高分求助.c#解析海量数据文本排重问题(求助高手帮忙)

changeme1314520 2009-10-14 08:07:51

我现在要解析一个800M的文本文件,然后进行数据统计,这个文本文件的格式基本如下
[2009-08-14 09:00:36] logtype:[photo] company:[dell] uid[2031]
[2009-08-14 09:00:37] logtype:[music] company:[dell] uid[2039]
[2009-08-14 09:00:38] logtype:[music] company:[IBM] uid[2038]
[2009-08-14 09:00:39] logtype:[music] company:[NOKIA] uid[2059]

此数据文件我己经解析到临时表datatable中

我现在要进行排重操作

比如我要取company是dell的项,因为其中有两个，我在进行统计时只统计一次就可以了,在SQL中可以用distinct进行操作

我现在的前提是这个文本数据量太大不能插入到数据库里面，请高手指点如何利用datatable进行排重操作.谢谢！

...全文

112 6 打赏收藏转发到动态举报

写回复

用AI写文章

6 条回复

切换为时间正序

请发表友善的回复…

发表回复

dfzx 2009-10-14

打赏
举报

采用多线程把数据分组插到数据库中

读的时候没必要全部读完，可以一次只读一部份

starj1 2009-10-14

打赏
举报

修改下:

        private void Form1_Load(object sender, EventArgs e)

        {

            string sRegSplit = @"\[(?<time>\d+-\d+-\d+ \d+:\d+:\d+)\] logtype:\[(?<logtype>\w+)\] company:\[(?<company>\w+)\] uid\[(?<uid>\d+)\]";

            System.Text.RegularExpressions.Regex reSplit = 

                new System.Text.RegularExpressions.Regex(sRegSplit, System.Text.RegularExpressions.RegexOptions.Compiled);



            Dictionary<string, Information> dictT = new Dictionary<string, Information>();

            List<Information> lstT = new List<Information>();



            using (System.IO.StreamReader srT = new System.IO.StreamReader("数据文件", Encoding.Default))

            {

                while (!srT.EndOfStream)

                {

                    Information infor = new Information();

                    System.Text.RegularExpressions.Match match = reSplit.Match(srT.ReadLine());

                    infor.time = match.Groups["time"].Value;

                    infor.logtype = match.Groups["logtype"].Value;

                    infor.company = match.Groups["company"].Value;

                    infor.uid = match.Groups["uid"].Value;



                    lstT.Add(infor);//全要

                    if (!dictT.ContainsKey(infor.company))//去除掉不需要的

                        dictT.Add(infor.company, infor);

                }

            }



            lstT.Sort(CompareInformation);

        }



        private static int CompareInformation(Information I1, Information I2)

        {

            //下面二句设成要比较的字段

            string x = I1.company;

            string y = I2.company;



            int ret = 0;



            if (x == null)

            {

                if (y != null)

                    ret = -1;

            }

            else

            {

                if (y == null)

                {

                    ret = 1;

                }

                else

                {

                    ret = x.CompareTo(y);

                }

            }



            return ret;

        }



        struct Information

        {

            public string time;

            public string logtype;

            public string company;

            public string uid;

        }

starj1 2009-10-14

打赏
举报

写了一个,希望你内存够大

        private void Form1_Load(object sender, EventArgs e)

        {

            string sRegSplit = @"\[(?<time>\d+-\d+-\d+ \d+:\d+:\d+)\] logtype:\[(?<logtype>\w+)\] company:\[(?<company>\w+)\] uid\[(?<uid>\d+)\]";

            System.Text.RegularExpressions.Regex reSplit = 

                new System.Text.RegularExpressions.Regex(sRegSplit, System.Text.RegularExpressions.RegexOptions.Compiled);



            Dictionary<string, Information> dictT = new Dictionary<string, Information>();

            List<Information> lstT = new List<Information>();



            using (System.IO.StreamReader srT = new System.IO.StreamReader("数据文件", Encoding.Default))

            {

                while (!srT.EndOfStream)

                {

                    Information infor = new Information();

                    System.Text.RegularExpressions.Match match = reSplit.Match(srT.ReadLine);

                    infor.time = match.Groups["time"].Value;

                    infor.logtype = match.Groups["logtype"].Value;

                    infor.company = match.Groups["company"].Value;

                    infor.uid = match.Groups["uid"].Value;



                    lstT.Add(infor);

                    if (!dictT.ContainsKey(infor.company))

                        dictT.Add(infor.company, infor);

                }

            }



            lstT.Sort(CompareInformation);

        }



        private static int CompareInformation(Information I1, Information I2)

        {

            string x = I1;

            string y = I2;



            if (x == null)

            {

                if (y == null)

                {

                    return 0;

                }

                else

                {

                    return -1;

                }

            }

            else

            {

                if (y == null)

                {

                    return 1;

                }

                else

                {

                    int retval = x.CompareTo(y);

                }

            }

        }



        struct Information

        {

            public string time;

            public string logtype;

            public string company;

            public string uid;

        }