linq查询的性能问题

本文关键字:问题 性能 查询 linq | 更新日期: 2023-09-27 18:25:25

我有一个函数,它从目录中获取文件列表,然后从列表中搜索文件名以查找匹配项。表演糟透了。

以下是功能:

public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type)
    {
        XmlConfigurator.Configure();
        log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString()));
        List<fileStatus> results = new List<fileStatus>();
        DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory);
        if (dirInfo.Exists)
        {
            // GET LIST OF ALL FILES IN DIRECTORY
            string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories);
            log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files)));

            if (files.Length > 0 && permitNumbers.Count > 0)
            {
                log.Debug("Checking for matching files");
                // CHECK FOR MATCHING FILES
                switch (type)
                {
                    case fileType.Well:
                        var matchingFiles = (from f in files
                                             where f.Substring(f.LastIndexOf("''") + 1).Length > 4
                                             where permitNumbers.Contains(f.Substring(f.LastIndexOf("''") + 1, 5))
                                             select new fileStatus(fileType.Well, f.Substring(f.LastIndexOf("''") + 1, 5), 1, f.Substring(f.LastIndexOf("''") + 1)));

                        var permitNumbersWithMatches = (from x in matchingFiles
                                                       select x.PermitNumber);
                        var nonMatchingFiles = (from p in permitNumbers
                                                where !permitNumbersWithMatches.Contains(p)
                                                select new fileStatus(fileType.Well, p, 0, string.Empty));
                        results.AddRange(matchingFiles);
                        results.AddRange(nonMatchingFiles);
                        break;
                    case fileType.DrillerLog:
                        matchingFiles = (from f in files
                                         where f.Substring(f.LastIndexOf("''") + 1).Length > 4
                                         where permitNumbers.Contains(f.Substring(f.LastIndexOf("''") + 1, 5))
                                         select new fileStatus(fileType.DrillerLog, f.Substring(f.LastIndexOf("''") + 1, 5), 1, f.Substring(f.LastIndexOf("''") + 1)));
                        permitNumbersWithMatches = (from x in matchingFiles
                                                       select x.PermitNumber);
                        nonMatchingFiles = (from p in permitNumbers
                                                where !permitNumbersWithMatches.Contains(p)
                                            select new fileStatus(fileType.DrillerLog, p, 0, string.Empty));

                        results.AddRange(matchingFiles);
                        results.AddRange(nonMatchingFiles);
                        break;
                    case fileType.RasterLog:
                        matchingFiles = (from f in files
                                         where f.Substring(f.LastIndexOf("''") + 1).Length > 13
                                         where permitNumbers.Contains(f.Substring(f.LastIndexOf("''") + 1, 14))
                                         select new fileStatus(fileType.RasterLog, f.Substring(f.LastIndexOf("''") + 1, 14), 1, f.Substring(f.LastIndexOf("''") + 1)));
                        permitNumbersWithMatches = (from x in matchingFiles
                                                       select x.PermitNumber);
                        nonMatchingFiles = (from p in permitNumbers
                                                where !permitNumbersWithMatches.Contains(p)
                                            select new fileStatus(fileType.RasterLog, p, 0, string.Empty));

                        results.AddRange(matchingFiles);
                        results.AddRange(nonMatchingFiles);
                        break;
                    default:
                        break;
                }
                log.Debug("Done checking for matching files");
            }
        }
        return results;
    }

一旦它到达为"matchingFiles"提供值的linq查询,它就会挂起。这是一个很大的"permitNumbers"集合(如5000),还有一个大的"files"集合。

我能做些什么来加快速度吗?

考虑到下面提供的建议,我将功能修改为如下,现在性能如预期。非常感谢大家

public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type)
    {
        HashSet<string> numbers = new HashSet<string>(permitNumbers);
        XmlConfigurator.Configure();
        log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString()));
        List<fileStatus> results = new List<fileStatus>();
        DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory);
        if (dirInfo.Exists)
        {
            // GET LIST OF ALL FILES IN DIRECTORY
            string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories);
            HashSet<string> fileNames = new HashSet<string>(files.Select(f => Path.GetFileName(f)));
            log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files)));

            if (fileNames.Count > 0 && numbers.Count > 0)
            {
                log.Debug("Checking for matching files");
                // CHECK FOR MATCHING FILES
                switch (type)
                {
                    case fileType.Well:
                        var matchingFiles = (from f in fileNames
                                             where f.Length > 4
                                             where numbers.Contains(f.Substring(0, 5))
                                             select new fileStatus(fileType.Well, f.Substring(0, 5), 1, f));

                        var permitNumbersWithMatches = (from x in matchingFiles
                                                       select x.PermitNumber);
                        var nonMatchingFiles = numbers.Except(permitNumbersWithMatches)
                            .Select(p => new fileStatus(fileType.Well, p, 0, string.Empty));
                        results.AddRange(matchingFiles);
                        results.AddRange(nonMatchingFiles);
                        break;
                    case fileType.DrillerLog:
                        matchingFiles = (from f in fileNames
                                         where f.Length > 4
                                         where numbers.Contains(f.Substring(0, 5))
                                         select new fileStatus(fileType.DrillerLog, f.Substring(0, 5), 1, f));

                        permitNumbersWithMatches = (from x in matchingFiles
                                                       select x.PermitNumber);
                        nonMatchingFiles = numbers.Except(permitNumbersWithMatches)
                            .Select(p => new fileStatus(fileType.DrillerLog, p, 0, string.Empty));

                        results.AddRange(matchingFiles);
                        results.AddRange(nonMatchingFiles);
                        break;
                    case fileType.RasterLog:
                        matchingFiles = (from f in fileNames
                                         where f.Length > 13
                                         where numbers.Contains(f.Substring(0, 14))
                                         select new fileStatus(fileType.RasterLog, f.Substring(0, 14), 1, f));
                        permitNumbersWithMatches = (from x in matchingFiles
                                                       select x.PermitNumber);
                        nonMatchingFiles = numbers.Except(permitNumbersWithMatches)
                            .Select(p => new fileStatus(fileType.RasterLog, p, 0, string.Empty));

                        results.AddRange(matchingFiles);
                        results.AddRange(nonMatchingFiles);
                        break;
                    default:
                        break;
                }
                log.Debug("Done checking for matching files");
            }
        }
        return results;
    }

linq查询的性能问题

您正在创建一个查询matchingFiles,当迭代时,它将遍历您拥有的所有文件,以多种方式对它们进行操作,并且对您的数字集进行线性搜索。然后,您执行这个查询(需要从磁盘上重复读取大量数据,如果您有足够的数据来遍历缓存,这将非常昂贵),并对其执行线性搜索,以查找每个许可证号。这导致了O(N^2*M)的渐近复杂度,其中N是许可证号的数量,M是文件的数量。那是。。。非常糟糕。

这里的关键是避免1)进行线性搜索和2)多次迭代复杂查询,尤其是避免对其他序列中的每个项目进行迭代。

对于#1,只需将permitNumbers设为HashSet<string>而不是列表,然后检查其中是否包含项目将变成O(1)操作。

对于#2,将您的第三个查询替换为只需要迭代源序列一次的操作:

var nonMatchingFiles = permitNumbers.Except(permitNumbersWithMatches)
    .Select(p => new fileStatus(fileType.Well, p, 0, string.Empty));

我会通过对Path.GetFileName(f)的一次调用来消除对f.Substring(f.LastIndexOf("''")+1)的所有重复调用

例如

var fileNames = files.Select(f => Path.GetFileName(f));    
var matchingFiles = (from fname in fileNames
                     where fname.Length > 4
                     where permitNumbers.Contains(fname.Substring(0, 5))
                     select new fileStatus(fileType.Well, fname.Substring(0, 5), 1, fname);