linq查询的性能问题
本文关键字:问题 性能 查询 linq | 更新日期: 2023-09-27 18:25:25
我有一个函数,它从目录中获取文件列表,然后从列表中搜索文件名以查找匹配项。表演糟透了。
以下是功能:
public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type)
{
XmlConfigurator.Configure();
log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString()));
List<fileStatus> results = new List<fileStatus>();
DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory);
if (dirInfo.Exists)
{
// GET LIST OF ALL FILES IN DIRECTORY
string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories);
log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files)));
if (files.Length > 0 && permitNumbers.Count > 0)
{
log.Debug("Checking for matching files");
// CHECK FOR MATCHING FILES
switch (type)
{
case fileType.Well:
var matchingFiles = (from f in files
where f.Substring(f.LastIndexOf("''") + 1).Length > 4
where permitNumbers.Contains(f.Substring(f.LastIndexOf("''") + 1, 5))
select new fileStatus(fileType.Well, f.Substring(f.LastIndexOf("''") + 1, 5), 1, f.Substring(f.LastIndexOf("''") + 1)));
var permitNumbersWithMatches = (from x in matchingFiles
select x.PermitNumber);
var nonMatchingFiles = (from p in permitNumbers
where !permitNumbersWithMatches.Contains(p)
select new fileStatus(fileType.Well, p, 0, string.Empty));
results.AddRange(matchingFiles);
results.AddRange(nonMatchingFiles);
break;
case fileType.DrillerLog:
matchingFiles = (from f in files
where f.Substring(f.LastIndexOf("''") + 1).Length > 4
where permitNumbers.Contains(f.Substring(f.LastIndexOf("''") + 1, 5))
select new fileStatus(fileType.DrillerLog, f.Substring(f.LastIndexOf("''") + 1, 5), 1, f.Substring(f.LastIndexOf("''") + 1)));
permitNumbersWithMatches = (from x in matchingFiles
select x.PermitNumber);
nonMatchingFiles = (from p in permitNumbers
where !permitNumbersWithMatches.Contains(p)
select new fileStatus(fileType.DrillerLog, p, 0, string.Empty));
results.AddRange(matchingFiles);
results.AddRange(nonMatchingFiles);
break;
case fileType.RasterLog:
matchingFiles = (from f in files
where f.Substring(f.LastIndexOf("''") + 1).Length > 13
where permitNumbers.Contains(f.Substring(f.LastIndexOf("''") + 1, 14))
select new fileStatus(fileType.RasterLog, f.Substring(f.LastIndexOf("''") + 1, 14), 1, f.Substring(f.LastIndexOf("''") + 1)));
permitNumbersWithMatches = (from x in matchingFiles
select x.PermitNumber);
nonMatchingFiles = (from p in permitNumbers
where !permitNumbersWithMatches.Contains(p)
select new fileStatus(fileType.RasterLog, p, 0, string.Empty));
results.AddRange(matchingFiles);
results.AddRange(nonMatchingFiles);
break;
default:
break;
}
log.Debug("Done checking for matching files");
}
}
return results;
}
一旦它到达为"matchingFiles"提供值的linq查询,它就会挂起。这是一个很大的"permitNumbers"集合(如5000),还有一个大的"files"集合。
我能做些什么来加快速度吗?
考虑到下面提供的建议,我将功能修改为如下,现在性能如预期。非常感谢大家
public List<fileStatus> checkFilesStatus(List<string> permitNumbers, string serverDirectory, fileType type)
{
HashSet<string> numbers = new HashSet<string>(permitNumbers);
XmlConfigurator.Configure();
log.Debug(string.Format("Beginning checkFilesStatus with following parameters > permitNumbers: {0} > serverDirectory: {1} > type: {2}", string.Join(",", permitNumbers.ToArray()), serverDirectory, type.ToString()));
List<fileStatus> results = new List<fileStatus>();
DirectoryInfo dirInfo = new DirectoryInfo(serverDirectory);
if (dirInfo.Exists)
{
// GET LIST OF ALL FILES IN DIRECTORY
string[] files = System.IO.Directory.GetFiles(serverDirectory, "*", System.IO.SearchOption.AllDirectories);
HashSet<string> fileNames = new HashSet<string>(files.Select(f => Path.GetFileName(f)));
log.Debug(string.Format("List of all files in directory: {0}", string.Join(",", files)));
if (fileNames.Count > 0 && numbers.Count > 0)
{
log.Debug("Checking for matching files");
// CHECK FOR MATCHING FILES
switch (type)
{
case fileType.Well:
var matchingFiles = (from f in fileNames
where f.Length > 4
where numbers.Contains(f.Substring(0, 5))
select new fileStatus(fileType.Well, f.Substring(0, 5), 1, f));
var permitNumbersWithMatches = (from x in matchingFiles
select x.PermitNumber);
var nonMatchingFiles = numbers.Except(permitNumbersWithMatches)
.Select(p => new fileStatus(fileType.Well, p, 0, string.Empty));
results.AddRange(matchingFiles);
results.AddRange(nonMatchingFiles);
break;
case fileType.DrillerLog:
matchingFiles = (from f in fileNames
where f.Length > 4
where numbers.Contains(f.Substring(0, 5))
select new fileStatus(fileType.DrillerLog, f.Substring(0, 5), 1, f));
permitNumbersWithMatches = (from x in matchingFiles
select x.PermitNumber);
nonMatchingFiles = numbers.Except(permitNumbersWithMatches)
.Select(p => new fileStatus(fileType.DrillerLog, p, 0, string.Empty));
results.AddRange(matchingFiles);
results.AddRange(nonMatchingFiles);
break;
case fileType.RasterLog:
matchingFiles = (from f in fileNames
where f.Length > 13
where numbers.Contains(f.Substring(0, 14))
select new fileStatus(fileType.RasterLog, f.Substring(0, 14), 1, f));
permitNumbersWithMatches = (from x in matchingFiles
select x.PermitNumber);
nonMatchingFiles = numbers.Except(permitNumbersWithMatches)
.Select(p => new fileStatus(fileType.RasterLog, p, 0, string.Empty));
results.AddRange(matchingFiles);
results.AddRange(nonMatchingFiles);
break;
default:
break;
}
log.Debug("Done checking for matching files");
}
}
return results;
}
您正在创建一个查询,matchingFiles
,当迭代时,它将遍历您拥有的所有文件,以多种方式对它们进行操作,并且对您的数字集进行线性搜索。然后,您执行这个查询(需要从磁盘上重复读取大量数据,如果您有足够的数据来遍历缓存,这将非常昂贵),并对其执行线性搜索,以查找每个许可证号。这导致了O(N^2*M)的渐近复杂度,其中N是许可证号的数量,M是文件的数量。那是。。。非常糟糕。
这里的关键是避免1)进行线性搜索和2)多次迭代复杂查询,尤其是避免对其他序列中的每个项目进行迭代。
对于#1,只需将permitNumbers
设为HashSet<string>
而不是列表,然后检查其中是否包含项目将变成O(1)操作。
对于#2,将您的第三个查询替换为只需要迭代源序列一次的操作:
var nonMatchingFiles = permitNumbers.Except(permitNumbersWithMatches)
.Select(p => new fileStatus(fileType.Well, p, 0, string.Empty));
我会通过对Path.GetFileName(f)的一次调用来消除对f.Substring(f.LastIndexOf("''")+1)的所有重复调用
例如
var fileNames = files.Select(f => Path.GetFileName(f));
var matchingFiles = (from fname in fileNames
where fname.Length > 4
where permitNumbers.Contains(fname.Substring(0, 5))
select new fileStatus(fileType.Well, fname.Substring(0, 5), 1, fname);