在.NET中获取目录数据的最快方法
本文关键字:方法 数据 NET 获取 | 更新日期: 2023-09-27 18:26:13
我正在开发一个文件同步服务,用于在不同机器上的两个文件夹之间同步文件。我需要找到一种非常快速的方法来枚举目录并从中提取以下信息:
- 该目录中所有文件路径和子目录路径的一种或多种数据结构,包括每个文件或子目录的最后写入时间
- 对于在当前目录下的任何级别上找到的每个子目录,同上
到目前为止,我已经想出了这个:
static void Main(string[] args)
{
List<Tuple<string, DateTime>> files = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directories = new List<Tuple<string, DateTime>>();
Stopwatch watch = new Stopwatch();
while (true)
{
watch.Start();
while (!CheckFolderRecursiveSingleThreaded("C:''", out files, out directories))
{
// You can assume for all intents and purposes that drive C does exist and that you have access to it, which will cause this sleep to not get called.
Thread.Sleep(1000);
}
watch.Stop();
Console.WriteLine(watch.ElapsedMilliseconds);
watch.Reset();
// Do something with the information.
Thread.Sleep(1000);
}
}
static bool CheckFolderRecursiveSingleThreaded(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
try
{
DirectoryInfo directoryInformation = new DirectoryInfo(path);
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
foreach (FileInfo file in directoryInformation.GetFiles())
{
fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
}
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
foreach (DirectoryInfo directory in directoryInformation.GetDirectories())
{
// Check for the ReparsePoint flag, which will indicate a symbolic link.
if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
List<Tuple<string, DateTime>> directoryFiles;
List<Tuple<string, DateTime>> directoryFolders;
if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
{
fileList.AddRange(directoryFiles);
directoryList.AddRange(directoryFolders);
}
}
}
files = fileList;
directories = directoryList;
return true;
}
catch
{
files = null;
directories = null;
return false;
}
}
从性能角度来看,通过我的C:''驱动器枚举并生成一个包含549254个文件和83235个文件夹的列表大约需要22秒(无论是在没有附加调试器的发布或调试模式下运行),但是否可以更快?我愿意接受任何建议,甚至MSVC++的建议。
编辑:由于多线程(必须在发布模式下测试),使用LINQ的AsParallel编辑12秒。请注意,这对所有C:''sub文件夹都是并行的,但递归调用将对我上面的单线程实现进行,否则对所有文件夹都进行并行处理将花费很长时间!
static bool CheckFolderParallelled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
try
{
DirectoryInfo directoryInformation = new DirectoryInfo(path);
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
foreach (FileInfo file in directoryInformation.GetFiles())
{
fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
}
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
directoryInformation.GetDirectories().AsParallel().ForAll(directory =>
{
// Check for the ReparsePoint flag, which will indicate a symbolic link.
if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
List<Tuple<string, DateTime>> directoryFiles;
List<Tuple<string, DateTime>> directoryFolders;
if (CheckFolderRecursiveSingleThreaded(directory.FullName, out directoryFiles, out directoryFolders))
{
fileList.AddRange(directoryFiles);
directoryList.AddRange(directoryFolders);
}
}
});
files = fileList;
directories = directoryList;
return true;
}
catch
{
files = null;
directories = null;
return false;
}
}
编辑:使用Alexei的链接解决方案的Mark Gravell接受的答案还有大约21秒。这种非递归技术并不是最快的(保持该Queue数据类型有效的成本可能与在堆栈上推送和弹出对该方法的调用的成本一样昂贵):
static bool CheckFolderNonRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
try
{
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
ConcurrentQueue<DirectoryInfo> pendingSearches = new ConcurrentQueue<DirectoryInfo>();
pendingSearches.Enqueue(new DirectoryInfo(path));
DirectoryInfo pendingDirectory;
while (pendingSearches.Count > 0)
{
if (pendingSearches.TryDequeue(out pendingDirectory))
{
try
{
foreach (FileInfo file in pendingDirectory.GetFiles())
{
fileList.Add(new Tuple<string, DateTime>(file.FullName, file.LastWriteTimeUtc));
}
foreach (DirectoryInfo directory in pendingDirectory.GetDirectories())
{
// Check for the ReparsePoint flag, which will indicate a symbolic link.
if (!directory.Attributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(directory.FullName, directory.LastWriteTimeUtc));
pendingSearches.Enqueue(directory);
}
}
}
catch { } // Ignore directories with no access rights.
}
}
files = fileList;
directories = directoryList;
return true;
}
catch
{
files = null;
directories = null;
return false;
}
}
编辑:这个问题对.NET是开放式的,因为可能有一种更快的方法可以使用像boost这样的MSVC++库,但我还没有找到更快的方法。如果有人能在C++中用一个更快的C驱动器枚举器来提取相同的数据,击败我的C#方法,首先要赞扬你做得更快,其次我真的很想看它,第三它会帮助很多人(不仅仅是我自己)。直到我意识到以下方法花费了大约200000毫秒,比我上面发布的任何代码都长得多:
#include "stdafx.h"
#include <iostream>
#include <Windows.h>
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <boost/timer.hpp>
namespace fs = boost::filesystem;
bool IterateDirectory(const wchar_t *directory);
int _tmain(int argc, _TCHAR* argv[])
{
boost::timer timer = boost::timer();
while (true)
{
timer.restart();
// L makes it wide, since IterateDirectory takes wchar_t.
// R makes it a raw string literal, which tells the compiler to parse the string as-is, not escape characters and fancy tricks.
IterateDirectory(LR"(C:')");
std::cout << "Elapsed time: " << timer.elapsed() * 1000 << " ms" << std::endl;
Sleep(1000);
}
return 0;
}
// IterateDirectory takes wchar_t because path.c_str() always returns wchar_t whether you are using unicode or multibyte.
bool IterateDirectory(const wchar_t *directory)
{
if (boost::filesystem::exists(directory))
{
fs::directory_iterator it(directory), eod;
BOOST_FOREACH(fs::path path, std::make_pair(it, eod))
{
try
{
if (is_regular_file(path))
{
//std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
}
if (is_directory(path))
{
//std::cout << path << ", last write time: " << last_write_time(path) << '.' << std::endl;
// path.c_str() always returns wchar_t, whether you are using unicode or multibyte. This is probably because of multi-language support inside of the Windows operating system and file structure.
IterateDirectory(path.c_str());
}
}
catch (...) { } // Ignore directories we don't have access to.
}
return true;
}
return false;
}
编辑:使用PInvoke查找FirstFile和FindNextFile花了大约6秒迭代我的整个C驱动器(感谢重复的链接和Sam Saffron的回答)。但是它能更快吗?
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
public FileAttributes dwFileAttributes;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
public int nFileSizeHigh;
public int nFileSizeLow;
public int dwReserved0;
public int dwReserved1;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
public string cFileName;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
public string cAlternateFileName;
}
static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);
static bool FindNextFilePInvokeRecursive(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string,DateTime>>();
try
{
findHandle = FindFirstFileW(path + @"'*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
if (findData.cFileName == "." || findData.cFileName == "..") continue;
string fullPath = path + (path.EndsWith("''") ? String.Empty : "''") + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
{
fileList.AddRange(subDirectoryFileList);
directoryList.AddRange(subDirectoryDirectoryList);
}
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
}
}
while (FindNextFile(findHandle, out findData));
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
public static class FILETIMEExtensions
{
public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME filetime)
{
long highBits = filetime.dwHighDateTime;
highBits = highBits << 32;
return DateTime.FromFileTimeUtc(highBits + (long)filetime.dwLowDateTime);
}
}
编辑:是的,可以更快。使用并行化目标文件夹的子目录递归的技术,我可以使用上面的FindNextFilePInvokeRecursive方法将其延长到4秒。这需要4秒来用我需要的数据迭代我的整个C驱动器。我可以在进程监视器中看到,我消耗了大约30%的CPU,最多只消耗了1%的磁盘,这对我来说有点奇怪,目前不确定为什么会这样,也许只是这种链表遍历风格导致它可以忽略不计。理想情况下,它应该至少消耗100%的CPU,但这可能取决于并行子文件夹的数量和深度。但它能更快吗
static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<Tuple<string, DateTime>> files, out List<Tuple<string, DateTime>> directories)
{
List<Tuple<string, DateTime>> fileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> directoryList = new List<Tuple<string, DateTime>>();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
try
{
findHandle = FindFirstFileW(path + @"'*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
if (findData.cFileName == "." || findData.cFileName == "..") continue;
string fullPath = path + (path.EndsWith("''") ? String.Empty : "''") + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new Tuple<string, DateTime>(fullPath, findData.ftLastWriteTime.ToDateTime()));
}
}
while (FindNextFile(findHandle, out findData));
directoryList.AsParallel().ForAll(x =>
{
List<Tuple<string, DateTime>> subDirectoryFileList = new List<Tuple<string, DateTime>>();
List<Tuple<string, DateTime>> subDirectoryDirectoryList = new List<Tuple<string, DateTime>>();
if (FindNextFilePInvokeRecursive(x.Item1, out subDirectoryFileList, out subDirectoryDirectoryList))
{
fileList.AddRange(subDirectoryFileList);
directoryList.AddRange(subDirectoryDirectoryList);
}
});
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
编辑:使用并行时忘记添加并发锁,否则可能会捕获异常。为了我的目的,还删除了元组并使用了FileInformation/DirectoryInformation类。这缩短了0.5秒。现在3.5秒来枚举我的C:驱动器。
[DllImport("kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
public static extern IntPtr FindFirstFileW(string lpFileName, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll", CharSet = CharSet.Unicode)]
public static extern bool FindNextFile(IntPtr hFindFile, out WIN32_FIND_DATAW lpFindFileData);
[DllImport("kernel32.dll")]
public static extern bool FindClose(IntPtr hFindFile);
[StructLayout(LayoutKind.Sequential, CharSet = CharSet.Unicode)]
public struct WIN32_FIND_DATAW {
public FileAttributes dwFileAttributes;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftCreationTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastAccessTime;
internal System.Runtime.InteropServices.ComTypes.FILETIME ftLastWriteTime;
public int nFileSizeHigh;
public int nFileSizeLow;
public int dwReserved0;
public int dwReserved1;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 260)]
public string cFileName;
[MarshalAs(UnmanagedType.ByValTStr, SizeConst = 14)]
public string cAlternateFileName;
}
static IntPtr INVALID_HANDLE_VALUE = new IntPtr(-1);
static bool FindNextFilePInvokeRecursive(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
List<FileInformation> fileList = new List<FileInformation>();
List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
try
{
findHandle = FindFirstFileW(path + @"'*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
// Skip current directory and parent directory symbols that are returned.
if (findData.cFileName != "." && findData.cFileName != "..")
{
string fullPath = path + @"'" + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
List<FileInformation> subDirectoryFileList = new List<FileInformation>();
List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
if (FindNextFilePInvokeRecursive(fullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
{
fileList.AddRange(subDirectoryFileList);
directoryList.AddRange(subDirectoryDirectoryList);
}
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
}
}
}
while (FindNextFile(findHandle, out findData));
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
static bool FindNextFilePInvokeRecursiveParalleled(string path, out List<FileInformation> files, out List<DirectoryInformation> directories)
{
List<FileInformation> fileList = new List<FileInformation>();
object fileListLock = new object();
List<DirectoryInformation> directoryList = new List<DirectoryInformation>();
object directoryListLock = new object();
WIN32_FIND_DATAW findData;
IntPtr findHandle = INVALID_HANDLE_VALUE;
List<Tuple<string, DateTime>> info = new List<Tuple<string, DateTime>>();
try
{
path = path.EndsWith(@"'") ? path : path + @"'";
findHandle = FindFirstFileW(path + @"*", out findData);
if (findHandle != INVALID_HANDLE_VALUE)
{
do
{
// Skip current directory and parent directory symbols that are returned.
if (findData.cFileName != "." && findData.cFileName != "..")
{
string fullPath = path + findData.cFileName;
// Check if this is a directory and not a symbolic link since symbolic links could lead to repeated files and folders as well as infinite loops.
if (findData.dwFileAttributes.HasFlag(FileAttributes.Directory) && !findData.dwFileAttributes.HasFlag(FileAttributes.ReparsePoint))
{
directoryList.Add(new DirectoryInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
}
else if (!findData.dwFileAttributes.HasFlag(FileAttributes.Directory))
{
fileList.Add(new FileInformation { FullPath = fullPath, LastWriteTime = findData.ftLastWriteTime.ToDateTime() });
}
}
}
while (FindNextFile(findHandle, out findData));
directoryList.AsParallel().ForAll(x =>
{
List<FileInformation> subDirectoryFileList = new List<FileInformation>();
List<DirectoryInformation> subDirectoryDirectoryList = new List<DirectoryInformation>();
if (FindNextFilePInvokeRecursive(x.FullPath, out subDirectoryFileList, out subDirectoryDirectoryList))
{
lock (fileListLock)
{
fileList.AddRange(subDirectoryFileList);
}
lock (directoryListLock)
{
directoryList.AddRange(subDirectoryDirectoryList);
}
}
});
}
}
catch (Exception exception)
{
Console.WriteLine("Caught exception while trying to enumerate a directory. {0}", exception.ToString());
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = null;
directories = null;
return false;
}
if (findHandle != INVALID_HANDLE_VALUE) FindClose(findHandle);
files = fileList;
directories = directoryList;
return true;
}
public class FileInformation
{
public string FullPath;
public DateTime LastWriteTime;
}
public class DirectoryInformation
{
public string FullPath;
public DateTime LastWriteTime;
}
编辑:B.K.询问从FILETIME:转换为DateTime的问题
public static class FILETIMEExtensions
{
public static DateTime ToDateTime(this System.Runtime.InteropServices.ComTypes.FILETIME time)
{
ulong high = (ulong)time.dwHighDateTime;
ulong low = (ulong)time.dwLowDateTime;
long fileTime = (long)((high << 32) + low);
return DateTime.FromFileTimeUtc(fileTime);
}
}
使用LINQ和并行任务
var stuff = dir.GetFiles("*.*", System.IO.SearchOption.AllDirectories);
Parallel.ForEach(stuff, p=>{ //do things in parallel.. });
//or this
var q = stuff.AsParallel().Where(x => p(x)).Orderby(x => k(x)).Select(x => f(x));
foreach (var e in q) a(e);