c#中查找字符串匹配列表的最佳比较算法
本文关键字:最佳 比较 算法 列表 串匹配 查找 字符 字符串 | 更新日期: 2023-09-27 17:52:43
假设我有一个10万个单词的列表。我想找出一个给定的字符串是否与列表中的任何单词匹配,我想以最快的方式完成它。我还想知道是否有其他单词,由该字符串中的第一个字符开始,出现在列表中。
例如:假设你有一个字符串"icedtgg"
"我"集成电路"冰"冰"icedt"icedtg"icedtgg"
我正在尝试想出一个最优的比较算法,告诉我以下单词是否在我的列表中。
到目前为止,我的100,000个单词的列表存储在
中Dicitonary<char, List<string>> WordList;
其中char
是单词的第一个字符,List<string>
是所有以该字符开头的单词。
所以WordList['a']
包含所有以"a"开头的单词("ape","apple","art"等),"b"包含所有以"b"开头的单词,等等。
因为我知道我所有的单词都以"I"开头,我可以首先将我的解决方案从100,000个单词缩小到仅以"I"开头的单词。
List<string> CurrentWordList = WordList['i'];
现在我检查
if( CurrentWordList[0].Length == 1 )
那么我知道我的第一个字符串是一个匹配" I ",因为" I "将是列表中的第一个单词。这些列表事先按字母顺序排序,以免减慢匹配速度。
任何想法?
*不,这不是一个HW的任务,我是一个专业的软件架构师,试图找到一个最佳匹配算法的乐趣/爱好/游戏开发。
我决定添加这个答案,不是因为它是您问题的最佳解决方案,而是为了说明两种相对简单的可能解决方案,并且与您似乎正在遵循的方法有些一致。
下面的(未优化的)示例提供了一个极其简单的前缀树实现,它为每个消耗的字符使用一个节点。
public class SimplePrefixTrie
{
private readonly Node _root = new Node(); // root represents empty string.
private class Node
{
public Dictionary<char, Node> Children;
public bool IsTerminal; // whether a full word ends here.
public Node Find(string word, int index)
{
var child = default(Node);
if (index < word.Length && Children != null)
Children.TryGetValue(word[index], out child);
return child;
}
public Node Add(string word, int toConsume)
{
var child = default(Node);
if (toConsume == word.Length)
this.IsTerminal = true;
else if (Children == null || !Children.TryGetValue(word[toConsume], out child))
{
if (Children == null)
Children = new Dictionary<char, Node>();
Children[word[toConsume]] = child = new Node();
}
return child;
}
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ndx++);
}
public IEnumerable<string> FindWordsMatchingPrefixesOf(string searchWord)
{
var ndx = 0;
var cur = _root;
while (cur != null)
{
if (cur.IsTerminal)
yield return searchWord.Substring(0, ndx);
cur = cur.Find(searchWord, ndx++);
}
}
}
下面还添加了一个压缩前缀树的简单实现。它遵循与上面示例几乎相同的方法,但是存储共享的前缀部分,而不是单个字符。当现有存储的前缀变为共享时,它将拆分节点,并将其拆分为两个部分。
public class SimpleCompressedPrefixTrie
{
private readonly Node _root = new Node();
private class Node
{
private Dictionary<char, Node> _children;
public string PrefixValue = string.Empty;
public bool IsTerminal;
public Node Add(string word, ref int startIndex)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
if (n == PrefixValue.Length) // full prefix match
{
if (startIndex == word.Length) // full match
IsTerminal = true;
else
return AddToChild(word, ref startIndex);
}
else // partial match, need to split this node's prefix.
SplittingAdd(word, n, ref startIndex);
return null;
}
public Node Find(string word, ref int startIndex, out int matchLen)
{
var n = FindSharedPrefix(word, startIndex);
startIndex += n;
matchLen = -1;
if (n == PrefixValue.Length)
{
if (IsTerminal)
matchLen = startIndex;
var child = default(Node);
if (_children != null && startIndex < word.Length && _children.TryGetValue(word[startIndex], out child))
{
startIndex++; // consumed map key character.
return child;
}
}
return null;
}
private Node AddToChild(string word, ref int startIndex)
{
var key = word[startIndex++]; // consume the mapping character
var nextNode = default(Node);
if (_children == null)
_children = new Dictionary<char, Node>();
else if (_children.TryGetValue(key, out nextNode))
return nextNode;
var remainder = word.Substring(startIndex);
_children[key] = new Node() { PrefixValue = remainder, IsTerminal = true };
return null; // consumed.
}
private void SplittingAdd(string word, int n, ref int startIndex)
{
var curChildren = _children;
_children = new Dictionary<char, Node>();
_children[PrefixValue[n]] = new Node()
{
PrefixValue = this.PrefixValue.Substring(n + 1),
IsTerminal = this.IsTerminal,
_children = curChildren
};
PrefixValue = PrefixValue.Substring(0, n);
IsTerminal = startIndex == word.Length;
if (!IsTerminal)
{
var prefix = word.Length > startIndex + 1 ? word.Substring(startIndex + 1) : string.Empty;
_children[word[startIndex]] = new Node() { PrefixValue = prefix, IsTerminal = true };
startIndex++;
}
}
private int FindSharedPrefix(string word, int startIndex)
{
var n = Math.Min(PrefixValue.Length, word.Length - startIndex);
var len = 0;
while (len < n && PrefixValue[len] == word[len + startIndex])
len++;
return len;
}
}
public void AddWord(string word)
{
var ndx = 0;
var cur = _root;
while (cur != null)
cur = cur.Add(word, ref ndx);
}
public IEnumerable<string> FindWordsMatchingPrefixesOf(string searchWord)
{
var startNdx = 0;
var cur = _root;
while (cur != null)
{
var matchLen = 0;
cur = cur.Find(searchWord, ref startNdx, out matchLen);
if (matchLen > 0)
yield return searchWord.Substring(0, matchLen);
};
}
}
用法示例:
var trie = new SimplePrefixTrie(); // or new SimpleCompressedPrefixTrie();
trie.AddWord("hello");
trie.AddWord("iced");
trie.AddWord("i");
trie.AddWord("ice");
trie.AddWord("icecone");
trie.AddWord("dtgg");
trie.AddWord("hicet");
foreach (var w in trie.FindWordsMatchingPrefixesOf("icedtgg"))
Console.WriteLine(w);
与输出:i
ice
iced
UPDATE:选择正确的数据结构很重要
我认为一个更新可以提供一些价值,说明选择一个适合问题的数据结构是多么重要,以及涉及到什么样的权衡。因此,我创建了一个小型基准应用程序,与基准参考实现相比,它可以测试到目前为止为这个问题提供的答案中的策略。
- Naive:是最简单的Naive解决方案。
- JimMischel:是基于这个答案的方法。
- MattyMerrix:是基于你自己的回答。
- JimMattyDSL:结合了'JimMischel'和'MattyMerrix'方法,并在排序列表中使用更优的二进制字符串搜索。
- SimpleTrie和CompessedTrie基于本回答中描述的两种实现。
将5000个单词匹配到最大长度为25的10000个词条的字典
Method Memory (MB) Build Time (s) Lookup Time (s)
Naive 0.64-0.64, 0.64 0.001-0.002, 0.001 6.136-6.312, 6.210
JimMischel 0.84-0.84, 0.84 0.013-0.018, 0.016 0.083-0.113, 0.102
JimMattyDSL 0.80-0.81, 0.80 0.013-0.018, 0.016 0.008-0.011, 0.010
SimpleTrie 24.55-24.56, 24.56 0.042-0.056, 0.051 0.002-0.002, 0.002
CompessedTrie 1.84-1.84, 1.84 0.003-0.003, 0.003 0.002-0.002, 0.002
MattyMerrix 0.83-0.83, 0.83 0.017-0.017, 0.017 0.034-0.034, 0.034
将5000个单词匹配到最大长度为25的100000个词条的字典
Method Memory (MB) Build Time (s) Lookup Time (s)
Naive 6.01-6.01, 6.01 0.024-0.026, 0.025 65.651-65.758, 65.715
JimMischel 6.32-6.32, 6.32 0.232-0.236, 0.233 1.208-1.254, 1.235
JimMattyDSL 5.95-5.96, 5.96 0.264-0.269, 0.266 0.050-0.052, 0.051
SimpleTrie 226.49-226.49, 226.49 0.932-0.962, 0.951 0.004-0.004, 0.004
CompessedTrie 16.10-16.10, 16.10 0.101-0.126, 0.111 0.003-0.003, 0.003
MattyMerrix 6.15-6.15, 6.15 0.254-0.269, 0.259 0.414-0.418, 0.416
将5000个单词匹配到最大长度为25的1000000个词条的字典
Method Memory (MB) Build Time (s) Lookup Time (s)
JimMischel 57.69-57.69, 57.69 3.027-3.086, 3.052 16.341-16.415, 16.373
JimMattyDSL 60.88-60.88, 60.88 3.396-3.484, 3.453 0.399-0.400, 0.399
SimpleTrie 2124.57-2124.57, 2124.57 11.622-11.989, 11.860 0.006-0.006, 0.006
CompessedTrie 166.59-166.59, 166.59 2.813-2.832, 2.823 0.005-0.005, 0.005
MattyMerrix 62.71-62.73, 62.72 3.230-3.270, 3.251 6.996-7.015, 7.008
可以看到,(非空间优化的)尝试所需的内存要高得多。对于所有测试的实现,它增加了字典的大小O(N)。
正如预期的那样,尝试的查找时间或多或少是恒定的:O(k),仅取决于搜索条件的长度。对于其他实现,时间将根据要搜索的字典的大小而增加。
请注意,可以为这个问题构建更优的实现,这将接近O(k)的搜索时间,并允许更紧凑的存储和更少的内存占用。如果你映射到一个简化的字母表(例如。只有'A'-'Z'),这也是可以利用的。
所以你只想在字典中找到作为输入字符串前缀的单词?您可以比所提出的任何方法更有效地做到这一点。这只是一个修改后的合并。
如果您的单词列表由一个按首字母键入的字典组成,并且每个条目包含以该字母开头的单词的排序列表,则此操作可以完成。最坏的情况是0 (n + m),其中n是以字母开头的单词数,m是输入字符串的长度。
var inputString = "icegdt";
// get list of words that start with the first character
var wordsList = MyDictionary[input_string[0]];
// find all words that are prefixes of the input string
var iInput = 0;
var iWords = 0;
var prefix = inputString.Substring(0, iInput+1);
while (iInput < inputString.Length && iWords < wordsList.Count)
{
if (wordsList[iWords] == prefix)
{
// wordsList[iWords] is found!
++iWords;
}
else if (wordsList[iWords] > prefix)
{
// The current word is alphabetically after the prefix.
// So we need the next character.
++iInput;
if (iInput < inputString.Length)
{
prefix = inputString.Substring(0, iInput+1);
}
}
else
{
// The prefix is alphabetically after the current word.
// Advance the current word.
++iWord;
}
}
如果这就是您想要做的(查找作为输入字符串前缀的字典单词),那么按第一个字符索引字典就没有特别的理由了。给定一个排序好的单词列表,您可以对第一个字母进行二分搜索以找到起点。这将比字典查找花费稍微更多的时间,但是与搜索单词列表查找匹配所花费的时间相比,时间差异将非常小。此外,排序后的单词列表比字典方法占用更少的内存。
如果要进行不区分大小写的比较,请将比较代码更改为:
var result = String.Compare(wordsList[iWords], prefix, true);
if (result == 0)
{
// wordsList[iWords] is found!
++iWords;
}
else if (result > 0)
{
还将每次迭代的字符串比较次数减少到每次迭代恰好一次。
while (x < str.Length-1)
{
if (ChrW(10) == GetChar(str, x) && ChrW(13) == GetChar(str, x+1))
{
// x+2 - This new line
}
x++;
}
这是我的第一次尝试,我想把它发布出去,以防我今天无法完成。
public class CompareHelper
{
//Should always be sorted in alphabetical order.
public static Dictionary<char, List<string>> MyDictionary;
public static List<string> CurrentWordList;
public static List<string> MatchedWordList;
//The word we are trying to find matches for.
public static char InitChar;
public static StringBuilder ThisWord;
/// <summary>
/// Initialize the Compare. Set the first character. See if there are any 1 letter words
/// for that character.
/// </summary>
/// <param name="firstChar">The first character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool InitCompare(char firstChar)
{
InitChar = firstChar;
//Get all words that start with the firstChar.
CurrentWordList = MyDictionary[InitChar];
ThisWord = new StringBuilder();
ThisWord.Append(firstChar);
if (CurrentWordList[0].Length == 1)
{
//Match.
return true;
}
//No matches.
return false;
}
/// <summary>
/// Append this letter to our ThisWord. See if there are any matching words.
/// </summary>
/// <param name="nextChar">The next character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool NextCompare(char nextChar)
{
ThisWord.Append(nextChar);
int currentIndex = ThisWord.Length - 1;
if (FindRemainingWords(nextChar, currentIndex))
{
if (CurrentWordList[0].Length == currentIndex)
{
//Match.
return true;
}
}
//No matches.
return false;
}
/// <summary>
/// Trim down our CurrentWordList until it only contains words
/// that at currIndex start with the currChar.
/// </summary>
/// <param name="currChar">The next letter in our ThisWord.</param>
/// <param name="currIndex">The index of the letter.</param>
/// <returns>True if there are words remaining in CurrentWordList.</returns>
private static bool FindRemainingWords(char currChar, int currIndex)
{
//Null check.
if (CurrentWordList == null || CurrentWordList.Count < 1)
{
return false;
}
bool doneSearching = false;
while(!doneSearching)
{
int middleIndex = CurrentWordList.Count / 2;
//TODO: test for CurrentWordList.count 2 or 1 ...
//TODO: test for wordToCheck.length < curr index
char middleLetter = CurrentWordList[middleIndex][currIndex];
LetterPositionEnum returnEnum = GetLetterPosition(currChar, middleLetter);
switch(returnEnum)
{
case LetterPositionEnum.Before:
CurrentWordList = CurrentWordList.GetRange(middleIndex, (CurrentWordList.Count - middleIndex));
break;
case LetterPositionEnum.PREV:
CurrentWordList = CurrentWordList.GetRange(middleIndex, (CurrentWordList.Count - middleIndex));
break;
case LetterPositionEnum.MATCH:
CurrentWordList = CurrentWordList.GetRange(middleIndex, (CurrentWordList.Count - middleIndex));
break;
case LetterPositionEnum.NEXT:
CurrentWordList = CurrentWordList.GetRange(0, middleIndex);
break;
case LetterPositionEnum.After:
CurrentWordList = CurrentWordList.GetRange(0, middleIndex);
break;
default:
break;
}
}
TrimWords(currChar, currIndex);
//Null check.
if (CurrentWordList == null || CurrentWordList.Count < 1)
{
return false;
}
//There are still words left in CurrentWordList.
return true;
}
//Trim all words in CurrentWordList
//that are LetterPositionEnum.PREV and LetterPositionEnum.NEXT
private static void TrimWords(char currChar, int currIndex)
{
int startIndex = 0;
int endIndex = CurrentWordList.Count;
bool startIndexFound = false;
//Loop through all of the words.
for ( int i = startIndex; i < endIndex; i++)
{
//If we havent found the start index then the first match of currChar
//will be the start index.
if( !startIndexFound && currChar == CurrentWordList[i][currIndex] )
{
startIndex = i;
startIndexFound = true;
}
//If we have found the start index then the next letter that isnt
//currChar will be the end index.
if( startIndexFound && currChar != CurrentWordList[i][currIndex])
{
endIndex = i;
break;
}
}
//Trim the words that dont start with currChar.
CurrentWordList = CurrentWordList.GetRange(startIndex, endIndex);
}
//In order to find all words that begin with a given character, we should search
//for the last word that begins with the previous character (PREV) and the
//first word that begins with the next character (NEXT).
//Anything else Before or After that is trash and we will throw out.
public enum LetterPositionEnum
{
Before,
PREV,
MATCH,
NEXT,
After
};
//We want to ignore all letters that come before this one.
public static LetterPositionEnum GetLetterPosition(char currChar, char compareLetter)
{
switch (currChar)
{
case 'A':
switch (compareLetter)
{
case 'A': return LetterPositionEnum.MATCH;
case 'B': return LetterPositionEnum.NEXT;
case 'C': return LetterPositionEnum.After;
case 'D': return LetterPositionEnum.After;
case 'E': return LetterPositionEnum.After;
case 'F': return LetterPositionEnum.After;
case 'G': return LetterPositionEnum.After;
case 'H': return LetterPositionEnum.After;
case 'I': return LetterPositionEnum.After;
case 'J': return LetterPositionEnum.After;
case 'K': return LetterPositionEnum.After;
case 'L': return LetterPositionEnum.After;
case 'M': return LetterPositionEnum.After;
case 'N': return LetterPositionEnum.After;
case 'O': return LetterPositionEnum.After;
case 'P': return LetterPositionEnum.After;
case 'Q': return LetterPositionEnum.After;
case 'R': return LetterPositionEnum.After;
case 'S': return LetterPositionEnum.After;
case 'T': return LetterPositionEnum.After;
case 'U': return LetterPositionEnum.After;
case 'V': return LetterPositionEnum.After;
case 'W': return LetterPositionEnum.After;
case 'X': return LetterPositionEnum.After;
case 'Y': return LetterPositionEnum.After;
case 'Z': return LetterPositionEnum.After;
default: return LetterPositionEnum.After;
}
case 'B':
switch (compareLetter)
{
case 'A': return LetterPositionEnum.PREV;
case 'B': return LetterPositionEnum.MATCH;
case 'C': return LetterPositionEnum.NEXT;
case 'D': return LetterPositionEnum.After;
case 'E': return LetterPositionEnum.After;
case 'F': return LetterPositionEnum.After;
case 'G': return LetterPositionEnum.After;
case 'H': return LetterPositionEnum.After;
case 'I': return LetterPositionEnum.After;
case 'J': return LetterPositionEnum.After;
case 'K': return LetterPositionEnum.After;
case 'L': return LetterPositionEnum.After;
case 'M': return LetterPositionEnum.After;
case 'N': return LetterPositionEnum.After;
case 'O': return LetterPositionEnum.After;
case 'P': return LetterPositionEnum.After;
case 'Q': return LetterPositionEnum.After;
case 'R': return LetterPositionEnum.After;
case 'S': return LetterPositionEnum.After;
case 'T': return LetterPositionEnum.After;
case 'U': return LetterPositionEnum.After;
case 'V': return LetterPositionEnum.After;
case 'W': return LetterPositionEnum.After;
case 'X': return LetterPositionEnum.After;
case 'Y': return LetterPositionEnum.After;
case 'Z': return LetterPositionEnum.After;
default: return LetterPositionEnum.After;
}
case 'C':
switch (compareLetter)
{
case 'A': return LetterPositionEnum.Before;
case 'B': return LetterPositionEnum.PREV;
case 'C': return LetterPositionEnum.MATCH;
case 'D': return LetterPositionEnum.NEXT;
case 'E': return LetterPositionEnum.After;
case 'F': return LetterPositionEnum.After;
case 'G': return LetterPositionEnum.After;
case 'H': return LetterPositionEnum.After;
case 'I': return LetterPositionEnum.After;
case 'J': return LetterPositionEnum.After;
case 'K': return LetterPositionEnum.After;
case 'L': return LetterPositionEnum.After;
case 'M': return LetterPositionEnum.After;
case 'N': return LetterPositionEnum.After;
case 'O': return LetterPositionEnum.After;
case 'P': return LetterPositionEnum.After;
case 'Q': return LetterPositionEnum.After;
case 'R': return LetterPositionEnum.After;
case 'S': return LetterPositionEnum.After;
case 'T': return LetterPositionEnum.After;
case 'U': return LetterPositionEnum.After;
case 'V': return LetterPositionEnum.After;
case 'W': return LetterPositionEnum.After;
case 'X': return LetterPositionEnum.After;
case 'Y': return LetterPositionEnum.After;
case 'Z': return LetterPositionEnum.After;
default: return LetterPositionEnum.After;
}
//etc. Stack Overflow limits characters to 30,000 contact me for full switch case.
default: return LetterPositionEnum.After;
}
}
}
好的,这是我想出的最终解决方案,我不确定它是否是最优的最优的,但似乎是相当快,我喜欢的逻辑和代码的简洁。
基本上在应用程序启动时,你传递一个任意长度的单词列表给InitWords。这将对单词进行排序,并将它们放入有26个键的字典中,每个键代表字母表中的每个字母。
然后在游戏过程中,你将遍历字符集,总是从第一个字母开始,然后是第一个和第二个字母,依此类推。整个过程中,你都在减少CurrentWordList中的单词数量。
如果你有字符串'icedgt'。你会调用InitCompare with 'i',这会从MyDictionary中抓取KeyValuePair with Key 'i',然后你会看到第一个单词的长度是否为1,因为它们已经按字母顺序排列了,单词'i'将是第一个单词。然后在下一次迭代时,将'c'传递给NextCompare,这再次通过使用Linq只返回第二个字符为'c'的单词来减小列表大小。然后接下来你会做另一个NextCompare并传入'e',再次使用Linq减少CurrentWordList中的单词数量。
在第一次迭代之后你的CurrentWordList有每个以'i'开头的单词,在NextCompare中你会有每个以'ic'开头的单词在NextCompare中你会有一个子集每个单词以'ice'开头等等。
我不确定Linq是否能在速度方面胜过我的手动巨大的Switch Case,但它简单而优雅。为此我很高兴。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Xuzzle.Code
{
public class CompareHelper
{
//Should always be sorted in alphabetical order.
public static Dictionary<char, List<string>> MyDictionary;
public static List<string> CurrentWordList;
//The word we are trying to find matches for.
public static char InitChar;
public static StringBuilder ThisWord;
/// <summary>
/// Init MyDictionary with the list of words passed in. Make a new
/// key value pair with each Letter.
/// </summary>
/// <param name="listOfWords"></param>
public static void InitWords(List<string> listOfWords)
{
MyDictionary = new Dictionary<char, List<string>>();
foreach (char currChar in LetterHelper.Alphabet)
{
var wordsParsed = listOfWords.Where(currWord => char.ToUpper(currWord[0]) == currChar).ToArray();
Array.Sort(wordsParsed);
MyDictionary.Add(currChar, wordsParsed.ToList());
}
}
/// <summary>
/// Initialize the Compare. Set the first character. See if there are any 1 letter words
/// for that character.
/// </summary>
/// <param name="firstChar">The first character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool InitCompare(char firstChar)
{
InitChar = firstChar;
//Get all words that start with the firstChar.
CurrentWordList = MyDictionary[InitChar];
ThisWord = new StringBuilder();
ThisWord.Append(firstChar);
if (CurrentWordList[0].Length == 1)
{
//Match.
return true;
}
//No matches.
return false;
}
/// <summary>
/// Append this letter to our ThisWord. See if there are any matching words.
/// </summary>
/// <param name="nextChar">The next character in the word string.</param>
/// <returns>True if a word was found.</returns>
public static bool NextCompare(char nextChar)
{
ThisWord.Append(nextChar);
int currentIndex = ThisWord.Length - 1;
if (CurrentWordList != null && CurrentWordList.Count > 0)
{
CurrentWordList = CurrentWordList.Where(word => (word.Length > currentIndex && word[currentIndex] == nextChar)).ToList();
if (CurrentWordList != null && CurrentWordList.Count > 0)
{
if (CurrentWordList[0].Length == ThisWord.Length)
{
//Match.
return true;
}
}
}
//No matches.
return false;
}
}
}