从给定的单词集合中获取单词以进行校对

本文关键字:单词 获取 集合 | 更新日期: 2023-09-27 18:31:51

我有一个单词集合存储在列表对象中,例如这里的
标题集合

Lorem Ipsum
Centuries
Electronic

这是我想寻找这个词
的示例段落Lorem Ipsum只是印刷和排版行业的虚拟文本。自 1500 年代以来,Loren Ipsum 一直是行业标准的虚拟文本,当时一位不知名的打印机拿了一个类型的厨房并把它打乱以制作类型标本书。它不仅存活了五个世纪,而且还飞跃进入电子排版,基本保持不变。它在1960年代随着包含LorenIpsum段落的Letraset表的发布而普及,最近在Aldus PageMaker等桌面出版软件中普及,包括LoremIpsum的版本。

我的目标是,我想提取该段落中的那些单词,如果拼写错误并不重要,因为目标是纠正大写和拼写错误的单词。

我在这里的预期结果是

lorem ipsum
Loren Ipsum
centuries
electornic
LorenIpsum
LoremIpsum

但不限于这些,因为这将贯穿整篇文章和大量文章

抱歉,我还没有编写任何代码,但我计划在这里使用 RegEx for C#。

从给定的单词集合中获取单词以进行校对

互联网上有许多算法可以检查两个单词之间的相似性。GetEdits就是其中之一。

可以使用以下代码。但是,它可能不是很有效。

static int GetEdits(string answer, string guess)
{
    guess = guess.ToLower();
    answer = answer.ToLower();
    int[,] d = new int[answer.Length + 1, guess.Length + 1];
    for (int i = 0; i <= answer.Length; i++)
        d[i, 0] = i;
    for (int j = 0; j <= guess.Length; j++)
        d[0, j] = j;
    for (int j = 1; j <= guess.Length; j++)
        for (int i = 1; i <= answer.Length; i++)
            if (answer[i - 1] == guess[j - 1])
                d[i, j] = d[i - 1, j - 1];  //no operation
            else
                d[i, j] = Math.Min(Math.Min(
                    d[i - 1, j] + 1,    //a deletion
                    d[i, j - 1] + 1),   //an insertion
                    d[i - 1, j - 1] + 1 //a substitution
                );
    return d[answer.Length, guess.Length];
}
static void Main(string[] args)
{
    const string text = @"lorem ipsum is simply dummy text of the printing and typesetting industry. Loren Ipsum has been the industrys standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing LorenIpsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of LoremIpsum.";
    var findWords = new string[]
    {
        "Lorem Ipsum",
        "Centuries",
        "Electronic"
    };
    const int MaxErrors = 2;
    // Tokenize text
    var tokens = text.Split(' ', ',' , '.');
    for (int i = 0; i < tokens.Length; i++)
    {
        if( tokens[i] != String.Empty)
        {
            foreach (var findWord in findWords)
            {
                if (GetEdits(findWord, tokens[i]) <= MaxErrors)
                {
                    Console.WriteLine(tokens[i]);
                    break;
                }
                // Join with the next word and check again.
                else if(findWord.Contains(' ') && i + 1 < tokens.Length)
                {
                    string token = tokens[i] + " " + tokens[i + 1];
                    if (GetEdits(findWord, token) <= MaxErrors)
                    {
                        Console.WriteLine(token);
                        i++;
                        break;
                    }
                }
            }
        }
    }
}