两个字符串之间的所有公共子字符串

本文关键字:字符串 之间 两个 | 更新日期: 2023-09-27 18:17:52

我正在c#中查找两个字符串之间的所有公共子字符串。例如,如果输入是:S1= "需要电子邮件协助"S2= "email求助"

输出应该是- 'need assistance email'

下面的代码返回最长的公共子字符串,但是我希望我的代码返回所有的公共子字符串。任何帮助都非常感激!

static void commonsubstrings()
    {
        input1 = "need asssitance with email";
        input2 = "email assistance needed"
        if (input2.Length > input1.Length)
        {
            swap = input1;
            input1 = input2;
            input2 = swap;
        }
        int k = 1;
        String temp;
        String longTemp = "";
        for (int i = 0; (i <= input1.Length); i++)
        {
            if ((i == input1.Length))
            {
                if (longest != null)
                {
                    k = longest.Length + 1;
                }
                else
                {
                    k = 1;
                }
                temp = input1.Substring(1, input1.Length - 1);
                if (temp.Equals(""))
                {
                    break;
                }
                if (k <= temp.Length)
                {
                    i = k - 1;
                    input1 = temp;
                    if ((longest != null) && (longest.Length > longTemp.Length))
                    {
                        longTemp = longest;
                    }
                }
            }
            holder1 = input1.Substring(0, k);
            for (int j = 0; (j < input2.Length) && (j + k <= input2.Length); j++)
            {
                check1 = input2.Substring(j, k);
                if (holder1.Equals(check1))
                {
                    longest = holder1;
                    break;
                }
            }
            k++;
        }
        Console.WriteLine(longest);
        Console.ReadLine(); 

}

两个字符串之间的所有公共子字符串

另一种方法:您可以使用levenshtein距离来查找两个单词的相似度。如果距离小于指定值,则可以将两个字符串视为相等。然后,您可以对Enumerable.Intersect使用水平比较器。

那么就像这样简单:

string S1= "need asssitance with email" ;
string S2 = "email assistance needed";
string[] words1 = S1.Split();
string[] words2 = S2.Split();
var wordsIntersecting = words1.Intersect(words2, new LevenshteinComparer());
string output = string.Join(" ", wordsIntersecting);

输出:need asssitance email

下面是自定义比较器:

class LevenshteinComparer : IEqualityComparer<string>
{
    public int MaxDistance { get; set; }
    private Levenshtein _Levenshtein = new Levenshtein();
    public LevenshteinComparer() : this(50) { }
    public LevenshteinComparer(int maxDistance)
    {
        this.MaxDistance = maxDistance;
    }
    public bool Equals(string x, string y)
    {
        int distance = _Levenshtein.iLD(x, y);
        return distance <= MaxDistance;
    }
    public int GetHashCode(string obj)
    {
        return 0; 
    }
}

,这里是levelshtein算法的实现:

public class Levenshtein
{
    ///*****************************
    /// Compute Levenshtein distance 
    /// Memory efficient version
    ///*****************************
    public int iLD(String sRow, String sCol)
    {
        int RowLen = sRow.Length;  // length of sRow
        int ColLen = sCol.Length;  // length of sCol
        int RowIdx;                // iterates through sRow
        int ColIdx;                // iterates through sCol
        char Row_i;                // ith character of sRow
        char Col_j;                // jth character of sCol
        int cost;                   // cost
        /// Test string length
        if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31))
            throw (new Exception("'nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + ".'nYours is " + Math.Max(sRow.Length, sCol.Length) + "."));
        // Step 1
        if (RowLen == 0)
        {
            return ColLen;
        }
        if (ColLen == 0)
        {
            return RowLen;
        }
        /// Create the two vectors
        int[] v0 = new int[RowLen + 1];
        int[] v1 = new int[RowLen + 1];
        int[] vTmp;

        /// Step 2
        /// Initialize the first vector
        for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
        {
            v0[RowIdx] = RowIdx;
        }
        // Step 3
        /// Fore each column
        for (ColIdx = 1; ColIdx <= ColLen; ColIdx++)
        {
            /// Set the 0'th element to the column number
            v1[0] = ColIdx;
            Col_j = sCol[ColIdx - 1];

            // Step 4
            /// Fore each row
            for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
            {
                Row_i = sRow[RowIdx - 1];

                // Step 5
                if (Row_i == Col_j)
                {
                    cost = 0;
                }
                else
                {
                    cost = 1;
                }
                // Step 6
                /// Find minimum
                int m_min = v0[RowIdx] + 1;
                int b = v1[RowIdx - 1] + 1;
                int c = v0[RowIdx - 1] + cost;
                if (b < m_min)
                {
                    m_min = b;
                }
                if (c < m_min)
                {
                    m_min = c;
                }
                v1[RowIdx] = m_min;
            }
            /// Swap the vectors
            vTmp = v0;
            v0 = v1;
            v1 = vTmp;
        }

        // Step 7
        /// Value between 0 - 100
        /// 0==perfect match 100==totaly different
        /// 
        /// The vectors where swaped one last time at the end of the last loop,
        /// that is why the result is now in v0 rather than in v1
        //System.Console.WriteLine("iDist=" + v0[RowLen]);
        int max = System.Math.Max(RowLen, ColLen);
        return ((100 * v0[RowLen]) / max);
    }


    ///*****************************
    /// Compute the min
    ///*****************************
    private int Minimum(int a, int b, int c)
    {
        int mi = a;
        if (b < mi)
        {
            mi = b;
        }
        if (c < mi)
        {
            mi = c;
        }
        return mi;
    }
    ///*****************************
    /// Compute Levenshtein distance         
    ///*****************************
    public int LD(String sNew, String sOld)
    {
        int[,] matrix;              // matrix
        int sNewLen = sNew.Length;  // length of sNew
        int sOldLen = sOld.Length;  // length of sOld
        int sNewIdx; // iterates through sNew
        int sOldIdx; // iterates through sOld
        char sNew_i; // ith character of sNew
        char sOld_j; // jth character of sOld
        int cost; // cost
        /// Test string length
        if (Math.Max(sNew.Length, sOld.Length) > Math.Pow(2, 31))
            throw (new Exception("'nMaximum string length in Levenshtein.LD is " + Math.Pow(2, 31) + ".'nYours is " + Math.Max(sNew.Length, sOld.Length) + "."));
        // Step 1
        if (sNewLen == 0)
        {
            return sOldLen;
        }
        if (sOldLen == 0)
        {
            return sNewLen;
        }
        matrix = new int[sNewLen + 1, sOldLen + 1];
        // Step 2
        for (sNewIdx = 0; sNewIdx <= sNewLen; sNewIdx++)
        {
            matrix[sNewIdx, 0] = sNewIdx;
        }
        for (sOldIdx = 0; sOldIdx <= sOldLen; sOldIdx++)
        {
            matrix[0, sOldIdx] = sOldIdx;
        }
        // Step 3
        for (sNewIdx = 1; sNewIdx <= sNewLen; sNewIdx++)
        {
            sNew_i = sNew[sNewIdx - 1];
            // Step 4
            for (sOldIdx = 1; sOldIdx <= sOldLen; sOldIdx++)
            {
                sOld_j = sOld[sOldIdx - 1];
                // Step 5
                if (sNew_i == sOld_j)
                {
                    cost = 0;
                }
                else
                {
                    cost = 1;
                }
                // Step 6
                matrix[sNewIdx, sOldIdx] = Minimum(matrix[sNewIdx - 1, sOldIdx] + 1, matrix[sNewIdx, sOldIdx - 1] + 1, matrix[sNewIdx - 1, sOldIdx - 1] + cost);
            }
        }
        // Step 7
        /// Value between 0 - 100
        /// 0==perfect match 100==totaly different
        System.Console.WriteLine("Dist=" + matrix[sNewLen, sOldLen]);
        int max = System.Math.Max(sNewLen, sOldLen);
        return (100 * matrix[sNewLen, sOldLen]) / max;
    }
}

归功于Levenshtein类:http://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm

    public static string [] CommonString(string left, string right)
    {
        List<string> result = new List<string>();
        string [] rightArray = right.Split();
        string [] leftArray = left.Split();
        result.AddRange(rightArray.Where(r => leftArray.Any(l => l.StartsWith(r))));
        // must check other way in case left array contains smaller words than right array
        result.AddRange(leftArray.Where(l => rightArray.Any(r => r.StartsWith(l))));
        return result.Distinct().ToArray();
    }

使用Set-Intersections

从查找字符串的所有可能子字符串的例程开始。这是在Python中,这是一个"练习"读者把它翻译成c#:

def allSubstr(instring):
  retset = set()
  retset.add(instring)
  totlen = len(instring)
  for thislen in range(0, totlen):
    for startpos in range(0, totlen):
      # print "startpos: %s, thislen: %s" % (startpos, thislen)
      addStr = instring[startpos:startpos+thislen]
      # print "addstr: %s" % (addStr)
      retset.add(addStr)
  print "retset total: %s" % (retset)
  return retset
set1 = allSubstr('abcdefg')
set2 = allSubstr('cdef')
print set1.intersection(set2)

输出如下:

>>> set1 = allSubstr('abcdefg')
retset total: set(['', 'cde', 'ab', 'ef', 'cd', 'abcdef', 'abc', 'efg', 'bcde', 'cdefg', 'bc', 'de',   'bcdef', 'abcd', 'defg', 'fg', 'cdef', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'bcd', 'abcde', 'abcdefg', 'bcdefg', 'def'])
>>> set2 = allSubstr('cdef')
retset total: set(['', 'cde', 'c', 'ef', 'e', 'd', 'f', 'de', 'cd', 'cdef', 'def'])
>>> 
>>> set1.intersection(set2)
set(['', 'cde', 'c', 'de', 'e', 'd', 'f', 'ef', 'cd', 'cdef', 'def'])

不,你对长度为1的子集不感兴趣。但是,您总是可以在调用set.add()之前添加长度限制。