将c# Levenshtein Distance移植到Java
本文关键字:Java Distance Levenshtein | 更新日期: 2023-09-27 17:51:13
我正在从一个现有的拼写检查程序开发一个应用程序,谁能帮我修改这部分代码,我不能移植指针或stackalloc
到Java,因为没有等价物存在。一个具有完全相同功能的java方法。
public static unsafe double GNULevenstein(string word1, string word2)
{
// this algorithm normally computes un-normalized distance between two string.
fixed (char* word1Ptr = word1)
fixed (char* word2Ptr = word2)
{
char* pointerToWord1 = word1Ptr;
char* pointerToWord2 = word2Ptr;
/* skip equal start sequence, if any */
if (word1.Length >= word2.Length)
{
while (*pointerToWord1 == *pointerToWord2)
{
/* if we already used up one string,
* then the result is the length of the other */
if (*pointerToWord1 == ''0') break;
pointerToWord1++;
pointerToWord2++;
}
}
else // wordl < word2
{
while (*pointerToWord1 == *pointerToWord2)
{
/* if we already used up one string,
* then the result is the length of the other */
if (*pointerToWord2 == ''0') break;
pointerToWord1++;
pointerToWord2++;
}
}
/* length count #1*/
int len1 = word1.Length - (int)(pointerToWord1 - word1Ptr);
int len2 = word2.Length - (int)(pointerToWord2 - word2Ptr);
/* if we already used up one string, then
the result is the length of the other */
if (*pointerToWord1 == ''0')
return ExportResult( len2 , word1.Length,word2.Length , false);
if (*pointerToWord2 == ''0')
return ExportResult(len1, word1.Length, word2.Length, false);
/* length count #2*/
pointerToWord1 += len1;
pointerToWord2 += len2;
/* cut of equal tail sequence, if any */
while (*--pointerToWord1 == *--pointerToWord2)
{
len1--;
len2--;
}
/* reset pointers, adjust length */
pointerToWord1 -= len1++;
pointerToWord2 -= len2++;
/* possible dist to great? */
//if ((len1 - len2 >= 0 ? len1 - len2 : -(len1 - len2)) >= char.MaxValue) return 1;
if (Math.Abs(len1 - len2) >= char.MaxValue)
return ExportResult(1, false); // no similarity
char* tmp;
/* swap if l2 longer than l1 */
if (len1 < len2)
{
tmp = pointerToWord1;
pointerToWord1 = pointerToWord2;
pointerToWord2 = tmp;
len1 ^= len2;
len2 ^= len1;
len1 ^= len2;
}
/* fill initial row */
int i, j, n;
n = (*pointerToWord1 != *pointerToWord2) ? 1 : 0;
char* r = stackalloc char[len1 * 2];
char* p1, p2;
for (i = 0, p1 = r; i < len1; i++, *p1++ = (char)n++, p1++)
{ /*empty*/}
/* calc. rowwise */
for (j = 1; j < len2; j++)
{
/* init pointers and col#0 */
p1 = r + ((j & 1) == 0 ? 1 : 0);
p2 = r + (j & 1);
n = *p1 + 1;
*p2++ = (char)n; p2++;
pointerToWord2++;
/* foreach column */
for (i = 1; i < len1; i++)
{
if (*p1 < n) n = *p1 + (*(pointerToWord1 + i) != *pointerToWord2 ? 1 : 0); /* replace cheaper than delete? */
p1++;
if (*++p1 < n) n = *p1 + 1; /* insert cheaper then insert ? */
*p2++ = (char)n++; /* update field and cost for next col's delete */
p2++;
}
}
/* return result */
return ExportResult( n - 1, word1.Length, word2.Length, false);
}
}
这个方法看起来像是从C/c++移植过来的,而不是用c#编写的。c#中的字符串操作通常足够快,使用unsafe
并直接使用char*
s是浪费时间和精力…
我刚谷歌了方法名,看起来你只是想要一个Java实现的Levenshtein距离,所以,从相同的链接:
public class LevenshteinDistance {
private static int minimum(int a, int b, int c) {
return Math.min(Math.min(a, b), c);
}
public static int computeLevenshteinDistance(CharSequence str1,
CharSequence str2) {
int[][] distance = new int[str1.length() + 1][str2.length() + 1];
for (int i = 0; i <= str1.length(); i++)
distance[i][0] = i;
for (int j = 1; j <= str2.length(); j++)
distance[0][j] = j;
for (int i = 1; i <= str1.length(); i++)
for (int j = 1; j <= str2.length(); j++)
distance[i][j] = minimum(
distance[i - 1][j] + 1,
distance[i][j - 1] + 1,
distance[i - 1][j - 1]
+ ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0
: 1));
return distance[str1.length()][str2.length()];
}
}