使用区分区域性的比较从字符串中获取子字符串

本文关键字:字符串 获取 比较 用区 区域性 | 更新日期: 2023-09-27 18:35:24

有没有办法使用区分区域性的相等比较从字符串中获取匹配的子字符串?例如,在 en-US 区域性下,æae 被视为相等。 "Encyclopædia".IndexOf("aed")的计算结果为 8 ,表示匹配;但是,有没有一种方法可以提取匹配的子字符串,æd ,而不涉及迭代源字符串?请注意,查找子字符串和匹配子字符串的长度可以相差几个字符。

使用区分区域性的比较从字符串中获取子字符串

我最终通过首先调用IndexOf来获取比赛的起始位置,然后迭代尝试识别其长度来解决这个问题。我针对与指定子字符串长度相同的匹配的热路径进行了优化;在这种情况下,仅执行单个比较。

public static class StringExtensions
{
    public static void Find(this string source, string substring, StringComparison comparisonType, out int matchIndex, out int matchLength)
    {
        Find(source, substring, 0, source.Length, comparisonType, out matchIndex, out matchLength);
    }
    public static void Find(this string source, string substring, int searchIndex, StringComparison comparisonType, out int matchIndex, out int matchLength)
    {
        Find(source, substring, searchIndex, source.Length - searchIndex, comparisonType, out matchIndex, out matchLength);
    }
    public static void Find(this string source, string substring, int searchIndex, int searchLength, StringComparison comparisonType, out int matchIndex, out int matchLength)
    {
        matchIndex = source.IndexOf(substring, searchIndex, searchLength, comparisonType);
        if (matchIndex == -1)
        {
            matchLength = -1;
            return;
        }
        matchLength = FindMatchLength(source, substring, searchIndex, searchLength, comparisonType, matchIndex);
        // Defensive programming, but should never happen
        if (matchLength == -1)
            matchIndex = -1;
    }
    private static int FindMatchLength(string source, string substring, int searchIndex, int searchLength, StringComparison comparisonType, int matchIndex)
    {
        int matchLengthMaximum = searchLength - (matchIndex - searchIndex);
        int matchLengthInitial = Math.Min(substring.Length, matchLengthMaximum);
        // Hot path: match length is same as substring length.
        if (Compare(source, matchIndex, matchLengthInitial, substring, 0, substring.Length, comparisonType) == 0)
            return matchLengthInitial;
        int matchLengthDecrementing = matchLengthInitial - 1;
        int matchLengthIncrementing = matchLengthInitial + 1;
        while (matchLengthDecrementing >= 0 || matchLengthIncrementing <= matchLengthMaximum)
        {
            if (matchLengthDecrementing >= 0)
            {
                if (Compare(source, matchIndex, matchLengthDecrementing, substring, 0, substring.Length, comparisonType) == 0)
                    return matchLengthDecrementing;
                matchLengthDecrementing--;
            }
            if (matchLengthIncrementing <= matchLengthMaximum)
            {
                if (Compare(source, matchIndex, matchLengthIncrementing, substring, 0, substring.Length, comparisonType) == 0)
                    return matchLengthIncrementing;
                matchLengthIncrementing++;
            }
        }
        // Should never happen
        return -1;
    }
    private static int Compare(string strA, int indexA, int lengthA, string strB, int indexB, int lengthB, StringComparison comparisonType)
    {
        switch (comparisonType)
        {
            case StringComparison.CurrentCulture:
                return CultureInfo.CurrentCulture.CompareInfo.Compare(strA, indexA, lengthA, strB, indexB, lengthB, CompareOptions.None);
            case StringComparison.CurrentCultureIgnoreCase:
                return CultureInfo.CurrentCulture.CompareInfo.Compare(strA, indexA, lengthA, strB, indexB, lengthB, CompareOptions.IgnoreCase);
            case StringComparison.InvariantCulture:
                return CultureInfo.InvariantCulture.CompareInfo.Compare(strA, indexA, lengthA, strB, indexB, lengthB, CompareOptions.None);
            case StringComparison.InvariantCultureIgnoreCase:
                return CultureInfo.InvariantCulture.CompareInfo.Compare(strA, indexA, lengthA, strB, indexB, lengthB, CompareOptions.IgnoreCase);
            case StringComparison.Ordinal:
                return CultureInfo.InvariantCulture.CompareInfo.Compare(strA, indexA, lengthA, strB, indexB, lengthB, CompareOptions.Ordinal);
            case StringComparison.OrdinalIgnoreCase:
                return CultureInfo.InvariantCulture.CompareInfo.Compare(strA, indexA, lengthA, strB, indexB, lengthB, CompareOptions.OrdinalIgnoreCase);
            default:
                throw new ArgumentException("The string comparison type passed in is currently not supported.", nameof(comparisonType));
        }
    }
}

样品使用:

int index, length;
source.Find(remove, StringComparison.CurrentCulture, out index, out length);
string clean = index < 0 ? source : source.Remove(index, length);

自 .NET 5.0 System.Globalization.CompareInfo 具有返回匹配长度的方法:
int IndexOf(ReadOnlySpan<char> source, ReadOnlySpan<char> value, CompareOptions options, out int matchLength);参见 CompareInfo.IndexOf 方法