减去包含所有搜索条件的最短字符串

本文关键字:字符串 条件 搜索 包含所 | 更新日期: 2023-09-27 17:53:29

我有一个问题要解决,其中给定字符串source和搜索条件criteria的集合,该算法必须返回source的最短可能子字符串,其中包含criteria的所有项。

=================================

  • 相同的搜索条件可能存在于多个源字符串中次了。在这种情况下,它需要返回子字符串包含搜索条件的特定实例,以便它是所有可能子字符串中最短的。
  • 搜索项中可以包含空格,如hello world
  • 查找条件的顺序无关紧要,只要它们都在结果子字符串
  • 中即可。

==================================

String source = "aaa wwwww fgffsd ththththt sss sgsgsgsghs bfbfb hhh sdfg kkk dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss nbnbn";
List<String> criteria = new List<string> { "kkk", "aaa", "sss", "hhh" };

上面的输入应该返回以下子字符串:kkk wdwd aaa vcvc hhh zxzx sss

不幸的是,我花了很多时间试图写这样一个算法,但我不能得到它刚刚好。下面是我目前得到的代码:

public struct Extraction
{
    public int Start { get; set; }
    public int End { get; set; }
    public int Length
    {
        get
        {
            var length = this.End - this.Start;
            return length;
        }
    }
    public Extraction(int start, int end)
    {
        this.Start = start;
        this.End = end;
    }
}
public class TextExtractor
{
    private String _source;
    private Dictionary<String, List<Int32>> _criteriaIndexes;
    private Dictionary<String, int> _entryIndex;
    public TextExtractor(String source, List<String> searchCriteria)
    {
        this._source = source;
        this._criteriaIndexes = this.ExtractIndexes(source, searchCriteria);
        this._entryIndex = _criteriaIndexes.ToDictionary(x => x.Key, v => 0);
    }
    public String Extract()
    {
        List<Extraction> possibleExtractions = new List<Extraction>();
        int index = 0;
        int min = int.MaxValue;
        int max = 0;
        bool shouldStop = false;
        while (index < _criteriaIndexes.Count && !shouldStop)
        {
            Boolean compareWithAll = index == _criteriaIndexes.Count - 1;
            if (!compareWithAll)
            {
                var current = _criteriaIndexes.ElementAt(index);
                this.CalculateMinMax(current, ref min, ref max);
                index++;
            }
            else
            {
                var entry = _criteriaIndexes.Last();
                while (_entryIndex[entry.Key] < entry.Value.Count)
                {
                    int a = min;
                    int b = max;
                    this.CalculateMinMax(entry, ref a, ref b);
                    _entryIndex[entry.Key]++;
                    Extraction ext = new Extraction(a, b);
                    possibleExtractions.Add(ext);
                }
                int k = index - 1;
                while (k >= 0)
                {
                    var prev = _criteriaIndexes.ElementAt(k);
                    if (prev.Value.Count - 1 > _entryIndex[prev.Key])
                    {
                        _entryIndex[prev.Key]++;
                        break;
                    }
                    else
                    {
                        k--;
                    }
                }
                shouldStop = _criteriaIndexes.All(x => x.Value.Count - 1 <= _entryIndex[x.Key]);
                _entryIndex[entry.Key] = 0;
                index = 0;
                min = int.MaxValue;
                max = 0;
            }
        }
        Extraction shortest = possibleExtractions.First(x => x.Length.Equals(possibleExtractions.Min(p => p.Length)));
        String result = _source.Substring(shortest.Start, shortest.Length);
        return result;
    }
    private Dictionary<String, List<Int32>> ExtractIndexes(String source, List<String> searchCriteria)
    {
        Dictionary<String, List<Int32>> result = new Dictionary<string, List<int>>();
        foreach (var criteria in searchCriteria)
        {
            Int32 i = 0;
            Int32 startingIndex = 0;
            var indexes = new List<int>();
            while (i > -1)
            {
                i = source.IndexOf(criteria, startingIndex);
                if (i > -1)
                {
                    startingIndex = i + 1;
                    indexes.Add(i);
                }
            }
            if (indexes.Any())
            {
                result.Add(criteria, indexes);
            }
        }
        return result;
    }
    private void CalculateMinMax(KeyValuePair<String, List<int>> current, ref int min, ref int max)
    {
        int j = current.Value[_entryIndex[current.Key]];
        if (j < min)
        {
            min = j;
        }
        int indexPlusWordLength = j + current.Key.Length;
        if (indexPlusWordLength > max)
        {
            max = indexPlusWordLength;
        }
    }
}
如果有人能指出我在算法中哪里出了问题,我会很感激的。此外,我觉得这是一个非常幼稚的实现。也许有比尝试索引组合更好的方法来解决这个问题?

谢谢!

减去包含所有搜索条件的最短字符串

这是一个更简单的算法,它会给你最短的子字符串。

void Main()
{
    String source = "aaa wwwww fgffsd ththththt sss ww sgsgsgsghs bfbfb hhh sdfg kkk " +
        "dhdhtrherhrhrthrthrt ddfhdetehehe kkk wdwd aaa vcvc hhh zxzx sss ww nbnbn";
    List<String> criteria = new List<string> { "kkk", "aaa", "sss ww", "hhh" };
    var result = GetAllSubstringContainingCriteria(source, criteria)
        .OrderBy(sub => sub.Length).FirstOrDefault();
    // result is "kkk wdwd aaa vcvc hhh zxzx sss ww"
}
private IEnumerable<string> GetAllSubstringContainingCriteria(
    string source, List<string> criteria)
{
    for (int i = 0; i < source.Length; i++)
    {
        var subString = source.Substring(i);
        if (criteria.Any(crit => subString.StartsWith(crit)))
        {
            var lastWordIndex = 
                GetLastCharacterIndexFromLastCriteriaInSubstring(subString, criteria);
            if (lastWordIndex >= 0)
                yield return string.Join(" ", subString.Substring(0, lastWordIndex));
        }
        else
            continue;
    }
}
private int GetLastCharacterIndexFromLastCriteriaInSubstring(
    string subString, List<string> criteria)
{
    var results = criteria.Select(crit => new { 
            index = subString.IndexOf(crit),
            criteria = crit});
    return results.All(result => result.index >= 0)
        ? results.Select(result => result.index + result.criteria.Length).Max()
        : -1;
}

让Java内置类完成这项工作。如何将您的标准转换为正则表达式模式?如果条件是X或Y或Z…,将其转换为形式为"(X)|(Y)|(Z)|…"的正则表达式,编译它,并针对源字符串执行它。

这当然返回最左边的匹配。您可以编写一个非常简单的循环,遍历所有出现的元素,缓存它们,并选择最短的元素——或者最左边最短的元素——或者,如果两个或多个元素同样短,则选择所有这些元素。