如何查询关键字数量不固定的倒排索引

本文关键字:倒排索引 关键字 何查询 查询 | 更新日期: 2023-09-27 18:05:32

如果关键字是动态的,如何查询倒排索引列表/集合(用户可以随心所欲地输入)?

我在构建"WHERE"子句时很困惑,因为关键字的数量不是固定的。

以防有人不熟悉倒排索引:http://en.wikipedia.org/wiki/Inverted_index

这是索引的类模型:

public class Index
{
    public string word;
    public List<int> referenceIDs;
    //constructor
}

这是集合:

List<Index> invertedIndices = new List<Index>();

谢谢。

注:如果可能的话,我更喜欢lambda表达式的答案,尽管任何基于sql的语言也应该很好。

编辑:

  1. 我将字段从private编辑为public,使其更易于理解。
  2. 如果你不熟悉倒排索引,你需要阅读wiki(它真的很简单,因为这个例子真的很容易理解)。

如何查询关键字数量不固定的倒排索引

        var final = invertedIndices.Where(x => words.Contains(x.word))
                                   .SelectMany(y => y.referenceIDs)
                                   .GroupBy(z => z)
                                   .Where(a => a.Count() == words.Count())
                                   .Select(b => b.Key);

下面的单元测试证明该查询只检索预期的结果。如果将"words"列表转换为字典或某种自定义引用类型,则可以使用JOIN。因此,不能将字符串列表与引用类型列表连接起来。

    [TestMethod]
    public void InvertedIndiciesSearchReturnsMatchOnAllKeywords()
    {
        var words = new List<string>() { "Cow", "Horse" };
        var invertedIndices = new List<Index>()
        {
            new Index { word = "Pig", referenceIDs = new List<int>() { 1, 2, 8 }},
            new Index { word = "Chicken", referenceIDs = new List<int>() { 4, 8 }},
            new Index { word = "Horse", referenceIDs = new List<int>() { 1, 2, 8 }},
            new Index { word = "Goat", referenceIDs = new List<int>() { 3 }},
            new Index { word = "Cow", referenceIDs = new List<int>() { 1, 3 }},
            new Index { word = "Coward", referenceIDs = new List<int>() { 999 }}
        };
        // Contains is searching for x.word _in the list_ "words", not searching
        // to see if any of the _strings within words_ contains x.word.
        var final = invertedIndices.Where(x => words.Contains(x.word))
                                   .SelectMany(y => y.referenceIDs)
            // now limit the results by getting only those reference IDs
            // that appeared for every item in the input list
                                   .GroupBy(z => z)
                                   .Where(a => a.Count() == words.Count())
                                   .Select(b => b.Key);

        Assert.AreEqual(1, final.Count(), "result count");
        Assert.AreEqual(1, final.First(), "value '1' is shared by Cow and Horse and should be the only result");
    }

这样做的一种方法是设计您自己的集合。Dictionary<string, List<int>>可以很好地用作底层集合。这使得您的查找非常快。下面是这样一个类的部分实现,它展示了这样一个查找是怎样的:

public class InvertedIndexCollection : IDictionary<string, List<int>>
{
    public class IndexedWord
    {
        public string Key
        {
            get
            {
                return kvp.Key;
            }
        }
        public List<int> Value
        {
            get
            {
                return kvp.Value;
            }
        }
        private KeyValuePair<string, List<int>> kvp = new KeyValuePair<string, List<int>>();
        public IndexedWord()
        {
        }
        public IndexedWord(string _key, List<int> _newvalue)
        {
            kvp = new KeyValuePair<string, List<int>>(_key, _newvalue.OrderBy(x => x).ToList());
        }
    }
    private Dictionary<string, List<int>> Collection = new Dictionary<string, List<int>>();
    public int Count
    {
        get
        {
            return Collection.Count;
        }
    }
    public InvertedIndexCollection()
    {
    }
    public InvertedIndexCollection(Dictionary<string, List<int>> NewCollection)
    {
        Collection = NewCollection;
    }
    public List<int> this[string key]
    {
        get
        {
            return Collection[key];
        }
        set
        {
            Collection[key] = value;
        }
    }
    public void Add(IndexedWord NewItem)
    {
        if(Collection.ContainsKey(NewItem.Key))
            Collection[NewItem.Key].AddRange(NewItem.Value.Where(x => !Collection[NewItem.Key].Contains(x)));
        else
            Collection.Add(NewItem.Key, NewItem.Value);
    }
    public void Add(string Newkey, int Index)
    {
        if(Collection.ContainsKey(Newkey))
        {
            Collection[Newkey].Add(Index);
            Collection[Newkey].Sort();
        }
        else
            Collection.Add(Newkey, new List<int> { Index });
    }
    public List<int> FindIndices(string InputString, string Delimiter)
    {
        return FindIndices(InputString.Split(Delimiter.ToArray(), StringSplitOptions.RemoveEmptyEntries));
    }
    //This return a list of indices that contain all the search words.  You will
    //probably need to work out how to implement partial results, but this
    //should give you a start
    public List<int> FindIndices(IEnumerable<string> InputArray)
    {
        //Get a list of indices for each word
        var templist = (from word in InputArray
                        where Collection.ContainsKey(word)
                        select Collection[word]);
        //Flatten the lists and remove duplicates and return every index that is
        //common to all the words.
        return (from index in templist.SelectMany(x => x).Distinct()
                where templist.All(x => x.Contains(index))
                select index).ToList();
    }
    public void Add(string key, List<int> value)
    {
        Collection.Add(key, value);
    }
    public bool ContainsKey(string key)
    {
        return Collection.ContainsKey(key);
    }
    public ICollection<string> Keys
    {
        get
        {
            return Collection.Keys;
        }
    }
    public bool Remove(string key)
    {
        return Collection.Remove(key);
    }
    public bool TryGetValue(string key, out List<int> value)
    {
        return Collection.TryGetValue(key, out value);
    }
    public ICollection<List<int>> Values
    {
        get
        {
            return Collection.Values;
        }
    }
    public void Clear()
    {
        Collection.Clear();
    }
    public bool Contains(KeyValuePair<string, List<int>> item)
    {
        return Collection.Contains(item);
    }

    public IEnumerator<KeyValuePair<string, List<int>>> GetEnumerator()
    {
        return Collection.GetEnumerator();
    }
    System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
    {
        return Collection.GetEnumerator();
    }
    public void Add(KeyValuePair<string, List<int>> item)
    {
        throw new NotImplementedException();
    }
    public void CopyTo(KeyValuePair<string, List<int>>[] array, int arrayIndex)
    {
        throw new NotImplementedException();
    }
    public bool IsReadOnly
    {
        get
        {
            throw new NotImplementedException();
        }
    }
    public bool Remove(KeyValuePair<string, List<int>> item)
    {
        throw new NotImplementedException();
    }
}