在一定权重下找到所有后续替换的最佳算法

本文关键字:替换 算法 最佳 权重 | 更新日期: 2023-09-27 18:23:44

让我们有一个数字序列。说{1,2,3,4,5}。以及一组具有权重的子序列替换。例如:
{1} ->{3}:0.2-这意味着1可以被重量为0.1的3取代
{2,3,4}->{4,3}:0.3
{5} ->{2}:0.4

我需要找到所有的序列,我们可以使用有重量限制的替换。

限制是这样工作的:任何n(3)个项目(在一行中)的替换权重之和应该小于给定的数量e<=0.5
对于我们的示例,结果是:
{1,2,3,4,5}:替换权重:{0,0,0、0,0}。一行中任意3项的总和小于0.5
{3,2,3,4,5}:替换权重:{0.2,0,0,1,0}
{1,4,3,5}:替换权重:{0,0.3/3,0.3/3、0.3/3,0}3个符号的序列so/3
{3,4,3,5}取代权重:{0.2,0.3/3,0.3/3、0.3/3,0}{1,2,3,4,2}替换权重:{0,0,0、0、0.4}
{3,2,3,4,2}替换权重:{0.2,0,0,0.4}

我们不允许{1,4,3,2},因为替换权重{0,0.3/3,0.3/3、0.3/3、0.4}有3个最后的项,和权重=0.6。

在实际例子中,子序列替换集很大。几乎任何短子序列都有一个替换。很明显,这项任务可以用蛮力完成。但我正在寻找一种快速完成的方法。如有任何帮助,我们将不胜感激。

更新
实际上,我处理的是字符串,而不是数字序列。到目前为止,我提出了以下算法(在C#中实现):

public class Substitution
{
    public string SubstituteWhat { get; private set; }
    public string SubstituteTo { get; private set; }
    public double  TotalWeight { get; private set; }
    public double  SymbolWeight { get; private set; }
    public Substitution(string substituteWhat, string substituteTo, double totalWeight)
    {
        SubstituteWhat = substituteWhat;
        SubstituteTo = substituteTo;
        TotalWeight = totalWeight;
        SymbolWeight = TotalWeight/SubstituteWhat.Length;
    }
    public override string ToString()
    {
        return string.Format("SubstituteWhat: {0}, SubstituteTo: {1}, TotalWeight: {2}", SubstituteWhat, SubstituteTo, TotalWeight);
    }
}
class SubstitutedStringWindowImpl
{
    public string OriginalPhrase { get; set; }
    public string SubstitutedPhrase { get; set; }
    private double[] weightVector;
    private double windowSum;
    private int addItemPointer;
    private int substructItemPoiner;
    public SubstitutedStringWindowImpl(string phrase, int windowLength)
    {
        this.OriginalPhrase = phrase;
        SubstitutedPhrase = String.Empty;
        weightVector = new double[phrase.Length + windowLength ];
        windowSum = 0;
        substructItemPoiner = 0;
        addItemPointer = windowLength;
    }
    public bool PerformSubstitution(
        Substitution substitution, 
        double maxWeight,
        out SubstitutedStringWindowImpl substitutedString)
    {
        substitutedString = MemberwiseClone() as SubstitutedStringWindowImpl;
        substitutedString.weightVector = weightVector.ToArray();
        for (int i = 0; i < substitution.SubstituteWhat.Length; i++)
        {
            substitutedString.weightVector[substitutedString.addItemPointer] = substitution.SymbolWeight;
            substitutedString.windowSum = substitutedString.windowSum -
                                          substitutedString.weightVector[substitutedString.substructItemPoiner] +
                                          substitutedString.weightVector[substitutedString.addItemPointer];
            substitutedString.substructItemPoiner++;
            substitutedString.addItemPointer++;
            if (substitutedString.windowSum > maxWeight)
                return false;
            if (substitutedString.addItemPointer == substitutedString.weightVector.Length)
                break;
        }
        substitutedString.SubstitutedPhrase = SubstitutedPhrase + substitution.SubstituteTo;
        return true;
    }
}
internal class SubstitutionManagerWindowImpl
{
    private readonly Dictionary<string, List<Substitution>> substitutionsDict;
    private readonly double maxWeight;
    private readonly int maxSubstitutionLength;
    private readonly int windowLength;
    public SubstitutionManagerWindowImpl(
        List<Substitution> substitutions,
        double maxWeight,
        int windowLength)
    {
        this.substitutionsDict = substitutions.GroupBy(x => x.SubstituteWhat)
            .ToDictionary(x => x.Key, x => x.ToList());
        this.maxWeight = maxWeight;
        this.windowLength = windowLength;
        maxSubstitutionLength = substitutions.Max(x => x.SubstituteWhat.Length);
    }

    private List<SubstitutedStringWindowImpl> GetAllSubstitutionsPrivate(
        SubstitutedStringWindowImpl stringToHandle, int symbolCount)
    {
        if (stringToHandle.OriginalPhrase.Length == symbolCount)
            return new List<SubstitutedStringWindowImpl> {stringToHandle};
        var result = new List<SubstitutedStringWindowImpl>();
        for (int i = 1;
            i <= Math.Min(maxSubstitutionLength, stringToHandle.OriginalPhrase.Length - symbolCount);
            i++)
        {
            var subphraseToSubstitute = stringToHandle.OriginalPhrase.Substring(symbolCount, i);
            List<Substitution> appropriateSubstitutions;
            if (!substitutionsDict.TryGetValue(subphraseToSubstitute, out appropriateSubstitutions))
                continue;
            foreach (var substitution in appropriateSubstitutions)
            {
                SubstitutedStringWindowImpl handledString;
                if (!stringToHandle.PerformSubstitution(substitution, maxWeight, out handledString))
                    continue;
                result.AddRange(GetAllSubstitutionsPrivate(handledString,
                    symbolCount + substitution.SubstituteWhat.Length));
            }
        }
        return result;
    }
    // this is the entry function
    public List<string> GetAllSubstitutions(string phrase)
    {
        var res = GetAllSubstitutionsPrivate(new SubstitutedStringWindowImpl(phrase,windowLength), 0);
        return res.Select(x => x.SubstitutedPhrase).ToList();
    }
}

但它似乎做得不够快。有什么改进的建议吗?

在一定权重下找到所有后续替换的最佳算法

似乎不需要太多的复杂度就可以用多项式延迟进行枚举。定义递归过程

all_substitutions(substitutions, first_half, last_n_weights, second_half)

其中first_half是输入的取代前缀,second_half是输入的未取代后缀。all_substitutions的主体应该测试second_half是否为空。如果是,则生成first_half并返回。否则,对于second_half开始时的每个可行替换,使其适当地递归(其中,我们将单例替换为自身视为权重为0的可行替换)。

如果沿着这些路线的过程不够快,那么查看实际的代码会很有帮助,因为进一步的改进将是数据结构。

我试着看了看,但我似乎无法缓存结果,这通常是一种非常有效的优化,这让我很烦恼。经过一番调整,我找到了一条路。

代码会尽快替换一个序列,然后递归地替换右边的剩余序列。它不需要将先前的替换成本传递给被替换的子序列,因为在没有第一次替换的情况下,子序列将在稍后重新分析,从而更改先前的成本值。

然而,当父序列接收到子序列中的替换结果时,会考虑成本:它只保留那些满足其替换成本不会超过成本限制这一事实的子序列。当然,就像在每种缓存机制中一样,你用内存换取速度,但缓存在这里非常有用。(无论如何,理想情况下,缓存应该独立于CappedCostSubstitutionEnumerator;使用AOP,您可以在需要时注入)

我们首先定义了SequenceSubstitution对象

public class Sequence<T>: List<T>
{
    public double Cost { get; set; }
    public override string ToString() // for the cache key
    {
        return string.Join("|", this.Select(t => t.ToString()));
    }
    public Sequence(IEnumerable<T> sequence)
    {
        this.AddRange(sequence);
        Cost = 0;
    }
}
public class Substitution<T>
{
    public IEnumerable<T> OriginalSequence { get; set; }
    public IEnumerable<T> SwappedSequence { get; set; }
    public double Cost { get; set; }
}

现在我们可以定义替换枚举器

public class CappedCostSubstitutionEnumerator<T>
{
    private static Dictionary<string, List<Sequence<T>>> Cache { get; set; }
    static CappedCostSubstitutionEnumerator()
    {
        Cache = new Dictionary<string, List<Sequence<T>>>();
    }
    public double AuthorizedCost { get; set; }
    public List<Substitution<T>> Substitutions { get; set; }
    public CappedCostSubstitutionEnumerator(double maxAuthorizedCost)
    {
        Substitutions = new List<Substitution<T>>();
        AuthorizedCost = maxAuthorizedCost;
    }
    public List<List<T>> EnumerateSequenceSubstitutions(List<T> sequence)
    {
        var values = EnumerateSequenceSubstitutions(new Sequence<T>(sequence));
        return values.Select(s => s as List<T>).ToList();
    }
    private List<Sequence<T>> EnumerateSequenceSubstitutions(Sequence<T> sourceSequence)
    {
        var sourceSequenceKey = sourceSequence.ToString();
        if (Cache.ContainsKey(sourceSequenceKey))
        {
            return Cache[sourceSequenceKey];
        }
        else
        {
            var sequenceSubstitutionsResults = new List<Sequence<T>> { sourceSequence };
            foreach (var substitution in Substitutions.Where(substitution => substitution.Cost <= AuthorizedCost))
            {
                SubstituteWhereOriginalSubstitutionSequenceIsFound(sourceSequence, substitution, sequenceSubstitutionsResults);
            }
            Cache.Add(sourceSequenceKey, sequenceSubstitutionsResults);
            return sequenceSubstitutionsResults;
        }
    }
    private void SubstituteWhereOriginalSubstitutionSequenceIsFound(Sequence<T> sourceSequence, Substitution<T> substitution,
        List<Sequence<T>> sequenceSubstitutionsResults)
    {
        var indexInSequence = 0;
        var originalSequenceLength = substitution.OriginalSequence.Count();
        var upperIndexInSequence = sourceSequence.Count() - originalSequenceLength;
        // we are going to look for the substitution pattern at each possible place in the source sequence
        while (indexInSequence <= upperIndexInSequence)
        {
            var evaluatedMatch = sourceSequence.Skip(indexInSequence).Take(originalSequenceLength);
            if (evaluatedMatch.SequenceEqual<T>(substitution.OriginalSequence))
            {
                var start = sourceSequence.Take(indexInSequence);
                var substituted = substitution.SwappedSequence;
                var remain = sourceSequence.Skip(indexInSequence + originalSequenceLength);
                // now we want the subsequences without taking cost into account
                var subSequences = EnumerateSequenceSubstitutions(new Sequence<T>(remain));
                // we filter the subsequences on the total cost here
                foreach (
                    var subSequence in
                        subSequences.Where(
                            subSeq => (subSeq.Cost + substitution.Cost) <= (AuthorizedCost - sourceSequence.Cost)))
                {
                    sequenceSubstitutionsResults.Add(
                        new Sequence<T>(start.Concat(substituted).Concat(subSequence))
                        {
                            Cost = substitution.Cost + subSequence.Cost
                        });
                }
            }
            indexInSequence++;
        }
    }
}

一些要点:

  • 我使用列表,但它可能不是最有效的数据持有者;事实上,我确信内存碎片可能是个问题。如果内存池会导致问题,那么使用它可能是个好主意
  • 一次遍历一个元素并不是最有效的,有一些更好的方法可以找到序列的出现
  • 缓存确实有帮助,例如下面的例子,它被命中1796次,插入53次

要使用它,请执行以下操作:

private static void Main(string[] args)
{
    var p = new CappedCostSubstitutionEnumerator<int>(0.5);
    p.Substitutions.Add(new Substitution<int>() {OriginalSequence = new int[] {1}, SwappedSequence = new int[] {3}, Cost = 0.2});
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3,4 }, SwappedSequence = new int[] { 4,3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5 }, SwappedSequence = new int[] { 2 }, Cost = 0.4 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5,1 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.1 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] {  4 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    var results = p.EnumerateSequenceSubstitutions(new List<int>() { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5 });
    // results contains 5390 values
}

这是我的解决方案以及一些测试。在我的计算机上,30个代币序列的替换在不到一秒钟的时间内计算出来。

它作为二叉树工作,并将中间结果缓存在上下文对象中,因此所有分支共享缓存。问题是将源代码拆分为两个子树并不容易,因为有不同的令牌长度。它是通过根据每个可能的令牌长度在不同点重复拆分来解决的,所以每个令牌长度在两个子树中都有机会。

涉及到许多优化,包括按"预算"插槽预先过滤替换代币(不可能为每个预算进行计算)。

请注意,原始项并没有作为递归函数的优化包含在结果中。此外,为了避免舍入错误,我将double替换为decimal,因为它从预算中减去了成本。实际上,我建议用不同范围的整数来代替成本。

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace SubstitutionsWithBudget
{
    [TestClass]
    public class SubstitutionTest
    {
        private static readonly List<Substitution> Substitutions = new List<Substitution>
            {
                new Substitution("1", "3", 0.2m),
                new Substitution("234", "43", 0.3m),
                new Substitution("5", "2", 0.4m)
            };
        private readonly SubstitutionsDefinition substitutionsDefinition = new SubstitutionsDefinition(Substitutions);
        [TestMethod]
        public void OriginalQuestion()
        {
            string source = "12345";
            var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, 0.6m);
            foreach (var variant in variants)
            {
                Console.WriteLine(variant);
            }
            Assert.AreEqual(5, variants.Count());
        }
        [TestMethod]
        public void MultiplicityTest()
        {
            const int multiplicity = 6;
            string source = string.Join("", Enumerable.Repeat("12345", multiplicity));
            var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, multiplicity * 0.6m).ToList();
            foreach (var variant in variants.Take(10))
            {
                Console.WriteLine(variant);
            }
        }
        [TestMethod]
        public void SomeUsefulApplication()
        {
            var substitutions = new List<Substitution>
            {
                new Substitution("monkey", "elephant", 2m),
                new Substitution("monkey", "shark", 3m),
                new Substitution("banana", "apple", 1m),
                new Substitution("feed", "kill", 4m),
            };
            var resultingSubstitutions = EnumerateSubstitutions(
                new Context(),
                new SubstitutionsDefinition(substitutions),
                "feed monkey with banana",
                4)
                .OrderBy(s => s.Item2).ToList();
            foreach (var substitution in resultingSubstitutions)
            {
                Console.WriteLine(substitution);
            }
            Assert.IsTrue(resultingSubstitutions.Any(
                s => s.Item1 == "feed elephant with banana"));
            Assert.IsFalse(resultingSubstitutions.Any(
                s => s.Item1 == "kill shark with banana"));
        }
        IEnumerable<Tuple<string, decimal>> EnumerateSubstitutions(Context context, SubstitutionsDefinition substitutionsDefinition, string source, decimal maxCost)
        {
            if (source.Length == 0)
            {
                yield break;
            }
            var possibleSubstitutions = substitutionsDefinition.GetSubstitutions(source, maxCost).ToList();
            // find substitutions of whole string
            foreach (var possibleSubstitution in possibleSubstitutions)
            {
                yield return Tuple.Create(possibleSubstitution.Destination, possibleSubstitution.Weight);
            }
            // Variable split boundary to accomodate tokens of different length
            var middle = source.Length / 2;
            var window = substitutionsDefinition.MaxLength - 1;
            if (window > source.Length - 2)
            {
                window = source.Length - 2;
            }
            var min = middle - window / 2;
            var returned = new HashSet<Tuple<string, decimal>> { Tuple.Create(source, 0m) };
            for (var i = 0; i <= window; i++)
            {
                var divideAt = min + i;
                var left = source.Substring(0, divideAt);
                var right = source.Substring(divideAt, source.Length - divideAt);
                var q =
                    from leftSubstitution in context.GetCachedResult(Tuple.Create(left, maxCost),
                        () => EnumerateSubstitutions(context, substitutionsDefinition, left, maxCost)).Concat(new[] { Tuple.Create(left, 0m) })
                    let leftCost = leftSubstitution.Item2
                    from rightSubstitution in context.GetCachedResult(Tuple.Create(right, maxCost - leftCost),
                    () => EnumerateSubstitutions(context, substitutionsDefinition, right, maxCost - leftCost)).Concat(new[] { Tuple.Create(right, 0m) })
                    where leftCost + rightSubstitution.Item2 <= maxCost
                    select new { leftSubstitution, rightSubstitution };
                q = q.ToList();
                foreach (var item in q.Select(pair => Tuple.Create(pair.leftSubstitution.Item1 + pair.rightSubstitution.Item1,
                    pair.leftSubstitution.Item2 + pair.rightSubstitution.Item2)).Where(item => !returned.Contains(item)))
                {
                    yield return item;
                    returned.Add(item);
                }
            }
        }
    }
    public struct Substitution
    {
        public readonly string Souce;
        public readonly string Destination;
        public readonly decimal Weight;
        public Substitution(string souce, string destination, decimal weight)
        {
            Souce = souce;
            Destination = destination;
            Weight = weight;
        }
    }
    public class Context
    {
        private readonly Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>> cache = new Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>>();
        public IEnumerable<Tuple<string, decimal>> GetCachedResult(Tuple<string, decimal> key, Func<IEnumerable<Tuple<string, decimal>>> create)
        {
            List<Tuple<string, decimal>> result;
            cache.TryGetValue(key, out result);
            if (result != null) return result;
            result = create().ToList();
            cache.Add(key, result);
            return result;
        }
        public void AddToCache(Tuple<string, decimal> key, List<Tuple<string, decimal>> result)
        {
            cache.Add(key, result);
        }
    }
    public class SubstitutionsDefinition
    {
        private readonly decimal maxCost;
        private const int Granularity = 10;
        /// <summary>
        /// Holds substitutions lookups based on budget slots.
        /// A slot with higher budget also holds values of all lower-budget slots.
        /// </summary>
        private readonly ILookup<int, ILookup<string, Substitution>> byCost;
        private readonly int maxLength;
        private readonly ILookup<string, Substitution> simple;
        private bool simpleMode;
        public int MaxLength { get { return maxLength; } }
        // Really helpful if there are a lot of substitutions
        public IEnumerable<Substitution> GetSubstitutions(string token, decimal budget)
        {
            return simpleMode
                ? GetSubstitutionsSmallSet(token, budget)
                : GetSubstitutionsLargeSet(token, budget);
        }
        private IEnumerable<Substitution> GetSubstitutionsLargeSet(string token, decimal budget)
        {
            return byCost[GetSlot(budget)].SelectMany(i => i[token]).Where(s => s.Weight <= budget);
        }
        public IEnumerable<Substitution> GetSubstitutionsSmallSet(string token, decimal budget)
        {
            return simple[token].Where(s => s.Weight <= budget);
        }
        public SubstitutionsDefinition(IEnumerable<Substitution> substitutions)
        {
            var list = substitutions.ToList();
            simpleMode = list.Count < 20; // Need to be found experimentally.
            simple = list.ToLookup(i => i.Souce);
            maxCost = list.Max(s => s.Weight);
            maxLength = list.Max(s => s.Souce.Length);
            var byLength = list.ToLookup(s => s.Souce.Length);
            var q =
                from length in list.Select(i => i.Souce.Length).Distinct()
                from budgetSlot in Enumerable.Range(0, Granularity + 1)
                from item in byLength[length]
                where item.Weight <= GetCost(budgetSlot)
                group item by budgetSlot;
            byCost = q.ToLookup(i => i.Key, i => i.ToLookup(j => j.Souce));
        }
        private decimal GetCost(int slot)
        {
            var d = maxCost * slot;
            return d / Granularity;
        }
        private int GetSlot(decimal weight)
        {
            var maxWeight = Math.Min(weight, maxCost);
            return (int)(Granularity * maxWeight / maxCost);
        }
    }
}