在一定权重下找到所有后续替换的最佳算法
本文关键字:替换 算法 最佳 权重 | 更新日期: 2023-09-27 18:23:44
让我们有一个数字序列。说{1,2,3,4,5}。以及一组具有权重的子序列替换。例如:
{1} ->{3}:0.2-这意味着1可以被重量为0.1的3取代
{2,3,4}->{4,3}:0.3
{5} ->{2}:0.4
我需要找到所有的序列,我们可以使用有重量限制的替换。
限制是这样工作的:任何n(3)个项目(在一行中)的替换权重之和应该小于给定的数量e<=0.5
对于我们的示例,结果是:
{1,2,3,4,5}:替换权重:{0,0,0、0,0}。一行中任意3项的总和小于0.5
{3,2,3,4,5}:替换权重:{0.2,0,0,1,0}
{1,4,3,5}:替换权重:{0,0.3/3,0.3/3、0.3/3,0}3个符号的序列so/3
{3,4,3,5}取代权重:{0.2,0.3/3,0.3/3、0.3/3,0}{1,2,3,4,2}替换权重:{0,0,0、0、0.4}
{3,2,3,4,2}替换权重:{0.2,0,0,0.4}
我们不允许{1,4,3,2},因为替换权重{0,0.3/3,0.3/3、0.3/3、0.4}有3个最后的项,和权重=0.6。
在实际例子中,子序列替换集很大。几乎任何短子序列都有一个替换。很明显,这项任务可以用蛮力完成。但我正在寻找一种快速完成的方法。如有任何帮助,我们将不胜感激。
更新
实际上,我处理的是字符串,而不是数字序列。到目前为止,我提出了以下算法(在C#中实现):
public class Substitution
{
public string SubstituteWhat { get; private set; }
public string SubstituteTo { get; private set; }
public double TotalWeight { get; private set; }
public double SymbolWeight { get; private set; }
public Substitution(string substituteWhat, string substituteTo, double totalWeight)
{
SubstituteWhat = substituteWhat;
SubstituteTo = substituteTo;
TotalWeight = totalWeight;
SymbolWeight = TotalWeight/SubstituteWhat.Length;
}
public override string ToString()
{
return string.Format("SubstituteWhat: {0}, SubstituteTo: {1}, TotalWeight: {2}", SubstituteWhat, SubstituteTo, TotalWeight);
}
}
class SubstitutedStringWindowImpl
{
public string OriginalPhrase { get; set; }
public string SubstitutedPhrase { get; set; }
private double[] weightVector;
private double windowSum;
private int addItemPointer;
private int substructItemPoiner;
public SubstitutedStringWindowImpl(string phrase, int windowLength)
{
this.OriginalPhrase = phrase;
SubstitutedPhrase = String.Empty;
weightVector = new double[phrase.Length + windowLength ];
windowSum = 0;
substructItemPoiner = 0;
addItemPointer = windowLength;
}
public bool PerformSubstitution(
Substitution substitution,
double maxWeight,
out SubstitutedStringWindowImpl substitutedString)
{
substitutedString = MemberwiseClone() as SubstitutedStringWindowImpl;
substitutedString.weightVector = weightVector.ToArray();
for (int i = 0; i < substitution.SubstituteWhat.Length; i++)
{
substitutedString.weightVector[substitutedString.addItemPointer] = substitution.SymbolWeight;
substitutedString.windowSum = substitutedString.windowSum -
substitutedString.weightVector[substitutedString.substructItemPoiner] +
substitutedString.weightVector[substitutedString.addItemPointer];
substitutedString.substructItemPoiner++;
substitutedString.addItemPointer++;
if (substitutedString.windowSum > maxWeight)
return false;
if (substitutedString.addItemPointer == substitutedString.weightVector.Length)
break;
}
substitutedString.SubstitutedPhrase = SubstitutedPhrase + substitution.SubstituteTo;
return true;
}
}
internal class SubstitutionManagerWindowImpl
{
private readonly Dictionary<string, List<Substitution>> substitutionsDict;
private readonly double maxWeight;
private readonly int maxSubstitutionLength;
private readonly int windowLength;
public SubstitutionManagerWindowImpl(
List<Substitution> substitutions,
double maxWeight,
int windowLength)
{
this.substitutionsDict = substitutions.GroupBy(x => x.SubstituteWhat)
.ToDictionary(x => x.Key, x => x.ToList());
this.maxWeight = maxWeight;
this.windowLength = windowLength;
maxSubstitutionLength = substitutions.Max(x => x.SubstituteWhat.Length);
}
private List<SubstitutedStringWindowImpl> GetAllSubstitutionsPrivate(
SubstitutedStringWindowImpl stringToHandle, int symbolCount)
{
if (stringToHandle.OriginalPhrase.Length == symbolCount)
return new List<SubstitutedStringWindowImpl> {stringToHandle};
var result = new List<SubstitutedStringWindowImpl>();
for (int i = 1;
i <= Math.Min(maxSubstitutionLength, stringToHandle.OriginalPhrase.Length - symbolCount);
i++)
{
var subphraseToSubstitute = stringToHandle.OriginalPhrase.Substring(symbolCount, i);
List<Substitution> appropriateSubstitutions;
if (!substitutionsDict.TryGetValue(subphraseToSubstitute, out appropriateSubstitutions))
continue;
foreach (var substitution in appropriateSubstitutions)
{
SubstitutedStringWindowImpl handledString;
if (!stringToHandle.PerformSubstitution(substitution, maxWeight, out handledString))
continue;
result.AddRange(GetAllSubstitutionsPrivate(handledString,
symbolCount + substitution.SubstituteWhat.Length));
}
}
return result;
}
// this is the entry function
public List<string> GetAllSubstitutions(string phrase)
{
var res = GetAllSubstitutionsPrivate(new SubstitutedStringWindowImpl(phrase,windowLength), 0);
return res.Select(x => x.SubstitutedPhrase).ToList();
}
}
但它似乎做得不够快。有什么改进的建议吗?
似乎不需要太多的复杂度就可以用多项式延迟进行枚举。定义递归过程
all_substitutions(substitutions, first_half, last_n_weights, second_half)
其中first_half
是输入的取代前缀,second_half
是输入的未取代后缀。all_substitutions
的主体应该测试second_half
是否为空。如果是,则生成first_half
并返回。否则,对于second_half
开始时的每个可行替换,使其适当地递归(其中,我们将单例替换为自身视为权重为0的可行替换)。
如果沿着这些路线的过程不够快,那么查看实际的代码会很有帮助,因为进一步的改进将是数据结构。
我试着看了看,但我似乎无法缓存结果,这通常是一种非常有效的优化,这让我很烦恼。经过一番调整,我找到了一条路。
代码会尽快替换一个序列,然后递归地替换右边的剩余序列。它不需要将先前的替换成本传递给被替换的子序列,因为在没有第一次替换的情况下,子序列将在稍后重新分析,从而更改先前的成本值。
然而,当父序列接收到子序列中的替换结果时,会考虑成本:它只保留那些满足其替换成本不会超过成本限制这一事实的子序列。当然,就像在每种缓存机制中一样,你用内存换取速度,但缓存在这里非常有用。(无论如何,理想情况下,缓存应该独立于CappedCostSubstitutionEnumerator;使用AOP,您可以在需要时注入)
我们首先定义了Sequence
和Substitution
对象
public class Sequence<T>: List<T>
{
public double Cost { get; set; }
public override string ToString() // for the cache key
{
return string.Join("|", this.Select(t => t.ToString()));
}
public Sequence(IEnumerable<T> sequence)
{
this.AddRange(sequence);
Cost = 0;
}
}
public class Substitution<T>
{
public IEnumerable<T> OriginalSequence { get; set; }
public IEnumerable<T> SwappedSequence { get; set; }
public double Cost { get; set; }
}
现在我们可以定义替换枚举器
public class CappedCostSubstitutionEnumerator<T>
{
private static Dictionary<string, List<Sequence<T>>> Cache { get; set; }
static CappedCostSubstitutionEnumerator()
{
Cache = new Dictionary<string, List<Sequence<T>>>();
}
public double AuthorizedCost { get; set; }
public List<Substitution<T>> Substitutions { get; set; }
public CappedCostSubstitutionEnumerator(double maxAuthorizedCost)
{
Substitutions = new List<Substitution<T>>();
AuthorizedCost = maxAuthorizedCost;
}
public List<List<T>> EnumerateSequenceSubstitutions(List<T> sequence)
{
var values = EnumerateSequenceSubstitutions(new Sequence<T>(sequence));
return values.Select(s => s as List<T>).ToList();
}
private List<Sequence<T>> EnumerateSequenceSubstitutions(Sequence<T> sourceSequence)
{
var sourceSequenceKey = sourceSequence.ToString();
if (Cache.ContainsKey(sourceSequenceKey))
{
return Cache[sourceSequenceKey];
}
else
{
var sequenceSubstitutionsResults = new List<Sequence<T>> { sourceSequence };
foreach (var substitution in Substitutions.Where(substitution => substitution.Cost <= AuthorizedCost))
{
SubstituteWhereOriginalSubstitutionSequenceIsFound(sourceSequence, substitution, sequenceSubstitutionsResults);
}
Cache.Add(sourceSequenceKey, sequenceSubstitutionsResults);
return sequenceSubstitutionsResults;
}
}
private void SubstituteWhereOriginalSubstitutionSequenceIsFound(Sequence<T> sourceSequence, Substitution<T> substitution,
List<Sequence<T>> sequenceSubstitutionsResults)
{
var indexInSequence = 0;
var originalSequenceLength = substitution.OriginalSequence.Count();
var upperIndexInSequence = sourceSequence.Count() - originalSequenceLength;
// we are going to look for the substitution pattern at each possible place in the source sequence
while (indexInSequence <= upperIndexInSequence)
{
var evaluatedMatch = sourceSequence.Skip(indexInSequence).Take(originalSequenceLength);
if (evaluatedMatch.SequenceEqual<T>(substitution.OriginalSequence))
{
var start = sourceSequence.Take(indexInSequence);
var substituted = substitution.SwappedSequence;
var remain = sourceSequence.Skip(indexInSequence + originalSequenceLength);
// now we want the subsequences without taking cost into account
var subSequences = EnumerateSequenceSubstitutions(new Sequence<T>(remain));
// we filter the subsequences on the total cost here
foreach (
var subSequence in
subSequences.Where(
subSeq => (subSeq.Cost + substitution.Cost) <= (AuthorizedCost - sourceSequence.Cost)))
{
sequenceSubstitutionsResults.Add(
new Sequence<T>(start.Concat(substituted).Concat(subSequence))
{
Cost = substitution.Cost + subSequence.Cost
});
}
}
indexInSequence++;
}
}
}
一些要点:
- 我使用列表,但它可能不是最有效的数据持有者;事实上,我确信内存碎片可能是个问题。如果内存池会导致问题,那么使用它可能是个好主意
- 一次遍历一个元素并不是最有效的,有一些更好的方法可以找到序列的出现
- 缓存确实有帮助,例如下面的例子,它被命中1796次,插入53次
要使用它,请执行以下操作:
private static void Main(string[] args)
{
var p = new CappedCostSubstitutionEnumerator<int>(0.5);
p.Substitutions.Add(new Substitution<int>() {OriginalSequence = new int[] {1}, SwappedSequence = new int[] {3}, Cost = 0.2});
p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3,4 }, SwappedSequence = new int[] { 4,3 }, Cost = 0.3 });
p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5 }, SwappedSequence = new int[] { 2 }, Cost = 0.4 });
p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5,1 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.1 });
p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 4 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
var results = p.EnumerateSequenceSubstitutions(new List<int>() { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5 });
// results contains 5390 values
}
这是我的解决方案以及一些测试。在我的计算机上,30个代币序列的替换在不到一秒钟的时间内计算出来。
它作为二叉树工作,并将中间结果缓存在上下文对象中,因此所有分支共享缓存。问题是将源代码拆分为两个子树并不容易,因为有不同的令牌长度。它是通过根据每个可能的令牌长度在不同点重复拆分来解决的,所以每个令牌长度在两个子树中都有机会。
涉及到许多优化,包括按"预算"插槽预先过滤替换代币(不可能为每个预算进行计算)。
请注意,原始项并没有作为递归函数的优化包含在结果中。此外,为了避免舍入错误,我将double替换为decimal,因为它从预算中减去了成本。实际上,我建议用不同范围的整数来代替成本。
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace SubstitutionsWithBudget
{
[TestClass]
public class SubstitutionTest
{
private static readonly List<Substitution> Substitutions = new List<Substitution>
{
new Substitution("1", "3", 0.2m),
new Substitution("234", "43", 0.3m),
new Substitution("5", "2", 0.4m)
};
private readonly SubstitutionsDefinition substitutionsDefinition = new SubstitutionsDefinition(Substitutions);
[TestMethod]
public void OriginalQuestion()
{
string source = "12345";
var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, 0.6m);
foreach (var variant in variants)
{
Console.WriteLine(variant);
}
Assert.AreEqual(5, variants.Count());
}
[TestMethod]
public void MultiplicityTest()
{
const int multiplicity = 6;
string source = string.Join("", Enumerable.Repeat("12345", multiplicity));
var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, multiplicity * 0.6m).ToList();
foreach (var variant in variants.Take(10))
{
Console.WriteLine(variant);
}
}
[TestMethod]
public void SomeUsefulApplication()
{
var substitutions = new List<Substitution>
{
new Substitution("monkey", "elephant", 2m),
new Substitution("monkey", "shark", 3m),
new Substitution("banana", "apple", 1m),
new Substitution("feed", "kill", 4m),
};
var resultingSubstitutions = EnumerateSubstitutions(
new Context(),
new SubstitutionsDefinition(substitutions),
"feed monkey with banana",
4)
.OrderBy(s => s.Item2).ToList();
foreach (var substitution in resultingSubstitutions)
{
Console.WriteLine(substitution);
}
Assert.IsTrue(resultingSubstitutions.Any(
s => s.Item1 == "feed elephant with banana"));
Assert.IsFalse(resultingSubstitutions.Any(
s => s.Item1 == "kill shark with banana"));
}
IEnumerable<Tuple<string, decimal>> EnumerateSubstitutions(Context context, SubstitutionsDefinition substitutionsDefinition, string source, decimal maxCost)
{
if (source.Length == 0)
{
yield break;
}
var possibleSubstitutions = substitutionsDefinition.GetSubstitutions(source, maxCost).ToList();
// find substitutions of whole string
foreach (var possibleSubstitution in possibleSubstitutions)
{
yield return Tuple.Create(possibleSubstitution.Destination, possibleSubstitution.Weight);
}
// Variable split boundary to accomodate tokens of different length
var middle = source.Length / 2;
var window = substitutionsDefinition.MaxLength - 1;
if (window > source.Length - 2)
{
window = source.Length - 2;
}
var min = middle - window / 2;
var returned = new HashSet<Tuple<string, decimal>> { Tuple.Create(source, 0m) };
for (var i = 0; i <= window; i++)
{
var divideAt = min + i;
var left = source.Substring(0, divideAt);
var right = source.Substring(divideAt, source.Length - divideAt);
var q =
from leftSubstitution in context.GetCachedResult(Tuple.Create(left, maxCost),
() => EnumerateSubstitutions(context, substitutionsDefinition, left, maxCost)).Concat(new[] { Tuple.Create(left, 0m) })
let leftCost = leftSubstitution.Item2
from rightSubstitution in context.GetCachedResult(Tuple.Create(right, maxCost - leftCost),
() => EnumerateSubstitutions(context, substitutionsDefinition, right, maxCost - leftCost)).Concat(new[] { Tuple.Create(right, 0m) })
where leftCost + rightSubstitution.Item2 <= maxCost
select new { leftSubstitution, rightSubstitution };
q = q.ToList();
foreach (var item in q.Select(pair => Tuple.Create(pair.leftSubstitution.Item1 + pair.rightSubstitution.Item1,
pair.leftSubstitution.Item2 + pair.rightSubstitution.Item2)).Where(item => !returned.Contains(item)))
{
yield return item;
returned.Add(item);
}
}
}
}
public struct Substitution
{
public readonly string Souce;
public readonly string Destination;
public readonly decimal Weight;
public Substitution(string souce, string destination, decimal weight)
{
Souce = souce;
Destination = destination;
Weight = weight;
}
}
public class Context
{
private readonly Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>> cache = new Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>>();
public IEnumerable<Tuple<string, decimal>> GetCachedResult(Tuple<string, decimal> key, Func<IEnumerable<Tuple<string, decimal>>> create)
{
List<Tuple<string, decimal>> result;
cache.TryGetValue(key, out result);
if (result != null) return result;
result = create().ToList();
cache.Add(key, result);
return result;
}
public void AddToCache(Tuple<string, decimal> key, List<Tuple<string, decimal>> result)
{
cache.Add(key, result);
}
}
public class SubstitutionsDefinition
{
private readonly decimal maxCost;
private const int Granularity = 10;
/// <summary>
/// Holds substitutions lookups based on budget slots.
/// A slot with higher budget also holds values of all lower-budget slots.
/// </summary>
private readonly ILookup<int, ILookup<string, Substitution>> byCost;
private readonly int maxLength;
private readonly ILookup<string, Substitution> simple;
private bool simpleMode;
public int MaxLength { get { return maxLength; } }
// Really helpful if there are a lot of substitutions
public IEnumerable<Substitution> GetSubstitutions(string token, decimal budget)
{
return simpleMode
? GetSubstitutionsSmallSet(token, budget)
: GetSubstitutionsLargeSet(token, budget);
}
private IEnumerable<Substitution> GetSubstitutionsLargeSet(string token, decimal budget)
{
return byCost[GetSlot(budget)].SelectMany(i => i[token]).Where(s => s.Weight <= budget);
}
public IEnumerable<Substitution> GetSubstitutionsSmallSet(string token, decimal budget)
{
return simple[token].Where(s => s.Weight <= budget);
}
public SubstitutionsDefinition(IEnumerable<Substitution> substitutions)
{
var list = substitutions.ToList();
simpleMode = list.Count < 20; // Need to be found experimentally.
simple = list.ToLookup(i => i.Souce);
maxCost = list.Max(s => s.Weight);
maxLength = list.Max(s => s.Souce.Length);
var byLength = list.ToLookup(s => s.Souce.Length);
var q =
from length in list.Select(i => i.Souce.Length).Distinct()
from budgetSlot in Enumerable.Range(0, Granularity + 1)
from item in byLength[length]
where item.Weight <= GetCost(budgetSlot)
group item by budgetSlot;
byCost = q.ToLookup(i => i.Key, i => i.ToLookup(j => j.Souce));
}
private decimal GetCost(int slot)
{
var d = maxCost * slot;
return d / Granularity;
}
private int GetSlot(decimal weight)
{
var maxWeight = Math.Min(weight, maxCost);
return (int)(Granularity * maxWeight / maxCost);
}
}
}