在两个字节数组中查找最大的字节序列
本文关键字:字节 查找 字节数 两个 数组 | 更新日期: 2023-09-27 18:03:41
示例:
{54,87,23,87,45,67,7,85,65,65,3,4,55,76,65,64,5,6,4,54,45,6,4};
{76,57,65,3,4,55,76,65,64,5,6,4,54,45,8,65,66,57,6,7,7,56,6,8,76,54,67};
基本上,我有两个字节[],需要在这两个字节中找到最大的相同字节序列。
我尝试了显而易见的事情,并编写了一些代码来破坏结果:
var bestIndex = 0;
var bestCount = 0;
for (var i1 = 0; i1 + bestCount < data1.Length; i1++)
{
var currentCount = 0;
for (var i2 = 0; i2 < data2.Length; i2++)
{
if (data1[i1 + currentCount] == data2[i2])
{
currentCount++;
if (i1 + currentCount == data1.Length)
{
bestCount = currentCount;
bestIndex = i1;
break;
}
}
else
{
if (currentCount > bestCount)
{
bestCount = currentCount;
bestIndex = i1;
}
currentCount = 0;
}
}
if (currentCount > bestCount)
{
bestCount = currentCount;
bestIndex = i1;
}
}
然而,在我的应用程序中,字节数组会大得多,甚至高达GB。因此,基本上我需要一个提示/代码来说明如何提高效率。
我对此有一些想法。我不确定这是有帮助还是有伤害,但你有没有考虑过先通过最大的可能性反向工作,这样你就可以在找到匹配项后立即终止。
byte[] b1 = { 54, 87, 23, 87, 45, 67, 7, 85, 65, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 6, 4 };
byte[] b2 = { 76, 57, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 8, 65, 66, 57, 6, 7, 7, 56, 6, 7, 44, 57, 8, 76, 54, 67 };
//figure out which one is smaller, since that one will limit the range options
byte[] smaller;
byte[] bigger;
if (b1.Count() > b2.Count())
{
bigger = b1;
smaller = b2;
}
else
{
bigger = b2;
smaller = b1;
}
// doesn't matter what order we put these in, since they will be ordered later by length
List<Tuple<int, int>> ranges = new List<Tuple<int, int>>();
Parallel.For(0, smaller.Count(), (i1) => {
Parallel.For(i1 + 1, smaller.Count(), (i2) =>
{
ranges.Add(new Tuple<int, int>(i1, i2));
});
});
// order by length of slice produced by range in descending order
// this way, once we get an answer, we know nothing else can be longer
ranges = ranges.OrderByDescending(x => x.Item2 - x.Item1).ToList();
Tuple<int, int> largestMatchingRange = new Tuple<int, int>(0, 0);
foreach (Tuple<int, int> range in ranges)
{
bool match = true; // set in outer loop to allow for break
for (int i1 = 0; i1 < bigger.Count(); i1++)
{
if (bigger.Count() <= i1 + (range.Item2 - range.Item1))
{
//short cut if the available slice from the bigger array is shorter than the range length
match = false;
continue;
}
match = true; // reset to true to allow for new attempt for each larger array slice
for (int i2 = range.Item1, i1Temp = i1; i2 < range.Item2; i2++, i1Temp++)
{
if (bigger[i1Temp] != smaller[i2])
{
match = false;
break;
}
}
if (match)
{
largestMatchingRange = range;
break;
}
}
if (match)
{
break;
}
}
byte[] largestMatchingBytes = smaller.Skip(largestMatchingRange.Item1).Take(largestMatchingRange.Item2 - largestMatchingRange.Item1).ToArray();
您可以将每个字节值的索引位置保存在列表字典中,而不是逐个检查字节。在您的情况下,256个列表的数组可能会更好。
List<int>[] index(byte[] a) { // List<long> if the array can be more than 2GB
var lists = new List<int>[256];
for(int i = 0; i < a.Length; i++) {
var b = a[i];
if (lists[b] == null) lists[b] = new List<int>();
lists[b].Add(i);
}
return lists;
}
然后你可以循环256个可能的字节值
byte[] data1 = { 54, 87, 23, 87, 45, 67, 7, 85, 65, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 6, 4 };
byte[] data2 = { 76, 57, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 8, 65, 66, 57, 6, 7, 7, 56, 6, 7, 44, 57, 8, 76, 54, 67 };
var indexes1 = index(data1);
var indexes2 = index(data2);
var bestIndex = 0;
var bestCount = 0;
for (var b = 0; b < 256; b++)
{
var list1 = indexes1[b]; if (list1 == null) continue;
var list2 = indexes1[b]; if (list2 == null) continue;
foreach(var index1 in list1)
{
foreach (var index2 in list2)
{
// your code here
for (var i1 = index1; i1 < data1.Length - bestCount; i1++)
{
var currentCount = 0;
for (var i2 = index2; i2 < data2.Length; i2++)
{
if (data1[i1 + currentCount] == data2[i2])
{
currentCount++;
if (i1 + currentCount == data1.Length)
{
bestCount = currentCount;
bestIndex = i1;
break;
}
}
else
{
if (currentCount > bestCount)
{
bestCount = currentCount;
bestIndex = i1;
}
currentCount = 0;
}
}
if (currentCount > bestCount)
{
bestCount = currentCount;
bestIndex = i1;
}
}
}
}
}
var best = data1.Skip(bestIndex).Take(bestCount);
Debug.Print(bestIndex + ", " + bestCount + ": " + string.Join(", ", best));
理论上,这感觉对更大的数组来说需要更少的比较,但在实践中,它会有更多的内存缓存未命中,所以我不确定它是否会比其他答案中的线性并行版本更快。我没有想太多,但希望它能给你一些想法,以防我弄错了。
更新
我刚刚意识到,对于一台内存小于32GB的普通机器来说,这个想法有多糟糕,因为索引列表将占用字节数组内存的4倍以上。
我发现了循环,这个应该更快。
byte[] data1 = { 54, 87, 23, 87, 45, 67, 7, 85, 65, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 6, 4 };
byte[] data2 = { 76, 57, 65, 3, 4, 55, 76, 65, 64, 5, 6, 4, 54, 45, 8, 65, 66, 57, 6, 7, 7, 56, 6, 7, 44, 57, 8, 76, 54, 67 };
//figure out which one is smaller, since that one will limit the range options
byte[] smaller;
byte[] bigger;
if (data1.Count() > data2.Count())
{
bigger = data1;
smaller = data2;
}
else
{
bigger = data2;
smaller = data1;
}
Tuple<int, int> largestMatchingRange = new Tuple<int, int>(0, 0);
//iterate over slices in reverse length order
for (int length = smaller.Count() - 1; length > 0; length--)
{
int numberOfSlicesForLength = smaller.Count() - length;
bool match = true; // set in outer loop to allow for break
for (int start = 0; start < numberOfSlicesForLength; start++)
{
//within a collection of similarly sized slices, we start with the slice found first within the array
Tuple<int, int> range = new Tuple<int, int>(start, start + length);
for (int i1 = 0; i1 < bigger.Count(); i1++)
{
if (bigger.Count() <= i1 + (range.Item2 - range.Item1))
{
//short cut if the available slice from the bigger array is shorter than the range length
match = false;
continue;
}
match = true; // reset to true to allow for new attempt for each larger array slice
for (int i2 = range.Item1, i1Temp = i1; i2 < range.Item2; i2++, i1Temp++)
{
if (bigger[i1Temp] != smaller[i2])
{
match = false;
break;
}
}
if (match)
{
largestMatchingRange = range;
break;
}
}
if (match)
{
break;
}
}
if (match)
{
break;
}
}
byte[] largestMatchingBytes = smaller.Skip(largestMatchingRange.Item1).Take(largestMatchingRange.Item2 - largestMatchingRange.Item1).ToArray();