如何按带有容差因子的数值分组对象
本文关键字:对象 何按带 | 更新日期: 2023-09-27 18:18:41
我有一个c#对象列表,其简化数据如下:
ID, Price
2, 80.0
8, 44.25
14, 43.5
30, 79.98
54, 44.24
74, 80.01
我正在尝试按最低的数字分组,同时考虑到容忍因素。例如,在tolerance = 0.02的情况下,我的预期结果应该是:
44.24 -> 8, 54
43.5 -> 14
79.98 -> 2, 30, 74
我如何在实现大型数据集的良好性能的同时做到这一点?在这种情况下,LINQ是可行的吗?
在我看来,如果您有一个大型数据集,您将希望避免对值进行排序,然后在遍历已排序列表时收集它们的直接解决方案,因为对大型集合进行排序可能会很昂贵。我能想到的最有效的解决方案是不做任何显式排序,即构建一个树,其中每个节点包含键位于"连续"范围内的项。range(其中所有键都在彼此的tolerance
范围内)—每次添加一个小于tolerance
的项时,每个节点的范围都会扩展。我实现了一个解决方案——结果比我想象的要复杂和有趣——根据我的粗略基准测试,看起来用这种方法比直接解决方案花费的时间少一半。
这是我作为扩展方法的实现(因此您可以链接它,尽管像正常的Group
方法一样,一旦结果IEnumerable
迭代,它将完全迭代source
)。
public static IEnumerable<IGrouping<double, TValue>> GroupWithTolerance<TValue>(
this IEnumerable<TValue> source,
double tolerance,
Func<TValue, double> keySelector)
{
if(source == null)
throw new ArgumentNullException("source");
return GroupWithToleranceHelper<TValue>.Group(source, tolerance, keySelector);
}
private static class GroupWithToleranceHelper<TValue>
{
public static IEnumerable<IGrouping<double, TValue>> Group(
IEnumerable<TValue> source,
double tolerance,
Func<TValue, double> keySelector)
{
Node root = null, current = null;
foreach (var item in source)
{
var key = keySelector(item);
if(root == null) root = new Node(key);
current = root;
while(true){
if(key < current.Min - tolerance) { current = (current.Left ?? (current.Left = new Node(key))); }
else if(key > current.Max + tolerance) {current = (current.Right ?? (current.Right = new Node(key)));}
else
{
current.Values.Add(item);
if(current.Max < key){
current.Max = key;
current.Redistribute(tolerance);
}
if(current.Min > key) {
current.Min = key;
current.Redistribute(tolerance);
}
break;
}
}
}
if (root != null)
{
foreach (var entry in InOrder(root))
{
yield return entry;
}
}
else
{
//Return an empty collection
yield break;
}
}
private static IEnumerable<IGrouping<double, TValue>> InOrder(Node node)
{
if(node.Left != null)
foreach (var element in InOrder(node.Left))
yield return element;
yield return node;
if(node.Right != null)
foreach (var element in InOrder(node.Right))
yield return element;
}
private class Node : IGrouping<double, TValue>
{
public double Min;
public double Max;
public readonly List<TValue> Values = new List<TValue>();
public Node Left;
public Node Right;
public Node(double key) {
Min = key;
Max = key;
}
public double Key { get { return Min; } }
IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); }
public IEnumerator<TValue> GetEnumerator() { return Values.GetEnumerator(); }
public IEnumerable<TValue> GetLeftValues(){
return Left == null ? Values : Values.Concat(Left.GetLeftValues());
}
public IEnumerable<TValue> GetRightValues(){
return Right == null ? Values : Values.Concat(Right.GetRightValues());
}
public void Redistribute(double tolerance)
{
if(this.Left != null) {
this.Left.Redistribute(tolerance);
if(this.Left.Max + tolerance > this.Min){
this.Values.AddRange(this.Left.GetRightValues());
this.Min = this.Left.Min;
this.Left = this.Left.Left;
}
}
if(this.Right != null) {
this.Right.Redistribute(tolerance);
if(this.Right.Min - tolerance < this.Max){
this.Values.AddRange(this.Right.GetLeftValues());
this.Max = this.Right.Max;
this.Right = this.Right.Right;
}
}
}
}
}
如果需要,可以将double
转换为另一种类型(我希望c#有numeric
的通用约束)。
最直接的方法是设计自己的IEqualityComparer<double>
。
public class ToleranceEqualityComparer : IEqualityComparer<double>
{
public double Tolerance { get; set; } = 0.02;
public bool Equals(double x, double y)
{
return x - Tolerance <= y && x + Tolerance > y;
}
//This is to force the use of Equals methods.
public int GetHashCode(double obj) => 1;
}
你应该像这样使用
var dataByPrice = data.GroupBy(d => d.Price, new ToleranceEqualityComparer());
这是一个新的实现,它最终通过了其他两个解决方案失败的单元测试。它实现了与当前接受的答案相同的签名。检查单元测试以确保没有组导致的最小值和最大值大于容差,并且分组的项目数量与提供的项目匹配。
如何使用
var values = new List<Tuple<double, string>>
{
new Tuple<double, string>(113.5, "Text Item 1"),
new Tuple<double, string>(109.62, "Text Item 2"),
new Tuple<double, string>(159.06, "Text Item 3"),
new Tuple<double, string>(114, "Text Item 4")
};
var groups = values.GroupWithTolerance(5, a => a.Item1).ToList();
扩展方法
/// <summary>
/// Groups items of an IEnumerable collection while allowing a tolerance that all items within the group will fall within
/// </summary>
/// <typeparam name="TValue"></typeparam>
/// <param name="source"></param>
/// <param name="tolerance"></param>
/// <param name="keySelector"></param>
/// <returns></returns>
/// <exception cref="ArgumentNullException"></exception>
public static IEnumerable<IGrouping<double, TValue>> GroupWithTolerance<TValue>(
this IEnumerable<TValue> source,
double tolerance,
Func<TValue, double> keySelector
)
{
var sortedValuesWithKey = source
.Select((a, i) => Tuple.Create(a, keySelector(a), i))
.OrderBy(a => a.Item2)
.ToList();
var diffsByIndex = sortedValuesWithKey
.Skip(1)
//i will start at 0 but we are targeting the diff between 0 and 1.
.Select((a, i) => Tuple.Create(i + 1, sortedValuesWithKey[i + 1].Item2 - sortedValuesWithKey[i].Item2))
.ToList();
var groupBreaks = diffsByIndex
.Where(a => a.Item2 > tolerance)
.Select(a => a.Item1)
.ToHashSet();
var groupKeys = new double[sortedValuesWithKey.Count];
void AddRange(int startIndex, int endIndex)
{
//If there is just one value in the group, take a short cut.
if (endIndex - startIndex == 0)
{
groupKeys[sortedValuesWithKey[startIndex].Item3] = sortedValuesWithKey[startIndex].Item2;
return;
}
var min = sortedValuesWithKey[startIndex].Item2;
var max = sortedValuesWithKey[endIndex].Item2;
//If the range is within tolerance, we are done with this group.
if (max - min < tolerance)
{
//Get the average value of the group and assign it to all elements.
var rangeValues = new List<double>(endIndex - startIndex);
for (var x = startIndex; x <= endIndex; x++)
rangeValues.Add(sortedValuesWithKey[x].Item2);
var average = rangeValues.Average();
for (var x = startIndex; x <= endIndex; x++)
groupKeys[sortedValuesWithKey[x].Item3] = average;
return;
}
//The range is not within tolerance and needs to be divided again.
//Find the largest gap and divide.
double maxDiff = -1;
var splitIndex = -1;
for (var i = startIndex; i < endIndex; i++)
{
var currentDif = diffsByIndex[i].Item2;
if (currentDif > maxDiff)
{
maxDiff = currentDif;
splitIndex = i;
}
}
AddRange(startIndex, splitIndex);
AddRange(splitIndex + 1, endIndex);
}
var groupStartIndex = 0;
for (var i = 1; i < sortedValuesWithKey.Count; i++)
{
//There isn't a group break here, at least not yet, so continue.
if (!groupBreaks.Contains(i))
continue;
AddRange(groupStartIndex, i - 1);
groupStartIndex = i;
}
//Add the last group's keys if we haven't already.
if (groupStartIndex < sortedValuesWithKey.Count)
AddRange(groupStartIndex, sortedValuesWithKey.Count - 1);
return sortedValuesWithKey.GroupBy(a => groupKeys[a.Item3], a => a.Item1);
}