在LINQ中分组会使这个更快吗?这可能吗?
本文关键字:LINQ 分组会 | 更新日期: 2023-09-27 18:07:02
我有一个LINQ数据表,看起来像这样:
numberColumn Value
1 3
4 1
30 6
20 10
50 5
我也有一个范围列表,看起来像这样(可以有任意数量的范围)
Ranges
range
lowerRangeInclusive = 0
upperRangeExclusive = 10
average = null
range
lowerRangeInclusive = 10
upperRangeExclusive = 40
average = null
range
lowerRangeInclusive = 40
upperRangeExclusive = 100
average = null
我需要以快速的方式计算numberColumn的每个范围的平均值(我的实际数据非常大)。成千上万行,100多个范围)。在上面的例子中,它应该是这样的:
Ranges
range
lowerRangeInclusive = 0
upperRangeExclusive = 10
average = 2
range
lowerRangeInclusive = 10
upperRangeExclusive = 40
average = 8
range
lowerRangeInclusive = 40
upperRangeExclusive = 100
average = 5
现在我只有一个这样的东西,但是它很慢。这是糟糕的未优化,还是无论如何都会变慢?:
var table = GetTable();
foreach (var range in Ranges)
{
range.Average = table.Where(n => n.numberColumn >= range.lowerRangeInclusive &&
n.numberColumn < range.upperRangeExclusive)
.Select(x => x.Value).Average();
}
有更快的方法吗?
你没有说"很慢"对你意味着什么。我将数据放入列表中,每个列表对应一个范围。在我的计算机上,处理200个范围内的100,000个项目只花了不到一秒钟的时间。
LINQ可能是优雅和快速的编写,但它并不总是最快的方式来处理数据。
如果这段代码不能帮助你理解"binning"是什么意思,也许直方图可以帮助你。
using System;
using System.Collections;
using System.Collections.Generic;
using System.Data;
using System.Diagnostics;
using System.Linq;
static class Module1
{
const int NUMBERCOLUMNMAX = 1000;
const int VALUEMAX = 100;
const int SAMPLEDATASIZE = 100000; // how many rows to populate the DataTable with.
const int NBINS = 200; // equivalent to quantity of ranges
static Random rand = new Random();
static DataTable dt;
static List<Range> ranges;
public class Range
{
public int LowerRangeInclusive { get; set; }
public int UpperRangeExclusive { get; set; }
public double Average { get; set; }
}
public static DataTable GetData()
{
// create DataTable
DataTable dt = new DataTable();
DataColumn dcNum = new DataColumn
{
ColumnName = "numberColumn",
DataType = Type.GetType("System.Int32")
};
DataColumn dcVal = new DataColumn
{
ColumnName = "Value",
DataType = Type.GetType("System.Int32")
};
dt.Columns.Add(dcNum);
dt.Columns.Add(dcVal);
// populate DataTable
for (int i = 1; i <= SAMPLEDATASIZE; i++)
{
DataRow dr = dt.NewRow();
dr[0] = rand.Next(1, NUMBERCOLUMNMAX + 1);
dr[1] = rand.Next(1, VALUEMAX + 1);
dt.Rows.Add(dr);
}
return dt;
}
public static List<Range> GetRanges()
{
ranges = new List<Range>();
int nRanges = NBINS;
for (int i = 0; i < nRanges; i++)
{
Range thisRange = new Range
{
LowerRangeInclusive = Convert.ToInt32(Math.Floor(Convert.ToDouble(NUMBERCOLUMNMAX) * i / nRanges)),
UpperRangeExclusive = Convert.ToInt32(Math.Floor(Convert.ToDouble(NUMBERCOLUMNMAX) * (i + 1) / nRanges))
};
ranges.Add(thisRange);
}
return ranges;
}
public static void SetAverages(List<Range> ranges, DataTable dt)
{
int nRanges = ranges.Count;
List<int>[] bins = new List<int>[nRanges];
for (int i = 0; i < nRanges; i++)
{
bins[i] = new List<int>();
}
int numCol = dt.Columns["numberColumn"].Ordinal;
int valCol = dt.Columns["Value"].Ordinal;
foreach (DataRow dr in dt.Rows)
{
for (int i = 0; i < nRanges; i++)
{
if (Convert.ToInt32(dr[numCol]) >= ranges[i].LowerRangeInclusive && Convert.ToInt32(dr[numCol]) < ranges[i].UpperRangeExclusive)
{
bins[i].Add(Convert.ToInt32(dr[valCol]));
break;
}
}
}
//TODO: Do something meaningful in the case where ranges(i).Count == 0 instead of the average being zero.
for (int i = 0; i < nRanges; i++)
{
if (bins[i].Count > 0)
{
ranges[i].Average = bins[i].Average();
}
}
}
public static void Main()
{
Console.Write("Init...");
dt = GetData();
ranges = GetRanges();
Console.WriteLine("done.");
// Show ranges
//foreach (Range r in ranges)
//{
// Console.WriteLine(String.Format("{0,4} {1,5}", r.LowerRangeInclusive, r.UpperRangeExclusive));
//}
// Show datarows
//for (int i = 0; i < dt.Rows.Count; i++)
//{
// Console.WriteLine("{0,4} {1,5}", dt.Rows[i][0], dt.Rows[i][1]);
//}
//Time it so that comparisons can be made to other methods.
Stopwatch sw = new Stopwatch();
sw.Start();
SetAverages(ranges, dt);
sw.Stop();
Console.WriteLine(string.Format("{0} rows processed in {1} bins in {2}ms.", dt.Rows.Count, NBINS, sw.ElapsedMilliseconds));
// Show the results
foreach (Range r in ranges)
{
Console.WriteLine(string.Format("[{0},{1}): {2}", r.LowerRangeInclusive, r.UpperRangeExclusive, r.Average));
}
Console.ReadLine();
}
}
注:我最初是用VB写的,并使用了一个在线转换器来获得c#,所以可能有一些东西可以针对c#进行优化,这是我不知道的。