评估两个列表之间序列差异的算法
本文关键字:算法 之间 列表 两个 评估 | 更新日期: 2023-09-27 18:06:30
我正在寻找一种算法来比较两个序列。
序列A -将是一个最优顺序的整数id列表
序列B -将是一个相同id的列表,其顺序可能不同。
我想检测两个列表之间的顺序差异。
,因此我正在寻找一种算法来做到这一点。不知道这是不是以前解决过的普遍问题
正如Julián Urbano建议的那样,Kendall Tau相关是一个很好的测量方法。我决定在。net中使用Linq实现它。下面是我的代码,它实现了Tau-A(用于没有关联的数据)和Tau-B(允许关联)。代码假设您的数据尚未排序,因此它根据Measure1对数据进行一次排序以获得第一组排名值,然后根据Measure2对数据进行排序以获得第二组排名值。相互关联的是排名,而不是原始数据。(如果measure lambda函数返回原始对象不变,则可以将其应用于已有的排名。)
using System;
using System.Collections.Generic;
using System.Linq;
using static System.Math;
namespace Statistics
{
/// <summary>
/// Compute the Kendall Tau Correlation of two orderings of values, a non-parametric correlation that compares the ranking
/// of value, not the values themselves.
///
/// The correlation of the measures is one if all values are in the same order when sorted by two different measures.
/// The correlation is minus one if the second ordering is the reverse of the first.
/// The correlation is zero if the values are completely uncorrelated.
///
/// Two algorithms are provided: TauA and TauB. TauB accounts properly for duplicate values (ties), unlike TauA.
/// </summary>
public class KendallTauCorrelation<T, C> where C : IComparable<C>
{
private Func<T, C> Measure1 { get; }
private Func<T, C> Measure2 { get; }
public KendallTauCorrelation(Func<T, C> measure1, Func<T, C> measure2)
{
Measure1 = measure1;
Measure2 = measure2;
}
/// <summary>
/// Compute the Tau-a rank correlation, which is suitable if there are no ties in rank.
/// </summary>
/// <returns>A value between -1 and 1.
/// If the measures are ranked the same by both measures, returns 1.
/// If the measures are ranked in exactly opposite order, return -1.
/// The more items that are out of sequence, the lower the score.
/// If the measures are completely uncorrelated, returns zero.
/// </returns>
/// <param name="data">Data to be ranked according to two measures and then correlated.</param>
public double TauA(IList<T> data)
{
var ranked = data
.OrderBy(Measure1)
.Select((item, index) => new { Data = item, Rank1 = index + 1})
.OrderBy(pair => Measure2(pair.Data))
.Select((pair, index) => new { pair.Rank1, Rank2 = index + 1 })
.ToList();
var numerator = 0;
var n = ranked.Count;
var denominator = n * (n - 1) / 2.0;
for (var i = 1; i < n; i++)
for (var j = 0; j < i; j++)
{
numerator += Sign(ranked[i].Rank1 - ranked[j].Rank1)
* Sign(ranked[i].Rank2 - ranked[j].Rank2);
}
return numerator / denominator;
}
/// <summary>
/// Compute the Tau-b correlation, which accounts for ties.
///
/// n - n
/// c d
/// τ = -----------------------
/// b _____________________
/// / (n - n )(n - n )
/// √ 0 1 0 2
///
/// where:
/// n0 = n(n-1)/2
///
/// n1 = Σ t (t - 1)/2
/// i i i
///
/// n2 = Σ t (t - 1)/2
/// j j j
///
/// t[i] = # of ties for the ith group according to measure 1.
/// t[j] = # of ties for the jth group according to measure 2.
/// nc = # of concordant pairs
/// nd = # of discordant pairs
/// </summary>
/// <returns>A correlation value between -1 (perfect reverse correlation)
/// and +1 (perfect correlation).
/// Zero means uncorrelated. </returns>
/// <param name="data">Data.</param>
public double TauB(IEnumerable<T> data)
{
// Compute two Ranks by sorting first by Measure1 and then by Measure2.
// Group by like values of each in order to handle ties.
var ranked = data.Select(item => new { M1 = Measure1(item), M2 = Measure2(item) })
.GroupBy(measures => new { measures.M1 })
.OrderBy(@group => @group.First().M1)
.ThenBy(@group => @group.First().M2)
.AsEnumerable()
.Select((@group, groupIndex) => new
{
Measure1Ranked = @group.Select((measure, index) => new { measure.M1, measure.M2 }),
Rank = ++groupIndex
})
.SelectMany(v => v.Measure1Ranked, (s, i) => new
{
i.M1,
i.M2,
DenseRank1 = s.Rank
})
.GroupBy(measures => new { measures.M2 })
.OrderBy(@group => @group.First().M2)
.ThenBy(@group => @group.First().M1)
.AsEnumerable()
.Select((@group, groupIndex) => new
{
Measure2Ranked = @group.Select((measure, index) => new { measure.M1, measure.M2, measure.DenseRank1 }),
Rank = ++groupIndex
})
.SelectMany(v => v.Measure2Ranked, (s, i) => new { i.M1, i.M2, i.DenseRank1, DenseRank2 = s.Rank })
.ToArray();
if (ranked.Length <= 1)
return 0; // No data or one data point. Impossible to establish correlation.
// Now that we have ranked the data, compute the correlation.
var n = ranked.Count();
var n0 = n * (n - 1) / 2;
var n1 = 0;
var n2 = 0;
var numerator = 0; // Stores nc - nd as a single value, rather than computing them separately.
for (var i = 1; i < n; i++)
for (var j = 0; j < i; j++)
{
var iRanked = ranked[i];
var jRanked = ranked[j];
numerator += Sign(iRanked.DenseRank1 - jRanked.DenseRank1)
* Sign(iRanked.DenseRank2 - jRanked.DenseRank2);
// Keep track of ties. Because we are running the indices in a triangle,
// we automatically get this for n1 and n2: ties * (ties - 1) / 2
if (iRanked.M1.CompareTo(jRanked.M1) == 0)
n1++;
if (iRanked.M2.CompareTo(jRanked.M2) == 0)
n2++;
}
if (n0 == n1 || n0 == n2)
return 0; // All ties, so everything as the same rank.
// Observe that if n1 = n2 = 0, that this formula is identical to Tau-a.
return numerator / Sqrt((double)(n0 - n1)*(n0 - n2));
}
}
}
下面是NUnit中的单元测试:
using System;
using NUnit.Framework;
using static System.Math; // New C# 6.0 feature that allows one to import static methods and call them without their class name.
namespace Statistics
{
[TestFixture]
public class KendallTauCorrelationTests
{
public static int[] OneToTen = new[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
#region Tau-a
[Test]
public void TauA_SameOrder()
{
var kendall = new KendallTauCorrelation<int, int>(
(int value) => value,
(int value) => value * 10
);
Assert.AreEqual(
1.0,
kendall.TauA(OneToTen),
"Numbers that sort in the same order should be perfectly correlated."
);
}
[Test]
public void TauA_ReverseOrder()
{
var kendall = new KendallTauCorrelation<int, int>(
(int value) => value,
(int value) => value * -10
);
Assert.AreEqual(
-1.0,
kendall.TauA(OneToTen),
"Numbers that sort in reverse order should be perfectly anti-correlated."
);
}
[Test]
public void TauA_OneSwap()
{
var reordered = new[] { 1, 2, 3, 5, 4, 6, 7, 8, 9, 10 };
var kendall = new KendallTauCorrelation<int, int>(
(int value) => value,
(int value) => reordered[value - 1]
);
Assert.AreEqual(
43.0 / 45.0,
kendall.TauA(OneToTen),
0.00001,
"If a single number is out of place the sequences should be almost perfectly correlated."
);
}
#endregion
#region Tau-b
[Test]
public void TauB_SameOrder()
{
var kendall = new KendallTauCorrelation<int,int>(
(int value) => value,
(int value) => value * 10
);
Assert.AreEqual(
1.0,
kendall.TauB(OneToTen),
"Numbers that sort in the same order should be perfectly correlated."
);
}
[Test]
public void TauB_ReverseOrder()
{
var kendall = new KendallTauCorrelation<int, int>(
(int value) => value,
(int value) => value * -10
);
Assert.AreEqual(
-1.0,
kendall.TauB(OneToTen),
"Numbers that sort in reverse order should be perfectly anti-correlated."
);
}
[Test]
public void TauB_OneSwap_NoTies()
{
var reordered = new[] { 1,2,3,5,4,6,7,8,9,10 };
var kendall = new KendallTauCorrelation<int, int>(
(int value) => value,
(int value) => reordered[value-1]
);
Assert.AreEqual(
43.0/45.0,
kendall.TauB(OneToTen),
0.00001,
"If a single number is out of place the sequences should be almost perfectly correlated."
);
}
[Test]
public void TauB_Ties()
{
var reordered = new[] { 1, 1, 1, 4, 5, 6, 7, 8, 9, 10 };
var kendall = new KendallTauCorrelation<int, int>(
(int value) => value,
(int value) => reordered[value - 1]
);
Assert.AreEqual(
42.0 / Sqrt(42.0*45.0),
kendall.TauB(OneToTen),
0.00001,
"Adding a few ties should be almost perfectly correlated."
);
}
#endregion
}
}
注意:这使用了穷举的O(N^2)算法。有一种更有效的方法是使用修改后的N Log N归并排序,我听说过,但我还没有看到它是如何完成的。
注意:这个泛型类假设两个度量返回相同的数据类型。使类具有两个泛型度量类型是一个简单的更改。它们只需要具有可比性。它们不需要相互比较
如果你只是想衡量有多不同,但你不关心差异发生在哪里,你可以使用肯德尔相关系数。它会给你一个从-1(列表顺序相反)到+1(列表顺序相同)的分数。
它基本上计算两个列表中相同顺序的元素对的数量,然后除以对的总数:
int[] a = { 1, 2, 3, 4, 5, 6, 7, 8 };
int[] b = { 3, 4, 1, 8, 6, 7, 2, 5 };
double numer = 0;
for (int i = 0; i < (a.Length - 1); i++)
for (int j = i + 1; j < a.Length; j++)
numer += Math.Sign(a[i] - a[j]) * Math.Sign(b[i] - b[j]);
double tau = numer / (a.Length * (a.Length - 1) / 2);