评估两个列表之间序列差异的算法

本文关键字:算法 之间 列表 两个 评估 | 更新日期: 2023-09-27 18:06:30

我正在寻找一种算法来比较两个序列。

序列A -将是一个最优顺序的整数id列表

序列B -将是一个相同id的列表,其顺序可能不同。

我想检测两个列表之间的顺序差异。

,因此我正在寻找一种算法来做到这一点。不知道这是不是以前解决过的普遍问题

评估两个列表之间序列差异的算法

正如Julián Urbano建议的那样,Kendall Tau相关是一个很好的测量方法。我决定在。net中使用Linq实现它。下面是我的代码,它实现了Tau-A(用于没有关联的数据)和Tau-B(允许关联)。代码假设您的数据尚未排序,因此它根据Measure1对数据进行一次排序以获得第一组排名值,然后根据Measure2对数据进行排序以获得第二组排名值。相互关联的是排名,而不是原始数据。(如果measure lambda函数返回原始对象不变,则可以将其应用于已有的排名。)

using System;
using System.Collections.Generic;
using System.Linq;
using static System.Math;
namespace Statistics
{
    /// <summary>
    /// Compute the Kendall Tau Correlation of two orderings of values, a non-parametric correlation that compares the ranking
    /// of value, not the values themselves.
    /// 
    /// The correlation of the measures is one if all values are in the same order when sorted by two different measures.
    /// The correlation is minus one if the second ordering is the reverse of the first.
    /// The correlation is zero if the values are completely uncorrelated.
    /// 
    /// Two algorithms are provided: TauA and TauB. TauB accounts properly for duplicate values (ties), unlike TauA.
    /// </summary>
    public class KendallTauCorrelation<T, C> where C : IComparable<C>
    {
        private Func<T, C> Measure1 { get; }
        private Func<T, C> Measure2 { get; }
        public KendallTauCorrelation(Func<T, C> measure1, Func<T, C> measure2)
        {
            Measure1 = measure1;
            Measure2 = measure2;
        }
        /// <summary>
        /// Compute the Tau-a rank correlation, which is suitable if there are no ties in rank.
        /// </summary>
        /// <returns>A value between -1 and 1. 
        /// If the measures are ranked the same by both measures, returns 1.
        /// If the measures are ranked in exactly opposite order, return -1.
        /// The more items that are out of sequence, the lower the score.
        /// If the measures are completely uncorrelated, returns zero.
        /// </returns>
        /// <param name="data">Data to be ranked according to two measures and then correlated.</param>
        public double TauA(IList<T> data)
        {
            var ranked = data
                     .OrderBy(Measure1)
                     .Select((item, index) => new { Data = item, Rank1 = index + 1})
                     .OrderBy(pair => Measure2(pair.Data))
                     .Select((pair, index) => new { pair.Rank1, Rank2 = index + 1 })
                     .ToList();
            var numerator = 0;
            var n = ranked.Count;
            var denominator = n * (n - 1) / 2.0;
            for (var i = 1; i < n; i++)
                for (var j = 0; j < i; j++)
                {
                    numerator += Sign(ranked[i].Rank1 - ranked[j].Rank1) 
                               * Sign(ranked[i].Rank2 - ranked[j].Rank2);
                }
            return numerator / denominator;
        }
        /// <summary>
        /// Compute the Tau-b correlation, which accounts for ties.
        /// 
        ///             n  - n
        ///              c    d
        ///  τ  = -----------------------
        ///   b    _____________________
        ///       / (n  -  n )(n  -  n )
        ///      √    0     1   0     2
        /// 
        /// where:
        ///        n0 = n(n-1)/2
        ///               
        ///        n1 =  Σ  t (t - 1)/2
        ///              i   i  i
        /// 
        ///        n2 =  Σ  t (t - 1)/2
        ///              j   j  j
        /// 
        ///      t[i] = # of ties for the ith group according to measure 1.
        ///      t[j] = # of ties for the jth group according to measure 2.
        ///        nc = # of concordant pairs
        ///        nd = # of discordant pairs
        /// </summary>
        /// <returns>A correlation value between -1 (perfect reverse correlation)
        ///  and +1 (perfect correlation). 
        /// Zero means uncorrelated. </returns>
        /// <param name="data">Data.</param>
        public double TauB(IEnumerable<T> data)
        {
            // Compute two Ranks by sorting first by Measure1 and then by Measure2.
            // Group by like values of each in order to handle ties.
            var ranked = data.Select(item => new { M1 = Measure1(item), M2 = Measure2(item) })
                .GroupBy(measures => new { measures.M1 })
                .OrderBy(@group => @group.First().M1)
                .ThenBy(@group => @group.First().M2)
                .AsEnumerable()
                .Select((@group, groupIndex) => new
                {
                    Measure1Ranked = @group.Select((measure, index) => new { measure.M1, measure.M2 }),
                    Rank = ++groupIndex
                })
                .SelectMany(v => v.Measure1Ranked, (s, i) => new
                {
                    i.M1,
                    i.M2,
                    DenseRank1 = s.Rank
                })
                .GroupBy(measures => new { measures.M2 })
                .OrderBy(@group => @group.First().M2)
                .ThenBy(@group => @group.First().M1)
                .AsEnumerable()
                .Select((@group, groupIndex) => new
                 {
                     Measure2Ranked = @group.Select((measure, index) => new { measure.M1, measure.M2, measure.DenseRank1 }),
                     Rank = ++groupIndex
                 })
                .SelectMany(v => v.Measure2Ranked, (s, i) => new { i.M1, i.M2, i.DenseRank1, DenseRank2 = s.Rank })                          
                .ToArray();
            if (ranked.Length <= 1)
                return 0; // No data or one data point. Impossible to establish correlation.
            // Now that we have ranked the data, compute the correlation.
            var n = ranked.Count();
            var n0 = n * (n - 1) / 2;
            var n1 = 0;
            var n2 = 0;
            var numerator = 0; // Stores nc - nd as a single value, rather than computing them separately.
            for (var i = 1; i < n; i++)
                for (var j = 0; j < i; j++)
                {
                    var iRanked = ranked[i];
                    var jRanked = ranked[j];
                    numerator += Sign(iRanked.DenseRank1 - jRanked.DenseRank1)
                               * Sign(iRanked.DenseRank2 - jRanked.DenseRank2);
                    // Keep track of ties. Because we are running the indices in a triangle,
                    // we automatically get this for n1 and n2: ties * (ties - 1) / 2
                    if (iRanked.M1.CompareTo(jRanked.M1) == 0)
                        n1++;
                    if (iRanked.M2.CompareTo(jRanked.M2) == 0)
                        n2++;
                }
            if (n0 == n1 || n0 == n2)
                return 0; // All ties, so everything as the same rank.
            // Observe that if n1 = n2 = 0, that this formula is identical to Tau-a.
            return numerator / Sqrt((double)(n0 - n1)*(n0 - n2));
        }
    }
}

下面是NUnit中的单元测试:

using System;
using NUnit.Framework;
using static System.Math; // New C# 6.0 feature that allows one to import static methods and call them without their class name.
namespace Statistics
{
    [TestFixture]
    public class KendallTauCorrelationTests
    {
        public static int[] OneToTen = new[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
        #region Tau-a
        [Test]
        public void TauA_SameOrder()
        {
            var kendall = new KendallTauCorrelation<int, int>(
                (int value) => value,
                (int value) => value * 10
            );
            Assert.AreEqual(
                1.0,
                kendall.TauA(OneToTen),
                "Numbers that sort in the same order should be perfectly correlated."
            );
        }
        [Test]
        public void TauA_ReverseOrder()
        {
            var kendall = new KendallTauCorrelation<int, int>(
                (int value) => value,
                (int value) => value * -10
            );
            Assert.AreEqual(
                -1.0,
                kendall.TauA(OneToTen),
                "Numbers that sort in reverse order should be perfectly anti-correlated."
            );
        }
        [Test]
        public void TauA_OneSwap()
        {
            var reordered = new[] { 1, 2, 3, 5, 4, 6, 7, 8, 9, 10 };
            var kendall = new KendallTauCorrelation<int, int>(
                (int value) => value,
                (int value) => reordered[value - 1]
            );
            Assert.AreEqual(
                43.0 / 45.0,
                kendall.TauA(OneToTen),
                0.00001,
                "If a single number is out of place the sequences should be almost perfectly correlated."
            );
        }
        #endregion
        #region Tau-b
        [Test]
        public void TauB_SameOrder()
        {
            var kendall = new KendallTauCorrelation<int,int>(
                (int value) => value, 
                (int value) => value * 10
            );
            Assert.AreEqual(
                1.0, 
                kendall.TauB(OneToTen), 
                "Numbers that sort in the same order should be perfectly correlated."
            );
        }
        [Test]
        public void TauB_ReverseOrder()
        {
            var kendall = new KendallTauCorrelation<int, int>(
                (int value) => value,
                (int value) => value * -10
            );
            Assert.AreEqual(
                -1.0,
                kendall.TauB(OneToTen),
                "Numbers that sort in reverse order should be perfectly anti-correlated."
            );
        }
        [Test]
        public void TauB_OneSwap_NoTies()
        {
            var reordered = new[] { 1,2,3,5,4,6,7,8,9,10 };
            var kendall = new KendallTauCorrelation<int, int>(
                (int value) => value,
                (int value) => reordered[value-1]
            );
            Assert.AreEqual(
                43.0/45.0,
                kendall.TauB(OneToTen),
                0.00001,
                "If a single number is out of place the sequences should be almost perfectly correlated."
            );
        }
        [Test]
        public void TauB_Ties()
        {
            var reordered = new[] { 1, 1, 1, 4, 5, 6, 7, 8, 9, 10 };
            var kendall = new KendallTauCorrelation<int, int>(
                (int value) => value,
                (int value) => reordered[value - 1]
            );
            Assert.AreEqual(
                42.0 / Sqrt(42.0*45.0),
                kendall.TauB(OneToTen),
                0.00001,
                "Adding a few ties should be almost perfectly correlated."
            );
        }
        #endregion
    }
}

注意:这使用了穷举的O(N^2)算法。有一种更有效的方法是使用修改后的N Log N归并排序,我听说过,但我还没有看到它是如何完成的。

注意:这个泛型类假设两个度量返回相同的数据类型。使类具有两个泛型度量类型是一个简单的更改。它们只需要具有可比性。它们不需要相互比较

如果你只是想衡量有多不同,但你不关心差异发生在哪里,你可以使用肯德尔相关系数。它会给你一个从-1(列表顺序相反)到+1(列表顺序相同)的分数。

它基本上计算两个列表中相同顺序的元素对的数量,然后除以对的总数:

int[] a = { 1, 2, 3, 4, 5, 6, 7, 8 };
int[] b = { 3, 4, 1, 8, 6, 7, 2, 5 };
double numer = 0;
for (int i = 0; i < (a.Length - 1); i++)
  for (int j = i + 1; j < a.Length; j++)
    numer += Math.Sign(a[i] - a[j]) * Math.Sign(b[i] - b[j]);
double tau = numer / (a.Length * (a.Length - 1) / 2);