复杂的多属性分组问题

本文关键字:问题 属性 复杂 | 更新日期: 2023-09-27 18:03:13

我有大量的目录类型数据,具有电子邮件,电话,邮政编码等属性。我想检测和分组重复,但在所有多个属性上,组可以由不同属性上的重复组成,但不是所有都必须是彼此的直接重复,但可以通过另一行相关。

示例数据:

Name       |     Email     |     Tel     |     Postcode
John Sim   |     j@j.com   |     111     |     C67
J Sim      |               |     111     |
John S     |               |             |     C67

我希望能够找到一种方法,我可以检测重复的电子邮件,电话和邮政编码属性,但即使它们不完全匹配。因此,在上面的示例中,我将得到上述所有3个分组,即使最后一个与中间一个没有直接匹配,但第一个与两者都有匹配。

如果有意义的话!显然,这是非常简化的,我有100的记录,我试图分组的方式,我可以显示重复的组。

到目前为止,我已经找到了一些非常低效的方法来做到这一点,包括遍历每一行,对任何直接重复进行分组,然后检查这些重复是否也有重复,如果有,则将它们全部移到一个新组中。但我正试图找到一些灵感和更有效的方法来做这件事:)

谢谢!

复杂的多属性分组问题

您可以在foreach循环中使用索引并在每次迭代中进行组合,将复杂度降低到0 (n):

        foreach (var entry in list)
        {
            Group emailGroup = null;
            Group telGroup = null;
            Group postcodeGroup = null;
            if (entry.Email != null && _emailGroups.TryGetValue(entry.Email, out emailGroup))
                if (!emailGroup.Add(entry)) emailGroup = null;
            if (entry.Tel != null && _telGroups.TryGetValue(entry.Tel, out telGroup))
                if (!telGroup.Add(entry)) telGroup = null;
            if (entry.Postcode != null && _postcodeGroups.TryGetValue(entry.Postcode, out postcodeGroup))
                if (!postcodeGroup.Add(entry)) postcodeGroup = null;
            if (emailGroup == null && telGroup == null && postcodeGroup == null)
            {
                CreateGroup(entry);
                continue;
            }
            CombineGroups(emailGroup, telGroup, postcodeGroup);
        }

当然,你必须决定+处理你想要做的碰撞等,如果需要的话,添加任何名称逻辑(例如拆分名字+中间+最后),然后在每个上做一个双向包含(相当昂贵,所以可能想要查看字符串索引)

完整代码+测试

参见Method

[Test]
public void Test()

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using FluentAssertions;
using NUnit.Framework;
namespace StackOverflow
{
    [TestFixture]
    public class Class1
    {
        private Dictionary<string, Group> _emailGroups;
        private Dictionary<string, Group> _telGroups;
        private Dictionary<string, Group> _postcodeGroups;
        private void CreateGroup(Entry entry)
        {
            var group = new Group(entry);
            if (group.Email != null && !_emailGroups.ContainsKey(group.Email))
                _emailGroups[group.Email] = group;
            if (group.Tel != null && !_emailGroups.ContainsKey(group.Tel))
                _telGroups[group.Tel] = group;
            if (group.PostCode != null && !_emailGroups.ContainsKey(group.PostCode))
                _postcodeGroups[group.PostCode] = group;
        }
        private void CombineGroups(Group emailGroup, Group telGroup, Group postcodeGroup)
        {
            if (emailGroup != telGroup && emailGroup != null && telGroup != null)
            {
                if (emailGroup.CanCombine(telGroup))
                {
                    emailGroup.Add(telGroup);
                    UpdateIndexes(emailGroup, telGroup);
                    telGroup = null;
                }
                ;
            }
            if (emailGroup != postcodeGroup && emailGroup != null && postcodeGroup != null)
            {
                if (emailGroup.CanCombine(postcodeGroup))
                {
                    emailGroup.Add(postcodeGroup);
                    UpdateIndexes(emailGroup, postcodeGroup);
                    postcodeGroup = null;
                }
                ;
            }
            if (telGroup != postcodeGroup && telGroup != null && postcodeGroup != null)
            {
                if (telGroup.CanCombine(postcodeGroup))
                {
                    telGroup.Add(postcodeGroup);
                    UpdateIndexes(telGroup, postcodeGroup);
                    postcodeGroup = null;
                }
                ;
            }
        }
        private void UpdateIndexes(Group newGroup, Group oldGroup)
        {
            Group group;
            if (oldGroup.Email != null
                && _emailGroups.TryGetValue(oldGroup.Email, out group)
                && group == oldGroup)
                _emailGroups[oldGroup.Email] = newGroup;
            if (oldGroup.Tel != null
                && _telGroups.TryGetValue(oldGroup.Tel, out group)
                && group == oldGroup)
                _telGroups[oldGroup.Tel] = newGroup;
            if (oldGroup.PostCode != null
                && _postcodeGroups.TryGetValue(oldGroup.PostCode, out group)
                && group == oldGroup)
                _postcodeGroups[oldGroup.PostCode] = newGroup;
        }
        public class Group
        {
            public HashSet<Entry> Entries = new HashSet<Entry>();
            public Group(Entry entry)
            {
                Email = entry.Email;
                Tel = entry.Tel;
                PostCode = entry.PostCode;
                Entries.Add(entry);
            }
            public string Email { get; set; }
            public string Tel { get; set; }
            public string PostCode { get; set; }
            public bool Matches(Entry entry)
            {
                if (Email != null && entry.Email != null && entry.Email != Email)
                    return false;
                if (Tel != null && entry.Tel != null && entry.Tel != Tel)
                    return false;
                if (PostCode != null && entry.PostCode != null && entry.PostCode != PostCode)
                    return false;
                return true;
            }
            public bool Add(Entry entry)
            {
                if (!Matches(entry))
                    return false;
                Entries.Add(entry);
                if (Email == null && entry.Email != null)
                    Email = entry.Email;
                if (Tel == null && entry.Tel != null)
                    Tel = entry.Tel;
                if (PostCode == null && entry.PostCode != null)
                    PostCode = entry.PostCode;
                return true;
            }
            public bool CanCombine(Group entry)
            {
                if (Email != null && entry.Email != null && Email != entry.Email)
                    return false;
                if (Tel != null && entry.Tel != null && Tel != entry.Tel)
                    return false;
                if (PostCode != null && entry.PostCode != null && PostCode != entry.PostCode)
                    return false;
                return true;
            }
            public void Add(Group group)
            {
                foreach (var entry in group.Entries)
                {
                    Add(entry);
                }
            }
            public override string ToString()
            {
                var sb = new StringBuilder();
                sb.AppendLine($"Key: {Email ?? "null"} | {Tel ?? "null"} | {PostCode ?? "null"}");
                foreach (var entry in Entries)
                {
                    sb.AppendLine(entry.ToString());
                }
                return sb.ToString();
            }
        }
        public class Entry
        {
            public Entry(string name, string email, string tel, string postCode)
            {
                Name = name;
                Email = email;
                Tel = tel;
                PostCode = postCode;
            }
            public string Name { get; set; }
            public string Email { get; set; }
            public string Tel { get; set; }
            public string PostCode { get; set; }
            public override string ToString()
            {
                return $"Entry: {Name ?? "null"} | {Email ?? "null"} | {Tel ?? "null"} | {PostCode ?? "null"}";
            }
        }
        [Test]
        public void Test()
        {
            var list = new List<Entry>
            {
                new Entry("John S", null, null, "C67"),
                new Entry("J Sim", null, "111", null),
                new Entry("John Sim", "j@j.com", "111", "C67")
            };
            _emailGroups = new Dictionary<string, Group>();
            _telGroups = new Dictionary<string, Group>();
            _postcodeGroups = new Dictionary<string, Group>();
            foreach (var entry in list)
            {
                Group emailGroup = null;
                Group telGroup = null;
                Group postcodeGroup = null;
                if (entry.Email != null && _emailGroups.TryGetValue(entry.Email, out emailGroup))
                    if (!emailGroup.Add(entry)) emailGroup = null;
                if (entry.Tel != null && _telGroups.TryGetValue(entry.Tel, out telGroup))
                    if (!telGroup.Add(entry)) telGroup = null;
                if (entry.PostCode != null && _postcodeGroups.TryGetValue(entry.PostCode, out postcodeGroup))
                    if (!postcodeGroup.Add(entry)) postcodeGroup = null;
                if (emailGroup == null && telGroup == null && postcodeGroup == null)
                {
                    CreateGroup(entry);
                    continue;
                }
                CombineGroups(emailGroup, telGroup, postcodeGroup);
            }
            var groups = _emailGroups.Select(x => x.Value)
                .Union(_telGroups.Select(x => x.Value))
                .Union(_postcodeGroups.Select(x => x.Value))
                .Distinct()
                .ToList();
            foreach (var grp in groups)
            {
                Console.WriteLine(grp.ToString());
            }
            groups.Should().HaveCount(1);
            groups.First().Entries.Should().HaveCount(3);
        }
    }
}