复杂的多属性分组问题
本文关键字:问题 属性 复杂 | 更新日期: 2023-09-27 18:03:13
我有大量的目录类型数据,具有电子邮件,电话,邮政编码等属性。我想检测和分组重复,但在所有多个属性上,组可以由不同属性上的重复组成,但不是所有都必须是彼此的直接重复,但可以通过另一行相关。
示例数据:
Name | Email | Tel | Postcode
John Sim | j@j.com | 111 | C67
J Sim | | 111 |
John S | | | C67
我希望能够找到一种方法,我可以检测重复的电子邮件,电话和邮政编码属性,但即使它们不完全匹配。因此,在上面的示例中,我将得到上述所有3个分组,即使最后一个与中间一个没有直接匹配,但第一个与两者都有匹配。
如果有意义的话!显然,这是非常简化的,我有100的记录,我试图分组的方式,我可以显示重复的组。
到目前为止,我已经找到了一些非常低效的方法来做到这一点,包括遍历每一行,对任何直接重复进行分组,然后检查这些重复是否也有重复,如果有,则将它们全部移到一个新组中。但我正试图找到一些灵感和更有效的方法来做这件事:)谢谢!
您可以在foreach循环中使用索引并在每次迭代中进行组合,将复杂度降低到0 (n):
foreach (var entry in list)
{
Group emailGroup = null;
Group telGroup = null;
Group postcodeGroup = null;
if (entry.Email != null && _emailGroups.TryGetValue(entry.Email, out emailGroup))
if (!emailGroup.Add(entry)) emailGroup = null;
if (entry.Tel != null && _telGroups.TryGetValue(entry.Tel, out telGroup))
if (!telGroup.Add(entry)) telGroup = null;
if (entry.Postcode != null && _postcodeGroups.TryGetValue(entry.Postcode, out postcodeGroup))
if (!postcodeGroup.Add(entry)) postcodeGroup = null;
if (emailGroup == null && telGroup == null && postcodeGroup == null)
{
CreateGroup(entry);
continue;
}
CombineGroups(emailGroup, telGroup, postcodeGroup);
}
当然,你必须决定+处理你想要做的碰撞等,如果需要的话,添加任何名称逻辑(例如拆分名字+中间+最后),然后在每个上做一个双向包含(相当昂贵,所以可能想要查看字符串索引)
完整代码+测试
参见Method
[Test]
public void Test()
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using FluentAssertions;
using NUnit.Framework;
namespace StackOverflow
{
[TestFixture]
public class Class1
{
private Dictionary<string, Group> _emailGroups;
private Dictionary<string, Group> _telGroups;
private Dictionary<string, Group> _postcodeGroups;
private void CreateGroup(Entry entry)
{
var group = new Group(entry);
if (group.Email != null && !_emailGroups.ContainsKey(group.Email))
_emailGroups[group.Email] = group;
if (group.Tel != null && !_emailGroups.ContainsKey(group.Tel))
_telGroups[group.Tel] = group;
if (group.PostCode != null && !_emailGroups.ContainsKey(group.PostCode))
_postcodeGroups[group.PostCode] = group;
}
private void CombineGroups(Group emailGroup, Group telGroup, Group postcodeGroup)
{
if (emailGroup != telGroup && emailGroup != null && telGroup != null)
{
if (emailGroup.CanCombine(telGroup))
{
emailGroup.Add(telGroup);
UpdateIndexes(emailGroup, telGroup);
telGroup = null;
}
;
}
if (emailGroup != postcodeGroup && emailGroup != null && postcodeGroup != null)
{
if (emailGroup.CanCombine(postcodeGroup))
{
emailGroup.Add(postcodeGroup);
UpdateIndexes(emailGroup, postcodeGroup);
postcodeGroup = null;
}
;
}
if (telGroup != postcodeGroup && telGroup != null && postcodeGroup != null)
{
if (telGroup.CanCombine(postcodeGroup))
{
telGroup.Add(postcodeGroup);
UpdateIndexes(telGroup, postcodeGroup);
postcodeGroup = null;
}
;
}
}
private void UpdateIndexes(Group newGroup, Group oldGroup)
{
Group group;
if (oldGroup.Email != null
&& _emailGroups.TryGetValue(oldGroup.Email, out group)
&& group == oldGroup)
_emailGroups[oldGroup.Email] = newGroup;
if (oldGroup.Tel != null
&& _telGroups.TryGetValue(oldGroup.Tel, out group)
&& group == oldGroup)
_telGroups[oldGroup.Tel] = newGroup;
if (oldGroup.PostCode != null
&& _postcodeGroups.TryGetValue(oldGroup.PostCode, out group)
&& group == oldGroup)
_postcodeGroups[oldGroup.PostCode] = newGroup;
}
public class Group
{
public HashSet<Entry> Entries = new HashSet<Entry>();
public Group(Entry entry)
{
Email = entry.Email;
Tel = entry.Tel;
PostCode = entry.PostCode;
Entries.Add(entry);
}
public string Email { get; set; }
public string Tel { get; set; }
public string PostCode { get; set; }
public bool Matches(Entry entry)
{
if (Email != null && entry.Email != null && entry.Email != Email)
return false;
if (Tel != null && entry.Tel != null && entry.Tel != Tel)
return false;
if (PostCode != null && entry.PostCode != null && entry.PostCode != PostCode)
return false;
return true;
}
public bool Add(Entry entry)
{
if (!Matches(entry))
return false;
Entries.Add(entry);
if (Email == null && entry.Email != null)
Email = entry.Email;
if (Tel == null && entry.Tel != null)
Tel = entry.Tel;
if (PostCode == null && entry.PostCode != null)
PostCode = entry.PostCode;
return true;
}
public bool CanCombine(Group entry)
{
if (Email != null && entry.Email != null && Email != entry.Email)
return false;
if (Tel != null && entry.Tel != null && Tel != entry.Tel)
return false;
if (PostCode != null && entry.PostCode != null && PostCode != entry.PostCode)
return false;
return true;
}
public void Add(Group group)
{
foreach (var entry in group.Entries)
{
Add(entry);
}
}
public override string ToString()
{
var sb = new StringBuilder();
sb.AppendLine($"Key: {Email ?? "null"} | {Tel ?? "null"} | {PostCode ?? "null"}");
foreach (var entry in Entries)
{
sb.AppendLine(entry.ToString());
}
return sb.ToString();
}
}
public class Entry
{
public Entry(string name, string email, string tel, string postCode)
{
Name = name;
Email = email;
Tel = tel;
PostCode = postCode;
}
public string Name { get; set; }
public string Email { get; set; }
public string Tel { get; set; }
public string PostCode { get; set; }
public override string ToString()
{
return $"Entry: {Name ?? "null"} | {Email ?? "null"} | {Tel ?? "null"} | {PostCode ?? "null"}";
}
}
[Test]
public void Test()
{
var list = new List<Entry>
{
new Entry("John S", null, null, "C67"),
new Entry("J Sim", null, "111", null),
new Entry("John Sim", "j@j.com", "111", "C67")
};
_emailGroups = new Dictionary<string, Group>();
_telGroups = new Dictionary<string, Group>();
_postcodeGroups = new Dictionary<string, Group>();
foreach (var entry in list)
{
Group emailGroup = null;
Group telGroup = null;
Group postcodeGroup = null;
if (entry.Email != null && _emailGroups.TryGetValue(entry.Email, out emailGroup))
if (!emailGroup.Add(entry)) emailGroup = null;
if (entry.Tel != null && _telGroups.TryGetValue(entry.Tel, out telGroup))
if (!telGroup.Add(entry)) telGroup = null;
if (entry.PostCode != null && _postcodeGroups.TryGetValue(entry.PostCode, out postcodeGroup))
if (!postcodeGroup.Add(entry)) postcodeGroup = null;
if (emailGroup == null && telGroup == null && postcodeGroup == null)
{
CreateGroup(entry);
continue;
}
CombineGroups(emailGroup, telGroup, postcodeGroup);
}
var groups = _emailGroups.Select(x => x.Value)
.Union(_telGroups.Select(x => x.Value))
.Union(_postcodeGroups.Select(x => x.Value))
.Distinct()
.ToList();
foreach (var grp in groups)
{
Console.WriteLine(grp.ToString());
}
groups.Should().HaveCount(1);
groups.First().Entries.Should().HaveCount(3);
}
}
}