LINQ 联接 HashSet 中的实体,Join vs Dictionary vs HashSet 性能

本文关键字:vs HashSet Join 性能 Dictionary 实体 联接 LINQ | 更新日期: 2023-09-27 17:57:02

我有HashSet,每个存储T,我编写了一个测试应用程序来比较我能想到的不同关系算法,但是我对我得到的结果并不满意。

是否存在更有效的实现对象关系的方法,然后我正在测试的一次?

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Diagnostics;
namespace LINQTests
{
    class Program
    {
        static void Main(string[] args)
        {
            HashSet<User> UserTable = new HashSet<User>();
            HashSet<UserProperty> UserPropertyTable = new HashSet<UserProperty>();
            #region Lets create some dummy data.
            Console.WriteLine("Please wait while creating dummy data, this can take a while...");
            Console.WriteLine("");
            int rows = 1000000;
            for(int x = 0; x < rows; x++)
            {
                Random rnd = new Random();
                // Add new user.
                User user = new User(string.Format("Joachim-{0}", x));
                if(!UserTable.Add(user))
                {
                    // throw new Exception.
                }
                else
                {
                    UserProperty age = new UserProperty(user, "Age", rnd.Next(25, 30).ToString());
                    if(!UserPropertyTable.Add(age))
                    {
                        // throw new Exception.
                    }
                    UserProperty sex = new UserProperty(user, "Sex", "Male");
                    if (!UserPropertyTable.Add(sex))
                    {
                        // throw new Exception.
                    }
                    UserProperty location = new UserProperty(user, "Location", "Norway");
                    if (!UserPropertyTable.Add(location))
                    {
                        // throw new Exception.
                    }
                }
            }
            #endregion
            #region Lets do some query tests.
            IEnumerable<User> Users;
            Stopwatch stopwatch = new Stopwatch();
            int matches = 0;
            // Lets find all users who are of age 29.
            Console.WriteLine("Finding all users who are of age 29");
            Console.WriteLine("");
            Console.WriteLine("---------------------------------------------------");
            Console.WriteLine("{0,-20} | {1,6} | {2,9}", "Search Strategy", "Found", "Time");
            Console.WriteLine("---------------------------------------------------");
            // Join test.
            stopwatch.Start();
            Users = (from user in UserTable
                     join property in UserPropertyTable on user.Id equals property.UserId
                     where property.Key == "Age" && property.Value == "29"
                     select user);
            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Joining Tables", matches, stopwatch.ElapsedMilliseconds);
            // Dictionary test.
            stopwatch.Restart();
            var DictionarySearch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t).ToDictionary(x => x.UserId);
            Users = (from t in UserTable where DictionarySearch.ContainsKey(t.Id) select t);
            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Dictionary Contain", matches, stopwatch.ElapsedMilliseconds);
            // HashSet test.
            stopwatch.Restart();
            var HashsetSearch = new HashSet<Guid>(from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId);
            Users = (from t in UserTable where HashsetSearch.Contains(t.Id) select t);
            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "HashSet Contain", matches, stopwatch.ElapsedMilliseconds);
            // Following takes so long that we wont run them!
            //// Array test.
            //stopwatch.Restart();
            //var ArrayMatch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId).ToArray();
            //Users = (from t in UserTable where ArrayMatch.Contains(t.Id) select t);
            //matches = Users.Count();
            //stopwatch.Stop();
            //Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Array Contain", matches, stopwatch.ElapsedMilliseconds);

            //// List test.
            //stopwatch.Restart();
            //var ListMatch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId).ToList();
            //Users = (from t in UserTable where ListMatch.Contains(t.Id) select t);
            //matches = Users.Count();
            //stopwatch.Stop();
            //Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "List Contain", matches, stopwatch.ElapsedMilliseconds);

            Console.WriteLine("---------------------------------------------------");
            #endregion

            Console.WriteLine("");
            Console.WriteLine("Hit return to exit...");
            Console.Read();
        }
    }
    public class User
    {
        public User(string UserName)
        {
            this.Id = Guid.NewGuid();
            this.UserName = UserName;
        }
        public Guid Id { get; set; }
        public string UserName { get; set; }
        public override bool Equals(object obj)
        {
            User other = obj as User;
            if (other == null)
                return false;
            return this.Id == other.Id;
        }
        public override int GetHashCode()
        {
            return Id.GetHashCode();
        }
    }
    public class UserProperty
    {
        public UserProperty(User user, string key, string value)
        {
            this.Id = Guid.NewGuid();
            this.UserId = user.Id;
            this.Key = key;
            this.Value = value;
        }
        public Guid Id { get; private set; }
        public Guid UserId {get; private set;}
        public string Key { get; set; }
        public string Value { get; set; }
        public override bool Equals(object obj)
        {
            UserProperty other = obj as UserProperty;
            if (other == null)
                return false;
            return this.UserId == other.UserId && this.Key == other.Key;
        }
        public override int GetHashCode()
        {
            return string.Format("{0}-{1}", this.UserId, this.Key).GetHashCode();
        }
    }
}

LINQ 联接 HashSet 中的实体,Join vs Dictionary vs HashSet 性能

这将使 linq/join 与其他方法相当:

var properties = UserPropertyTable
    .Where(p=>p.Key == "Age" && p.Value == "29")
    .ToArray();
Users = (from user in UserTable
         join property in properties
         on user.Id equals property.UserId
         select user);

这是我能达到的最快(~2X):

var filteredUserIds = new HashSet<Guid>(
        UserPropertyTable
            .Where(p=>p.Key == "Age" && p.Value == "29")
            .Select(p=>p.UserId));
Users = (from user in UserTable
         where filteredUserIds.Contains(user.Id)
         select user);

带输出

---------------------------------------------------搜索策略 | 找到 |     时间---------------------------------------------------我的方法 |210366 |   157 毫秒字典包含 |210366 |   325 毫秒哈希集包含 |210366 |   325 毫秒---------------------------------------------------

以下是您可以执行的一些操作:

  1. 对于添加的每个元素和正在探测的每个元素,都会调用 GetHashCode(哈希表如何比较两个哈希代码?每个探测器至少调用一次等于。优化这些。不要使用字符串。格式(!它需要解析格式字符串并分配内存。 UserId.GHC() * 37 ^ Name.GHC()应该做的。
  2. 不要使用 LINQ。编写手动循环。LINQ 对正在处理的每个项目有多个虚拟调用或委托调用。这会增加指令,使CPU停止并防止内联。
  3. 你能预先计算数据结构吗?预先计算哈希表或排序列表。合并两个排序列表可能比使用哈希表更快。这不是框架内置的。要为合并联接编写大量困难的自定义代码。
  4. 为什么要在测量中包含property.Key == "Age" && property.Value == "29"谓词?是否需要为每次执行运行?您可以尝试通过使用整数值而不是字符串来优化它。也许您可以预先计算UserPropertyTable的索引,以便您可以在恒定时间内获得匹配的元素。

这应该会给你 1-10 倍的加速,具体取决于你实现的程度。如果你需要比这更快,我需要先问一些问题。通常,您可以找到仅适用于特定情况的专用技巧。