64位整数上的C++与C#的逐位运算-性能

本文关键字:运算 性能 整数 C++ 64位 | 更新日期: 2023-09-27 18:21:39

我有一个由5个无符号长字符组成的数组中存储的2D字段。我要争取最好的表现。我在C#中工作,但我试图通过在C++中实现我的类来设置一个基准。

这里的问题是,C#实现大约需要10秒才能完成,而C++大约需要1秒,这使得它的速度快了10倍。C++是VS2015中的x64版本。C#位于x64 VS2015.NET 4.6中。当然,两者都在Release中。

编辑:稍微优化C#代码后,它仍然需要7到8秒,而C++需要1.3秒。

注意:x86中的C++大约需要6秒才能完成。我正在64位机器上运行代码。

问题:是什么让C++更快?有没有一种方法可以优化C#代码,使其至少达到类似的速度?(也许是一些不安全的魔法?)

让我困惑的是,我们谈论的只是通过数组和逐位运算进行迭代。难道这不应该是JITed做与C++几乎相同的事情吗?

示例代码:实现中有两个简单的函数。Left()和Right()分别将整个字段向左移动1位。合适的钻头在两个长杆之间。

C++

#include <iostream>
#include <chrono>
using namespace std;
using namespace std::chrono;
class BitField
{
private:
    unsigned long long LEFTMOST_BIT = 0x8000000000000000;
    unsigned long long RIGHTMOST_BIT = 1;
public:
    unsigned long long Cells_l[5];
    BitField()
    {
        for (size_t i = 0; i < 5; i++)
        {
            Cells_l[i] = rand(); // Random initialization
        }
    }
    void Left()
    {
        unsigned long long carry = 0;
        unsigned long long nextCarry = 0;
        for (int i = 0; i < 5; i++)
        {
            nextCarry = (Cells_l[i] & LEFTMOST_BIT) >> 63;
            Cells_l[i] = Cells_l[i] << 1 | carry;
            carry = nextCarry;
        }
    }
    void Right()
    {
        unsigned long long carry = 0;
        unsigned long long nextCarry = 0;
        for (int i = 4; i >= 0; i--)
        {
            nextCarry = (Cells_l[i] & RIGHTMOST_BIT) << 63;
            Cells_l[i] = Cells_l[i] >> 1 | carry;
            carry = nextCarry;
        }
    }
};
int main()
{
    BitField bf;
    high_resolution_clock::time_point t1 = high_resolution_clock::now();
    for (int i = 0; i < 100000000; i++)
    {
        bf.Left();
        bf.Left();
        bf.Left();
        bf.Right();
        bf.Right();
        bf.Left();
        bf.Right();
        bf.Right();
    }
    high_resolution_clock::time_point t2 = high_resolution_clock::now();
    auto duration = duration_cast<milliseconds>(t2 - t1).count();
    cout << "Time: " << duration << endl << endl;
    // Print to avoid compiler optimizations
    for (size_t i = 0; i < 5; i++)
    {
        cout << bf.Cells_l[i] << endl;
    }
    return 0;
}

C#

using System;
using System.Diagnostics;
namespace TestCS
{
    class BitField
    {
        const ulong LEFTMOST_BIT = 0x8000000000000000;
        const ulong RIGHTMOST_BIT = 1;
        static Random rnd = new Random();
        ulong[] Cells;
        public BitField()
        {
            Cells = new ulong[5];
            for (int i = 0; i < 5; i++)
            {
                Cells[i] = (ulong)rnd.Next(); // Random initialization
            }
        }
        public void Left()
        {
            ulong carry = 0;
            ulong nextCarry = 0;
            for (int i = 0; i < 5; i++)
            {
                nextCarry = (Cells[i] & LEFTMOST_BIT) >> 63;
                Cells[i] = Cells[i] << 1 | carry;
                carry = nextCarry;
            }
        }
        public void Right()
        {
            ulong carry = 0;
            ulong nextCarry = 0;
            for (int i = 4; i >= 0; i--)
            {
                nextCarry = (Cells[i] & RIGHTMOST_BIT) << 63;
                Cells[i] = Cells[i] >> 1 | carry;
                carry = nextCarry;
            }
        }
    }
    class Program
    {
        static void Main(string[] args)
        {
            BitField bf = new BitField();
            Stopwatch sw = new Stopwatch();
            // Call to remove the compilation time from measurements
            bf.Left();
            bf.Right();
            sw.Start();
            for (int i = 0; i < 100000000; i++)
            {
                bf.Left();
                bf.Left();
                bf.Left();
                bf.Right();
                bf.Right();
                bf.Left();
                bf.Right();
                bf.Right();
            }
            sw.Stop();
            Console.WriteLine($"Done in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
        }
    }
}

编辑:修复了示例代码中的"nextCarry"拼写错误。

64位整数上的C++与C#的逐位运算-性能

我从@AntoninLejsek的评论和删除的回答中得到了足够的信息,我可以自己回答这个问题。

TL;DRC++编译器在优化方面做得更好,并且在循环中进行C#管理的阵列访问会花费大量成本。然而,不安全的代码和固定访问不足以匹配C++。

我们似乎需要手动优化C#代码,以获得与C++相当的性能。

  1. 展开循环
  2. 使用不安全的代码进行固定阵列访问
  3. 不要重复访问数组,而是将项存储到本地变量中

下面的C#代码的运行速度与C++代码一样快(实际上大约快100毫秒)。在.NET 4.6 VS 2015 x64版本上编译。

unsafe struct BitField
{
    static Random rnd = new Random();
    public fixed ulong Cells[5];
    public BitField(int nothing)
    {
        fixed (ulong* p = Cells)
        {
            for (int i = 0; i < 5; i++)
            {
                p[i] = (ulong)rnd.Next(); // Just some random number
            }
        }
    }
public void StuffUnrolledNonManaged()
{
        ulong u0;
        ulong u1;
        ulong u2;
        ulong u3;
        ulong u4;
        fixed (ulong *p = Cells)
        {
            u0 = p[0];
            u1 = p[1];
            u2 = p[2];
            u3 = p[3];
            u4 = p[4];
        }
        ulong carry = 0;
        ulong nextCarry = 0;
        for (int i = 0; i < 100000000; i++)
        {
            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;
            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;
            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;
            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;
            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;
            //left
            carry = 0;
            nextCarry = u0 >> 63;
            u0 = u0 << 1 | carry;
            carry = nextCarry;
            nextCarry = u1 >> 63;
            u1 = u1 << 1 | carry;
            carry = nextCarry;
            nextCarry = u2 >> 63;
            u2 = u2 << 1 | carry;
            carry = nextCarry;
            nextCarry = u3 >> 63;
            u3 = u3 << 1 | carry;
            carry = nextCarry;
            u4 = u4 << 1 | carry;
            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;
            //right
            carry = 0;
            nextCarry = u4 << 63;
            u4 = u4 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u3 << 63;
            u3 = u3 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u2 << 63;
            u2 = u2 >> 1 | carry;
            carry = nextCarry;
            nextCarry = u1 << 63;
            u1 = u1 >> 1 | carry;
            carry = nextCarry;
            u0 = u0 >> 1 | carry;
        }
        fixed (ulong* p = Cells)
        {
            p[0] = u0;
            p[1] = u1;
            p[2] = u2;
            p[3] = u3;
            p[4] = u4;
        }
    }

测试代码

static void Main(string[] args)
        {
            BitField bf = new BitField(0);
            Stopwatch sw = new Stopwatch();
            // Call to remove the compilation time from measurements
            bf.StuffUnrolledNonManaged();
            sw.Start();
            bf.StuffUnrolledNonManaged();
            sw.Stop();
            Console.WriteLine($"Non managed access unrolled in: {sw.Elapsed.TotalMilliseconds.ToString()}ms");
        }

此代码大约在1.1秒内完成

注意:只有固定的阵列访问不足以匹配C++性能。如果我们不使用局部变量-u0的每个实例都被p[0]等替换。时间大约为3.6秒

如果我们对问题代码只使用固定访问(在循环中调用Left()和Right()函数)。时间约为5.8秒

部分差异可能是由于两个版本之间的代码差异——您既没有在C++Left中也没有在C#Right中分配给nextCarry,但这些可能是示例中的拼写错误。

你可能想看看两者的反汇编来看看区别,但主要是因为C++编译器有更多的时间来优化代码。在这种情况下,它展开循环,内联所有函数调用(包括构造函数),并将Cells_l中的所有内容铲入寄存器。所以有一个使用寄存器的大循环,并且没有对内存的访问。

我还没有看过C#编译的输出,但我怀疑它是否能做到这一点。

此外,如注释中所述,将C#代码中的所有Cells.Length调用替换为5(就像C++代码中一样)。