Summing array of integers with SIMD v non-SIMD approach

Date Added (UTC):

14 Apr 2024 @ 23:52

Date Updated (UTC):

14 Apr 2024 @ 23:52

.NET Version(s):

.NET 8

Tag(s):

#SIMD

Added By:

Dave Callan

Dave_DotNet

Blog

Ireland

.NET Developer and tech lead from Ireland!

Benchmark Results:

Benchmark Code:

View in Carbon

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Columns;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Reports;
using System;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

[Config(typeof(Config))]
[HideColumns(Column.Job, Column.RatioSD, Column.AllocRatio)]
[MemoryDiagnoser]
[ReturnValueValidator(failOnError: true)]
public class SIMDBenchmark
{
    private int[] data;

    [Params(1024, 2048, 4096)] // Different sizes of the array
    public int N;

    [GlobalSetup]
    public void Setup()
    {
        data = new int[N];
        var random = new Random(42);
        for (int i = 0; i < N; i++)
        {
            data[i] = random.Next(100);
        }
    }

    [Benchmark(Baseline = true)]
    public int SumNonSIMD()
    {
        int sum = 0;
        for (int i = 0; i < data.Length; i++)
        {
            sum += data[i];
        }
        return sum;
    }

    [Benchmark]
    public int SumSIMD()
    {
        Vector128<int> vsum = Vector128<int>.Zero;
        int i = 0;
        int limit = data.Length - data.Length % 4;

        for (; i < limit; i += 4)
        {
            var v = Vector128.Create(data[i], data[i + 1], data[i + 2], data[i + 3]);
            vsum = Sse2.Add(vsum, v);
        }

        int sum = vsum.ToScalar() + vsum.GetElement(1) + vsum.GetElement(2) + vsum.GetElement(3);
        for (; i < data.Length; i++)
        {
            sum += data[i];
        }

        return sum;
    }

    private class Config : ManualConfig
    {
        public Config()
        {
            SummaryStyle =
                SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend);
        }
    }
}

// .NET 8
public int SumNonSIMD()
{
    int num = 0;
    int num2 = 0;
    while (num2 < data.Length)
    {
        num += data[num2];
        num2++;
    }
    return num;
}

// .NET 8
public int SumSIMD()
{
    Vector128<int> vector = Vector128<int>.Zero;
    int num = 0;
    int num2 = data.Length - data.Length % 4;
    while (num < num2)
    {
        Vector128<int> right = Vector128.Create(data[num], data[num + 1], data[num + 2], data[num + 3]);
        vector = Sse2.Add(vector, right);
        num += 4;
    }
    int num3 = Vector128.ToScalar(vector) + Vector128.GetElement(vector, 1) + Vector128.GetElement(vector, 2) + Vector128.GetElement(vector, 3);
    while (num < data.Length)
    {
        num3 += data[num];
        num++;
    }
    return num3;
}

// .NET 8
.method public hidebysig 
    instance int32 SumNonSIMD () cil managed 
{
    .custom instance void [BenchmarkDotNet.Annotations]BenchmarkDotNet.Attributes.BenchmarkAttribute::.ctor(int32, string) = (
        01 00 2a 00 00 00 01 5f 01 00 54 02 08 42 61 73
        65 6c 69 6e 65 01
    )
    // Method begins at RVA 0x2098
    // Code size 34 (0x22)
    .maxstack 3
    .locals init (
        [0] int32 sum,
        [1] int32 i
    )

    // sequence point: (line 45, col 9) to (line 45, col 21) in _
    IL_0000: ldc.i4.0
    IL_0001: stloc.0
    // sequence point: (line 46, col 14) to (line 46, col 23) in _
    IL_0002: ldc.i4.0
    IL_0003: stloc.1
    // sequence point: hidden
    IL_0004: br.s IL_0015
    // loop start (head: IL_0015)
        // sequence point: (line 48, col 13) to (line 48, col 28) in _
        IL_0006: ldloc.0
        IL_0007: ldarg.0
        IL_0008: ldfld int32[] SIMDBenchmark::data
        IL_000d: ldloc.1
        IL_000e: ldelem.i4
        IL_000f: add
        IL_0010: stloc.0
        // sequence point: (line 46, col 42) to (line 46, col 45) in _
        IL_0011: ldloc.1
        IL_0012: ldc.i4.1
        IL_0013: add
        IL_0014: stloc.1

        // sequence point: (line 46, col 25) to (line 46, col 40) in _
        IL_0015: ldloc.1
        IL_0016: ldarg.0
        IL_0017: ldfld int32[] SIMDBenchmark::data
        IL_001c: ldlen
        IL_001d: conv.i4
        IL_001e: blt.s IL_0006
    // end loop

    // sequence point: (line 50, col 9) to (line 50, col 20) in _
    IL_0020: ldloc.0
    IL_0021: ret
}

// .NET 8
.method public hidebysig 
    instance int32 SumSIMD () cil managed 
{
    .custom instance void [BenchmarkDotNet.Annotations]BenchmarkDotNet.Attributes.BenchmarkAttribute::.ctor(int32, string) = (
        01 00 35 00 00 00 01 5f 00 00
    )
    // Method begins at RVA 0x20c8
    // Code size 153 (0x99)
    .maxstack 6
    .locals init (
        [0] valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> vsum,
        [1] int32 i,
        [2] int32 limit,
        [3] int32 sum,
        [4] valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> v
    )

    // sequence point: (line 56, col 9) to (line 56, col 51) in _
    IL_0000: call valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!0> valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32>::get_Zero()
    IL_0005: stloc.0
    // sequence point: (line 57, col 9) to (line 57, col 19) in _
    IL_0006: ldc.i4.0
    IL_0007: stloc.1
    // sequence point: (line 58, col 9) to (line 58, col 51) in _
    IL_0008: ldarg.0
    IL_0009: ldfld int32[] SIMDBenchmark::data
    IL_000e: ldlen
    IL_000f: conv.i4
    IL_0010: ldarg.0
    IL_0011: ldfld int32[] SIMDBenchmark::data
    IL_0016: ldlen
    IL_0017: conv.i4
    IL_0018: ldc.i4.4
    IL_0019: rem
    IL_001a: sub
    IL_001b: stloc.2
    // sequence point: hidden
    IL_001c: br.s IL_0058
    // loop start (head: IL_0058)
        // sequence point: (line 62, col 13) to (line 62, col 86) in _
        IL_001e: ldarg.0
        IL_001f: ldfld int32[] SIMDBenchmark::data
        IL_0024: ldloc.1
        IL_0025: ldelem.i4
        IL_0026: ldarg.0
        IL_0027: ldfld int32[] SIMDBenchmark::data
        IL_002c: ldloc.1
        IL_002d: ldc.i4.1
        IL_002e: add
        IL_002f: ldelem.i4
        IL_0030: ldarg.0
        IL_0031: ldfld int32[] SIMDBenchmark::data
        IL_0036: ldloc.1
        IL_0037: ldc.i4.2
        IL_0038: add
        IL_0039: ldelem.i4
        IL_003a: ldarg.0
        IL_003b: ldfld int32[] SIMDBenchmark::data
        IL_0040: ldloc.1
        IL_0041: ldc.i4.3
        IL_0042: add
        IL_0043: ldelem.i4
        IL_0044: call valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::Create(int32, int32, int32, int32)
        IL_0049: stloc.s 4
        // sequence point: (line 63, col 13) to (line 63, col 38) in _
        IL_004b: ldloc.0
        IL_004c: ldloc.s 4
        IL_004e: call valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> [System.Runtime.Intrinsics]System.Runtime.Intrinsics.X86.Sse2::Add(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32>, valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32>)
        IL_0053: stloc.0
        // sequence point: (line 60, col 27) to (line 60, col 33) in _
        IL_0054: ldloc.1
        IL_0055: ldc.i4.4
        IL_0056: add
        IL_0057: stloc.1

        // sequence point: (line 60, col 16) to (line 60, col 25) in _
        IL_0058: ldloc.1
        IL_0059: ldloc.2
        IL_005a: blt.s IL_001e
    // end loop

    // sequence point: (line 66, col 9) to (line 66, col 98) in _
    IL_005c: ldloc.0
    IL_005d: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::ToScalar<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>)
    IL_0062: ldloc.0
    IL_0063: ldc.i4.1
    IL_0064: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::GetElement<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>, int32)
    IL_0069: add
    IL_006a: ldloc.0
    IL_006b: ldc.i4.2
    IL_006c: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::GetElement<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>, int32)
    IL_0071: add
    IL_0072: ldloc.0
    IL_0073: ldc.i4.3
    IL_0074: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::GetElement<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>, int32)
    IL_0079: add
    IL_007a: stloc.3
    // sequence point: hidden
    IL_007b: br.s IL_008c
    // loop start (head: IL_008c)
        // sequence point: (line 69, col 13) to (line 69, col 28) in _
        IL_007d: ldloc.3
        IL_007e: ldarg.0
        IL_007f: ldfld int32[] SIMDBenchmark::data
        IL_0084: ldloc.1
        IL_0085: ldelem.i4
        IL_0086: add
        IL_0087: stloc.3
        // sequence point: (line 67, col 33) to (line 67, col 36) in _
        IL_0088: ldloc.1
        IL_0089: ldc.i4.1
        IL_008a: add
        IL_008b: stloc.1

        // sequence point: (line 67, col 16) to (line 67, col 31) in _
        IL_008c: ldloc.1
        IL_008d: ldarg.0
        IL_008e: ldfld int32[] SIMDBenchmark::data
        IL_0093: ldlen
        IL_0094: conv.i4
        IL_0095: blt.s IL_007d
    // end loop

    // sequence point: (line 72, col 9) to (line 72, col 20) in _
    IL_0097: ldloc.3
    IL_0098: ret
}

// .NET 8 (X64)
SumNonSIMD()
    L0000: sub rsp, 0x28
    L0004: xor eax, eax
    L0006: xor edx, edx
    L0008: mov rcx, [rcx+8]
    L000c: cmp dword ptr [rcx+8], 0
    L0010: jle short L0038
    L0012: nop [rax]
    L0019: nop [rax]
    L0020: mov r8, rcx
    L0023: cmp edx, [r8+8]
    L0027: jae short L003d
    L0029: mov r10d, edx
    L002c: add eax, [r8+r10*4+0x10]
    L0031: inc edx
    L0033: cmp [rcx+8], edx
    L0036: jg short L0020
    L0038: add rsp, 0x28
    L003c: ret
    L003d: call 0x00007ffe687b0da0
    L0042: int3

// .NET 8 (X64)
SumSIMD()
    L0000: sub rsp, 0x28
    L0004: vzeroupper
    L0007: vxorps xmm0, xmm0, xmm0
    L000b: xor eax, eax
    L000d: mov rcx, [rcx+8]
    L0011: mov edx, [rcx+8]
    L0014: mov r8d, [rcx+8]
    L0018: and r8d, 3
    L001c: sub edx, r8d
    L001f: test edx, edx
    L0021: jle short L008f
    L0023: mov r8, rcx
    L0026: mov r10d, [r8+8]
    L002a: cmp eax, r10d
    L002d: jae L00e3
    L0033: mov r9d, eax
    L0036: vmovd xmm1, [r8+r9*4+0x10]
    L003d: mov r8, rcx
    L0040: lea r9d, [rax+1]
    L0044: cmp r9d, r10d
    L0047: jae L00e3
    L004d: vpinsrd xmm1, xmm1, [r8+r9*4+0x10], 1
    L0055: mov r8, rcx
    L0058: lea r9d, [rax+2]
    L005c: cmp r9d, r10d
    L005f: jae L00e3
    L0065: vpinsrd xmm1, xmm1, [r8+r9*4+0x10], 2
    L006d: mov r8, rcx
    L0070: lea r9d, [rax+3]
    L0074: cmp r9d, r10d
    L0077: jae short L00e3
    L0079: mov r10d, r9d
    L007c: vpinsrd xmm1, xmm1, [r8+r10*4+0x10], 3
    L0084: vpaddd xmm0, xmm0, xmm1
    L0088: add eax, 4
    L008b: cmp eax, edx
    L008d: jl short L0023
    L008f: vmovd edx, xmm0
    L0093: vpextrd r8d, xmm0, 1
    L0099: add edx, r8d
    L009c: vpextrd r8d, xmm0, 2
    L00a2: add edx, r8d
    L00a5: vpextrd r8d, xmm0, 3
    L00ab: add r8d, edx
    L00ae: cmp [rcx+8], eax
    L00b1: jle short L00db
    L00b3: nop [rax+rax]
    L00b8: nop [rax+rax]
    L00c0: mov rdx, rcx
    L00c3: mov r10d, [rdx+8]
    L00c7: cmp eax, r10d
    L00ca: jae short L00e3
    L00cc: mov r10d, eax
    L00cf: add r8d, [rdx+r10*4+0x10]
    L00d4: inc eax
    L00d6: cmp [rcx+8], eax
    L00d9: jg short L00c0
    L00db: mov eax, r8d
    L00de: add rsp, 0x28
    L00e2: ret
    L00e3: call 0x00007ffe687b0da0
    L00e8: int3

Benchmark Description:

This benchmark uses SSE2 intrinsics to sum the array in chunks of four integers at a time, which should be faster on systems where SIMD is supported.

The provided benchmark code is designed to compare the performance of summing elements in an integer array using two different methods: a traditional loop (non-SIMD) and a SIMD (Single Instruction, Multiple Data) approach. It uses the BenchmarkDotNet library, a popular .NET benchmarking tool. The benchmark is configured to test different sizes of the array to see how each method scales with data size. The setup and rationale behind each method are as follows:

### General Setup

- **.NET Version**: The code does not explicitly mention the .NET version, but it uses features (e.g., `System.Runtime.Intrinsics`) that are available in .NET Core 2.1 and later versions, including .NET 5 and .NET 6.
- **Configuration**: The benchmark class uses a custom configuration defined in the `Config` class, which hides certain columns in the output and modifies the summary style to focus on trend ratios.
- **MemoryDiagnoser**: Enabled to report memory allocation statistics, which is crucial for understanding the memory efficiency of each method.
- **ReturnValueValidator**: Ensures that the methods return the correct value, adding a layer of correctness verification to the performance testing.

### Benchmark Methods

#### 1. `SumNonSIMD`

- **Purpose**: This method sums the elements of an integer array using a straightforward loop, which represents a common non-SIMD approach.
- **Performance Aspect**: It measures the baseline performance of array summation without any hardware acceleration or parallel processing techniques. This method is expected to be slower on large datasets compared to SIMD, as it processes one element at a time.
- **Importance**: Serving as the baseline, it allows us to quantify the performance gains achieved by SIMD operations.

#### 2. `SumSIMD`

- **Purpose**: This method demonstrates how to use SIMD operations to sum an integer array. SIMD allows processing multiple data points in a single instruction, potentially offering significant performance improvements for operations that can be parallelized.
- **Performance Aspect**: It specifically tests the performance benefits of using SIMD instructions (via `System.Runtime.Intrinsics`) for arithmetic operations on arrays. The method is designed to leverage CPU vector instructions to sum blocks of four integers at a time.
- **Importance**: Understanding the performance gains from SIMD is crucial for optimizing compute-bound applications, especially those dealing with large datasets or requiring real-time processing. SIMD can offer substantial speedups by fully utilizing modern CPU capabilities.

### Expected Insights

Running these benchmarks should provide insights into:

- **Performance Gains**: The difference in execution time between the non-SIMD and SIMD methods, highlighting the potential speedup from using SIMD instructions.
- **Scalability**: How each method's performance scales with the size of the input array. SIMD operations might show more significant benefits as the data size increases.
- **Memory Usage**: Although both methods are expected to have similar memory footprints, the benchmark's memory diagnostics will confirm if SIMD operations introduce any additional memory overhead.

In summary, these benchmarks are designed to illustrate the benefits of SIMD for parallelizable operations, offering a clear comparison against traditional loop-based methods in terms of speed and efficiency.

Summing array of integers with SIMD v non-SIMD approach

Benchmark Results:

Benchmark Code:

Original

Lowered C# Code

IL Code

JIT Asm Code

Benchmark Description:

User Added:

AI Added:

Benchmark Comments: