Summing array of integers with SIMD v non-SIMD approach
Date Added (UTC):
14 Apr 2024 @ 23:52
Date Updated (UTC):14 Apr 2024 @ 23:52
.NET Version(s): Tag(s):
Added By:
.NET Developer and tech lead from Ireland!
Benchmark Results:
Benchmark Code:
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Columns;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Reports;
using System;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
[Config(typeof(Config))]
[HideColumns(Column.Job, Column.RatioSD, Column.AllocRatio)]
[MemoryDiagnoser]
[ReturnValueValidator(failOnError: true)]
public class SIMDBenchmark
{
private int[] data;
[Params(1024, 2048, 4096)] // Different sizes of the array
public int N;
[GlobalSetup]
public void Setup()
{
data = new int[N];
var random = new Random(42);
for (int i = 0; i < N; i++)
{
data[i] = random.Next(100);
}
}
[Benchmark(Baseline = true)]
public int SumNonSIMD()
{
int sum = 0;
for (int i = 0; i < data.Length; i++)
{
sum += data[i];
}
return sum;
}
[Benchmark]
public int SumSIMD()
{
Vector128<int> vsum = Vector128<int>.Zero;
int i = 0;
int limit = data.Length - data.Length % 4;
for (; i < limit; i += 4)
{
var v = Vector128.Create(data[i], data[i + 1], data[i + 2], data[i + 3]);
vsum = Sse2.Add(vsum, v);
}
int sum = vsum.ToScalar() + vsum.GetElement(1) + vsum.GetElement(2) + vsum.GetElement(3);
for (; i < data.Length; i++)
{
sum += data[i];
}
return sum;
}
private class Config : ManualConfig
{
public Config()
{
SummaryStyle =
SummaryStyle.Default.WithRatioStyle(RatioStyle.Trend);
}
}
}
Powered by SharpLab
// .NET 8
public int SumNonSIMD()
{
int num = 0;
int num2 = 0;
while (num2 < data.Length)
{
num += data[num2];
num2++;
}
return num;
}
// .NET 8
public int SumSIMD()
{
Vector128<int> vector = Vector128<int>.Zero;
int num = 0;
int num2 = data.Length - data.Length % 4;
while (num < num2)
{
Vector128<int> right = Vector128.Create(data[num], data[num + 1], data[num + 2], data[num + 3]);
vector = Sse2.Add(vector, right);
num += 4;
}
int num3 = Vector128.ToScalar(vector) + Vector128.GetElement(vector, 1) + Vector128.GetElement(vector, 2) + Vector128.GetElement(vector, 3);
while (num < data.Length)
{
num3 += data[num];
num++;
}
return num3;
}
Powered by SharpLab
// .NET 8
.method public hidebysig
instance int32 SumNonSIMD () cil managed
{
.custom instance void [BenchmarkDotNet.Annotations]BenchmarkDotNet.Attributes.BenchmarkAttribute::.ctor(int32, string) = (
01 00 2a 00 00 00 01 5f 01 00 54 02 08 42 61 73
65 6c 69 6e 65 01
)
// Method begins at RVA 0x2098
// Code size 34 (0x22)
.maxstack 3
.locals init (
[0] int32 sum,
[1] int32 i
)
// sequence point: (line 45, col 9) to (line 45, col 21) in _
IL_0000: ldc.i4.0
IL_0001: stloc.0
// sequence point: (line 46, col 14) to (line 46, col 23) in _
IL_0002: ldc.i4.0
IL_0003: stloc.1
// sequence point: hidden
IL_0004: br.s IL_0015
// loop start (head: IL_0015)
// sequence point: (line 48, col 13) to (line 48, col 28) in _
IL_0006: ldloc.0
IL_0007: ldarg.0
IL_0008: ldfld int32[] SIMDBenchmark::data
IL_000d: ldloc.1
IL_000e: ldelem.i4
IL_000f: add
IL_0010: stloc.0
// sequence point: (line 46, col 42) to (line 46, col 45) in _
IL_0011: ldloc.1
IL_0012: ldc.i4.1
IL_0013: add
IL_0014: stloc.1
// sequence point: (line 46, col 25) to (line 46, col 40) in _
IL_0015: ldloc.1
IL_0016: ldarg.0
IL_0017: ldfld int32[] SIMDBenchmark::data
IL_001c: ldlen
IL_001d: conv.i4
IL_001e: blt.s IL_0006
// end loop
// sequence point: (line 50, col 9) to (line 50, col 20) in _
IL_0020: ldloc.0
IL_0021: ret
}
// .NET 8
.method public hidebysig
instance int32 SumSIMD () cil managed
{
.custom instance void [BenchmarkDotNet.Annotations]BenchmarkDotNet.Attributes.BenchmarkAttribute::.ctor(int32, string) = (
01 00 35 00 00 00 01 5f 00 00
)
// Method begins at RVA 0x20c8
// Code size 153 (0x99)
.maxstack 6
.locals init (
[0] valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> vsum,
[1] int32 i,
[2] int32 limit,
[3] int32 sum,
[4] valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> v
)
// sequence point: (line 56, col 9) to (line 56, col 51) in _
IL_0000: call valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!0> valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32>::get_Zero()
IL_0005: stloc.0
// sequence point: (line 57, col 9) to (line 57, col 19) in _
IL_0006: ldc.i4.0
IL_0007: stloc.1
// sequence point: (line 58, col 9) to (line 58, col 51) in _
IL_0008: ldarg.0
IL_0009: ldfld int32[] SIMDBenchmark::data
IL_000e: ldlen
IL_000f: conv.i4
IL_0010: ldarg.0
IL_0011: ldfld int32[] SIMDBenchmark::data
IL_0016: ldlen
IL_0017: conv.i4
IL_0018: ldc.i4.4
IL_0019: rem
IL_001a: sub
IL_001b: stloc.2
// sequence point: hidden
IL_001c: br.s IL_0058
// loop start (head: IL_0058)
// sequence point: (line 62, col 13) to (line 62, col 86) in _
IL_001e: ldarg.0
IL_001f: ldfld int32[] SIMDBenchmark::data
IL_0024: ldloc.1
IL_0025: ldelem.i4
IL_0026: ldarg.0
IL_0027: ldfld int32[] SIMDBenchmark::data
IL_002c: ldloc.1
IL_002d: ldc.i4.1
IL_002e: add
IL_002f: ldelem.i4
IL_0030: ldarg.0
IL_0031: ldfld int32[] SIMDBenchmark::data
IL_0036: ldloc.1
IL_0037: ldc.i4.2
IL_0038: add
IL_0039: ldelem.i4
IL_003a: ldarg.0
IL_003b: ldfld int32[] SIMDBenchmark::data
IL_0040: ldloc.1
IL_0041: ldc.i4.3
IL_0042: add
IL_0043: ldelem.i4
IL_0044: call valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::Create(int32, int32, int32, int32)
IL_0049: stloc.s 4
// sequence point: (line 63, col 13) to (line 63, col 38) in _
IL_004b: ldloc.0
IL_004c: ldloc.s 4
IL_004e: call valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32> [System.Runtime.Intrinsics]System.Runtime.Intrinsics.X86.Sse2::Add(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32>, valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<int32>)
IL_0053: stloc.0
// sequence point: (line 60, col 27) to (line 60, col 33) in _
IL_0054: ldloc.1
IL_0055: ldc.i4.4
IL_0056: add
IL_0057: stloc.1
// sequence point: (line 60, col 16) to (line 60, col 25) in _
IL_0058: ldloc.1
IL_0059: ldloc.2
IL_005a: blt.s IL_001e
// end loop
// sequence point: (line 66, col 9) to (line 66, col 98) in _
IL_005c: ldloc.0
IL_005d: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::ToScalar<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>)
IL_0062: ldloc.0
IL_0063: ldc.i4.1
IL_0064: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::GetElement<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>, int32)
IL_0069: add
IL_006a: ldloc.0
IL_006b: ldc.i4.2
IL_006c: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::GetElement<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>, int32)
IL_0071: add
IL_0072: ldloc.0
IL_0073: ldc.i4.3
IL_0074: call !!0 [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128::GetElement<int32>(valuetype [System.Runtime.Intrinsics]System.Runtime.Intrinsics.Vector128`1<!!0>, int32)
IL_0079: add
IL_007a: stloc.3
// sequence point: hidden
IL_007b: br.s IL_008c
// loop start (head: IL_008c)
// sequence point: (line 69, col 13) to (line 69, col 28) in _
IL_007d: ldloc.3
IL_007e: ldarg.0
IL_007f: ldfld int32[] SIMDBenchmark::data
IL_0084: ldloc.1
IL_0085: ldelem.i4
IL_0086: add
IL_0087: stloc.3
// sequence point: (line 67, col 33) to (line 67, col 36) in _
IL_0088: ldloc.1
IL_0089: ldc.i4.1
IL_008a: add
IL_008b: stloc.1
// sequence point: (line 67, col 16) to (line 67, col 31) in _
IL_008c: ldloc.1
IL_008d: ldarg.0
IL_008e: ldfld int32[] SIMDBenchmark::data
IL_0093: ldlen
IL_0094: conv.i4
IL_0095: blt.s IL_007d
// end loop
// sequence point: (line 72, col 9) to (line 72, col 20) in _
IL_0097: ldloc.3
IL_0098: ret
}
Powered by SharpLab
|
|
Benchmark Description:
This benchmark uses SSE2 intrinsics to sum the array in chunks of four integers at a time, which should be faster on systems where SIMD is supported.
The provided benchmark code is designed to compare the performance of summing elements in an integer array using two different methods: a traditional loop (non-SIMD) and a SIMD (Single Instruction, Multiple Data) approach. It uses the BenchmarkDotNet library, a popular .NET benchmarking tool. The benchmark is configured to test different sizes of the array to see how each method scales with data size. The setup and rationale behind each method are as follows:
### General Setup
- **.NET Version**: The code does not explicitly mention the .NET version, but it uses features (e.g., `System.Runtime.Intrinsics`) that are available in .NET Core 2.1 and later versions, including .NET 5 and .NET 6.
- **Configuration**: The benchmark class uses a custom configuration defined in the `Config` class, which hides certain columns in the output and modifies the summary style to focus on trend ratios.
- **MemoryDiagnoser**: Enabled to report memory allocation statistics, which is crucial for understanding the memory efficiency of each method.
- **ReturnValueValidator**: Ensures that the methods return the correct value, adding a layer of correctness verification to the performance testing.
### Benchmark Methods
#### 1. `SumNonSIMD`
- **Purpose**: This method sums the elements of an integer array using a straightforward loop, which represents a common non-SIMD approach.
- **Performance Aspect**: It measures the baseline performance of array summation without any hardware acceleration or parallel processing techniques. This method is expected to be slower on large datasets compared to SIMD, as it processes one element at a time.
- **Importance**: Serving as the baseline, it allows us to quantify the performance gains achieved by SIMD operations.
#### 2. `SumSIMD`
- **Purpose**: This method demonstrates how to use SIMD operations to sum an integer array. SIMD allows processing multiple data points in a single instruction, potentially offering significant performance improvements for operations that can be parallelized.
- **Performance Aspect**: It specifically tests the performance benefits of using SIMD instructions (via `System.Runtime.Intrinsics`) for arithmetic operations on arrays. The method is designed to leverage CPU vector instructions to sum blocks of four integers at a time.
- **Importance**: Understanding the performance gains from SIMD is crucial for optimizing compute-bound applications, especially those dealing with large datasets or requiring real-time processing. SIMD can offer substantial speedups by fully utilizing modern CPU capabilities.
### Expected Insights
Running these benchmarks should provide insights into:
- **Performance Gains**: The difference in execution time between the non-SIMD and SIMD methods, highlighting the potential speedup from using SIMD instructions.
- **Scalability**: How each method's performance scales with the size of the input array. SIMD operations might show more significant benefits as the data size increases.
- **Memory Usage**: Although both methods are expected to have similar memory footprints, the benchmark's memory diagnostics will confirm if SIMD operations introduce any additional memory overhead.
In summary, these benchmarks are designed to illustrate the benefits of SIMD for parallelizable operations, offering a clear comparison against traditional loop-based methods in terms of speed and efficiency.