├── Core3Intrinsics ├── Core3Intrinsics.csproj ├── Program.cs ├── Validator.cs ├── Transpose.cs ├── Intro.cs └── Mandelbrot.cs ├── Core3IntrinsicsBenchmarks ├── Program.cs ├── Core3IntrinsicsBenchmarks.csproj ├── AlignedMemoryHandle.cs ├── ReadmeBenches.cs ├── TrigonometricOps.cs ├── AlignedArrayPool.cs ├── Mandelbrot.cs ├── MemoryBenches.cs ├── IntegerBasicOps.cs └── BasicOps.cs ├── LICENSE ├── Core3Intrinsics.sln ├── .gitattributes ├── ExtraFiles ├── MemoryBenches2.md ├── MemoryBenches-Aligned.md └── MemoryBenches-1.md ├── .gitignore └── Readme.md /Core3Intrinsics/Core3Intrinsics.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.0 6 | 8.0 7 | 8 | 9 | 10 | x64 11 | true 12 | 13 | 14 | 15 | x64 16 | true 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BenchmarkDotNet.Running; 3 | using BenchmarkDotNet.Configs; 4 | using System.Collections.Generic; 5 | 6 | namespace Core3IntrinsicsBenchmarks 7 | { 8 | class Program 9 | { 10 | static void Main() 11 | { 12 | //var summary = BenchmarkRunner.Run(); 13 | //_ = BenchmarkRunner.Run(); 14 | //var summary = BenchmarkRunner.Run(); 15 | //var summary = BenchmarkRunner.Run(); 16 | var summary = BenchmarkRunner.Run(); 17 | //var summary = BenchmarkRunner.Run(); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/Core3IntrinsicsBenchmarks.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.0 6 | 7.3 7 | 8 | 9 | 10 | x64 11 | true 12 | pdbonly 13 | true 14 | 15 | 16 | 17 | x64 18 | true 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 C. B. Gonzalez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/AlignedMemoryHandle.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Buffers; 3 | using System.Collections.Generic; 4 | using System.Runtime.InteropServices; 5 | using System.Text; 6 | 7 | namespace Core3IntrinsicsBenchmarks 8 | { 9 | public unsafe class AlignedMemoryHandle where T : struct 10 | { 11 | private MemoryHandle memoryHandle; 12 | readonly byte* bytePointer; 13 | readonly int byteArrayLength; 14 | readonly Memory memory; 15 | 16 | public MemoryHandle MemoryHandle => memoryHandle; 17 | 18 | public ref byte ByteRef => ref GetByteRef(); 19 | 20 | public ref T TRef => ref GetTRef(); 21 | 22 | public Memory Memory => memory; 23 | 24 | public int ByteArrayLength => byteArrayLength; 25 | 26 | public unsafe AlignedMemoryHandle(void* pointer, GCHandle handle, ref T arrayStart, int byteLength) 27 | { 28 | memoryHandle = new MemoryHandle(pointer, handle); 29 | bytePointer = (byte*)pointer; 30 | ref T tRef = ref arrayStart; 31 | byteArrayLength = byteLength; 32 | memory = new Memory(MemoryMarshal.Cast(new Span(pointer, byteLength)).ToArray()); 33 | } 34 | 35 | private unsafe ref byte GetByteRef() 36 | { 37 | return ref bytePointer[0]; 38 | } 39 | 40 | private unsafe ref T GetTRef() 41 | { 42 | return ref MemoryMarshal.Cast(new Span((void*)bytePointer, byteArrayLength)).ToArray()[0]; 43 | } 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Core3Intrinsics/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | 4 | namespace Core3Intrinsics 5 | { 6 | class Program 7 | { 8 | static unsafe void Main() 9 | { 10 | Console.WriteLine("Starting test ..."); 11 | Console.WriteLine("\tMandelBrot"); 12 | var man = new Mandelbrot(); 13 | man.FloatMandel(); 14 | man.Vector256Mandel(); 15 | (bool areEqual, System.Collections.Generic.List errorList, int maxDifference) = Validator.CompareValuesFloat(man.results.Span.ToArray(), man.results2.Span.ToArray()); 16 | Console.WriteLine($"\t\tMandelBrot successful: {areEqual}, Number of differences: {errorList.Count}, max. difference: {maxDifference}"); 17 | Console.WriteLine($"\t\tDone with mandelbrot, total bytes: {man.SizeInBytes}"); 18 | //Transpose.CreateArrays(); 19 | //bool res1 = Transpose.SerializeColorsInt(); 20 | //bool res2 = Transpose.SerializedColorsVector256(); 21 | //if(res1 && res2) 22 | //{ 23 | // (bool areEqual, System.Collections.Generic.List errorList) = Validator.CompareValues(Transpose.transposed1, Transpose.transposed2); 24 | // Console.WriteLine($"Transpose ended with success {areEqual}, number of differences {errorList.Count}"); 25 | //} 26 | //else 27 | //{ 28 | // Console.WriteLine($"Error running Transpose"); 29 | //} 30 | _ = Console.ReadLine(); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Core3Intrinsics.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.29215.179 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core3Intrinsics", "Core3Intrinsics\Core3Intrinsics.csproj", "{8ABE3139-8924-46FE-B8D4-155FE20DD285}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core3IntrinsicsBenchmarks", "Core3IntrinsicsBenchmarks\Core3IntrinsicsBenchmarks.csproj", "{FFEC9419-D276-46DB-8136-4642054E1C99}" 9 | EndProject 10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A3F9D91A-297A-40E2-9714-D009F3FB9CF0}" 11 | ProjectSection(SolutionItems) = preProject 12 | Readme.md = Readme.md 13 | EndProjectSection 14 | EndProject 15 | Global 16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 17 | Debug|Any CPU = Debug|Any CPU 18 | Release|Any CPU = Release|Any CPU 19 | EndGlobalSection 20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 21 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 22 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Debug|Any CPU.Build.0 = Debug|Any CPU 23 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Release|Any CPU.ActiveCfg = Release|Any CPU 24 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Release|Any CPU.Build.0 = Release|Any CPU 25 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 26 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Debug|Any CPU.Build.0 = Debug|Any CPU 27 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Release|Any CPU.ActiveCfg = Release|Any CPU 28 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Release|Any CPU.Build.0 = Release|Any CPU 29 | EndGlobalSection 30 | GlobalSection(SolutionProperties) = preSolution 31 | HideSolutionNode = FALSE 32 | EndGlobalSection 33 | GlobalSection(ExtensibilityGlobals) = postSolution 34 | SolutionGuid = {0AA0631C-9878-463C-8661-45CA8F282505} 35 | EndGlobalSection 36 | EndGlobal 37 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/ReadmeBenches.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using BenchmarkDotNet.Exporters; 4 | using BenchmarkDotNet.Exporters.Csv; 5 | using System; 6 | using System.Buffers; 7 | using System.Runtime.CompilerServices; 8 | using System.Runtime.InteropServices; 9 | using System.Runtime.Intrinsics; 10 | using System.Runtime.Intrinsics.X86; 11 | 12 | namespace Core3IntrinsicsBenchmarks 13 | { 14 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 15 | public class ReadmeBenches 16 | { 17 | [Params(4096/*, 1048576*/)] 18 | public int NumberOfFloats { get; set; } 19 | 20 | private static float[] inputData; 21 | 22 | [GlobalSetup] 23 | public void GlobalSetup() 24 | { 25 | inputData = new float[NumberOfFloats]; 26 | for(int i = 0; i < inputData.Length; i++) 27 | { 28 | inputData[i] = i + 1; 29 | } 30 | } 31 | 32 | [Benchmark(Baseline = true)] 33 | public float[] ProcessData() 34 | { 35 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 36 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 37 | Vector256 result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-50, 0, 0, 0, -50, 0, 0, 0> 38 | float[] results = new float[inputData.Length]; 39 | Span> resultVectors = MemoryMarshal.Cast>(results); 40 | 41 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(inputData); 42 | 43 | for (int i = 0; i < inputVectors.Length; i++) 44 | { 45 | resultVectors[i] = Avx.Sqrt(inputVectors[i]); 46 | } 47 | results[0] = result.GetElement(0); 48 | return results; 49 | } 50 | 51 | [Benchmark] 52 | public unsafe float[] ProcessDataUnsafe() 53 | { 54 | float[] results = new float[inputData.Length]; 55 | fixed (float* inputPtr = &inputData[0]) 56 | { 57 | float* inCurrent = inputPtr; 58 | fixed (float* resultPtr = &results[0]) 59 | { 60 | float* resEnd = resultPtr + results.Length; 61 | float* resCurrent = resultPtr; 62 | while (resCurrent < resEnd) 63 | { 64 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent))); 65 | resCurrent += 8; 66 | inCurrent += 8; 67 | } 68 | } 69 | } 70 | return results; 71 | } 72 | 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Core3Intrinsics/Validator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Core3Intrinsics 6 | { 7 | public static class Validator 8 | { 9 | public static (bool, List) CompareValues(T[] left, T[] right) where T : struct 10 | { 11 | var differIndexes = new List(); 12 | bool allEqual = true; 13 | if(left.Length != right.Length) 14 | { 15 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}."); 16 | } 17 | for(int i = 0; i < left.Length; i++) 18 | { 19 | if(!EqualityComparer.Default.Equals(left[i], right[i])) 20 | { 21 | differIndexes.Add(i); 22 | 23 | allEqual &= false; 24 | } 25 | } 26 | 27 | return (allEqual, differIndexes); 28 | } 29 | 30 | public static (bool, List, int) CompareValuesFloat(float[] left, float[] right) 31 | { 32 | var differIndexes = new List(); 33 | int maxDifference = 0; 34 | bool allEqual = true; 35 | if (left.Length != right.Length) 36 | { 37 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}."); 38 | } 39 | for (int i = 0; i < left.Length; i++) 40 | { 41 | if (left[i] != right[i]) 42 | { 43 | differIndexes.Add(i); 44 | if(Math.Abs(left[i] - right[i]) > maxDifference) 45 | { 46 | maxDifference = (int)Math.Abs(left[i] - right[i]); 47 | } 48 | allEqual &= false; 49 | } 50 | } 51 | 52 | return (allEqual, differIndexes, maxDifference); 53 | } 54 | 55 | public static (bool, List, int) CompareValuesDouble(double[] left, double[] right) 56 | { 57 | var differIndexes = new List(); 58 | int maxDifference = 0; 59 | bool allEqual = true; 60 | if (left.Length != right.Length) 61 | { 62 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}."); 63 | } 64 | for (int i = 0; i < left.Length; i++) 65 | { 66 | if (left[i] != right[i]) 67 | { 68 | differIndexes.Add(i); 69 | if (Math.Abs(left[i] - right[i]) > maxDifference) 70 | { 71 | maxDifference = (int)Math.Abs(left[i] - right[i]); 72 | } 73 | allEqual &= false; 74 | } 75 | } 76 | 77 | return (allEqual, differIndexes, maxDifference); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /ExtraFiles/MemoryBenches2.md: -------------------------------------------------------------------------------- 1 | ``` ini 2 | 3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 5 | .NET Core SDK=3.0.100-preview9-014004 6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 8 | 9 | 10 | ``` 11 | | Method | numberOfBytes | Mean | Error | StdDev | Ratio | RatioSD | 12 | |-------------------------------- |-------------- |---------------:|--------------:|--------------:|------:|--------:| 13 | | **ScalarStoreBlock** | **16384** | **298.5 ns** | **5.924 ns** | **9.047 ns** | **1.00** | **0.00** | 14 | | VectorStoreArrayMemPtr | 16384 | 394.1 ns | 10.456 ns | 16.885 ns | 1.32 | 0.06 | 15 | | VectorStoreArrayMemPtrUnaligned | 16384 | 495.0 ns | 9.477 ns | 10.140 ns | 1.66 | 0.07 | 16 | | | | | | | | | 17 | | **ScalarStoreBlock** | **131072** | **6,225.2 ns** | **116.328 ns** | **103.122 ns** | **1.00** | **0.00** | 18 | | VectorStoreArrayMemPtr | 131072 | 6,772.1 ns | 77.929 ns | 65.074 ns | 1.09 | 0.02 | 19 | | VectorStoreArrayMemPtrUnaligned | 131072 | 7,245.7 ns | 130.736 ns | 115.894 ns | 1.16 | 0.03 | 20 | | | | | | | | | 21 | | **ScalarStoreBlock** | **1048576** | **67,515.4 ns** | **2,549.673 ns** | **2,618.326 ns** | **1.00** | **0.00** | 22 | | VectorStoreArrayMemPtr | 1048576 | 80,868.2 ns | 1,569.923 ns | 1,928.007 ns | 1.20 | 0.05 | 23 | | VectorStoreArrayMemPtrUnaligned | 1048576 | 83,708.5 ns | 1,995.286 ns | 2,134.934 ns | 1.24 | 0.05 | 24 | | | | | | | | | 25 | | **ScalarStoreBlock** | **2097152** | **189,619.0 ns** | **7,155.162 ns** | **21,097.157 ns** | **1.00** | **0.00** | 26 | | VectorStoreArrayMemPtr | 2097152 | 271,783.7 ns | 5,376.659 ns | 11,914.305 ns | 1.41 | 0.17 | 27 | | VectorStoreArrayMemPtrUnaligned | 2097152 | 274,970.6 ns | 5,310.311 ns | 5,453.298 ns | 1.44 | 0.15 | 28 | | | | | | | | | 29 | | **ScalarStoreBlock** | **8388608** | **1,105,687.5 ns** | **10,205.821 ns** | **8,522.323 ns** | **1.00** | **0.00** | 30 | | VectorStoreArrayMemPtr | 8388608 | 1,573,145.8 ns | 31,795.047 ns | 29,741.107 ns | 1.42 | 0.02 | 31 | | VectorStoreArrayMemPtrUnaligned | 8388608 | 1,568,842.2 ns | 28,942.750 ns | 27,073.066 ns | 1.42 | 0.03 | 32 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/TrigonometricOps.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Numerics; 3 | using System.Runtime.Intrinsics; 4 | using BenchmarkDotNet.Attributes; 5 | using System.Runtime.InteropServices; 6 | using System.Runtime.Intrinsics.X86; 7 | using System.Buffers; 8 | 9 | namespace Core3IntrinsicsBenchmarks 10 | { 11 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 12 | public class TrigonometricOps 13 | { 14 | const int l1CacheSize = 32 * 1024; // one L1 cache, 32 kB 15 | private int numberOfItems; 16 | public static int algn = 32; 17 | public AlignedArrayPool floatPool; 18 | public AlignedArrayPool doublePool; 19 | AlignedMemoryHandle dataMemory, resultMemory; 20 | AlignedMemoryHandle dataDoubleMemory, resultDoubleMemory; 21 | 22 | [GlobalSetup] 23 | public unsafe void GlobalSetup() 24 | { 25 | numberOfItems = l1CacheSize / sizeof(double) / 2 - 8; 26 | floatPool = new AlignedArrayPool(); 27 | doublePool = new AlignedArrayPool(); 28 | dataMemory = floatPool.Rent(numberOfItems); 29 | resultMemory = floatPool.Rent(numberOfItems); 30 | dataDoubleMemory = doublePool.Rent(numberOfItems); 31 | resultDoubleMemory = doublePool.Rent(numberOfItems); 32 | Span dataSpan = new Span(dataMemory.MemoryHandle.Pointer, numberOfItems); 33 | Span resultSpan = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 34 | Span dataDoubleSpan = new Span(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 35 | Span resultDoubleSpan = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 36 | 37 | for (int i = 0; i < numberOfItems; i++) 38 | { 39 | dataSpan[i] = i + 0.01f; 40 | resultSpan[i] = 0.0f; 41 | dataDoubleSpan[i] = i + 0.01; 42 | resultDoubleSpan[i] = 0.0; 43 | } 44 | } 45 | 46 | [GlobalCleanup] 47 | public void GlobalCleanup() 48 | { 49 | floatPool.Return(resultMemory, false); 50 | floatPool.Return(dataMemory, false); 51 | doublePool.Return(resultDoubleMemory, false); 52 | doublePool.Return(dataDoubleMemory, false); 53 | floatPool.Dispose(); 54 | doublePool.Dispose(); 55 | } 56 | 57 | [Benchmark] 58 | public unsafe void Cos() 59 | { 60 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 61 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 62 | 63 | for (int i = 0; i < sp1.Length; i++) 64 | { 65 | sp2[i] = (float)Math.Cos(sp1[i]); 66 | } 67 | } 68 | 69 | [Benchmark] 70 | public unsafe void CosMathF() 71 | { 72 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 73 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 74 | 75 | for (int i = 0; i < sp1.Length; i++) 76 | { 77 | sp2[i] = MathF.Cos(sp1[i]); 78 | } 79 | } 80 | 81 | [Benchmark] 82 | public unsafe void CosDouble() 83 | { 84 | ReadOnlySpan sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 85 | Span sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 86 | 87 | for (int i = 0; i < sp1.Length; i++) 88 | { 89 | sp2[i] = Math.Cos(sp1[i]); 90 | 91 | } 92 | } 93 | 94 | 95 | 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /ExtraFiles/MemoryBenches-Aligned.md: -------------------------------------------------------------------------------- 1 | ``` ini 2 | 3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 5 | .NET Core SDK=3.0.100-preview9-014004 6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 8 | 9 | 10 | ``` 11 | | Method | NumberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 12 | |----------------------------- |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:| 13 | | **VectorStoreAligned** | **16384** | **504.7 ns** | **7.635 ns** | **8.792 ns** | **504.6 ns** | **1.00** | **0.00** | 14 | | VectorStoreArrayMemPtr | 16384 | 385.1 ns | 6.161 ns | 4.810 ns | 383.8 ns | 0.76 | 0.01 | 15 | | VectorStoreArrayMemSafe | 16384 | 597.0 ns | 11.873 ns | 12.193 ns | 595.5 ns | 1.18 | 0.03 | 16 | | VectorStoreArraySimpleBuffer | 16384 | 640.5 ns | 22.126 ns | 18.476 ns | 636.5 ns | 1.27 | 0.05 | 17 | | | | | | | | | | 18 | | **VectorStoreAligned** | **131072** | **9,865.0 ns** | **199.512 ns** | **279.687 ns** | **9,767.2 ns** | **1.00** | **0.00** | 19 | | VectorStoreArrayMemPtr | 131072 | 9,637.7 ns | 94.004 ns | 83.332 ns | 9,645.3 ns | 0.97 | 0.03 | 20 | | VectorStoreArrayMemSafe | 131072 | 6,181.7 ns | 120.563 ns | 148.062 ns | 6,144.4 ns | 0.63 | 0.03 | 21 | | VectorStoreArraySimpleBuffer | 131072 | 9,925.4 ns | 260.502 ns | 230.929 ns | 9,855.4 ns | 1.00 | 0.03 | 22 | | | | | | | | | | 23 | | **VectorStoreAligned** | **1048576** | **79,435.3 ns** | **1,865.323 ns** | **2,220.535 ns** | **78,294.8 ns** | **1.00** | **0.00** | 24 | | VectorStoreArrayMemPtr | 1048576 | 98,353.8 ns | 2,720.589 ns | 2,271.815 ns | 97,951.3 ns | 1.24 | 0.03 | 25 | | VectorStoreArrayMemSafe | 1048576 | 79,803.5 ns | 1,712.943 ns | 3,000.081 ns | 78,598.9 ns | 1.01 | 0.06 | 26 | | VectorStoreArraySimpleBuffer | 1048576 | 79,867.6 ns | 2,257.561 ns | 2,318.349 ns | 79,063.7 ns | 1.00 | 0.05 | 27 | | | | | | | | | | 28 | | **VectorStoreAligned** | **2097152** | **216,500.1 ns** | **4,992.955 ns** | **14,164.183 ns** | **212,591.0 ns** | **1.00** | **0.00** | 29 | | VectorStoreArrayMemPtr | 2097152 | 346,242.9 ns | 6,797.722 ns | 9,304.799 ns | 341,851.5 ns | 1.58 | 0.12 | 30 | | VectorStoreArrayMemSafe | 2097152 | 205,378.0 ns | 3,818.530 ns | 3,188.646 ns | 205,488.9 ns | 0.93 | 0.07 | 31 | | VectorStoreArraySimpleBuffer | 2097152 | 228,231.7 ns | 4,517.376 ns | 10,736.022 ns | 225,121.4 ns | 1.06 | 0.09 | 32 | | | | | | | | | | 33 | | **VectorStoreAligned** | **8388608** | **1,503,050.0 ns** | **28,335.402 ns** | **27,829.153 ns** | **1,490,845.2 ns** | **1.00** | **0.00** | 34 | | VectorStoreArrayMemPtr | 8388608 | 1,506,756.1 ns | 19,681.599 ns | 17,447.225 ns | 1,503,300.3 ns | 1.00 | 0.02 | 35 | | VectorStoreArrayMemSafe | 8388608 | 1,536,087.1 ns | 26,551.526 ns | 23,537.236 ns | 1,531,720.1 ns | 1.02 | 0.03 | 36 | | VectorStoreArraySimpleBuffer | 8388608 | 1,541,513.7 ns | 32,303.380 ns | 30,216.602 ns | 1,536,127.9 ns | 1.02 | 0.03 | 37 | -------------------------------------------------------------------------------- /Core3Intrinsics/Transpose.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace Core3Intrinsics 7 | { 8 | public static class Transpose 9 | { 10 | private const int defWidth = 1920, defHeight = 1080, numberOfElements = 8; 11 | private static int currWidth, currHeight; 12 | private static int[] original; 13 | public static int[]transposed1, transposed2; 14 | 15 | private static bool isInitialized = false; 16 | 17 | public static bool SerializeColorsInt() 18 | { 19 | if(!isInitialized) 20 | { 21 | return false; 22 | } 23 | int[] colorComponents = new int[currWidth * 4]; 24 | Span colorsSpan = transposed1; 25 | int runningCounter = 0;//, byteCounter; 26 | int start; 27 | for (int y = 0; y < currHeight; y++) 28 | { 29 | Span currColors = colorsSpan.Slice(runningCounter, currWidth * 4); 30 | for (int x = 0; x < currWidth; x+= numberOfElements) 31 | { 32 | for (int i = 0; i < numberOfElements; i++) 33 | { 34 | start = x * 4 + i; 35 | colorComponents[start] = original[runningCounter]; 36 | colorComponents[start + numberOfElements] = original[runningCounter + 1]; 37 | colorComponents[start + (2 * numberOfElements)] = original[runningCounter + 2]; 38 | colorComponents[start + (3 * numberOfElements)] = original[runningCounter + 3]; 39 | runningCounter += 4; 40 | } 41 | } 42 | colorComponents.CopyTo(currColors); 43 | 44 | } 45 | return true; 46 | } 47 | 48 | public static bool SerializedColorsVector256() 49 | { 50 | if (!isInitialized) 51 | { 52 | return false; 53 | } 54 | Span> originVectors = MemoryMarshal.Cast>(original); 55 | Span> transposedVectors = MemoryMarshal.Cast>(transposed2); 56 | Vector256 pm0, pm1, pm2, pm3, up0, up1, up2, up3; 57 | for(int i = 0; i < originVectors.Length; i += 4) 58 | { 59 | pm0 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x20); 60 | pm1 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x20); 61 | pm2 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x31); 62 | pm3 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x31); 63 | 64 | up0 = Avx2.UnpackLow(pm0, pm1); 65 | up1 = Avx2.UnpackHigh(pm0, pm1); 66 | up2 = Avx2.UnpackLow(pm2, pm3); 67 | up3 = Avx2.UnpackHigh(pm2, pm3); 68 | 69 | transposedVectors[i] = Avx2.UnpackLow(up0, up2); 70 | transposedVectors[i + 1] = Avx2.UnpackHigh(up0, up2); 71 | transposedVectors[i + 2] = Avx2.UnpackLow(up1, up3); 72 | transposedVectors[i + 3] = Avx2.UnpackHigh(up1, up3); 73 | } 74 | 75 | return true; 76 | } 77 | 78 | public static void CreateArrays(int width = defWidth, int height = defHeight) 79 | { 80 | currWidth = width; 81 | currHeight = height; 82 | 83 | original = new int[4 * currWidth * currHeight]; 84 | transposed1 = new int[4 * currHeight * currWidth]; 85 | transposed2 = new int[4 * currHeight * currWidth]; 86 | 87 | for (int i = 0; i < original.Length; i++) 88 | { 89 | original[i] = i; 90 | } 91 | 92 | isInitialized = true; 93 | } 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/AlignedArrayPool.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Buffers; 3 | using System.Collections.Generic; 4 | using System.Runtime.InteropServices; 5 | using System.Text; 6 | 7 | namespace Core3IntrinsicsBenchmarks 8 | { 9 | public class AlignedArrayPool : IDisposable where T : struct 10 | { 11 | private bool disposedValue = false; // To detect redundant calls 12 | 13 | private static readonly object lockObject = new object(); 14 | private readonly ArrayPool pool = ArrayPool.Shared; 15 | private const int defaultByteAlignment = 32; 16 | 17 | 18 | private readonly int tSize, currentAlignment; 19 | private readonly List<(byte[], GCHandle, IntPtr, int)> allBuffers; 20 | private readonly List<(MemoryHandle, GCHandle, byte[])> allMemoryHandles; 21 | 22 | public AlignedArrayPool() 23 | { 24 | Type tp = typeof(T); 25 | tSize = Marshal.SizeOf(tp); 26 | if (!tp.IsValueType || tp.IsEnum) 27 | { 28 | throw new ArgumentException("Invalid type, must be numeric."); 29 | } 30 | currentAlignment = defaultByteAlignment; 31 | allMemoryHandles = new List<(MemoryHandle, GCHandle, byte[])>(); 32 | allBuffers = new List<(byte[], GCHandle,IntPtr, int)>(); 33 | } 34 | 35 | public unsafe AlignedMemoryHandle Rent(int minimumLength, int byteAlignment) 36 | { 37 | byte[] buff = pool.Rent(minimumLength * tSize + 2 * byteAlignment); // see comment below, could just be 1 * 38 | var handle = GCHandle.Alloc(buff, GCHandleType.Pinned); 39 | allBuffers.Add((buff, handle, IntPtr.Zero, 0)); 40 | MemoryHandle memHand; 41 | AlignedMemoryHandle alMemHand; 42 | int currIdx; 43 | lock (lockObject) 44 | { 45 | currIdx = allBuffers.Count - 1; 46 | IntPtr ptr = AlignBuffer(currIdx); 47 | T[] tBuff = MemoryMarshal.Cast(new Span(ptr.ToPointer(), minimumLength * tSize)).ToArray(); 48 | memHand = new MemoryHandle(ptr.ToPointer(), handle); 49 | alMemHand = new AlignedMemoryHandle(ptr.ToPointer(), handle, ref tBuff[0], minimumLength * tSize); 50 | allMemoryHandles.Add((memHand, handle, buff)); 51 | } 52 | return alMemHand; 53 | 54 | unsafe IntPtr AlignBuffer(int bufferIndex) 55 | { 56 | (byte[], GCHandle, IntPtr, int) currentBuff = allBuffers[bufferIndex]; 57 | allBuffers.RemoveAt(bufferIndex); 58 | long lPtr = currentBuff.Item2.AddrOfPinnedObject().ToInt64(); 59 | long lPtr2 = (lPtr + currentAlignment - 1) & ~(currentAlignment - 1); 60 | // For benchmarking purposes, we avoid chance 32 byte alignment 61 | if(lPtr2 % 32 == 0) 62 | { 63 | lPtr2 += byteAlignment; 64 | } 65 | currentBuff.Item4 = (int)(lPtr2 - lPtr); 66 | currentBuff.Item3 = new IntPtr(lPtr2); 67 | allBuffers.Add(currentBuff); 68 | return new IntPtr(lPtr2); 69 | } 70 | } 71 | 72 | public AlignedMemoryHandle Rent(int minimumLength) 73 | { 74 | return Rent(minimumLength, defaultByteAlignment); 75 | } 76 | 77 | public unsafe void Return(AlignedMemoryHandle bufferHandle, bool clearArray = false) 78 | { 79 | (MemoryHandle memHandle, GCHandle gcHandle, byte[] buff) item; 80 | lock (lockObject) 81 | { 82 | for (int i = 0; i < allMemoryHandles.Count; i++) 83 | { 84 | item = allMemoryHandles[i]; 85 | if (item.memHandle.Pointer == bufferHandle.MemoryHandle.Pointer) 86 | { 87 | if (item.gcHandle.IsAllocated) 88 | { 89 | item.gcHandle.Free(); 90 | } 91 | pool.Return(item.buff, clearArray); 92 | allMemoryHandles.RemoveAt(i); 93 | break; 94 | } 95 | } 96 | } 97 | } 98 | 99 | #region IDisposable Support 100 | 101 | protected virtual void Dispose(bool disposing) 102 | { 103 | if (!disposedValue) 104 | { 105 | if (disposing) 106 | { 107 | // TODO: dispose managed state (managed objects). 108 | } 109 | 110 | // TODO: free unmanaged resources (unmanaged objects) and override a finalizer below. 111 | // TODO: set large fields to null. 112 | if (allMemoryHandles.Count > 0) 113 | { 114 | (MemoryHandle memHandle, GCHandle gcHandle, byte[] buff) item; 115 | for (int i = 0; i < allMemoryHandles.Count; i++) 116 | { 117 | item = allMemoryHandles[i]; 118 | if (item.gcHandle.IsAllocated) 119 | { 120 | item.gcHandle.Free(); 121 | } 122 | pool.Return(item.buff); 123 | 124 | } 125 | allMemoryHandles.Clear(); 126 | allBuffers.Clear(); 127 | } 128 | 129 | disposedValue = true; 130 | } 131 | } 132 | 133 | // TODO: override a finalizer only if Dispose(bool disposing) above has code to free unmanaged resources. 134 | ~AlignedArrayPool() 135 | { 136 | // Do not change this code. Put cleanup code in Dispose(bool disposing) above. 137 | Dispose(false); 138 | } 139 | 140 | // This code added to correctly implement the disposable pattern. 141 | public void Dispose() 142 | { 143 | // Do not change this code. Put cleanup code in Dispose(bool disposing) above. 144 | Dispose(true); 145 | // TODO: uncomment the following line if the finalizer is overridden above. 146 | GC.SuppressFinalize(this); 147 | } 148 | #endregion 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /ExtraFiles/MemoryBenches-1.md: -------------------------------------------------------------------------------- 1 | ``` ini 2 | 3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 5 | .NET Core SDK=3.0.100-preview9-014004 6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 8 | 9 | 10 | ``` 11 | | Method | numberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 12 | |------------------------------ |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:| 13 | | **ScalarStoreUnrolled** | **16384** | **2,292.3 ns** | **60.087 ns** | **50.176 ns** | **2,284.6 ns** | **7.52** | **0.30** | 14 | | ScalarStoreBlock | 16384 | 306.1 ns | 8.539 ns | 12.246 ns | 302.8 ns | 1.00 | 0.00 | 15 | | VectorStoreAligned | 16384 | 493.2 ns | 9.847 ns | 12.453 ns | 493.2 ns | 1.60 | 0.06 | 16 | | VectorStoreArrayMemPtr | 16384 | 401.3 ns | 8.049 ns | 12.998 ns | 397.5 ns | 1.32 | 0.07 | 17 | | VectorStoreArrayMemSafe | 16384 | 473.3 ns | 9.507 ns | 13.327 ns | 470.7 ns | 1.55 | 0.08 | 18 | | VectorStoreUnaligned | 16384 | 577.2 ns | 10.582 ns | 9.381 ns | 576.0 ns | 1.89 | 0.07 | 19 | | VectorStoreUnalignedMemPtr | 16384 | 504.7 ns | 15.461 ns | 20.641 ns | 498.6 ns | 1.65 | 0.09 | 20 | | VectorStoreUnalignedToAligned | 16384 | 492.7 ns | 9.763 ns | 16.311 ns | 485.8 ns | 1.61 | 0.08 | 21 | | | | | | | | | | 22 | | **ScalarStoreUnrolled** | **131072** | **18,656.4 ns** | **343.541 ns** | **321.348 ns** | **18,589.3 ns** | **3.02** | **0.06** | 23 | | ScalarStoreBlock | 131072 | 6,185.0 ns | 77.250 ns | 64.508 ns | 6,174.3 ns | 1.00 | 0.00 | 24 | | VectorStoreAligned | 131072 | 6,873.3 ns | 65.477 ns | 54.676 ns | 6,880.6 ns | 1.11 | 0.02 | 25 | | VectorStoreArrayMemPtr | 131072 | 6,653.6 ns | 141.340 ns | 132.209 ns | 6,610.1 ns | 1.08 | 0.03 | 26 | | VectorStoreArrayMemSafe | 131072 | 6,931.2 ns | 138.136 ns | 282.176 ns | 6,822.8 ns | 1.13 | 0.06 | 27 | | VectorStoreUnaligned | 131072 | 7,556.5 ns | 114.427 ns | 89.337 ns | 7,537.2 ns | 1.22 | 0.02 | 28 | | VectorStoreUnalignedMemPtr | 131072 | 7,319.7 ns | 145.018 ns | 221.457 ns | 7,239.3 ns | 1.19 | 0.04 | 29 | | VectorStoreUnalignedToAligned | 131072 | 6,928.4 ns | 138.061 ns | 141.779 ns | 6,892.1 ns | 1.12 | 0.03 | 30 | | | | | | | | | | 31 | | **ScalarStoreUnrolled** | **1048576** | **159,693.3 ns** | **2,764.505 ns** | **2,308.487 ns** | **159,156.2 ns** | **2.43** | **0.07** | 32 | | ScalarStoreBlock | 1048576 | 65,713.1 ns | 1,277.124 ns | 1,132.137 ns | 65,699.8 ns | 1.00 | 0.00 | 33 | | VectorStoreAligned | 1048576 | 85,778.4 ns | 2,106.262 ns | 5,975.114 ns | 83,181.5 ns | 1.31 | 0.10 | 34 | | VectorStoreArrayMemPtr | 1048576 | 78,964.1 ns | 1,518.257 ns | 1,624.518 ns | 78,922.6 ns | 1.20 | 0.03 | 35 | | VectorStoreArrayMemSafe | 1048576 | 80,763.9 ns | 1,389.509 ns | 1,160.303 ns | 80,709.0 ns | 1.23 | 0.03 | 36 | | VectorStoreUnaligned | 1048576 | 84,741.3 ns | 1,680.962 ns | 2,185.725 ns | 84,040.2 ns | 1.29 | 0.04 | 37 | | VectorStoreUnalignedMemPtr | 1048576 | 82,595.5 ns | 1,816.659 ns | 2,019.212 ns | 82,142.8 ns | 1.26 | 0.04 | 38 | | VectorStoreUnalignedToAligned | 1048576 | 86,209.3 ns | 1,984.263 ns | 5,693.224 ns | 85,122.7 ns | 1.30 | 0.09 | 39 | | | | | | | | | | 40 | | **ScalarStoreUnrolled** | **2097152** | **386,240.6 ns** | **7,648.523 ns** | **19,188.650 ns** | **381,202.7 ns** | **2.26** | **0.11** | 41 | | ScalarStoreBlock | 2097152 | 171,998.1 ns | 3,435.604 ns | 5,142.251 ns | 170,366.1 ns | 1.00 | 0.00 | 42 | | VectorStoreAligned | 2097152 | 250,602.9 ns | 3,544.961 ns | 2,960.203 ns | 250,186.1 ns | 1.45 | 0.05 | 43 | | VectorStoreArrayMemPtr | 2097152 | 253,581.1 ns | 5,065.490 ns | 9,003.903 ns | 251,693.9 ns | 1.48 | 0.06 | 44 | | VectorStoreArrayMemSafe | 2097152 | 254,647.4 ns | 5,565.014 ns | 10,034.868 ns | 251,608.8 ns | 1.49 | 0.07 | 45 | | VectorStoreUnaligned | 2097152 | 258,129.5 ns | 5,127.175 ns | 7,018.136 ns | 256,494.3 ns | 1.50 | 0.06 | 46 | | VectorStoreUnalignedMemPtr | 2097152 | 259,253.1 ns | 5,207.113 ns | 8,408.518 ns | 257,269.9 ns | 1.51 | 0.07 | 47 | | VectorStoreUnalignedToAligned | 2097152 | 268,083.3 ns | 5,350.387 ns | 14,736.521 ns | 270,760.6 ns | 1.55 | 0.10 | 48 | | | | | | | | | | 49 | | **ScalarStoreUnrolled** | **8388608** | **1,792,974.9 ns** | **34,861.894 ns** | **59,198.142 ns** | **1,773,807.8 ns** | **1.64** | **0.07** | 50 | | ScalarStoreBlock | 8388608 | 1,106,074.5 ns | 17,544.390 ns | 14,650.360 ns | 1,107,074.2 ns | 1.00 | 0.00 | 51 | | VectorStoreAligned | 8388608 | 1,564,931.4 ns | 38,160.539 ns | 37,478.752 ns | 1,549,061.2 ns | 1.42 | 0.04 | 52 | | VectorStoreArrayMemPtr | 8388608 | 1,573,258.0 ns | 34,312.238 ns | 44,615.601 ns | 1,561,962.8 ns | 1.43 | 0.05 | 53 | | VectorStoreArrayMemSafe | 8388608 | 1,559,172.6 ns | 17,596.260 ns | 15,598.626 ns | 1,559,339.7 ns | 1.41 | 0.03 | 54 | | VectorStoreUnaligned | 8388608 | 1,541,325.1 ns | 18,699.861 ns | 14,599.621 ns | 1,541,280.2 ns | 1.39 | 0.02 | 55 | | VectorStoreUnalignedMemPtr | 8388608 | 1,561,604.8 ns | 22,459.313 ns | 19,909.596 ns | 1,558,538.2 ns | 1.41 | 0.03 | 56 | | VectorStoreUnalignedToAligned | 8388608 | 1,546,770.0 ns | 19,669.857 ns | 15,356.930 ns | 1,543,577.9 ns | 1.40 | 0.02 | 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | [Aa][Rr][Mm]/ 24 | [Aa][Rr][Mm]64/ 25 | bld/ 26 | [Bb]in/ 27 | [Oo]bj/ 28 | [Ll]og/ 29 | 30 | # Visual Studio 2015/2017 cache/options directory 31 | .vs/ 32 | # Uncomment if you have tasks that create the project's static files in wwwroot 33 | #wwwroot/ 34 | 35 | # Visual Studio 2017 auto generated files 36 | Generated\ Files/ 37 | 38 | # MSTest test Results 39 | [Tt]est[Rr]esult*/ 40 | [Bb]uild[Ll]og.* 41 | 42 | # NUNIT 43 | *.VisualState.xml 44 | TestResult.xml 45 | 46 | # Build Results of an ATL Project 47 | [Dd]ebugPS/ 48 | [Rr]eleasePS/ 49 | dlldata.c 50 | 51 | # Benchmark Results 52 | BenchmarkDotNet.Artifacts/ 53 | 54 | # .NET Core 55 | project.lock.json 56 | project.fragment.lock.json 57 | artifacts/ 58 | 59 | # StyleCop 60 | StyleCopReport.xml 61 | 62 | # Files built by Visual Studio 63 | *_i.c 64 | *_p.c 65 | *_h.h 66 | *.ilk 67 | *.meta 68 | *.obj 69 | *.iobj 70 | *.pch 71 | *.pdb 72 | *.ipdb 73 | *.pgc 74 | *.pgd 75 | *.rsp 76 | *.sbr 77 | *.tlb 78 | *.tli 79 | *.tlh 80 | *.tmp 81 | *.tmp_proj 82 | *_wpftmp.csproj 83 | *.log 84 | *.vspscc 85 | *.vssscc 86 | .builds 87 | *.pidb 88 | *.svclog 89 | *.scc 90 | 91 | # Chutzpah Test files 92 | _Chutzpah* 93 | 94 | # Visual C++ cache files 95 | ipch/ 96 | *.aps 97 | *.ncb 98 | *.opendb 99 | *.opensdf 100 | *.sdf 101 | *.cachefile 102 | *.VC.db 103 | *.VC.VC.opendb 104 | 105 | # Visual Studio profiler 106 | *.psess 107 | *.vsp 108 | *.vspx 109 | *.sap 110 | 111 | # Visual Studio Trace Files 112 | *.e2e 113 | 114 | # TFS 2012 Local Workspace 115 | $tf/ 116 | 117 | # Guidance Automation Toolkit 118 | *.gpState 119 | 120 | # ReSharper is a .NET coding add-in 121 | _ReSharper*/ 122 | *.[Rr]e[Ss]harper 123 | *.DotSettings.user 124 | 125 | # JustCode is a .NET coding add-in 126 | .JustCode 127 | 128 | # TeamCity is a build add-in 129 | _TeamCity* 130 | 131 | # DotCover is a Code Coverage Tool 132 | *.dotCover 133 | 134 | # AxoCover is a Code Coverage Tool 135 | .axoCover/* 136 | !.axoCover/settings.json 137 | 138 | # Visual Studio code coverage results 139 | *.coverage 140 | *.coveragexml 141 | 142 | # NCrunch 143 | _NCrunch_* 144 | .*crunch*.local.xml 145 | nCrunchTemp_* 146 | 147 | # MightyMoose 148 | *.mm.* 149 | AutoTest.Net/ 150 | 151 | # Web workbench (sass) 152 | .sass-cache/ 153 | 154 | # Installshield output folder 155 | [Ee]xpress/ 156 | 157 | # DocProject is a documentation generator add-in 158 | DocProject/buildhelp/ 159 | DocProject/Help/*.HxT 160 | DocProject/Help/*.HxC 161 | DocProject/Help/*.hhc 162 | DocProject/Help/*.hhk 163 | DocProject/Help/*.hhp 164 | DocProject/Help/Html2 165 | DocProject/Help/html 166 | 167 | # Click-Once directory 168 | publish/ 169 | 170 | # Publish Web Output 171 | *.[Pp]ublish.xml 172 | *.azurePubxml 173 | # Note: Comment the next line if you want to checkin your web deploy settings, 174 | # but database connection strings (with potential passwords) will be unencrypted 175 | *.pubxml 176 | *.publishproj 177 | 178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 179 | # checkin your Azure Web App publish settings, but sensitive information contained 180 | # in these scripts will be unencrypted 181 | PublishScripts/ 182 | 183 | # NuGet Packages 184 | *.nupkg 185 | # The packages folder can be ignored because of Package Restore 186 | **/[Pp]ackages/* 187 | # except build/, which is used as an MSBuild target. 188 | !**/[Pp]ackages/build/ 189 | # Uncomment if necessary however generally it will be regenerated when needed 190 | #!**/[Pp]ackages/repositories.config 191 | # NuGet v3's project.json files produces more ignorable files 192 | *.nuget.props 193 | *.nuget.targets 194 | 195 | # Microsoft Azure Build Output 196 | csx/ 197 | *.build.csdef 198 | 199 | # Microsoft Azure Emulator 200 | ecf/ 201 | rcf/ 202 | 203 | # Windows Store app package directories and files 204 | AppPackages/ 205 | BundleArtifacts/ 206 | Package.StoreAssociation.xml 207 | _pkginfo.txt 208 | *.appx 209 | 210 | # Visual Studio cache files 211 | # files ending in .cache can be ignored 212 | *.[Cc]ache 213 | # but keep track of directories ending in .cache 214 | !?*.[Cc]ache/ 215 | 216 | # Others 217 | ClientBin/ 218 | ~$* 219 | *~ 220 | *.dbmdl 221 | *.dbproj.schemaview 222 | *.jfm 223 | *.pfx 224 | *.publishsettings 225 | orleans.codegen.cs 226 | 227 | # Including strong name files can present a security risk 228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 229 | #*.snk 230 | 231 | # Since there are multiple workflows, uncomment next line to ignore bower_components 232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 233 | #bower_components/ 234 | 235 | # RIA/Silverlight projects 236 | Generated_Code/ 237 | 238 | # Backup & report files from converting an old project file 239 | # to a newer Visual Studio version. Backup files are not needed, 240 | # because we have git ;-) 241 | _UpgradeReport_Files/ 242 | Backup*/ 243 | UpgradeLog*.XML 244 | UpgradeLog*.htm 245 | ServiceFabricBackup/ 246 | *.rptproj.bak 247 | 248 | # SQL Server files 249 | *.mdf 250 | *.ldf 251 | *.ndf 252 | 253 | # Business Intelligence projects 254 | *.rdl.data 255 | *.bim.layout 256 | *.bim_*.settings 257 | *.rptproj.rsuser 258 | *- Backup*.rdl 259 | 260 | # Microsoft Fakes 261 | FakesAssemblies/ 262 | 263 | # GhostDoc plugin setting file 264 | *.GhostDoc.xml 265 | 266 | # Node.js Tools for Visual Studio 267 | .ntvs_analysis.dat 268 | node_modules/ 269 | 270 | # Visual Studio 6 build log 271 | *.plg 272 | 273 | # Visual Studio 6 workspace options file 274 | *.opt 275 | 276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 277 | *.vbw 278 | 279 | # Visual Studio LightSwitch build output 280 | **/*.HTMLClient/GeneratedArtifacts 281 | **/*.DesktopClient/GeneratedArtifacts 282 | **/*.DesktopClient/ModelManifest.xml 283 | **/*.Server/GeneratedArtifacts 284 | **/*.Server/ModelManifest.xml 285 | _Pvt_Extensions 286 | 287 | # Paket dependency manager 288 | .paket/paket.exe 289 | paket-files/ 290 | 291 | # FAKE - F# Make 292 | .fake/ 293 | 294 | # JetBrains Rider 295 | .idea/ 296 | *.sln.iml 297 | 298 | # CodeRush personal settings 299 | .cr/personal 300 | 301 | # Python Tools for Visual Studio (PTVS) 302 | __pycache__/ 303 | *.pyc 304 | 305 | # Cake - Uncomment if you are using it 306 | # tools/** 307 | # !tools/packages.config 308 | 309 | # Tabs Studio 310 | *.tss 311 | 312 | # Telerik's JustMock configuration file 313 | *.jmconfig 314 | 315 | # BizTalk build output 316 | *.btp.cs 317 | *.btm.cs 318 | *.odx.cs 319 | *.xsd.cs 320 | 321 | # OpenCover UI analysis results 322 | OpenCover/ 323 | 324 | # Azure Stream Analytics local run output 325 | ASALocalRun/ 326 | 327 | # MSBuild Binary and Structured Log 328 | *.binlog 329 | 330 | # NVidia Nsight GPU debugger configuration file 331 | *.nvuser 332 | 333 | # MFractors (Xamarin productivity tool) working folder 334 | .mfractor/ 335 | 336 | # Local History for Visual Studio 337 | .localhistory/ 338 | 339 | # BeatPulse healthcheck temp database 340 | healthchecksdb -------------------------------------------------------------------------------- /Core3Intrinsics/Intro.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace Core3Intrinsics 7 | { 8 | public class Intro 9 | { 10 | public Intro() 11 | { 12 | var middleVector = Vector128.Create(1.0f); // middleVector = <1,1,1,1> 13 | middleVector = Vector128.CreateScalar(-1.0f); // middleVector = <-1,0,0,0> 14 | var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191> 15 | if(Avx.IsSupported) 16 | { 17 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 18 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 19 | Vector256 result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit 20 | left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, - 70.0f, -80.0f); 21 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f); 22 | result = Avx.UnpackHigh(left, right); // result = <-3, 3, -4, 4, -70, 70, -80, 80> 23 | result = Avx.UnpackLow(left, right); // result = <-1, 1, -2, 2, -50, 50, -60, 60> 24 | result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0> 25 | bool testResult = Avx.TestC(left, right); // testResult = true 26 | testResult = Avx.TestC(right, left); // testResult = false 27 | Vector256 result1 = Avx.Divide(left, right); 28 | var plusOne = Vector256.Create(1.0f); 29 | result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling); 30 | result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling); 31 | left = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); 32 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 33 | Vector256 nanInFirstPosition = Avx.Divide(left, right); 34 | left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f); 35 | Vector256 InfInFirstPosition = Avx.Divide(left, right); 36 | 37 | left = Vector256.Create(-1.1f, 3.0f, 1.0f/3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f); 38 | right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 39 | Vector256 compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 40 | Vector256 mixed = Avx.BlendVariable(left, right, compareResult); // mixed = <-1, 2, -3, 2, -50, -60, -70, -80> 41 | 42 | //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f); 43 | //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f); 44 | Vector256 other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 45 | bool bRes = Avx.TestZ(plusOne, compareResult); 46 | bool bRes2 = Avx.TestC(plusOne, compareResult); 47 | bool allTrue = !Avx.TestZ(compareResult, compareResult); 48 | compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 49 | compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling); 50 | compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling); 51 | compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); 52 | var left128 = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f); 53 | var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f); 54 | Vector128 compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0> 55 | 56 | int res = Avx.MoveMask(compareResult); 57 | if (Fma.IsSupported) 58 | { 59 | Vector256 resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element 60 | resultFma = Fma.MultiplyAddNegated(left, right, other); // = -(left * right + other) for each element 61 | resultFma = Fma.MultiplySubtract(left, right, other); // = left * right - other for each element 62 | Fma.MultiplyAddSubtract(left, right, other); // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract 63 | 64 | } 65 | result = Avx.DotProduct(left, right, 0b1010_0001); // result = <-20, 0, 0, 0, -10000, 0, 0, 0> 66 | result = Avx.Floor(left); // result = <-3, -3, -3, -3, -3, -3, -3, -3> 67 | result = Avx.Add(left, right); // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5> 68 | result = Avx.Ceiling(left); // result = <-2, -2, -2, -2, -2, -2, -2, -2> 69 | result = Avx.Multiply(left, right); // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5> 70 | result = Avx.HorizontalAdd(left, right); // result = <-5, -5, 10, 10, -5, -5, 10, 10> 71 | result = Avx.HorizontalSubtract(left, right); // result = <0, 0, 0, 0, 0, 0, 0, 0> 72 | double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 }; 73 | double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; 74 | double[] someResult = new double[someDoubles.Length]; 75 | float[] someFloats = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 }; 76 | float[] someOtherFloats = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 }; 77 | unsafe 78 | { 79 | fixed (double* ptr = &someDoubles[1]) 80 | { 81 | fixed (double* ptr2 = &someResult[0]) 82 | { 83 | Vector256 res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8> 84 | Avx.Store(ptr2, res2); 85 | } 86 | } 87 | 88 | fixed (float* ptr = &someFloats[0]) 89 | { 90 | fixed (float* ptr2 = &someOtherFloats[0]) 91 | { 92 | Vector256 res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001); 93 | //Avx.Store(ptr2, res2); 94 | } 95 | } 96 | } 97 | 98 | 99 | 100 | } 101 | } 102 | 103 | public float[] ProcessData(ref Span input) 104 | { 105 | float[] results = new float[input.Length]; 106 | Span> resultVectors = MemoryMarshal.Cast>(results); 107 | 108 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(input); 109 | 110 | for(int i = 0; i < inputVectors.Length; i++) 111 | { 112 | resultVectors[i] = Avx.Sqrt(inputVectors[i]); 113 | } 114 | 115 | return results; 116 | } 117 | 118 | public unsafe float[] ProcessDataUnsafe(ref Span input) 119 | { 120 | float[] results = new float[input.Length]; 121 | fixed (float* inputPtr = &input[0]) 122 | { 123 | float* inCurrent = inputPtr; 124 | fixed (float* resultPtr = &results[0]) 125 | { 126 | float* resEnd = resultPtr + results.Length; 127 | float* resCurrent = resultPtr; 128 | while (resCurrent < resEnd) 129 | { 130 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent))); 131 | resCurrent += 8; 132 | inCurrent += 8; 133 | } 134 | } 135 | } 136 | return results; 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /Core3Intrinsics/Mandelbrot.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.Intrinsics; 3 | using System.Runtime.InteropServices; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace Core3Intrinsics 7 | { 8 | public class Mandelbrot 9 | { 10 | readonly int TOTALBYTES = 16 * 1024 * 1024;//4 * 1024 * 1024; 11 | public int numberOfTasks = 1; 12 | 13 | const float LEFT_X = -2.5f; 14 | const float RIGHT_X = 1.0f; 15 | const float TOP_Y = 1.0f; 16 | const float BOTT_Y = -1.0f; 17 | 18 | int resolutionX, resolutionY; 19 | readonly float ratioy_x = (TOP_Y - BOTT_Y) / (RIGHT_X - LEFT_X); 20 | float STEP_X; 21 | float STEP_Y; 22 | public Memory results, results2, testValue1, testValue2; 23 | public int SizeInBytes => numberOfPoints * sizeof(float); 24 | Memory xPoints, yPoints; 25 | int numberOfPoints; 26 | 27 | public void FloatMandel() 28 | { 29 | int floatL3Size = TOTALBYTES / sizeof(float); 30 | resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x)); 31 | if (resolutionX % 8 != 0) 32 | { 33 | resolutionX -= resolutionX % 8; 34 | } 35 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); 36 | if (resolutionY % 8 != 0) 37 | { 38 | resolutionY -= resolutionY % 8; 39 | } 40 | STEP_X = (RIGHT_X - LEFT_X) / resolutionX; 41 | STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment 42 | numberOfPoints = resolutionX * resolutionY; 43 | if(numberOfPoints % 8 != 0) 44 | { 45 | numberOfPoints += numberOfPoints % 8; 46 | } 47 | results = new float[numberOfPoints]; 48 | testValue1 = new float [numberOfPoints]; 49 | testValue2 = new float [numberOfPoints]; 50 | 51 | xPoints = new float[resolutionX]; 52 | yPoints = new float[resolutionY]; 53 | for (int i = 0; i < resolutionX; i++) 54 | { 55 | xPoints.Span[i] = LEFT_X + i * STEP_X; 56 | } 57 | for (int i = 0; i < resolutionY; i++) 58 | { 59 | yPoints.Span[i] = TOP_Y - i * STEP_Y; 60 | } 61 | 62 | float currentY; 63 | float currentX; 64 | int countX = 0, countY = 0; 65 | int maxInter = 256; 66 | int inter; 67 | float zSquare, xSquare, ySquare, x, y; 68 | ReadOnlySpan ySpan = yPoints.Span; 69 | ReadOnlySpan xSpan = xPoints.Span; 70 | Span res = results.Span; 71 | int floatCounter = 0; 72 | while (countY < resolutionY) 73 | { 74 | 75 | currentY = ySpan[countY]; 76 | while (countX < resolutionX) 77 | { 78 | 79 | currentX = xSpan[countX]; 80 | zSquare = xSquare = ySquare = 0.0f; 81 | inter = 0; 82 | bool goOn; 83 | while (xSquare + ySquare <= 4.0f && inter < maxInter) 84 | { 85 | x = xSquare - ySquare + currentX; 86 | y = zSquare - ySquare - xSquare + currentY; 87 | xSquare = x * x; 88 | ySquare = y * y; 89 | zSquare = (x + y) * (x + y); 90 | goOn = xSquare + ySquare <= 4.0f; 91 | 92 | inter = goOn ? inter + 1 : inter; 93 | } 94 | //res[countY * resolutionX + countX] = inter; 95 | res[floatCounter] = inter; 96 | testValue1.Span[floatCounter] = xSquare + ySquare; 97 | countX++; 98 | floatCounter++; 99 | } 100 | countX = 0; 101 | countY++; 102 | } 103 | } 104 | 105 | public unsafe void Vector256Mandel() 106 | { 107 | int floatL3Size = TOTALBYTES / sizeof(float); 108 | resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x)); 109 | if (resolutionX % 8 != 0) 110 | { 111 | resolutionX -= resolutionX % 8; 112 | } 113 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); 114 | if (resolutionY % 8 != 0) 115 | { 116 | resolutionY -= resolutionY % 8; 117 | } 118 | STEP_X = (RIGHT_X - LEFT_X) / resolutionX; 119 | STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment 120 | numberOfPoints = resolutionX * resolutionY; 121 | results2 = new float[numberOfPoints]; 122 | 123 | xPoints = new float[resolutionX]; 124 | yPoints = new float[resolutionY]; 125 | for (int i = 0; i < resolutionX; i++) 126 | { 127 | xPoints.Span[i] = LEFT_X + i * STEP_X; 128 | } 129 | for (int i = 0; i < resolutionY; i++) 130 | { 131 | yPoints.Span[i] = TOP_Y - i * STEP_Y; 132 | } 133 | 134 | int countX = 0, countY = 0; 135 | int maxInter = 256; 136 | int inter; 137 | ReadOnlySpan ySpan = yPoints.Span;// MemoryMarshal.Cast>(yPoints.Span); 138 | ReadOnlySpan> xSpan = MemoryMarshal.Cast>(xPoints.Span); 139 | Span> res = MemoryMarshal.Cast>(results2.Span); 140 | Span> testSpan = MemoryMarshal.Cast>(testValue2.Span); 141 | int resVectorNumber = 0; 142 | 143 | Vector256 xVec, yVec; 144 | var oneVec = Vector256.Create(1.0f); 145 | var fourVec = Vector256.Create(4.0f); 146 | 147 | while (countY < ySpan.Length) 148 | { 149 | var currYVec = Vector256.Create(ySpan[countY]); 150 | while (countX < xSpan.Length) 151 | { 152 | 153 | Vector256 currXVec = xSpan[countX]; 154 | var xSquVec = Vector256.Create(0.0f); 155 | var ySquVec = Vector256.Create(0.0f); 156 | var zSquVec = Vector256.Create(0.0f); 157 | var interVec = Vector256.Create(0.0f); 158 | Vector256 sumVector = oneVec; 159 | inter = 0; 160 | bool goOn = true; 161 | while (goOn) 162 | { 163 | xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); 164 | yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); 165 | xSquVec = Avx.Multiply(xVec, xVec); 166 | ySquVec = Avx.Multiply(yVec, yVec); 167 | zSquVec = Avx.Multiply(Avx.Add(xVec, yVec), Avx.Add(xVec, yVec)); 168 | Vector256 test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? 169 | sumVector = Avx.BlendVariable(Vector256.Zero, sumVector, test); // selects from second if true, from first otherwise 170 | goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? 171 | if (goOn) 172 | { 173 | interVec = Avx.Add(interVec, sumVector); 174 | } 175 | inter = goOn ? inter + 1 : inter; 176 | } 177 | testSpan[resVectorNumber] = Avx.Add(xSquVec, ySquVec); 178 | res[resVectorNumber] = interVec; 179 | resVectorNumber++; 180 | countX++; 181 | } 182 | countX = 0; 183 | countY++; 184 | } 185 | 186 | } 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/Mandelbrot.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Numerics; 3 | using System.Runtime.Intrinsics; 4 | using BenchmarkDotNet.Attributes; 5 | using System.Runtime.InteropServices; 6 | using System.Runtime.Intrinsics.X86; 7 | 8 | namespace Core3IntrinsicsBenchmarks 9 | { 10 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 11 | public class Mandelbrot 12 | { 13 | //[Params(4 * 1024 * 1024, 16 * 1024 * 1024)] //L3, 4 * L3 14 | public int TotalBytes {get; set; } 15 | 16 | public int numberOfTasks = 2; 17 | const float LEFT_X = -2.5f; 18 | const float RIGHT_X = 1.0f; 19 | const float TOP_Y = 1.0f; 20 | const float BOTT_Y = -1.0f; 21 | const float RATIO_Y_X = (TOP_Y - BOTT_Y) / (RIGHT_X - LEFT_X); 22 | 23 | int resolutionX, resolutionY; 24 | readonly float ratioy_x = RATIO_Y_X; 25 | public Memory results; 26 | public int SizeInBytes => numberOfPoints * sizeof(float); 27 | Memory xPoints, yPoints; 28 | int numberOfPoints; 29 | 30 | [GlobalSetup] 31 | public void GlobalSetup() 32 | { 33 | resolutionX = 1920; 34 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); 35 | float STEP_X = (RIGHT_X - LEFT_X) / resolutionX; 36 | float STEP_Y = STEP_X; // (TOP_Y - BOTT_Y) / resolutionY; Bug from reddit comment 37 | 38 | numberOfPoints = resolutionX * resolutionY; 39 | results = new float[numberOfPoints]; 40 | xPoints = new float[resolutionX]; 41 | yPoints = new float[resolutionY]; 42 | for(int i = 0; i < resolutionX; i++) 43 | { 44 | xPoints.Span[i] = LEFT_X + i * STEP_X; 45 | } 46 | for (int i = 0; i < resolutionY; i++) 47 | { 48 | yPoints.Span[i] = TOP_Y - i * STEP_Y; 49 | } 50 | } 51 | 52 | [Benchmark(Baseline = true)] 53 | public void FloatMandel() 54 | { 55 | float currentY; 56 | float currentX; 57 | int countX = 0, countY = 0; 58 | int maxInter = 256; 59 | int inter; 60 | float zSquare, xSquare, ySquare, x, y; 61 | ReadOnlySpan ySpan = yPoints.Span; 62 | ReadOnlySpan xSpan = xPoints.Span; 63 | Span res = results.Span; 64 | int floatCounter = 0; 65 | float q; 66 | float one16 = 1.0f / 16.0f; 67 | while (countY < resolutionY) 68 | { 69 | currentY = ySpan[countY]; 70 | while (countX < resolutionX) 71 | { 72 | currentX = xSpan[countX]; 73 | zSquare = xSquare = ySquare = 0.0f; 74 | inter = 0; 75 | bool goOn;// = true; 76 | float temp = (currentX - 0.25f); 77 | float temp1 = currentY * currentY; 78 | q = temp * temp + temp1; 79 | goOn = (q * (q + (temp)) > 0.25f * temp1); // out of cardioid? see https://en.wikipedia.org/wiki/Mandelbrot_set#Cardioid_/_bulb_checking 80 | if (goOn) 81 | { 82 | goOn = (currentX + 1.0f) * (currentX + 1.0f) + temp1 > one16; // out of period-2 bulb? 83 | if (!goOn) 84 | { 85 | inter = 255; 86 | } 87 | } 88 | 89 | while (goOn && inter < maxInter) 90 | { 91 | x = xSquare - ySquare + currentX; 92 | y = zSquare - ySquare - xSquare + currentY; 93 | xSquare = x * x; 94 | ySquare = y * y; 95 | zSquare = (x + y) * (x + y); 96 | goOn = xSquare + ySquare <= 4.0f; 97 | 98 | inter = goOn ? inter + 1 : inter; 99 | } 100 | res[floatCounter] = inter; 101 | countX++; 102 | floatCounter++; 103 | } 104 | countX = 0; 105 | countY++; 106 | } 107 | } 108 | 109 | [Benchmark] 110 | public unsafe void Vector256Mandel() 111 | { 112 | int countX = 0, countY = 0; 113 | int maxInter = 256; 114 | int inter; 115 | ReadOnlySpan ySpan = yPoints.Span; 116 | ReadOnlySpan> xSpan = MemoryMarshal.Cast>(xPoints.Span); 117 | Span> res = MemoryMarshal.Cast>(results.Span); 118 | int resVectorNumber = 0; 119 | 120 | Vector256 xVec, yVec; 121 | Vector256 zeroVec = Vector256.Zero; 122 | var oneVec = Vector256.Create(1.0f); 123 | var fourVec = Vector256.Create(4.0f); 124 | var one4Vec = Vector256.Create(0.25f); 125 | var one16Vec = Vector256.Create(1.0f/16.0f); 126 | Vector256 qVec; 127 | Vector256 test; 128 | 129 | while (countY < ySpan.Length) 130 | { 131 | var currYVec = Vector256.Create(ySpan[countY]); 132 | while (countX < xSpan.Length) 133 | { 134 | Vector256 currXVec = xSpan[countX]; 135 | Vector256 xSquVec = zeroVec; 136 | Vector256 ySquVec = zeroVec; 137 | Vector256 zSquVec = zeroVec; 138 | Vector256 interVec = zeroVec; 139 | Vector256 sumVector; 140 | 141 | inter = 0; 142 | bool goOn; 143 | Vector256 temp = Avx.Subtract(currXVec, one4Vec); 144 | Vector256 temp1 = Avx.Multiply(currYVec, currYVec); 145 | qVec = Avx.Add(Avx.Multiply(temp, temp), temp1); 146 | Vector256 temp2 = Avx.Multiply(qVec, Avx.Add(qVec, temp)); 147 | test = Avx.Compare(temp2, Avx.Multiply(one4Vec, temp1), FloatComparisonMode.OrderedGreaterThanNonSignaling); 148 | goOn = (Avx.MoveMask(test) > 0); 149 | if(goOn) 150 | { 151 | temp2 = Avx.Add(currXVec, oneVec); 152 | temp = Avx.Add(Avx.Multiply(temp2, temp2), temp1); 153 | test = Avx.Compare(temp, one16Vec, FloatComparisonMode.OrderedGreaterThanNonSignaling); 154 | goOn = Avx.MoveMask(test) > 0; 155 | if (!goOn) 156 | { 157 | interVec = Vector256.Create(255.0f); // make all point = maximum value 158 | } 159 | } 160 | while (goOn) 161 | { 162 | xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); 163 | yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); 164 | xSquVec = Avx.Multiply(xVec, xVec); 165 | ySquVec = Avx.Multiply(yVec, yVec); 166 | temp = Avx.Add(xVec, yVec); 167 | zSquVec = Avx.Multiply(temp, temp); 168 | test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? 169 | sumVector = Avx.BlendVariable(zeroVec, oneVec, test); 170 | 171 | goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? 172 | if (goOn) 173 | { 174 | interVec = Avx.Add(interVec, sumVector); 175 | } 176 | inter = goOn ? inter + 1 : inter; 177 | } 178 | res[resVectorNumber] = interVec; 179 | resVectorNumber++; 180 | countX++; 181 | } 182 | countX = 0; 183 | countY++; 184 | } 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/MemoryBenches.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using BenchmarkDotNet.Exporters; 4 | using BenchmarkDotNet.Exporters.Csv; 5 | using System; 6 | using System.Buffers; 7 | using System.Runtime.CompilerServices; 8 | using System.Runtime.InteropServices; 9 | using System.Runtime.Intrinsics; 10 | using System.Runtime.Intrinsics.X86; 11 | 12 | namespace Core3IntrinsicsBenchmarks 13 | { 14 | //[DisassemblyDiagnoser(printAsm: true, printSource: true)] 15 | //[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] 16 | //[CategoriesColumn] 17 | //[Config(typeof(Config))] // only used for plots 18 | public class MemoryBenches 19 | { 20 | private class Config : ManualConfig // only used for plots 21 | { 22 | public Config() 23 | { 24 | Add(CsvMeasurementsExporter.Default); 25 | Add(RPlotExporter.Default); 26 | } 27 | } 28 | 29 | [Params(16 * 1024, 128 * 1024, 1024 * 1024, 2 * 1024 * 1024, 8 * 1024 * 1024)] // half L1, half L2, half L3, 2 * L3 30 | public int NumberOfBytes { get ; set; } 31 | 32 | private int vectorNumberOfItems, vectorFloatStep; 33 | private int numberOfFloatItems; 34 | 35 | private static readonly AlignedArrayPool alignedArrayPool = new AlignedArrayPool(); 36 | private static AlignedMemoryHandle dataMemory, storeMemory, data16Memory, store16Memory; 37 | //private static float[] arr1, arr2; 38 | 39 | [GlobalSetup] 40 | public unsafe void GlobalSetup() 41 | { 42 | vectorFloatStep = Vector256.Count; 43 | numberOfFloatItems = NumberOfBytes / sizeof(float); 44 | vectorNumberOfItems = numberOfFloatItems / vectorFloatStep; 45 | 46 | dataMemory = alignedArrayPool.Rent(numberOfFloatItems); 47 | storeMemory = alignedArrayPool.Rent(numberOfFloatItems); 48 | data16Memory = alignedArrayPool.Rent(numberOfFloatItems, 16); 49 | store16Memory = alignedArrayPool.Rent(numberOfFloatItems, 16); 50 | 51 | for (int i = 0; i < numberOfFloatItems; i++) 52 | { 53 | dataMemory.Memory.Span[i] = i; 54 | data16Memory.Memory.Span[i] = i; 55 | } 56 | } 57 | 58 | [GlobalCleanup] 59 | public void GlobalCleanup() 60 | { 61 | alignedArrayPool.Return(dataMemory); 62 | alignedArrayPool.Return(storeMemory); 63 | alignedArrayPool.Return(data16Memory); 64 | alignedArrayPool.Return(store16Memory); 65 | } 66 | 67 | /* 68 | [BenchmarkCategory("Aligned Memory"), Benchmark] 69 | public unsafe void ScalarStore() 70 | { 71 | ReadOnlySpan dataAl = MemoryMarshal.Cast(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength)); 72 | Span storeAl = MemoryMarshal.Cast(new Span(storeMemory.MemoryHandle.Pointer, storeMemory.ByteArrayLength)); 73 | for (int i = 0; i < dataAl.Length; i++) 74 | { 75 | storeAl[i] = dataAl[i]; 76 | } 77 | } 78 | 79 | [BenchmarkCategory("Aligned Memory"), Benchmark] 80 | public unsafe void ScalarStoreUnrolled() 81 | { 82 | ReadOnlySpan dataAl = MemoryMarshal.Cast(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength)); 83 | Span storeAl = MemoryMarshal.Cast(new Span(storeMemory.MemoryHandle.Pointer, storeMemory.ByteArrayLength)); 84 | 85 | int step = 4; 86 | for (int i = 0; i < dataAl.Length; i += step) 87 | { 88 | storeAl[i] = dataAl[i]; 89 | storeAl[i + 1] = dataAl[i + 1]; 90 | storeAl[i + 2] = dataAl[i + 2]; 91 | storeAl[i + 3] = dataAl[i + 3]; 92 | } 93 | } 94 | 95 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 96 | public unsafe void PtrCopyUnrolled() 97 | { 98 | float* arr1Ptr = (float*)data16Memory.MemoryHandle.Pointer; 99 | float* arr2Ptr = (float*)store16Memory.MemoryHandle.Pointer; 100 | 101 | int i = 0; 102 | while (i < numberOfFloatItems) 103 | { 104 | *arr2Ptr = *arr1Ptr; 105 | arr1Ptr++; 106 | arr2Ptr++; 107 | *arr2Ptr = *arr1Ptr; 108 | arr1Ptr++; 109 | arr2Ptr++; 110 | *arr2Ptr = *arr1Ptr; 111 | arr1Ptr++; 112 | arr2Ptr++; 113 | *arr2Ptr = *arr1Ptr; 114 | arr1Ptr++; 115 | arr2Ptr++; 116 | 117 | i += 4; 118 | } 119 | 120 | 121 | } */ 122 | 123 | [BenchmarkCategory("Aligned Memory"), Benchmark] 124 | public void ScalarCopyBlock() 125 | { 126 | Unsafe.CopyBlock(ref storeMemory.ByteRef, ref dataMemory.ByteRef, (uint)(numberOfFloatItems * sizeof(float))); 127 | } 128 | 129 | 130 | [BenchmarkCategory("Aligned Memory"), Benchmark(Baseline = true)] 131 | public unsafe void VectorStoreAlignedUnsafe() 132 | { 133 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 134 | float* currSpPtr2 = (float*)storeMemory.MemoryHandle.Pointer; 135 | 136 | int i = 0; 137 | while (i < vectorNumberOfItems) 138 | { 139 | Avx.StoreAligned(currSpPtr2, Avx.LoadAlignedVector256(currSpPtr)); 140 | currSpPtr += vectorFloatStep; 141 | currSpPtr2 += vectorFloatStep; 142 | i++; 143 | } 144 | } 145 | /* 146 | [BenchmarkCategory("Aligned Memory"), Benchmark] 147 | public unsafe void VectorStoreArrayMemPtr() 148 | { 149 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfFloatItems)); 150 | Span> writeMem = MemoryMarshal.Cast>(new Span(storeMemory.MemoryHandle.Pointer, numberOfFloatItems)); 151 | 152 | int i = 0; 153 | 154 | while (i < readMem.Length) 155 | { 156 | writeMem[i] = readMem[i]; 157 | i++; 158 | } 159 | } 160 | 161 | [BenchmarkCategory("Aligned Memory"), Benchmark] 162 | public void VectorStoreArrayMemSafe() 163 | { 164 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(dataMemory.Memory.Span); 165 | Span> writeMem = MemoryMarshal.Cast>(storeMemory.Memory.Span); 166 | 167 | int i = 0; 168 | 169 | while (i < readMem.Length) 170 | { 171 | writeMem[i] = readMem[i]; 172 | i++; 173 | } 174 | } 175 | 176 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 177 | public unsafe void VectorStoreArrayMemPtrUnaligned() 178 | { 179 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(new ReadOnlySpan(data16Memory.MemoryHandle.Pointer, numberOfFloatItems)); 180 | Span> writeMem = MemoryMarshal.Cast>(new Span(store16Memory.MemoryHandle.Pointer, numberOfFloatItems)); 181 | 182 | int i = 0; 183 | while (i < readMem.Length) 184 | { 185 | writeMem[i] = readMem[i]; 186 | i++; 187 | } 188 | } 189 | 190 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 191 | public void VectorArraySafeUnaligned() 192 | { 193 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(data16Memory.Memory.Span); 194 | Span> writeMem = MemoryMarshal.Cast>(store16Memory.Memory.Span); 195 | 196 | int i = 0; 197 | while (i < readMem.Length) 198 | { 199 | writeMem[i] = readMem[i]; 200 | i++; 201 | } 202 | } */ 203 | 204 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 205 | public unsafe void VectorStoreUnalignedUnsafe() 206 | { 207 | float* currSpPtr = (float*)data16Memory.MemoryHandle.Pointer; 208 | float* currSpPtr2 = (float*)store16Memory.MemoryHandle.Pointer; 209 | 210 | int i = 0; 211 | while (i < vectorNumberOfItems) 212 | { 213 | Avx.Store(currSpPtr2, Avx.LoadVector256(currSpPtr)); 214 | currSpPtr += vectorFloatStep; 215 | currSpPtr2 += vectorFloatStep; 216 | i++; 217 | } 218 | } 219 | 220 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 221 | public unsafe void VectorStoreUnalignedToAlignedUnsafe() 222 | { 223 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 224 | float* currSpPtr2 = (float*)storeMemory.MemoryHandle.Pointer; 225 | 226 | int i = 0; 227 | while (i < vectorNumberOfItems) 228 | { 229 | Avx.Store(currSpPtr2, Avx.LoadVector256(currSpPtr)); 230 | currSpPtr += vectorFloatStep; 231 | currSpPtr2 += vectorFloatStep; 232 | i++; 233 | } 234 | } 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/IntegerBasicOps.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using System; 4 | using System.Numerics; 5 | using System.Runtime.InteropServices; 6 | using System.Runtime.Intrinsics; 7 | using System.Runtime.Intrinsics.X86; 8 | 9 | namespace Core3IntrinsicsBenchmarks 10 | { 11 | //[DisassemblyDiagnoser(printAsm: true, printSource: true)] 12 | [GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] 13 | [CategoriesColumn] 14 | public class IntegerBasicOps 15 | { 16 | [Params(/*4 * 1024,*/ 4000 * 1024)] 17 | public int NumberOfItems {get; set;} 18 | 19 | private const int bmpWidth = 1920, bmpHeight = 1080; 20 | private AlignedArrayPool intPool; 21 | private AlignedArrayPool shortPool; 22 | private AlignedArrayPool longPool; 23 | private AlignedMemoryHandle intData, intStore, bmpData, bmpStore; 24 | private AlignedMemoryHandle shortData, shortStore; 25 | private AlignedMemoryHandle longData, longStore; 26 | 27 | [GlobalSetup] 28 | public void GlobalSetup() 29 | { 30 | intPool = new AlignedArrayPool(); 31 | shortPool = new AlignedArrayPool(); 32 | longPool = new AlignedArrayPool(); 33 | 34 | intData = intPool.Rent(NumberOfItems); 35 | intStore = intPool.Rent(NumberOfItems); 36 | bmpData = intPool.Rent(bmpWidth * bmpHeight * 4); 37 | bmpStore = intPool.Rent(bmpWidth * bmpHeight * 4); 38 | shortData = shortPool.Rent(NumberOfItems); 39 | shortStore = shortPool.Rent(NumberOfItems); 40 | longData = longPool.Rent(NumberOfItems); 41 | longStore = longPool.Rent(NumberOfItems); 42 | 43 | var r = new Random(1); 44 | for (int i = 0; i < NumberOfItems; i++) 45 | { 46 | intData.Memory.Span[i] = i * 2 + r.Next(-1000, 1000); 47 | intStore.Memory.Span[i] = i + r.Next(-1000, 1000); 48 | shortData.Memory.Span[i] = (short)intData.Memory.Span[i]; 49 | shortStore.Memory.Span[i] = (short)intStore.Memory.Span[i]; 50 | longData.Memory.Span[i] = intData.Memory.Span[i]; 51 | longStore.Memory.Span[i] = intStore.Memory.Span[i]; 52 | } 53 | for(int i = 0; i < bmpData.Memory.Span.Length; i++) 54 | { 55 | bmpData.Memory.Span[i] = i; 56 | } 57 | } 58 | 59 | [GlobalCleanup] 60 | public void GlobalCleanup() 61 | { 62 | intPool.Return(intData); 63 | intPool.Return(intStore); 64 | intPool.Return(bmpData); 65 | intPool.Return(bmpStore); 66 | shortPool.Return(shortData); 67 | shortPool.Return(shortStore); 68 | longPool.Return(longData); 69 | longPool.Return(longStore); 70 | intPool.Dispose(); 71 | } 72 | /* 73 | [BenchmarkCategory("Short"), Benchmark(Baseline = true)] 74 | public unsafe void ShortAdd() 75 | { 76 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems); 77 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems); 78 | 79 | for (int i = 0; i < NumberOfItems; i++) 80 | { 81 | sp2[i] = (short)(sp1[i] + sp2[i]); 82 | } 83 | } 84 | 85 | [BenchmarkCategory("Short"), Benchmark] 86 | public unsafe void ShortAddVector256() 87 | { 88 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span); 89 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span); 90 | 91 | for (int i = 0; i < sp1.Length; i++) 92 | { 93 | sp2[i] = Avx2.Add(sp1[i], sp2[i]); 94 | } 95 | } 96 | 97 | [BenchmarkCategory("Short"), Benchmark] 98 | public unsafe void ShortAndNot() 99 | { 100 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems); 101 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems); 102 | 103 | for (int i = 0; i < NumberOfItems; i++) 104 | { 105 | sp2[i] = (short)(sp1[i] & ~sp2[i]); 106 | } 107 | } 108 | 109 | [BenchmarkCategory("Short"), Benchmark] 110 | public unsafe void ShortAndNotVector256() 111 | { 112 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span); 113 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span); 114 | 115 | for (int i = 0; i < sp1.Length; i++) 116 | { 117 | sp2[i] = Avx2.AndNot(sp1[i],sp2[i]); 118 | } 119 | } 120 | 121 | [BenchmarkCategory("Short"), Benchmark] 122 | public unsafe void ShortShiftLeft() 123 | { 124 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems); 125 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems); 126 | 127 | for (int i = 0; i < NumberOfItems; i++) 128 | { 129 | sp2[i] = (short)(sp1[i] << 5); 130 | } 131 | } 132 | 133 | [BenchmarkCategory("Short"), Benchmark] 134 | public unsafe void ShortShiftLeftVector256() 135 | { 136 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span); 137 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span); 138 | 139 | for (int i = 0; i < sp1.Length; i++) 140 | { 141 | sp2[i] = Avx2.ShiftLeftLogical(sp1[i], 5); 142 | } 143 | } */ 144 | /* 145 | [BenchmarkCategory("Integer"), Benchmark(Baseline = true)] 146 | public unsafe void IntAdd() 147 | { 148 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 149 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 150 | 151 | for(int i = 0; i < NumberOfItems; i++) 152 | { 153 | sp2[i] = sp1[i] + sp2[i]; 154 | } 155 | } 156 | 157 | [BenchmarkCategory("Integer"), Benchmark] 158 | public unsafe void IntAddVector256() 159 | { 160 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 161 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 162 | 163 | for (int i = 0; i < sp1.Length; i++) 164 | { 165 | sp2[i] = Avx2.Add(sp1[i], sp2[i]); 166 | } 167 | } 168 | 169 | [BenchmarkCategory("Integer"), Benchmark] 170 | public unsafe void IntXor() 171 | { 172 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 173 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 174 | 175 | for (int i = 0; i < NumberOfItems; i++) 176 | { 177 | sp2[i] = sp1[i] ^ sp2[i]; 178 | } 179 | } 180 | 181 | [BenchmarkCategory("Integer"), Benchmark] 182 | public unsafe void IntXorVector256() 183 | { 184 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 185 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 186 | 187 | for (int i = 0; i < sp1.Length; i++) 188 | { 189 | sp2[i] = Avx2.Xor(sp1[i], sp2[i]); 190 | } 191 | } 192 | 193 | [BenchmarkCategory("Integer"), Benchmark] 194 | public unsafe void IntMultiply() 195 | { 196 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 197 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 198 | 199 | for (int i = 0; i < NumberOfItems; i++) 200 | { 201 | sp2[i] = sp1[i] * sp2[i]; 202 | } 203 | } 204 | 205 | [BenchmarkCategory("Integer"), Benchmark] 206 | public unsafe void IntMultiplyLowVector256() 207 | { 208 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 209 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 210 | 211 | for (int i = 0; i < sp1.Length; i++) 212 | { 213 | sp2[i] = Avx2.MultiplyLow(sp1[i], sp2[i]); 214 | } 215 | } 216 | 217 | [BenchmarkCategory("Integer"), Benchmark] 218 | public unsafe void IntShiftLeft() 219 | { 220 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 221 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 222 | 223 | for (int i = 0; i < NumberOfItems; i++) 224 | { 225 | sp2[i] = sp1[i] << 5; 226 | } 227 | } 228 | 229 | 230 | [BenchmarkCategory("Integer"), Benchmark] 231 | public unsafe void IntShiftLeftVector256() 232 | { 233 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 234 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 235 | 236 | for (int i = 0; i < sp1.Length; i++) 237 | { 238 | sp2[i] = Avx2.ShiftLeftLogical(sp1[i], 5); 239 | } 240 | } 241 | 242 | [BenchmarkCategory("Integer"), Benchmark] 243 | public unsafe void IntMax() 244 | { 245 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 246 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 247 | 248 | for (int i = 0; i < NumberOfItems; i++) 249 | { 250 | sp2[i] = sp1[i] > sp2[i] ? sp1[1] : sp2[i]; 251 | } 252 | } 253 | 254 | [BenchmarkCategory("Integer"), Benchmark] 255 | public unsafe void IntMaxVector256() 256 | { 257 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 258 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 259 | 260 | for (int i = 0; i < sp1.Length; i++) 261 | { 262 | sp2[i] = Avx2.Max(sp1[i], sp2[i]); 263 | } 264 | } */ 265 | 266 | [BenchmarkCategory("Chained1"), Benchmark(Baseline = true)] 267 | public unsafe void IntMultipleOps() 268 | { 269 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 270 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 271 | 272 | for (int i = 0; i < NumberOfItems; i++) 273 | { 274 | sp2[i] = ((sp1[i] > sp2[i] ? sp1[1] : sp2[i]) << 2) * 3; 275 | } 276 | } 277 | 278 | [BenchmarkCategory("Chained1"), Benchmark] 279 | public unsafe void IntMultipleOpsvector256() 280 | { 281 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 282 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 283 | 284 | Vector256 three = Vector256.Create(3); 285 | 286 | for (int i = 0; i < sp1.Length; i++) 287 | { 288 | sp2[i] = Avx2.MultiplyLow(Avx2.ShiftLeftLogical(Avx2.Max(sp1[i], sp2[i]), 2), three); 289 | } 290 | } 291 | 292 | [BenchmarkCategory("Chained2"), Benchmark(Baseline = true)] 293 | public unsafe void IntTranspose() 294 | { 295 | var sp1 = new ReadOnlySpan(bmpData.MemoryHandle.Pointer, bmpHeight * bmpWidth * 4); 296 | var sp2 = new Span(bmpStore.MemoryHandle.Pointer, bmpHeight * bmpWidth * 4); 297 | int numberOfElements = Vector256.Count; 298 | 299 | int[] colorComponents = new int[bmpWidth * 4]; 300 | int runningCounter = 0;//, byteCounter; 301 | int start; 302 | for (int y = 0; y < bmpHeight; y++) 303 | { 304 | Span currColors = sp2.Slice(runningCounter, bmpWidth * 4); 305 | for (int x = 0; x < bmpWidth; x += numberOfElements) 306 | { 307 | for (int i = 0; i < numberOfElements; i++) 308 | { 309 | start = x * 4 + i; 310 | colorComponents[start] = sp1[runningCounter]; 311 | colorComponents[start + numberOfElements] = sp1[runningCounter + 1]; 312 | colorComponents[start + (2 * numberOfElements)] = sp1[runningCounter + 2]; 313 | colorComponents[start + (3 * numberOfElements)] = sp1[runningCounter + 3]; 314 | runningCounter += 4; 315 | } 316 | } 317 | colorComponents.CopyTo(currColors); 318 | 319 | } 320 | } 321 | 322 | [BenchmarkCategory("Chained2"), Benchmark] 323 | public unsafe void IntTransposeVector256() // see https://software.intel.com/sites/default/files/m/d/4/1/d/8/Image_Processing_-_whitepaper_-_100pct_CCEreviewed_update.pdf 324 | { 325 | Span> originVectors = MemoryMarshal.Cast>(bmpData.Memory.Span); 326 | Span> transposedVectors = MemoryMarshal.Cast>(bmpStore.Memory.Span); 327 | Vector256 pm0, pm1, pm2, pm3, up0, up1, up2, up3; 328 | for (int i = 0; i < originVectors.Length; i += 4) 329 | { 330 | pm0 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x20); 331 | pm1 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x20); 332 | pm2 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x31); 333 | pm3 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x31); 334 | 335 | up0 = Avx2.UnpackLow(pm0, pm1); 336 | up1 = Avx2.UnpackHigh(pm0, pm1); 337 | up2 = Avx2.UnpackLow(pm2, pm3); 338 | up3 = Avx2.UnpackHigh(pm2, pm3); 339 | 340 | transposedVectors[i] = Avx2.UnpackLow(up0, up2); 341 | transposedVectors[i + 1] = Avx2.UnpackHigh(up0, up2); 342 | transposedVectors[i + 2] = Avx2.UnpackLow(up1, up3); 343 | transposedVectors[i + 3] = Avx2.UnpackHigh(up1, up3); 344 | } 345 | } 346 | 347 | /* 348 | [BenchmarkCategory("Long"), Benchmark(Baseline = true)] 349 | public unsafe void LongAdd() 350 | { 351 | var sp1 = new ReadOnlySpan(longData.MemoryHandle.Pointer, NumberOfItems); 352 | var sp2 = new Span(longStore.MemoryHandle.Pointer, NumberOfItems); 353 | 354 | for (int i = 0; i < NumberOfItems; i++) 355 | { 356 | sp2[i] = sp1[i] + sp2[i]; 357 | } 358 | } 359 | 360 | [BenchmarkCategory("Long"), Benchmark] 361 | public unsafe void LongMultiply() 362 | { 363 | var sp1 = new ReadOnlySpan(longData.MemoryHandle.Pointer, NumberOfItems); 364 | var sp2 = new Span(longStore.MemoryHandle.Pointer, NumberOfItems); 365 | 366 | for (int i = 0; i < NumberOfItems; i++) 367 | { 368 | sp2[i] = sp1[i] * sp2[i]; 369 | } 370 | } 371 | 372 | 373 | [BenchmarkCategory("Long"), Benchmark] 374 | public unsafe void LongAddVector256() 375 | { 376 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(longData.Memory.Span); 377 | Span> sp2 = MemoryMarshal.Cast>(longStore.Memory.Span); 378 | 379 | for (int i = 0; i < sp1.Length; i++) 380 | { 381 | sp2[i] = Avx2.Add(sp1[i], sp2[i]); 382 | } 383 | } 384 | 385 | 386 | [BenchmarkCategory("Long"), Benchmark] 387 | public unsafe void IntMultiplyVector256ToLong() 388 | { 389 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 390 | ReadOnlySpan> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 391 | Span> sp3 = MemoryMarshal.Cast>(longStore.Memory.Span); 392 | 393 | for (int i = 0; i < sp1.Length; i++) 394 | { 395 | sp3[i] = Avx2.Multiply(sp1[i], sp2[i]); 396 | } 397 | } 398 | */ 399 | } 400 | } 401 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/BasicOps.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using System; 4 | using System.Buffers; 5 | using System.Numerics; 6 | using System.Runtime.InteropServices; 7 | using System.Runtime.Intrinsics; 8 | using System.Runtime.Intrinsics.X86; 9 | 10 | namespace Core3IntrinsicsBenchmarks 11 | { 12 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 13 | //[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] 14 | //[CategoriesColumn] 15 | public class BasicOps 16 | { 17 | [Params(/*256 * 1024,*/ 10 * 4 * 1024 * 1024)] 18 | public int ParamCacheSizeBytes { get; set; } 19 | 20 | private int numberOfFloatItems, numberOfDoubleItems; 21 | //To test aligned vs unaligned memory 22 | //private readonly AlignedMemoryHandle dataMemory; 23 | //private readonly AlignedMemoryHandle dataMemory2; 24 | //private readonly AlignedMemoryHandle dataMemory3; 25 | //private readonly AlignedMemoryHandle resultMemory; 26 | //private readonly AlignedMemoryHandle dataDoubleMemory; 27 | //private readonly AlignedMemoryHandle resultDoubleMemory; 28 | private float[] data, data2, data3, result; 29 | private double[] dataD, dataD2, dataD3, resultD; 30 | 31 | [GlobalSetup] 32 | public unsafe void GlobalSetup() 33 | { 34 | numberOfFloatItems = ParamCacheSizeBytes / sizeof(float) / 4; // make sure that all data fits 35 | numberOfDoubleItems = ParamCacheSizeBytes / sizeof(double) / 4; 36 | //To test aligned vs unaligned memory 37 | //floatPool = new AlignedArrayPool(); 38 | //doublePool = new AlignedArrayPool(); 39 | //dataMemory = floatPool.Rent(numberOfFloatItems); 40 | //dataMemory2 = floatPool.Rent(numberOfFloatItems); 41 | //dataMemory3 = floatPool.Rent(numberOfFloatItems); 42 | //resultMemory = floatPool.Rent(numberOfFloatItems); 43 | //dataDoubleMemory = doublePool.Rent(numberOfDoubleItems); 44 | //resultDoubleMemory = doublePool.Rent(numberOfDoubleItems); 45 | data = ArrayPool.Shared.Rent(numberOfFloatItems); 46 | data2 = ArrayPool.Shared.Rent(numberOfFloatItems); 47 | data3 = ArrayPool.Shared.Rent(numberOfFloatItems); 48 | result = ArrayPool.Shared.Rent(numberOfFloatItems); 49 | dataD = ArrayPool.Shared.Rent(numberOfDoubleItems); 50 | dataD2 = ArrayPool.Shared.Rent(numberOfDoubleItems); 51 | dataD3 = ArrayPool.Shared.Rent(numberOfDoubleItems); 52 | resultD = ArrayPool.Shared.Rent(numberOfDoubleItems); 53 | 54 | 55 | for (int i = 0; i < numberOfFloatItems; i++) 56 | { 57 | data[i] = i + 1.0f; 58 | data2[i] = i + 1.0f; 59 | data3[i] = i + 1.0f; 60 | result[i] = 0.0f; 61 | } 62 | for(int i = 0; i < numberOfDoubleItems; i++) 63 | { 64 | dataD[i] = i + 1.0; 65 | resultD[i] = 0.0; 66 | } 67 | } 68 | 69 | [GlobalCleanup] 70 | public void GlobalCleanup() 71 | { 72 | //To test aligned vs unaligned memory 73 | //floatPool.Return(resultMemory, false); 74 | //floatPool.Return(dataMemory, false); 75 | //floatPool.Return(dataMemory2, false); 76 | //floatPool.Return(dataMemory3, false); 77 | //doublePool.Return(resultDoubleMemory, false); 78 | //doublePool.Return(dataDoubleMemory, false); 79 | //floatPool.Dispose(); 80 | //doublePool.Dispose(); 81 | ArrayPool.Shared.Return(data); 82 | ArrayPool.Shared.Return(data2); 83 | ArrayPool.Shared.Return(data3); 84 | ArrayPool.Shared.Return(result); 85 | ArrayPool.Shared.Return(dataD); 86 | ArrayPool.Shared.Return(dataD2); 87 | ArrayPool.Shared.Return(dataD3); 88 | ArrayPool.Shared.Return(resultD); 89 | } 90 | 91 | 92 | /*[BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 93 | public unsafe void MultiplyAddScalarFloat() 94 | { 95 | var sp1 = new ReadOnlySpan(data, 0, numberOfFloatItems); 96 | var sp12 = new ReadOnlySpan(data2, 0, numberOfFloatItems); 97 | var sp13 = new ReadOnlySpan(data3, 0, numberOfFloatItems); 98 | var sp2 = new Span(result, 0, numberOfFloatItems); 99 | 100 | for (int i = 0; i < sp1.Length; i++) 101 | { 102 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 103 | } 104 | } */ 105 | 106 | [BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 107 | public void ScalarFloatMultipleOps() 108 | { 109 | var sp1 = new ReadOnlySpan(data, 0, numberOfFloatItems); 110 | var sp12 = new ReadOnlySpan(data2, 0, numberOfFloatItems); 111 | var sp13 = new ReadOnlySpan(data3, 0, numberOfFloatItems); 112 | var sp2 = new Span(result, 0, numberOfFloatItems); 113 | 114 | for (int i = 0; i < sp1.Length; i++) 115 | { 116 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 117 | sp2[i] = sp2[i] * sp1[i] + sp1[i]; 118 | sp2[i] = sp1[i] * sp1[i] + sp2[i]; 119 | } 120 | } 121 | 122 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 123 | public void Vector256FloatMultipleOps() 124 | { 125 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 126 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 127 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 128 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 129 | 130 | for (int i = 0; i < d1.Length; i++) 131 | { 132 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 133 | r[i] = Fma.MultiplyAdd(r[i], d1[i], d1[i]); 134 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], r[i]); 135 | } 136 | } 137 | 138 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 139 | public unsafe void Vector256FloatMultipleOpsUnsafe() 140 | { 141 | fixed (float* d1Ptr = &data[0]) 142 | { 143 | fixed (float* d2Ptr = &data2[0]) 144 | { 145 | fixed (float* d3Ptr = &data3[0]) 146 | { 147 | fixed (float* resPtr = &result[0]) 148 | { 149 | float* currD1 = d1Ptr; 150 | float* currD2 = d2Ptr; 151 | float* currD3 = d3Ptr; 152 | float* currRes = resPtr; 153 | float* limitPtr = d1Ptr + numberOfFloatItems; 154 | while (currD1 < limitPtr) 155 | { 156 | Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currD3))); 157 | Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currRes), Avx.LoadVector256(currD1), Avx.LoadVector256(currD1))); 158 | Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currRes))); 159 | currD1 += 8; 160 | currD2 += 8; 161 | currD3 += 8; 162 | currRes += 8; 163 | } 164 | } 165 | } 166 | } 167 | 168 | } 169 | } 170 | 171 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 172 | public void VectorTFloatMultipleOps() 173 | { 174 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 175 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 176 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 177 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 178 | 179 | for (int i = 0; i < d1.Length; i++) 180 | { 181 | r[i] = d1[i] * d2[i] + d3[i]; 182 | r[i] = r [i] * d1[i] + d1[i]; 183 | r[i] = d1[i] * d2[i] + r[i]; 184 | } 185 | } 186 | 187 | 188 | /* 189 | [BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 190 | public unsafe void DoubleMultipleOps() 191 | { 192 | var sp1 = new ReadOnlySpan(dataD, 0, numberOfDoubleItems); 193 | var sp12 = new ReadOnlySpan(dataD2, 0, numberOfDoubleItems); 194 | var sp13 = new ReadOnlySpan(dataD3, 0, numberOfDoubleItems); 195 | var sp2 = new Span(resultD, 0, numberOfDoubleItems); 196 | 197 | for (int i = 0; i < sp1.Length; i++) 198 | { 199 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 200 | sp2[i] = sp2[i] * sp1[i] + sp1[i]; 201 | sp2[i] = sp1[i] * sp1[i] + sp2[i]; 202 | } 203 | } 204 | 205 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 206 | public unsafe void FmaMultiplyAddvector256Float() 207 | { 208 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 209 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 210 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 211 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 212 | 213 | for (int i = 0; i < d1.Length; i++) 214 | { 215 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 216 | } 217 | }*/ 218 | 219 | /* 220 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 221 | public unsafe void FmaMultiplyAddvectorTFloat() 222 | { 223 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 224 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 225 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 226 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 227 | 228 | for (int i = 0; i < d1.Length; i++) 229 | { 230 | r[i] = d1[i] * d2[i] + d3[i]; 231 | } 232 | } 233 | 234 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 235 | public unsafe void Vector256DoubleMultipleOps() 236 | { 237 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(dataD, 0, numberOfDoubleItems)); 238 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(dataD2, 0, numberOfDoubleItems)); 239 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(dataD3, 0, numberOfDoubleItems)); 240 | Span> r = MemoryMarshal.Cast>(new Span(resultD, 0, numberOfDoubleItems)); 241 | 242 | for (int i = 0; i < d1.Length; i++) 243 | { 244 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 245 | r[i] = Fma.MultiplyAdd(r[i], d1[i], d1[i]); 246 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], r[i]); 247 | } 248 | } */ 249 | 250 | 251 | /* 252 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 253 | public unsafe void FmaMultiplyAddSpanAMH() 254 | { 255 | //int step = Vector256.Count; 256 | 257 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength)); 258 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(dataMemory2.MemoryHandle.Pointer, dataMemory2.ByteArrayLength)); 259 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(dataMemory3.MemoryHandle.Pointer, dataMemory3.ByteArrayLength)); 260 | Span> r = MemoryMarshal.Cast>(new Span(resultMemory.MemoryHandle.Pointer, resultMemory.ByteArrayLength)); 261 | 262 | for (int i = 0; i < d1.Length; i++) 263 | { 264 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 265 | } 266 | } 267 | 268 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 269 | public unsafe void FmaMultiplyAddAMHPtr() 270 | { 271 | int step = Vector256.Count; 272 | 273 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 274 | float* currSpPtr12 = (float*)dataMemory.MemoryHandle.Pointer; 275 | float* currSpPtr13 = (float*)dataMemory.MemoryHandle.Pointer; 276 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 277 | 278 | for (int i = 0; i < numberOfItems; i += step) 279 | { 280 | Avx.StoreAligned(currSpPtr2, Fma.MultiplyAdd(Avx.LoadAlignedVector256(currSpPtr), Avx.LoadAlignedVector256(currSpPtr12), Avx.LoadAlignedVector256(currSpPtr13))); 281 | currSpPtr += step; 282 | currSpPtr12 += step; 283 | currSpPtr13 += step; 284 | currSpPtr2 += step; 285 | } 286 | } 287 | 288 | [BenchmarkCategory("Negative MultiplyAdd"), Benchmark(Baseline = true)] 289 | public unsafe void NegMultiplyAdd() 290 | { 291 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 292 | var sp12 = new ReadOnlySpan(dataMemory2.MemoryHandle.Pointer, numberOfItems); 293 | var sp13 = new ReadOnlySpan(dataMemory3.MemoryHandle.Pointer, numberOfItems); 294 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 295 | 296 | for (int i = 0; i < sp1.Length; i++) 297 | { 298 | sp2[i] = -(sp1[i] * sp12[i]) + sp13[i]; 299 | } 300 | } 301 | 302 | [BenchmarkCategory("Negative MultiplyAdd"), Benchmark] 303 | public unsafe void FmaNegMultiplyAdd() 304 | { 305 | int step = Vector256.Count; 306 | 307 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 308 | float* currSpPtr12 = (float*)dataMemory.MemoryHandle.Pointer; 309 | float* currSpPtr13 = (float*)dataMemory.MemoryHandle.Pointer; 310 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 311 | 312 | for (int i = 0; i < numberOfItems; i += step) 313 | { 314 | Avx.StoreAligned(currSpPtr2, Fma.MultiplyAddNegated(Avx.LoadAlignedVector256(currSpPtr), Avx.LoadAlignedVector256(currSpPtr12), Avx.LoadAlignedVector256(currSpPtr13))); 315 | currSpPtr += step; 316 | currSpPtr12 += step; 317 | currSpPtr13 += step; 318 | currSpPtr2 += step; 319 | } 320 | } 321 | 322 | 323 | [BenchmarkCategory("Reciprocal"), Benchmark(Baseline = true)] 324 | public unsafe void Reciprocal() 325 | { 326 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 327 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 328 | 329 | for (int i = 0; i < sp1.Length; i++) 330 | { 331 | sp2[i] = 1.0f / sp1[i]; 332 | } 333 | } 334 | 335 | [BenchmarkCategory("Reciprocal"), Benchmark] 336 | public unsafe void ReciprocalDouble() 337 | { 338 | var sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 339 | var sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 340 | 341 | for (int i = 0; i < sp1.Length; i++) 342 | { 343 | sp2[i] = 1.0 / sp1[i]; 344 | } 345 | } 346 | 347 | 348 | [BenchmarkCategory("Reciprocal"), Benchmark] 349 | public unsafe void VectorReciprocal() 350 | { 351 | int step = Vector256.Count; 352 | 353 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 354 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 355 | 356 | for (int i = 0; i < numberOfItems; i += step) 357 | { 358 | Avx.StoreAligned(currSpPtr2, Avx.Reciprocal(Avx.LoadAlignedVector256(currSpPtr))); 359 | currSpPtr += step; 360 | currSpPtr2 += step; 361 | } 362 | } 363 | 364 | 365 | [BenchmarkCategory("Reciprocal"), Benchmark] 366 | public unsafe void VecReciprocal() 367 | { 368 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 369 | ReadOnlySpan> vecSpan = MemoryMarshal.Cast>(sp1); 370 | 371 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 372 | Span> vecSpan2 = MemoryMarshal.Cast>(sp2); 373 | 374 | for (int i = 0; i < vecSpan.Length; i++) 375 | { 376 | vecSpan2[i] = Vector.One / vecSpan[i]; 377 | } 378 | } 379 | 380 | [BenchmarkCategory("Reciprocal"), Benchmark] 381 | public unsafe void VectorReciprocalDouble() 382 | { 383 | double one = 1.0; 384 | double* onePtr = &one; 385 | 386 | int step = Vector256.Count; 387 | 388 | Vector256 oneVector = Avx.BroadcastScalarToVector256(onePtr); 389 | 390 | double* currSpPtr = (double*)dataMemory.MemoryHandle.Pointer; 391 | double* currSpPtr2 = (double*)resultDoubleMemory.MemoryHandle.Pointer; 392 | 393 | for (int i = 0; i < numberOfItems; i += step) 394 | { 395 | Avx.StoreAligned(currSpPtr2, Avx.Divide(oneVector, Avx.LoadAlignedVector256(currSpPtr))); 396 | currSpPtr += step; 397 | currSpPtr2 += step; 398 | } 399 | } */ 400 | /* 401 | [Benchmark] 402 | public unsafe void RecSquareRoot() 403 | { 404 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 405 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 406 | 407 | for (int i = 0; i < sp1.Length; i++) 408 | { 409 | sp2[i] = 1.0f / MathF.Sqrt(sp1[i]); 410 | } 411 | } 412 | 413 | [Benchmark] 414 | public unsafe void RecSquareRootDouble() 415 | { 416 | ReadOnlySpan sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 417 | Span sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 418 | 419 | for (int i = 0; i < sp1.Length; i++) 420 | { 421 | sp2[i] = 1.0 / Math.Sqrt(sp1[i]); 422 | } 423 | } 424 | 425 | [Benchmark] 426 | public unsafe void VectorRecSquareRoot() 427 | { 428 | int step = Vector256.Count; 429 | 430 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 431 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 432 | 433 | for (int i = 0; i < numberOfItems; i += step) 434 | { 435 | Avx.StoreAligned(currSpPtr2, Avx.Reciprocal(Avx.Sqrt(Avx.LoadAlignedVector256(currSpPtr)))); 436 | currSpPtr += step; 437 | currSpPtr2 += step; 438 | } 439 | } 440 | 441 | [Benchmark] 442 | public unsafe void VectorReciprocalSqrt() 443 | { 444 | int step = Vector256.Count; 445 | 446 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 447 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 448 | 449 | for (int i = 0; i < numberOfItems; i += step) 450 | { 451 | Avx.StoreAligned(currSpPtr2, Avx.ReciprocalSqrt(Avx.LoadAlignedVector256(currSpPtr))); 452 | currSpPtr += step; 453 | currSpPtr2 += step; 454 | } 455 | } 456 | 457 | [Benchmark] 458 | public unsafe void VectorRecSquareRootDouble() 459 | { 460 | double one = 1.0; 461 | double* onePt = &one; 462 | Vector256 oneVec = Avx.BroadcastScalarToVector256(onePt); 463 | 464 | int step = Vector256.Count; 465 | 466 | double* currSpPtr = (double*)dataMemory.MemoryHandle.Pointer; 467 | double* currSpPtr2 = (double*)resultDoubleMemory.MemoryHandle.Pointer; 468 | 469 | for (int i = 0; i < numberOfItems; i += step) 470 | { 471 | 472 | Avx.StoreAligned(currSpPtr2, Avx.Divide(oneVec, Avx.Sqrt(Avx.LoadAlignedVector256(currSpPtr)))); 473 | currSpPtr += step; 474 | currSpPtr2 += step; 475 | } 476 | } */ 477 | /* 478 | [BenchmarkCategory("Square root"), Benchmark(Baseline = true)] 479 | public unsafe void SquareRoot() 480 | { 481 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 482 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 483 | 484 | for (int i = 0; i < sp1.Length; i++) 485 | { 486 | sp2[i] = MathF.Sqrt(sp1[i]); 487 | } 488 | } 489 | 490 | [BenchmarkCategory("Square root"), Benchmark] 491 | public unsafe void SquareRootDouble() 492 | { 493 | var sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 494 | var sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 495 | 496 | for (int i = 0; i < sp1.Length; i++) 497 | { 498 | sp2[i] = Math.Sqrt(sp1[i]); 499 | } 500 | } 501 | 502 | [BenchmarkCategory("Square root"), Benchmark] 503 | public unsafe void VectorSquareRoot() 504 | { 505 | int step = Vector256.Count; 506 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 507 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 508 | 509 | for (int i = 0; i < numberOfItems; i += step) 510 | { 511 | Avx.StoreAligned(currSpPtr2, Avx.Sqrt(Avx.LoadAlignedVector256(currSpPtr))); 512 | currSpPtr += step; 513 | currSpPtr2 += step; 514 | } 515 | } 516 | 517 | [BenchmarkCategory("Square root"), Benchmark] 518 | public unsafe void VecSquareRoot() 519 | { 520 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 521 | ReadOnlySpan> vecSpan = MemoryMarshal.Cast>(sp1); 522 | 523 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 524 | Span> vecSpan2 = MemoryMarshal.Cast>(sp2); 525 | 526 | for (int i = 0; i < vecSpan.Length; i++) 527 | { 528 | vecSpan2[i] = Vector.SquareRoot(vecSpan[i]); 529 | } 530 | } */ 531 | } 532 | 533 | } 534 | 535 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | ## Introduction to Core 3 Intrinsics in C#, with Benchmarks ## 2 | 3 | Taking the new `System.Runtime.Intrinsics` namespace for a spin and comparing it to scalar `float` and `Vector` operations. 4 | 5 | #### Contents #### 6 | - [Introduction to Intrinsics](#Intro) 7 | - [First steps](#First) 8 | - [Loading and storing data](#Load) 9 | - [Aligned vs. Unaligned Memory](#Aligned) 10 | - [Dataset Sizes vs Caches](#Cache) 11 | - [Basic Operations](#Basic) 12 | - [Comparisons](#Compare) 13 | - [What´s Missing?](#Missing) 14 | - [Some Benchmark Results](#Benchmarks) 15 | 16 | -------------------------------------- 17 | -------------------------------------- 18 | #### Introduction to Intrinsics #### 19 | 20 | The new functionality (available in Net Core 3.0 and beyond) under the `System.Runtime.Intrinsics` namespace will open up some the Intel and AMD processor intrinsics (see [Intel´s full guide here](https://software.intel.com/sites/landingpage/IntrinsicsGuide))) and a [Microsoft blog entry](https://devblogs.microsoft.com/dotnet/hardware-intrinsics-in-net-core/) by Tanner Gooding on the subject. The coverage is not 100% but I imagine it will grow further as time passes. ARM processor support is in the future. 21 | 22 | In a nutshell, the new functionality expands SIMD processing beyond what´s possible using `System.Numerics.Vector` by adding dozens of new instructions. 23 | 24 | -------------------------------------- 25 | -------------------------------------- 26 | #### First steps #### 27 | 28 | You prepare your code by adding some `using` statements: 29 | ```C# 30 | using System.Runtime.Intrinsics 31 | using System.Runtime.Intrinsics.X86 32 | ``` 33 | `Intrinsics` contains the different new vector classes and structures ([Microsoft documentation](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics?view=netcore-3.0)): `Vector64`, `Vector128` and `Vector256`. The number refers to the bit-length of the vector, as expected. 34 | 35 | The classes offer functions for creating and transforming vectors: `Vector256.Create(1.0f)` creates a new `Vector256`, with every component `float` initialized to `1.0f`, `Vector128.AsByte(someVector128)` creates a new vector128, casting the `float` values to `byte`. Also, you can create vectors using `Create` and explicitly passing all elements. 36 | 37 | ```C# 38 | using System.Runtime.Intrinsics; 39 | 40 | namespace Core3Intrinsics 41 | { 42 | public class Intro 43 | { 44 | public Intro() 45 | { 46 | Vector128 middleVector = Vector128.Create(1.0f); // middleVector = <1,1,1,1> 47 | middleVector = Vector128.CreateScalar(-1.0f); // middleVector = <-1,0,0,0> 48 | Vector64 floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 63> 49 | Vector256 left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, - 7.0f, -8.0f); 50 | } 51 | } 52 | } 53 | ``` 54 | 55 | `Intrinsics.X86` contains the SIMD namespaces, like SSE and AVX. It can be quite daunting (see [Microsoft´s documentation here](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.x86?view=netcore-3.0)) since it does not contain any explanation of the functionality. For functions like `Add` it might not be necessary but the `Blend` name itself is not necessarily enlightening (unless you are already familiar with Intel´s intrinsincs.) 56 | 57 | All namespaces within `Intrinsics.X86` contain a static `IsSupported` `bool`: if `true` all is well and the platform supports the specific functionality (i. e. AVX2). If `false`, you are on your own, no software fallback is provided. If your code does not check for availability and happens to run on a hardware platform which does not support the functionality you are using, a `PlatformNotSupportedException` will be thrown at runtime. 58 | 59 | These namespaces contain all the currently supported SIMD functions, like `Add`, `LoadVector256` and many more. 60 | 61 | ```C# 62 | using System.Runtime.Intrinsics; 63 | using System.Runtime.Intrinsics.X86; 64 | 65 | namespace Core3Intrinsics 66 | { 67 | public class Intro 68 | { 69 | public Intro() 70 | { 71 | if(Avx.IsSupported) 72 | { 73 | var left = Vector256.Create(-2.5f); 74 | var right = Vector256.Create(5.0f); 75 | Vector256 result = Avx.Add(left, right); // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5> 76 | result = Avx.Multiply(left, right); // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5> 77 | 78 | double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 }; 79 | unsafe 80 | { 81 | fixed (double* ptr = &someDoubles[1]) 82 | { 83 | Vector256 res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8> 84 | } 85 | } 86 | } 87 | } 88 | } 89 | } 90 | ``` 91 | 92 | The [documentation](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.x86?view=netcore-3.0) contains the intrinsic function used by the processor (for `Add(Vector256, Vector256)` for example, the instruction is `__m256 _mm256_add_ps (__m256 a, __m256 b)`). This comes in handy in order to find the equivalent instruction in the [Intel guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=884,287,2825,136&text=_mm256_add_ps): 93 | 94 | ``` 95 | __m256 _mm256_add_ps (__m256 a, __m256 b) 96 | Synopsis 97 | __m256 _mm256_add_ps (__m256 a, __m256 b) 98 | #include 99 | Instruction: vaddps ymm, ymm, ymm 100 | CPUID Flags: AVX 101 | Description 102 | Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst. 103 | Operation 104 | FOR j := 0 to 7 105 | i := j*32 106 | dst[i+31:i] := a[i+31:i] + b[i+31:i] 107 | ENDFOR 108 | dst[MAX:256] := 0 109 | 110 | Performance 111 | | Architecture | Latency | Throughput (CPI) 112 | | ---------------|---------|----------------- 113 | | Skylake | 4 | 0.5 114 | | Broadwell | 3 | 1 115 | | Haswell | 3 | 1 116 | | Ivy Bridge | 3 | 1 117 | ``` 118 | 119 | This gives you the exact description of the operation(s) being performed and also performance data (the "Latency" value is "is the number of processor clocks it takes for an instruction to have its data available for use by another instruction", the "Throughput" is "the number of processor clocks it takes for an instruction to execute or perform its calculations". See [Intels´ definition here](https://software.intel.com/en-us/articles/measuring-instruction-latency-and-throughput)) 120 | 121 | -------------------------------------- 122 | -------------------------------------- 123 | #### Loading and storing data #### 124 | 125 | ##### Creating Vectors 126 | 127 | As seen above, you can create vectors one-by-one using the various `Create` functions. Another possibility is to use the (unsafe) `Loadxxx()` functions. 128 | 129 | Storing data can be achieved with Storexx. 130 | 131 | ``` C# 132 | double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 }; 133 | double[] someResult = new double[someDoubles.Length]; 134 | unsafe 135 | { 136 | fixed (double* ptr = &someDoubles[1]) 137 | { 138 | fixed (double* ptr2 = &someResult[0]) 139 | { 140 | Vector256 res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8> 141 | Avx.Store(ptr2, res2); 142 | } 143 | } 144 | } 145 | 146 | ``` 147 | 148 | You can also create a new vector by interleaving two others: 149 | ``` C# 150 | left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, - 70.0f, -80.0f); 151 | right = Vector256.Create(1.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f); 152 | result = Avx.UnpackLow(left, right); // result = <-1, 1, -2, 2, -50, 50, -60, 60> 153 | result = Avx.UnpackHigh(left, right); // result = <-3, 3, -4, 4, -70, 70, -80, 80> 154 | ``` 155 | ``` ini 156 | R = UnpackLow(A, B) 157 | 158 | |------|------|------|------|------|------|------|------| 159 | | A0 | A1 | A2 | A3 | A4 | A5 | A6 | A7 | 160 | |------|------|------|------|------|------|------|------| 161 | |------|------|------|------|------|------|------|------| 162 | | B0 | B1 | B2 | B3 | B4 | B5 | B6 | B7 | 163 | |------|------|------|------|------|------|------|------| 164 | 165 | R0 R1 R2 R3 R4 R5 R6 R7 166 | |------|------|------|------|------|------|------|------| 167 | | A0 | B0 | A1 | B1 | A4 | B4 | A5 | B5 | 168 | |------|------|------|------|------|------|------|------| 169 | ``` 170 | ##### Vectors from Arrays ##### 171 | 172 | Many times you´ll use the intrinsics for huge amounts of data, so a more practical approach to create vectors could be: 173 | 174 | ``` C# 175 | public float[] ProcessData(ref Span input) 176 | { 177 | float[] results = new float[input.Length]; 178 | Span> resultVectors = MemoryMarshal.Cast>(results); 179 | 180 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(input); 181 | 182 | for(int i = 0; i < inputVectors.Length; i++) 183 | { 184 | resultVectors[i] = Avx.Sqrt(inputVectors[i]); 185 | } 186 | 187 | return results; 188 | } 189 | ``` 190 | 191 | `System.Runtime.Interopservices.MemoryMarshal.Cast()` will cast values in place (i. e. no copying involved). At the end of the loop, the `results` array will automagically contain the individual floats from the vector operation (btw, the above example does not check if the `input` array fits neatly into `Vector256`, normally you´d need to process any remaining elements in a scalar way). 192 | 193 | You can also go `unsafe` and loop through pointers, of course: 194 | 195 | ``` C# 196 | public unsafe float[] ProcessDataUnsafe(ref Span input) 197 | { 198 | float[] results = new float[input.Length]; 199 | fixed (float* inputPtr = &input[0]) 200 | { 201 | float* inCurrent = inputPtr; 202 | fixed (float* resultPtr = &results[0]) 203 | { 204 | float* resEnd = resultPtr + results.Length; 205 | float* resCurrent = resultPtr; 206 | while (resCurrent < resEnd) 207 | { 208 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent))); 209 | resCurrent += 8; 210 | inCurrent += 8; 211 | } 212 | } 213 | } 214 | return results; 215 | } 216 | ``` 217 | No performance difference on my machine, though. 218 | 219 | ##### Using intrinsics to copy memory? A disappointment... ##### 220 | 221 | Although moving data around using vectors seems pretty efficient, I was surprised to measure `System.Runtime.CompilerServices.Unsafe.CopyBlock(ref byte destination, ref byte source, uint byteCount)` as faster, independently of data size (i.e. even data far bigger than cache will be copied efficiently). Of course it´s unsafe in the sense that you need to know what you are doing (not `unsafe` though). 222 | 223 | ``` 224 | | Method | numberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 225 | |------------------------------ |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:| 226 | | ScalarStoreBlock | 16384 | 306.1 ns | 8.539 ns | 12.246 ns | 302.8 ns | 1.00 | 0.00 | 227 | | VectorStoreArrayMemPtr | 16384 | 401.3 ns | 8.049 ns | 12.998 ns | 397.5 ns | 1.32 | 0.07 | 228 | 229 | | ScalarStoreBlock | 8388608 | 1,106,074.5 ns | 17,544.390 ns | 14,650.360 ns | 1,107,074.2 ns | 1.00 | 0.00 | 230 | | VectorStoreArrayMemPtr | 8388608 | 1,573,258.0 ns | 34,312.238 ns | 44,615.601 ns | 1,561,962.8 ns | 1.43 | 0.05 | 231 | 232 | ``` 233 | An impressive 32 - 43% advantage... It shows that a properly optimized scalar method (probably using some very smart assembly instructions) beats a naïve vectorization with ease. 234 | 235 | -------------------------------------- 236 | -------------------------------------- 237 | #### Aligned vs. Unaligned Memory 238 | 239 | If you look through the different `Load...` instructions available, you´ll notice that you have, for example, `LoadVector256(T*)` and `LoadAlignedVector256(T*)`. 240 | 241 | > :warning: The "Aligned" part refers to memory alignment of the pointer to the beginning of the data: in order to use the `LoadAligned` version of the functions, your data needs to start at a specific boundary: for 256 bit vectors (32 bytes), the data ***needs*** to start at a location (pointer address) that is a multiple of 32 (for 128 bit vectors it needs to be aligned at 16 byte boundaries). Failure to do so can result in a runtime ***general protection fault***. 242 | 243 | In the past, aligned data used to work much better that unaligned data, but modern processors don´t really care, as long as your data is aligned to the natural OS´s boundary in order to avoid stradling cache line or page boundaries (see [this comment by T. Gooding](https://devblogs.microsoft.com/dotnet/hardware-intrinsics-in-net-core/#comment-2942), for example) 244 | 245 | Comparing aligned to unaligned on my machine: 246 | 247 | | Method | NumberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 248 | |------------------------------------ |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:| 249 | | VectorStoreAlignedUnsafe | 8388608 | 1,508,063.2 ns | 30,407.034 ns | 26,955.044 ns | 1,501,035.9 ns | 1.00 | 0.00 | 250 | | VectorStoreUnalignedUnsafe | 8388608 | 1,527,444.0 ns | 29,279.764 ns | 30,068.162 ns | 1,514,013.7 ns | 1.02 | 0.03 | 251 | | VectorStoreUnalignedToAlignedUnsafe | 8388608 | 1,485,540.1 ns | 12,131.046 ns | 10,129.973 ns | 1,486,236.1 ns | 0.99 | 0.02 | 252 | 253 | There´s really no meaningful difference for bigger data chunks. 254 | 255 | -------------------------------------- 256 | -------------------------------------- 257 | #### Dataset Sizes vs Caches #### 258 | 259 | Often overlooked, the size of your datasets may have an important impact on your processing times (apart from the obvious increase in elements): if all data fits in a processor core´s cache and only a few operations will be performed per data point, then memory acces times will be crucial and you´ll notice a non-linear increase in processing time vs. data size. 260 | 261 | > :warning: In other words, when you measure your loop in order to determine your gains (if any!) from using intrinsics, it´s important to test with *data sizes close to the real data*. For huge data, test with arrays several times bigger than the available cache size, at least. 262 | 263 | -------------------------------------- 264 | -------------------------------------- 265 | #### Basic Floating Point Math Operations #### 266 | 267 | As mentioned above, `System.Runtime.Intrinsics.X86` contains the SSE, AVX etc. functionality. You can add, substract, multiply and divide all kinds of vectors. 268 | 269 | You also have `Sqrt` and `ReciprocalSqrt`, `Min` and `Max`, they all do what you expect. 270 | 271 | Some more exotic operations are: 272 | 273 | ##### AddSubtract ##### 274 | *__m256d _mm256_addsub_pd (__m256d a, __m256d b)* 275 | 276 | ``` C# 277 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 278 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 279 | Vector256 result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5> 280 | ``` 281 | 282 | 283 | `AddSubtract` will *subtract* the even components (0, 2, ...) and *add* the odd ones (1, 3, ...). 284 | 285 | ``` ini 286 | |------|------|------|------|------| 287 | | A0 | A1 | A2 | A3 | ... | 288 | |------|------|------|------|------| 289 | - + - + ... 290 | |------|------|------|------|------| 291 | | B0 | B1 | B2 | B3 | ... | 292 | |------|------|------|------|------| 293 | 294 | ``` 295 | 296 | 297 | ##### DotProduct ##### 298 | 299 | *__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)* 300 | 301 | The `Avx.DotProduct` is a bit out of the common: 302 | 303 | ``` C# 304 | left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, - 70.0f, -80.0f); 305 | right = Vector256.Create(1.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f); 306 | result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0> 307 | ``` 308 | This will actually create **2** dot products of 128 bit vectors: from the first four elements of `left` and `right`, stored on the first element of `result`, and the same for the right 4 elements, stored on the 5th element. In other words, it will perform a dot product on two 128 bit float vectors independently. It can be visualized as doing the dot product of 2 four float element vectors separately and simultaneously. 309 | 310 | You can control which product is performed by using the 4 high order bits of the third parameter **in reverse order**: all ones means do all 4 products (on each 128 bit half). A value of `0b0001` would mean that only the **first** element´s products is performed, a value of `0b1010` will multiply second and fourth: 311 | 312 | ``` C# 313 | result = Avx.DotProduct(left, right, 0b1010_0001); // result = <-20, 0, 0, 0, -10000, 0, 0, 0> 314 | ``` 315 | 316 | If you think of vectors with x, y, z and w components, the order in which you turn the product on or off is thus (w, z, y, x). 317 | 318 | The second half of the third parameter byte indicates **where to store** the dot product results, again in **reverse order**: `0001` means store the result in the first elements of each 128 bit vector. 319 | 320 | ``` ini 321 | R = DotProduct(A, B, bitMask) 322 | 323 | bit mask = b7 b6 b5 b4 0 0 0 1 324 | 325 | b4 b5 b6 b7 b4 b5 b6 b7 326 | |------|------|------|------||------|------|------|------| 327 | | A0 | A1 | A2 | A3 || A4 | A5 | A6 | A7 | 328 | |------|------|------|------||------|------|------|------| 329 | * * * * * * * * 330 | |------|------|------|------||------|------|------|------| 331 | | B0 | B1 | B2 | B3 || B4 | B5 | B6 | B7 | 332 | |------|------|------|------||------|------|------|------| 333 | = = ... 334 | 0 0 0 0 335 | or + or ... or + or ... 336 | A0*B0 A1*B1 A4*B4 A5*B5 337 | |__________________________||__________________________| 338 | 339 | stored in stored in 340 | | | 341 | | | 342 | 343 | 1 0 0 0 1 0 0 0 344 | |------|------|------|------||------|------|------|------| 345 | | R0 | 0 | 0 | 0 || R4 | 0 | 0 | 0 | 346 | |------|------|------|------||------|------|------|------| 347 | ``` 348 | 349 | > :warning: You should do some benchmarking before using this instruction, its performance doesn´t seem to be too hot. 350 | 351 | ##### Floor, Ceiling ##### 352 | These do what you expect: 353 | 354 | ``` C# 355 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 356 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 357 | 358 | result = Avx.Floor(left); // result = <-3, -3, -3, -3, -3, -3, -3, -3> 359 | result = Avx.Ceiling(left); // result = <-2, -2, -2, -2, -2, -2, -2, -2> 360 | ``` 361 | 362 | In order to have finer control you also have `RoundToNearestInteger` , `RoundToNegativeInfinity` etc. 363 | 364 | ##### Horizontal Add, Subtract ##### 365 | 366 | *__m256 _mm256_hadd_ps (__m256 a, __m256 b)* 367 | 368 | *__m256 _mm256_hsub_ps (__m256 a, __m256 b)* 369 | 370 | ``` C# 371 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 372 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 373 | result = Avx.HorizontalAdd(left, right); // result = <-5, -5, 10, 10, -5, -5, 10, 10> 374 | result = Avx.HorizontalSubtract(left, right); // result = <0, 0, 0, 0, 0, 0, 0, 0> 375 | ``` 376 | 377 | `HorizontalAdd` will add element 0 and 1 from `left`, then elements 2 and 3. They get stored in elements 0 and 1 of `result`. Then it goes on with the same for `right` and stores the results in elements 2 and 3 of `result`; then further... 378 | 379 | ``` ini 380 | R = HorizontalAdd(A, B) 381 | |------|------|------|------|------|------|------|------| 382 | | A0 | A1 | A2 | A3 | A4 | A5 | A6 | A7 | 383 | |------|------|------|------|------|------|------|------| 384 | |------|------|------|------|------|------|------|------| 385 | | B0 | B1 | B2 | B3 | B4 | B5 | B6 | B7 | 386 | |------|------|------|------|------|------|------|------| 387 | 388 | R0 R1 R2 R3 R4 R5 R6 R7 389 | |----------|----------|----------|----------|----------|----------|----------|----------| 390 | | A0 + A1 | A2 + A3 | B0 + B1 | B2 + B3 | A4 + A5 | A6 + A7 | B4 + B5 | B6 + B7 | 391 | |----------|----------|----------|----------|----------|----------|----------|----------| 392 | 393 | ``` 394 | 395 | ##### FMA - Fused Multiply Operations ##### 396 | 397 | *__m256 _mm256_fmadd_ps* etc. 398 | 399 | ``` C# 400 | if (Fma.IsSupported) 401 | { 402 | var resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element 403 | resultFma = Fma.MultiplyAddNegated(left, right, other); // = -(left * right + other) for each element 404 | resultFma = Fma.MultiplySubtract(left, right, other); // = left * right - other for each element 405 | Fma.MultiplyAddSubtract(left, right, other); // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract 406 | } 407 | ``` 408 | These instructions will combine multiplies with add or substract in several variants. 409 | 410 | -------------------------------------- 411 | -------------------------------------- 412 | #### Vector Comparisons #### 413 | 414 | There are several intrinsics to compare vectors. 415 | 416 | ##### Vector results 417 | 418 | A set of `Sse.Compare...` exist for **128-bit vectors**: 419 | 420 | ``` C# 421 | var left128 = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f); 422 | var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f); 423 | Vector128 compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0> 424 | ``` 425 | 426 | You also have `CompareLessThanOrEqual`, `CompareNotEqual` and many more. 427 | 428 | If the comparison is `false` for a given element, the result vector will have a zero in that position. If `true` the position will be occupied by a value of all bits set to 1 (which results in `NaN` for `float` and `double`). 429 | 430 | For **256-bit vectors**, `Avx.Compare(vector a, vector b, flag)` will compare both vectors according to the `FloatComparisonMode` flag given. 431 | ``` C# 432 | left = Vector256.Create(-1.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); 433 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 434 | var compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 435 | ``` 436 | `FloatComparisonMode.OrderedGreaterThanNonSignaling` will compare if elements in `left` are greater than elements in `right`. As above, if the comparison is `false`, the result vector will have a zero in that position. If `true` the position will be occupied by a value of all bits set to 1 (which results in `NaN` for `float` and `double`). 437 | 438 | > The `Ordered...` part of the flag´s name refers to how `NaN` in the vectors are treated, the `...NonSignaling` means to not throw exceptions when NaNs occur, although I am not really sure how this works yet [TO BE CONTINUED]. 439 | 440 | Once you have the comparison result, there are several things you can do with it: 441 | 442 | ``` C# 443 | left = Vector256.Create(-1.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); 444 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 445 | var compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 446 | int res = Avx.MoveMask(compareResult); // res = 0b10101010 = 0xAA = 170 447 | 448 | if(int > 0) 449 | { 450 | // At least one comparison is true, do something 451 | } 452 | ``` 453 | 454 | `MoveMask` will create an `int` which bits indicate the elements which are `true` (in reality, it will copy each element´s highest order bit, which comes down to the same, since `true` has all bits set). The `int` will list the elements **in reverse order**. 455 | 456 | If you don´t need to know which element satisfies the comparison but just determine if all did, you can do: 457 | 458 | ``` C# 459 | left = Vector256.Create(-1.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); 460 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 461 | var compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 462 | bool areAllTrue = !Avx.TestZ(compareResult, compareResult); // areAllTrue = false 463 | 464 | if(!areAllTrue) 465 | { 466 | // At least one comparison is false, do something 467 | } 468 | ``` 469 | 470 | You can also use the resulting vector to selectively load vector elements: 471 | 472 | ``` C# 473 | left = Vector256.Create(-1.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); 474 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 475 | var mask = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // mask = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 476 | Vector256 mixed = Avx.BlendVariable(left, right, mask); // mixed = <-1, 2, -3, 2, -50, -60, -70, -80> 477 | ``` 478 | 479 | For each element in the third parameter (`mask`), `BlendVariable` will pick the correspondent element from the **second** vector (`right` in the above snippet) if the mask´s value is **`true`**; otherwise it will pick the element from the first vector. 480 | 481 | In the above snippet, `left`[0] = `-1.0f`, `right`[0] = `0.0f`. The mask is `0` (false) at this position, so the result vector´s first position gets the value from the **first** vector: `-1.0f`. 482 | 483 | 484 | ##### Scalar Results ##### 485 | 486 | As mentioned above, there are some intrinsics to compare values that return a scalar (`int` or `bool`): `TestZ`, `TestC` etc and `MoveMask`. 487 | 488 | -------------------------------------- 489 | -------------------------------------- 490 | 491 | #### What´s Missing? #### 492 | 493 | There are no [trigonometric functions](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#cats=Trigonometry) as yet: cosine, sine etc. are all missing. Maybe some others, but that´s the category that caught my eye. 494 | 495 | -------------------------------------- 496 | -------------------------------------- 497 | 498 | #### Benchmark Results #### 499 | 500 | Some benchmarks, with small data sizes (i. e. the data should fit into L2 cache) and larger sizes (i. e. 10 x L3 cachesize ) on my machine. 501 | 502 | ##### FMA with `floats` ##### 503 | 504 | 505 | A simple scalar loop: 506 | 507 | ``` C# 508 | [BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 509 | public unsafe void MultiplyAddScalarFloat() 510 | { 511 | var sp1 = new ReadOnlySpan(data, 0, numberOfFloatItems); 512 | var sp12 = new ReadOnlySpan(data2, 0, numberOfFloatItems); 513 | var sp13 = new ReadOnlySpan(data3, 0, numberOfFloatItems); 514 | var sp2 = new Span(result, 0, numberOfFloatItems); 515 | 516 | for (int i = 0; i < sp1.Length; i++) 517 | { 518 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 519 | } 520 | } 521 | ``` 522 | 523 | The same using `Fma`: 524 | 525 | ``` C# 526 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 527 | public unsafe void FmaMultiplyAddvector256Float() 528 | { 529 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 530 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 531 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 532 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 533 | 534 | for (int i = 0; i < d1.Length; i++) 535 | { 536 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 537 | } 538 | } 539 | ``` 540 | Comparing both gives: 541 | 542 | ``` ini 543 | 544 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 545 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 546 | .NET Core SDK=3.0.100-rc1-014190 547 | [Host] : .NET Core 3.0.0-rc1-19456-20 (CoreCLR 4.700.19.45506, CoreFX 4.700.19.45604), 64bit RyuJIT 548 | DefaultJob : .NET Core 3.0.0-rc1-19456-20 (CoreCLR 4.700.19.45506, CoreFX 4.700.19.45604), 64bit RyuJIT 549 | 550 | 551 | ``` 552 | | Method | ParamCacheSizeBytes | Mean | Error | StdDev | Ratio | RatioSD | 553 | |------------------------------ |-------------------- |-------------:|------------:|------------:|------:|--------:| 554 | | **MultiplyAddScalarFloat** | **262144** | **20.128 us** | **0.5597 us** | **0.8377 us** | **1.00** | **0.00** | 555 | | FmaMultiplyAddvector256Float | 262144 | 6.750 us | 0.1338 us | 0.1186 us | 0.33 | 0.02 | 556 | | | | | | | | | 557 | | **MultiplyAddScalarFloat** | **41943040** | **5,208.768 us** | **103.2312 us** | **257.0815 us** | **1.00** | **0.00** | 558 | | FmaMultiplyAddvector256Float | 41943040 | 4,021.671 us | 75.5671 us | 70.6856 us | 0.78 | 0.04 | 559 | 560 | 561 | As expected for small number of operations inside the loop, the memory access times take their tolls: only a 22% time reduction for larger data sizes with vector intrinsics, using safe operations. (Although 22% could really be many hours for really huge jobs, of course...) 562 | 563 | If we perform 3 FMA operations per step in the loop and use pointers for the vectors on the other hand, we get a more consistent speedup: still 1.67x for bigger data sets (see the source code for implementation of the test): 564 | 565 | | Method | ParamCacheSizeBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 566 | |-------------------------------- |-------------------- |------------:|------------:|------------:|------------:|------:|--------:| 567 | | ScalarFloatMultipleOps | 262144 | 40.88 us | 1.1768 us | 1.2592 us | 40.44 us | 1.00 | 0.00 | 568 | | Vector256FloatMultipleOpsUnsafe | 262144 | 17.75 us | 0.0963 us | 0.0752 us | 17.74 us | 0.43 | 0.02 | 569 | | VectorTFloatMultipleOps | 262144 | 18.29 us | 0.1063 us | 0.0942 us | 18.29 us | 0.45 | 0.01 | 570 | | | | | | | | | | 571 | | ScalarFloatMultipleOps | 41943040 | 7,877.13 us | 142.2414 us | 126.0933 us | 7,851.68 us | 1.00 | 0.00 | 572 | | Vector256FloatMultipleOpsUnsafe | 41943040 | 4,659.43 us | 91.8353 us | 176.9355 us | 4,731.19 us | 0.60 | 0.02 | 573 | | VectorTFloatMultipleOps | 41943040 | 5,239.05 us | 126.9370 us | 118.7370 us | 5,246.28 us | 0.67 | 0.02 | 574 | 575 | The processor will likely prefetch data while it performs operations, i´d assume, effectively hiding the access time. 576 | 577 | `Vector` is slower, as expected, partly (probably) because it doesn´t implement `Fma` and partly since we are using safe code and the loop will be slowed down by range-checks. 578 | 579 | 580 | ##### Mixed `float` operations ##### 581 | 582 | The [Mandelbrot set](https://en.wikipedia.org/wiki/Mandelbrot_set) is an all-time favorite to show off parallel processing. On my machine I get the following results for a 1920 X 1080 image (this is just generating values, not creating a bitmap): 583 | 584 | | Method | Mean | Error | StdDev | Ratio | 585 | |---------------- |----------:|----------:|----------:|------:| 586 | | FloatMandel | 134.92 ms | 1.1073 ms | 0.9247 ms | 1.00 | 587 | | Vector256Mandel | 25.98 ms | 0.1739 ms | 0.1626 ms | 0.19 | 588 | 589 | A 5.3x speedup is nice! The vector loop could probably be further optimized though, I just did a naïve translation of the scalar code. 590 | 591 | ##### Some `int` Operations ##### 592 | 593 | Some basic integer operations show an average 1.5 - 1.6x speed increase with intrinsics for one operation, probably illustrating that modern processors are already very good at handling ints and, again perhaps, memory access times. 594 | 595 | | Method | NumberOfItems | Mean | Error | StdDev | Ratio | RatioSD | 596 | |------------------------ |-------------- |---------:|----------:|----------:|------:|--------:| 597 | | IntAdd | 4096000 | 4.811 ms | 0.0262 ms | 0.0219 ms | 1.00 | 0.00 | 598 | | IntAddVector256 | 4096000 | 3.041 ms | 0.0499 ms | 0.0442 ms | 0.63 | 0.01 | 599 | | IntXor | 4096000 | 4.834 ms | 0.0838 ms | 0.0700 ms | 1.00 | 0.01 | 600 | | IntXorVector256 | 4096000 | 3.028 ms | 0.0457 ms | 0.0405 ms | 0.63 | 0.01 | 601 | | IntMultiply | 4096000 | 4.777 ms | 0.1163 ms | 0.0971 ms | 0.99 | 0.02 | 602 | | IntMultiplyLowVector256 | 4096000 | 3.013 ms | 0.0380 ms | 0.0337 ms | 0.63 | 0.01 | 603 | | IntShiftLeft | 4096000 | 4.057 ms | 0.1107 ms | 0.1036 ms | 0.84 | 0.02 | 604 | | IntShiftLeftVector256 | 4096000 | 3.063 ms | 0.0638 ms | 0.0597 ms | 0.64 | 0.01 | 605 | | IntMax | 4096000 | 4.757 ms | 0.1295 ms | 0.1272 ms | 0.99 | 0.03 | 606 | | IntMaxVector256 | 4096000 | 3.018 ms | 0.0286 ms | 0.0239 ms | 0.63 | 0.01 | 607 | 608 | Chaing three ops inside the loop gives: 609 | 610 | | Method | NumberOfItems | Mean | Error | StdDev | Ratio | 611 | |------------------------ |-------------- |---------:|----------:|----------:|------:| 612 | | IntMultipleOps | 4096000 | 5.430 ms | 0.1067 ms | 0.1048 ms | 1.00 | 613 | | IntMultipleOpsvector256 | 4096000 | 3.016 ms | 0.0551 ms | 0.0516 ms | 0.56 | 614 | 615 | Only a small improvement, so probably processor optimization plays the bigger role. --------------------------------------------------------------------------------