├── Core3Intrinsics ├── Core3Intrinsics.csproj ├── Program.cs ├── Validator.cs ├── Transpose.cs ├── Intro.cs └── Mandelbrot.cs ├── Core3IntrinsicsBenchmarks ├── Program.cs ├── Core3IntrinsicsBenchmarks.csproj ├── AlignedMemoryHandle.cs ├── ReadmeBenches.cs ├── TrigonometricOps.cs ├── AlignedArrayPool.cs ├── Mandelbrot.cs ├── MemoryBenches.cs ├── IntegerBasicOps.cs └── BasicOps.cs ├── LICENSE ├── Core3Intrinsics.sln ├── .gitattributes ├── ExtraFiles ├── MemoryBenches2.md ├── MemoryBenches-Aligned.md └── MemoryBenches-1.md ├── .gitignore └── Readme.md /Core3Intrinsics/Core3Intrinsics.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.0 6 | 8.0 7 | 8 | 9 | 10 | x64 11 | true 12 | 13 | 14 | 15 | x64 16 | true 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BenchmarkDotNet.Running; 3 | using BenchmarkDotNet.Configs; 4 | using System.Collections.Generic; 5 | 6 | namespace Core3IntrinsicsBenchmarks 7 | { 8 | class Program 9 | { 10 | static void Main() 11 | { 12 | //var summary = BenchmarkRunner.Run(); 13 | //_ = BenchmarkRunner.Run(); 14 | //var summary = BenchmarkRunner.Run(); 15 | //var summary = BenchmarkRunner.Run(); 16 | var summary = BenchmarkRunner.Run(); 17 | //var summary = BenchmarkRunner.Run(); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/Core3IntrinsicsBenchmarks.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.0 6 | 7.3 7 | 8 | 9 | 10 | x64 11 | true 12 | pdbonly 13 | true 14 | 15 | 16 | 17 | x64 18 | true 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 C. B. Gonzalez 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/AlignedMemoryHandle.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Buffers; 3 | using System.Collections.Generic; 4 | using System.Runtime.InteropServices; 5 | using System.Text; 6 | 7 | namespace Core3IntrinsicsBenchmarks 8 | { 9 | public unsafe class AlignedMemoryHandle where T : struct 10 | { 11 | private MemoryHandle memoryHandle; 12 | readonly byte* bytePointer; 13 | readonly int byteArrayLength; 14 | readonly Memory memory; 15 | 16 | public MemoryHandle MemoryHandle => memoryHandle; 17 | 18 | public ref byte ByteRef => ref GetByteRef(); 19 | 20 | public ref T TRef => ref GetTRef(); 21 | 22 | public Memory Memory => memory; 23 | 24 | public int ByteArrayLength => byteArrayLength; 25 | 26 | public unsafe AlignedMemoryHandle(void* pointer, GCHandle handle, ref T arrayStart, int byteLength) 27 | { 28 | memoryHandle = new MemoryHandle(pointer, handle); 29 | bytePointer = (byte*)pointer; 30 | ref T tRef = ref arrayStart; 31 | byteArrayLength = byteLength; 32 | memory = new Memory(MemoryMarshal.Cast(new Span(pointer, byteLength)).ToArray()); 33 | } 34 | 35 | private unsafe ref byte GetByteRef() 36 | { 37 | return ref bytePointer[0]; 38 | } 39 | 40 | private unsafe ref T GetTRef() 41 | { 42 | return ref MemoryMarshal.Cast(new Span((void*)bytePointer, byteArrayLength)).ToArray()[0]; 43 | } 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /Core3Intrinsics/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | 4 | namespace Core3Intrinsics 5 | { 6 | class Program 7 | { 8 | static unsafe void Main() 9 | { 10 | Console.WriteLine("Starting test ..."); 11 | Console.WriteLine("\tMandelBrot"); 12 | var man = new Mandelbrot(); 13 | man.FloatMandel(); 14 | man.Vector256Mandel(); 15 | (bool areEqual, System.Collections.Generic.List errorList, int maxDifference) = Validator.CompareValuesFloat(man.results.Span.ToArray(), man.results2.Span.ToArray()); 16 | Console.WriteLine($"\t\tMandelBrot successful: {areEqual}, Number of differences: {errorList.Count}, max. difference: {maxDifference}"); 17 | Console.WriteLine($"\t\tDone with mandelbrot, total bytes: {man.SizeInBytes}"); 18 | //Transpose.CreateArrays(); 19 | //bool res1 = Transpose.SerializeColorsInt(); 20 | //bool res2 = Transpose.SerializedColorsVector256(); 21 | //if(res1 && res2) 22 | //{ 23 | // (bool areEqual, System.Collections.Generic.List errorList) = Validator.CompareValues(Transpose.transposed1, Transpose.transposed2); 24 | // Console.WriteLine($"Transpose ended with success {areEqual}, number of differences {errorList.Count}"); 25 | //} 26 | //else 27 | //{ 28 | // Console.WriteLine($"Error running Transpose"); 29 | //} 30 | _ = Console.ReadLine(); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Core3Intrinsics.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.29215.179 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core3Intrinsics", "Core3Intrinsics\Core3Intrinsics.csproj", "{8ABE3139-8924-46FE-B8D4-155FE20DD285}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core3IntrinsicsBenchmarks", "Core3IntrinsicsBenchmarks\Core3IntrinsicsBenchmarks.csproj", "{FFEC9419-D276-46DB-8136-4642054E1C99}" 9 | EndProject 10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A3F9D91A-297A-40E2-9714-D009F3FB9CF0}" 11 | ProjectSection(SolutionItems) = preProject 12 | Readme.md = Readme.md 13 | EndProjectSection 14 | EndProject 15 | Global 16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 17 | Debug|Any CPU = Debug|Any CPU 18 | Release|Any CPU = Release|Any CPU 19 | EndGlobalSection 20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 21 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 22 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Debug|Any CPU.Build.0 = Debug|Any CPU 23 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Release|Any CPU.ActiveCfg = Release|Any CPU 24 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Release|Any CPU.Build.0 = Release|Any CPU 25 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 26 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Debug|Any CPU.Build.0 = Debug|Any CPU 27 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Release|Any CPU.ActiveCfg = Release|Any CPU 28 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Release|Any CPU.Build.0 = Release|Any CPU 29 | EndGlobalSection 30 | GlobalSection(SolutionProperties) = preSolution 31 | HideSolutionNode = FALSE 32 | EndGlobalSection 33 | GlobalSection(ExtensibilityGlobals) = postSolution 34 | SolutionGuid = {0AA0631C-9878-463C-8661-45CA8F282505} 35 | EndGlobalSection 36 | EndGlobal 37 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/ReadmeBenches.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using BenchmarkDotNet.Exporters; 4 | using BenchmarkDotNet.Exporters.Csv; 5 | using System; 6 | using System.Buffers; 7 | using System.Runtime.CompilerServices; 8 | using System.Runtime.InteropServices; 9 | using System.Runtime.Intrinsics; 10 | using System.Runtime.Intrinsics.X86; 11 | 12 | namespace Core3IntrinsicsBenchmarks 13 | { 14 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 15 | public class ReadmeBenches 16 | { 17 | [Params(4096/*, 1048576*/)] 18 | public int NumberOfFloats { get; set; } 19 | 20 | private static float[] inputData; 21 | 22 | [GlobalSetup] 23 | public void GlobalSetup() 24 | { 25 | inputData = new float[NumberOfFloats]; 26 | for(int i = 0; i < inputData.Length; i++) 27 | { 28 | inputData[i] = i + 1; 29 | } 30 | } 31 | 32 | [Benchmark(Baseline = true)] 33 | public float[] ProcessData() 34 | { 35 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 36 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 37 | Vector256 result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-50, 0, 0, 0, -50, 0, 0, 0> 38 | float[] results = new float[inputData.Length]; 39 | Span> resultVectors = MemoryMarshal.Cast>(results); 40 | 41 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(inputData); 42 | 43 | for (int i = 0; i < inputVectors.Length; i++) 44 | { 45 | resultVectors[i] = Avx.Sqrt(inputVectors[i]); 46 | } 47 | results[0] = result.GetElement(0); 48 | return results; 49 | } 50 | 51 | [Benchmark] 52 | public unsafe float[] ProcessDataUnsafe() 53 | { 54 | float[] results = new float[inputData.Length]; 55 | fixed (float* inputPtr = &inputData[0]) 56 | { 57 | float* inCurrent = inputPtr; 58 | fixed (float* resultPtr = &results[0]) 59 | { 60 | float* resEnd = resultPtr + results.Length; 61 | float* resCurrent = resultPtr; 62 | while (resCurrent < resEnd) 63 | { 64 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent))); 65 | resCurrent += 8; 66 | inCurrent += 8; 67 | } 68 | } 69 | } 70 | return results; 71 | } 72 | 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /Core3Intrinsics/Validator.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace Core3Intrinsics 6 | { 7 | public static class Validator 8 | { 9 | public static (bool, List) CompareValues(T[] left, T[] right) where T : struct 10 | { 11 | var differIndexes = new List(); 12 | bool allEqual = true; 13 | if(left.Length != right.Length) 14 | { 15 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}."); 16 | } 17 | for(int i = 0; i < left.Length; i++) 18 | { 19 | if(!EqualityComparer.Default.Equals(left[i], right[i])) 20 | { 21 | differIndexes.Add(i); 22 | 23 | allEqual &= false; 24 | } 25 | } 26 | 27 | return (allEqual, differIndexes); 28 | } 29 | 30 | public static (bool, List, int) CompareValuesFloat(float[] left, float[] right) 31 | { 32 | var differIndexes = new List(); 33 | int maxDifference = 0; 34 | bool allEqual = true; 35 | if (left.Length != right.Length) 36 | { 37 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}."); 38 | } 39 | for (int i = 0; i < left.Length; i++) 40 | { 41 | if (left[i] != right[i]) 42 | { 43 | differIndexes.Add(i); 44 | if(Math.Abs(left[i] - right[i]) > maxDifference) 45 | { 46 | maxDifference = (int)Math.Abs(left[i] - right[i]); 47 | } 48 | allEqual &= false; 49 | } 50 | } 51 | 52 | return (allEqual, differIndexes, maxDifference); 53 | } 54 | 55 | public static (bool, List, int) CompareValuesDouble(double[] left, double[] right) 56 | { 57 | var differIndexes = new List(); 58 | int maxDifference = 0; 59 | bool allEqual = true; 60 | if (left.Length != right.Length) 61 | { 62 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}."); 63 | } 64 | for (int i = 0; i < left.Length; i++) 65 | { 66 | if (left[i] != right[i]) 67 | { 68 | differIndexes.Add(i); 69 | if (Math.Abs(left[i] - right[i]) > maxDifference) 70 | { 71 | maxDifference = (int)Math.Abs(left[i] - right[i]); 72 | } 73 | allEqual &= false; 74 | } 75 | } 76 | 77 | return (allEqual, differIndexes, maxDifference); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /ExtraFiles/MemoryBenches2.md: -------------------------------------------------------------------------------- 1 | ``` ini 2 | 3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 5 | .NET Core SDK=3.0.100-preview9-014004 6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 8 | 9 | 10 | ``` 11 | | Method | numberOfBytes | Mean | Error | StdDev | Ratio | RatioSD | 12 | |-------------------------------- |-------------- |---------------:|--------------:|--------------:|------:|--------:| 13 | | **ScalarStoreBlock** | **16384** | **298.5 ns** | **5.924 ns** | **9.047 ns** | **1.00** | **0.00** | 14 | | VectorStoreArrayMemPtr | 16384 | 394.1 ns | 10.456 ns | 16.885 ns | 1.32 | 0.06 | 15 | | VectorStoreArrayMemPtrUnaligned | 16384 | 495.0 ns | 9.477 ns | 10.140 ns | 1.66 | 0.07 | 16 | | | | | | | | | 17 | | **ScalarStoreBlock** | **131072** | **6,225.2 ns** | **116.328 ns** | **103.122 ns** | **1.00** | **0.00** | 18 | | VectorStoreArrayMemPtr | 131072 | 6,772.1 ns | 77.929 ns | 65.074 ns | 1.09 | 0.02 | 19 | | VectorStoreArrayMemPtrUnaligned | 131072 | 7,245.7 ns | 130.736 ns | 115.894 ns | 1.16 | 0.03 | 20 | | | | | | | | | 21 | | **ScalarStoreBlock** | **1048576** | **67,515.4 ns** | **2,549.673 ns** | **2,618.326 ns** | **1.00** | **0.00** | 22 | | VectorStoreArrayMemPtr | 1048576 | 80,868.2 ns | 1,569.923 ns | 1,928.007 ns | 1.20 | 0.05 | 23 | | VectorStoreArrayMemPtrUnaligned | 1048576 | 83,708.5 ns | 1,995.286 ns | 2,134.934 ns | 1.24 | 0.05 | 24 | | | | | | | | | 25 | | **ScalarStoreBlock** | **2097152** | **189,619.0 ns** | **7,155.162 ns** | **21,097.157 ns** | **1.00** | **0.00** | 26 | | VectorStoreArrayMemPtr | 2097152 | 271,783.7 ns | 5,376.659 ns | 11,914.305 ns | 1.41 | 0.17 | 27 | | VectorStoreArrayMemPtrUnaligned | 2097152 | 274,970.6 ns | 5,310.311 ns | 5,453.298 ns | 1.44 | 0.15 | 28 | | | | | | | | | 29 | | **ScalarStoreBlock** | **8388608** | **1,105,687.5 ns** | **10,205.821 ns** | **8,522.323 ns** | **1.00** | **0.00** | 30 | | VectorStoreArrayMemPtr | 8388608 | 1,573,145.8 ns | 31,795.047 ns | 29,741.107 ns | 1.42 | 0.02 | 31 | | VectorStoreArrayMemPtrUnaligned | 8388608 | 1,568,842.2 ns | 28,942.750 ns | 27,073.066 ns | 1.42 | 0.03 | 32 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/TrigonometricOps.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Numerics; 3 | using System.Runtime.Intrinsics; 4 | using BenchmarkDotNet.Attributes; 5 | using System.Runtime.InteropServices; 6 | using System.Runtime.Intrinsics.X86; 7 | using System.Buffers; 8 | 9 | namespace Core3IntrinsicsBenchmarks 10 | { 11 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 12 | public class TrigonometricOps 13 | { 14 | const int l1CacheSize = 32 * 1024; // one L1 cache, 32 kB 15 | private int numberOfItems; 16 | public static int algn = 32; 17 | public AlignedArrayPool floatPool; 18 | public AlignedArrayPool doublePool; 19 | AlignedMemoryHandle dataMemory, resultMemory; 20 | AlignedMemoryHandle dataDoubleMemory, resultDoubleMemory; 21 | 22 | [GlobalSetup] 23 | public unsafe void GlobalSetup() 24 | { 25 | numberOfItems = l1CacheSize / sizeof(double) / 2 - 8; 26 | floatPool = new AlignedArrayPool(); 27 | doublePool = new AlignedArrayPool(); 28 | dataMemory = floatPool.Rent(numberOfItems); 29 | resultMemory = floatPool.Rent(numberOfItems); 30 | dataDoubleMemory = doublePool.Rent(numberOfItems); 31 | resultDoubleMemory = doublePool.Rent(numberOfItems); 32 | Span dataSpan = new Span(dataMemory.MemoryHandle.Pointer, numberOfItems); 33 | Span resultSpan = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 34 | Span dataDoubleSpan = new Span(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 35 | Span resultDoubleSpan = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 36 | 37 | for (int i = 0; i < numberOfItems; i++) 38 | { 39 | dataSpan[i] = i + 0.01f; 40 | resultSpan[i] = 0.0f; 41 | dataDoubleSpan[i] = i + 0.01; 42 | resultDoubleSpan[i] = 0.0; 43 | } 44 | } 45 | 46 | [GlobalCleanup] 47 | public void GlobalCleanup() 48 | { 49 | floatPool.Return(resultMemory, false); 50 | floatPool.Return(dataMemory, false); 51 | doublePool.Return(resultDoubleMemory, false); 52 | doublePool.Return(dataDoubleMemory, false); 53 | floatPool.Dispose(); 54 | doublePool.Dispose(); 55 | } 56 | 57 | [Benchmark] 58 | public unsafe void Cos() 59 | { 60 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 61 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 62 | 63 | for (int i = 0; i < sp1.Length; i++) 64 | { 65 | sp2[i] = (float)Math.Cos(sp1[i]); 66 | } 67 | } 68 | 69 | [Benchmark] 70 | public unsafe void CosMathF() 71 | { 72 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 73 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 74 | 75 | for (int i = 0; i < sp1.Length; i++) 76 | { 77 | sp2[i] = MathF.Cos(sp1[i]); 78 | } 79 | } 80 | 81 | [Benchmark] 82 | public unsafe void CosDouble() 83 | { 84 | ReadOnlySpan sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 85 | Span sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 86 | 87 | for (int i = 0; i < sp1.Length; i++) 88 | { 89 | sp2[i] = Math.Cos(sp1[i]); 90 | 91 | } 92 | } 93 | 94 | 95 | 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /ExtraFiles/MemoryBenches-Aligned.md: -------------------------------------------------------------------------------- 1 | ``` ini 2 | 3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 5 | .NET Core SDK=3.0.100-preview9-014004 6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 8 | 9 | 10 | ``` 11 | | Method | NumberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 12 | |----------------------------- |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:| 13 | | **VectorStoreAligned** | **16384** | **504.7 ns** | **7.635 ns** | **8.792 ns** | **504.6 ns** | **1.00** | **0.00** | 14 | | VectorStoreArrayMemPtr | 16384 | 385.1 ns | 6.161 ns | 4.810 ns | 383.8 ns | 0.76 | 0.01 | 15 | | VectorStoreArrayMemSafe | 16384 | 597.0 ns | 11.873 ns | 12.193 ns | 595.5 ns | 1.18 | 0.03 | 16 | | VectorStoreArraySimpleBuffer | 16384 | 640.5 ns | 22.126 ns | 18.476 ns | 636.5 ns | 1.27 | 0.05 | 17 | | | | | | | | | | 18 | | **VectorStoreAligned** | **131072** | **9,865.0 ns** | **199.512 ns** | **279.687 ns** | **9,767.2 ns** | **1.00** | **0.00** | 19 | | VectorStoreArrayMemPtr | 131072 | 9,637.7 ns | 94.004 ns | 83.332 ns | 9,645.3 ns | 0.97 | 0.03 | 20 | | VectorStoreArrayMemSafe | 131072 | 6,181.7 ns | 120.563 ns | 148.062 ns | 6,144.4 ns | 0.63 | 0.03 | 21 | | VectorStoreArraySimpleBuffer | 131072 | 9,925.4 ns | 260.502 ns | 230.929 ns | 9,855.4 ns | 1.00 | 0.03 | 22 | | | | | | | | | | 23 | | **VectorStoreAligned** | **1048576** | **79,435.3 ns** | **1,865.323 ns** | **2,220.535 ns** | **78,294.8 ns** | **1.00** | **0.00** | 24 | | VectorStoreArrayMemPtr | 1048576 | 98,353.8 ns | 2,720.589 ns | 2,271.815 ns | 97,951.3 ns | 1.24 | 0.03 | 25 | | VectorStoreArrayMemSafe | 1048576 | 79,803.5 ns | 1,712.943 ns | 3,000.081 ns | 78,598.9 ns | 1.01 | 0.06 | 26 | | VectorStoreArraySimpleBuffer | 1048576 | 79,867.6 ns | 2,257.561 ns | 2,318.349 ns | 79,063.7 ns | 1.00 | 0.05 | 27 | | | | | | | | | | 28 | | **VectorStoreAligned** | **2097152** | **216,500.1 ns** | **4,992.955 ns** | **14,164.183 ns** | **212,591.0 ns** | **1.00** | **0.00** | 29 | | VectorStoreArrayMemPtr | 2097152 | 346,242.9 ns | 6,797.722 ns | 9,304.799 ns | 341,851.5 ns | 1.58 | 0.12 | 30 | | VectorStoreArrayMemSafe | 2097152 | 205,378.0 ns | 3,818.530 ns | 3,188.646 ns | 205,488.9 ns | 0.93 | 0.07 | 31 | | VectorStoreArraySimpleBuffer | 2097152 | 228,231.7 ns | 4,517.376 ns | 10,736.022 ns | 225,121.4 ns | 1.06 | 0.09 | 32 | | | | | | | | | | 33 | | **VectorStoreAligned** | **8388608** | **1,503,050.0 ns** | **28,335.402 ns** | **27,829.153 ns** | **1,490,845.2 ns** | **1.00** | **0.00** | 34 | | VectorStoreArrayMemPtr | 8388608 | 1,506,756.1 ns | 19,681.599 ns | 17,447.225 ns | 1,503,300.3 ns | 1.00 | 0.02 | 35 | | VectorStoreArrayMemSafe | 8388608 | 1,536,087.1 ns | 26,551.526 ns | 23,537.236 ns | 1,531,720.1 ns | 1.02 | 0.03 | 36 | | VectorStoreArraySimpleBuffer | 8388608 | 1,541,513.7 ns | 32,303.380 ns | 30,216.602 ns | 1,536,127.9 ns | 1.02 | 0.03 | 37 | -------------------------------------------------------------------------------- /Core3Intrinsics/Transpose.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace Core3Intrinsics 7 | { 8 | public static class Transpose 9 | { 10 | private const int defWidth = 1920, defHeight = 1080, numberOfElements = 8; 11 | private static int currWidth, currHeight; 12 | private static int[] original; 13 | public static int[]transposed1, transposed2; 14 | 15 | private static bool isInitialized = false; 16 | 17 | public static bool SerializeColorsInt() 18 | { 19 | if(!isInitialized) 20 | { 21 | return false; 22 | } 23 | int[] colorComponents = new int[currWidth * 4]; 24 | Span colorsSpan = transposed1; 25 | int runningCounter = 0;//, byteCounter; 26 | int start; 27 | for (int y = 0; y < currHeight; y++) 28 | { 29 | Span currColors = colorsSpan.Slice(runningCounter, currWidth * 4); 30 | for (int x = 0; x < currWidth; x+= numberOfElements) 31 | { 32 | for (int i = 0; i < numberOfElements; i++) 33 | { 34 | start = x * 4 + i; 35 | colorComponents[start] = original[runningCounter]; 36 | colorComponents[start + numberOfElements] = original[runningCounter + 1]; 37 | colorComponents[start + (2 * numberOfElements)] = original[runningCounter + 2]; 38 | colorComponents[start + (3 * numberOfElements)] = original[runningCounter + 3]; 39 | runningCounter += 4; 40 | } 41 | } 42 | colorComponents.CopyTo(currColors); 43 | 44 | } 45 | return true; 46 | } 47 | 48 | public static bool SerializedColorsVector256() 49 | { 50 | if (!isInitialized) 51 | { 52 | return false; 53 | } 54 | Span> originVectors = MemoryMarshal.Cast>(original); 55 | Span> transposedVectors = MemoryMarshal.Cast>(transposed2); 56 | Vector256 pm0, pm1, pm2, pm3, up0, up1, up2, up3; 57 | for(int i = 0; i < originVectors.Length; i += 4) 58 | { 59 | pm0 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x20); 60 | pm1 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x20); 61 | pm2 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x31); 62 | pm3 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x31); 63 | 64 | up0 = Avx2.UnpackLow(pm0, pm1); 65 | up1 = Avx2.UnpackHigh(pm0, pm1); 66 | up2 = Avx2.UnpackLow(pm2, pm3); 67 | up3 = Avx2.UnpackHigh(pm2, pm3); 68 | 69 | transposedVectors[i] = Avx2.UnpackLow(up0, up2); 70 | transposedVectors[i + 1] = Avx2.UnpackHigh(up0, up2); 71 | transposedVectors[i + 2] = Avx2.UnpackLow(up1, up3); 72 | transposedVectors[i + 3] = Avx2.UnpackHigh(up1, up3); 73 | } 74 | 75 | return true; 76 | } 77 | 78 | public static void CreateArrays(int width = defWidth, int height = defHeight) 79 | { 80 | currWidth = width; 81 | currHeight = height; 82 | 83 | original = new int[4 * currWidth * currHeight]; 84 | transposed1 = new int[4 * currHeight * currWidth]; 85 | transposed2 = new int[4 * currHeight * currWidth]; 86 | 87 | for (int i = 0; i < original.Length; i++) 88 | { 89 | original[i] = i; 90 | } 91 | 92 | isInitialized = true; 93 | } 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/AlignedArrayPool.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Buffers; 3 | using System.Collections.Generic; 4 | using System.Runtime.InteropServices; 5 | using System.Text; 6 | 7 | namespace Core3IntrinsicsBenchmarks 8 | { 9 | public class AlignedArrayPool : IDisposable where T : struct 10 | { 11 | private bool disposedValue = false; // To detect redundant calls 12 | 13 | private static readonly object lockObject = new object(); 14 | private readonly ArrayPool pool = ArrayPool.Shared; 15 | private const int defaultByteAlignment = 32; 16 | 17 | 18 | private readonly int tSize, currentAlignment; 19 | private readonly List<(byte[], GCHandle, IntPtr, int)> allBuffers; 20 | private readonly List<(MemoryHandle, GCHandle, byte[])> allMemoryHandles; 21 | 22 | public AlignedArrayPool() 23 | { 24 | Type tp = typeof(T); 25 | tSize = Marshal.SizeOf(tp); 26 | if (!tp.IsValueType || tp.IsEnum) 27 | { 28 | throw new ArgumentException("Invalid type, must be numeric."); 29 | } 30 | currentAlignment = defaultByteAlignment; 31 | allMemoryHandles = new List<(MemoryHandle, GCHandle, byte[])>(); 32 | allBuffers = new List<(byte[], GCHandle,IntPtr, int)>(); 33 | } 34 | 35 | public unsafe AlignedMemoryHandle Rent(int minimumLength, int byteAlignment) 36 | { 37 | byte[] buff = pool.Rent(minimumLength * tSize + 2 * byteAlignment); // see comment below, could just be 1 * 38 | var handle = GCHandle.Alloc(buff, GCHandleType.Pinned); 39 | allBuffers.Add((buff, handle, IntPtr.Zero, 0)); 40 | MemoryHandle memHand; 41 | AlignedMemoryHandle alMemHand; 42 | int currIdx; 43 | lock (lockObject) 44 | { 45 | currIdx = allBuffers.Count - 1; 46 | IntPtr ptr = AlignBuffer(currIdx); 47 | T[] tBuff = MemoryMarshal.Cast(new Span(ptr.ToPointer(), minimumLength * tSize)).ToArray(); 48 | memHand = new MemoryHandle(ptr.ToPointer(), handle); 49 | alMemHand = new AlignedMemoryHandle(ptr.ToPointer(), handle, ref tBuff[0], minimumLength * tSize); 50 | allMemoryHandles.Add((memHand, handle, buff)); 51 | } 52 | return alMemHand; 53 | 54 | unsafe IntPtr AlignBuffer(int bufferIndex) 55 | { 56 | (byte[], GCHandle, IntPtr, int) currentBuff = allBuffers[bufferIndex]; 57 | allBuffers.RemoveAt(bufferIndex); 58 | long lPtr = currentBuff.Item2.AddrOfPinnedObject().ToInt64(); 59 | long lPtr2 = (lPtr + currentAlignment - 1) & ~(currentAlignment - 1); 60 | // For benchmarking purposes, we avoid chance 32 byte alignment 61 | if(lPtr2 % 32 == 0) 62 | { 63 | lPtr2 += byteAlignment; 64 | } 65 | currentBuff.Item4 = (int)(lPtr2 - lPtr); 66 | currentBuff.Item3 = new IntPtr(lPtr2); 67 | allBuffers.Add(currentBuff); 68 | return new IntPtr(lPtr2); 69 | } 70 | } 71 | 72 | public AlignedMemoryHandle Rent(int minimumLength) 73 | { 74 | return Rent(minimumLength, defaultByteAlignment); 75 | } 76 | 77 | public unsafe void Return(AlignedMemoryHandle bufferHandle, bool clearArray = false) 78 | { 79 | (MemoryHandle memHandle, GCHandle gcHandle, byte[] buff) item; 80 | lock (lockObject) 81 | { 82 | for (int i = 0; i < allMemoryHandles.Count; i++) 83 | { 84 | item = allMemoryHandles[i]; 85 | if (item.memHandle.Pointer == bufferHandle.MemoryHandle.Pointer) 86 | { 87 | if (item.gcHandle.IsAllocated) 88 | { 89 | item.gcHandle.Free(); 90 | } 91 | pool.Return(item.buff, clearArray); 92 | allMemoryHandles.RemoveAt(i); 93 | break; 94 | } 95 | } 96 | } 97 | } 98 | 99 | #region IDisposable Support 100 | 101 | protected virtual void Dispose(bool disposing) 102 | { 103 | if (!disposedValue) 104 | { 105 | if (disposing) 106 | { 107 | // TODO: dispose managed state (managed objects). 108 | } 109 | 110 | // TODO: free unmanaged resources (unmanaged objects) and override a finalizer below. 111 | // TODO: set large fields to null. 112 | if (allMemoryHandles.Count > 0) 113 | { 114 | (MemoryHandle memHandle, GCHandle gcHandle, byte[] buff) item; 115 | for (int i = 0; i < allMemoryHandles.Count; i++) 116 | { 117 | item = allMemoryHandles[i]; 118 | if (item.gcHandle.IsAllocated) 119 | { 120 | item.gcHandle.Free(); 121 | } 122 | pool.Return(item.buff); 123 | 124 | } 125 | allMemoryHandles.Clear(); 126 | allBuffers.Clear(); 127 | } 128 | 129 | disposedValue = true; 130 | } 131 | } 132 | 133 | // TODO: override a finalizer only if Dispose(bool disposing) above has code to free unmanaged resources. 134 | ~AlignedArrayPool() 135 | { 136 | // Do not change this code. Put cleanup code in Dispose(bool disposing) above. 137 | Dispose(false); 138 | } 139 | 140 | // This code added to correctly implement the disposable pattern. 141 | public void Dispose() 142 | { 143 | // Do not change this code. Put cleanup code in Dispose(bool disposing) above. 144 | Dispose(true); 145 | // TODO: uncomment the following line if the finalizer is overridden above. 146 | GC.SuppressFinalize(this); 147 | } 148 | #endregion 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /ExtraFiles/MemoryBenches-1.md: -------------------------------------------------------------------------------- 1 | ``` ini 2 | 3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362 4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores 5 | .NET Core SDK=3.0.100-preview9-014004 6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT 8 | 9 | 10 | ``` 11 | | Method | numberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD | 12 | |------------------------------ |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:| 13 | | **ScalarStoreUnrolled** | **16384** | **2,292.3 ns** | **60.087 ns** | **50.176 ns** | **2,284.6 ns** | **7.52** | **0.30** | 14 | | ScalarStoreBlock | 16384 | 306.1 ns | 8.539 ns | 12.246 ns | 302.8 ns | 1.00 | 0.00 | 15 | | VectorStoreAligned | 16384 | 493.2 ns | 9.847 ns | 12.453 ns | 493.2 ns | 1.60 | 0.06 | 16 | | VectorStoreArrayMemPtr | 16384 | 401.3 ns | 8.049 ns | 12.998 ns | 397.5 ns | 1.32 | 0.07 | 17 | | VectorStoreArrayMemSafe | 16384 | 473.3 ns | 9.507 ns | 13.327 ns | 470.7 ns | 1.55 | 0.08 | 18 | | VectorStoreUnaligned | 16384 | 577.2 ns | 10.582 ns | 9.381 ns | 576.0 ns | 1.89 | 0.07 | 19 | | VectorStoreUnalignedMemPtr | 16384 | 504.7 ns | 15.461 ns | 20.641 ns | 498.6 ns | 1.65 | 0.09 | 20 | | VectorStoreUnalignedToAligned | 16384 | 492.7 ns | 9.763 ns | 16.311 ns | 485.8 ns | 1.61 | 0.08 | 21 | | | | | | | | | | 22 | | **ScalarStoreUnrolled** | **131072** | **18,656.4 ns** | **343.541 ns** | **321.348 ns** | **18,589.3 ns** | **3.02** | **0.06** | 23 | | ScalarStoreBlock | 131072 | 6,185.0 ns | 77.250 ns | 64.508 ns | 6,174.3 ns | 1.00 | 0.00 | 24 | | VectorStoreAligned | 131072 | 6,873.3 ns | 65.477 ns | 54.676 ns | 6,880.6 ns | 1.11 | 0.02 | 25 | | VectorStoreArrayMemPtr | 131072 | 6,653.6 ns | 141.340 ns | 132.209 ns | 6,610.1 ns | 1.08 | 0.03 | 26 | | VectorStoreArrayMemSafe | 131072 | 6,931.2 ns | 138.136 ns | 282.176 ns | 6,822.8 ns | 1.13 | 0.06 | 27 | | VectorStoreUnaligned | 131072 | 7,556.5 ns | 114.427 ns | 89.337 ns | 7,537.2 ns | 1.22 | 0.02 | 28 | | VectorStoreUnalignedMemPtr | 131072 | 7,319.7 ns | 145.018 ns | 221.457 ns | 7,239.3 ns | 1.19 | 0.04 | 29 | | VectorStoreUnalignedToAligned | 131072 | 6,928.4 ns | 138.061 ns | 141.779 ns | 6,892.1 ns | 1.12 | 0.03 | 30 | | | | | | | | | | 31 | | **ScalarStoreUnrolled** | **1048576** | **159,693.3 ns** | **2,764.505 ns** | **2,308.487 ns** | **159,156.2 ns** | **2.43** | **0.07** | 32 | | ScalarStoreBlock | 1048576 | 65,713.1 ns | 1,277.124 ns | 1,132.137 ns | 65,699.8 ns | 1.00 | 0.00 | 33 | | VectorStoreAligned | 1048576 | 85,778.4 ns | 2,106.262 ns | 5,975.114 ns | 83,181.5 ns | 1.31 | 0.10 | 34 | | VectorStoreArrayMemPtr | 1048576 | 78,964.1 ns | 1,518.257 ns | 1,624.518 ns | 78,922.6 ns | 1.20 | 0.03 | 35 | | VectorStoreArrayMemSafe | 1048576 | 80,763.9 ns | 1,389.509 ns | 1,160.303 ns | 80,709.0 ns | 1.23 | 0.03 | 36 | | VectorStoreUnaligned | 1048576 | 84,741.3 ns | 1,680.962 ns | 2,185.725 ns | 84,040.2 ns | 1.29 | 0.04 | 37 | | VectorStoreUnalignedMemPtr | 1048576 | 82,595.5 ns | 1,816.659 ns | 2,019.212 ns | 82,142.8 ns | 1.26 | 0.04 | 38 | | VectorStoreUnalignedToAligned | 1048576 | 86,209.3 ns | 1,984.263 ns | 5,693.224 ns | 85,122.7 ns | 1.30 | 0.09 | 39 | | | | | | | | | | 40 | | **ScalarStoreUnrolled** | **2097152** | **386,240.6 ns** | **7,648.523 ns** | **19,188.650 ns** | **381,202.7 ns** | **2.26** | **0.11** | 41 | | ScalarStoreBlock | 2097152 | 171,998.1 ns | 3,435.604 ns | 5,142.251 ns | 170,366.1 ns | 1.00 | 0.00 | 42 | | VectorStoreAligned | 2097152 | 250,602.9 ns | 3,544.961 ns | 2,960.203 ns | 250,186.1 ns | 1.45 | 0.05 | 43 | | VectorStoreArrayMemPtr | 2097152 | 253,581.1 ns | 5,065.490 ns | 9,003.903 ns | 251,693.9 ns | 1.48 | 0.06 | 44 | | VectorStoreArrayMemSafe | 2097152 | 254,647.4 ns | 5,565.014 ns | 10,034.868 ns | 251,608.8 ns | 1.49 | 0.07 | 45 | | VectorStoreUnaligned | 2097152 | 258,129.5 ns | 5,127.175 ns | 7,018.136 ns | 256,494.3 ns | 1.50 | 0.06 | 46 | | VectorStoreUnalignedMemPtr | 2097152 | 259,253.1 ns | 5,207.113 ns | 8,408.518 ns | 257,269.9 ns | 1.51 | 0.07 | 47 | | VectorStoreUnalignedToAligned | 2097152 | 268,083.3 ns | 5,350.387 ns | 14,736.521 ns | 270,760.6 ns | 1.55 | 0.10 | 48 | | | | | | | | | | 49 | | **ScalarStoreUnrolled** | **8388608** | **1,792,974.9 ns** | **34,861.894 ns** | **59,198.142 ns** | **1,773,807.8 ns** | **1.64** | **0.07** | 50 | | ScalarStoreBlock | 8388608 | 1,106,074.5 ns | 17,544.390 ns | 14,650.360 ns | 1,107,074.2 ns | 1.00 | 0.00 | 51 | | VectorStoreAligned | 8388608 | 1,564,931.4 ns | 38,160.539 ns | 37,478.752 ns | 1,549,061.2 ns | 1.42 | 0.04 | 52 | | VectorStoreArrayMemPtr | 8388608 | 1,573,258.0 ns | 34,312.238 ns | 44,615.601 ns | 1,561,962.8 ns | 1.43 | 0.05 | 53 | | VectorStoreArrayMemSafe | 8388608 | 1,559,172.6 ns | 17,596.260 ns | 15,598.626 ns | 1,559,339.7 ns | 1.41 | 0.03 | 54 | | VectorStoreUnaligned | 8388608 | 1,541,325.1 ns | 18,699.861 ns | 14,599.621 ns | 1,541,280.2 ns | 1.39 | 0.02 | 55 | | VectorStoreUnalignedMemPtr | 8388608 | 1,561,604.8 ns | 22,459.313 ns | 19,909.596 ns | 1,558,538.2 ns | 1.41 | 0.03 | 56 | | VectorStoreUnalignedToAligned | 8388608 | 1,546,770.0 ns | 19,669.857 ns | 15,356.930 ns | 1,543,577.9 ns | 1.40 | 0.02 | 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | [Aa][Rr][Mm]/ 24 | [Aa][Rr][Mm]64/ 25 | bld/ 26 | [Bb]in/ 27 | [Oo]bj/ 28 | [Ll]og/ 29 | 30 | # Visual Studio 2015/2017 cache/options directory 31 | .vs/ 32 | # Uncomment if you have tasks that create the project's static files in wwwroot 33 | #wwwroot/ 34 | 35 | # Visual Studio 2017 auto generated files 36 | Generated\ Files/ 37 | 38 | # MSTest test Results 39 | [Tt]est[Rr]esult*/ 40 | [Bb]uild[Ll]og.* 41 | 42 | # NUNIT 43 | *.VisualState.xml 44 | TestResult.xml 45 | 46 | # Build Results of an ATL Project 47 | [Dd]ebugPS/ 48 | [Rr]eleasePS/ 49 | dlldata.c 50 | 51 | # Benchmark Results 52 | BenchmarkDotNet.Artifacts/ 53 | 54 | # .NET Core 55 | project.lock.json 56 | project.fragment.lock.json 57 | artifacts/ 58 | 59 | # StyleCop 60 | StyleCopReport.xml 61 | 62 | # Files built by Visual Studio 63 | *_i.c 64 | *_p.c 65 | *_h.h 66 | *.ilk 67 | *.meta 68 | *.obj 69 | *.iobj 70 | *.pch 71 | *.pdb 72 | *.ipdb 73 | *.pgc 74 | *.pgd 75 | *.rsp 76 | *.sbr 77 | *.tlb 78 | *.tli 79 | *.tlh 80 | *.tmp 81 | *.tmp_proj 82 | *_wpftmp.csproj 83 | *.log 84 | *.vspscc 85 | *.vssscc 86 | .builds 87 | *.pidb 88 | *.svclog 89 | *.scc 90 | 91 | # Chutzpah Test files 92 | _Chutzpah* 93 | 94 | # Visual C++ cache files 95 | ipch/ 96 | *.aps 97 | *.ncb 98 | *.opendb 99 | *.opensdf 100 | *.sdf 101 | *.cachefile 102 | *.VC.db 103 | *.VC.VC.opendb 104 | 105 | # Visual Studio profiler 106 | *.psess 107 | *.vsp 108 | *.vspx 109 | *.sap 110 | 111 | # Visual Studio Trace Files 112 | *.e2e 113 | 114 | # TFS 2012 Local Workspace 115 | $tf/ 116 | 117 | # Guidance Automation Toolkit 118 | *.gpState 119 | 120 | # ReSharper is a .NET coding add-in 121 | _ReSharper*/ 122 | *.[Rr]e[Ss]harper 123 | *.DotSettings.user 124 | 125 | # JustCode is a .NET coding add-in 126 | .JustCode 127 | 128 | # TeamCity is a build add-in 129 | _TeamCity* 130 | 131 | # DotCover is a Code Coverage Tool 132 | *.dotCover 133 | 134 | # AxoCover is a Code Coverage Tool 135 | .axoCover/* 136 | !.axoCover/settings.json 137 | 138 | # Visual Studio code coverage results 139 | *.coverage 140 | *.coveragexml 141 | 142 | # NCrunch 143 | _NCrunch_* 144 | .*crunch*.local.xml 145 | nCrunchTemp_* 146 | 147 | # MightyMoose 148 | *.mm.* 149 | AutoTest.Net/ 150 | 151 | # Web workbench (sass) 152 | .sass-cache/ 153 | 154 | # Installshield output folder 155 | [Ee]xpress/ 156 | 157 | # DocProject is a documentation generator add-in 158 | DocProject/buildhelp/ 159 | DocProject/Help/*.HxT 160 | DocProject/Help/*.HxC 161 | DocProject/Help/*.hhc 162 | DocProject/Help/*.hhk 163 | DocProject/Help/*.hhp 164 | DocProject/Help/Html2 165 | DocProject/Help/html 166 | 167 | # Click-Once directory 168 | publish/ 169 | 170 | # Publish Web Output 171 | *.[Pp]ublish.xml 172 | *.azurePubxml 173 | # Note: Comment the next line if you want to checkin your web deploy settings, 174 | # but database connection strings (with potential passwords) will be unencrypted 175 | *.pubxml 176 | *.publishproj 177 | 178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 179 | # checkin your Azure Web App publish settings, but sensitive information contained 180 | # in these scripts will be unencrypted 181 | PublishScripts/ 182 | 183 | # NuGet Packages 184 | *.nupkg 185 | # The packages folder can be ignored because of Package Restore 186 | **/[Pp]ackages/* 187 | # except build/, which is used as an MSBuild target. 188 | !**/[Pp]ackages/build/ 189 | # Uncomment if necessary however generally it will be regenerated when needed 190 | #!**/[Pp]ackages/repositories.config 191 | # NuGet v3's project.json files produces more ignorable files 192 | *.nuget.props 193 | *.nuget.targets 194 | 195 | # Microsoft Azure Build Output 196 | csx/ 197 | *.build.csdef 198 | 199 | # Microsoft Azure Emulator 200 | ecf/ 201 | rcf/ 202 | 203 | # Windows Store app package directories and files 204 | AppPackages/ 205 | BundleArtifacts/ 206 | Package.StoreAssociation.xml 207 | _pkginfo.txt 208 | *.appx 209 | 210 | # Visual Studio cache files 211 | # files ending in .cache can be ignored 212 | *.[Cc]ache 213 | # but keep track of directories ending in .cache 214 | !?*.[Cc]ache/ 215 | 216 | # Others 217 | ClientBin/ 218 | ~$* 219 | *~ 220 | *.dbmdl 221 | *.dbproj.schemaview 222 | *.jfm 223 | *.pfx 224 | *.publishsettings 225 | orleans.codegen.cs 226 | 227 | # Including strong name files can present a security risk 228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 229 | #*.snk 230 | 231 | # Since there are multiple workflows, uncomment next line to ignore bower_components 232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 233 | #bower_components/ 234 | 235 | # RIA/Silverlight projects 236 | Generated_Code/ 237 | 238 | # Backup & report files from converting an old project file 239 | # to a newer Visual Studio version. Backup files are not needed, 240 | # because we have git ;-) 241 | _UpgradeReport_Files/ 242 | Backup*/ 243 | UpgradeLog*.XML 244 | UpgradeLog*.htm 245 | ServiceFabricBackup/ 246 | *.rptproj.bak 247 | 248 | # SQL Server files 249 | *.mdf 250 | *.ldf 251 | *.ndf 252 | 253 | # Business Intelligence projects 254 | *.rdl.data 255 | *.bim.layout 256 | *.bim_*.settings 257 | *.rptproj.rsuser 258 | *- Backup*.rdl 259 | 260 | # Microsoft Fakes 261 | FakesAssemblies/ 262 | 263 | # GhostDoc plugin setting file 264 | *.GhostDoc.xml 265 | 266 | # Node.js Tools for Visual Studio 267 | .ntvs_analysis.dat 268 | node_modules/ 269 | 270 | # Visual Studio 6 build log 271 | *.plg 272 | 273 | # Visual Studio 6 workspace options file 274 | *.opt 275 | 276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 277 | *.vbw 278 | 279 | # Visual Studio LightSwitch build output 280 | **/*.HTMLClient/GeneratedArtifacts 281 | **/*.DesktopClient/GeneratedArtifacts 282 | **/*.DesktopClient/ModelManifest.xml 283 | **/*.Server/GeneratedArtifacts 284 | **/*.Server/ModelManifest.xml 285 | _Pvt_Extensions 286 | 287 | # Paket dependency manager 288 | .paket/paket.exe 289 | paket-files/ 290 | 291 | # FAKE - F# Make 292 | .fake/ 293 | 294 | # JetBrains Rider 295 | .idea/ 296 | *.sln.iml 297 | 298 | # CodeRush personal settings 299 | .cr/personal 300 | 301 | # Python Tools for Visual Studio (PTVS) 302 | __pycache__/ 303 | *.pyc 304 | 305 | # Cake - Uncomment if you are using it 306 | # tools/** 307 | # !tools/packages.config 308 | 309 | # Tabs Studio 310 | *.tss 311 | 312 | # Telerik's JustMock configuration file 313 | *.jmconfig 314 | 315 | # BizTalk build output 316 | *.btp.cs 317 | *.btm.cs 318 | *.odx.cs 319 | *.xsd.cs 320 | 321 | # OpenCover UI analysis results 322 | OpenCover/ 323 | 324 | # Azure Stream Analytics local run output 325 | ASALocalRun/ 326 | 327 | # MSBuild Binary and Structured Log 328 | *.binlog 329 | 330 | # NVidia Nsight GPU debugger configuration file 331 | *.nvuser 332 | 333 | # MFractors (Xamarin productivity tool) working folder 334 | .mfractor/ 335 | 336 | # Local History for Visual Studio 337 | .localhistory/ 338 | 339 | # BeatPulse healthcheck temp database 340 | healthchecksdb -------------------------------------------------------------------------------- /Core3Intrinsics/Intro.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.InteropServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace Core3Intrinsics 7 | { 8 | public class Intro 9 | { 10 | public Intro() 11 | { 12 | var middleVector = Vector128.Create(1.0f); // middleVector = <1,1,1,1> 13 | middleVector = Vector128.CreateScalar(-1.0f); // middleVector = <-1,0,0,0> 14 | var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191> 15 | if(Avx.IsSupported) 16 | { 17 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5> 18 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5> 19 | Vector256 result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit 20 | left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, - 70.0f, -80.0f); 21 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f); 22 | result = Avx.UnpackHigh(left, right); // result = <-3, 3, -4, 4, -70, 70, -80, 80> 23 | result = Avx.UnpackLow(left, right); // result = <-1, 1, -2, 2, -50, 50, -60, 60> 24 | result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0> 25 | bool testResult = Avx.TestC(left, right); // testResult = true 26 | testResult = Avx.TestC(right, left); // testResult = false 27 | Vector256 result1 = Avx.Divide(left, right); 28 | var plusOne = Vector256.Create(1.0f); 29 | result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling); 30 | result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling); 31 | left = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f); 32 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 33 | Vector256 nanInFirstPosition = Avx.Divide(left, right); 34 | left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f); 35 | Vector256 InfInFirstPosition = Avx.Divide(left, right); 36 | 37 | left = Vector256.Create(-1.1f, 3.0f, 1.0f/3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f); 38 | right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 39 | Vector256 compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 40 | Vector256 mixed = Avx.BlendVariable(left, right, compareResult); // mixed = <-1, 2, -3, 2, -50, -60, -70, -80> 41 | 42 | //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f); 43 | //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f); 44 | Vector256 other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f); 45 | bool bRes = Avx.TestZ(plusOne, compareResult); 46 | bool bRes2 = Avx.TestC(plusOne, compareResult); 47 | bool allTrue = !Avx.TestZ(compareResult, compareResult); 48 | compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN> 49 | compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling); 50 | compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling); 51 | compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); 52 | var left128 = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f); 53 | var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f); 54 | Vector128 compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0> 55 | 56 | int res = Avx.MoveMask(compareResult); 57 | if (Fma.IsSupported) 58 | { 59 | Vector256 resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element 60 | resultFma = Fma.MultiplyAddNegated(left, right, other); // = -(left * right + other) for each element 61 | resultFma = Fma.MultiplySubtract(left, right, other); // = left * right - other for each element 62 | Fma.MultiplyAddSubtract(left, right, other); // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract 63 | 64 | } 65 | result = Avx.DotProduct(left, right, 0b1010_0001); // result = <-20, 0, 0, 0, -10000, 0, 0, 0> 66 | result = Avx.Floor(left); // result = <-3, -3, -3, -3, -3, -3, -3, -3> 67 | result = Avx.Add(left, right); // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5> 68 | result = Avx.Ceiling(left); // result = <-2, -2, -2, -2, -2, -2, -2, -2> 69 | result = Avx.Multiply(left, right); // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5> 70 | result = Avx.HorizontalAdd(left, right); // result = <-5, -5, 10, 10, -5, -5, 10, 10> 71 | result = Avx.HorizontalSubtract(left, right); // result = <0, 0, 0, 0, 0, 0, 0, 0> 72 | double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 }; 73 | double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; 74 | double[] someResult = new double[someDoubles.Length]; 75 | float[] someFloats = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 }; 76 | float[] someOtherFloats = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 }; 77 | unsafe 78 | { 79 | fixed (double* ptr = &someDoubles[1]) 80 | { 81 | fixed (double* ptr2 = &someResult[0]) 82 | { 83 | Vector256 res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8> 84 | Avx.Store(ptr2, res2); 85 | } 86 | } 87 | 88 | fixed (float* ptr = &someFloats[0]) 89 | { 90 | fixed (float* ptr2 = &someOtherFloats[0]) 91 | { 92 | Vector256 res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001); 93 | //Avx.Store(ptr2, res2); 94 | } 95 | } 96 | } 97 | 98 | 99 | 100 | } 101 | } 102 | 103 | public float[] ProcessData(ref Span input) 104 | { 105 | float[] results = new float[input.Length]; 106 | Span> resultVectors = MemoryMarshal.Cast>(results); 107 | 108 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(input); 109 | 110 | for(int i = 0; i < inputVectors.Length; i++) 111 | { 112 | resultVectors[i] = Avx.Sqrt(inputVectors[i]); 113 | } 114 | 115 | return results; 116 | } 117 | 118 | public unsafe float[] ProcessDataUnsafe(ref Span input) 119 | { 120 | float[] results = new float[input.Length]; 121 | fixed (float* inputPtr = &input[0]) 122 | { 123 | float* inCurrent = inputPtr; 124 | fixed (float* resultPtr = &results[0]) 125 | { 126 | float* resEnd = resultPtr + results.Length; 127 | float* resCurrent = resultPtr; 128 | while (resCurrent < resEnd) 129 | { 130 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent))); 131 | resCurrent += 8; 132 | inCurrent += 8; 133 | } 134 | } 135 | } 136 | return results; 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /Core3Intrinsics/Mandelbrot.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.Intrinsics; 3 | using System.Runtime.InteropServices; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace Core3Intrinsics 7 | { 8 | public class Mandelbrot 9 | { 10 | readonly int TOTALBYTES = 16 * 1024 * 1024;//4 * 1024 * 1024; 11 | public int numberOfTasks = 1; 12 | 13 | const float LEFT_X = -2.5f; 14 | const float RIGHT_X = 1.0f; 15 | const float TOP_Y = 1.0f; 16 | const float BOTT_Y = -1.0f; 17 | 18 | int resolutionX, resolutionY; 19 | readonly float ratioy_x = (TOP_Y - BOTT_Y) / (RIGHT_X - LEFT_X); 20 | float STEP_X; 21 | float STEP_Y; 22 | public Memory results, results2, testValue1, testValue2; 23 | public int SizeInBytes => numberOfPoints * sizeof(float); 24 | Memory xPoints, yPoints; 25 | int numberOfPoints; 26 | 27 | public void FloatMandel() 28 | { 29 | int floatL3Size = TOTALBYTES / sizeof(float); 30 | resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x)); 31 | if (resolutionX % 8 != 0) 32 | { 33 | resolutionX -= resolutionX % 8; 34 | } 35 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); 36 | if (resolutionY % 8 != 0) 37 | { 38 | resolutionY -= resolutionY % 8; 39 | } 40 | STEP_X = (RIGHT_X - LEFT_X) / resolutionX; 41 | STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment 42 | numberOfPoints = resolutionX * resolutionY; 43 | if(numberOfPoints % 8 != 0) 44 | { 45 | numberOfPoints += numberOfPoints % 8; 46 | } 47 | results = new float[numberOfPoints]; 48 | testValue1 = new float [numberOfPoints]; 49 | testValue2 = new float [numberOfPoints]; 50 | 51 | xPoints = new float[resolutionX]; 52 | yPoints = new float[resolutionY]; 53 | for (int i = 0; i < resolutionX; i++) 54 | { 55 | xPoints.Span[i] = LEFT_X + i * STEP_X; 56 | } 57 | for (int i = 0; i < resolutionY; i++) 58 | { 59 | yPoints.Span[i] = TOP_Y - i * STEP_Y; 60 | } 61 | 62 | float currentY; 63 | float currentX; 64 | int countX = 0, countY = 0; 65 | int maxInter = 256; 66 | int inter; 67 | float zSquare, xSquare, ySquare, x, y; 68 | ReadOnlySpan ySpan = yPoints.Span; 69 | ReadOnlySpan xSpan = xPoints.Span; 70 | Span res = results.Span; 71 | int floatCounter = 0; 72 | while (countY < resolutionY) 73 | { 74 | 75 | currentY = ySpan[countY]; 76 | while (countX < resolutionX) 77 | { 78 | 79 | currentX = xSpan[countX]; 80 | zSquare = xSquare = ySquare = 0.0f; 81 | inter = 0; 82 | bool goOn; 83 | while (xSquare + ySquare <= 4.0f && inter < maxInter) 84 | { 85 | x = xSquare - ySquare + currentX; 86 | y = zSquare - ySquare - xSquare + currentY; 87 | xSquare = x * x; 88 | ySquare = y * y; 89 | zSquare = (x + y) * (x + y); 90 | goOn = xSquare + ySquare <= 4.0f; 91 | 92 | inter = goOn ? inter + 1 : inter; 93 | } 94 | //res[countY * resolutionX + countX] = inter; 95 | res[floatCounter] = inter; 96 | testValue1.Span[floatCounter] = xSquare + ySquare; 97 | countX++; 98 | floatCounter++; 99 | } 100 | countX = 0; 101 | countY++; 102 | } 103 | } 104 | 105 | public unsafe void Vector256Mandel() 106 | { 107 | int floatL3Size = TOTALBYTES / sizeof(float); 108 | resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x)); 109 | if (resolutionX % 8 != 0) 110 | { 111 | resolutionX -= resolutionX % 8; 112 | } 113 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); 114 | if (resolutionY % 8 != 0) 115 | { 116 | resolutionY -= resolutionY % 8; 117 | } 118 | STEP_X = (RIGHT_X - LEFT_X) / resolutionX; 119 | STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment 120 | numberOfPoints = resolutionX * resolutionY; 121 | results2 = new float[numberOfPoints]; 122 | 123 | xPoints = new float[resolutionX]; 124 | yPoints = new float[resolutionY]; 125 | for (int i = 0; i < resolutionX; i++) 126 | { 127 | xPoints.Span[i] = LEFT_X + i * STEP_X; 128 | } 129 | for (int i = 0; i < resolutionY; i++) 130 | { 131 | yPoints.Span[i] = TOP_Y - i * STEP_Y; 132 | } 133 | 134 | int countX = 0, countY = 0; 135 | int maxInter = 256; 136 | int inter; 137 | ReadOnlySpan ySpan = yPoints.Span;// MemoryMarshal.Cast>(yPoints.Span); 138 | ReadOnlySpan> xSpan = MemoryMarshal.Cast>(xPoints.Span); 139 | Span> res = MemoryMarshal.Cast>(results2.Span); 140 | Span> testSpan = MemoryMarshal.Cast>(testValue2.Span); 141 | int resVectorNumber = 0; 142 | 143 | Vector256 xVec, yVec; 144 | var oneVec = Vector256.Create(1.0f); 145 | var fourVec = Vector256.Create(4.0f); 146 | 147 | while (countY < ySpan.Length) 148 | { 149 | var currYVec = Vector256.Create(ySpan[countY]); 150 | while (countX < xSpan.Length) 151 | { 152 | 153 | Vector256 currXVec = xSpan[countX]; 154 | var xSquVec = Vector256.Create(0.0f); 155 | var ySquVec = Vector256.Create(0.0f); 156 | var zSquVec = Vector256.Create(0.0f); 157 | var interVec = Vector256.Create(0.0f); 158 | Vector256 sumVector = oneVec; 159 | inter = 0; 160 | bool goOn = true; 161 | while (goOn) 162 | { 163 | xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); 164 | yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); 165 | xSquVec = Avx.Multiply(xVec, xVec); 166 | ySquVec = Avx.Multiply(yVec, yVec); 167 | zSquVec = Avx.Multiply(Avx.Add(xVec, yVec), Avx.Add(xVec, yVec)); 168 | Vector256 test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? 169 | sumVector = Avx.BlendVariable(Vector256.Zero, sumVector, test); // selects from second if true, from first otherwise 170 | goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? 171 | if (goOn) 172 | { 173 | interVec = Avx.Add(interVec, sumVector); 174 | } 175 | inter = goOn ? inter + 1 : inter; 176 | } 177 | testSpan[resVectorNumber] = Avx.Add(xSquVec, ySquVec); 178 | res[resVectorNumber] = interVec; 179 | resVectorNumber++; 180 | countX++; 181 | } 182 | countX = 0; 183 | countY++; 184 | } 185 | 186 | } 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/Mandelbrot.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Numerics; 3 | using System.Runtime.Intrinsics; 4 | using BenchmarkDotNet.Attributes; 5 | using System.Runtime.InteropServices; 6 | using System.Runtime.Intrinsics.X86; 7 | 8 | namespace Core3IntrinsicsBenchmarks 9 | { 10 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 11 | public class Mandelbrot 12 | { 13 | //[Params(4 * 1024 * 1024, 16 * 1024 * 1024)] //L3, 4 * L3 14 | public int TotalBytes {get; set; } 15 | 16 | public int numberOfTasks = 2; 17 | const float LEFT_X = -2.5f; 18 | const float RIGHT_X = 1.0f; 19 | const float TOP_Y = 1.0f; 20 | const float BOTT_Y = -1.0f; 21 | const float RATIO_Y_X = (TOP_Y - BOTT_Y) / (RIGHT_X - LEFT_X); 22 | 23 | int resolutionX, resolutionY; 24 | readonly float ratioy_x = RATIO_Y_X; 25 | public Memory results; 26 | public int SizeInBytes => numberOfPoints * sizeof(float); 27 | Memory xPoints, yPoints; 28 | int numberOfPoints; 29 | 30 | [GlobalSetup] 31 | public void GlobalSetup() 32 | { 33 | resolutionX = 1920; 34 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); 35 | float STEP_X = (RIGHT_X - LEFT_X) / resolutionX; 36 | float STEP_Y = STEP_X; // (TOP_Y - BOTT_Y) / resolutionY; Bug from reddit comment 37 | 38 | numberOfPoints = resolutionX * resolutionY; 39 | results = new float[numberOfPoints]; 40 | xPoints = new float[resolutionX]; 41 | yPoints = new float[resolutionY]; 42 | for(int i = 0; i < resolutionX; i++) 43 | { 44 | xPoints.Span[i] = LEFT_X + i * STEP_X; 45 | } 46 | for (int i = 0; i < resolutionY; i++) 47 | { 48 | yPoints.Span[i] = TOP_Y - i * STEP_Y; 49 | } 50 | } 51 | 52 | [Benchmark(Baseline = true)] 53 | public void FloatMandel() 54 | { 55 | float currentY; 56 | float currentX; 57 | int countX = 0, countY = 0; 58 | int maxInter = 256; 59 | int inter; 60 | float zSquare, xSquare, ySquare, x, y; 61 | ReadOnlySpan ySpan = yPoints.Span; 62 | ReadOnlySpan xSpan = xPoints.Span; 63 | Span res = results.Span; 64 | int floatCounter = 0; 65 | float q; 66 | float one16 = 1.0f / 16.0f; 67 | while (countY < resolutionY) 68 | { 69 | currentY = ySpan[countY]; 70 | while (countX < resolutionX) 71 | { 72 | currentX = xSpan[countX]; 73 | zSquare = xSquare = ySquare = 0.0f; 74 | inter = 0; 75 | bool goOn;// = true; 76 | float temp = (currentX - 0.25f); 77 | float temp1 = currentY * currentY; 78 | q = temp * temp + temp1; 79 | goOn = (q * (q + (temp)) > 0.25f * temp1); // out of cardioid? see https://en.wikipedia.org/wiki/Mandelbrot_set#Cardioid_/_bulb_checking 80 | if (goOn) 81 | { 82 | goOn = (currentX + 1.0f) * (currentX + 1.0f) + temp1 > one16; // out of period-2 bulb? 83 | if (!goOn) 84 | { 85 | inter = 255; 86 | } 87 | } 88 | 89 | while (goOn && inter < maxInter) 90 | { 91 | x = xSquare - ySquare + currentX; 92 | y = zSquare - ySquare - xSquare + currentY; 93 | xSquare = x * x; 94 | ySquare = y * y; 95 | zSquare = (x + y) * (x + y); 96 | goOn = xSquare + ySquare <= 4.0f; 97 | 98 | inter = goOn ? inter + 1 : inter; 99 | } 100 | res[floatCounter] = inter; 101 | countX++; 102 | floatCounter++; 103 | } 104 | countX = 0; 105 | countY++; 106 | } 107 | } 108 | 109 | [Benchmark] 110 | public unsafe void Vector256Mandel() 111 | { 112 | int countX = 0, countY = 0; 113 | int maxInter = 256; 114 | int inter; 115 | ReadOnlySpan ySpan = yPoints.Span; 116 | ReadOnlySpan> xSpan = MemoryMarshal.Cast>(xPoints.Span); 117 | Span> res = MemoryMarshal.Cast>(results.Span); 118 | int resVectorNumber = 0; 119 | 120 | Vector256 xVec, yVec; 121 | Vector256 zeroVec = Vector256.Zero; 122 | var oneVec = Vector256.Create(1.0f); 123 | var fourVec = Vector256.Create(4.0f); 124 | var one4Vec = Vector256.Create(0.25f); 125 | var one16Vec = Vector256.Create(1.0f/16.0f); 126 | Vector256 qVec; 127 | Vector256 test; 128 | 129 | while (countY < ySpan.Length) 130 | { 131 | var currYVec = Vector256.Create(ySpan[countY]); 132 | while (countX < xSpan.Length) 133 | { 134 | Vector256 currXVec = xSpan[countX]; 135 | Vector256 xSquVec = zeroVec; 136 | Vector256 ySquVec = zeroVec; 137 | Vector256 zSquVec = zeroVec; 138 | Vector256 interVec = zeroVec; 139 | Vector256 sumVector; 140 | 141 | inter = 0; 142 | bool goOn; 143 | Vector256 temp = Avx.Subtract(currXVec, one4Vec); 144 | Vector256 temp1 = Avx.Multiply(currYVec, currYVec); 145 | qVec = Avx.Add(Avx.Multiply(temp, temp), temp1); 146 | Vector256 temp2 = Avx.Multiply(qVec, Avx.Add(qVec, temp)); 147 | test = Avx.Compare(temp2, Avx.Multiply(one4Vec, temp1), FloatComparisonMode.OrderedGreaterThanNonSignaling); 148 | goOn = (Avx.MoveMask(test) > 0); 149 | if(goOn) 150 | { 151 | temp2 = Avx.Add(currXVec, oneVec); 152 | temp = Avx.Add(Avx.Multiply(temp2, temp2), temp1); 153 | test = Avx.Compare(temp, one16Vec, FloatComparisonMode.OrderedGreaterThanNonSignaling); 154 | goOn = Avx.MoveMask(test) > 0; 155 | if (!goOn) 156 | { 157 | interVec = Vector256.Create(255.0f); // make all point = maximum value 158 | } 159 | } 160 | while (goOn) 161 | { 162 | xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); 163 | yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); 164 | xSquVec = Avx.Multiply(xVec, xVec); 165 | ySquVec = Avx.Multiply(yVec, yVec); 166 | temp = Avx.Add(xVec, yVec); 167 | zSquVec = Avx.Multiply(temp, temp); 168 | test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? 169 | sumVector = Avx.BlendVariable(zeroVec, oneVec, test); 170 | 171 | goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? 172 | if (goOn) 173 | { 174 | interVec = Avx.Add(interVec, sumVector); 175 | } 176 | inter = goOn ? inter + 1 : inter; 177 | } 178 | res[resVectorNumber] = interVec; 179 | resVectorNumber++; 180 | countX++; 181 | } 182 | countX = 0; 183 | countY++; 184 | } 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/MemoryBenches.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using BenchmarkDotNet.Exporters; 4 | using BenchmarkDotNet.Exporters.Csv; 5 | using System; 6 | using System.Buffers; 7 | using System.Runtime.CompilerServices; 8 | using System.Runtime.InteropServices; 9 | using System.Runtime.Intrinsics; 10 | using System.Runtime.Intrinsics.X86; 11 | 12 | namespace Core3IntrinsicsBenchmarks 13 | { 14 | //[DisassemblyDiagnoser(printAsm: true, printSource: true)] 15 | //[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] 16 | //[CategoriesColumn] 17 | //[Config(typeof(Config))] // only used for plots 18 | public class MemoryBenches 19 | { 20 | private class Config : ManualConfig // only used for plots 21 | { 22 | public Config() 23 | { 24 | Add(CsvMeasurementsExporter.Default); 25 | Add(RPlotExporter.Default); 26 | } 27 | } 28 | 29 | [Params(16 * 1024, 128 * 1024, 1024 * 1024, 2 * 1024 * 1024, 8 * 1024 * 1024)] // half L1, half L2, half L3, 2 * L3 30 | public int NumberOfBytes { get ; set; } 31 | 32 | private int vectorNumberOfItems, vectorFloatStep; 33 | private int numberOfFloatItems; 34 | 35 | private static readonly AlignedArrayPool alignedArrayPool = new AlignedArrayPool(); 36 | private static AlignedMemoryHandle dataMemory, storeMemory, data16Memory, store16Memory; 37 | //private static float[] arr1, arr2; 38 | 39 | [GlobalSetup] 40 | public unsafe void GlobalSetup() 41 | { 42 | vectorFloatStep = Vector256.Count; 43 | numberOfFloatItems = NumberOfBytes / sizeof(float); 44 | vectorNumberOfItems = numberOfFloatItems / vectorFloatStep; 45 | 46 | dataMemory = alignedArrayPool.Rent(numberOfFloatItems); 47 | storeMemory = alignedArrayPool.Rent(numberOfFloatItems); 48 | data16Memory = alignedArrayPool.Rent(numberOfFloatItems, 16); 49 | store16Memory = alignedArrayPool.Rent(numberOfFloatItems, 16); 50 | 51 | for (int i = 0; i < numberOfFloatItems; i++) 52 | { 53 | dataMemory.Memory.Span[i] = i; 54 | data16Memory.Memory.Span[i] = i; 55 | } 56 | } 57 | 58 | [GlobalCleanup] 59 | public void GlobalCleanup() 60 | { 61 | alignedArrayPool.Return(dataMemory); 62 | alignedArrayPool.Return(storeMemory); 63 | alignedArrayPool.Return(data16Memory); 64 | alignedArrayPool.Return(store16Memory); 65 | } 66 | 67 | /* 68 | [BenchmarkCategory("Aligned Memory"), Benchmark] 69 | public unsafe void ScalarStore() 70 | { 71 | ReadOnlySpan dataAl = MemoryMarshal.Cast(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength)); 72 | Span storeAl = MemoryMarshal.Cast(new Span(storeMemory.MemoryHandle.Pointer, storeMemory.ByteArrayLength)); 73 | for (int i = 0; i < dataAl.Length; i++) 74 | { 75 | storeAl[i] = dataAl[i]; 76 | } 77 | } 78 | 79 | [BenchmarkCategory("Aligned Memory"), Benchmark] 80 | public unsafe void ScalarStoreUnrolled() 81 | { 82 | ReadOnlySpan dataAl = MemoryMarshal.Cast(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength)); 83 | Span storeAl = MemoryMarshal.Cast(new Span(storeMemory.MemoryHandle.Pointer, storeMemory.ByteArrayLength)); 84 | 85 | int step = 4; 86 | for (int i = 0; i < dataAl.Length; i += step) 87 | { 88 | storeAl[i] = dataAl[i]; 89 | storeAl[i + 1] = dataAl[i + 1]; 90 | storeAl[i + 2] = dataAl[i + 2]; 91 | storeAl[i + 3] = dataAl[i + 3]; 92 | } 93 | } 94 | 95 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 96 | public unsafe void PtrCopyUnrolled() 97 | { 98 | float* arr1Ptr = (float*)data16Memory.MemoryHandle.Pointer; 99 | float* arr2Ptr = (float*)store16Memory.MemoryHandle.Pointer; 100 | 101 | int i = 0; 102 | while (i < numberOfFloatItems) 103 | { 104 | *arr2Ptr = *arr1Ptr; 105 | arr1Ptr++; 106 | arr2Ptr++; 107 | *arr2Ptr = *arr1Ptr; 108 | arr1Ptr++; 109 | arr2Ptr++; 110 | *arr2Ptr = *arr1Ptr; 111 | arr1Ptr++; 112 | arr2Ptr++; 113 | *arr2Ptr = *arr1Ptr; 114 | arr1Ptr++; 115 | arr2Ptr++; 116 | 117 | i += 4; 118 | } 119 | 120 | 121 | } */ 122 | 123 | [BenchmarkCategory("Aligned Memory"), Benchmark] 124 | public void ScalarCopyBlock() 125 | { 126 | Unsafe.CopyBlock(ref storeMemory.ByteRef, ref dataMemory.ByteRef, (uint)(numberOfFloatItems * sizeof(float))); 127 | } 128 | 129 | 130 | [BenchmarkCategory("Aligned Memory"), Benchmark(Baseline = true)] 131 | public unsafe void VectorStoreAlignedUnsafe() 132 | { 133 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 134 | float* currSpPtr2 = (float*)storeMemory.MemoryHandle.Pointer; 135 | 136 | int i = 0; 137 | while (i < vectorNumberOfItems) 138 | { 139 | Avx.StoreAligned(currSpPtr2, Avx.LoadAlignedVector256(currSpPtr)); 140 | currSpPtr += vectorFloatStep; 141 | currSpPtr2 += vectorFloatStep; 142 | i++; 143 | } 144 | } 145 | /* 146 | [BenchmarkCategory("Aligned Memory"), Benchmark] 147 | public unsafe void VectorStoreArrayMemPtr() 148 | { 149 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfFloatItems)); 150 | Span> writeMem = MemoryMarshal.Cast>(new Span(storeMemory.MemoryHandle.Pointer, numberOfFloatItems)); 151 | 152 | int i = 0; 153 | 154 | while (i < readMem.Length) 155 | { 156 | writeMem[i] = readMem[i]; 157 | i++; 158 | } 159 | } 160 | 161 | [BenchmarkCategory("Aligned Memory"), Benchmark] 162 | public void VectorStoreArrayMemSafe() 163 | { 164 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(dataMemory.Memory.Span); 165 | Span> writeMem = MemoryMarshal.Cast>(storeMemory.Memory.Span); 166 | 167 | int i = 0; 168 | 169 | while (i < readMem.Length) 170 | { 171 | writeMem[i] = readMem[i]; 172 | i++; 173 | } 174 | } 175 | 176 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 177 | public unsafe void VectorStoreArrayMemPtrUnaligned() 178 | { 179 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(new ReadOnlySpan(data16Memory.MemoryHandle.Pointer, numberOfFloatItems)); 180 | Span> writeMem = MemoryMarshal.Cast>(new Span(store16Memory.MemoryHandle.Pointer, numberOfFloatItems)); 181 | 182 | int i = 0; 183 | while (i < readMem.Length) 184 | { 185 | writeMem[i] = readMem[i]; 186 | i++; 187 | } 188 | } 189 | 190 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 191 | public void VectorArraySafeUnaligned() 192 | { 193 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(data16Memory.Memory.Span); 194 | Span> writeMem = MemoryMarshal.Cast>(store16Memory.Memory.Span); 195 | 196 | int i = 0; 197 | while (i < readMem.Length) 198 | { 199 | writeMem[i] = readMem[i]; 200 | i++; 201 | } 202 | } */ 203 | 204 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 205 | public unsafe void VectorStoreUnalignedUnsafe() 206 | { 207 | float* currSpPtr = (float*)data16Memory.MemoryHandle.Pointer; 208 | float* currSpPtr2 = (float*)store16Memory.MemoryHandle.Pointer; 209 | 210 | int i = 0; 211 | while (i < vectorNumberOfItems) 212 | { 213 | Avx.Store(currSpPtr2, Avx.LoadVector256(currSpPtr)); 214 | currSpPtr += vectorFloatStep; 215 | currSpPtr2 += vectorFloatStep; 216 | i++; 217 | } 218 | } 219 | 220 | [BenchmarkCategory("Unaligned Memory"), Benchmark] 221 | public unsafe void VectorStoreUnalignedToAlignedUnsafe() 222 | { 223 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 224 | float* currSpPtr2 = (float*)storeMemory.MemoryHandle.Pointer; 225 | 226 | int i = 0; 227 | while (i < vectorNumberOfItems) 228 | { 229 | Avx.Store(currSpPtr2, Avx.LoadVector256(currSpPtr)); 230 | currSpPtr += vectorFloatStep; 231 | currSpPtr2 += vectorFloatStep; 232 | i++; 233 | } 234 | } 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/IntegerBasicOps.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using System; 4 | using System.Numerics; 5 | using System.Runtime.InteropServices; 6 | using System.Runtime.Intrinsics; 7 | using System.Runtime.Intrinsics.X86; 8 | 9 | namespace Core3IntrinsicsBenchmarks 10 | { 11 | //[DisassemblyDiagnoser(printAsm: true, printSource: true)] 12 | [GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] 13 | [CategoriesColumn] 14 | public class IntegerBasicOps 15 | { 16 | [Params(/*4 * 1024,*/ 4000 * 1024)] 17 | public int NumberOfItems {get; set;} 18 | 19 | private const int bmpWidth = 1920, bmpHeight = 1080; 20 | private AlignedArrayPool intPool; 21 | private AlignedArrayPool shortPool; 22 | private AlignedArrayPool longPool; 23 | private AlignedMemoryHandle intData, intStore, bmpData, bmpStore; 24 | private AlignedMemoryHandle shortData, shortStore; 25 | private AlignedMemoryHandle longData, longStore; 26 | 27 | [GlobalSetup] 28 | public void GlobalSetup() 29 | { 30 | intPool = new AlignedArrayPool(); 31 | shortPool = new AlignedArrayPool(); 32 | longPool = new AlignedArrayPool(); 33 | 34 | intData = intPool.Rent(NumberOfItems); 35 | intStore = intPool.Rent(NumberOfItems); 36 | bmpData = intPool.Rent(bmpWidth * bmpHeight * 4); 37 | bmpStore = intPool.Rent(bmpWidth * bmpHeight * 4); 38 | shortData = shortPool.Rent(NumberOfItems); 39 | shortStore = shortPool.Rent(NumberOfItems); 40 | longData = longPool.Rent(NumberOfItems); 41 | longStore = longPool.Rent(NumberOfItems); 42 | 43 | var r = new Random(1); 44 | for (int i = 0; i < NumberOfItems; i++) 45 | { 46 | intData.Memory.Span[i] = i * 2 + r.Next(-1000, 1000); 47 | intStore.Memory.Span[i] = i + r.Next(-1000, 1000); 48 | shortData.Memory.Span[i] = (short)intData.Memory.Span[i]; 49 | shortStore.Memory.Span[i] = (short)intStore.Memory.Span[i]; 50 | longData.Memory.Span[i] = intData.Memory.Span[i]; 51 | longStore.Memory.Span[i] = intStore.Memory.Span[i]; 52 | } 53 | for(int i = 0; i < bmpData.Memory.Span.Length; i++) 54 | { 55 | bmpData.Memory.Span[i] = i; 56 | } 57 | } 58 | 59 | [GlobalCleanup] 60 | public void GlobalCleanup() 61 | { 62 | intPool.Return(intData); 63 | intPool.Return(intStore); 64 | intPool.Return(bmpData); 65 | intPool.Return(bmpStore); 66 | shortPool.Return(shortData); 67 | shortPool.Return(shortStore); 68 | longPool.Return(longData); 69 | longPool.Return(longStore); 70 | intPool.Dispose(); 71 | } 72 | /* 73 | [BenchmarkCategory("Short"), Benchmark(Baseline = true)] 74 | public unsafe void ShortAdd() 75 | { 76 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems); 77 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems); 78 | 79 | for (int i = 0; i < NumberOfItems; i++) 80 | { 81 | sp2[i] = (short)(sp1[i] + sp2[i]); 82 | } 83 | } 84 | 85 | [BenchmarkCategory("Short"), Benchmark] 86 | public unsafe void ShortAddVector256() 87 | { 88 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span); 89 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span); 90 | 91 | for (int i = 0; i < sp1.Length; i++) 92 | { 93 | sp2[i] = Avx2.Add(sp1[i], sp2[i]); 94 | } 95 | } 96 | 97 | [BenchmarkCategory("Short"), Benchmark] 98 | public unsafe void ShortAndNot() 99 | { 100 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems); 101 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems); 102 | 103 | for (int i = 0; i < NumberOfItems; i++) 104 | { 105 | sp2[i] = (short)(sp1[i] & ~sp2[i]); 106 | } 107 | } 108 | 109 | [BenchmarkCategory("Short"), Benchmark] 110 | public unsafe void ShortAndNotVector256() 111 | { 112 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span); 113 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span); 114 | 115 | for (int i = 0; i < sp1.Length; i++) 116 | { 117 | sp2[i] = Avx2.AndNot(sp1[i],sp2[i]); 118 | } 119 | } 120 | 121 | [BenchmarkCategory("Short"), Benchmark] 122 | public unsafe void ShortShiftLeft() 123 | { 124 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems); 125 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems); 126 | 127 | for (int i = 0; i < NumberOfItems; i++) 128 | { 129 | sp2[i] = (short)(sp1[i] << 5); 130 | } 131 | } 132 | 133 | [BenchmarkCategory("Short"), Benchmark] 134 | public unsafe void ShortShiftLeftVector256() 135 | { 136 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span); 137 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span); 138 | 139 | for (int i = 0; i < sp1.Length; i++) 140 | { 141 | sp2[i] = Avx2.ShiftLeftLogical(sp1[i], 5); 142 | } 143 | } */ 144 | /* 145 | [BenchmarkCategory("Integer"), Benchmark(Baseline = true)] 146 | public unsafe void IntAdd() 147 | { 148 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 149 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 150 | 151 | for(int i = 0; i < NumberOfItems; i++) 152 | { 153 | sp2[i] = sp1[i] + sp2[i]; 154 | } 155 | } 156 | 157 | [BenchmarkCategory("Integer"), Benchmark] 158 | public unsafe void IntAddVector256() 159 | { 160 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 161 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 162 | 163 | for (int i = 0; i < sp1.Length; i++) 164 | { 165 | sp2[i] = Avx2.Add(sp1[i], sp2[i]); 166 | } 167 | } 168 | 169 | [BenchmarkCategory("Integer"), Benchmark] 170 | public unsafe void IntXor() 171 | { 172 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 173 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 174 | 175 | for (int i = 0; i < NumberOfItems; i++) 176 | { 177 | sp2[i] = sp1[i] ^ sp2[i]; 178 | } 179 | } 180 | 181 | [BenchmarkCategory("Integer"), Benchmark] 182 | public unsafe void IntXorVector256() 183 | { 184 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 185 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 186 | 187 | for (int i = 0; i < sp1.Length; i++) 188 | { 189 | sp2[i] = Avx2.Xor(sp1[i], sp2[i]); 190 | } 191 | } 192 | 193 | [BenchmarkCategory("Integer"), Benchmark] 194 | public unsafe void IntMultiply() 195 | { 196 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 197 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 198 | 199 | for (int i = 0; i < NumberOfItems; i++) 200 | { 201 | sp2[i] = sp1[i] * sp2[i]; 202 | } 203 | } 204 | 205 | [BenchmarkCategory("Integer"), Benchmark] 206 | public unsafe void IntMultiplyLowVector256() 207 | { 208 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 209 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 210 | 211 | for (int i = 0; i < sp1.Length; i++) 212 | { 213 | sp2[i] = Avx2.MultiplyLow(sp1[i], sp2[i]); 214 | } 215 | } 216 | 217 | [BenchmarkCategory("Integer"), Benchmark] 218 | public unsafe void IntShiftLeft() 219 | { 220 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 221 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 222 | 223 | for (int i = 0; i < NumberOfItems; i++) 224 | { 225 | sp2[i] = sp1[i] << 5; 226 | } 227 | } 228 | 229 | 230 | [BenchmarkCategory("Integer"), Benchmark] 231 | public unsafe void IntShiftLeftVector256() 232 | { 233 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 234 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 235 | 236 | for (int i = 0; i < sp1.Length; i++) 237 | { 238 | sp2[i] = Avx2.ShiftLeftLogical(sp1[i], 5); 239 | } 240 | } 241 | 242 | [BenchmarkCategory("Integer"), Benchmark] 243 | public unsafe void IntMax() 244 | { 245 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 246 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 247 | 248 | for (int i = 0; i < NumberOfItems; i++) 249 | { 250 | sp2[i] = sp1[i] > sp2[i] ? sp1[1] : sp2[i]; 251 | } 252 | } 253 | 254 | [BenchmarkCategory("Integer"), Benchmark] 255 | public unsafe void IntMaxVector256() 256 | { 257 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 258 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 259 | 260 | for (int i = 0; i < sp1.Length; i++) 261 | { 262 | sp2[i] = Avx2.Max(sp1[i], sp2[i]); 263 | } 264 | } */ 265 | 266 | [BenchmarkCategory("Chained1"), Benchmark(Baseline = true)] 267 | public unsafe void IntMultipleOps() 268 | { 269 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems); 270 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems); 271 | 272 | for (int i = 0; i < NumberOfItems; i++) 273 | { 274 | sp2[i] = ((sp1[i] > sp2[i] ? sp1[1] : sp2[i]) << 2) * 3; 275 | } 276 | } 277 | 278 | [BenchmarkCategory("Chained1"), Benchmark] 279 | public unsafe void IntMultipleOpsvector256() 280 | { 281 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 282 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 283 | 284 | Vector256 three = Vector256.Create(3); 285 | 286 | for (int i = 0; i < sp1.Length; i++) 287 | { 288 | sp2[i] = Avx2.MultiplyLow(Avx2.ShiftLeftLogical(Avx2.Max(sp1[i], sp2[i]), 2), three); 289 | } 290 | } 291 | 292 | [BenchmarkCategory("Chained2"), Benchmark(Baseline = true)] 293 | public unsafe void IntTranspose() 294 | { 295 | var sp1 = new ReadOnlySpan(bmpData.MemoryHandle.Pointer, bmpHeight * bmpWidth * 4); 296 | var sp2 = new Span(bmpStore.MemoryHandle.Pointer, bmpHeight * bmpWidth * 4); 297 | int numberOfElements = Vector256.Count; 298 | 299 | int[] colorComponents = new int[bmpWidth * 4]; 300 | int runningCounter = 0;//, byteCounter; 301 | int start; 302 | for (int y = 0; y < bmpHeight; y++) 303 | { 304 | Span currColors = sp2.Slice(runningCounter, bmpWidth * 4); 305 | for (int x = 0; x < bmpWidth; x += numberOfElements) 306 | { 307 | for (int i = 0; i < numberOfElements; i++) 308 | { 309 | start = x * 4 + i; 310 | colorComponents[start] = sp1[runningCounter]; 311 | colorComponents[start + numberOfElements] = sp1[runningCounter + 1]; 312 | colorComponents[start + (2 * numberOfElements)] = sp1[runningCounter + 2]; 313 | colorComponents[start + (3 * numberOfElements)] = sp1[runningCounter + 3]; 314 | runningCounter += 4; 315 | } 316 | } 317 | colorComponents.CopyTo(currColors); 318 | 319 | } 320 | } 321 | 322 | [BenchmarkCategory("Chained2"), Benchmark] 323 | public unsafe void IntTransposeVector256() // see https://software.intel.com/sites/default/files/m/d/4/1/d/8/Image_Processing_-_whitepaper_-_100pct_CCEreviewed_update.pdf 324 | { 325 | Span> originVectors = MemoryMarshal.Cast>(bmpData.Memory.Span); 326 | Span> transposedVectors = MemoryMarshal.Cast>(bmpStore.Memory.Span); 327 | Vector256 pm0, pm1, pm2, pm3, up0, up1, up2, up3; 328 | for (int i = 0; i < originVectors.Length; i += 4) 329 | { 330 | pm0 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x20); 331 | pm1 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x20); 332 | pm2 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x31); 333 | pm3 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x31); 334 | 335 | up0 = Avx2.UnpackLow(pm0, pm1); 336 | up1 = Avx2.UnpackHigh(pm0, pm1); 337 | up2 = Avx2.UnpackLow(pm2, pm3); 338 | up3 = Avx2.UnpackHigh(pm2, pm3); 339 | 340 | transposedVectors[i] = Avx2.UnpackLow(up0, up2); 341 | transposedVectors[i + 1] = Avx2.UnpackHigh(up0, up2); 342 | transposedVectors[i + 2] = Avx2.UnpackLow(up1, up3); 343 | transposedVectors[i + 3] = Avx2.UnpackHigh(up1, up3); 344 | } 345 | } 346 | 347 | /* 348 | [BenchmarkCategory("Long"), Benchmark(Baseline = true)] 349 | public unsafe void LongAdd() 350 | { 351 | var sp1 = new ReadOnlySpan(longData.MemoryHandle.Pointer, NumberOfItems); 352 | var sp2 = new Span(longStore.MemoryHandle.Pointer, NumberOfItems); 353 | 354 | for (int i = 0; i < NumberOfItems; i++) 355 | { 356 | sp2[i] = sp1[i] + sp2[i]; 357 | } 358 | } 359 | 360 | [BenchmarkCategory("Long"), Benchmark] 361 | public unsafe void LongMultiply() 362 | { 363 | var sp1 = new ReadOnlySpan(longData.MemoryHandle.Pointer, NumberOfItems); 364 | var sp2 = new Span(longStore.MemoryHandle.Pointer, NumberOfItems); 365 | 366 | for (int i = 0; i < NumberOfItems; i++) 367 | { 368 | sp2[i] = sp1[i] * sp2[i]; 369 | } 370 | } 371 | 372 | 373 | [BenchmarkCategory("Long"), Benchmark] 374 | public unsafe void LongAddVector256() 375 | { 376 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(longData.Memory.Span); 377 | Span> sp2 = MemoryMarshal.Cast>(longStore.Memory.Span); 378 | 379 | for (int i = 0; i < sp1.Length; i++) 380 | { 381 | sp2[i] = Avx2.Add(sp1[i], sp2[i]); 382 | } 383 | } 384 | 385 | 386 | [BenchmarkCategory("Long"), Benchmark] 387 | public unsafe void IntMultiplyVector256ToLong() 388 | { 389 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span); 390 | ReadOnlySpan> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span); 391 | Span> sp3 = MemoryMarshal.Cast>(longStore.Memory.Span); 392 | 393 | for (int i = 0; i < sp1.Length; i++) 394 | { 395 | sp3[i] = Avx2.Multiply(sp1[i], sp2[i]); 396 | } 397 | } 398 | */ 399 | } 400 | } 401 | -------------------------------------------------------------------------------- /Core3IntrinsicsBenchmarks/BasicOps.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | using BenchmarkDotNet.Configs; 3 | using System; 4 | using System.Buffers; 5 | using System.Numerics; 6 | using System.Runtime.InteropServices; 7 | using System.Runtime.Intrinsics; 8 | using System.Runtime.Intrinsics.X86; 9 | 10 | namespace Core3IntrinsicsBenchmarks 11 | { 12 | [DisassemblyDiagnoser(printAsm: true, printSource: true)] 13 | //[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)] 14 | //[CategoriesColumn] 15 | public class BasicOps 16 | { 17 | [Params(/*256 * 1024,*/ 10 * 4 * 1024 * 1024)] 18 | public int ParamCacheSizeBytes { get; set; } 19 | 20 | private int numberOfFloatItems, numberOfDoubleItems; 21 | //To test aligned vs unaligned memory 22 | //private readonly AlignedMemoryHandle dataMemory; 23 | //private readonly AlignedMemoryHandle dataMemory2; 24 | //private readonly AlignedMemoryHandle dataMemory3; 25 | //private readonly AlignedMemoryHandle resultMemory; 26 | //private readonly AlignedMemoryHandle dataDoubleMemory; 27 | //private readonly AlignedMemoryHandle resultDoubleMemory; 28 | private float[] data, data2, data3, result; 29 | private double[] dataD, dataD2, dataD3, resultD; 30 | 31 | [GlobalSetup] 32 | public unsafe void GlobalSetup() 33 | { 34 | numberOfFloatItems = ParamCacheSizeBytes / sizeof(float) / 4; // make sure that all data fits 35 | numberOfDoubleItems = ParamCacheSizeBytes / sizeof(double) / 4; 36 | //To test aligned vs unaligned memory 37 | //floatPool = new AlignedArrayPool(); 38 | //doublePool = new AlignedArrayPool(); 39 | //dataMemory = floatPool.Rent(numberOfFloatItems); 40 | //dataMemory2 = floatPool.Rent(numberOfFloatItems); 41 | //dataMemory3 = floatPool.Rent(numberOfFloatItems); 42 | //resultMemory = floatPool.Rent(numberOfFloatItems); 43 | //dataDoubleMemory = doublePool.Rent(numberOfDoubleItems); 44 | //resultDoubleMemory = doublePool.Rent(numberOfDoubleItems); 45 | data = ArrayPool.Shared.Rent(numberOfFloatItems); 46 | data2 = ArrayPool.Shared.Rent(numberOfFloatItems); 47 | data3 = ArrayPool.Shared.Rent(numberOfFloatItems); 48 | result = ArrayPool.Shared.Rent(numberOfFloatItems); 49 | dataD = ArrayPool.Shared.Rent(numberOfDoubleItems); 50 | dataD2 = ArrayPool.Shared.Rent(numberOfDoubleItems); 51 | dataD3 = ArrayPool.Shared.Rent(numberOfDoubleItems); 52 | resultD = ArrayPool.Shared.Rent(numberOfDoubleItems); 53 | 54 | 55 | for (int i = 0; i < numberOfFloatItems; i++) 56 | { 57 | data[i] = i + 1.0f; 58 | data2[i] = i + 1.0f; 59 | data3[i] = i + 1.0f; 60 | result[i] = 0.0f; 61 | } 62 | for(int i = 0; i < numberOfDoubleItems; i++) 63 | { 64 | dataD[i] = i + 1.0; 65 | resultD[i] = 0.0; 66 | } 67 | } 68 | 69 | [GlobalCleanup] 70 | public void GlobalCleanup() 71 | { 72 | //To test aligned vs unaligned memory 73 | //floatPool.Return(resultMemory, false); 74 | //floatPool.Return(dataMemory, false); 75 | //floatPool.Return(dataMemory2, false); 76 | //floatPool.Return(dataMemory3, false); 77 | //doublePool.Return(resultDoubleMemory, false); 78 | //doublePool.Return(dataDoubleMemory, false); 79 | //floatPool.Dispose(); 80 | //doublePool.Dispose(); 81 | ArrayPool.Shared.Return(data); 82 | ArrayPool.Shared.Return(data2); 83 | ArrayPool.Shared.Return(data3); 84 | ArrayPool.Shared.Return(result); 85 | ArrayPool.Shared.Return(dataD); 86 | ArrayPool.Shared.Return(dataD2); 87 | ArrayPool.Shared.Return(dataD3); 88 | ArrayPool.Shared.Return(resultD); 89 | } 90 | 91 | 92 | /*[BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 93 | public unsafe void MultiplyAddScalarFloat() 94 | { 95 | var sp1 = new ReadOnlySpan(data, 0, numberOfFloatItems); 96 | var sp12 = new ReadOnlySpan(data2, 0, numberOfFloatItems); 97 | var sp13 = new ReadOnlySpan(data3, 0, numberOfFloatItems); 98 | var sp2 = new Span(result, 0, numberOfFloatItems); 99 | 100 | for (int i = 0; i < sp1.Length; i++) 101 | { 102 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 103 | } 104 | } */ 105 | 106 | [BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 107 | public void ScalarFloatMultipleOps() 108 | { 109 | var sp1 = new ReadOnlySpan(data, 0, numberOfFloatItems); 110 | var sp12 = new ReadOnlySpan(data2, 0, numberOfFloatItems); 111 | var sp13 = new ReadOnlySpan(data3, 0, numberOfFloatItems); 112 | var sp2 = new Span(result, 0, numberOfFloatItems); 113 | 114 | for (int i = 0; i < sp1.Length; i++) 115 | { 116 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 117 | sp2[i] = sp2[i] * sp1[i] + sp1[i]; 118 | sp2[i] = sp1[i] * sp1[i] + sp2[i]; 119 | } 120 | } 121 | 122 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 123 | public void Vector256FloatMultipleOps() 124 | { 125 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 126 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 127 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 128 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 129 | 130 | for (int i = 0; i < d1.Length; i++) 131 | { 132 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 133 | r[i] = Fma.MultiplyAdd(r[i], d1[i], d1[i]); 134 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], r[i]); 135 | } 136 | } 137 | 138 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 139 | public unsafe void Vector256FloatMultipleOpsUnsafe() 140 | { 141 | fixed (float* d1Ptr = &data[0]) 142 | { 143 | fixed (float* d2Ptr = &data2[0]) 144 | { 145 | fixed (float* d3Ptr = &data3[0]) 146 | { 147 | fixed (float* resPtr = &result[0]) 148 | { 149 | float* currD1 = d1Ptr; 150 | float* currD2 = d2Ptr; 151 | float* currD3 = d3Ptr; 152 | float* currRes = resPtr; 153 | float* limitPtr = d1Ptr + numberOfFloatItems; 154 | while (currD1 < limitPtr) 155 | { 156 | Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currD3))); 157 | Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currRes), Avx.LoadVector256(currD1), Avx.LoadVector256(currD1))); 158 | Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currRes))); 159 | currD1 += 8; 160 | currD2 += 8; 161 | currD3 += 8; 162 | currRes += 8; 163 | } 164 | } 165 | } 166 | } 167 | 168 | } 169 | } 170 | 171 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 172 | public void VectorTFloatMultipleOps() 173 | { 174 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 175 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 176 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 177 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 178 | 179 | for (int i = 0; i < d1.Length; i++) 180 | { 181 | r[i] = d1[i] * d2[i] + d3[i]; 182 | r[i] = r [i] * d1[i] + d1[i]; 183 | r[i] = d1[i] * d2[i] + r[i]; 184 | } 185 | } 186 | 187 | 188 | /* 189 | [BenchmarkCategory("MultiplyAdd"), Benchmark(Baseline = true)] 190 | public unsafe void DoubleMultipleOps() 191 | { 192 | var sp1 = new ReadOnlySpan(dataD, 0, numberOfDoubleItems); 193 | var sp12 = new ReadOnlySpan(dataD2, 0, numberOfDoubleItems); 194 | var sp13 = new ReadOnlySpan(dataD3, 0, numberOfDoubleItems); 195 | var sp2 = new Span(resultD, 0, numberOfDoubleItems); 196 | 197 | for (int i = 0; i < sp1.Length; i++) 198 | { 199 | sp2[i] = sp1[i] * sp12[i] + sp13[i]; 200 | sp2[i] = sp2[i] * sp1[i] + sp1[i]; 201 | sp2[i] = sp1[i] * sp1[i] + sp2[i]; 202 | } 203 | } 204 | 205 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 206 | public unsafe void FmaMultiplyAddvector256Float() 207 | { 208 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 209 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 210 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 211 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 212 | 213 | for (int i = 0; i < d1.Length; i++) 214 | { 215 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 216 | } 217 | }*/ 218 | 219 | /* 220 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 221 | public unsafe void FmaMultiplyAddvectorTFloat() 222 | { 223 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(data, 0, numberOfFloatItems)); 224 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(data2, 0, numberOfFloatItems)); 225 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(data3, 0, numberOfFloatItems)); 226 | Span> r = MemoryMarshal.Cast>(new Span(result, 0, numberOfFloatItems)); 227 | 228 | for (int i = 0; i < d1.Length; i++) 229 | { 230 | r[i] = d1[i] * d2[i] + d3[i]; 231 | } 232 | } 233 | 234 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 235 | public unsafe void Vector256DoubleMultipleOps() 236 | { 237 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(dataD, 0, numberOfDoubleItems)); 238 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(dataD2, 0, numberOfDoubleItems)); 239 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(dataD3, 0, numberOfDoubleItems)); 240 | Span> r = MemoryMarshal.Cast>(new Span(resultD, 0, numberOfDoubleItems)); 241 | 242 | for (int i = 0; i < d1.Length; i++) 243 | { 244 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 245 | r[i] = Fma.MultiplyAdd(r[i], d1[i], d1[i]); 246 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], r[i]); 247 | } 248 | } */ 249 | 250 | 251 | /* 252 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 253 | public unsafe void FmaMultiplyAddSpanAMH() 254 | { 255 | //int step = Vector256.Count; 256 | 257 | ReadOnlySpan> d1 = MemoryMarshal.Cast>(new Span(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength)); 258 | ReadOnlySpan> d2 = MemoryMarshal.Cast>(new Span(dataMemory2.MemoryHandle.Pointer, dataMemory2.ByteArrayLength)); 259 | ReadOnlySpan> d3 = MemoryMarshal.Cast>(new Span(dataMemory3.MemoryHandle.Pointer, dataMemory3.ByteArrayLength)); 260 | Span> r = MemoryMarshal.Cast>(new Span(resultMemory.MemoryHandle.Pointer, resultMemory.ByteArrayLength)); 261 | 262 | for (int i = 0; i < d1.Length; i++) 263 | { 264 | r[i] = Fma.MultiplyAdd(d1[i], d2[i], d3[i]); 265 | } 266 | } 267 | 268 | [BenchmarkCategory("MultiplyAdd"), Benchmark] 269 | public unsafe void FmaMultiplyAddAMHPtr() 270 | { 271 | int step = Vector256.Count; 272 | 273 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 274 | float* currSpPtr12 = (float*)dataMemory.MemoryHandle.Pointer; 275 | float* currSpPtr13 = (float*)dataMemory.MemoryHandle.Pointer; 276 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 277 | 278 | for (int i = 0; i < numberOfItems; i += step) 279 | { 280 | Avx.StoreAligned(currSpPtr2, Fma.MultiplyAdd(Avx.LoadAlignedVector256(currSpPtr), Avx.LoadAlignedVector256(currSpPtr12), Avx.LoadAlignedVector256(currSpPtr13))); 281 | currSpPtr += step; 282 | currSpPtr12 += step; 283 | currSpPtr13 += step; 284 | currSpPtr2 += step; 285 | } 286 | } 287 | 288 | [BenchmarkCategory("Negative MultiplyAdd"), Benchmark(Baseline = true)] 289 | public unsafe void NegMultiplyAdd() 290 | { 291 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 292 | var sp12 = new ReadOnlySpan(dataMemory2.MemoryHandle.Pointer, numberOfItems); 293 | var sp13 = new ReadOnlySpan(dataMemory3.MemoryHandle.Pointer, numberOfItems); 294 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 295 | 296 | for (int i = 0; i < sp1.Length; i++) 297 | { 298 | sp2[i] = -(sp1[i] * sp12[i]) + sp13[i]; 299 | } 300 | } 301 | 302 | [BenchmarkCategory("Negative MultiplyAdd"), Benchmark] 303 | public unsafe void FmaNegMultiplyAdd() 304 | { 305 | int step = Vector256.Count; 306 | 307 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 308 | float* currSpPtr12 = (float*)dataMemory.MemoryHandle.Pointer; 309 | float* currSpPtr13 = (float*)dataMemory.MemoryHandle.Pointer; 310 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 311 | 312 | for (int i = 0; i < numberOfItems; i += step) 313 | { 314 | Avx.StoreAligned(currSpPtr2, Fma.MultiplyAddNegated(Avx.LoadAlignedVector256(currSpPtr), Avx.LoadAlignedVector256(currSpPtr12), Avx.LoadAlignedVector256(currSpPtr13))); 315 | currSpPtr += step; 316 | currSpPtr12 += step; 317 | currSpPtr13 += step; 318 | currSpPtr2 += step; 319 | } 320 | } 321 | 322 | 323 | [BenchmarkCategory("Reciprocal"), Benchmark(Baseline = true)] 324 | public unsafe void Reciprocal() 325 | { 326 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 327 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 328 | 329 | for (int i = 0; i < sp1.Length; i++) 330 | { 331 | sp2[i] = 1.0f / sp1[i]; 332 | } 333 | } 334 | 335 | [BenchmarkCategory("Reciprocal"), Benchmark] 336 | public unsafe void ReciprocalDouble() 337 | { 338 | var sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 339 | var sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 340 | 341 | for (int i = 0; i < sp1.Length; i++) 342 | { 343 | sp2[i] = 1.0 / sp1[i]; 344 | } 345 | } 346 | 347 | 348 | [BenchmarkCategory("Reciprocal"), Benchmark] 349 | public unsafe void VectorReciprocal() 350 | { 351 | int step = Vector256.Count; 352 | 353 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 354 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 355 | 356 | for (int i = 0; i < numberOfItems; i += step) 357 | { 358 | Avx.StoreAligned(currSpPtr2, Avx.Reciprocal(Avx.LoadAlignedVector256(currSpPtr))); 359 | currSpPtr += step; 360 | currSpPtr2 += step; 361 | } 362 | } 363 | 364 | 365 | [BenchmarkCategory("Reciprocal"), Benchmark] 366 | public unsafe void VecReciprocal() 367 | { 368 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 369 | ReadOnlySpan> vecSpan = MemoryMarshal.Cast>(sp1); 370 | 371 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 372 | Span> vecSpan2 = MemoryMarshal.Cast>(sp2); 373 | 374 | for (int i = 0; i < vecSpan.Length; i++) 375 | { 376 | vecSpan2[i] = Vector.One / vecSpan[i]; 377 | } 378 | } 379 | 380 | [BenchmarkCategory("Reciprocal"), Benchmark] 381 | public unsafe void VectorReciprocalDouble() 382 | { 383 | double one = 1.0; 384 | double* onePtr = &one; 385 | 386 | int step = Vector256.Count; 387 | 388 | Vector256 oneVector = Avx.BroadcastScalarToVector256(onePtr); 389 | 390 | double* currSpPtr = (double*)dataMemory.MemoryHandle.Pointer; 391 | double* currSpPtr2 = (double*)resultDoubleMemory.MemoryHandle.Pointer; 392 | 393 | for (int i = 0; i < numberOfItems; i += step) 394 | { 395 | Avx.StoreAligned(currSpPtr2, Avx.Divide(oneVector, Avx.LoadAlignedVector256(currSpPtr))); 396 | currSpPtr += step; 397 | currSpPtr2 += step; 398 | } 399 | } */ 400 | /* 401 | [Benchmark] 402 | public unsafe void RecSquareRoot() 403 | { 404 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 405 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 406 | 407 | for (int i = 0; i < sp1.Length; i++) 408 | { 409 | sp2[i] = 1.0f / MathF.Sqrt(sp1[i]); 410 | } 411 | } 412 | 413 | [Benchmark] 414 | public unsafe void RecSquareRootDouble() 415 | { 416 | ReadOnlySpan sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 417 | Span sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 418 | 419 | for (int i = 0; i < sp1.Length; i++) 420 | { 421 | sp2[i] = 1.0 / Math.Sqrt(sp1[i]); 422 | } 423 | } 424 | 425 | [Benchmark] 426 | public unsafe void VectorRecSquareRoot() 427 | { 428 | int step = Vector256.Count; 429 | 430 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 431 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 432 | 433 | for (int i = 0; i < numberOfItems; i += step) 434 | { 435 | Avx.StoreAligned(currSpPtr2, Avx.Reciprocal(Avx.Sqrt(Avx.LoadAlignedVector256(currSpPtr)))); 436 | currSpPtr += step; 437 | currSpPtr2 += step; 438 | } 439 | } 440 | 441 | [Benchmark] 442 | public unsafe void VectorReciprocalSqrt() 443 | { 444 | int step = Vector256.Count; 445 | 446 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer; 447 | float* currSpPtr2 = (float*)resultMemory.MemoryHandle.Pointer; 448 | 449 | for (int i = 0; i < numberOfItems; i += step) 450 | { 451 | Avx.StoreAligned(currSpPtr2, Avx.ReciprocalSqrt(Avx.LoadAlignedVector256(currSpPtr))); 452 | currSpPtr += step; 453 | currSpPtr2 += step; 454 | } 455 | } 456 | 457 | [Benchmark] 458 | public unsafe void VectorRecSquareRootDouble() 459 | { 460 | double one = 1.0; 461 | double* onePt = &one; 462 | Vector256 oneVec = Avx.BroadcastScalarToVector256(onePt); 463 | 464 | int step = Vector256.Count; 465 | 466 | double* currSpPtr = (double*)dataMemory.MemoryHandle.Pointer; 467 | double* currSpPtr2 = (double*)resultDoubleMemory.MemoryHandle.Pointer; 468 | 469 | for (int i = 0; i < numberOfItems; i += step) 470 | { 471 | 472 | Avx.StoreAligned(currSpPtr2, Avx.Divide(oneVec, Avx.Sqrt(Avx.LoadAlignedVector256(currSpPtr)))); 473 | currSpPtr += step; 474 | currSpPtr2 += step; 475 | } 476 | } */ 477 | /* 478 | [BenchmarkCategory("Square root"), Benchmark(Baseline = true)] 479 | public unsafe void SquareRoot() 480 | { 481 | var sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems); 482 | var sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems); 483 | 484 | for (int i = 0; i < sp1.Length; i++) 485 | { 486 | sp2[i] = MathF.Sqrt(sp1[i]); 487 | } 488 | } 489 | 490 | [BenchmarkCategory("Square root"), Benchmark] 491 | public unsafe void SquareRootDouble() 492 | { 493 | var sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems); 494 | var sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems); 495 | 496 | for (int i = 0; i < sp1.Length; i++) 497 | { 498 | sp2[i] = Math.Sqrt(sp1[i]); 499 | } 500 | } 501 | 502 | [BenchmarkCategory("Square root"), Benchmark] 503 | public unsafe void VectorSquareRoot() 504 | { 505 | int step = Vector256