├── Core3Intrinsics
├── Core3Intrinsics.csproj
├── Program.cs
├── Validator.cs
├── Transpose.cs
├── Intro.cs
└── Mandelbrot.cs
├── Core3IntrinsicsBenchmarks
├── Program.cs
├── Core3IntrinsicsBenchmarks.csproj
├── AlignedMemoryHandle.cs
├── ReadmeBenches.cs
├── TrigonometricOps.cs
├── AlignedArrayPool.cs
├── Mandelbrot.cs
├── MemoryBenches.cs
├── IntegerBasicOps.cs
└── BasicOps.cs
├── LICENSE
├── Core3Intrinsics.sln
├── .gitattributes
├── ExtraFiles
├── MemoryBenches2.md
├── MemoryBenches-Aligned.md
└── MemoryBenches-1.md
├── .gitignore
└── Readme.md
/Core3Intrinsics/Core3Intrinsics.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp3.0
6 | 8.0
7 |
8 |
9 |
10 | x64
11 | true
12 |
13 |
14 |
15 | x64
16 | true
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using BenchmarkDotNet.Running;
3 | using BenchmarkDotNet.Configs;
4 | using System.Collections.Generic;
5 |
6 | namespace Core3IntrinsicsBenchmarks
7 | {
8 | class Program
9 | {
10 | static void Main()
11 | {
12 | //var summary = BenchmarkRunner.Run();
13 | //_ = BenchmarkRunner.Run();
14 | //var summary = BenchmarkRunner.Run();
15 | //var summary = BenchmarkRunner.Run();
16 | var summary = BenchmarkRunner.Run();
17 | //var summary = BenchmarkRunner.Run();
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/Core3IntrinsicsBenchmarks.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp3.0
6 | 7.3
7 |
8 |
9 |
10 | x64
11 | true
12 | pdbonly
13 | true
14 |
15 |
16 |
17 | x64
18 | true
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 C. B. Gonzalez
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/AlignedMemoryHandle.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Buffers;
3 | using System.Collections.Generic;
4 | using System.Runtime.InteropServices;
5 | using System.Text;
6 |
7 | namespace Core3IntrinsicsBenchmarks
8 | {
9 | public unsafe class AlignedMemoryHandle where T : struct
10 | {
11 | private MemoryHandle memoryHandle;
12 | readonly byte* bytePointer;
13 | readonly int byteArrayLength;
14 | readonly Memory memory;
15 |
16 | public MemoryHandle MemoryHandle => memoryHandle;
17 |
18 | public ref byte ByteRef => ref GetByteRef();
19 |
20 | public ref T TRef => ref GetTRef();
21 |
22 | public Memory Memory => memory;
23 |
24 | public int ByteArrayLength => byteArrayLength;
25 |
26 | public unsafe AlignedMemoryHandle(void* pointer, GCHandle handle, ref T arrayStart, int byteLength)
27 | {
28 | memoryHandle = new MemoryHandle(pointer, handle);
29 | bytePointer = (byte*)pointer;
30 | ref T tRef = ref arrayStart;
31 | byteArrayLength = byteLength;
32 | memory = new Memory(MemoryMarshal.Cast(new Span(pointer, byteLength)).ToArray());
33 | }
34 |
35 | private unsafe ref byte GetByteRef()
36 | {
37 | return ref bytePointer[0];
38 | }
39 |
40 | private unsafe ref T GetTRef()
41 | {
42 | return ref MemoryMarshal.Cast(new Span((void*)bytePointer, byteArrayLength)).ToArray()[0];
43 | }
44 |
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/Core3Intrinsics/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.InteropServices;
3 |
4 | namespace Core3Intrinsics
5 | {
6 | class Program
7 | {
8 | static unsafe void Main()
9 | {
10 | Console.WriteLine("Starting test ...");
11 | Console.WriteLine("\tMandelBrot");
12 | var man = new Mandelbrot();
13 | man.FloatMandel();
14 | man.Vector256Mandel();
15 | (bool areEqual, System.Collections.Generic.List errorList, int maxDifference) = Validator.CompareValuesFloat(man.results.Span.ToArray(), man.results2.Span.ToArray());
16 | Console.WriteLine($"\t\tMandelBrot successful: {areEqual}, Number of differences: {errorList.Count}, max. difference: {maxDifference}");
17 | Console.WriteLine($"\t\tDone with mandelbrot, total bytes: {man.SizeInBytes}");
18 | //Transpose.CreateArrays();
19 | //bool res1 = Transpose.SerializeColorsInt();
20 | //bool res2 = Transpose.SerializedColorsVector256();
21 | //if(res1 && res2)
22 | //{
23 | // (bool areEqual, System.Collections.Generic.List errorList) = Validator.CompareValues(Transpose.transposed1, Transpose.transposed2);
24 | // Console.WriteLine($"Transpose ended with success {areEqual}, number of differences {errorList.Count}");
25 | //}
26 | //else
27 | //{
28 | // Console.WriteLine($"Error running Transpose");
29 | //}
30 | _ = Console.ReadLine();
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/Core3Intrinsics.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.29215.179
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core3Intrinsics", "Core3Intrinsics\Core3Intrinsics.csproj", "{8ABE3139-8924-46FE-B8D4-155FE20DD285}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Core3IntrinsicsBenchmarks", "Core3IntrinsicsBenchmarks\Core3IntrinsicsBenchmarks.csproj", "{FFEC9419-D276-46DB-8136-4642054E1C99}"
9 | EndProject
10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{A3F9D91A-297A-40E2-9714-D009F3FB9CF0}"
11 | ProjectSection(SolutionItems) = preProject
12 | Readme.md = Readme.md
13 | EndProjectSection
14 | EndProject
15 | Global
16 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
17 | Debug|Any CPU = Debug|Any CPU
18 | Release|Any CPU = Release|Any CPU
19 | EndGlobalSection
20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
21 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
22 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Debug|Any CPU.Build.0 = Debug|Any CPU
23 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Release|Any CPU.ActiveCfg = Release|Any CPU
24 | {8ABE3139-8924-46FE-B8D4-155FE20DD285}.Release|Any CPU.Build.0 = Release|Any CPU
25 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
26 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Debug|Any CPU.Build.0 = Debug|Any CPU
27 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Release|Any CPU.ActiveCfg = Release|Any CPU
28 | {FFEC9419-D276-46DB-8136-4642054E1C99}.Release|Any CPU.Build.0 = Release|Any CPU
29 | EndGlobalSection
30 | GlobalSection(SolutionProperties) = preSolution
31 | HideSolutionNode = FALSE
32 | EndGlobalSection
33 | GlobalSection(ExtensibilityGlobals) = postSolution
34 | SolutionGuid = {0AA0631C-9878-463C-8661-45CA8F282505}
35 | EndGlobalSection
36 | EndGlobal
37 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/ReadmeBenches.cs:
--------------------------------------------------------------------------------
1 | using BenchmarkDotNet.Attributes;
2 | using BenchmarkDotNet.Configs;
3 | using BenchmarkDotNet.Exporters;
4 | using BenchmarkDotNet.Exporters.Csv;
5 | using System;
6 | using System.Buffers;
7 | using System.Runtime.CompilerServices;
8 | using System.Runtime.InteropServices;
9 | using System.Runtime.Intrinsics;
10 | using System.Runtime.Intrinsics.X86;
11 |
12 | namespace Core3IntrinsicsBenchmarks
13 | {
14 | [DisassemblyDiagnoser(printAsm: true, printSource: true)]
15 | public class ReadmeBenches
16 | {
17 | [Params(4096/*, 1048576*/)]
18 | public int NumberOfFloats { get; set; }
19 |
20 | private static float[] inputData;
21 |
22 | [GlobalSetup]
23 | public void GlobalSetup()
24 | {
25 | inputData = new float[NumberOfFloats];
26 | for(int i = 0; i < inputData.Length; i++)
27 | {
28 | inputData[i] = i + 1;
29 | }
30 | }
31 |
32 | [Benchmark(Baseline = true)]
33 | public float[] ProcessData()
34 | {
35 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5>
36 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5>
37 | Vector256 result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-50, 0, 0, 0, -50, 0, 0, 0>
38 | float[] results = new float[inputData.Length];
39 | Span> resultVectors = MemoryMarshal.Cast>(results);
40 |
41 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(inputData);
42 |
43 | for (int i = 0; i < inputVectors.Length; i++)
44 | {
45 | resultVectors[i] = Avx.Sqrt(inputVectors[i]);
46 | }
47 | results[0] = result.GetElement(0);
48 | return results;
49 | }
50 |
51 | [Benchmark]
52 | public unsafe float[] ProcessDataUnsafe()
53 | {
54 | float[] results = new float[inputData.Length];
55 | fixed (float* inputPtr = &inputData[0])
56 | {
57 | float* inCurrent = inputPtr;
58 | fixed (float* resultPtr = &results[0])
59 | {
60 | float* resEnd = resultPtr + results.Length;
61 | float* resCurrent = resultPtr;
62 | while (resCurrent < resEnd)
63 | {
64 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent)));
65 | resCurrent += 8;
66 | inCurrent += 8;
67 | }
68 | }
69 | }
70 | return results;
71 | }
72 |
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/Core3Intrinsics/Validator.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace Core3Intrinsics
6 | {
7 | public static class Validator
8 | {
9 | public static (bool, List) CompareValues(T[] left, T[] right) where T : struct
10 | {
11 | var differIndexes = new List();
12 | bool allEqual = true;
13 | if(left.Length != right.Length)
14 | {
15 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}.");
16 | }
17 | for(int i = 0; i < left.Length; i++)
18 | {
19 | if(!EqualityComparer.Default.Equals(left[i], right[i]))
20 | {
21 | differIndexes.Add(i);
22 |
23 | allEqual &= false;
24 | }
25 | }
26 |
27 | return (allEqual, differIndexes);
28 | }
29 |
30 | public static (bool, List, int) CompareValuesFloat(float[] left, float[] right)
31 | {
32 | var differIndexes = new List();
33 | int maxDifference = 0;
34 | bool allEqual = true;
35 | if (left.Length != right.Length)
36 | {
37 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}.");
38 | }
39 | for (int i = 0; i < left.Length; i++)
40 | {
41 | if (left[i] != right[i])
42 | {
43 | differIndexes.Add(i);
44 | if(Math.Abs(left[i] - right[i]) > maxDifference)
45 | {
46 | maxDifference = (int)Math.Abs(left[i] - right[i]);
47 | }
48 | allEqual &= false;
49 | }
50 | }
51 |
52 | return (allEqual, differIndexes, maxDifference);
53 | }
54 |
55 | public static (bool, List, int) CompareValuesDouble(double[] left, double[] right)
56 | {
57 | var differIndexes = new List();
58 | int maxDifference = 0;
59 | bool allEqual = true;
60 | if (left.Length != right.Length)
61 | {
62 | throw new ArgumentOutOfRangeException($"Arrays not of the same length: {nameof(left)} {nameof(right)}.");
63 | }
64 | for (int i = 0; i < left.Length; i++)
65 | {
66 | if (left[i] != right[i])
67 | {
68 | differIndexes.Add(i);
69 | if (Math.Abs(left[i] - right[i]) > maxDifference)
70 | {
71 | maxDifference = (int)Math.Abs(left[i] - right[i]);
72 | }
73 | allEqual &= false;
74 | }
75 | }
76 |
77 | return (allEqual, differIndexes, maxDifference);
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/ExtraFiles/MemoryBenches2.md:
--------------------------------------------------------------------------------
1 | ``` ini
2 |
3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores
5 | .NET Core SDK=3.0.100-preview9-014004
6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
8 |
9 |
10 | ```
11 | | Method | numberOfBytes | Mean | Error | StdDev | Ratio | RatioSD |
12 | |-------------------------------- |-------------- |---------------:|--------------:|--------------:|------:|--------:|
13 | | **ScalarStoreBlock** | **16384** | **298.5 ns** | **5.924 ns** | **9.047 ns** | **1.00** | **0.00** |
14 | | VectorStoreArrayMemPtr | 16384 | 394.1 ns | 10.456 ns | 16.885 ns | 1.32 | 0.06 |
15 | | VectorStoreArrayMemPtrUnaligned | 16384 | 495.0 ns | 9.477 ns | 10.140 ns | 1.66 | 0.07 |
16 | | | | | | | | |
17 | | **ScalarStoreBlock** | **131072** | **6,225.2 ns** | **116.328 ns** | **103.122 ns** | **1.00** | **0.00** |
18 | | VectorStoreArrayMemPtr | 131072 | 6,772.1 ns | 77.929 ns | 65.074 ns | 1.09 | 0.02 |
19 | | VectorStoreArrayMemPtrUnaligned | 131072 | 7,245.7 ns | 130.736 ns | 115.894 ns | 1.16 | 0.03 |
20 | | | | | | | | |
21 | | **ScalarStoreBlock** | **1048576** | **67,515.4 ns** | **2,549.673 ns** | **2,618.326 ns** | **1.00** | **0.00** |
22 | | VectorStoreArrayMemPtr | 1048576 | 80,868.2 ns | 1,569.923 ns | 1,928.007 ns | 1.20 | 0.05 |
23 | | VectorStoreArrayMemPtrUnaligned | 1048576 | 83,708.5 ns | 1,995.286 ns | 2,134.934 ns | 1.24 | 0.05 |
24 | | | | | | | | |
25 | | **ScalarStoreBlock** | **2097152** | **189,619.0 ns** | **7,155.162 ns** | **21,097.157 ns** | **1.00** | **0.00** |
26 | | VectorStoreArrayMemPtr | 2097152 | 271,783.7 ns | 5,376.659 ns | 11,914.305 ns | 1.41 | 0.17 |
27 | | VectorStoreArrayMemPtrUnaligned | 2097152 | 274,970.6 ns | 5,310.311 ns | 5,453.298 ns | 1.44 | 0.15 |
28 | | | | | | | | |
29 | | **ScalarStoreBlock** | **8388608** | **1,105,687.5 ns** | **10,205.821 ns** | **8,522.323 ns** | **1.00** | **0.00** |
30 | | VectorStoreArrayMemPtr | 8388608 | 1,573,145.8 ns | 31,795.047 ns | 29,741.107 ns | 1.42 | 0.02 |
31 | | VectorStoreArrayMemPtrUnaligned | 8388608 | 1,568,842.2 ns | 28,942.750 ns | 27,073.066 ns | 1.42 | 0.03 |
32 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/TrigonometricOps.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Numerics;
3 | using System.Runtime.Intrinsics;
4 | using BenchmarkDotNet.Attributes;
5 | using System.Runtime.InteropServices;
6 | using System.Runtime.Intrinsics.X86;
7 | using System.Buffers;
8 |
9 | namespace Core3IntrinsicsBenchmarks
10 | {
11 | [DisassemblyDiagnoser(printAsm: true, printSource: true)]
12 | public class TrigonometricOps
13 | {
14 | const int l1CacheSize = 32 * 1024; // one L1 cache, 32 kB
15 | private int numberOfItems;
16 | public static int algn = 32;
17 | public AlignedArrayPool floatPool;
18 | public AlignedArrayPool doublePool;
19 | AlignedMemoryHandle dataMemory, resultMemory;
20 | AlignedMemoryHandle dataDoubleMemory, resultDoubleMemory;
21 |
22 | [GlobalSetup]
23 | public unsafe void GlobalSetup()
24 | {
25 | numberOfItems = l1CacheSize / sizeof(double) / 2 - 8;
26 | floatPool = new AlignedArrayPool();
27 | doublePool = new AlignedArrayPool();
28 | dataMemory = floatPool.Rent(numberOfItems);
29 | resultMemory = floatPool.Rent(numberOfItems);
30 | dataDoubleMemory = doublePool.Rent(numberOfItems);
31 | resultDoubleMemory = doublePool.Rent(numberOfItems);
32 | Span dataSpan = new Span(dataMemory.MemoryHandle.Pointer, numberOfItems);
33 | Span resultSpan = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems);
34 | Span dataDoubleSpan = new Span(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems);
35 | Span resultDoubleSpan = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems);
36 |
37 | for (int i = 0; i < numberOfItems; i++)
38 | {
39 | dataSpan[i] = i + 0.01f;
40 | resultSpan[i] = 0.0f;
41 | dataDoubleSpan[i] = i + 0.01;
42 | resultDoubleSpan[i] = 0.0;
43 | }
44 | }
45 |
46 | [GlobalCleanup]
47 | public void GlobalCleanup()
48 | {
49 | floatPool.Return(resultMemory, false);
50 | floatPool.Return(dataMemory, false);
51 | doublePool.Return(resultDoubleMemory, false);
52 | doublePool.Return(dataDoubleMemory, false);
53 | floatPool.Dispose();
54 | doublePool.Dispose();
55 | }
56 |
57 | [Benchmark]
58 | public unsafe void Cos()
59 | {
60 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems);
61 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems);
62 |
63 | for (int i = 0; i < sp1.Length; i++)
64 | {
65 | sp2[i] = (float)Math.Cos(sp1[i]);
66 | }
67 | }
68 |
69 | [Benchmark]
70 | public unsafe void CosMathF()
71 | {
72 | ReadOnlySpan sp1 = new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfItems);
73 | Span sp2 = new Span(resultMemory.MemoryHandle.Pointer, numberOfItems);
74 |
75 | for (int i = 0; i < sp1.Length; i++)
76 | {
77 | sp2[i] = MathF.Cos(sp1[i]);
78 | }
79 | }
80 |
81 | [Benchmark]
82 | public unsafe void CosDouble()
83 | {
84 | ReadOnlySpan sp1 = new ReadOnlySpan(dataDoubleMemory.MemoryHandle.Pointer, numberOfItems);
85 | Span sp2 = new Span(resultDoubleMemory.MemoryHandle.Pointer, numberOfItems);
86 |
87 | for (int i = 0; i < sp1.Length; i++)
88 | {
89 | sp2[i] = Math.Cos(sp1[i]);
90 |
91 | }
92 | }
93 |
94 |
95 |
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/ExtraFiles/MemoryBenches-Aligned.md:
--------------------------------------------------------------------------------
1 | ``` ini
2 |
3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores
5 | .NET Core SDK=3.0.100-preview9-014004
6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
8 |
9 |
10 | ```
11 | | Method | NumberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD |
12 | |----------------------------- |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:|
13 | | **VectorStoreAligned** | **16384** | **504.7 ns** | **7.635 ns** | **8.792 ns** | **504.6 ns** | **1.00** | **0.00** |
14 | | VectorStoreArrayMemPtr | 16384 | 385.1 ns | 6.161 ns | 4.810 ns | 383.8 ns | 0.76 | 0.01 |
15 | | VectorStoreArrayMemSafe | 16384 | 597.0 ns | 11.873 ns | 12.193 ns | 595.5 ns | 1.18 | 0.03 |
16 | | VectorStoreArraySimpleBuffer | 16384 | 640.5 ns | 22.126 ns | 18.476 ns | 636.5 ns | 1.27 | 0.05 |
17 | | | | | | | | | |
18 | | **VectorStoreAligned** | **131072** | **9,865.0 ns** | **199.512 ns** | **279.687 ns** | **9,767.2 ns** | **1.00** | **0.00** |
19 | | VectorStoreArrayMemPtr | 131072 | 9,637.7 ns | 94.004 ns | 83.332 ns | 9,645.3 ns | 0.97 | 0.03 |
20 | | VectorStoreArrayMemSafe | 131072 | 6,181.7 ns | 120.563 ns | 148.062 ns | 6,144.4 ns | 0.63 | 0.03 |
21 | | VectorStoreArraySimpleBuffer | 131072 | 9,925.4 ns | 260.502 ns | 230.929 ns | 9,855.4 ns | 1.00 | 0.03 |
22 | | | | | | | | | |
23 | | **VectorStoreAligned** | **1048576** | **79,435.3 ns** | **1,865.323 ns** | **2,220.535 ns** | **78,294.8 ns** | **1.00** | **0.00** |
24 | | VectorStoreArrayMemPtr | 1048576 | 98,353.8 ns | 2,720.589 ns | 2,271.815 ns | 97,951.3 ns | 1.24 | 0.03 |
25 | | VectorStoreArrayMemSafe | 1048576 | 79,803.5 ns | 1,712.943 ns | 3,000.081 ns | 78,598.9 ns | 1.01 | 0.06 |
26 | | VectorStoreArraySimpleBuffer | 1048576 | 79,867.6 ns | 2,257.561 ns | 2,318.349 ns | 79,063.7 ns | 1.00 | 0.05 |
27 | | | | | | | | | |
28 | | **VectorStoreAligned** | **2097152** | **216,500.1 ns** | **4,992.955 ns** | **14,164.183 ns** | **212,591.0 ns** | **1.00** | **0.00** |
29 | | VectorStoreArrayMemPtr | 2097152 | 346,242.9 ns | 6,797.722 ns | 9,304.799 ns | 341,851.5 ns | 1.58 | 0.12 |
30 | | VectorStoreArrayMemSafe | 2097152 | 205,378.0 ns | 3,818.530 ns | 3,188.646 ns | 205,488.9 ns | 0.93 | 0.07 |
31 | | VectorStoreArraySimpleBuffer | 2097152 | 228,231.7 ns | 4,517.376 ns | 10,736.022 ns | 225,121.4 ns | 1.06 | 0.09 |
32 | | | | | | | | | |
33 | | **VectorStoreAligned** | **8388608** | **1,503,050.0 ns** | **28,335.402 ns** | **27,829.153 ns** | **1,490,845.2 ns** | **1.00** | **0.00** |
34 | | VectorStoreArrayMemPtr | 8388608 | 1,506,756.1 ns | 19,681.599 ns | 17,447.225 ns | 1,503,300.3 ns | 1.00 | 0.02 |
35 | | VectorStoreArrayMemSafe | 8388608 | 1,536,087.1 ns | 26,551.526 ns | 23,537.236 ns | 1,531,720.1 ns | 1.02 | 0.03 |
36 | | VectorStoreArraySimpleBuffer | 8388608 | 1,541,513.7 ns | 32,303.380 ns | 30,216.602 ns | 1,536,127.9 ns | 1.02 | 0.03 |
37 |
--------------------------------------------------------------------------------
/Core3Intrinsics/Transpose.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.InteropServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace Core3Intrinsics
7 | {
8 | public static class Transpose
9 | {
10 | private const int defWidth = 1920, defHeight = 1080, numberOfElements = 8;
11 | private static int currWidth, currHeight;
12 | private static int[] original;
13 | public static int[]transposed1, transposed2;
14 |
15 | private static bool isInitialized = false;
16 |
17 | public static bool SerializeColorsInt()
18 | {
19 | if(!isInitialized)
20 | {
21 | return false;
22 | }
23 | int[] colorComponents = new int[currWidth * 4];
24 | Span colorsSpan = transposed1;
25 | int runningCounter = 0;//, byteCounter;
26 | int start;
27 | for (int y = 0; y < currHeight; y++)
28 | {
29 | Span currColors = colorsSpan.Slice(runningCounter, currWidth * 4);
30 | for (int x = 0; x < currWidth; x+= numberOfElements)
31 | {
32 | for (int i = 0; i < numberOfElements; i++)
33 | {
34 | start = x * 4 + i;
35 | colorComponents[start] = original[runningCounter];
36 | colorComponents[start + numberOfElements] = original[runningCounter + 1];
37 | colorComponents[start + (2 * numberOfElements)] = original[runningCounter + 2];
38 | colorComponents[start + (3 * numberOfElements)] = original[runningCounter + 3];
39 | runningCounter += 4;
40 | }
41 | }
42 | colorComponents.CopyTo(currColors);
43 |
44 | }
45 | return true;
46 | }
47 |
48 | public static bool SerializedColorsVector256()
49 | {
50 | if (!isInitialized)
51 | {
52 | return false;
53 | }
54 | Span> originVectors = MemoryMarshal.Cast>(original);
55 | Span> transposedVectors = MemoryMarshal.Cast>(transposed2);
56 | Vector256 pm0, pm1, pm2, pm3, up0, up1, up2, up3;
57 | for(int i = 0; i < originVectors.Length; i += 4)
58 | {
59 | pm0 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x20);
60 | pm1 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x20);
61 | pm2 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x31);
62 | pm3 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x31);
63 |
64 | up0 = Avx2.UnpackLow(pm0, pm1);
65 | up1 = Avx2.UnpackHigh(pm0, pm1);
66 | up2 = Avx2.UnpackLow(pm2, pm3);
67 | up3 = Avx2.UnpackHigh(pm2, pm3);
68 |
69 | transposedVectors[i] = Avx2.UnpackLow(up0, up2);
70 | transposedVectors[i + 1] = Avx2.UnpackHigh(up0, up2);
71 | transposedVectors[i + 2] = Avx2.UnpackLow(up1, up3);
72 | transposedVectors[i + 3] = Avx2.UnpackHigh(up1, up3);
73 | }
74 |
75 | return true;
76 | }
77 |
78 | public static void CreateArrays(int width = defWidth, int height = defHeight)
79 | {
80 | currWidth = width;
81 | currHeight = height;
82 |
83 | original = new int[4 * currWidth * currHeight];
84 | transposed1 = new int[4 * currHeight * currWidth];
85 | transposed2 = new int[4 * currHeight * currWidth];
86 |
87 | for (int i = 0; i < original.Length; i++)
88 | {
89 | original[i] = i;
90 | }
91 |
92 | isInitialized = true;
93 | }
94 |
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/AlignedArrayPool.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Buffers;
3 | using System.Collections.Generic;
4 | using System.Runtime.InteropServices;
5 | using System.Text;
6 |
7 | namespace Core3IntrinsicsBenchmarks
8 | {
9 | public class AlignedArrayPool : IDisposable where T : struct
10 | {
11 | private bool disposedValue = false; // To detect redundant calls
12 |
13 | private static readonly object lockObject = new object();
14 | private readonly ArrayPool pool = ArrayPool.Shared;
15 | private const int defaultByteAlignment = 32;
16 |
17 |
18 | private readonly int tSize, currentAlignment;
19 | private readonly List<(byte[], GCHandle, IntPtr, int)> allBuffers;
20 | private readonly List<(MemoryHandle, GCHandle, byte[])> allMemoryHandles;
21 |
22 | public AlignedArrayPool()
23 | {
24 | Type tp = typeof(T);
25 | tSize = Marshal.SizeOf(tp);
26 | if (!tp.IsValueType || tp.IsEnum)
27 | {
28 | throw new ArgumentException("Invalid type, must be numeric.");
29 | }
30 | currentAlignment = defaultByteAlignment;
31 | allMemoryHandles = new List<(MemoryHandle, GCHandle, byte[])>();
32 | allBuffers = new List<(byte[], GCHandle,IntPtr, int)>();
33 | }
34 |
35 | public unsafe AlignedMemoryHandle Rent(int minimumLength, int byteAlignment)
36 | {
37 | byte[] buff = pool.Rent(minimumLength * tSize + 2 * byteAlignment); // see comment below, could just be 1 *
38 | var handle = GCHandle.Alloc(buff, GCHandleType.Pinned);
39 | allBuffers.Add((buff, handle, IntPtr.Zero, 0));
40 | MemoryHandle memHand;
41 | AlignedMemoryHandle alMemHand;
42 | int currIdx;
43 | lock (lockObject)
44 | {
45 | currIdx = allBuffers.Count - 1;
46 | IntPtr ptr = AlignBuffer(currIdx);
47 | T[] tBuff = MemoryMarshal.Cast(new Span(ptr.ToPointer(), minimumLength * tSize)).ToArray();
48 | memHand = new MemoryHandle(ptr.ToPointer(), handle);
49 | alMemHand = new AlignedMemoryHandle(ptr.ToPointer(), handle, ref tBuff[0], minimumLength * tSize);
50 | allMemoryHandles.Add((memHand, handle, buff));
51 | }
52 | return alMemHand;
53 |
54 | unsafe IntPtr AlignBuffer(int bufferIndex)
55 | {
56 | (byte[], GCHandle, IntPtr, int) currentBuff = allBuffers[bufferIndex];
57 | allBuffers.RemoveAt(bufferIndex);
58 | long lPtr = currentBuff.Item2.AddrOfPinnedObject().ToInt64();
59 | long lPtr2 = (lPtr + currentAlignment - 1) & ~(currentAlignment - 1);
60 | // For benchmarking purposes, we avoid chance 32 byte alignment
61 | if(lPtr2 % 32 == 0)
62 | {
63 | lPtr2 += byteAlignment;
64 | }
65 | currentBuff.Item4 = (int)(lPtr2 - lPtr);
66 | currentBuff.Item3 = new IntPtr(lPtr2);
67 | allBuffers.Add(currentBuff);
68 | return new IntPtr(lPtr2);
69 | }
70 | }
71 |
72 | public AlignedMemoryHandle Rent(int minimumLength)
73 | {
74 | return Rent(minimumLength, defaultByteAlignment);
75 | }
76 |
77 | public unsafe void Return(AlignedMemoryHandle bufferHandle, bool clearArray = false)
78 | {
79 | (MemoryHandle memHandle, GCHandle gcHandle, byte[] buff) item;
80 | lock (lockObject)
81 | {
82 | for (int i = 0; i < allMemoryHandles.Count; i++)
83 | {
84 | item = allMemoryHandles[i];
85 | if (item.memHandle.Pointer == bufferHandle.MemoryHandle.Pointer)
86 | {
87 | if (item.gcHandle.IsAllocated)
88 | {
89 | item.gcHandle.Free();
90 | }
91 | pool.Return(item.buff, clearArray);
92 | allMemoryHandles.RemoveAt(i);
93 | break;
94 | }
95 | }
96 | }
97 | }
98 |
99 | #region IDisposable Support
100 |
101 | protected virtual void Dispose(bool disposing)
102 | {
103 | if (!disposedValue)
104 | {
105 | if (disposing)
106 | {
107 | // TODO: dispose managed state (managed objects).
108 | }
109 |
110 | // TODO: free unmanaged resources (unmanaged objects) and override a finalizer below.
111 | // TODO: set large fields to null.
112 | if (allMemoryHandles.Count > 0)
113 | {
114 | (MemoryHandle memHandle, GCHandle gcHandle, byte[] buff) item;
115 | for (int i = 0; i < allMemoryHandles.Count; i++)
116 | {
117 | item = allMemoryHandles[i];
118 | if (item.gcHandle.IsAllocated)
119 | {
120 | item.gcHandle.Free();
121 | }
122 | pool.Return(item.buff);
123 |
124 | }
125 | allMemoryHandles.Clear();
126 | allBuffers.Clear();
127 | }
128 |
129 | disposedValue = true;
130 | }
131 | }
132 |
133 | // TODO: override a finalizer only if Dispose(bool disposing) above has code to free unmanaged resources.
134 | ~AlignedArrayPool()
135 | {
136 | // Do not change this code. Put cleanup code in Dispose(bool disposing) above.
137 | Dispose(false);
138 | }
139 |
140 | // This code added to correctly implement the disposable pattern.
141 | public void Dispose()
142 | {
143 | // Do not change this code. Put cleanup code in Dispose(bool disposing) above.
144 | Dispose(true);
145 | // TODO: uncomment the following line if the finalizer is overridden above.
146 | GC.SuppressFinalize(this);
147 | }
148 | #endregion
149 | }
150 | }
151 |
--------------------------------------------------------------------------------
/ExtraFiles/MemoryBenches-1.md:
--------------------------------------------------------------------------------
1 | ``` ini
2 |
3 | BenchmarkDotNet=v0.11.5, OS=Windows 10.0.18362
4 | Intel Core i7-4500U CPU 1.80GHz (Haswell), 1 CPU, 4 logical and 2 physical cores
5 | .NET Core SDK=3.0.100-preview9-014004
6 | [Host] : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
7 | DefaultJob : .NET Core 3.0.0-preview9-19423-09 (CoreCLR 4.700.19.42102, CoreFX 4.700.19.42104), 64bit RyuJIT
8 |
9 |
10 | ```
11 | | Method | numberOfBytes | Mean | Error | StdDev | Median | Ratio | RatioSD |
12 | |------------------------------ |-------------- |---------------:|--------------:|--------------:|---------------:|------:|--------:|
13 | | **ScalarStoreUnrolled** | **16384** | **2,292.3 ns** | **60.087 ns** | **50.176 ns** | **2,284.6 ns** | **7.52** | **0.30** |
14 | | ScalarStoreBlock | 16384 | 306.1 ns | 8.539 ns | 12.246 ns | 302.8 ns | 1.00 | 0.00 |
15 | | VectorStoreAligned | 16384 | 493.2 ns | 9.847 ns | 12.453 ns | 493.2 ns | 1.60 | 0.06 |
16 | | VectorStoreArrayMemPtr | 16384 | 401.3 ns | 8.049 ns | 12.998 ns | 397.5 ns | 1.32 | 0.07 |
17 | | VectorStoreArrayMemSafe | 16384 | 473.3 ns | 9.507 ns | 13.327 ns | 470.7 ns | 1.55 | 0.08 |
18 | | VectorStoreUnaligned | 16384 | 577.2 ns | 10.582 ns | 9.381 ns | 576.0 ns | 1.89 | 0.07 |
19 | | VectorStoreUnalignedMemPtr | 16384 | 504.7 ns | 15.461 ns | 20.641 ns | 498.6 ns | 1.65 | 0.09 |
20 | | VectorStoreUnalignedToAligned | 16384 | 492.7 ns | 9.763 ns | 16.311 ns | 485.8 ns | 1.61 | 0.08 |
21 | | | | | | | | | |
22 | | **ScalarStoreUnrolled** | **131072** | **18,656.4 ns** | **343.541 ns** | **321.348 ns** | **18,589.3 ns** | **3.02** | **0.06** |
23 | | ScalarStoreBlock | 131072 | 6,185.0 ns | 77.250 ns | 64.508 ns | 6,174.3 ns | 1.00 | 0.00 |
24 | | VectorStoreAligned | 131072 | 6,873.3 ns | 65.477 ns | 54.676 ns | 6,880.6 ns | 1.11 | 0.02 |
25 | | VectorStoreArrayMemPtr | 131072 | 6,653.6 ns | 141.340 ns | 132.209 ns | 6,610.1 ns | 1.08 | 0.03 |
26 | | VectorStoreArrayMemSafe | 131072 | 6,931.2 ns | 138.136 ns | 282.176 ns | 6,822.8 ns | 1.13 | 0.06 |
27 | | VectorStoreUnaligned | 131072 | 7,556.5 ns | 114.427 ns | 89.337 ns | 7,537.2 ns | 1.22 | 0.02 |
28 | | VectorStoreUnalignedMemPtr | 131072 | 7,319.7 ns | 145.018 ns | 221.457 ns | 7,239.3 ns | 1.19 | 0.04 |
29 | | VectorStoreUnalignedToAligned | 131072 | 6,928.4 ns | 138.061 ns | 141.779 ns | 6,892.1 ns | 1.12 | 0.03 |
30 | | | | | | | | | |
31 | | **ScalarStoreUnrolled** | **1048576** | **159,693.3 ns** | **2,764.505 ns** | **2,308.487 ns** | **159,156.2 ns** | **2.43** | **0.07** |
32 | | ScalarStoreBlock | 1048576 | 65,713.1 ns | 1,277.124 ns | 1,132.137 ns | 65,699.8 ns | 1.00 | 0.00 |
33 | | VectorStoreAligned | 1048576 | 85,778.4 ns | 2,106.262 ns | 5,975.114 ns | 83,181.5 ns | 1.31 | 0.10 |
34 | | VectorStoreArrayMemPtr | 1048576 | 78,964.1 ns | 1,518.257 ns | 1,624.518 ns | 78,922.6 ns | 1.20 | 0.03 |
35 | | VectorStoreArrayMemSafe | 1048576 | 80,763.9 ns | 1,389.509 ns | 1,160.303 ns | 80,709.0 ns | 1.23 | 0.03 |
36 | | VectorStoreUnaligned | 1048576 | 84,741.3 ns | 1,680.962 ns | 2,185.725 ns | 84,040.2 ns | 1.29 | 0.04 |
37 | | VectorStoreUnalignedMemPtr | 1048576 | 82,595.5 ns | 1,816.659 ns | 2,019.212 ns | 82,142.8 ns | 1.26 | 0.04 |
38 | | VectorStoreUnalignedToAligned | 1048576 | 86,209.3 ns | 1,984.263 ns | 5,693.224 ns | 85,122.7 ns | 1.30 | 0.09 |
39 | | | | | | | | | |
40 | | **ScalarStoreUnrolled** | **2097152** | **386,240.6 ns** | **7,648.523 ns** | **19,188.650 ns** | **381,202.7 ns** | **2.26** | **0.11** |
41 | | ScalarStoreBlock | 2097152 | 171,998.1 ns | 3,435.604 ns | 5,142.251 ns | 170,366.1 ns | 1.00 | 0.00 |
42 | | VectorStoreAligned | 2097152 | 250,602.9 ns | 3,544.961 ns | 2,960.203 ns | 250,186.1 ns | 1.45 | 0.05 |
43 | | VectorStoreArrayMemPtr | 2097152 | 253,581.1 ns | 5,065.490 ns | 9,003.903 ns | 251,693.9 ns | 1.48 | 0.06 |
44 | | VectorStoreArrayMemSafe | 2097152 | 254,647.4 ns | 5,565.014 ns | 10,034.868 ns | 251,608.8 ns | 1.49 | 0.07 |
45 | | VectorStoreUnaligned | 2097152 | 258,129.5 ns | 5,127.175 ns | 7,018.136 ns | 256,494.3 ns | 1.50 | 0.06 |
46 | | VectorStoreUnalignedMemPtr | 2097152 | 259,253.1 ns | 5,207.113 ns | 8,408.518 ns | 257,269.9 ns | 1.51 | 0.07 |
47 | | VectorStoreUnalignedToAligned | 2097152 | 268,083.3 ns | 5,350.387 ns | 14,736.521 ns | 270,760.6 ns | 1.55 | 0.10 |
48 | | | | | | | | | |
49 | | **ScalarStoreUnrolled** | **8388608** | **1,792,974.9 ns** | **34,861.894 ns** | **59,198.142 ns** | **1,773,807.8 ns** | **1.64** | **0.07** |
50 | | ScalarStoreBlock | 8388608 | 1,106,074.5 ns | 17,544.390 ns | 14,650.360 ns | 1,107,074.2 ns | 1.00 | 0.00 |
51 | | VectorStoreAligned | 8388608 | 1,564,931.4 ns | 38,160.539 ns | 37,478.752 ns | 1,549,061.2 ns | 1.42 | 0.04 |
52 | | VectorStoreArrayMemPtr | 8388608 | 1,573,258.0 ns | 34,312.238 ns | 44,615.601 ns | 1,561,962.8 ns | 1.43 | 0.05 |
53 | | VectorStoreArrayMemSafe | 8388608 | 1,559,172.6 ns | 17,596.260 ns | 15,598.626 ns | 1,559,339.7 ns | 1.41 | 0.03 |
54 | | VectorStoreUnaligned | 8388608 | 1,541,325.1 ns | 18,699.861 ns | 14,599.621 ns | 1,541,280.2 ns | 1.39 | 0.02 |
55 | | VectorStoreUnalignedMemPtr | 8388608 | 1,561,604.8 ns | 22,459.313 ns | 19,909.596 ns | 1,558,538.2 ns | 1.41 | 0.03 |
56 | | VectorStoreUnalignedToAligned | 8388608 | 1,546,770.0 ns | 19,669.857 ns | 15,356.930 ns | 1,543,577.9 ns | 1.40 | 0.02 |
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Build results
17 | [Dd]ebug/
18 | [Dd]ebugPublic/
19 | [Rr]elease/
20 | [Rr]eleases/
21 | x64/
22 | x86/
23 | [Aa][Rr][Mm]/
24 | [Aa][Rr][Mm]64/
25 | bld/
26 | [Bb]in/
27 | [Oo]bj/
28 | [Ll]og/
29 |
30 | # Visual Studio 2015/2017 cache/options directory
31 | .vs/
32 | # Uncomment if you have tasks that create the project's static files in wwwroot
33 | #wwwroot/
34 |
35 | # Visual Studio 2017 auto generated files
36 | Generated\ Files/
37 |
38 | # MSTest test Results
39 | [Tt]est[Rr]esult*/
40 | [Bb]uild[Ll]og.*
41 |
42 | # NUNIT
43 | *.VisualState.xml
44 | TestResult.xml
45 |
46 | # Build Results of an ATL Project
47 | [Dd]ebugPS/
48 | [Rr]eleasePS/
49 | dlldata.c
50 |
51 | # Benchmark Results
52 | BenchmarkDotNet.Artifacts/
53 |
54 | # .NET Core
55 | project.lock.json
56 | project.fragment.lock.json
57 | artifacts/
58 |
59 | # StyleCop
60 | StyleCopReport.xml
61 |
62 | # Files built by Visual Studio
63 | *_i.c
64 | *_p.c
65 | *_h.h
66 | *.ilk
67 | *.meta
68 | *.obj
69 | *.iobj
70 | *.pch
71 | *.pdb
72 | *.ipdb
73 | *.pgc
74 | *.pgd
75 | *.rsp
76 | *.sbr
77 | *.tlb
78 | *.tli
79 | *.tlh
80 | *.tmp
81 | *.tmp_proj
82 | *_wpftmp.csproj
83 | *.log
84 | *.vspscc
85 | *.vssscc
86 | .builds
87 | *.pidb
88 | *.svclog
89 | *.scc
90 |
91 | # Chutzpah Test files
92 | _Chutzpah*
93 |
94 | # Visual C++ cache files
95 | ipch/
96 | *.aps
97 | *.ncb
98 | *.opendb
99 | *.opensdf
100 | *.sdf
101 | *.cachefile
102 | *.VC.db
103 | *.VC.VC.opendb
104 |
105 | # Visual Studio profiler
106 | *.psess
107 | *.vsp
108 | *.vspx
109 | *.sap
110 |
111 | # Visual Studio Trace Files
112 | *.e2e
113 |
114 | # TFS 2012 Local Workspace
115 | $tf/
116 |
117 | # Guidance Automation Toolkit
118 | *.gpState
119 |
120 | # ReSharper is a .NET coding add-in
121 | _ReSharper*/
122 | *.[Rr]e[Ss]harper
123 | *.DotSettings.user
124 |
125 | # JustCode is a .NET coding add-in
126 | .JustCode
127 |
128 | # TeamCity is a build add-in
129 | _TeamCity*
130 |
131 | # DotCover is a Code Coverage Tool
132 | *.dotCover
133 |
134 | # AxoCover is a Code Coverage Tool
135 | .axoCover/*
136 | !.axoCover/settings.json
137 |
138 | # Visual Studio code coverage results
139 | *.coverage
140 | *.coveragexml
141 |
142 | # NCrunch
143 | _NCrunch_*
144 | .*crunch*.local.xml
145 | nCrunchTemp_*
146 |
147 | # MightyMoose
148 | *.mm.*
149 | AutoTest.Net/
150 |
151 | # Web workbench (sass)
152 | .sass-cache/
153 |
154 | # Installshield output folder
155 | [Ee]xpress/
156 |
157 | # DocProject is a documentation generator add-in
158 | DocProject/buildhelp/
159 | DocProject/Help/*.HxT
160 | DocProject/Help/*.HxC
161 | DocProject/Help/*.hhc
162 | DocProject/Help/*.hhk
163 | DocProject/Help/*.hhp
164 | DocProject/Help/Html2
165 | DocProject/Help/html
166 |
167 | # Click-Once directory
168 | publish/
169 |
170 | # Publish Web Output
171 | *.[Pp]ublish.xml
172 | *.azurePubxml
173 | # Note: Comment the next line if you want to checkin your web deploy settings,
174 | # but database connection strings (with potential passwords) will be unencrypted
175 | *.pubxml
176 | *.publishproj
177 |
178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
179 | # checkin your Azure Web App publish settings, but sensitive information contained
180 | # in these scripts will be unencrypted
181 | PublishScripts/
182 |
183 | # NuGet Packages
184 | *.nupkg
185 | # The packages folder can be ignored because of Package Restore
186 | **/[Pp]ackages/*
187 | # except build/, which is used as an MSBuild target.
188 | !**/[Pp]ackages/build/
189 | # Uncomment if necessary however generally it will be regenerated when needed
190 | #!**/[Pp]ackages/repositories.config
191 | # NuGet v3's project.json files produces more ignorable files
192 | *.nuget.props
193 | *.nuget.targets
194 |
195 | # Microsoft Azure Build Output
196 | csx/
197 | *.build.csdef
198 |
199 | # Microsoft Azure Emulator
200 | ecf/
201 | rcf/
202 |
203 | # Windows Store app package directories and files
204 | AppPackages/
205 | BundleArtifacts/
206 | Package.StoreAssociation.xml
207 | _pkginfo.txt
208 | *.appx
209 |
210 | # Visual Studio cache files
211 | # files ending in .cache can be ignored
212 | *.[Cc]ache
213 | # but keep track of directories ending in .cache
214 | !?*.[Cc]ache/
215 |
216 | # Others
217 | ClientBin/
218 | ~$*
219 | *~
220 | *.dbmdl
221 | *.dbproj.schemaview
222 | *.jfm
223 | *.pfx
224 | *.publishsettings
225 | orleans.codegen.cs
226 |
227 | # Including strong name files can present a security risk
228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
229 | #*.snk
230 |
231 | # Since there are multiple workflows, uncomment next line to ignore bower_components
232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
233 | #bower_components/
234 |
235 | # RIA/Silverlight projects
236 | Generated_Code/
237 |
238 | # Backup & report files from converting an old project file
239 | # to a newer Visual Studio version. Backup files are not needed,
240 | # because we have git ;-)
241 | _UpgradeReport_Files/
242 | Backup*/
243 | UpgradeLog*.XML
244 | UpgradeLog*.htm
245 | ServiceFabricBackup/
246 | *.rptproj.bak
247 |
248 | # SQL Server files
249 | *.mdf
250 | *.ldf
251 | *.ndf
252 |
253 | # Business Intelligence projects
254 | *.rdl.data
255 | *.bim.layout
256 | *.bim_*.settings
257 | *.rptproj.rsuser
258 | *- Backup*.rdl
259 |
260 | # Microsoft Fakes
261 | FakesAssemblies/
262 |
263 | # GhostDoc plugin setting file
264 | *.GhostDoc.xml
265 |
266 | # Node.js Tools for Visual Studio
267 | .ntvs_analysis.dat
268 | node_modules/
269 |
270 | # Visual Studio 6 build log
271 | *.plg
272 |
273 | # Visual Studio 6 workspace options file
274 | *.opt
275 |
276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
277 | *.vbw
278 |
279 | # Visual Studio LightSwitch build output
280 | **/*.HTMLClient/GeneratedArtifacts
281 | **/*.DesktopClient/GeneratedArtifacts
282 | **/*.DesktopClient/ModelManifest.xml
283 | **/*.Server/GeneratedArtifacts
284 | **/*.Server/ModelManifest.xml
285 | _Pvt_Extensions
286 |
287 | # Paket dependency manager
288 | .paket/paket.exe
289 | paket-files/
290 |
291 | # FAKE - F# Make
292 | .fake/
293 |
294 | # JetBrains Rider
295 | .idea/
296 | *.sln.iml
297 |
298 | # CodeRush personal settings
299 | .cr/personal
300 |
301 | # Python Tools for Visual Studio (PTVS)
302 | __pycache__/
303 | *.pyc
304 |
305 | # Cake - Uncomment if you are using it
306 | # tools/**
307 | # !tools/packages.config
308 |
309 | # Tabs Studio
310 | *.tss
311 |
312 | # Telerik's JustMock configuration file
313 | *.jmconfig
314 |
315 | # BizTalk build output
316 | *.btp.cs
317 | *.btm.cs
318 | *.odx.cs
319 | *.xsd.cs
320 |
321 | # OpenCover UI analysis results
322 | OpenCover/
323 |
324 | # Azure Stream Analytics local run output
325 | ASALocalRun/
326 |
327 | # MSBuild Binary and Structured Log
328 | *.binlog
329 |
330 | # NVidia Nsight GPU debugger configuration file
331 | *.nvuser
332 |
333 | # MFractors (Xamarin productivity tool) working folder
334 | .mfractor/
335 |
336 | # Local History for Visual Studio
337 | .localhistory/
338 |
339 | # BeatPulse healthcheck temp database
340 | healthchecksdb
--------------------------------------------------------------------------------
/Core3Intrinsics/Intro.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.InteropServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace Core3Intrinsics
7 | {
8 | public class Intro
9 | {
10 | public Intro()
11 | {
12 | var middleVector = Vector128.Create(1.0f); // middleVector = <1,1,1,1>
13 | middleVector = Vector128.CreateScalar(-1.0f); // middleVector = <-1,0,0,0>
14 | var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191>
15 | if(Avx.IsSupported)
16 | {
17 | var left = Vector256.Create(-2.5f); // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5>
18 | var right = Vector256.Create(5.0f); // <5, 5, 5, 5, 5, 5, 5, 5>
19 | Vector256 result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit
20 | left = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, - 70.0f, -80.0f);
21 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f);
22 | result = Avx.UnpackHigh(left, right); // result = <-3, 3, -4, 4, -70, 70, -80, 80>
23 | result = Avx.UnpackLow(left, right); // result = <-1, 1, -2, 2, -50, 50, -60, 60>
24 | result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0>
25 | bool testResult = Avx.TestC(left, right); // testResult = true
26 | testResult = Avx.TestC(right, left); // testResult = false
27 | Vector256 result1 = Avx.Divide(left, right);
28 | var plusOne = Vector256.Create(1.0f);
29 | result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
30 | result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling);
31 | left = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f);
32 | right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
33 | Vector256 nanInFirstPosition = Avx.Divide(left, right);
34 | left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f);
35 | Vector256 InfInFirstPosition = Avx.Divide(left, right);
36 |
37 | left = Vector256.Create(-1.1f, 3.0f, 1.0f/3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f);
38 | right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
39 | Vector256 compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
40 | Vector256 mixed = Avx.BlendVariable(left, right, compareResult); // mixed = <-1, 2, -3, 2, -50, -60, -70, -80>
41 |
42 | //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f);
43 | //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f);
44 | Vector256 other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
45 | bool bRes = Avx.TestZ(plusOne, compareResult);
46 | bool bRes2 = Avx.TestC(plusOne, compareResult);
47 | bool allTrue = !Avx.TestZ(compareResult, compareResult);
48 | compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
49 | compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling);
50 | compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling);
51 | compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling);
52 | var left128 = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f);
53 | var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f);
54 | Vector128 compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0>
55 |
56 | int res = Avx.MoveMask(compareResult);
57 | if (Fma.IsSupported)
58 | {
59 | Vector256 resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element
60 | resultFma = Fma.MultiplyAddNegated(left, right, other); // = -(left * right + other) for each element
61 | resultFma = Fma.MultiplySubtract(left, right, other); // = left * right - other for each element
62 | Fma.MultiplyAddSubtract(left, right, other); // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract
63 |
64 | }
65 | result = Avx.DotProduct(left, right, 0b1010_0001); // result = <-20, 0, 0, 0, -10000, 0, 0, 0>
66 | result = Avx.Floor(left); // result = <-3, -3, -3, -3, -3, -3, -3, -3>
67 | result = Avx.Add(left, right); // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5>
68 | result = Avx.Ceiling(left); // result = <-2, -2, -2, -2, -2, -2, -2, -2>
69 | result = Avx.Multiply(left, right); // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5>
70 | result = Avx.HorizontalAdd(left, right); // result = <-5, -5, 10, 10, -5, -5, 10, 10>
71 | result = Avx.HorizontalSubtract(left, right); // result = <0, 0, 0, 0, 0, 0, 0, 0>
72 | double[] someDoubles = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 };
73 | double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
74 | double[] someResult = new double[someDoubles.Length];
75 | float[] someFloats = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 };
76 | float[] someOtherFloats = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 };
77 | unsafe
78 | {
79 | fixed (double* ptr = &someDoubles[1])
80 | {
81 | fixed (double* ptr2 = &someResult[0])
82 | {
83 | Vector256 res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8>
84 | Avx.Store(ptr2, res2);
85 | }
86 | }
87 |
88 | fixed (float* ptr = &someFloats[0])
89 | {
90 | fixed (float* ptr2 = &someOtherFloats[0])
91 | {
92 | Vector256 res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001);
93 | //Avx.Store(ptr2, res2);
94 | }
95 | }
96 | }
97 |
98 |
99 |
100 | }
101 | }
102 |
103 | public float[] ProcessData(ref Span input)
104 | {
105 | float[] results = new float[input.Length];
106 | Span> resultVectors = MemoryMarshal.Cast>(results);
107 |
108 | ReadOnlySpan> inputVectors = MemoryMarshal.Cast>(input);
109 |
110 | for(int i = 0; i < inputVectors.Length; i++)
111 | {
112 | resultVectors[i] = Avx.Sqrt(inputVectors[i]);
113 | }
114 |
115 | return results;
116 | }
117 |
118 | public unsafe float[] ProcessDataUnsafe(ref Span input)
119 | {
120 | float[] results = new float[input.Length];
121 | fixed (float* inputPtr = &input[0])
122 | {
123 | float* inCurrent = inputPtr;
124 | fixed (float* resultPtr = &results[0])
125 | {
126 | float* resEnd = resultPtr + results.Length;
127 | float* resCurrent = resultPtr;
128 | while (resCurrent < resEnd)
129 | {
130 | Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent)));
131 | resCurrent += 8;
132 | inCurrent += 8;
133 | }
134 | }
135 | }
136 | return results;
137 | }
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/Core3Intrinsics/Mandelbrot.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Intrinsics;
3 | using System.Runtime.InteropServices;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace Core3Intrinsics
7 | {
8 | public class Mandelbrot
9 | {
10 | readonly int TOTALBYTES = 16 * 1024 * 1024;//4 * 1024 * 1024;
11 | public int numberOfTasks = 1;
12 |
13 | const float LEFT_X = -2.5f;
14 | const float RIGHT_X = 1.0f;
15 | const float TOP_Y = 1.0f;
16 | const float BOTT_Y = -1.0f;
17 |
18 | int resolutionX, resolutionY;
19 | readonly float ratioy_x = (TOP_Y - BOTT_Y) / (RIGHT_X - LEFT_X);
20 | float STEP_X;
21 | float STEP_Y;
22 | public Memory results, results2, testValue1, testValue2;
23 | public int SizeInBytes => numberOfPoints * sizeof(float);
24 | Memory xPoints, yPoints;
25 | int numberOfPoints;
26 |
27 | public void FloatMandel()
28 | {
29 | int floatL3Size = TOTALBYTES / sizeof(float);
30 | resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x));
31 | if (resolutionX % 8 != 0)
32 | {
33 | resolutionX -= resolutionX % 8;
34 | }
35 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x);
36 | if (resolutionY % 8 != 0)
37 | {
38 | resolutionY -= resolutionY % 8;
39 | }
40 | STEP_X = (RIGHT_X - LEFT_X) / resolutionX;
41 | STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment
42 | numberOfPoints = resolutionX * resolutionY;
43 | if(numberOfPoints % 8 != 0)
44 | {
45 | numberOfPoints += numberOfPoints % 8;
46 | }
47 | results = new float[numberOfPoints];
48 | testValue1 = new float [numberOfPoints];
49 | testValue2 = new float [numberOfPoints];
50 |
51 | xPoints = new float[resolutionX];
52 | yPoints = new float[resolutionY];
53 | for (int i = 0; i < resolutionX; i++)
54 | {
55 | xPoints.Span[i] = LEFT_X + i * STEP_X;
56 | }
57 | for (int i = 0; i < resolutionY; i++)
58 | {
59 | yPoints.Span[i] = TOP_Y - i * STEP_Y;
60 | }
61 |
62 | float currentY;
63 | float currentX;
64 | int countX = 0, countY = 0;
65 | int maxInter = 256;
66 | int inter;
67 | float zSquare, xSquare, ySquare, x, y;
68 | ReadOnlySpan ySpan = yPoints.Span;
69 | ReadOnlySpan xSpan = xPoints.Span;
70 | Span res = results.Span;
71 | int floatCounter = 0;
72 | while (countY < resolutionY)
73 | {
74 |
75 | currentY = ySpan[countY];
76 | while (countX < resolutionX)
77 | {
78 |
79 | currentX = xSpan[countX];
80 | zSquare = xSquare = ySquare = 0.0f;
81 | inter = 0;
82 | bool goOn;
83 | while (xSquare + ySquare <= 4.0f && inter < maxInter)
84 | {
85 | x = xSquare - ySquare + currentX;
86 | y = zSquare - ySquare - xSquare + currentY;
87 | xSquare = x * x;
88 | ySquare = y * y;
89 | zSquare = (x + y) * (x + y);
90 | goOn = xSquare + ySquare <= 4.0f;
91 |
92 | inter = goOn ? inter + 1 : inter;
93 | }
94 | //res[countY * resolutionX + countX] = inter;
95 | res[floatCounter] = inter;
96 | testValue1.Span[floatCounter] = xSquare + ySquare;
97 | countX++;
98 | floatCounter++;
99 | }
100 | countX = 0;
101 | countY++;
102 | }
103 | }
104 |
105 | public unsafe void Vector256Mandel()
106 | {
107 | int floatL3Size = TOTALBYTES / sizeof(float);
108 | resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x));
109 | if (resolutionX % 8 != 0)
110 | {
111 | resolutionX -= resolutionX % 8;
112 | }
113 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x);
114 | if (resolutionY % 8 != 0)
115 | {
116 | resolutionY -= resolutionY % 8;
117 | }
118 | STEP_X = (RIGHT_X - LEFT_X) / resolutionX;
119 | STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment
120 | numberOfPoints = resolutionX * resolutionY;
121 | results2 = new float[numberOfPoints];
122 |
123 | xPoints = new float[resolutionX];
124 | yPoints = new float[resolutionY];
125 | for (int i = 0; i < resolutionX; i++)
126 | {
127 | xPoints.Span[i] = LEFT_X + i * STEP_X;
128 | }
129 | for (int i = 0; i < resolutionY; i++)
130 | {
131 | yPoints.Span[i] = TOP_Y - i * STEP_Y;
132 | }
133 |
134 | int countX = 0, countY = 0;
135 | int maxInter = 256;
136 | int inter;
137 | ReadOnlySpan ySpan = yPoints.Span;// MemoryMarshal.Cast>(yPoints.Span);
138 | ReadOnlySpan> xSpan = MemoryMarshal.Cast>(xPoints.Span);
139 | Span> res = MemoryMarshal.Cast>(results2.Span);
140 | Span> testSpan = MemoryMarshal.Cast>(testValue2.Span);
141 | int resVectorNumber = 0;
142 |
143 | Vector256 xVec, yVec;
144 | var oneVec = Vector256.Create(1.0f);
145 | var fourVec = Vector256.Create(4.0f);
146 |
147 | while (countY < ySpan.Length)
148 | {
149 | var currYVec = Vector256.Create(ySpan[countY]);
150 | while (countX < xSpan.Length)
151 | {
152 |
153 | Vector256 currXVec = xSpan[countX];
154 | var xSquVec = Vector256.Create(0.0f);
155 | var ySquVec = Vector256.Create(0.0f);
156 | var zSquVec = Vector256.Create(0.0f);
157 | var interVec = Vector256.Create(0.0f);
158 | Vector256 sumVector = oneVec;
159 | inter = 0;
160 | bool goOn = true;
161 | while (goOn)
162 | {
163 | xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec);
164 | yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec);
165 | xSquVec = Avx.Multiply(xVec, xVec);
166 | ySquVec = Avx.Multiply(yVec, yVec);
167 | zSquVec = Avx.Multiply(Avx.Add(xVec, yVec), Avx.Add(xVec, yVec));
168 | Vector256 test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0?
169 | sumVector = Avx.BlendVariable(Vector256.Zero, sumVector, test); // selects from second if true, from first otherwise
170 | goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value?
171 | if (goOn)
172 | {
173 | interVec = Avx.Add(interVec, sumVector);
174 | }
175 | inter = goOn ? inter + 1 : inter;
176 | }
177 | testSpan[resVectorNumber] = Avx.Add(xSquVec, ySquVec);
178 | res[resVectorNumber] = interVec;
179 | resVectorNumber++;
180 | countX++;
181 | }
182 | countX = 0;
183 | countY++;
184 | }
185 |
186 | }
187 | }
188 | }
189 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/Mandelbrot.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Numerics;
3 | using System.Runtime.Intrinsics;
4 | using BenchmarkDotNet.Attributes;
5 | using System.Runtime.InteropServices;
6 | using System.Runtime.Intrinsics.X86;
7 |
8 | namespace Core3IntrinsicsBenchmarks
9 | {
10 | [DisassemblyDiagnoser(printAsm: true, printSource: true)]
11 | public class Mandelbrot
12 | {
13 | //[Params(4 * 1024 * 1024, 16 * 1024 * 1024)] //L3, 4 * L3
14 | public int TotalBytes {get; set; }
15 |
16 | public int numberOfTasks = 2;
17 | const float LEFT_X = -2.5f;
18 | const float RIGHT_X = 1.0f;
19 | const float TOP_Y = 1.0f;
20 | const float BOTT_Y = -1.0f;
21 | const float RATIO_Y_X = (TOP_Y - BOTT_Y) / (RIGHT_X - LEFT_X);
22 |
23 | int resolutionX, resolutionY;
24 | readonly float ratioy_x = RATIO_Y_X;
25 | public Memory results;
26 | public int SizeInBytes => numberOfPoints * sizeof(float);
27 | Memory xPoints, yPoints;
28 | int numberOfPoints;
29 |
30 | [GlobalSetup]
31 | public void GlobalSetup()
32 | {
33 | resolutionX = 1920;
34 | resolutionY = (int)MathF.Floor(resolutionX * ratioy_x);
35 | float STEP_X = (RIGHT_X - LEFT_X) / resolutionX;
36 | float STEP_Y = STEP_X; // (TOP_Y - BOTT_Y) / resolutionY; Bug from reddit comment
37 |
38 | numberOfPoints = resolutionX * resolutionY;
39 | results = new float[numberOfPoints];
40 | xPoints = new float[resolutionX];
41 | yPoints = new float[resolutionY];
42 | for(int i = 0; i < resolutionX; i++)
43 | {
44 | xPoints.Span[i] = LEFT_X + i * STEP_X;
45 | }
46 | for (int i = 0; i < resolutionY; i++)
47 | {
48 | yPoints.Span[i] = TOP_Y - i * STEP_Y;
49 | }
50 | }
51 |
52 | [Benchmark(Baseline = true)]
53 | public void FloatMandel()
54 | {
55 | float currentY;
56 | float currentX;
57 | int countX = 0, countY = 0;
58 | int maxInter = 256;
59 | int inter;
60 | float zSquare, xSquare, ySquare, x, y;
61 | ReadOnlySpan ySpan = yPoints.Span;
62 | ReadOnlySpan xSpan = xPoints.Span;
63 | Span res = results.Span;
64 | int floatCounter = 0;
65 | float q;
66 | float one16 = 1.0f / 16.0f;
67 | while (countY < resolutionY)
68 | {
69 | currentY = ySpan[countY];
70 | while (countX < resolutionX)
71 | {
72 | currentX = xSpan[countX];
73 | zSquare = xSquare = ySquare = 0.0f;
74 | inter = 0;
75 | bool goOn;// = true;
76 | float temp = (currentX - 0.25f);
77 | float temp1 = currentY * currentY;
78 | q = temp * temp + temp1;
79 | goOn = (q * (q + (temp)) > 0.25f * temp1); // out of cardioid? see https://en.wikipedia.org/wiki/Mandelbrot_set#Cardioid_/_bulb_checking
80 | if (goOn)
81 | {
82 | goOn = (currentX + 1.0f) * (currentX + 1.0f) + temp1 > one16; // out of period-2 bulb?
83 | if (!goOn)
84 | {
85 | inter = 255;
86 | }
87 | }
88 |
89 | while (goOn && inter < maxInter)
90 | {
91 | x = xSquare - ySquare + currentX;
92 | y = zSquare - ySquare - xSquare + currentY;
93 | xSquare = x * x;
94 | ySquare = y * y;
95 | zSquare = (x + y) * (x + y);
96 | goOn = xSquare + ySquare <= 4.0f;
97 |
98 | inter = goOn ? inter + 1 : inter;
99 | }
100 | res[floatCounter] = inter;
101 | countX++;
102 | floatCounter++;
103 | }
104 | countX = 0;
105 | countY++;
106 | }
107 | }
108 |
109 | [Benchmark]
110 | public unsafe void Vector256Mandel()
111 | {
112 | int countX = 0, countY = 0;
113 | int maxInter = 256;
114 | int inter;
115 | ReadOnlySpan ySpan = yPoints.Span;
116 | ReadOnlySpan> xSpan = MemoryMarshal.Cast>(xPoints.Span);
117 | Span> res = MemoryMarshal.Cast>(results.Span);
118 | int resVectorNumber = 0;
119 |
120 | Vector256 xVec, yVec;
121 | Vector256 zeroVec = Vector256.Zero;
122 | var oneVec = Vector256.Create(1.0f);
123 | var fourVec = Vector256.Create(4.0f);
124 | var one4Vec = Vector256.Create(0.25f);
125 | var one16Vec = Vector256.Create(1.0f/16.0f);
126 | Vector256 qVec;
127 | Vector256 test;
128 |
129 | while (countY < ySpan.Length)
130 | {
131 | var currYVec = Vector256.Create(ySpan[countY]);
132 | while (countX < xSpan.Length)
133 | {
134 | Vector256 currXVec = xSpan[countX];
135 | Vector256 xSquVec = zeroVec;
136 | Vector256 ySquVec = zeroVec;
137 | Vector256 zSquVec = zeroVec;
138 | Vector256 interVec = zeroVec;
139 | Vector256 sumVector;
140 |
141 | inter = 0;
142 | bool goOn;
143 | Vector256 temp = Avx.Subtract(currXVec, one4Vec);
144 | Vector256 temp1 = Avx.Multiply(currYVec, currYVec);
145 | qVec = Avx.Add(Avx.Multiply(temp, temp), temp1);
146 | Vector256 temp2 = Avx.Multiply(qVec, Avx.Add(qVec, temp));
147 | test = Avx.Compare(temp2, Avx.Multiply(one4Vec, temp1), FloatComparisonMode.OrderedGreaterThanNonSignaling);
148 | goOn = (Avx.MoveMask(test) > 0);
149 | if(goOn)
150 | {
151 | temp2 = Avx.Add(currXVec, oneVec);
152 | temp = Avx.Add(Avx.Multiply(temp2, temp2), temp1);
153 | test = Avx.Compare(temp, one16Vec, FloatComparisonMode.OrderedGreaterThanNonSignaling);
154 | goOn = Avx.MoveMask(test) > 0;
155 | if (!goOn)
156 | {
157 | interVec = Vector256.Create(255.0f); // make all point = maximum value
158 | }
159 | }
160 | while (goOn)
161 | {
162 | xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec);
163 | yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec);
164 | xSquVec = Avx.Multiply(xVec, xVec);
165 | ySquVec = Avx.Multiply(yVec, yVec);
166 | temp = Avx.Add(xVec, yVec);
167 | zSquVec = Avx.Multiply(temp, temp);
168 | test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0?
169 | sumVector = Avx.BlendVariable(zeroVec, oneVec, test);
170 |
171 | goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value?
172 | if (goOn)
173 | {
174 | interVec = Avx.Add(interVec, sumVector);
175 | }
176 | inter = goOn ? inter + 1 : inter;
177 | }
178 | res[resVectorNumber] = interVec;
179 | resVectorNumber++;
180 | countX++;
181 | }
182 | countX = 0;
183 | countY++;
184 | }
185 | }
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/MemoryBenches.cs:
--------------------------------------------------------------------------------
1 | using BenchmarkDotNet.Attributes;
2 | using BenchmarkDotNet.Configs;
3 | using BenchmarkDotNet.Exporters;
4 | using BenchmarkDotNet.Exporters.Csv;
5 | using System;
6 | using System.Buffers;
7 | using System.Runtime.CompilerServices;
8 | using System.Runtime.InteropServices;
9 | using System.Runtime.Intrinsics;
10 | using System.Runtime.Intrinsics.X86;
11 |
12 | namespace Core3IntrinsicsBenchmarks
13 | {
14 | //[DisassemblyDiagnoser(printAsm: true, printSource: true)]
15 | //[GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)]
16 | //[CategoriesColumn]
17 | //[Config(typeof(Config))] // only used for plots
18 | public class MemoryBenches
19 | {
20 | private class Config : ManualConfig // only used for plots
21 | {
22 | public Config()
23 | {
24 | Add(CsvMeasurementsExporter.Default);
25 | Add(RPlotExporter.Default);
26 | }
27 | }
28 |
29 | [Params(16 * 1024, 128 * 1024, 1024 * 1024, 2 * 1024 * 1024, 8 * 1024 * 1024)] // half L1, half L2, half L3, 2 * L3
30 | public int NumberOfBytes { get ; set; }
31 |
32 | private int vectorNumberOfItems, vectorFloatStep;
33 | private int numberOfFloatItems;
34 |
35 | private static readonly AlignedArrayPool alignedArrayPool = new AlignedArrayPool();
36 | private static AlignedMemoryHandle dataMemory, storeMemory, data16Memory, store16Memory;
37 | //private static float[] arr1, arr2;
38 |
39 | [GlobalSetup]
40 | public unsafe void GlobalSetup()
41 | {
42 | vectorFloatStep = Vector256.Count;
43 | numberOfFloatItems = NumberOfBytes / sizeof(float);
44 | vectorNumberOfItems = numberOfFloatItems / vectorFloatStep;
45 |
46 | dataMemory = alignedArrayPool.Rent(numberOfFloatItems);
47 | storeMemory = alignedArrayPool.Rent(numberOfFloatItems);
48 | data16Memory = alignedArrayPool.Rent(numberOfFloatItems, 16);
49 | store16Memory = alignedArrayPool.Rent(numberOfFloatItems, 16);
50 |
51 | for (int i = 0; i < numberOfFloatItems; i++)
52 | {
53 | dataMemory.Memory.Span[i] = i;
54 | data16Memory.Memory.Span[i] = i;
55 | }
56 | }
57 |
58 | [GlobalCleanup]
59 | public void GlobalCleanup()
60 | {
61 | alignedArrayPool.Return(dataMemory);
62 | alignedArrayPool.Return(storeMemory);
63 | alignedArrayPool.Return(data16Memory);
64 | alignedArrayPool.Return(store16Memory);
65 | }
66 |
67 | /*
68 | [BenchmarkCategory("Aligned Memory"), Benchmark]
69 | public unsafe void ScalarStore()
70 | {
71 | ReadOnlySpan dataAl = MemoryMarshal.Cast(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength));
72 | Span storeAl = MemoryMarshal.Cast(new Span(storeMemory.MemoryHandle.Pointer, storeMemory.ByteArrayLength));
73 | for (int i = 0; i < dataAl.Length; i++)
74 | {
75 | storeAl[i] = dataAl[i];
76 | }
77 | }
78 |
79 | [BenchmarkCategory("Aligned Memory"), Benchmark]
80 | public unsafe void ScalarStoreUnrolled()
81 | {
82 | ReadOnlySpan dataAl = MemoryMarshal.Cast(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, dataMemory.ByteArrayLength));
83 | Span storeAl = MemoryMarshal.Cast(new Span(storeMemory.MemoryHandle.Pointer, storeMemory.ByteArrayLength));
84 |
85 | int step = 4;
86 | for (int i = 0; i < dataAl.Length; i += step)
87 | {
88 | storeAl[i] = dataAl[i];
89 | storeAl[i + 1] = dataAl[i + 1];
90 | storeAl[i + 2] = dataAl[i + 2];
91 | storeAl[i + 3] = dataAl[i + 3];
92 | }
93 | }
94 |
95 | [BenchmarkCategory("Unaligned Memory"), Benchmark]
96 | public unsafe void PtrCopyUnrolled()
97 | {
98 | float* arr1Ptr = (float*)data16Memory.MemoryHandle.Pointer;
99 | float* arr2Ptr = (float*)store16Memory.MemoryHandle.Pointer;
100 |
101 | int i = 0;
102 | while (i < numberOfFloatItems)
103 | {
104 | *arr2Ptr = *arr1Ptr;
105 | arr1Ptr++;
106 | arr2Ptr++;
107 | *arr2Ptr = *arr1Ptr;
108 | arr1Ptr++;
109 | arr2Ptr++;
110 | *arr2Ptr = *arr1Ptr;
111 | arr1Ptr++;
112 | arr2Ptr++;
113 | *arr2Ptr = *arr1Ptr;
114 | arr1Ptr++;
115 | arr2Ptr++;
116 |
117 | i += 4;
118 | }
119 |
120 |
121 | } */
122 |
123 | [BenchmarkCategory("Aligned Memory"), Benchmark]
124 | public void ScalarCopyBlock()
125 | {
126 | Unsafe.CopyBlock(ref storeMemory.ByteRef, ref dataMemory.ByteRef, (uint)(numberOfFloatItems * sizeof(float)));
127 | }
128 |
129 |
130 | [BenchmarkCategory("Aligned Memory"), Benchmark(Baseline = true)]
131 | public unsafe void VectorStoreAlignedUnsafe()
132 | {
133 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer;
134 | float* currSpPtr2 = (float*)storeMemory.MemoryHandle.Pointer;
135 |
136 | int i = 0;
137 | while (i < vectorNumberOfItems)
138 | {
139 | Avx.StoreAligned(currSpPtr2, Avx.LoadAlignedVector256(currSpPtr));
140 | currSpPtr += vectorFloatStep;
141 | currSpPtr2 += vectorFloatStep;
142 | i++;
143 | }
144 | }
145 | /*
146 | [BenchmarkCategory("Aligned Memory"), Benchmark]
147 | public unsafe void VectorStoreArrayMemPtr()
148 | {
149 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(new ReadOnlySpan(dataMemory.MemoryHandle.Pointer, numberOfFloatItems));
150 | Span> writeMem = MemoryMarshal.Cast>(new Span(storeMemory.MemoryHandle.Pointer, numberOfFloatItems));
151 |
152 | int i = 0;
153 |
154 | while (i < readMem.Length)
155 | {
156 | writeMem[i] = readMem[i];
157 | i++;
158 | }
159 | }
160 |
161 | [BenchmarkCategory("Aligned Memory"), Benchmark]
162 | public void VectorStoreArrayMemSafe()
163 | {
164 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(dataMemory.Memory.Span);
165 | Span> writeMem = MemoryMarshal.Cast>(storeMemory.Memory.Span);
166 |
167 | int i = 0;
168 |
169 | while (i < readMem.Length)
170 | {
171 | writeMem[i] = readMem[i];
172 | i++;
173 | }
174 | }
175 |
176 | [BenchmarkCategory("Unaligned Memory"), Benchmark]
177 | public unsafe void VectorStoreArrayMemPtrUnaligned()
178 | {
179 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(new ReadOnlySpan(data16Memory.MemoryHandle.Pointer, numberOfFloatItems));
180 | Span> writeMem = MemoryMarshal.Cast>(new Span(store16Memory.MemoryHandle.Pointer, numberOfFloatItems));
181 |
182 | int i = 0;
183 | while (i < readMem.Length)
184 | {
185 | writeMem[i] = readMem[i];
186 | i++;
187 | }
188 | }
189 |
190 | [BenchmarkCategory("Unaligned Memory"), Benchmark]
191 | public void VectorArraySafeUnaligned()
192 | {
193 | ReadOnlySpan> readMem = MemoryMarshal.Cast>(data16Memory.Memory.Span);
194 | Span> writeMem = MemoryMarshal.Cast>(store16Memory.Memory.Span);
195 |
196 | int i = 0;
197 | while (i < readMem.Length)
198 | {
199 | writeMem[i] = readMem[i];
200 | i++;
201 | }
202 | } */
203 |
204 | [BenchmarkCategory("Unaligned Memory"), Benchmark]
205 | public unsafe void VectorStoreUnalignedUnsafe()
206 | {
207 | float* currSpPtr = (float*)data16Memory.MemoryHandle.Pointer;
208 | float* currSpPtr2 = (float*)store16Memory.MemoryHandle.Pointer;
209 |
210 | int i = 0;
211 | while (i < vectorNumberOfItems)
212 | {
213 | Avx.Store(currSpPtr2, Avx.LoadVector256(currSpPtr));
214 | currSpPtr += vectorFloatStep;
215 | currSpPtr2 += vectorFloatStep;
216 | i++;
217 | }
218 | }
219 |
220 | [BenchmarkCategory("Unaligned Memory"), Benchmark]
221 | public unsafe void VectorStoreUnalignedToAlignedUnsafe()
222 | {
223 | float* currSpPtr = (float*)dataMemory.MemoryHandle.Pointer;
224 | float* currSpPtr2 = (float*)storeMemory.MemoryHandle.Pointer;
225 |
226 | int i = 0;
227 | while (i < vectorNumberOfItems)
228 | {
229 | Avx.Store(currSpPtr2, Avx.LoadVector256(currSpPtr));
230 | currSpPtr += vectorFloatStep;
231 | currSpPtr2 += vectorFloatStep;
232 | i++;
233 | }
234 | }
235 | }
236 | }
237 |
--------------------------------------------------------------------------------
/Core3IntrinsicsBenchmarks/IntegerBasicOps.cs:
--------------------------------------------------------------------------------
1 | using BenchmarkDotNet.Attributes;
2 | using BenchmarkDotNet.Configs;
3 | using System;
4 | using System.Numerics;
5 | using System.Runtime.InteropServices;
6 | using System.Runtime.Intrinsics;
7 | using System.Runtime.Intrinsics.X86;
8 |
9 | namespace Core3IntrinsicsBenchmarks
10 | {
11 | //[DisassemblyDiagnoser(printAsm: true, printSource: true)]
12 | [GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)]
13 | [CategoriesColumn]
14 | public class IntegerBasicOps
15 | {
16 | [Params(/*4 * 1024,*/ 4000 * 1024)]
17 | public int NumberOfItems {get; set;}
18 |
19 | private const int bmpWidth = 1920, bmpHeight = 1080;
20 | private AlignedArrayPool intPool;
21 | private AlignedArrayPool shortPool;
22 | private AlignedArrayPool longPool;
23 | private AlignedMemoryHandle intData, intStore, bmpData, bmpStore;
24 | private AlignedMemoryHandle shortData, shortStore;
25 | private AlignedMemoryHandle longData, longStore;
26 |
27 | [GlobalSetup]
28 | public void GlobalSetup()
29 | {
30 | intPool = new AlignedArrayPool();
31 | shortPool = new AlignedArrayPool();
32 | longPool = new AlignedArrayPool();
33 |
34 | intData = intPool.Rent(NumberOfItems);
35 | intStore = intPool.Rent(NumberOfItems);
36 | bmpData = intPool.Rent(bmpWidth * bmpHeight * 4);
37 | bmpStore = intPool.Rent(bmpWidth * bmpHeight * 4);
38 | shortData = shortPool.Rent(NumberOfItems);
39 | shortStore = shortPool.Rent(NumberOfItems);
40 | longData = longPool.Rent(NumberOfItems);
41 | longStore = longPool.Rent(NumberOfItems);
42 |
43 | var r = new Random(1);
44 | for (int i = 0; i < NumberOfItems; i++)
45 | {
46 | intData.Memory.Span[i] = i * 2 + r.Next(-1000, 1000);
47 | intStore.Memory.Span[i] = i + r.Next(-1000, 1000);
48 | shortData.Memory.Span[i] = (short)intData.Memory.Span[i];
49 | shortStore.Memory.Span[i] = (short)intStore.Memory.Span[i];
50 | longData.Memory.Span[i] = intData.Memory.Span[i];
51 | longStore.Memory.Span[i] = intStore.Memory.Span[i];
52 | }
53 | for(int i = 0; i < bmpData.Memory.Span.Length; i++)
54 | {
55 | bmpData.Memory.Span[i] = i;
56 | }
57 | }
58 |
59 | [GlobalCleanup]
60 | public void GlobalCleanup()
61 | {
62 | intPool.Return(intData);
63 | intPool.Return(intStore);
64 | intPool.Return(bmpData);
65 | intPool.Return(bmpStore);
66 | shortPool.Return(shortData);
67 | shortPool.Return(shortStore);
68 | longPool.Return(longData);
69 | longPool.Return(longStore);
70 | intPool.Dispose();
71 | }
72 | /*
73 | [BenchmarkCategory("Short"), Benchmark(Baseline = true)]
74 | public unsafe void ShortAdd()
75 | {
76 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems);
77 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems);
78 |
79 | for (int i = 0; i < NumberOfItems; i++)
80 | {
81 | sp2[i] = (short)(sp1[i] + sp2[i]);
82 | }
83 | }
84 |
85 | [BenchmarkCategory("Short"), Benchmark]
86 | public unsafe void ShortAddVector256()
87 | {
88 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span);
89 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span);
90 |
91 | for (int i = 0; i < sp1.Length; i++)
92 | {
93 | sp2[i] = Avx2.Add(sp1[i], sp2[i]);
94 | }
95 | }
96 |
97 | [BenchmarkCategory("Short"), Benchmark]
98 | public unsafe void ShortAndNot()
99 | {
100 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems);
101 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems);
102 |
103 | for (int i = 0; i < NumberOfItems; i++)
104 | {
105 | sp2[i] = (short)(sp1[i] & ~sp2[i]);
106 | }
107 | }
108 |
109 | [BenchmarkCategory("Short"), Benchmark]
110 | public unsafe void ShortAndNotVector256()
111 | {
112 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span);
113 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span);
114 |
115 | for (int i = 0; i < sp1.Length; i++)
116 | {
117 | sp2[i] = Avx2.AndNot(sp1[i],sp2[i]);
118 | }
119 | }
120 |
121 | [BenchmarkCategory("Short"), Benchmark]
122 | public unsafe void ShortShiftLeft()
123 | {
124 | var sp1 = new ReadOnlySpan(shortData.MemoryHandle.Pointer, NumberOfItems);
125 | var sp2 = new Span(shortStore.MemoryHandle.Pointer, NumberOfItems);
126 |
127 | for (int i = 0; i < NumberOfItems; i++)
128 | {
129 | sp2[i] = (short)(sp1[i] << 5);
130 | }
131 | }
132 |
133 | [BenchmarkCategory("Short"), Benchmark]
134 | public unsafe void ShortShiftLeftVector256()
135 | {
136 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(shortData.Memory.Span);
137 | Span> sp2 = MemoryMarshal.Cast>(shortStore.Memory.Span);
138 |
139 | for (int i = 0; i < sp1.Length; i++)
140 | {
141 | sp2[i] = Avx2.ShiftLeftLogical(sp1[i], 5);
142 | }
143 | } */
144 | /*
145 | [BenchmarkCategory("Integer"), Benchmark(Baseline = true)]
146 | public unsafe void IntAdd()
147 | {
148 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems);
149 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems);
150 |
151 | for(int i = 0; i < NumberOfItems; i++)
152 | {
153 | sp2[i] = sp1[i] + sp2[i];
154 | }
155 | }
156 |
157 | [BenchmarkCategory("Integer"), Benchmark]
158 | public unsafe void IntAddVector256()
159 | {
160 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
161 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
162 |
163 | for (int i = 0; i < sp1.Length; i++)
164 | {
165 | sp2[i] = Avx2.Add(sp1[i], sp2[i]);
166 | }
167 | }
168 |
169 | [BenchmarkCategory("Integer"), Benchmark]
170 | public unsafe void IntXor()
171 | {
172 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems);
173 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems);
174 |
175 | for (int i = 0; i < NumberOfItems; i++)
176 | {
177 | sp2[i] = sp1[i] ^ sp2[i];
178 | }
179 | }
180 |
181 | [BenchmarkCategory("Integer"), Benchmark]
182 | public unsafe void IntXorVector256()
183 | {
184 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
185 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
186 |
187 | for (int i = 0; i < sp1.Length; i++)
188 | {
189 | sp2[i] = Avx2.Xor(sp1[i], sp2[i]);
190 | }
191 | }
192 |
193 | [BenchmarkCategory("Integer"), Benchmark]
194 | public unsafe void IntMultiply()
195 | {
196 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems);
197 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems);
198 |
199 | for (int i = 0; i < NumberOfItems; i++)
200 | {
201 | sp2[i] = sp1[i] * sp2[i];
202 | }
203 | }
204 |
205 | [BenchmarkCategory("Integer"), Benchmark]
206 | public unsafe void IntMultiplyLowVector256()
207 | {
208 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
209 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
210 |
211 | for (int i = 0; i < sp1.Length; i++)
212 | {
213 | sp2[i] = Avx2.MultiplyLow(sp1[i], sp2[i]);
214 | }
215 | }
216 |
217 | [BenchmarkCategory("Integer"), Benchmark]
218 | public unsafe void IntShiftLeft()
219 | {
220 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems);
221 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems);
222 |
223 | for (int i = 0; i < NumberOfItems; i++)
224 | {
225 | sp2[i] = sp1[i] << 5;
226 | }
227 | }
228 |
229 |
230 | [BenchmarkCategory("Integer"), Benchmark]
231 | public unsafe void IntShiftLeftVector256()
232 | {
233 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
234 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
235 |
236 | for (int i = 0; i < sp1.Length; i++)
237 | {
238 | sp2[i] = Avx2.ShiftLeftLogical(sp1[i], 5);
239 | }
240 | }
241 |
242 | [BenchmarkCategory("Integer"), Benchmark]
243 | public unsafe void IntMax()
244 | {
245 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems);
246 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems);
247 |
248 | for (int i = 0; i < NumberOfItems; i++)
249 | {
250 | sp2[i] = sp1[i] > sp2[i] ? sp1[1] : sp2[i];
251 | }
252 | }
253 |
254 | [BenchmarkCategory("Integer"), Benchmark]
255 | public unsafe void IntMaxVector256()
256 | {
257 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
258 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
259 |
260 | for (int i = 0; i < sp1.Length; i++)
261 | {
262 | sp2[i] = Avx2.Max(sp1[i], sp2[i]);
263 | }
264 | } */
265 |
266 | [BenchmarkCategory("Chained1"), Benchmark(Baseline = true)]
267 | public unsafe void IntMultipleOps()
268 | {
269 | var sp1 = new ReadOnlySpan(intData.MemoryHandle.Pointer, NumberOfItems);
270 | var sp2 = new Span(intStore.MemoryHandle.Pointer, NumberOfItems);
271 |
272 | for (int i = 0; i < NumberOfItems; i++)
273 | {
274 | sp2[i] = ((sp1[i] > sp2[i] ? sp1[1] : sp2[i]) << 2) * 3;
275 | }
276 | }
277 |
278 | [BenchmarkCategory("Chained1"), Benchmark]
279 | public unsafe void IntMultipleOpsvector256()
280 | {
281 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
282 | Span> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
283 |
284 | Vector256 three = Vector256.Create(3);
285 |
286 | for (int i = 0; i < sp1.Length; i++)
287 | {
288 | sp2[i] = Avx2.MultiplyLow(Avx2.ShiftLeftLogical(Avx2.Max(sp1[i], sp2[i]), 2), three);
289 | }
290 | }
291 |
292 | [BenchmarkCategory("Chained2"), Benchmark(Baseline = true)]
293 | public unsafe void IntTranspose()
294 | {
295 | var sp1 = new ReadOnlySpan(bmpData.MemoryHandle.Pointer, bmpHeight * bmpWidth * 4);
296 | var sp2 = new Span(bmpStore.MemoryHandle.Pointer, bmpHeight * bmpWidth * 4);
297 | int numberOfElements = Vector256.Count;
298 |
299 | int[] colorComponents = new int[bmpWidth * 4];
300 | int runningCounter = 0;//, byteCounter;
301 | int start;
302 | for (int y = 0; y < bmpHeight; y++)
303 | {
304 | Span currColors = sp2.Slice(runningCounter, bmpWidth * 4);
305 | for (int x = 0; x < bmpWidth; x += numberOfElements)
306 | {
307 | for (int i = 0; i < numberOfElements; i++)
308 | {
309 | start = x * 4 + i;
310 | colorComponents[start] = sp1[runningCounter];
311 | colorComponents[start + numberOfElements] = sp1[runningCounter + 1];
312 | colorComponents[start + (2 * numberOfElements)] = sp1[runningCounter + 2];
313 | colorComponents[start + (3 * numberOfElements)] = sp1[runningCounter + 3];
314 | runningCounter += 4;
315 | }
316 | }
317 | colorComponents.CopyTo(currColors);
318 |
319 | }
320 | }
321 |
322 | [BenchmarkCategory("Chained2"), Benchmark]
323 | public unsafe void IntTransposeVector256() // see https://software.intel.com/sites/default/files/m/d/4/1/d/8/Image_Processing_-_whitepaper_-_100pct_CCEreviewed_update.pdf
324 | {
325 | Span> originVectors = MemoryMarshal.Cast>(bmpData.Memory.Span);
326 | Span> transposedVectors = MemoryMarshal.Cast>(bmpStore.Memory.Span);
327 | Vector256 pm0, pm1, pm2, pm3, up0, up1, up2, up3;
328 | for (int i = 0; i < originVectors.Length; i += 4)
329 | {
330 | pm0 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x20);
331 | pm1 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x20);
332 | pm2 = Avx.Permute2x128(originVectors[i], originVectors[i + 2], 0x31);
333 | pm3 = Avx.Permute2x128(originVectors[i + 1], originVectors[i + 3], 0x31);
334 |
335 | up0 = Avx2.UnpackLow(pm0, pm1);
336 | up1 = Avx2.UnpackHigh(pm0, pm1);
337 | up2 = Avx2.UnpackLow(pm2, pm3);
338 | up3 = Avx2.UnpackHigh(pm2, pm3);
339 |
340 | transposedVectors[i] = Avx2.UnpackLow(up0, up2);
341 | transposedVectors[i + 1] = Avx2.UnpackHigh(up0, up2);
342 | transposedVectors[i + 2] = Avx2.UnpackLow(up1, up3);
343 | transposedVectors[i + 3] = Avx2.UnpackHigh(up1, up3);
344 | }
345 | }
346 |
347 | /*
348 | [BenchmarkCategory("Long"), Benchmark(Baseline = true)]
349 | public unsafe void LongAdd()
350 | {
351 | var sp1 = new ReadOnlySpan(longData.MemoryHandle.Pointer, NumberOfItems);
352 | var sp2 = new Span(longStore.MemoryHandle.Pointer, NumberOfItems);
353 |
354 | for (int i = 0; i < NumberOfItems; i++)
355 | {
356 | sp2[i] = sp1[i] + sp2[i];
357 | }
358 | }
359 |
360 | [BenchmarkCategory("Long"), Benchmark]
361 | public unsafe void LongMultiply()
362 | {
363 | var sp1 = new ReadOnlySpan(longData.MemoryHandle.Pointer, NumberOfItems);
364 | var sp2 = new Span(longStore.MemoryHandle.Pointer, NumberOfItems);
365 |
366 | for (int i = 0; i < NumberOfItems; i++)
367 | {
368 | sp2[i] = sp1[i] * sp2[i];
369 | }
370 | }
371 |
372 |
373 | [BenchmarkCategory("Long"), Benchmark]
374 | public unsafe void LongAddVector256()
375 | {
376 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(longData.Memory.Span);
377 | Span> sp2 = MemoryMarshal.Cast>(longStore.Memory.Span);
378 |
379 | for (int i = 0; i < sp1.Length; i++)
380 | {
381 | sp2[i] = Avx2.Add(sp1[i], sp2[i]);
382 | }
383 | }
384 |
385 |
386 | [BenchmarkCategory("Long"), Benchmark]
387 | public unsafe void IntMultiplyVector256ToLong()
388 | {
389 | ReadOnlySpan> sp1 = MemoryMarshal.Cast>(intData.Memory.Span);
390 | ReadOnlySpan> sp2 = MemoryMarshal.Cast>(intStore.Memory.Span);
391 | Span> sp3 = MemoryMarshal.Cast