├── BenchmarkResults ├── Analysis.pdf ├── Analysis.xlsx └── Specialized.csv ├── SortingNetworks ├── SortingNetworks.csproj ├── AESRand.cs ├── MWC1616Rand.cs ├── Attic │ ├── PeriodicInt.cs │ ├── Periodic16Branchless.cs │ ├── PeriodicInt_Block.cs │ └── Periodic16Expr.cs ├── UnsafeSort.cs ├── PeriodicInt.cs ├── UnsafeRandom.cs ├── PeriodicInt_Block.cs ├── IntSorter.cs └── FloatSorter.cs ├── SNBenchmark ├── SNBenchmark.csproj ├── ArraySortConstantEstimation.cs ├── InvocationBenchmark.cs ├── Generators.cs ├── FloatBenchmark.cs ├── IntBenchmark.cs ├── Validation.cs └── Program.cs ├── LICENSE.txt ├── SortingNetworks.sln ├── .gitattributes ├── .gitignore └── README.md /BenchmarkResults/Analysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zvrba/SortingNetworks/HEAD/BenchmarkResults/Analysis.pdf -------------------------------------------------------------------------------- /BenchmarkResults/Analysis.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zvrba/SortingNetworks/HEAD/BenchmarkResults/Analysis.xlsx -------------------------------------------------------------------------------- /SortingNetworks/SortingNetworks.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Library 5 | netcoreapp3.1 6 | 7 | 8 | 9 | 10 | 11 | true 12 | 13 | 14 | 15 | true 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /SNBenchmark/SNBenchmark.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.1 6 | 7 | 8 | 9 | true 10 | 11 | 12 | 13 | true 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /SNBenchmark/ArraySortConstantEstimation.cs: -------------------------------------------------------------------------------- 1 | using BenchmarkDotNet.Attributes; 2 | 3 | using System; 4 | 5 | namespace SNBenchmark 6 | { 7 | /// 8 | /// Array.Sort uses an introsort algorithm that has O(n*log(n)) complexity. This benchmark 9 | /// generates data for estimating the constant hidden in the O-term. Only random pattern is used. 10 | /// 11 | public class ArraySortConstantEstimation 12 | { 13 | readonly Generators generators = new Generators(); 14 | int[] d; 15 | 16 | [Params(32, 64, 128, 256, 1024, 2048, 4096, 8192, 16384)] 17 | public int Size { get; set; } 18 | 19 | [GlobalSetup] 20 | public void GlobalSetup() { 21 | d = new int[Size]; 22 | } 23 | 24 | [Benchmark(Baseline = true)] 25 | public void NoSort() { 26 | generators.Random(d); 27 | } 28 | 29 | [Benchmark] 30 | public void ArraySort() { 31 | generators.Random(d); 32 | Array.Sort(d); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Stian Z. Vrba 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /SortingNetworks/AESRand.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.Intrinsics; 3 | using System.Runtime.Intrinsics.X86; 4 | 5 | namespace SortingNetworks 6 | { 7 | /// 8 | /// Random number generation using AES-NI instructions. 9 | /// Code adapted from https://github.com/dragontamer/AESRand/blob/master/AESRand/AESRand/AESRand.cpp 10 | /// 11 | public sealed class AESRand : UnsafeRandom { 12 | static readonly Vector128 PRIME_INCREMENT = Vector128.Create( 13 | 0x2f, 0x2b, 0x29, 0x25, 0x1f, 0x1d, 0x17, 0x13, 14 | 0x11, 0x0D, 0x0B, 0x07, 0x05, 0x03, 0x02, 0x01).AsUInt64(); 15 | 16 | Vector128 state; 17 | 18 | public AESRand(int[] seed) { 19 | if (seed.Length != 4) 20 | throw new ArgumentException("Seed must contain exactly 4 elements.", nameof(seed)); 21 | this.state = Vector128.Create(seed[0], seed[1], seed[2], seed[3]).AsUInt64(); 22 | } 23 | 24 | /// 25 | /// Overwrites the initial 4 elements of with random 32-bit integers. 26 | /// 27 | /// 28 | /// Array of length at least 4. Behaviour is UNDEFINED if the array is shorter. 29 | /// 30 | public override Vector128 Get4() { 31 | state = Sse2.Add(state, PRIME_INCREMENT); 32 | var r1 = Aes.Encrypt(state.AsByte(), PRIME_INCREMENT.AsByte()); 33 | var r2 = Aes.Encrypt(r1, PRIME_INCREMENT.AsByte()).AsInt32(); 34 | return r2; 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /BenchmarkResults/Specialized.csv: -------------------------------------------------------------------------------- 1 | Method;Size;Pattern;Mean (ns);OnlySort (ns);Ratio;;;;;; 2 | NoSort;4;Asc;9,197;;;;;;;; 3 | ArraySort;4;Asc;34,932;25,735;3,084621839;;;;;; 4 | NetworkSort;4;Asc;17,54;8,343;1;;;;;; 5 | NoSort;4;Desc;10,197;;;;;;;; 6 | ArraySort;4;Desc;40,525;30,328;3,53020603;;;;;; 7 | NetworkSort;4;Desc;18,788;8,591;1;;;;;; 8 | NoSort;4;Rand;13,267;;;;;;;; 9 | ArraySort;4;Rand;54,718;41,451;4,557058047;;;;;; 10 | NetworkSort;4;Rand;22,363;9,096;1;;;;;; 11 | NoSort;8;Asc;17,591;;;;;;;; 12 | ArraySort;8;Asc;57,123;39,532;4,326110746;;;;;; 13 | NetworkSort;8;Asc;26,729;9,138;1;;;;;; 14 | NoSort;8;Desc;22,368;;;;;;;; 15 | ArraySort;8;Desc;82,407;60,039;10,20377294;;;;;; 16 | NetworkSort;8;Desc;28,252;5,884;1;;;;;; 17 | NoSort;8;Rand;23,726;;;;;;;; 18 | ArraySort;8;Rand;101,007;77,281;6,388971561;;;;;; 19 | NetworkSort;8;Rand;35,822;12,096;1;;;;;; 20 | NoSort;16;Asc;35,836;;;;;;;; 21 | ArraySort;16;Asc;92,05;56,214;4,597906102;;;;;; 22 | NetworkSort;16;Asc;48,062;12,226;1;;;;;; 23 | NoSort;16;Desc;34,743;;;;;;;; 24 | ArraySort;16;Desc;218,171;183,428;12,24976626;;;;;; 25 | NetworkSort;16;Desc;49,717;14,974;1;;;;;; 26 | NoSort;16;Rand;44,974;;;;;;;; 27 | ArraySort;16;Rand;223,987;179,013;7,861791831;;;;;; 28 | NetworkSort;16;Rand;67,744;22,77;1;;;;;; 29 | NoSort;32;Asc;66,193;;;;;;;; 30 | ArraySort;32;Asc;160,966;94,773;3,808591866;;;;;; 31 | NetworkSort;32;Asc;91,077;24,884;1;;;;;; 32 | NoSort;32;Desc;87,786;;;;;;;; 33 | ArraySort;32;Desc;205,57;117,784;4,844685752;;;;;; 34 | NetworkSort;32;Desc;112,098;24,312;1;;;;;; 35 | NoSort;32;Rand;88,008;;;;;;;; 36 | ArraySort;32;Rand;673,462;585,454;11,42706016;;;;;; 37 | NetworkSort;32;Rand;139,242;51,234;1;;;;;; 38 | -------------------------------------------------------------------------------- /SortingNetworks/MWC1616Rand.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.Intrinsics; 3 | using System.Runtime.Intrinsics.X86; 4 | 5 | namespace SortingNetworks 6 | { 7 | /// 8 | /// Random number generator using MWC1616 (multiply with carry) algorithm. 9 | /// Code adapted from http://www.digicortex.net/node/22 10 | /// 11 | public sealed class MWC1616Rand : UnsafeRandom 12 | { 13 | // Single array so that we can pin it only once. 14 | 15 | Vector128 mask, m1, m2; // Constants 16 | Vector128 a, b; // State 17 | 18 | public MWC1616Rand(int[] seed) { 19 | if (seed.Length != 8) 20 | throw new ArgumentException("The seed array must contain exactly 8 elements.", nameof(seed)); 21 | 22 | mask = Vector128.Create(0xFFFFu); 23 | m1 = Vector128.Create(0x4650u); 24 | m2 = Vector128.Create(0x78B7u); 25 | a = Vector128.Create((uint)seed[0], (uint)seed[1], (uint)seed[2], (uint)seed[3]); 26 | b = Vector128.Create((uint)seed[4], (uint)seed[5], (uint)seed[6], (uint)seed[7]); 27 | } 28 | 29 | public override Vector128 Get4() { 30 | var amask = Sse2.And(a, mask); 31 | var ashift = Sse2.ShiftRightLogical(a, 0x10); 32 | var amul = Sse41.MultiplyLow(amask, m1); 33 | a = Sse2.Add(amul, ashift); 34 | 35 | var bmask = Sse2.And(b, mask); 36 | var bshift = Sse2.ShiftRightLogical(b, 0x10); 37 | var bmul = Sse41.MultiplyLow(bmask, m2); 38 | b = Sse2.Add(bmul, bshift); 39 | 40 | var t1 = Sse2.And(b, mask); 41 | var t2 = Sse2.ShiftLeftLogical(a, 0x10); 42 | var r = Sse2.Add(t1, t2); 43 | return r.AsInt32(); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /SNBenchmark/InvocationBenchmark.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using BenchmarkDotNet.Attributes; 3 | 4 | namespace SNBenchmark 5 | { 6 | [BenchmarkCategory("Invocation")] 7 | public class InvocationBenchmark 8 | { 9 | readonly int[] data = new int[16]; 10 | readonly SortingNetworks.UnsafeSort asorter = SortingNetworks.UnsafeSort.Create(16); 11 | readonly SortingNetworks.PeriodicInt csorter = new SortingNetworks.PeriodicInt(); 12 | 13 | [GlobalSetup] 14 | public void GlobalSetup() { 15 | for (int i = 0; i < data.Length; ++i) data[i] = i; 16 | } 17 | 18 | [Benchmark] 19 | public unsafe void AbstractInvoke() { 20 | fixed (int* p = data) 21 | asorter.Sorter(p, data.Length); 22 | } 23 | 24 | [Benchmark] 25 | public unsafe void ConcreteInvoke() { 26 | fixed (int* p = data) 27 | csorter.Sort16(p); 28 | } 29 | } 30 | 31 | #if false // Obsoleted, "Attic" is no longer included in build of SortingNetworks. 32 | [BenchmarkCategory("Invocation")] 33 | public class ExpressionInvocationBenchmark 34 | { 35 | readonly int[] data = new int[16]; 36 | readonly SortingNetworks.Attic.Periodic16Expr expr = new SortingNetworks.Attic.Periodic16Expr(); 37 | 38 | // Sets up data array to be sorted so as to have minimum possible data-dependent variation. 39 | [GlobalSetup] 40 | public void GlobalSetup() { 41 | for (int i = 0; i < data.Length; ++i) data[i] = i; 42 | } 43 | 44 | [Benchmark] 45 | public unsafe void DirectInvoke() { 46 | fixed (int* p = data) 47 | SortingNetworks.Attic.Periodic16Branchless.Sort(p); 48 | } 49 | 50 | [Benchmark] 51 | public unsafe void ExpressionInvoke() { 52 | fixed (int* p = data) 53 | expr.Sort(p); 54 | } 55 | } 56 | #endif 57 | } 58 | -------------------------------------------------------------------------------- /SortingNetworks.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.31424.327 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SortingNetworks", "SortingNetworks\SortingNetworks.csproj", "{2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SNBenchmark", "SNBenchmark\SNBenchmark.csproj", "{F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}" 9 | EndProject 10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{9B8ACCCE-973B-42F8-BC2A-2C6751AB5237}" 11 | ProjectSection(SolutionItems) = preProject 12 | LICENSE.txt = LICENSE.txt 13 | README.md = README.md 14 | EndProjectSection 15 | EndProject 16 | Global 17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 18 | Debug|Any CPU = Debug|Any CPU 19 | Release|Any CPU = Release|Any CPU 20 | EndGlobalSection 21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 22 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 23 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Debug|Any CPU.Build.0 = Debug|Any CPU 24 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Release|Any CPU.ActiveCfg = Release|Any CPU 25 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Release|Any CPU.Build.0 = Release|Any CPU 26 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 27 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Debug|Any CPU.Build.0 = Debug|Any CPU 28 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Release|Any CPU.ActiveCfg = Release|Any CPU 29 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Release|Any CPU.Build.0 = Release|Any CPU 30 | EndGlobalSection 31 | GlobalSection(SolutionProperties) = preSolution 32 | HideSolutionNode = FALSE 33 | EndGlobalSection 34 | GlobalSection(ExtensibilityGlobals) = postSolution 35 | SolutionGuid = {0E45DB88-0408-4A77-8703-6DA36E579EAD} 36 | EndGlobalSection 37 | EndGlobal 38 | -------------------------------------------------------------------------------- /SNBenchmark/Generators.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.Intrinsics; 3 | using System.Runtime.Intrinsics.X86; 4 | 5 | namespace SNBenchmark 6 | { 7 | class Generators 8 | { 9 | readonly SortingNetworks.MWC1616Rand rng = new SortingNetworks.MWC1616Rand(new int[8] { 2, 3, 5, 7, 11, 13, 17, 19 }); 10 | 11 | /// 12 | /// Fills data with integers from 0 to data.Length-1 in ascending order. 13 | /// 14 | public void Ascending(int[] data) { 15 | for (int i = 0; i < data.Length; ++i) 16 | data[i] = i; 17 | } 18 | 19 | /// 20 | /// Fills data with integers from 0 to data.Length-1 in descending order. 21 | /// 22 | public void Descending(int[] data) { 23 | for (int i = 0; i < data.Length; ++i) 24 | data[i] = data.Length - 1 - i; 25 | } 26 | 27 | /// 28 | /// Fills data with pseudo-random numbers. Length of data must be a multiple of 4, otherwise the 29 | /// remaining elements will not be filled. 30 | /// 31 | /// 32 | public unsafe void Random(int[] data) { 33 | fixed (int* p = data) { 34 | for (int i = 0; i < data.Length / 4; ++i) 35 | rng.Get4(p + 4 * i); 36 | } 37 | } 38 | 39 | /// 40 | /// Rearranges the existing contents of according to a random permutation. 41 | /// 42 | public unsafe void FisherYates(T[] data) where T : unmanaged { 43 | var r = stackalloc uint[4]; // Randomness 44 | int k = 4; // Randomness is initially used up. j is temp. 45 | int j; 46 | Vector128 ar; 47 | 48 | // Use pointer throughout to avoid bound checks. 49 | // Also, we're jumping around the array so the direction of the iteration doesn't matter. 50 | fixed (T* p = data) { 51 | for (int i = data.Length - 1; i > 0; --i) { 52 | // Generate randomness if empty. 53 | if (k == 4) { 54 | ar = rng.Get4().AsUInt32(); 55 | Sse2.Store(r, ar); 56 | k = 0; 57 | } 58 | j = (int)(r[k++] % (i + 1)); // Random int between [0, i] 59 | (p[i], p[j]) = (p[j], p[i]); // Exchange. 60 | } 61 | } 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /SNBenchmark/FloatBenchmark.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | using BenchmarkDotNet.Attributes; 5 | 6 | namespace SNBenchmark 7 | { 8 | [XmlExporterAttribute.Brief] 9 | [XmlExporter(fileNameSuffix: "xml", indentXml: true, excludeMeasurements: true)] 10 | public class FloatBenchmark 11 | { 12 | readonly Generators generators = new Generators(); 13 | Action g; 14 | SortingNetworks.UnsafeSort n; 15 | float[] d; 16 | 17 | //[Params(4, 8, 12, 16, 32, 47, 64, 97, 128, 147, 256, 317, 512, 711, 1024, 1943, 2048, 3717, 4096)] 18 | [ParamsSource(nameof(Sizes))] 19 | public int Size { get; set; } 20 | 21 | [GlobalSetup] 22 | public void GlobalSetup() { 23 | g = generators.FisherYates; 24 | n = SortingNetworks.UnsafeSort.Create(Size); 25 | d = new float[Size]; 26 | Filler(); 27 | } 28 | 29 | // Also used to simulate sorting. 30 | void Filler() { 31 | for (int i = 0; i < d.Length; ++i) 32 | d[i] = i; 33 | } 34 | 35 | void ArraySorter() => Array.Sort(d); 36 | 37 | unsafe void NetworkSorter() { 38 | fixed (float* p = d) n.Sorter(p, d.Length); 39 | } 40 | 41 | void Template(Action sorter, string what) { 42 | g(d); 43 | sorter(); 44 | // Should leave the array sorted so no need to reinitialize it for the next iteration. 45 | int i; 46 | for (i = 0; i < d.Length && d[i] == i; ++i) 47 | ; // no body 48 | if (i < d.Length) 49 | Environment.FailFast(what); 50 | } 51 | 52 | /// 53 | /// Baseline: Fill array with sorted numbers, overwrite with sorted sequence, and validate for being sorted. 54 | /// The first and last step are common for all benchmarks. 55 | /// 56 | [Benchmark(Baseline = true)] 57 | public void NoSort() => Template(Filler, "Unsorted [Baseline]."); 58 | 59 | /// 60 | /// Sorting by using Array.Sort(). 61 | /// 62 | [Benchmark] 63 | public void ArraySort() => Template(ArraySorter, "Unsorted [ArraySort]."); 64 | 65 | [Benchmark] 66 | public unsafe void NetworkSort() => Template(NetworkSorter, "Unsorted [NetworkSort]."); 67 | 68 | // The numbers in-between powers of two are deliberately set to odd numbers slightly lower/larger than half the interval. 69 | // This to test the sorters for various lengths. 70 | public IEnumerable Sizes => new int[] { 71 | 4, 8, 12, 16, 27, 32, 47, 64, 128, 177, 256, 364, 512, 748, 1024, 2048, 3389, 4096, 6793, 8192, 14289, 16384, 72 | 32768, 53151, 65536, 96317, 131072, 191217, 262144, 398853, 524288, 719289, 1048576 73 | }; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /SortingNetworks/Attic/PeriodicInt.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks.Attic 7 | { 8 | using V = Vector256; 9 | 10 | /// 11 | /// Provides methods for sorting integer arrays of lengths that are a power of two. Invoking public members with 12 | /// arrays that are of shorter length will result in UNDEFINED BEHAVIOR (data corruption, crash). 13 | /// 14 | public partial class PeriodicInt 15 | { 16 | readonly V Zero; // 00000000 17 | readonly V Complement; // FFFFFFFF 18 | readonly V AlternatingMaskHi128; // FFFF0000 19 | readonly V AlternatingMaskLo128; // 0000FFFF 20 | readonly V AlternatingMaskHi64; // FF00FF00 21 | readonly V AlternatingMaskLo32; // F0F0F0F0 22 | 23 | public PeriodicInt() { 24 | Zero = V.Zero; 25 | Complement = Avx2.CompareEqual(Zero, Zero); 26 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsInt32(); 27 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsInt32(); 28 | AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8)); 29 | AlternatingMaskLo32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftLeftLogical(Complement.AsInt64(), 32)).AsInt32(); 30 | } 31 | 32 | /// 33 | /// In-place sorts 16 elements starting at . 34 | /// 35 | public unsafe void Sort16(int* data) { 36 | var lo = Avx.LoadVector256(data); 37 | var hi = Avx.LoadVector256(data + 8); 38 | 39 | Block16(2, ref lo, ref hi); 40 | Block16(3, ref lo, ref hi); 41 | Block16(4, ref lo, ref hi); 42 | Block16(4, ref lo, ref hi); 43 | 44 | Avx.Store(data, lo); 45 | Avx.Store(data + 8, hi); 46 | } 47 | 48 | /// 49 | /// In-place sorts 32 elements starting at . 50 | /// 51 | public unsafe void Sort32(int* data) { 52 | var v0 = Avx.LoadVector256(data + 0); 53 | var v1 = Avx.LoadVector256(data + 8); 54 | var v2 = Avx.LoadVector256(data + 16); 55 | var v3 = Avx.LoadVector256(data + 24); 56 | 57 | Block32(2, ref v0, ref v1, ref v2, ref v3); 58 | Block32(3, ref v0, ref v1, ref v2, ref v3); 59 | Block32(4, ref v0, ref v1, ref v2, ref v3); 60 | Block32(5, ref v0, ref v1, ref v2, ref v3); 61 | Block32(5, ref v0, ref v1, ref v2, ref v3); 62 | 63 | Avx.Store(data + 0, v0); 64 | Avx.Store(data + 8, v1); 65 | Avx.Store(data + 16, v2); 66 | Avx.Store(data + 24, v3); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /SNBenchmark/IntBenchmark.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | 4 | using BenchmarkDotNet.Attributes; 5 | 6 | namespace SNBenchmark 7 | { 8 | [XmlExporterAttribute.Brief] 9 | [XmlExporter(fileNameSuffix: "xml", indentXml: true, excludeMeasurements: true)] 10 | public class IntBenchmark 11 | { 12 | readonly Generators generators = new Generators(); 13 | Action g; 14 | SortingNetworks.UnsafeSort n; 15 | int[] d; 16 | 17 | //[Params(4, 8, 12, 16, 32, 47, 64, 97, 128, 147, 256, 317, 512, 711, 1024, 1943, 2048, 3717, 4096)] 18 | [ParamsSource(nameof(Sizes))] 19 | public int Size { get; set; } 20 | 21 | //[Params("Asc", "Desc", "Rand")] 22 | [Params("Rand")] 23 | public string Pattern { get; set; } 24 | 25 | [GlobalSetup] 26 | public void GlobalSetup() { 27 | switch (Pattern) { 28 | case "Asc": g = generators.Ascending; break; 29 | case "Desc": g = generators.Descending; break; 30 | case "Rand": g = generators.FisherYates; break; 31 | default: throw new ArgumentOutOfRangeException(nameof(Pattern)); 32 | } 33 | n = SortingNetworks.UnsafeSort.Create(Size); 34 | d = new int[Size]; 35 | Filler(); 36 | } 37 | 38 | // Also used to simulate sorting. 39 | void Filler() { 40 | for (int i = 0; i < d.Length; ++i) 41 | d[i] = i; 42 | } 43 | 44 | void ArraySorter() => Array.Sort(d); 45 | 46 | unsafe void NetworkSorter() { 47 | fixed (int* p = d) n.Sorter(p, d.Length); 48 | } 49 | 50 | void Template(Action sorter, string what) { 51 | g(d); 52 | sorter(); 53 | // Should leave the array sorted so no need to reinitialize it for the next iteration. 54 | int i; 55 | for (i = 0; i < d.Length && d[i] == i; ++i) 56 | ; // no body 57 | if (i < d.Length) 58 | Environment.FailFast(what); 59 | } 60 | 61 | /// 62 | /// Baseline: Fill array with sorted numbers, overwrite with sorted sequence, and validate for being sorted. 63 | /// The first and last step are common for all benchmarks. 64 | /// 65 | [Benchmark(Baseline = true)] 66 | public void NoSort() => Template(Filler, "Unsorted [Baseline]."); 67 | 68 | /// 69 | /// Sorting by using Array.Sort(). 70 | /// 71 | [Benchmark] 72 | public void ArraySort() => Template(ArraySorter, "Unsorted [ArraySort]."); 73 | 74 | [Benchmark] 75 | public unsafe void NetworkSort() => Template(NetworkSorter, "Unsorted [NetworkSort]."); 76 | 77 | // The numbers in-between powers of two are deliberately set to odd numbers slightly lower/larger than half the interval. 78 | // This to test the sorters for various lengths. 79 | public IEnumerable Sizes => new int[] { 80 | 4, 8, 12, 16, 27, 32, 47, 64, 128, 177, 256, 364, 512, 748, 1024, 2048, 3389, 4096, 6793, 8192, 14289, 16384, 81 | 32768, 53151, 65536, 96317, 131072, 191217, 262144, 398853, 524288, 719289, 1048576 82 | }; 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /SortingNetworks/UnsafeSort.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace SortingNetworks 4 | { 5 | /// 6 | /// Represents an in-place sorting method with possibly limited bounds on the valid values of . 7 | /// 8 | /// Type of elements being sorted. 9 | /// Pointer to the beginning of the range to sort. 10 | /// Number of elements in the range. 11 | public unsafe delegate void Sorter(T* data, int c) where T : unmanaged; 12 | 13 | /// 14 | /// Provides methods for sorting arrays of ints or floats using a periodic sorting network. 15 | /// 16 | /// The type of array elements. 17 | /// 18 | /// WARNING! All methods taking pointer arguments require that the allocated size is correct wrt. the implied or specified 19 | /// length. Also, the input length must conform to and limits. Otherwise 20 | /// UNDEFINED BEHAVIOR occurs: incorrect result, data corruption or crash. 21 | /// 22 | public abstract class UnsafeSort where T : unmanaged 23 | { 24 | /// 25 | /// Creates an instance of UnsafeSort{T}. 26 | /// 27 | /// 28 | /// Maximum array length supported by the sorter. Sorters for sizes of up to 16 are more efficent than the general-length 29 | /// sorters and should therefore be used for small arrays. 30 | /// 31 | /// 32 | /// exceeds 2^24, which is the maximum supported value. - OR - 33 | /// is not int or float. 34 | /// 35 | public static UnsafeSort Create(int maxLength) { 36 | object ret = null; 37 | 38 | if (typeof(T) == typeof(int)) 39 | ret = new IntSorter(maxLength); 40 | if (typeof(T) == typeof(float)) 41 | ret = new FloatSorter(maxLength); 42 | 43 | if (ret == null) 44 | throw new ArgumentOutOfRangeException("Unsupported element type: " + typeof(T).Name); 45 | return (UnsafeSort)ret; 46 | } 47 | 48 | // This base is derivable only in this assembly. 49 | private protected UnsafeSort() 50 | { } 51 | 52 | /// 53 | /// Minimum array length supported by this sorter. 54 | /// 55 | public int MinLength { get; protected set; } 56 | 57 | /// 58 | /// Maximum array length supported by this sorter. 59 | /// 60 | public int MaxLength { get; protected set; } 61 | 62 | /// 63 | /// Delegate that performs the actual sorting. WARNING! The count argument given to the delegate must be between 64 | /// and (inclusive). No bounds are checked. 65 | /// 66 | public Sorter Sorter { get; protected set; } 67 | 68 | /// 69 | /// Convenience overload for use in "safe" code. Checks preconditions and then invokes . 70 | /// 71 | /// Array to sort. 72 | /// The array length is invalid. 73 | public unsafe void Sort(T[] data) { 74 | if (data.Length < MinLength || data.Length > MaxLength) 75 | throw new ArgumentOutOfRangeException(nameof(data), $"Invalid array length ({data.Length})."); 76 | fixed (T* p = data) 77 | Sorter(p, data.Length); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /SNBenchmark/Validation.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace SNBenchmark 4 | { 5 | /// 6 | /// Validation methods for verifying output of a sorting network. 7 | /// 8 | static class Validation 9 | { 10 | /// 11 | /// Validates by exploiting theorem Z of TAOCOP section 5.3.4: it is 12 | /// sufficient to check that all 0-1 sequences (2^N of them) are sorted by the network. 13 | /// Only lengths of up to 28 are accepted. 14 | /// 15 | /// An instance of sorting network to test. 16 | /// Element count to test with. 17 | /// Sorter's length is larger than 28. 18 | /// Validation has failed. 19 | public static unsafe void Check(SortingNetworks.UnsafeSort sort, int size) { 20 | if (size < 4 || size > 32) 21 | throw new ArgumentOutOfRangeException(nameof(size), "Valid range is [4, 32]."); 22 | 23 | var bits = new int[size]; 24 | 25 | fixed (int* pbits = bits) { 26 | for (uint i = 0; i <= (1 << size) - 1; ++i) { 27 | int popcnt = 0; // Number of ones in i 28 | for (uint j = i, k = 0; k < size; ++k, j >>= 1) { 29 | int b = (int)(j & 1); 30 | pbits[k] = b; 31 | popcnt += b; 32 | } 33 | 34 | sort.Sorter(pbits, size); 35 | 36 | for (int k = 0; k < size - popcnt; ++k) 37 | if (pbits[k] != 0) 38 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}."); 39 | 40 | for (int k = size - popcnt; k < size; ++k) 41 | if (pbits[k] != 1) 42 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}."); 43 | } 44 | } 45 | } 46 | 47 | /// 48 | /// Overload for float arrays; . 49 | /// 50 | public static unsafe void Check(SortingNetworks.UnsafeSort sort, int size) { 51 | if (size < 4 || size > 32) 52 | throw new ArgumentOutOfRangeException(nameof(size), "Valid range is [4, 32]."); 53 | 54 | var bits = new float[size]; 55 | 56 | fixed (float* pbits = bits) { 57 | for (uint i = 0; i <= (1 << size) - 1; ++i) { 58 | int popcnt = 0; // Number of ones in i 59 | for (uint j = i, k = 0; k < size; ++k, j >>= 1) { 60 | int b = (int)(j & 1); 61 | pbits[k] = b; 62 | popcnt += b; 63 | } 64 | 65 | sort.Sorter(pbits, size); 66 | 67 | for (int k = 0; k < size - popcnt; ++k) 68 | if (pbits[k] != 0) 69 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}."); 70 | 71 | for (int k = size - popcnt; k < size; ++k) 72 | if (pbits[k] != 1) 73 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}."); 74 | } 75 | } 76 | } 77 | 78 | /// 79 | /// Checks whether array is sorted. 80 | /// 81 | /// True if the input is sorted, false otherwise. 82 | public static bool IsSorted(int[] data) { 83 | for (int i = 1; i < data.Length; ++i) 84 | if (data[i] < data[i - 1]) 85 | return false; 86 | return true; 87 | } 88 | 89 | /// 90 | /// Checks whether array is sorted. 91 | /// 92 | /// True if the input is sorted, false otherwise. 93 | public static bool IsSorted(float[] data) { 94 | for (int i = 1; i < data.Length; ++i) 95 | if (data[i] < data[i - 1]) 96 | return false; 97 | return true; 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /SortingNetworks/Attic/Periodic16Branchless.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks.Attic 7 | { 8 | using V = System.Runtime.Intrinsics.Vector256; 9 | 10 | /// 11 | /// Reference, branchless implementation of 16-element periodic sorting network. 12 | /// 13 | public static class Periodic16Branchless 14 | { 15 | // All zeros 16 | static readonly V Zero = Vector256.Create(0); 17 | // All ones 18 | static readonly V Complement = Avx2.CompareEqual(Zero, Zero); 19 | // FF00FF00 (1 digit = 32 bits) 20 | static readonly V AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8)); 21 | // F0F0F0F0 22 | static readonly V AlternatingMaskLo32 = Avx2.Xor( 23 | Complement.AsInt64(), 24 | Avx2.ShiftLeftLogical(Complement.AsInt64(), 32) 25 | ).AsInt32(); 26 | 27 | /// 28 | /// In-place sorting of 16 elements starting at . 29 | /// 30 | /// 31 | public static unsafe void Sort(int* data) { 32 | var lo = Avx.LoadVector256(data); 33 | var hi = Avx.LoadVector256(data + 8); 34 | 35 | Step(ref lo, ref hi); 36 | Step(ref lo, ref hi); 37 | Step(ref lo, ref hi); 38 | Step(ref lo, ref hi); 39 | 40 | Avx.Store(data, lo); 41 | Avx.Store(data + 8, hi); 42 | } 43 | 44 | /// 45 | /// Test method for debugging instruction sequences. 46 | /// 47 | public static unsafe void Test() { 48 | var data = new int[16]; 49 | for (int i = 0; i < 16; ++i) data[i] = i; 50 | fixed (int* p = data) { 51 | var lo = Avx.LoadVector256(p); 52 | var hi = Avx.LoadVector256(p + 8); 53 | Step(ref lo, ref hi); 54 | } 55 | } 56 | 57 | /// 58 | /// One step of the sorting network for 16 elements. Must be iterated 4 times. 59 | /// 60 | /// 61 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 62 | static void Step(ref V lo, ref V hi) { 63 | V tmp1, tmp2; 64 | 65 | // lo, hi are intermediate results after each stage and input to next one. 66 | 67 | // STAGE 1: 68 | // 76543210 69 | // 89ABCDEF 70 | 71 | tmp1 = Avx2.Shuffle(hi, 0x1B); // CDEF89AB 72 | hi = Avx2.Permute2x128(tmp1, tmp1, 1); // 89ABCDEF 73 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 74 | 75 | // STAGE 2: 76 | // BA983210 77 | // CDEF4567 78 | 79 | tmp1 = Avx2.Permute2x128(lo, hi, 0x31); // 89AB7654 80 | tmp1 = Avx2.Shuffle(tmp1, 0x1B); // BA984567 81 | lo = Avx2.Permute2x128(lo, tmp1, 0x30); // BA983210 82 | hi = Avx2.Permute2x128(hi, tmp1, 0x02); // CDEF4567 83 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 84 | 85 | // STAGE 3: 86 | // DC985410 87 | // EFAB6723 88 | 89 | Swap(ref lo, ref hi, AlternatingMaskHi64); // L:CD984510 - H:BAEF3267 90 | lo = Avx2.Shuffle(lo, 0b01001011); // 91 | hi = Avx2.Shuffle(hi, 0b10110100); // 92 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 93 | 94 | // STAGE 4: 95 | // ECA86420 96 | // FDB97531 97 | 98 | Swap(ref lo, ref hi, AlternatingMaskLo32); // L:ECA86420 - H:DF9B5713 99 | hi = Avx2.Shuffle(hi, 0b10110001); 100 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 101 | 102 | // Final stage: restore order. 103 | 104 | tmp1 = Avx2.UnpackLow(lo, hi); 105 | tmp2 = Avx2.UnpackHigh(lo, hi); 106 | lo = Avx2.Permute2x128(tmp1, tmp2, 0x20); 107 | hi = Avx2.Permute2x128(tmp1, tmp2, 0x31); 108 | } 109 | 110 | /// 111 | /// Swaps elements of and where is 1. 112 | /// 113 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 114 | static void Swap(ref V lo, ref V hi, V mask) { 115 | var t = Avx2.BlendVariable(lo, hi, mask); 116 | lo = Avx2.BlendVariable(hi, lo, mask); 117 | hi = t; 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /SNBenchmark/Program.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | using BenchmarkDotNet.Configs; 4 | using BenchmarkDotNet.Running; 5 | 6 | namespace SNBenchmark 7 | { 8 | unsafe class Program 9 | { 10 | static void Main(string[] args) { 11 | if (args.Length == 0) 12 | Usage(); 13 | 14 | if (args[0] == "VI") { 15 | ValidateInt(); 16 | } 17 | else if (args[0] == "VF") { 18 | ValidateFloat(); 19 | } 20 | else if (args[0] == "B") { 21 | //var ss = new BenchmarkDotNet.Reports.BenchmarkReport 22 | // TODO: SummaryStyle; InvariantCulture 23 | var config = ManualConfig.Create(DefaultConfig.Instance) 24 | .WithOptions(ConfigOptions.StopOnFirstError | ConfigOptions.JoinSummary); 25 | var args1 = new string[args.Length - 1]; 26 | Array.Copy(args, 1, args1, 0, args.Length - 1); 27 | BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args1); 28 | } 29 | else { 30 | Usage(); 31 | } 32 | 33 | Environment.Exit(0); 34 | } 35 | 36 | static void Usage() { 37 | Console.WriteLine("USAGE: {VI | VF | B} [argument...]"); 38 | Console.WriteLine("VI validates int sorting networks for all sizes up to 32."); 39 | Console.WriteLine("VF validates float sorting networks for all sizes up to 32."); 40 | Console.WriteLine("B runs benchmarks with arguments following it."); 41 | Environment.Exit(0); 42 | } 43 | 44 | static void ValidateInt() { 45 | for (int size = 4; size <= 32; ++size) { 46 | var n = SortingNetworks.UnsafeSort.Create(size); 47 | Console.Write($"Validating size {size:D2}: "); 48 | try { 49 | Validation.Check(n, size); 50 | Console.WriteLine("OK"); 51 | } 52 | catch (NotImplementedException e) { 53 | Console.WriteLine($"FAILED: {e.Message}"); 54 | } 55 | } 56 | } 57 | 58 | static void ValidateFloat() { 59 | for (int size = 4; size <= 32; ++size) { 60 | var n = SortingNetworks.UnsafeSort.Create(size); 61 | Console.Write($"Validating size {size:D2}: "); 62 | try { 63 | Validation.Check(n, size); 64 | Console.WriteLine("OK"); 65 | } 66 | catch (NotImplementedException e) { 67 | Console.WriteLine($"FAILED: {e.Message}"); 68 | } 69 | } 70 | } 71 | 72 | // This exists only for sporadic testing and debugging. 73 | static void Test() { 74 | var d = new int[11157]; 75 | int[] dc; 76 | var g = new Generators(); 77 | var nn = SortingNetworks.UnsafeSort.Create(d.Length); 78 | for (int i = 0; i < d.Length; ++i) d[i] = i; 79 | 80 | var iteration = 0; 81 | while (true) { 82 | ++iteration; 83 | //if ((iteration % 1000) == 0) 84 | // Console.WriteLine(iteration); 85 | g.FisherYates(d); 86 | dc = (int[])d.Clone(); 87 | fixed (int* p = d) 88 | nn.Sorter(p, d.Length); 89 | for (int i = 0; i < d.Length; ++i) 90 | if (d[i] != i) 91 | throw new NotImplementedException(); 92 | } 93 | } 94 | 95 | static unsafe void TestAESRand() { 96 | var r = new SortingNetworks.AESRand(new int[4] { 2, 3, 5, 7, }); 97 | 98 | int[] idata = new int[4]; 99 | float[] fdata = new float[4]; 100 | 101 | for (int i = 0; i < 4; ++i) { 102 | fixed (int* p = idata) 103 | r.Get4(p); 104 | fixed (float* p = fdata) 105 | r.Get4U(p); 106 | fixed (float* p = fdata) 107 | r.Get4N(p); 108 | } 109 | } 110 | 111 | static unsafe void TestMWC1616Rand() { 112 | var r = new SortingNetworks.MWC1616Rand(new int[8] { 2, 3, 5, 7, 11, 13, 17, 19, }); 113 | 114 | int[] idata = new int[4]; 115 | float[] fdata = new float[4]; 116 | 117 | for (int i = 0; i < 4; ++i) { 118 | fixed (int* p = idata) 119 | r.Get4(p); 120 | fixed (float* p = fdata) 121 | r.Get4U(p); 122 | fixed (float* p = fdata) 123 | r.Get4N(p); 124 | } 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /SortingNetworks/PeriodicInt.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks 7 | { 8 | using V = Vector256; 9 | 10 | /// 11 | /// Provides methods for sorting integer arrays of lengths that are a power of two. Invoking public members with 12 | /// arrays that are of shorter length will result in UNDEFINED BEHAVIOR (data corruption, crash). 13 | /// 14 | /// 15 | /// You're not expected to understand this code unless you have read the paper by Dowd et al. 16 | /// 17 | public unsafe partial class PeriodicInt 18 | { 19 | public readonly V Zero; // 00000000 20 | public readonly V Complement; // FFFFFFFF 21 | public readonly V AlternatingMaskLo128; // 0000FFFF 22 | public readonly V AlternatingMaskHi128; // FFFF0000 23 | public readonly V AlternatingMaskHi64; // FF00FF00 24 | public readonly V AlternatingMaskHi32; // F0F0F0F0 25 | public readonly V Max; // int.MaxValue in each element 26 | public readonly V ReversePermutation; // Input to VPERMD that reverses all 8 ints 27 | public readonly V LoadMask; // Input to AlignRight for creating load mask 28 | 29 | public PeriodicInt() { 30 | Zero = V.Zero; 31 | Complement = Avx2.CompareEqual(Zero, Zero); 32 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsInt32(); 33 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsInt32(); 34 | AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8)); 35 | AlternatingMaskHi32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftRightLogical(Complement.AsInt64(), 32)).AsInt32(); 36 | Max = Vector256.Create(int.MaxValue); 37 | ReversePermutation = Vector256.Create(7, 6, 5, 4, 3, 2, 1, 0); 38 | } 39 | 40 | // This is the last size that can be reasonably inlined due to code size (> 1kB of straight-line code) and # of used registers. 41 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 42 | public unsafe void Sort32(int* data) { 43 | var v0 = Avx.LoadVector256(data + 0); 44 | var v1 = Avx.LoadVector256(data + 8); 45 | var v2 = Avx.LoadVector256(data + 16); 46 | var v3 = Avx.LoadVector256(data + 24); 47 | Block_32_1(2, ref v0, ref v1, ref v2, ref v3); 48 | Block_32_1(3, ref v0, ref v1, ref v2, ref v3); 49 | Block_32_1(4, ref v0, ref v1, ref v2, ref v3); 50 | Block_32_1(5, ref v0, ref v1, ref v2, ref v3); 51 | Block_32_1(5, ref v0, ref v1, ref v2, ref v3); 52 | Avx.Store(data + 0, v0); 53 | Avx.Store(data + 8, v1); 54 | Avx.Store(data + 16, v2); 55 | Avx.Store(data + 24, v3); 56 | } 57 | 58 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 59 | public unsafe void Sort32(int* data, int c) { 60 | throw new NotImplementedException("Sort32"); 61 | } 62 | 63 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 64 | public unsafe void Sort16(int* data) { 65 | var v0 = Avx.LoadVector256(data + 0); 66 | var v1 = Avx.LoadVector256(data + 8); 67 | Block_16_1(2, ref v0, ref v1); 68 | Block_16_1(3, ref v0, ref v1); 69 | Block_16_1(4, ref v0, ref v1); 70 | Block_16_1(4, ref v0, ref v1); 71 | Avx.Store(data + 0, v0); 72 | Avx.Store(data + 8, v1); 73 | } 74 | 75 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 76 | public unsafe void Sort16(int* data, int c) { 77 | throw new NotImplementedException("Sort16"); 78 | } 79 | 80 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 81 | public unsafe void Sort8(int* data) { 82 | var v = Avx.LoadVector256(data); 83 | Block_8_1(2, ref v); 84 | Block_8_1(3, ref v); 85 | Block_8_1(3, ref v); 86 | Avx.Store(data, v); 87 | } 88 | 89 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 90 | public unsafe void Sort8(int* data, int c) { 91 | throw new NotImplementedException("Sort8"); 92 | } 93 | 94 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 95 | public unsafe void Sort4(int* data) { 96 | var v = Avx2.MaskLoad(data, AlternatingMaskLo128); 97 | Block_4_2(2, ref v); 98 | Block_4_2(2, ref v); 99 | Avx2.MaskStore(data, AlternatingMaskLo128, v); 100 | } 101 | 102 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 103 | public unsafe void Sort4(int* data, int c) { 104 | throw new NotImplementedException("Sort4"); 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /SortingNetworks/UnsafeRandom.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.Intrinsics; 3 | using System.Runtime.Intrinsics.X86; 4 | 5 | namespace SortingNetworks 6 | { 7 | /// 8 | /// Provides methods for fast, "unsafe" generation of integer or floating-point random numbers. 9 | /// 10 | public abstract class UnsafeRandom 11 | { 12 | readonly Vector128 oneMask; 13 | readonly Vector128 one; 14 | readonly Vector128 complement; 15 | 16 | protected UnsafeRandom() { 17 | oneMask = Vector128.Create(0x3F800000); 18 | one = Vector128.Create(1.0f); 19 | complement = Vector128.Create(-1); 20 | } 21 | 22 | /// 23 | /// Returns 4 random numbers in a vector. 24 | /// 25 | public abstract Vector128 Get4(); 26 | 27 | /// 28 | /// Overwrites the initial 4 elements of with random 32-bit integers. 29 | /// 30 | /// 31 | /// Pointer to a memory chunk of at least 4 integers. Behaviour is UNDEFINED if the allocated 32 | /// space for the chunk is shorter. 33 | /// 34 | public unsafe void Get4(int* data) { 35 | var v = Get4(); 36 | Sse2.Store(data, v); 37 | } 38 | 39 | /// 40 | /// Overwrites initial elements of with random 32-bit integers. 41 | /// 42 | /// 43 | /// Pointer to a memory chunk of at least integers. Behaviour is UNDEFINED if the allocated 44 | /// space for the chunk is shorter. 45 | /// 46 | /// 47 | /// Number of elements to write. Must be between 0 and 4. 48 | /// 49 | public unsafe void Get(int* data, int c) { 50 | var v = Get4(); 51 | var m = Sse2.ShiftRightLogical128BitLane(complement, (byte)(4 * (4 - c))); 52 | Avx2.MaskStore(data, m, v); 53 | } 54 | 55 | /// 56 | /// Overwrites the initial 4 elements of with floats in range [-2^31, 2^31). 57 | /// 58 | /// 59 | /// Pointer to a memory chunk of at least 4 floats. 60 | /// Behaviour is UNDEFINED if the allocated space for the chunk is shorter. 61 | /// 62 | public unsafe void Get4U(float* data) { 63 | var v = Get4(); 64 | var f = Sse2.ConvertToVector128Single(v); 65 | Sse.Store(data, f); 66 | } 67 | 68 | /// 69 | /// Overwrites the initial 4 elements of with floats in range [-2^31, 2^31). 70 | /// 71 | /// 72 | /// Pointer to a memory chunk of at least floats. Behaviour is UNDEFINED if the 73 | /// allocated space for the chunk is shorter. 74 | /// 75 | /// 76 | /// Number of elements to write. Must be between 0 and 4. 77 | /// 78 | public unsafe void Get4U(float* data, int c) { 79 | var v = Get4(); 80 | var m = Sse2.ShiftRightLogical128BitLane(complement, (byte)(4 * (4 - c))); 81 | var f = Sse2.ConvertToVector128Single(v); 82 | Avx.MaskStore(data, m.AsSingle(), f); 83 | } 84 | 85 | /// 86 | /// Overwrites the initial 4 elements of with floats in range [0, 1). 87 | /// 88 | /// 89 | /// Pointer to a memory chunk of at least 4 floats. 90 | /// Behaviour is UNDEFINED if the allocated space for the chunk is shorter. 91 | /// 92 | public unsafe void Get4N(float* data) { 93 | var v = Get4(); 94 | // Keep 23 MSB bits of the random integer and convert to [1.0,2.0) 95 | v = Sse2.Or(Sse2.ShiftRightLogical(v, 9), oneMask); 96 | var f = Sse.Subtract(v.AsSingle(), one); 97 | Sse.Store(data, f); 98 | } 99 | 100 | /// 101 | /// Overwrites the initial 4 elements of with floats in range [0, 1). 102 | /// 103 | /// 104 | /// Pointer to a memory chunk of at least floats. Behaviour is UNDEFINED if the 105 | /// allocated space for the chunk is shorter. 106 | /// 107 | /// 108 | /// Number of elements to write. Must be between 0 and 4. 109 | /// 110 | public unsafe void Get4N(float* data, int c) { 111 | var v = Get4(); 112 | v = Sse2.Or(Sse2.ShiftRightLogical(v, 9), oneMask); 113 | var m = Sse2.ShiftRightLogical128BitLane(complement, (byte)(4 * (4 - c))); 114 | var f = Sse.Subtract(v.AsSingle(), one); 115 | Avx.MaskStore(data, m.AsSingle(), f); 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /SortingNetworks/PeriodicInt_Block.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks 7 | { 8 | using V = Vector256; 9 | 10 | partial class PeriodicInt 11 | { 12 | /// 13 | /// Used to implement a single compare-swap phase for N elements; this processes 32 items at a time. 14 | /// Range [b, b+16) is compared/exchanged with reversed range [e-16, e).. 15 | /// 16 | /// 17 | /// TODO: Should use aligned loads and non-temporal stores once it's possible to allocate aligned storage in .NET. 18 | /// 19 | /// Start of the block. 20 | /// One past the end of the block. 21 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 22 | public unsafe void Phase_N_32(int* b, int* e) { 23 | V m0, m1; 24 | 25 | // Low half. 26 | var v0 = Avx.LoadVector256(b + 0); 27 | var v1 = Avx.LoadVector256(b + 8); 28 | 29 | // High half. Interleave loads with reversing. 30 | var v2 = Avx2.PermuteVar8x32(Avx.LoadVector256(e - 16), ReversePermutation); 31 | var v3 = Avx2.PermuteVar8x32(Avx.LoadVector256(e - 8), ReversePermutation); 32 | 33 | // Comparisons, interleaved with stores. Min/max have throughput of 0.5, so we can execute two at once. 34 | // Use m0 and m1 to exploit the fact that min/max have a throughput < 1. 35 | m0 = Avx2.Min(v0, v3); 36 | m1 = Avx2.PermuteVar8x32(Avx2.Max(v0, v3), ReversePermutation); 37 | Avx.Store(b + 0, m0); 38 | Avx.Store(e - 8, m1); 39 | 40 | m0 = Avx2.Min(v1, v2); 41 | m1 = Avx2.PermuteVar8x32(Avx2.Max(v1, v2), ReversePermutation); 42 | Avx.Store(b + 8, m0); 43 | Avx.Store(e - 16, m1); 44 | } 45 | 46 | /// 47 | /// Block for sorting one vector of 32 elements (four registers). 48 | /// 49 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 50 | public void Block_32_1(int p, ref V _v0, ref V _v1, ref V _v2, ref V _v3) { 51 | V v0 = _v0, v1 = _v1, v2, v3, m0, m1; 52 | 53 | v2 = Avx2.PermuteVar8x32(_v2, ReversePermutation); 54 | v3 = Avx2.PermuteVar8x32(_v3, ReversePermutation); 55 | m0 = Avx2.Max(v0, v3); 56 | m1 = Avx2.Max(v1, v2); 57 | v0 = Avx2.Min(v0, v3); 58 | v1 = Avx2.Min(v1, v2); 59 | v2 = Avx2.PermuteVar8x32(m1, ReversePermutation); 60 | v3 = Avx2.PermuteVar8x32(m0, ReversePermutation); 61 | if (p == 1) 62 | goto done; 63 | 64 | Block_16_1(p - 1, ref v0, ref v1); 65 | Block_16_1(p - 1, ref v2, ref v3); 66 | 67 | done: 68 | _v0 = v0; _v1 = v1; 69 | _v2 = v2; _v3 = v3; 70 | } 71 | 72 | /// 73 | /// Block for sorting one vector of 16 elements (two registers). 74 | /// 75 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 76 | public void Block_16_1(int p, ref V _v0, ref V _v1) { 77 | V v0 = _v0, v1, m; 78 | 79 | v1 = Avx2.PermuteVar8x32(_v1, ReversePermutation); 80 | m = Avx2.Max(v0, v1); 81 | v0 = Avx2.Min(v0, v1); 82 | v1 = Avx2.PermuteVar8x32(m, ReversePermutation); 83 | if (p == 1) 84 | goto done; 85 | 86 | Block_8_1(p - 1, ref v0); 87 | Block_8_1(p - 1, ref v1); 88 | 89 | done: 90 | _v0 = v0; 91 | _v1 = v1; 92 | } 93 | 94 | /// 95 | /// Block for sorting one vector of 8 elements. 96 | /// 97 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 98 | public void Block_8_1(int p, ref V _v) { 99 | V v0 = _v, v1, m; 100 | 101 | // PHASE1: 102 | // 76543210 103 | // 01234567 104 | 105 | v1 = Avx2.PermuteVar8x32(v0, ReversePermutation); 106 | m = Avx2.CompareGreaterThan(v0, v1); 107 | m = Avx2.Xor(m, AlternatingMaskHi128); 108 | v0 = Avx2.BlendVariable(v0, v1, m); 109 | if (p == 1) 110 | goto done; 111 | 112 | Block_4_2(p - 1, ref v0); 113 | 114 | done: 115 | _v = v0; 116 | } 117 | 118 | /// 119 | /// Block for sorting 2 independent vectors of 4 elements each. 120 | /// 121 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 122 | public void Block_4_2(int p, ref V _v) { 123 | V v0 = _v, v1, m; 124 | 125 | // PHASE1: 126 | // 3210 (INPUT, same in both lanes) 127 | // 0123 128 | 129 | v1 = Avx2.Shuffle(v0, 0x1B); // 0123 130 | m = Avx2.CompareGreaterThan(v0, v1); 131 | m = Avx2.Xor(m, AlternatingMaskHi64); 132 | v0 = Avx2.BlendVariable(v0, v1, m); 133 | if (p == 1) 134 | goto done; 135 | 136 | // PHASE2: 137 | // 3210 138 | // 2301 139 | 140 | v1 = Avx2.Shuffle(v0, 0b10110001); // 2301 141 | m = Avx2.CompareGreaterThan(v0, v1); 142 | m = Avx2.Xor(m, AlternatingMaskHi32); 143 | v0 = Avx2.BlendVariable(v0, v1, m); 144 | 145 | done: 146 | _v = v0; 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /SortingNetworks/Attic/PeriodicInt_Block.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks.Attic 7 | { 8 | using V = Vector256; 9 | 10 | public partial class PeriodicInt 11 | { 12 | /// 13 | /// Operations of a (potentially partial) 32-block. The integers are ordered from the least significant element in 14 | /// to the most significant element in . 15 | /// 16 | /// Phase to stop at; must be 1-5. 17 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 18 | void Block32(int p, ref V _v0, ref V _v1, ref V _v2, ref V _v3) { 19 | var v2 = Avx2.Permute2x128(_v2, _v2, 0x01); 20 | var v3 = Avx2.Permute2x128(_v3, _v3, 0x01); 21 | v2 = Avx2.Shuffle(v2, 0x1B); 22 | v3 = Avx2.Shuffle(v3, 0x1B); 23 | 24 | Swap(ref _v0, ref v3, Avx2.CompareGreaterThan(v3, _v0)); // 0-7 : 31:24 25 | Swap(ref _v1, ref v2, Avx2.CompareGreaterThan(v2, _v1)); // 8-15 : 23:16 26 | 27 | v2 = Avx2.Shuffle(v2, 0x1B); 28 | v3 = Avx2.Shuffle(v3, 0x1B); 29 | v2 = Avx2.Permute2x128(v2, v2, 0x01); 30 | v3 = Avx2.Permute2x128(v3, v3, 0x01); 31 | 32 | Block16(p - 1, ref _v0, ref _v1); 33 | Block16(p - 1, ref v2, ref v3); 34 | _v2 = v2; _v3 = v3; 35 | } 36 | 37 | /// 38 | /// Performs the operations of a single, potentially partial, 16-block. The integers in each half (vector parameter) 39 | /// are ordered from least to most significant bits. 40 | /// 41 | /// Phase to stop at; must be 1, 2, 3 or 4. Unchecked; 4 or any other value will run the whole block. 42 | /// Low half of elements to sort. 43 | /// High half of elements to sort. 44 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 45 | void Block16(int p, ref V _lo, ref V _hi) { 46 | V lo = _lo, hi = _hi; // Stack-allocated to eliminate unnecessary loads/stores to refs 47 | V tmp1, tmp2; 48 | 49 | // INPUT: 50 | // 76543210 51 | // FEDCBA98 52 | // lo, hi are intermediate results after each stage and input to next one. 53 | 54 | // PHASE 1: 55 | // 76543210 56 | // 89ABCDEF 57 | 58 | tmp1 = Avx2.Shuffle(hi, 0x1B); // CDEF89AB 59 | hi = Avx2.Permute2x128(tmp1, tmp1, 0x01); // 89ABCDEF 60 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 61 | if (p == 1) { 62 | hi = Avx2.Permute2x128(hi, hi, 0x01); 63 | hi = Avx2.Shuffle(hi, 0x1B); 64 | _lo = lo; _hi = hi; 65 | return; 66 | } 67 | 68 | // PHASE 2: 69 | // BA983210 70 | // CDEF4567 71 | 72 | tmp1 = Avx2.Permute2x128(lo, hi, 0x31); // 89AB7654 73 | tmp1 = Avx2.Shuffle(tmp1, 0x1B); // BA984567 74 | lo = Avx2.Permute2x128(lo, tmp1, 0x30); // BA983210 75 | hi = Avx2.Permute2x128(hi, tmp1, 0x02); // CDEF4567 76 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 77 | if (p == 2) { 78 | hi = Avx2.Shuffle(hi, 0x1B); // FEDC7654 79 | tmp1 = Avx2.Permute2x128(lo, hi, 0x21); // 7654BA98 80 | Swap(ref lo, ref tmp1, AlternatingMaskLo128); 81 | Swap(ref hi, ref tmp1, AlternatingMaskHi128); 82 | _lo = lo; _hi = hi; 83 | return; 84 | } 85 | 86 | // PHASE 3: 87 | // DC985410 88 | // EFAB6723 89 | 90 | Swap(ref lo, ref hi, AlternatingMaskHi64); // L:CD984510 - H:BAEF3267 91 | lo = Avx2.Shuffle(lo, 0b01001011); // 92 | hi = Avx2.Shuffle(hi, 0b10110100); // 93 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 94 | if (p == 3) { 95 | hi = Avx2.Shuffle(hi, 0b10110001); // FEBA7632 96 | tmp1 = Avx2.UnpackLow(lo.AsInt64(), 97 | hi.AsInt64()).AsInt32(); // BA983210 98 | tmp2 = Avx2.UnpackHigh(lo.AsInt64(), 99 | hi.AsInt64()).AsInt32(); // FEDC7654 100 | goto fixup; 101 | } 102 | 103 | // PHASE 4: 104 | // ECA86420 105 | // FDB97531 106 | 107 | Swap(ref lo, ref hi, AlternatingMaskLo32); // L:ECA86420 - H:DF9B5713 108 | hi = Avx2.Shuffle(hi, 0b10110001); 109 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo)); 110 | 111 | // Final stage: restore order. 112 | 113 | tmp1 = Avx2.UnpackLow(lo, hi); 114 | tmp2 = Avx2.UnpackHigh(lo, hi); 115 | 116 | fixup: 117 | lo = Avx2.Permute2x128(tmp1, tmp2, 0x20); 118 | hi = Avx2.Permute2x128(tmp1, tmp2, 0x31); 119 | _lo = lo; _hi = hi; 120 | } 121 | 122 | /// 123 | /// Swaps elements of and where is 1. 124 | /// 125 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 126 | static void Swap(ref V lo, ref V hi, V mask) { 127 | var t = Avx2.BlendVariable(lo, hi, mask); 128 | lo = Avx2.BlendVariable(hi, lo, mask); 129 | hi = t; 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /SortingNetworks/Attic/Periodic16Expr.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq.Expressions; 4 | using System.Reflection; 5 | using System.Runtime.Intrinsics; 6 | using System.Runtime.Intrinsics.X86; 7 | 8 | namespace SortingNetworks.Attic 9 | { 10 | using V = System.Runtime.Intrinsics.Vector256; 11 | 12 | /// 13 | /// Builds an expression for periodic sorting network as compiled lambda. 14 | /// The network is hard-coded to 16 elements. 15 | /// 16 | public class Periodic16Expr 17 | { 18 | unsafe delegate void RegisterSort(ref V lo, ref V hi); 19 | 20 | static readonly Type TAVX = typeof(Avx); 21 | static readonly Type TAVX2 = typeof(Avx2); 22 | static readonly Type TV = typeof(V); 23 | 24 | // All zeros 25 | static readonly V Zero = Vector256.Create(0); 26 | // All ones 27 | static readonly V Complement = Avx2.CompareEqual(Zero, Zero); 28 | // FF00FF00 (1 digit = 32 bits) 29 | static readonly V AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8)); 30 | // F0F0F0F0 31 | static readonly V AlternatingMaskLo32 = Avx2.Xor( 32 | Complement.AsInt64(), 33 | Avx2.ShiftLeftLogical(Complement.AsInt64(), 32) 34 | ).AsInt32(); 35 | 36 | static readonly MethodInfo Shuffle = TAVX2.GetMethod("Shuffle", new Type[] { TV, typeof(byte) }); 37 | static readonly MethodInfo BlendVariable = TAVX2.GetMethod("BlendVariable", new Type[] { TV, TV, TV }); 38 | static readonly MethodInfo Permute2x128 = TAVX2.GetMethod("Permute2x128", new Type[] { TV, TV, typeof(byte) }); 39 | static readonly MethodInfo CompareGreaterThan = TAVX2.GetMethod("CompareGreaterThan", new Type[] { TV, TV }); 40 | static readonly MethodInfo UnpackLow = TAVX2.GetMethod("UnpackLow", new Type[] { TV, TV }); 41 | static readonly MethodInfo UnpackHigh = TAVX2.GetMethod("UnpackHigh", new Type[] { TV, TV }); 42 | 43 | readonly ParameterExpression lo; 44 | readonly ParameterExpression hi; 45 | readonly ParameterExpression tmp1; 46 | readonly ParameterExpression tmp2; 47 | readonly ParameterExpression tmp3; 48 | readonly RegisterSort sort; 49 | 50 | public Periodic16Expr() { 51 | lo = Expression.Parameter(TV.MakeByRefType(), "lo"); 52 | hi = Expression.Parameter(TV.MakeByRefType(), "hi"); 53 | tmp1 = Expression.Variable(TV, "tmp1"); 54 | tmp2 = Expression.Variable(TV, "tmp2"); 55 | tmp3 = Expression.Variable(TV, "tmp3"); 56 | 57 | var steps = new List(); 58 | for (int i = 0; i < 4; ++i) 59 | steps.AddRange(Step()); 60 | 61 | var l = Expression.Lambda( 62 | Expression.Block(new ParameterExpression[] { tmp1, tmp2, tmp3 }, steps), 63 | new ParameterExpression[] { lo, hi }); 64 | sort = l.Compile(false); 65 | } 66 | 67 | public unsafe void Sort(int* data) { 68 | var lo = Avx.LoadVector256(data); 69 | var hi = Avx.LoadVector256(data + 8); 70 | sort(ref lo, ref hi); 71 | Avx.Store(data, lo); 72 | Avx.Store(data + 8, hi); 73 | } 74 | 75 | private List Step() { 76 | var es = new List(); 77 | 78 | // STAGE1 79 | 80 | es.AddRange(new Expression[] { 81 | Expression.Assign(tmp1, Expression.Call(Shuffle, hi, Expression.Constant((byte)0x1B))), 82 | Expression.Assign(hi, Expression.Call(Permute2x128, tmp1, tmp1, Expression.Constant((byte)1))), 83 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo)) 84 | }); 85 | es.AddRange(Swap(lo, hi, tmp1)); 86 | 87 | // STAGE2 88 | 89 | es.AddRange(new Expression[] { 90 | Expression.Assign(tmp1, Expression.Call(Permute2x128, lo, hi, Expression.Constant((byte)0x31))), 91 | Expression.Assign(tmp1, Expression.Call(Shuffle, tmp1, Expression.Constant((byte)0x1B))), 92 | Expression.Assign(lo, Expression.Call(Permute2x128, lo, tmp1, Expression.Constant((byte)0x30))), 93 | Expression.Assign(hi, Expression.Call(Permute2x128, hi, tmp1, Expression.Constant((byte)0x02))), 94 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo)) 95 | }); 96 | es.AddRange(Swap(lo, hi, tmp1)); 97 | 98 | // STAGE3 99 | 100 | es.AddRange(Swap(lo, hi, Expression.Constant(AlternatingMaskHi64))); 101 | es.AddRange(new Expression[] { 102 | Expression.Assign(lo, Expression.Call(Shuffle, lo, Expression.Constant((byte)0b01001011))), 103 | Expression.Assign(hi, Expression.Call(Shuffle, hi, Expression.Constant((byte)0b10110100))), 104 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo)) 105 | }); 106 | es.AddRange(Swap(lo, hi, tmp1)); 107 | 108 | // STAGE4 109 | 110 | es.AddRange(Swap(lo, hi, Expression.Constant(AlternatingMaskLo32))); 111 | es.AddRange(new Expression[] { 112 | Expression.Assign(hi, Expression.Call(Shuffle, hi, Expression.Constant((byte)0b10110001))), 113 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo)) 114 | }); 115 | es.AddRange(Swap(lo, hi, tmp1)); 116 | 117 | // RESTORE ORDER. 118 | es.AddRange(new Expression[] { 119 | Expression.Assign(tmp1, Expression.Call(UnpackLow, lo, hi)), 120 | Expression.Assign(tmp2, Expression.Call(UnpackHigh, lo, hi)), 121 | Expression.Assign(lo, Expression.Call(Permute2x128, tmp1, tmp2, Expression.Constant((byte)0x20))), 122 | Expression.Assign(hi, Expression.Call(Permute2x128, tmp1, tmp2, Expression.Constant((byte)0x31))) 123 | }); 124 | 125 | return es; 126 | } 127 | 128 | private Expression[] Swap(ParameterExpression lo, ParameterExpression hi, Expression mask) { 129 | return new Expression[] { 130 | Expression.Assign(tmp3, Expression.Call(BlendVariable, lo, hi, mask)), 131 | Expression.Assign(lo, Expression.Call(BlendVariable, hi, lo, mask)), 132 | Expression.Assign(hi, tmp3) 133 | }; 134 | } 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Ww][Ii][Nn]32/ 27 | [Aa][Rr][Mm]/ 28 | [Aa][Rr][Mm]64/ 29 | bld/ 30 | [Bb]in/ 31 | [Oo]bj/ 32 | [Oo]ut/ 33 | [Ll]og/ 34 | [Ll]ogs/ 35 | 36 | # Visual Studio 2015/2017 cache/options directory 37 | .vs/ 38 | # Uncomment if you have tasks that create the project's static files in wwwroot 39 | #wwwroot/ 40 | 41 | # Visual Studio 2017 auto generated files 42 | Generated\ Files/ 43 | 44 | # MSTest test Results 45 | [Tt]est[Rr]esult*/ 46 | [Bb]uild[Ll]og.* 47 | 48 | # NUnit 49 | *.VisualState.xml 50 | TestResult.xml 51 | nunit-*.xml 52 | 53 | # Build Results of an ATL Project 54 | [Dd]ebugPS/ 55 | [Rr]eleasePS/ 56 | dlldata.c 57 | 58 | # Benchmark Results 59 | BenchmarkDotNet.Artifacts/ 60 | 61 | # .NET Core 62 | project.lock.json 63 | project.fragment.lock.json 64 | artifacts/ 65 | 66 | # ASP.NET Scaffolding 67 | ScaffoldingReadMe.txt 68 | 69 | # StyleCop 70 | StyleCopReport.xml 71 | 72 | # Files built by Visual Studio 73 | *_i.c 74 | *_p.c 75 | *_h.h 76 | *.ilk 77 | *.meta 78 | *.obj 79 | *.iobj 80 | *.pch 81 | *.pdb 82 | *.ipdb 83 | *.pgc 84 | *.pgd 85 | *.rsp 86 | *.sbr 87 | *.tlb 88 | *.tli 89 | *.tlh 90 | *.tmp 91 | *.tmp_proj 92 | *_wpftmp.csproj 93 | *.log 94 | *.vspscc 95 | *.vssscc 96 | .builds 97 | *.pidb 98 | *.svclog 99 | *.scc 100 | 101 | # Chutzpah Test files 102 | _Chutzpah* 103 | 104 | # Visual C++ cache files 105 | ipch/ 106 | *.aps 107 | *.ncb 108 | *.opendb 109 | *.opensdf 110 | *.sdf 111 | *.cachefile 112 | *.VC.db 113 | *.VC.VC.opendb 114 | 115 | # Visual Studio profiler 116 | *.psess 117 | *.vsp 118 | *.vspx 119 | *.sap 120 | 121 | # Visual Studio Trace Files 122 | *.e2e 123 | 124 | # TFS 2012 Local Workspace 125 | $tf/ 126 | 127 | # Guidance Automation Toolkit 128 | *.gpState 129 | 130 | # ReSharper is a .NET coding add-in 131 | _ReSharper*/ 132 | *.[Rr]e[Ss]harper 133 | *.DotSettings.user 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Coverlet is a free, cross platform Code Coverage Tool 146 | coverage*.json 147 | coverage*.xml 148 | coverage*.info 149 | 150 | # Visual Studio code coverage results 151 | *.coverage 152 | *.coveragexml 153 | 154 | # NCrunch 155 | _NCrunch_* 156 | .*crunch*.local.xml 157 | nCrunchTemp_* 158 | 159 | # MightyMoose 160 | *.mm.* 161 | AutoTest.Net/ 162 | 163 | # Web workbench (sass) 164 | .sass-cache/ 165 | 166 | # Installshield output folder 167 | [Ee]xpress/ 168 | 169 | # DocProject is a documentation generator add-in 170 | DocProject/buildhelp/ 171 | DocProject/Help/*.HxT 172 | DocProject/Help/*.HxC 173 | DocProject/Help/*.hhc 174 | DocProject/Help/*.hhk 175 | DocProject/Help/*.hhp 176 | DocProject/Help/Html2 177 | DocProject/Help/html 178 | 179 | # Click-Once directory 180 | publish/ 181 | 182 | # Publish Web Output 183 | *.[Pp]ublish.xml 184 | *.azurePubxml 185 | # Note: Comment the next line if you want to checkin your web deploy settings, 186 | # but database connection strings (with potential passwords) will be unencrypted 187 | *.pubxml 188 | *.publishproj 189 | 190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 191 | # checkin your Azure Web App publish settings, but sensitive information contained 192 | # in these scripts will be unencrypted 193 | PublishScripts/ 194 | 195 | # NuGet Packages 196 | *.nupkg 197 | # NuGet Symbol Packages 198 | *.snupkg 199 | # The packages folder can be ignored because of Package Restore 200 | **/[Pp]ackages/* 201 | # except build/, which is used as an MSBuild target. 202 | !**/[Pp]ackages/build/ 203 | # Uncomment if necessary however generally it will be regenerated when needed 204 | #!**/[Pp]ackages/repositories.config 205 | # NuGet v3's project.json files produces more ignorable files 206 | *.nuget.props 207 | *.nuget.targets 208 | 209 | # Microsoft Azure Build Output 210 | csx/ 211 | *.build.csdef 212 | 213 | # Microsoft Azure Emulator 214 | ecf/ 215 | rcf/ 216 | 217 | # Windows Store app package directories and files 218 | AppPackages/ 219 | BundleArtifacts/ 220 | Package.StoreAssociation.xml 221 | _pkginfo.txt 222 | *.appx 223 | *.appxbundle 224 | *.appxupload 225 | 226 | # Visual Studio cache files 227 | # files ending in .cache can be ignored 228 | *.[Cc]ache 229 | # but keep track of directories ending in .cache 230 | !?*.[Cc]ache/ 231 | 232 | # Others 233 | ClientBin/ 234 | ~$* 235 | *~ 236 | *.dbmdl 237 | *.dbproj.schemaview 238 | *.jfm 239 | *.pfx 240 | *.publishsettings 241 | orleans.codegen.cs 242 | 243 | # Including strong name files can present a security risk 244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 245 | #*.snk 246 | 247 | # Since there are multiple workflows, uncomment next line to ignore bower_components 248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 249 | #bower_components/ 250 | 251 | # RIA/Silverlight projects 252 | Generated_Code/ 253 | 254 | # Backup & report files from converting an old project file 255 | # to a newer Visual Studio version. Backup files are not needed, 256 | # because we have git ;-) 257 | _UpgradeReport_Files/ 258 | Backup*/ 259 | UpgradeLog*.XML 260 | UpgradeLog*.htm 261 | ServiceFabricBackup/ 262 | *.rptproj.bak 263 | 264 | # SQL Server files 265 | *.mdf 266 | *.ldf 267 | *.ndf 268 | 269 | # Business Intelligence projects 270 | *.rdl.data 271 | *.bim.layout 272 | *.bim_*.settings 273 | *.rptproj.rsuser 274 | *- [Bb]ackup.rdl 275 | *- [Bb]ackup ([0-9]).rdl 276 | *- [Bb]ackup ([0-9][0-9]).rdl 277 | 278 | # Microsoft Fakes 279 | FakesAssemblies/ 280 | 281 | # GhostDoc plugin setting file 282 | *.GhostDoc.xml 283 | 284 | # Node.js Tools for Visual Studio 285 | .ntvs_analysis.dat 286 | node_modules/ 287 | 288 | # Visual Studio 6 build log 289 | *.plg 290 | 291 | # Visual Studio 6 workspace options file 292 | *.opt 293 | 294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 295 | *.vbw 296 | 297 | # Visual Studio LightSwitch build output 298 | **/*.HTMLClient/GeneratedArtifacts 299 | **/*.DesktopClient/GeneratedArtifacts 300 | **/*.DesktopClient/ModelManifest.xml 301 | **/*.Server/GeneratedArtifacts 302 | **/*.Server/ModelManifest.xml 303 | _Pvt_Extensions 304 | 305 | # Paket dependency manager 306 | .paket/paket.exe 307 | paket-files/ 308 | 309 | # FAKE - F# Make 310 | .fake/ 311 | 312 | # CodeRush personal settings 313 | .cr/personal 314 | 315 | # Python Tools for Visual Studio (PTVS) 316 | __pycache__/ 317 | *.pyc 318 | 319 | # Cake - Uncomment if you are using it 320 | # tools/** 321 | # !tools/packages.config 322 | 323 | # Tabs Studio 324 | *.tss 325 | 326 | # Telerik's JustMock configuration file 327 | *.jmconfig 328 | 329 | # BizTalk build output 330 | *.btp.cs 331 | *.btm.cs 332 | *.odx.cs 333 | *.xsd.cs 334 | 335 | # OpenCover UI analysis results 336 | OpenCover/ 337 | 338 | # Azure Stream Analytics local run output 339 | ASALocalRun/ 340 | 341 | # MSBuild Binary and Structured Log 342 | *.binlog 343 | 344 | # NVidia Nsight GPU debugger configuration file 345 | *.nvuser 346 | 347 | # MFractors (Xamarin productivity tool) working folder 348 | .mfractor/ 349 | 350 | # Local History for Visual Studio 351 | .localhistory/ 352 | 353 | # BeatPulse healthcheck temp database 354 | healthchecksdb 355 | 356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 357 | MigrationBackup/ 358 | 359 | # Ionide (cross platform F# VS Code tools) working folder 360 | .ionide/ 361 | 362 | # Fody - auto-generated XML schema 363 | FodyWeavers.xsd 364 | /SNBenchmark/Properties/launchSettings.json 365 | /SortingNetworks/Properties/launchSettings.json 366 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ARCHIVED 2 | 3 | This repository has been archived and development has been moved [here](https://github.com/zvrba/Podaga). 4 | 5 | # Sorting networks 6 | 7 | Playground for exploring implementation techniques for sorting networks. These can sort small arrays much faster 8 | than `Array.Sort()`; depending on the size (4-32) and pattern, the speedup is 3-6X. See [benchmarks](#benchmarks) below. 9 | The generated assembly in Release mode is lean and mean, and seems comparable with what would have been generated by 10 | a C++ compiler. 11 | 12 | ## Changes since v1 13 | 14 | - Implemented Fisher-Yates shuffle; it is now used in benchmarks for more reliable validation of sorters. 15 | - Support for arbitrary length arrays, i.e., not just lengths that are power of 2. 16 | - Exhaustive validation now checks sorters for all lengths 4-32. 17 | - Added (32-bit) float sorter. 18 | 19 | # Project structure 20 | 21 | The projects are developed with Visual Studio 2019 and target netcore3.1. The solution consists of two projects. 22 | 23 | ## SNBenchmark 24 | 25 | This project dependes on BenchmarkDotNet. It contains validation code, benchmarks and demonstrates the use of sorting methods. 26 | The main program must be run with a single argument: `VI`, `VF` or `B`. 27 | 28 | When run with `VI`, it runs an exhaustive validation of integer networks for element counts of up to 32. When run with `VF` 29 | it runs an exhaustive validation of float networks for element counts of up to 32. Larger sizes are infeasible, as `2^N` 30 | zero/one inputs would have to be tested. 31 | 32 | When run with "B", it passes the rest of the arguments to BenchmarkDotNet. Without any additional arguments, it will present a menu. 33 | All benchmarks call `Environment.FailFast` if the result is found to be unsorted so that this can be detected in the logs. 34 | 35 | ## SortingNetworks 36 | 37 | `SortingNetworks` project is the main code and has no dependencies. The high-performance public types use `unsafe` 38 | code and can only be used from `unsafe` methods. The code depends on AVX2 instruction set. In addition, `AESRand` 39 | class depends on AES-NI instruction set. 40 | 41 | ### Sorting 42 | 43 | The main interface is `UnsafeSort` class which exposes a static factory function. The actual sorting code is in 44 | `IntSorter` and `FloatSorter` classes. You are not expected to understand how it works without studying [references](#references). 45 | The code to handle lengths that are not power of two introduces some overhead even for small arrays, so `PeriodicInt` class is 46 | provided with methods for sorting arrays of lengths 4, 8, 16 and 32; see [benchmarks](#benchmarks) below. 47 | 48 | `UnsafeSort` and `PeriodicInt` classes have no mutable internal state, so it is recommended to use a single (non-static) instance 49 | throughout the program (see remark about statics below). 50 | 51 | Directory `Attic` contains the (failed) experiment with expression trees and earlier (less performant) iterations of the 52 | periodic network. 53 | 54 | ### Random numbers 55 | 56 | This subsystem consists of three classes: and abstract `UnsafeRandom` class and two concrete classes: `AESRand` and `MWC1616Rand`. 57 | These can be instantiated directly. **NB!** The correctness of the code and the quality of random numbers has not been verified! 58 | Benchmarks use `MWC1616Rand` with a fixed seed as `AESRand` seemed to generate some obvious patterns. 59 | 60 | # Lessons learned 61 | These were learned by inspecting the generated assembly code in Release mode. 62 | 63 | Accessing static data has more overhead than accessing instance data: extraneous CALL instructions into the runtime 64 | are generated. My guess is that these ensure thread-safe, once-only static initialization semantics. 65 | 66 | Accessing `ref` parameters as in `Periodics16Branchless` generates a lot of load/store instructions. 67 | It is much more efficient to load ref parameters into locals at the beginning of the procedure and store 68 | results at the end, as in `PeriodicInt`. 69 | 70 | `Periodic16Expr` demonstrates how to build a sorter with expression trees. The generated assembly is OK, 71 | save for the long prologue/epilogue sequences This makes the overhead of calling a lambda compiled at run-time 72 | way too big for this application. 73 | 74 | `unsafe` is not viral: Method `A` is allowed to call `unsafe` method `B` without `A` having to be marked 75 | unsafe as well. Also, it is allowed to assign an `unsafe` method to a non-unsafe delegate variable. 76 | 77 | `System.Random` does not have consistent timing: when used in the baseline benchmark, the results almost always 78 | contained a warning about it having a bimodal distribution. This makes it rather unusable in baseline benchmarks. 79 | Therefore `UnsafeRandom`, `AESRand` and `MWC1616Rand` classes were implemented. Of these, only MWC is being used. 80 | 81 | Generics suck for numeric code. I couldn't figure out how to write a generic `bool IsSorted(T[])` method that'd 82 | work for any numeric type. Adding `where T : unmanaged` doesn't help as the compiler doesn't know that unmanaged 83 | types are comparable with less-than and equal. Nor does it seem possible to write `void Iota(T[] data)` that'd 84 | fill `data` with numbers from `0 .. Length-1`. This is apparently being actively worked on for new versions 85 | of .NET and C#. 86 | 87 | I attempted to make concrete benchmark classes `sealed`, but that makes BenchmarkDotNet fail because it apparently 88 | needs to derive from the benchmark class. 89 | 90 | RyuJIT has some impressive optimizations: despite branches in "block" methods in `PeriodicInt`, it manages to generate 91 | branchless code when constants that branches depend on are known at compile-time. It also elides unnecessary loads and 92 | stores to/from ref variables and inlines impressively. The generated machine code, however, is huge: 32-sorter is > 1kB 93 | in size. If considering larger sorters, inlining should be forced. 94 | 95 | # Benchmarks 96 | 97 | Raw benchmark data with excel file used to generate the report are in [BenchmarkResults](BenchmarkResults). Main results 98 | with comments are presented [here (PDF)](BenchmarkResults/Analysis.pdf) with additional comments below. 99 | 100 | I couldn't figure out how to coerce BenchmarkDotNet into treating the baseline as additive overhead instead of, well, _baseline_. 101 | (Actually, that's what `[IterationSetup]` and `[IterationCleanup]` are for, but they come with a warning that they could spoil results 102 | of microbenchmarks.) The analysis presents results after subtracting the additive overhead. 103 | 104 | ## General observations 105 | 106 | Even for small sizes, `UnsafeSort` is slightly slower than `PeriodicInt` which works for fixed-length arrays only 107 | (compare "IntBenchmark" with "Specialized" ). For example, `PeriodicInt` takes ~22ns to sort 16 elements, whereas 108 | `UnsafeSort` takes ~38ns. Even though the additional logic to handle all sizes below 16 is relatively simple, it 109 | shows in running times. 110 | 111 | Sorting floating point numbers seems to be slightly slower than integers ("Int vs Float"). 112 | 113 | ## Invocation: direct vs delegate vs compiled expression 114 | 115 | This project was initially started to investigate manual code generation using expression trees, but it turns out that 116 | these are unsuitable for high-performance scenarios as the prologue/epilogue in the generated code has way too high overhead 117 | (see `ExpressionInvocationBenchmark`): 118 | 119 | | Method | Mean | Error | StdDev | 120 | |----------------- |----------:|---------:|---------:| 121 | | DirectInvoke | 45.51 ns | 0.934 ns | 2.147 ns | 122 | | ExpressionInvoke | 124.08 ns | 2.512 ns | 6.747 ns | 123 | 124 | On the other hand, there is no substantial difference between directly invoking an instance method, or invoking it through an 125 | abstract base method. Thus there is no penalty in using the more convenient `UnsafeSort` class as opposed to directly calling 126 | methods on an instance of `PeriodicInt`: 127 | 128 | 129 | | Method | Mean | Error | StdDev | 130 | |--------------- |---------:|---------:|---------:| 131 | | AbstractInvoke | 23.80 ns | 0.421 ns | 0.603 ns | 132 | | ConcreteInvoke | 23.28 ns | 0.310 ns | 0.290 ns | 133 | 134 | NB! The results between the two benchmarks are not directly comparable as they run different algorithms. 135 | TODO: same for float. Re-export analysis. 136 | 137 | # References 138 | 139 | D. E. Knuth, The Art of Computer Programming, vol. 3, section 5.3.4 for basic exposition. The ""periodic" network as 140 | implemented here appears in TAOCP exercise 53, but has first been described by Dowd et al.: "The Periodic Balanced Sorting 141 | Network", JACM Vol. 36, No. 4, October 1989, pp. 738-757. 142 | 143 | Other references appear in code comments. 144 | -------------------------------------------------------------------------------- /SortingNetworks/IntSorter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks 7 | { 8 | using V = Vector256; 9 | 10 | sealed unsafe class IntSorter : UnsafeSort 11 | { 12 | // TODO: Place these inside own unsafe struct. 13 | readonly V Zero; // 00000000 14 | readonly V Complement; // FFFFFFFF 15 | readonly V AlternatingMaskLo128; // 0000FFFF 16 | readonly V AlternatingMaskHi128; // FFFF0000 17 | readonly V AlternatingMaskHi64; // FF00FF00 18 | readonly V AlternatingMaskHi32; // F0F0F0F0 19 | readonly V Max; // int.MaxValue in each element 20 | readonly V ReversePermutation; // Input to VPERMD that reverses all 8 ints 21 | readonly V[] CountMask; // For loading 1-8 elements. VPALIGNR requires an immediate constant, which kills perf. 22 | 23 | internal IntSorter(int maxLength) { 24 | Zero = V.Zero; 25 | Complement = Avx2.CompareEqual(Zero, Zero); 26 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsInt32(); 27 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsInt32(); 28 | AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8)); 29 | AlternatingMaskHi32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftRightLogical(Complement.AsInt64(), 32)).AsInt32(); 30 | Max = Vector256.Create(int.MaxValue); 31 | ReversePermutation = Vector256.Create(7, 6, 5, 4, 3, 2, 1, 0); 32 | CountMask = new V[8]; 33 | CountMask[0] = Complement; 34 | CountMask[1] = Vector256.Create(-1, 0, 0, 0, 0, 0, 0, 0); 35 | CountMask[2] = Vector256.Create(-1, -1, 0, 0, 0, 0, 0, 0); 36 | CountMask[3] = Vector256.Create(-1, -1, -1, 0, 0, 0, 0, 0); 37 | CountMask[4] = Vector256.Create(-1, -1, -1, -1, 0, 0, 0, 0); 38 | CountMask[5] = Vector256.Create(-1, -1, -1, -1, -1, 0, 0, 0); 39 | CountMask[6] = Vector256.Create(-1, -1, -1, -1, -1, -1, 0, 0); 40 | CountMask[7] = Vector256.Create(-1, -1, -1, -1, -1, -1, -1, 0); 41 | 42 | if (maxLength <= 8) { 43 | MinLength = 4; 44 | MaxLength = 8; 45 | Sorter = Sort8; 46 | } 47 | else if (maxLength <= 16) { 48 | MinLength = 9; 49 | MaxLength = 16; 50 | Sorter = Sort16; 51 | } 52 | else { 53 | MinLength = 16; 54 | MaxLength = 1 << 24; 55 | Sorter = Sort; 56 | if (maxLength > MaxLength) 57 | throw new ArgumentOutOfRangeException("Maximum supported length is 2^24."); 58 | } 59 | } 60 | 61 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 62 | unsafe void Sort8(int* data, int c) { 63 | var v = Load8(data, c); 64 | Block8(2, ref v); 65 | Block8(3, ref v); 66 | Block8(3, ref v); 67 | Store8(data, v, c); 68 | } 69 | 70 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 71 | unsafe void Sort16(int* data, int c) { 72 | var v0 = Avx.LoadVector256(data); 73 | var v1 = Load8(data + 8, c - 8); 74 | Block16(2, ref v0, ref v1); 75 | Block16(3, ref v0, ref v1); 76 | Block16(4, ref v0, ref v1); 77 | Block16(4, ref v0, ref v1); 78 | Avx.Store(data, v0); 79 | Store8(data + 8, v1, c - 8); 80 | } 81 | 82 | unsafe void Sort(int* data, int c) { 83 | var (upsize, log2c) = UpSize(c); 84 | for (int i = 0; i < log2c; ++i) 85 | Block(i + 2 < log2c ? i + 2 : log2c, data, c, upsize); 86 | 87 | static (int upsize, int log2c) UpSize(int size) { 88 | --size; 89 | size |= size >> 1; 90 | size |= size >> 2; 91 | size |= size >> 4; 92 | size |= size >> 8; 93 | size |= size >> 16; 94 | 95 | var upsize = size + 1; 96 | int log2c = -1; 97 | for (size = upsize; size > 0; ++log2c, size >>= 1) 98 | ; 99 | 100 | return (upsize, log2c); 101 | } 102 | } 103 | 104 | // b and e point to the true range to be sorted. upsize is (e-b) rounded up to a power of two. 105 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 106 | unsafe void Block(int p, int* b, int c, int upsize) { 107 | int split = 1; 108 | for (; p > 0 && upsize >= 8; --p, split *= 2, upsize /= 2) { 109 | for (int i = 0, sb = 0; i < split && sb < c; ++i, sb += upsize) { 110 | var sc = upsize; 111 | if (sb + upsize > c) 112 | sc = c - sb; 113 | Phase(p, b + sb, sc, upsize); 114 | } 115 | } 116 | } 117 | 118 | // b points to block start, c is the actual # of elements in the block and upsize is c rounded up to power of two. 119 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 120 | unsafe void Phase(int p, int* b, int c, int upsize) { 121 | if (upsize > 8) { 122 | var i0 = (upsize - c) >> 3; 123 | var c0 = (upsize - c) & 7; 124 | 125 | int* e = b + upsize - 8 * (i0 + 1); 126 | b += 8 * i0; 127 | 128 | if (c0 != 0 && b < e) { 129 | PhaseStep(1, b, e, 16 - c0); 130 | b += 8; 131 | e -= 8; 132 | } 133 | 134 | for (; b < e; b += 8, e -= 8) 135 | PhaseStep(b, e); 136 | } 137 | else { 138 | Block8(p, b, c); 139 | } 140 | } 141 | 142 | // Full size (16) compare-exchange. 143 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 144 | unsafe void PhaseStep(int* lo, int* hi) { 145 | var v0 = Avx.LoadVector256(lo); 146 | var v1 = Avx.LoadVector256(hi); 147 | Block16(1, ref v0, ref v1); 148 | Avx.Store(lo, v0); 149 | Avx.Store(hi, v1); 150 | } 151 | 152 | // No inlining; executed at most once. 153 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 154 | unsafe void PhaseStep(int p, int* lo, int* hi, int c) { 155 | var v0 = Avx.LoadVector256(lo); 156 | var v1 = Load8(hi, c - 8); 157 | Block16(p, ref v0, ref v1); 158 | Avx.Store(lo, v0); 159 | Store8(hi, v1, c - 8); 160 | } 161 | 162 | // No inlining; executed at most once. 163 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 164 | unsafe void Block8(int p, int* b, int c) { 165 | var v = Load8(b, c); 166 | Block8(p, ref v); 167 | Store8(b, v, c); 168 | } 169 | 170 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 171 | unsafe void Block16(int p, ref V _v0, ref V _v1) { 172 | V v0 = _v0, v1, m; 173 | 174 | v1 = Avx2.PermuteVar8x32(_v1, ReversePermutation); 175 | m = Avx2.Max(v0, v1); 176 | v0 = Avx2.Min(v0, v1); 177 | v1 = Avx2.PermuteVar8x32(m, ReversePermutation); 178 | if (--p == 0) 179 | goto done; 180 | 181 | Block8(p, ref v0); 182 | Block8(p, ref v1); 183 | 184 | done: 185 | _v0 = v0; _v1 = v1; 186 | } 187 | 188 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 189 | void Block8(int p, ref V v) { 190 | V v0 = v, v1, m; 191 | 192 | // COMPARE / SWAP PHASE 193 | // 76543210 194 | // 01234567 195 | 196 | v1 = Avx2.PermuteVar8x32(v0, ReversePermutation); 197 | m = Avx2.CompareGreaterThan(v0, v1); 198 | m = Avx2.Xor(m, AlternatingMaskHi128); 199 | v0 = Avx2.BlendVariable(v0, v1, m); 200 | if (--p == 0) 201 | goto done; 202 | 203 | // COMPARE / SWAP PHASE 204 | // 76543210 205 | // 45670123 206 | 207 | v1 = Avx2.Shuffle(v0, 0x1B); 208 | m = Avx2.CompareGreaterThan(v0, v1); 209 | m = Avx2.Xor(m, AlternatingMaskHi64); 210 | v0 = Avx2.BlendVariable(v0, v1, m); 211 | if (--p == 0) 212 | goto done; 213 | 214 | // COMPARE / SWAP PHASE 215 | // 76543210 216 | // 67452301 217 | 218 | v1 = Avx2.Shuffle(v0, 0b10110001); 219 | m = Avx2.CompareGreaterThan(v0, v1); 220 | m = Avx2.Xor(m, AlternatingMaskHi32); 221 | v0 = Avx2.BlendVariable(v0, v1, m); 222 | 223 | done: 224 | v = v0; 225 | } 226 | 227 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 228 | unsafe V Load8(int* v, int c) { 229 | var m = CountMask[c & 7]; 230 | return Avx2.BlendVariable(Max, Avx2.MaskLoad(v, m), m); 231 | } 232 | 233 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 234 | unsafe void Store8(int* a, V v, int c) { 235 | var m = CountMask[c & 7]; 236 | Avx2.MaskStore(a, m, v); 237 | } 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /SortingNetworks/FloatSorter.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Runtime.CompilerServices; 3 | using System.Runtime.Intrinsics; 4 | using System.Runtime.Intrinsics.X86; 5 | 6 | namespace SortingNetworks 7 | { 8 | using V = Vector256; 9 | using VI = Vector256; 10 | 11 | sealed unsafe class FloatSorter : UnsafeSort 12 | { 13 | readonly V Zero; // 00000000 14 | readonly V Complement; // FFFFFFFF 15 | readonly V AlternatingMaskLo128; // 0000FFFF 16 | readonly V AlternatingMaskHi128; // FFFF0000 17 | readonly V AlternatingMaskHi64; // FF00FF00 18 | readonly V AlternatingMaskHi32; // F0F0F0F0 19 | readonly V Max; // int.MaxValue in each element 20 | readonly VI ReversePermutation; // Input to VPERMD that reverses all 8 ints 21 | readonly V[] CountMask; // For loading 1-8 elements. VPALIGNR requires an immediate constant, which kills perf. 22 | 23 | internal FloatSorter(int maxLength) { 24 | Zero = V.Zero; 25 | Complement = Vector256.Create(-1).AsSingle(); 26 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsSingle(); 27 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsSingle(); 28 | AlternatingMaskHi64 = Avx2.Xor(Complement.AsByte(), Avx2.ShiftRightLogical128BitLane(Complement.AsByte(), 8)).AsSingle(); 29 | AlternatingMaskHi32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftRightLogical(Complement.AsInt64(), 32)).AsSingle(); 30 | Max = Vector256.Create(float.PositiveInfinity); 31 | ReversePermutation = Vector256.Create(7, 6, 5, 4, 3, 2, 1, 0); 32 | CountMask = new V[8]; 33 | CountMask[0] = Complement; 34 | CountMask[1] = Vector256.Create(-1, 0, 0, 0, 0, 0, 0, 0).AsSingle(); 35 | CountMask[2] = Vector256.Create(-1, -1, 0, 0, 0, 0, 0, 0).AsSingle(); 36 | CountMask[3] = Vector256.Create(-1, -1, -1, 0, 0, 0, 0, 0).AsSingle(); 37 | CountMask[4] = Vector256.Create(-1, -1, -1, -1, 0, 0, 0, 0).AsSingle(); 38 | CountMask[5] = Vector256.Create(-1, -1, -1, -1, -1, 0, 0, 0).AsSingle(); 39 | CountMask[6] = Vector256.Create(-1, -1, -1, -1, -1, -1, 0, 0).AsSingle(); 40 | CountMask[7] = Vector256.Create(-1, -1, -1, -1, -1, -1, -1, 0).AsSingle(); 41 | 42 | if (maxLength <= 8) { 43 | MinLength = 4; 44 | MaxLength = 8; 45 | Sorter = Sort8; 46 | } else if (maxLength <= 16) { 47 | MinLength = 9; 48 | MaxLength = 16; 49 | Sorter = Sort16; 50 | } else { 51 | MinLength = 16; 52 | MaxLength = 1 << 24; 53 | Sorter = Sort; 54 | if (maxLength > MaxLength) 55 | throw new ArgumentOutOfRangeException("Maximum supported length is 2^24."); 56 | } 57 | } 58 | 59 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 60 | unsafe void Sort8(float* data, int c) { 61 | var v = Load8(data, c); 62 | Block8(2, ref v); 63 | Block8(3, ref v); 64 | Block8(3, ref v); 65 | Store8(data, v, c); 66 | } 67 | 68 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 69 | unsafe void Sort16(float* data, int c) { 70 | var v0 = Avx.LoadVector256(data); 71 | var v1 = Load8(data + 8, c - 8); 72 | Block16(2, ref v0, ref v1); 73 | Block16(3, ref v0, ref v1); 74 | Block16(4, ref v0, ref v1); 75 | Block16(4, ref v0, ref v1); 76 | Avx.Store(data, v0); 77 | Store8(data + 8, v1, c - 8); 78 | } 79 | 80 | unsafe void Sort(float* data, int c) { 81 | var (upsize, log2c) = UpSize(c); 82 | for (int i = 0; i < log2c; ++i) 83 | Block(i + 2 < log2c ? i + 2 : log2c, data, c, upsize); 84 | 85 | static (int upsize, int log2c) UpSize(int size) { 86 | --size; 87 | size |= size >> 1; 88 | size |= size >> 2; 89 | size |= size >> 4; 90 | size |= size >> 8; 91 | size |= size >> 16; 92 | 93 | var upsize = size + 1; 94 | int log2c = -1; 95 | for (size = upsize; size > 0; ++log2c, size >>= 1) 96 | ; 97 | 98 | return (upsize, log2c); 99 | } 100 | } 101 | 102 | 103 | // b and e point to the true range to be sorted. upsize is (e-b) rounded up to a power of two. 104 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 105 | unsafe void Block(int p, float* b, int c, int upsize) { 106 | int split = 1; 107 | for (; p > 0 && upsize >= 8; --p, split *= 2, upsize /= 2) { 108 | for (int i = 0, sb = 0; i < split && sb < c; ++i, sb += upsize) { 109 | var sc = upsize; 110 | if (sb + upsize > c) 111 | sc = c - sb; 112 | Phase(p, b + sb, sc, upsize); 113 | } 114 | } 115 | } 116 | 117 | // b points to block start, c is the actual # of elements in the block and upsize is c rounded up to power of two. 118 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 119 | unsafe void Phase(int p, float* b, int c, int upsize) { 120 | if (upsize > 8) { 121 | var i0 = (upsize - c) >> 3; 122 | var c0 = (upsize - c) & 7; 123 | 124 | float* e = b + upsize - 8 * (i0 + 1); 125 | b += 8 * i0; 126 | 127 | if (c0 != 0 && b < e) { 128 | PhaseStep(1, b, e, 16 - c0); 129 | b += 8; 130 | e -= 8; 131 | } 132 | 133 | for (; b < e; b += 8, e -= 8) 134 | PhaseStep(b, e); 135 | } else { 136 | Block8(p, b, c); 137 | } 138 | } 139 | 140 | // Full size (16) compare-exchange. 141 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 142 | unsafe void PhaseStep(float* lo, float* hi) { 143 | var v0 = Avx.LoadVector256(lo); 144 | var v1 = Avx.LoadVector256(hi); 145 | Block16(1, ref v0, ref v1); 146 | Avx.Store(lo, v0); 147 | Avx.Store(hi, v1); 148 | } 149 | 150 | // No inlining; executed at most once. 151 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 152 | unsafe void PhaseStep(int p, float* lo, float* hi, int c) { 153 | var v0 = Avx.LoadVector256(lo); 154 | var v1 = Load8(hi, c - 8); 155 | Block16(p, ref v0, ref v1); 156 | Avx.Store(lo, v0); 157 | Store8(hi, v1, c - 8); 158 | } 159 | 160 | // No inlining; executed at most once. 161 | [MethodImpl(MethodImplOptions.AggressiveOptimization)] 162 | unsafe void Block8(int p, float* b, int c) { 163 | var v = Load8(b, c); 164 | Block8(p, ref v); 165 | Store8(b, v, c); 166 | } 167 | 168 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 169 | unsafe void Block16(int p, ref V _v0, ref V _v1) { 170 | V v0 = _v0, v1, m; 171 | 172 | v1 = Avx2.PermuteVar8x32(_v1, ReversePermutation); 173 | m = Avx.Max(v0, v1); 174 | v0 = Avx.Min(v0, v1); 175 | v1 = Avx2.PermuteVar8x32(m, ReversePermutation); 176 | if (--p == 0) 177 | goto done; 178 | 179 | Block8(p, ref v0); 180 | Block8(p, ref v1); 181 | 182 | done: 183 | _v0 = v0; _v1 = v1; 184 | } 185 | 186 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 187 | void Block8(int p, ref V v) { 188 | V v0 = v, v1, m; 189 | 190 | // COMPARE / SWAP PHASE 191 | // 76543210 192 | // 01234567 193 | 194 | v1 = Avx2.PermuteVar8x32(v0, ReversePermutation); 195 | m = Avx.Compare(v0, v1, FloatComparisonMode.OrderedGreaterThanNonSignaling); 196 | m = Avx.Xor(m, AlternatingMaskHi128); 197 | v0 = Avx.BlendVariable(v0, v1, m); 198 | if (--p == 0) 199 | goto done; 200 | 201 | // COMPARE / SWAP PHASE 202 | // 76543210 203 | // 45670123 204 | 205 | v1 = Avx2.Shuffle(v0, v0, 0x1B); 206 | m = Avx.Compare(v0, v1, FloatComparisonMode.OrderedGreaterThanNonSignaling); 207 | m = Avx.Xor(m, AlternatingMaskHi64); 208 | v0 = Avx.BlendVariable(v0, v1, m); 209 | if (--p == 0) 210 | goto done; 211 | 212 | // COMPARE / SWAP PHASE 213 | // 76543210 214 | // 67452301 215 | 216 | v1 = Avx.Shuffle(v0, v0, 0b10110001); 217 | m = Avx.Compare(v0, v1, FloatComparisonMode.OrderedGreaterThanNonSignaling); 218 | m = Avx.Xor(m, AlternatingMaskHi32); 219 | v0 = Avx.BlendVariable(v0, v1, m); 220 | 221 | done: 222 | v = v0; 223 | } 224 | 225 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 226 | unsafe V Load8(float* v, int c) { 227 | var m = CountMask[c & 7]; 228 | return Avx.BlendVariable(Max, Avx.MaskLoad(v, m), m); 229 | } 230 | 231 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] 232 | unsafe void Store8(float* a, V v, int c) { 233 | var m = CountMask[c & 7]; 234 | Avx.MaskStore(a, m, v); 235 | } 236 | } 237 | } 238 | --------------------------------------------------------------------------------