├── BenchmarkResults
├── Analysis.pdf
├── Analysis.xlsx
└── Specialized.csv
├── SortingNetworks
├── SortingNetworks.csproj
├── AESRand.cs
├── MWC1616Rand.cs
├── Attic
│ ├── PeriodicInt.cs
│ ├── Periodic16Branchless.cs
│ ├── PeriodicInt_Block.cs
│ └── Periodic16Expr.cs
├── UnsafeSort.cs
├── PeriodicInt.cs
├── UnsafeRandom.cs
├── PeriodicInt_Block.cs
├── IntSorter.cs
└── FloatSorter.cs
├── SNBenchmark
├── SNBenchmark.csproj
├── ArraySortConstantEstimation.cs
├── InvocationBenchmark.cs
├── Generators.cs
├── FloatBenchmark.cs
├── IntBenchmark.cs
├── Validation.cs
└── Program.cs
├── LICENSE.txt
├── SortingNetworks.sln
├── .gitattributes
├── .gitignore
└── README.md
/BenchmarkResults/Analysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zvrba/SortingNetworks/HEAD/BenchmarkResults/Analysis.pdf
--------------------------------------------------------------------------------
/BenchmarkResults/Analysis.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zvrba/SortingNetworks/HEAD/BenchmarkResults/Analysis.xlsx
--------------------------------------------------------------------------------
/SortingNetworks/SortingNetworks.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Library
5 | netcoreapp3.1
6 |
7 |
8 |
9 |
10 |
11 | true
12 |
13 |
14 |
15 | true
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/SNBenchmark/SNBenchmark.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp3.1
6 |
7 |
8 |
9 | true
10 |
11 |
12 |
13 | true
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/SNBenchmark/ArraySortConstantEstimation.cs:
--------------------------------------------------------------------------------
1 | using BenchmarkDotNet.Attributes;
2 |
3 | using System;
4 |
5 | namespace SNBenchmark
6 | {
7 | ///
8 | /// Array.Sort uses an introsort algorithm that has O(n*log(n)) complexity. This benchmark
9 | /// generates data for estimating the constant hidden in the O-term. Only random pattern is used.
10 | ///
11 | public class ArraySortConstantEstimation
12 | {
13 | readonly Generators generators = new Generators();
14 | int[] d;
15 |
16 | [Params(32, 64, 128, 256, 1024, 2048, 4096, 8192, 16384)]
17 | public int Size { get; set; }
18 |
19 | [GlobalSetup]
20 | public void GlobalSetup() {
21 | d = new int[Size];
22 | }
23 |
24 | [Benchmark(Baseline = true)]
25 | public void NoSort() {
26 | generators.Random(d);
27 | }
28 |
29 | [Benchmark]
30 | public void ArraySort() {
31 | generators.Random(d);
32 | Array.Sort(d);
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Stian Z. Vrba
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/SortingNetworks/AESRand.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Intrinsics;
3 | using System.Runtime.Intrinsics.X86;
4 |
5 | namespace SortingNetworks
6 | {
7 | ///
8 | /// Random number generation using AES-NI instructions.
9 | /// Code adapted from https://github.com/dragontamer/AESRand/blob/master/AESRand/AESRand/AESRand.cpp
10 | ///
11 | public sealed class AESRand : UnsafeRandom {
12 | static readonly Vector128 PRIME_INCREMENT = Vector128.Create(
13 | 0x2f, 0x2b, 0x29, 0x25, 0x1f, 0x1d, 0x17, 0x13,
14 | 0x11, 0x0D, 0x0B, 0x07, 0x05, 0x03, 0x02, 0x01).AsUInt64();
15 |
16 | Vector128 state;
17 |
18 | public AESRand(int[] seed) {
19 | if (seed.Length != 4)
20 | throw new ArgumentException("Seed must contain exactly 4 elements.", nameof(seed));
21 | this.state = Vector128.Create(seed[0], seed[1], seed[2], seed[3]).AsUInt64();
22 | }
23 |
24 | ///
25 | /// Overwrites the initial 4 elements of with random 32-bit integers.
26 | ///
27 | ///
28 | /// Array of length at least 4. Behaviour is UNDEFINED if the array is shorter.
29 | ///
30 | public override Vector128 Get4() {
31 | state = Sse2.Add(state, PRIME_INCREMENT);
32 | var r1 = Aes.Encrypt(state.AsByte(), PRIME_INCREMENT.AsByte());
33 | var r2 = Aes.Encrypt(r1, PRIME_INCREMENT.AsByte()).AsInt32();
34 | return r2;
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/BenchmarkResults/Specialized.csv:
--------------------------------------------------------------------------------
1 | Method;Size;Pattern;Mean (ns);OnlySort (ns);Ratio;;;;;;
2 | NoSort;4;Asc;9,197;;;;;;;;
3 | ArraySort;4;Asc;34,932;25,735;3,084621839;;;;;;
4 | NetworkSort;4;Asc;17,54;8,343;1;;;;;;
5 | NoSort;4;Desc;10,197;;;;;;;;
6 | ArraySort;4;Desc;40,525;30,328;3,53020603;;;;;;
7 | NetworkSort;4;Desc;18,788;8,591;1;;;;;;
8 | NoSort;4;Rand;13,267;;;;;;;;
9 | ArraySort;4;Rand;54,718;41,451;4,557058047;;;;;;
10 | NetworkSort;4;Rand;22,363;9,096;1;;;;;;
11 | NoSort;8;Asc;17,591;;;;;;;;
12 | ArraySort;8;Asc;57,123;39,532;4,326110746;;;;;;
13 | NetworkSort;8;Asc;26,729;9,138;1;;;;;;
14 | NoSort;8;Desc;22,368;;;;;;;;
15 | ArraySort;8;Desc;82,407;60,039;10,20377294;;;;;;
16 | NetworkSort;8;Desc;28,252;5,884;1;;;;;;
17 | NoSort;8;Rand;23,726;;;;;;;;
18 | ArraySort;8;Rand;101,007;77,281;6,388971561;;;;;;
19 | NetworkSort;8;Rand;35,822;12,096;1;;;;;;
20 | NoSort;16;Asc;35,836;;;;;;;;
21 | ArraySort;16;Asc;92,05;56,214;4,597906102;;;;;;
22 | NetworkSort;16;Asc;48,062;12,226;1;;;;;;
23 | NoSort;16;Desc;34,743;;;;;;;;
24 | ArraySort;16;Desc;218,171;183,428;12,24976626;;;;;;
25 | NetworkSort;16;Desc;49,717;14,974;1;;;;;;
26 | NoSort;16;Rand;44,974;;;;;;;;
27 | ArraySort;16;Rand;223,987;179,013;7,861791831;;;;;;
28 | NetworkSort;16;Rand;67,744;22,77;1;;;;;;
29 | NoSort;32;Asc;66,193;;;;;;;;
30 | ArraySort;32;Asc;160,966;94,773;3,808591866;;;;;;
31 | NetworkSort;32;Asc;91,077;24,884;1;;;;;;
32 | NoSort;32;Desc;87,786;;;;;;;;
33 | ArraySort;32;Desc;205,57;117,784;4,844685752;;;;;;
34 | NetworkSort;32;Desc;112,098;24,312;1;;;;;;
35 | NoSort;32;Rand;88,008;;;;;;;;
36 | ArraySort;32;Rand;673,462;585,454;11,42706016;;;;;;
37 | NetworkSort;32;Rand;139,242;51,234;1;;;;;;
38 |
--------------------------------------------------------------------------------
/SortingNetworks/MWC1616Rand.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Intrinsics;
3 | using System.Runtime.Intrinsics.X86;
4 |
5 | namespace SortingNetworks
6 | {
7 | ///
8 | /// Random number generator using MWC1616 (multiply with carry) algorithm.
9 | /// Code adapted from http://www.digicortex.net/node/22
10 | ///
11 | public sealed class MWC1616Rand : UnsafeRandom
12 | {
13 | // Single array so that we can pin it only once.
14 |
15 | Vector128 mask, m1, m2; // Constants
16 | Vector128 a, b; // State
17 |
18 | public MWC1616Rand(int[] seed) {
19 | if (seed.Length != 8)
20 | throw new ArgumentException("The seed array must contain exactly 8 elements.", nameof(seed));
21 |
22 | mask = Vector128.Create(0xFFFFu);
23 | m1 = Vector128.Create(0x4650u);
24 | m2 = Vector128.Create(0x78B7u);
25 | a = Vector128.Create((uint)seed[0], (uint)seed[1], (uint)seed[2], (uint)seed[3]);
26 | b = Vector128.Create((uint)seed[4], (uint)seed[5], (uint)seed[6], (uint)seed[7]);
27 | }
28 |
29 | public override Vector128 Get4() {
30 | var amask = Sse2.And(a, mask);
31 | var ashift = Sse2.ShiftRightLogical(a, 0x10);
32 | var amul = Sse41.MultiplyLow(amask, m1);
33 | a = Sse2.Add(amul, ashift);
34 |
35 | var bmask = Sse2.And(b, mask);
36 | var bshift = Sse2.ShiftRightLogical(b, 0x10);
37 | var bmul = Sse41.MultiplyLow(bmask, m2);
38 | b = Sse2.Add(bmul, bshift);
39 |
40 | var t1 = Sse2.And(b, mask);
41 | var t2 = Sse2.ShiftLeftLogical(a, 0x10);
42 | var r = Sse2.Add(t1, t2);
43 | return r.AsInt32();
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/SNBenchmark/InvocationBenchmark.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using BenchmarkDotNet.Attributes;
3 |
4 | namespace SNBenchmark
5 | {
6 | [BenchmarkCategory("Invocation")]
7 | public class InvocationBenchmark
8 | {
9 | readonly int[] data = new int[16];
10 | readonly SortingNetworks.UnsafeSort asorter = SortingNetworks.UnsafeSort.Create(16);
11 | readonly SortingNetworks.PeriodicInt csorter = new SortingNetworks.PeriodicInt();
12 |
13 | [GlobalSetup]
14 | public void GlobalSetup() {
15 | for (int i = 0; i < data.Length; ++i) data[i] = i;
16 | }
17 |
18 | [Benchmark]
19 | public unsafe void AbstractInvoke() {
20 | fixed (int* p = data)
21 | asorter.Sorter(p, data.Length);
22 | }
23 |
24 | [Benchmark]
25 | public unsafe void ConcreteInvoke() {
26 | fixed (int* p = data)
27 | csorter.Sort16(p);
28 | }
29 | }
30 |
31 | #if false // Obsoleted, "Attic" is no longer included in build of SortingNetworks.
32 | [BenchmarkCategory("Invocation")]
33 | public class ExpressionInvocationBenchmark
34 | {
35 | readonly int[] data = new int[16];
36 | readonly SortingNetworks.Attic.Periodic16Expr expr = new SortingNetworks.Attic.Periodic16Expr();
37 |
38 | // Sets up data array to be sorted so as to have minimum possible data-dependent variation.
39 | [GlobalSetup]
40 | public void GlobalSetup() {
41 | for (int i = 0; i < data.Length; ++i) data[i] = i;
42 | }
43 |
44 | [Benchmark]
45 | public unsafe void DirectInvoke() {
46 | fixed (int* p = data)
47 | SortingNetworks.Attic.Periodic16Branchless.Sort(p);
48 | }
49 |
50 | [Benchmark]
51 | public unsafe void ExpressionInvoke() {
52 | fixed (int* p = data)
53 | expr.Sort(p);
54 | }
55 | }
56 | #endif
57 | }
58 |
--------------------------------------------------------------------------------
/SortingNetworks.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.31424.327
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SortingNetworks", "SortingNetworks\SortingNetworks.csproj", "{2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "SNBenchmark", "SNBenchmark\SNBenchmark.csproj", "{F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}"
9 | EndProject
10 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{9B8ACCCE-973B-42F8-BC2A-2C6751AB5237}"
11 | ProjectSection(SolutionItems) = preProject
12 | LICENSE.txt = LICENSE.txt
13 | README.md = README.md
14 | EndProjectSection
15 | EndProject
16 | Global
17 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
18 | Debug|Any CPU = Debug|Any CPU
19 | Release|Any CPU = Release|Any CPU
20 | EndGlobalSection
21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
23 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Debug|Any CPU.Build.0 = Debug|Any CPU
24 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Release|Any CPU.ActiveCfg = Release|Any CPU
25 | {2C887C06-7BA1-4B47-A0D7-352FE0AAAF63}.Release|Any CPU.Build.0 = Release|Any CPU
26 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Debug|Any CPU.Build.0 = Debug|Any CPU
28 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Release|Any CPU.ActiveCfg = Release|Any CPU
29 | {F6666BA3-4CEB-4962-B9A2-917EA4C7F65D}.Release|Any CPU.Build.0 = Release|Any CPU
30 | EndGlobalSection
31 | GlobalSection(SolutionProperties) = preSolution
32 | HideSolutionNode = FALSE
33 | EndGlobalSection
34 | GlobalSection(ExtensibilityGlobals) = postSolution
35 | SolutionGuid = {0E45DB88-0408-4A77-8703-6DA36E579EAD}
36 | EndGlobalSection
37 | EndGlobal
38 |
--------------------------------------------------------------------------------
/SNBenchmark/Generators.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Intrinsics;
3 | using System.Runtime.Intrinsics.X86;
4 |
5 | namespace SNBenchmark
6 | {
7 | class Generators
8 | {
9 | readonly SortingNetworks.MWC1616Rand rng = new SortingNetworks.MWC1616Rand(new int[8] { 2, 3, 5, 7, 11, 13, 17, 19 });
10 |
11 | ///
12 | /// Fills data with integers from 0 to data.Length-1 in ascending order.
13 | ///
14 | public void Ascending(int[] data) {
15 | for (int i = 0; i < data.Length; ++i)
16 | data[i] = i;
17 | }
18 |
19 | ///
20 | /// Fills data with integers from 0 to data.Length-1 in descending order.
21 | ///
22 | public void Descending(int[] data) {
23 | for (int i = 0; i < data.Length; ++i)
24 | data[i] = data.Length - 1 - i;
25 | }
26 |
27 | ///
28 | /// Fills data with pseudo-random numbers. Length of data must be a multiple of 4, otherwise the
29 | /// remaining elements will not be filled.
30 | ///
31 | ///
32 | public unsafe void Random(int[] data) {
33 | fixed (int* p = data) {
34 | for (int i = 0; i < data.Length / 4; ++i)
35 | rng.Get4(p + 4 * i);
36 | }
37 | }
38 |
39 | ///
40 | /// Rearranges the existing contents of according to a random permutation.
41 | ///
42 | public unsafe void FisherYates(T[] data) where T : unmanaged {
43 | var r = stackalloc uint[4]; // Randomness
44 | int k = 4; // Randomness is initially used up. j is temp.
45 | int j;
46 | Vector128 ar;
47 |
48 | // Use pointer throughout to avoid bound checks.
49 | // Also, we're jumping around the array so the direction of the iteration doesn't matter.
50 | fixed (T* p = data) {
51 | for (int i = data.Length - 1; i > 0; --i) {
52 | // Generate randomness if empty.
53 | if (k == 4) {
54 | ar = rng.Get4().AsUInt32();
55 | Sse2.Store(r, ar);
56 | k = 0;
57 | }
58 | j = (int)(r[k++] % (i + 1)); // Random int between [0, i]
59 | (p[i], p[j]) = (p[j], p[i]); // Exchange.
60 | }
61 | }
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/SNBenchmark/FloatBenchmark.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 |
4 | using BenchmarkDotNet.Attributes;
5 |
6 | namespace SNBenchmark
7 | {
8 | [XmlExporterAttribute.Brief]
9 | [XmlExporter(fileNameSuffix: "xml", indentXml: true, excludeMeasurements: true)]
10 | public class FloatBenchmark
11 | {
12 | readonly Generators generators = new Generators();
13 | Action g;
14 | SortingNetworks.UnsafeSort n;
15 | float[] d;
16 |
17 | //[Params(4, 8, 12, 16, 32, 47, 64, 97, 128, 147, 256, 317, 512, 711, 1024, 1943, 2048, 3717, 4096)]
18 | [ParamsSource(nameof(Sizes))]
19 | public int Size { get; set; }
20 |
21 | [GlobalSetup]
22 | public void GlobalSetup() {
23 | g = generators.FisherYates;
24 | n = SortingNetworks.UnsafeSort.Create(Size);
25 | d = new float[Size];
26 | Filler();
27 | }
28 |
29 | // Also used to simulate sorting.
30 | void Filler() {
31 | for (int i = 0; i < d.Length; ++i)
32 | d[i] = i;
33 | }
34 |
35 | void ArraySorter() => Array.Sort(d);
36 |
37 | unsafe void NetworkSorter() {
38 | fixed (float* p = d) n.Sorter(p, d.Length);
39 | }
40 |
41 | void Template(Action sorter, string what) {
42 | g(d);
43 | sorter();
44 | // Should leave the array sorted so no need to reinitialize it for the next iteration.
45 | int i;
46 | for (i = 0; i < d.Length && d[i] == i; ++i)
47 | ; // no body
48 | if (i < d.Length)
49 | Environment.FailFast(what);
50 | }
51 |
52 | ///
53 | /// Baseline: Fill array with sorted numbers, overwrite with sorted sequence, and validate for being sorted.
54 | /// The first and last step are common for all benchmarks.
55 | ///
56 | [Benchmark(Baseline = true)]
57 | public void NoSort() => Template(Filler, "Unsorted [Baseline].");
58 |
59 | ///
60 | /// Sorting by using Array.Sort().
61 | ///
62 | [Benchmark]
63 | public void ArraySort() => Template(ArraySorter, "Unsorted [ArraySort].");
64 |
65 | [Benchmark]
66 | public unsafe void NetworkSort() => Template(NetworkSorter, "Unsorted [NetworkSort].");
67 |
68 | // The numbers in-between powers of two are deliberately set to odd numbers slightly lower/larger than half the interval.
69 | // This to test the sorters for various lengths.
70 | public IEnumerable Sizes => new int[] {
71 | 4, 8, 12, 16, 27, 32, 47, 64, 128, 177, 256, 364, 512, 748, 1024, 2048, 3389, 4096, 6793, 8192, 14289, 16384,
72 | 32768, 53151, 65536, 96317, 131072, 191217, 262144, 398853, 524288, 719289, 1048576
73 | };
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/SortingNetworks/Attic/PeriodicInt.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks.Attic
7 | {
8 | using V = Vector256;
9 |
10 | ///
11 | /// Provides methods for sorting integer arrays of lengths that are a power of two. Invoking public members with
12 | /// arrays that are of shorter length will result in UNDEFINED BEHAVIOR (data corruption, crash).
13 | ///
14 | public partial class PeriodicInt
15 | {
16 | readonly V Zero; // 00000000
17 | readonly V Complement; // FFFFFFFF
18 | readonly V AlternatingMaskHi128; // FFFF0000
19 | readonly V AlternatingMaskLo128; // 0000FFFF
20 | readonly V AlternatingMaskHi64; // FF00FF00
21 | readonly V AlternatingMaskLo32; // F0F0F0F0
22 |
23 | public PeriodicInt() {
24 | Zero = V.Zero;
25 | Complement = Avx2.CompareEqual(Zero, Zero);
26 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsInt32();
27 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsInt32();
28 | AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8));
29 | AlternatingMaskLo32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftLeftLogical(Complement.AsInt64(), 32)).AsInt32();
30 | }
31 |
32 | ///
33 | /// In-place sorts 16 elements starting at .
34 | ///
35 | public unsafe void Sort16(int* data) {
36 | var lo = Avx.LoadVector256(data);
37 | var hi = Avx.LoadVector256(data + 8);
38 |
39 | Block16(2, ref lo, ref hi);
40 | Block16(3, ref lo, ref hi);
41 | Block16(4, ref lo, ref hi);
42 | Block16(4, ref lo, ref hi);
43 |
44 | Avx.Store(data, lo);
45 | Avx.Store(data + 8, hi);
46 | }
47 |
48 | ///
49 | /// In-place sorts 32 elements starting at .
50 | ///
51 | public unsafe void Sort32(int* data) {
52 | var v0 = Avx.LoadVector256(data + 0);
53 | var v1 = Avx.LoadVector256(data + 8);
54 | var v2 = Avx.LoadVector256(data + 16);
55 | var v3 = Avx.LoadVector256(data + 24);
56 |
57 | Block32(2, ref v0, ref v1, ref v2, ref v3);
58 | Block32(3, ref v0, ref v1, ref v2, ref v3);
59 | Block32(4, ref v0, ref v1, ref v2, ref v3);
60 | Block32(5, ref v0, ref v1, ref v2, ref v3);
61 | Block32(5, ref v0, ref v1, ref v2, ref v3);
62 |
63 | Avx.Store(data + 0, v0);
64 | Avx.Store(data + 8, v1);
65 | Avx.Store(data + 16, v2);
66 | Avx.Store(data + 24, v3);
67 | }
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/SNBenchmark/IntBenchmark.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 |
4 | using BenchmarkDotNet.Attributes;
5 |
6 | namespace SNBenchmark
7 | {
8 | [XmlExporterAttribute.Brief]
9 | [XmlExporter(fileNameSuffix: "xml", indentXml: true, excludeMeasurements: true)]
10 | public class IntBenchmark
11 | {
12 | readonly Generators generators = new Generators();
13 | Action g;
14 | SortingNetworks.UnsafeSort n;
15 | int[] d;
16 |
17 | //[Params(4, 8, 12, 16, 32, 47, 64, 97, 128, 147, 256, 317, 512, 711, 1024, 1943, 2048, 3717, 4096)]
18 | [ParamsSource(nameof(Sizes))]
19 | public int Size { get; set; }
20 |
21 | //[Params("Asc", "Desc", "Rand")]
22 | [Params("Rand")]
23 | public string Pattern { get; set; }
24 |
25 | [GlobalSetup]
26 | public void GlobalSetup() {
27 | switch (Pattern) {
28 | case "Asc": g = generators.Ascending; break;
29 | case "Desc": g = generators.Descending; break;
30 | case "Rand": g = generators.FisherYates; break;
31 | default: throw new ArgumentOutOfRangeException(nameof(Pattern));
32 | }
33 | n = SortingNetworks.UnsafeSort.Create(Size);
34 | d = new int[Size];
35 | Filler();
36 | }
37 |
38 | // Also used to simulate sorting.
39 | void Filler() {
40 | for (int i = 0; i < d.Length; ++i)
41 | d[i] = i;
42 | }
43 |
44 | void ArraySorter() => Array.Sort(d);
45 |
46 | unsafe void NetworkSorter() {
47 | fixed (int* p = d) n.Sorter(p, d.Length);
48 | }
49 |
50 | void Template(Action sorter, string what) {
51 | g(d);
52 | sorter();
53 | // Should leave the array sorted so no need to reinitialize it for the next iteration.
54 | int i;
55 | for (i = 0; i < d.Length && d[i] == i; ++i)
56 | ; // no body
57 | if (i < d.Length)
58 | Environment.FailFast(what);
59 | }
60 |
61 | ///
62 | /// Baseline: Fill array with sorted numbers, overwrite with sorted sequence, and validate for being sorted.
63 | /// The first and last step are common for all benchmarks.
64 | ///
65 | [Benchmark(Baseline = true)]
66 | public void NoSort() => Template(Filler, "Unsorted [Baseline].");
67 |
68 | ///
69 | /// Sorting by using Array.Sort().
70 | ///
71 | [Benchmark]
72 | public void ArraySort() => Template(ArraySorter, "Unsorted [ArraySort].");
73 |
74 | [Benchmark]
75 | public unsafe void NetworkSort() => Template(NetworkSorter, "Unsorted [NetworkSort].");
76 |
77 | // The numbers in-between powers of two are deliberately set to odd numbers slightly lower/larger than half the interval.
78 | // This to test the sorters for various lengths.
79 | public IEnumerable Sizes => new int[] {
80 | 4, 8, 12, 16, 27, 32, 47, 64, 128, 177, 256, 364, 512, 748, 1024, 2048, 3389, 4096, 6793, 8192, 14289, 16384,
81 | 32768, 53151, 65536, 96317, 131072, 191217, 262144, 398853, 524288, 719289, 1048576
82 | };
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/SortingNetworks/UnsafeSort.cs:
--------------------------------------------------------------------------------
1 | using System;
2 |
3 | namespace SortingNetworks
4 | {
5 | ///
6 | /// Represents an in-place sorting method with possibly limited bounds on the valid values of .
7 | ///
8 | /// Type of elements being sorted.
9 | /// Pointer to the beginning of the range to sort.
10 | /// Number of elements in the range.
11 | public unsafe delegate void Sorter(T* data, int c) where T : unmanaged;
12 |
13 | ///
14 | /// Provides methods for sorting arrays of ints or floats using a periodic sorting network.
15 | ///
16 | /// The type of array elements.
17 | ///
18 | /// WARNING! All methods taking pointer arguments require that the allocated size is correct wrt. the implied or specified
19 | /// length. Also, the input length must conform to and limits. Otherwise
20 | /// UNDEFINED BEHAVIOR occurs: incorrect result, data corruption or crash.
21 | ///
22 | public abstract class UnsafeSort where T : unmanaged
23 | {
24 | ///
25 | /// Creates an instance of UnsafeSort{T}.
26 | ///
27 | ///
28 | /// Maximum array length supported by the sorter. Sorters for sizes of up to 16 are more efficent than the general-length
29 | /// sorters and should therefore be used for small arrays.
30 | ///
31 | ///
32 | /// exceeds 2^24, which is the maximum supported value. - OR -
33 | /// is not int or float.
34 | ///
35 | public static UnsafeSort Create(int maxLength) {
36 | object ret = null;
37 |
38 | if (typeof(T) == typeof(int))
39 | ret = new IntSorter(maxLength);
40 | if (typeof(T) == typeof(float))
41 | ret = new FloatSorter(maxLength);
42 |
43 | if (ret == null)
44 | throw new ArgumentOutOfRangeException("Unsupported element type: " + typeof(T).Name);
45 | return (UnsafeSort)ret;
46 | }
47 |
48 | // This base is derivable only in this assembly.
49 | private protected UnsafeSort()
50 | { }
51 |
52 | ///
53 | /// Minimum array length supported by this sorter.
54 | ///
55 | public int MinLength { get; protected set; }
56 |
57 | ///
58 | /// Maximum array length supported by this sorter.
59 | ///
60 | public int MaxLength { get; protected set; }
61 |
62 | ///
63 | /// Delegate that performs the actual sorting. WARNING! The count argument given to the delegate must be between
64 | /// and (inclusive). No bounds are checked.
65 | ///
66 | public Sorter Sorter { get; protected set; }
67 |
68 | ///
69 | /// Convenience overload for use in "safe" code. Checks preconditions and then invokes .
70 | ///
71 | /// Array to sort.
72 | /// The array length is invalid.
73 | public unsafe void Sort(T[] data) {
74 | if (data.Length < MinLength || data.Length > MaxLength)
75 | throw new ArgumentOutOfRangeException(nameof(data), $"Invalid array length ({data.Length}).");
76 | fixed (T* p = data)
77 | Sorter(p, data.Length);
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/SNBenchmark/Validation.cs:
--------------------------------------------------------------------------------
1 | using System;
2 |
3 | namespace SNBenchmark
4 | {
5 | ///
6 | /// Validation methods for verifying output of a sorting network.
7 | ///
8 | static class Validation
9 | {
10 | ///
11 | /// Validates by exploiting theorem Z of TAOCOP section 5.3.4: it is
12 | /// sufficient to check that all 0-1 sequences (2^N of them) are sorted by the network.
13 | /// Only lengths of up to 28 are accepted.
14 | ///
15 | /// An instance of sorting network to test.
16 | /// Element count to test with.
17 | /// Sorter's length is larger than 28.
18 | /// Validation has failed.
19 | public static unsafe void Check(SortingNetworks.UnsafeSort sort, int size) {
20 | if (size < 4 || size > 32)
21 | throw new ArgumentOutOfRangeException(nameof(size), "Valid range is [4, 32].");
22 |
23 | var bits = new int[size];
24 |
25 | fixed (int* pbits = bits) {
26 | for (uint i = 0; i <= (1 << size) - 1; ++i) {
27 | int popcnt = 0; // Number of ones in i
28 | for (uint j = i, k = 0; k < size; ++k, j >>= 1) {
29 | int b = (int)(j & 1);
30 | pbits[k] = b;
31 | popcnt += b;
32 | }
33 |
34 | sort.Sorter(pbits, size);
35 |
36 | for (int k = 0; k < size - popcnt; ++k)
37 | if (pbits[k] != 0)
38 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}.");
39 |
40 | for (int k = size - popcnt; k < size; ++k)
41 | if (pbits[k] != 1)
42 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}.");
43 | }
44 | }
45 | }
46 |
47 | ///
48 | /// Overload for float arrays; .
49 | ///
50 | public static unsafe void Check(SortingNetworks.UnsafeSort sort, int size) {
51 | if (size < 4 || size > 32)
52 | throw new ArgumentOutOfRangeException(nameof(size), "Valid range is [4, 32].");
53 |
54 | var bits = new float[size];
55 |
56 | fixed (float* pbits = bits) {
57 | for (uint i = 0; i <= (1 << size) - 1; ++i) {
58 | int popcnt = 0; // Number of ones in i
59 | for (uint j = i, k = 0; k < size; ++k, j >>= 1) {
60 | int b = (int)(j & 1);
61 | pbits[k] = b;
62 | popcnt += b;
63 | }
64 |
65 | sort.Sorter(pbits, size);
66 |
67 | for (int k = 0; k < size - popcnt; ++k)
68 | if (pbits[k] != 0)
69 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}.");
70 |
71 | for (int k = size - popcnt; k < size; ++k)
72 | if (pbits[k] != 1)
73 | throw new NotImplementedException($"Result is not a permutation for bit pattern {i:X8}.");
74 | }
75 | }
76 | }
77 |
78 | ///
79 | /// Checks whether array is sorted.
80 | ///
81 | /// True if the input is sorted, false otherwise.
82 | public static bool IsSorted(int[] data) {
83 | for (int i = 1; i < data.Length; ++i)
84 | if (data[i] < data[i - 1])
85 | return false;
86 | return true;
87 | }
88 |
89 | ///
90 | /// Checks whether array is sorted.
91 | ///
92 | /// True if the input is sorted, false otherwise.
93 | public static bool IsSorted(float[] data) {
94 | for (int i = 1; i < data.Length; ++i)
95 | if (data[i] < data[i - 1])
96 | return false;
97 | return true;
98 | }
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/SortingNetworks/Attic/Periodic16Branchless.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks.Attic
7 | {
8 | using V = System.Runtime.Intrinsics.Vector256;
9 |
10 | ///
11 | /// Reference, branchless implementation of 16-element periodic sorting network.
12 | ///
13 | public static class Periodic16Branchless
14 | {
15 | // All zeros
16 | static readonly V Zero = Vector256.Create(0);
17 | // All ones
18 | static readonly V Complement = Avx2.CompareEqual(Zero, Zero);
19 | // FF00FF00 (1 digit = 32 bits)
20 | static readonly V AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8));
21 | // F0F0F0F0
22 | static readonly V AlternatingMaskLo32 = Avx2.Xor(
23 | Complement.AsInt64(),
24 | Avx2.ShiftLeftLogical(Complement.AsInt64(), 32)
25 | ).AsInt32();
26 |
27 | ///
28 | /// In-place sorting of 16 elements starting at .
29 | ///
30 | ///
31 | public static unsafe void Sort(int* data) {
32 | var lo = Avx.LoadVector256(data);
33 | var hi = Avx.LoadVector256(data + 8);
34 |
35 | Step(ref lo, ref hi);
36 | Step(ref lo, ref hi);
37 | Step(ref lo, ref hi);
38 | Step(ref lo, ref hi);
39 |
40 | Avx.Store(data, lo);
41 | Avx.Store(data + 8, hi);
42 | }
43 |
44 | ///
45 | /// Test method for debugging instruction sequences.
46 | ///
47 | public static unsafe void Test() {
48 | var data = new int[16];
49 | for (int i = 0; i < 16; ++i) data[i] = i;
50 | fixed (int* p = data) {
51 | var lo = Avx.LoadVector256(p);
52 | var hi = Avx.LoadVector256(p + 8);
53 | Step(ref lo, ref hi);
54 | }
55 | }
56 |
57 | ///
58 | /// One step of the sorting network for 16 elements. Must be iterated 4 times.
59 | ///
60 | ///
61 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
62 | static void Step(ref V lo, ref V hi) {
63 | V tmp1, tmp2;
64 |
65 | // lo, hi are intermediate results after each stage and input to next one.
66 |
67 | // STAGE 1:
68 | // 76543210
69 | // 89ABCDEF
70 |
71 | tmp1 = Avx2.Shuffle(hi, 0x1B); // CDEF89AB
72 | hi = Avx2.Permute2x128(tmp1, tmp1, 1); // 89ABCDEF
73 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
74 |
75 | // STAGE 2:
76 | // BA983210
77 | // CDEF4567
78 |
79 | tmp1 = Avx2.Permute2x128(lo, hi, 0x31); // 89AB7654
80 | tmp1 = Avx2.Shuffle(tmp1, 0x1B); // BA984567
81 | lo = Avx2.Permute2x128(lo, tmp1, 0x30); // BA983210
82 | hi = Avx2.Permute2x128(hi, tmp1, 0x02); // CDEF4567
83 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
84 |
85 | // STAGE 3:
86 | // DC985410
87 | // EFAB6723
88 |
89 | Swap(ref lo, ref hi, AlternatingMaskHi64); // L:CD984510 - H:BAEF3267
90 | lo = Avx2.Shuffle(lo, 0b01001011); //
91 | hi = Avx2.Shuffle(hi, 0b10110100); //
92 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
93 |
94 | // STAGE 4:
95 | // ECA86420
96 | // FDB97531
97 |
98 | Swap(ref lo, ref hi, AlternatingMaskLo32); // L:ECA86420 - H:DF9B5713
99 | hi = Avx2.Shuffle(hi, 0b10110001);
100 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
101 |
102 | // Final stage: restore order.
103 |
104 | tmp1 = Avx2.UnpackLow(lo, hi);
105 | tmp2 = Avx2.UnpackHigh(lo, hi);
106 | lo = Avx2.Permute2x128(tmp1, tmp2, 0x20);
107 | hi = Avx2.Permute2x128(tmp1, tmp2, 0x31);
108 | }
109 |
110 | ///
111 | /// Swaps elements of and where is 1.
112 | ///
113 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
114 | static void Swap(ref V lo, ref V hi, V mask) {
115 | var t = Avx2.BlendVariable(lo, hi, mask);
116 | lo = Avx2.BlendVariable(hi, lo, mask);
117 | hi = t;
118 | }
119 | }
120 | }
121 |
--------------------------------------------------------------------------------
/SNBenchmark/Program.cs:
--------------------------------------------------------------------------------
1 | using System;
2 |
3 | using BenchmarkDotNet.Configs;
4 | using BenchmarkDotNet.Running;
5 |
6 | namespace SNBenchmark
7 | {
8 | unsafe class Program
9 | {
10 | static void Main(string[] args) {
11 | if (args.Length == 0)
12 | Usage();
13 |
14 | if (args[0] == "VI") {
15 | ValidateInt();
16 | }
17 | else if (args[0] == "VF") {
18 | ValidateFloat();
19 | }
20 | else if (args[0] == "B") {
21 | //var ss = new BenchmarkDotNet.Reports.BenchmarkReport
22 | // TODO: SummaryStyle; InvariantCulture
23 | var config = ManualConfig.Create(DefaultConfig.Instance)
24 | .WithOptions(ConfigOptions.StopOnFirstError | ConfigOptions.JoinSummary);
25 | var args1 = new string[args.Length - 1];
26 | Array.Copy(args, 1, args1, 0, args.Length - 1);
27 | BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args1);
28 | }
29 | else {
30 | Usage();
31 | }
32 |
33 | Environment.Exit(0);
34 | }
35 |
36 | static void Usage() {
37 | Console.WriteLine("USAGE: {VI | VF | B} [argument...]");
38 | Console.WriteLine("VI validates int sorting networks for all sizes up to 32.");
39 | Console.WriteLine("VF validates float sorting networks for all sizes up to 32.");
40 | Console.WriteLine("B runs benchmarks with arguments following it.");
41 | Environment.Exit(0);
42 | }
43 |
44 | static void ValidateInt() {
45 | for (int size = 4; size <= 32; ++size) {
46 | var n = SortingNetworks.UnsafeSort.Create(size);
47 | Console.Write($"Validating size {size:D2}: ");
48 | try {
49 | Validation.Check(n, size);
50 | Console.WriteLine("OK");
51 | }
52 | catch (NotImplementedException e) {
53 | Console.WriteLine($"FAILED: {e.Message}");
54 | }
55 | }
56 | }
57 |
58 | static void ValidateFloat() {
59 | for (int size = 4; size <= 32; ++size) {
60 | var n = SortingNetworks.UnsafeSort.Create(size);
61 | Console.Write($"Validating size {size:D2}: ");
62 | try {
63 | Validation.Check(n, size);
64 | Console.WriteLine("OK");
65 | }
66 | catch (NotImplementedException e) {
67 | Console.WriteLine($"FAILED: {e.Message}");
68 | }
69 | }
70 | }
71 |
72 | // This exists only for sporadic testing and debugging.
73 | static void Test() {
74 | var d = new int[11157];
75 | int[] dc;
76 | var g = new Generators();
77 | var nn = SortingNetworks.UnsafeSort.Create(d.Length);
78 | for (int i = 0; i < d.Length; ++i) d[i] = i;
79 |
80 | var iteration = 0;
81 | while (true) {
82 | ++iteration;
83 | //if ((iteration % 1000) == 0)
84 | // Console.WriteLine(iteration);
85 | g.FisherYates(d);
86 | dc = (int[])d.Clone();
87 | fixed (int* p = d)
88 | nn.Sorter(p, d.Length);
89 | for (int i = 0; i < d.Length; ++i)
90 | if (d[i] != i)
91 | throw new NotImplementedException();
92 | }
93 | }
94 |
95 | static unsafe void TestAESRand() {
96 | var r = new SortingNetworks.AESRand(new int[4] { 2, 3, 5, 7, });
97 |
98 | int[] idata = new int[4];
99 | float[] fdata = new float[4];
100 |
101 | for (int i = 0; i < 4; ++i) {
102 | fixed (int* p = idata)
103 | r.Get4(p);
104 | fixed (float* p = fdata)
105 | r.Get4U(p);
106 | fixed (float* p = fdata)
107 | r.Get4N(p);
108 | }
109 | }
110 |
111 | static unsafe void TestMWC1616Rand() {
112 | var r = new SortingNetworks.MWC1616Rand(new int[8] { 2, 3, 5, 7, 11, 13, 17, 19, });
113 |
114 | int[] idata = new int[4];
115 | float[] fdata = new float[4];
116 |
117 | for (int i = 0; i < 4; ++i) {
118 | fixed (int* p = idata)
119 | r.Get4(p);
120 | fixed (float* p = fdata)
121 | r.Get4U(p);
122 | fixed (float* p = fdata)
123 | r.Get4N(p);
124 | }
125 | }
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/SortingNetworks/PeriodicInt.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks
7 | {
8 | using V = Vector256;
9 |
10 | ///
11 | /// Provides methods for sorting integer arrays of lengths that are a power of two. Invoking public members with
12 | /// arrays that are of shorter length will result in UNDEFINED BEHAVIOR (data corruption, crash).
13 | ///
14 | ///
15 | /// You're not expected to understand this code unless you have read the paper by Dowd et al.
16 | ///
17 | public unsafe partial class PeriodicInt
18 | {
19 | public readonly V Zero; // 00000000
20 | public readonly V Complement; // FFFFFFFF
21 | public readonly V AlternatingMaskLo128; // 0000FFFF
22 | public readonly V AlternatingMaskHi128; // FFFF0000
23 | public readonly V AlternatingMaskHi64; // FF00FF00
24 | public readonly V AlternatingMaskHi32; // F0F0F0F0
25 | public readonly V Max; // int.MaxValue in each element
26 | public readonly V ReversePermutation; // Input to VPERMD that reverses all 8 ints
27 | public readonly V LoadMask; // Input to AlignRight for creating load mask
28 |
29 | public PeriodicInt() {
30 | Zero = V.Zero;
31 | Complement = Avx2.CompareEqual(Zero, Zero);
32 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsInt32();
33 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsInt32();
34 | AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8));
35 | AlternatingMaskHi32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftRightLogical(Complement.AsInt64(), 32)).AsInt32();
36 | Max = Vector256.Create(int.MaxValue);
37 | ReversePermutation = Vector256.Create(7, 6, 5, 4, 3, 2, 1, 0);
38 | }
39 |
40 | // This is the last size that can be reasonably inlined due to code size (> 1kB of straight-line code) and # of used registers.
41 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
42 | public unsafe void Sort32(int* data) {
43 | var v0 = Avx.LoadVector256(data + 0);
44 | var v1 = Avx.LoadVector256(data + 8);
45 | var v2 = Avx.LoadVector256(data + 16);
46 | var v3 = Avx.LoadVector256(data + 24);
47 | Block_32_1(2, ref v0, ref v1, ref v2, ref v3);
48 | Block_32_1(3, ref v0, ref v1, ref v2, ref v3);
49 | Block_32_1(4, ref v0, ref v1, ref v2, ref v3);
50 | Block_32_1(5, ref v0, ref v1, ref v2, ref v3);
51 | Block_32_1(5, ref v0, ref v1, ref v2, ref v3);
52 | Avx.Store(data + 0, v0);
53 | Avx.Store(data + 8, v1);
54 | Avx.Store(data + 16, v2);
55 | Avx.Store(data + 24, v3);
56 | }
57 |
58 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
59 | public unsafe void Sort32(int* data, int c) {
60 | throw new NotImplementedException("Sort32");
61 | }
62 |
63 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
64 | public unsafe void Sort16(int* data) {
65 | var v0 = Avx.LoadVector256(data + 0);
66 | var v1 = Avx.LoadVector256(data + 8);
67 | Block_16_1(2, ref v0, ref v1);
68 | Block_16_1(3, ref v0, ref v1);
69 | Block_16_1(4, ref v0, ref v1);
70 | Block_16_1(4, ref v0, ref v1);
71 | Avx.Store(data + 0, v0);
72 | Avx.Store(data + 8, v1);
73 | }
74 |
75 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
76 | public unsafe void Sort16(int* data, int c) {
77 | throw new NotImplementedException("Sort16");
78 | }
79 |
80 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
81 | public unsafe void Sort8(int* data) {
82 | var v = Avx.LoadVector256(data);
83 | Block_8_1(2, ref v);
84 | Block_8_1(3, ref v);
85 | Block_8_1(3, ref v);
86 | Avx.Store(data, v);
87 | }
88 |
89 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
90 | public unsafe void Sort8(int* data, int c) {
91 | throw new NotImplementedException("Sort8");
92 | }
93 |
94 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
95 | public unsafe void Sort4(int* data) {
96 | var v = Avx2.MaskLoad(data, AlternatingMaskLo128);
97 | Block_4_2(2, ref v);
98 | Block_4_2(2, ref v);
99 | Avx2.MaskStore(data, AlternatingMaskLo128, v);
100 | }
101 |
102 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
103 | public unsafe void Sort4(int* data, int c) {
104 | throw new NotImplementedException("Sort4");
105 | }
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/SortingNetworks/UnsafeRandom.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.Intrinsics;
3 | using System.Runtime.Intrinsics.X86;
4 |
5 | namespace SortingNetworks
6 | {
7 | ///
8 | /// Provides methods for fast, "unsafe" generation of integer or floating-point random numbers.
9 | ///
10 | public abstract class UnsafeRandom
11 | {
12 | readonly Vector128 oneMask;
13 | readonly Vector128 one;
14 | readonly Vector128 complement;
15 |
16 | protected UnsafeRandom() {
17 | oneMask = Vector128.Create(0x3F800000);
18 | one = Vector128.Create(1.0f);
19 | complement = Vector128.Create(-1);
20 | }
21 |
22 | ///
23 | /// Returns 4 random numbers in a vector.
24 | ///
25 | public abstract Vector128 Get4();
26 |
27 | ///
28 | /// Overwrites the initial 4 elements of with random 32-bit integers.
29 | ///
30 | ///
31 | /// Pointer to a memory chunk of at least 4 integers. Behaviour is UNDEFINED if the allocated
32 | /// space for the chunk is shorter.
33 | ///
34 | public unsafe void Get4(int* data) {
35 | var v = Get4();
36 | Sse2.Store(data, v);
37 | }
38 |
39 | ///
40 | /// Overwrites initial elements of with random 32-bit integers.
41 | ///
42 | ///
43 | /// Pointer to a memory chunk of at least integers. Behaviour is UNDEFINED if the allocated
44 | /// space for the chunk is shorter.
45 | ///
46 | ///
47 | /// Number of elements to write. Must be between 0 and 4.
48 | ///
49 | public unsafe void Get(int* data, int c) {
50 | var v = Get4();
51 | var m = Sse2.ShiftRightLogical128BitLane(complement, (byte)(4 * (4 - c)));
52 | Avx2.MaskStore(data, m, v);
53 | }
54 |
55 | ///
56 | /// Overwrites the initial 4 elements of with floats in range [-2^31, 2^31).
57 | ///
58 | ///
59 | /// Pointer to a memory chunk of at least 4 floats.
60 | /// Behaviour is UNDEFINED if the allocated space for the chunk is shorter.
61 | ///
62 | public unsafe void Get4U(float* data) {
63 | var v = Get4();
64 | var f = Sse2.ConvertToVector128Single(v);
65 | Sse.Store(data, f);
66 | }
67 |
68 | ///
69 | /// Overwrites the initial 4 elements of with floats in range [-2^31, 2^31).
70 | ///
71 | ///
72 | /// Pointer to a memory chunk of at least floats. Behaviour is UNDEFINED if the
73 | /// allocated space for the chunk is shorter.
74 | ///
75 | ///
76 | /// Number of elements to write. Must be between 0 and 4.
77 | ///
78 | public unsafe void Get4U(float* data, int c) {
79 | var v = Get4();
80 | var m = Sse2.ShiftRightLogical128BitLane(complement, (byte)(4 * (4 - c)));
81 | var f = Sse2.ConvertToVector128Single(v);
82 | Avx.MaskStore(data, m.AsSingle(), f);
83 | }
84 |
85 | ///
86 | /// Overwrites the initial 4 elements of with floats in range [0, 1).
87 | ///
88 | ///
89 | /// Pointer to a memory chunk of at least 4 floats.
90 | /// Behaviour is UNDEFINED if the allocated space for the chunk is shorter.
91 | ///
92 | public unsafe void Get4N(float* data) {
93 | var v = Get4();
94 | // Keep 23 MSB bits of the random integer and convert to [1.0,2.0)
95 | v = Sse2.Or(Sse2.ShiftRightLogical(v, 9), oneMask);
96 | var f = Sse.Subtract(v.AsSingle(), one);
97 | Sse.Store(data, f);
98 | }
99 |
100 | ///
101 | /// Overwrites the initial 4 elements of with floats in range [0, 1).
102 | ///
103 | ///
104 | /// Pointer to a memory chunk of at least floats. Behaviour is UNDEFINED if the
105 | /// allocated space for the chunk is shorter.
106 | ///
107 | ///
108 | /// Number of elements to write. Must be between 0 and 4.
109 | ///
110 | public unsafe void Get4N(float* data, int c) {
111 | var v = Get4();
112 | v = Sse2.Or(Sse2.ShiftRightLogical(v, 9), oneMask);
113 | var m = Sse2.ShiftRightLogical128BitLane(complement, (byte)(4 * (4 - c)));
114 | var f = Sse.Subtract(v.AsSingle(), one);
115 | Avx.MaskStore(data, m.AsSingle(), f);
116 | }
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/SortingNetworks/PeriodicInt_Block.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks
7 | {
8 | using V = Vector256;
9 |
10 | partial class PeriodicInt
11 | {
12 | ///
13 | /// Used to implement a single compare-swap phase for N elements; this processes 32 items at a time.
14 | /// Range [b, b+16) is compared/exchanged with reversed range [e-16, e)..
15 | ///
16 | ///
17 | /// TODO: Should use aligned loads and non-temporal stores once it's possible to allocate aligned storage in .NET.
18 | ///
19 | /// Start of the block.
20 | /// One past the end of the block.
21 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
22 | public unsafe void Phase_N_32(int* b, int* e) {
23 | V m0, m1;
24 |
25 | // Low half.
26 | var v0 = Avx.LoadVector256(b + 0);
27 | var v1 = Avx.LoadVector256(b + 8);
28 |
29 | // High half. Interleave loads with reversing.
30 | var v2 = Avx2.PermuteVar8x32(Avx.LoadVector256(e - 16), ReversePermutation);
31 | var v3 = Avx2.PermuteVar8x32(Avx.LoadVector256(e - 8), ReversePermutation);
32 |
33 | // Comparisons, interleaved with stores. Min/max have throughput of 0.5, so we can execute two at once.
34 | // Use m0 and m1 to exploit the fact that min/max have a throughput < 1.
35 | m0 = Avx2.Min(v0, v3);
36 | m1 = Avx2.PermuteVar8x32(Avx2.Max(v0, v3), ReversePermutation);
37 | Avx.Store(b + 0, m0);
38 | Avx.Store(e - 8, m1);
39 |
40 | m0 = Avx2.Min(v1, v2);
41 | m1 = Avx2.PermuteVar8x32(Avx2.Max(v1, v2), ReversePermutation);
42 | Avx.Store(b + 8, m0);
43 | Avx.Store(e - 16, m1);
44 | }
45 |
46 | ///
47 | /// Block for sorting one vector of 32 elements (four registers).
48 | ///
49 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
50 | public void Block_32_1(int p, ref V _v0, ref V _v1, ref V _v2, ref V _v3) {
51 | V v0 = _v0, v1 = _v1, v2, v3, m0, m1;
52 |
53 | v2 = Avx2.PermuteVar8x32(_v2, ReversePermutation);
54 | v3 = Avx2.PermuteVar8x32(_v3, ReversePermutation);
55 | m0 = Avx2.Max(v0, v3);
56 | m1 = Avx2.Max(v1, v2);
57 | v0 = Avx2.Min(v0, v3);
58 | v1 = Avx2.Min(v1, v2);
59 | v2 = Avx2.PermuteVar8x32(m1, ReversePermutation);
60 | v3 = Avx2.PermuteVar8x32(m0, ReversePermutation);
61 | if (p == 1)
62 | goto done;
63 |
64 | Block_16_1(p - 1, ref v0, ref v1);
65 | Block_16_1(p - 1, ref v2, ref v3);
66 |
67 | done:
68 | _v0 = v0; _v1 = v1;
69 | _v2 = v2; _v3 = v3;
70 | }
71 |
72 | ///
73 | /// Block for sorting one vector of 16 elements (two registers).
74 | ///
75 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
76 | public void Block_16_1(int p, ref V _v0, ref V _v1) {
77 | V v0 = _v0, v1, m;
78 |
79 | v1 = Avx2.PermuteVar8x32(_v1, ReversePermutation);
80 | m = Avx2.Max(v0, v1);
81 | v0 = Avx2.Min(v0, v1);
82 | v1 = Avx2.PermuteVar8x32(m, ReversePermutation);
83 | if (p == 1)
84 | goto done;
85 |
86 | Block_8_1(p - 1, ref v0);
87 | Block_8_1(p - 1, ref v1);
88 |
89 | done:
90 | _v0 = v0;
91 | _v1 = v1;
92 | }
93 |
94 | ///
95 | /// Block for sorting one vector of 8 elements.
96 | ///
97 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
98 | public void Block_8_1(int p, ref V _v) {
99 | V v0 = _v, v1, m;
100 |
101 | // PHASE1:
102 | // 76543210
103 | // 01234567
104 |
105 | v1 = Avx2.PermuteVar8x32(v0, ReversePermutation);
106 | m = Avx2.CompareGreaterThan(v0, v1);
107 | m = Avx2.Xor(m, AlternatingMaskHi128);
108 | v0 = Avx2.BlendVariable(v0, v1, m);
109 | if (p == 1)
110 | goto done;
111 |
112 | Block_4_2(p - 1, ref v0);
113 |
114 | done:
115 | _v = v0;
116 | }
117 |
118 | ///
119 | /// Block for sorting 2 independent vectors of 4 elements each.
120 | ///
121 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
122 | public void Block_4_2(int p, ref V _v) {
123 | V v0 = _v, v1, m;
124 |
125 | // PHASE1:
126 | // 3210 (INPUT, same in both lanes)
127 | // 0123
128 |
129 | v1 = Avx2.Shuffle(v0, 0x1B); // 0123
130 | m = Avx2.CompareGreaterThan(v0, v1);
131 | m = Avx2.Xor(m, AlternatingMaskHi64);
132 | v0 = Avx2.BlendVariable(v0, v1, m);
133 | if (p == 1)
134 | goto done;
135 |
136 | // PHASE2:
137 | // 3210
138 | // 2301
139 |
140 | v1 = Avx2.Shuffle(v0, 0b10110001); // 2301
141 | m = Avx2.CompareGreaterThan(v0, v1);
142 | m = Avx2.Xor(m, AlternatingMaskHi32);
143 | v0 = Avx2.BlendVariable(v0, v1, m);
144 |
145 | done:
146 | _v = v0;
147 | }
148 | }
149 | }
150 |
--------------------------------------------------------------------------------
/SortingNetworks/Attic/PeriodicInt_Block.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks.Attic
7 | {
8 | using V = Vector256;
9 |
10 | public partial class PeriodicInt
11 | {
12 | ///
13 | /// Operations of a (potentially partial) 32-block. The integers are ordered from the least significant element in
14 | /// to the most significant element in .
15 | ///
16 | /// Phase to stop at; must be 1-5.
17 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
18 | void Block32(int p, ref V _v0, ref V _v1, ref V _v2, ref V _v3) {
19 | var v2 = Avx2.Permute2x128(_v2, _v2, 0x01);
20 | var v3 = Avx2.Permute2x128(_v3, _v3, 0x01);
21 | v2 = Avx2.Shuffle(v2, 0x1B);
22 | v3 = Avx2.Shuffle(v3, 0x1B);
23 |
24 | Swap(ref _v0, ref v3, Avx2.CompareGreaterThan(v3, _v0)); // 0-7 : 31:24
25 | Swap(ref _v1, ref v2, Avx2.CompareGreaterThan(v2, _v1)); // 8-15 : 23:16
26 |
27 | v2 = Avx2.Shuffle(v2, 0x1B);
28 | v3 = Avx2.Shuffle(v3, 0x1B);
29 | v2 = Avx2.Permute2x128(v2, v2, 0x01);
30 | v3 = Avx2.Permute2x128(v3, v3, 0x01);
31 |
32 | Block16(p - 1, ref _v0, ref _v1);
33 | Block16(p - 1, ref v2, ref v3);
34 | _v2 = v2; _v3 = v3;
35 | }
36 |
37 | ///
38 | /// Performs the operations of a single, potentially partial, 16-block. The integers in each half (vector parameter)
39 | /// are ordered from least to most significant bits.
40 | ///
41 | /// Phase to stop at; must be 1, 2, 3 or 4. Unchecked; 4 or any other value will run the whole block.
42 | /// Low half of elements to sort.
43 | /// High half of elements to sort.
44 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
45 | void Block16(int p, ref V _lo, ref V _hi) {
46 | V lo = _lo, hi = _hi; // Stack-allocated to eliminate unnecessary loads/stores to refs
47 | V tmp1, tmp2;
48 |
49 | // INPUT:
50 | // 76543210
51 | // FEDCBA98
52 | // lo, hi are intermediate results after each stage and input to next one.
53 |
54 | // PHASE 1:
55 | // 76543210
56 | // 89ABCDEF
57 |
58 | tmp1 = Avx2.Shuffle(hi, 0x1B); // CDEF89AB
59 | hi = Avx2.Permute2x128(tmp1, tmp1, 0x01); // 89ABCDEF
60 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
61 | if (p == 1) {
62 | hi = Avx2.Permute2x128(hi, hi, 0x01);
63 | hi = Avx2.Shuffle(hi, 0x1B);
64 | _lo = lo; _hi = hi;
65 | return;
66 | }
67 |
68 | // PHASE 2:
69 | // BA983210
70 | // CDEF4567
71 |
72 | tmp1 = Avx2.Permute2x128(lo, hi, 0x31); // 89AB7654
73 | tmp1 = Avx2.Shuffle(tmp1, 0x1B); // BA984567
74 | lo = Avx2.Permute2x128(lo, tmp1, 0x30); // BA983210
75 | hi = Avx2.Permute2x128(hi, tmp1, 0x02); // CDEF4567
76 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
77 | if (p == 2) {
78 | hi = Avx2.Shuffle(hi, 0x1B); // FEDC7654
79 | tmp1 = Avx2.Permute2x128(lo, hi, 0x21); // 7654BA98
80 | Swap(ref lo, ref tmp1, AlternatingMaskLo128);
81 | Swap(ref hi, ref tmp1, AlternatingMaskHi128);
82 | _lo = lo; _hi = hi;
83 | return;
84 | }
85 |
86 | // PHASE 3:
87 | // DC985410
88 | // EFAB6723
89 |
90 | Swap(ref lo, ref hi, AlternatingMaskHi64); // L:CD984510 - H:BAEF3267
91 | lo = Avx2.Shuffle(lo, 0b01001011); //
92 | hi = Avx2.Shuffle(hi, 0b10110100); //
93 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
94 | if (p == 3) {
95 | hi = Avx2.Shuffle(hi, 0b10110001); // FEBA7632
96 | tmp1 = Avx2.UnpackLow(lo.AsInt64(),
97 | hi.AsInt64()).AsInt32(); // BA983210
98 | tmp2 = Avx2.UnpackHigh(lo.AsInt64(),
99 | hi.AsInt64()).AsInt32(); // FEDC7654
100 | goto fixup;
101 | }
102 |
103 | // PHASE 4:
104 | // ECA86420
105 | // FDB97531
106 |
107 | Swap(ref lo, ref hi, AlternatingMaskLo32); // L:ECA86420 - H:DF9B5713
108 | hi = Avx2.Shuffle(hi, 0b10110001);
109 | Swap(ref lo, ref hi, Avx2.CompareGreaterThan(hi, lo));
110 |
111 | // Final stage: restore order.
112 |
113 | tmp1 = Avx2.UnpackLow(lo, hi);
114 | tmp2 = Avx2.UnpackHigh(lo, hi);
115 |
116 | fixup:
117 | lo = Avx2.Permute2x128(tmp1, tmp2, 0x20);
118 | hi = Avx2.Permute2x128(tmp1, tmp2, 0x31);
119 | _lo = lo; _hi = hi;
120 | }
121 |
122 | ///
123 | /// Swaps elements of and where is 1.
124 | ///
125 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
126 | static void Swap(ref V lo, ref V hi, V mask) {
127 | var t = Avx2.BlendVariable(lo, hi, mask);
128 | lo = Avx2.BlendVariable(hi, lo, mask);
129 | hi = t;
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/SortingNetworks/Attic/Periodic16Expr.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq.Expressions;
4 | using System.Reflection;
5 | using System.Runtime.Intrinsics;
6 | using System.Runtime.Intrinsics.X86;
7 |
8 | namespace SortingNetworks.Attic
9 | {
10 | using V = System.Runtime.Intrinsics.Vector256;
11 |
12 | ///
13 | /// Builds an expression for periodic sorting network as compiled lambda.
14 | /// The network is hard-coded to 16 elements.
15 | ///
16 | public class Periodic16Expr
17 | {
18 | unsafe delegate void RegisterSort(ref V lo, ref V hi);
19 |
20 | static readonly Type TAVX = typeof(Avx);
21 | static readonly Type TAVX2 = typeof(Avx2);
22 | static readonly Type TV = typeof(V);
23 |
24 | // All zeros
25 | static readonly V Zero = Vector256.Create(0);
26 | // All ones
27 | static readonly V Complement = Avx2.CompareEqual(Zero, Zero);
28 | // FF00FF00 (1 digit = 32 bits)
29 | static readonly V AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8));
30 | // F0F0F0F0
31 | static readonly V AlternatingMaskLo32 = Avx2.Xor(
32 | Complement.AsInt64(),
33 | Avx2.ShiftLeftLogical(Complement.AsInt64(), 32)
34 | ).AsInt32();
35 |
36 | static readonly MethodInfo Shuffle = TAVX2.GetMethod("Shuffle", new Type[] { TV, typeof(byte) });
37 | static readonly MethodInfo BlendVariable = TAVX2.GetMethod("BlendVariable", new Type[] { TV, TV, TV });
38 | static readonly MethodInfo Permute2x128 = TAVX2.GetMethod("Permute2x128", new Type[] { TV, TV, typeof(byte) });
39 | static readonly MethodInfo CompareGreaterThan = TAVX2.GetMethod("CompareGreaterThan", new Type[] { TV, TV });
40 | static readonly MethodInfo UnpackLow = TAVX2.GetMethod("UnpackLow", new Type[] { TV, TV });
41 | static readonly MethodInfo UnpackHigh = TAVX2.GetMethod("UnpackHigh", new Type[] { TV, TV });
42 |
43 | readonly ParameterExpression lo;
44 | readonly ParameterExpression hi;
45 | readonly ParameterExpression tmp1;
46 | readonly ParameterExpression tmp2;
47 | readonly ParameterExpression tmp3;
48 | readonly RegisterSort sort;
49 |
50 | public Periodic16Expr() {
51 | lo = Expression.Parameter(TV.MakeByRefType(), "lo");
52 | hi = Expression.Parameter(TV.MakeByRefType(), "hi");
53 | tmp1 = Expression.Variable(TV, "tmp1");
54 | tmp2 = Expression.Variable(TV, "tmp2");
55 | tmp3 = Expression.Variable(TV, "tmp3");
56 |
57 | var steps = new List();
58 | for (int i = 0; i < 4; ++i)
59 | steps.AddRange(Step());
60 |
61 | var l = Expression.Lambda(
62 | Expression.Block(new ParameterExpression[] { tmp1, tmp2, tmp3 }, steps),
63 | new ParameterExpression[] { lo, hi });
64 | sort = l.Compile(false);
65 | }
66 |
67 | public unsafe void Sort(int* data) {
68 | var lo = Avx.LoadVector256(data);
69 | var hi = Avx.LoadVector256(data + 8);
70 | sort(ref lo, ref hi);
71 | Avx.Store(data, lo);
72 | Avx.Store(data + 8, hi);
73 | }
74 |
75 | private List Step() {
76 | var es = new List();
77 |
78 | // STAGE1
79 |
80 | es.AddRange(new Expression[] {
81 | Expression.Assign(tmp1, Expression.Call(Shuffle, hi, Expression.Constant((byte)0x1B))),
82 | Expression.Assign(hi, Expression.Call(Permute2x128, tmp1, tmp1, Expression.Constant((byte)1))),
83 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo))
84 | });
85 | es.AddRange(Swap(lo, hi, tmp1));
86 |
87 | // STAGE2
88 |
89 | es.AddRange(new Expression[] {
90 | Expression.Assign(tmp1, Expression.Call(Permute2x128, lo, hi, Expression.Constant((byte)0x31))),
91 | Expression.Assign(tmp1, Expression.Call(Shuffle, tmp1, Expression.Constant((byte)0x1B))),
92 | Expression.Assign(lo, Expression.Call(Permute2x128, lo, tmp1, Expression.Constant((byte)0x30))),
93 | Expression.Assign(hi, Expression.Call(Permute2x128, hi, tmp1, Expression.Constant((byte)0x02))),
94 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo))
95 | });
96 | es.AddRange(Swap(lo, hi, tmp1));
97 |
98 | // STAGE3
99 |
100 | es.AddRange(Swap(lo, hi, Expression.Constant(AlternatingMaskHi64)));
101 | es.AddRange(new Expression[] {
102 | Expression.Assign(lo, Expression.Call(Shuffle, lo, Expression.Constant((byte)0b01001011))),
103 | Expression.Assign(hi, Expression.Call(Shuffle, hi, Expression.Constant((byte)0b10110100))),
104 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo))
105 | });
106 | es.AddRange(Swap(lo, hi, tmp1));
107 |
108 | // STAGE4
109 |
110 | es.AddRange(Swap(lo, hi, Expression.Constant(AlternatingMaskLo32)));
111 | es.AddRange(new Expression[] {
112 | Expression.Assign(hi, Expression.Call(Shuffle, hi, Expression.Constant((byte)0b10110001))),
113 | Expression.Assign(tmp1, Expression.Call(CompareGreaterThan, hi, lo))
114 | });
115 | es.AddRange(Swap(lo, hi, tmp1));
116 |
117 | // RESTORE ORDER.
118 | es.AddRange(new Expression[] {
119 | Expression.Assign(tmp1, Expression.Call(UnpackLow, lo, hi)),
120 | Expression.Assign(tmp2, Expression.Call(UnpackHigh, lo, hi)),
121 | Expression.Assign(lo, Expression.Call(Permute2x128, tmp1, tmp2, Expression.Constant((byte)0x20))),
122 | Expression.Assign(hi, Expression.Call(Permute2x128, tmp1, tmp2, Expression.Constant((byte)0x31)))
123 | });
124 |
125 | return es;
126 | }
127 |
128 | private Expression[] Swap(ParameterExpression lo, ParameterExpression hi, Expression mask) {
129 | return new Expression[] {
130 | Expression.Assign(tmp3, Expression.Call(BlendVariable, lo, hi, mask)),
131 | Expression.Assign(lo, Expression.Call(BlendVariable, hi, lo, mask)),
132 | Expression.Assign(hi, tmp3)
133 | };
134 | }
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Mono auto generated files
17 | mono_crash.*
18 |
19 | # Build results
20 | [Dd]ebug/
21 | [Dd]ebugPublic/
22 | [Rr]elease/
23 | [Rr]eleases/
24 | x64/
25 | x86/
26 | [Ww][Ii][Nn]32/
27 | [Aa][Rr][Mm]/
28 | [Aa][Rr][Mm]64/
29 | bld/
30 | [Bb]in/
31 | [Oo]bj/
32 | [Oo]ut/
33 | [Ll]og/
34 | [Ll]ogs/
35 |
36 | # Visual Studio 2015/2017 cache/options directory
37 | .vs/
38 | # Uncomment if you have tasks that create the project's static files in wwwroot
39 | #wwwroot/
40 |
41 | # Visual Studio 2017 auto generated files
42 | Generated\ Files/
43 |
44 | # MSTest test Results
45 | [Tt]est[Rr]esult*/
46 | [Bb]uild[Ll]og.*
47 |
48 | # NUnit
49 | *.VisualState.xml
50 | TestResult.xml
51 | nunit-*.xml
52 |
53 | # Build Results of an ATL Project
54 | [Dd]ebugPS/
55 | [Rr]eleasePS/
56 | dlldata.c
57 |
58 | # Benchmark Results
59 | BenchmarkDotNet.Artifacts/
60 |
61 | # .NET Core
62 | project.lock.json
63 | project.fragment.lock.json
64 | artifacts/
65 |
66 | # ASP.NET Scaffolding
67 | ScaffoldingReadMe.txt
68 |
69 | # StyleCop
70 | StyleCopReport.xml
71 |
72 | # Files built by Visual Studio
73 | *_i.c
74 | *_p.c
75 | *_h.h
76 | *.ilk
77 | *.meta
78 | *.obj
79 | *.iobj
80 | *.pch
81 | *.pdb
82 | *.ipdb
83 | *.pgc
84 | *.pgd
85 | *.rsp
86 | *.sbr
87 | *.tlb
88 | *.tli
89 | *.tlh
90 | *.tmp
91 | *.tmp_proj
92 | *_wpftmp.csproj
93 | *.log
94 | *.vspscc
95 | *.vssscc
96 | .builds
97 | *.pidb
98 | *.svclog
99 | *.scc
100 |
101 | # Chutzpah Test files
102 | _Chutzpah*
103 |
104 | # Visual C++ cache files
105 | ipch/
106 | *.aps
107 | *.ncb
108 | *.opendb
109 | *.opensdf
110 | *.sdf
111 | *.cachefile
112 | *.VC.db
113 | *.VC.VC.opendb
114 |
115 | # Visual Studio profiler
116 | *.psess
117 | *.vsp
118 | *.vspx
119 | *.sap
120 |
121 | # Visual Studio Trace Files
122 | *.e2e
123 |
124 | # TFS 2012 Local Workspace
125 | $tf/
126 |
127 | # Guidance Automation Toolkit
128 | *.gpState
129 |
130 | # ReSharper is a .NET coding add-in
131 | _ReSharper*/
132 | *.[Rr]e[Ss]harper
133 | *.DotSettings.user
134 |
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 |
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 |
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 |
145 | # Coverlet is a free, cross platform Code Coverage Tool
146 | coverage*.json
147 | coverage*.xml
148 | coverage*.info
149 |
150 | # Visual Studio code coverage results
151 | *.coverage
152 | *.coveragexml
153 |
154 | # NCrunch
155 | _NCrunch_*
156 | .*crunch*.local.xml
157 | nCrunchTemp_*
158 |
159 | # MightyMoose
160 | *.mm.*
161 | AutoTest.Net/
162 |
163 | # Web workbench (sass)
164 | .sass-cache/
165 |
166 | # Installshield output folder
167 | [Ee]xpress/
168 |
169 | # DocProject is a documentation generator add-in
170 | DocProject/buildhelp/
171 | DocProject/Help/*.HxT
172 | DocProject/Help/*.HxC
173 | DocProject/Help/*.hhc
174 | DocProject/Help/*.hhk
175 | DocProject/Help/*.hhp
176 | DocProject/Help/Html2
177 | DocProject/Help/html
178 |
179 | # Click-Once directory
180 | publish/
181 |
182 | # Publish Web Output
183 | *.[Pp]ublish.xml
184 | *.azurePubxml
185 | # Note: Comment the next line if you want to checkin your web deploy settings,
186 | # but database connection strings (with potential passwords) will be unencrypted
187 | *.pubxml
188 | *.publishproj
189 |
190 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
191 | # checkin your Azure Web App publish settings, but sensitive information contained
192 | # in these scripts will be unencrypted
193 | PublishScripts/
194 |
195 | # NuGet Packages
196 | *.nupkg
197 | # NuGet Symbol Packages
198 | *.snupkg
199 | # The packages folder can be ignored because of Package Restore
200 | **/[Pp]ackages/*
201 | # except build/, which is used as an MSBuild target.
202 | !**/[Pp]ackages/build/
203 | # Uncomment if necessary however generally it will be regenerated when needed
204 | #!**/[Pp]ackages/repositories.config
205 | # NuGet v3's project.json files produces more ignorable files
206 | *.nuget.props
207 | *.nuget.targets
208 |
209 | # Microsoft Azure Build Output
210 | csx/
211 | *.build.csdef
212 |
213 | # Microsoft Azure Emulator
214 | ecf/
215 | rcf/
216 |
217 | # Windows Store app package directories and files
218 | AppPackages/
219 | BundleArtifacts/
220 | Package.StoreAssociation.xml
221 | _pkginfo.txt
222 | *.appx
223 | *.appxbundle
224 | *.appxupload
225 |
226 | # Visual Studio cache files
227 | # files ending in .cache can be ignored
228 | *.[Cc]ache
229 | # but keep track of directories ending in .cache
230 | !?*.[Cc]ache/
231 |
232 | # Others
233 | ClientBin/
234 | ~$*
235 | *~
236 | *.dbmdl
237 | *.dbproj.schemaview
238 | *.jfm
239 | *.pfx
240 | *.publishsettings
241 | orleans.codegen.cs
242 |
243 | # Including strong name files can present a security risk
244 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
245 | #*.snk
246 |
247 | # Since there are multiple workflows, uncomment next line to ignore bower_components
248 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
249 | #bower_components/
250 |
251 | # RIA/Silverlight projects
252 | Generated_Code/
253 |
254 | # Backup & report files from converting an old project file
255 | # to a newer Visual Studio version. Backup files are not needed,
256 | # because we have git ;-)
257 | _UpgradeReport_Files/
258 | Backup*/
259 | UpgradeLog*.XML
260 | UpgradeLog*.htm
261 | ServiceFabricBackup/
262 | *.rptproj.bak
263 |
264 | # SQL Server files
265 | *.mdf
266 | *.ldf
267 | *.ndf
268 |
269 | # Business Intelligence projects
270 | *.rdl.data
271 | *.bim.layout
272 | *.bim_*.settings
273 | *.rptproj.rsuser
274 | *- [Bb]ackup.rdl
275 | *- [Bb]ackup ([0-9]).rdl
276 | *- [Bb]ackup ([0-9][0-9]).rdl
277 |
278 | # Microsoft Fakes
279 | FakesAssemblies/
280 |
281 | # GhostDoc plugin setting file
282 | *.GhostDoc.xml
283 |
284 | # Node.js Tools for Visual Studio
285 | .ntvs_analysis.dat
286 | node_modules/
287 |
288 | # Visual Studio 6 build log
289 | *.plg
290 |
291 | # Visual Studio 6 workspace options file
292 | *.opt
293 |
294 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
295 | *.vbw
296 |
297 | # Visual Studio LightSwitch build output
298 | **/*.HTMLClient/GeneratedArtifacts
299 | **/*.DesktopClient/GeneratedArtifacts
300 | **/*.DesktopClient/ModelManifest.xml
301 | **/*.Server/GeneratedArtifacts
302 | **/*.Server/ModelManifest.xml
303 | _Pvt_Extensions
304 |
305 | # Paket dependency manager
306 | .paket/paket.exe
307 | paket-files/
308 |
309 | # FAKE - F# Make
310 | .fake/
311 |
312 | # CodeRush personal settings
313 | .cr/personal
314 |
315 | # Python Tools for Visual Studio (PTVS)
316 | __pycache__/
317 | *.pyc
318 |
319 | # Cake - Uncomment if you are using it
320 | # tools/**
321 | # !tools/packages.config
322 |
323 | # Tabs Studio
324 | *.tss
325 |
326 | # Telerik's JustMock configuration file
327 | *.jmconfig
328 |
329 | # BizTalk build output
330 | *.btp.cs
331 | *.btm.cs
332 | *.odx.cs
333 | *.xsd.cs
334 |
335 | # OpenCover UI analysis results
336 | OpenCover/
337 |
338 | # Azure Stream Analytics local run output
339 | ASALocalRun/
340 |
341 | # MSBuild Binary and Structured Log
342 | *.binlog
343 |
344 | # NVidia Nsight GPU debugger configuration file
345 | *.nvuser
346 |
347 | # MFractors (Xamarin productivity tool) working folder
348 | .mfractor/
349 |
350 | # Local History for Visual Studio
351 | .localhistory/
352 |
353 | # BeatPulse healthcheck temp database
354 | healthchecksdb
355 |
356 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
357 | MigrationBackup/
358 |
359 | # Ionide (cross platform F# VS Code tools) working folder
360 | .ionide/
361 |
362 | # Fody - auto-generated XML schema
363 | FodyWeavers.xsd
364 | /SNBenchmark/Properties/launchSettings.json
365 | /SortingNetworks/Properties/launchSettings.json
366 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ARCHIVED
2 |
3 | This repository has been archived and development has been moved [here](https://github.com/zvrba/Podaga).
4 |
5 | # Sorting networks
6 |
7 | Playground for exploring implementation techniques for sorting networks. These can sort small arrays much faster
8 | than `Array.Sort()`; depending on the size (4-32) and pattern, the speedup is 3-6X. See [benchmarks](#benchmarks) below.
9 | The generated assembly in Release mode is lean and mean, and seems comparable with what would have been generated by
10 | a C++ compiler.
11 |
12 | ## Changes since v1
13 |
14 | - Implemented Fisher-Yates shuffle; it is now used in benchmarks for more reliable validation of sorters.
15 | - Support for arbitrary length arrays, i.e., not just lengths that are power of 2.
16 | - Exhaustive validation now checks sorters for all lengths 4-32.
17 | - Added (32-bit) float sorter.
18 |
19 | # Project structure
20 |
21 | The projects are developed with Visual Studio 2019 and target netcore3.1. The solution consists of two projects.
22 |
23 | ## SNBenchmark
24 |
25 | This project dependes on BenchmarkDotNet. It contains validation code, benchmarks and demonstrates the use of sorting methods.
26 | The main program must be run with a single argument: `VI`, `VF` or `B`.
27 |
28 | When run with `VI`, it runs an exhaustive validation of integer networks for element counts of up to 32. When run with `VF`
29 | it runs an exhaustive validation of float networks for element counts of up to 32. Larger sizes are infeasible, as `2^N`
30 | zero/one inputs would have to be tested.
31 |
32 | When run with "B", it passes the rest of the arguments to BenchmarkDotNet. Without any additional arguments, it will present a menu.
33 | All benchmarks call `Environment.FailFast` if the result is found to be unsorted so that this can be detected in the logs.
34 |
35 | ## SortingNetworks
36 |
37 | `SortingNetworks` project is the main code and has no dependencies. The high-performance public types use `unsafe`
38 | code and can only be used from `unsafe` methods. The code depends on AVX2 instruction set. In addition, `AESRand`
39 | class depends on AES-NI instruction set.
40 |
41 | ### Sorting
42 |
43 | The main interface is `UnsafeSort` class which exposes a static factory function. The actual sorting code is in
44 | `IntSorter` and `FloatSorter` classes. You are not expected to understand how it works without studying [references](#references).
45 | The code to handle lengths that are not power of two introduces some overhead even for small arrays, so `PeriodicInt` class is
46 | provided with methods for sorting arrays of lengths 4, 8, 16 and 32; see [benchmarks](#benchmarks) below.
47 |
48 | `UnsafeSort` and `PeriodicInt` classes have no mutable internal state, so it is recommended to use a single (non-static) instance
49 | throughout the program (see remark about statics below).
50 |
51 | Directory `Attic` contains the (failed) experiment with expression trees and earlier (less performant) iterations of the
52 | periodic network.
53 |
54 | ### Random numbers
55 |
56 | This subsystem consists of three classes: and abstract `UnsafeRandom` class and two concrete classes: `AESRand` and `MWC1616Rand`.
57 | These can be instantiated directly. **NB!** The correctness of the code and the quality of random numbers has not been verified!
58 | Benchmarks use `MWC1616Rand` with a fixed seed as `AESRand` seemed to generate some obvious patterns.
59 |
60 | # Lessons learned
61 | These were learned by inspecting the generated assembly code in Release mode.
62 |
63 | Accessing static data has more overhead than accessing instance data: extraneous CALL instructions into the runtime
64 | are generated. My guess is that these ensure thread-safe, once-only static initialization semantics.
65 |
66 | Accessing `ref` parameters as in `Periodics16Branchless` generates a lot of load/store instructions.
67 | It is much more efficient to load ref parameters into locals at the beginning of the procedure and store
68 | results at the end, as in `PeriodicInt`.
69 |
70 | `Periodic16Expr` demonstrates how to build a sorter with expression trees. The generated assembly is OK,
71 | save for the long prologue/epilogue sequences This makes the overhead of calling a lambda compiled at run-time
72 | way too big for this application.
73 |
74 | `unsafe` is not viral: Method `A` is allowed to call `unsafe` method `B` without `A` having to be marked
75 | unsafe as well. Also, it is allowed to assign an `unsafe` method to a non-unsafe delegate variable.
76 |
77 | `System.Random` does not have consistent timing: when used in the baseline benchmark, the results almost always
78 | contained a warning about it having a bimodal distribution. This makes it rather unusable in baseline benchmarks.
79 | Therefore `UnsafeRandom`, `AESRand` and `MWC1616Rand` classes were implemented. Of these, only MWC is being used.
80 |
81 | Generics suck for numeric code. I couldn't figure out how to write a generic `bool IsSorted(T[])` method that'd
82 | work for any numeric type. Adding `where T : unmanaged` doesn't help as the compiler doesn't know that unmanaged
83 | types are comparable with less-than and equal. Nor does it seem possible to write `void Iota(T[] data)` that'd
84 | fill `data` with numbers from `0 .. Length-1`. This is apparently being actively worked on for new versions
85 | of .NET and C#.
86 |
87 | I attempted to make concrete benchmark classes `sealed`, but that makes BenchmarkDotNet fail because it apparently
88 | needs to derive from the benchmark class.
89 |
90 | RyuJIT has some impressive optimizations: despite branches in "block" methods in `PeriodicInt`, it manages to generate
91 | branchless code when constants that branches depend on are known at compile-time. It also elides unnecessary loads and
92 | stores to/from ref variables and inlines impressively. The generated machine code, however, is huge: 32-sorter is > 1kB
93 | in size. If considering larger sorters, inlining should be forced.
94 |
95 | # Benchmarks
96 |
97 | Raw benchmark data with excel file used to generate the report are in [BenchmarkResults](BenchmarkResults). Main results
98 | with comments are presented [here (PDF)](BenchmarkResults/Analysis.pdf) with additional comments below.
99 |
100 | I couldn't figure out how to coerce BenchmarkDotNet into treating the baseline as additive overhead instead of, well, _baseline_.
101 | (Actually, that's what `[IterationSetup]` and `[IterationCleanup]` are for, but they come with a warning that they could spoil results
102 | of microbenchmarks.) The analysis presents results after subtracting the additive overhead.
103 |
104 | ## General observations
105 |
106 | Even for small sizes, `UnsafeSort` is slightly slower than `PeriodicInt` which works for fixed-length arrays only
107 | (compare "IntBenchmark" with "Specialized" ). For example, `PeriodicInt` takes ~22ns to sort 16 elements, whereas
108 | `UnsafeSort` takes ~38ns. Even though the additional logic to handle all sizes below 16 is relatively simple, it
109 | shows in running times.
110 |
111 | Sorting floating point numbers seems to be slightly slower than integers ("Int vs Float").
112 |
113 | ## Invocation: direct vs delegate vs compiled expression
114 |
115 | This project was initially started to investigate manual code generation using expression trees, but it turns out that
116 | these are unsuitable for high-performance scenarios as the prologue/epilogue in the generated code has way too high overhead
117 | (see `ExpressionInvocationBenchmark`):
118 |
119 | | Method | Mean | Error | StdDev |
120 | |----------------- |----------:|---------:|---------:|
121 | | DirectInvoke | 45.51 ns | 0.934 ns | 2.147 ns |
122 | | ExpressionInvoke | 124.08 ns | 2.512 ns | 6.747 ns |
123 |
124 | On the other hand, there is no substantial difference between directly invoking an instance method, or invoking it through an
125 | abstract base method. Thus there is no penalty in using the more convenient `UnsafeSort` class as opposed to directly calling
126 | methods on an instance of `PeriodicInt`:
127 |
128 |
129 | | Method | Mean | Error | StdDev |
130 | |--------------- |---------:|---------:|---------:|
131 | | AbstractInvoke | 23.80 ns | 0.421 ns | 0.603 ns |
132 | | ConcreteInvoke | 23.28 ns | 0.310 ns | 0.290 ns |
133 |
134 | NB! The results between the two benchmarks are not directly comparable as they run different algorithms.
135 | TODO: same for float. Re-export analysis.
136 |
137 | # References
138 |
139 | D. E. Knuth, The Art of Computer Programming, vol. 3, section 5.3.4 for basic exposition. The ""periodic" network as
140 | implemented here appears in TAOCP exercise 53, but has first been described by Dowd et al.: "The Periodic Balanced Sorting
141 | Network", JACM Vol. 36, No. 4, October 1989, pp. 738-757.
142 |
143 | Other references appear in code comments.
144 |
--------------------------------------------------------------------------------
/SortingNetworks/IntSorter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks
7 | {
8 | using V = Vector256;
9 |
10 | sealed unsafe class IntSorter : UnsafeSort
11 | {
12 | // TODO: Place these inside own unsafe struct.
13 | readonly V Zero; // 00000000
14 | readonly V Complement; // FFFFFFFF
15 | readonly V AlternatingMaskLo128; // 0000FFFF
16 | readonly V AlternatingMaskHi128; // FFFF0000
17 | readonly V AlternatingMaskHi64; // FF00FF00
18 | readonly V AlternatingMaskHi32; // F0F0F0F0
19 | readonly V Max; // int.MaxValue in each element
20 | readonly V ReversePermutation; // Input to VPERMD that reverses all 8 ints
21 | readonly V[] CountMask; // For loading 1-8 elements. VPALIGNR requires an immediate constant, which kills perf.
22 |
23 | internal IntSorter(int maxLength) {
24 | Zero = V.Zero;
25 | Complement = Avx2.CompareEqual(Zero, Zero);
26 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsInt32();
27 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsInt32();
28 | AlternatingMaskHi64 = Avx2.Xor(Complement, Avx2.ShiftRightLogical128BitLane(Complement, 8));
29 | AlternatingMaskHi32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftRightLogical(Complement.AsInt64(), 32)).AsInt32();
30 | Max = Vector256.Create(int.MaxValue);
31 | ReversePermutation = Vector256.Create(7, 6, 5, 4, 3, 2, 1, 0);
32 | CountMask = new V[8];
33 | CountMask[0] = Complement;
34 | CountMask[1] = Vector256.Create(-1, 0, 0, 0, 0, 0, 0, 0);
35 | CountMask[2] = Vector256.Create(-1, -1, 0, 0, 0, 0, 0, 0);
36 | CountMask[3] = Vector256.Create(-1, -1, -1, 0, 0, 0, 0, 0);
37 | CountMask[4] = Vector256.Create(-1, -1, -1, -1, 0, 0, 0, 0);
38 | CountMask[5] = Vector256.Create(-1, -1, -1, -1, -1, 0, 0, 0);
39 | CountMask[6] = Vector256.Create(-1, -1, -1, -1, -1, -1, 0, 0);
40 | CountMask[7] = Vector256.Create(-1, -1, -1, -1, -1, -1, -1, 0);
41 |
42 | if (maxLength <= 8) {
43 | MinLength = 4;
44 | MaxLength = 8;
45 | Sorter = Sort8;
46 | }
47 | else if (maxLength <= 16) {
48 | MinLength = 9;
49 | MaxLength = 16;
50 | Sorter = Sort16;
51 | }
52 | else {
53 | MinLength = 16;
54 | MaxLength = 1 << 24;
55 | Sorter = Sort;
56 | if (maxLength > MaxLength)
57 | throw new ArgumentOutOfRangeException("Maximum supported length is 2^24.");
58 | }
59 | }
60 |
61 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
62 | unsafe void Sort8(int* data, int c) {
63 | var v = Load8(data, c);
64 | Block8(2, ref v);
65 | Block8(3, ref v);
66 | Block8(3, ref v);
67 | Store8(data, v, c);
68 | }
69 |
70 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
71 | unsafe void Sort16(int* data, int c) {
72 | var v0 = Avx.LoadVector256(data);
73 | var v1 = Load8(data + 8, c - 8);
74 | Block16(2, ref v0, ref v1);
75 | Block16(3, ref v0, ref v1);
76 | Block16(4, ref v0, ref v1);
77 | Block16(4, ref v0, ref v1);
78 | Avx.Store(data, v0);
79 | Store8(data + 8, v1, c - 8);
80 | }
81 |
82 | unsafe void Sort(int* data, int c) {
83 | var (upsize, log2c) = UpSize(c);
84 | for (int i = 0; i < log2c; ++i)
85 | Block(i + 2 < log2c ? i + 2 : log2c, data, c, upsize);
86 |
87 | static (int upsize, int log2c) UpSize(int size) {
88 | --size;
89 | size |= size >> 1;
90 | size |= size >> 2;
91 | size |= size >> 4;
92 | size |= size >> 8;
93 | size |= size >> 16;
94 |
95 | var upsize = size + 1;
96 | int log2c = -1;
97 | for (size = upsize; size > 0; ++log2c, size >>= 1)
98 | ;
99 |
100 | return (upsize, log2c);
101 | }
102 | }
103 |
104 | // b and e point to the true range to be sorted. upsize is (e-b) rounded up to a power of two.
105 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
106 | unsafe void Block(int p, int* b, int c, int upsize) {
107 | int split = 1;
108 | for (; p > 0 && upsize >= 8; --p, split *= 2, upsize /= 2) {
109 | for (int i = 0, sb = 0; i < split && sb < c; ++i, sb += upsize) {
110 | var sc = upsize;
111 | if (sb + upsize > c)
112 | sc = c - sb;
113 | Phase(p, b + sb, sc, upsize);
114 | }
115 | }
116 | }
117 |
118 | // b points to block start, c is the actual # of elements in the block and upsize is c rounded up to power of two.
119 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
120 | unsafe void Phase(int p, int* b, int c, int upsize) {
121 | if (upsize > 8) {
122 | var i0 = (upsize - c) >> 3;
123 | var c0 = (upsize - c) & 7;
124 |
125 | int* e = b + upsize - 8 * (i0 + 1);
126 | b += 8 * i0;
127 |
128 | if (c0 != 0 && b < e) {
129 | PhaseStep(1, b, e, 16 - c0);
130 | b += 8;
131 | e -= 8;
132 | }
133 |
134 | for (; b < e; b += 8, e -= 8)
135 | PhaseStep(b, e);
136 | }
137 | else {
138 | Block8(p, b, c);
139 | }
140 | }
141 |
142 | // Full size (16) compare-exchange.
143 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
144 | unsafe void PhaseStep(int* lo, int* hi) {
145 | var v0 = Avx.LoadVector256(lo);
146 | var v1 = Avx.LoadVector256(hi);
147 | Block16(1, ref v0, ref v1);
148 | Avx.Store(lo, v0);
149 | Avx.Store(hi, v1);
150 | }
151 |
152 | // No inlining; executed at most once.
153 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
154 | unsafe void PhaseStep(int p, int* lo, int* hi, int c) {
155 | var v0 = Avx.LoadVector256(lo);
156 | var v1 = Load8(hi, c - 8);
157 | Block16(p, ref v0, ref v1);
158 | Avx.Store(lo, v0);
159 | Store8(hi, v1, c - 8);
160 | }
161 |
162 | // No inlining; executed at most once.
163 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
164 | unsafe void Block8(int p, int* b, int c) {
165 | var v = Load8(b, c);
166 | Block8(p, ref v);
167 | Store8(b, v, c);
168 | }
169 |
170 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
171 | unsafe void Block16(int p, ref V _v0, ref V _v1) {
172 | V v0 = _v0, v1, m;
173 |
174 | v1 = Avx2.PermuteVar8x32(_v1, ReversePermutation);
175 | m = Avx2.Max(v0, v1);
176 | v0 = Avx2.Min(v0, v1);
177 | v1 = Avx2.PermuteVar8x32(m, ReversePermutation);
178 | if (--p == 0)
179 | goto done;
180 |
181 | Block8(p, ref v0);
182 | Block8(p, ref v1);
183 |
184 | done:
185 | _v0 = v0; _v1 = v1;
186 | }
187 |
188 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
189 | void Block8(int p, ref V v) {
190 | V v0 = v, v1, m;
191 |
192 | // COMPARE / SWAP PHASE
193 | // 76543210
194 | // 01234567
195 |
196 | v1 = Avx2.PermuteVar8x32(v0, ReversePermutation);
197 | m = Avx2.CompareGreaterThan(v0, v1);
198 | m = Avx2.Xor(m, AlternatingMaskHi128);
199 | v0 = Avx2.BlendVariable(v0, v1, m);
200 | if (--p == 0)
201 | goto done;
202 |
203 | // COMPARE / SWAP PHASE
204 | // 76543210
205 | // 45670123
206 |
207 | v1 = Avx2.Shuffle(v0, 0x1B);
208 | m = Avx2.CompareGreaterThan(v0, v1);
209 | m = Avx2.Xor(m, AlternatingMaskHi64);
210 | v0 = Avx2.BlendVariable(v0, v1, m);
211 | if (--p == 0)
212 | goto done;
213 |
214 | // COMPARE / SWAP PHASE
215 | // 76543210
216 | // 67452301
217 |
218 | v1 = Avx2.Shuffle(v0, 0b10110001);
219 | m = Avx2.CompareGreaterThan(v0, v1);
220 | m = Avx2.Xor(m, AlternatingMaskHi32);
221 | v0 = Avx2.BlendVariable(v0, v1, m);
222 |
223 | done:
224 | v = v0;
225 | }
226 |
227 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
228 | unsafe V Load8(int* v, int c) {
229 | var m = CountMask[c & 7];
230 | return Avx2.BlendVariable(Max, Avx2.MaskLoad(v, m), m);
231 | }
232 |
233 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
234 | unsafe void Store8(int* a, V v, int c) {
235 | var m = CountMask[c & 7];
236 | Avx2.MaskStore(a, m, v);
237 | }
238 | }
239 | }
240 |
--------------------------------------------------------------------------------
/SortingNetworks/FloatSorter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using System.Runtime.Intrinsics.X86;
5 |
6 | namespace SortingNetworks
7 | {
8 | using V = Vector256;
9 | using VI = Vector256;
10 |
11 | sealed unsafe class FloatSorter : UnsafeSort
12 | {
13 | readonly V Zero; // 00000000
14 | readonly V Complement; // FFFFFFFF
15 | readonly V AlternatingMaskLo128; // 0000FFFF
16 | readonly V AlternatingMaskHi128; // FFFF0000
17 | readonly V AlternatingMaskHi64; // FF00FF00
18 | readonly V AlternatingMaskHi32; // F0F0F0F0
19 | readonly V Max; // int.MaxValue in each element
20 | readonly VI ReversePermutation; // Input to VPERMD that reverses all 8 ints
21 | readonly V[] CountMask; // For loading 1-8 elements. VPALIGNR requires an immediate constant, which kills perf.
22 |
23 | internal FloatSorter(int maxLength) {
24 | Zero = V.Zero;
25 | Complement = Vector256.Create(-1).AsSingle();
26 | AlternatingMaskHi128 = Vector256.Create(0L, 0L, -1L, -1L).AsSingle();
27 | AlternatingMaskLo128 = Vector256.Create(-1L, -1L, 0L, 0L).AsSingle();
28 | AlternatingMaskHi64 = Avx2.Xor(Complement.AsByte(), Avx2.ShiftRightLogical128BitLane(Complement.AsByte(), 8)).AsSingle();
29 | AlternatingMaskHi32 = Avx2.Xor(Complement.AsInt64(), Avx2.ShiftRightLogical(Complement.AsInt64(), 32)).AsSingle();
30 | Max = Vector256.Create(float.PositiveInfinity);
31 | ReversePermutation = Vector256.Create(7, 6, 5, 4, 3, 2, 1, 0);
32 | CountMask = new V[8];
33 | CountMask[0] = Complement;
34 | CountMask[1] = Vector256.Create(-1, 0, 0, 0, 0, 0, 0, 0).AsSingle();
35 | CountMask[2] = Vector256.Create(-1, -1, 0, 0, 0, 0, 0, 0).AsSingle();
36 | CountMask[3] = Vector256.Create(-1, -1, -1, 0, 0, 0, 0, 0).AsSingle();
37 | CountMask[4] = Vector256.Create(-1, -1, -1, -1, 0, 0, 0, 0).AsSingle();
38 | CountMask[5] = Vector256.Create(-1, -1, -1, -1, -1, 0, 0, 0).AsSingle();
39 | CountMask[6] = Vector256.Create(-1, -1, -1, -1, -1, -1, 0, 0).AsSingle();
40 | CountMask[7] = Vector256.Create(-1, -1, -1, -1, -1, -1, -1, 0).AsSingle();
41 |
42 | if (maxLength <= 8) {
43 | MinLength = 4;
44 | MaxLength = 8;
45 | Sorter = Sort8;
46 | } else if (maxLength <= 16) {
47 | MinLength = 9;
48 | MaxLength = 16;
49 | Sorter = Sort16;
50 | } else {
51 | MinLength = 16;
52 | MaxLength = 1 << 24;
53 | Sorter = Sort;
54 | if (maxLength > MaxLength)
55 | throw new ArgumentOutOfRangeException("Maximum supported length is 2^24.");
56 | }
57 | }
58 |
59 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
60 | unsafe void Sort8(float* data, int c) {
61 | var v = Load8(data, c);
62 | Block8(2, ref v);
63 | Block8(3, ref v);
64 | Block8(3, ref v);
65 | Store8(data, v, c);
66 | }
67 |
68 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
69 | unsafe void Sort16(float* data, int c) {
70 | var v0 = Avx.LoadVector256(data);
71 | var v1 = Load8(data + 8, c - 8);
72 | Block16(2, ref v0, ref v1);
73 | Block16(3, ref v0, ref v1);
74 | Block16(4, ref v0, ref v1);
75 | Block16(4, ref v0, ref v1);
76 | Avx.Store(data, v0);
77 | Store8(data + 8, v1, c - 8);
78 | }
79 |
80 | unsafe void Sort(float* data, int c) {
81 | var (upsize, log2c) = UpSize(c);
82 | for (int i = 0; i < log2c; ++i)
83 | Block(i + 2 < log2c ? i + 2 : log2c, data, c, upsize);
84 |
85 | static (int upsize, int log2c) UpSize(int size) {
86 | --size;
87 | size |= size >> 1;
88 | size |= size >> 2;
89 | size |= size >> 4;
90 | size |= size >> 8;
91 | size |= size >> 16;
92 |
93 | var upsize = size + 1;
94 | int log2c = -1;
95 | for (size = upsize; size > 0; ++log2c, size >>= 1)
96 | ;
97 |
98 | return (upsize, log2c);
99 | }
100 | }
101 |
102 |
103 | // b and e point to the true range to be sorted. upsize is (e-b) rounded up to a power of two.
104 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
105 | unsafe void Block(int p, float* b, int c, int upsize) {
106 | int split = 1;
107 | for (; p > 0 && upsize >= 8; --p, split *= 2, upsize /= 2) {
108 | for (int i = 0, sb = 0; i < split && sb < c; ++i, sb += upsize) {
109 | var sc = upsize;
110 | if (sb + upsize > c)
111 | sc = c - sb;
112 | Phase(p, b + sb, sc, upsize);
113 | }
114 | }
115 | }
116 |
117 | // b points to block start, c is the actual # of elements in the block and upsize is c rounded up to power of two.
118 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
119 | unsafe void Phase(int p, float* b, int c, int upsize) {
120 | if (upsize > 8) {
121 | var i0 = (upsize - c) >> 3;
122 | var c0 = (upsize - c) & 7;
123 |
124 | float* e = b + upsize - 8 * (i0 + 1);
125 | b += 8 * i0;
126 |
127 | if (c0 != 0 && b < e) {
128 | PhaseStep(1, b, e, 16 - c0);
129 | b += 8;
130 | e -= 8;
131 | }
132 |
133 | for (; b < e; b += 8, e -= 8)
134 | PhaseStep(b, e);
135 | } else {
136 | Block8(p, b, c);
137 | }
138 | }
139 |
140 | // Full size (16) compare-exchange.
141 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
142 | unsafe void PhaseStep(float* lo, float* hi) {
143 | var v0 = Avx.LoadVector256(lo);
144 | var v1 = Avx.LoadVector256(hi);
145 | Block16(1, ref v0, ref v1);
146 | Avx.Store(lo, v0);
147 | Avx.Store(hi, v1);
148 | }
149 |
150 | // No inlining; executed at most once.
151 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
152 | unsafe void PhaseStep(int p, float* lo, float* hi, int c) {
153 | var v0 = Avx.LoadVector256(lo);
154 | var v1 = Load8(hi, c - 8);
155 | Block16(p, ref v0, ref v1);
156 | Avx.Store(lo, v0);
157 | Store8(hi, v1, c - 8);
158 | }
159 |
160 | // No inlining; executed at most once.
161 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
162 | unsafe void Block8(int p, float* b, int c) {
163 | var v = Load8(b, c);
164 | Block8(p, ref v);
165 | Store8(b, v, c);
166 | }
167 |
168 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
169 | unsafe void Block16(int p, ref V _v0, ref V _v1) {
170 | V v0 = _v0, v1, m;
171 |
172 | v1 = Avx2.PermuteVar8x32(_v1, ReversePermutation);
173 | m = Avx.Max(v0, v1);
174 | v0 = Avx.Min(v0, v1);
175 | v1 = Avx2.PermuteVar8x32(m, ReversePermutation);
176 | if (--p == 0)
177 | goto done;
178 |
179 | Block8(p, ref v0);
180 | Block8(p, ref v1);
181 |
182 | done:
183 | _v0 = v0; _v1 = v1;
184 | }
185 |
186 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
187 | void Block8(int p, ref V v) {
188 | V v0 = v, v1, m;
189 |
190 | // COMPARE / SWAP PHASE
191 | // 76543210
192 | // 01234567
193 |
194 | v1 = Avx2.PermuteVar8x32(v0, ReversePermutation);
195 | m = Avx.Compare(v0, v1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
196 | m = Avx.Xor(m, AlternatingMaskHi128);
197 | v0 = Avx.BlendVariable(v0, v1, m);
198 | if (--p == 0)
199 | goto done;
200 |
201 | // COMPARE / SWAP PHASE
202 | // 76543210
203 | // 45670123
204 |
205 | v1 = Avx2.Shuffle(v0, v0, 0x1B);
206 | m = Avx.Compare(v0, v1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
207 | m = Avx.Xor(m, AlternatingMaskHi64);
208 | v0 = Avx.BlendVariable(v0, v1, m);
209 | if (--p == 0)
210 | goto done;
211 |
212 | // COMPARE / SWAP PHASE
213 | // 76543210
214 | // 67452301
215 |
216 | v1 = Avx.Shuffle(v0, v0, 0b10110001);
217 | m = Avx.Compare(v0, v1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
218 | m = Avx.Xor(m, AlternatingMaskHi32);
219 | v0 = Avx.BlendVariable(v0, v1, m);
220 |
221 | done:
222 | v = v0;
223 | }
224 |
225 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
226 | unsafe V Load8(float* v, int c) {
227 | var m = CountMask[c & 7];
228 | return Avx.BlendVariable(Max, Avx.MaskLoad(v, m), m);
229 | }
230 |
231 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
232 | unsafe void Store8(float* a, V v, int c) {
233 | var m = CountMask[c & 7];
234 | Avx.MaskStore(a, m, v);
235 | }
236 | }
237 | }
238 |
--------------------------------------------------------------------------------