├── Bench
├── .gitignore
├── restore-gfx.sh
├── drop-to-console.sh
├── run.cmd
├── Bench.cs
├── run.sh
├── Bench.csproj
├── prep.sh
├── SortBench.cs
├── SmallSortBench.cs
└── Utils
│ ├── TimePerNColumn.cs
│ ├── ValueGenerator.cs
│ ├── DatatableJsonExporter.cs
│ └── FullNameProvider.cs
├── vxsort.png
├── VxSort
├── LocalsInit.cs
├── FodyWeavers.xml
├── InternalsVisibleTo.cs
├── SpanExtensions.cs
├── FodyWeavers.xsd
├── VxSort.csproj
├── BitonicSort.cs
├── BitonicSort.Generated.tt
├── BytePermutationTables.cs
├── BitonicSort.Generated.cs
└── VectorizedSort.cs
├── .gitignore
├── nuget.config
├── Test
├── Test.csproj
├── DataGeneration.cs
├── BitonicSortTests.cs
├── PositionCountingSortTests.cs
├── ParityTests.cs
└── PermutationTableTests.cs
├── Directory.Build.props
├── vxsort.sln.DotSettings.user
├── LICENSE
├── Directory.Build.targets
├── vxsort.svg
├── vxsort.sln
├── .github
└── workflows
│ └── build.yml
├── vxsort.sln.DotSettings
└── README.md
/Bench/.gitignore:
--------------------------------------------------------------------------------
1 | BenchmarkDotNet.Artifacts
2 |
--------------------------------------------------------------------------------
/vxsort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/damageboy/VxSort/HEAD/vxsort.png
--------------------------------------------------------------------------------
/Bench/restore-gfx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo systemctl isolate graphical.target
3 |
--------------------------------------------------------------------------------
/Bench/drop-to-console.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | sudo systemctl isolate multi-user.target
3 |
--------------------------------------------------------------------------------
/VxSort/LocalsInit.cs:
--------------------------------------------------------------------------------
1 | using LocalsInit;
2 |
3 | [assembly: LocalsInit(false)]
4 |
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | obj/
2 | bin/
3 | .idea/
4 | .vscode/
5 | .vs/
6 | .ionide/
7 | Test/reports/
8 | TestBlog/reports/
9 | Test/TestResults/
10 | TestBlog/TestResults/
11 |
--------------------------------------------------------------------------------
/VxSort/FodyWeavers.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/VxSort/InternalsVisibleTo.cs:
--------------------------------------------------------------------------------
1 | using System.Runtime.CompilerServices;
2 | using LocalsInit;
3 |
4 | [assembly: InternalsVisibleTo("Test")]
5 | [assembly: InternalsVisibleTo("Bench")]
6 | [assembly: InternalsVisibleTo("Example")]
7 |
8 |
9 |
--------------------------------------------------------------------------------
/nuget.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Bench/run.cmd:
--------------------------------------------------------------------------------
1 | FOR %%G IN (telegram,chrome,typora,firefox,rider64,devenv,msbuild,spotify) DO (pssuspend %%G 1> nul 2> nul)
2 |
3 | dotnet run -c release -- %1 %2 %3 %4 %5 %6 %7 %8 %9
4 |
5 | FOR %%G IN (telegram,chrome,typora,firefox,rider64,devenv,msbuild,spotify) DO (pssuspend -r %%G 1> nul 2> nul)
--------------------------------------------------------------------------------
/Bench/Bench.cs:
--------------------------------------------------------------------------------
1 | using System.Drawing;
2 | using BenchmarkDotNet.Running;
3 | using Microsoft.VisualBasic.CompilerServices;
4 |
5 | namespace Bench
6 | {
7 | class Program
8 | {
9 | static void Main(string[] args)
10 | {
11 | BenchmarkSwitcher
12 | .FromAssembly(typeof(Program).Assembly)
13 | .Run(args);
14 | }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/Bench/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #hogs=$(pgrep -if "(typora|firefox|chrome|chromium-browser|rider|mono-sgen|spotify|msbuild|telegram|browsercore64)")
3 | hogs=$(pgrep -if "(typora|firefox|chrome|chromium-browser|rider|mono-sgen|msbuild|telegram)")
4 | echo Suspending $(echo $hogs | wc -w) procs before running BDN
5 | [[ -z "$hogs" ]] || echo $hogs | xargs kill -STOP
6 | dotnet run -c release -- "$@"
7 | echo Resuming $(echo $hogs | wc -w) procs after running BDN
8 | [[ -z "$hogs" ]] || echo $hogs | xargs kill -CONT
9 |
--------------------------------------------------------------------------------
/Test/Test.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 | netcoreapp3.1
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/Directory.Build.props:
--------------------------------------------------------------------------------
1 |
2 |
3 | true
4 | 8
5 | pdbonly
6 | true
7 | CA1810;CA1303;CA2207;NU5105
8 | AnyCPU
9 | true
10 | true
11 | snupkg
12 | 0.1.5
13 |
14 |
15 |
--------------------------------------------------------------------------------
/vxsort.sln.DotSettings.user:
--------------------------------------------------------------------------------
1 |
2 | VISIBLE_FILES
3 | <AssemblyExplorer>
4 | <Assembly Path="/home/dmg/.nuget/packages/newtonsoft.json/12.0.3-beta2/lib/netstandard2.0/Newtonsoft.Json.dll" />
5 | </AssemblyExplorer>
--------------------------------------------------------------------------------
/Bench/Bench.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp3.1
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/VxSort/SpanExtensions.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Diagnostics;
3 | using System.Globalization;
4 | using System.Runtime.InteropServices;
5 | using System.Text;
6 |
7 | namespace VxSortResearch.Utils
8 | {
9 | public static class SpanExtensions
10 | {
11 | public static unsafe void * AlignSpan(this ReadOnlySpan unalignedSpan, ulong alignment)
12 | {
13 | var alignedPtr = (byte*) Marshal.AllocHGlobal(unalignedSpan.Length + (int) alignment);
14 | var x = alignedPtr;
15 | if (((ulong) alignedPtr) % alignment != 0)
16 | alignedPtr = (byte *) (((ulong) alignedPtr + alignment) & ~(alignment - 1));
17 |
18 | Debug.Assert((ulong) alignedPtr % alignment == 0);
19 | unalignedSpan.CopyTo(new Span(alignedPtr, unalignedSpan.Length));
20 | return alignedPtr;
21 | }
22 | }
23 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 damageboy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/VxSort/FodyWeavers.xsd:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | Defines the default value for the localsinit flag.
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | 'true' to run assembly verification (PEVerify) on the target assembly after all weavers have been executed.
20 |
21 |
22 |
23 |
24 | A comma-separated list of error codes that can be safely ignored in assembly verification.
25 |
26 |
27 |
28 |
29 | 'false' to turn off automatic generation of the XML Schema file.
30 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/Directory.Build.targets:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | runtime; build; native; contentfiles; analyzers; buildtransitive
5 | all
6 |
7 |
8 | all
9 |
10 |
11 | all
12 | runtime; build; native; contentfiles; analyzers; buildtransitive
13 |
14 |
15 | all
16 | runtime; build; native; contentfiles; analyzers; buildtransitive
17 |
18 |
19 |
20 | runtime; build; native; contentfiles; analyzers; buildtransitive
21 | all
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/Bench/prep.sh:
--------------------------------------------------------------------------------
1 | # from https://www.alexgallego.org/perf/compiler/explorer/flatbuffers/smf/2018/06/30/effects-cpu-turbo.html
2 |
3 | function cpu_disable_performance_cpupower_state(){
4 | for c in /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor; do echo powersave > $c; done
5 | }
6 | function cpu_enable_performance_cpupower_state(){
7 | for c in /sys/devices/system/cpu/cpu[0-9]*/cpufreq/scaling_governor; do echo performance > $c; done
8 | }
9 | function cpu_available_frequencies() {
10 | local cpuspec=${1:-[0-9]}
11 |
12 | for i in /sys/devices/system/cpu/cpu$cpuspec*; do
13 | echo "$i:"
14 | echo " cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)";
15 | echo " cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)";
16 | done
17 | }
18 |
19 | function cpu_set_min_frequencies() {
20 | local freq=$1;
21 | local cpuspec=${2:-[0-9]}
22 | if [[ $freq == "" ]]; then exit 1; fi
23 | for i in /sys/devices/system/cpu/cpu$cpuspec*; do
24 | echo "$i:"
25 | echo "$i/cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)";
26 | echo "$freq" | sudo tee "$i/cpufreq/scaling_min_freq"
27 | echo "$i/cpufreq/scaling_min_freq: $(cat $i/cpufreq/scaling_min_freq)";
28 | done
29 | }
30 |
31 | function cpu_set_max_frequencies() {
32 | local freq=$1;
33 | local cpuspec=${2:-[0-9]}
34 | if [[ $freq == "" ]]; then exit 1; fi
35 | for i in /sys/devices/system/cpu/cpu$cpuspec*; do
36 | echo "$i:"
37 | echo "$i/cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)";
38 | echo "$freq" | sudo tee "$i/cpufreq/scaling_max_freq"
39 | echo "$i/cpufreq/scaling_max_freq: $(cat $i/cpufreq/scaling_max_freq)";
40 | done
41 | }
42 |
--------------------------------------------------------------------------------
/Test/DataGeneration.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.InteropServices;
3 | using System.Security.Cryptography;
4 |
5 | namespace Test {
6 | ///
7 | /// Tests + Setup code comparing various quicksort to arraysort in terms of correctness/parity
8 | ///
9 | public class DataGeneration
10 | {
11 | internal static (int[] randomData, int[] sortedData, string reproContext) GenerateData(
12 | int size, int seed, int forcedValue = -1, double forcedValueRate = double.NaN, int modulo = int.MaxValue, bool dontSort = false)
13 | {
14 | var r = new Random(seed);
15 | var data = new int[size];
16 | for (var i = 0; i < size; ++i)
17 | data[i] = double.IsNaN(forcedValueRate) ? r.Next() % modulo :
18 | r.NextDouble() > forcedValueRate ? forcedValue : (r.Next() % modulo);
19 |
20 | int[] sorted = null;
21 | if (!dontSort) {
22 | sorted = new int[size];
23 | data.CopyTo(sorted, 0);
24 | Array.Sort(sorted);
25 | }
26 |
27 | var reproContext = "";
28 |
29 | using (var sha1 = new SHA1CryptoServiceProvider()) {
30 | Span hash = stackalloc byte[20];
31 | sha1.TryComputeHash(MemoryMarshal.Cast(new ReadOnlySpan(data)), hash, out _);
32 | var dataHash = Convert.ToBase64String(hash);
33 | sha1.TryComputeHash(MemoryMarshal.Cast(new ReadOnlySpan(sorted)), hash, out _);
34 | var sortedHash = Convert.ToBase64String(hash);
35 |
36 | reproContext = $"[{size},{seed}] -> [{dataHash},{sortedHash}]";
37 | }
38 |
39 | return (data, sorted, reproContext);
40 | }
41 | }
42 | }
--------------------------------------------------------------------------------
/VxSort/VxSort.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp3.1
5 | true
6 | VxSort
7 | damageboy
8 | Vectorized Sorting for .NET
9 | damageboy
10 | https://github.com/damageboy/VxSort/
11 | https://raw.githubusercontent.com/damageboy/VxSort/master/LICENSE
12 | https://github.com/damageboy/VxSort/
13 | sorting intrinsics
14 | true
15 | true
16 | true
17 | snupkg
18 | enable
19 | portable
20 | git
21 | https://raw.githubusercontent.com/damageboy/VxSort/master/LICENSE
22 | ..\vxsort.png
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | TextTemplatingFileGenerator
35 | BitonicSort.Generated.cs
36 |
37 |
38 |
39 |
40 | BitonicSort.Generated.tt
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/vxsort.svg:
--------------------------------------------------------------------------------
1 |
2 |
76 |
--------------------------------------------------------------------------------
/vxsort.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.26124.0
5 | MinimumVisualStudioVersion = 15.0.26124.0
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Bench", "Bench\Bench.csproj", "{2E6E16A9-9F99-4B6C-B766-94105E861C85}"
7 | EndProject
8 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VxSort", "VxSort\VxSort.csproj", "{FC447F4D-C6CD-4253-A048-3FD5BB1CA173}"
9 | EndProject
10 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{67F241C5-061D-48BB-A4AE-89FC52655A07}"
11 | EndProject
12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".stuff", ".stuff", "{4723FC10-8AE7-4223-BC72-7C2A0D6EF660}"
13 | ProjectSection(SolutionItems) = preProject
14 | .gitignore = .gitignore
15 | Directory.Build.props = Directory.Build.props
16 | Directory.Build.targets = Directory.Build.targets
17 | LICENSE = LICENSE
18 | README.md = README.md
19 | EndProjectSection
20 | EndProject
21 | Global
22 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
23 | Debug|Any CPU = Debug|Any CPU
24 | Release|Any CPU = Release|Any CPU
25 | Stats|Any CPU = Stats|Any CPU
26 | EndGlobalSection
27 | GlobalSection(SolutionProperties) = preSolution
28 | HideSolutionNode = FALSE
29 | EndGlobalSection
30 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
31 | {2E6E16A9-9F99-4B6C-B766-94105E861C85}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
32 | {2E6E16A9-9F99-4B6C-B766-94105E861C85}.Debug|Any CPU.Build.0 = Debug|Any CPU
33 | {2E6E16A9-9F99-4B6C-B766-94105E861C85}.Release|Any CPU.ActiveCfg = Release|Any CPU
34 | {2E6E16A9-9F99-4B6C-B766-94105E861C85}.Release|Any CPU.Build.0 = Release|Any CPU
35 | {FC447F4D-C6CD-4253-A048-3FD5BB1CA173}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
36 | {FC447F4D-C6CD-4253-A048-3FD5BB1CA173}.Debug|Any CPU.Build.0 = Debug|Any CPU
37 | {FC447F4D-C6CD-4253-A048-3FD5BB1CA173}.Release|Any CPU.ActiveCfg = Release|Any CPU
38 | {FC447F4D-C6CD-4253-A048-3FD5BB1CA173}.Release|Any CPU.Build.0 = Release|Any CPU
39 | {67F241C5-061D-48BB-A4AE-89FC52655A07}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
40 | {67F241C5-061D-48BB-A4AE-89FC52655A07}.Debug|Any CPU.Build.0 = Debug|Any CPU
41 | {67F241C5-061D-48BB-A4AE-89FC52655A07}.Release|Any CPU.ActiveCfg = Release|Any CPU
42 | {67F241C5-061D-48BB-A4AE-89FC52655A07}.Release|Any CPU.Build.0 = Release|Any CPU
43 | EndGlobalSection
44 | EndGlobal
45 |
--------------------------------------------------------------------------------
/Bench/SortBench.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Net.Http.Headers;
3 | using Bench.Utils;
4 | using BenchmarkDotNet.Attributes;
5 | using BenchmarkDotNet.Columns;
6 | using BenchmarkDotNet.Configs;
7 | using BenchmarkDotNet.Extensions;
8 | using BenchmarkDotNet.Horology;
9 | using BenchmarkDotNet.Jobs;
10 | using BenchmarkDotNet.Reports;
11 | using VxSort;
12 |
13 | namespace Bench
14 | {
15 | class LongConfig : ManualConfig
16 | {
17 | public LongConfig()
18 | {
19 | SummaryStyle = new SummaryStyle(true, SizeUnit.GB, TimeUnit.Microsecond);
20 | Add(Job.LongRun);
21 | Add(new TimePerNColumn());
22 | }
23 | }
24 |
25 | class MediumConfig : ManualConfig
26 | {
27 | public MediumConfig()
28 | {
29 | SummaryStyle = new SummaryStyle(true, SizeUnit.GB, TimeUnit.Microsecond);
30 | Add(Job.MediumRun);
31 | Add(new TimePerNColumn());
32 | }
33 | }
34 |
35 | class ShortConfig : ManualConfig
36 | {
37 | public ShortConfig()
38 | {
39 | SummaryStyle = new SummaryStyle(true, SizeUnit.GB, TimeUnit.Microsecond);
40 | Add(Job.ShortRun);
41 | Add(new TimePerNColumn());
42 | }
43 | }
44 |
45 |
46 | public class SortBenchBase where T : unmanaged, IComparable
47 | {
48 | protected virtual int InvocationsPerIteration { get; }
49 | protected int _iterationIndex = 0;
50 | T[] _values;
51 | protected T[][] _arrays;
52 |
53 | [Params(10, 100, 1_000, 10_000, 100_000, 1_000_000)]//, 10_000_000)]
54 | public int N;
55 |
56 | [GlobalSetup]
57 | public void Setup() => _values = ValuesGenerator.ArrayOfUniqueValues(N);
58 |
59 | [IterationCleanup]
60 | public void CleanupIteration() => _iterationIndex = 0; // after every iteration end we set the index to 0
61 |
62 | [IterationSetup]
63 | public void SetupArrayIteration() => ValuesGenerator.FillArrays(ref _arrays, InvocationsPerIteration, _values);
64 | }
65 |
66 | [GenericTypeArguments(typeof(int))] // value type
67 | [InvocationCount(InvocationsPerIterationValue)]
68 | [Config(typeof(MediumConfig))]
69 | [DatatableJsonExporter]
70 | public class UnstableSort : SortBenchBase where T : unmanaged, IComparable
71 | {
72 | const int InvocationsPerIterationValue = 3;
73 |
74 | protected override int InvocationsPerIteration => InvocationsPerIterationValue;
75 |
76 | [Benchmark(Baseline=true)]
77 | public void ArraySort() => Array.Sort(_arrays[_iterationIndex++]);
78 |
79 | [Benchmark]
80 | public void VxSort() => VectorizedSort.UnstableSort(_arrays[_iterationIndex++]);
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | name: Build
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | - '!research'
8 | tags:
9 | - 'v*'
10 |
11 | env:
12 | dotnet_version: '3.1.100'
13 | DOTNET_CLI_TELEMETRY_OPTOUT: true
14 | DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
15 |
16 |
17 | jobs:
18 | build:
19 | runs-on: ${{ matrix.os }}
20 |
21 | strategy:
22 | fail-fast: false
23 | matrix:
24 | # macos on GH actions doesn't seem to have AVX2 extensions enables CPUs yet?
25 | #os: [ ubuntu-latest, windows-latest, macos-latest ]
26 | os: [ ubuntu-latest, windows-latest ]
27 |
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v1
31 |
32 | - uses: actions/cache@v1
33 | with:
34 | path: ~/.nuget/packages
35 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/Directory.Build.targets') }}
36 | restore-keys: |
37 | ${{ runner.os }}-nuget-
38 |
39 | - name: Setup .NET Core SDK
40 | uses: actions/setup-dotnet@v1.4.0
41 | with:
42 | dotnet-version: ${{ env.dotnet_version }}
43 |
44 | - name: Make SDK shut up
45 | shell: bash
46 | run: touch "${DOTNET_ROOT}/$(dotnet --version).dotnetFirstUseSentinel"
47 |
48 | - name: Test
49 | run: dotnet test -c release -v normal Test/Test.csproj
50 | shell: bash
51 |
52 |
53 | publish:
54 | needs: build
55 |
56 | runs-on: windows-latest
57 |
58 | steps:
59 | - name: Checkout
60 | uses: actions/checkout@v1
61 |
62 | - uses: actions/cache@v1
63 | with:
64 | path: ~/.nuget/packages
65 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/Directory.Build.targets') }}
66 | restore-keys: |
67 | ${{ runner.os }}-nuget-
68 |
69 | - name: Setup .NET Core SDK
70 | uses: actions/setup-dotnet@v1.0.2 # Issues with DOTNET_ROOT
71 | with:
72 | dotnet-version: ${{ env.dotnet_version }}
73 |
74 | - name: Pack NuGet package (CI)
75 | if: startsWith(github.ref, 'refs/heads/')
76 | shell: bash
77 | run: dotnet pack VxSort --configuration release --output nupkgs --version-suffix "ci.$(date -u +%Y%m%dT%H%M%S)+sha.${GITHUB_SHA:0:9}"
78 |
79 | - name: Pack NuGet package (Release)
80 | if: startsWith(github.ref, 'refs/tags/v')
81 | shell: bash
82 | run: dotnet pack VxSort --configuration release --output nupkgs
83 |
84 | - name: Upload artifacts (nupkg)
85 | uses: actions/upload-artifact@v1
86 | with:
87 | name: VxSort.nupkgs
88 | path: nupkgs
89 |
90 | - name: Publish packages to GitHub Package Registry
91 | env:
92 | TOKEN: ${{ secrets.GITHUB_TOKEN }}
93 | SOURCE: https://nuget.pkg.github.com/damageboy/index.json
94 | shell: bash
95 | run: |
96 | cd nupkgs
97 | curl -o nuget -L https://dist.nuget.org/win-x86-commandline/latest/nuget.exe
98 | ./nuget sources add -Name "GitHub" -Source $SOURCE -UserName damageboy -Password $TOKEN
99 | ./nuget push "VxSort.*.nupkg" -Source "GitHub"
100 |
101 |
--------------------------------------------------------------------------------
/Test/BitonicSortTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using NUnit.Framework;
5 | using VxSort;
6 | using static Test.DataGeneration;
7 | using DataGenerator = System.Func<(int[] data, int[] sortedData, string reproContext)>;
8 |
9 | namespace Test
10 | {
11 | public class BitonicSortTests
12 | {
13 | static readonly int[] BitonicSizes = { 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128 };
14 |
15 | static IEnumerable PreSorted =>
16 | from size in BitonicSizes
17 | select new ParityTests.SortTestCaseData(() => (Enumerable.Range(0, size).ToArray(), Enumerable.Range(0, size).ToArray(), "pre-sorted") ).SetArgDisplayNames($"{size:000}/S");
18 |
19 | static IEnumerable ReverseSorted =>
20 | from size in BitonicSizes
21 | select new ParityTests.SortTestCaseData(() => (Enumerable.Range(0, size).Reverse().ToArray(), Enumerable.Range(0, size).ToArray(), "reverse-sorted") ).SetArgDisplayNames($"Ƨ{size:0000000}");
22 |
23 | static IEnumerable HalfMinValue =>
24 | from size in BitonicSizes
25 | from seed in new[] {666, 333, 999, 314159}
26 | select new ParityTests.SortTestCaseData(() => GenerateData(size, seed, int.MinValue, 0.5)).SetArgDisplayNames($"{size:000}/{seed}/0.5min");
27 |
28 | static IEnumerable HalfMaxValue =>
29 | from size in BitonicSizes
30 | from seed in new[] {666, 333, 999, 314159}
31 | select new ParityTests.SortTestCaseData(() => GenerateData(size, seed, int.MaxValue, 0.5)).SetArgDisplayNames($"{size:000}/{seed}/0.5max");
32 |
33 | static IEnumerable AllOnes =>
34 | from size in BitonicSizes
35 | select new ParityTests.SortTestCaseData(() => (Enumerable.Repeat(1, size).ToArray(), Enumerable.Repeat(1, size).ToArray(), "all-ones") ).SetArgDisplayNames($"1:{size:0000000}");
36 |
37 |
38 | static IEnumerable ConstantSeed =>
39 | from size in BitonicSizes
40 | from seed in new[] {666, 333, 999, 314159}
41 | select new ParityTests.SortTestCaseData(() => GenerateData(size, seed, modulo: 100)).SetArgDisplayNames($"{size:000}/{seed}");
42 |
43 | static IEnumerable TimeSeed =>
44 | from size in BitonicSizes
45 | let numIterations = int.Parse(Environment.GetEnvironmentVariable("NUM_CYCLES") ?? "100")
46 | from i in Enumerable.Range(0, numIterations)
47 | let seed = ((int) DateTime.Now.Ticks + i * 666) % int.MaxValue
48 | select new ParityTests.SortTestCaseData(() => GenerateData(size, seed)).SetArgDisplayNames($"{size:000}/R{i}");
49 |
50 | [TestCaseSource(nameof(PreSorted))]
51 | [TestCaseSource(nameof(HalfMinValue))]
52 | [TestCaseSource(nameof(HalfMaxValue))]
53 | [TestCaseSource(nameof(ConstantSeed))]
54 | [TestCaseSource(nameof(TimeSeed))]
55 | public unsafe void BitonicSortTest(DataGenerator generator)
56 | {
57 | var (randomData, sortedData, reproContext) = generator();
58 |
59 | fixed (int* p = &randomData[0]) {
60 | BitonicSort.Sort(p, randomData.Length);
61 | }
62 |
63 | Assert.That(randomData, Is.Ordered, reproContext);
64 | Assert.That(randomData, Is.EqualTo(sortedData), reproContext);
65 | }
66 |
67 | }
68 | }
--------------------------------------------------------------------------------
/Bench/SmallSortBench.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Linq;
3 | using System.Runtime.InteropServices;
4 | using Bench.Utils;
5 | using BenchmarkDotNet.Attributes;
6 | using BenchmarkDotNet.Columns;
7 | using BenchmarkDotNet.Configs;
8 | using BenchmarkDotNet.Extensions;
9 | using BenchmarkDotNet.Horology;
10 | using BenchmarkDotNet.Jobs;
11 | using BenchmarkDotNet.Reports;
12 | using VxSort;
13 |
14 | namespace Bench
15 | {
16 | class SmallSortConfig : ManualConfig
17 | {
18 | public SmallSortConfig()
19 | {
20 | SummaryStyle = new SummaryStyle(true, SizeUnit.GB, TimeUnit.Nanosecond);
21 | Add(Job.LongRun);
22 | Add(new TimePerNColumn());
23 | }
24 | }
25 |
26 | public class SmallSortBenchBase where T : unmanaged, IComparable
27 | {
28 | const ulong CACHELINE_SIZE = 64;
29 | protected virtual int InvocationsPerIteration { get; }
30 |
31 | protected int _iterationIndex = 0;
32 | T[] _originalValues;
33 | protected T[][] _arrays;
34 | GCHandle[] _gcHandles;
35 | protected unsafe T*[] _arrayPtrs;
36 |
37 |
38 | protected virtual int ArraySize { get; }
39 |
40 | protected unsafe T* _tmp;
41 |
42 | [GlobalSetup]
43 | public unsafe void Setup()
44 | {
45 | _tmp = (T*) Marshal.AllocHGlobal(sizeof(T) * 2 * ArraySize);
46 | var rolledUpArraySize = ArraySize + (int) (CACHELINE_SIZE / (ulong) sizeof(T));
47 | _originalValues = ValuesGenerator.ArrayOfUniqueValues(rolledUpArraySize);
48 | _arrays = Enumerable.Range(0, InvocationsPerIteration).Select(_ => new T[rolledUpArraySize])
49 | .ToArray();
50 | _gcHandles = _arrays.Select(a => GCHandle.Alloc(a, GCHandleType.Pinned)).ToArray();
51 | _arrayPtrs = new T*[InvocationsPerIteration];
52 | for (var i = 0; i < InvocationsPerIteration; i++) {
53 | var p = (T*) _gcHandles[i].AddrOfPinnedObject();
54 | if (((ulong) p) % CACHELINE_SIZE != 0)
55 | p = (T*) ((((ulong) p) + CACHELINE_SIZE) & ~(CACHELINE_SIZE - 1));
56 |
57 | _arrayPtrs[i] = p;
58 | }
59 | }
60 |
61 | [IterationCleanup]
62 | public void CleanupIteration() => _iterationIndex = 0; // after every iteration end we set the index to 0
63 |
64 | [IterationSetup]
65 | public void SetupArrayIteration() =>
66 | ValuesGenerator.FillArrays(ref _arrays, InvocationsPerIteration, _originalValues);
67 | }
68 |
69 | [GenericTypeArguments(typeof(int))] // value type
70 | [InvocationCount(InvocationsPerIterationValue)]
71 | [Config(typeof(SmallSortConfig))]
72 | public class SmallSortBench : SmallSortBenchBase where T : unmanaged, IComparable
73 | {
74 | const int InvocationsPerIterationValue = 4096;
75 | protected override int InvocationsPerIteration => InvocationsPerIterationValue;
76 | protected override int ArraySize => N;
77 |
78 | [Params(8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128)]
79 | public int N;
80 |
81 | [Benchmark(Baseline = true)]
82 | public unsafe void ArraySort() => Array.Sort(_arrays[_iterationIndex++]);
83 |
84 |
85 | //[Benchmark]
86 | //public unsafe void PCSort() => PositionCountingSort.Sort((int *) _arrayPtrs[_iterationIndex++], N, (int *) _tmp);
87 |
88 | [Benchmark]
89 | public unsafe void BitonicSort() => BitonicSort.Sort((int *) _arrayPtrs[_iterationIndex++], N);
90 |
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/Bench/Utils/TimePerNColumn.cs:
--------------------------------------------------------------------------------
1 | using System.Linq;
2 | using System.Text;
3 | using BenchmarkDotNet.Columns;
4 | using BenchmarkDotNet.Environments;
5 | using BenchmarkDotNet.Horology;
6 | using BenchmarkDotNet.Reports;
7 | using BenchmarkDotNet.Running;
8 |
9 | namespace Bench.Utils
10 | {
11 | public static class CommonExtensions
12 | {
13 | public static string ToTimeStr(this double value, TimeUnit unit = null, int unitNameWidth = 1,
14 | bool showUnit = true, string format = "N4",
15 | Encoding encoding = null)
16 | {
17 | unit = unit ?? TimeUnit.GetBestTimeUnit(value);
18 | double unitValue = TimeUnit.Convert(value, TimeUnit.Nanosecond, unit);
19 | if (showUnit) {
20 | string unitName = unit.Name.ToString(encoding ?? Encoding.ASCII).PadLeft(unitNameWidth);
21 | return $"{unitValue.ToStr(format)} {unitName}";
22 | }
23 |
24 | return $"{unitValue.ToStr(format)}";
25 | }
26 |
27 | public static string ToStr(this double value, string format = "0.##")
28 | {
29 | // Here we should manually create an object[] for string.Format
30 | // If we write something like
31 | // string.Format(HostEnvironmentInfo.MainCultureInfo, $"{{0:{format}}}", value)
32 | // it will be resolved to:
33 | // string.Format(System.IFormatProvider, string, params object[]) // .NET 4.5
34 | // string.Format(System.IFormatProvider, string, object) // .NET 4.6
35 | // Unfortunately, Mono doesn't have the second overload (with object instead of params object[]).
36 | var args = new object[] {value};
37 | return string.Format(HostEnvironmentInfo.MainCultureInfo, $"{{0:{format}}}", args);
38 | }
39 |
40 | public static string ToTimeStr(this double value, TimeUnit unit, Encoding encoding, string format = "N4",
41 | int unitNameWidth = 1, bool showUnit = true)
42 | => value.ToTimeStr(unit, unitNameWidth, showUnit, format, encoding);
43 |
44 | public static string ToTimeStr(this double value, Encoding encoding, TimeUnit unit = null, string format = "N4",
45 | int unitNameWidth = 1,
46 | bool showUnit = true)
47 | => value.ToTimeStr(unit, unitNameWidth, showUnit, format, encoding);
48 | }
49 |
50 | public class TimePerNColumn : IColumn
51 | {
52 | public string Id => nameof(TimePerNColumn);
53 | public string ColumnName => "Time / N";
54 |
55 | public bool IsDefault(Summary summary, BenchmarkCase benchmarkCase) => false;
56 |
57 | public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
58 | {
59 | return "";
60 | }
61 |
62 | public bool IsAvailable(Summary summary) => true;
63 | public bool AlwaysShow => true;
64 | public ColumnCategory Category => ColumnCategory.Statistics;
65 | public int PriorityInCategory => 0;
66 | public bool IsNumeric => true;
67 | public UnitType UnitType => UnitType.Time;
68 | public string Legend => $"Time taken to process a single element";
69 |
70 | public string GetValue(Summary summary, BenchmarkCase benchmarkCase, SummaryStyle style)
71 | {
72 | var valueOfN = (int) benchmarkCase.Parameters.Items.Single(p => p.Name == "N").Value;
73 | var timePerN = summary[benchmarkCase].ResultStatistics.Mean / valueOfN;
74 | return timePerN.ToTimeStr(benchmarkCase.Config.Encoding, TimeUnit.GetBestTimeUnit(timePerN));
75 | }
76 |
77 | public override string ToString() => ColumnName;
78 | }
79 | }
--------------------------------------------------------------------------------
/vxsort.sln.DotSettings:
--------------------------------------------------------------------------------
1 |
2 | END_OF_LINE
3 | True
4 | END_OF_LINE
5 | END_OF_LINE
6 | TOGETHER_SAME_LINE
7 | END_OF_LINE
8 | True
9 | True
10 | True
11 | True
12 | True
13 | True
14 | True
15 | True
16 | END_OF_LINE
17 | AVX
18 | PC
19 | <Policy Inspect="True" Prefix="" Suffix="" Style="AA_BB"><ExtraRule Prefix="" Suffix="" Style="AaBb" /></Policy>
20 | True
21 | True
22 | True
23 | True
24 | True
25 | True
26 | True
27 | True
28 | True
29 | True
--------------------------------------------------------------------------------
/Test/PositionCountingSortTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using NUnit.Framework;
5 | using VxSort;
6 | using static Test.DataGeneration;
7 | using DataGenerator = System.Func<(int[] data, int[] sortedData, string reproContext)>;
8 |
9 | namespace Test
10 | {
11 | public class PositionCountingSortTests
12 | {
13 | static readonly int[] _pcSortSizes = new[] { 8, 16, 24, 32, 40, 48, 56, 64 };
14 |
15 | static readonly int[] _sixTeen = new[] { 16 };
16 |
17 | public class SortTestCaseData : TestCaseData
18 | {
19 | public SortTestCaseData(DataGenerator generator) : base(generator) { }
20 | }
21 |
22 | static IEnumerable PreSorted =>
23 | from size in _pcSortSizes
24 | select new SortTestCaseData(() => (Enumerable.Range(0, size).ToArray(), Enumerable.Range(0, size).ToArray(), "pre-sorted") ).SetArgDisplayNames($"S{size}");
25 |
26 | static IEnumerable HalfMinValue =>
27 | from size in _pcSortSizes
28 | from seed in new[] {666, 333, 999, 314159}
29 | select new SortTestCaseData(() => GenerateData(size, seed, int.MinValue, 0.5)).SetArgDisplayNames($"{size}/{seed}/0.5min");
30 |
31 | static IEnumerable HalfMaxValue =>
32 | from size in _pcSortSizes
33 | from seed in new[] {666, 333, 999, 314159}
34 | select new SortTestCaseData(() => GenerateData(size, seed, int.MaxValue, 0.5)).SetArgDisplayNames($"{size}/{seed}/0.5max");
35 |
36 |
37 | static IEnumerable ConstantSeed =>
38 | from size in _pcSortSizes
39 | from seed in new[] {666, 333, 999, 314159}
40 | select new SortTestCaseData(() => GenerateData(size, seed)).SetArgDisplayNames($"{size}/{seed}");
41 |
42 | static IEnumerable TimeSeed =>
43 | from size in _pcSortSizes
44 | let numIterations = int.Parse(Environment.GetEnvironmentVariable("NUM_CYCLES") ?? "100")
45 | from i in Enumerable.Range(0, numIterations)
46 | let seed = ((int) DateTime.Now.Ticks + i * 666) % int.MaxValue
47 | select new SortTestCaseData(() => GenerateData(size, seed)).SetArgDisplayNames($"{size}/R{i}");
48 |
49 | static IEnumerable PreSorted16 =>
50 | from size in _sixTeen
51 | select new SortTestCaseData(() => (Enumerable.Range(0, size).ToArray(), Enumerable.Range(0, size).ToArray(), "pre-sorted") ).SetArgDisplayNames($"S{size}");
52 |
53 | static IEnumerable HalfMinValue16 =>
54 | from size in _sixTeen
55 | from seed in new[] {666, 333, 999, 314159}
56 | select new SortTestCaseData(() => GenerateData(size, seed, int.MinValue, 0.5)).SetArgDisplayNames($"{size}/{seed}/0.5min");
57 |
58 | static IEnumerable HalfMaxValue16 =>
59 | from size in _sixTeen
60 | from seed in new[] {666, 333, 999, 314159}
61 | select new SortTestCaseData(() => GenerateData(size, seed, int.MaxValue, 0.5)).SetArgDisplayNames($"{size}/{seed}/0.5max");
62 |
63 |
64 | static IEnumerable ConstantSeed16 =>
65 | from size in _sixTeen
66 | from seed in new[] {666, 333, 999, 314159}
67 | select new SortTestCaseData(() => GenerateData(size, seed)).SetArgDisplayNames($"{size}/{seed}");
68 |
69 | static IEnumerable TimeSeed16 =>
70 | from size in _sixTeen
71 | let numIterations = int.Parse(Environment.GetEnvironmentVariable("NUM_CYCLES") ?? "100")
72 | from i in Enumerable.Range(0, numIterations)
73 | let seed = ((int) DateTime.Now.Ticks + i * 666) % int.MaxValue
74 | select new SortTestCaseData(() => GenerateData(size, seed)).SetArgDisplayNames($"{size}/R{i}");
75 |
76 |
77 | static IEnumerable AllTests =>
78 | PreSorted.Concat(HalfMinValue).Concat(HalfMaxValue).Concat(ConstantSeed).Concat(TimeSeed);
79 |
80 | static IEnumerable AllTests16 =>
81 | PreSorted16.Concat(HalfMinValue16).Concat(HalfMaxValue16).Concat(ConstantSeed16).Concat(TimeSeed16);
82 | }
83 | }
--------------------------------------------------------------------------------
/Test/ParityTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Net;
5 | using NUnit.Framework;
6 | using VxSort;
7 | using static Test.DataGeneration;
8 | using DataGenerator = System.Func<(int[] data, int[] sortedData, string reproContext)>;
9 |
10 | namespace Test
11 | {
12 | [Parallelizable(ParallelScope.All)]
13 | public class ParityTests
14 | {
15 | static int NumCycles => int.Parse(Environment.GetEnvironmentVariable("NUM_CYCLES") ?? "10");
16 |
17 | public class SortTestCaseData : TestCaseData
18 | {
19 | public SortTestCaseData(DataGenerator generator) : base(generator) { }
20 | }
21 |
22 | static readonly int[] ArraySizes = {
23 | 10,
24 | 100,
25 | VectorizedSort.VxUnstableSortInt32.SMALL_SORT_THRESHOLD_ELEMENTS,
26 | BitonicSort.MaxBitonicSortSize,
27 | 1_000,
28 | 10_000,
29 | 100_000,
30 | 1_000_000
31 | };
32 |
33 | static readonly int[] ConstantSeeds = { 666, 333, 999, 314159 };
34 |
35 | static IEnumerable PreSorted =>
36 | from size in ArraySizes
37 | from i in Enumerable.Range(0, NumCycles)
38 | let realSize = size + i
39 | select new SortTestCaseData(() => (Enumerable.Range(0, realSize).ToArray(), Enumerable.Range(0, realSize).ToArray(), "pre-sorted") ).SetArgDisplayNames($"S{realSize:0000000}");
40 |
41 | static IEnumerable ReverseSorted =>
42 | from size in ArraySizes
43 | from i in Enumerable.Range(0, NumCycles)
44 | let realSize = size + i
45 | select new SortTestCaseData(() => (Enumerable.Range(0, realSize).Reverse().ToArray(), Enumerable.Range(0, realSize).ToArray(), "reverse-sorted") ).SetArgDisplayNames($"Ƨ{realSize:0000000}");
46 |
47 | static IEnumerable HalfMinValue =>
48 | from size in ArraySizes
49 | from seed in ConstantSeeds
50 | from i in Enumerable.Range(0, NumCycles)
51 | let realSize = size + i
52 | select new SortTestCaseData(() => GenerateData(realSize, seed, int.MinValue, 0.5)).SetArgDisplayNames($"{realSize:0000000}/{seed}/0.5min");
53 |
54 | static IEnumerable HalfMaxValue =>
55 | from size in ArraySizes
56 | from seed in ConstantSeeds
57 | from i in Enumerable.Range(0, NumCycles)
58 | let realSize = size + i
59 | select new SortTestCaseData(() => GenerateData(realSize, seed, int.MaxValue, 0.5)).SetArgDisplayNames($"{realSize:0000000}/{seed}/0.5max");
60 |
61 | static IEnumerable AllOnes =>
62 | from size in ArraySizes
63 | from i in Enumerable.Range(0, NumCycles)
64 | let realSize = size + i
65 | select new SortTestCaseData(() => (Enumerable.Repeat(1, realSize).ToArray(), Enumerable.Repeat(1, realSize).ToArray(), "all-ones") ).SetArgDisplayNames($"1:{realSize:0000000}");
66 |
67 | static IEnumerable ConstantSeed =>
68 | from size in ArraySizes
69 | from seed in ConstantSeeds
70 | from i in Enumerable.Range(0, NumCycles)
71 | let realSize = size + i
72 | select new SortTestCaseData(() => GenerateData(realSize, seed)).SetArgDisplayNames($"{realSize:0000000}/{seed}");
73 |
74 | static IEnumerable TimeSeed =>
75 | from size in ArraySizes
76 | from i in Enumerable.Range(0, NumCycles)
77 | let realSize = size + i
78 | let seed = ((int) DateTime.Now.Ticks + i * 666) % int.MaxValue
79 | select new SortTestCaseData(() => GenerateData(realSize, seed)).SetArgDisplayNames($"{realSize:0000000}/R{i}");
80 |
81 | [TestCaseSource(nameof(PreSorted))]
82 | [TestCaseSource(nameof(ReverseSorted))]
83 | [TestCaseSource(nameof(HalfMinValue))]
84 | [TestCaseSource(nameof(HalfMaxValue))]
85 | [TestCaseSource(nameof(AllOnes))]
86 | [TestCaseSource(nameof(ConstantSeed))]
87 | [TestCaseSource(nameof(TimeSeed))]
88 | public void VxSortPrimitiveUnstable(DataGenerator generator)
89 | {
90 | var (randomData, sortedData, reproContext) = generator();
91 | VectorizedSort.UnstableSort(randomData);
92 |
93 | Assert.That(randomData, Is.Ordered, reproContext);
94 | Assert.That(randomData, Is.EqualTo(sortedData), reproContext);
95 | }
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/VxSort/BitonicSort.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Runtime.CompilerServices;
3 | using System.Runtime.Intrinsics;
4 | using static System.Runtime.Intrinsics.X86.Avx2;
5 |
6 | namespace VxSort
7 | {
8 | using V = Vector256;
9 | static partial class BitonicSort where T : unmanaged, IComparable
10 | {
11 |
12 | // Legend:
13 | // X - shuffle/permute mask for generating a cross (X) shuffle
14 | // the numbers (1,2,4) denote the stride of the shuffle operation
15 | // B - Blend mask, used for blending two vectors according to a given order
16 | // the numbers (1,2,4) denote the "stride" of blending, e.g. 1 means switch vectors
17 | // every element, 2 means switch vectors every two elements and so on...
18 | // P - Permute mask, read specific comment about it below...
19 | const byte X_1 = 0b10_11_00_01;
20 | const byte X_2 = 0b01_00_11_10;
21 | const byte B_1 = 0b10_10_10_10;
22 | const byte B_2 = 0b11_00_11_00;
23 | const byte B_4 = 0b11_11_00_00;
24 |
25 | // Shuffle (X_R) + Permute (P_X) is a more efficient way
26 | // (copied shamelessly from LLVM through compiler explorer)
27 | // For implementing X_4, which requires a cross 128-bit lane operation.
28 | // A Shuffle (1c lat / 1c tp) + 64 bit permute (3c lat / 1c tp) take 1 more cycle to execute than the
29 | // the alternative: PermuteVar8x32 / VPERMD which takes (3c lat / 1c tp)
30 | // But, the latter requires loading the permutation entry from cache, which can take up to 5 cycles (when cached)
31 | // and costs one more register, which steals a register from us for high-count bitonic sorts.
32 | // In short, it's faster this way, from my attempts...
33 | const byte X_R = 0b00_01_10_11;
34 | const byte P_X = 0b01_00_11_10;
35 |
36 | // Basic 8-element bitonic sort
37 | // This will get composed and inlined throughout
38 | // the various bitonic-sort sizes:
39 | // BitonicSort1V will be directly embedded in BitonicSort{2,3,5,9}V
40 | // BitonicSort2V will be directly embedded in BitonicSort{3,4,6,10}V
41 | // BitonicSort3V will be directly embedded in BitonicSort{7,11}V
42 | // etc.
43 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
44 | static void BitonicSort01V(ref V d)
45 | {
46 | // ReSharper disable JoinDeclarationAndInitializer
47 | V min, max, s;
48 | // ReSharper restore JoinDeclarationAndInitializer
49 | s = Shuffle(d, X_1);
50 | min = Min(s, d);
51 | max = Max(s, d);
52 | d = Blend(min, max, B_1);
53 |
54 | s = Shuffle(d, X_R);
55 | min = Min(s, d);
56 | max = Max(s, d);
57 | d = Blend(min, max,B_2);
58 |
59 | s = Shuffle(d, X_1);
60 | min = Min(s, d);
61 | max = Max(s, d);
62 | d = Blend(min, max,B_1);
63 |
64 | s = Shuffle(d, X_R);
65 | s = Permute4x64(s.AsInt64(),P_X).AsInt32();
66 | min = Min(s, d);
67 | max = Max(s, d);
68 | d = Blend(min, max, B_4);
69 |
70 | s = Shuffle(d, X_2);
71 | min = Min(s, d);
72 | max = Max(s, d);
73 | d = Blend(min, max,B_2);
74 |
75 | s = Shuffle(d, X_1);
76 | min = Min(s, d);
77 | max = Max(s, d);
78 | d = Blend(min, max, B_1);
79 | }
80 |
81 | // Basic bitonic 8-element merge
82 | // This will get composed and inlined throughout
83 | // the code base for merging larger sized bitonic sorts temporary result states
84 | // BitonicSort1VFinish used for BitonicSort{2,3,5,9}V{,Finish}
85 | // BitonicSort2VFinish used for BitonicSort{3,4,6,10}V{,Finish}
86 |
87 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
88 | static void BitonicSort01VMerge(ref V d)
89 | {
90 | // ReSharper disable JoinDeclarationAndInitializer
91 | V min, max, s;
92 | // ReSharper restore JoinDeclarationAndInitializer
93 |
94 | s = Permute4x64(d.AsInt64(), P_X).AsInt32();
95 | min = Min(s, d);
96 | max = Max(s, d);
97 | d = Blend(min, max, B_4);
98 |
99 | s = Shuffle(d, X_2);
100 | min = Min(s, d);
101 | max = Max(s, d);
102 | d = Blend(min, max, B_2);
103 |
104 | s = Shuffle(d, X_1);
105 | min = Min(s, d);
106 | max = Max(s, d);
107 | d = Blend(min, max, B_1);
108 | }
109 | }
110 | }
--------------------------------------------------------------------------------
/VxSort/BitonicSort.Generated.tt:
--------------------------------------------------------------------------------
1 | <#@ template language="C#" #>
2 | <#@ template debug="false" hostspecific="false" language="C#" #>
3 | <#@ assembly name="System.Core" #>
4 | <#@ import namespace="System.Linq" #>
5 | <#@ import namespace="System.Text" #>
6 | <#@ import namespace="System.Collections.Generic" #>
7 | <#@ output extension=".cs" #>
8 | //
9 | // This code was generated by a tool on <#= $"{DateTime.UtcNow.ToString("yyyy-MM-dd-HH:mm:ss")}"#>
10 | //
11 | // Changes to this file may cause incorrect behavior and will be lost if
12 | // the code is regenerated.
13 | //
14 |
15 | using System;
16 | using System.Diagnostics;
17 | using System.Runtime.CompilerServices;
18 | using System.Runtime.Intrinsics;
19 | using static System.Runtime.Intrinsics.X86.Avx;
20 | using static System.Runtime.Intrinsics.X86.Avx2;
21 |
22 | namespace VxSort
23 | {
24 | using V = Vector256;
25 | static unsafe partial class BitonicSort
26 | {
27 | <#
28 | static uint GetNextPowerOf2(uint v)
29 | {
30 | v--;
31 | v |= v >> 1;
32 | v |= v >> 2;
33 | v |= v >> 4;
34 | v |= v >> 8;
35 | v |= v >> 16;
36 | v++;
37 | return v;
38 | }
39 |
40 |
41 | string GenerateParamList(uint start, uint numParams)
42 | => string.Join(", ", Enumerable.Range((int) start, (int) numParams).Select(p => $"ref d{p:00}"));
43 |
44 | string GenerateParamDefList(uint numParams)
45 | => string.Join(", ", Enumerable.Range(1, (int) numParams).Select(p => $"ref V d{p:00}"));
46 |
47 | const uint MaxBitonicSortVectors = 16U;
48 |
49 | uint LargestMergeVariantNeeded = GetNextPowerOf2(MaxBitonicSortVectors) / 2;
50 | #>
51 | <# for (var m = 2U; m <= MaxBitonicSortVectors; m++) {
52 | var s1 = GetNextPowerOf2(m) / 2;
53 | var s2 = m - s1; #>
54 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
55 | static void BitonicSort<#= $"{m:00}" #>V(<#= GenerateParamDefList(m) #>)
56 | {
57 | V tmp;
58 |
59 | BitonicSort<#= $"{s1:00}" #>V(<#= GenerateParamList(1, s1) #>);
60 | BitonicSort<#= $"{s2:00}" #>V(<#= GenerateParamList(s1 + 1, s2) #>);
61 |
62 | <# for (var r = s1 + 1; r < m + 1; r++) {
63 | var x = s1 + 1 - (r - s1); #>
64 | tmp = Shuffle(d<#= $"{r:00}" #>, X_R);
65 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
66 | d<#= $"{r:00}" #> = Max(d<#= $"{x:00}" #>, tmp);
67 | d<#= $"{x:00}" #> = Min(d<#= $"{x:00}" #>, tmp);
68 |
69 | <# } #>
70 | BitonicSort<#= $"{s1:00}" #>VMerge(<#= GenerateParamList(1, s1) #>);
71 | BitonicSort<#= $"{s2:00}" #>VMerge(<#= GenerateParamList(s1 + 1, s2) #>);
72 | }
73 | <# if (m <= LargestMergeVariantNeeded) { #>
74 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
75 | static void BitonicSort<#= $"{m:00}" #>VMerge(<#= GenerateParamDefList(m) #>)
76 | {
77 | V tmp;
78 |
79 | <# for (var r = s1 + 1; r < m + 1; r++) {
80 | var x = (r - s1); #>
81 | tmp = d<#= $"{x:00}" #>;
82 | d<#= $"{x:00}" #> = Min(d<#= $"{r:00}" #>, d<#= $"{x:00}" #>);
83 | d<#= $"{r:00}" #> = Max(d<#= $"{r:00}" #>, tmp);
84 |
85 | <# } #>
86 | BitonicSort<#= $"{s1:00}" #>VMerge(<#= GenerateParamList(1, s1) #>);
87 | BitonicSort<#= $"{s2:00}" #>VMerge(<#= GenerateParamList(s1 + 1, s2) #>);
88 | }
89 | <# } #>
90 |
91 | <# } #>
92 |
93 | <# for (var m = 1U; m <= MaxBitonicSortVectors; m++) { #>
94 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
95 | static void BitonicSort<#= $"{m:00}" #>V(int* ptr)
96 | {
97 | var N = V.Count;
98 |
99 | <#
100 | for (var l = 0; l < m; l++) { #>
101 | var d<#= $"{l + 1:00}" #> = LoadDquVector256(ptr + <#= $"{l:00}" #>*N);
102 | <# } #>
103 |
104 | BitonicSort<#= $"{m:00}" #>V(<#= GenerateParamList(1, m) #>);
105 |
106 | <#
107 | for (var l = 0; l < m; l++) { #>
108 | Store(ptr + <#= $"{l:00}" #>*N, d<#= $"{l + 1:00}" #>);
109 | <# } #>
110 | }
111 | <# } #>
112 |
113 | public const int MinBitonicSortSize = 8;
114 | public const int MaxBitonicSortSize = <#= MaxBitonicSortVectors * 8 #>;
115 |
116 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
117 | public static void Sort(int* ptr, int length)
118 | {
119 | Debug.Assert(length % 8 == 0);
120 | Debug.Assert(length <= MaxBitonicSortSize);
121 |
122 | switch (length / 8) {
123 | <# for (var m = 1U; m <= MaxBitonicSortVectors; m++) { #>
124 | case <#= $"{m:00}" #>: BitonicSort<#= $"{m:00}" #>V(ptr); return;
125 | <# } #>
126 |
127 | default:
128 | throw new NotSupportedException("length is not power a multiple of 8 && <= <#= MaxBitonicSortVectors * 8 #>");
129 | }
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/Bench/Utils/ValueGenerator.cs:
--------------------------------------------------------------------------------
1 | // Licensed to the .NET Foundation under one or more agreements.
2 | // The .NET Foundation licenses this file to you under the MIT license.
3 | // See the LICENSE file in the project root for more information.
4 |
5 | using System;
6 | using System.Collections.Generic;
7 | using System.Linq;
8 | using System.Text;
9 |
10 | namespace Bench.Utils
11 | {
12 | public static class ValuesGenerator
13 | {
14 | const int Seed = 666; // we always use the same seed to have repeatable results!
15 |
16 | internal static void FillArrays(ref T[][] arrays, int collectionsCount, T[] source)
17 | {
18 | if (arrays == null)
19 | arrays = Enumerable.Range(0, collectionsCount).Select(_ => new T[source.Length]).ToArray();
20 |
21 | foreach (var array in arrays)
22 | System.Array.Copy(sourceArray: source, destinationArray: array, length: source.Length);
23 |
24 | if(arrays.Any(collection => collection.Length != source.Length)) // we dont use Debug.Assert here because this code will be executed mostly in Release
25 | throw new InvalidOperationException();
26 | }
27 |
28 |
29 | internal static unsafe void FillArrays(T** arrays, int collectionsCount, T[] source) where T : unmanaged
30 | {
31 | for (var i = 0; i < collectionsCount; i++)
32 | new ReadOnlySpan(source).CopyTo(new Span(arrays[i], source.Length));
33 | }
34 |
35 | public static T GetNonDefaultValue()
36 | {
37 | if (typeof(T) == typeof(byte)) // we can't use ArrayOfUniqueValues for byte
38 | return Array(byte.MaxValue).First(value => !value.Equals(default));
39 | else
40 | return ArrayOfUniqueValues(2).First(value => !value.Equals(default));
41 | }
42 |
43 | ///
44 | /// does not support byte because there are only 256 unique byte values
45 | ///
46 | public static T[] ArrayOfUniqueValues(int count)
47 | {
48 | var random = new Random(Seed);
49 |
50 | var uniqueValues = new HashSet();
51 |
52 | while (uniqueValues.Count != count)
53 | {
54 | T value = GenerateValue(random);
55 |
56 | if (!uniqueValues.Contains(value))
57 | uniqueValues.Add(value);
58 | }
59 |
60 | return uniqueValues.ToArray();
61 | }
62 |
63 | public static T[] Array(int count)
64 | {
65 | var random = new Random(Seed);
66 |
67 | var result = new T[count];
68 |
69 | if (typeof(T) == typeof(byte))
70 | {
71 | random.NextBytes((byte[])(object)result);
72 | }
73 | else
74 | {
75 | for (int i = 0; i < result.Length; i++)
76 | {
77 | result[i] = GenerateValue(random);
78 | }
79 | }
80 |
81 | return result;
82 | }
83 |
84 | public static Dictionary Dictionary(int count)
85 | {
86 | var random = new Random(Seed);
87 |
88 | var dictionary = new Dictionary();
89 |
90 | while (dictionary.Count != count)
91 | {
92 | TKey key = GenerateValue(random);
93 |
94 | if (!dictionary.ContainsKey(key))
95 | dictionary.Add(key, GenerateValue(random));
96 | }
97 |
98 | return dictionary;
99 | }
100 |
101 | private static T GenerateValue(Random random)
102 | {
103 | if (typeof(T) == typeof(char))
104 | return (T)(object)(char)random.Next(char.MinValue, char.MaxValue);
105 | if (typeof(T) == typeof(int))
106 | return (T)(object)random.Next();
107 | if (typeof(T) == typeof(double))
108 | return (T)(object)random.NextDouble();
109 | if (typeof(T) == typeof(bool))
110 | return (T)(object)(random.NextDouble() > 0.5);
111 | if (typeof(T) == typeof(string))
112 | return (T) (object) GenerateRandomString(random, 1, 50);
113 |
114 | throw new NotImplementedException($"{typeof(T).Name} is not implemented");
115 | }
116 |
117 | private static string GenerateRandomString(Random random, int minLength, int maxLength)
118 | {
119 | var length = random.Next(minLength, maxLength);
120 |
121 | var builder = new StringBuilder(length);
122 | for (int i = 0; i < length; i++)
123 | {
124 | var rangeSelector = random.Next(0, 3);
125 |
126 | if (rangeSelector == 0)
127 | builder.Append((char) random.Next('a', 'z'));
128 | else if (rangeSelector == 1)
129 | builder.Append((char) random.Next('A', 'Z'));
130 | else
131 | builder.Append((char) random.Next('0', '9'));
132 | }
133 |
134 | return builder.ToString();
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VxSort [](https://github.com/damageboy/vxsort/actions) [](https://www.nuget.org/packages/VxSort/)
2 |
3 |
4 |
5 |
6 |
7 | VxSort is a repository that contains both the code accompanying the [This goes to Eleven](https://bits.houmus.org/2020-01-28/this-goes-to-eleven-pt1) blog post series by [@damageboy](https://github.com/damageboy/).
8 |
9 | In addition, this repository contains the source code for the NuGet package by the same name that provides a ready to use implementation for sorting with managed code at a much higher speeds than what is currently possible with CoreCLR 3.0.
10 |
11 | ## Usage
12 |
13 | Add with Nuget.
14 |
15 | ```csharp
16 | using VxSort;
17 |
18 | // ...
19 | var r = new Random((int) DateTime.UtcNow.Ticks);
20 | int[] lotOfNumbers = Enumerable.Repeat(100_000_000).Select(r.NextInt()).ToArray();
21 |
22 | VectorizedSort.Sort(lotsOfNumbers);
23 |
24 | // Wow
25 | ```
26 |
27 | ## Roadmap to 1.0
28 |
29 | Currently, VxSort is very feature-less, Here's what it **can** do:
30 |
31 | - [x] Sort 32-bit integers, ascending
32 |
33 | Here's what's **missing**, in terms of functionality, and the order at which it should probably be implemented:
34 |
35 | - [ ] Primitive Support:
36 | - [ ] Add 32-bit descending support: [#2](https://github.com/damageboy/VxSort/issues/2)
37 | - [ ] Add 32-unsigned ascending support]: [#3](https://github.com/damageboy/VxSort/issues/3) (slightly tricky):
38 | - There is no direct unsigned support in AVX2, e.g. we have:
39 | [`_mm256_cmpgt_epi32(__m256i a, __m256i b)`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32) / [`CompareGreaterThan(Vector256, Vector256`](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.x86.avx2.comparegreaterthan?view=netcore-3.0#System_Runtime_Intrinsics_X86_Avx2_CompareGreaterThan_System_Runtime_Intrinsics_Vector256_System_Int32__System_Runtime_Intrinsics_Vector256_System_Int32__)
40 | but no unsigned variant for the comparison operation.
41 | - Instead we could:
42 | - Perform a fake descending partition operation around the value 0, where all `>= 0` are on the left,
43 | and all "fake" `< 0` values (e.g what is really unsigned values with the top bit set...) go to the right.
44 | - Procees to partition with ascending semantics the left portion, while partitioning with descensing semantics the right
45 | - (Unsigned) Profit!
46 | - [ ] Add 32-bit unsigned descending support.
47 | - [ ] Add 64-bit signed/unsigned ascending/descending support.
48 | - [ ] Support 32/64 bit floating point sorting.
49 | - Try to generalize the 32/64-bit support with generic wrappers to avoid code duplication
50 | - [ ] 16 bit support (annoying since there is no 16 bit permute so perf will go down doing 16 -> 32 bit and back)
51 | - [ ] 8 bit support (annoying since there is no 8 bit permute so perf will go down doing 16 -> 32 bit and back)
52 | - [ ] Key/Value Sorting:
53 | - [ ] Add a stable variant, tweaking the current double-pumped loop and switching to `PCSort` for stable sorting.
54 | This is substantially slower, but such is life
55 | - [ ] Add an explicit unstable variant of sorting, for those who don't care/need it
56 | - [ ] `IComparer`/`Comparison` -like based vectorized sorting:
57 | - In general, all hope is lost if `IComparer`/`Comparison` or anything of that sort is provided.
58 | - Unless the `IComparer`/`Comparison` is essentially some sort of a trivial/primitive "compare the struct/class by comparing member X, For example:
59 | An `IComparer`/`Comparison` that is using the 3rd member of `T` which is at a constant offset of 10-bytes into the `T` strcut/class.
60 | - Those sorts of trivial `IComparer`/`Comparison` could be actually solved for with a AVX2 gather operation:
61 | gather all the keys at a given offset and performs the regular vectorized sorting.
62 | - This would require a new type of API where the user provides (perhaps?) an Expression> that performs the key selection, that could be "reverse-engineered" to
63 | understand if the expression tree can be reduced to an AVX2 gather operation and so on...
64 | - General / Good practices?:
65 | - [ ] Transition to code-generating AVX2 Bitonic sorting to avoid maintaining source files with thousands of source lines that could be instead machine generated.
66 |
67 |
68 | ## Credits
69 |
70 | VxSort is based on the following ideas/papers:
71 | * [Fast Quicksort Implementation Using AVX Instructions](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.1009.7773&rep=rep1&type=pdf) by Shay Gueron & Vlad Krasnov for the basic idea of the Vectorized Partition Block.
72 | * [Position-counting sort](https://dirtyhandscoding.github.io/posts/vectorizing-small-fixed-size-sort.html) by @dirtyhandscoding
73 |
74 | VxSort uses the following projects:
75 |
76 | * [**`Fody`**](https://github.com/Fody/Fody) by the Fody developers.
77 | * [**`LocalsInit.Fody`**](https://github.com/ltrzesniewski/LocalsInit.Fody/tree/master/src) for getting rid of `.locals init` by [Lucas Trzesniewski](https://github.com/ltrzesniewski)
78 | * The Logo `sort` by Markus from the Noun Project
79 |
80 | ## Author
81 |
82 | Dan Shechter, a.k.a @damageboy
83 |
84 |
--------------------------------------------------------------------------------
/Bench/Utils/DatatableJsonExporter.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 | using System.Linq;
5 | using BenchmarkDotNet.Attributes;
6 | using BenchmarkDotNet.Engines;
7 | using BenchmarkDotNet.Exporters;
8 | using BenchmarkDotNet.Loggers;
9 | using BenchmarkDotNet.Reports;
10 | using Microsoft.CodeAnalysis.CSharp;
11 | using JsonSerializer = Bench.Utils.SimpleJson;
12 |
13 | namespace Bench.Utils
14 | {
15 | [AttributeUsage(AttributeTargets.Class | AttributeTargets.Assembly, AllowMultiple = true)]
16 | public class DatatableJsonExporterAttribute : ExporterConfigBaseAttribute
17 | {
18 | private DatatableJsonExporterAttribute(IExporter exporter) : base(exporter)
19 | {
20 | }
21 |
22 | public DatatableJsonExporterAttribute(bool indentJson = false, bool excludeMeasurements = false)
23 | : this(new DatatableJsonExporter(indentJson, excludeMeasurements))
24 | {
25 | }
26 | }
27 | public class DatatableJsonExporter: ExporterBase
28 | {
29 | protected override string FileExtension => "datatable.json";
30 |
31 | bool IndentJson { get; } = true;
32 | private bool ExcludeMeasurements { get; }
33 |
34 | public DatatableJsonExporter(bool indentJson = false, bool excludeMeasurements = false)
35 | {
36 | IndentJson = indentJson;
37 | ExcludeMeasurements = excludeMeasurements;
38 | }
39 |
40 | public override void ExportToLog(Summary summary, ILogger logger)
41 | {
42 | var reportIndex = -1;
43 | var benchmarks = summary.Reports.Select(report => {
44 | reportIndex++;
45 |
46 | var n = (int) report.BenchmarkCase.Parameters["N"];
47 |
48 | var data = new Dictionary {
49 | { "FullName", FullNameProvider.GetBenchmarkName(report.BenchmarkCase) }, // do NOT remove this property, it is used for xunit-performance migration
50 | { "MethodName", FullNameProvider.GetMethodName(report.BenchmarkCase) },
51 | { "Mean", report.ResultStatistics.Mean },
52 | { "MeanDataTable", $"{report.ResultStatistics.Mean:0.0000} ({report.ResultStatistics.ConfidenceInterval.Lower:0.0000} - {report.ResultStatistics.ConfidenceInterval.Upper:0.0000})" },
53 | { "TimePerNDataTable", $"{report.ResultStatistics.Mean/n:0.0000} ({report.ResultStatistics.ConfidenceInterval.Lower/n:0.0000} - {report.ResultStatistics.ConfidenceInterval.Upper/n:0.0000})" },
54 | { "Median", report.ResultStatistics.Median },
55 | };
56 |
57 | var logicalGroups = summary.Table.FullContentStartOfLogicalGroup;
58 |
59 | foreach (var c in summary.Table.Columns)
60 | data.Add(c.OriginalColumn.Id, summary.Table.FullContent[reportIndex][c.Index]);
61 |
62 | foreach (var param in report.BenchmarkCase.Parameters.Items)
63 | data.Add(param.Name, param.Value);
64 |
65 | // We construct Measurements manually, so that we can have the IterationMode enum as text, rather than an integer
66 |
67 | var resultMeasurements = report.AllMeasurements.Where(m =>
68 | m.IterationMode == IterationMode.Workload &&
69 | m.IterationStage == IterationStage.Result).Select(m => m.Nanoseconds).ToArray();
70 |
71 | var min = resultMeasurements.Min();
72 | var max = resultMeasurements.Max();
73 |
74 | var measuremeantString = string.Join(",",
75 | resultMeasurements.
76 | Select(m =>
77 | m switch {
78 | var x when x == min => $"{x};#00AA00",
79 | var x when x == max => $"{x};#AA0000",
80 | _ => m.ToString()
81 | }));
82 |
83 |
84 | data.Add("Measurements", measuremeantString);
85 |
86 | if (report.Metrics.Any())
87 | {
88 | data.Add("Metrics", report.Metrics.Values);
89 | }
90 | return data;
91 | });
92 |
93 | var flatData = benchmarks.ToArray();
94 |
95 | FixRatio(flatData);
96 |
97 | //JsonSerializer.CurrentJsonSerializerStrategy.Indent = IndentJson;
98 | JsonSerializer.CurrentJsonSerializerStrategy.Indent = true;
99 | logger.WriteLine(JsonSerializer.SerializeObject(flatData));
100 | }
101 |
102 | void FixRatio(Dictionary[] flatData)
103 | {
104 | var maxRatio =
105 | flatData.Select(data => double.Parse((string) data["BaselineRatioColumn.Mean"])).Max();
106 |
107 | foreach (var data in flatData) {
108 | var currentRatio = double.Parse((string) data["BaselineRatioColumn.Mean"]);
109 |
110 | var ratioColor = currentRatio == 1 ? "#aaaaaa" :
111 | currentRatio > 1 ? "#CC0000" : "#00CC00";
112 |
113 | var ratio100 = (int) (currentRatio * 100);
114 | var ratio100Scaled = (int) ((currentRatio / maxRatio) * 100);
115 |
116 | data.Add("RatioDataTable",
117 | $"({ratio100}:{ratio100Scaled});{currentRatio * 100:N2}%;{ratioColor}");
118 | }
119 |
120 | }
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/Test/PermutationTableTests.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Runtime.Intrinsics;
5 | using System.Runtime.Intrinsics.X86;
6 | using NUnit.Framework;
7 | using VxSort;
8 | using static System.Runtime.Intrinsics.X86.Avx;
9 | using static System.Runtime.Intrinsics.X86.Avx2;
10 | using static VxSort.BytePermutationTables;
11 |
12 | namespace Test
13 | {
14 | public class PermutationTableTests
15 | {
16 | static IEnumerable GenerateStableIntPermTableValues()
17 | {
18 | for (var mask = 0U; mask < 256U; mask++) {
19 | var data = new int[] { -1, -1, -1, -1, -1, -1, -1, -1};
20 | var left = 0;
21 | var right = 0;
22 |
23 | var numRight = (int) Popcnt.PopCount(mask);
24 | var numLeft = 8 - numRight;
25 | var leftSegment = new Span(data, 0, numLeft);
26 | var rightSegment = new Span(data, numLeft, numRight);
27 |
28 |
29 | for (var b = 0; b < 8; b++) {
30 | if (((mask >> b) & 1) == 0)
31 | leftSegment[left++] = b;
32 | else {
33 | rightSegment[right++] = b;
34 | }
35 | }
36 |
37 | for (var b = 0; b < 8; b++) {
38 | Assert.That(data[b], Is.Not.Negative);
39 | }
40 |
41 | yield return data;
42 | }
43 | }
44 |
45 | [Test]
46 | [Repeat(1000)]
47 | public unsafe void GeneratedPermutationsAreCorrect()
48 | {
49 | var perms = GenerateStableIntPermTableValues().ToArray();
50 |
51 | for (var i = 0U; i < 256U; i++) {
52 | var pivot = 666;
53 |
54 | var r = new Random((int) DateTime.UtcNow.Ticks);
55 |
56 | var data = new int[8] {-1, -1, -1, -1, -1, -1, -1, -1};
57 | for (var j = 0; j < 8; j++) {
58 | data[j] = (((i >> j) & 0x1) == 0) ? r.Next(0, 666) : r.Next(777, 1000);
59 | }
60 |
61 | // Check if I messed up and there's a -1 somewhere
62 | Assert.That(data, Is.All.Not.Negative);
63 |
64 | var permutedData = new int[8];
65 |
66 | fixed (int* perm = &perms[i][0])
67 | fixed (int* pSrc = &data[0])
68 | fixed (int* pDest = &permutedData[0]) {
69 | var dataVector = LoadDquVector256(pSrc);
70 | dataVector = PermuteVar8x32(dataVector, LoadDquVector256(perm));
71 | Store(pDest, dataVector);
72 | }
73 |
74 | var numLeft = 8 - (int) Popcnt.PopCount(i);
75 | Assert.That(permutedData[0..numLeft], Is.All.LessThan(pivot));
76 | Assert.That(permutedData[numLeft..], Is.All.GreaterThan(pivot));
77 | Assert.That(data.Except(permutedData), Is.Empty);
78 | }
79 | }
80 |
81 | [Test]
82 | public unsafe void GeneratedPermutationsAreStable()
83 | {
84 | var perms = GenerateStableIntPermTableValues().ToArray();
85 |
86 | for (var mask = 0U; mask < 256U; mask++) {
87 | var pivot = 666;
88 |
89 | var popCount = (int) Popcnt.PopCount(mask);
90 | var numRight = popCount;
91 | var numLeft = 8 - popCount;
92 |
93 | for (var numPivot = 0; numPivot < 4; numPivot++) {
94 | var data = new int[] {-1, -1, -1, -1, -1, -1, -1, -1};
95 | var smallerThanData = Enumerable.Range(100, numLeft).ToArray();
96 | var largerThanData = Enumerable.Range(777, numRight).ToArray();
97 |
98 |
99 | for (int b = 0, si = 0, li = 0; b < 8; b++) {
100 | data[b] = (((mask >> b) & 1) == 0) ? smallerThanData[si++] : largerThanData[li++];
101 | }
102 | var permutedData = new int[] {-1, -1, -1, -1, -1, -1, -1, -1};
103 |
104 | fixed (int* perm = &perms[mask][0])
105 | fixed (int* pSrc = &data[0])
106 | fixed (int* pDest = &permutedData[0]) {
107 | var dataVector = LoadDquVector256(pSrc);
108 | dataVector =
109 | PermuteVar8x32(dataVector, LoadDquVector256(perm));
110 | Store(pDest, dataVector);
111 | }
112 |
113 | var msg = $"mask is {mask}/{Convert.ToString(mask, 2).PadLeft(8, '0')}|numPivot={numPivot}";
114 | Assert.That(permutedData[0..numLeft], Is.All.LessThan(pivot), msg);
115 | Assert.That(permutedData[0..numLeft], Is.Ordered, msg);
116 | Assert.That(permutedData[^numRight..], Is.All.GreaterThan(pivot), msg);
117 | Assert.That(permutedData[^numRight..], Is.Ordered, msg);
118 | Assert.That(data.Except(permutedData), Is.Empty, msg);
119 | }
120 | }
121 | }
122 |
123 | [Test]
124 | public unsafe void CompiledBytePermTableAlignedPtrIsGood()
125 | {
126 | var perms = GenerateStableIntPermTableValues().ToArray();
127 | for (var i = 0U; i < 256U; i++) {
128 | fixed (int* p = &perms[i][0]) {
129 | var truth = LoadDquVector256(p);
130 | var test =
131 | And(GetBytePermutationAligned(BytePermTableAlignedPtr, i), Vector256.Create(0x7));
132 |
133 | Assert.That(truth, Is.EqualTo(test));
134 | }
135 | }
136 | }
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/Bench/Utils/FullNameProvider.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Reflection;
6 | using System.Text;
7 | using BenchmarkDotNet.Code;
8 | using BenchmarkDotNet.Parameters;
9 | using BenchmarkDotNet.Running;
10 |
11 | namespace Bench.Utils
12 | {
13 | public static class FullNameProvider
14 | {
15 | private static readonly IReadOnlyDictionary Aliases = new Dictionary
16 | {
17 | { typeof(byte), "byte" },
18 | { typeof(sbyte), "sbyte" },
19 | { typeof(short), "short" },
20 | { typeof(ushort), "ushort" },
21 | { typeof(int), "int" },
22 | { typeof(uint), "uint" },
23 | { typeof(long), "long" },
24 | { typeof(ulong), "ulong" },
25 | { typeof(float), "float" },
26 | { typeof(double), "double" },
27 | { typeof(decimal), "decimal" },
28 | { typeof(object), "object" },
29 | { typeof(bool), "bool" },
30 | { typeof(char), "char" },
31 | { typeof(string), "string" },
32 | { typeof(byte?), "byte?" },
33 | { typeof(sbyte?), "sbyte?" },
34 | { typeof(short?), "short?" },
35 | { typeof(ushort?), "ushort?" },
36 | { typeof(int?), "int?" },
37 | { typeof(uint?), "uint?" },
38 | { typeof(long?), "long?" },
39 | { typeof(ulong?), "ulong?" },
40 | { typeof(float?), "float?" },
41 | { typeof(double?), "double?" },
42 | { typeof(decimal?), "decimal?" },
43 | { typeof(bool?), "bool?" },
44 | { typeof(char?), "char?" }
45 | };
46 |
47 | public static string GetBenchmarkName(BenchmarkCase benchmarkCase)
48 | {
49 | var type = benchmarkCase.Descriptor.Type;
50 |
51 | // we can't just use type.FullName because we need sth different for generics (it reports SimpleGeneric`1[[System.Int32, mscorlib, Version=4.0.0.0)
52 | var name = new StringBuilder();
53 |
54 | if (!string.IsNullOrEmpty(type.Namespace))
55 | name.Append(type.Namespace).Append('.');
56 |
57 | name.Append(GetNestedTypes(type));
58 |
59 | name.Append(GetTypeName(type)).Append('.');
60 |
61 | name.Append(GetMethodName(benchmarkCase));
62 |
63 | return name.ToString();
64 | }
65 |
66 | private static string GetNestedTypes(Type type)
67 | {
68 | string nestedTypes = "";
69 | Type child = type, parent = type.DeclaringType;
70 | while (child.IsNested && parent != null)
71 | {
72 | nestedTypes = parent.Name + "+" + nestedTypes;
73 |
74 | child = parent;
75 | parent = parent.DeclaringType;
76 | }
77 |
78 | return nestedTypes;
79 | }
80 |
81 | internal static string GetTypeName(Type type)
82 | {
83 | if (!type.IsGenericType)
84 | return type.Name;
85 |
86 | string mainName = type.Name.Substring(0, type.Name.IndexOf('`'));
87 | string args = string.Join(", ", type.GetGenericArguments().Select(GetTypeName).ToArray());
88 |
89 | return $"{mainName}<{args}>";
90 | }
91 |
92 | internal static string GetMethodName(BenchmarkCase benchmarkCase)
93 | {
94 | var name = new StringBuilder(benchmarkCase.Descriptor.WorkloadMethod.Name);
95 |
96 | if (benchmarkCase.HasParameters)
97 | name.Append(GetBenchmarkParameters(benchmarkCase.Descriptor.WorkloadMethod, benchmarkCase.Parameters));
98 |
99 | return name.ToString();
100 | }
101 |
102 | private static string GetBenchmarkParameters(MethodInfo method, ParameterInstances benchmarkParameters)
103 | {
104 | var methodArguments = method.GetParameters();
105 | var benchmarkParams = benchmarkParameters.Items.Where(parameter => !parameter.IsArgument).ToArray();
106 | var parametersBuilder = new StringBuilder(methodArguments.Length * 20).Append('(');
107 |
108 | for (int i = 0; i < methodArguments.Length; i++)
109 | {
110 | if (i > 0)
111 | parametersBuilder.Append(", ");
112 |
113 | parametersBuilder.Append(methodArguments[i].Name).Append(':').Append(' ');
114 | parametersBuilder.Append(GetArgument(benchmarkParameters.GetArgument(methodArguments[i].Name).Value, methodArguments[i].ParameterType));
115 | }
116 |
117 | for (int i = 0; i < benchmarkParams.Length; i++)
118 | {
119 | var parameter = benchmarkParams[i];
120 |
121 | if (methodArguments.Length > 0 || i > 0)
122 | parametersBuilder.Append(", ");
123 |
124 | parametersBuilder.Append(parameter.Name).Append(':').Append(' ');
125 | parametersBuilder.Append(GetArgument(parameter.Value, parameter.Value?.GetType()));
126 | }
127 |
128 | return parametersBuilder.Append(')').ToString();
129 | }
130 |
131 | private static string GetArgument(object argumentValue, Type argumentType)
132 | {
133 | switch (argumentValue) {
134 | case null:
135 | return "null";
136 | case IParam iparam:
137 | return GetArgument(iparam.Value, argumentType);
138 | case object[] array when array.Length == 1:
139 | return GetArgument(array[0], argumentType);
140 | case string text:
141 | return $"\"{EscapeWhitespaces(text)}\"";
142 | case char character:
143 | return $"'{character}'";
144 | case DateTime time:
145 | return time.ToString("yyyy'-'MM'-'dd'T'HH':'mm':'ss.fffffffK");
146 | case Type type:
147 | return $"typeof({GetTypeArgumentName(type)})";
148 | }
149 |
150 | if (argumentType != null && argumentType.IsArray)
151 | return GetArray((IEnumerable)argumentValue);
152 |
153 | return argumentValue.ToString();
154 | }
155 |
156 | // it's not generic so I can't simply use .Skip and all other LINQ goodness
157 | private static string GetArray(IEnumerable collection)
158 | {
159 | var buffer = new StringBuilder().Append('[');
160 |
161 | int index = 0;
162 | foreach (var item in collection)
163 | {
164 | if (index > 0)
165 | buffer.Append(", ");
166 |
167 | if (index > 4)
168 | {
169 | buffer.Append("..."); // [0, 1, 2, 3, 4, ...]
170 | break;
171 | }
172 |
173 | buffer.Append(GetArgument(item, item?.GetType()));
174 |
175 | ++index;
176 | }
177 |
178 | buffer.Append(']');
179 |
180 | return buffer.ToString();
181 | }
182 |
183 | private static string EscapeWhitespaces(string text)
184 | => text.Replace("\t", "\\t")
185 | .Replace("\r\n", "\\r\\n");
186 |
187 | private static string GetTypeArgumentName(Type type)
188 | {
189 | if (Aliases.TryGetValue(type, out string alias))
190 | return alias;
191 |
192 | if (Nullable.GetUnderlyingType(type) != null)
193 | return $"{GetTypeArgumentName(Nullable.GetUnderlyingType(type))}?";
194 |
195 | if (!string.IsNullOrEmpty(type.Namespace))
196 | return $"{type.Namespace}.{GetTypeName(type)}";
197 |
198 | return GetTypeName(type);
199 | }
200 | }
201 | }
202 |
--------------------------------------------------------------------------------
/VxSort/BytePermutationTables.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Diagnostics;
3 | using System.Runtime.CompilerServices;
4 | using System.Runtime.Intrinsics;
5 | using System.Runtime.Intrinsics.X86;
6 | using VxSortResearch.Utils;
7 |
8 | namespace VxSort
9 | {
10 | public static unsafe class BytePermutationTables
11 | {
12 | internal static ReadOnlySpan BytePermTable => new byte[] {
13 | 64, 1, 2, 3, 4, 5, 6, 7, // 0b00000000 (0)|Left-PC: 8
14 | 57, 2, 3, 4, 5, 6, 7, 0, // 0b00000001 (1)|Left-PC: 7
15 | 56, 2, 3, 4, 5, 6, 7, 1, // 0b00000010 (2)|Left-PC: 7
16 | 50, 3, 4, 5, 6, 7, 0, 1, // 0b00000011 (3)|Left-PC: 6
17 | 56, 1, 3, 4, 5, 6, 7, 2, // 0b00000100 (4)|Left-PC: 7
18 | 49, 3, 4, 5, 6, 7, 0, 2, // 0b00000101 (5)|Left-PC: 6
19 | 48, 3, 4, 5, 6, 7, 1, 2, // 0b00000110 (6)|Left-PC: 6
20 | 43, 4, 5, 6, 7, 0, 1, 2, // 0b00000111 (7)|Left-PC: 5
21 | 56, 1, 2, 4, 5, 6, 7, 3, // 0b00001000 (8)|Left-PC: 7
22 | 49, 2, 4, 5, 6, 7, 0, 3, // 0b00001001 (9)|Left-PC: 6
23 | 48, 2, 4, 5, 6, 7, 1, 3, // 0b00001010 (10)|Left-PC: 6
24 | 42, 4, 5, 6, 7, 0, 1, 3, // 0b00001011 (11)|Left-PC: 5
25 | 48, 1, 4, 5, 6, 7, 2, 3, // 0b00001100 (12)|Left-PC: 6
26 | 41, 4, 5, 6, 7, 0, 2, 3, // 0b00001101 (13)|Left-PC: 5
27 | 40, 4, 5, 6, 7, 1, 2, 3, // 0b00001110 (14)|Left-PC: 5
28 | 36, 5, 6, 7, 0, 1, 2, 3, // 0b00001111 (15)|Left-PC: 4
29 | 56, 1, 2, 3, 5, 6, 7, 4, // 0b00010000 (16)|Left-PC: 7
30 | 49, 2, 3, 5, 6, 7, 0, 4, // 0b00010001 (17)|Left-PC: 6
31 | 48, 2, 3, 5, 6, 7, 1, 4, // 0b00010010 (18)|Left-PC: 6
32 | 42, 3, 5, 6, 7, 0, 1, 4, // 0b00010011 (19)|Left-PC: 5
33 | 48, 1, 3, 5, 6, 7, 2, 4, // 0b00010100 (20)|Left-PC: 6
34 | 41, 3, 5, 6, 7, 0, 2, 4, // 0b00010101 (21)|Left-PC: 5
35 | 40, 3, 5, 6, 7, 1, 2, 4, // 0b00010110 (22)|Left-PC: 5
36 | 35, 5, 6, 7, 0, 1, 2, 4, // 0b00010111 (23)|Left-PC: 4
37 | 48, 1, 2, 5, 6, 7, 3, 4, // 0b00011000 (24)|Left-PC: 6
38 | 41, 2, 5, 6, 7, 0, 3, 4, // 0b00011001 (25)|Left-PC: 5
39 | 40, 2, 5, 6, 7, 1, 3, 4, // 0b00011010 (26)|Left-PC: 5
40 | 34, 5, 6, 7, 0, 1, 3, 4, // 0b00011011 (27)|Left-PC: 4
41 | 40, 1, 5, 6, 7, 2, 3, 4, // 0b00011100 (28)|Left-PC: 5
42 | 33, 5, 6, 7, 0, 2, 3, 4, // 0b00011101 (29)|Left-PC: 4
43 | 32, 5, 6, 7, 1, 2, 3, 4, // 0b00011110 (30)|Left-PC: 4
44 | 29, 6, 7, 0, 1, 2, 3, 4, // 0b00011111 (31)|Left-PC: 3
45 | 56, 1, 2, 3, 4, 6, 7, 5, // 0b00100000 (32)|Left-PC: 7
46 | 49, 2, 3, 4, 6, 7, 0, 5, // 0b00100001 (33)|Left-PC: 6
47 | 48, 2, 3, 4, 6, 7, 1, 5, // 0b00100010 (34)|Left-PC: 6
48 | 42, 3, 4, 6, 7, 0, 1, 5, // 0b00100011 (35)|Left-PC: 5
49 | 48, 1, 3, 4, 6, 7, 2, 5, // 0b00100100 (36)|Left-PC: 6
50 | 41, 3, 4, 6, 7, 0, 2, 5, // 0b00100101 (37)|Left-PC: 5
51 | 40, 3, 4, 6, 7, 1, 2, 5, // 0b00100110 (38)|Left-PC: 5
52 | 35, 4, 6, 7, 0, 1, 2, 5, // 0b00100111 (39)|Left-PC: 4
53 | 48, 1, 2, 4, 6, 7, 3, 5, // 0b00101000 (40)|Left-PC: 6
54 | 41, 2, 4, 6, 7, 0, 3, 5, // 0b00101001 (41)|Left-PC: 5
55 | 40, 2, 4, 6, 7, 1, 3, 5, // 0b00101010 (42)|Left-PC: 5
56 | 34, 4, 6, 7, 0, 1, 3, 5, // 0b00101011 (43)|Left-PC: 4
57 | 40, 1, 4, 6, 7, 2, 3, 5, // 0b00101100 (44)|Left-PC: 5
58 | 33, 4, 6, 7, 0, 2, 3, 5, // 0b00101101 (45)|Left-PC: 4
59 | 32, 4, 6, 7, 1, 2, 3, 5, // 0b00101110 (46)|Left-PC: 4
60 | 28, 6, 7, 0, 1, 2, 3, 5, // 0b00101111 (47)|Left-PC: 3
61 | 48, 1, 2, 3, 6, 7, 4, 5, // 0b00110000 (48)|Left-PC: 6
62 | 41, 2, 3, 6, 7, 0, 4, 5, // 0b00110001 (49)|Left-PC: 5
63 | 40, 2, 3, 6, 7, 1, 4, 5, // 0b00110010 (50)|Left-PC: 5
64 | 34, 3, 6, 7, 0, 1, 4, 5, // 0b00110011 (51)|Left-PC: 4
65 | 40, 1, 3, 6, 7, 2, 4, 5, // 0b00110100 (52)|Left-PC: 5
66 | 33, 3, 6, 7, 0, 2, 4, 5, // 0b00110101 (53)|Left-PC: 4
67 | 32, 3, 6, 7, 1, 2, 4, 5, // 0b00110110 (54)|Left-PC: 4
68 | 27, 6, 7, 0, 1, 2, 4, 5, // 0b00110111 (55)|Left-PC: 3
69 | 40, 1, 2, 6, 7, 3, 4, 5, // 0b00111000 (56)|Left-PC: 5
70 | 33, 2, 6, 7, 0, 3, 4, 5, // 0b00111001 (57)|Left-PC: 4
71 | 32, 2, 6, 7, 1, 3, 4, 5, // 0b00111010 (58)|Left-PC: 4
72 | 26, 6, 7, 0, 1, 3, 4, 5, // 0b00111011 (59)|Left-PC: 3
73 | 32, 1, 6, 7, 2, 3, 4, 5, // 0b00111100 (60)|Left-PC: 4
74 | 25, 6, 7, 0, 2, 3, 4, 5, // 0b00111101 (61)|Left-PC: 3
75 | 24, 6, 7, 1, 2, 3, 4, 5, // 0b00111110 (62)|Left-PC: 3
76 | 22, 7, 0, 1, 2, 3, 4, 5, // 0b00111111 (63)|Left-PC: 2
77 | 56, 1, 2, 3, 4, 5, 7, 6, // 0b01000000 (64)|Left-PC: 7
78 | 49, 2, 3, 4, 5, 7, 0, 6, // 0b01000001 (65)|Left-PC: 6
79 | 48, 2, 3, 4, 5, 7, 1, 6, // 0b01000010 (66)|Left-PC: 6
80 | 42, 3, 4, 5, 7, 0, 1, 6, // 0b01000011 (67)|Left-PC: 5
81 | 48, 1, 3, 4, 5, 7, 2, 6, // 0b01000100 (68)|Left-PC: 6
82 | 41, 3, 4, 5, 7, 0, 2, 6, // 0b01000101 (69)|Left-PC: 5
83 | 40, 3, 4, 5, 7, 1, 2, 6, // 0b01000110 (70)|Left-PC: 5
84 | 35, 4, 5, 7, 0, 1, 2, 6, // 0b01000111 (71)|Left-PC: 4
85 | 48, 1, 2, 4, 5, 7, 3, 6, // 0b01001000 (72)|Left-PC: 6
86 | 41, 2, 4, 5, 7, 0, 3, 6, // 0b01001001 (73)|Left-PC: 5
87 | 40, 2, 4, 5, 7, 1, 3, 6, // 0b01001010 (74)|Left-PC: 5
88 | 34, 4, 5, 7, 0, 1, 3, 6, // 0b01001011 (75)|Left-PC: 4
89 | 40, 1, 4, 5, 7, 2, 3, 6, // 0b01001100 (76)|Left-PC: 5
90 | 33, 4, 5, 7, 0, 2, 3, 6, // 0b01001101 (77)|Left-PC: 4
91 | 32, 4, 5, 7, 1, 2, 3, 6, // 0b01001110 (78)|Left-PC: 4
92 | 28, 5, 7, 0, 1, 2, 3, 6, // 0b01001111 (79)|Left-PC: 3
93 | 48, 1, 2, 3, 5, 7, 4, 6, // 0b01010000 (80)|Left-PC: 6
94 | 41, 2, 3, 5, 7, 0, 4, 6, // 0b01010001 (81)|Left-PC: 5
95 | 40, 2, 3, 5, 7, 1, 4, 6, // 0b01010010 (82)|Left-PC: 5
96 | 34, 3, 5, 7, 0, 1, 4, 6, // 0b01010011 (83)|Left-PC: 4
97 | 40, 1, 3, 5, 7, 2, 4, 6, // 0b01010100 (84)|Left-PC: 5
98 | 33, 3, 5, 7, 0, 2, 4, 6, // 0b01010101 (85)|Left-PC: 4
99 | 32, 3, 5, 7, 1, 2, 4, 6, // 0b01010110 (86)|Left-PC: 4
100 | 27, 5, 7, 0, 1, 2, 4, 6, // 0b01010111 (87)|Left-PC: 3
101 | 40, 1, 2, 5, 7, 3, 4, 6, // 0b01011000 (88)|Left-PC: 5
102 | 33, 2, 5, 7, 0, 3, 4, 6, // 0b01011001 (89)|Left-PC: 4
103 | 32, 2, 5, 7, 1, 3, 4, 6, // 0b01011010 (90)|Left-PC: 4
104 | 26, 5, 7, 0, 1, 3, 4, 6, // 0b01011011 (91)|Left-PC: 3
105 | 32, 1, 5, 7, 2, 3, 4, 6, // 0b01011100 (92)|Left-PC: 4
106 | 25, 5, 7, 0, 2, 3, 4, 6, // 0b01011101 (93)|Left-PC: 3
107 | 24, 5, 7, 1, 2, 3, 4, 6, // 0b01011110 (94)|Left-PC: 3
108 | 21, 7, 0, 1, 2, 3, 4, 6, // 0b01011111 (95)|Left-PC: 2
109 | 48, 1, 2, 3, 4, 7, 5, 6, // 0b01100000 (96)|Left-PC: 6
110 | 41, 2, 3, 4, 7, 0, 5, 6, // 0b01100001 (97)|Left-PC: 5
111 | 40, 2, 3, 4, 7, 1, 5, 6, // 0b01100010 (98)|Left-PC: 5
112 | 34, 3, 4, 7, 0, 1, 5, 6, // 0b01100011 (99)|Left-PC: 4
113 | 40, 1, 3, 4, 7, 2, 5, 6, // 0b01100100 (100)|Left-PC: 5
114 | 33, 3, 4, 7, 0, 2, 5, 6, // 0b01100101 (101)|Left-PC: 4
115 | 32, 3, 4, 7, 1, 2, 5, 6, // 0b01100110 (102)|Left-PC: 4
116 | 27, 4, 7, 0, 1, 2, 5, 6, // 0b01100111 (103)|Left-PC: 3
117 | 40, 1, 2, 4, 7, 3, 5, 6, // 0b01101000 (104)|Left-PC: 5
118 | 33, 2, 4, 7, 0, 3, 5, 6, // 0b01101001 (105)|Left-PC: 4
119 | 32, 2, 4, 7, 1, 3, 5, 6, // 0b01101010 (106)|Left-PC: 4
120 | 26, 4, 7, 0, 1, 3, 5, 6, // 0b01101011 (107)|Left-PC: 3
121 | 32, 1, 4, 7, 2, 3, 5, 6, // 0b01101100 (108)|Left-PC: 4
122 | 25, 4, 7, 0, 2, 3, 5, 6, // 0b01101101 (109)|Left-PC: 3
123 | 24, 4, 7, 1, 2, 3, 5, 6, // 0b01101110 (110)|Left-PC: 3
124 | 20, 7, 0, 1, 2, 3, 5, 6, // 0b01101111 (111)|Left-PC: 2
125 | 40, 1, 2, 3, 7, 4, 5, 6, // 0b01110000 (112)|Left-PC: 5
126 | 33, 2, 3, 7, 0, 4, 5, 6, // 0b01110001 (113)|Left-PC: 4
127 | 32, 2, 3, 7, 1, 4, 5, 6, // 0b01110010 (114)|Left-PC: 4
128 | 26, 3, 7, 0, 1, 4, 5, 6, // 0b01110011 (115)|Left-PC: 3
129 | 32, 1, 3, 7, 2, 4, 5, 6, // 0b01110100 (116)|Left-PC: 4
130 | 25, 3, 7, 0, 2, 4, 5, 6, // 0b01110101 (117)|Left-PC: 3
131 | 24, 3, 7, 1, 2, 4, 5, 6, // 0b01110110 (118)|Left-PC: 3
132 | 19, 7, 0, 1, 2, 4, 5, 6, // 0b01110111 (119)|Left-PC: 2
133 | 32, 1, 2, 7, 3, 4, 5, 6, // 0b01111000 (120)|Left-PC: 4
134 | 25, 2, 7, 0, 3, 4, 5, 6, // 0b01111001 (121)|Left-PC: 3
135 | 24, 2, 7, 1, 3, 4, 5, 6, // 0b01111010 (122)|Left-PC: 3
136 | 18, 7, 0, 1, 3, 4, 5, 6, // 0b01111011 (123)|Left-PC: 2
137 | 24, 1, 7, 2, 3, 4, 5, 6, // 0b01111100 (124)|Left-PC: 3
138 | 17, 7, 0, 2, 3, 4, 5, 6, // 0b01111101 (125)|Left-PC: 2
139 | 16, 7, 1, 2, 3, 4, 5, 6, // 0b01111110 (126)|Left-PC: 2
140 | 15, 0, 1, 2, 3, 4, 5, 6, // 0b01111111 (127)|Left-PC: 1
141 | 56, 1, 2, 3, 4, 5, 6, 7, // 0b10000000 (128)|Left-PC: 7
142 | 49, 2, 3, 4, 5, 6, 0, 7, // 0b10000001 (129)|Left-PC: 6
143 | 48, 2, 3, 4, 5, 6, 1, 7, // 0b10000010 (130)|Left-PC: 6
144 | 42, 3, 4, 5, 6, 0, 1, 7, // 0b10000011 (131)|Left-PC: 5
145 | 48, 1, 3, 4, 5, 6, 2, 7, // 0b10000100 (132)|Left-PC: 6
146 | 41, 3, 4, 5, 6, 0, 2, 7, // 0b10000101 (133)|Left-PC: 5
147 | 40, 3, 4, 5, 6, 1, 2, 7, // 0b10000110 (134)|Left-PC: 5
148 | 35, 4, 5, 6, 0, 1, 2, 7, // 0b10000111 (135)|Left-PC: 4
149 | 48, 1, 2, 4, 5, 6, 3, 7, // 0b10001000 (136)|Left-PC: 6
150 | 41, 2, 4, 5, 6, 0, 3, 7, // 0b10001001 (137)|Left-PC: 5
151 | 40, 2, 4, 5, 6, 1, 3, 7, // 0b10001010 (138)|Left-PC: 5
152 | 34, 4, 5, 6, 0, 1, 3, 7, // 0b10001011 (139)|Left-PC: 4
153 | 40, 1, 4, 5, 6, 2, 3, 7, // 0b10001100 (140)|Left-PC: 5
154 | 33, 4, 5, 6, 0, 2, 3, 7, // 0b10001101 (141)|Left-PC: 4
155 | 32, 4, 5, 6, 1, 2, 3, 7, // 0b10001110 (142)|Left-PC: 4
156 | 28, 5, 6, 0, 1, 2, 3, 7, // 0b10001111 (143)|Left-PC: 3
157 | 48, 1, 2, 3, 5, 6, 4, 7, // 0b10010000 (144)|Left-PC: 6
158 | 41, 2, 3, 5, 6, 0, 4, 7, // 0b10010001 (145)|Left-PC: 5
159 | 40, 2, 3, 5, 6, 1, 4, 7, // 0b10010010 (146)|Left-PC: 5
160 | 34, 3, 5, 6, 0, 1, 4, 7, // 0b10010011 (147)|Left-PC: 4
161 | 40, 1, 3, 5, 6, 2, 4, 7, // 0b10010100 (148)|Left-PC: 5
162 | 33, 3, 5, 6, 0, 2, 4, 7, // 0b10010101 (149)|Left-PC: 4
163 | 32, 3, 5, 6, 1, 2, 4, 7, // 0b10010110 (150)|Left-PC: 4
164 | 27, 5, 6, 0, 1, 2, 4, 7, // 0b10010111 (151)|Left-PC: 3
165 | 40, 1, 2, 5, 6, 3, 4, 7, // 0b10011000 (152)|Left-PC: 5
166 | 33, 2, 5, 6, 0, 3, 4, 7, // 0b10011001 (153)|Left-PC: 4
167 | 32, 2, 5, 6, 1, 3, 4, 7, // 0b10011010 (154)|Left-PC: 4
168 | 26, 5, 6, 0, 1, 3, 4, 7, // 0b10011011 (155)|Left-PC: 3
169 | 32, 1, 5, 6, 2, 3, 4, 7, // 0b10011100 (156)|Left-PC: 4
170 | 25, 5, 6, 0, 2, 3, 4, 7, // 0b10011101 (157)|Left-PC: 3
171 | 24, 5, 6, 1, 2, 3, 4, 7, // 0b10011110 (158)|Left-PC: 3
172 | 21, 6, 0, 1, 2, 3, 4, 7, // 0b10011111 (159)|Left-PC: 2
173 | 48, 1, 2, 3, 4, 6, 5, 7, // 0b10100000 (160)|Left-PC: 6
174 | 41, 2, 3, 4, 6, 0, 5, 7, // 0b10100001 (161)|Left-PC: 5
175 | 40, 2, 3, 4, 6, 1, 5, 7, // 0b10100010 (162)|Left-PC: 5
176 | 34, 3, 4, 6, 0, 1, 5, 7, // 0b10100011 (163)|Left-PC: 4
177 | 40, 1, 3, 4, 6, 2, 5, 7, // 0b10100100 (164)|Left-PC: 5
178 | 33, 3, 4, 6, 0, 2, 5, 7, // 0b10100101 (165)|Left-PC: 4
179 | 32, 3, 4, 6, 1, 2, 5, 7, // 0b10100110 (166)|Left-PC: 4
180 | 27, 4, 6, 0, 1, 2, 5, 7, // 0b10100111 (167)|Left-PC: 3
181 | 40, 1, 2, 4, 6, 3, 5, 7, // 0b10101000 (168)|Left-PC: 5
182 | 33, 2, 4, 6, 0, 3, 5, 7, // 0b10101001 (169)|Left-PC: 4
183 | 32, 2, 4, 6, 1, 3, 5, 7, // 0b10101010 (170)|Left-PC: 4
184 | 26, 4, 6, 0, 1, 3, 5, 7, // 0b10101011 (171)|Left-PC: 3
185 | 32, 1, 4, 6, 2, 3, 5, 7, // 0b10101100 (172)|Left-PC: 4
186 | 25, 4, 6, 0, 2, 3, 5, 7, // 0b10101101 (173)|Left-PC: 3
187 | 24, 4, 6, 1, 2, 3, 5, 7, // 0b10101110 (174)|Left-PC: 3
188 | 20, 6, 0, 1, 2, 3, 5, 7, // 0b10101111 (175)|Left-PC: 2
189 | 40, 1, 2, 3, 6, 4, 5, 7, // 0b10110000 (176)|Left-PC: 5
190 | 33, 2, 3, 6, 0, 4, 5, 7, // 0b10110001 (177)|Left-PC: 4
191 | 32, 2, 3, 6, 1, 4, 5, 7, // 0b10110010 (178)|Left-PC: 4
192 | 26, 3, 6, 0, 1, 4, 5, 7, // 0b10110011 (179)|Left-PC: 3
193 | 32, 1, 3, 6, 2, 4, 5, 7, // 0b10110100 (180)|Left-PC: 4
194 | 25, 3, 6, 0, 2, 4, 5, 7, // 0b10110101 (181)|Left-PC: 3
195 | 24, 3, 6, 1, 2, 4, 5, 7, // 0b10110110 (182)|Left-PC: 3
196 | 19, 6, 0, 1, 2, 4, 5, 7, // 0b10110111 (183)|Left-PC: 2
197 | 32, 1, 2, 6, 3, 4, 5, 7, // 0b10111000 (184)|Left-PC: 4
198 | 25, 2, 6, 0, 3, 4, 5, 7, // 0b10111001 (185)|Left-PC: 3
199 | 24, 2, 6, 1, 3, 4, 5, 7, // 0b10111010 (186)|Left-PC: 3
200 | 18, 6, 0, 1, 3, 4, 5, 7, // 0b10111011 (187)|Left-PC: 2
201 | 24, 1, 6, 2, 3, 4, 5, 7, // 0b10111100 (188)|Left-PC: 3
202 | 17, 6, 0, 2, 3, 4, 5, 7, // 0b10111101 (189)|Left-PC: 2
203 | 16, 6, 1, 2, 3, 4, 5, 7, // 0b10111110 (190)|Left-PC: 2
204 | 14, 0, 1, 2, 3, 4, 5, 7, // 0b10111111 (191)|Left-PC: 1
205 | 48, 1, 2, 3, 4, 5, 6, 7, // 0b11000000 (192)|Left-PC: 6
206 | 41, 2, 3, 4, 5, 0, 6, 7, // 0b11000001 (193)|Left-PC: 5
207 | 40, 2, 3, 4, 5, 1, 6, 7, // 0b11000010 (194)|Left-PC: 5
208 | 34, 3, 4, 5, 0, 1, 6, 7, // 0b11000011 (195)|Left-PC: 4
209 | 40, 1, 3, 4, 5, 2, 6, 7, // 0b11000100 (196)|Left-PC: 5
210 | 33, 3, 4, 5, 0, 2, 6, 7, // 0b11000101 (197)|Left-PC: 4
211 | 32, 3, 4, 5, 1, 2, 6, 7, // 0b11000110 (198)|Left-PC: 4
212 | 27, 4, 5, 0, 1, 2, 6, 7, // 0b11000111 (199)|Left-PC: 3
213 | 40, 1, 2, 4, 5, 3, 6, 7, // 0b11001000 (200)|Left-PC: 5
214 | 33, 2, 4, 5, 0, 3, 6, 7, // 0b11001001 (201)|Left-PC: 4
215 | 32, 2, 4, 5, 1, 3, 6, 7, // 0b11001010 (202)|Left-PC: 4
216 | 26, 4, 5, 0, 1, 3, 6, 7, // 0b11001011 (203)|Left-PC: 3
217 | 32, 1, 4, 5, 2, 3, 6, 7, // 0b11001100 (204)|Left-PC: 4
218 | 25, 4, 5, 0, 2, 3, 6, 7, // 0b11001101 (205)|Left-PC: 3
219 | 24, 4, 5, 1, 2, 3, 6, 7, // 0b11001110 (206)|Left-PC: 3
220 | 20, 5, 0, 1, 2, 3, 6, 7, // 0b11001111 (207)|Left-PC: 2
221 | 40, 1, 2, 3, 5, 4, 6, 7, // 0b11010000 (208)|Left-PC: 5
222 | 33, 2, 3, 5, 0, 4, 6, 7, // 0b11010001 (209)|Left-PC: 4
223 | 32, 2, 3, 5, 1, 4, 6, 7, // 0b11010010 (210)|Left-PC: 4
224 | 26, 3, 5, 0, 1, 4, 6, 7, // 0b11010011 (211)|Left-PC: 3
225 | 32, 1, 3, 5, 2, 4, 6, 7, // 0b11010100 (212)|Left-PC: 4
226 | 25, 3, 5, 0, 2, 4, 6, 7, // 0b11010101 (213)|Left-PC: 3
227 | 24, 3, 5, 1, 2, 4, 6, 7, // 0b11010110 (214)|Left-PC: 3
228 | 19, 5, 0, 1, 2, 4, 6, 7, // 0b11010111 (215)|Left-PC: 2
229 | 32, 1, 2, 5, 3, 4, 6, 7, // 0b11011000 (216)|Left-PC: 4
230 | 25, 2, 5, 0, 3, 4, 6, 7, // 0b11011001 (217)|Left-PC: 3
231 | 24, 2, 5, 1, 3, 4, 6, 7, // 0b11011010 (218)|Left-PC: 3
232 | 18, 5, 0, 1, 3, 4, 6, 7, // 0b11011011 (219)|Left-PC: 2
233 | 24, 1, 5, 2, 3, 4, 6, 7, // 0b11011100 (220)|Left-PC: 3
234 | 17, 5, 0, 2, 3, 4, 6, 7, // 0b11011101 (221)|Left-PC: 2
235 | 16, 5, 1, 2, 3, 4, 6, 7, // 0b11011110 (222)|Left-PC: 2
236 | 13, 0, 1, 2, 3, 4, 6, 7, // 0b11011111 (223)|Left-PC: 1
237 | 40, 1, 2, 3, 4, 5, 6, 7, // 0b11100000 (224)|Left-PC: 5
238 | 33, 2, 3, 4, 0, 5, 6, 7, // 0b11100001 (225)|Left-PC: 4
239 | 32, 2, 3, 4, 1, 5, 6, 7, // 0b11100010 (226)|Left-PC: 4
240 | 26, 3, 4, 0, 1, 5, 6, 7, // 0b11100011 (227)|Left-PC: 3
241 | 32, 1, 3, 4, 2, 5, 6, 7, // 0b11100100 (228)|Left-PC: 4
242 | 25, 3, 4, 0, 2, 5, 6, 7, // 0b11100101 (229)|Left-PC: 3
243 | 24, 3, 4, 1, 2, 5, 6, 7, // 0b11100110 (230)|Left-PC: 3
244 | 19, 4, 0, 1, 2, 5, 6, 7, // 0b11100111 (231)|Left-PC: 2
245 | 32, 1, 2, 4, 3, 5, 6, 7, // 0b11101000 (232)|Left-PC: 4
246 | 25, 2, 4, 0, 3, 5, 6, 7, // 0b11101001 (233)|Left-PC: 3
247 | 24, 2, 4, 1, 3, 5, 6, 7, // 0b11101010 (234)|Left-PC: 3
248 | 18, 4, 0, 1, 3, 5, 6, 7, // 0b11101011 (235)|Left-PC: 2
249 | 24, 1, 4, 2, 3, 5, 6, 7, // 0b11101100 (236)|Left-PC: 3
250 | 17, 4, 0, 2, 3, 5, 6, 7, // 0b11101101 (237)|Left-PC: 2
251 | 16, 4, 1, 2, 3, 5, 6, 7, // 0b11101110 (238)|Left-PC: 2
252 | 12, 0, 1, 2, 3, 5, 6, 7, // 0b11101111 (239)|Left-PC: 1
253 | 32, 1, 2, 3, 4, 5, 6, 7, // 0b11110000 (240)|Left-PC: 4
254 | 25, 2, 3, 0, 4, 5, 6, 7, // 0b11110001 (241)|Left-PC: 3
255 | 24, 2, 3, 1, 4, 5, 6, 7, // 0b11110010 (242)|Left-PC: 3
256 | 18, 3, 0, 1, 4, 5, 6, 7, // 0b11110011 (243)|Left-PC: 2
257 | 24, 1, 3, 2, 4, 5, 6, 7, // 0b11110100 (244)|Left-PC: 3
258 | 17, 3, 0, 2, 4, 5, 6, 7, // 0b11110101 (245)|Left-PC: 2
259 | 16, 3, 1, 2, 4, 5, 6, 7, // 0b11110110 (246)|Left-PC: 2
260 | 11, 0, 1, 2, 4, 5, 6, 7, // 0b11110111 (247)|Left-PC: 1
261 | 24, 1, 2, 3, 4, 5, 6, 7, // 0b11111000 (248)|Left-PC: 3
262 | 17, 2, 0, 3, 4, 5, 6, 7, // 0b11111001 (249)|Left-PC: 2
263 | 16, 2, 1, 3, 4, 5, 6, 7, // 0b11111010 (250)|Left-PC: 2
264 | 10, 0, 1, 3, 4, 5, 6, 7, // 0b11111011 (251)|Left-PC: 1
265 | 16, 1, 2, 3, 4, 5, 6, 7, // 0b11111100 (252)|Left-PC: 2
266 | 9, 0, 2, 3, 4, 5, 6, 7, // 0b11111101 (253)|Left-PC: 1
267 | 8, 1, 2, 3, 4, 5, 6, 7, // 0b11111110 (254)|Left-PC: 1
268 | 0, 1, 2, 3, 4, 5, 6, 7, // 0b11111111 (255)|Left-PC: 0
269 | };
270 |
271 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
272 | internal static Vector256 GetBytePermutationAligned(byte * pBase, uint index)
273 | {
274 | Debug.Assert(index <= 255);
275 | Debug.Assert(pBase != null);
276 | Debug.Assert(((ulong) (pBase + index * 8)) % 8 == 0);
277 | return Avx2.ConvertToVector256Int32(pBase + index * 8);
278 | }
279 |
280 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
281 | internal static unsafe Vector256 GetBytePermutationAligned(byte * pBase, ulong index)
282 | {
283 | Debug.Assert(index <= 255);
284 | Debug.Assert(pBase != null);
285 | Debug.Assert(((ulong) (pBase + index * 8)) % 8 == 0);
286 | return Avx2.ConvertToVector256Int32(pBase + index * 8);
287 | }
288 |
289 | internal static readonly unsafe byte* BytePermTableAlignedPtr;
290 |
291 | const uint PAGE_SIZE = 4096U;
292 |
293 | static BytePermutationTables()
294 | {
295 | BytePermTableAlignedPtr = (byte*) BytePermTable.AlignSpan(PAGE_SIZE);
296 | }
297 | }
298 | }
--------------------------------------------------------------------------------
/VxSort/BitonicSort.Generated.cs:
--------------------------------------------------------------------------------
1 | //
2 | // This code was generated by a tool on 2020-01-20-15:08:20
3 | //
4 | // Changes to this file may cause incorrect behavior and will be lost if
5 | // the code is regenerated.
6 | //
7 |
8 | using System;
9 | using System.Diagnostics;
10 | using System.Runtime.CompilerServices;
11 | using System.Runtime.Intrinsics;
12 | using static System.Runtime.Intrinsics.X86.Avx;
13 | using static System.Runtime.Intrinsics.X86.Avx2;
14 |
15 | namespace VxSort
16 | {
17 | using V = Vector256;
18 | static unsafe partial class BitonicSort
19 | {
20 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
21 | static void BitonicSort02V(ref V d01, ref V d02)
22 | {
23 | V tmp;
24 |
25 | BitonicSort01V(ref d01);
26 | BitonicSort01V(ref d02);
27 |
28 | tmp = Shuffle(d02, X_R);
29 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
30 | d02 = Max(d01, tmp);
31 | d01 = Min(d01, tmp);
32 |
33 | BitonicSort01VMerge(ref d01);
34 | BitonicSort01VMerge(ref d02);
35 | }
36 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
37 | static void BitonicSort02VMerge(ref V d01, ref V d02)
38 | {
39 | V tmp;
40 |
41 | tmp = d01;
42 | d01 = Min(d02, d01);
43 | d02 = Max(d02, tmp);
44 |
45 | BitonicSort01VMerge(ref d01);
46 | BitonicSort01VMerge(ref d02);
47 | }
48 |
49 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
50 | static void BitonicSort03V(ref V d01, ref V d02, ref V d03)
51 | {
52 | V tmp;
53 |
54 | BitonicSort02V(ref d01, ref d02);
55 | BitonicSort01V(ref d03);
56 |
57 | tmp = Shuffle(d03, X_R);
58 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
59 | d03 = Max(d02, tmp);
60 | d02 = Min(d02, tmp);
61 |
62 | BitonicSort02VMerge(ref d01, ref d02);
63 | BitonicSort01VMerge(ref d03);
64 | }
65 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
66 | static void BitonicSort03VMerge(ref V d01, ref V d02, ref V d03)
67 | {
68 | V tmp;
69 |
70 | tmp = d01;
71 | d01 = Min(d03, d01);
72 | d03 = Max(d03, tmp);
73 |
74 | BitonicSort02VMerge(ref d01, ref d02);
75 | BitonicSort01VMerge(ref d03);
76 | }
77 |
78 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
79 | static void BitonicSort04V(ref V d01, ref V d02, ref V d03, ref V d04)
80 | {
81 | V tmp;
82 |
83 | BitonicSort02V(ref d01, ref d02);
84 | BitonicSort02V(ref d03, ref d04);
85 |
86 | tmp = Shuffle(d03, X_R);
87 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
88 | d03 = Max(d02, tmp);
89 | d02 = Min(d02, tmp);
90 |
91 | tmp = Shuffle(d04, X_R);
92 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
93 | d04 = Max(d01, tmp);
94 | d01 = Min(d01, tmp);
95 |
96 | BitonicSort02VMerge(ref d01, ref d02);
97 | BitonicSort02VMerge(ref d03, ref d04);
98 | }
99 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
100 | static void BitonicSort04VMerge(ref V d01, ref V d02, ref V d03, ref V d04)
101 | {
102 | V tmp;
103 |
104 | tmp = d01;
105 | d01 = Min(d03, d01);
106 | d03 = Max(d03, tmp);
107 |
108 | tmp = d02;
109 | d02 = Min(d04, d02);
110 | d04 = Max(d04, tmp);
111 |
112 | BitonicSort02VMerge(ref d01, ref d02);
113 | BitonicSort02VMerge(ref d03, ref d04);
114 | }
115 |
116 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
117 | static void BitonicSort05V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05)
118 | {
119 | V tmp;
120 |
121 | BitonicSort04V(ref d01, ref d02, ref d03, ref d04);
122 | BitonicSort01V(ref d05);
123 |
124 | tmp = Shuffle(d05, X_R);
125 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
126 | d05 = Max(d04, tmp);
127 | d04 = Min(d04, tmp);
128 |
129 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
130 | BitonicSort01VMerge(ref d05);
131 | }
132 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
133 | static void BitonicSort05VMerge(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05)
134 | {
135 | V tmp;
136 |
137 | tmp = d01;
138 | d01 = Min(d05, d01);
139 | d05 = Max(d05, tmp);
140 |
141 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
142 | BitonicSort01VMerge(ref d05);
143 | }
144 |
145 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
146 | static void BitonicSort06V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06)
147 | {
148 | V tmp;
149 |
150 | BitonicSort04V(ref d01, ref d02, ref d03, ref d04);
151 | BitonicSort02V(ref d05, ref d06);
152 |
153 | tmp = Shuffle(d05, X_R);
154 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
155 | d05 = Max(d04, tmp);
156 | d04 = Min(d04, tmp);
157 |
158 | tmp = Shuffle(d06, X_R);
159 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
160 | d06 = Max(d03, tmp);
161 | d03 = Min(d03, tmp);
162 |
163 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
164 | BitonicSort02VMerge(ref d05, ref d06);
165 | }
166 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
167 | static void BitonicSort06VMerge(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06)
168 | {
169 | V tmp;
170 |
171 | tmp = d01;
172 | d01 = Min(d05, d01);
173 | d05 = Max(d05, tmp);
174 |
175 | tmp = d02;
176 | d02 = Min(d06, d02);
177 | d06 = Max(d06, tmp);
178 |
179 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
180 | BitonicSort02VMerge(ref d05, ref d06);
181 | }
182 |
183 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
184 | static void BitonicSort07V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07)
185 | {
186 | V tmp;
187 |
188 | BitonicSort04V(ref d01, ref d02, ref d03, ref d04);
189 | BitonicSort03V(ref d05, ref d06, ref d07);
190 |
191 | tmp = Shuffle(d05, X_R);
192 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
193 | d05 = Max(d04, tmp);
194 | d04 = Min(d04, tmp);
195 |
196 | tmp = Shuffle(d06, X_R);
197 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
198 | d06 = Max(d03, tmp);
199 | d03 = Min(d03, tmp);
200 |
201 | tmp = Shuffle(d07, X_R);
202 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
203 | d07 = Max(d02, tmp);
204 | d02 = Min(d02, tmp);
205 |
206 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
207 | BitonicSort03VMerge(ref d05, ref d06, ref d07);
208 | }
209 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
210 | static void BitonicSort07VMerge(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07)
211 | {
212 | V tmp;
213 |
214 | tmp = d01;
215 | d01 = Min(d05, d01);
216 | d05 = Max(d05, tmp);
217 |
218 | tmp = d02;
219 | d02 = Min(d06, d02);
220 | d06 = Max(d06, tmp);
221 |
222 | tmp = d03;
223 | d03 = Min(d07, d03);
224 | d07 = Max(d07, tmp);
225 |
226 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
227 | BitonicSort03VMerge(ref d05, ref d06, ref d07);
228 | }
229 |
230 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
231 | static void BitonicSort08V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08)
232 | {
233 | V tmp;
234 |
235 | BitonicSort04V(ref d01, ref d02, ref d03, ref d04);
236 | BitonicSort04V(ref d05, ref d06, ref d07, ref d08);
237 |
238 | tmp = Shuffle(d05, X_R);
239 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
240 | d05 = Max(d04, tmp);
241 | d04 = Min(d04, tmp);
242 |
243 | tmp = Shuffle(d06, X_R);
244 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
245 | d06 = Max(d03, tmp);
246 | d03 = Min(d03, tmp);
247 |
248 | tmp = Shuffle(d07, X_R);
249 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
250 | d07 = Max(d02, tmp);
251 | d02 = Min(d02, tmp);
252 |
253 | tmp = Shuffle(d08, X_R);
254 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
255 | d08 = Max(d01, tmp);
256 | d01 = Min(d01, tmp);
257 |
258 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
259 | BitonicSort04VMerge(ref d05, ref d06, ref d07, ref d08);
260 | }
261 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
262 | static void BitonicSort08VMerge(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08)
263 | {
264 | V tmp;
265 |
266 | tmp = d01;
267 | d01 = Min(d05, d01);
268 | d05 = Max(d05, tmp);
269 |
270 | tmp = d02;
271 | d02 = Min(d06, d02);
272 | d06 = Max(d06, tmp);
273 |
274 | tmp = d03;
275 | d03 = Min(d07, d03);
276 | d07 = Max(d07, tmp);
277 |
278 | tmp = d04;
279 | d04 = Min(d08, d04);
280 | d08 = Max(d08, tmp);
281 |
282 | BitonicSort04VMerge(ref d01, ref d02, ref d03, ref d04);
283 | BitonicSort04VMerge(ref d05, ref d06, ref d07, ref d08);
284 | }
285 |
286 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
287 | static void BitonicSort09V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09)
288 | {
289 | V tmp;
290 |
291 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
292 | BitonicSort01V(ref d09);
293 |
294 | tmp = Shuffle(d09, X_R);
295 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
296 | d09 = Max(d08, tmp);
297 | d08 = Min(d08, tmp);
298 |
299 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
300 | BitonicSort01VMerge(ref d09);
301 | }
302 |
303 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
304 | static void BitonicSort10V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10)
305 | {
306 | V tmp;
307 |
308 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
309 | BitonicSort02V(ref d09, ref d10);
310 |
311 | tmp = Shuffle(d09, X_R);
312 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
313 | d09 = Max(d08, tmp);
314 | d08 = Min(d08, tmp);
315 |
316 | tmp = Shuffle(d10, X_R);
317 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
318 | d10 = Max(d07, tmp);
319 | d07 = Min(d07, tmp);
320 |
321 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
322 | BitonicSort02VMerge(ref d09, ref d10);
323 | }
324 |
325 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
326 | static void BitonicSort11V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10, ref V d11)
327 | {
328 | V tmp;
329 |
330 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
331 | BitonicSort03V(ref d09, ref d10, ref d11);
332 |
333 | tmp = Shuffle(d09, X_R);
334 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
335 | d09 = Max(d08, tmp);
336 | d08 = Min(d08, tmp);
337 |
338 | tmp = Shuffle(d10, X_R);
339 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
340 | d10 = Max(d07, tmp);
341 | d07 = Min(d07, tmp);
342 |
343 | tmp = Shuffle(d11, X_R);
344 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
345 | d11 = Max(d06, tmp);
346 | d06 = Min(d06, tmp);
347 |
348 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
349 | BitonicSort03VMerge(ref d09, ref d10, ref d11);
350 | }
351 |
352 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
353 | static void BitonicSort12V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10, ref V d11, ref V d12)
354 | {
355 | V tmp;
356 |
357 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
358 | BitonicSort04V(ref d09, ref d10, ref d11, ref d12);
359 |
360 | tmp = Shuffle(d09, X_R);
361 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
362 | d09 = Max(d08, tmp);
363 | d08 = Min(d08, tmp);
364 |
365 | tmp = Shuffle(d10, X_R);
366 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
367 | d10 = Max(d07, tmp);
368 | d07 = Min(d07, tmp);
369 |
370 | tmp = Shuffle(d11, X_R);
371 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
372 | d11 = Max(d06, tmp);
373 | d06 = Min(d06, tmp);
374 |
375 | tmp = Shuffle(d12, X_R);
376 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
377 | d12 = Max(d05, tmp);
378 | d05 = Min(d05, tmp);
379 |
380 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
381 | BitonicSort04VMerge(ref d09, ref d10, ref d11, ref d12);
382 | }
383 |
384 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
385 | static void BitonicSort13V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10, ref V d11, ref V d12, ref V d13)
386 | {
387 | V tmp;
388 |
389 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
390 | BitonicSort05V(ref d09, ref d10, ref d11, ref d12, ref d13);
391 |
392 | tmp = Shuffle(d09, X_R);
393 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
394 | d09 = Max(d08, tmp);
395 | d08 = Min(d08, tmp);
396 |
397 | tmp = Shuffle(d10, X_R);
398 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
399 | d10 = Max(d07, tmp);
400 | d07 = Min(d07, tmp);
401 |
402 | tmp = Shuffle(d11, X_R);
403 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
404 | d11 = Max(d06, tmp);
405 | d06 = Min(d06, tmp);
406 |
407 | tmp = Shuffle(d12, X_R);
408 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
409 | d12 = Max(d05, tmp);
410 | d05 = Min(d05, tmp);
411 |
412 | tmp = Shuffle(d13, X_R);
413 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
414 | d13 = Max(d04, tmp);
415 | d04 = Min(d04, tmp);
416 |
417 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
418 | BitonicSort05VMerge(ref d09, ref d10, ref d11, ref d12, ref d13);
419 | }
420 |
421 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
422 | static void BitonicSort14V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10, ref V d11, ref V d12, ref V d13, ref V d14)
423 | {
424 | V tmp;
425 |
426 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
427 | BitonicSort06V(ref d09, ref d10, ref d11, ref d12, ref d13, ref d14);
428 |
429 | tmp = Shuffle(d09, X_R);
430 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
431 | d09 = Max(d08, tmp);
432 | d08 = Min(d08, tmp);
433 |
434 | tmp = Shuffle(d10, X_R);
435 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
436 | d10 = Max(d07, tmp);
437 | d07 = Min(d07, tmp);
438 |
439 | tmp = Shuffle(d11, X_R);
440 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
441 | d11 = Max(d06, tmp);
442 | d06 = Min(d06, tmp);
443 |
444 | tmp = Shuffle(d12, X_R);
445 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
446 | d12 = Max(d05, tmp);
447 | d05 = Min(d05, tmp);
448 |
449 | tmp = Shuffle(d13, X_R);
450 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
451 | d13 = Max(d04, tmp);
452 | d04 = Min(d04, tmp);
453 |
454 | tmp = Shuffle(d14, X_R);
455 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
456 | d14 = Max(d03, tmp);
457 | d03 = Min(d03, tmp);
458 |
459 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
460 | BitonicSort06VMerge(ref d09, ref d10, ref d11, ref d12, ref d13, ref d14);
461 | }
462 |
463 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
464 | static void BitonicSort15V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10, ref V d11, ref V d12, ref V d13, ref V d14, ref V d15)
465 | {
466 | V tmp;
467 |
468 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
469 | BitonicSort07V(ref d09, ref d10, ref d11, ref d12, ref d13, ref d14, ref d15);
470 |
471 | tmp = Shuffle(d09, X_R);
472 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
473 | d09 = Max(d08, tmp);
474 | d08 = Min(d08, tmp);
475 |
476 | tmp = Shuffle(d10, X_R);
477 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
478 | d10 = Max(d07, tmp);
479 | d07 = Min(d07, tmp);
480 |
481 | tmp = Shuffle(d11, X_R);
482 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
483 | d11 = Max(d06, tmp);
484 | d06 = Min(d06, tmp);
485 |
486 | tmp = Shuffle(d12, X_R);
487 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
488 | d12 = Max(d05, tmp);
489 | d05 = Min(d05, tmp);
490 |
491 | tmp = Shuffle(d13, X_R);
492 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
493 | d13 = Max(d04, tmp);
494 | d04 = Min(d04, tmp);
495 |
496 | tmp = Shuffle(d14, X_R);
497 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
498 | d14 = Max(d03, tmp);
499 | d03 = Min(d03, tmp);
500 |
501 | tmp = Shuffle(d15, X_R);
502 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
503 | d15 = Max(d02, tmp);
504 | d02 = Min(d02, tmp);
505 |
506 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
507 | BitonicSort07VMerge(ref d09, ref d10, ref d11, ref d12, ref d13, ref d14, ref d15);
508 | }
509 |
510 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
511 | static void BitonicSort16V(ref V d01, ref V d02, ref V d03, ref V d04, ref V d05, ref V d06, ref V d07, ref V d08, ref V d09, ref V d10, ref V d11, ref V d12, ref V d13, ref V d14, ref V d15, ref V d16)
512 | {
513 | V tmp;
514 |
515 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
516 | BitonicSort08V(ref d09, ref d10, ref d11, ref d12, ref d13, ref d14, ref d15, ref d16);
517 |
518 | tmp = Shuffle(d09, X_R);
519 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
520 | d09 = Max(d08, tmp);
521 | d08 = Min(d08, tmp);
522 |
523 | tmp = Shuffle(d10, X_R);
524 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
525 | d10 = Max(d07, tmp);
526 | d07 = Min(d07, tmp);
527 |
528 | tmp = Shuffle(d11, X_R);
529 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
530 | d11 = Max(d06, tmp);
531 | d06 = Min(d06, tmp);
532 |
533 | tmp = Shuffle(d12, X_R);
534 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
535 | d12 = Max(d05, tmp);
536 | d05 = Min(d05, tmp);
537 |
538 | tmp = Shuffle(d13, X_R);
539 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
540 | d13 = Max(d04, tmp);
541 | d04 = Min(d04, tmp);
542 |
543 | tmp = Shuffle(d14, X_R);
544 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
545 | d14 = Max(d03, tmp);
546 | d03 = Min(d03, tmp);
547 |
548 | tmp = Shuffle(d15, X_R);
549 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
550 | d15 = Max(d02, tmp);
551 | d02 = Min(d02, tmp);
552 |
553 | tmp = Shuffle(d16, X_R);
554 | tmp = Permute4x64(tmp.AsInt64(), P_X).AsInt32();
555 | d16 = Max(d01, tmp);
556 | d01 = Min(d01, tmp);
557 |
558 | BitonicSort08VMerge(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
559 | BitonicSort08VMerge(ref d09, ref d10, ref d11, ref d12, ref d13, ref d14, ref d15, ref d16);
560 | }
561 |
562 |
563 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
564 | static void BitonicSort01V(int* ptr)
565 | {
566 | var N = V.Count;
567 |
568 | var d01 = LoadDquVector256(ptr + 00*N);
569 |
570 | BitonicSort01V(ref d01);
571 |
572 | Store(ptr + 00*N, d01);
573 | }
574 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
575 | static void BitonicSort02V(int* ptr)
576 | {
577 | var N = V.Count;
578 |
579 | var d01 = LoadDquVector256(ptr + 00*N);
580 | var d02 = LoadDquVector256(ptr + 01*N);
581 |
582 | BitonicSort02V(ref d01, ref d02);
583 |
584 | Store(ptr + 00*N, d01);
585 | Store(ptr + 01*N, d02);
586 | }
587 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
588 | static void BitonicSort03V(int* ptr)
589 | {
590 | var N = V.Count;
591 |
592 | var d01 = LoadDquVector256(ptr + 00*N);
593 | var d02 = LoadDquVector256(ptr + 01*N);
594 | var d03 = LoadDquVector256(ptr + 02*N);
595 |
596 | BitonicSort03V(ref d01, ref d02, ref d03);
597 |
598 | Store(ptr + 00*N, d01);
599 | Store(ptr + 01*N, d02);
600 | Store(ptr + 02*N, d03);
601 | }
602 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
603 | static void BitonicSort04V(int* ptr)
604 | {
605 | var N = V.Count;
606 |
607 | var d01 = LoadDquVector256(ptr + 00*N);
608 | var d02 = LoadDquVector256(ptr + 01*N);
609 | var d03 = LoadDquVector256(ptr + 02*N);
610 | var d04 = LoadDquVector256(ptr + 03*N);
611 |
612 | BitonicSort04V(ref d01, ref d02, ref d03, ref d04);
613 |
614 | Store(ptr + 00*N, d01);
615 | Store(ptr + 01*N, d02);
616 | Store(ptr + 02*N, d03);
617 | Store(ptr + 03*N, d04);
618 | }
619 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
620 | static void BitonicSort05V(int* ptr)
621 | {
622 | var N = V.Count;
623 |
624 | var d01 = LoadDquVector256(ptr + 00*N);
625 | var d02 = LoadDquVector256(ptr + 01*N);
626 | var d03 = LoadDquVector256(ptr + 02*N);
627 | var d04 = LoadDquVector256(ptr + 03*N);
628 | var d05 = LoadDquVector256(ptr + 04*N);
629 |
630 | BitonicSort05V(ref d01, ref d02, ref d03, ref d04, ref d05);
631 |
632 | Store(ptr + 00*N, d01);
633 | Store(ptr + 01*N, d02);
634 | Store(ptr + 02*N, d03);
635 | Store(ptr + 03*N, d04);
636 | Store(ptr + 04*N, d05);
637 | }
638 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
639 | static void BitonicSort06V(int* ptr)
640 | {
641 | var N = V.Count;
642 |
643 | var d01 = LoadDquVector256(ptr + 00*N);
644 | var d02 = LoadDquVector256(ptr + 01*N);
645 | var d03 = LoadDquVector256(ptr + 02*N);
646 | var d04 = LoadDquVector256(ptr + 03*N);
647 | var d05 = LoadDquVector256(ptr + 04*N);
648 | var d06 = LoadDquVector256(ptr + 05*N);
649 |
650 | BitonicSort06V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06);
651 |
652 | Store(ptr + 00*N, d01);
653 | Store(ptr + 01*N, d02);
654 | Store(ptr + 02*N, d03);
655 | Store(ptr + 03*N, d04);
656 | Store(ptr + 04*N, d05);
657 | Store(ptr + 05*N, d06);
658 | }
659 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
660 | static void BitonicSort07V(int* ptr)
661 | {
662 | var N = V.Count;
663 |
664 | var d01 = LoadDquVector256(ptr + 00*N);
665 | var d02 = LoadDquVector256(ptr + 01*N);
666 | var d03 = LoadDquVector256(ptr + 02*N);
667 | var d04 = LoadDquVector256(ptr + 03*N);
668 | var d05 = LoadDquVector256(ptr + 04*N);
669 | var d06 = LoadDquVector256(ptr + 05*N);
670 | var d07 = LoadDquVector256(ptr + 06*N);
671 |
672 | BitonicSort07V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07);
673 |
674 | Store(ptr + 00*N, d01);
675 | Store(ptr + 01*N, d02);
676 | Store(ptr + 02*N, d03);
677 | Store(ptr + 03*N, d04);
678 | Store(ptr + 04*N, d05);
679 | Store(ptr + 05*N, d06);
680 | Store(ptr + 06*N, d07);
681 | }
682 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
683 | static void BitonicSort08V(int* ptr)
684 | {
685 | var N = V.Count;
686 |
687 | var d01 = LoadDquVector256(ptr + 00*N);
688 | var d02 = LoadDquVector256(ptr + 01*N);
689 | var d03 = LoadDquVector256(ptr + 02*N);
690 | var d04 = LoadDquVector256(ptr + 03*N);
691 | var d05 = LoadDquVector256(ptr + 04*N);
692 | var d06 = LoadDquVector256(ptr + 05*N);
693 | var d07 = LoadDquVector256(ptr + 06*N);
694 | var d08 = LoadDquVector256(ptr + 07*N);
695 |
696 | BitonicSort08V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08);
697 |
698 | Store(ptr + 00*N, d01);
699 | Store(ptr + 01*N, d02);
700 | Store(ptr + 02*N, d03);
701 | Store(ptr + 03*N, d04);
702 | Store(ptr + 04*N, d05);
703 | Store(ptr + 05*N, d06);
704 | Store(ptr + 06*N, d07);
705 | Store(ptr + 07*N, d08);
706 | }
707 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
708 | static void BitonicSort09V(int* ptr)
709 | {
710 | var N = V.Count;
711 |
712 | var d01 = LoadDquVector256(ptr + 00*N);
713 | var d02 = LoadDquVector256(ptr + 01*N);
714 | var d03 = LoadDquVector256(ptr + 02*N);
715 | var d04 = LoadDquVector256(ptr + 03*N);
716 | var d05 = LoadDquVector256(ptr + 04*N);
717 | var d06 = LoadDquVector256(ptr + 05*N);
718 | var d07 = LoadDquVector256(ptr + 06*N);
719 | var d08 = LoadDquVector256(ptr + 07*N);
720 | var d09 = LoadDquVector256(ptr + 08*N);
721 |
722 | BitonicSort09V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09);
723 |
724 | Store(ptr + 00*N, d01);
725 | Store(ptr + 01*N, d02);
726 | Store(ptr + 02*N, d03);
727 | Store(ptr + 03*N, d04);
728 | Store(ptr + 04*N, d05);
729 | Store(ptr + 05*N, d06);
730 | Store(ptr + 06*N, d07);
731 | Store(ptr + 07*N, d08);
732 | Store(ptr + 08*N, d09);
733 | }
734 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
735 | static void BitonicSort10V(int* ptr)
736 | {
737 | var N = V.Count;
738 |
739 | var d01 = LoadDquVector256(ptr + 00*N);
740 | var d02 = LoadDquVector256(ptr + 01*N);
741 | var d03 = LoadDquVector256(ptr + 02*N);
742 | var d04 = LoadDquVector256(ptr + 03*N);
743 | var d05 = LoadDquVector256(ptr + 04*N);
744 | var d06 = LoadDquVector256(ptr + 05*N);
745 | var d07 = LoadDquVector256(ptr + 06*N);
746 | var d08 = LoadDquVector256(ptr + 07*N);
747 | var d09 = LoadDquVector256(ptr + 08*N);
748 | var d10 = LoadDquVector256(ptr + 09*N);
749 |
750 | BitonicSort10V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10);
751 |
752 | Store(ptr + 00*N, d01);
753 | Store(ptr + 01*N, d02);
754 | Store(ptr + 02*N, d03);
755 | Store(ptr + 03*N, d04);
756 | Store(ptr + 04*N, d05);
757 | Store(ptr + 05*N, d06);
758 | Store(ptr + 06*N, d07);
759 | Store(ptr + 07*N, d08);
760 | Store(ptr + 08*N, d09);
761 | Store(ptr + 09*N, d10);
762 | }
763 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
764 | static void BitonicSort11V(int* ptr)
765 | {
766 | var N = V.Count;
767 |
768 | var d01 = LoadDquVector256(ptr + 00*N);
769 | var d02 = LoadDquVector256(ptr + 01*N);
770 | var d03 = LoadDquVector256(ptr + 02*N);
771 | var d04 = LoadDquVector256(ptr + 03*N);
772 | var d05 = LoadDquVector256(ptr + 04*N);
773 | var d06 = LoadDquVector256(ptr + 05*N);
774 | var d07 = LoadDquVector256(ptr + 06*N);
775 | var d08 = LoadDquVector256(ptr + 07*N);
776 | var d09 = LoadDquVector256(ptr + 08*N);
777 | var d10 = LoadDquVector256(ptr + 09*N);
778 | var d11 = LoadDquVector256(ptr + 10*N);
779 |
780 | BitonicSort11V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10, ref d11);
781 |
782 | Store(ptr + 00*N, d01);
783 | Store(ptr + 01*N, d02);
784 | Store(ptr + 02*N, d03);
785 | Store(ptr + 03*N, d04);
786 | Store(ptr + 04*N, d05);
787 | Store(ptr + 05*N, d06);
788 | Store(ptr + 06*N, d07);
789 | Store(ptr + 07*N, d08);
790 | Store(ptr + 08*N, d09);
791 | Store(ptr + 09*N, d10);
792 | Store(ptr + 10*N, d11);
793 | }
794 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
795 | static void BitonicSort12V(int* ptr)
796 | {
797 | var N = V.Count;
798 |
799 | var d01 = LoadDquVector256(ptr + 00*N);
800 | var d02 = LoadDquVector256(ptr + 01*N);
801 | var d03 = LoadDquVector256(ptr + 02*N);
802 | var d04 = LoadDquVector256(ptr + 03*N);
803 | var d05 = LoadDquVector256(ptr + 04*N);
804 | var d06 = LoadDquVector256(ptr + 05*N);
805 | var d07 = LoadDquVector256(ptr + 06*N);
806 | var d08 = LoadDquVector256(ptr + 07*N);
807 | var d09 = LoadDquVector256(ptr + 08*N);
808 | var d10 = LoadDquVector256(ptr + 09*N);
809 | var d11 = LoadDquVector256(ptr + 10*N);
810 | var d12 = LoadDquVector256(ptr + 11*N);
811 |
812 | BitonicSort12V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10, ref d11, ref d12);
813 |
814 | Store(ptr + 00*N, d01);
815 | Store(ptr + 01*N, d02);
816 | Store(ptr + 02*N, d03);
817 | Store(ptr + 03*N, d04);
818 | Store(ptr + 04*N, d05);
819 | Store(ptr + 05*N, d06);
820 | Store(ptr + 06*N, d07);
821 | Store(ptr + 07*N, d08);
822 | Store(ptr + 08*N, d09);
823 | Store(ptr + 09*N, d10);
824 | Store(ptr + 10*N, d11);
825 | Store(ptr + 11*N, d12);
826 | }
827 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
828 | static void BitonicSort13V(int* ptr)
829 | {
830 | var N = V.Count;
831 |
832 | var d01 = LoadDquVector256(ptr + 00*N);
833 | var d02 = LoadDquVector256(ptr + 01*N);
834 | var d03 = LoadDquVector256(ptr + 02*N);
835 | var d04 = LoadDquVector256(ptr + 03*N);
836 | var d05 = LoadDquVector256(ptr + 04*N);
837 | var d06 = LoadDquVector256(ptr + 05*N);
838 | var d07 = LoadDquVector256(ptr + 06*N);
839 | var d08 = LoadDquVector256(ptr + 07*N);
840 | var d09 = LoadDquVector256(ptr + 08*N);
841 | var d10 = LoadDquVector256(ptr + 09*N);
842 | var d11 = LoadDquVector256(ptr + 10*N);
843 | var d12 = LoadDquVector256(ptr + 11*N);
844 | var d13 = LoadDquVector256(ptr + 12*N);
845 |
846 | BitonicSort13V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10, ref d11, ref d12, ref d13);
847 |
848 | Store(ptr + 00*N, d01);
849 | Store(ptr + 01*N, d02);
850 | Store(ptr + 02*N, d03);
851 | Store(ptr + 03*N, d04);
852 | Store(ptr + 04*N, d05);
853 | Store(ptr + 05*N, d06);
854 | Store(ptr + 06*N, d07);
855 | Store(ptr + 07*N, d08);
856 | Store(ptr + 08*N, d09);
857 | Store(ptr + 09*N, d10);
858 | Store(ptr + 10*N, d11);
859 | Store(ptr + 11*N, d12);
860 | Store(ptr + 12*N, d13);
861 | }
862 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
863 | static void BitonicSort14V(int* ptr)
864 | {
865 | var N = V.Count;
866 |
867 | var d01 = LoadDquVector256(ptr + 00*N);
868 | var d02 = LoadDquVector256(ptr + 01*N);
869 | var d03 = LoadDquVector256(ptr + 02*N);
870 | var d04 = LoadDquVector256(ptr + 03*N);
871 | var d05 = LoadDquVector256(ptr + 04*N);
872 | var d06 = LoadDquVector256(ptr + 05*N);
873 | var d07 = LoadDquVector256(ptr + 06*N);
874 | var d08 = LoadDquVector256(ptr + 07*N);
875 | var d09 = LoadDquVector256(ptr + 08*N);
876 | var d10 = LoadDquVector256(ptr + 09*N);
877 | var d11 = LoadDquVector256(ptr + 10*N);
878 | var d12 = LoadDquVector256(ptr + 11*N);
879 | var d13 = LoadDquVector256(ptr + 12*N);
880 | var d14 = LoadDquVector256(ptr + 13*N);
881 |
882 | BitonicSort14V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10, ref d11, ref d12, ref d13, ref d14);
883 |
884 | Store(ptr + 00*N, d01);
885 | Store(ptr + 01*N, d02);
886 | Store(ptr + 02*N, d03);
887 | Store(ptr + 03*N, d04);
888 | Store(ptr + 04*N, d05);
889 | Store(ptr + 05*N, d06);
890 | Store(ptr + 06*N, d07);
891 | Store(ptr + 07*N, d08);
892 | Store(ptr + 08*N, d09);
893 | Store(ptr + 09*N, d10);
894 | Store(ptr + 10*N, d11);
895 | Store(ptr + 11*N, d12);
896 | Store(ptr + 12*N, d13);
897 | Store(ptr + 13*N, d14);
898 | }
899 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
900 | static void BitonicSort15V(int* ptr)
901 | {
902 | var N = V.Count;
903 |
904 | var d01 = LoadDquVector256(ptr + 00*N);
905 | var d02 = LoadDquVector256(ptr + 01*N);
906 | var d03 = LoadDquVector256(ptr + 02*N);
907 | var d04 = LoadDquVector256(ptr + 03*N);
908 | var d05 = LoadDquVector256(ptr + 04*N);
909 | var d06 = LoadDquVector256(ptr + 05*N);
910 | var d07 = LoadDquVector256(ptr + 06*N);
911 | var d08 = LoadDquVector256(ptr + 07*N);
912 | var d09 = LoadDquVector256(ptr + 08*N);
913 | var d10 = LoadDquVector256(ptr + 09*N);
914 | var d11 = LoadDquVector256(ptr + 10*N);
915 | var d12 = LoadDquVector256(ptr + 11*N);
916 | var d13 = LoadDquVector256(ptr + 12*N);
917 | var d14 = LoadDquVector256(ptr + 13*N);
918 | var d15 = LoadDquVector256(ptr + 14*N);
919 |
920 | BitonicSort15V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10, ref d11, ref d12, ref d13, ref d14, ref d15);
921 |
922 | Store(ptr + 00*N, d01);
923 | Store(ptr + 01*N, d02);
924 | Store(ptr + 02*N, d03);
925 | Store(ptr + 03*N, d04);
926 | Store(ptr + 04*N, d05);
927 | Store(ptr + 05*N, d06);
928 | Store(ptr + 06*N, d07);
929 | Store(ptr + 07*N, d08);
930 | Store(ptr + 08*N, d09);
931 | Store(ptr + 09*N, d10);
932 | Store(ptr + 10*N, d11);
933 | Store(ptr + 11*N, d12);
934 | Store(ptr + 12*N, d13);
935 | Store(ptr + 13*N, d14);
936 | Store(ptr + 14*N, d15);
937 | }
938 | [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.AggressiveOptimization)]
939 | static void BitonicSort16V(int* ptr)
940 | {
941 | var N = V.Count;
942 |
943 | var d01 = LoadDquVector256(ptr + 00*N);
944 | var d02 = LoadDquVector256(ptr + 01*N);
945 | var d03 = LoadDquVector256(ptr + 02*N);
946 | var d04 = LoadDquVector256(ptr + 03*N);
947 | var d05 = LoadDquVector256(ptr + 04*N);
948 | var d06 = LoadDquVector256(ptr + 05*N);
949 | var d07 = LoadDquVector256(ptr + 06*N);
950 | var d08 = LoadDquVector256(ptr + 07*N);
951 | var d09 = LoadDquVector256(ptr + 08*N);
952 | var d10 = LoadDquVector256(ptr + 09*N);
953 | var d11 = LoadDquVector256(ptr + 10*N);
954 | var d12 = LoadDquVector256(ptr + 11*N);
955 | var d13 = LoadDquVector256(ptr + 12*N);
956 | var d14 = LoadDquVector256(ptr + 13*N);
957 | var d15 = LoadDquVector256(ptr + 14*N);
958 | var d16 = LoadDquVector256(ptr + 15*N);
959 |
960 | BitonicSort16V(ref d01, ref d02, ref d03, ref d04, ref d05, ref d06, ref d07, ref d08, ref d09, ref d10, ref d11, ref d12, ref d13, ref d14, ref d15, ref d16);
961 |
962 | Store(ptr + 00*N, d01);
963 | Store(ptr + 01*N, d02);
964 | Store(ptr + 02*N, d03);
965 | Store(ptr + 03*N, d04);
966 | Store(ptr + 04*N, d05);
967 | Store(ptr + 05*N, d06);
968 | Store(ptr + 06*N, d07);
969 | Store(ptr + 07*N, d08);
970 | Store(ptr + 08*N, d09);
971 | Store(ptr + 09*N, d10);
972 | Store(ptr + 10*N, d11);
973 | Store(ptr + 11*N, d12);
974 | Store(ptr + 12*N, d13);
975 | Store(ptr + 13*N, d14);
976 | Store(ptr + 14*N, d15);
977 | Store(ptr + 15*N, d16);
978 | }
979 |
980 | public const int MinBitonicSortSize = 8;
981 | public const int MaxBitonicSortSize = 128;
982 |
983 | [MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
984 | public static void Sort(int* ptr, int length)
985 | {
986 | Debug.Assert(length % 8 == 0);
987 | Debug.Assert(length <= MaxBitonicSortSize);
988 |
989 | switch (length / 8) {
990 | case 01: BitonicSort01V(ptr); return;
991 | case 02: BitonicSort02V(ptr); return;
992 | case 03: BitonicSort03V(ptr); return;
993 | case 04: BitonicSort04V(ptr); return;
994 | case 05: BitonicSort05V(ptr); return;
995 | case 06: BitonicSort06V(ptr); return;
996 | case 07: BitonicSort07V(ptr); return;
997 | case 08: BitonicSort08V(ptr); return;
998 | case 09: BitonicSort09V(ptr); return;
999 | case 10: BitonicSort10V(ptr); return;
1000 | case 11: BitonicSort11V(ptr); return;
1001 | case 12: BitonicSort12V(ptr); return;
1002 | case 13: BitonicSort13V(ptr); return;
1003 | case 14: BitonicSort14V(ptr); return;
1004 | case 15: BitonicSort15V(ptr); return;
1005 | case 16: BitonicSort16V(ptr); return;
1006 |
1007 | default:
1008 | throw new NotSupportedException("length is not power a multiple of 8 && <= 128");
1009 | }
1010 | }
1011 | }
1012 | }
1013 |
--------------------------------------------------------------------------------
/VxSort/VectorizedSort.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Diagnostics;
3 | using System.Runtime.CompilerServices;
4 | using System.Runtime.Intrinsics;
5 | using System.Runtime.Intrinsics.X86;
6 | using static System.Runtime.Intrinsics.X86.Avx;
7 | using static System.Runtime.Intrinsics.X86.Avx2;
8 | using static System.Runtime.Intrinsics.X86.Popcnt.X64;
9 | using static System.Runtime.Intrinsics.X86.Popcnt;
10 | using static VxSort.BytePermutationTables;
11 |
12 | namespace VxSort
13 | {
14 | public static class VectorizedSort
15 | {
16 | public static unsafe void UnstableSort(T[] array) where T : unmanaged, IComparable
17 | {
18 | if (array == null) {
19 | throw new ArgumentNullException(nameof(array));
20 | }
21 |
22 | if (array.Length == 0) {
23 | return;
24 | }
25 |
26 | if (!Avx2.IsSupported) {
27 | throw new NotSupportedException($"{nameof(VxSort)} requires x86/AVX2 support in the processor");
28 | }
29 |
30 | fixed (T* p = &array[0]) {
31 | // Yes this looks horrid, but the C# JIT will happily elide
32 | // the irrelevant code per each type being compiled, so we're good
33 | if (typeof(T) == typeof(int)) {
34 | var left = (int*) p;
35 | var sorter = new VxUnstableSortInt32(startPtr: left, endPtr: left + array.Length - 1);
36 | sorter.Sort();
37 | }
38 | else {
39 | throw new NotImplementedException($"{nameof(VxSort)} does not yet support {typeof(T).Name}");
40 | }
41 | }
42 | }
43 |
44 | static int FloorLog2PlusOne(uint n)
45 | {
46 | var result = 0;
47 | while (n >= 1)
48 | {
49 | result++;
50 | n /= 2;
51 | }
52 | return result;
53 | }
54 |
55 | static unsafe void Swap(TX *left, TX *right) where TX : unmanaged
56 | {
57 | var tmp = *left;
58 | *left = *right;
59 | *right = tmp;
60 | }
61 |
62 | static void Swap(Span span, int left, int right)
63 | {
64 | var tmp = span[left];
65 | span[left] = span[right];
66 | span[right] = tmp;
67 | }
68 |
69 | static unsafe void SwapIfGreater(TX *left, TX *right) where TX : unmanaged, IComparable
70 | {
71 | if ((*left).CompareTo(*right) <= 0) return;
72 | Swap(left, right);
73 | }
74 |
75 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
76 | static unsafe void InsertionSort(TX * left, TX * right) where TX : unmanaged, IComparable
77 | {
78 | for (var i = left; i < right; i++) {
79 | var j = i;
80 | var t = *(i + 1);
81 | while (j >= left && t.CompareTo(*j) < 0) {
82 | *(j + 1) = *j;
83 | j--;
84 | }
85 | *(j + 1) = t;
86 | }
87 | }
88 |
89 | static void HeapSort(Span keys) where TX : unmanaged, IComparable
90 | {
91 | Debug.Assert(!keys.IsEmpty);
92 |
93 | var lo = 0;
94 | var hi = keys.Length - 1;
95 |
96 | var n = hi - lo + 1;
97 | for (var i = n / 2; i >= 1; i = i - 1)
98 | {
99 | DownHeap(keys, i, n, lo);
100 | }
101 |
102 | for (var i = n; i > 1; i--)
103 | {
104 | Swap(keys, lo, lo + i - 1);
105 | DownHeap(keys, 1, i - 1, lo);
106 | }
107 | }
108 |
109 | static void DownHeap(Span keys, int i, int n, int lo) where TX : unmanaged, IComparable
110 | {
111 | Debug.Assert(lo >= 0);
112 | Debug.Assert(lo < keys.Length);
113 |
114 | var d = keys[lo + i - 1];
115 | while (i <= n / 2) {
116 | var child = 2 * i;
117 | if (child < n && keys[lo + child - 1].CompareTo(keys[lo + child]) < 0) {
118 | child++;
119 | }
120 |
121 | if (keys[lo + child - 1].CompareTo(d) < 0)
122 | break;
123 |
124 | keys[lo + i - 1] = keys[lo + child - 1];
125 | i = child;
126 | }
127 |
128 | keys[lo + i - 1] = d;
129 | }
130 |
131 | // How much initial room needs to be made
132 | // during setup in full Vector25 units
133 | const int SLACK_PER_SIDE_IN_VECTORS = 8;
134 |
135 | // Once we come out of the first unrolled loop
136 | // this will be the size of the second unrolled loop.
137 | const int UNROLL2_SIZE_IN_VECTORS = 4;
138 |
139 | // Alignment in bytes
140 | const ulong ALIGN = 32;
141 | const ulong ALIGN_MASK = ALIGN - 1;
142 |
143 | internal unsafe ref struct VxUnstableSortInt32
144 | {
145 | // We need this as a compile time constant
146 | const int V256_N = 256 / 8 / sizeof(int);
147 |
148 | internal const int SMALL_SORT_THRESHOLD_ELEMENTS = 112;
149 | const int SLACK_PER_SIDE_IN_ELEMENTS = SLACK_PER_SIDE_IN_VECTORS * V256_N;
150 | const int UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS = UNROLL2_SIZE_IN_VECTORS * V256_N;
151 | const int EIGHTH_SLACK_PER_SIDE_IN_ELEMENTS = V256_N;
152 |
153 | // The formula goes like this:
154 | // 2 x the number of slack elements on each side +
155 | // 2 x amount of maximal bytes needed for alignment (32)
156 | // 8 more elements since we write with 8-way stores from both ends of the temporary area
157 | // and we must make sure to accidentaly over-write from left -> right or vice-versa right on that edge...
158 | const int PARTITION_TMP_SIZE_IN_ELEMENTS = (int) (2 * SLACK_PER_SIDE_IN_ELEMENTS + 2 * ALIGN / sizeof(int) + V256_N);
159 |
160 | const long REALIGN_LEFT = 0x666;
161 | const long REALIGN_RIGHT = 0x66600000000;
162 | internal const long REALIGN_BOTH = REALIGN_LEFT | REALIGN_RIGHT;
163 | readonly int* _startPtr;
164 | readonly int* _endPtr;
165 | readonly int* _tempStart;
166 | readonly int* _tempEnd;
167 | #pragma warning disable 649
168 | fixed int _temp[PARTITION_TMP_SIZE_IN_ELEMENTS];
169 | int _depth;
170 | #pragma warning restore 649
171 | internal long Length => _endPtr - _startPtr + 1;
172 |
173 | public VxUnstableSortInt32(int* startPtr, int* endPtr) : this()
174 | {
175 | Debug.Assert(SMALL_SORT_THRESHOLD_ELEMENTS % V256_N == 0);
176 |
177 | _depth = 0;
178 | _startPtr = startPtr;
179 | _endPtr = endPtr;
180 | fixed (int* pTemp = _temp) {
181 | _tempStart = pTemp;
182 | _tempEnd = pTemp + PARTITION_TMP_SIZE_IN_ELEMENTS;
183 | }
184 | }
185 |
186 |
187 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
188 | internal void Sort()
189 | {
190 | // It makes no sense to sort arrays smaller than the max supported
191 | // bitonic sort with hybrid partitioning, so we special case those sized
192 | // and just copy the entire source to the tmp memory, pad it with
193 | // int.MaxValue and call BitonicSort
194 | var cachedLength = (uint) (ulong) Length;
195 | if (cachedLength <= BitonicSort.MaxBitonicSortSize) {
196 | CopyAndSortWithBitonic(cachedLength);
197 | return;
198 | }
199 |
200 | var depthLimit = 2 * FloorLog2PlusOne(cachedLength);
201 | HybridSort(_startPtr, _endPtr, REALIGN_BOTH, depthLimit);
202 | }
203 |
204 | void CopyAndSortWithBitonic(uint cachedLength)
205 | {
206 | var start = _startPtr;
207 | var tmp = _tempStart;
208 | var byteCount = cachedLength * sizeof(int);
209 |
210 | var adjustedLength = cachedLength & ~0b111;
211 | Store(tmp + adjustedLength, Vector256.Create(int.MaxValue));
212 | Unsafe.CopyBlockUnaligned(tmp, start, byteCount);
213 | BitonicSort.Sort(tmp, (int) Math.Min(adjustedLength + 8, BitonicSort.MaxBitonicSortSize));
214 | Unsafe.CopyBlockUnaligned(start, tmp, byteCount);
215 | }
216 |
217 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
218 | internal void HybridSort(int* left, int* right, long realignHint, int depthLimit)
219 | {
220 | // In case of bad separation we might encounter a partition size of -1
221 | Debug.Assert(left <= right + 1);
222 |
223 | var length = (int) (right - left + 1);
224 |
225 | int* mid;
226 | switch (length) {
227 |
228 | case -1:
229 | case 0:
230 | case 1:
231 | return;
232 | case 2:
233 | SwapIfGreater(left, right);
234 | return;
235 | case 3:
236 | mid = right - 1;
237 | SwapIfGreater(left, mid);
238 | SwapIfGreater(left, right);
239 | SwapIfGreater(mid, right);
240 | return;
241 | }
242 |
243 | _depth++;
244 |
245 | // SMALL_SORT_THRESHOLD_ELEMENTS is guaranteed (and asserted) to be a multiple of 8
246 | // So we can check if length is strictly smaller, knowing that we will round up to
247 | // SMALL_SORT_THRESHOLD_ELEMENTS exactly and no more
248 | // This is kind of critical given that we only limited # of implementation of
249 | // vectorized bitonic sort
250 | if (length < SMALL_SORT_THRESHOLD_ELEMENTS) {
251 | var nextLength = (length & 7) > 0 ? (length + V256_N) & ~7: length;
252 |
253 | Debug.Assert(nextLength <= BitonicSort.MaxBitonicSortSize);
254 | var extraSpaceNeeded = nextLength - length;
255 | var fakeLeft = left - extraSpaceNeeded;
256 | if (fakeLeft >= _startPtr) {
257 | BitonicSort.Sort(fakeLeft, nextLength);
258 | }
259 | else {
260 | InsertionSort(left, right);
261 | }
262 | _depth--;
263 | return;
264 | }
265 |
266 | // Detect a whole bunch of bad cases where partitioning
267 | // will not do well:
268 | // 1. Reverse sorted array
269 | // 2. High degree of repeated values (dutch flag problem, one value)
270 | if (depthLimit == 0)
271 | {
272 | HeapSort(new Span(left, (int) (right - left + 1)));
273 | _depth--;
274 | return;
275 | }
276 | depthLimit--;
277 |
278 | // This is going to be a bit weird:
279 | // Pre/Post alignment calculations happen here: we prepare hints to the
280 | // partition function of how much to align and in which direction (pre/post).
281 | // The motivation to do these calculations here and the actual alignment inside the partitioning code is
282 | // that here, we can cache those calculations.
283 | // As we recurse to the left we can reuse the left cached calculation, And when we recurse
284 | // to the right we reuse the right calculation, so we can avoid re-calculating the same aligned addresses
285 | // throughout the recursion, at the cost of a minor code complexity
286 | // Since we branch on the magi values REALIGN_LEFT & REALIGN_RIGHT its safe to assume
287 | // the we are not torturing the branch predictor.'
288 |
289 | // We use a long as a "struct" to pass on alignment hints to the partitioning
290 | // By packing 2 32 bit elements into it, as the JIT seem to not do this.
291 | // In reality we need more like 2x 4bits for each side, but I don't think
292 | // there is a real difference'
293 |
294 | var preAlignedLeft = (int*) ((ulong) left & ~ALIGN_MASK);
295 | var cannotPreAlignLeft = (preAlignedLeft - _startPtr) >> 63;
296 | var preAlignLeftOffset = (preAlignedLeft - left) + (V256_N & cannotPreAlignLeft);
297 | if ((realignHint & REALIGN_LEFT) != 0) {
298 | // Alignment flow:
299 | // * Calculate pre-alignment on the left
300 | // * See it would cause us an out-of bounds read
301 | // * Since we'd like to avoid that, we adjust for post-alignment
302 | // * There are no branches since we do branch->arithmetic
303 | realignHint &= unchecked((long) 0xFFFFFFFF00000000UL);
304 | realignHint |= preAlignLeftOffset;
305 | }
306 |
307 | var preAlignedRight = (int*) (((ulong) right - 1 & ~ALIGN_MASK) + ALIGN);
308 | var cannotPreAlignRight = (_endPtr - preAlignedRight) >> 63;
309 | var preAlignRightOffset = (preAlignedRight - right - (V256_N & cannotPreAlignRight));
310 | if ((realignHint & REALIGN_RIGHT) != 0) {
311 | // right is pointing just PAST the last element we intend to partition (where we also store the pivot)
312 | // So we calculate alignment based on right - 1, and YES: I am casting to ulong before doing the -1, this
313 | // is intentional since the whole thing is either aligned to 32 bytes or not, so decrementing the POINTER value
314 | // by 1 is sufficient for the alignment, an the JIT sucks at this anyway
315 | realignHint &= 0xFFFFFFFF;
316 | realignHint |= preAlignRightOffset << 32;
317 | }
318 |
319 | Debug.Assert(((ulong) (left + (realignHint & 0xFFFFFFFF)) & ALIGN_MASK) == 0);
320 | Debug.Assert(((ulong) (right + (realignHint >> 32)) & ALIGN_MASK) == 0);
321 |
322 | // Compute median-of-three, of:
323 | // the first, mid and one before last elements
324 | mid = left + (right - left) / 2;
325 | SwapIfGreater(left, mid);
326 | SwapIfGreater(left, right - 1);
327 | SwapIfGreater(mid, right - 1);
328 |
329 | // Pivot is mid, place it in the right hand side
330 | Swap(mid, right);
331 |
332 | var sep = length < PARTITION_TMP_SIZE_IN_ELEMENTS ?
333 | Partition1VectorInPlace(left, right, realignHint) :
334 | Partition8VectorsInPlace(left, right, realignHint);
335 |
336 | HybridSort(left, sep - 1, realignHint | REALIGN_RIGHT, depthLimit);
337 | HybridSort(sep + 1, right, realignHint | REALIGN_LEFT, depthLimit);
338 | _depth--;
339 | }
340 |
341 | ///
342 | /// Partition using Vectorized AVX2 intrinsics
343 | ///
344 | /// pointer (inclusive) to the first element to partition
345 | /// pointer (exclusive) to the last element to partition, actually points to where the pivot before partitioning
346 | /// alignment instructions
347 | /// Position of the pivot that was passed to the function inside the array
348 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
349 | internal int* Partition8VectorsInPlace(int* left, int* right, long hint)
350 | {
351 | Debug.Assert(right - left >= SMALL_SORT_THRESHOLD_ELEMENTS, $"Not enough elements: {right-left} >= {SMALL_SORT_THRESHOLD_ELEMENTS}");
352 |
353 | Debug.Assert((((ulong) left) & 0x3) == 0);
354 | Debug.Assert((((ulong) right) & 0x3) == 0);
355 | // Vectorized double-pumped (dual-sided) partitioning:
356 | // We start with picking a pivot using the media-of-3 "method"
357 | // Once we have sensible pivot stored as the last element of the array
358 | // We process the array from both ends.
359 | //
360 | // To get this rolling, we first read 2 Vector256 elements from the left and
361 | // another 2 from the right, and store them in some temporary space
362 | // in order to leave enough "space" inside the vector for storing partitioned values.
363 | // Why 2 from each side? Because we need n+1 from each side
364 | // where n is the number of Vector256 elements we process in each iteration...
365 | // The reasoning behind the +1 is because of the way we decide from *which*
366 | // side to read, we may end up reading up to one more vector from any given side
367 | // and writing it in its entirety to the opposite side (this becomes slightly clearer
368 | // when reading the code below...)
369 | // Conceptually, the bulk of the processing looks like this after clearing out some initial
370 | // space as described above:
371 |
372 | // [.............................................................................]
373 | // ^wl ^rl rr^ wr^
374 | // Where:
375 | // wl = writeLeft
376 | // rl = readLeft
377 | // rr = readRight
378 | // wr = writeRight
379 |
380 | // In every iteration, we select what side to read from based on how much
381 | // space is left between head read/write pointer on each side...
382 | // We read from where there is a smaller gap, e.g. that side
383 | // that is closer to the unfortunate possibility of its write head overwriting
384 | // its read head... By reading from THAT side, we're ensuring this does not happen
385 |
386 | // An additional unfortunate complexity we need to deal with is that the right pointer
387 | // must be decremented by another Vector256.Count elements
388 | // Since the Load/Store primitives obviously accept start addresses
389 | var N = Vector256.Count; // Treated as constant @ JIT time
390 | var pivot = *right;
391 | // We do this here just in case we need to pre-align to the right
392 | // We end up
393 | *right = int.MaxValue;
394 |
395 | var readLeft = left;
396 | var readRight = right;
397 | var writeLeft = left;
398 | var crappyWriteRight = right - N;
399 |
400 | var tmpStartLeft = _tempStart;
401 | var tmpLeft = tmpStartLeft;
402 | var tmpStartRight = _tempEnd;
403 | var tmpRight = tmpStartRight;
404 |
405 | // Broadcast the selected pivot
406 | var P = Vector256.Create(pivot);
407 | var pBase = BytePermTableAlignedPtr;
408 | tmpRight -= N;
409 |
410 | #region Vector256 Alignment
411 | // the read heads always advance by 8 elements, or 32 bytes,
412 | // We can spend some extra time here to align the pointers
413 | // so they start at a cache-line boundary
414 | // Once that happens, we can read with Avx.LoadAlignedVector256
415 | // And also know for sure that our reads will never cross cache-lines
416 | // Otherwise, 50% of our AVX2 Loads will need to read from two cache-lines
417 | var leftAlign = unchecked((int) (hint & 0xFFFFFFFF));
418 | var rightAlign = unchecked((int) (hint >> 32));
419 |
420 | var preAlignedLeft = left + leftAlign;
421 | var preAlignedRight = right + rightAlign - N;
422 |
423 | // We preemptively go through the motions of
424 | // vectorized alignment, and at worst we re-neg
425 | // by not advancing the various read/tmp pointers
426 | // as if nothing ever happenned if the conditions
427 | // are wrong from vectorized alginment
428 | var RT0 = LoadAlignedVector256(preAlignedRight);
429 | var LT0 = LoadAlignedVector256(preAlignedLeft);
430 | var rtMask = (uint) MoveMask(CompareGreaterThan(RT0, P).AsSingle());
431 | var ltMask = (uint) MoveMask(CompareGreaterThan(LT0, P).AsSingle());
432 | var rtPopCount = Math.Max(PopCount(rtMask), (uint) rightAlign);
433 | var ltPopCount = PopCount(ltMask);
434 | RT0 = PermuteVar8x32(RT0, GetBytePermutationAligned(pBase, rtMask));
435 | LT0 = PermuteVar8x32(LT0, GetBytePermutationAligned(pBase, ltMask));
436 | Store(tmpRight, RT0);
437 | Store(tmpLeft, LT0);
438 |
439 | var rightAlignMask = ~((rightAlign - 1) >> 31);
440 | var leftAlignMask = leftAlign >> 31;
441 |
442 | tmpRight -= rtPopCount & rightAlignMask;
443 | rtPopCount = V256_N - rtPopCount;
444 | readRight += (rightAlign - N) & rightAlignMask;
445 |
446 | Store(tmpRight, LT0);
447 | tmpRight -= ltPopCount & leftAlignMask;
448 | ltPopCount = V256_N - ltPopCount;
449 | tmpLeft += ltPopCount & leftAlignMask;
450 | tmpStartLeft += -leftAlign & leftAlignMask;
451 | readLeft += (leftAlign + N) & leftAlignMask;
452 |
453 | Store(tmpLeft, RT0);
454 | tmpLeft += rtPopCount & rightAlignMask;
455 | tmpStartRight -= rightAlign & rightAlignMask;
456 |
457 | if (leftAlign > 0) {
458 | tmpRight += N;
459 | readLeft = AlignLeftScalarUncommon(readLeft, pivot, ref tmpLeft, ref tmpRight);
460 | tmpRight -= N;
461 | }
462 |
463 | if (rightAlign < 0) {
464 | tmpRight += N;
465 | readRight = AlignRightScalarUncommon(readRight, pivot, ref tmpLeft, ref tmpRight);
466 | tmpRight -= N;
467 | }
468 | Debug.Assert(((ulong) readLeft & ALIGN_MASK) == 0);
469 | Debug.Assert(((ulong) readRight & ALIGN_MASK) == 0);
470 |
471 | Debug.Assert((((byte *) readRight - (byte *) readLeft) % (long) ALIGN) == 0);
472 | Debug.Assert((readRight - readLeft) >= SLACK_PER_SIDE_IN_ELEMENTS * 2);
473 |
474 | #endregion
475 |
476 | // Make 8 vectors worth of space on each side by partitioning them straight into the temporary memory
477 | LoadAndPartition8Vectors(readLeft, P, pBase, ref tmpLeft, ref tmpRight);
478 | LoadAndPartition8Vectors(readRight - SLACK_PER_SIDE_IN_ELEMENTS, P, pBase, ref tmpLeft, ref tmpRight);
479 | tmpRight += N;
480 |
481 | // Adjust for the reading that was made above
482 | readLeft += SLACK_PER_SIDE_IN_ELEMENTS;
483 | readRight -= SLACK_PER_SIDE_IN_ELEMENTS * 2;
484 |
485 | var writeRight = crappyWriteRight;
486 |
487 | while (readLeft < readRight) {
488 | int* nextPtr;
489 | if ((byte *) writeRight - (byte *) readRight < (2*SLACK_PER_SIDE_IN_ELEMENTS - N)*sizeof(int)) {
490 | nextPtr = readRight;
491 | readRight -= SLACK_PER_SIDE_IN_ELEMENTS;
492 | } else {
493 | nextPtr = readLeft;
494 | readLeft += SLACK_PER_SIDE_IN_ELEMENTS;
495 | }
496 |
497 | LoadAndPartition8Vectors(nextPtr, P, pBase, ref writeLeft, ref writeRight);
498 | }
499 |
500 | readRight += UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS;
501 |
502 | while (readLeft < readRight) {
503 | int* nextPtr;
504 | if ((byte *) writeRight - (byte *) readRight < (2*UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS - N) * sizeof(int)) {
505 | nextPtr = readRight;
506 | readRight -= UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS;
507 | } else {
508 | nextPtr = readLeft;
509 | readLeft += UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS;
510 | }
511 |
512 | Debug.Assert(readLeft - writeLeft >= UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS, $"left head overwrite {readLeft - writeLeft}");
513 | Debug.Assert(writeRight - readRight >= UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS, $"right head overwrite {writeRight - readRight}");
514 |
515 | LoadAndPartition4Vectors(nextPtr, P, pBase, ref writeLeft, ref writeRight);
516 | }
517 |
518 | readRight += UNROLL2_SLACK_PER_SIDE_IN_ELEMENTS - N;
519 |
520 | while (readLeft <= readRight) {
521 | int* nextPtr;
522 | if (((byte *) writeRight - (byte *) readRight) < N * sizeof(int)) {
523 | nextPtr = readRight;
524 | readRight -= N;
525 | } else {
526 | nextPtr = readLeft;
527 | readLeft += N;
528 | }
529 |
530 | PartitionBlock1V(LoadAlignedVector256(nextPtr), P, pBase, ref writeLeft, ref writeRight);
531 | }
532 |
533 | var boundary = writeLeft;
534 |
535 | // 3. Copy-back the 4 registers + remainder we partitioned in the beginning
536 | var leftTmpSize = (uint) (ulong) (tmpLeft - tmpStartLeft);
537 | Unsafe.CopyBlockUnaligned(boundary, tmpStartLeft, leftTmpSize*sizeof(int));
538 | boundary += leftTmpSize;
539 | var rightTmpSize = (uint) (ulong) (tmpStartRight - tmpRight);
540 | Unsafe.CopyBlockUnaligned(boundary, tmpRight, rightTmpSize*sizeof(int));
541 |
542 | // Shove to pivot back to the boundary
543 | var value = *boundary;
544 | *right = value;
545 | *boundary = pivot;
546 |
547 | Debug.Assert(boundary > left);
548 | Debug.Assert(boundary <= right);
549 |
550 | return boundary;
551 | }
552 |
553 | [MethodImpl(MethodImplOptions.AggressiveInlining|MethodImplOptions.AggressiveOptimization)]
554 | static void LoadAndPartition8Vectors(int* dataPtr, Vector256 P, byte* pBase, ref int* writeLeftPtr, ref int* writeRightPtr)
555 | {
556 | var N = Vector256.Count; // Treated as constant @ JIT time
557 |
558 | var L0 = LoadAlignedVector256(dataPtr + 0 * N);
559 | var L1 = LoadAlignedVector256(dataPtr + 1 * N);
560 | var L2 = LoadAlignedVector256(dataPtr + 2 * N);
561 | var L3 = LoadAlignedVector256(dataPtr + 3 * N);
562 | var L4 = LoadAlignedVector256(dataPtr + 4 * N);
563 | var L5 = LoadAlignedVector256(dataPtr + 5 * N);
564 | var L6 = LoadAlignedVector256(dataPtr + 6 * N);
565 | var L7 = LoadAlignedVector256(dataPtr + 7 * N);
566 | PartitionBlock4V(P, L0, L1, L2, L3, pBase, ref writeLeftPtr, ref writeRightPtr);
567 | PartitionBlock4V(P, L4, L5, L6, L7, pBase, ref writeLeftPtr, ref writeRightPtr);
568 | }
569 |
570 | [MethodImpl(MethodImplOptions.AggressiveInlining|MethodImplOptions.AggressiveOptimization)]
571 | static void LoadAndPartition4Vectors(int* dataPtr, Vector256 P, byte* pBase, ref int* writeLeft, ref int* writeRight)
572 | {
573 | var N = Vector256.Count; // Treated as constant @ JIT time
574 |
575 | var L0 = LoadAlignedVector256(dataPtr + 0 * N);
576 | var L1 = LoadAlignedVector256(dataPtr + 1 * N);
577 | var L2 = LoadAlignedVector256(dataPtr + 2 * N);
578 | var L3 = LoadAlignedVector256(dataPtr + 3 * N);
579 | PartitionBlock4V(P, L0, L1, L2, L3, pBase, ref writeLeft, ref writeRight);
580 | }
581 |
582 | [MethodImpl(MethodImplOptions.AggressiveInlining|MethodImplOptions.AggressiveOptimization)]
583 | static void PartitionBlock4V(Vector256 P, Vector256 L0, Vector256 L1, Vector256 L2,
584 | Vector256 L3, byte* pBase,
585 | ref int* writeLeft,
586 | ref int* writeRight)
587 | {
588 | PartitionBlock1V(L0, P, pBase, ref writeLeft, ref writeRight);
589 | PartitionBlock1V(L1, P, pBase, ref writeLeft, ref writeRight);
590 | PartitionBlock1V(L2, P, pBase, ref writeLeft, ref writeRight);
591 | PartitionBlock1V(L3, P, pBase, ref writeLeft, ref writeRight);
592 | }
593 |
594 | [MethodImpl(MethodImplOptions.AggressiveInlining|MethodImplOptions.AggressiveOptimization)]
595 | static void PartitionBlock1V(Vector256 L0, Vector256 P, byte* pBase, ref int* writeLeft, ref int* writeRight)
596 | {
597 | // Looks kinda silly, the (ulong) (uint) thingy right?
598 | // Well, it's making a yucky lemonade out of lemons is what it is.
599 | // This is a crappy way of making the jit generate slightly less worse code
600 | // due to: https://github.com/dotnet/runtime/issues/431#issuecomment-568280829
601 | // To summarize: VMOVMASK is mis-understood as a 32-bit write by the CoreCLR 3.x JIT.
602 | // It's really a 64 bit write in 64 bit mode, in other words, it clears the entire register.
603 | // Again, the JIT *should* be aware that the destination register just had it's top 32 bits cleared.
604 | // It doesn't.
605 | // This causes a variety of issues, here it's that GetBytePermutation* method is generated
606 | // with suboptimal x86 code (see above issue/comment).
607 | // By forcefully clearing the 32-top bits by casting to ulong, we "help" the JIT further down the road
608 | // and the rest of the code is generated more cleanly.
609 | // In other words, until the issue is resolved we "pay" with a 2-byte instruction for this useless cast
610 | // But this helps the JIT generate slightly better code below (saving 3 bytes).
611 | var m0 = (ulong) (uint) MoveMask(CompareGreaterThan(L0, P).AsSingle());
612 | L0 = PermuteVar8x32(L0, GetBytePermutationAligned(pBase, m0));
613 | // We make sure the last use of m0 is for this PopCount operation. Why?
614 | // Again, this is to get the best code generated on an Intel CPU. This round it's intel's fault, yay.
615 | // There's a relatively well know CPU errata where POPCNT has a false dependency on the destination operand.
616 | // The JIT is already aware of this, so it will clear the destination operand before emitting a POPCNT:
617 | // https://github.com/dotnet/coreclr/issues/19555
618 | // By "delaying" the PopCount to this stage, it is highly likely (I don't know why, I just know it is...)
619 | // that the JIT will emit a POPCNT X,X instruction, where X is now both the source and the destination
620 | // for PopCount. This means that there is no need for clearing the destination register (it would even be
621 | // an error to do so). This saves about two bytes in the instruction stream.
622 | var pc = -((long) (int) PopCount(m0));
623 | Store(writeLeft, L0);
624 | Store(writeRight, L0);
625 | // I comfortably ignored having negated the PopCount result after casting to (long)
626 | // The reasoning behind this is that be storing the PopCount as a negative
627 | // while also expressing the pointer bumping (next two lines) in this very specific form that
628 | // it is expressed: a summation of two variables with an optional constant (that CAN be negative)
629 | // We are allowing the JIT to encode this as two LEA opcodes in x64: https://www.felixcloutier.com/x86/lea
630 | // This saves a considerable amount of space in the instruction stream, which are then exploded
631 | // when this block is unrolled. All in all this is has a very clear benefit in perf while decreasing code
632 | // size.
633 | // TODO: Currently the entire sorting operation generates a right-hand popcount that needs to be negated
634 | // If/When I re-write it to do left-hand comparison/pop-counting we can save another two bytes
635 | // for the negation operation, which will also do its share to speed things up while lowering
636 | // the native code size, yay for future me!
637 | writeRight = writeRight + pc;
638 | writeLeft = writeLeft + pc + V256_N;
639 | }
640 |
641 | ///
642 | /// Partition using Vectorized AVX2 intrinsics
643 | ///
644 | /// pointer (inclusive) to the first element to partition
645 | /// pointer (exclusive) to the last element to partition, actually points to where the pivot before partitioning
646 | /// alignment instructions
647 | /// Position of the pivot that was passed to the function inside the array
648 | [MethodImpl(MethodImplOptions.AggressiveOptimization)]
649 | internal int* Partition1VectorInPlace(int* left, int* right, long hint)
650 | {
651 | Debug.Assert((((ulong) left) & 0x3) == 0);
652 | Debug.Assert((((ulong) right) & 0x3) == 0);
653 | // Vectorized double-pumped (dual-sided) partitioning:
654 | // We start with picking a pivot using the media-of-3 "method"
655 | // Once we have sensible pivot stored as the last element of the array
656 | // We process the array from both ends.
657 | //
658 | // To get this rolling, we first read 2 Vector256 elements from the left and
659 | // another 2 from the right, and store them in some temporary space
660 | // in order to leave enough "space" inside the vector for storing partitioned values.
661 | // Why 2 from each side? Because we need n+1 from each side
662 | // where n is the number of Vector256 elements we process in each iteration...
663 | // The reasoning behind the +1 is because of the way we decide from *which*
664 | // side to read, we may end up reading up to one more vector from any given side
665 | // and writing it in its entirety to the opposite side (this becomes slightly clearer
666 | // when reading the code below...)
667 | // Conceptually, the bulk of the processing looks like this after clearing out some initial
668 | // space as described above:
669 |
670 | // [.............................................................................]
671 | // ^wl ^rl rr^ wr^
672 | // Where:
673 | // wl = writeLeft
674 | // rl = readLeft
675 | // rr = readRight
676 | // wr = writeRight
677 |
678 | // In every iteration, we select what side to read from based on how much
679 | // space is left between head read/write pointer on each side...
680 | // We read from where there is a smaller gap, e.g. that side
681 | // that is closer to the unfortunate possibility of its write head overwriting
682 | // its read head... By reading from THAT side, we're ensuring this does not happen
683 |
684 | // An additional unfortunate complexity we need to deal with is that the right pointer
685 | // must be decremented by another Vector256.Count elements
686 | // Since the Load/Store primitives obviously accept start addresses
687 | var N = Vector256.Count; // Treated as constant @ JIT time
688 | var pivot = *right;
689 | // We do this here just in case we need to pre-align to the right
690 | // We end up
691 | *right = int.MaxValue;
692 |
693 | var readLeft = left;
694 | var readRight = right;
695 | var writeLeft = readLeft;
696 | var writeRight = readRight - N;
697 |
698 | var tmpStartLeft = _tempStart;
699 | var tmpLeft = tmpStartLeft;
700 | var tmpStartRight = _tempEnd;
701 | var tmpRight = tmpStartRight;
702 |
703 | // Broadcast the selected pivot
704 | var P = Vector256.Create(pivot);
705 | var pBase = BytePermTableAlignedPtr;
706 | tmpRight -= N;
707 |
708 | // the read heads always advance by 8 elements, or 32 bytes,
709 | // We can spend some extra time here to align the pointers
710 | // so they start at a cache-line boundary
711 | // Once that happens, we can read with Avx.LoadAlignedVector256
712 | // And also know for sure that our reads will never cross cache-lines
713 | // Otherwise, 50% of our AVX2 Loads will need to read from two cache-lines
714 |
715 | var leftAlign = unchecked((int) (hint & 0xFFFFFFFF));
716 | var rightAlign = unchecked((int) (hint >> 32));
717 |
718 | var preAlignedLeft = left + leftAlign;
719 | var preAlignedRight = right + rightAlign - N;
720 |
721 | // We preemptively go through the motions of
722 | // vectorized alignment, and at worst we re-neg
723 | // by not advancing the various read/tmp pointers
724 | // as if nothing ever happened if the conditions
725 | // are wrong from vectorized alignment
726 | var RT0 = LoadAlignedVector256(preAlignedRight);
727 | var LT0 = LoadAlignedVector256(preAlignedLeft);
728 | var rtMask = (uint) MoveMask(CompareGreaterThan(RT0, P).AsSingle());
729 | var ltMask = (uint) MoveMask(CompareGreaterThan(LT0, P).AsSingle());
730 | var rtPopCount = Math.Max(PopCount(rtMask), (uint) rightAlign);
731 | var ltPopCount = PopCount(ltMask);
732 | RT0 = PermuteVar8x32(RT0, GetBytePermutationAligned(pBase, rtMask));
733 | LT0 = PermuteVar8x32(LT0, GetBytePermutationAligned(pBase, ltMask));
734 | Avx.Store(tmpRight, RT0);
735 | Avx.Store(tmpLeft, LT0);
736 |
737 | var rightAlignMask = ~((rightAlign - 1) >> 31);
738 | var leftAlignMask = leftAlign >> 31;
739 |
740 | tmpRight -= rtPopCount & rightAlignMask;
741 | rtPopCount = V256_N - rtPopCount;
742 | readRight += (rightAlign - N) & rightAlignMask;
743 |
744 | Avx.Store(tmpRight, LT0);
745 | tmpRight -= ltPopCount & leftAlignMask;
746 | ltPopCount = V256_N - ltPopCount;
747 | tmpLeft += ltPopCount & leftAlignMask;
748 | tmpStartLeft += -leftAlign & leftAlignMask;
749 | readLeft += (leftAlign + N) & leftAlignMask;
750 |
751 | Avx.Store(tmpLeft, RT0);
752 | tmpLeft += rtPopCount & rightAlignMask;
753 | tmpStartRight -= rightAlign & rightAlignMask;
754 |
755 | if (leftAlign > 0) {
756 | tmpRight += N;
757 | readLeft = AlignLeftScalarUncommon(readLeft, pivot, ref tmpLeft, ref tmpRight);
758 | tmpRight -= N;
759 | }
760 |
761 | if (rightAlign < 0) {
762 | tmpRight += N;
763 | readRight = AlignRightScalarUncommon(readRight, pivot, ref tmpLeft, ref tmpRight);
764 | tmpRight -= N;
765 | }
766 | Debug.Assert(((ulong) readLeft & ALIGN_MASK) == 0);
767 | Debug.Assert(((ulong) readRight & ALIGN_MASK) == 0);
768 |
769 | Debug.Assert((((byte *) readRight - (byte *) readLeft) % (long) ALIGN) == 0);
770 | Debug.Assert((readRight - readLeft) >= EIGHTH_SLACK_PER_SIDE_IN_ELEMENTS * 2);
771 |
772 | // Read ahead from left+right
773 | LT0 = LoadAlignedVector256(readLeft + 0*N);
774 | RT0 = LoadAlignedVector256(readRight - 1*N);
775 |
776 | // Adjust for the reading that was made above
777 | readLeft += 1*N;
778 | readRight -= 2*N;
779 |
780 | ltMask = (uint) MoveMask(CompareGreaterThan(LT0, P).AsSingle());
781 | rtMask = (uint) MoveMask(CompareGreaterThan(RT0, P).AsSingle());
782 |
783 | ltPopCount = PopCount(ltMask);
784 | rtPopCount = PopCount(rtMask);
785 |
786 | LT0 = PermuteVar8x32(LT0, GetBytePermutationAligned(pBase, ltMask));
787 | RT0 = PermuteVar8x32(RT0, GetBytePermutationAligned(pBase, rtMask));
788 |
789 | Store(tmpRight, LT0);
790 | tmpRight -= ltPopCount;
791 | ltPopCount = V256_N - ltPopCount;
792 | Store(tmpRight, RT0);
793 | tmpRight -= rtPopCount;
794 | rtPopCount = V256_N - rtPopCount;
795 | tmpRight += N;
796 |
797 | Store(tmpLeft, LT0);
798 | tmpLeft += ltPopCount;
799 | Store(tmpLeft, RT0);
800 | tmpLeft += rtPopCount;
801 |
802 | while (readRight >= readLeft) {
803 |
804 | int* nextPtr;
805 | if (((byte *) writeRight - (byte *) readRight) < N * sizeof(int)) {
806 | nextPtr = readRight;
807 | readRight -= N;
808 | } else {
809 | nextPtr = readLeft;
810 | readLeft += N;
811 | }
812 |
813 | PartitionBlock1V(LoadAlignedVector256(nextPtr), P, pBase, ref writeLeft, ref writeRight);
814 | }
815 |
816 | var boundary = writeLeft;
817 |
818 | // 3. Copy-back the 4 registers + remainder we partitioned in the beginning
819 | var leftTmpSize = (uint) (ulong) (tmpLeft - tmpStartLeft);
820 | Unsafe.CopyBlockUnaligned(boundary, tmpStartLeft, leftTmpSize*sizeof(int));
821 | boundary += leftTmpSize;
822 | var rightTmpSize = (uint) (ulong) (tmpStartRight - tmpRight);
823 | Unsafe.CopyBlockUnaligned(boundary, tmpRight, rightTmpSize*sizeof(int));
824 |
825 | // Shove to pivot back to the boundary
826 | var value = *boundary;
827 | *right = value;
828 | *boundary = pivot;
829 |
830 | Debug.Assert(boundary > left);
831 | Debug.Assert(boundary <= right);
832 |
833 | return boundary;
834 | }
835 |
836 | ///
837 | /// Called when the left hand side of the entire array does not have enough elements
838 | /// for us to align the memory with vectorized operations, so we do this uncommon slower alternative.
839 | /// Generally speaking this is probably called for all the partitioning calls on the left edge of the array
840 | ///
841 | ///
842 | ///
843 | ///
844 | ///
845 | ///
846 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
847 | static int* AlignLeftScalarUncommon(int* readLeft, int pivot, ref int* tmpLeft, ref int* tmpRight)
848 | {
849 | if (((ulong) readLeft & ALIGN_MASK) == 0)
850 | return readLeft;
851 |
852 | var nextAlign = (int*) (((ulong) readLeft + ALIGN) & ~ALIGN_MASK);
853 | while (readLeft < nextAlign) {
854 | var v = *readLeft++;
855 | if (v <= pivot) {
856 | *tmpLeft++ = v;
857 | } else {
858 | *--tmpRight = v;
859 | }
860 | }
861 |
862 | return readLeft;
863 | }
864 |
865 | ///
866 | /// Called when the right hand side of the entire array does not have enough elements
867 | /// for us to align the memory with vectorized operations, so we do this uncommon slower alternative.
868 | /// Generally speaking this is probably called for all the partitioning calls on the right edge of the array
869 | ///
870 | ///
871 | ///
872 | ///
873 | ///
874 | ///
875 | [MethodImpl(MethodImplOptions.AggressiveInlining)]
876 | static int* AlignRightScalarUncommon(int* readRight, int pivot, ref int* tmpLeft, ref int* tmpRight)
877 | {
878 | if (((ulong) readRight & ALIGN_MASK) == 0)
879 | return readRight;
880 |
881 | var nextAlign = (int*) ((ulong) readRight & ~ALIGN_MASK);
882 | while (readRight > nextAlign) {
883 | var v = *--readRight;
884 | if (v <= pivot) {
885 | *tmpLeft++ = v;
886 | } else {
887 | *--tmpRight = v;
888 | }
889 | }
890 |
891 | return readRight;
892 | }
893 | }
894 | }
895 | }
896 |
--------------------------------------------------------------------------------