├── .clang-format ├── .github └── workflows │ └── build.yml ├── fuzz.cpp ├── LICENSE.md ├── tests.cpp ├── benchmark.cpp ├── README.md ├── nanosort.hpp └── extern ├── hybrid_qsort.h └── pdqsort.h /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | unix: 7 | strategy: 8 | matrix: 9 | os: [ubuntu, macos] 10 | name: ${{matrix.os}} 11 | runs-on: ${{matrix.os}}-latest 12 | steps: 13 | - uses: actions/checkout@v1 14 | - name: test 15 | run: | 16 | g++ tests.cpp -o tests 17 | ./tests 18 | 19 | windows: 20 | runs-on: windows-latest 21 | steps: 22 | - uses: actions/checkout@v1 23 | - uses: ilammy/msvc-dev-cmd@v1 24 | - name: test 25 | shell: bash # necessary for fail-fast 26 | run: | 27 | cl tests.cpp 28 | ./tests 29 | -------------------------------------------------------------------------------- /fuzz.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of nanosort library; see nanosort.hpp for license details 2 | #include "nanosort.hpp" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* Data, size_t Size) { 12 | typedef uint16_t T; 13 | 14 | const T* elements = reinterpret_cast(Data); 15 | size_t count = Size / sizeof(T); 16 | 17 | std::vector ss(elements, elements + count); 18 | std::vector ns(elements, elements + count); 19 | std::vector hs(elements, elements + count); 20 | 21 | std::sort(ss.begin(), ss.end()); 22 | nanosort(ns.begin(), ns.end()); 23 | nanosort_detail::heap_sort(hs.begin(), hs.end(), std::less()); 24 | 25 | assert(ss == ns); 26 | assert(ss == hs); 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Arseny Kapoulkine 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of nanosort library; see nanosort.hpp for license details 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "nanosort.hpp" 9 | 10 | template > 11 | void test_sort(const std::vector& a, Compare comp = std::less()) { 12 | std::vector hs = a; 13 | nanosort_detail::heap_sort(hs.begin(), hs.end(), comp); 14 | 15 | assert(std::is_sorted(hs.begin(), hs.end(), comp)); 16 | 17 | std::vector ss = a; 18 | nanosort_detail::small_sort(ss.begin(), ss.end(), comp); 19 | 20 | assert(std::is_sorted(ss.begin(), ss.end(), comp)); 21 | 22 | std::vector ns = a; 23 | nanosort(ns.begin(), ns.end(), comp); 24 | 25 | assert(std::is_sorted(ns.begin(), ns.end(), comp)); 26 | 27 | std::vector es = a; 28 | std::stable_sort(es.begin(), es.end()); 29 | std::stable_sort(ns.begin(), ns.end()); 30 | std::stable_sort(hs.begin(), hs.end()); 31 | std::stable_sort(ss.begin(), ss.end()); 32 | 33 | assert(es == ns); 34 | assert(es == hs); 35 | assert(es == ss); 36 | } 37 | 38 | int main() { 39 | const size_t N = 1000; 40 | 41 | { 42 | std::vector A(N); 43 | for (size_t i = 0; i < N; ++i) A[i] = i; 44 | test_sort(A); 45 | test_sort(A, std::greater()); 46 | } 47 | 48 | { 49 | std::vector A(N); 50 | for (size_t i = 0; i < N; ++i) A[i] = N - i; 51 | test_sort(A); 52 | test_sort(A, std::greater()); 53 | } 54 | 55 | { 56 | std::vector A(N); 57 | for (size_t i = 0; i < N; ++i) A[i] = N - i; 58 | test_sort(A); 59 | test_sort(A, std::greater()); 60 | } 61 | 62 | { 63 | std::vector A(N); 64 | for (size_t i = 0; i < N; ++i) A[i] = i * 123456789; 65 | test_sort(A); 66 | test_sort(A, std::greater()); 67 | } 68 | 69 | { 70 | std::vector A(N); 71 | for (size_t i = 0; i < N; ++i) A[i] = 0; 72 | test_sort(A); 73 | test_sort(A, std::greater()); 74 | } 75 | 76 | { 77 | std::vector A(N); 78 | for (size_t i = 0; i < N; ++i) A[i] = i % 16; 79 | test_sort(A); 80 | test_sort(A, std::greater()); 81 | } 82 | 83 | { 84 | std::vector A; 85 | test_sort(A); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /benchmark.cpp: -------------------------------------------------------------------------------- 1 | // This file is part of nanosort library; see nanosort.hpp for license details 2 | #include 3 | #include 4 | #include 5 | 6 | #include "extern/hybrid_qsort.h" 7 | #include "extern/pdqsort.h" 8 | #include "nanosort.hpp" 9 | 10 | const double kBenchRun = 0.1; 11 | 12 | #if defined(__linux__) 13 | double timestamp() { 14 | timespec ts; 15 | clock_gettime(CLOCK_MONOTONIC, &ts); 16 | return double(ts.tv_sec) + 1e-9 * double(ts.tv_nsec); 17 | } 18 | #elif defined(_WIN32) 19 | struct LARGE_INTEGER { 20 | __int64 QuadPart; 21 | }; 22 | extern "C" __declspec(dllimport) int __stdcall QueryPerformanceCounter( 23 | LARGE_INTEGER *lpPerformanceCount); 24 | extern "C" __declspec(dllimport) int __stdcall QueryPerformanceFrequency( 25 | LARGE_INTEGER *lpFrequency); 26 | 27 | double timestamp() { 28 | LARGE_INTEGER freq, counter; 29 | QueryPerformanceFrequency(&freq); 30 | QueryPerformanceCounter(&counter); 31 | return double(counter.QuadPart) / double(freq.QuadPart); 32 | } 33 | #else 34 | double timestamp() { return double(clock()) / double(CLOCKS_PER_SEC); } 35 | #endif 36 | 37 | typedef struct { 38 | uint64_t state; 39 | uint64_t inc; 40 | } pcg32_random_t; 41 | 42 | uint32_t pcg32_random_r(pcg32_random_t *rng) { 43 | uint64_t oldstate = rng->state; 44 | // Advance internal state 45 | rng->state = oldstate * 6364136223846793005ULL + (rng->inc | 1); 46 | // Calculate output function (XSH RR), uses old state for max ILP 47 | uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u; 48 | uint32_t rot = oldstate >> 59u; 49 | return (xorshifted >> rot) | (xorshifted << ((-rot) & 31)); 50 | } 51 | 52 | template 53 | double runbench(Sort sort, const std::vector &data) { 54 | double divider = data.size() * log2(double(data.size())); 55 | 56 | std::vector copy(data.size()); 57 | 58 | double time = 0; 59 | double start = timestamp(); 60 | 61 | while (timestamp() - start < kBenchRun) { 62 | copy = data; 63 | 64 | double ts0 = timestamp(); 65 | sort(copy.begin(), copy.end()); 66 | double ts1 = timestamp(); 67 | 68 | if (ts1 - ts0 < time || time == 0) time = ts1 - ts0; 69 | } 70 | 71 | return time * 1e9 / divider; 72 | } 73 | 74 | template 75 | void bench(const std::string &name, const std::vector &data) { 76 | double t1 = runbench([](auto beg, auto end) { std::sort(beg, end); }, data); 77 | double t2 = runbench([](auto beg, auto end) { pdqsort(beg, end); }, data); 78 | double t3 = runbench( 79 | [](auto beg, auto end) { exp_gerbens::QuickSort(beg, end); }, data); 80 | double t4 = runbench([](auto beg, auto end) { nanosort(beg, end); }, data); 81 | 82 | printf("%s | %.2f ns/op | %.2f ns/op | %.2f ns/op | %.2f ns/op\n", 83 | name.c_str(), t1, t2, t3, t4); 84 | } 85 | 86 | struct Pair { 87 | uint32_t key; 88 | uint32_t value; 89 | 90 | bool operator<(const Pair &other) const { return key < other.key; } 91 | }; 92 | 93 | struct PairString { 94 | const char *key; 95 | uint32_t value; 96 | 97 | bool operator<(const PairString &other) const { 98 | return strcmp(key, other.key) < 0; 99 | } 100 | }; 101 | 102 | int main() { 103 | pcg32_random_t rng = {42, 0}; 104 | std::vector test(1000000); 105 | 106 | for (size_t i = 0; i < test.size(); ++i) test[i] = pcg32_random_r(&rng); 107 | bench("random int", test); 108 | 109 | for (size_t i = 0; i < test.size(); ++i) test[i] = uint32_t(i); 110 | bench("sorted int", test); 111 | 112 | for (size_t i = 0; i < 100; ++i) test[pcg32_random_r(&rng) % test.size()] = 0; 113 | bench("sroted int", test); 114 | 115 | for (size_t i = 0; i < test.size(); ++i) 116 | test[i] = (i % 100 == 0) ? pcg32_random_r(&rng) : test[i - 1] + 1; 117 | bench("run100 int", test); 118 | 119 | for (size_t i = 0; i < test.size(); ++i) test[i] = uint32_t(test.size() - i); 120 | bench("sortre int", test); 121 | 122 | for (size_t i = 0; i < test.size(); ++i) 123 | test[i] = pcg32_random_r(&rng) % 1000; 124 | bench("eq1000 int", test); 125 | 126 | std::vector test2(test.size()); 127 | 128 | for (size_t i = 0; i < test.size(); ++i) test2[i].key = pcg32_random_r(&rng); 129 | bench("randompair", test2); 130 | 131 | std::vector dict; 132 | for (size_t i = 0; i < test.size(); ++i) dict.push_back(std::to_string(i)); 133 | 134 | std::vector test3(test.size()); 135 | for (size_t i = 0; i < test.size(); ++i) 136 | test3[i].key = dict[pcg32_random_r(&rng) % dict.size()].c_str(); 137 | bench("randomstrp", test3); 138 | 139 | std::vector test4(test.size()); 140 | for (size_t i = 0; i < test.size(); ++i) 141 | test4[i] = float(pcg32_random_r(&rng) % test.size()); 142 | bench("random flt", test4); 143 | 144 | std::vector test5(test.size()); 145 | for (size_t i = 0; i < test.size(); ++i) 146 | test5[i] = "longprefixtopushtoheap" + std::to_string(pcg32_random_r(&rng)); 147 | bench("randomstr!", test5); 148 | } 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nanosort [![Actions Status](https://github.com/zeux/nanosort/workflows/build/badge.svg)](https://github.com/zeux/nanosort/actions) ![MIT](https://img.shields.io/badge/license-MIT-blue.svg) 2 | 3 | ## Algorithm 4 | 5 | nanosort aims to be a fast comparison-based sorting algorithm, tuned for POD types of reasonably small sizes. nanosort implements an algorithm similar to introsort - divide & conquer quick sort with small subarrays sorted using a quadratic sort, and a fallback to heap sort to guarantee worst case NlogN execution time. Like the aforementioned algorithms, nanosort is *not* stable. To get high performance, nanosort uses the following techniques: 6 | 7 | - Instead of classical partition algorithms, nanosort uses a Lomuto-inspired branchless partition. Due to unique construction, this partition results in constant superb performance given minimal code size. 8 | - Instead of classical insertion sort, nanosort uses a 2-at-a-time bubble sort discovered by Gerben Stavenga; branchless implementation of this algorithm similarly results in excellent performance with reasonable code size. 9 | 10 | To reach high performance, it's critical that key loops in nanosort (`partition`, `small_sort`) as well as `median5` selection network are compiled using efficient branchless code, using instructions similar to `setb` and `cmov`. Not all compilers can do this properly; as such, nanosort presently has variable performance across different compilers. 11 | 12 | Crucially, nanosort guarantees worst case complexity of NlogN and does not result in undefined behavior even when the comparison function doesn't use strict weak ordering. This is in stark contrast to most STL implementations (for example, `libc++` has a worst case complexity of O(N^2) on certain inputs; all STL implementations can crash, including out of bounds *writes*, when given an input array of floats that contains NaN values). 13 | 14 | nanosort values predictability of execution time - most sequences of a given size and type are going to take more or less the same amount of time to sort. Because of this, nanosort can lose to algorithms that can detect sorted or partially sorted inputs, although even a fairly small number of random swaps in a sorted input are enough to make nanosort competitive with algorithms like pdqsort. 15 | 16 | ## Implementation 17 | 18 | nanosort is implemented as a header-only library that should compile on any compiler that supports C++03; nanosort optionally supports C++11 (and will use move construction/assignment to reduce copy cost). nanosort has no dependencies, including STL. 19 | 20 | nanosort compiles to ~1KB of x64 assembly code when using clang with -O2 and sorting an array of integers. 21 | 22 | To use nanosort, include the header and call `nanosort` function with or without a comparator: 23 | 24 | ```c++ 25 | #include "nanosort.hpp" 26 | 27 | ... 28 | nanosort(data, data + count); 29 | nanosort(data, data + count, std::greater()); 30 | ... 31 | ``` 32 | 33 | ## Benchmarks 34 | 35 | All benchmarks were ran on Intel Core i7-8700K. 36 | 37 | All benchmarks sort POD data types except for `randomstr!` which sorts std::string objects. 38 | 39 | ### clang 11 / libc++ 40 | 41 | nanosort performs very well on clang, beating other sorts most of the time with two notable exceptions: 42 | 43 | - on "sorted int" and "sortre int" libc++ std::sort and pdqsort are linear and all other sorts here are NlogN 44 | - on "randomstr!", the cost of extra std::string copies in nanosort outweighs all improvements 45 | 46 | benchmark | std::sort | pdqsort | exp_gerbens | nanosort 47 | -----------|------------|------------|-------------|---------- 48 | random int | 2.61 ns/op | 1.26 ns/op | 0.86 ns/op | 0.79 ns/op 49 | sorted int | 0.03 ns/op | 0.04 ns/op | 0.76 ns/op | 0.81 ns/op 50 | sroted int | 0.46 ns/op | 0.79 ns/op | 11.03 ns/op | 0.69 ns/op 51 | run100 int | 0.76 ns/op | 1.18 ns/op | 0.84 ns/op | 0.78 ns/op 52 | sortre int | 0.07 ns/op | 0.08 ns/op | 6.98 ns/op | 0.74 ns/op 53 | eq1000 int | 1.40 ns/op | 0.48 ns/op | 0.49 ns/op | 0.42 ns/op 54 | randompair | 2.65 ns/op | 2.72 ns/op | 1.07 ns/op | 0.95 ns/op 55 | randomstrp | 9.99 ns/op | 9.58 ns/op | 11.50 ns/op | 10.94 ns/op 56 | random flt | 2.92 ns/op | 1.40 ns/op | 1.29 ns/op | 1.41 ns/op 57 | randomstr! | 13.80 ns/op | 13.13 ns/op | 35.02 ns/op | 24.03 ns/op 58 | 59 | ### gcc 10 / libstdc++ 60 | 61 | gcc currently doesn't generate a proper branchless sequence for some of the algorithms, leading to worse performance on random benchmarks compared to clang. nanosort still has good performance but doesn't win as convincingly. 62 | 63 | The author plans to look into tuning nanosort to generate better code on gcc. 64 | 65 | benchmark | std::sort | pdqsort | exp_gerbens | nanosort 66 | -----------|------------|------------|-------------|---------- 67 | random int | 2.60 ns/op | 1.28 ns/op | 1.66 ns/op | 1.27 ns/op 68 | sorted int | 0.54 ns/op | 0.03 ns/op | 1.06 ns/op | 0.85 ns/op 69 | sroted int | 0.86 ns/op | 0.84 ns/op | 20.16 ns/op | 0.76 ns/op 70 | run100 int | 0.98 ns/op | 1.22 ns/op | 1.30 ns/op | 0.96 ns/op 71 | sortre int | 0.44 ns/op | 0.08 ns/op | 13.51 ns/op | 0.90 ns/op 72 | eq1000 int | 1.60 ns/op | 0.48 ns/op | 0.86 ns/op | 0.46 ns/op 73 | randompair | 2.77 ns/op | 2.77 ns/op | 1.79 ns/op | 2.48 ns/op 74 | randomstrp | 9.59 ns/op | 8.90 ns/op | 12.09 ns/op | 11.11 ns/op 75 | random flt | 2.89 ns/op | 1.33 ns/op | 1.83 ns/op | 1.44 ns/op 76 | randomstr! | 15.29 ns/op | 12.82 ns/op | 35.67 ns/op | 22.12 ns/op 77 | 78 | ### MSVC 2019 / MSVC STL 79 | 80 | MSVC does generate branchless code for most algorithms used by nanosort, but has several severe codegen performance issues that lead to excessive serialization of execution, which leads to much worse IPC on this code. 81 | 82 | The author plans to get Microsoft to fix the code generation here, as it can be indicative of similar problems in other tight loops, and/or implement workarounds in nanosort code. 83 | 84 | benchmark | std::sort | pdqsort | exp_gerbens | nanosort 85 | -----------|------------|------------|-------------|---------- 86 | random int | 3.18 ns/op | 2.63 ns/op | 1.97 ns/op | 2.00 ns/op 87 | sorted int | 0.43 ns/op | 0.04 ns/op | 0.80 ns/op | 2.02 ns/op 88 | sroted int | 0.84 ns/op | 0.59 ns/op | 9.65 ns/op | 1.77 ns/op 89 | run100 int | 1.32 ns/op | 0.83 ns/op | 0.93 ns/op | 2.00 ns/op 90 | sortre int | 0.51 ns/op | 0.05 ns/op | 9.16 ns/op | 1.92 ns/op 91 | eq1000 int | 1.56 ns/op | 1.34 ns/op | 1.49 ns/op | 1.22 ns/op 92 | randompair | 3.29 ns/op | 2.75 ns/op | 0.91 ns/op | 2.42 ns/op 93 | randomstrp | 10.21 ns/op | 8.40 ns/op | 12.90 ns/op | 12.60 ns/op 94 | random flt | 3.50 ns/op | 2.97 ns/op | 2.93 ns/op | 1.40 ns/op 95 | randomstr! | 19.45 ns/op | 18.36 ns/op | 87.24 ns/op | 28.72 ns/op 96 | 97 | ## License 98 | 99 | This library is available to anybody free of charge, under the terms of MIT License (see LICENSE.md). 100 | -------------------------------------------------------------------------------- /nanosort.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * nanosort 3 | * 4 | * Copyright (C) 2021, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) 5 | * Report bugs and download new versions at https://github.com/zeux/nanosort 6 | * 7 | * This library is distributed under the MIT License. See notice at the end of 8 | * this file. 9 | * 10 | * Thank you to Andrei Alexandrescu for his branchless Lomuto partition code and 11 | * Gerben Stavenga for further research of branchless partitions; their work 12 | * inspired this algorithm. 13 | */ 14 | #pragma once 15 | 16 | #include 17 | #include 18 | 19 | #ifdef _MSC_VER 20 | #define NANOSORT_NOINLINE __declspec(noinline) 21 | #define NANOSORT_UNLIKELY(c) (c) 22 | #else 23 | #define NANOSORT_NOINLINE __attribute__((noinline)) 24 | #define NANOSORT_UNLIKELY(c) __builtin_expect(c, 0) 25 | #endif 26 | 27 | #if __cplusplus >= 201103L 28 | #define NANOSORT_MOVE(v) static_cast(v) 29 | #else 30 | #define NANOSORT_MOVE(v) v 31 | #endif 32 | 33 | namespace nanosort_detail { 34 | 35 | struct Less { 36 | template 37 | bool operator()(const T& l, const T& r) const { 38 | return l < r; 39 | } 40 | }; 41 | 42 | template 43 | struct IteratorTraits { 44 | typedef typename It::value_type value_type; 45 | }; 46 | 47 | template 48 | struct IteratorTraits { 49 | typedef T value_type; 50 | }; 51 | 52 | template 53 | void swap(T& l, T& r) { 54 | T t(NANOSORT_MOVE(l)); 55 | l = NANOSORT_MOVE(r); 56 | r = NANOSORT_MOVE(t); 57 | } 58 | 59 | // Return median of 5 elements in the array 60 | template 61 | T median5(It first, It last, Compare comp) { 62 | size_t n = last - first; 63 | assert(n >= 5); 64 | 65 | T e0 = first[(n >> 2) * 0]; 66 | T e1 = first[(n >> 2) * 1]; 67 | T e2 = first[(n >> 2) * 2]; 68 | T e3 = first[(n >> 2) * 3]; 69 | T e4 = first[n - 1]; 70 | 71 | if (comp(e1, e0)) swap(e1, e0); 72 | if (comp(e4, e3)) swap(e4, e3); 73 | if (comp(e3, e0)) swap(e3, e0); 74 | 75 | if (comp(e1, e4)) swap(e1, e4); 76 | if (comp(e2, e1)) swap(e2, e1); 77 | if (comp(e3, e2)) swap(e2, e3); 78 | 79 | if (comp(e2, e1)) swap(e2, e1); 80 | 81 | return e2; 82 | } 83 | 84 | // Split array into x=pivot 85 | template 86 | It partition(T pivot, It first, It last, Compare comp) { 87 | It res = first; 88 | for (It it = first; it != last; ++it) { 89 | bool r = comp(*it, pivot); 90 | swap(*res, *it); 91 | res += r; 92 | } 93 | return res; 94 | } 95 | 96 | // Splits array into x<=pivot and x>pivot 97 | template 98 | It partition_rev(T pivot, It first, It last, Compare comp) { 99 | It res = first; 100 | for (It it = first; it != last; ++it) { 101 | bool r = comp(pivot, *it); 102 | swap(*res, *it); 103 | res += !r; 104 | } 105 | return res; 106 | } 107 | 108 | // Push root down through the heap 109 | template 110 | void heap_sift(It heap, size_t count, size_t root, Compare comp) { 111 | assert(count > 0); 112 | size_t last = (count - 1) >> 1; 113 | 114 | while (root < last) { 115 | assert(root * 2 + 2 < count); 116 | 117 | size_t next = root; 118 | next = comp(heap[next], heap[root * 2 + 1]) ? root * 2 + 1 : next; 119 | next = comp(heap[next], heap[root * 2 + 2]) ? root * 2 + 2 : next; 120 | 121 | if (next == root) break; 122 | swap(heap[root], heap[next]); 123 | root = next; 124 | } 125 | 126 | if (root == last && root * 2 + 1 < count && 127 | comp(heap[root], heap[root * 2 + 1])) { 128 | swap(heap[root], heap[root * 2 + 1]); 129 | } 130 | } 131 | 132 | // Sort array using heap sort 133 | template 134 | void heap_sort(It first, It last, Compare comp) { 135 | if (first == last) return; 136 | 137 | It heap = first; 138 | size_t count = last - first; 139 | 140 | for (size_t i = count / 2; i > 0; --i) { 141 | heap_sift(heap, count, i - 1, comp); 142 | } 143 | 144 | for (size_t i = count - 1; i > 0; --i) { 145 | swap(heap[0], heap[i]); 146 | heap_sift(heap, i, 0, comp); 147 | } 148 | } 149 | 150 | template 151 | void small_sort(It first, It last, Compare comp) { 152 | size_t n = last - first; 153 | 154 | for (size_t i = n; i > 1; i -= 2) { 155 | T x = NANOSORT_MOVE(first[0]); 156 | T y = NANOSORT_MOVE(first[1]); 157 | if (comp(y, x)) swap(y, x); 158 | 159 | for (size_t j = 2; j < i; j++) { 160 | T z = NANOSORT_MOVE(first[j]); 161 | 162 | if (comp(x, z)) swap(x, z); 163 | if (comp(y, z)) swap(y, z); 164 | if (comp(y, x)) swap(y, x); 165 | 166 | first[j - 2] = NANOSORT_MOVE(z); 167 | } 168 | 169 | first[i - 2] = NANOSORT_MOVE(x); 170 | first[i - 1] = NANOSORT_MOVE(y); 171 | } 172 | } 173 | 174 | template 175 | void sort(It first, It last, size_t limit, Compare comp) { 176 | for (;;) { 177 | if (last - first < 16) { 178 | small_sort(first, last, comp); 179 | return; 180 | } 181 | 182 | if (NANOSORT_UNLIKELY(limit == 0)) { 183 | heap_sort(first, last, comp); 184 | return; 185 | } 186 | 187 | T pivot = median5(first, last, comp); 188 | It mid = partition(pivot, first, last, comp); 189 | 190 | // For skewed partitions compute new midpoint by separating equal elements 191 | It midr = mid; 192 | if (NANOSORT_UNLIKELY(mid - first <= (last - first) >> 3)) { 193 | midr = partition_rev(pivot, mid, last, comp); 194 | } 195 | 196 | // Per MSVC STL, this allows 1.5 log2(N) recursive steps 197 | limit = (limit >> 1) + (limit >> 2); 198 | 199 | if (mid - first <= last - midr) { 200 | sort(first, mid, limit, comp); 201 | first = midr; 202 | } else { 203 | sort(midr, last, limit, comp); 204 | last = mid; 205 | } 206 | } 207 | } 208 | 209 | } // namespace nanosort_detail 210 | 211 | template 212 | void nanosort(It first, It last, Compare comp) { 213 | typedef typename nanosort_detail::IteratorTraits::value_type T; 214 | nanosort_detail::sort(first, last, last - first, comp); 215 | } 216 | 217 | template 218 | void nanosort(It first, It last) { 219 | typedef typename nanosort_detail::IteratorTraits::value_type T; 220 | nanosort_detail::sort(first, last, last - first, nanosort_detail::Less()); 221 | } 222 | 223 | /** 224 | * Copyright (c) 2021 Arseny Kapoulkine 225 | * 226 | * Permission is hereby granted, free of charge, to any person 227 | * obtaining a copy of this software and associated documentation 228 | * files (the "Software"), to deal in the Software without 229 | * restriction, including without limitation the rights to use, 230 | * copy, modify, merge, publish, distribute, sublicense, and/or sell 231 | * copies of the Software, and to permit persons to whom the 232 | * Software is furnished to do so, subject to the following 233 | * conditions: 234 | * 235 | * The above copyright notice and this permission notice shall be 236 | * included in all copies or substantial portions of the Software. 237 | * 238 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 239 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 240 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 241 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 242 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 243 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 244 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 245 | * OTHER DEALINGS IN THE SOFTWARE. 246 | */ 247 | -------------------------------------------------------------------------------- /extern/hybrid_qsort.h: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // https://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef EXPERIMENTAL_USERS_GERBENS_HYBRID_QSORT_H_ 16 | #define EXPERIMENTAL_USERS_GERBENS_HYBRID_QSORT_H_ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace exp_gerbens { 24 | 25 | constexpr ptrdiff_t kSmallSortThreshold = 16; 26 | 27 | // Moves median of first, middle, last 28 | template 29 | auto MedianOfThree(RandomIt first, RandomIt last, Compare comp = std::less<>{}) { 30 | auto n = last - first; 31 | auto f = *first; 32 | auto m = first[n >> 1]; 33 | auto l = last[-1]; 34 | using std::swap; 35 | if (comp(m, f)) swap(f, m); 36 | if (comp(l, f)) swap(f, l); 37 | if (comp(l, m)) swap(l, m); 38 | return m; 39 | } 40 | 41 | template 42 | void BranchlessSwap(RandomIt a, RandomIt b, Compare comp) { 43 | auto x = *a; 44 | auto y = *b; 45 | if (comp(y, x)) std::swap(a, b); 46 | *a = x; 47 | *b = y; 48 | } 49 | 50 | // Moves median of first, middle, last 51 | template 52 | void MoveMedianOfThreeToEnd(RandomIt first, RandomIt last, Compare comp) { 53 | auto mid = first + ((last - first) >> 1); 54 | auto back = last - 1; 55 | BranchlessSwap(first, mid, comp); 56 | BranchlessSwap(first, back, comp); 57 | BranchlessSwap(back, mid, comp); 58 | } 59 | 60 | // BubbleSort works better it has N(N-1)/2 stores, but x is updated in the inner 61 | // loop. This is cmp/cmov sequence making the inner loop 2 cycles. 62 | template 63 | void BubbleSort(RandomIt first, RandomIt last, Compare comp = std::less<>{}) { 64 | auto n = last - first; 65 | for (auto i = n; i > 1; i--) { 66 | auto x = first[0]; 67 | for (decltype(n) j = 1; j < i; j++) { 68 | auto y = first[j]; 69 | bool is_smaller = comp(y, x); 70 | first[j - 1] = is_smaller ? y : x; 71 | x = is_smaller ? x : y; 72 | } 73 | first[i - 1] = x; 74 | } 75 | } 76 | 77 | // BubbleSort2 bubbles two elements at a time. This means it's doing N(N+1)/4 78 | // iterations and therefore much less stores. Correctly ordering the cmov's it 79 | // is still possible to execute the inner loop in 2 cycles with respect to 80 | // data dependencies. So in effect this cuts running time by 2x, even though 81 | // it's not cutting number of comparisons. 82 | template 83 | void BubbleSort2(RandomIt first, RandomIt last, Compare comp = std::less<>{}) { 84 | auto n = last - first; 85 | for (auto i = n; i > 1; i -= 2) { 86 | auto x = first[0]; 87 | auto y = first[1]; 88 | if (y < x) std::swap(x, y); 89 | for (decltype(n) j = 2; j < i; j++) { 90 | auto z = first[j]; 91 | bool is_smaller = comp(z, y); 92 | auto w = is_smaller ? z : y; 93 | y = is_smaller ? y : z; 94 | is_smaller = comp(z, x); 95 | first[j - 2] = is_smaller ? z : x; 96 | x = is_smaller ? x : w; 97 | } 98 | first[i - 2] = x; 99 | first[i - 1] = y; 100 | } 101 | } 102 | 103 | template 104 | void SmallSort(RandomIt first, RandomIt last, Compare comp) { 105 | BubbleSort2(first, last, comp); 106 | } 107 | 108 | template 109 | ScratchIt PartitionInto(It first, It last, ScratchIt out, Compare comp) { 110 | auto n = last - first; 111 | auto pivot = first[n - 1]; 112 | auto l = out + n - 1; 113 | #ifdef __clang__ 114 | #pragma clang loop unroll_count(2) 115 | #endif 116 | for (ptrdiff_t i = -(n - 1); i < 0; i++) { 117 | auto x = first[i + n - 1]; 118 | bool is_larger = !comp(x, pivot); 119 | auto dest = is_larger ? 0 : i; 120 | l[dest] = x; 121 | l -= is_larger; 122 | } 123 | *l = pivot; 124 | return l; 125 | } 126 | 127 | template 128 | void QuickSortScratch(RandomIt first, RandomIt last, ScratchIt scratch, Compare comp); 129 | 130 | template 131 | void QuickSortInto(RandomIt first, RandomIt last, OutIt out, Compare comp) { 132 | auto n = last - first; 133 | if (n > kSmallSortThreshold) { 134 | MoveMedianOfThreeToEnd(first, last, comp); 135 | auto p = PartitionInto(first, last, out, comp); 136 | QuickSortScratch(out, p, first, comp); 137 | QuickSortScratch(p + 1, out + n, first, comp); 138 | } else { 139 | SmallSort(first, last, comp); 140 | std::move(first, last, out); 141 | } 142 | } 143 | 144 | template 145 | void QuickSortScratch(RandomIt first, RandomIt last, ScratchIt scratch, Compare comp) { 146 | auto n = last - first; 147 | if (n > kSmallSortThreshold) { 148 | MoveMedianOfThreeToEnd(first, last, comp); 149 | auto p = PartitionInto(first, last, scratch, comp); 150 | QuickSortInto(scratch, p, first, comp); 151 | first[p - scratch] = *p; 152 | QuickSortInto(p + 1, scratch + n, first + (p - scratch) + 1, comp); 153 | } else { 154 | SmallSort(first, last, comp); 155 | } 156 | } 157 | 158 | // Lomuto inspired partitioning, except it's not in-place and therefore is 159 | // much like bucket sort. It distributes as many elements in the interval 160 | // the interval [first, last) into two buckets. The elements smaller then 161 | // the pivot are distributed in-place at [first, ret). The elements larger 162 | // or equal to the pivot are distributed to the scratch buffer filling it 163 | // backwards. Execution stops when either scratch if full or all elements 164 | // are processed. 165 | template 166 | RandomIt DistributeForward(T pivot, RandomIt first, RandomIt last, ScratchIt scratch, 167 | ptrdiff_t scratch_size, Compare comp) { 168 | ptrdiff_t larger = 0; 169 | auto scratch_end = scratch + scratch_size - 1; 170 | while (first < last) { 171 | auto x = *first; 172 | bool is_larger = !comp(x, pivot); 173 | auto dest = is_larger ? &scratch_end[larger] : &first[larger]; 174 | *dest = x; 175 | first++; 176 | larger -= is_larger; 177 | if (larger == -scratch_size) break; 178 | } 179 | return first + larger; 180 | } 181 | 182 | // Same as above only reversed. This fills the scratch buffer starting at the 183 | // beginning. 184 | template 185 | RandomIt DistributeBackward(T pivot, RandomIt first, RandomIt last, ScratchIt scratch, 186 | ptrdiff_t scratch_size, Compare comp) { 187 | ptrdiff_t smaller = 0; 188 | while (first < last) { 189 | --last; 190 | auto x = *last; 191 | bool is_smaller = comp(x, pivot); 192 | auto dest = is_smaller ? &scratch[smaller] : &last[smaller]; 193 | *dest = x; 194 | smaller += is_smaller; 195 | if (smaller == scratch_size) break; 196 | } 197 | return last + smaller; 198 | } 199 | 200 | // New partition algorithm. It's a branch "reduced" hybrid between Hoare and 201 | // a simplified Lomuto partitioning schemes. Lomuto partitioning works by 202 | // ensuring that the first part of the array is properly partitioned with 203 | // respect to the pivot and grow it by the next element, swapping if needed. Now 204 | // obviously you also have a reverse Lomuto partitioning scheme that works 205 | // backwards, mutatis mutandis. Hoare's algorithm is more symmetrical as it 206 | // starts from both ends, working inwards while swapping elements. Lomuto's 207 | // scheme can be implemented branch free but has the overhead of doing two 208 | // stores per iteration necessary for branchless implementation of swap. 209 | // Furthermore it runs into the problem that the load at the partition index 210 | // potentially depends on previous stores, which quickly disable CPU load store 211 | // reordering. 212 | // 213 | // We can weaken Lomuto partioning scheme by unconditionally storing elements in 214 | // one of two buckets. This is not so much partitioning as it is distributing. 215 | // The algorithm distributes the elements over the two buckets based on the 216 | // pivot. This is much simpler and cheaper. The bucket containing the elements 217 | // smaller than the pivot can overlap with the array, however we need a 218 | // temporary buffer to hold the other elements. At the end we can copy the 219 | // elements of the temporary buffer to the end of the array to achieve a 220 | // partition. Note this would lead to a stable quicksort. Unfortunately such an 221 | // algorithm would not be in-place as it needs O(n) additional memory. 222 | 223 | // Let's call this distribution algorithm L', just like Lomuto there is a 224 | // reverse version of it as well. If we make our temporary buffer a small fixed 225 | // size buffer, we have to terminate the distributing when the fixed buffer is 226 | // full, at which point only a part of the array will have been processed. 227 | // Luckily we can leverage a modified version of Hoare's algorithm. Applying L' 228 | // backward with another tempory buffer with the same fixed size, will terminate 229 | // with that buffer full. Now there is enough space in the array to swap the 230 | // temporary buffers with their proper place in the array. What we are getting 231 | // is a tunable Hoare algorithm that works bulkwise, in the limiting case the 232 | // temporary buffers are of size 1, we recover the original Hoare algorithm. 233 | // 234 | // This scheme greatly improves on branchless Lomuto partioning by reducing the 235 | // amount of work that needs to be done in the inner loop and it greatly 236 | // improves on Hoare algorithm by only hitting branch misses every N elements 237 | // and swapping elements wholesale. 238 | template 239 | RandomIt HoareLomutoHybridPartition(T pivot, RandomIt first, RandomIt last, T* scratch, Compare comp) { 240 | auto pfirst = DistributeForward(pivot, first, last, scratch, kScratchSize, comp); 241 | if (auto size = last - pfirst; size <= kScratchSize) { 242 | std::move(scratch + kScratchSize - size, scratch + kScratchSize, pfirst); 243 | return pfirst; 244 | } 245 | first = pfirst + kScratchSize; 246 | RandomIt res; 247 | while (true) { 248 | last = DistributeBackward(pivot, first, last, first - kScratchSize, kScratchSize, comp) - kScratchSize; 249 | if (last <= first) { 250 | res = last; 251 | break; 252 | } 253 | first = DistributeForward(pivot, first, last, last, kScratchSize, comp) + kScratchSize; 254 | if (last <= first) { 255 | res = first - kScratchSize; 256 | break; 257 | } 258 | } 259 | std::move(scratch, scratch + kScratchSize, res); 260 | return res; 261 | } 262 | 263 | template 264 | std::pair ChoosePivotAndPartition(RandomIt first, RandomIt last, T* scratch, Compare comp) { 265 | auto pivot = MedianOfThree(first, last, comp); 266 | auto res = HoareLomutoHybridPartition(pivot, first, last, scratch, comp); 267 | auto n = last - first; 268 | auto m = res - first; 269 | if (m < (n >> 3)) { 270 | // Fallback path, a surprisingly skewed partition has happened. Likely pivot has many identical elements 271 | return {res, std::partition(res, last, [&](const T& p) { return !comp(pivot, p); })}; 272 | } 273 | return {res, res}; 274 | } 275 | 276 | template 277 | void QuickSortImpl(RandomIt first, RandomIt last, T* scratch, Compare comp) { 278 | while (last - first > kScratchSize) { 279 | auto p = ChoosePivotAndPartition(first, last, scratch, comp); 280 | auto nleft = p.first - first; 281 | auto nright = last - p.second; 282 | // Recurse only on the smallest partition guaranteeing O(log n) stack. 283 | if (nleft <= nright) { 284 | QuickSortImpl(first, p.first, scratch, comp); 285 | first = p.second; 286 | } else { 287 | QuickSortImpl(p.second, last, scratch, comp); 288 | last = p.first; 289 | } 290 | } 291 | // SmallSort(first, last, comp); 292 | QuickSortScratch(first, last, scratch, comp); 293 | } 294 | 295 | template 296 | void QuickSort(RandomIt first, RandomIt last, Compare comp) { 297 | static_assert(kScratchSize > 0, "Must have a positive scratch space size"); 298 | using T = typename std::decay::type; 299 | T scratch[kScratchSize]; 300 | QuickSortImpl(first, last, scratch, comp); 301 | } 302 | 303 | template 304 | void QuickSort(RandomIt first, RandomIt last) { 305 | QuickSort(first, last, std::less<>{}); 306 | } 307 | 308 | } // namespace exp_gerbens 309 | 310 | #endif // EXPERIMENTAL_USERS_GERBENS_HYBRID_QSORT_H_ 311 | -------------------------------------------------------------------------------- /extern/pdqsort.h: -------------------------------------------------------------------------------- 1 | /* 2 | pdqsort.h - Pattern-defeating quicksort. 3 | 4 | Copyright (c) 2015 Orson Peters 5 | 6 | This software is provided 'as-is', without any express or implied warranty. In no event will the 7 | authors be held liable for any damages arising from the use of this software. 8 | 9 | Permission is granted to anyone to use this software for any purpose, including commercial 10 | applications, and to alter it and redistribute it freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not claim that you wrote the 13 | original software. If you use this software in a product, an acknowledgment in the product 14 | documentation would be appreciated but is not required. 15 | 16 | 2. Altered source versions must be plainly marked as such, and must not be misrepresented as 17 | being the original software. 18 | 19 | 3. This notice may not be removed or altered from any source distribution. 20 | */ 21 | 22 | 23 | #ifndef PDQSORT_H 24 | #define PDQSORT_H 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #if __cplusplus >= 201103L 33 | #include 34 | #include 35 | #define PDQSORT_PREFER_MOVE(x) std::move(x) 36 | #else 37 | #define PDQSORT_PREFER_MOVE(x) (x) 38 | #endif 39 | 40 | 41 | namespace pdqsort_detail { 42 | enum { 43 | // Partitions below this size are sorted using insertion sort. 44 | insertion_sort_threshold = 24, 45 | 46 | // Partitions above this size use Tukey's ninther to select the pivot. 47 | ninther_threshold = 128, 48 | 49 | // When we detect an already sorted partition, attempt an insertion sort that allows this 50 | // amount of element moves before giving up. 51 | partial_insertion_sort_limit = 8, 52 | 53 | // Must be multiple of 8 due to loop unrolling, and < 256 to fit in unsigned char. 54 | block_size = 64, 55 | 56 | // Cacheline size, assumes power of two. 57 | cacheline_size = 64 58 | 59 | }; 60 | 61 | #if __cplusplus >= 201103L 62 | template struct is_default_compare : std::false_type { }; 63 | template struct is_default_compare> : std::true_type { }; 64 | template struct is_default_compare> : std::true_type { }; 65 | #endif 66 | 67 | // Returns floor(log2(n)), assumes n > 0. 68 | template 69 | inline int log2(T n) { 70 | int log = 0; 71 | while (n >>= 1) ++log; 72 | return log; 73 | } 74 | 75 | // Sorts [begin, end) using insertion sort with the given comparison function. 76 | template 77 | inline void insertion_sort(Iter begin, Iter end, Compare comp) { 78 | typedef typename std::iterator_traits::value_type T; 79 | if (begin == end) return; 80 | 81 | for (Iter cur = begin + 1; cur != end; ++cur) { 82 | Iter sift = cur; 83 | Iter sift_1 = cur - 1; 84 | 85 | // Compare first so we can avoid 2 moves for an element already positioned correctly. 86 | if (comp(*sift, *sift_1)) { 87 | T tmp = PDQSORT_PREFER_MOVE(*sift); 88 | 89 | do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); } 90 | while (sift != begin && comp(tmp, *--sift_1)); 91 | 92 | *sift = PDQSORT_PREFER_MOVE(tmp); 93 | } 94 | } 95 | } 96 | 97 | // Sorts [begin, end) using insertion sort with the given comparison function. Assumes 98 | // *(begin - 1) is an element smaller than or equal to any element in [begin, end). 99 | template 100 | inline void unguarded_insertion_sort(Iter begin, Iter end, Compare comp) { 101 | typedef typename std::iterator_traits::value_type T; 102 | if (begin == end) return; 103 | 104 | for (Iter cur = begin + 1; cur != end; ++cur) { 105 | Iter sift = cur; 106 | Iter sift_1 = cur - 1; 107 | 108 | // Compare first so we can avoid 2 moves for an element already positioned correctly. 109 | if (comp(*sift, *sift_1)) { 110 | T tmp = PDQSORT_PREFER_MOVE(*sift); 111 | 112 | do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); } 113 | while (comp(tmp, *--sift_1)); 114 | 115 | *sift = PDQSORT_PREFER_MOVE(tmp); 116 | } 117 | } 118 | } 119 | 120 | // Attempts to use insertion sort on [begin, end). Will return false if more than 121 | // partial_insertion_sort_limit elements were moved, and abort sorting. Otherwise it will 122 | // successfully sort and return true. 123 | template 124 | inline bool partial_insertion_sort(Iter begin, Iter end, Compare comp) { 125 | typedef typename std::iterator_traits::value_type T; 126 | if (begin == end) return true; 127 | 128 | std::size_t limit = 0; 129 | for (Iter cur = begin + 1; cur != end; ++cur) { 130 | Iter sift = cur; 131 | Iter sift_1 = cur - 1; 132 | 133 | // Compare first so we can avoid 2 moves for an element already positioned correctly. 134 | if (comp(*sift, *sift_1)) { 135 | T tmp = PDQSORT_PREFER_MOVE(*sift); 136 | 137 | do { *sift-- = PDQSORT_PREFER_MOVE(*sift_1); } 138 | while (sift != begin && comp(tmp, *--sift_1)); 139 | 140 | *sift = PDQSORT_PREFER_MOVE(tmp); 141 | limit += cur - sift; 142 | } 143 | 144 | if (limit > partial_insertion_sort_limit) return false; 145 | } 146 | 147 | return true; 148 | } 149 | 150 | template 151 | inline void sort2(Iter a, Iter b, Compare comp) { 152 | if (comp(*b, *a)) std::iter_swap(a, b); 153 | } 154 | 155 | // Sorts the elements *a, *b and *c using comparison function comp. 156 | template 157 | inline void sort3(Iter a, Iter b, Iter c, Compare comp) { 158 | sort2(a, b, comp); 159 | sort2(b, c, comp); 160 | sort2(a, b, comp); 161 | } 162 | 163 | template 164 | inline T* align_cacheline(T* p) { 165 | #if defined(UINTPTR_MAX) && __cplusplus >= 201103L 166 | std::uintptr_t ip = reinterpret_cast(p); 167 | #else 168 | std::size_t ip = reinterpret_cast(p); 169 | #endif 170 | ip = (ip + cacheline_size - 1) & -cacheline_size; 171 | return reinterpret_cast(ip); 172 | } 173 | 174 | template 175 | inline void swap_offsets(Iter first, Iter last, 176 | unsigned char* offsets_l, unsigned char* offsets_r, 177 | int num, bool use_swaps) { 178 | typedef typename std::iterator_traits::value_type T; 179 | if (use_swaps) { 180 | // This case is needed for the descending distribution, where we need 181 | // to have proper swapping for pdqsort to remain O(n). 182 | for (int i = 0; i < num; ++i) { 183 | std::iter_swap(first + offsets_l[i], last - offsets_r[i]); 184 | } 185 | } else if (num > 0) { 186 | Iter l = first + offsets_l[0]; Iter r = last - offsets_r[0]; 187 | T tmp(PDQSORT_PREFER_MOVE(*l)); *l = PDQSORT_PREFER_MOVE(*r); 188 | for (int i = 1; i < num; ++i) { 189 | l = first + offsets_l[i]; *r = PDQSORT_PREFER_MOVE(*l); 190 | r = last - offsets_r[i]; *l = PDQSORT_PREFER_MOVE(*r); 191 | } 192 | *r = PDQSORT_PREFER_MOVE(tmp); 193 | } 194 | } 195 | 196 | // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal 197 | // to the pivot are put in the right-hand partition. Returns the position of the pivot after 198 | // partitioning and whether the passed sequence already was correctly partitioned. Assumes the 199 | // pivot is a median of at least 3 elements and that [begin, end) is at least 200 | // insertion_sort_threshold long. Uses branchless partitioning. 201 | template 202 | inline std::pair partition_right_branchless(Iter begin, Iter end, Compare comp) { 203 | typedef typename std::iterator_traits::value_type T; 204 | 205 | // Move pivot into local for speed. 206 | T pivot(PDQSORT_PREFER_MOVE(*begin)); 207 | Iter first = begin; 208 | Iter last = end; 209 | 210 | // Find the first element greater than or equal than the pivot (the median of 3 guarantees 211 | // this exists). 212 | while (comp(*++first, pivot)); 213 | 214 | // Find the first element strictly smaller than the pivot. We have to guard this search if 215 | // there was no element before *first. 216 | if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); 217 | else while ( !comp(*--last, pivot)); 218 | 219 | // If the first pair of elements that should be swapped to partition are the same element, 220 | // the passed in sequence already was correctly partitioned. 221 | bool already_partitioned = first >= last; 222 | if (!already_partitioned) { 223 | std::iter_swap(first, last); 224 | ++first; 225 | } 226 | 227 | // The following branchless partitioning is derived from "BlockQuicksort: How Branch 228 | // Mispredictions don’t affect Quicksort" by Stefan Edelkamp and Armin Weiss. 229 | unsigned char offsets_l_storage[block_size + cacheline_size]; 230 | unsigned char offsets_r_storage[block_size + cacheline_size]; 231 | unsigned char* offsets_l = align_cacheline(offsets_l_storage); 232 | unsigned char* offsets_r = align_cacheline(offsets_r_storage); 233 | int num_l, num_r, start_l, start_r; 234 | num_l = num_r = start_l = start_r = 0; 235 | 236 | while (last - first > 2 * block_size) { 237 | // Fill up offset blocks with elements that are on the wrong side. 238 | if (num_l == 0) { 239 | start_l = 0; 240 | Iter it = first; 241 | for (unsigned char i = 0; i < block_size;) { 242 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 243 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 244 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 245 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 246 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 247 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 248 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 249 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 250 | } 251 | } 252 | if (num_r == 0) { 253 | start_r = 0; 254 | Iter it = last; 255 | for (unsigned char i = 0; i < block_size;) { 256 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 257 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 258 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 259 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 260 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 261 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 262 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 263 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 264 | } 265 | } 266 | 267 | // Swap elements and update block sizes and first/last boundaries. 268 | int num = std::min(num_l, num_r); 269 | swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, 270 | num, num_l == num_r); 271 | num_l -= num; num_r -= num; 272 | start_l += num; start_r += num; 273 | if (num_l == 0) first += block_size; 274 | if (num_r == 0) last -= block_size; 275 | } 276 | 277 | int l_size = 0, r_size = 0; 278 | int unknown_left = (int)(last - first) - ((num_r || num_l) ? block_size : 0); 279 | if (num_r) { 280 | // Handle leftover block by assigning the unknown elements to the other block. 281 | l_size = unknown_left; 282 | r_size = block_size; 283 | } else if (num_l) { 284 | l_size = block_size; 285 | r_size = unknown_left; 286 | } else { 287 | // No leftover block, split the unknown elements in two blocks. 288 | l_size = unknown_left/2; 289 | r_size = unknown_left - l_size; 290 | } 291 | 292 | // Fill offset buffers if needed. 293 | if (unknown_left && !num_l) { 294 | start_l = 0; 295 | Iter it = first; 296 | for (unsigned char i = 0; i < l_size;) { 297 | offsets_l[num_l] = i++; num_l += !comp(*it, pivot); ++it; 298 | } 299 | } 300 | if (unknown_left && !num_r) { 301 | start_r = 0; 302 | Iter it = last; 303 | for (unsigned char i = 0; i < r_size;) { 304 | offsets_r[num_r] = ++i; num_r += comp(*--it, pivot); 305 | } 306 | } 307 | 308 | int num = std::min(num_l, num_r); 309 | swap_offsets(first, last, offsets_l + start_l, offsets_r + start_r, num, num_l == num_r); 310 | num_l -= num; num_r -= num; 311 | start_l += num; start_r += num; 312 | if (num_l == 0) first += l_size; 313 | if (num_r == 0) last -= r_size; 314 | 315 | // We have now fully identified [first, last)'s proper position. Swap the last elements. 316 | if (num_l) { 317 | offsets_l += start_l; 318 | while (num_l--) std::iter_swap(first + offsets_l[num_l], --last); 319 | first = last; 320 | } 321 | if (num_r) { 322 | offsets_r += start_r; 323 | while (num_r--) std::iter_swap(last - offsets_r[num_r], first), ++first; 324 | last = first; 325 | } 326 | 327 | // Put the pivot in the right place. 328 | Iter pivot_pos = first - 1; 329 | *begin = PDQSORT_PREFER_MOVE(*pivot_pos); 330 | *pivot_pos = PDQSORT_PREFER_MOVE(pivot); 331 | 332 | return std::make_pair(pivot_pos, already_partitioned); 333 | } 334 | 335 | // Partitions [begin, end) around pivot *begin using comparison function comp. Elements equal 336 | // to the pivot are put in the right-hand partition. Returns the position of the pivot after 337 | // partitioning and whether the passed sequence already was correctly partitioned. Assumes the 338 | // pivot is a median of at least 3 elements and that [begin, end) is at least 339 | // insertion_sort_threshold long. 340 | template 341 | inline std::pair partition_right(Iter begin, Iter end, Compare comp) { 342 | typedef typename std::iterator_traits::value_type T; 343 | 344 | // Move pivot into local for speed. 345 | T pivot(PDQSORT_PREFER_MOVE(*begin)); 346 | 347 | Iter first = begin; 348 | Iter last = end; 349 | 350 | // Find the first element greater than or equal than the pivot (the median of 3 guarantees 351 | // this exists). 352 | while (comp(*++first, pivot)); 353 | 354 | // Find the first element strictly smaller than the pivot. We have to guard this search if 355 | // there was no element before *first. 356 | if (first - 1 == begin) while (first < last && !comp(*--last, pivot)); 357 | else while ( !comp(*--last, pivot)); 358 | 359 | // If the first pair of elements that should be swapped to partition are the same element, 360 | // the passed in sequence already was correctly partitioned. 361 | bool already_partitioned = first >= last; 362 | 363 | // Keep swapping pairs of elements that are on the wrong side of the pivot. Previously 364 | // swapped pairs guard the searches, which is why the first iteration is special-cased 365 | // above. 366 | while (first < last) { 367 | std::iter_swap(first, last); 368 | while (comp(*++first, pivot)); 369 | while (!comp(*--last, pivot)); 370 | } 371 | 372 | // Put the pivot in the right place. 373 | Iter pivot_pos = first - 1; 374 | *begin = PDQSORT_PREFER_MOVE(*pivot_pos); 375 | *pivot_pos = PDQSORT_PREFER_MOVE(pivot); 376 | 377 | return std::make_pair(pivot_pos, already_partitioned); 378 | } 379 | 380 | // Similar function to the one above, except elements equal to the pivot are put to the left of 381 | // the pivot and it doesn't check or return if the passed sequence already was partitioned. 382 | // Since this is rarely used (the many equal case), and in that case pdqsort already has O(n) 383 | // performance, no block quicksort is applied here for simplicity. 384 | template 385 | inline Iter partition_left(Iter begin, Iter end, Compare comp) { 386 | typedef typename std::iterator_traits::value_type T; 387 | 388 | T pivot(PDQSORT_PREFER_MOVE(*begin)); 389 | Iter first = begin; 390 | Iter last = end; 391 | 392 | while (comp(pivot, *--last)); 393 | 394 | if (last + 1 == end) while (first < last && !comp(pivot, *++first)); 395 | else while ( !comp(pivot, *++first)); 396 | 397 | while (first < last) { 398 | std::iter_swap(first, last); 399 | while (comp(pivot, *--last)); 400 | while (!comp(pivot, *++first)); 401 | } 402 | 403 | Iter pivot_pos = last; 404 | *begin = PDQSORT_PREFER_MOVE(*pivot_pos); 405 | *pivot_pos = PDQSORT_PREFER_MOVE(pivot); 406 | 407 | return pivot_pos; 408 | } 409 | 410 | 411 | template 412 | inline void pdqsort_loop(Iter begin, Iter end, Compare comp, int bad_allowed, bool leftmost = true) { 413 | typedef typename std::iterator_traits::difference_type diff_t; 414 | 415 | // Use a while loop for tail recursion elimination. 416 | while (true) { 417 | diff_t size = end - begin; 418 | 419 | // Insertion sort is faster for small arrays. 420 | if (size < insertion_sort_threshold) { 421 | if (leftmost) insertion_sort(begin, end, comp); 422 | else unguarded_insertion_sort(begin, end, comp); 423 | return; 424 | } 425 | 426 | // Choose pivot as median of 3 or pseudomedian of 9. 427 | diff_t s2 = size / 2; 428 | if (size > ninther_threshold) { 429 | sort3(begin, begin + s2, end - 1, comp); 430 | sort3(begin + 1, begin + (s2 - 1), end - 2, comp); 431 | sort3(begin + 2, begin + (s2 + 1), end - 3, comp); 432 | sort3(begin + (s2 - 1), begin + s2, begin + (s2 + 1), comp); 433 | std::iter_swap(begin, begin + s2); 434 | } else sort3(begin + s2, begin, end - 1, comp); 435 | 436 | // If *(begin - 1) is the end of the right partition of a previous partition operation 437 | // there is no element in [begin, end) that is smaller than *(begin - 1). Then if our 438 | // pivot compares equal to *(begin - 1) we change strategy, putting equal elements in 439 | // the left partition, greater elements in the right partition. We do not have to 440 | // recurse on the left partition, since it's sorted (all equal). 441 | if (!leftmost && !comp(*(begin - 1), *begin)) { 442 | begin = partition_left(begin, end, comp) + 1; 443 | continue; 444 | } 445 | 446 | // Partition and get results. 447 | std::pair part_result = 448 | Branchless ? partition_right_branchless(begin, end, comp) 449 | : partition_right(begin, end, comp); 450 | Iter pivot_pos = part_result.first; 451 | bool already_partitioned = part_result.second; 452 | 453 | // Check for a highly unbalanced partition. 454 | diff_t l_size = pivot_pos - begin; 455 | diff_t r_size = end - (pivot_pos + 1); 456 | bool highly_unbalanced = l_size < size / 8 || r_size < size / 8; 457 | 458 | // If we got a highly unbalanced partition we shuffle elements to break many patterns. 459 | if (highly_unbalanced) { 460 | // If we had too many bad partitions, switch to heapsort to guarantee O(n log n). 461 | if (--bad_allowed == 0) { 462 | std::make_heap(begin, end, comp); 463 | std::sort_heap(begin, end, comp); 464 | return; 465 | } 466 | 467 | if (l_size >= insertion_sort_threshold) { 468 | std::iter_swap(begin, begin + l_size / 4); 469 | std::iter_swap(pivot_pos - 1, pivot_pos - l_size / 4); 470 | 471 | if (l_size > ninther_threshold) { 472 | std::iter_swap(begin + 1, begin + (l_size / 4 + 1)); 473 | std::iter_swap(begin + 2, begin + (l_size / 4 + 2)); 474 | std::iter_swap(pivot_pos - 2, pivot_pos - (l_size / 4 + 1)); 475 | std::iter_swap(pivot_pos - 3, pivot_pos - (l_size / 4 + 2)); 476 | } 477 | } 478 | 479 | if (r_size >= insertion_sort_threshold) { 480 | std::iter_swap(pivot_pos + 1, pivot_pos + (1 + r_size / 4)); 481 | std::iter_swap(end - 1, end - r_size / 4); 482 | 483 | if (r_size > ninther_threshold) { 484 | std::iter_swap(pivot_pos + 2, pivot_pos + (2 + r_size / 4)); 485 | std::iter_swap(pivot_pos + 3, pivot_pos + (3 + r_size / 4)); 486 | std::iter_swap(end - 2, end - (1 + r_size / 4)); 487 | std::iter_swap(end - 3, end - (2 + r_size / 4)); 488 | } 489 | } 490 | } else { 491 | // If we were decently balanced and we tried to sort an already partitioned 492 | // sequence try to use insertion sort. 493 | if (already_partitioned && partial_insertion_sort(begin, pivot_pos, comp) 494 | && partial_insertion_sort(pivot_pos + 1, end, comp)) return; 495 | } 496 | 497 | // Sort the left partition first using recursion and do tail recursion elimination for 498 | // the right-hand partition. 499 | pdqsort_loop(begin, pivot_pos, comp, bad_allowed, leftmost); 500 | begin = pivot_pos + 1; 501 | leftmost = false; 502 | } 503 | } 504 | } 505 | 506 | 507 | template 508 | inline void pdqsort(Iter begin, Iter end, Compare comp) { 509 | if (begin == end) return; 510 | 511 | #if __cplusplus >= 201103L 512 | pdqsort_detail::pdqsort_loop::type>::value && 514 | std::is_arithmetic::value_type>::value>( 515 | begin, end, comp, pdqsort_detail::log2(end - begin)); 516 | #else 517 | pdqsort_detail::pdqsort_loop( 518 | begin, end, comp, pdqsort_detail::log2(end - begin)); 519 | #endif 520 | } 521 | 522 | template 523 | inline void pdqsort(Iter begin, Iter end) { 524 | typedef typename std::iterator_traits::value_type T; 525 | pdqsort(begin, end, std::less()); 526 | } 527 | 528 | template 529 | inline void pdqsort_branchless(Iter begin, Iter end, Compare comp) { 530 | if (begin == end) return; 531 | pdqsort_detail::pdqsort_loop( 532 | begin, end, comp, pdqsort_detail::log2(end - begin)); 533 | } 534 | 535 | template 536 | inline void pdqsort_branchless(Iter begin, Iter end) { 537 | typedef typename std::iterator_traits::value_type T; 538 | pdqsort_branchless(begin, end, std::less()); 539 | } 540 | 541 | 542 | #undef PDQSORT_PREFER_MOVE 543 | 544 | #endif 545 | --------------------------------------------------------------------------------