├── LICENSE ├── Makefile ├── README.md ├── benchmarks ├── benchmark.cpp ├── generators.h ├── performancecounters │ ├── apple_arm_events.h │ ├── benchmarker.h │ ├── event_counter.h │ └── linux-perf-events.h └── stream.cpp ├── gnuplot ├── README.md ├── icelake.data ├── icelake.datalehmer.pdf ├── icelake.datapcg64.pdf ├── icelake.dataratio.pdf ├── icelakestream.data ├── m2.data ├── m2.datalehmer.pdf ├── m2.datapcg64.pdf ├── m2.dataratio.pdf ├── m2stream.data └── plot.gnuplot ├── include ├── partial-shuffle-inl.h ├── random_bounded.h └── template_shuffle.h ├── src ├── batch_shuffle_dice.c ├── chacha.c ├── chacha.h ├── lehmer64.h ├── pcg64.h ├── random_bounded.c └── splitmix64.h └── tests └── basic.cpp /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2024 The batched_random authors 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: benchmark basic stream 2 | CXX=clang++ 3 | CC=clang 4 | benchmark: benchmarks/benchmark.cpp random_bounded.o 5 | $(CXX) $(CXXFLAGS) -std=c++17 -O3 -Wall -Wextra -o benchmark benchmarks/benchmark.cpp random_bounded.o -Iinclude -Ibenchmarks 6 | stream: benchmarks/stream.cpp random_bounded.o 7 | $(CXX) $(CXXFLAGS) -std=c++17 -O3 -Wall -Wextra -o stream benchmarks/stream.cpp random_bounded.o -Iinclude -Ibenchmarks 8 | basic : tests/basic.cpp random_bounded.o 9 | $(CXX) $(CXXFLAGS) -std=c++17 -O3 -Wall -Wextra -o basic tests/basic.cpp random_bounded.o -Iinclude 10 | random_bounded.o: src/batch_shuffle_dice.c src/random_bounded.c include/random_bounded.h src/lehmer64.h src/splitmix64.h 11 | $(CC) $(CFLAGS) -std=c11 -O3 -Wall -Wextra -Wconversion -c src/random_bounded.c 12 | 13 | clean: 14 | rm -f random_bounded.o benchmark basic stream -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Batched Random 2 | 3 | We benchmark fast shuffling functions using batched random index generation. 4 | It is meant for research purposes. Though we have good benchmarks and tests, 5 | this code is not meant to be production-ready. 6 | 7 | ### Reference 8 | 9 | * Nevin Brackett-Rozinsky, Daniel Lemire, [Batched Ranged Random Integer Generation](https://arxiv.org/abs/2408.06213), Software: Practice and Experience 55 (1), 2024. 10 | 11 | ### Requirements 12 | 13 | - Recent LLVM clang and clang++ compilers 14 | - Make 15 | 16 | ### Running Benchmarks 17 | 18 | 19 | ``` 20 | make 21 | ./benchmark 22 | ``` 23 | 24 | To get the C++ benchmarks, you can type `./benchmark --cpp`. They are disabled by default. 25 | 26 | To run tests: 27 | ``` 28 | ./basic 29 | ``` 30 | 31 | ## Code 32 | 33 | See `src` directory for the main code. 34 | 35 | ## Other Compilers 36 | 37 | We use LLVM/clang for benchmarking. 38 | 39 | The code is portable and other compilers can be used. The performance 40 | of the C++ code might be sensitive to the C++ compiler used. 41 | Specifically, we find that `shuffle_23456` has relatively poor performance 42 | with GCC compared to LLVM/clang. GCC fails to optimize `shuffle_23456` properly. 43 | We recommend that GCC users prefer `shuffle_2`. 44 | -------------------------------------------------------------------------------- /benchmarks/benchmark.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "performancecounters/benchmarker.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | extern "C" { 12 | #include "random_bounded.h" 13 | } 14 | #include "generators.h" 15 | #include "template_shuffle.h" 16 | 17 | void precomp_shuffle(uint64_t *storage, uint64_t size, 18 | const uint32_t *precomputed) { 19 | uint64_t tmp, val; 20 | uint32_t nextpos; 21 | for (size_t i = size - 1; i > 0; i--) { 22 | nextpos = precomputed[i]; 23 | tmp = storage[i]; // likely in cache 24 | val = storage[nextpos]; // could be costly 25 | storage[i] = val; 26 | storage[nextpos] = tmp; // you might have to read this store later 27 | } 28 | } 29 | 30 | void pretty_print(size_t volume, size_t bytes, std::string name, 31 | event_aggregate agg) { 32 | printf("%-45s : ", name.c_str()); 33 | printf(" %5.2f Gi/s ", volume / agg.fastest_elapsed_ns()); 34 | double best_speed = volume / agg.fastest_elapsed_ns(); 35 | double avg_speed = volume / agg.elapsed_ns(); 36 | double range = (best_speed - avg_speed) / avg_speed * 100.0; 37 | printf(" %5.2f GB/s best, %5.2f GB/s average, (%2.0f %%) ", 38 | bytes / agg.fastest_elapsed_ns(), bytes / agg.elapsed_ns(), range); 39 | if (collector.has_events()) { 40 | printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns()); 41 | printf(" %5.2f c/b ", agg.fastest_cycles() / bytes); 42 | printf(" %5.2f i/b ", agg.fastest_instructions() / bytes); 43 | printf(" %5.2f i/e ", agg.fastest_instructions() / volume); 44 | printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles()); 45 | } 46 | printf("\n"); 47 | } 48 | 49 | void bench(size_t size, bool include_cpp) { 50 | constexpr size_t min_volume = 4096; 51 | if (size == 0) { 52 | return; 53 | } 54 | size_t volume = size; 55 | if (size < min_volume) { 56 | volume *= min_volume / size; 57 | } 58 | std::vector input(volume); 59 | std::random_device rd; 60 | 61 | if (size > 0xFFFFFFFF) { 62 | std::cerr << "WARNING: Volume too large for precomputed shuffle." 63 | << std::endl; 64 | } 65 | std::vector precomputed(volume); 66 | for (size_t i = 0; i < volume; i++) { 67 | input[i] = i; 68 | uint64_t bound = (i % size) + 1; 69 | precomputed[i] = random_bounded_lehmer(bound); 70 | } 71 | 72 | std::cout << "Size of precomputed values " 73 | << size * sizeof(uint32_t) / 1024 << " kB" 74 | << std::endl; 75 | std::cout << "Size of shuffle : " << size << " words" << std::endl; 76 | std::cout << "Size of shuffle : " << size * sizeof(uint64_t) / 1024 / 1024. 77 | << " MB" << std::endl; 78 | 79 | size_t min_repeat = 10; 80 | size_t min_time_ns = 100000000; 81 | size_t max_repeat = 100000; 82 | 83 | if (size < volume) { 84 | printf("inner repeat: %zu\n", volume / size); 85 | } 86 | 87 | if (include_cpp) { 88 | lehmer64 lehmerGenerator{rd()}; 89 | std::mt19937_64 mtGenerator{rd()}; 90 | 91 | // C++ Lehmer 92 | 93 | pretty_print(volume, volume * sizeof(uint64_t), 94 | "C++ std::shuffle (lehmer)", 95 | bench( 96 | [&input, &lehmerGenerator, size]() { 97 | for (auto t = input.begin(); t < input.end(); t += size) { 98 | std::shuffle(t, t + size, lehmerGenerator); 99 | } 100 | }, 101 | min_repeat, min_time_ns, max_repeat)); 102 | 103 | pretty_print(volume, volume * sizeof(uint64_t), 104 | "C++ shuffle 2 (lehmer)", 105 | bench( 106 | [&input, &lehmerGenerator, size]() { 107 | for (auto t = input.begin(); t < input.end(); t += size) { 108 | batched_random::shuffle_2(t, t + size, lehmerGenerator); 109 | } 110 | }, 111 | min_repeat, min_time_ns, max_repeat)); 112 | 113 | pretty_print(volume, volume * sizeof(uint64_t), 114 | "C++ shuffle 2-6 (lehmer)", 115 | bench( 116 | [&input, &lehmerGenerator, size]() { 117 | for (auto t = input.begin(); t < input.end(); t += size) { 118 | batched_random::shuffle_23456(t, t + size, lehmerGenerator); 119 | } 120 | }, 121 | min_repeat, min_time_ns, max_repeat)); 122 | 123 | // C++ Mersenne twister 124 | 125 | pretty_print(volume, volume * sizeof(uint64_t), 126 | "C++ std::shuffle (mersenne)", 127 | bench( 128 | [&input, &mtGenerator, size]() { 129 | for (auto t = input.begin(); t < input.end(); t += size) { 130 | std::shuffle(t, t + size, mtGenerator); 131 | } 132 | }, 133 | min_repeat, min_time_ns, max_repeat)); 134 | 135 | pretty_print(volume, volume * sizeof(uint64_t), 136 | "C++ shuffle 2 (mersenne)", 137 | bench( 138 | [&input, &mtGenerator, size]() { 139 | for (auto t = input.begin(); t < input.end(); t += size) { 140 | batched_random::shuffle_2(t, t + size, mtGenerator); 141 | } 142 | }, 143 | min_repeat, min_time_ns, max_repeat)); 144 | 145 | pretty_print(volume, volume * sizeof(uint64_t), 146 | "C++ shuffle 2-6 (mersenne)", 147 | bench( 148 | [&input, &mtGenerator, size]() { 149 | for (auto t = input.begin(); t < input.end(); t += size) { 150 | batched_random::shuffle_23456(t, t + size, mtGenerator); 151 | } 152 | }, 153 | min_repeat, min_time_ns, max_repeat)); 154 | } 155 | 156 | // Lehmer 157 | 158 | pretty_print(volume, volume * sizeof(uint64_t), 159 | "standard shuffle (lehmer)", 160 | bench( 161 | [&input, size, volume]() { 162 | for (size_t t = 0; t < volume; t += size) { 163 | shuffle_lehmer(input.data() + t, size); 164 | } 165 | }, 166 | min_repeat, min_time_ns, max_repeat)); 167 | 168 | pretty_print(volume, volume * sizeof(uint64_t), 169 | "batch shuffle 2 (lehmer)", 170 | bench( 171 | [&input, size, volume]() { 172 | for (size_t t = 0; t < volume; t += size) { 173 | shuffle_lehmer_2(input.data() + t, size); 174 | } 175 | }, 176 | min_repeat, min_time_ns, max_repeat)); 177 | 178 | pretty_print(volume, volume * sizeof(uint64_t), 179 | "batch shuffle 2-6 (lehmer)", 180 | bench( 181 | [&input, size, volume]() { 182 | for (size_t t = 0; t < volume; t += size) { 183 | shuffle_lehmer_23456(input.data() + t, size); 184 | } 185 | }, 186 | min_repeat, min_time_ns, max_repeat)); 187 | 188 | pretty_print(volume, volume * sizeof(uint64_t), 189 | "naive batch shuffle 2 (lehmer)", 190 | bench( 191 | [&input, size, volume]() { 192 | for (size_t t = 0; t < volume; t += size) { 193 | naive_shuffle_lehmer_2(input.data() + t, size); 194 | } 195 | }, 196 | min_repeat, min_time_ns, max_repeat)); 197 | 198 | // PCG 199 | 200 | pretty_print(volume, volume * sizeof(uint64_t), 201 | "standard shuffle (PCG)", 202 | bench( 203 | [&input, size, volume]() { 204 | for (size_t t = 0; t < volume; t += size) { 205 | shuffle_pcg(input.data() + t, size); 206 | } 207 | }, 208 | min_repeat, min_time_ns, max_repeat)); 209 | 210 | pretty_print(volume, volume * sizeof(uint64_t), 211 | "batch shuffle 2 (PCG)", 212 | bench( 213 | [&input, size, volume]() { 214 | for (size_t t = 0; t < volume; t += size) { 215 | shuffle_pcg_2(input.data() + t, size); 216 | } 217 | }, 218 | min_repeat, min_time_ns, max_repeat)); 219 | 220 | pretty_print(volume, volume * sizeof(uint64_t), 221 | "batch shuffle 2-6 (PCG)", 222 | bench( 223 | [&input, size, volume]() { 224 | for (size_t t = 0; t < volume; t += size) { 225 | shuffle_pcg_23456(input.data() + t, size); 226 | } 227 | }, 228 | min_repeat, min_time_ns, max_repeat)); 229 | 230 | 231 | 232 | pretty_print(volume, volume * sizeof(uint64_t), 233 | "naive batch shuffle 2 (PCG)", 234 | bench( 235 | [&input, size, volume]() { 236 | for (size_t t = 0; t < volume; t += size) { 237 | naive_shuffle_pcg_2(input.data() + t, size); 238 | } 239 | }, 240 | min_repeat, min_time_ns, max_repeat)); 241 | // chacha 242 | 243 | pretty_print(volume, volume * sizeof(uint64_t), 244 | "standard shuffle (chacha)", 245 | bench( 246 | [&input, size, volume]() { 247 | for (size_t t = 0; t < volume; t += size) { 248 | shuffle_chacha(input.data() + t, size); 249 | } 250 | }, 251 | min_repeat, min_time_ns, max_repeat)); 252 | 253 | pretty_print(volume, volume * sizeof(uint64_t), 254 | "batch shuffle 2 (chacha)", 255 | bench( 256 | [&input, size, volume]() { 257 | for (size_t t = 0; t < volume; t += size) { 258 | shuffle_chacha_2(input.data() + t, size); 259 | } 260 | }, 261 | min_repeat, min_time_ns, max_repeat)); 262 | 263 | pretty_print(volume, volume * sizeof(uint64_t), 264 | "batch shuffle 2-6 (chacha)", 265 | bench( 266 | [&input, size, volume]() { 267 | for (size_t t = 0; t < volume; t += size) { 268 | shuffle_chacha_23456(input.data() + t, size); 269 | } 270 | }, 271 | min_repeat, min_time_ns, max_repeat)); 272 | 273 | pretty_print(volume, volume * sizeof(uint64_t), 274 | "naive batch shuffle 2 (chacha)", 275 | bench( 276 | [&input, size, volume]() { 277 | for (size_t t = 0; t < volume; t += size) { 278 | naive_shuffle_chacha_2(input.data() + t, size); 279 | } 280 | }, 281 | min_repeat, min_time_ns, max_repeat)); 282 | // Precomputed 283 | 284 | pretty_print(volume, volume * sizeof(uint64_t), 285 | "directed_shuffle (as a reference)", 286 | bench( 287 | [&input, precomputed, size, volume]() { 288 | for (size_t t = 0; t < volume; t += size) { 289 | precomp_shuffle(input.data() + t, size, 290 | precomputed.data() + t); 291 | } 292 | }, 293 | min_repeat, min_time_ns, max_repeat)); 294 | } 295 | 296 | int main(int argc, char **argv) { 297 | seed(1234); 298 | bool include_cpp = false; 299 | if (argc > 1) { 300 | if (std::string(argv[1]) == "--cpp") { 301 | include_cpp = true; 302 | } 303 | } 304 | 305 | // We want to make sure we extend the range far enough to see regressions 306 | // for large arrays, if any. 307 | for (size_t i = 1 << 9; i <= 1 << 20; i <<= 1) { 308 | bench(i, include_cpp); 309 | std::cout << std::endl; 310 | } 311 | 312 | return EXIT_SUCCESS; 313 | } 314 | -------------------------------------------------------------------------------- /benchmarks/generators.h: -------------------------------------------------------------------------------- 1 | #ifndef BENCHMARKS_GENERATORS_H 2 | #define BENCHMARKS_GENERATORS_H 3 | #include 4 | 5 | class lehmer64 { 6 | public: 7 | using result_type = uint64_t; 8 | static constexpr result_type(min)() { return 0; } 9 | static constexpr result_type(max)() { return UINT64_MAX; } 10 | 11 | lehmer64() : m_state(1234) {} 12 | lehmer64(uint64_t seed) : m_state(seed|1) {} 13 | lehmer64(lehmer64&& l) : m_state(l.m_state) {} 14 | lehmer64(lehmer64& l) : m_state(l.m_state) {} 15 | 16 | 17 | void step() { m_state *= UINT64_C(0xda942042e4dd58b5); } 18 | 19 | result_type operator()() { 20 | step(); 21 | return (uint64_t)(m_state >> 64); 22 | } 23 | 24 | void discard(unsigned long long n) { 25 | for (unsigned long long i = 0; i < n; ++i) 26 | operator()(); 27 | } 28 | 29 | private: 30 | __uint128_t m_state; 31 | }; 32 | 33 | #endif -------------------------------------------------------------------------------- /benchmarks/performancecounters/apple_arm_events.h: -------------------------------------------------------------------------------- 1 | /* clang-format off */ 2 | 3 | // Original design from: 4 | // ============================================================================= 5 | // XNU kperf/kpc 6 | // Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges 7 | // 8 | // References: 9 | // 10 | // XNU source (since xnu 2422.1.72): 11 | // https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h 12 | // https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c 13 | // 14 | // Lightweight PET (Profile Every Thread, since xnu 3789.1.32): 15 | // https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c 16 | // https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c 17 | // 18 | // System Private frameworks (since macOS 10.11, iOS 8.0): 19 | // /System/Library/PrivateFrameworks/kperf.framework 20 | // /System/Library/PrivateFrameworks/kperfdata.framework 21 | // 22 | // Xcode framework (since Xcode 7.0): 23 | // /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework 24 | // 25 | // CPU database (plist files) 26 | // macOS (since macOS 10.11): 27 | // /usr/share/kpep/.plist 28 | // iOS (copied from Xcode, since iOS 10.0, Xcode 8.0): 29 | // /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform 30 | // /DeviceSupport//DeveloperDiskImage.dmg/usr/share/kpep/.plist 31 | // 32 | // 33 | // Created by YaoYuan on 2021. 34 | // Released into the public domain (unlicense.org). 35 | // ============================================================================= 36 | 37 | #ifndef M1CYCLES_H 38 | #define M1CYCLES_H 39 | 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | #include // for dlopen() and dlsym() 47 | #include // for mach_absolute_time() 48 | #include // for kdebug trace decode 49 | #include // for sysctl() 50 | #include // for usleep() 51 | 52 | struct performance_counters { 53 | double cycles; 54 | double branches; 55 | double missed_branches; 56 | double instructions; 57 | performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i) 58 | : cycles(c), branches(b), missed_branches(m), instructions(i) {} 59 | performance_counters(double c, double b, double m, double i) 60 | : cycles(c), branches(b), missed_branches(m), instructions(i) {} 61 | performance_counters(double init) 62 | : cycles(init), 63 | branches(init), 64 | missed_branches(init), 65 | instructions(init) {} 66 | 67 | inline performance_counters &operator-=(const performance_counters &other) { 68 | cycles -= other.cycles; 69 | branches -= other.branches; 70 | missed_branches -= other.missed_branches; 71 | instructions -= other.instructions; 72 | return *this; 73 | } 74 | inline performance_counters &min(const performance_counters &other) { 75 | cycles = other.cycles < cycles ? other.cycles : cycles; 76 | branches = other.branches < branches ? other.branches : branches; 77 | missed_branches = other.missed_branches < missed_branches 78 | ? other.missed_branches 79 | : missed_branches; 80 | instructions = 81 | other.instructions < instructions ? other.instructions : instructions; 82 | return *this; 83 | } 84 | inline performance_counters &operator+=(const performance_counters &other) { 85 | cycles += other.cycles; 86 | branches += other.branches; 87 | missed_branches += other.missed_branches; 88 | instructions += other.instructions; 89 | return *this; 90 | } 91 | 92 | inline performance_counters &operator/=(double numerator) { 93 | cycles /= numerator; 94 | branches /= numerator; 95 | missed_branches /= numerator; 96 | instructions /= numerator; 97 | return *this; 98 | } 99 | }; 100 | 101 | inline performance_counters operator-(const performance_counters &a, 102 | const performance_counters &b) { 103 | return performance_counters(a.cycles - b.cycles, a.branches - b.branches, 104 | a.missed_branches - b.missed_branches, 105 | a.instructions - b.instructions); 106 | } 107 | 108 | typedef float f32; 109 | typedef double f64; 110 | typedef int8_t i8; 111 | typedef uint8_t u8; 112 | typedef int16_t i16; 113 | typedef uint16_t u16; 114 | typedef int32_t i32; 115 | typedef uint32_t u32; 116 | typedef int64_t i64; 117 | typedef uint64_t u64; 118 | typedef size_t usize; 119 | 120 | // ----------------------------------------------------------------------------- 121 | // header (reverse engineered) 122 | // This framework wraps some sysctl calls to communicate with the kpc in kernel. 123 | // Most functions requires root privileges, or process is "blessed". 124 | // ----------------------------------------------------------------------------- 125 | 126 | // Cross-platform class constants. 127 | #define KPC_CLASS_FIXED (0) 128 | #define KPC_CLASS_CONFIGURABLE (1) 129 | #define KPC_CLASS_POWER (2) 130 | #define KPC_CLASS_RAWPMU (3) 131 | 132 | // Cross-platform class mask constants. 133 | #define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED) // 1 134 | #define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE) // 2 135 | #define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER) // 4 136 | #define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU) // 8 137 | 138 | // PMU version constants. 139 | #define KPC_PMU_ERROR (0) // Error 140 | #define KPC_PMU_INTEL_V3 (1) // Intel 141 | #define KPC_PMU_ARM_APPLE (2) // ARM64 142 | #define KPC_PMU_INTEL_V2 (3) // Old Intel 143 | #define KPC_PMU_ARM_V2 (4) // Old ARM 144 | 145 | // The maximum number of counters we could read from every class in one go. 146 | // ARMV7: FIXED: 1, CONFIGURABLE: 4 147 | // ARM32: FIXED: 2, CONFIGURABLE: 6 148 | // ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8) 149 | // x86: 32 150 | #define KPC_MAX_COUNTERS 32 151 | 152 | // Bits for defining what to do on an action. 153 | // Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h 154 | #define KPERF_SAMPLER_TH_INFO (1U << 0) 155 | #define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1) 156 | #define KPERF_SAMPLER_KSTACK (1U << 2) 157 | #define KPERF_SAMPLER_USTACK (1U << 3) 158 | #define KPERF_SAMPLER_PMC_THREAD (1U << 4) 159 | #define KPERF_SAMPLER_PMC_CPU (1U << 5) 160 | #define KPERF_SAMPLER_PMC_CONFIG (1U << 6) 161 | #define KPERF_SAMPLER_MEMINFO (1U << 7) 162 | #define KPERF_SAMPLER_TH_SCHEDULING (1U << 8) 163 | #define KPERF_SAMPLER_TH_DISPATCH (1U << 9) 164 | #define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10) 165 | #define KPERF_SAMPLER_SYS_MEM (1U << 11) 166 | #define KPERF_SAMPLER_TH_INSCYC (1U << 12) 167 | #define KPERF_SAMPLER_TK_INFO (1U << 13) 168 | 169 | // Maximum number of kperf action ids. 170 | #define KPERF_ACTION_MAX (32) 171 | 172 | // Maximum number of kperf timer ids. 173 | #define KPERF_TIMER_MAX (8) 174 | 175 | // x86/arm config registers are 64-bit 176 | typedef u64 kpc_config_t; 177 | 178 | /// Print current CPU identification string to the buffer (same as snprintf), 179 | /// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC 180 | /// database in /usr/share/kpep. 181 | /// @return string's length, or negative value if error occurs. 182 | /// @note This method does not requires root privileges. 183 | /// @details sysctl get(hw.cputype), get(hw.cpusubtype), 184 | /// get(hw.cpufamily), get(machdep.cpu.model) 185 | static int (*kpc_cpu_string)(char *buf, usize buf_size); 186 | 187 | /// Get the version of KPC that's being run. 188 | /// @return See `PMU version constants` above. 189 | /// @details sysctl get(kpc.pmu_version) 190 | static u32 (*kpc_pmu_version)(void); 191 | 192 | /// Get running PMC classes. 193 | /// @return See `class mask constants` above, 194 | /// 0 if error occurs or no class is set. 195 | /// @details sysctl get(kpc.counting) 196 | static u32 (*kpc_get_counting)(void); 197 | 198 | /// Set PMC classes to enable counting. 199 | /// @param classes See `class mask constants` above, set 0 to shutdown counting. 200 | /// @return 0 for success. 201 | /// @details sysctl set(kpc.counting) 202 | static int (*kpc_set_counting)(u32 classes); 203 | 204 | /// Get running PMC classes for current thread. 205 | /// @return See `class mask constants` above, 206 | /// 0 if error occurs or no class is set. 207 | /// @details sysctl get(kpc.thread_counting) 208 | static u32 (*kpc_get_thread_counting)(void); 209 | 210 | /// Set PMC classes to enable counting for current thread. 211 | /// @param classes See `class mask constants` above, set 0 to shutdown counting. 212 | /// @return 0 for success. 213 | /// @details sysctl set(kpc.thread_counting) 214 | static int (*kpc_set_thread_counting)(u32 classes); 215 | 216 | /// Get how many config registers there are for a given mask. 217 | /// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`, 218 | /// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. 219 | /// @param classes See `class mask constants` above. 220 | /// @return 0 if error occurs or no class is set. 221 | /// @note This method does not requires root privileges. 222 | /// @details sysctl get(kpc.config_count) 223 | static u32 (*kpc_get_config_count)(u32 classes); 224 | 225 | /// Get config registers. 226 | /// @param classes see `class mask constants` above. 227 | /// @param config Config buffer to receive values, should not smaller than 228 | /// kpc_get_config_count(classes) * sizeof(kpc_config_t). 229 | /// @return 0 for success. 230 | /// @details sysctl get(kpc.config_count), get(kpc.config) 231 | static int (*kpc_get_config)(u32 classes, kpc_config_t *config); 232 | 233 | /// Set config registers. 234 | /// @param classes see `class mask constants` above. 235 | /// @param config Config buffer, should not smaller than 236 | /// kpc_get_config_count(classes) * sizeof(kpc_config_t). 237 | /// @return 0 for success. 238 | /// @details sysctl get(kpc.config_count), set(kpc.config) 239 | static int (*kpc_set_config)(u32 classes, kpc_config_t *config); 240 | 241 | /// Get how many counters there are for a given mask. 242 | /// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`, 243 | /// returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`. 244 | /// @param classes See `class mask constants` above. 245 | /// @note This method does not requires root privileges. 246 | /// @details sysctl get(kpc.counter_count) 247 | static u32 (*kpc_get_counter_count)(u32 classes); 248 | 249 | /// Get counter accumulations. 250 | /// If `all_cpus` is true, the buffer count should not smaller than 251 | /// (cpu_count * counter_count). Otherwize, the buffer count should not smaller 252 | /// than (counter_count). 253 | /// @see kpc_get_counter_count(), kpc_cpu_count(). 254 | /// @param all_cpus true for all CPUs, false for current cpu. 255 | /// @param classes See `class mask constants` above. 256 | /// @param curcpu A pointer to receive current cpu id, can be NULL. 257 | /// @param buf Buffer to receive counter's value. 258 | /// @return 0 for success. 259 | /// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters) 260 | static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu, 261 | u64 *buf); 262 | 263 | /// Get counter accumulations for current thread. 264 | /// @param tid Thread id, should be 0. 265 | /// @param buf_count The number of buf's elements (not bytes), 266 | /// should not smaller than kpc_get_counter_count(). 267 | /// @param buf Buffer to receive counter's value. 268 | /// @return 0 for success. 269 | /// @details sysctl get(kpc.thread_counters) 270 | static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf); 271 | 272 | /// Acquire/release the counters used by the Power Manager. 273 | /// @param val 1:acquire, 0:release 274 | /// @return 0 for success. 275 | /// @details sysctl set(kpc.force_all_ctrs) 276 | static int (*kpc_force_all_ctrs_set)(int val); 277 | 278 | /// Get the state of all_ctrs. 279 | /// @return 0 for success. 280 | /// @details sysctl get(kpc.force_all_ctrs) 281 | static int (*kpc_force_all_ctrs_get)(int *val_out); 282 | 283 | /// Set number of actions, should be `KPERF_ACTION_MAX`. 284 | /// @details sysctl set(kperf.action.count) 285 | static int (*kperf_action_count_set)(u32 count); 286 | 287 | /// Get number of actions. 288 | /// @details sysctl get(kperf.action.count) 289 | static int (*kperf_action_count_get)(u32 *count); 290 | 291 | /// Set what to sample when a trigger fires an action, e.g. 292 | /// `KPERF_SAMPLER_PMC_CPU`. 293 | /// @details sysctl set(kperf.action.samplers) 294 | static int (*kperf_action_samplers_set)(u32 actionid, u32 sample); 295 | 296 | /// Get what to sample when a trigger fires an action. 297 | /// @details sysctl get(kperf.action.samplers) 298 | static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample); 299 | 300 | /// Apply a task filter to the action, -1 to disable filter. 301 | /// @details sysctl set(kperf.action.filter_by_task) 302 | static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port); 303 | 304 | /// Apply a pid filter to the action, -1 to disable filter. 305 | /// @details sysctl set(kperf.action.filter_by_pid) 306 | static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid); 307 | 308 | /// Set number of time triggers, should be `KPERF_TIMER_MAX`. 309 | /// @details sysctl set(kperf.timer.count) 310 | static int (*kperf_timer_count_set)(u32 count); 311 | 312 | /// Get number of time triggers. 313 | /// @details sysctl get(kperf.timer.count) 314 | static int (*kperf_timer_count_get)(u32 *count); 315 | 316 | /// Set timer number and period. 317 | /// @details sysctl set(kperf.timer.period) 318 | static int (*kperf_timer_period_set)(u32 actionid, u64 tick); 319 | 320 | /// Get timer number and period. 321 | /// @details sysctl get(kperf.timer.period) 322 | static int (*kperf_timer_period_get)(u32 actionid, u64 *tick); 323 | 324 | /// Set timer number and actionid. 325 | /// @details sysctl set(kperf.timer.action) 326 | static int (*kperf_timer_action_set)(u32 actionid, u32 timerid); 327 | 328 | /// Get timer number and actionid. 329 | /// @details sysctl get(kperf.timer.action) 330 | static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid); 331 | 332 | /// Set which timer ID does PET (Profile Every Thread). 333 | /// @details sysctl set(kperf.timer.pet_timer) 334 | static int (*kperf_timer_pet_set)(u32 timerid); 335 | 336 | /// Get which timer ID does PET (Profile Every Thread). 337 | /// @details sysctl get(kperf.timer.pet_timer) 338 | static int (*kperf_timer_pet_get)(u32 *timerid); 339 | 340 | /// Enable or disable sampling. 341 | /// @details sysctl set(kperf.sampling) 342 | static int (*kperf_sample_set)(u32 enabled); 343 | 344 | /// Get is currently sampling. 345 | /// @details sysctl get(kperf.sampling) 346 | static int (*kperf_sample_get)(u32 *enabled); 347 | 348 | /// Reset kperf: stop sampling, kdebug, timers and actions. 349 | /// @return 0 for success. 350 | static int (*kperf_reset)(void); 351 | 352 | /// Nanoseconds to CPU ticks. 353 | static u64 (*kperf_ns_to_ticks)(u64 ns); 354 | 355 | /// CPU ticks to nanoseconds. 356 | static u64 (*kperf_ticks_to_ns)(u64 ticks); 357 | 358 | /// CPU ticks frequency (mach_absolute_time). 359 | static u64 (*kperf_tick_frequency)(void); 360 | 361 | // ----------------------------------------------------------------------------- 362 | // header (reverse engineered) 363 | // This framework provides some functions to access the local CPU database. 364 | // These functions do not require root privileges. 365 | // ----------------------------------------------------------------------------- 366 | 367 | // KPEP CPU archtecture constants. 368 | #define KPEP_ARCH_I386 0 369 | #define KPEP_ARCH_X86_64 1 370 | #define KPEP_ARCH_ARM 2 371 | #define KPEP_ARCH_ARM64 3 372 | 373 | /// KPEP event (size: 48/28 bytes on 64/32 bit OS) 374 | typedef struct kpep_event { 375 | const char *name; ///< Unique name of a event, such as "INST_RETIRED.ANY". 376 | const char *description; ///< Description for this event. 377 | const char *errata; ///< Errata, currently NULL. 378 | const char *alias; ///< Alias name, such as "Instructions", "Cycles". 379 | const char *fallback; ///< Fallback event name for fixed counter. 380 | u32 mask; 381 | u8 number; 382 | u8 umask; 383 | u8 reserved; 384 | u8 is_fixed; 385 | } kpep_event; 386 | 387 | /// KPEP database (size: 144/80 bytes on 64/32 bit OS) 388 | typedef struct kpep_db { 389 | const char *name; ///< Database name, such as "haswell". 390 | const char *cpu_id; ///< Plist name, such as "cpu_7_8_10b282dc". 391 | const char *marketing_name; ///< Marketing name, such as "Intel Haswell". 392 | void *plist_data; ///< Plist data (CFDataRef), currently NULL. 393 | void *event_map; ///< All events (CFDict). 394 | kpep_event 395 | *event_arr; ///< Event struct buffer (sizeof(kpep_event) * events_count). 396 | kpep_event **fixed_event_arr; ///< Fixed counter events (sizeof(kpep_event *) 397 | ///< * fixed_counter_count) 398 | void *alias_map; ///< All aliases (CFDict). 399 | usize reserved_1; 400 | usize reserved_2; 401 | usize reserved_3; 402 | usize event_count; ///< All events count. 403 | usize alias_count; 404 | usize fixed_counter_count; 405 | usize config_counter_count; 406 | usize power_counter_count; 407 | u32 archtecture; ///< see `KPEP CPU archtecture constants` above. 408 | u32 fixed_counter_bits; 409 | u32 config_counter_bits; 410 | u32 power_counter_bits; 411 | } kpep_db; 412 | 413 | /// KPEP config (size: 80/44 bytes on 64/32 bit OS) 414 | typedef struct kpep_config { 415 | kpep_db *db; 416 | kpep_event **ev_arr; ///< (sizeof(kpep_event *) * counter_count), init NULL 417 | usize *ev_map; ///< (sizeof(usize *) * counter_count), init 0 418 | usize *ev_idx; ///< (sizeof(usize *) * counter_count), init -1 419 | u32 *flags; ///< (sizeof(u32 *) * counter_count), init 0 420 | u64 *kpc_periods; ///< (sizeof(u64 *) * counter_count), init 0 421 | usize event_count; /// kpep_config_events_count() 422 | usize counter_count; 423 | u32 classes; ///< See `class mask constants` above. 424 | u32 config_counter; 425 | u32 power_counter; 426 | u32 reserved; 427 | } kpep_config; 428 | 429 | /// Error code for kpep_config_xxx() and kpep_db_xxx() functions. 430 | typedef enum { 431 | KPEP_CONFIG_ERROR_NONE = 0, 432 | KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1, 433 | KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2, 434 | KPEP_CONFIG_ERROR_IO = 3, 435 | KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4, 436 | KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5, 437 | KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6, 438 | KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7, 439 | KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8, 440 | KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9, 441 | KPEP_CONFIG_ERROR_DB_CORRUPT = 10, 442 | KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11, 443 | KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12, 444 | KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13, 445 | KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14, 446 | KPEP_CONFIG_ERROR_ERRNO = 15, 447 | KPEP_CONFIG_ERROR_MAX 448 | } kpep_config_error_code; 449 | 450 | /// Error description for kpep_config_error_code. 451 | static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = { 452 | "none", 453 | "invalid argument", 454 | "out of memory", 455 | "I/O", 456 | "buffer too small", 457 | "current system unknown", 458 | "database path invalid", 459 | "database not found", 460 | "database architecture unsupported", 461 | "database version unsupported", 462 | "database corrupt", 463 | "event not found", 464 | "conflicting events", 465 | "all counters must be forced", 466 | "event unavailable", 467 | "check errno"}; 468 | 469 | /// Error description. 470 | static const char *kpep_config_error_desc(int code) { 471 | if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) { 472 | return kpep_config_error_names[code]; 473 | } 474 | return "unknown error"; 475 | } 476 | 477 | /// Create a config. 478 | /// @param db A kpep db, see kpep_db_create() 479 | /// @param cfg_ptr A pointer to receive the new config. 480 | /// @return kpep_config_error_code, 0 for success. 481 | static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr); 482 | 483 | /// Free the config. 484 | static void (*kpep_config_free)(kpep_config *cfg); 485 | 486 | /// Add an event to config. 487 | /// @param cfg The config. 488 | /// @param ev_ptr A event pointer. 489 | /// @param flag 0: all, 1: user space only 490 | /// @param err Error bitmap pointer, can be NULL. 491 | /// If return value is `CONFLICTING_EVENTS`, this bitmap contains 492 | /// the conflicted event indices, e.g. "1 << 2" means index 2. 493 | /// @return kpep_config_error_code, 0 for success. 494 | static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr, 495 | u32 flag, u32 *err); 496 | 497 | /// Remove event at index. 498 | /// @return kpep_config_error_code, 0 for success. 499 | static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx); 500 | 501 | /// Force all counters. 502 | /// @return kpep_config_error_code, 0 for success. 503 | static int (*kpep_config_force_counters)(kpep_config *cfg); 504 | 505 | /// Get events count. 506 | /// @return kpep_config_error_code, 0 for success. 507 | static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr); 508 | 509 | /// Get all event pointers. 510 | /// @param buf A buffer to receive event pointers. 511 | /// @param buf_size The buffer's size in bytes, should not smaller than 512 | /// kpep_config_events_count() * sizeof(void *). 513 | /// @return kpep_config_error_code, 0 for success. 514 | static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf, 515 | usize buf_size); 516 | 517 | /// Get kpc register configs. 518 | /// @param buf A buffer to receive kpc register configs. 519 | /// @param buf_size The buffer's size in bytes, should not smaller than 520 | /// kpep_config_kpc_count() * sizeof(kpc_config_t). 521 | /// @return kpep_config_error_code, 0 for success. 522 | static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf, 523 | usize buf_size); 524 | 525 | /// Get kpc register config count. 526 | /// @return kpep_config_error_code, 0 for success. 527 | static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr); 528 | 529 | /// Get kpc classes. 530 | /// @param classes See `class mask constants` above. 531 | /// @return kpep_config_error_code, 0 for success. 532 | static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr); 533 | 534 | /// Get the index mapping from event to counter. 535 | /// @param buf A buffer to receive indexes. 536 | /// @param buf_size The buffer's size in bytes, should not smaller than 537 | /// kpep_config_events_count() * sizeof(kpc_config_t). 538 | /// @return kpep_config_error_code, 0 for success. 539 | static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size); 540 | 541 | /// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/". 542 | /// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8". 543 | /// Pass NULL for current CPU. 544 | /// @return kpep_config_error_code, 0 for success. 545 | static int (*kpep_db_create)(const char *name, kpep_db **db_ptr); 546 | 547 | /// Free the kpep database. 548 | static void (*kpep_db_free)(kpep_db *db); 549 | 550 | /// Get the database's name. 551 | /// @return kpep_config_error_code, 0 for success. 552 | static int (*kpep_db_name)(kpep_db *db, const char **name); 553 | 554 | /// Get the event alias count. 555 | /// @return kpep_config_error_code, 0 for success. 556 | static int (*kpep_db_aliases_count)(kpep_db *db, usize *count); 557 | 558 | /// Get all alias. 559 | /// @param buf A buffer to receive all alias strings. 560 | /// @param buf_size The buffer's size in bytes, 561 | /// should not smaller than kpep_db_aliases_count() * sizeof(void *). 562 | /// @return kpep_config_error_code, 0 for success. 563 | static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size); 564 | 565 | /// Get counters count for given classes. 566 | /// @param classes 1: Fixed, 2: Configurable. 567 | /// @return kpep_config_error_code, 0 for success. 568 | static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count); 569 | 570 | /// Get all event count. 571 | /// @return kpep_config_error_code, 0 for success. 572 | static int (*kpep_db_events_count)(kpep_db *db, usize *count); 573 | 574 | /// Get all events. 575 | /// @param buf A buffer to receive all event pointers. 576 | /// @param buf_size The buffer's size in bytes, 577 | /// should not smaller than kpep_db_events_count() * sizeof(void *). 578 | /// @return kpep_config_error_code, 0 for success. 579 | static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size); 580 | 581 | /// Get one event by name. 582 | /// @return kpep_config_error_code, 0 for success. 583 | static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr); 584 | 585 | /// Get event's name. 586 | /// @return kpep_config_error_code, 0 for success. 587 | static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr); 588 | 589 | /// Get event's alias. 590 | /// @return kpep_config_error_code, 0 for success. 591 | static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr); 592 | 593 | /// Get event's description. 594 | /// @return kpep_config_error_code, 0 for success. 595 | static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr); 596 | 597 | // ----------------------------------------------------------------------------- 598 | // load kperf/kperfdata dynamic library 599 | // ----------------------------------------------------------------------------- 600 | 601 | typedef struct { 602 | const char *name; 603 | void **impl; 604 | } lib_symbol; 605 | 606 | #define lib_nelems(x) (sizeof(x) / sizeof((x)[0])) 607 | #define lib_symbol_def(name) \ 608 | { #name, (void **)&name } 609 | 610 | static const lib_symbol lib_symbols_kperf[] = { 611 | lib_symbol_def(kpc_pmu_version), 612 | lib_symbol_def(kpc_cpu_string), 613 | lib_symbol_def(kpc_set_counting), 614 | lib_symbol_def(kpc_get_counting), 615 | lib_symbol_def(kpc_set_thread_counting), 616 | lib_symbol_def(kpc_get_thread_counting), 617 | lib_symbol_def(kpc_get_config_count), 618 | lib_symbol_def(kpc_get_counter_count), 619 | lib_symbol_def(kpc_set_config), 620 | lib_symbol_def(kpc_get_config), 621 | lib_symbol_def(kpc_get_cpu_counters), 622 | lib_symbol_def(kpc_get_thread_counters), 623 | lib_symbol_def(kpc_force_all_ctrs_set), 624 | lib_symbol_def(kpc_force_all_ctrs_get), 625 | lib_symbol_def(kperf_action_count_set), 626 | lib_symbol_def(kperf_action_count_get), 627 | lib_symbol_def(kperf_action_samplers_set), 628 | lib_symbol_def(kperf_action_samplers_get), 629 | lib_symbol_def(kperf_action_filter_set_by_task), 630 | lib_symbol_def(kperf_action_filter_set_by_pid), 631 | lib_symbol_def(kperf_timer_count_set), 632 | lib_symbol_def(kperf_timer_count_get), 633 | lib_symbol_def(kperf_timer_period_set), 634 | lib_symbol_def(kperf_timer_period_get), 635 | lib_symbol_def(kperf_timer_action_set), 636 | lib_symbol_def(kperf_timer_action_get), 637 | lib_symbol_def(kperf_sample_set), 638 | lib_symbol_def(kperf_sample_get), 639 | lib_symbol_def(kperf_reset), 640 | lib_symbol_def(kperf_timer_pet_set), 641 | lib_symbol_def(kperf_timer_pet_get), 642 | lib_symbol_def(kperf_ns_to_ticks), 643 | lib_symbol_def(kperf_ticks_to_ns), 644 | lib_symbol_def(kperf_tick_frequency), 645 | }; 646 | 647 | static const lib_symbol lib_symbols_kperfdata[] = { 648 | lib_symbol_def(kpep_config_create), 649 | lib_symbol_def(kpep_config_free), 650 | lib_symbol_def(kpep_config_add_event), 651 | lib_symbol_def(kpep_config_remove_event), 652 | lib_symbol_def(kpep_config_force_counters), 653 | lib_symbol_def(kpep_config_events_count), 654 | lib_symbol_def(kpep_config_events), 655 | lib_symbol_def(kpep_config_kpc), 656 | lib_symbol_def(kpep_config_kpc_count), 657 | lib_symbol_def(kpep_config_kpc_classes), 658 | lib_symbol_def(kpep_config_kpc_map), 659 | lib_symbol_def(kpep_db_create), 660 | lib_symbol_def(kpep_db_free), 661 | lib_symbol_def(kpep_db_name), 662 | lib_symbol_def(kpep_db_aliases_count), 663 | lib_symbol_def(kpep_db_aliases), 664 | lib_symbol_def(kpep_db_counters_count), 665 | lib_symbol_def(kpep_db_events_count), 666 | lib_symbol_def(kpep_db_events), 667 | lib_symbol_def(kpep_db_event), 668 | lib_symbol_def(kpep_event_name), 669 | lib_symbol_def(kpep_event_alias), 670 | lib_symbol_def(kpep_event_description), 671 | }; 672 | 673 | #define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf" 674 | #define lib_path_kperfdata \ 675 | "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata" 676 | 677 | static bool lib_inited = false; 678 | static bool lib_has_err = false; 679 | static char lib_err_msg[256]; 680 | 681 | static void *lib_handle_kperf = NULL; 682 | static void *lib_handle_kperfdata = NULL; 683 | 684 | static void lib_deinit(void) { 685 | lib_inited = false; 686 | lib_has_err = false; 687 | if (lib_handle_kperf) dlclose(lib_handle_kperf); 688 | if (lib_handle_kperfdata) dlclose(lib_handle_kperfdata); 689 | lib_handle_kperf = NULL; 690 | lib_handle_kperfdata = NULL; 691 | for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { 692 | const lib_symbol *symbol = &lib_symbols_kperf[i]; 693 | *symbol->impl = NULL; 694 | } 695 | for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { 696 | const lib_symbol *symbol = &lib_symbols_kperfdata[i]; 697 | *symbol->impl = NULL; 698 | } 699 | } 700 | 701 | static bool lib_init(void) { 702 | #define return_err() \ 703 | do { \ 704 | lib_deinit(); \ 705 | lib_inited = true; \ 706 | lib_has_err = true; \ 707 | return false; \ 708 | } while (false) 709 | 710 | if (lib_inited) return !lib_has_err; 711 | 712 | // load dynamic library 713 | lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY); 714 | if (!lib_handle_kperf) { 715 | snprintf(lib_err_msg, sizeof(lib_err_msg), 716 | "Failed to load kperf.framework, message: %s.", dlerror()); 717 | return_err(); 718 | } 719 | lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY); 720 | if (!lib_handle_kperfdata) { 721 | snprintf(lib_err_msg, sizeof(lib_err_msg), 722 | "Failed to load kperfdata.framework, message: %s.", dlerror()); 723 | return_err(); 724 | } 725 | 726 | // load symbol address from dynamic library 727 | for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) { 728 | const lib_symbol *symbol = &lib_symbols_kperf[i]; 729 | *symbol->impl = dlsym(lib_handle_kperf, symbol->name); 730 | if (!*symbol->impl) { 731 | snprintf(lib_err_msg, sizeof(lib_err_msg), 732 | "Failed to load kperf function: %s.", symbol->name); 733 | return_err(); 734 | } 735 | } 736 | for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) { 737 | const lib_symbol *symbol = &lib_symbols_kperfdata[i]; 738 | *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name); 739 | if (!*symbol->impl) { 740 | snprintf(lib_err_msg, sizeof(lib_err_msg), 741 | "Failed to load kperfdata function: %s.", symbol->name); 742 | return_err(); 743 | } 744 | } 745 | 746 | lib_inited = true; 747 | lib_has_err = false; 748 | return true; 749 | 750 | #undef return_err 751 | } 752 | 753 | // ----------------------------------------------------------------------------- 754 | // kdebug private structs 755 | // https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h 756 | // ----------------------------------------------------------------------------- 757 | 758 | /* 759 | * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf 760 | * structure. 761 | */ 762 | #if defined(__arm64__) 763 | typedef uint64_t kd_buf_argtype; 764 | #else 765 | typedef uintptr_t kd_buf_argtype; 766 | #endif 767 | 768 | typedef struct { 769 | uint64_t timestamp; 770 | kd_buf_argtype arg1; 771 | kd_buf_argtype arg2; 772 | kd_buf_argtype arg3; 773 | kd_buf_argtype arg4; 774 | kd_buf_argtype arg5; /* the thread ID */ 775 | uint32_t debugid; /* see */ 776 | 777 | /* 778 | * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf 779 | * structure. 780 | */ 781 | #if defined(__LP64__) || defined(__arm64__) 782 | uint32_t cpuid; /* cpu index, from 0 */ 783 | kd_buf_argtype unused; 784 | #endif 785 | } kd_buf; 786 | 787 | /* bits for the type field of kd_regtype */ 788 | #define KDBG_CLASSTYPE 0x10000 789 | #define KDBG_SUBCLSTYPE 0x20000 790 | #define KDBG_RANGETYPE 0x40000 791 | #define KDBG_TYPENONE 0x80000 792 | #define KDBG_CKTYPES 0xF0000 793 | 794 | /* only trace at most 4 types of events, at the code granularity */ 795 | #define KDBG_VALCHECK 0x00200000U 796 | 797 | typedef struct { 798 | unsigned int type; 799 | unsigned int value1; 800 | unsigned int value2; 801 | unsigned int value3; 802 | unsigned int value4; 803 | } kd_regtype; 804 | 805 | typedef struct { 806 | /* number of events that can fit in the buffers */ 807 | int nkdbufs; 808 | /* set if trace is disabled */ 809 | int nolog; 810 | /* kd_ctrl_page.flags */ 811 | unsigned int flags; 812 | /* number of threads in thread map */ 813 | int nkdthreads; 814 | /* the owning pid */ 815 | int bufid; 816 | } kbufinfo_t; 817 | 818 | // ----------------------------------------------------------------------------- 819 | // kdebug utils 820 | // ----------------------------------------------------------------------------- 821 | 822 | #define EVENT_NAME_MAX 8 823 | typedef struct { 824 | const char *alias; /// name for print 825 | const char *names[EVENT_NAME_MAX]; /// name from pmc db 826 | } event_alias; 827 | 828 | /// Event names from /usr/share/kpep/.plist 829 | static const event_alias profile_events[] = { 830 | {"cycles", 831 | { 832 | "FIXED_CYCLES", // Apple A7-A15 833 | "CPU_CLK_UNHALTED.THREAD", // Intel Core 1th-10th 834 | "CPU_CLK_UNHALTED.CORE", // Intel Yonah, Merom 835 | }}, 836 | {"instructions", 837 | { 838 | "FIXED_INSTRUCTIONS", // Apple A7-A15 839 | "INST_RETIRED.ANY" // Intel Yonah, Merom, Core 1th-10th 840 | }}, 841 | {"branches", 842 | { 843 | "INST_BRANCH", // Apple A7-A15 844 | "BR_INST_RETIRED.ALL_BRANCHES", // Intel Core 1th-10th 845 | "INST_RETIRED.ANY", // Intel Yonah, Merom 846 | }}, 847 | {"branch-misses", 848 | { 849 | "BRANCH_MISPRED_NONSPEC", // Apple A7-A15, since iOS 15, macOS 12 850 | "BRANCH_MISPREDICT", // Apple A7-A14 851 | "BR_MISP_RETIRED.ALL_BRANCHES", // Intel Core 2th-10th 852 | "BR_INST_RETIRED.MISPRED", // Intel Yonah, Merom 853 | }}, 854 | }; 855 | 856 | static kpep_event *get_event(kpep_db *db, const event_alias *alias) { 857 | for (usize j = 0; j < EVENT_NAME_MAX; j++) { 858 | const char *name = alias->names[j]; 859 | if (!name) break; 860 | kpep_event *ev = NULL; 861 | if (kpep_db_event(db, name, &ev) == 0) { 862 | return ev; 863 | } 864 | } 865 | return NULL; 866 | } 867 | 868 | struct AppleEvents { 869 | kpc_config_t regs[KPC_MAX_COUNTERS] = {0}; 870 | usize counter_map[KPC_MAX_COUNTERS] = {0}; 871 | u64 counters_0[KPC_MAX_COUNTERS] = {0}; 872 | u64 counters_1[KPC_MAX_COUNTERS] = {0}; 873 | static constexpr usize ev_count = 874 | sizeof(profile_events) / sizeof(profile_events[0]); 875 | bool init = false; 876 | bool worked = false; 877 | inline bool setup_performance_counters() { 878 | if (init) { 879 | return worked; 880 | } 881 | init = true; 882 | 883 | // load dylib 884 | if (!lib_init()) { 885 | printf("Error: %s\n", lib_err_msg); 886 | return (worked = false); 887 | } 888 | 889 | // check permission 890 | int force_ctrs = 0; 891 | if (kpc_force_all_ctrs_get(&force_ctrs)) { 892 | return (worked = false); 893 | } 894 | int ret; 895 | // load pmc db 896 | kpep_db *db = NULL; 897 | if ((ret = kpep_db_create(NULL, &db))) { 898 | printf("Error: cannot load pmc database: %d.\n", ret); 899 | return (worked = false); 900 | } 901 | // printf("loaded db: %s (%s)\n", db->name, db->marketing_name); 902 | // printf("number of fixed counters: %zu\n", db->fixed_counter_count); 903 | // printf("number of configurable counters: %zu\n", 904 | // db->config_counter_count); 905 | 906 | // create a config 907 | kpep_config *cfg = NULL; 908 | if ((ret = kpep_config_create(db, &cfg))) { 909 | printf("Failed to create kpep config: %d (%s).\n", ret, 910 | kpep_config_error_desc(ret)); 911 | return (worked = false); 912 | } 913 | if ((ret = kpep_config_force_counters(cfg))) { 914 | printf("Failed to force counters: %d (%s).\n", ret, 915 | kpep_config_error_desc(ret)); 916 | return (worked = false); 917 | } 918 | 919 | // get events 920 | kpep_event *ev_arr[ev_count] = {0}; 921 | for (usize i = 0; i < ev_count; i++) { 922 | const event_alias *alias = profile_events + i; 923 | ev_arr[i] = get_event(db, alias); 924 | if (!ev_arr[i]) { 925 | printf("Cannot find event: %s.\n", alias->alias); 926 | return (worked = false); 927 | } 928 | } 929 | 930 | // add event to config 931 | for (usize i = 0; i < ev_count; i++) { 932 | kpep_event *ev = ev_arr[i]; 933 | if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) { 934 | printf("Failed to add event: %d (%s).\n", ret, 935 | kpep_config_error_desc(ret)); 936 | return (worked = false); 937 | } 938 | } 939 | 940 | // prepare buffer and config 941 | u32 classes = 0; 942 | usize reg_count = 0; 943 | if ((ret = kpep_config_kpc_classes(cfg, &classes))) { 944 | printf("Failed get kpc classes: %d (%s).\n", ret, 945 | kpep_config_error_desc(ret)); 946 | return (worked = false); 947 | } 948 | if ((ret = kpep_config_kpc_count(cfg, ®_count))) { 949 | printf("Failed get kpc count: %d (%s).\n", ret, 950 | kpep_config_error_desc(ret)); 951 | return (worked = false); 952 | } 953 | if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) { 954 | printf("Failed get kpc map: %d (%s).\n", ret, 955 | kpep_config_error_desc(ret)); 956 | return (worked = false); 957 | } 958 | if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) { 959 | printf("Failed get kpc registers: %d (%s).\n", ret, 960 | kpep_config_error_desc(ret)); 961 | return (worked = false); 962 | } 963 | 964 | // set config to kernel 965 | if ((ret = kpc_force_all_ctrs_set(1))) { 966 | printf("Failed force all ctrs: %d.\n", ret); 967 | return (worked = false); 968 | } 969 | if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) { 970 | if ((ret = kpc_set_config(classes, regs))) { 971 | printf("Failed set kpc config: %d.\n", ret); 972 | return (worked = false); 973 | } 974 | } 975 | 976 | // start counting 977 | if ((ret = kpc_set_counting(classes))) { 978 | printf("Failed set counting: %d.\n", ret); 979 | return (worked = false); 980 | } 981 | if ((ret = kpc_set_thread_counting(classes))) { 982 | printf("Failed set thread counting: %d.\n", ret); 983 | return (worked = false); 984 | } 985 | 986 | return (worked = true); 987 | } 988 | 989 | inline performance_counters get_counters() { 990 | static bool warned = false; 991 | int ret; 992 | // get counters before 993 | if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) { 994 | if (!warned) { 995 | printf("Failed get thread counters before: %d.\n", ret); 996 | warned = true; 997 | } 998 | return 1; 999 | } 1000 | return performance_counters{ 1001 | counters_0[counter_map[0]], counters_0[counter_map[3]], 1002 | counters_0[counter_map[2]], counters_0[counter_map[1]]}; 1003 | } 1004 | }; 1005 | 1006 | #endif 1007 | -------------------------------------------------------------------------------- /benchmarks/performancecounters/benchmarker.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "performancecounters/event_counter.h" 4 | #include 5 | #include 6 | 7 | event_collector collector; 8 | 9 | template 10 | event_aggregate bench(const function_type &function, size_t min_repeat = 1, 11 | size_t min_time_ns = 1000000000, 12 | size_t max_repeat = 1000000, double tolerance = 2.0) { 13 | // run it a few times to warm up the cache 14 | for (size_t i = 0; i < 10; i++) { 15 | function(); 16 | } 17 | 18 | size_t N = min_repeat; 19 | if (N == 0) { 20 | N = 1; 21 | } 22 | size_t max_trials = 30; 23 | size_t trial = 0; 24 | std::pair best{std::numeric_limits::max(), 25 | event_aggregate{}}; 26 | do { 27 | event_aggregate aggregate{}; 28 | for (size_t i = 0; i < N; i++) { 29 | std::atomic_thread_fence(std::memory_order_acquire); 30 | collector.start(); 31 | function(); 32 | std::atomic_thread_fence(std::memory_order_release); 33 | event_count allocate_count = collector.end(); 34 | aggregate << allocate_count; 35 | if ((i + 1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) && 36 | (N < max_repeat)) { 37 | N *= 10; 38 | } 39 | } 40 | double ratio = aggregate.elapsed_ns() / aggregate.fastest_elapsed_ns(); 41 | trial++; 42 | if(ratio < tolerance) { 43 | return aggregate; 44 | } 45 | if(ratio < best.first) { 46 | best = {ratio, aggregate}; 47 | } 48 | if(trial >= max_trials) { 49 | //fprintf(stderr, "Warning: failed to converge after %zu trials got %f \n", max_trials, best.first); 50 | return best.second; 51 | } 52 | } while(true); 53 | } 54 | -------------------------------------------------------------------------------- /benchmarks/performancecounters/event_counter.h: -------------------------------------------------------------------------------- 1 | #ifndef __EVENT_COUNTER_H 2 | #define __EVENT_COUNTER_H 3 | 4 | #include 5 | #ifndef _MSC_VER 6 | #include 7 | #endif 8 | #include 9 | 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include "linux-perf-events.h" 16 | #ifdef __linux__ 17 | #include 18 | #endif 19 | 20 | #if __APPLE__ && __aarch64__ 21 | #include "apple_arm_events.h" 22 | #endif 23 | 24 | struct event_count { 25 | std::chrono::duration elapsed; 26 | std::vector event_counts; 27 | event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {} 28 | event_count(const std::chrono::duration _elapsed, 29 | const std::vector _event_counts) 30 | : elapsed(_elapsed), event_counts(_event_counts) {} 31 | event_count(const event_count &other) 32 | : elapsed(other.elapsed), event_counts(other.event_counts) {} 33 | 34 | // The types of counters (so we can read the getter more easily) 35 | enum event_counter_types { 36 | CPU_CYCLES, 37 | INSTRUCTIONS, 38 | }; 39 | 40 | double elapsed_sec() const { 41 | return std::chrono::duration(elapsed).count(); 42 | } 43 | double elapsed_ns() const { 44 | return std::chrono::duration(elapsed).count(); 45 | } 46 | double cycles() const { 47 | return static_cast(event_counts[CPU_CYCLES]); 48 | } 49 | double instructions() const { 50 | return static_cast(event_counts[INSTRUCTIONS]); 51 | } 52 | 53 | event_count &operator=(const event_count &other) { 54 | this->elapsed = other.elapsed; 55 | this->event_counts = other.event_counts; 56 | return *this; 57 | } 58 | event_count operator+(const event_count &other) const { 59 | return event_count(elapsed + other.elapsed, 60 | { 61 | event_counts[0] + other.event_counts[0], 62 | event_counts[1] + other.event_counts[1], 63 | event_counts[2] + other.event_counts[2], 64 | event_counts[3] + other.event_counts[3], 65 | event_counts[4] + other.event_counts[4], 66 | }); 67 | } 68 | 69 | void operator+=(const event_count &other) { *this = *this + other; } 70 | }; 71 | 72 | struct event_aggregate { 73 | bool has_events = false; 74 | int iterations = 0; 75 | event_count total{}; 76 | event_count best{}; 77 | event_count worst{}; 78 | 79 | event_aggregate() = default; 80 | 81 | void operator<<(const event_count &other) { 82 | if (iterations == 0 || other.elapsed < best.elapsed) { 83 | best = other; 84 | } 85 | if (iterations == 0 || other.elapsed > worst.elapsed) { 86 | worst = other; 87 | } 88 | iterations++; 89 | total += other; 90 | } 91 | 92 | double elapsed_sec() const { return total.elapsed_sec() / iterations; } 93 | double total_elapsed_ns() const { return total.elapsed_ns(); } 94 | double elapsed_ns() const { return total.elapsed_ns() / iterations; } 95 | double cycles() const { return total.cycles() / iterations; } 96 | double instructions() const { return total.instructions() / iterations; } 97 | double fastest_elapsed_ns() const { return best.elapsed_ns(); } 98 | double fastest_cycles() const { return best.cycles(); } 99 | double fastest_instructions() const { return best.instructions(); } 100 | }; 101 | 102 | struct event_collector { 103 | event_count count{}; 104 | std::chrono::time_point start_clock{}; 105 | 106 | #if defined(__linux__) 107 | LinuxEvents linux_events; 108 | event_collector() 109 | : linux_events(std::vector{ 110 | PERF_COUNT_HW_CPU_CYCLES, 111 | PERF_COUNT_HW_INSTRUCTIONS, 112 | }) {} 113 | bool has_events() { return linux_events.is_working(); } 114 | #elif __APPLE__ && __aarch64__ 115 | AppleEvents apple_events; 116 | performance_counters diff; 117 | event_collector() : diff(0) { apple_events.setup_performance_counters(); } 118 | bool has_events() { return apple_events.setup_performance_counters(); } 119 | #else 120 | event_collector() {} 121 | bool has_events() { return false; } 122 | #endif 123 | 124 | inline void start() { 125 | #if defined(__linux) 126 | linux_events.start(); 127 | #elif __APPLE__ && __aarch64__ 128 | if (has_events()) { 129 | diff = apple_events.get_counters(); 130 | } 131 | #endif 132 | start_clock = std::chrono::steady_clock::now(); 133 | } 134 | inline event_count &end() { 135 | const auto end_clock = std::chrono::steady_clock::now(); 136 | #if defined(__linux) 137 | linux_events.end(count.event_counts); 138 | #elif __APPLE__ && __aarch64__ 139 | if (has_events()) { 140 | performance_counters end = apple_events.get_counters(); 141 | diff = end - diff; 142 | } 143 | count.event_counts[0] = diff.cycles; 144 | count.event_counts[1] = diff.instructions; 145 | count.event_counts[2] = diff.missed_branches; 146 | count.event_counts[3] = 0; 147 | count.event_counts[4] = diff.branches; 148 | #endif 149 | count.elapsed = end_clock - start_clock; 150 | return count; 151 | } 152 | }; 153 | 154 | #endif 155 | -------------------------------------------------------------------------------- /benchmarks/performancecounters/linux-perf-events.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #ifdef __linux__ 3 | 4 | #include // for __NR_perf_event_open 5 | #include // for perf event constants 6 | #include // for ioctl 7 | #include // for syscall 8 | 9 | #include // for errno 10 | #include // for memset 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | template class LinuxEvents { 17 | int fd; 18 | bool working; 19 | perf_event_attr attribs{}; 20 | size_t num_events{}; 21 | std::vector temp_result_vec{}; 22 | std::vector ids{}; 23 | 24 | public: 25 | explicit LinuxEvents(std::vector config_vec) : fd(0), working(true) { 26 | memset(&attribs, 0, sizeof(attribs)); 27 | attribs.type = TYPE; 28 | attribs.size = sizeof(attribs); 29 | attribs.disabled = 1; 30 | attribs.exclude_kernel = 1; 31 | attribs.exclude_hv = 1; 32 | 33 | attribs.sample_period = 0; 34 | attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; 35 | const int pid = 0; // the current process 36 | const int cpu = -1; // all CPUs 37 | const unsigned long flags = 0; 38 | 39 | int group = -1; // no group 40 | num_events = config_vec.size(); 41 | ids.resize(config_vec.size()); 42 | uint32_t i = 0; 43 | for (auto config : config_vec) { 44 | attribs.config = config; 45 | int _fd = static_cast( 46 | syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags)); 47 | if (_fd == -1) { 48 | report_error("perf_event_open"); 49 | } 50 | ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]); 51 | if (group == -1) { 52 | group = _fd; 53 | fd = _fd; 54 | } 55 | } 56 | 57 | temp_result_vec.resize(num_events * 2 + 1); 58 | } 59 | 60 | ~LinuxEvents() { 61 | if (fd != -1) { 62 | close(fd); 63 | } 64 | } 65 | 66 | inline void start() { 67 | if (fd != -1) { 68 | if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) { 69 | report_error("ioctl(PERF_EVENT_IOC_RESET)"); 70 | } 71 | 72 | if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) { 73 | report_error("ioctl(PERF_EVENT_IOC_ENABLE)"); 74 | } 75 | } 76 | } 77 | 78 | inline void end(std::vector &results) { 79 | if (fd != -1) { 80 | if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) { 81 | report_error("ioctl(PERF_EVENT_IOC_DISABLE)"); 82 | } 83 | 84 | if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) { 85 | report_error("read"); 86 | } 87 | } 88 | // our actual results are in slots 1,3,5, ... of this structure 89 | for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) { 90 | results[i / 2] = temp_result_vec[i]; 91 | } 92 | for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) { 93 | if (ids[i / 2 - 1] != temp_result_vec[i]) { 94 | report_error("event mismatch"); 95 | } 96 | } 97 | } 98 | 99 | bool is_working() { return working; } 100 | 101 | private: 102 | void report_error(const std::string &) { working = false; } 103 | }; 104 | #endif -------------------------------------------------------------------------------- /benchmarks/stream.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "performancecounters/benchmarker.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | extern "C" { 12 | #include "random_bounded.h" 13 | } 14 | #include "generators.h" 15 | #include "template_shuffle.h" 16 | std::vector precomputed; 17 | void precomp_shuffle(uint64_t *storage, uint64_t size) { 18 | uint64_t tmp, val; 19 | uint32_t nextpos; 20 | for (size_t i = size; i > 1; i--) { 21 | nextpos = precomputed[i]; 22 | tmp = storage[i - 1]; // likely in cache 23 | val = storage[nextpos]; // could be costly 24 | storage[i - 1] = val; 25 | storage[nextpos] = tmp; // you might have to read this store later 26 | } 27 | } 28 | 29 | void pretty_print(size_t volume, size_t bytes, std::string name, 30 | event_aggregate agg) { 31 | (void)bytes; 32 | (void)name; 33 | printf(" %5.2f ", agg.elapsed_ns() / volume); 34 | fflush(stdout); 35 | } 36 | 37 | using shuffle_function = void (*)(uint64_t *, uint64_t); 38 | 39 | struct named_function { 40 | std::string name; 41 | shuffle_function function; 42 | }; 43 | 44 | named_function func[] = { 45 | {"shuffle_lehmer", shuffle_lehmer}, 46 | {"naive_shuffle_lehmer_2", naive_shuffle_lehmer_2}, 47 | {"shuffle_lehmer_2", shuffle_lehmer_2}, 48 | {"shuffle_lehmer_23456", shuffle_lehmer_23456}, 49 | {"shuffle_pcg", shuffle_pcg}, 50 | {"naive_shuffle_pcg_2", naive_shuffle_pcg_2}, 51 | {"shuffle_pcg_2", shuffle_pcg_2}, 52 | {"shuffle_pcg_23456", shuffle_pcg_23456}, 53 | {"shuffle_chacha", shuffle_chacha}, 54 | {"naive_shuffle_chacha_2", naive_shuffle_chacha_2}, 55 | {"shuffle_chacha_2", shuffle_chacha_2}, 56 | {"shuffle_chacha_23456", shuffle_chacha_23456}}; 57 | 58 | using cpp_shuffle_function = void (*)(std::vector::iterator, 59 | std::vector::iterator, 60 | std::mt19937_64 &); 61 | 62 | using fast_cpp_shuffle_function = void (*)(std::vector::iterator, 63 | std::vector::iterator, 64 | lehmer64 &); 65 | 66 | struct named_cpp_function { 67 | std::string name; 68 | cpp_shuffle_function function; 69 | }; 70 | 71 | named_cpp_function cppfunc[] = { 72 | {"std::shuffle-mersenne", 73 | [](std::vector::iterator first, 74 | std::vector::iterator last, 75 | std::mt19937_64 &g) { std::shuffle(first, last, g); }}, 76 | {"batched_random::shuffle_2-mersenne", 77 | [](std::vector::iterator first, 78 | std::vector::iterator last, 79 | std::mt19937_64 &g) { batched_random::shuffle_2(first, last, g); }}, 80 | {"batched_random::shuffle_23456-mersenne", 81 | [](std::vector::iterator first, 82 | std::vector::iterator last, std::mt19937_64 &g) { 83 | batched_random::shuffle_23456(first, last, g); 84 | }}}; 85 | 86 | struct named_fast_cpp_function { 87 | std::string name; 88 | fast_cpp_shuffle_function function; 89 | }; 90 | 91 | named_fast_cpp_function fastcppfunc[] = { 92 | {"std::shuffle-lehmer", [](std::vector::iterator first, 93 | std::vector::iterator last, 94 | lehmer64 &g) { std::shuffle(first, last, g); }}, 95 | {"batched_random::shuffle_2-lehmer", 96 | [](std::vector::iterator first, 97 | std::vector::iterator last, 98 | lehmer64 &g) { batched_random::shuffle_2(first, last, g); }}, 99 | {"batched_random::shuffle_23456-lehmer", 100 | [](std::vector::iterator first, 101 | std::vector::iterator last, 102 | lehmer64 &g) { batched_random::shuffle_23456(first, last, g); }}}; 103 | 104 | void bench_line(std::vector &input) { 105 | size_t volume = input.size(); 106 | printf("%zu\t\t", volume); 107 | precomputed.resize(volume + 1); 108 | for (size_t i = 1; i < volume + 1; i++) { 109 | precomputed[i] = random_bounded_lehmer(i); 110 | } 111 | std::random_device rd; 112 | size_t min_repeat = 1; 113 | size_t min_time_ns = 1000000; // 1 ms 114 | size_t max_repeat = 100000; 115 | size_t repeat = 1; 116 | double tolerance = 1.1; 117 | if (volume * repeat < 10000) { 118 | repeat++; 119 | } 120 | std::mt19937_64 mtGenerator{rd()}; 121 | lehmer64 lehmerGenerator{rd()}; 122 | 123 | size_t counter = 0; 124 | for (auto &f : func) { 125 | pretty_print(volume * repeat, repeat * volume * sizeof(uint64_t), f.name, 126 | bench( 127 | [&input, &f, repeat]() { 128 | for (size_t r = 0; r < repeat; r++) { 129 | f.function(input.data(), input.size()); 130 | } 131 | }, 132 | min_repeat, min_time_ns, max_repeat, tolerance)); 133 | counter++; 134 | if((counter)%4 == 0) { printf(" "); } 135 | } 136 | } 137 | 138 | void bench_table(size_t start, size_t end, size_t lines) { 139 | double b = pow(double(end) / start, 1.0 / lines); 140 | printf("# for each scheme, we give the average " 141 | "time/item in ns \n"); 142 | printf("# Volume\t"); 143 | for (auto &f : fastcppfunc) { 144 | printf("\t%s", f.name.c_str()); 145 | } 146 | for (auto &f : cppfunc) { 147 | printf("\t%s", f.name.c_str()); 148 | } 149 | for (auto &f : func) { 150 | printf("\t%s", f.name.c_str()); 151 | } 152 | printf("\n"); 153 | for (double i = start; round(i) <= end; i *= b) { 154 | std::vector input(round(i)); 155 | bench_line(input); 156 | std::cout << std::endl; 157 | } 158 | } 159 | 160 | int main(int, char **) { 161 | seed(1234); 162 | bench_table(100, 150000, 15); 163 | return EXIT_SUCCESS; 164 | } 165 | -------------------------------------------------------------------------------- /gnuplot/README.md: -------------------------------------------------------------------------------- 1 | Example: 2 | 3 | ``` 4 | ./stream > stream.txt 5 | cd gnuplot 6 | gnuplot -e "filename='../stream.txt'" plot.gnuplot 7 | ``` 8 | 9 | 10 | m2.data : LLVM 14, Apple M2 11 | 12 | icelake.data : LLVM 16, Ice Lake processor -------------------------------------------------------------------------------- /gnuplot/icelake.data: -------------------------------------------------------------------------------- 1 | # for each scheme, we give the best time/item and the average time/item in ns 2 | # Volume std::shuffle-lehmer batched_random::shuffle_2-lehmer batched_random::shuffle_23456-lehmer std::shuffle-mersenne batched_random::shuffle_2-mersenne batched_random::shuffle_23456-mersenne shuffle_lehmer naive_shuffle_lehmer_2 shuffle_lehmer_2 shuffle_lehmer_23456 shuffle_pcg naive_shuffle_pcg_2 shuffle_pcg_2 shuffle_pcg_23456 shuffle_chacha naive_shuffle_chacha_2 shuffle_chacha_2 shuffle_chacha_23456 3 | 100 1.71 1.92 1.63 1.21 2.66 2.32 2.09 1.49 16.55 11.07 9.15 3.98 4 | 163 1.68 1.79 1.46 1.10 2.63 2.16 1.90 1.37 15.91 10.18 8.25 3.47 5 | 265 1.63 1.62 1.30 1.01 2.61 1.99 1.71 1.35 15.98 10.00 8.11 3.38 6 | 432 1.56 1.54 1.22 0.97 2.55 1.91 1.62 1.28 16.04 9.89 8.04 3.37 7 | 703 1.54 1.50 1.16 0.92 2.52 1.86 1.56 1.19 16.08 9.62 7.95 3.44 8 | 1145 1.51 1.47 1.13 0.87 2.50 1.84 1.53 1.10 16.16 9.78 8.00 3.58 9 | 1864 1.50 1.46 1.08 0.82 2.47 1.81 1.50 1.05 16.17 9.76 7.95 3.61 10 | 3035 1.46 1.39 1.03 0.79 2.44 1.73 1.44 1.01 16.20 9.57 7.88 3.83 11 | 4942 1.43 1.37 1.01 0.76 2.44 1.71 1.41 0.99 16.19 9.35 7.78 3.95 12 | 8047 1.42 1.33 0.99 0.76 2.42 1.68 1.38 0.99 16.17 9.27 7.76 4.05 13 | 13104 1.41 1.31 0.98 0.78 2.41 1.66 1.36 0.97 16.14 9.08 7.73 4.09 14 | 21337 1.41 1.30 0.97 0.83 2.40 1.64 1.35 0.97 16.09 9.21 7.75 4.46 15 | 34743 1.41 1.30 0.99 0.89 2.41 1.64 1.34 0.99 16.10 9.14 7.71 4.85 16 | 56573 1.40 1.29 1.02 0.97 2.40 1.64 1.35 1.03 16.11 9.12 7.77 5.13 17 | 92120 1.46 1.69 1.07 1.03 2.40 2.07 1.37 1.08 16.12 9.63 7.74 5.31 18 | 150000 1.54 2.18 1.17 1.13 2.47 2.56 1.55 1.22 16.15 10.21 8.01 5.56 -------------------------------------------------------------------------------- /gnuplot/icelake.datalehmer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/icelake.datalehmer.pdf -------------------------------------------------------------------------------- /gnuplot/icelake.datapcg64.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/icelake.datapcg64.pdf -------------------------------------------------------------------------------- /gnuplot/icelake.dataratio.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/icelake.dataratio.pdf -------------------------------------------------------------------------------- /gnuplot/icelakestream.data: -------------------------------------------------------------------------------- 1 | # for each scheme, we give the best time/item and the average time/item in ns 2 | # Volume std::shuffle batched_random::shuffle_2 batched_random::shuffle_2_4 batched_random::shuffle_2_4_6 precomp_shuffle shuffle shuffle_batch_2 shuffle_batch_2_4 shuffle_batch_2_4_6 shuffle_pcg64 shuffle_batch_2_pcg64 shuffle_batch_2_4_pcg64 shuffle_batch_2_4_6_pcg64 3 | 256 3.61 4.03 4.16 4.38 3.12 3.45 3.51 4.02 0.84 0.97 2.16 2.43 1.28 1.54 1.48 1.98 1.20 1.99 2.68 3.31 1.75 2.19 1.62 2.43 1.50 2.17 4 | 320 3.43 3.56 4.04 4.26 2.94 3.18 2.59 3.17 0.77 0.87 2.16 2.39 1.24 1.47 1.50 1.90 1.25 1.96 2.64 3.25 1.71 2.13 1.60 2.38 1.55 2.14 5 | 384 3.30 3.43 3.83 3.97 2.72 2.96 2.47 3.12 0.74 0.80 2.15 2.36 1.22 1.43 1.53 1.85 1.24 1.90 2.62 3.20 1.72 2.08 1.59 2.34 1.45 2.09 6 | 448 3.21 3.30 3.77 3.90 2.75 2.98 2.46 2.81 0.76 0.84 2.14 2.34 1.20 1.40 1.53 1.82 1.21 1.85 2.60 3.17 1.68 2.05 1.57 2.30 1.51 2.07 7 | 512 3.14 3.22 3.66 3.77 2.67 2.90 2.28 2.96 0.81 0.89 2.14 2.32 1.19 1.37 1.52 1.79 1.17 1.81 2.59 3.15 1.66 2.01 1.58 2.25 1.52 2.04 8 | 576 3.09 3.15 3.67 3.82 2.51 2.69 2.42 2.71 0.68 0.74 2.12 2.30 1.18 1.35 1.50 1.77 1.25 1.85 2.59 3.12 1.65 1.98 1.53 2.21 1.57 2.05 9 | 640 3.32 3.42 3.86 4.01 2.54 2.74 2.58 2.95 0.71 0.77 2.12 2.29 1.17 1.33 1.48 1.75 1.30 1.85 2.58 3.10 1.63 1.96 1.53 2.17 1.58 2.03 10 | 704 3.26 3.36 3.84 3.98 2.46 2.63 2.27 2.52 0.70 0.73 2.12 2.28 1.16 1.32 1.49 1.73 1.32 1.84 2.58 3.08 1.63 1.94 1.48 2.13 1.58 2.00 11 | 768 3.21 3.29 3.72 3.86 2.46 2.68 2.40 2.67 0.71 0.78 2.11 2.27 1.16 1.30 1.48 1.72 1.35 1.82 2.57 3.07 1.62 1.92 1.43 2.10 1.58 1.98 12 | 832 3.17 3.24 3.71 3.84 2.43 2.63 2.22 2.43 0.70 0.75 2.10 2.26 1.15 1.29 1.47 1.71 1.35 1.80 2.56 3.05 1.61 1.90 1.44 2.07 1.56 1.96 13 | 896 3.14 3.21 3.66 3.79 2.42 2.57 2.38 2.61 0.77 0.79 2.11 2.25 1.16 1.28 1.48 1.70 1.34 1.79 2.55 3.04 1.60 1.89 1.42 2.04 1.56 1.95 14 | 960 3.11 3.20 3.62 3.74 2.38 2.57 2.47 2.71 0.71 0.76 2.11 2.24 1.14 1.27 1.47 1.69 1.41 1.78 2.55 3.03 1.60 1.87 1.41 2.02 1.60 1.94 15 | 1024 3.08 3.14 3.60 3.72 2.42 2.60 2.26 2.55 0.70 0.73 2.11 2.24 1.15 1.26 1.45 1.68 1.39 1.77 2.55 3.02 1.59 1.87 1.41 2.00 1.60 1.93 16 | 1088 3.05 3.11 3.56 3.65 2.35 2.50 2.15 2.40 0.71 0.73 2.11 2.23 1.14 1.26 1.44 1.67 1.40 1.76 2.55 3.01 1.59 1.85 1.41 1.97 1.58 1.92 17 | 1152 3.03 3.10 3.59 3.79 2.36 2.53 2.37 2.68 0.67 0.70 2.10 2.23 1.14 1.25 1.45 1.66 1.42 1.81 2.55 3.01 1.60 1.87 1.39 2.02 1.61 1.99 18 | 1216 3.01 3.07 3.54 3.78 2.39 2.63 2.42 2.73 0.70 0.72 2.10 2.22 1.14 1.24 1.45 1.65 1.44 1.82 2.54 3.00 1.60 1.86 1.38 2.01 1.59 2.00 19 | 1280 3.16 3.22 3.67 3.84 2.53 2.91 2.21 2.59 0.66 0.70 2.11 2.22 1.14 1.24 1.43 1.64 1.42 1.81 2.55 3.00 1.59 1.86 1.39 2.00 1.60 2.00 20 | 1344 3.11 3.17 3.61 3.89 2.39 2.58 2.26 2.56 0.69 0.74 2.09 2.21 1.13 1.23 1.44 1.64 1.42 1.81 2.54 3.00 1.58 1.85 1.40 1.99 1.61 2.00 21 | 1408 3.11 3.17 3.58 3.84 2.52 2.85 2.38 2.61 0.66 0.69 2.10 2.21 1.13 1.23 1.41 1.63 1.44 1.81 2.54 2.99 1.58 1.85 1.39 1.98 1.60 2.00 22 | 1472 3.08 3.15 3.58 3.73 2.40 2.66 2.41 2.74 0.67 0.69 2.10 2.20 1.13 1.22 1.42 1.62 1.44 1.81 2.54 2.98 1.60 1.84 1.35 1.97 1.62 2.00 23 | 1536 3.06 3.11 3.59 3.77 2.45 2.72 2.22 2.44 0.69 0.71 2.10 2.20 1.14 1.22 1.43 1.62 1.45 1.81 2.54 2.98 1.58 1.84 1.39 1.97 1.62 2.00 24 | 1600 3.04 3.10 3.56 3.78 2.39 2.68 2.27 2.51 0.71 0.74 2.09 2.20 1.13 1.22 1.41 1.61 1.47 1.80 2.54 2.98 1.58 1.83 1.39 1.96 1.62 2.00 25 | 1664 3.05 3.10 3.54 3.82 2.39 2.54 2.37 2.64 0.68 0.70 2.10 2.19 1.13 1.21 1.42 1.61 1.42 1.80 2.54 2.97 1.58 1.83 1.39 1.95 1.62 2.00 26 | 1728 3.02 3.07 3.54 3.80 2.49 2.68 2.31 2.56 0.68 0.70 2.09 2.19 1.13 1.21 1.35 1.60 1.46 1.80 2.53 2.97 1.58 1.83 1.35 1.94 1.62 2.00 27 | 1792 3.01 3.07 3.53 3.74 2.42 2.64 2.32 2.69 0.68 0.71 2.09 2.19 1.13 1.21 1.43 1.60 1.45 1.80 2.54 2.96 1.58 1.82 1.38 1.94 1.60 2.00 28 | 1856 2.99 3.08 3.48 3.68 2.38 2.56 2.30 2.55 0.67 0.68 2.09 2.18 1.13 1.20 1.42 1.59 1.47 1.80 2.54 2.96 1.59 1.82 1.30 1.93 1.62 2.00 29 | 1920 3.07 3.14 3.56 3.70 2.34 2.53 2.33 2.64 0.67 0.70 2.09 2.18 1.12 1.20 1.42 1.59 1.45 1.79 2.54 2.96 1.56 1.81 1.38 1.92 1.61 2.00 30 | 1984 3.08 3.13 3.57 3.69 2.40 2.71 2.32 2.66 0.68 0.69 2.10 2.18 1.12 1.20 1.41 1.58 1.47 1.79 2.54 2.95 1.58 1.81 1.37 1.91 1.63 1.99 31 | 2048 3.05 3.11 3.59 3.77 2.33 2.56 2.30 2.75 0.69 0.71 2.09 2.18 1.13 1.20 1.42 1.58 1.46 1.78 2.53 2.95 1.58 1.81 1.36 1.91 1.62 1.99 32 | 2112 3.04 3.09 3.55 3.72 2.35 2.50 2.36 2.86 0.66 0.68 2.09 2.18 1.12 1.19 1.41 1.57 1.48 1.78 2.54 2.95 1.59 1.80 1.36 1.90 1.61 1.99 33 | 2176 3.03 3.09 3.53 3.71 2.38 2.53 2.27 2.56 0.67 0.70 2.09 2.17 1.13 1.19 1.28 1.57 1.48 1.78 2.54 2.95 1.58 1.80 1.37 1.89 1.63 1.98 34 | 2240 3.02 3.06 3.54 3.66 2.31 2.59 2.34 2.70 0.65 0.68 2.09 2.17 1.12 1.19 1.40 1.56 1.47 1.77 2.54 2.94 1.58 1.80 1.35 1.89 1.62 1.98 35 | 2304 3.00 3.06 3.52 3.82 2.28 2.45 2.21 2.46 0.66 0.68 2.09 2.17 1.12 1.19 1.41 1.56 1.48 1.78 2.53 2.94 1.57 1.80 1.34 1.88 1.63 1.98 36 | 2368 3.03 3.08 3.50 3.62 2.27 2.43 2.34 2.61 0.65 0.68 2.09 2.17 1.12 1.19 1.35 1.56 1.47 1.77 2.54 2.94 1.56 1.79 1.36 1.87 1.62 1.98 37 | 2432 3.00 3.04 3.52 3.73 2.28 2.45 2.22 2.60 0.67 0.70 2.09 2.16 1.12 1.18 1.33 1.55 1.46 1.77 2.54 2.94 1.57 1.79 1.33 1.87 1.63 1.97 38 | 2496 2.99 3.03 3.47 3.66 2.29 2.65 2.23 2.44 0.65 0.68 2.08 2.16 1.12 1.18 1.40 1.55 1.48 1.76 2.53 2.93 1.57 1.79 1.33 1.86 1.61 1.97 39 | 2560 3.05 3.11 3.55 3.73 2.38 2.57 2.24 2.51 0.65 0.67 2.09 2.16 1.12 1.18 1.41 1.55 1.48 1.76 2.53 2.93 1.57 1.78 1.32 1.86 1.62 1.97 40 | 2624 3.06 3.12 3.56 3.74 2.36 2.65 2.25 2.54 0.66 0.67 2.08 2.16 1.12 1.18 1.40 1.54 1.45 1.76 2.53 2.93 1.56 1.78 1.35 1.85 1.63 1.96 41 | 2688 3.05 3.11 3.55 3.67 2.38 2.57 2.35 2.64 0.67 0.69 2.09 2.16 1.12 1.18 1.41 1.54 1.47 1.75 2.53 2.93 1.57 1.78 1.34 1.84 1.63 1.96 42 | 2752 3.02 3.08 3.55 3.71 2.32 2.49 2.26 2.51 0.68 0.69 2.09 2.16 1.12 1.18 1.41 1.54 1.48 1.75 2.53 2.92 1.56 1.78 1.34 1.84 1.63 1.96 43 | 2816 3.00 3.06 3.54 3.76 2.33 2.52 2.30 2.62 0.66 0.67 2.08 2.16 1.12 1.17 1.40 1.54 1.48 1.75 2.53 2.92 1.56 1.77 1.34 1.83 1.63 1.96 44 | 2880 3.00 3.06 3.55 3.67 2.30 2.45 2.32 2.80 0.68 0.71 2.08 2.15 1.12 1.17 1.29 1.53 1.47 1.75 2.53 2.92 1.57 1.77 1.35 1.83 1.63 1.95 45 | 2944 2.99 3.05 3.50 3.69 2.36 2.50 2.36 2.68 0.67 0.70 2.09 2.15 1.12 1.17 1.31 1.53 1.49 1.75 2.53 2.92 1.57 1.77 1.31 1.82 1.62 1.95 46 | 3008 2.99 3.04 3.51 3.69 2.31 2.45 2.21 2.51 0.67 0.71 2.09 2.15 1.12 1.17 1.32 1.53 1.50 1.75 2.53 2.91 1.56 1.77 1.31 1.82 1.63 1.95 47 | 3072 2.98 3.02 3.49 3.62 2.31 2.46 2.30 2.63 0.68 0.71 2.09 2.15 1.12 1.17 1.33 1.52 1.48 1.74 2.53 2.91 1.57 1.76 1.31 1.81 1.63 1.94 48 | 3136 3.04 3.10 3.56 3.68 2.33 2.47 2.25 2.51 0.67 0.70 2.08 2.15 1.12 1.17 1.29 1.52 1.49 1.74 2.53 2.91 1.56 1.76 1.33 1.81 1.62 1.94 49 | 3200 3.02 3.07 3.57 3.76 2.28 2.51 2.26 2.55 0.68 0.70 2.08 2.15 1.12 1.17 1.34 1.52 1.49 1.74 2.52 2.91 1.56 1.76 1.31 1.80 1.63 1.94 50 | 3264 3.08 3.15 3.52 3.61 2.25 2.35 2.23 2.40 0.68 0.70 2.09 2.15 1.12 1.17 1.30 1.52 1.47 1.60 2.53 2.90 1.56 1.71 1.31 1.68 1.60 1.76 51 | 3328 3.02 3.07 3.50 3.60 2.26 2.36 2.22 2.38 0.69 0.73 2.09 2.15 1.11 1.17 1.40 1.52 1.46 1.60 2.52 2.90 1.56 1.71 1.29 1.68 1.60 1.75 52 | 3392 3.01 3.06 3.50 3.58 2.25 2.37 2.27 2.47 0.68 0.70 2.08 2.15 1.12 1.17 1.40 1.52 1.47 1.61 2.53 2.89 1.54 1.71 1.28 1.67 1.61 1.76 53 | 3456 3.02 3.07 3.51 3.60 2.23 2.38 2.26 2.49 0.69 0.71 2.08 2.15 1.12 1.17 1.41 1.52 1.47 1.61 2.52 2.89 1.55 1.71 1.30 1.67 1.60 1.76 54 | 3520 3.00 3.11 3.54 3.68 2.25 2.38 2.19 2.39 0.68 0.71 2.08 2.15 1.11 1.16 1.33 1.52 1.46 1.61 2.53 2.89 1.56 1.71 1.29 1.67 1.61 1.76 55 | 3584 3.04 3.14 3.49 3.58 2.22 2.35 2.19 2.39 0.68 0.70 2.09 2.15 1.11 1.16 1.23 1.52 1.46 1.61 2.53 2.89 1.56 1.71 1.30 1.67 1.60 1.76 56 | 3648 2.99 3.03 3.50 3.59 2.26 2.42 2.20 2.44 0.67 0.70 2.08 2.14 1.11 1.16 1.29 1.52 1.47 1.61 2.53 2.89 1.56 1.71 1.29 1.66 1.60 1.76 57 | 3712 2.99 3.03 3.50 3.60 2.24 2.37 2.22 2.42 0.68 0.71 2.08 2.14 1.11 1.16 1.40 1.51 1.46 1.61 2.53 2.89 1.56 1.71 1.31 1.66 1.61 1.77 58 | 3776 3.02 3.07 3.54 3.64 2.27 2.39 2.20 2.45 0.70 0.73 2.09 2.14 1.11 1.16 1.31 1.51 1.47 1.61 2.53 2.89 1.56 1.71 1.30 1.66 1.60 1.77 59 | 3840 3.11 3.18 3.52 3.62 2.29 2.43 2.20 2.40 0.68 0.73 2.09 2.14 1.11 1.16 1.26 1.51 1.46 1.61 2.53 2.89 1.55 1.71 1.29 1.66 1.61 1.77 60 | 3904 3.09 3.18 3.50 3.59 2.30 2.45 2.21 2.42 0.69 0.72 2.09 2.14 1.11 1.16 1.40 1.51 1.48 1.61 2.53 2.89 1.55 1.71 1.29 1.66 1.61 1.77 61 | 3968 3.03 3.11 3.50 3.61 2.29 2.42 2.32 2.56 0.68 0.70 2.08 2.14 1.11 1.16 1.30 1.51 1.47 1.61 2.53 2.89 1.55 1.71 1.30 1.66 1.59 1.77 62 | 4032 3.04 3.11 3.48 3.61 2.28 2.40 2.23 2.43 0.70 0.73 2.09 2.14 1.11 1.16 1.33 1.51 1.47 1.61 2.53 2.89 1.55 1.71 1.27 1.66 1.61 1.77 63 | 4096 3.01 3.08 3.57 3.70 2.25 2.41 2.28 2.50 0.68 0.70 2.09 2.14 1.11 1.16 1.35 1.51 1.48 1.61 2.53 2.89 1.56 1.71 1.29 1.66 1.61 1.77 64 | 4160 3.07 3.19 3.49 3.58 2.25 2.44 2.22 2.40 0.70 0.72 2.08 2.14 1.11 1.16 1.27 1.51 1.47 1.61 2.53 2.89 1.55 1.71 1.29 1.65 1.60 1.77 65 | 4224 2.99 3.06 3.51 3.69 2.31 2.50 2.32 2.52 0.72 0.76 2.08 2.13 1.11 1.15 1.26 1.49 1.50 1.71 2.53 2.89 1.56 1.74 1.28 1.74 1.63 1.91 66 | 4288 2.98 3.03 3.52 3.65 2.32 2.48 2.28 2.52 0.73 0.77 2.08 2.13 1.11 1.15 1.23 1.49 1.50 1.70 2.53 2.89 1.56 1.74 1.31 1.74 1.63 1.90 67 | 4352 2.98 3.02 3.49 3.60 2.31 2.49 2.30 2.57 0.73 0.77 2.08 2.13 1.11 1.15 1.31 1.49 1.48 1.71 2.53 2.89 1.56 1.73 1.31 1.74 1.64 1.90 68 | 4416 3.01 3.06 3.54 3.67 2.28 2.47 2.20 2.51 0.71 0.75 2.08 2.13 1.11 1.15 1.24 1.49 1.49 1.70 2.53 2.89 1.55 1.73 1.30 1.74 1.64 1.90 69 | 4480 3.01 3.06 3.53 3.68 2.30 2.42 2.29 2.55 0.70 0.72 2.08 2.13 1.11 1.15 1.24 1.49 1.49 1.70 2.53 2.89 1.56 1.73 1.30 1.73 1.63 1.90 70 | 4544 3.01 3.07 3.51 3.63 2.25 2.41 2.23 2.50 0.73 0.78 2.08 2.13 1.11 1.15 1.31 1.49 1.50 1.70 2.53 2.89 1.55 1.73 1.31 1.73 1.63 1.90 71 | 4608 3.01 3.06 3.51 3.65 2.26 2.42 2.23 2.42 0.71 0.74 2.08 2.13 1.11 1.15 1.25 1.49 1.50 1.70 2.53 2.88 1.56 1.73 1.30 1.73 1.63 1.90 72 | 4672 3.00 3.05 3.50 3.60 2.27 2.47 2.21 2.42 0.71 0.74 2.08 2.13 1.11 1.15 1.27 1.49 1.48 1.70 2.53 2.88 1.56 1.73 1.31 1.73 1.64 1.89 73 | 4736 2.99 3.06 3.50 3.64 2.29 2.44 2.26 2.46 0.70 0.73 2.08 2.13 1.11 1.15 1.29 1.49 1.50 1.70 2.53 2.88 1.56 1.73 1.31 1.73 1.63 1.89 74 | 4800 3.01 3.06 3.51 3.62 2.25 2.40 2.24 2.50 0.71 0.73 2.08 2.13 1.11 1.15 1.38 1.49 1.50 1.69 2.53 2.88 1.56 1.73 1.32 1.72 1.64 1.89 75 | 4864 3.00 3.05 3.49 3.60 2.27 2.45 2.22 2.44 0.70 0.75 2.08 2.13 1.11 1.15 1.31 1.49 1.50 1.70 2.53 2.88 1.56 1.73 1.31 1.72 1.64 1.89 76 | 4928 3.00 3.06 3.48 3.65 2.23 2.42 2.22 2.40 0.70 0.73 2.09 2.13 1.11 1.15 1.30 1.49 1.51 1.69 2.53 2.88 1.56 1.73 1.32 1.72 1.63 1.89 77 | 4992 2.97 3.02 3.50 3.63 2.26 2.40 2.25 2.46 0.72 0.76 2.08 2.13 1.11 1.15 1.26 1.49 1.51 1.69 2.53 2.88 1.57 1.73 1.32 1.72 1.63 1.89 78 | 5056 3.01 3.09 3.52 3.63 2.30 2.48 2.23 2.44 0.73 0.77 2.08 2.13 1.11 1.15 1.36 1.49 1.51 1.69 2.53 2.88 1.55 1.73 1.30 1.72 1.64 1.89 79 | 5120 3.02 3.07 3.52 3.72 2.32 2.50 2.24 2.57 0.70 0.72 2.09 2.13 1.11 1.15 1.22 1.48 1.48 1.69 2.53 2.88 1.57 1.73 1.31 1.72 1.62 1.89 80 | 5184 3.00 3.05 3.52 3.70 2.29 2.45 2.26 2.51 0.71 0.73 2.09 2.13 1.11 1.15 1.30 1.48 1.50 1.69 2.53 2.88 1.56 1.73 1.31 1.72 1.64 1.89 81 | 5248 3.01 3.06 3.50 3.68 2.27 2.41 2.28 2.51 0.71 0.75 2.09 2.13 1.11 1.15 1.27 1.49 1.51 1.69 2.53 2.88 1.56 1.73 1.29 1.71 1.64 1.88 82 | 5312 2.99 3.04 3.49 3.60 2.24 2.35 2.26 2.48 0.72 0.76 2.08 2.13 1.11 1.15 1.22 1.48 1.48 1.69 2.53 2.88 1.57 1.72 1.31 1.71 1.64 1.88 83 | 5376 3.01 3.06 3.51 3.62 2.33 2.56 2.33 2.63 0.71 0.74 2.09 2.13 1.11 1.15 1.26 1.48 1.49 1.69 2.53 2.88 1.56 1.72 1.31 1.71 1.65 1.88 84 | 5440 2.98 3.02 3.50 3.66 2.29 2.50 2.26 2.47 0.72 0.75 2.09 2.13 1.11 1.15 1.20 1.48 1.50 1.69 2.53 2.88 1.56 1.72 1.31 1.71 1.65 1.88 85 | 5504 2.99 3.04 3.50 3.62 2.27 2.42 2.27 2.51 0.74 0.75 2.08 2.13 1.11 1.15 1.22 1.48 1.51 1.68 2.53 2.88 1.57 1.72 1.30 1.70 1.66 1.88 86 | 5568 2.98 3.03 3.49 3.64 2.28 2.48 2.24 2.45 0.73 0.75 2.08 2.13 1.11 1.15 1.23 1.48 1.50 1.68 2.54 2.87 1.56 1.72 1.29 1.70 1.66 1.88 87 | 5632 3.02 3.06 3.53 3.67 2.28 2.44 2.26 2.55 0.72 0.75 2.08 2.13 1.11 1.15 1.22 1.48 1.50 1.68 2.53 2.87 1.56 1.72 1.30 1.70 1.67 1.88 88 | 5696 3.02 3.09 3.53 3.70 2.24 2.36 2.22 2.43 0.71 0.77 2.08 2.13 1.11 1.15 1.23 1.48 1.49 1.68 2.53 2.87 1.56 1.72 1.30 1.70 1.65 1.88 89 | 5760 3.00 3.05 3.51 3.69 2.27 2.45 2.24 2.55 0.72 0.75 2.08 2.13 1.11 1.15 1.19 1.48 1.50 1.68 2.53 2.87 1.56 1.72 1.29 1.70 1.66 1.87 90 | 5824 3.00 3.05 3.50 3.65 2.27 2.42 2.26 2.55 0.72 0.75 2.09 2.13 1.11 1.18 1.32 1.48 1.49 1.61 2.53 2.87 1.56 1.70 1.29 1.63 1.63 1.78 91 | 5888 2.99 3.03 3.50 3.67 2.24 2.40 2.21 2.40 0.70 0.71 2.09 2.13 1.11 1.15 1.20 1.48 1.49 1.61 2.53 2.87 1.55 1.70 1.28 1.63 1.62 1.78 92 | 5952 3.01 3.06 3.48 3.57 2.24 2.38 2.21 2.39 0.72 0.73 2.09 2.13 1.11 1.15 1.20 1.48 1.49 1.61 2.53 2.87 1.56 1.70 1.29 1.63 1.64 1.78 93 | 6016 2.99 3.04 3.48 3.61 2.22 2.32 2.20 2.37 0.71 0.73 2.08 2.13 1.11 1.15 1.20 1.49 1.49 1.61 2.52 2.87 1.57 1.70 1.29 1.63 1.62 1.78 94 | 6080 2.98 3.02 3.48 3.56 2.25 2.41 2.21 2.39 0.72 0.73 2.09 2.13 1.11 1.15 1.23 1.48 1.49 1.61 2.53 2.86 1.57 1.70 1.28 1.63 1.62 1.78 95 | 6144 2.98 3.03 3.48 3.58 2.23 2.34 2.20 2.40 0.71 0.73 2.08 2.13 1.11 1.15 1.25 1.48 1.49 1.61 2.52 2.87 1.56 1.70 1.31 1.63 1.64 1.78 96 | 6208 2.99 3.04 3.46 3.54 2.24 2.35 2.19 2.36 0.71 0.73 2.08 2.13 1.11 1.15 1.25 1.48 1.49 1.61 2.53 2.86 1.56 1.70 1.32 1.63 1.63 1.78 97 | 6272 3.01 3.06 3.50 3.59 2.27 2.37 2.23 2.45 0.71 0.72 2.09 2.13 1.11 1.15 1.21 1.48 1.49 1.61 2.52 2.87 1.57 1.70 1.30 1.63 1.63 1.78 98 | 6336 3.01 3.07 3.49 3.59 2.27 2.37 2.22 2.41 0.73 0.74 2.09 2.13 1.11 1.15 1.22 1.48 1.50 1.61 2.53 2.86 1.57 1.70 1.30 1.63 1.62 1.78 99 | 6400 3.00 3.05 3.50 3.60 2.25 2.36 2.22 2.40 0.71 0.73 2.09 2.13 1.11 1.14 1.24 1.48 1.50 1.61 2.53 2.86 1.56 1.70 1.29 1.63 1.64 1.78 100 | 6464 2.99 3.05 3.49 3.59 2.26 2.38 2.23 2.45 0.71 0.75 2.09 2.13 1.11 1.14 1.23 1.48 1.50 1.61 2.52 2.86 1.56 1.70 1.30 1.63 1.63 1.78 101 | 6528 2.99 3.04 3.47 3.56 2.24 2.34 2.22 2.39 0.73 0.75 2.08 2.13 1.11 1.14 1.27 1.48 1.49 1.61 2.53 2.86 1.56 1.70 1.31 1.63 1.63 1.78 102 | 6592 2.99 3.05 3.50 3.60 2.24 2.34 2.20 2.40 0.71 0.72 2.09 2.13 1.11 1.14 1.25 1.48 1.50 1.61 2.53 2.86 1.56 1.70 1.31 1.63 1.64 1.78 103 | 6656 2.98 3.03 3.48 3.57 2.25 2.35 2.23 2.44 0.71 0.72 2.09 2.13 1.11 1.14 1.23 1.48 1.50 1.61 2.53 2.86 1.56 1.70 1.31 1.63 1.64 1.78 104 | 6720 2.98 3.03 3.48 3.58 2.23 2.34 2.21 2.41 0.71 0.74 2.09 2.13 1.11 1.14 1.24 1.48 1.51 1.61 2.53 2.86 1.57 1.69 1.30 1.63 1.63 1.78 105 | 6784 2.99 3.04 3.48 3.58 2.25 2.38 2.20 2.37 0.71 0.73 2.09 2.13 1.11 1.14 1.28 1.48 1.50 1.61 2.52 2.87 1.56 1.69 1.30 1.63 1.64 1.78 106 | 6848 2.98 3.02 3.47 3.58 2.28 2.39 2.21 2.38 0.71 0.73 2.09 2.13 1.11 1.14 1.40 1.48 1.50 1.61 2.53 2.86 1.57 1.69 1.31 1.63 1.64 1.78 107 | 6912 3.00 3.05 3.49 3.60 2.24 2.35 2.24 2.42 0.73 0.74 2.09 2.13 1.11 1.14 1.23 1.48 1.49 1.61 2.53 2.86 1.57 1.69 1.30 1.62 1.64 1.78 108 | 6976 3.00 3.04 3.51 3.63 2.23 2.34 2.22 2.47 0.72 0.74 2.09 2.13 1.11 1.14 1.30 1.48 1.51 1.61 2.52 2.86 1.57 1.69 1.30 1.62 1.64 1.78 109 | 7040 3.01 3.05 3.49 3.60 2.27 2.41 2.22 2.46 0.72 0.74 2.08 2.12 1.11 1.14 1.25 1.48 1.50 1.61 2.53 2.86 1.57 1.69 1.31 1.62 1.63 1.78 110 | 7104 2.99 3.06 3.49 3.59 2.24 2.37 2.23 2.49 0.72 0.75 2.09 2.12 1.11 1.14 1.27 1.48 1.51 1.61 2.52 2.86 1.57 1.69 1.31 1.62 1.63 1.78 111 | 7168 2.99 3.03 3.49 3.60 2.24 2.34 2.21 2.42 0.72 0.74 2.09 2.12 1.11 1.14 1.20 1.48 1.50 1.61 2.53 2.86 1.56 1.69 1.31 1.62 1.63 1.78 112 | 7232 2.99 3.06 3.48 3.58 2.22 2.33 2.23 2.42 0.72 0.75 2.08 2.12 1.11 1.14 1.28 1.48 1.50 1.61 2.52 2.86 1.56 1.69 1.30 1.62 1.64 1.78 113 | 7296 2.98 3.03 3.47 3.58 2.22 2.33 2.21 2.38 0.72 0.74 2.09 2.12 1.11 1.14 1.40 1.48 1.50 1.61 2.53 2.86 1.57 1.69 1.30 1.62 1.65 1.78 114 | 7360 2.99 3.04 3.48 3.57 2.23 2.33 2.20 2.39 0.73 0.76 2.08 2.12 1.11 1.14 1.40 1.48 1.50 1.61 2.53 2.86 1.57 1.69 1.29 1.62 1.63 1.78 115 | 7424 2.98 3.02 3.47 3.58 2.21 2.31 2.22 2.47 0.73 0.76 2.09 2.12 1.11 1.14 1.40 1.48 1.50 1.61 2.53 2.86 1.56 1.69 1.30 1.62 1.64 1.78 116 | 7488 2.98 3.02 3.47 3.56 2.22 2.33 2.20 2.37 0.74 0.77 2.09 2.12 1.11 1.14 1.31 1.48 1.50 1.61 2.52 2.86 1.57 1.69 1.31 1.62 1.64 1.78 117 | 7552 2.99 3.04 3.50 3.60 2.24 2.34 2.20 2.38 0.72 0.74 2.09 2.12 1.11 1.14 1.37 1.48 1.50 1.61 2.53 2.86 1.56 1.69 1.30 1.62 1.64 1.78 118 | 7616 3.00 3.04 3.50 3.62 2.23 2.35 2.19 2.36 0.73 0.77 2.09 2.12 1.11 1.14 1.40 1.48 1.50 1.61 2.52 2.86 1.56 1.69 1.31 1.61 1.63 1.78 119 | 7680 3.00 3.05 3.49 3.59 2.27 2.42 2.22 2.42 0.74 0.76 2.09 2.12 1.11 1.14 1.27 1.48 1.51 1.61 2.53 2.86 1.57 1.69 1.29 1.61 1.64 1.78 120 | 7744 3.00 3.06 3.49 3.61 2.24 2.39 2.23 2.41 0.72 0.74 2.08 2.12 1.11 1.14 1.24 1.48 1.51 1.61 2.53 2.86 1.56 1.69 1.31 1.61 1.63 1.78 121 | 7808 2.98 3.03 3.50 3.58 2.25 2.36 2.22 2.41 0.72 0.74 2.09 2.12 1.11 1.14 1.35 1.47 1.50 1.60 2.53 2.85 1.57 1.69 1.29 1.61 1.64 1.78 122 | 7872 2.98 3.03 3.48 3.56 2.25 2.37 2.22 2.41 0.73 0.75 2.08 2.12 1.11 1.14 1.38 1.47 1.49 1.61 2.53 2.85 1.57 1.69 1.28 1.61 1.64 1.78 123 | 7936 2.99 3.03 3.48 3.58 2.24 2.34 2.24 2.47 0.73 0.75 2.08 2.12 1.11 1.14 1.35 1.47 1.50 1.61 2.52 2.85 1.57 1.69 1.30 1.61 1.64 1.78 124 | 8000 2.98 3.03 3.47 3.57 2.23 2.34 2.21 2.41 0.74 0.76 2.08 2.12 1.11 1.14 1.27 1.47 1.51 1.60 2.52 2.85 1.56 1.69 1.31 1.61 1.65 1.78 125 | 8064 2.98 3.02 -------------------------------------------------------------------------------- /gnuplot/m2.data: -------------------------------------------------------------------------------- 1 | # for each scheme, we give the best time/item and the average time/item in ns 2 | # Volume std::shuffle-lehmer batched_random::shuffle_2-lehmer batched_random::shuffle_23456-lehmer std::shuffle-mersenne batched_random::shuffle_2-mersenne batched_random::shuffle_23456-mersenne shuffle_lehmer naive_shuffle_lehmer_2 shuffle_lehmer_2 shuffle_lehmer_23456 shuffle_pcg naive_shuffle_pcg_2 shuffle_pcg_2 shuffle_pcg_23456 shuffle_chacha naive_shuffle_chacha_2 shuffle_chacha_2 shuffle_chacha_23456 3 | 100 1.79 1.72 1.40 1.20 2.54 2.02 1.84 1.35 7.95 5.03 4.70 2.47 4 | 163 1.68 1.50 1.26 1.03 2.16 1.75 1.55 1.17 7.76 4.73 4.44 2.24 5 | 265 1.56 1.44 1.09 0.90 2.08 1.59 1.49 1.04 7.69 4.55 4.29 2.08 6 | 432 1.47 1.22 0.99 0.81 1.97 1.45 1.28 0.92 7.66 4.54 4.29 1.97 7 | 703 1.40 1.11 0.90 0.73 1.93 1.32 1.18 0.83 7.60 4.32 4.23 1.93 8 | 1145 1.34 1.03 0.84 0.65 1.86 1.24 1.11 0.76 7.72 4.33 4.07 2.00 9 | 1864 1.30 0.96 0.80 0.60 1.80 1.22 1.06 0.70 7.58 4.19 4.04 1.98 10 | 3035 1.29 0.95 0.77 0.57 1.73 1.16 1.02 0.67 7.75 4.17 4.00 2.07 11 | 4942 1.28 0.90 0.75 0.57 1.70 1.09 1.00 0.67 7.55 4.32 3.98 2.08 12 | 8047 1.26 0.84 0.73 0.55 1.71 1.11 0.97 0.65 7.57 4.21 4.03 2.19 13 | 13104 1.23 0.83 0.73 0.54 1.68 1.07 1.02 0.65 7.72 4.13 3.98 2.30 14 | 21337 1.24 0.82 0.71 0.56 1.69 1.11 1.04 0.68 7.71 4.29 4.23 2.51 15 | 34743 1.24 0.84 0.76 0.59 1.69 1.08 0.95 0.70 7.71 4.27 4.11 2.63 16 | 56573 1.24 0.84 0.75 0.70 1.67 1.08 0.96 0.74 7.81 4.35 4.17 2.89 17 | 92120 1.22 0.87 0.82 0.73 1.68 1.04 0.99 0.80 7.90 4.63 4.31 2.95 18 | 150000 1.23 0.88 0.83 0.81 1.68 1.05 0.97 0.83 7.93 4.75 4.27 3.04 19 | -------------------------------------------------------------------------------- /gnuplot/m2.datalehmer.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2.datalehmer.pdf -------------------------------------------------------------------------------- /gnuplot/m2.datapcg64.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2.datapcg64.pdf -------------------------------------------------------------------------------- /gnuplot/m2.dataratio.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2.dataratio.pdf -------------------------------------------------------------------------------- /gnuplot/m2stream.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2stream.data -------------------------------------------------------------------------------- /gnuplot/plot.gnuplot: -------------------------------------------------------------------------------- 1 | # gnuplot -e "filename='foo.data'" plot.gnuplot 2 | 3 | set term pdf 4 | set ylabel "time per item (ns)" 5 | set xlabel "number of entries" 6 | stats filename using 1 nooutput name 'X_' 7 | set style fill border 8 | 9 | # Access the min and max X-values 10 | xmin = X_min 11 | xmax = X_max 12 | set xrange [xmin:xmax] 13 | set yrange [0:] 14 | set key bottom center 15 | set logscale x 2 16 | set format x "2^{%L}" 17 | 18 | 19 | set out filename . "lehmer.pdf" 20 | 21 | plot filename using 1:2 with lines lw 5 title 'shuffle' , \ 22 | "" using 1:3 with lines lw 5 title 'naive shuffle\_2' , \ 23 | "" using 1:4 with lines lw 5 title 'shuffle\_2' , \ 24 | "" using 1:5 with lines lw 5 title 'shuffle\_6' 25 | 26 | set out filename . "pcg64.pdf" 27 | 28 | plot filename using 1:6 with lines lw 5 title 'shuffle' , \ 29 | "" using 1:7 with lines lw 5 title 'naive shuffle\_2' ,\ 30 | "" using 1:8 with lines lw 5 title 'shuffle\_2' , \ 31 | "" using 1:9 with lines lw 5 title 'shuffle\_6' 32 | 33 | set out filename . "chacha.pdf" 34 | 35 | plot filename using 1:10 with lines lw 5 title 'shuffle' , \ 36 | "" using 1:11 with lines lw 5 title 'naive shuffle\_2' , \ 37 | "" using 1:12 with lines lw 5 title 'shuffle\_2' , \ 38 | "" using 1:13 with lines lw 5 title 'shuffle\_6' 39 | 40 | 41 | set ylabel "speed ratio (shuffle\\\_6/shuffle)" 42 | set yrange [1:5] 43 | set out filename . "ratio.pdf" 44 | 45 | plot filename using 1:($10/$13) with lines lw 5 title 'ChaCha', \ 46 | "" using 1:($6/$9) with lines lw 5 title 'PCG64', \ 47 | "" using 1:($2/$5) with lines lw 5 title 'Lehmer' -------------------------------------------------------------------------------- /include/partial-shuffle-inl.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * This header contains C++ helper templates. They are not meant for end users. 4 | */ 5 | #ifndef PARTIAL_SHUFFLE_INL_H 6 | #define PARTIAL_SHUFFLE_INL_H 7 | 8 | #include 9 | #include 10 | 11 | namespace batched_random { 12 | 13 | // Performs k steps of a Fisher-Yates shuffle on n elements, in the array 14 | // `storage`. 15 | // 16 | // Preconditions: 17 | // n >= k >= 1 18 | // bound >= n*(n-1)*...*(n-(k-1)), which must not overflow 19 | // rng() produces uniformly random 64-bit values 20 | // 21 | // The return value is usable as `bound` for smaller batches of size k. 22 | template 23 | inline uint64_t partial_shuffle_64b(RandomIt storage, uint64_t n, uint64_t k, 24 | uint64_t bound, URBG &g) { 25 | static_assert(std::is_same::value, "result_type must be uint64_t"); 26 | __uint128_t x; 27 | uint64_t r = g(); 28 | uint64_t indexes[7]; // We know that k <= 7 29 | 30 | for (uint64_t i = 0; i < k; i++) { 31 | x = (__uint128_t)(n - i) * (__uint128_t)r; 32 | r = (uint64_t)x; 33 | indexes[i] = (uint64_t)(x >> 64); 34 | } 35 | 36 | if (r < bound) { 37 | bound = n; 38 | for (uint64_t i = 1; i < k; i++) { 39 | bound *= n - i; 40 | } 41 | uint64_t t = -bound % bound; 42 | 43 | while (r < t) { 44 | r = g(); 45 | for (uint64_t i = 0; i < k; i++) { 46 | x = (__uint128_t)(n - i) * (__uint128_t)r; 47 | r = (uint64_t)x; 48 | indexes[i] = (uint64_t)(x >> 64); 49 | } 50 | } 51 | } 52 | for (uint64_t i = 0; i < k; i++) { 53 | std::iter_swap(storage + n - i - 1, storage + indexes[i]); 54 | } 55 | 56 | return bound; 57 | } 58 | 59 | } // namespace batched_random 60 | 61 | #endif // TEMPLATE_SHUFFLE_H 62 | -------------------------------------------------------------------------------- /include/random_bounded.h: -------------------------------------------------------------------------------- 1 | /*** 2 | * This header contains function declarations for C array shuffling functions. 3 | * It can be called by C code. 4 | */ 5 | #ifndef BATCHED_RANDOM_H 6 | #define BATCHED_RANDOM_H 7 | #include 8 | 9 | // call this one before calling random_bounded and other shuffling functions. 10 | void seed(uint64_t s); 11 | 12 | 13 | // shuffle the storage array, you need to provide your own random number 14 | // generator (rng) 15 | void shuffle(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)); 16 | void shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)); 17 | void shuffle_batch_23456(uint64_t *storage, uint64_t size, 18 | uint64_t (*rng)(void)); 19 | void naive_shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)); 20 | 21 | // shuffle with lehmer rng 22 | void shuffle_lehmer(uint64_t *storage, uint64_t size); 23 | void shuffle_lehmer_2(uint64_t *storage, uint64_t size); 24 | void shuffle_lehmer_23456(uint64_t *storage, uint64_t size); 25 | void naive_shuffle_lehmer_2(uint64_t *storage, uint64_t size); 26 | 27 | // shuffle with pcg64 rng 28 | void shuffle_pcg(uint64_t *storage, uint64_t size); 29 | void shuffle_pcg_2(uint64_t *storage, uint64_t size); 30 | void shuffle_pcg_23456(uint64_t *storage, uint64_t size); 31 | void naive_shuffle_pcg_2(uint64_t *storage, uint64_t size); 32 | 33 | 34 | // shuffle with chacha rng 35 | void shuffle_chacha(uint64_t *storage, uint64_t size); 36 | void shuffle_chacha_2(uint64_t *storage, uint64_t size); 37 | void shuffle_chacha_23456(uint64_t *storage, uint64_t size); 38 | void naive_shuffle_chacha_2(uint64_t *storage, uint64_t size); 39 | 40 | 41 | // returns a random number in the range [0, range) 42 | uint64_t random_bounded_lehmer(uint64_t range); 43 | 44 | #endif // BATCHED_RANDOM_H 45 | -------------------------------------------------------------------------------- /include/template_shuffle.h: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * This header contains C++ templates that shuffle the elements in the range [first, 4 | * last) using the random number generator g. They are meant to emulate 5 | * the standard std::shuffle function and can often act as drop-in replacement. 6 | */ 7 | #ifndef TEMPLATE_SHUFFLE_H 8 | #define TEMPLATE_SHUFFLE_H 9 | 10 | #include "partial-shuffle-inl.h" 11 | 12 | // This code is meant to look like the C++ standard library. 13 | namespace batched_random { 14 | 15 | // This is a template function that shuffles the elements in the range [first, 16 | // last). 17 | // 18 | // It is similar to std::shuffle, but it uses a different algorithm. 19 | template 20 | extern void shuffle_2(RandomIt first, RandomIt last, URBG &&g) { 21 | uint64_t i = std::distance(first, last); 22 | for (; i > 1 << 30; i--) { 23 | partial_shuffle_64b(first, i, 1, i, g); 24 | } 25 | 26 | // Batches of 2 for sizes up to 2^30 elements 27 | uint64_t bound = (uint64_t)1 << 60; 28 | for (; i > 1; i -= 2) { 29 | bound = partial_shuffle_64b(first, i, 2, bound, g); 30 | } 31 | } 32 | 33 | // This is a template function that shuffles the elements in the range [first, 34 | // last) 35 | // 36 | // It is similar to std::shuffle, but it uses a different algorithm. 37 | // 38 | // Performance note: This function might be slow under GCC: see shuffle_2. 39 | template 40 | extern void shuffle_23456(RandomIt first, RandomIt last, URBG &&g) { 41 | uint64_t i = std::distance(first, last); 42 | for (; i > 1 << 30; i--) { 43 | partial_shuffle_64b(first, i, 1, i, g); 44 | } 45 | 46 | // Batches of 2 for sizes up to 2^30 elements 47 | uint64_t bound = (uint64_t)1 << 60; 48 | for (; i > 1 << 19; i -= 2) { 49 | bound = partial_shuffle_64b(first, i, 2, bound, g); 50 | } 51 | 52 | // Batches of 3 for sizes up to 2^19 elements 53 | bound = (uint64_t)1 << 57; 54 | for (; i > 1 << 14; i -= 3) { 55 | bound = partial_shuffle_64b(first, i, 3, bound, g); 56 | } 57 | 58 | // Batches of 4 for sizes up to 2^14 elements 59 | bound = (uint64_t)1 << 56; 60 | for (; i > 1 << 11; i -= 4) { 61 | bound = partial_shuffle_64b(first, i, 4, bound, g); 62 | } 63 | 64 | // Batches of 5 for sizes up to 2^11 elements 65 | bound = (uint64_t)1 << 55; 66 | for (; i > 1 << 9; i -= 5) { 67 | bound = partial_shuffle_64b(first, i, 5, bound, g); 68 | } 69 | 70 | // Batches of 6 for sizes up to 2^9 elements 71 | bound = (uint64_t)1 << 54; 72 | for (; i > 6; i -= 6) { 73 | bound = partial_shuffle_64b(first, i, 6, bound, g); 74 | } 75 | 76 | if (i > 1) { 77 | partial_shuffle_64b(first, i, i - 1, 720, g); 78 | } 79 | } 80 | 81 | } // namespace batched_random 82 | 83 | #endif // TEMPLATE_SHUFFLE_H 84 | -------------------------------------------------------------------------------- /src/batch_shuffle_dice.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | uint64_t random_bounded(uint64_t range, uint64_t (*rng)(void)) { 4 | __uint128_t random64bit, multiresult; 5 | uint64_t leftover; 6 | uint64_t threshold; 7 | random64bit = rng(); 8 | multiresult = random64bit * range; 9 | leftover = (uint64_t)multiresult; 10 | if (leftover < range) { 11 | threshold = -range % range; 12 | while (leftover < threshold) { 13 | random64bit = rng(); 14 | multiresult = random64bit * range; 15 | leftover = (uint64_t)multiresult; 16 | } 17 | } 18 | return (uint64_t)(multiresult >> 64); // [0, range) 19 | } 20 | 21 | // This is a naive batched shuffle. We generate a single random number r in n*(n-1)*...*(n-(k-1)). 22 | // Then we get the random index as 23 | // r % n -> pos1 24 | // r = (r / n) 25 | // r % (n-1) -> pos2 26 | // r = (r / (n-1)) 27 | // ... 28 | // r % (n-k+1) -> posk (can omit the modulo here) 29 | 30 | inline void naive_partial_shuffle_64b(uint64_t *storage, uint64_t n, uint64_t k, uint64_t (*rng)(void)) { 31 | uint64_t pos1, pos2; 32 | uint64_t val1, val2; 33 | uint64_t bound = n; 34 | for (uint64_t i = 1; i < k; i++) { 35 | bound *= n - i; 36 | } 37 | // Next we generate a random integer in [0, bound) 38 | uint64_t r = random_bounded(bound, rng); 39 | for (uint64_t i = 0; i < k - 1; i++) { 40 | pos2 = r % (n - i); 41 | r /= (n - i); 42 | pos1 = n - i - 1; 43 | val1 = storage[pos1]; 44 | val2 = storage[pos2]; 45 | storage[pos1] = val2; 46 | storage[pos2] = val1; 47 | } 48 | // the last one does not need a modulo 49 | pos2 = r; 50 | pos1 = n - k; 51 | val1 = storage[pos1]; 52 | val2 = storage[pos2]; 53 | storage[pos1] = val2; 54 | storage[pos2] = val1; 55 | } 56 | 57 | 58 | // Performs k steps of a Fisher-Yates shuffle on n elements, in the array 59 | // `storage`. 60 | // 61 | // Preconditions: 62 | // n >= k >= 1 63 | // bound >= n*(n-1)*...*(n-(k-1)), which must not overflow 64 | // rng() produces uniformly random 64-bit values 65 | // 66 | // The return value is usable as `bound` for smaller batches of size k. 67 | static inline uint64_t partial_shuffle_64b(uint64_t *storage, uint64_t n, uint64_t k, 68 | uint64_t bound, uint64_t (*rng)(void)) { 69 | __uint128_t x; 70 | uint64_t r = rng(); 71 | uint64_t pos1, pos2; 72 | uint64_t val1, val2; 73 | uint64_t indexes[7]; // We know that k <= 7 74 | 75 | for (uint64_t i = 0; i < k; i++) { 76 | x = (__uint128_t)(n - i) * (__uint128_t)r; 77 | r = (uint64_t)x; 78 | indexes[i] = (uint64_t)(x >> 64); 79 | } 80 | 81 | if (r < bound) { 82 | bound = n; 83 | for (uint64_t i = 1; i < k; i++) { 84 | bound *= n - i; 85 | } 86 | uint64_t t = -bound % bound; 87 | 88 | while (r < t) { 89 | r = rng(); 90 | for (uint64_t i = 0; i < k; i++) { 91 | x = (__uint128_t)(n - i) * (__uint128_t)r; 92 | r = (uint64_t)x; 93 | indexes[i] = (uint64_t)(x >> 64); 94 | } 95 | } 96 | } 97 | for (uint64_t i = 0; i < k; i++) { 98 | pos1 = n - i - 1; 99 | pos2 = indexes[i]; 100 | val1 = storage[pos1]; // should be in cache 101 | val2 = storage[pos2]; // might not be in cache 102 | storage[pos1] = val2; 103 | storage[pos2] = val1; // will be read later 104 | } 105 | return bound; 106 | } 107 | 108 | // Rolls a batch of fair dice with sizes n, n-1, ..., n-(k-1) 109 | // 110 | // Preconditions: 111 | // n >= k 112 | // bound >= n*(n-1)*...*(n-(k-1)), which must not overflow 113 | // rng() produces uniformly random 64-bit values 114 | // result has length at least k 115 | // 116 | // The dice rolls are put in the `result` array: 117 | // result[i] is an (n-i) sided die roll 118 | // 119 | // The return value is usable as `bound` for smaller batches of size k. 120 | inline uint64_t partial_shuffle_dice_64b(uint64_t n, uint64_t k, uint64_t bound, 121 | uint64_t (*rng)(void), 122 | uint64_t *result) { 123 | __uint128_t x; 124 | uint64_t r = rng(); 125 | 126 | for (uint64_t i = 0; i < k; i++) { 127 | x = (__uint128_t)(n - i) * (__uint128_t)r; 128 | r = (uint64_t)x; 129 | result[i] = (uint64_t)(x >> 64); 130 | } 131 | 132 | if (r < bound) { 133 | bound = n; 134 | for (uint64_t i = 1; i < k; i++) { 135 | bound *= n - i; 136 | } 137 | uint64_t t = -bound % bound; 138 | while (r < t) { 139 | r = rng(); 140 | for (uint64_t i = 0; i < k; i++) { 141 | x = (__uint128_t)(n - i) * (__uint128_t)r; 142 | r = (uint64_t)x; 143 | result[i] = (uint64_t)(x >> 64); 144 | } 145 | } 146 | } 147 | 148 | return bound; 149 | } 150 | 151 | // Rolls fair dice with sizes n, n-1, ..., n - (4*k - 1) 152 | // in four interleaved batches. The first die in batch j 153 | // has size n-j, and each subsequent die is smaller by 4 154 | // 155 | // Preconditions: 156 | // n >= 4*k 157 | // bound >= n*(n-4)*...*(n - 4*(k-1)), which must not overflow 158 | // rng() produces uniformly random 64-bit values 159 | // result has length at least 4*k 160 | // 161 | // The dice rolls are put in the `result` array: 162 | // result[i] is an (n-i) sided die roll 163 | // 164 | // The return value is usable as `bound` with the same k and smaller n 165 | inline uint64_t partial_shuffle_dice_64b_interleaved_4x(uint64_t n, uint64_t k, 166 | uint64_t bound, 167 | uint64_t (*rng)(void), 168 | uint64_t *result) { 169 | __uint128_t x; 170 | uint64_t r[4]; 171 | 172 | for (int j = 0; j < 4; j++) { 173 | r[j] = rng(); 174 | } 175 | 176 | for (uint64_t i = 0; i < k; i++) { 177 | for (uint64_t j = 0; j < 4; j++) { 178 | x = (__uint128_t)(n - 4 * i - j) * (__uint128_t)r[j]; 179 | r[j] = (uint64_t)x; 180 | result[4 * i + j] = (uint64_t)(x >> 64); 181 | } 182 | } 183 | 184 | for (uint64_t j = 0; j < 4; j++) { 185 | if (r[j] < bound) { 186 | uint64_t m = n - j; 187 | bound = m; 188 | for (uint64_t i = 1; i < k; i++) { 189 | bound *= m - 4 * i; 190 | } 191 | uint64_t t = -bound % bound; 192 | while (r[j] < t) { 193 | r[j] = rng(); 194 | for (uint64_t i = 0; i < k; i++) { 195 | x = (__uint128_t)(m - 4 * i) * (__uint128_t)r[j]; 196 | r[j] = (uint64_t)x; 197 | result[4 * i + j] = (uint64_t)(x >> 64); 198 | } 199 | } 200 | } 201 | } 202 | 203 | return bound; 204 | } 205 | 206 | // Rolls a batch of fair dice with sizes 2, 3, ..., 17 207 | // 208 | // Preconditions: 209 | // rng() produces uniformly random 64-bit values 210 | // result has length at least 16 211 | // 212 | // The dice rolls are put in the `result` array: 213 | // result[i] is an (i+2) sided die roll 214 | inline void shuffle_17_dice_16b_interleaved(uint64_t (*rng)(void), 215 | uint16_t *result) { 216 | uint16_t r[4]; 217 | uint16_t m[4] = {(1 << 10) - 1, (1 << 8) - 1, (1 << 12) - 1, (1 << 12) - 1}; 218 | 219 | do { 220 | uint64_t bits = rng(); 221 | for (int i = 0; i < 4; i++) { 222 | r[i] = (uint16_t)(bits >> (16 * i)); 223 | } 224 | } while (((r[0] & m[0]) == 0) || ((r[1] & m[1]) == 0) || 225 | ((r[2] & m[2]) == 0) || ((r[3] & m[3]) == 0)); 226 | 227 | // Each column of n is a batch. 228 | uint16_t n[4][4] = { 229 | {2, 5, 7, 12}, {3, 6, 8, 13}, {4, 16, 9, 14}, {11, 17, 10, 15}}; 230 | uint32_t x[4]; 231 | 232 | for (int i = 0; i < 4; i++) { 233 | for (int j = 0; j < 4; j++) { 234 | x[j] = (uint32_t)n[i][j] * (uint32_t)r[j]; 235 | } 236 | // These are separate loops so the above multiplication 237 | // can take advantage of instruction-level parallelism. 238 | for (int j = 0; j < 4; j++) { 239 | result[n[i][j] - 2] = (uint16_t)(x[j] >> 16); 240 | r[j] = (uint16_t)x[j]; 241 | } 242 | } 243 | } 244 | 245 | // Rolls a batch of fair dice with sizes 2, 3, ..., 17 246 | // 247 | // Preconditions: 248 | // rng() produces uniformly random 64-bit values 249 | // result has length at least 16 250 | // 251 | // The dice rolls are put in the `result` array: 252 | // result[i] is an (i+2) sided die roll 253 | inline void shuffle_17_dice_16b_linear(uint64_t (*rng)(void), 254 | uint16_t *result) { 255 | uint16_t r[4]; 256 | uint16_t m[4] = {(1 << 10) - 1, (1 << 8) - 1, (1 << 12) - 1, (1 << 12) - 1}; 257 | 258 | do { 259 | uint64_t bits = rng(); 260 | for (int i = 0; i < 4; i++) { 261 | r[i] = (uint16_t)(bits >> (16 * i)); 262 | } 263 | } while (((r[0] & m[0]) == 0) || ((r[1] & m[1]) == 0) || 264 | ((r[2] & m[2]) == 0) || ((r[3] & m[3]) == 0)); 265 | 266 | uint16_t p[16] = {r[0], r[0], r[0], r[1], r[1], r[2], r[2], r[2], 267 | r[2], r[0], r[3], r[3], r[3], r[3], r[1], r[1]}; 268 | uint16_t d[16] = {1, 2, 6, 1, 5, 1, 7, 56, 269 | 504, 24, 1, 12, 156, 2184, 30, 480}; 270 | 271 | for (int i = 0; i < 16; i++) { 272 | p[i] *= d[i]; 273 | } 274 | 275 | uint16_t n[16] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; 276 | 277 | for (int i = 0; i < 16; i++) { 278 | uint32_t x = (uint32_t)p[i] * (uint32_t)n[i]; 279 | result[i] = (uint16_t)(x >> 16); 280 | } 281 | } 282 | -------------------------------------------------------------------------------- /src/chacha.c: -------------------------------------------------------------------------------- 1 | // copyright: https://github.com/nixberg/chacha-rng-c (MIT License) 2 | // with some modifications by D. Lemire 3 | #include 4 | #include 5 | #include 6 | 7 | #include "chacha.h" 8 | 9 | static void chacha_init(ChaCha *rng, size_t rounds, const uint32_t seed[8], uint64_t stream) { 10 | rng->state[ 0] = 0x61707865; 11 | rng->state[ 1] = 0x3320646e; 12 | rng->state[ 2] = 0x79622d32; 13 | rng->state[ 3] = 0x6b206574; 14 | 15 | rng->state[ 4] = seed[0]; 16 | rng->state[ 5] = seed[1]; 17 | rng->state[ 6] = seed[2]; 18 | rng->state[ 7] = seed[3]; 19 | rng->state[ 8] = seed[4]; 20 | rng->state[ 9] = seed[5]; 21 | rng->state[10] = seed[6]; 22 | rng->state[11] = seed[7]; 23 | 24 | rng->state[12] = 0; 25 | rng->state[13] = 0; 26 | rng->state[14] = (uint32_t)stream; 27 | rng->state[15] = (uint32_t)(stream >> 32); 28 | 29 | rng->rounds = rounds; 30 | 31 | rng->word_index = 16; 32 | } 33 | 34 | void chacha8_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream) { 35 | chacha_init(rng, 8, seed, stream); 36 | } 37 | 38 | void chacha8_zero(ChaCha *rng, uint64_t stream) { 39 | uint32_t seed[8] = { 0 }; 40 | chacha_init(rng, 8, seed, stream); 41 | } 42 | 43 | void chacha12_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream) { 44 | chacha_init(rng, 12, seed, stream); 45 | } 46 | 47 | void chacha12_zero(ChaCha *rng, uint64_t stream) { 48 | uint32_t seed[8] = { 0 }; 49 | chacha_init(rng, 12, seed, stream); 50 | } 51 | 52 | void chacha20_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream) { 53 | chacha_init(rng, 20, seed, stream); 54 | } 55 | 56 | void chacha20_zero(ChaCha *rng, uint64_t stream) { 57 | uint32_t seed[8] = { 0 }; 58 | chacha_init(rng, 20, seed, stream); 59 | } 60 | 61 | uint8_t chacha_u8(ChaCha *rng) { 62 | return (uint8_t)chacha_u32(rng); 63 | } 64 | 65 | void chacha_fill_u8(ChaCha *rng, uint8_t *array, size_t count) { 66 | size_t tail_count = count % 4; 67 | 68 | for (size_t i = 0; i < (count - tail_count); i += 4) { 69 | uint32_t word = chacha_u32(rng); 70 | array[i + 0] = (uint8_t)word; 71 | array[i + 1] = (uint8_t)(word >> 8); 72 | array[i + 2] = (uint8_t)(word >> 16); 73 | array[i + 3] = (uint8_t)(word >> 24); 74 | } 75 | 76 | if (tail_count > 0) { 77 | uint32_t word = chacha_u32(rng); 78 | for (size_t i = tail_count; i > 0; i--) { 79 | array[count - i] = (uint8_t)word; 80 | word >>= 8; 81 | } 82 | } 83 | } 84 | 85 | uint16_t chacha_u16(ChaCha *rng) { 86 | return (uint16_t)chacha_u32(rng); 87 | } 88 | 89 | void chacha_fill_u16(ChaCha *rng, uint16_t *array, size_t count) { 90 | size_t tail_count = count % 2; 91 | 92 | for (size_t i = 0; i < (count - tail_count); i += 2) { 93 | uint32_t word = chacha_u32(rng); 94 | array[i + 0] = (uint16_t)word; 95 | array[i + 1] = (uint16_t)word >> 16; 96 | } 97 | 98 | if (tail_count > 0) { 99 | uint32_t word = chacha_u32(rng); 100 | array[count - 1] = (uint16_t)word; 101 | } 102 | } 103 | 104 | static void double_round(uint32_t state[16]); 105 | 106 | static inline void increment_counter(ChaCha *rng) { 107 | rng->state[12]++; 108 | if (rng->state[12] == 0) { 109 | rng->state[13]++; 110 | if (rng->state[13] == 0) { 111 | exit(EXIT_FAILURE); 112 | } 113 | } 114 | } 115 | 116 | uint32_t chacha_u32(ChaCha *rng) { 117 | assert(rng->word_index <= 16); 118 | 119 | if (rng->word_index == 16) { 120 | for (size_t i = 0; i < 16; i++) { 121 | rng->working_state[i] = rng->state[i]; 122 | } 123 | 124 | for (size_t i = 0; i < rng->rounds; i += 2) { 125 | double_round(rng->working_state); 126 | } 127 | 128 | for (size_t i = 0; i < 16; i++) { 129 | rng->working_state[i] += rng->state[i]; 130 | } 131 | 132 | increment_counter(rng); 133 | rng->word_index = 0; 134 | } 135 | 136 | uint32_t result = rng->working_state[rng->word_index]; 137 | 138 | rng->word_index++; 139 | 140 | return result; 141 | } 142 | 143 | void chacha_fill_u32(ChaCha *rng, uint32_t *array, size_t count) { 144 | for (size_t i = 0; i < count; i++) { 145 | array[i] = chacha_u32(rng); 146 | } 147 | } 148 | 149 | uint64_t chacha_u64(ChaCha *rng) { 150 | uint64_t lo = chacha_u32(rng); 151 | uint64_t hi = chacha_u32(rng); 152 | return (hi << 32) | lo; 153 | } 154 | 155 | uint64_t chacha_u64_global() { 156 | return chacha_u64(&chacha_rng); 157 | } 158 | 159 | void chacha_fill_u64(ChaCha *rng, uint64_t *array, size_t count) { 160 | for (size_t i = 0; i < count; i++) { 161 | array[i] = chacha_u64(rng); 162 | } 163 | } 164 | 165 | float chacha_f32(ChaCha *rng) { 166 | return (float)(chacha_u32(rng) >> 8) * 0x1p-24f; 167 | } 168 | 169 | void chacha_fill_f32(ChaCha *rng, float *array, size_t count) { 170 | for (size_t i = 0; i < count; i++) { 171 | array[i] = chacha_f32(rng); 172 | } 173 | } 174 | 175 | double chacha_f64(ChaCha *rng) { 176 | return (float)(chacha_u64(rng) >> 11) * 0x1p-53; 177 | } 178 | 179 | void chacha_fill_f64(ChaCha *rng, double *array, size_t count) { 180 | for (size_t i = 0; i < count; i++) { 181 | array[i] = chacha_f64(rng); 182 | } 183 | } 184 | 185 | static inline uint32_t rotated_left(uint32_t value, uint32_t count) { 186 | return (value << count) | (value >> (32 - count)); 187 | } 188 | 189 | #define QUARTER_ROUND(a, b, c, d) \ 190 | state[a] += state[b]; state[d] = rotated_left(state[d] ^ state[a], 16); \ 191 | state[c] += state[d]; state[b] = rotated_left(state[b] ^ state[c], 12); \ 192 | state[a] += state[b]; state[d] = rotated_left(state[d] ^ state[a], 8); \ 193 | state[c] += state[d]; state[b] = rotated_left(state[b] ^ state[c], 7); 194 | 195 | static inline void double_round(uint32_t state[16]) { 196 | QUARTER_ROUND(0, 4, 8, 12) 197 | QUARTER_ROUND(1, 5, 9, 13) 198 | QUARTER_ROUND(2, 6, 10, 14) 199 | QUARTER_ROUND(3, 7, 11, 15) 200 | 201 | QUARTER_ROUND(0, 5, 10, 15) 202 | QUARTER_ROUND(1, 6, 11, 12) 203 | QUARTER_ROUND(2, 7, 8, 13) 204 | QUARTER_ROUND(3, 4, 9, 14) 205 | } 206 | -------------------------------------------------------------------------------- /src/chacha.h: -------------------------------------------------------------------------------- 1 | // copyright: https://github.com/nixberg/chacha-rng-c (MIT License) 2 | // with some modifications by D. Lemire 3 | #ifndef chacha_h 4 | #define chacha_h 5 | 6 | #include 7 | 8 | typedef struct { 9 | uint32_t state[16]; 10 | uint32_t working_state[16]; 11 | size_t rounds; 12 | size_t word_index; 13 | } ChaCha; 14 | ChaCha chacha_rng; 15 | 16 | void chacha8_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream); 17 | 18 | void chacha8_zero(ChaCha *rng, uint64_t stream); 19 | 20 | void chacha12_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream); 21 | 22 | void chacha12_zero(ChaCha *rng, uint64_t stream); 23 | 24 | void chacha20_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream); 25 | 26 | void chacha20_zero(ChaCha *rng, uint64_t stream); 27 | 28 | uint8_t chacha_u8(ChaCha *rng); 29 | 30 | uint16_t chacha_u16(ChaCha *rng); 31 | 32 | uint32_t chacha_u32(ChaCha *rng); 33 | 34 | uint64_t chacha_u64(ChaCha *rng); 35 | 36 | float chacha_f32(ChaCha *rng); 37 | 38 | double chacha_f64(ChaCha *rng); 39 | 40 | void chacha_fill_u8(ChaCha *rng, uint8_t *array, size_t count); 41 | 42 | void chacha_fill_u16(ChaCha *rng, uint16_t *array, size_t count); 43 | 44 | void chacha_fill_u32(ChaCha *rng, uint32_t *array, size_t count); 45 | 46 | void chacha_fill_u64(ChaCha *rng, uint64_t *array, size_t count); 47 | 48 | void chacha_fill_f32(ChaCha *rng, float *array, size_t count); 49 | 50 | void chacha_fill_f64(ChaCha *rng, double *array, size_t count); 51 | 52 | #endif /* chacha_h */ 53 | -------------------------------------------------------------------------------- /src/lehmer64.h: -------------------------------------------------------------------------------- 1 | #ifndef LEHMER64_H 2 | #define LEHMER64_H 3 | #include 4 | 5 | #include "splitmix64.h" 6 | 7 | __uint128_t g_lehmer64_state = UINT64_C(0x853c49e6748fea9b); 8 | 9 | /** 10 | * D. H. Lehmer, Mathematical methods in large-scale computing units. 11 | * Proceedings of a Second Symposium on Large Scale Digital Calculating 12 | * Machinery; 13 | * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146. 14 | */ 15 | 16 | static inline void lehmer64_seed(uint64_t seed) { 17 | g_lehmer64_state = (((__uint128_t)splitmix64_stateless(seed)) << 64) + 18 | splitmix64_stateless(seed + 1); 19 | } 20 | 21 | static inline uint64_t lehmer64() { 22 | g_lehmer64_state *= UINT64_C(0xda942042e4dd58b5); 23 | return (uint64_t)(g_lehmer64_state >> 64); 24 | } 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /src/pcg64.h: -------------------------------------------------------------------------------- 1 | #ifndef PCG64_H 2 | #define PCG64_H 3 | 4 | /* Modified by D. Lemire based on original code by M. O'Neill, August 2017 */ 5 | #include "splitmix64.h" // we are going to leverage splitmix64 to generate the seed 6 | #include 7 | 8 | typedef __uint128_t pcg128_t; 9 | #define PCG_128BIT_CONSTANT(high, low) ((((pcg128_t)high) << 64) + low) 10 | #define PCG_DEFAULT_MULTIPLIER_128 \ 11 | PCG_128BIT_CONSTANT(2549297995355413924ULL, 4865540595714422341ULL) 12 | #define PCG_DEFAULT_INCREMENT_128 \ 13 | PCG_128BIT_CONSTANT(6364136223846793005ULL, 1442695040888963407ULL) 14 | 15 | struct pcg_state_setseq_128 { 16 | pcg128_t state; 17 | pcg128_t inc; 18 | }; 19 | 20 | typedef struct pcg_state_setseq_128 pcg64_random_t; 21 | 22 | inline void pcg_setseq_128_step_r(struct pcg_state_setseq_128 *rng) { 23 | rng->state = rng->state * PCG_DEFAULT_MULTIPLIER_128 + rng->inc; 24 | } 25 | 26 | inline void pcg_setseq_128_srandom_r(struct pcg_state_setseq_128 *rng, 27 | pcg128_t initstate, pcg128_t initseq) { 28 | rng->state = 0U; 29 | rng->inc = (initseq << 1u) | 1u; 30 | pcg_setseq_128_step_r(rng); 31 | rng->state += initstate; 32 | pcg_setseq_128_step_r(rng); 33 | } 34 | 35 | // verbatim from O'Neill's except that we skip her assembly: 36 | inline uint64_t pcg_rotr_64(uint64_t value, unsigned int rot) { 37 | return (value >> rot) | (value << ((-rot) & 63)); 38 | } 39 | 40 | inline uint64_t pcg_output_xsl_rr_128_64(pcg128_t state) { 41 | return pcg_rotr_64(((uint64_t)(state >> 64u)) ^ (uint64_t)state, 42 | (unsigned int)(state >> 122u)); 43 | } 44 | 45 | inline uint64_t 46 | pcg_setseq_128_xsl_rr_64_random_r(struct pcg_state_setseq_128 *rng) { 47 | pcg_setseq_128_step_r(rng); 48 | return pcg_output_xsl_rr_128_64(rng->state); 49 | } 50 | 51 | // use use a global state: 52 | pcg64_random_t pcg64_global; // global state 53 | 54 | // call this once before calling pcg64_random_r 55 | inline void pcg64_seed(uint64_t seed) { 56 | pcg128_t initstate = 57 | PCG_128BIT_CONSTANT(splitmix64_stateless_offset(seed, 0), 58 | splitmix64_stateless_offset(seed, 1)); 59 | // we pick a sequence at random 60 | pcg128_t initseq = PCG_128BIT_CONSTANT(splitmix64_stateless_offset(seed, 2), 61 | splitmix64_stateless_offset(seed, 3)); 62 | initseq |= 1; // should not be necessary, but let us be careful. 63 | 64 | pcg_setseq_128_srandom_r(&pcg64_global, initstate, initseq); 65 | } 66 | 67 | #define pcg64_random_r pcg_setseq_128_xsl_rr_64_random_r 68 | 69 | static inline uint64_t pcg64(void) { return pcg64_random_r(&pcg64_global); } 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /src/random_bounded.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | #include "chacha.c" 6 | #include "batch_shuffle_dice.c" 7 | #include "lehmer64.h" 8 | #include "pcg64.h" 9 | 10 | void seed(uint64_t s) { 11 | lehmer64_seed(s); 12 | pcg64_seed(s); 13 | chacha8_zero(&chacha_rng, s); 14 | } 15 | 16 | 17 | 18 | // Fisher-Yates shuffle, rolling one die at a time 19 | void shuffle(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)) { 20 | uint64_t i; 21 | for (i = size; i > 1; i--) { 22 | uint64_t nextpos = random_bounded(i, rng); 23 | uint64_t tmp = storage[i - 1]; // likely in cache 24 | uint64_t val = storage[nextpos]; // could be costly 25 | storage[i - 1] = val; 26 | storage[nextpos] = tmp; // you might have to read this store later 27 | } 28 | } 29 | 30 | // Fisher-Yates shuffle, rolling up to two dice at a time 31 | void shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)) { 32 | uint64_t i = size; 33 | for (; i > 1 << 30; i--) { 34 | partial_shuffle_64b(storage, i, 1, i, rng); 35 | } 36 | 37 | // Batches of 2 for sizes up to 2^30 elements 38 | uint64_t bound = (uint64_t)1 << 60; 39 | for (; i > 1; i -= 2) { 40 | bound = partial_shuffle_64b(storage, i, 2, bound, rng); 41 | } 42 | } 43 | 44 | // Fisher-Yates shuffle, rolling up to six dice at a time 45 | void shuffle_batch_23456(uint64_t *storage, uint64_t size, 46 | uint64_t (*rng)(void)) { 47 | uint64_t i = size; 48 | for (; i > 1 << 30; i--) { 49 | partial_shuffle_64b(storage, i, 1, i, rng); 50 | } 51 | 52 | // Batches of 2 for sizes up to 2^30 elements 53 | uint64_t bound = (uint64_t)1 << 60; 54 | for (; i > 1 << 19; i -= 2) { 55 | bound = partial_shuffle_64b(storage, i, 2, bound, rng); 56 | } 57 | 58 | // Batches of 3 for sizes up to 2^19 elements 59 | bound = (uint64_t)1 << 57; 60 | for (; i > 1 << 14; i -= 3) { 61 | bound = partial_shuffle_64b(storage, i, 3, bound, rng); 62 | } 63 | 64 | // Batches of 4 for sizes up to 2^14 elements 65 | bound = (uint64_t)1 << 56; 66 | for (; i > 1 << 11; i -= 4) { 67 | bound = partial_shuffle_64b(storage, i, 4, bound, rng); 68 | } 69 | 70 | // Batches of 5 for sizes up to 2^11 elements 71 | bound = (uint64_t)1 << 55; 72 | for (; i > 1 << 9; i -= 5) { 73 | bound = partial_shuffle_64b(storage, i, 5, bound, rng); 74 | } 75 | 76 | // Batches of 6 for sizes up to 2^9 elements 77 | bound = (uint64_t)1 << 54; 78 | for (; i > 6; i -= 6) { 79 | bound = partial_shuffle_64b(storage, i, 6, bound, rng); 80 | } 81 | 82 | if (i > 1) { 83 | partial_shuffle_64b(storage, i, i - 1, 720, rng); 84 | } 85 | } 86 | 87 | 88 | // Fisher-Yates shuffle, rolling up to two dice at a time 89 | void naive_shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)) { 90 | uint64_t i = size; 91 | for (; i > (UINT64_C(1) << 32); i--) { 92 | naive_partial_shuffle_64b(storage, i, 1, rng); 93 | } 94 | for (; i > 1; i -= 2) { 95 | naive_partial_shuffle_64b(storage, i, 2, rng); 96 | } 97 | } 98 | 99 | 100 | // Shuffle with Lehmer RNG 101 | 102 | void shuffle_lehmer(uint64_t *storage, uint64_t size) { 103 | shuffle(storage, size, lehmer64); 104 | } 105 | 106 | void shuffle_lehmer_2(uint64_t *storage, uint64_t size) { 107 | shuffle_batch_2(storage, size, lehmer64); 108 | } 109 | 110 | void shuffle_lehmer_23456(uint64_t *storage, uint64_t size) { 111 | shuffle_batch_23456(storage, size, lehmer64); 112 | } 113 | 114 | void naive_shuffle_lehmer_2(uint64_t *storage, uint64_t size) { 115 | naive_shuffle_batch_2(storage, size, lehmer64); 116 | } 117 | 118 | // Shuffle with PCG RNG 119 | 120 | void shuffle_pcg(uint64_t *storage, uint64_t size) { 121 | shuffle(storage, size, pcg64); 122 | } 123 | 124 | void shuffle_pcg_2(uint64_t *storage, uint64_t size) { 125 | shuffle_batch_2(storage, size, pcg64); 126 | } 127 | 128 | void shuffle_pcg_23456(uint64_t *storage, uint64_t size) { 129 | shuffle_batch_23456(storage, size, pcg64); 130 | } 131 | 132 | void naive_shuffle_pcg_2(uint64_t *storage, uint64_t size) { 133 | naive_shuffle_batch_2(storage, size, pcg64); 134 | } 135 | 136 | // Shuffle with ChaCha RNG 137 | void shuffle_chacha(uint64_t *storage, uint64_t size) { 138 | shuffle(storage, size, chacha_u64_global); 139 | } 140 | 141 | void shuffle_chacha_2(uint64_t *storage, uint64_t size) { 142 | shuffle_batch_2(storage, size, chacha_u64_global); 143 | } 144 | 145 | void shuffle_chacha_23456(uint64_t *storage, uint64_t size) { 146 | shuffle_batch_23456(storage, size, chacha_u64_global); 147 | } 148 | 149 | void naive_shuffle_chacha_2(uint64_t *storage, uint64_t size) { 150 | naive_shuffle_batch_2(storage, size, chacha_u64_global); 151 | } 152 | // Random bounded Lehmer 153 | 154 | uint64_t random_bounded_lehmer(uint64_t range) { 155 | return random_bounded(range, lehmer64); 156 | } 157 | -------------------------------------------------------------------------------- /src/splitmix64.h: -------------------------------------------------------------------------------- 1 | #ifndef SPLITMIX64_H 2 | #define SPLITMIX64_H 3 | 4 | #include 5 | 6 | inline uint64_t splitmix64_stateless(uint64_t index) { 7 | uint64_t z = (index * UINT64_C(0x9E3779B97F4A7C15)); 8 | z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); 9 | z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); 10 | return z ^ (z >> 31); 11 | } 12 | 13 | // floor( ( (1+sqrt(5))/2 ) * 2**64 MOD 2**64) 14 | #define GOLDEN_GAMMA UINT64_C(0x9E3779B97F4A7C15) 15 | 16 | inline uint64_t splitmix64_r(uint64_t *seed) { 17 | uint64_t z = (*seed += GOLDEN_GAMMA); 18 | // David Stafford's Mix13 for MurmurHash3's 64-bit finalizer 19 | z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9); 20 | z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB); 21 | return z ^ (z >> 31); 22 | } 23 | 24 | // returns the value of splitmix64 "offset" steps from seed 25 | inline uint64_t splitmix64_stateless_offset(uint64_t seed, uint64_t offset) { 26 | seed += offset * GOLDEN_GAMMA; 27 | return splitmix64_r(&seed); 28 | } 29 | 30 | #endif // SPLITMIX64_H 31 | -------------------------------------------------------------------------------- /tests/basic.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | extern "C" { 9 | #include "random_bounded.h" 10 | } 11 | #include "template_shuffle.h" 12 | 13 | /*** 14 | * How do we test a shuffle function? 15 | * There are many tests that one could apply to a shuffle function. 16 | */ 17 | using shuffle_function = void (*)(uint64_t *, uint64_t); 18 | 19 | template 20 | bool everyone_can_move_everywhere(const function_type &function) { 21 | constexpr size_t size = 512; 22 | uint64_t input[size]; 23 | std::bitset bits[size]; 24 | for (size_t trial = 0; trial < size * size; trial++) { 25 | // We always start from the same input. 26 | std::iota(input, input + size, 0); 27 | // We shuffle: 28 | function(input, size); 29 | // Mark that at position i we found value input[i]. 30 | for (size_t i = 0; i < size; i++) { 31 | bits[i][input[i]] = 1; 32 | } 33 | } 34 | for (const std::bitset &b : bits) { 35 | if (!b.all()) { 36 | return false; 37 | } 38 | } 39 | return true; 40 | } 41 | 42 | template 43 | bool uniformity_test(const function_type &function) { 44 | constexpr size_t size = 512; 45 | uint64_t input[size]; 46 | std::array bits[size]{}; 47 | size_t volume = size * size; 48 | for (size_t trial = 0; trial < volume; trial++) { 49 | // We always start from the same input. 50 | std::iota(input, input + size, 0); 51 | // We shuffle: 52 | function(input, size); 53 | // Mark that at position i we found value input[i]. 54 | for (size_t i = 0; i < size; i++) { 55 | bits[i][input[i]] += 1; 56 | } 57 | } 58 | size_t overall_min {std::numeric_limits::max() }; 59 | size_t overall_max = 0; 60 | size_t average = 0; 61 | 62 | for (const std::array &b : bits) { 63 | average += std::accumulate(b.begin(), b.end(), 0); 64 | size_t max_value = *std::max_element(b.begin(), b.end()); 65 | size_t min_value = *std::min_element(b.begin(), b.end()); 66 | if (max_value > overall_max) { 67 | overall_max = max_value; 68 | } 69 | if (min_value < overall_min) { 70 | overall_min = min_value; 71 | } 72 | } 73 | size_t gap = overall_max - overall_min; 74 | double mean = (double)average / volume; 75 | double relative_gap = (double)gap / mean; 76 | 77 | printf("relative gap: %f, ", relative_gap); 78 | 79 | return relative_gap < 0.6; 80 | } 81 | 82 | template 83 | bool any_possible_pair_at_the_start(const function_type &function) { 84 | constexpr size_t size = 64; 85 | uint64_t input[size]; 86 | std::bitset bits; 87 | for (size_t trial = 0; trial < size * size * size; trial++) { 88 | // We always start from the same input. 89 | std::iota(input, input + size, 0); 90 | // We shuffle: 91 | function(input, size); 92 | bits[input[0] * size + input[1]] = 1; 93 | } 94 | for (size_t i = 0; i < size; i++) { 95 | for (size_t j = 0; j < size; j++) { 96 | if (i == j) { 97 | if (bits[i * size + j]) { 98 | return false; 99 | } 100 | } else { 101 | if (!bits[i * size + j]) { 102 | return false; 103 | } 104 | } 105 | } 106 | } 107 | return true; 108 | } 109 | 110 | template 111 | bool any_possible_pair_at_the_end(const function_type &function) { 112 | constexpr size_t size = 64; 113 | uint64_t input[size]; 114 | std::bitset bits; 115 | for (size_t trial = 0; trial < size * size * size; trial++) { 116 | // We always start from the same input. 117 | std::iota(input, input + size, 0); 118 | // We shuffle: 119 | function(input, size); 120 | bits[input[0] * size + input[1]] = 1; 121 | } 122 | for (size_t i = 0; i < size; i++) { 123 | for (size_t j = 0; j < size; j++) { 124 | if (i == j) { 125 | if (bits[i * size + j]) { 126 | return false; 127 | } 128 | } else { 129 | if (!bits[i * size + j]) { 130 | return false; 131 | } 132 | } 133 | } 134 | } 135 | return true; 136 | } 137 | 138 | struct named_function { 139 | std::string name; 140 | shuffle_function function; 141 | }; 142 | 143 | named_function func[] = { 144 | {"shuffle_lehmer", shuffle_lehmer}, 145 | {"shuffle_lehmer_2", shuffle_lehmer_2}, 146 | {"shuffle_lehmer_23456", shuffle_lehmer_23456}, 147 | {"shuffle_pcg", shuffle_pcg}, 148 | {"shuffle_pcg_2", shuffle_pcg_2}, 149 | {"shuffle_pcg_23456", shuffle_pcg_23456} 150 | }; 151 | 152 | bool test_everyone_can_move_everywhere() { 153 | std::cout << __FUNCTION__ << std::endl; 154 | for (const auto &f : func) { 155 | std::cout << std::setw(40) << f.name << ": "; 156 | std::cout.flush(); 157 | if (!everyone_can_move_everywhere(f.function)) { 158 | std::cerr << "!!!Test failed for " << f.name << std::endl; 159 | return false; 160 | } else { 161 | std::cout << "passed" << std::endl; 162 | } 163 | } 164 | return true; 165 | } 166 | 167 | bool test_uniformity_test() { 168 | std::cout << __FUNCTION__ << std::endl; 169 | for (const auto &f : func) { 170 | std::cout << std::setw(40) << f.name << ": "; 171 | std::cout.flush(); 172 | if (!uniformity_test(f.function)) { 173 | std::cerr << "!!!Test failed for " << f.name << std::endl; 174 | return false; 175 | } else { 176 | std::cout << "passed" << std::endl; 177 | } 178 | } 179 | return true; 180 | } 181 | 182 | bool test_any_possible_pair_at_the_start() { 183 | std::cout << __FUNCTION__ << std::endl; 184 | for (const auto &f : func) { 185 | std::cout << std::setw(40) << f.name << ": "; 186 | std::cout.flush(); 187 | if (!any_possible_pair_at_the_start(f.function)) { 188 | std::cerr << "!!!Test failed for " << f.name << std::endl; 189 | return false; 190 | } else { 191 | std::cout << "passed" << std::endl; 192 | } 193 | } 194 | return true; 195 | } 196 | 197 | bool test_any_possible_pair_at_the_end() { 198 | std::cout << __FUNCTION__ << std::endl; 199 | for (const auto &f : func) { 200 | std::cout << std::setw(40) << f.name << ": "; 201 | std::cout.flush(); 202 | if (!any_possible_pair_at_the_end(f.function)) { 203 | std::cerr << "!!!Test failed for " << f.name << std::endl; 204 | return false; 205 | } else { 206 | std::cout << "passed" << std::endl; 207 | } 208 | } 209 | return true; 210 | } 211 | 212 | int main() { 213 | seed(1234); 214 | bool success = true; 215 | success &= test_uniformity_test(); 216 | success &= test_any_possible_pair_at_the_end(); 217 | success &= test_any_possible_pair_at_the_start(); 218 | success &= test_everyone_can_move_everywhere(); 219 | if (success) { 220 | std::cout << "All tests passed" << std::endl; 221 | } else { 222 | std::cerr << "Some tests failed" << std::endl; 223 | } 224 | return success ? EXIT_SUCCESS : EXIT_FAILURE; 225 | } 226 | --------------------------------------------------------------------------------