├── LICENSE
├── Makefile
├── README.md
├── benchmarks
    ├── benchmark.cpp
    ├── generators.h
    ├── performancecounters
    │   ├── apple_arm_events.h
    │   ├── benchmarker.h
    │   ├── event_counter.h
    │   └── linux-perf-events.h
    └── stream.cpp
├── gnuplot
    ├── README.md
    ├── icelake.data
    ├── icelake.datalehmer.pdf
    ├── icelake.datapcg64.pdf
    ├── icelake.dataratio.pdf
    ├── icelakestream.data
    ├── m2.data
    ├── m2.datalehmer.pdf
    ├── m2.datapcg64.pdf
    ├── m2.dataratio.pdf
    ├── m2stream.data
    └── plot.gnuplot
├── include
    ├── partial-shuffle-inl.h
    ├── random_bounded.h
    └── template_shuffle.h
├── src
    ├── batch_shuffle_dice.c
    ├── chacha.c
    ├── chacha.h
    ├── lehmer64.h
    ├── pcg64.h
    ├── random_bounded.c
    └── splitmix64.h
└── tests
    └── basic.cpp


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2024 The batched_random authors
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:    benchmark basic stream
 2 | CXX=clang++
 3 | CC=clang
 4 | benchmark: benchmarks/benchmark.cpp random_bounded.o
 5 | 	$(CXX) $(CXXFLAGS) -std=c++17 -O3 -Wall  -Wextra  -o benchmark benchmarks/benchmark.cpp random_bounded.o  -Iinclude -Ibenchmarks 
 6 | stream: benchmarks/stream.cpp random_bounded.o
 7 | 	$(CXX) $(CXXFLAGS) -std=c++17 -O3 -Wall  -Wextra  -o stream benchmarks/stream.cpp random_bounded.o  -Iinclude -Ibenchmarks 
 8 | basic : tests/basic.cpp random_bounded.o
 9 | 	$(CXX) $(CXXFLAGS) -std=c++17 -O3 -Wall  -Wextra  -o basic tests/basic.cpp random_bounded.o  -Iinclude
10 | random_bounded.o: src/batch_shuffle_dice.c src/random_bounded.c include/random_bounded.h src/lehmer64.h  src/splitmix64.h
11 | 	$(CC) $(CFLAGS) -std=c11 -O3 -Wall -Wextra -Wconversion -c src/random_bounded.c
12 | 
13 | clean:
14 | 	rm -f random_bounded.o benchmark basic stream


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Batched Random
 2 | 
 3 | We benchmark fast shuffling functions using batched random index generation.
 4 | It is meant for research purposes. Though we have good benchmarks and tests, 
 5 | this code is not meant to be production-ready.
 6 | 
 7 | ### Reference
 8 | 
 9 | * Nevin Brackett-Rozinsky, Daniel Lemire, [Batched Ranged Random Integer Generation](https://arxiv.org/abs/2408.06213), Software: Practice and Experience 55 (1), 2024.
10 | 
11 | ### Requirements
12 | 
13 | - Recent LLVM clang and clang++ compilers
14 | - Make
15 | 
16 | ### Running Benchmarks
17 | 
18 | 
19 | ```
20 | make
21 | ./benchmark
22 | ```
23 | 
24 | To get the C++ benchmarks, you can type `./benchmark --cpp`. They are disabled by default.
25 | 
26 | To run tests:
27 | ```
28 | ./basic
29 | ```
30 | 
31 | ## Code
32 | 
33 | See `src` directory for the main code.
34 | 
35 | ## Other Compilers
36 | 
37 | We use LLVM/clang for benchmarking.
38 | 
39 | The code is portable and other compilers can be used. The performance
40 | of the C++ code might be sensitive to the C++ compiler used.
41 | Specifically, we find that `shuffle_23456` has relatively poor performance
42 | with GCC compared to LLVM/clang. GCC fails to optimize `shuffle_23456` properly.
43 | We recommend that GCC users prefer `shuffle_2`.
44 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "performancecounters/benchmarker.h"
  3 | #include <algorithm>
  4 | #include <charconv>
  5 | #include <filesystem>
  6 | #include <fstream>
  7 | #include <iostream>
  8 | #include <random>
  9 | #include <stdlib.h>
 10 | #include <vector>
 11 | extern "C" {
 12 | #include "random_bounded.h"
 13 | }
 14 | #include "generators.h"
 15 | #include "template_shuffle.h"
 16 | 
 17 | void precomp_shuffle(uint64_t *storage, uint64_t size,
 18 |                      const uint32_t *precomputed) {
 19 |   uint64_t tmp, val;
 20 |   uint32_t nextpos;
 21 |   for (size_t i = size - 1; i > 0; i--) {
 22 |     nextpos = precomputed[i];
 23 |     tmp = storage[i];   // likely in cache
 24 |     val = storage[nextpos]; // could be costly
 25 |     storage[i] = val;
 26 |     storage[nextpos] = tmp; // you might have to read this store later
 27 |   }
 28 | }
 29 | 
 30 | void pretty_print(size_t volume, size_t bytes, std::string name,
 31 |                   event_aggregate agg) {
 32 |   printf("%-45s : ", name.c_str());
 33 |   printf(" %5.2f Gi/s ", volume / agg.fastest_elapsed_ns());
 34 |   double best_speed = volume / agg.fastest_elapsed_ns();
 35 |   double avg_speed = volume / agg.elapsed_ns();
 36 |   double range = (best_speed - avg_speed) / avg_speed * 100.0;
 37 |   printf(" %5.2f GB/s best, %5.2f GB/s average, (%2.0f %%) ",
 38 |          bytes / agg.fastest_elapsed_ns(), bytes / agg.elapsed_ns(), range);
 39 |   if (collector.has_events()) {
 40 |     printf(" %5.2f GHz ", agg.fastest_cycles() / agg.fastest_elapsed_ns());
 41 |     printf(" %5.2f c/b ", agg.fastest_cycles() / bytes);
 42 |     printf(" %5.2f i/b ", agg.fastest_instructions() / bytes);
 43 |     printf(" %5.2f i/e ", agg.fastest_instructions() / volume);
 44 |     printf(" %5.2f i/c ", agg.fastest_instructions() / agg.fastest_cycles());
 45 |   }
 46 |   printf("\n");
 47 | }
 48 | 
 49 | void bench(size_t size, bool include_cpp) {
 50 |   constexpr size_t min_volume = 4096;
 51 |   if (size == 0) {
 52 |     return;
 53 |   }
 54 |   size_t volume = size;
 55 |   if (size < min_volume) {
 56 |     volume *= min_volume / size;
 57 |   }
 58 |   std::vector<uint64_t> input(volume);
 59 |   std::random_device rd;
 60 | 
 61 |   if (size > 0xFFFFFFFF) {
 62 |     std::cerr << "WARNING: Volume too large for precomputed shuffle."
 63 |               << std::endl;
 64 |   }
 65 |   std::vector<uint32_t> precomputed(volume);
 66 |   for (size_t i = 0; i < volume; i++) {
 67 |     input[i] = i;
 68 |     uint64_t bound = (i % size) + 1;
 69 |     precomputed[i] = random_bounded_lehmer(bound);
 70 |   }
 71 | 
 72 |   std::cout << "Size of precomputed values "
 73 |             << size * sizeof(uint32_t) / 1024 << " kB"
 74 |             << std::endl;
 75 |   std::cout << "Size of shuffle      : " << size << " words" << std::endl;
 76 |   std::cout << "Size of shuffle      : " << size * sizeof(uint64_t) / 1024 / 1024.
 77 |             << " MB" << std::endl;
 78 | 
 79 |   size_t min_repeat = 10;
 80 |   size_t min_time_ns = 100000000;
 81 |   size_t max_repeat = 100000;
 82 | 
 83 |   if (size < volume) {
 84 |     printf("inner repeat: %zu\n", volume / size);
 85 |   }
 86 | 
 87 |   if (include_cpp) {
 88 |     lehmer64 lehmerGenerator{rd()};
 89 |     std::mt19937_64 mtGenerator{rd()};
 90 | 
 91 |     // C++ Lehmer
 92 | 
 93 |     pretty_print(volume, volume * sizeof(uint64_t),
 94 |                  "C++ std::shuffle (lehmer)",
 95 |                  bench(
 96 |                      [&input, &lehmerGenerator, size]() {
 97 |                        for (auto t = input.begin(); t < input.end(); t += size) {
 98 |                          std::shuffle(t, t + size, lehmerGenerator);
 99 |                        }
100 |                      },
101 |                      min_repeat, min_time_ns, max_repeat));
102 | 
103 |     pretty_print(volume, volume * sizeof(uint64_t),
104 |                  "C++ shuffle 2 (lehmer)",
105 |                  bench(
106 |                      [&input, &lehmerGenerator, size]() {
107 |                        for (auto t = input.begin(); t < input.end(); t += size) {
108 |                          batched_random::shuffle_2(t, t + size, lehmerGenerator);
109 |                        }
110 |                      },
111 |                      min_repeat, min_time_ns, max_repeat));
112 | 
113 |     pretty_print(volume, volume * sizeof(uint64_t),
114 |                  "C++ shuffle 2-6 (lehmer)",
115 |                  bench(
116 |                      [&input, &lehmerGenerator, size]() {
117 |                        for (auto t = input.begin(); t < input.end(); t += size) {
118 |                          batched_random::shuffle_23456(t, t + size, lehmerGenerator);
119 |                        }
120 |                      },
121 |                      min_repeat, min_time_ns, max_repeat));
122 | 
123 |     // C++ Mersenne twister
124 | 
125 |     pretty_print(volume, volume * sizeof(uint64_t),
126 |                  "C++ std::shuffle (mersenne)",
127 |                  bench(
128 |                      [&input, &mtGenerator, size]() {
129 |                        for (auto t = input.begin(); t < input.end(); t += size) {
130 |                          std::shuffle(t, t + size, mtGenerator);
131 |                        }
132 |                      },
133 |                      min_repeat, min_time_ns, max_repeat));
134 | 
135 |     pretty_print(volume, volume * sizeof(uint64_t),
136 |                  "C++ shuffle 2 (mersenne)",
137 |                  bench(
138 |                      [&input, &mtGenerator, size]() {
139 |                        for (auto t = input.begin(); t < input.end(); t += size) {
140 |                          batched_random::shuffle_2(t, t + size, mtGenerator);
141 |                        }
142 |                      },
143 |                      min_repeat, min_time_ns, max_repeat));
144 | 
145 |     pretty_print(volume, volume * sizeof(uint64_t),
146 |                  "C++ shuffle 2-6 (mersenne)",
147 |                  bench(
148 |                      [&input, &mtGenerator, size]() {
149 |                        for (auto t = input.begin(); t < input.end(); t += size) {
150 |                          batched_random::shuffle_23456(t, t + size, mtGenerator);
151 |                        }
152 |                      },
153 |                      min_repeat, min_time_ns, max_repeat));
154 |   }
155 | 
156 |   // Lehmer
157 | 
158 |   pretty_print(volume, volume * sizeof(uint64_t),
159 |                "standard shuffle (lehmer)",
160 |                bench(
161 |                    [&input, size, volume]() {
162 |                      for (size_t t = 0; t < volume; t += size) {
163 |                        shuffle_lehmer(input.data() + t, size);
164 |                      }
165 |                    },
166 |                    min_repeat, min_time_ns, max_repeat));
167 | 
168 |   pretty_print(volume, volume * sizeof(uint64_t),
169 |                "batch shuffle 2 (lehmer)",
170 |                bench(
171 |                    [&input, size, volume]() {
172 |                      for (size_t t = 0; t < volume; t += size) {
173 |                        shuffle_lehmer_2(input.data() + t, size);
174 |                      }
175 |                    },
176 |                    min_repeat, min_time_ns, max_repeat));
177 | 
178 |   pretty_print(volume, volume * sizeof(uint64_t),
179 |                "batch shuffle 2-6 (lehmer)",
180 |                bench(
181 |                    [&input, size, volume]() {
182 |                      for (size_t t = 0; t < volume; t += size) {
183 |                        shuffle_lehmer_23456(input.data() + t, size);
184 |                      }
185 |                    },
186 |                    min_repeat, min_time_ns, max_repeat));
187 | 
188 |   pretty_print(volume, volume * sizeof(uint64_t),
189 |                "naive batch shuffle 2 (lehmer)",
190 |                bench(
191 |                    [&input, size, volume]() {
192 |                      for (size_t t = 0; t < volume; t += size) {
193 |                        naive_shuffle_lehmer_2(input.data() + t, size);
194 |                      }
195 |                    },
196 |                    min_repeat, min_time_ns, max_repeat));
197 | 
198 |   // PCG
199 | 
200 |   pretty_print(volume, volume * sizeof(uint64_t),
201 |                "standard shuffle (PCG)",
202 |                bench(
203 |                    [&input, size, volume]() {
204 |                      for (size_t t = 0; t < volume; t += size) {
205 |                        shuffle_pcg(input.data() + t, size);
206 |                      }
207 |                    },
208 |                    min_repeat, min_time_ns, max_repeat));
209 | 
210 |   pretty_print(volume, volume * sizeof(uint64_t),
211 |                "batch shuffle 2 (PCG)",
212 |                bench(
213 |                    [&input, size, volume]() {
214 |                      for (size_t t = 0; t < volume; t += size) {
215 |                        shuffle_pcg_2(input.data() + t, size);
216 |                      }
217 |                    },
218 |                    min_repeat, min_time_ns, max_repeat));
219 | 
220 |   pretty_print(volume, volume * sizeof(uint64_t),
221 |                "batch shuffle 2-6 (PCG)",
222 |                bench(
223 |                    [&input, size, volume]() {
224 |                      for (size_t t = 0; t < volume; t += size) {
225 |                        shuffle_pcg_23456(input.data() + t, size);
226 |                      }
227 |                    },
228 |                    min_repeat, min_time_ns, max_repeat));
229 | 
230 | 
231 | 
232 |   pretty_print(volume, volume * sizeof(uint64_t),
233 |                "naive batch shuffle 2 (PCG)",
234 |                bench(
235 |                    [&input, size, volume]() {
236 |                      for (size_t t = 0; t < volume; t += size) {
237 |                        naive_shuffle_pcg_2(input.data() + t, size);
238 |                      }
239 |                    },
240 |                    min_repeat, min_time_ns, max_repeat));
241 |   // chacha
242 | 
243 |   pretty_print(volume, volume * sizeof(uint64_t),
244 |                "standard shuffle (chacha)",
245 |                bench(
246 |                    [&input, size, volume]() {
247 |                      for (size_t t = 0; t < volume; t += size) {
248 |                        shuffle_chacha(input.data() + t, size);
249 |                      }
250 |                    },
251 |                    min_repeat, min_time_ns, max_repeat));
252 | 
253 |   pretty_print(volume, volume * sizeof(uint64_t),
254 |                "batch shuffle 2 (chacha)",
255 |                bench(
256 |                    [&input, size, volume]() {
257 |                      for (size_t t = 0; t < volume; t += size) {
258 |                        shuffle_chacha_2(input.data() + t, size);
259 |                      }
260 |                    },
261 |                    min_repeat, min_time_ns, max_repeat));
262 | 
263 |   pretty_print(volume, volume * sizeof(uint64_t),
264 |                "batch shuffle 2-6 (chacha)",
265 |                bench(
266 |                    [&input, size, volume]() {
267 |                      for (size_t t = 0; t < volume; t += size) {
268 |                        shuffle_chacha_23456(input.data() + t, size);
269 |                      }
270 |                    },
271 |                    min_repeat, min_time_ns, max_repeat));
272 | 
273 |   pretty_print(volume, volume * sizeof(uint64_t),
274 |                "naive batch shuffle 2 (chacha)",
275 |                bench(
276 |                    [&input, size, volume]() {
277 |                      for (size_t t = 0; t < volume; t += size) {
278 |                        naive_shuffle_chacha_2(input.data() + t, size);
279 |                      }
280 |                    },
281 |                    min_repeat, min_time_ns, max_repeat));
282 |   // Precomputed
283 | 
284 |   pretty_print(volume, volume * sizeof(uint64_t),
285 |                "directed_shuffle (as a reference)",
286 |                bench(
287 |                    [&input, precomputed, size, volume]() {
288 |                      for (size_t t = 0; t < volume; t += size) {
289 |                        precomp_shuffle(input.data() + t, size,
290 |                                        precomputed.data() + t);
291 |                      }
292 |                    },
293 |                    min_repeat, min_time_ns, max_repeat));
294 | }
295 | 
296 | int main(int argc, char **argv) {
297 |   seed(1234);
298 |   bool include_cpp = false;
299 |   if (argc > 1) {
300 |     if (std::string(argv[1]) == "--cpp") {
301 |       include_cpp = true;
302 |     }
303 |   }
304 | 
305 |   // We want to make sure we extend the range far enough to see regressions
306 |   // for large arrays, if any.
307 |   for (size_t i = 1 << 9; i <= 1 << 20; i <<= 1) {
308 |     bench(i, include_cpp);
309 |     std::cout << std::endl;
310 |   }
311 | 
312 |   return EXIT_SUCCESS;
313 | }
314 | 


--------------------------------------------------------------------------------
/benchmarks/generators.h:
--------------------------------------------------------------------------------
 1 | #ifndef BENCHMARKS_GENERATORS_H
 2 | #define BENCHMARKS_GENERATORS_H
 3 | #include <random>
 4 | 
 5 | class lehmer64 {
 6 | public:
 7 |   using result_type = uint64_t;
 8 |   static constexpr result_type(min)() { return 0; }
 9 |   static constexpr result_type(max)() { return UINT64_MAX; }
10 | 
11 |   lehmer64() : m_state(1234) {}
12 |   lehmer64(uint64_t seed) : m_state(seed|1) {}
13 |   lehmer64(lehmer64&& l) : m_state(l.m_state) {}
14 |   lehmer64(lehmer64& l) : m_state(l.m_state) {}
15 | 
16 | 
17 |   void step() { m_state *= UINT64_C(0xda942042e4dd58b5); }
18 | 
19 |   result_type operator()() {
20 |     step();
21 |     return (uint64_t)(m_state >> 64);
22 |   }
23 | 
24 |   void discard(unsigned long long n) {
25 |     for (unsigned long long i = 0; i < n; ++i)
26 |       operator()();
27 |   }
28 | 
29 | private:
30 |   __uint128_t m_state;
31 | };
32 | 
33 | #endif


--------------------------------------------------------------------------------
/benchmarks/performancecounters/apple_arm_events.h:
--------------------------------------------------------------------------------
   1 | /* clang-format off */
   2 | 
   3 | // Original design from:
   4 | // =============================================================================
   5 | // XNU kperf/kpc
   6 | // Available for 64-bit Intel/Apple Silicon, macOS/iOS, with root privileges
   7 | //
   8 | // References:
   9 | //
  10 | // XNU source (since xnu 2422.1.72):
  11 | // https://github.com/apple/darwin-xnu/blob/main/osfmk/kern/kpc.h
  12 | // https://github.com/apple/darwin-xnu/blob/main/bsd/kern/kern_kpc.c
  13 | //
  14 | // Lightweight PET (Profile Every Thread, since xnu 3789.1.32):
  15 | // https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/pet.c
  16 | // https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/kperf_kpc.c
  17 | //
  18 | // System Private frameworks (since macOS 10.11, iOS 8.0):
  19 | // /System/Library/PrivateFrameworks/kperf.framework
  20 | // /System/Library/PrivateFrameworks/kperfdata.framework
  21 | //
  22 | // Xcode framework (since Xcode 7.0):
  23 | // /Applications/Xcode.app/Contents/SharedFrameworks/DVTInstrumentsFoundation.framework
  24 | //
  25 | // CPU database (plist files)
  26 | // macOS (since macOS 10.11):
  27 | //     /usr/share/kpep/<name>.plist
  28 | // iOS (copied from Xcode, since iOS 10.0, Xcode 8.0):
  29 | //     /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
  30 | //     /DeviceSupport/<version>/DeveloperDiskImage.dmg/usr/share/kpep/<name>.plist
  31 | //
  32 | //
  33 | // Created by YaoYuan <ibireme@gmail.com> on 2021.
  34 | // Released into the public domain (unlicense.org).
  35 | // =============================================================================
  36 | 
  37 | #ifndef M1CYCLES_H
  38 | #define M1CYCLES_H
  39 | 
  40 | #include <stdbool.h>
  41 | #include <stdint.h>
  42 | #include <stdio.h>
  43 | #include <stdlib.h>
  44 | #include <string.h>
  45 | 
  46 | #include <dlfcn.h>           // for dlopen() and dlsym()
  47 | #include <mach/mach_time.h>  // for mach_absolute_time()
  48 | #include <sys/kdebug.h>      // for kdebug trace decode
  49 | #include <sys/sysctl.h>      // for sysctl()
  50 | #include <unistd.h>          // for usleep()
  51 | 
  52 | struct performance_counters {
  53 |   double cycles;
  54 |   double branches;
  55 |   double missed_branches;
  56 |   double instructions;
  57 |   performance_counters(uint64_t c, uint64_t b, uint64_t m, uint64_t i)
  58 |       : cycles(c), branches(b), missed_branches(m), instructions(i) {}
  59 |   performance_counters(double c, double b, double m, double i)
  60 |       : cycles(c), branches(b), missed_branches(m), instructions(i) {}
  61 |   performance_counters(double init)
  62 |       : cycles(init),
  63 |         branches(init),
  64 |         missed_branches(init),
  65 |         instructions(init) {}
  66 | 
  67 |   inline performance_counters &operator-=(const performance_counters &other) {
  68 |     cycles -= other.cycles;
  69 |     branches -= other.branches;
  70 |     missed_branches -= other.missed_branches;
  71 |     instructions -= other.instructions;
  72 |     return *this;
  73 |   }
  74 |   inline performance_counters &min(const performance_counters &other) {
  75 |     cycles = other.cycles < cycles ? other.cycles : cycles;
  76 |     branches = other.branches < branches ? other.branches : branches;
  77 |     missed_branches = other.missed_branches < missed_branches
  78 |                           ? other.missed_branches
  79 |                           : missed_branches;
  80 |     instructions =
  81 |         other.instructions < instructions ? other.instructions : instructions;
  82 |     return *this;
  83 |   }
  84 |   inline performance_counters &operator+=(const performance_counters &other) {
  85 |     cycles += other.cycles;
  86 |     branches += other.branches;
  87 |     missed_branches += other.missed_branches;
  88 |     instructions += other.instructions;
  89 |     return *this;
  90 |   }
  91 | 
  92 |   inline performance_counters &operator/=(double numerator) {
  93 |     cycles /= numerator;
  94 |     branches /= numerator;
  95 |     missed_branches /= numerator;
  96 |     instructions /= numerator;
  97 |     return *this;
  98 |   }
  99 | };
 100 | 
 101 | inline performance_counters operator-(const performance_counters &a,
 102 |                                       const performance_counters &b) {
 103 |   return performance_counters(a.cycles - b.cycles, a.branches - b.branches,
 104 |                               a.missed_branches - b.missed_branches,
 105 |                               a.instructions - b.instructions);
 106 | }
 107 | 
 108 | typedef float f32;
 109 | typedef double f64;
 110 | typedef int8_t i8;
 111 | typedef uint8_t u8;
 112 | typedef int16_t i16;
 113 | typedef uint16_t u16;
 114 | typedef int32_t i32;
 115 | typedef uint32_t u32;
 116 | typedef int64_t i64;
 117 | typedef uint64_t u64;
 118 | typedef size_t usize;
 119 | 
 120 | // -----------------------------------------------------------------------------
 121 | // <kperf.framework> header (reverse engineered)
 122 | // This framework wraps some sysctl calls to communicate with the kpc in kernel.
 123 | // Most functions requires root privileges, or process is "blessed".
 124 | // -----------------------------------------------------------------------------
 125 | 
 126 | // Cross-platform class constants.
 127 | #define KPC_CLASS_FIXED (0)
 128 | #define KPC_CLASS_CONFIGURABLE (1)
 129 | #define KPC_CLASS_POWER (2)
 130 | #define KPC_CLASS_RAWPMU (3)
 131 | 
 132 | // Cross-platform class mask constants.
 133 | #define KPC_CLASS_FIXED_MASK (1u << KPC_CLASS_FIXED)                // 1
 134 | #define KPC_CLASS_CONFIGURABLE_MASK (1u << KPC_CLASS_CONFIGURABLE)  // 2
 135 | #define KPC_CLASS_POWER_MASK (1u << KPC_CLASS_POWER)                // 4
 136 | #define KPC_CLASS_RAWPMU_MASK (1u << KPC_CLASS_RAWPMU)              // 8
 137 | 
 138 | // PMU version constants.
 139 | #define KPC_PMU_ERROR (0)      // Error
 140 | #define KPC_PMU_INTEL_V3 (1)   // Intel
 141 | #define KPC_PMU_ARM_APPLE (2)  // ARM64
 142 | #define KPC_PMU_INTEL_V2 (3)   // Old Intel
 143 | #define KPC_PMU_ARM_V2 (4)     // Old ARM
 144 | 
 145 | // The maximum number of counters we could read from every class in one go.
 146 | // ARMV7: FIXED: 1, CONFIGURABLE: 4
 147 | // ARM32: FIXED: 2, CONFIGURABLE: 6
 148 | // ARM64: FIXED: 2, CONFIGURABLE: CORE_NCTRS - FIXED (6 or 8)
 149 | // x86: 32
 150 | #define KPC_MAX_COUNTERS 32
 151 | 
 152 | // Bits for defining what to do on an action.
 153 | // Defined in https://github.com/apple/darwin-xnu/blob/main/osfmk/kperf/action.h
 154 | #define KPERF_SAMPLER_TH_INFO (1U << 0)
 155 | #define KPERF_SAMPLER_TH_SNAPSHOT (1U << 1)
 156 | #define KPERF_SAMPLER_KSTACK (1U << 2)
 157 | #define KPERF_SAMPLER_USTACK (1U << 3)
 158 | #define KPERF_SAMPLER_PMC_THREAD (1U << 4)
 159 | #define KPERF_SAMPLER_PMC_CPU (1U << 5)
 160 | #define KPERF_SAMPLER_PMC_CONFIG (1U << 6)
 161 | #define KPERF_SAMPLER_MEMINFO (1U << 7)
 162 | #define KPERF_SAMPLER_TH_SCHEDULING (1U << 8)
 163 | #define KPERF_SAMPLER_TH_DISPATCH (1U << 9)
 164 | #define KPERF_SAMPLER_TK_SNAPSHOT (1U << 10)
 165 | #define KPERF_SAMPLER_SYS_MEM (1U << 11)
 166 | #define KPERF_SAMPLER_TH_INSCYC (1U << 12)
 167 | #define KPERF_SAMPLER_TK_INFO (1U << 13)
 168 | 
 169 | // Maximum number of kperf action ids.
 170 | #define KPERF_ACTION_MAX (32)
 171 | 
 172 | // Maximum number of kperf timer ids.
 173 | #define KPERF_TIMER_MAX (8)
 174 | 
 175 | // x86/arm config registers are 64-bit
 176 | typedef u64 kpc_config_t;
 177 | 
 178 | /// Print current CPU identification string to the buffer (same as snprintf),
 179 | /// such as "cpu_7_8_10b282dc_46". This string can be used to locate the PMC
 180 | /// database in /usr/share/kpep.
 181 | /// @return string's length, or negative value if error occurs.
 182 | /// @note This method does not requires root privileges.
 183 | /// @details sysctl get(hw.cputype), get(hw.cpusubtype),
 184 | ///                 get(hw.cpufamily), get(machdep.cpu.model)
 185 | static int (*kpc_cpu_string)(char *buf, usize buf_size);
 186 | 
 187 | /// Get the version of KPC that's being run.
 188 | /// @return See `PMU version constants` above.
 189 | /// @details sysctl get(kpc.pmu_version)
 190 | static u32 (*kpc_pmu_version)(void);
 191 | 
 192 | /// Get running PMC classes.
 193 | /// @return See `class mask constants` above,
 194 | ///         0 if error occurs or no class is set.
 195 | /// @details sysctl get(kpc.counting)
 196 | static u32 (*kpc_get_counting)(void);
 197 | 
 198 | /// Set PMC classes to enable counting.
 199 | /// @param classes See `class mask constants` above, set 0 to shutdown counting.
 200 | /// @return 0 for success.
 201 | /// @details sysctl set(kpc.counting)
 202 | static int (*kpc_set_counting)(u32 classes);
 203 | 
 204 | /// Get running PMC classes for current thread.
 205 | /// @return See `class mask constants` above,
 206 | ///         0 if error occurs or no class is set.
 207 | /// @details sysctl get(kpc.thread_counting)
 208 | static u32 (*kpc_get_thread_counting)(void);
 209 | 
 210 | /// Set PMC classes to enable counting for current thread.
 211 | /// @param classes See `class mask constants` above, set 0 to shutdown counting.
 212 | /// @return 0 for success.
 213 | /// @details sysctl set(kpc.thread_counting)
 214 | static int (*kpc_set_thread_counting)(u32 classes);
 215 | 
 216 | /// Get how many config registers there are for a given mask.
 217 | /// For example: Intel may returns 1 for `KPC_CLASS_FIXED_MASK`,
 218 | ///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
 219 | /// @param classes See `class mask constants` above.
 220 | /// @return 0 if error occurs or no class is set.
 221 | /// @note This method does not requires root privileges.
 222 | /// @details sysctl get(kpc.config_count)
 223 | static u32 (*kpc_get_config_count)(u32 classes);
 224 | 
 225 | /// Get config registers.
 226 | /// @param classes see `class mask constants` above.
 227 | /// @param config Config buffer to receive values, should not smaller than
 228 | ///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
 229 | /// @return 0 for success.
 230 | /// @details sysctl get(kpc.config_count), get(kpc.config)
 231 | static int (*kpc_get_config)(u32 classes, kpc_config_t *config);
 232 | 
 233 | /// Set config registers.
 234 | /// @param classes see `class mask constants` above.
 235 | /// @param config Config buffer, should not smaller than
 236 | ///               kpc_get_config_count(classes) * sizeof(kpc_config_t).
 237 | /// @return 0 for success.
 238 | /// @details sysctl get(kpc.config_count), set(kpc.config)
 239 | static int (*kpc_set_config)(u32 classes, kpc_config_t *config);
 240 | 
 241 | /// Get how many counters there are for a given mask.
 242 | /// For example: Intel may returns 3 for `KPC_CLASS_FIXED_MASK`,
 243 | ///                        returns 4 for `KPC_CLASS_CONFIGURABLE_MASK`.
 244 | /// @param classes See `class mask constants` above.
 245 | /// @note This method does not requires root privileges.
 246 | /// @details sysctl get(kpc.counter_count)
 247 | static u32 (*kpc_get_counter_count)(u32 classes);
 248 | 
 249 | /// Get counter accumulations.
 250 | /// If `all_cpus` is true, the buffer count should not smaller than
 251 | /// (cpu_count * counter_count). Otherwize, the buffer count should not smaller
 252 | /// than (counter_count).
 253 | /// @see kpc_get_counter_count(), kpc_cpu_count().
 254 | /// @param all_cpus true for all CPUs, false for current cpu.
 255 | /// @param classes See `class mask constants` above.
 256 | /// @param curcpu A pointer to receive current cpu id, can be NULL.
 257 | /// @param buf Buffer to receive counter's value.
 258 | /// @return 0 for success.
 259 | /// @details sysctl get(hw.ncpu), get(kpc.counter_count), get(kpc.counters)
 260 | static int (*kpc_get_cpu_counters)(bool all_cpus, u32 classes, int *curcpu,
 261 |                                    u64 *buf);
 262 | 
 263 | /// Get counter accumulations for current thread.
 264 | /// @param tid Thread id, should be 0.
 265 | /// @param buf_count The number of buf's elements (not bytes),
 266 | ///                  should not smaller than kpc_get_counter_count().
 267 | /// @param buf Buffer to receive counter's value.
 268 | /// @return 0 for success.
 269 | /// @details sysctl get(kpc.thread_counters)
 270 | static int (*kpc_get_thread_counters)(u32 tid, u32 buf_count, u64 *buf);
 271 | 
 272 | /// Acquire/release the counters used by the Power Manager.
 273 | /// @param val 1:acquire, 0:release
 274 | /// @return 0 for success.
 275 | /// @details sysctl set(kpc.force_all_ctrs)
 276 | static int (*kpc_force_all_ctrs_set)(int val);
 277 | 
 278 | /// Get the state of all_ctrs.
 279 | /// @return 0 for success.
 280 | /// @details sysctl get(kpc.force_all_ctrs)
 281 | static int (*kpc_force_all_ctrs_get)(int *val_out);
 282 | 
 283 | /// Set number of actions, should be `KPERF_ACTION_MAX`.
 284 | /// @details sysctl set(kperf.action.count)
 285 | static int (*kperf_action_count_set)(u32 count);
 286 | 
 287 | /// Get number of actions.
 288 | /// @details sysctl get(kperf.action.count)
 289 | static int (*kperf_action_count_get)(u32 *count);
 290 | 
 291 | /// Set what to sample when a trigger fires an action, e.g.
 292 | /// `KPERF_SAMPLER_PMC_CPU`.
 293 | /// @details sysctl set(kperf.action.samplers)
 294 | static int (*kperf_action_samplers_set)(u32 actionid, u32 sample);
 295 | 
 296 | /// Get what to sample when a trigger fires an action.
 297 | /// @details sysctl get(kperf.action.samplers)
 298 | static int (*kperf_action_samplers_get)(u32 actionid, u32 *sample);
 299 | 
 300 | /// Apply a task filter to the action, -1 to disable filter.
 301 | /// @details sysctl set(kperf.action.filter_by_task)
 302 | static int (*kperf_action_filter_set_by_task)(u32 actionid, i32 port);
 303 | 
 304 | /// Apply a pid filter to the action, -1 to disable filter.
 305 | /// @details sysctl set(kperf.action.filter_by_pid)
 306 | static int (*kperf_action_filter_set_by_pid)(u32 actionid, i32 pid);
 307 | 
 308 | /// Set number of time triggers, should be `KPERF_TIMER_MAX`.
 309 | /// @details sysctl set(kperf.timer.count)
 310 | static int (*kperf_timer_count_set)(u32 count);
 311 | 
 312 | /// Get number of time triggers.
 313 | /// @details sysctl get(kperf.timer.count)
 314 | static int (*kperf_timer_count_get)(u32 *count);
 315 | 
 316 | /// Set timer number and period.
 317 | /// @details sysctl set(kperf.timer.period)
 318 | static int (*kperf_timer_period_set)(u32 actionid, u64 tick);
 319 | 
 320 | /// Get timer number and period.
 321 | /// @details sysctl get(kperf.timer.period)
 322 | static int (*kperf_timer_period_get)(u32 actionid, u64 *tick);
 323 | 
 324 | /// Set timer number and actionid.
 325 | /// @details sysctl set(kperf.timer.action)
 326 | static int (*kperf_timer_action_set)(u32 actionid, u32 timerid);
 327 | 
 328 | /// Get timer number and actionid.
 329 | /// @details sysctl get(kperf.timer.action)
 330 | static int (*kperf_timer_action_get)(u32 actionid, u32 *timerid);
 331 | 
 332 | /// Set which timer ID does PET (Profile Every Thread).
 333 | /// @details sysctl set(kperf.timer.pet_timer)
 334 | static int (*kperf_timer_pet_set)(u32 timerid);
 335 | 
 336 | /// Get which timer ID does PET (Profile Every Thread).
 337 | /// @details sysctl get(kperf.timer.pet_timer)
 338 | static int (*kperf_timer_pet_get)(u32 *timerid);
 339 | 
 340 | /// Enable or disable sampling.
 341 | /// @details sysctl set(kperf.sampling)
 342 | static int (*kperf_sample_set)(u32 enabled);
 343 | 
 344 | /// Get is currently sampling.
 345 | /// @details sysctl get(kperf.sampling)
 346 | static int (*kperf_sample_get)(u32 *enabled);
 347 | 
 348 | /// Reset kperf: stop sampling, kdebug, timers and actions.
 349 | /// @return 0 for success.
 350 | static int (*kperf_reset)(void);
 351 | 
 352 | /// Nanoseconds to CPU ticks.
 353 | static u64 (*kperf_ns_to_ticks)(u64 ns);
 354 | 
 355 | /// CPU ticks to nanoseconds.
 356 | static u64 (*kperf_ticks_to_ns)(u64 ticks);
 357 | 
 358 | /// CPU ticks frequency (mach_absolute_time).
 359 | static u64 (*kperf_tick_frequency)(void);
 360 | 
 361 | // -----------------------------------------------------------------------------
 362 | // <kperfdata.framework> header (reverse engineered)
 363 | // This framework provides some functions to access the local CPU database.
 364 | // These functions do not require root privileges.
 365 | // -----------------------------------------------------------------------------
 366 | 
 367 | // KPEP CPU archtecture constants.
 368 | #define KPEP_ARCH_I386 0
 369 | #define KPEP_ARCH_X86_64 1
 370 | #define KPEP_ARCH_ARM 2
 371 | #define KPEP_ARCH_ARM64 3
 372 | 
 373 | /// KPEP event (size: 48/28 bytes on 64/32 bit OS)
 374 | typedef struct kpep_event {
 375 |   const char *name;  ///< Unique name of a event, such as "INST_RETIRED.ANY".
 376 |   const char *description;  ///< Description for this event.
 377 |   const char *errata;       ///< Errata, currently NULL.
 378 |   const char *alias;        ///< Alias name, such as "Instructions", "Cycles".
 379 |   const char *fallback;     ///< Fallback event name for fixed counter.
 380 |   u32 mask;
 381 |   u8 number;
 382 |   u8 umask;
 383 |   u8 reserved;
 384 |   u8 is_fixed;
 385 | } kpep_event;
 386 | 
 387 | /// KPEP database (size: 144/80 bytes on 64/32 bit OS)
 388 | typedef struct kpep_db {
 389 |   const char *name;            ///< Database name, such as "haswell".
 390 |   const char *cpu_id;          ///< Plist name, such as "cpu_7_8_10b282dc".
 391 |   const char *marketing_name;  ///< Marketing name, such as "Intel Haswell".
 392 |   void *plist_data;            ///< Plist data (CFDataRef), currently NULL.
 393 |   void *event_map;  ///< All events (CFDict<CFSTR(event_name), kpep_event *>).
 394 |   kpep_event
 395 |       *event_arr;  ///< Event struct buffer (sizeof(kpep_event) * events_count).
 396 |   kpep_event **fixed_event_arr;  ///< Fixed counter events (sizeof(kpep_event *)
 397 |                                  ///< * fixed_counter_count)
 398 |   void *alias_map;  ///< All aliases (CFDict<CFSTR(event_name), kpep_event *>).
 399 |   usize reserved_1;
 400 |   usize reserved_2;
 401 |   usize reserved_3;
 402 |   usize event_count;  ///< All events count.
 403 |   usize alias_count;
 404 |   usize fixed_counter_count;
 405 |   usize config_counter_count;
 406 |   usize power_counter_count;
 407 |   u32 archtecture;  ///< see `KPEP CPU archtecture constants` above.
 408 |   u32 fixed_counter_bits;
 409 |   u32 config_counter_bits;
 410 |   u32 power_counter_bits;
 411 | } kpep_db;
 412 | 
 413 | /// KPEP config (size: 80/44 bytes on 64/32 bit OS)
 414 | typedef struct kpep_config {
 415 |   kpep_db *db;
 416 |   kpep_event **ev_arr;  ///< (sizeof(kpep_event *) * counter_count), init NULL
 417 |   usize *ev_map;        ///< (sizeof(usize *) * counter_count), init 0
 418 |   usize *ev_idx;        ///< (sizeof(usize *) * counter_count), init -1
 419 |   u32 *flags;           ///< (sizeof(u32 *) * counter_count), init 0
 420 |   u64 *kpc_periods;     ///< (sizeof(u64 *) * counter_count), init 0
 421 |   usize event_count;    /// kpep_config_events_count()
 422 |   usize counter_count;
 423 |   u32 classes;  ///< See `class mask constants` above.
 424 |   u32 config_counter;
 425 |   u32 power_counter;
 426 |   u32 reserved;
 427 | } kpep_config;
 428 | 
 429 | /// Error code for kpep_config_xxx() and kpep_db_xxx() functions.
 430 | typedef enum {
 431 |   KPEP_CONFIG_ERROR_NONE = 0,
 432 |   KPEP_CONFIG_ERROR_INVALID_ARGUMENT = 1,
 433 |   KPEP_CONFIG_ERROR_OUT_OF_MEMORY = 2,
 434 |   KPEP_CONFIG_ERROR_IO = 3,
 435 |   KPEP_CONFIG_ERROR_BUFFER_TOO_SMALL = 4,
 436 |   KPEP_CONFIG_ERROR_CUR_SYSTEM_UNKNOWN = 5,
 437 |   KPEP_CONFIG_ERROR_DB_PATH_INVALID = 6,
 438 |   KPEP_CONFIG_ERROR_DB_NOT_FOUND = 7,
 439 |   KPEP_CONFIG_ERROR_DB_ARCH_UNSUPPORTED = 8,
 440 |   KPEP_CONFIG_ERROR_DB_VERSION_UNSUPPORTED = 9,
 441 |   KPEP_CONFIG_ERROR_DB_CORRUPT = 10,
 442 |   KPEP_CONFIG_ERROR_EVENT_NOT_FOUND = 11,
 443 |   KPEP_CONFIG_ERROR_CONFLICTING_EVENTS = 12,
 444 |   KPEP_CONFIG_ERROR_COUNTERS_NOT_FORCED = 13,
 445 |   KPEP_CONFIG_ERROR_EVENT_UNAVAILABLE = 14,
 446 |   KPEP_CONFIG_ERROR_ERRNO = 15,
 447 |   KPEP_CONFIG_ERROR_MAX
 448 | } kpep_config_error_code;
 449 | 
 450 | /// Error description for kpep_config_error_code.
 451 | static const char *kpep_config_error_names[KPEP_CONFIG_ERROR_MAX] = {
 452 |     "none",
 453 |     "invalid argument",
 454 |     "out of memory",
 455 |     "I/O",
 456 |     "buffer too small",
 457 |     "current system unknown",
 458 |     "database path invalid",
 459 |     "database not found",
 460 |     "database architecture unsupported",
 461 |     "database version unsupported",
 462 |     "database corrupt",
 463 |     "event not found",
 464 |     "conflicting events",
 465 |     "all counters must be forced",
 466 |     "event unavailable",
 467 |     "check errno"};
 468 | 
 469 | /// Error description.
 470 | static const char *kpep_config_error_desc(int code) {
 471 |   if (0 <= code && code < KPEP_CONFIG_ERROR_MAX) {
 472 |     return kpep_config_error_names[code];
 473 |   }
 474 |   return "unknown error";
 475 | }
 476 | 
 477 | /// Create a config.
 478 | /// @param db A kpep db, see kpep_db_create()
 479 | /// @param cfg_ptr A pointer to receive the new config.
 480 | /// @return kpep_config_error_code, 0 for success.
 481 | static int (*kpep_config_create)(kpep_db *db, kpep_config **cfg_ptr);
 482 | 
 483 | /// Free the config.
 484 | static void (*kpep_config_free)(kpep_config *cfg);
 485 | 
 486 | /// Add an event to config.
 487 | /// @param cfg The config.
 488 | /// @param ev_ptr A event pointer.
 489 | /// @param flag 0: all, 1: user space only
 490 | /// @param err Error bitmap pointer, can be NULL.
 491 | ///            If return value is `CONFLICTING_EVENTS`, this bitmap contains
 492 | ///            the conflicted event indices, e.g. "1 << 2" means index 2.
 493 | /// @return kpep_config_error_code, 0 for success.
 494 | static int (*kpep_config_add_event)(kpep_config *cfg, kpep_event **ev_ptr,
 495 |                                     u32 flag, u32 *err);
 496 | 
 497 | /// Remove event at index.
 498 | /// @return kpep_config_error_code, 0 for success.
 499 | static int (*kpep_config_remove_event)(kpep_config *cfg, usize idx);
 500 | 
 501 | /// Force all counters.
 502 | /// @return kpep_config_error_code, 0 for success.
 503 | static int (*kpep_config_force_counters)(kpep_config *cfg);
 504 | 
 505 | /// Get events count.
 506 | /// @return kpep_config_error_code, 0 for success.
 507 | static int (*kpep_config_events_count)(kpep_config *cfg, usize *count_ptr);
 508 | 
 509 | /// Get all event pointers.
 510 | /// @param buf A buffer to receive event pointers.
 511 | /// @param buf_size The buffer's size in bytes, should not smaller than
 512 | ///                 kpep_config_events_count() * sizeof(void *).
 513 | /// @return kpep_config_error_code, 0 for success.
 514 | static int (*kpep_config_events)(kpep_config *cfg, kpep_event **buf,
 515 |                                  usize buf_size);
 516 | 
 517 | /// Get kpc register configs.
 518 | /// @param buf A buffer to receive kpc register configs.
 519 | /// @param buf_size The buffer's size in bytes, should not smaller than
 520 | ///                 kpep_config_kpc_count() * sizeof(kpc_config_t).
 521 | /// @return kpep_config_error_code, 0 for success.
 522 | static int (*kpep_config_kpc)(kpep_config *cfg, kpc_config_t *buf,
 523 |                               usize buf_size);
 524 | 
 525 | /// Get kpc register config count.
 526 | /// @return kpep_config_error_code, 0 for success.
 527 | static int (*kpep_config_kpc_count)(kpep_config *cfg, usize *count_ptr);
 528 | 
 529 | /// Get kpc classes.
 530 | /// @param classes See `class mask constants` above.
 531 | /// @return kpep_config_error_code, 0 for success.
 532 | static int (*kpep_config_kpc_classes)(kpep_config *cfg, u32 *classes_ptr);
 533 | 
 534 | /// Get the index mapping from event to counter.
 535 | /// @param buf A buffer to receive indexes.
 536 | /// @param buf_size The buffer's size in bytes, should not smaller than
 537 | ///                 kpep_config_events_count() * sizeof(kpc_config_t).
 538 | /// @return kpep_config_error_code, 0 for success.
 539 | static int (*kpep_config_kpc_map)(kpep_config *cfg, usize *buf, usize buf_size);
 540 | 
 541 | /// Open a kpep database file in "/usr/share/kpep/" or "/usr/local/share/kpep/".
 542 | /// @param name File name, for example "haswell", "cpu_100000c_1_92fb37c8".
 543 | ///             Pass NULL for current CPU.
 544 | /// @return kpep_config_error_code, 0 for success.
 545 | static int (*kpep_db_create)(const char *name, kpep_db **db_ptr);
 546 | 
 547 | /// Free the kpep database.
 548 | static void (*kpep_db_free)(kpep_db *db);
 549 | 
 550 | /// Get the database's name.
 551 | /// @return kpep_config_error_code, 0 for success.
 552 | static int (*kpep_db_name)(kpep_db *db, const char **name);
 553 | 
 554 | /// Get the event alias count.
 555 | /// @return kpep_config_error_code, 0 for success.
 556 | static int (*kpep_db_aliases_count)(kpep_db *db, usize *count);
 557 | 
 558 | /// Get all alias.
 559 | /// @param buf A buffer to receive all alias strings.
 560 | /// @param buf_size The buffer's size in bytes,
 561 | ///        should not smaller than kpep_db_aliases_count() * sizeof(void *).
 562 | /// @return kpep_config_error_code, 0 for success.
 563 | static int (*kpep_db_aliases)(kpep_db *db, const char **buf, usize buf_size);
 564 | 
 565 | /// Get counters count for given classes.
 566 | /// @param classes 1: Fixed, 2: Configurable.
 567 | /// @return kpep_config_error_code, 0 for success.
 568 | static int (*kpep_db_counters_count)(kpep_db *db, u8 classes, usize *count);
 569 | 
 570 | /// Get all event count.
 571 | /// @return kpep_config_error_code, 0 for success.
 572 | static int (*kpep_db_events_count)(kpep_db *db, usize *count);
 573 | 
 574 | /// Get all events.
 575 | /// @param buf A buffer to receive all event pointers.
 576 | /// @param buf_size The buffer's size in bytes,
 577 | ///        should not smaller than kpep_db_events_count() * sizeof(void *).
 578 | /// @return kpep_config_error_code, 0 for success.
 579 | static int (*kpep_db_events)(kpep_db *db, kpep_event **buf, usize buf_size);
 580 | 
 581 | /// Get one event by name.
 582 | /// @return kpep_config_error_code, 0 for success.
 583 | static int (*kpep_db_event)(kpep_db *db, const char *name, kpep_event **ev_ptr);
 584 | 
 585 | /// Get event's name.
 586 | /// @return kpep_config_error_code, 0 for success.
 587 | static int (*kpep_event_name)(kpep_event *ev, const char **name_ptr);
 588 | 
 589 | /// Get event's alias.
 590 | /// @return kpep_config_error_code, 0 for success.
 591 | static int (*kpep_event_alias)(kpep_event *ev, const char **alias_ptr);
 592 | 
 593 | /// Get event's description.
 594 | /// @return kpep_config_error_code, 0 for success.
 595 | static int (*kpep_event_description)(kpep_event *ev, const char **str_ptr);
 596 | 
 597 | // -----------------------------------------------------------------------------
 598 | // load kperf/kperfdata dynamic library
 599 | // -----------------------------------------------------------------------------
 600 | 
 601 | typedef struct {
 602 |   const char *name;
 603 |   void **impl;
 604 | } lib_symbol;
 605 | 
 606 | #define lib_nelems(x) (sizeof(x) / sizeof((x)[0]))
 607 | #define lib_symbol_def(name) \
 608 |   { #name, (void **)&name }
 609 | 
 610 | static const lib_symbol lib_symbols_kperf[] = {
 611 |     lib_symbol_def(kpc_pmu_version),
 612 |     lib_symbol_def(kpc_cpu_string),
 613 |     lib_symbol_def(kpc_set_counting),
 614 |     lib_symbol_def(kpc_get_counting),
 615 |     lib_symbol_def(kpc_set_thread_counting),
 616 |     lib_symbol_def(kpc_get_thread_counting),
 617 |     lib_symbol_def(kpc_get_config_count),
 618 |     lib_symbol_def(kpc_get_counter_count),
 619 |     lib_symbol_def(kpc_set_config),
 620 |     lib_symbol_def(kpc_get_config),
 621 |     lib_symbol_def(kpc_get_cpu_counters),
 622 |     lib_symbol_def(kpc_get_thread_counters),
 623 |     lib_symbol_def(kpc_force_all_ctrs_set),
 624 |     lib_symbol_def(kpc_force_all_ctrs_get),
 625 |     lib_symbol_def(kperf_action_count_set),
 626 |     lib_symbol_def(kperf_action_count_get),
 627 |     lib_symbol_def(kperf_action_samplers_set),
 628 |     lib_symbol_def(kperf_action_samplers_get),
 629 |     lib_symbol_def(kperf_action_filter_set_by_task),
 630 |     lib_symbol_def(kperf_action_filter_set_by_pid),
 631 |     lib_symbol_def(kperf_timer_count_set),
 632 |     lib_symbol_def(kperf_timer_count_get),
 633 |     lib_symbol_def(kperf_timer_period_set),
 634 |     lib_symbol_def(kperf_timer_period_get),
 635 |     lib_symbol_def(kperf_timer_action_set),
 636 |     lib_symbol_def(kperf_timer_action_get),
 637 |     lib_symbol_def(kperf_sample_set),
 638 |     lib_symbol_def(kperf_sample_get),
 639 |     lib_symbol_def(kperf_reset),
 640 |     lib_symbol_def(kperf_timer_pet_set),
 641 |     lib_symbol_def(kperf_timer_pet_get),
 642 |     lib_symbol_def(kperf_ns_to_ticks),
 643 |     lib_symbol_def(kperf_ticks_to_ns),
 644 |     lib_symbol_def(kperf_tick_frequency),
 645 | };
 646 | 
 647 | static const lib_symbol lib_symbols_kperfdata[] = {
 648 |     lib_symbol_def(kpep_config_create),
 649 |     lib_symbol_def(kpep_config_free),
 650 |     lib_symbol_def(kpep_config_add_event),
 651 |     lib_symbol_def(kpep_config_remove_event),
 652 |     lib_symbol_def(kpep_config_force_counters),
 653 |     lib_symbol_def(kpep_config_events_count),
 654 |     lib_symbol_def(kpep_config_events),
 655 |     lib_symbol_def(kpep_config_kpc),
 656 |     lib_symbol_def(kpep_config_kpc_count),
 657 |     lib_symbol_def(kpep_config_kpc_classes),
 658 |     lib_symbol_def(kpep_config_kpc_map),
 659 |     lib_symbol_def(kpep_db_create),
 660 |     lib_symbol_def(kpep_db_free),
 661 |     lib_symbol_def(kpep_db_name),
 662 |     lib_symbol_def(kpep_db_aliases_count),
 663 |     lib_symbol_def(kpep_db_aliases),
 664 |     lib_symbol_def(kpep_db_counters_count),
 665 |     lib_symbol_def(kpep_db_events_count),
 666 |     lib_symbol_def(kpep_db_events),
 667 |     lib_symbol_def(kpep_db_event),
 668 |     lib_symbol_def(kpep_event_name),
 669 |     lib_symbol_def(kpep_event_alias),
 670 |     lib_symbol_def(kpep_event_description),
 671 | };
 672 | 
 673 | #define lib_path_kperf "/System/Library/PrivateFrameworks/kperf.framework/kperf"
 674 | #define lib_path_kperfdata \
 675 |   "/System/Library/PrivateFrameworks/kperfdata.framework/kperfdata"
 676 | 
 677 | static bool lib_inited = false;
 678 | static bool lib_has_err = false;
 679 | static char lib_err_msg[256];
 680 | 
 681 | static void *lib_handle_kperf = NULL;
 682 | static void *lib_handle_kperfdata = NULL;
 683 | 
 684 | static void lib_deinit(void) {
 685 |   lib_inited = false;
 686 |   lib_has_err = false;
 687 |   if (lib_handle_kperf) dlclose(lib_handle_kperf);
 688 |   if (lib_handle_kperfdata) dlclose(lib_handle_kperfdata);
 689 |   lib_handle_kperf = NULL;
 690 |   lib_handle_kperfdata = NULL;
 691 |   for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
 692 |     const lib_symbol *symbol = &lib_symbols_kperf[i];
 693 |     *symbol->impl = NULL;
 694 |   }
 695 |   for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
 696 |     const lib_symbol *symbol = &lib_symbols_kperfdata[i];
 697 |     *symbol->impl = NULL;
 698 |   }
 699 | }
 700 | 
 701 | static bool lib_init(void) {
 702 | #define return_err()    \
 703 |   do {                  \
 704 |     lib_deinit();       \
 705 |     lib_inited = true;  \
 706 |     lib_has_err = true; \
 707 |     return false;       \
 708 |   } while (false)
 709 | 
 710 |   if (lib_inited) return !lib_has_err;
 711 | 
 712 |   // load dynamic library
 713 |   lib_handle_kperf = dlopen(lib_path_kperf, RTLD_LAZY);
 714 |   if (!lib_handle_kperf) {
 715 |     snprintf(lib_err_msg, sizeof(lib_err_msg),
 716 |              "Failed to load kperf.framework, message: %s.", dlerror());
 717 |     return_err();
 718 |   }
 719 |   lib_handle_kperfdata = dlopen(lib_path_kperfdata, RTLD_LAZY);
 720 |   if (!lib_handle_kperfdata) {
 721 |     snprintf(lib_err_msg, sizeof(lib_err_msg),
 722 |              "Failed to load kperfdata.framework, message: %s.", dlerror());
 723 |     return_err();
 724 |   }
 725 | 
 726 |   // load symbol address from dynamic library
 727 |   for (usize i = 0; i < lib_nelems(lib_symbols_kperf); i++) {
 728 |     const lib_symbol *symbol = &lib_symbols_kperf[i];
 729 |     *symbol->impl = dlsym(lib_handle_kperf, symbol->name);
 730 |     if (!*symbol->impl) {
 731 |       snprintf(lib_err_msg, sizeof(lib_err_msg),
 732 |                "Failed to load kperf function: %s.", symbol->name);
 733 |       return_err();
 734 |     }
 735 |   }
 736 |   for (usize i = 0; i < lib_nelems(lib_symbols_kperfdata); i++) {
 737 |     const lib_symbol *symbol = &lib_symbols_kperfdata[i];
 738 |     *symbol->impl = dlsym(lib_handle_kperfdata, symbol->name);
 739 |     if (!*symbol->impl) {
 740 |       snprintf(lib_err_msg, sizeof(lib_err_msg),
 741 |                "Failed to load kperfdata function: %s.", symbol->name);
 742 |       return_err();
 743 |     }
 744 |   }
 745 | 
 746 |   lib_inited = true;
 747 |   lib_has_err = false;
 748 |   return true;
 749 | 
 750 | #undef return_err
 751 | }
 752 | 
 753 | // -----------------------------------------------------------------------------
 754 | // kdebug private structs
 755 | // https://github.com/apple/darwin-xnu/blob/main/bsd/sys_private/kdebug_private.h
 756 | // -----------------------------------------------------------------------------
 757 | 
 758 | /*
 759 |  * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
 760 |  * structure.
 761 |  */
 762 | #if defined(__arm64__)
 763 | typedef uint64_t kd_buf_argtype;
 764 | #else
 765 | typedef uintptr_t kd_buf_argtype;
 766 | #endif
 767 | 
 768 | typedef struct {
 769 |   uint64_t timestamp;
 770 |   kd_buf_argtype arg1;
 771 |   kd_buf_argtype arg2;
 772 |   kd_buf_argtype arg3;
 773 |   kd_buf_argtype arg4;
 774 |   kd_buf_argtype arg5; /* the thread ID */
 775 |   uint32_t debugid;    /* see <sys/kdebug.h> */
 776 | 
 777 | /*
 778 |  * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
 779 |  * structure.
 780 |  */
 781 | #if defined(__LP64__) || defined(__arm64__)
 782 |   uint32_t cpuid; /* cpu index, from 0 */
 783 |   kd_buf_argtype unused;
 784 | #endif
 785 | } kd_buf;
 786 | 
 787 | /* bits for the type field of kd_regtype */
 788 | #define KDBG_CLASSTYPE 0x10000
 789 | #define KDBG_SUBCLSTYPE 0x20000
 790 | #define KDBG_RANGETYPE 0x40000
 791 | #define KDBG_TYPENONE 0x80000
 792 | #define KDBG_CKTYPES 0xF0000
 793 | 
 794 | /* only trace at most 4 types of events, at the code granularity */
 795 | #define KDBG_VALCHECK 0x00200000U
 796 | 
 797 | typedef struct {
 798 |   unsigned int type;
 799 |   unsigned int value1;
 800 |   unsigned int value2;
 801 |   unsigned int value3;
 802 |   unsigned int value4;
 803 | } kd_regtype;
 804 | 
 805 | typedef struct {
 806 |   /* number of events that can fit in the buffers */
 807 |   int nkdbufs;
 808 |   /* set if trace is disabled */
 809 |   int nolog;
 810 |   /* kd_ctrl_page.flags */
 811 |   unsigned int flags;
 812 |   /* number of threads in thread map */
 813 |   int nkdthreads;
 814 |   /* the owning pid */
 815 |   int bufid;
 816 | } kbufinfo_t;
 817 | 
 818 | // -----------------------------------------------------------------------------
 819 | // kdebug utils
 820 | // -----------------------------------------------------------------------------
 821 | 
 822 | #define EVENT_NAME_MAX 8
 823 | typedef struct {
 824 |   const char *alias;                  /// name for print
 825 |   const char *names[EVENT_NAME_MAX];  /// name from pmc db
 826 | } event_alias;
 827 | 
 828 | /// Event names from /usr/share/kpep/<name>.plist
 829 | static const event_alias profile_events[] = {
 830 |     {"cycles",
 831 |      {
 832 |          "FIXED_CYCLES",             // Apple A7-A15
 833 |          "CPU_CLK_UNHALTED.THREAD",  // Intel Core 1th-10th
 834 |          "CPU_CLK_UNHALTED.CORE",    // Intel Yonah, Merom
 835 |      }},
 836 |     {"instructions",
 837 |      {
 838 |          "FIXED_INSTRUCTIONS",  // Apple A7-A15
 839 |          "INST_RETIRED.ANY"     // Intel Yonah, Merom, Core 1th-10th
 840 |      }},
 841 |     {"branches",
 842 |      {
 843 |          "INST_BRANCH",                   // Apple A7-A15
 844 |          "BR_INST_RETIRED.ALL_BRANCHES",  // Intel Core 1th-10th
 845 |          "INST_RETIRED.ANY",              // Intel Yonah, Merom
 846 |      }},
 847 |     {"branch-misses",
 848 |      {
 849 |          "BRANCH_MISPRED_NONSPEC",  // Apple A7-A15, since iOS 15, macOS 12
 850 |          "BRANCH_MISPREDICT",       // Apple A7-A14
 851 |          "BR_MISP_RETIRED.ALL_BRANCHES",  // Intel Core 2th-10th
 852 |          "BR_INST_RETIRED.MISPRED",       // Intel Yonah, Merom
 853 |      }},
 854 | };
 855 | 
 856 | static kpep_event *get_event(kpep_db *db, const event_alias *alias) {
 857 |   for (usize j = 0; j < EVENT_NAME_MAX; j++) {
 858 |     const char *name = alias->names[j];
 859 |     if (!name) break;
 860 |     kpep_event *ev = NULL;
 861 |     if (kpep_db_event(db, name, &ev) == 0) {
 862 |       return ev;
 863 |     }
 864 |   }
 865 |   return NULL;
 866 | }
 867 | 
 868 | struct AppleEvents {
 869 |   kpc_config_t regs[KPC_MAX_COUNTERS] = {0};
 870 |   usize counter_map[KPC_MAX_COUNTERS] = {0};
 871 |   u64 counters_0[KPC_MAX_COUNTERS] = {0};
 872 |   u64 counters_1[KPC_MAX_COUNTERS] = {0};
 873 |   static constexpr usize ev_count =
 874 |       sizeof(profile_events) / sizeof(profile_events[0]);
 875 |   bool init = false;
 876 |   bool worked = false;
 877 |   inline bool setup_performance_counters() {
 878 |     if (init) {
 879 |       return worked;
 880 |     }
 881 |     init = true;
 882 | 
 883 |     // load dylib
 884 |     if (!lib_init()) {
 885 |       printf("Error: %s\n", lib_err_msg);
 886 |       return (worked = false);
 887 |     }
 888 | 
 889 |     // check permission
 890 |     int force_ctrs = 0;
 891 |     if (kpc_force_all_ctrs_get(&force_ctrs)) {
 892 |       return (worked = false);
 893 |     }
 894 |     int ret;
 895 |     // load pmc db
 896 |     kpep_db *db = NULL;
 897 |     if ((ret = kpep_db_create(NULL, &db))) {
 898 |       printf("Error: cannot load pmc database: %d.\n", ret);
 899 |       return (worked = false);
 900 |     }
 901 |     // printf("loaded db: %s (%s)\n", db->name, db->marketing_name);
 902 |     // printf("number of fixed counters: %zu\n", db->fixed_counter_count);
 903 |     // printf("number of configurable counters: %zu\n",
 904 |     // db->config_counter_count);
 905 | 
 906 |     // create a config
 907 |     kpep_config *cfg = NULL;
 908 |     if ((ret = kpep_config_create(db, &cfg))) {
 909 |       printf("Failed to create kpep config: %d (%s).\n", ret,
 910 |              kpep_config_error_desc(ret));
 911 |       return (worked = false);
 912 |     }
 913 |     if ((ret = kpep_config_force_counters(cfg))) {
 914 |       printf("Failed to force counters: %d (%s).\n", ret,
 915 |              kpep_config_error_desc(ret));
 916 |       return (worked = false);
 917 |     }
 918 | 
 919 |     // get events
 920 |     kpep_event *ev_arr[ev_count] = {0};
 921 |     for (usize i = 0; i < ev_count; i++) {
 922 |       const event_alias *alias = profile_events + i;
 923 |       ev_arr[i] = get_event(db, alias);
 924 |       if (!ev_arr[i]) {
 925 |         printf("Cannot find event: %s.\n", alias->alias);
 926 |         return (worked = false);
 927 |       }
 928 |     }
 929 | 
 930 |     // add event to config
 931 |     for (usize i = 0; i < ev_count; i++) {
 932 |       kpep_event *ev = ev_arr[i];
 933 |       if ((ret = kpep_config_add_event(cfg, &ev, 0, NULL))) {
 934 |         printf("Failed to add event: %d (%s).\n", ret,
 935 |                kpep_config_error_desc(ret));
 936 |         return (worked = false);
 937 |       }
 938 |     }
 939 | 
 940 |     // prepare buffer and config
 941 |     u32 classes = 0;
 942 |     usize reg_count = 0;
 943 |     if ((ret = kpep_config_kpc_classes(cfg, &classes))) {
 944 |       printf("Failed get kpc classes: %d (%s).\n", ret,
 945 |              kpep_config_error_desc(ret));
 946 |       return (worked = false);
 947 |     }
 948 |     if ((ret = kpep_config_kpc_count(cfg, &reg_count))) {
 949 |       printf("Failed get kpc count: %d (%s).\n", ret,
 950 |              kpep_config_error_desc(ret));
 951 |       return (worked = false);
 952 |     }
 953 |     if ((ret = kpep_config_kpc_map(cfg, counter_map, sizeof(counter_map)))) {
 954 |       printf("Failed get kpc map: %d (%s).\n", ret,
 955 |              kpep_config_error_desc(ret));
 956 |       return (worked = false);
 957 |     }
 958 |     if ((ret = kpep_config_kpc(cfg, regs, sizeof(regs)))) {
 959 |       printf("Failed get kpc registers: %d (%s).\n", ret,
 960 |              kpep_config_error_desc(ret));
 961 |       return (worked = false);
 962 |     }
 963 | 
 964 |     // set config to kernel
 965 |     if ((ret = kpc_force_all_ctrs_set(1))) {
 966 |       printf("Failed force all ctrs: %d.\n", ret);
 967 |       return (worked = false);
 968 |     }
 969 |     if ((classes & KPC_CLASS_CONFIGURABLE_MASK) && reg_count) {
 970 |       if ((ret = kpc_set_config(classes, regs))) {
 971 |         printf("Failed set kpc config: %d.\n", ret);
 972 |         return (worked = false);
 973 |       }
 974 |     }
 975 | 
 976 |     // start counting
 977 |     if ((ret = kpc_set_counting(classes))) {
 978 |       printf("Failed set counting: %d.\n", ret);
 979 |       return (worked = false);
 980 |     }
 981 |     if ((ret = kpc_set_thread_counting(classes))) {
 982 |       printf("Failed set thread counting: %d.\n", ret);
 983 |       return (worked = false);
 984 |     }
 985 | 
 986 |     return (worked = true);
 987 |   }
 988 | 
 989 |   inline performance_counters get_counters() {
 990 |     static bool warned = false;
 991 |     int ret;
 992 |     // get counters before
 993 |     if ((ret = kpc_get_thread_counters(0, KPC_MAX_COUNTERS, counters_0))) {
 994 |       if (!warned) {
 995 |         printf("Failed get thread counters before: %d.\n", ret);
 996 |         warned = true;
 997 |       }
 998 |       return 1;
 999 |     }
1000 |     return performance_counters{
1001 |         counters_0[counter_map[0]], counters_0[counter_map[3]],
1002 |         counters_0[counter_map[2]], counters_0[counter_map[1]]};
1003 |   }
1004 | };
1005 | 
1006 | #endif
1007 | 


--------------------------------------------------------------------------------
/benchmarks/performancecounters/benchmarker.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "performancecounters/event_counter.h"
 4 | #include <atomic>
 5 | #include <cstdio>
 6 | 
 7 | event_collector collector;
 8 | 
 9 | template <class function_type>
10 | event_aggregate bench(const function_type &function, size_t min_repeat = 1,
11 |                       size_t min_time_ns = 1000000000,
12 |                       size_t max_repeat = 1000000, double tolerance = 2.0) {
13 |   // run it a few times to warm up the cache
14 |   for (size_t i = 0; i < 10; i++) {
15 |     function();
16 |   }
17 | 
18 |   size_t N = min_repeat;
19 |   if (N == 0) {
20 |     N = 1;
21 |   }
22 |   size_t max_trials = 30;
23 |   size_t trial = 0;
24 |   std::pair<double, event_aggregate> best{std::numeric_limits<double>::max(),
25 |                                      event_aggregate{}};
26 |   do {
27 |     event_aggregate aggregate{};
28 |     for (size_t i = 0; i < N; i++) {
29 |       std::atomic_thread_fence(std::memory_order_acquire);
30 |       collector.start();
31 |       function();
32 |       std::atomic_thread_fence(std::memory_order_release);
33 |       event_count allocate_count = collector.end();
34 |       aggregate << allocate_count;
35 |       if ((i + 1 == N) && (aggregate.total_elapsed_ns() < min_time_ns) &&
36 |           (N < max_repeat)) {
37 |         N *= 10;
38 |       }
39 |     }
40 |     double ratio = aggregate.elapsed_ns() / aggregate.fastest_elapsed_ns();
41 |     trial++;
42 |     if(ratio < tolerance) {
43 |       return aggregate;
44 |     }
45 |     if(ratio < best.first) {
46 |       best = {ratio, aggregate};
47 |     }
48 |     if(trial >= max_trials) {
49 |       //fprintf(stderr, "Warning: failed to converge after %zu trials got %f \n", max_trials, best.first);
50 |       return best.second;
51 |     }
52 |   } while(true);
53 | }
54 | 


--------------------------------------------------------------------------------
/benchmarks/performancecounters/event_counter.h:
--------------------------------------------------------------------------------
  1 | #ifndef __EVENT_COUNTER_H
  2 | #define __EVENT_COUNTER_H
  3 | 
  4 | #include <cctype>
  5 | #ifndef _MSC_VER
  6 | #include <dirent.h>
  7 | #endif
  8 | #include <cinttypes>
  9 | 
 10 | #include <cstring>
 11 | 
 12 | #include <chrono>
 13 | #include <vector>
 14 | 
 15 | #include "linux-perf-events.h"
 16 | #ifdef __linux__
 17 | #include <libgen.h>
 18 | #endif
 19 | 
 20 | #if __APPLE__ && __aarch64__
 21 | #include "apple_arm_events.h"
 22 | #endif
 23 | 
 24 | struct event_count {
 25 |   std::chrono::duration<double> elapsed;
 26 |   std::vector<unsigned long long> event_counts;
 27 |   event_count() : elapsed(0), event_counts{0, 0, 0, 0, 0} {}
 28 |   event_count(const std::chrono::duration<double> _elapsed,
 29 |               const std::vector<unsigned long long> _event_counts)
 30 |       : elapsed(_elapsed), event_counts(_event_counts) {}
 31 |   event_count(const event_count &other)
 32 |       : elapsed(other.elapsed), event_counts(other.event_counts) {}
 33 | 
 34 |   // The types of counters (so we can read the getter more easily)
 35 |   enum event_counter_types {
 36 |     CPU_CYCLES,
 37 |     INSTRUCTIONS,
 38 |   };
 39 | 
 40 |   double elapsed_sec() const {
 41 |     return std::chrono::duration<double>(elapsed).count();
 42 |   }
 43 |   double elapsed_ns() const {
 44 |     return std::chrono::duration<double, std::nano>(elapsed).count();
 45 |   }
 46 |   double cycles() const {
 47 |     return static_cast<double>(event_counts[CPU_CYCLES]);
 48 |   }
 49 |   double instructions() const {
 50 |     return static_cast<double>(event_counts[INSTRUCTIONS]);
 51 |   }
 52 | 
 53 |   event_count &operator=(const event_count &other) {
 54 |     this->elapsed = other.elapsed;
 55 |     this->event_counts = other.event_counts;
 56 |     return *this;
 57 |   }
 58 |   event_count operator+(const event_count &other) const {
 59 |     return event_count(elapsed + other.elapsed,
 60 |                        {
 61 |                            event_counts[0] + other.event_counts[0],
 62 |                            event_counts[1] + other.event_counts[1],
 63 |                            event_counts[2] + other.event_counts[2],
 64 |                            event_counts[3] + other.event_counts[3],
 65 |                            event_counts[4] + other.event_counts[4],
 66 |                        });
 67 |   }
 68 | 
 69 |   void operator+=(const event_count &other) { *this = *this + other; }
 70 | };
 71 | 
 72 | struct event_aggregate {
 73 |   bool has_events = false;
 74 |   int iterations = 0;
 75 |   event_count total{};
 76 |   event_count best{};
 77 |   event_count worst{};
 78 | 
 79 |   event_aggregate() = default;
 80 | 
 81 |   void operator<<(const event_count &other) {
 82 |     if (iterations == 0 || other.elapsed < best.elapsed) {
 83 |       best = other;
 84 |     }
 85 |     if (iterations == 0 || other.elapsed > worst.elapsed) {
 86 |       worst = other;
 87 |     }
 88 |     iterations++;
 89 |     total += other;
 90 |   }
 91 | 
 92 |   double elapsed_sec() const { return total.elapsed_sec() / iterations; }
 93 |   double total_elapsed_ns() const { return total.elapsed_ns(); }
 94 |   double elapsed_ns() const { return total.elapsed_ns() / iterations; }
 95 |   double cycles() const { return total.cycles() / iterations; }
 96 |   double instructions() const { return total.instructions() / iterations; }
 97 |   double fastest_elapsed_ns() const { return best.elapsed_ns(); }
 98 |   double fastest_cycles() const { return best.cycles(); }
 99 |   double fastest_instructions() const { return best.instructions(); }
100 | };
101 | 
102 | struct event_collector {
103 |   event_count count{};
104 |   std::chrono::time_point<std::chrono::steady_clock> start_clock{};
105 | 
106 | #if defined(__linux__)
107 |   LinuxEvents<PERF_TYPE_HARDWARE> linux_events;
108 |   event_collector()
109 |       : linux_events(std::vector<int>{
110 |             PERF_COUNT_HW_CPU_CYCLES,
111 |             PERF_COUNT_HW_INSTRUCTIONS,
112 |         }) {}
113 |   bool has_events() { return linux_events.is_working(); }
114 | #elif __APPLE__ && __aarch64__
115 |   AppleEvents apple_events;
116 |   performance_counters diff;
117 |   event_collector() : diff(0) { apple_events.setup_performance_counters(); }
118 |   bool has_events() { return apple_events.setup_performance_counters(); }
119 | #else
120 |   event_collector() {}
121 |   bool has_events() { return false; }
122 | #endif
123 | 
124 |   inline void start() {
125 | #if defined(__linux)
126 |     linux_events.start();
127 | #elif __APPLE__ && __aarch64__
128 |     if (has_events()) {
129 |       diff = apple_events.get_counters();
130 |     }
131 | #endif
132 |     start_clock = std::chrono::steady_clock::now();
133 |   }
134 |   inline event_count &end() {
135 |     const auto end_clock = std::chrono::steady_clock::now();
136 | #if defined(__linux)
137 |     linux_events.end(count.event_counts);
138 | #elif __APPLE__ && __aarch64__
139 |     if (has_events()) {
140 |       performance_counters end = apple_events.get_counters();
141 |       diff = end - diff;
142 |     }
143 |     count.event_counts[0] = diff.cycles;
144 |     count.event_counts[1] = diff.instructions;
145 |     count.event_counts[2] = diff.missed_branches;
146 |     count.event_counts[3] = 0;
147 |     count.event_counts[4] = diff.branches;
148 | #endif
149 |     count.elapsed = end_clock - start_clock;
150 |     return count;
151 |   }
152 | };
153 | 
154 | #endif
155 | 


--------------------------------------------------------------------------------
/benchmarks/performancecounters/linux-perf-events.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #ifdef __linux__
  3 | 
  4 | #include <asm/unistd.h>       // for __NR_perf_event_open
  5 | #include <linux/perf_event.h> // for perf event constants
  6 | #include <sys/ioctl.h>        // for ioctl
  7 | #include <unistd.h>           // for syscall
  8 | 
  9 | #include <cerrno>  // for errno
 10 | #include <cstring> // for memset
 11 | #include <stdexcept>
 12 | 
 13 | #include <iostream>
 14 | #include <vector>
 15 | 
 16 | template <int TYPE = PERF_TYPE_HARDWARE> class LinuxEvents {
 17 |   int fd;
 18 |   bool working;
 19 |   perf_event_attr attribs{};
 20 |   size_t num_events{};
 21 |   std::vector<uint64_t> temp_result_vec{};
 22 |   std::vector<uint64_t> ids{};
 23 | 
 24 | public:
 25 |   explicit LinuxEvents(std::vector<int> config_vec) : fd(0), working(true) {
 26 |     memset(&attribs, 0, sizeof(attribs));
 27 |     attribs.type = TYPE;
 28 |     attribs.size = sizeof(attribs);
 29 |     attribs.disabled = 1;
 30 |     attribs.exclude_kernel = 1;
 31 |     attribs.exclude_hv = 1;
 32 | 
 33 |     attribs.sample_period = 0;
 34 |     attribs.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
 35 |     const int pid = 0;  // the current process
 36 |     const int cpu = -1; // all CPUs
 37 |     const unsigned long flags = 0;
 38 | 
 39 |     int group = -1; // no group
 40 |     num_events = config_vec.size();
 41 |     ids.resize(config_vec.size());
 42 |     uint32_t i = 0;
 43 |     for (auto config : config_vec) {
 44 |       attribs.config = config;
 45 |       int _fd = static_cast<int>(
 46 |           syscall(__NR_perf_event_open, &attribs, pid, cpu, group, flags));
 47 |       if (_fd == -1) {
 48 |         report_error("perf_event_open");
 49 |       }
 50 |       ioctl(_fd, PERF_EVENT_IOC_ID, &ids[i++]);
 51 |       if (group == -1) {
 52 |         group = _fd;
 53 |         fd = _fd;
 54 |       }
 55 |     }
 56 | 
 57 |     temp_result_vec.resize(num_events * 2 + 1);
 58 |   }
 59 | 
 60 |   ~LinuxEvents() {
 61 |     if (fd != -1) {
 62 |       close(fd);
 63 |     }
 64 |   }
 65 | 
 66 |   inline void start() {
 67 |     if (fd != -1) {
 68 |       if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
 69 |         report_error("ioctl(PERF_EVENT_IOC_RESET)");
 70 |       }
 71 | 
 72 |       if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
 73 |         report_error("ioctl(PERF_EVENT_IOC_ENABLE)");
 74 |       }
 75 |     }
 76 |   }
 77 | 
 78 |   inline void end(std::vector<unsigned long long> &results) {
 79 |     if (fd != -1) {
 80 |       if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
 81 |         report_error("ioctl(PERF_EVENT_IOC_DISABLE)");
 82 |       }
 83 | 
 84 |       if (read(fd, temp_result_vec.data(), temp_result_vec.size() * 8) == -1) {
 85 |         report_error("read");
 86 |       }
 87 |     }
 88 |     // our actual results are in slots 1,3,5, ... of this structure
 89 |     for (uint32_t i = 1; i < temp_result_vec.size(); i += 2) {
 90 |       results[i / 2] = temp_result_vec[i];
 91 |     }
 92 |     for (uint32_t i = 2; i < temp_result_vec.size(); i += 2) {
 93 |       if (ids[i / 2 - 1] != temp_result_vec[i]) {
 94 |         report_error("event mismatch");
 95 |       }
 96 |     }
 97 |   }
 98 | 
 99 |   bool is_working() { return working; }
100 | 
101 | private:
102 |   void report_error(const std::string &) { working = false; }
103 | };
104 | #endif


--------------------------------------------------------------------------------
/benchmarks/stream.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "performancecounters/benchmarker.h"
  3 | #include <algorithm>
  4 | #include <charconv>
  5 | #include <filesystem>
  6 | #include <fstream>
  7 | #include <iostream>
  8 | #include <random>
  9 | #include <stdlib.h>
 10 | #include <vector>
 11 | extern "C" {
 12 | #include "random_bounded.h"
 13 | }
 14 | #include "generators.h"
 15 | #include "template_shuffle.h"
 16 | std::vector<uint32_t> precomputed;
 17 | void precomp_shuffle(uint64_t *storage, uint64_t size) {
 18 |   uint64_t tmp, val;
 19 |   uint32_t nextpos;
 20 |   for (size_t i = size; i > 1; i--) {
 21 |     nextpos = precomputed[i];
 22 |     tmp = storage[i - 1];   // likely in cache
 23 |     val = storage[nextpos]; // could be costly
 24 |     storage[i - 1] = val;
 25 |     storage[nextpos] = tmp; // you might have to read this store later
 26 |   }
 27 | }
 28 | 
 29 | void pretty_print(size_t volume, size_t bytes, std::string name,
 30 |                   event_aggregate agg) {
 31 |   (void)bytes;
 32 |   (void)name;
 33 |   printf(" %5.2f  ", agg.elapsed_ns() / volume);
 34 |   fflush(stdout);
 35 | }
 36 | 
 37 | using shuffle_function = void (*)(uint64_t *, uint64_t);
 38 | 
 39 | struct named_function {
 40 |   std::string name;
 41 |   shuffle_function function;
 42 | };
 43 | 
 44 | named_function func[] = {
 45 |     {"shuffle_lehmer", shuffle_lehmer},
 46 |     {"naive_shuffle_lehmer_2", naive_shuffle_lehmer_2},
 47 |     {"shuffle_lehmer_2", shuffle_lehmer_2},
 48 |     {"shuffle_lehmer_23456", shuffle_lehmer_23456},
 49 |     {"shuffle_pcg", shuffle_pcg},
 50 |     {"naive_shuffle_pcg_2", naive_shuffle_pcg_2},
 51 |     {"shuffle_pcg_2", shuffle_pcg_2},
 52 |     {"shuffle_pcg_23456", shuffle_pcg_23456},
 53 |     {"shuffle_chacha", shuffle_chacha},
 54 |     {"naive_shuffle_chacha_2", naive_shuffle_chacha_2},
 55 |     {"shuffle_chacha_2", shuffle_chacha_2},
 56 |     {"shuffle_chacha_23456", shuffle_chacha_23456}};
 57 | 
 58 | using cpp_shuffle_function = void (*)(std::vector<uint64_t>::iterator,
 59 |                                       std::vector<uint64_t>::iterator,
 60 |                                       std::mt19937_64 &);
 61 | 
 62 | using fast_cpp_shuffle_function = void (*)(std::vector<uint64_t>::iterator,
 63 |                                            std::vector<uint64_t>::iterator,
 64 |                                            lehmer64 &);
 65 | 
 66 | struct named_cpp_function {
 67 |   std::string name;
 68 |   cpp_shuffle_function function;
 69 | };
 70 | 
 71 | named_cpp_function cppfunc[] = {
 72 |     {"std::shuffle-mersenne",
 73 |      [](std::vector<uint64_t>::iterator first,
 74 |         std::vector<uint64_t>::iterator last,
 75 |         std::mt19937_64 &g) { std::shuffle(first, last, g); }},
 76 |     {"batched_random::shuffle_2-mersenne",
 77 |      [](std::vector<uint64_t>::iterator first,
 78 |         std::vector<uint64_t>::iterator last,
 79 |         std::mt19937_64 &g) { batched_random::shuffle_2(first, last, g); }},
 80 |     {"batched_random::shuffle_23456-mersenne",
 81 |      [](std::vector<uint64_t>::iterator first,
 82 |         std::vector<uint64_t>::iterator last, std::mt19937_64 &g) {
 83 |        batched_random::shuffle_23456(first, last, g);
 84 |      }}};
 85 | 
 86 | struct named_fast_cpp_function {
 87 |   std::string name;
 88 |   fast_cpp_shuffle_function function;
 89 | };
 90 | 
 91 | named_fast_cpp_function fastcppfunc[] = {
 92 |     {"std::shuffle-lehmer", [](std::vector<uint64_t>::iterator first,
 93 |                                std::vector<uint64_t>::iterator last,
 94 |                                lehmer64 &g) { std::shuffle(first, last, g); }},
 95 |     {"batched_random::shuffle_2-lehmer",
 96 |      [](std::vector<uint64_t>::iterator first,
 97 |         std::vector<uint64_t>::iterator last,
 98 |         lehmer64 &g) { batched_random::shuffle_2(first, last, g); }},
 99 |     {"batched_random::shuffle_23456-lehmer",
100 |      [](std::vector<uint64_t>::iterator first,
101 |         std::vector<uint64_t>::iterator last,
102 |         lehmer64 &g) { batched_random::shuffle_23456(first, last, g); }}};
103 | 
104 | void bench_line(std::vector<uint64_t> &input) {
105 |   size_t volume = input.size();
106 |   printf("%zu\t\t", volume);
107 |   precomputed.resize(volume + 1);
108 |   for (size_t i = 1; i < volume + 1; i++) {
109 |     precomputed[i] = random_bounded_lehmer(i);
110 |   }
111 |   std::random_device rd;
112 |   size_t min_repeat = 1;
113 |   size_t min_time_ns = 1000000; // 1 ms
114 |   size_t max_repeat = 100000;
115 |   size_t repeat = 1;
116 |   double tolerance = 1.1;
117 |   if (volume * repeat < 10000) {
118 |     repeat++;
119 |   }
120 |   std::mt19937_64 mtGenerator{rd()};
121 |   lehmer64 lehmerGenerator{rd()};
122 | 
123 |   size_t counter = 0;
124 |   for (auto &f : func) {
125 |     pretty_print(volume * repeat, repeat * volume * sizeof(uint64_t), f.name,
126 |                  bench(
127 |                      [&input, &f, repeat]() {
128 |                        for (size_t r = 0; r < repeat; r++) {
129 |                          f.function(input.data(), input.size());
130 |                        }
131 |                      },
132 |                      min_repeat, min_time_ns, max_repeat, tolerance));
133 |     counter++;
134 |     if((counter)%4 == 0) { printf("          "); } 
135 |   }
136 | }
137 | 
138 | void bench_table(size_t start, size_t end, size_t lines) {
139 |   double b = pow(double(end) / start, 1.0 / lines);
140 |   printf("# for each scheme, we give the average "
141 |          "time/item in ns \n");
142 |   printf("# Volume\t");
143 |   for (auto &f : fastcppfunc) {
144 |     printf("\t%s", f.name.c_str());
145 |   }
146 |   for (auto &f : cppfunc) {
147 |     printf("\t%s", f.name.c_str());
148 |   }
149 |   for (auto &f : func) {
150 |     printf("\t%s", f.name.c_str());
151 |   }
152 |   printf("\n");
153 |   for (double i = start; round(i) <= end; i *= b) {
154 |     std::vector<uint64_t> input(round(i));
155 |     bench_line(input);
156 |     std::cout << std::endl;
157 |   }
158 | }
159 | 
160 | int main(int, char **) {
161 |   seed(1234);
162 |   bench_table(100, 150000, 15);
163 |   return EXIT_SUCCESS;
164 | }
165 | 


--------------------------------------------------------------------------------
/gnuplot/README.md:
--------------------------------------------------------------------------------
 1 | Example:
 2 | 
 3 | ```
 4 | ./stream > stream.txt
 5 | cd gnuplot
 6 | gnuplot -e "filename='../stream.txt'" plot.gnuplot
 7 | ```
 8 | 
 9 | 
10 | m2.data : LLVM 14, Apple M2
11 | 
12 | icelake.data : LLVM 16, Ice Lake processor


--------------------------------------------------------------------------------
/gnuplot/icelake.data:
--------------------------------------------------------------------------------
 1 | # for each scheme, we give the best time/item and the average time/item in ns
 2 | # Volume		std::shuffle-lehmer	batched_random::shuffle_2-lehmer	batched_random::shuffle_23456-lehmer	std::shuffle-mersenne	batched_random::shuffle_2-mersenne	batched_random::shuffle_23456-mersenne	shuffle_lehmer	naive_shuffle_lehmer_2	shuffle_lehmer_2	shuffle_lehmer_23456	shuffle_pcg 	naive_shuffle_pcg_2	shuffle_pcg_2	shuffle_pcg_23456	shuffle_chacha	naive_shuffle_chacha_2	shuffle_chacha_2	shuffle_chacha_23456
 3 | 100		  1.71    1.92    1.63    1.21              2.66    2.32    2.09    1.49             16.55   11.07    9.15    3.98
 4 | 163		  1.68    1.79    1.46    1.10              2.63    2.16    1.90    1.37             15.91   10.18    8.25    3.47
 5 | 265		  1.63    1.62    1.30    1.01              2.61    1.99    1.71    1.35             15.98   10.00    8.11    3.38
 6 | 432		  1.56    1.54    1.22    0.97              2.55    1.91    1.62    1.28             16.04    9.89    8.04    3.37
 7 | 703		  1.54    1.50    1.16    0.92              2.52    1.86    1.56    1.19             16.08    9.62    7.95    3.44
 8 | 1145		  1.51    1.47    1.13    0.87              2.50    1.84    1.53    1.10             16.16    9.78    8.00    3.58
 9 | 1864		  1.50    1.46    1.08    0.82              2.47    1.81    1.50    1.05             16.17    9.76    7.95    3.61
10 | 3035		  1.46    1.39    1.03    0.79              2.44    1.73    1.44    1.01             16.20    9.57    7.88    3.83
11 | 4942		  1.43    1.37    1.01    0.76              2.44    1.71    1.41    0.99             16.19    9.35    7.78    3.95
12 | 8047		  1.42    1.33    0.99    0.76              2.42    1.68    1.38    0.99             16.17    9.27    7.76    4.05
13 | 13104		  1.41    1.31    0.98    0.78              2.41    1.66    1.36    0.97             16.14    9.08    7.73    4.09
14 | 21337		  1.41    1.30    0.97    0.83              2.40    1.64    1.35    0.97             16.09    9.21    7.75    4.46
15 | 34743		  1.41    1.30    0.99    0.89              2.41    1.64    1.34    0.99             16.10    9.14    7.71    4.85
16 | 56573		  1.40    1.29    1.02    0.97              2.40    1.64    1.35    1.03             16.11    9.12    7.77    5.13
17 | 92120		  1.46    1.69    1.07    1.03              2.40    2.07    1.37    1.08             16.12    9.63    7.74    5.31
18 | 150000		  1.54    2.18    1.17    1.13              2.47    2.56    1.55    1.22             16.15   10.21    8.01    5.56


--------------------------------------------------------------------------------
/gnuplot/icelake.datalehmer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/icelake.datalehmer.pdf


--------------------------------------------------------------------------------
/gnuplot/icelake.datapcg64.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/icelake.datapcg64.pdf


--------------------------------------------------------------------------------
/gnuplot/icelake.dataratio.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/icelake.dataratio.pdf


--------------------------------------------------------------------------------
/gnuplot/icelakestream.data:
--------------------------------------------------------------------------------
  1 | # for each scheme, we give the best time/item and the average time/item in ns 
  2 | # Volume		std::shuffle	batched_random::shuffle_2	batched_random::shuffle_2_4	batched_random::shuffle_2_4_6	precomp_shuffle	shuffle	shuffle_batch_2	shuffle_batch_2_4	shuffle_batch_2_4_6	shuffle_pcg64	shuffle_batch_2_pcg64	shuffle_batch_2_4_pcg64	shuffle_batch_2_4_6_pcg64
  3 | 256		  3.61    4.03  	  4.16    4.38  	  3.12    3.45  	  3.51    4.02  	  0.84    0.97  	  2.16    2.43  	  1.28    1.54  	  1.48    1.98  	  1.20    1.99  	  2.68    3.31  	  1.75    2.19  	  1.62    2.43  	  1.50    2.17  	
  4 | 320		  3.43    3.56  	  4.04    4.26  	  2.94    3.18  	  2.59    3.17  	  0.77    0.87  	  2.16    2.39  	  1.24    1.47  	  1.50    1.90  	  1.25    1.96  	  2.64    3.25  	  1.71    2.13  	  1.60    2.38  	  1.55    2.14  	
  5 | 384		  3.30    3.43  	  3.83    3.97  	  2.72    2.96  	  2.47    3.12  	  0.74    0.80  	  2.15    2.36  	  1.22    1.43  	  1.53    1.85  	  1.24    1.90  	  2.62    3.20  	  1.72    2.08  	  1.59    2.34  	  1.45    2.09  	
  6 | 448		  3.21    3.30  	  3.77    3.90  	  2.75    2.98  	  2.46    2.81  	  0.76    0.84  	  2.14    2.34  	  1.20    1.40  	  1.53    1.82  	  1.21    1.85  	  2.60    3.17  	  1.68    2.05  	  1.57    2.30  	  1.51    2.07  	
  7 | 512		  3.14    3.22  	  3.66    3.77  	  2.67    2.90  	  2.28    2.96  	  0.81    0.89  	  2.14    2.32  	  1.19    1.37  	  1.52    1.79  	  1.17    1.81  	  2.59    3.15  	  1.66    2.01  	  1.58    2.25  	  1.52    2.04  	
  8 | 576		  3.09    3.15  	  3.67    3.82  	  2.51    2.69  	  2.42    2.71  	  0.68    0.74  	  2.12    2.30  	  1.18    1.35  	  1.50    1.77  	  1.25    1.85  	  2.59    3.12  	  1.65    1.98  	  1.53    2.21  	  1.57    2.05  	
  9 | 640		  3.32    3.42  	  3.86    4.01  	  2.54    2.74  	  2.58    2.95  	  0.71    0.77  	  2.12    2.29  	  1.17    1.33  	  1.48    1.75  	  1.30    1.85  	  2.58    3.10  	  1.63    1.96  	  1.53    2.17  	  1.58    2.03  	
 10 | 704		  3.26    3.36  	  3.84    3.98  	  2.46    2.63  	  2.27    2.52  	  0.70    0.73  	  2.12    2.28  	  1.16    1.32  	  1.49    1.73  	  1.32    1.84  	  2.58    3.08  	  1.63    1.94  	  1.48    2.13  	  1.58    2.00  	
 11 | 768		  3.21    3.29  	  3.72    3.86  	  2.46    2.68  	  2.40    2.67  	  0.71    0.78  	  2.11    2.27  	  1.16    1.30  	  1.48    1.72  	  1.35    1.82  	  2.57    3.07  	  1.62    1.92  	  1.43    2.10  	  1.58    1.98  	
 12 | 832		  3.17    3.24  	  3.71    3.84  	  2.43    2.63  	  2.22    2.43  	  0.70    0.75  	  2.10    2.26  	  1.15    1.29  	  1.47    1.71  	  1.35    1.80  	  2.56    3.05  	  1.61    1.90  	  1.44    2.07  	  1.56    1.96  	
 13 | 896		  3.14    3.21  	  3.66    3.79  	  2.42    2.57  	  2.38    2.61  	  0.77    0.79  	  2.11    2.25  	  1.16    1.28  	  1.48    1.70  	  1.34    1.79  	  2.55    3.04  	  1.60    1.89  	  1.42    2.04  	  1.56    1.95  	
 14 | 960		  3.11    3.20  	  3.62    3.74  	  2.38    2.57  	  2.47    2.71  	  0.71    0.76  	  2.11    2.24  	  1.14    1.27  	  1.47    1.69  	  1.41    1.78  	  2.55    3.03  	  1.60    1.87  	  1.41    2.02  	  1.60    1.94  	
 15 | 1024		  3.08    3.14  	  3.60    3.72  	  2.42    2.60  	  2.26    2.55  	  0.70    0.73  	  2.11    2.24  	  1.15    1.26  	  1.45    1.68  	  1.39    1.77  	  2.55    3.02  	  1.59    1.87  	  1.41    2.00  	  1.60    1.93  	
 16 | 1088		  3.05    3.11  	  3.56    3.65  	  2.35    2.50  	  2.15    2.40  	  0.71    0.73  	  2.11    2.23  	  1.14    1.26  	  1.44    1.67  	  1.40    1.76  	  2.55    3.01  	  1.59    1.85  	  1.41    1.97  	  1.58    1.92  	
 17 | 1152		  3.03    3.10  	  3.59    3.79  	  2.36    2.53  	  2.37    2.68  	  0.67    0.70  	  2.10    2.23  	  1.14    1.25  	  1.45    1.66  	  1.42    1.81  	  2.55    3.01  	  1.60    1.87  	  1.39    2.02  	  1.61    1.99  	
 18 | 1216		  3.01    3.07  	  3.54    3.78  	  2.39    2.63  	  2.42    2.73  	  0.70    0.72  	  2.10    2.22  	  1.14    1.24  	  1.45    1.65  	  1.44    1.82  	  2.54    3.00  	  1.60    1.86  	  1.38    2.01  	  1.59    2.00  	
 19 | 1280		  3.16    3.22  	  3.67    3.84  	  2.53    2.91  	  2.21    2.59  	  0.66    0.70  	  2.11    2.22  	  1.14    1.24  	  1.43    1.64  	  1.42    1.81  	  2.55    3.00  	  1.59    1.86  	  1.39    2.00  	  1.60    2.00  	
 20 | 1344		  3.11    3.17  	  3.61    3.89  	  2.39    2.58  	  2.26    2.56  	  0.69    0.74  	  2.09    2.21  	  1.13    1.23  	  1.44    1.64  	  1.42    1.81  	  2.54    3.00  	  1.58    1.85  	  1.40    1.99  	  1.61    2.00  	
 21 | 1408		  3.11    3.17  	  3.58    3.84  	  2.52    2.85  	  2.38    2.61  	  0.66    0.69  	  2.10    2.21  	  1.13    1.23  	  1.41    1.63  	  1.44    1.81  	  2.54    2.99  	  1.58    1.85  	  1.39    1.98  	  1.60    2.00  	
 22 | 1472		  3.08    3.15  	  3.58    3.73  	  2.40    2.66  	  2.41    2.74  	  0.67    0.69  	  2.10    2.20  	  1.13    1.22  	  1.42    1.62  	  1.44    1.81  	  2.54    2.98  	  1.60    1.84  	  1.35    1.97  	  1.62    2.00  	
 23 | 1536		  3.06    3.11  	  3.59    3.77  	  2.45    2.72  	  2.22    2.44  	  0.69    0.71  	  2.10    2.20  	  1.14    1.22  	  1.43    1.62  	  1.45    1.81  	  2.54    2.98  	  1.58    1.84  	  1.39    1.97  	  1.62    2.00  	
 24 | 1600		  3.04    3.10  	  3.56    3.78  	  2.39    2.68  	  2.27    2.51  	  0.71    0.74  	  2.09    2.20  	  1.13    1.22  	  1.41    1.61  	  1.47    1.80  	  2.54    2.98  	  1.58    1.83  	  1.39    1.96  	  1.62    2.00  	
 25 | 1664		  3.05    3.10  	  3.54    3.82  	  2.39    2.54  	  2.37    2.64  	  0.68    0.70  	  2.10    2.19  	  1.13    1.21  	  1.42    1.61  	  1.42    1.80  	  2.54    2.97  	  1.58    1.83  	  1.39    1.95  	  1.62    2.00  	
 26 | 1728		  3.02    3.07  	  3.54    3.80  	  2.49    2.68  	  2.31    2.56  	  0.68    0.70  	  2.09    2.19  	  1.13    1.21  	  1.35    1.60  	  1.46    1.80  	  2.53    2.97  	  1.58    1.83  	  1.35    1.94  	  1.62    2.00  	
 27 | 1792		  3.01    3.07  	  3.53    3.74  	  2.42    2.64  	  2.32    2.69  	  0.68    0.71  	  2.09    2.19  	  1.13    1.21  	  1.43    1.60  	  1.45    1.80  	  2.54    2.96  	  1.58    1.82  	  1.38    1.94  	  1.60    2.00  	
 28 | 1856		  2.99    3.08  	  3.48    3.68  	  2.38    2.56  	  2.30    2.55  	  0.67    0.68  	  2.09    2.18  	  1.13    1.20  	  1.42    1.59  	  1.47    1.80  	  2.54    2.96  	  1.59    1.82  	  1.30    1.93  	  1.62    2.00  	
 29 | 1920		  3.07    3.14  	  3.56    3.70  	  2.34    2.53  	  2.33    2.64  	  0.67    0.70  	  2.09    2.18  	  1.12    1.20  	  1.42    1.59  	  1.45    1.79  	  2.54    2.96  	  1.56    1.81  	  1.38    1.92  	  1.61    2.00  	
 30 | 1984		  3.08    3.13  	  3.57    3.69  	  2.40    2.71  	  2.32    2.66  	  0.68    0.69  	  2.10    2.18  	  1.12    1.20  	  1.41    1.58  	  1.47    1.79  	  2.54    2.95  	  1.58    1.81  	  1.37    1.91  	  1.63    1.99  	
 31 | 2048		  3.05    3.11  	  3.59    3.77  	  2.33    2.56  	  2.30    2.75  	  0.69    0.71  	  2.09    2.18  	  1.13    1.20  	  1.42    1.58  	  1.46    1.78  	  2.53    2.95  	  1.58    1.81  	  1.36    1.91  	  1.62    1.99  	
 32 | 2112		  3.04    3.09  	  3.55    3.72  	  2.35    2.50  	  2.36    2.86  	  0.66    0.68  	  2.09    2.18  	  1.12    1.19  	  1.41    1.57  	  1.48    1.78  	  2.54    2.95  	  1.59    1.80  	  1.36    1.90  	  1.61    1.99  	
 33 | 2176		  3.03    3.09  	  3.53    3.71  	  2.38    2.53  	  2.27    2.56  	  0.67    0.70  	  2.09    2.17  	  1.13    1.19  	  1.28    1.57  	  1.48    1.78  	  2.54    2.95  	  1.58    1.80  	  1.37    1.89  	  1.63    1.98  	
 34 | 2240		  3.02    3.06  	  3.54    3.66  	  2.31    2.59  	  2.34    2.70  	  0.65    0.68  	  2.09    2.17  	  1.12    1.19  	  1.40    1.56  	  1.47    1.77  	  2.54    2.94  	  1.58    1.80  	  1.35    1.89  	  1.62    1.98  	
 35 | 2304		  3.00    3.06  	  3.52    3.82  	  2.28    2.45  	  2.21    2.46  	  0.66    0.68  	  2.09    2.17  	  1.12    1.19  	  1.41    1.56  	  1.48    1.78  	  2.53    2.94  	  1.57    1.80  	  1.34    1.88  	  1.63    1.98  	
 36 | 2368		  3.03    3.08  	  3.50    3.62  	  2.27    2.43  	  2.34    2.61  	  0.65    0.68  	  2.09    2.17  	  1.12    1.19  	  1.35    1.56  	  1.47    1.77  	  2.54    2.94  	  1.56    1.79  	  1.36    1.87  	  1.62    1.98  	
 37 | 2432		  3.00    3.04  	  3.52    3.73  	  2.28    2.45  	  2.22    2.60  	  0.67    0.70  	  2.09    2.16  	  1.12    1.18  	  1.33    1.55  	  1.46    1.77  	  2.54    2.94  	  1.57    1.79  	  1.33    1.87  	  1.63    1.97  	
 38 | 2496		  2.99    3.03  	  3.47    3.66  	  2.29    2.65  	  2.23    2.44  	  0.65    0.68  	  2.08    2.16  	  1.12    1.18  	  1.40    1.55  	  1.48    1.76  	  2.53    2.93  	  1.57    1.79  	  1.33    1.86  	  1.61    1.97  	
 39 | 2560		  3.05    3.11  	  3.55    3.73  	  2.38    2.57  	  2.24    2.51  	  0.65    0.67  	  2.09    2.16  	  1.12    1.18  	  1.41    1.55  	  1.48    1.76  	  2.53    2.93  	  1.57    1.78  	  1.32    1.86  	  1.62    1.97  	
 40 | 2624		  3.06    3.12  	  3.56    3.74  	  2.36    2.65  	  2.25    2.54  	  0.66    0.67  	  2.08    2.16  	  1.12    1.18  	  1.40    1.54  	  1.45    1.76  	  2.53    2.93  	  1.56    1.78  	  1.35    1.85  	  1.63    1.96  	
 41 | 2688		  3.05    3.11  	  3.55    3.67  	  2.38    2.57  	  2.35    2.64  	  0.67    0.69  	  2.09    2.16  	  1.12    1.18  	  1.41    1.54  	  1.47    1.75  	  2.53    2.93  	  1.57    1.78  	  1.34    1.84  	  1.63    1.96  	
 42 | 2752		  3.02    3.08  	  3.55    3.71  	  2.32    2.49  	  2.26    2.51  	  0.68    0.69  	  2.09    2.16  	  1.12    1.18  	  1.41    1.54  	  1.48    1.75  	  2.53    2.92  	  1.56    1.78  	  1.34    1.84  	  1.63    1.96  	
 43 | 2816		  3.00    3.06  	  3.54    3.76  	  2.33    2.52  	  2.30    2.62  	  0.66    0.67  	  2.08    2.16  	  1.12    1.17  	  1.40    1.54  	  1.48    1.75  	  2.53    2.92  	  1.56    1.77  	  1.34    1.83  	  1.63    1.96  	
 44 | 2880		  3.00    3.06  	  3.55    3.67  	  2.30    2.45  	  2.32    2.80  	  0.68    0.71  	  2.08    2.15  	  1.12    1.17  	  1.29    1.53  	  1.47    1.75  	  2.53    2.92  	  1.57    1.77  	  1.35    1.83  	  1.63    1.95  	
 45 | 2944		  2.99    3.05  	  3.50    3.69  	  2.36    2.50  	  2.36    2.68  	  0.67    0.70  	  2.09    2.15  	  1.12    1.17  	  1.31    1.53  	  1.49    1.75  	  2.53    2.92  	  1.57    1.77  	  1.31    1.82  	  1.62    1.95  	
 46 | 3008		  2.99    3.04  	  3.51    3.69  	  2.31    2.45  	  2.21    2.51  	  0.67    0.71  	  2.09    2.15  	  1.12    1.17  	  1.32    1.53  	  1.50    1.75  	  2.53    2.91  	  1.56    1.77  	  1.31    1.82  	  1.63    1.95  	
 47 | 3072		  2.98    3.02  	  3.49    3.62  	  2.31    2.46  	  2.30    2.63  	  0.68    0.71  	  2.09    2.15  	  1.12    1.17  	  1.33    1.52  	  1.48    1.74  	  2.53    2.91  	  1.57    1.76  	  1.31    1.81  	  1.63    1.94  	
 48 | 3136		  3.04    3.10  	  3.56    3.68  	  2.33    2.47  	  2.25    2.51  	  0.67    0.70  	  2.08    2.15  	  1.12    1.17  	  1.29    1.52  	  1.49    1.74  	  2.53    2.91  	  1.56    1.76  	  1.33    1.81  	  1.62    1.94  	
 49 | 3200		  3.02    3.07  	  3.57    3.76  	  2.28    2.51  	  2.26    2.55  	  0.68    0.70  	  2.08    2.15  	  1.12    1.17  	  1.34    1.52  	  1.49    1.74  	  2.52    2.91  	  1.56    1.76  	  1.31    1.80  	  1.63    1.94  	
 50 | 3264		  3.08    3.15  	  3.52    3.61  	  2.25    2.35  	  2.23    2.40  	  0.68    0.70  	  2.09    2.15  	  1.12    1.17  	  1.30    1.52  	  1.47    1.60  	  2.53    2.90  	  1.56    1.71  	  1.31    1.68  	  1.60    1.76  	
 51 | 3328		  3.02    3.07  	  3.50    3.60  	  2.26    2.36  	  2.22    2.38  	  0.69    0.73  	  2.09    2.15  	  1.11    1.17  	  1.40    1.52  	  1.46    1.60  	  2.52    2.90  	  1.56    1.71  	  1.29    1.68  	  1.60    1.75  	
 52 | 3392		  3.01    3.06  	  3.50    3.58  	  2.25    2.37  	  2.27    2.47  	  0.68    0.70  	  2.08    2.15  	  1.12    1.17  	  1.40    1.52  	  1.47    1.61  	  2.53    2.89  	  1.54    1.71  	  1.28    1.67  	  1.61    1.76  	
 53 | 3456		  3.02    3.07  	  3.51    3.60  	  2.23    2.38  	  2.26    2.49  	  0.69    0.71  	  2.08    2.15  	  1.12    1.17  	  1.41    1.52  	  1.47    1.61  	  2.52    2.89  	  1.55    1.71  	  1.30    1.67  	  1.60    1.76  	
 54 | 3520		  3.00    3.11  	  3.54    3.68  	  2.25    2.38  	  2.19    2.39  	  0.68    0.71  	  2.08    2.15  	  1.11    1.16  	  1.33    1.52  	  1.46    1.61  	  2.53    2.89  	  1.56    1.71  	  1.29    1.67  	  1.61    1.76  	
 55 | 3584		  3.04    3.14  	  3.49    3.58  	  2.22    2.35  	  2.19    2.39  	  0.68    0.70  	  2.09    2.15  	  1.11    1.16  	  1.23    1.52  	  1.46    1.61  	  2.53    2.89  	  1.56    1.71  	  1.30    1.67  	  1.60    1.76  	
 56 | 3648		  2.99    3.03  	  3.50    3.59  	  2.26    2.42  	  2.20    2.44  	  0.67    0.70  	  2.08    2.14  	  1.11    1.16  	  1.29    1.52  	  1.47    1.61  	  2.53    2.89  	  1.56    1.71  	  1.29    1.66  	  1.60    1.76  	
 57 | 3712		  2.99    3.03  	  3.50    3.60  	  2.24    2.37  	  2.22    2.42  	  0.68    0.71  	  2.08    2.14  	  1.11    1.16  	  1.40    1.51  	  1.46    1.61  	  2.53    2.89  	  1.56    1.71  	  1.31    1.66  	  1.61    1.77  	
 58 | 3776		  3.02    3.07  	  3.54    3.64  	  2.27    2.39  	  2.20    2.45  	  0.70    0.73  	  2.09    2.14  	  1.11    1.16  	  1.31    1.51  	  1.47    1.61  	  2.53    2.89  	  1.56    1.71  	  1.30    1.66  	  1.60    1.77  	
 59 | 3840		  3.11    3.18  	  3.52    3.62  	  2.29    2.43  	  2.20    2.40  	  0.68    0.73  	  2.09    2.14  	  1.11    1.16  	  1.26    1.51  	  1.46    1.61  	  2.53    2.89  	  1.55    1.71  	  1.29    1.66  	  1.61    1.77  	
 60 | 3904		  3.09    3.18  	  3.50    3.59  	  2.30    2.45  	  2.21    2.42  	  0.69    0.72  	  2.09    2.14  	  1.11    1.16  	  1.40    1.51  	  1.48    1.61  	  2.53    2.89  	  1.55    1.71  	  1.29    1.66  	  1.61    1.77  	
 61 | 3968		  3.03    3.11  	  3.50    3.61  	  2.29    2.42  	  2.32    2.56  	  0.68    0.70  	  2.08    2.14  	  1.11    1.16  	  1.30    1.51  	  1.47    1.61  	  2.53    2.89  	  1.55    1.71  	  1.30    1.66  	  1.59    1.77  	
 62 | 4032		  3.04    3.11  	  3.48    3.61  	  2.28    2.40  	  2.23    2.43  	  0.70    0.73  	  2.09    2.14  	  1.11    1.16  	  1.33    1.51  	  1.47    1.61  	  2.53    2.89  	  1.55    1.71  	  1.27    1.66  	  1.61    1.77  	
 63 | 4096		  3.01    3.08  	  3.57    3.70  	  2.25    2.41  	  2.28    2.50  	  0.68    0.70  	  2.09    2.14  	  1.11    1.16  	  1.35    1.51  	  1.48    1.61  	  2.53    2.89  	  1.56    1.71  	  1.29    1.66  	  1.61    1.77  	
 64 | 4160		  3.07    3.19  	  3.49    3.58  	  2.25    2.44  	  2.22    2.40  	  0.70    0.72  	  2.08    2.14  	  1.11    1.16  	  1.27    1.51  	  1.47    1.61  	  2.53    2.89  	  1.55    1.71  	  1.29    1.65  	  1.60    1.77  	
 65 | 4224		  2.99    3.06  	  3.51    3.69  	  2.31    2.50  	  2.32    2.52  	  0.72    0.76  	  2.08    2.13  	  1.11    1.15  	  1.26    1.49  	  1.50    1.71  	  2.53    2.89  	  1.56    1.74  	  1.28    1.74  	  1.63    1.91  	
 66 | 4288		  2.98    3.03  	  3.52    3.65  	  2.32    2.48  	  2.28    2.52  	  0.73    0.77  	  2.08    2.13  	  1.11    1.15  	  1.23    1.49  	  1.50    1.70  	  2.53    2.89  	  1.56    1.74  	  1.31    1.74  	  1.63    1.90  	
 67 | 4352		  2.98    3.02  	  3.49    3.60  	  2.31    2.49  	  2.30    2.57  	  0.73    0.77  	  2.08    2.13  	  1.11    1.15  	  1.31    1.49  	  1.48    1.71  	  2.53    2.89  	  1.56    1.73  	  1.31    1.74  	  1.64    1.90  	
 68 | 4416		  3.01    3.06  	  3.54    3.67  	  2.28    2.47  	  2.20    2.51  	  0.71    0.75  	  2.08    2.13  	  1.11    1.15  	  1.24    1.49  	  1.49    1.70  	  2.53    2.89  	  1.55    1.73  	  1.30    1.74  	  1.64    1.90  	
 69 | 4480		  3.01    3.06  	  3.53    3.68  	  2.30    2.42  	  2.29    2.55  	  0.70    0.72  	  2.08    2.13  	  1.11    1.15  	  1.24    1.49  	  1.49    1.70  	  2.53    2.89  	  1.56    1.73  	  1.30    1.73  	  1.63    1.90  	
 70 | 4544		  3.01    3.07  	  3.51    3.63  	  2.25    2.41  	  2.23    2.50  	  0.73    0.78  	  2.08    2.13  	  1.11    1.15  	  1.31    1.49  	  1.50    1.70  	  2.53    2.89  	  1.55    1.73  	  1.31    1.73  	  1.63    1.90  	
 71 | 4608		  3.01    3.06  	  3.51    3.65  	  2.26    2.42  	  2.23    2.42  	  0.71    0.74  	  2.08    2.13  	  1.11    1.15  	  1.25    1.49  	  1.50    1.70  	  2.53    2.88  	  1.56    1.73  	  1.30    1.73  	  1.63    1.90  	
 72 | 4672		  3.00    3.05  	  3.50    3.60  	  2.27    2.47  	  2.21    2.42  	  0.71    0.74  	  2.08    2.13  	  1.11    1.15  	  1.27    1.49  	  1.48    1.70  	  2.53    2.88  	  1.56    1.73  	  1.31    1.73  	  1.64    1.89  	
 73 | 4736		  2.99    3.06  	  3.50    3.64  	  2.29    2.44  	  2.26    2.46  	  0.70    0.73  	  2.08    2.13  	  1.11    1.15  	  1.29    1.49  	  1.50    1.70  	  2.53    2.88  	  1.56    1.73  	  1.31    1.73  	  1.63    1.89  	
 74 | 4800		  3.01    3.06  	  3.51    3.62  	  2.25    2.40  	  2.24    2.50  	  0.71    0.73  	  2.08    2.13  	  1.11    1.15  	  1.38    1.49  	  1.50    1.69  	  2.53    2.88  	  1.56    1.73  	  1.32    1.72  	  1.64    1.89  	
 75 | 4864		  3.00    3.05  	  3.49    3.60  	  2.27    2.45  	  2.22    2.44  	  0.70    0.75  	  2.08    2.13  	  1.11    1.15  	  1.31    1.49  	  1.50    1.70  	  2.53    2.88  	  1.56    1.73  	  1.31    1.72  	  1.64    1.89  	
 76 | 4928		  3.00    3.06  	  3.48    3.65  	  2.23    2.42  	  2.22    2.40  	  0.70    0.73  	  2.09    2.13  	  1.11    1.15  	  1.30    1.49  	  1.51    1.69  	  2.53    2.88  	  1.56    1.73  	  1.32    1.72  	  1.63    1.89  	
 77 | 4992		  2.97    3.02  	  3.50    3.63  	  2.26    2.40  	  2.25    2.46  	  0.72    0.76  	  2.08    2.13  	  1.11    1.15  	  1.26    1.49  	  1.51    1.69  	  2.53    2.88  	  1.57    1.73  	  1.32    1.72  	  1.63    1.89  	
 78 | 5056		  3.01    3.09  	  3.52    3.63  	  2.30    2.48  	  2.23    2.44  	  0.73    0.77  	  2.08    2.13  	  1.11    1.15  	  1.36    1.49  	  1.51    1.69  	  2.53    2.88  	  1.55    1.73  	  1.30    1.72  	  1.64    1.89  	
 79 | 5120		  3.02    3.07  	  3.52    3.72  	  2.32    2.50  	  2.24    2.57  	  0.70    0.72  	  2.09    2.13  	  1.11    1.15  	  1.22    1.48  	  1.48    1.69  	  2.53    2.88  	  1.57    1.73  	  1.31    1.72  	  1.62    1.89  	
 80 | 5184		  3.00    3.05  	  3.52    3.70  	  2.29    2.45  	  2.26    2.51  	  0.71    0.73  	  2.09    2.13  	  1.11    1.15  	  1.30    1.48  	  1.50    1.69  	  2.53    2.88  	  1.56    1.73  	  1.31    1.72  	  1.64    1.89  	
 81 | 5248		  3.01    3.06  	  3.50    3.68  	  2.27    2.41  	  2.28    2.51  	  0.71    0.75  	  2.09    2.13  	  1.11    1.15  	  1.27    1.49  	  1.51    1.69  	  2.53    2.88  	  1.56    1.73  	  1.29    1.71  	  1.64    1.88  	
 82 | 5312		  2.99    3.04  	  3.49    3.60  	  2.24    2.35  	  2.26    2.48  	  0.72    0.76  	  2.08    2.13  	  1.11    1.15  	  1.22    1.48  	  1.48    1.69  	  2.53    2.88  	  1.57    1.72  	  1.31    1.71  	  1.64    1.88  	
 83 | 5376		  3.01    3.06  	  3.51    3.62  	  2.33    2.56  	  2.33    2.63  	  0.71    0.74  	  2.09    2.13  	  1.11    1.15  	  1.26    1.48  	  1.49    1.69  	  2.53    2.88  	  1.56    1.72  	  1.31    1.71  	  1.65    1.88  	
 84 | 5440		  2.98    3.02  	  3.50    3.66  	  2.29    2.50  	  2.26    2.47  	  0.72    0.75  	  2.09    2.13  	  1.11    1.15  	  1.20    1.48  	  1.50    1.69  	  2.53    2.88  	  1.56    1.72  	  1.31    1.71  	  1.65    1.88  	
 85 | 5504		  2.99    3.04  	  3.50    3.62  	  2.27    2.42  	  2.27    2.51  	  0.74    0.75  	  2.08    2.13  	  1.11    1.15  	  1.22    1.48  	  1.51    1.68  	  2.53    2.88  	  1.57    1.72  	  1.30    1.70  	  1.66    1.88  	
 86 | 5568		  2.98    3.03  	  3.49    3.64  	  2.28    2.48  	  2.24    2.45  	  0.73    0.75  	  2.08    2.13  	  1.11    1.15  	  1.23    1.48  	  1.50    1.68  	  2.54    2.87  	  1.56    1.72  	  1.29    1.70  	  1.66    1.88  	
 87 | 5632		  3.02    3.06  	  3.53    3.67  	  2.28    2.44  	  2.26    2.55  	  0.72    0.75  	  2.08    2.13  	  1.11    1.15  	  1.22    1.48  	  1.50    1.68  	  2.53    2.87  	  1.56    1.72  	  1.30    1.70  	  1.67    1.88  	
 88 | 5696		  3.02    3.09  	  3.53    3.70  	  2.24    2.36  	  2.22    2.43  	  0.71    0.77  	  2.08    2.13  	  1.11    1.15  	  1.23    1.48  	  1.49    1.68  	  2.53    2.87  	  1.56    1.72  	  1.30    1.70  	  1.65    1.88  	
 89 | 5760		  3.00    3.05  	  3.51    3.69  	  2.27    2.45  	  2.24    2.55  	  0.72    0.75  	  2.08    2.13  	  1.11    1.15  	  1.19    1.48  	  1.50    1.68  	  2.53    2.87  	  1.56    1.72  	  1.29    1.70  	  1.66    1.87  	
 90 | 5824		  3.00    3.05  	  3.50    3.65  	  2.27    2.42  	  2.26    2.55  	  0.72    0.75  	  2.09    2.13  	  1.11    1.18  	  1.32    1.48  	  1.49    1.61  	  2.53    2.87  	  1.56    1.70  	  1.29    1.63  	  1.63    1.78  	
 91 | 5888		  2.99    3.03  	  3.50    3.67  	  2.24    2.40  	  2.21    2.40  	  0.70    0.71  	  2.09    2.13  	  1.11    1.15  	  1.20    1.48  	  1.49    1.61  	  2.53    2.87  	  1.55    1.70  	  1.28    1.63  	  1.62    1.78  	
 92 | 5952		  3.01    3.06  	  3.48    3.57  	  2.24    2.38  	  2.21    2.39  	  0.72    0.73  	  2.09    2.13  	  1.11    1.15  	  1.20    1.48  	  1.49    1.61  	  2.53    2.87  	  1.56    1.70  	  1.29    1.63  	  1.64    1.78  	
 93 | 6016		  2.99    3.04  	  3.48    3.61  	  2.22    2.32  	  2.20    2.37  	  0.71    0.73  	  2.08    2.13  	  1.11    1.15  	  1.20    1.49  	  1.49    1.61  	  2.52    2.87  	  1.57    1.70  	  1.29    1.63  	  1.62    1.78  	
 94 | 6080		  2.98    3.02  	  3.48    3.56  	  2.25    2.41  	  2.21    2.39  	  0.72    0.73  	  2.09    2.13  	  1.11    1.15  	  1.23    1.48  	  1.49    1.61  	  2.53    2.86  	  1.57    1.70  	  1.28    1.63  	  1.62    1.78  	
 95 | 6144		  2.98    3.03  	  3.48    3.58  	  2.23    2.34  	  2.20    2.40  	  0.71    0.73  	  2.08    2.13  	  1.11    1.15  	  1.25    1.48  	  1.49    1.61  	  2.52    2.87  	  1.56    1.70  	  1.31    1.63  	  1.64    1.78  	
 96 | 6208		  2.99    3.04  	  3.46    3.54  	  2.24    2.35  	  2.19    2.36  	  0.71    0.73  	  2.08    2.13  	  1.11    1.15  	  1.25    1.48  	  1.49    1.61  	  2.53    2.86  	  1.56    1.70  	  1.32    1.63  	  1.63    1.78  	
 97 | 6272		  3.01    3.06  	  3.50    3.59  	  2.27    2.37  	  2.23    2.45  	  0.71    0.72  	  2.09    2.13  	  1.11    1.15  	  1.21    1.48  	  1.49    1.61  	  2.52    2.87  	  1.57    1.70  	  1.30    1.63  	  1.63    1.78  	
 98 | 6336		  3.01    3.07  	  3.49    3.59  	  2.27    2.37  	  2.22    2.41  	  0.73    0.74  	  2.09    2.13  	  1.11    1.15  	  1.22    1.48  	  1.50    1.61  	  2.53    2.86  	  1.57    1.70  	  1.30    1.63  	  1.62    1.78  	
 99 | 6400		  3.00    3.05  	  3.50    3.60  	  2.25    2.36  	  2.22    2.40  	  0.71    0.73  	  2.09    2.13  	  1.11    1.14  	  1.24    1.48  	  1.50    1.61  	  2.53    2.86  	  1.56    1.70  	  1.29    1.63  	  1.64    1.78  	
100 | 6464		  2.99    3.05  	  3.49    3.59  	  2.26    2.38  	  2.23    2.45  	  0.71    0.75  	  2.09    2.13  	  1.11    1.14  	  1.23    1.48  	  1.50    1.61  	  2.52    2.86  	  1.56    1.70  	  1.30    1.63  	  1.63    1.78  	
101 | 6528		  2.99    3.04  	  3.47    3.56  	  2.24    2.34  	  2.22    2.39  	  0.73    0.75  	  2.08    2.13  	  1.11    1.14  	  1.27    1.48  	  1.49    1.61  	  2.53    2.86  	  1.56    1.70  	  1.31    1.63  	  1.63    1.78  	
102 | 6592		  2.99    3.05  	  3.50    3.60  	  2.24    2.34  	  2.20    2.40  	  0.71    0.72  	  2.09    2.13  	  1.11    1.14  	  1.25    1.48  	  1.50    1.61  	  2.53    2.86  	  1.56    1.70  	  1.31    1.63  	  1.64    1.78  	
103 | 6656		  2.98    3.03  	  3.48    3.57  	  2.25    2.35  	  2.23    2.44  	  0.71    0.72  	  2.09    2.13  	  1.11    1.14  	  1.23    1.48  	  1.50    1.61  	  2.53    2.86  	  1.56    1.70  	  1.31    1.63  	  1.64    1.78  	
104 | 6720		  2.98    3.03  	  3.48    3.58  	  2.23    2.34  	  2.21    2.41  	  0.71    0.74  	  2.09    2.13  	  1.11    1.14  	  1.24    1.48  	  1.51    1.61  	  2.53    2.86  	  1.57    1.69  	  1.30    1.63  	  1.63    1.78  	
105 | 6784		  2.99    3.04  	  3.48    3.58  	  2.25    2.38  	  2.20    2.37  	  0.71    0.73  	  2.09    2.13  	  1.11    1.14  	  1.28    1.48  	  1.50    1.61  	  2.52    2.87  	  1.56    1.69  	  1.30    1.63  	  1.64    1.78  	
106 | 6848		  2.98    3.02  	  3.47    3.58  	  2.28    2.39  	  2.21    2.38  	  0.71    0.73  	  2.09    2.13  	  1.11    1.14  	  1.40    1.48  	  1.50    1.61  	  2.53    2.86  	  1.57    1.69  	  1.31    1.63  	  1.64    1.78  	
107 | 6912		  3.00    3.05  	  3.49    3.60  	  2.24    2.35  	  2.24    2.42  	  0.73    0.74  	  2.09    2.13  	  1.11    1.14  	  1.23    1.48  	  1.49    1.61  	  2.53    2.86  	  1.57    1.69  	  1.30    1.62  	  1.64    1.78  	
108 | 6976		  3.00    3.04  	  3.51    3.63  	  2.23    2.34  	  2.22    2.47  	  0.72    0.74  	  2.09    2.13  	  1.11    1.14  	  1.30    1.48  	  1.51    1.61  	  2.52    2.86  	  1.57    1.69  	  1.30    1.62  	  1.64    1.78  	
109 | 7040		  3.01    3.05  	  3.49    3.60  	  2.27    2.41  	  2.22    2.46  	  0.72    0.74  	  2.08    2.12  	  1.11    1.14  	  1.25    1.48  	  1.50    1.61  	  2.53    2.86  	  1.57    1.69  	  1.31    1.62  	  1.63    1.78  	
110 | 7104		  2.99    3.06  	  3.49    3.59  	  2.24    2.37  	  2.23    2.49  	  0.72    0.75  	  2.09    2.12  	  1.11    1.14  	  1.27    1.48  	  1.51    1.61  	  2.52    2.86  	  1.57    1.69  	  1.31    1.62  	  1.63    1.78  	
111 | 7168		  2.99    3.03  	  3.49    3.60  	  2.24    2.34  	  2.21    2.42  	  0.72    0.74  	  2.09    2.12  	  1.11    1.14  	  1.20    1.48  	  1.50    1.61  	  2.53    2.86  	  1.56    1.69  	  1.31    1.62  	  1.63    1.78  	
112 | 7232		  2.99    3.06  	  3.48    3.58  	  2.22    2.33  	  2.23    2.42  	  0.72    0.75  	  2.08    2.12  	  1.11    1.14  	  1.28    1.48  	  1.50    1.61  	  2.52    2.86  	  1.56    1.69  	  1.30    1.62  	  1.64    1.78  	
113 | 7296		  2.98    3.03  	  3.47    3.58  	  2.22    2.33  	  2.21    2.38  	  0.72    0.74  	  2.09    2.12  	  1.11    1.14  	  1.40    1.48  	  1.50    1.61  	  2.53    2.86  	  1.57    1.69  	  1.30    1.62  	  1.65    1.78  	
114 | 7360		  2.99    3.04  	  3.48    3.57  	  2.23    2.33  	  2.20    2.39  	  0.73    0.76  	  2.08    2.12  	  1.11    1.14  	  1.40    1.48  	  1.50    1.61  	  2.53    2.86  	  1.57    1.69  	  1.29    1.62  	  1.63    1.78  	
115 | 7424		  2.98    3.02  	  3.47    3.58  	  2.21    2.31  	  2.22    2.47  	  0.73    0.76  	  2.09    2.12  	  1.11    1.14  	  1.40    1.48  	  1.50    1.61  	  2.53    2.86  	  1.56    1.69  	  1.30    1.62  	  1.64    1.78  	
116 | 7488		  2.98    3.02  	  3.47    3.56  	  2.22    2.33  	  2.20    2.37  	  0.74    0.77  	  2.09    2.12  	  1.11    1.14  	  1.31    1.48  	  1.50    1.61  	  2.52    2.86  	  1.57    1.69  	  1.31    1.62  	  1.64    1.78  	
117 | 7552		  2.99    3.04  	  3.50    3.60  	  2.24    2.34  	  2.20    2.38  	  0.72    0.74  	  2.09    2.12  	  1.11    1.14  	  1.37    1.48  	  1.50    1.61  	  2.53    2.86  	  1.56    1.69  	  1.30    1.62  	  1.64    1.78  	
118 | 7616		  3.00    3.04  	  3.50    3.62  	  2.23    2.35  	  2.19    2.36  	  0.73    0.77  	  2.09    2.12  	  1.11    1.14  	  1.40    1.48  	  1.50    1.61  	  2.52    2.86  	  1.56    1.69  	  1.31    1.61  	  1.63    1.78  	
119 | 7680		  3.00    3.05  	  3.49    3.59  	  2.27    2.42  	  2.22    2.42  	  0.74    0.76  	  2.09    2.12  	  1.11    1.14  	  1.27    1.48  	  1.51    1.61  	  2.53    2.86  	  1.57    1.69  	  1.29    1.61  	  1.64    1.78  	
120 | 7744		  3.00    3.06  	  3.49    3.61  	  2.24    2.39  	  2.23    2.41  	  0.72    0.74  	  2.08    2.12  	  1.11    1.14  	  1.24    1.48  	  1.51    1.61  	  2.53    2.86  	  1.56    1.69  	  1.31    1.61  	  1.63    1.78  	
121 | 7808		  2.98    3.03  	  3.50    3.58  	  2.25    2.36  	  2.22    2.41  	  0.72    0.74  	  2.09    2.12  	  1.11    1.14  	  1.35    1.47  	  1.50    1.60  	  2.53    2.85  	  1.57    1.69  	  1.29    1.61  	  1.64    1.78  	
122 | 7872		  2.98    3.03  	  3.48    3.56  	  2.25    2.37  	  2.22    2.41  	  0.73    0.75  	  2.08    2.12  	  1.11    1.14  	  1.38    1.47  	  1.49    1.61  	  2.53    2.85  	  1.57    1.69  	  1.28    1.61  	  1.64    1.78  	
123 | 7936		  2.99    3.03  	  3.48    3.58  	  2.24    2.34  	  2.24    2.47  	  0.73    0.75  	  2.08    2.12  	  1.11    1.14  	  1.35    1.47  	  1.50    1.61  	  2.52    2.85  	  1.57    1.69  	  1.30    1.61  	  1.64    1.78  	
124 | 8000		  2.98    3.03  	  3.47    3.57  	  2.23    2.34  	  2.21    2.41  	  0.74    0.76  	  2.08    2.12  	  1.11    1.14  	  1.27    1.47  	  1.51    1.60  	  2.52    2.85  	  1.56    1.69  	  1.31    1.61  	  1.65    1.78  	
125 | 8064		  2.98    3.02  	


--------------------------------------------------------------------------------
/gnuplot/m2.data:
--------------------------------------------------------------------------------
 1 | # for each scheme, we give the best time/item and the average time/item in ns 
 2 | # Volume		std::shuffle-lehmer	batched_random::shuffle_2-lehmer	batched_random::shuffle_23456-lehmer	std::shuffle-mersenne	batched_random::shuffle_2-mersenne	batched_random::shuffle_23456-mersenne	shuffle_lehmer	naive_shuffle_lehmer_2	shuffle_lehmer_2	shuffle_lehmer_23456	shuffle_pcg	naive_shuffle_pcg_2	shuffle_pcg_2	shuffle_pcg_23456	shuffle_chacha	naive_shuffle_chacha_2	shuffle_chacha_2	shuffle_chacha_23456
 3 | 100		  1.79    1.72    1.40    1.20              2.54    2.02    1.84    1.35              7.95    5.03    4.70    2.47            
 4 | 163		  1.68    1.50    1.26    1.03              2.16    1.75    1.55    1.17              7.76    4.73    4.44    2.24            
 5 | 265		  1.56    1.44    1.09    0.90              2.08    1.59    1.49    1.04              7.69    4.55    4.29    2.08            
 6 | 432		  1.47    1.22    0.99    0.81              1.97    1.45    1.28    0.92              7.66    4.54    4.29    1.97            
 7 | 703		  1.40    1.11    0.90    0.73              1.93    1.32    1.18    0.83              7.60    4.32    4.23    1.93            
 8 | 1145		  1.34    1.03    0.84    0.65              1.86    1.24    1.11    0.76              7.72    4.33    4.07    2.00            
 9 | 1864		  1.30    0.96    0.80    0.60              1.80    1.22    1.06    0.70              7.58    4.19    4.04    1.98            
10 | 3035		  1.29    0.95    0.77    0.57              1.73    1.16    1.02    0.67              7.75    4.17    4.00    2.07            
11 | 4942		  1.28    0.90    0.75    0.57              1.70    1.09    1.00    0.67              7.55    4.32    3.98    2.08            
12 | 8047		  1.26    0.84    0.73    0.55              1.71    1.11    0.97    0.65              7.57    4.21    4.03    2.19            
13 | 13104		  1.23    0.83    0.73    0.54              1.68    1.07    1.02    0.65              7.72    4.13    3.98    2.30            
14 | 21337		  1.24    0.82    0.71    0.56              1.69    1.11    1.04    0.68              7.71    4.29    4.23    2.51            
15 | 34743		  1.24    0.84    0.76    0.59              1.69    1.08    0.95    0.70              7.71    4.27    4.11    2.63            
16 | 56573		  1.24    0.84    0.75    0.70              1.67    1.08    0.96    0.74              7.81    4.35    4.17    2.89            
17 | 92120		  1.22    0.87    0.82    0.73              1.68    1.04    0.99    0.80              7.90    4.63    4.31    2.95            
18 | 150000		  1.23    0.88    0.83    0.81              1.68    1.05    0.97    0.83              7.93    4.75    4.27    3.04            
19 | 


--------------------------------------------------------------------------------
/gnuplot/m2.datalehmer.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2.datalehmer.pdf


--------------------------------------------------------------------------------
/gnuplot/m2.datapcg64.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2.datapcg64.pdf


--------------------------------------------------------------------------------
/gnuplot/m2.dataratio.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2.dataratio.pdf


--------------------------------------------------------------------------------
/gnuplot/m2stream.data:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lemire/batched_random/75a907e8ce2ba5a107ddd628ff349b8fc326deaa/gnuplot/m2stream.data


--------------------------------------------------------------------------------
/gnuplot/plot.gnuplot:
--------------------------------------------------------------------------------
 1 | # gnuplot -e "filename='foo.data'" plot.gnuplot
 2 | 
 3 | set term pdf
 4 | set ylabel "time per item (ns)"
 5 | set xlabel "number of entries"
 6 | stats filename using 1 nooutput name 'X_'
 7 | set style fill border
 8 | 
 9 | # Access the min and max X-values
10 | xmin = X_min
11 | xmax = X_max
12 | set xrange [xmin:xmax]
13 | set yrange [0:]
14 | set key bottom center
15 | set logscale x 2
16 | set format x "2^{%L}"
17 | 
18 | 
19 | set out filename . "lehmer.pdf"
20 | 
21 | plot filename using 1:2 with lines lw 5 title 'shuffle' , \
22 | "" using 1:3 with lines lw 5 title 'naive shuffle\_2' , \
23 | "" using 1:4 with lines lw 5 title 'shuffle\_2' , \
24 | "" using 1:5 with lines lw 5 title 'shuffle\_6' 
25 | 
26 | set out filename . "pcg64.pdf"
27 | 
28 | plot filename using 1:6 with lines lw 5 title 'shuffle' , \
29 | "" using 1:7 with lines lw 5 title 'naive shuffle\_2' ,\
30 | "" using 1:8 with lines lw 5 title 'shuffle\_2' , \
31 | "" using 1:9 with lines lw 5 title 'shuffle\_6'
32 | 
33 | set out filename . "chacha.pdf"
34 | 
35 | plot filename using 1:10 with lines lw 5 title 'shuffle' , \
36 | "" using 1:11 with lines lw 5 title 'naive shuffle\_2' , \
37 | "" using 1:12 with lines lw 5 title 'shuffle\_2' , \
38 | "" using 1:13 with lines lw 5 title 'shuffle\_6' 
39 | 
40 | 
41 | set ylabel "speed ratio (shuffle\\\_6/shuffle)"
42 | set yrange [1:5]
43 | set out filename . "ratio.pdf"
44 | 
45 | plot filename using 1:($10/$13) with lines lw 5 title 'ChaCha', \
46 | "" using 1:($6/$9) with lines lw 5 title 'PCG64', \
47 | "" using 1:($2/$5) with lines lw 5 title 'Lehmer'


--------------------------------------------------------------------------------
/include/partial-shuffle-inl.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |  * This header contains C++ helper templates. They are not meant for end users.
 4 |  */
 5 | #ifndef PARTIAL_SHUFFLE_INL_H
 6 | #define PARTIAL_SHUFFLE_INL_H
 7 | 
 8 | #include <algorithm>
 9 | #include <cstdint>
10 | 
11 | namespace batched_random {
12 | 
13 | // Performs k steps of a Fisher-Yates shuffle on n elements, in the array
14 | // `storage`.
15 | //
16 | // Preconditions:
17 | //   n >= k >= 1
18 | //   bound >= n*(n-1)*...*(n-(k-1)), which must not overflow
19 | //   rng() produces uniformly random 64-bit values
20 | //
21 | // The return value is usable as `bound` for smaller batches of size k.
22 | template <class RandomIt, class URBG>
23 | inline uint64_t partial_shuffle_64b(RandomIt storage, uint64_t n, uint64_t k,
24 |                                     uint64_t bound, URBG &g) {
25 |   static_assert(std::is_same<typename URBG::result_type, uint64_t>::value, "result_type must be uint64_t");
26 |   __uint128_t x;
27 |   uint64_t r = g();
28 |   uint64_t indexes[7]; // We know that k <= 7
29 | 
30 |   for (uint64_t i = 0; i < k; i++) {
31 |     x = (__uint128_t)(n - i) * (__uint128_t)r;
32 |     r = (uint64_t)x;
33 |     indexes[i] = (uint64_t)(x >> 64);
34 |   }
35 | 
36 |   if (r < bound) {
37 |     bound = n;
38 |     for (uint64_t i = 1; i < k; i++) {
39 |       bound *= n - i;
40 |     }
41 |     uint64_t t = -bound % bound;
42 | 
43 |     while (r < t) {
44 |       r = g();
45 |       for (uint64_t i = 0; i < k; i++) {
46 |         x = (__uint128_t)(n - i) * (__uint128_t)r;
47 |         r = (uint64_t)x;
48 |         indexes[i] = (uint64_t)(x >> 64);
49 |       }
50 |     }
51 |   }
52 |   for (uint64_t i = 0; i < k; i++) {
53 |     std::iter_swap(storage + n - i - 1, storage + indexes[i]);
54 |   }
55 | 
56 |   return bound;
57 | }
58 | 
59 | } // namespace batched_random
60 | 
61 | #endif // TEMPLATE_SHUFFLE_H
62 | 


--------------------------------------------------------------------------------
/include/random_bounded.h:
--------------------------------------------------------------------------------
 1 | /***
 2 |  * This header contains function declarations for C array shuffling functions.
 3 |  * It can be called by C code.
 4 |  */
 5 | #ifndef BATCHED_RANDOM_H
 6 | #define BATCHED_RANDOM_H
 7 | #include <stdint.h>
 8 | 
 9 | // call this one before calling random_bounded and other shuffling functions.
10 | void seed(uint64_t s);
11 | 
12 | 
13 | // shuffle the storage array, you need to provide your own random number
14 | // generator (rng)
15 | void shuffle(uint64_t *storage, uint64_t size, uint64_t (*rng)(void));
16 | void shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void));
17 | void shuffle_batch_23456(uint64_t *storage, uint64_t size,
18 |                          uint64_t (*rng)(void));
19 | void naive_shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void));
20 | 
21 | // shuffle with lehmer rng
22 | void shuffle_lehmer(uint64_t *storage, uint64_t size);
23 | void shuffle_lehmer_2(uint64_t *storage, uint64_t size);
24 | void shuffle_lehmer_23456(uint64_t *storage, uint64_t size);
25 | void naive_shuffle_lehmer_2(uint64_t *storage, uint64_t size);
26 | 
27 | // shuffle with pcg64 rng
28 | void shuffle_pcg(uint64_t *storage, uint64_t size);
29 | void shuffle_pcg_2(uint64_t *storage, uint64_t size);
30 | void shuffle_pcg_23456(uint64_t *storage, uint64_t size);
31 | void naive_shuffle_pcg_2(uint64_t *storage, uint64_t size);
32 | 
33 | 
34 | // shuffle with chacha rng
35 | void shuffle_chacha(uint64_t *storage, uint64_t size);
36 | void shuffle_chacha_2(uint64_t *storage, uint64_t size);
37 | void shuffle_chacha_23456(uint64_t *storage, uint64_t size);
38 | void naive_shuffle_chacha_2(uint64_t *storage, uint64_t size);
39 | 
40 | 
41 | // returns a random number in the range [0, range)
42 | uint64_t random_bounded_lehmer(uint64_t range);
43 | 
44 | #endif // BATCHED_RANDOM_H
45 | 


--------------------------------------------------------------------------------
/include/template_shuffle.h:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |  * This header contains C++ templates that shuffle the elements in the range [first,
 4 |  * last) using the random number generator g. They are meant to emulate
 5 |  * the standard std::shuffle function and can often act as drop-in replacement.
 6 |  */
 7 | #ifndef TEMPLATE_SHUFFLE_H
 8 | #define TEMPLATE_SHUFFLE_H
 9 |  
10 | #include "partial-shuffle-inl.h"
11 | 
12 | // This code is meant to look like the C++ standard library.
13 | namespace batched_random {
14 | 
15 | // This is a template function that shuffles the elements in the range [first,
16 | // last).
17 | //
18 | // It is similar to std::shuffle, but it uses a different algorithm.
19 | template <class RandomIt, class URBG>
20 | extern void shuffle_2(RandomIt first, RandomIt last, URBG &&g) {
21 |   uint64_t i = std::distance(first, last);
22 |   for (; i > 1 << 30; i--) {
23 |     partial_shuffle_64b(first, i, 1, i, g);
24 |   }
25 | 
26 |   // Batches of 2 for sizes up to 2^30 elements
27 |   uint64_t bound = (uint64_t)1 << 60;
28 |   for (; i > 1; i -= 2) {
29 |     bound = partial_shuffle_64b(first, i, 2, bound, g);
30 |   }
31 | }
32 | 
33 | // This is a template function that shuffles the elements in the range [first,
34 | // last)
35 | //
36 | // It is similar to std::shuffle, but it uses a different algorithm.
37 | //
38 | // Performance note: This function might be slow under GCC: see shuffle_2.
39 | template <class RandomIt, class URBG>
40 | extern void shuffle_23456(RandomIt first, RandomIt last, URBG &&g) {
41 |   uint64_t i = std::distance(first, last);
42 |   for (; i > 1 << 30; i--) {
43 |     partial_shuffle_64b(first, i, 1, i, g);
44 |   }
45 | 
46 |   // Batches of 2 for sizes up to 2^30 elements
47 |   uint64_t bound = (uint64_t)1 << 60;
48 |   for (; i > 1 << 19; i -= 2) {
49 |     bound = partial_shuffle_64b(first, i, 2, bound, g);
50 |   }
51 | 
52 |   // Batches of 3 for sizes up to 2^19 elements
53 |   bound = (uint64_t)1 << 57;
54 |   for (; i > 1 << 14; i -= 3) {
55 |     bound = partial_shuffle_64b(first, i, 3, bound, g);
56 |   }
57 | 
58 |   // Batches of 4 for sizes up to 2^14 elements
59 |   bound = (uint64_t)1 << 56;
60 |   for (; i > 1 << 11; i -= 4) {
61 |     bound = partial_shuffle_64b(first, i, 4, bound, g);
62 |   }
63 | 
64 |   // Batches of 5 for sizes up to 2^11 elements
65 |   bound = (uint64_t)1 << 55;
66 |   for (; i > 1 << 9; i -= 5) {
67 |     bound = partial_shuffle_64b(first, i, 5, bound, g);
68 |   }
69 | 
70 |   // Batches of 6 for sizes up to 2^9 elements
71 |   bound = (uint64_t)1 << 54;
72 |   for (; i > 6; i -= 6) {
73 |     bound = partial_shuffle_64b(first, i, 6, bound, g);
74 |   }
75 | 
76 |   if (i > 1) {
77 |     partial_shuffle_64b(first, i, i - 1, 720, g);
78 |   }
79 | }
80 | 
81 | } // namespace batched_random
82 | 
83 | #endif // TEMPLATE_SHUFFLE_H
84 | 


--------------------------------------------------------------------------------
/src/batch_shuffle_dice.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | 
  3 | uint64_t random_bounded(uint64_t range, uint64_t (*rng)(void)) {
  4 |   __uint128_t random64bit, multiresult;
  5 |   uint64_t leftover;
  6 |   uint64_t threshold;
  7 |   random64bit = rng();
  8 |   multiresult = random64bit * range;
  9 |   leftover = (uint64_t)multiresult;
 10 |   if (leftover < range) {
 11 |     threshold = -range % range;
 12 |     while (leftover < threshold) {
 13 |       random64bit = rng();
 14 |       multiresult = random64bit * range;
 15 |       leftover = (uint64_t)multiresult;
 16 |     }
 17 |   }
 18 |   return (uint64_t)(multiresult >> 64); // [0, range)
 19 | }
 20 | 
 21 | // This is a naive batched shuffle. We generate a single random number r in n*(n-1)*...*(n-(k-1)).
 22 | // Then we get the random index as
 23 | // r % n -> pos1
 24 | // r = (r / n)
 25 | // r % (n-1) -> pos2
 26 | // r = (r / (n-1))
 27 | // ...
 28 | // r % (n-k+1) -> posk (can omit the modulo here)
 29 | 
 30 | inline void naive_partial_shuffle_64b(uint64_t *storage, uint64_t n, uint64_t k, uint64_t (*rng)(void)) {
 31 |   uint64_t pos1, pos2;
 32 |   uint64_t val1, val2;
 33 |   uint64_t bound = n;
 34 |   for (uint64_t i = 1; i < k; i++) {
 35 |     bound *= n - i;
 36 |   }
 37 |   // Next we generate a random integer in [0, bound)
 38 |   uint64_t r = random_bounded(bound, rng);
 39 |   for (uint64_t i = 0; i < k - 1; i++) {
 40 |     pos2 = r % (n - i);
 41 |     r /= (n - i);
 42 |     pos1 = n - i - 1;
 43 |     val1 = storage[pos1];
 44 |     val2 = storage[pos2];
 45 |     storage[pos1] = val2;
 46 |     storage[pos2] = val1;
 47 |   }
 48 |   // the last one does not need a modulo
 49 |   pos2 = r;
 50 |   pos1 = n - k;
 51 |   val1 = storage[pos1];
 52 |   val2 = storage[pos2];
 53 |   storage[pos1] = val2;
 54 |   storage[pos2] = val1;
 55 | }
 56 | 
 57 | 
 58 | // Performs k steps of a Fisher-Yates shuffle on n elements, in the array
 59 | // `storage`.
 60 | //
 61 | // Preconditions:
 62 | //   n >= k >= 1
 63 | //   bound >= n*(n-1)*...*(n-(k-1)), which must not overflow
 64 | //   rng() produces uniformly random 64-bit values
 65 | //
 66 | // The return value is usable as `bound` for smaller batches of size k.
 67 | static inline uint64_t partial_shuffle_64b(uint64_t *storage, uint64_t n, uint64_t k,
 68 |                                     uint64_t bound, uint64_t (*rng)(void)) {
 69 |   __uint128_t x;
 70 |   uint64_t r = rng();
 71 |   uint64_t pos1, pos2;
 72 |   uint64_t val1, val2;
 73 |   uint64_t indexes[7]; // We know that k <= 7
 74 | 
 75 |   for (uint64_t i = 0; i < k; i++) {
 76 |     x = (__uint128_t)(n - i) * (__uint128_t)r;
 77 |     r = (uint64_t)x;
 78 |     indexes[i] = (uint64_t)(x >> 64);
 79 |   }
 80 | 
 81 |   if (r < bound) {
 82 |     bound = n;
 83 |     for (uint64_t i = 1; i < k; i++) {
 84 |       bound *= n - i;
 85 |     }
 86 |     uint64_t t = -bound % bound;
 87 | 
 88 |     while (r < t) {
 89 |       r = rng();
 90 |       for (uint64_t i = 0; i < k; i++) {
 91 |         x = (__uint128_t)(n - i) * (__uint128_t)r;
 92 |         r = (uint64_t)x;
 93 |         indexes[i] = (uint64_t)(x >> 64);
 94 |       }
 95 |     }
 96 |   }
 97 |   for (uint64_t i = 0; i < k; i++) {
 98 |     pos1 = n - i - 1;
 99 |     pos2 = indexes[i];
100 |     val1 = storage[pos1]; // should be in cache
101 |     val2 = storage[pos2]; // might not be in cache
102 |     storage[pos1] = val2;
103 |     storage[pos2] = val1; // will be read later
104 |   }
105 |   return bound;
106 | }
107 | 
108 | // Rolls a batch of fair dice with sizes n, n-1, ..., n-(k-1)
109 | //
110 | // Preconditions:
111 | //   n >= k
112 | //   bound >= n*(n-1)*...*(n-(k-1)), which must not overflow
113 | //   rng() produces uniformly random 64-bit values
114 | //   result has length at least k
115 | //
116 | // The dice rolls are put in the `result` array:
117 | //   result[i] is an (n-i) sided die roll
118 | //
119 | // The return value is usable as `bound` for smaller batches of size k.
120 | inline uint64_t partial_shuffle_dice_64b(uint64_t n, uint64_t k, uint64_t bound,
121 |                                          uint64_t (*rng)(void),
122 |                                          uint64_t *result) {
123 |   __uint128_t x;
124 |   uint64_t r = rng();
125 | 
126 |   for (uint64_t i = 0; i < k; i++) {
127 |     x = (__uint128_t)(n - i) * (__uint128_t)r;
128 |     r = (uint64_t)x;
129 |     result[i] = (uint64_t)(x >> 64);
130 |   }
131 | 
132 |   if (r < bound) {
133 |     bound = n;
134 |     for (uint64_t i = 1; i < k; i++) {
135 |       bound *= n - i;
136 |     }
137 |     uint64_t t = -bound % bound;
138 |     while (r < t) {
139 |       r = rng();
140 |       for (uint64_t i = 0; i < k; i++) {
141 |         x = (__uint128_t)(n - i) * (__uint128_t)r;
142 |         r = (uint64_t)x;
143 |         result[i] = (uint64_t)(x >> 64);
144 |       }
145 |     }
146 |   }
147 | 
148 |   return bound;
149 | }
150 | 
151 | // Rolls fair dice with sizes n, n-1, ..., n - (4*k - 1)
152 | // in four interleaved batches. The first die in batch j
153 | // has size n-j, and each subsequent die is smaller by 4
154 | //
155 | // Preconditions:
156 | //   n >= 4*k
157 | //   bound >= n*(n-4)*...*(n - 4*(k-1)), which must not overflow
158 | //   rng() produces uniformly random 64-bit values
159 | //   result has length at least 4*k
160 | //
161 | // The dice rolls are put in the `result` array:
162 | //   result[i] is an (n-i) sided die roll
163 | //
164 | // The return value is usable as `bound` with the same k and smaller n
165 | inline uint64_t partial_shuffle_dice_64b_interleaved_4x(uint64_t n, uint64_t k,
166 |                                                         uint64_t bound,
167 |                                                         uint64_t (*rng)(void),
168 |                                                         uint64_t *result) {
169 |   __uint128_t x;
170 |   uint64_t r[4];
171 | 
172 |   for (int j = 0; j < 4; j++) {
173 |     r[j] = rng();
174 |   }
175 | 
176 |   for (uint64_t i = 0; i < k; i++) {
177 |     for (uint64_t j = 0; j < 4; j++) {
178 |       x = (__uint128_t)(n - 4 * i - j) * (__uint128_t)r[j];
179 |       r[j] = (uint64_t)x;
180 |       result[4 * i + j] = (uint64_t)(x >> 64);
181 |     }
182 |   }
183 | 
184 |   for (uint64_t j = 0; j < 4; j++) {
185 |     if (r[j] < bound) {
186 |       uint64_t m = n - j;
187 |       bound = m;
188 |       for (uint64_t i = 1; i < k; i++) {
189 |         bound *= m - 4 * i;
190 |       }
191 |       uint64_t t = -bound % bound;
192 |       while (r[j] < t) {
193 |         r[j] = rng();
194 |         for (uint64_t i = 0; i < k; i++) {
195 |           x = (__uint128_t)(m - 4 * i) * (__uint128_t)r[j];
196 |           r[j] = (uint64_t)x;
197 |           result[4 * i + j] = (uint64_t)(x >> 64);
198 |         }
199 |       }
200 |     }
201 |   }
202 | 
203 |   return bound;
204 | }
205 | 
206 | // Rolls a batch of fair dice with sizes 2, 3, ..., 17
207 | //
208 | // Preconditions:
209 | //   rng() produces uniformly random 64-bit values
210 | //   result has length at least 16
211 | //
212 | // The dice rolls are put in the `result` array:
213 | //   result[i] is an (i+2) sided die roll
214 | inline void shuffle_17_dice_16b_interleaved(uint64_t (*rng)(void),
215 |                                             uint16_t *result) {
216 |   uint16_t r[4];
217 |   uint16_t m[4] = {(1 << 10) - 1, (1 << 8) - 1, (1 << 12) - 1, (1 << 12) - 1};
218 | 
219 |   do {
220 |     uint64_t bits = rng();
221 |     for (int i = 0; i < 4; i++) {
222 |       r[i] = (uint16_t)(bits >> (16 * i));
223 |     }
224 |   } while (((r[0] & m[0]) == 0) || ((r[1] & m[1]) == 0) ||
225 |            ((r[2] & m[2]) == 0) || ((r[3] & m[3]) == 0));
226 | 
227 |   // Each column of n is a batch.
228 |   uint16_t n[4][4] = {
229 |       {2, 5, 7, 12}, {3, 6, 8, 13}, {4, 16, 9, 14}, {11, 17, 10, 15}};
230 |   uint32_t x[4];
231 | 
232 |   for (int i = 0; i < 4; i++) {
233 |     for (int j = 0; j < 4; j++) {
234 |       x[j] = (uint32_t)n[i][j] * (uint32_t)r[j];
235 |     }
236 |     // These are separate loops so the above multiplication
237 |     // can take advantage of instruction-level parallelism.
238 |     for (int j = 0; j < 4; j++) {
239 |       result[n[i][j] - 2] = (uint16_t)(x[j] >> 16);
240 |       r[j] = (uint16_t)x[j];
241 |     }
242 |   }
243 | }
244 | 
245 | // Rolls a batch of fair dice with sizes 2, 3, ..., 17
246 | //
247 | // Preconditions:
248 | //   rng() produces uniformly random 64-bit values
249 | //   result has length at least 16
250 | //
251 | // The dice rolls are put in the `result` array:
252 | //   result[i] is an (i+2) sided die roll
253 | inline void shuffle_17_dice_16b_linear(uint64_t (*rng)(void),
254 |                                        uint16_t *result) {
255 |   uint16_t r[4];
256 |   uint16_t m[4] = {(1 << 10) - 1, (1 << 8) - 1, (1 << 12) - 1, (1 << 12) - 1};
257 | 
258 |   do {
259 |     uint64_t bits = rng();
260 |     for (int i = 0; i < 4; i++) {
261 |       r[i] = (uint16_t)(bits >> (16 * i));
262 |     }
263 |   } while (((r[0] & m[0]) == 0) || ((r[1] & m[1]) == 0) ||
264 |            ((r[2] & m[2]) == 0) || ((r[3] & m[3]) == 0));
265 | 
266 |   uint16_t p[16] = {r[0], r[0], r[0], r[1], r[1], r[2], r[2], r[2],
267 |                     r[2], r[0], r[3], r[3], r[3], r[3], r[1], r[1]};
268 |   uint16_t d[16] = {1,   2,  6, 1,  5,   1,    7,  56,
269 |                     504, 24, 1, 12, 156, 2184, 30, 480};
270 | 
271 |   for (int i = 0; i < 16; i++) {
272 |     p[i] *= d[i];
273 |   }
274 | 
275 |   uint16_t n[16] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
276 | 
277 |   for (int i = 0; i < 16; i++) {
278 |     uint32_t x = (uint32_t)p[i] * (uint32_t)n[i];
279 |     result[i] = (uint16_t)(x >> 16);
280 |   }
281 | }
282 | 


--------------------------------------------------------------------------------
/src/chacha.c:
--------------------------------------------------------------------------------
  1 | // copyright: https://github.com/nixberg/chacha-rng-c (MIT License)
  2 | // with some modifications by D. Lemire
  3 | #include <stddef.h>
  4 | #include <assert.h>
  5 | #include <stdlib.h>
  6 | 
  7 | #include "chacha.h"
  8 | 
  9 | static void chacha_init(ChaCha *rng, size_t rounds, const uint32_t seed[8], uint64_t stream) {
 10 |     rng->state[ 0] = 0x61707865;
 11 |     rng->state[ 1] = 0x3320646e;
 12 |     rng->state[ 2] = 0x79622d32;
 13 |     rng->state[ 3] = 0x6b206574;
 14 | 
 15 |     rng->state[ 4] = seed[0];
 16 |     rng->state[ 5] = seed[1];
 17 |     rng->state[ 6] = seed[2];
 18 |     rng->state[ 7] = seed[3];
 19 |     rng->state[ 8] = seed[4];
 20 |     rng->state[ 9] = seed[5];
 21 |     rng->state[10] = seed[6];
 22 |     rng->state[11] = seed[7];
 23 | 
 24 |     rng->state[12] = 0;
 25 |     rng->state[13] = 0;
 26 |     rng->state[14] = (uint32_t)stream;
 27 |     rng->state[15] = (uint32_t)(stream >> 32);
 28 | 
 29 |     rng->rounds = rounds;
 30 | 
 31 |     rng->word_index = 16;
 32 | }
 33 | 
 34 | void chacha8_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream) {
 35 |     chacha_init(rng, 8, seed, stream);
 36 | }
 37 | 
 38 | void chacha8_zero(ChaCha *rng, uint64_t stream) {
 39 |     uint32_t seed[8] = { 0 };
 40 |     chacha_init(rng, 8, seed, stream);
 41 | }
 42 | 
 43 | void chacha12_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream) {
 44 |     chacha_init(rng, 12, seed, stream);
 45 | }
 46 | 
 47 | void chacha12_zero(ChaCha *rng, uint64_t stream) {
 48 |     uint32_t seed[8] = { 0 };
 49 |     chacha_init(rng, 12, seed, stream);
 50 | }
 51 | 
 52 | void chacha20_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream) {
 53 |     chacha_init(rng, 20, seed, stream);
 54 | }
 55 | 
 56 | void chacha20_zero(ChaCha *rng, uint64_t stream) {
 57 |     uint32_t seed[8] = { 0 };
 58 |     chacha_init(rng, 20, seed, stream);
 59 | }
 60 | 
 61 | uint8_t chacha_u8(ChaCha *rng) {
 62 |     return (uint8_t)chacha_u32(rng);
 63 | }
 64 | 
 65 | void chacha_fill_u8(ChaCha *rng, uint8_t *array, size_t count) {
 66 |     size_t tail_count = count % 4;
 67 | 
 68 |     for (size_t i = 0; i < (count - tail_count); i += 4) {
 69 |         uint32_t word = chacha_u32(rng);
 70 |         array[i + 0] = (uint8_t)word;
 71 |         array[i + 1] = (uint8_t)(word >> 8);
 72 |         array[i + 2] = (uint8_t)(word >> 16);
 73 |         array[i + 3] = (uint8_t)(word >> 24);
 74 |     }
 75 | 
 76 |     if (tail_count > 0) {
 77 |         uint32_t word = chacha_u32(rng);
 78 |         for (size_t i = tail_count; i > 0; i--) {
 79 |             array[count - i] = (uint8_t)word;
 80 |             word >>= 8;
 81 |         }
 82 |     }
 83 | }
 84 | 
 85 | uint16_t chacha_u16(ChaCha *rng) {
 86 |     return (uint16_t)chacha_u32(rng);
 87 | }
 88 | 
 89 | void chacha_fill_u16(ChaCha *rng, uint16_t *array, size_t count) {
 90 |     size_t tail_count = count % 2;
 91 | 
 92 |     for (size_t i = 0; i < (count - tail_count); i += 2) {
 93 |         uint32_t word = chacha_u32(rng);
 94 |         array[i + 0] = (uint16_t)word;
 95 |         array[i + 1] = (uint16_t)word >> 16;
 96 |     }
 97 | 
 98 |     if (tail_count > 0) {
 99 |         uint32_t word = chacha_u32(rng);
100 |         array[count - 1] = (uint16_t)word;
101 |     }
102 | }
103 | 
104 | static void double_round(uint32_t state[16]);
105 | 
106 | static inline void increment_counter(ChaCha *rng) {
107 |     rng->state[12]++;
108 |     if (rng->state[12] == 0) {
109 |         rng->state[13]++;
110 |         if (rng->state[13] == 0) {
111 |             exit(EXIT_FAILURE);
112 |         }
113 |     }
114 | }
115 | 
116 | uint32_t chacha_u32(ChaCha *rng) {
117 |     assert(rng->word_index <= 16);
118 | 
119 |     if (rng->word_index == 16) {
120 |         for (size_t i = 0; i < 16; i++) {
121 |             rng->working_state[i] = rng->state[i];
122 |         }
123 | 
124 |         for (size_t i = 0; i < rng->rounds; i += 2) {
125 |             double_round(rng->working_state);
126 |         }
127 | 
128 |         for (size_t i = 0; i < 16; i++) {
129 |             rng->working_state[i] += rng->state[i];
130 |         }
131 | 
132 |         increment_counter(rng);
133 |         rng->word_index = 0;
134 |     }
135 | 
136 |     uint32_t result = rng->working_state[rng->word_index];
137 | 
138 |     rng->word_index++;
139 | 
140 |     return result;
141 | }
142 | 
143 | void chacha_fill_u32(ChaCha *rng, uint32_t *array, size_t count) {
144 |     for (size_t i = 0; i < count; i++) {
145 |         array[i] = chacha_u32(rng);
146 |     }
147 | }
148 | 
149 | uint64_t chacha_u64(ChaCha *rng) {
150 |     uint64_t lo = chacha_u32(rng);
151 |     uint64_t hi = chacha_u32(rng);
152 |     return (hi << 32) | lo;
153 | }
154 | 
155 | uint64_t chacha_u64_global() {
156 |     return chacha_u64(&chacha_rng);
157 | }
158 | 
159 | void chacha_fill_u64(ChaCha *rng, uint64_t *array, size_t count) {
160 |     for (size_t i = 0; i < count; i++) {
161 |         array[i] = chacha_u64(rng);
162 |     }
163 | }
164 | 
165 | float chacha_f32(ChaCha *rng) {
166 |     return (float)(chacha_u32(rng) >> 8) * 0x1p-24f;
167 | }
168 | 
169 | void chacha_fill_f32(ChaCha *rng, float *array, size_t count) {
170 |     for (size_t i = 0; i < count; i++) {
171 |         array[i] = chacha_f32(rng);
172 |     }
173 | }
174 | 
175 | double chacha_f64(ChaCha *rng) {
176 |     return (float)(chacha_u64(rng) >> 11) * 0x1p-53;
177 | }
178 | 
179 | void chacha_fill_f64(ChaCha *rng, double *array, size_t count) {
180 |     for (size_t i = 0; i < count; i++) {
181 |         array[i] = chacha_f64(rng);
182 |     }
183 | }
184 | 
185 | static inline uint32_t rotated_left(uint32_t value, uint32_t count) {
186 |     return (value << count) | (value >> (32 - count));
187 | }
188 | 
189 | #define QUARTER_ROUND(a, b, c, d) \
190 |     state[a] += state[b]; state[d] = rotated_left(state[d] ^ state[a], 16); \
191 |     state[c] += state[d]; state[b] = rotated_left(state[b] ^ state[c], 12); \
192 |     state[a] += state[b]; state[d] = rotated_left(state[d] ^ state[a],  8); \
193 |     state[c] += state[d]; state[b] = rotated_left(state[b] ^ state[c],  7);
194 | 
195 | static inline void double_round(uint32_t state[16]) {
196 |     QUARTER_ROUND(0, 4,  8, 12)
197 |     QUARTER_ROUND(1, 5,  9, 13)
198 |     QUARTER_ROUND(2, 6, 10, 14)
199 |     QUARTER_ROUND(3, 7, 11, 15)
200 | 
201 |     QUARTER_ROUND(0, 5, 10, 15)
202 |     QUARTER_ROUND(1, 6, 11, 12)
203 |     QUARTER_ROUND(2, 7,  8, 13)
204 |     QUARTER_ROUND(3, 4,  9, 14)
205 | }
206 | 


--------------------------------------------------------------------------------
/src/chacha.h:
--------------------------------------------------------------------------------
 1 | // copyright: https://github.com/nixberg/chacha-rng-c (MIT License)
 2 | // with some modifications by D. Lemire
 3 | #ifndef chacha_h
 4 | #define chacha_h
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | typedef struct {
 9 |     uint32_t state[16];
10 |     uint32_t working_state[16];
11 |     size_t rounds;
12 |     size_t word_index;
13 | } ChaCha;
14 | ChaCha chacha_rng;
15 | 
16 | void chacha8_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream);
17 | 
18 | void chacha8_zero(ChaCha *rng, uint64_t stream);
19 | 
20 | void chacha12_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream);
21 | 
22 | void chacha12_zero(ChaCha *rng, uint64_t stream);
23 | 
24 | void chacha20_init(ChaCha *rng, const uint32_t seed[8], uint64_t stream);
25 | 
26 | void chacha20_zero(ChaCha *rng, uint64_t stream);
27 | 
28 | uint8_t chacha_u8(ChaCha *rng);
29 | 
30 | uint16_t chacha_u16(ChaCha *rng);
31 | 
32 | uint32_t chacha_u32(ChaCha *rng);
33 | 
34 | uint64_t chacha_u64(ChaCha *rng);
35 | 
36 | float chacha_f32(ChaCha *rng);
37 | 
38 | double chacha_f64(ChaCha *rng);
39 | 
40 | void chacha_fill_u8(ChaCha *rng, uint8_t *array, size_t count);
41 | 
42 | void chacha_fill_u16(ChaCha *rng, uint16_t *array, size_t count);
43 | 
44 | void chacha_fill_u32(ChaCha *rng, uint32_t *array, size_t count);
45 | 
46 | void chacha_fill_u64(ChaCha *rng, uint64_t *array, size_t count);
47 | 
48 | void chacha_fill_f32(ChaCha *rng, float *array, size_t count);
49 | 
50 | void chacha_fill_f64(ChaCha *rng, double *array, size_t count);
51 | 
52 | #endif /* chacha_h */
53 | 


--------------------------------------------------------------------------------
/src/lehmer64.h:
--------------------------------------------------------------------------------
 1 | #ifndef LEHMER64_H
 2 | #define LEHMER64_H
 3 | #include <stdint.h>
 4 | 
 5 | #include "splitmix64.h"
 6 | 
 7 | __uint128_t g_lehmer64_state = UINT64_C(0x853c49e6748fea9b);
 8 | 
 9 | /**
10 |  * D. H. Lehmer, Mathematical methods in large-scale computing units.
11 |  * Proceedings of a Second Symposium on Large Scale Digital Calculating
12 |  * Machinery;
13 |  * Annals of the Computation Laboratory, Harvard Univ. 26 (1951), pp. 141-146.
14 |  */
15 | 
16 | static inline void lehmer64_seed(uint64_t seed) {
17 |   g_lehmer64_state = (((__uint128_t)splitmix64_stateless(seed)) << 64) +
18 |                      splitmix64_stateless(seed + 1);
19 | }
20 | 
21 | static inline uint64_t lehmer64() {
22 |   g_lehmer64_state *= UINT64_C(0xda942042e4dd58b5);
23 |   return (uint64_t)(g_lehmer64_state >> 64);
24 | }
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/src/pcg64.h:
--------------------------------------------------------------------------------
 1 | #ifndef PCG64_H
 2 | #define PCG64_H
 3 | 
 4 | /* Modified by D. Lemire based on original code by M. O'Neill, August 2017 */
 5 | #include "splitmix64.h" // we are going to leverage splitmix64 to generate the seed
 6 | #include <stdint.h>
 7 | 
 8 | typedef __uint128_t pcg128_t;
 9 | #define PCG_128BIT_CONSTANT(high, low) ((((pcg128_t)high) << 64) + low)
10 | #define PCG_DEFAULT_MULTIPLIER_128                                             \
11 |   PCG_128BIT_CONSTANT(2549297995355413924ULL, 4865540595714422341ULL)
12 | #define PCG_DEFAULT_INCREMENT_128                                              \
13 |   PCG_128BIT_CONSTANT(6364136223846793005ULL, 1442695040888963407ULL)
14 | 
15 | struct pcg_state_setseq_128 {
16 |   pcg128_t state;
17 |   pcg128_t inc;
18 | };
19 | 
20 | typedef struct pcg_state_setseq_128 pcg64_random_t;
21 | 
22 | inline void pcg_setseq_128_step_r(struct pcg_state_setseq_128 *rng) {
23 |   rng->state = rng->state * PCG_DEFAULT_MULTIPLIER_128 + rng->inc;
24 | }
25 | 
26 | inline void pcg_setseq_128_srandom_r(struct pcg_state_setseq_128 *rng,
27 |                                      pcg128_t initstate, pcg128_t initseq) {
28 |   rng->state = 0U;
29 |   rng->inc = (initseq << 1u) | 1u;
30 |   pcg_setseq_128_step_r(rng);
31 |   rng->state += initstate;
32 |   pcg_setseq_128_step_r(rng);
33 | }
34 | 
35 | // verbatim from O'Neill's except that we skip her assembly:
36 | inline uint64_t pcg_rotr_64(uint64_t value, unsigned int rot) {
37 |   return (value >> rot) | (value << ((-rot) & 63));
38 | }
39 | 
40 | inline uint64_t pcg_output_xsl_rr_128_64(pcg128_t state) {
41 |   return pcg_rotr_64(((uint64_t)(state >> 64u)) ^ (uint64_t)state,
42 |                      (unsigned int)(state >> 122u));
43 | }
44 | 
45 | inline uint64_t
46 | pcg_setseq_128_xsl_rr_64_random_r(struct pcg_state_setseq_128 *rng) {
47 |   pcg_setseq_128_step_r(rng);
48 |   return pcg_output_xsl_rr_128_64(rng->state);
49 | }
50 | 
51 | // use use a global state:
52 | pcg64_random_t pcg64_global; // global state
53 | 
54 | // call this once before calling pcg64_random_r
55 | inline void pcg64_seed(uint64_t seed) {
56 |   pcg128_t initstate =
57 |       PCG_128BIT_CONSTANT(splitmix64_stateless_offset(seed, 0),
58 |                           splitmix64_stateless_offset(seed, 1));
59 |   // we pick a sequence at random
60 |   pcg128_t initseq = PCG_128BIT_CONSTANT(splitmix64_stateless_offset(seed, 2),
61 |                                          splitmix64_stateless_offset(seed, 3));
62 |   initseq |= 1; // should not be necessary, but let us be careful.
63 | 
64 |   pcg_setseq_128_srandom_r(&pcg64_global, initstate, initseq);
65 | }
66 | 
67 | #define pcg64_random_r pcg_setseq_128_xsl_rr_64_random_r
68 | 
69 | static inline uint64_t pcg64(void) { return pcg64_random_r(&pcg64_global); }
70 | 
71 | #endif
72 | 


--------------------------------------------------------------------------------
/src/random_bounded.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdint.h>
  3 | #include <stdlib.h>
  4 | 
  5 | #include "chacha.c"
  6 | #include "batch_shuffle_dice.c"
  7 | #include "lehmer64.h"
  8 | #include "pcg64.h"
  9 | 
 10 | void seed(uint64_t s) {
 11 |   lehmer64_seed(s);
 12 |   pcg64_seed(s);
 13 |   chacha8_zero(&chacha_rng, s); 
 14 | }
 15 | 
 16 | 
 17 | 
 18 | // Fisher-Yates shuffle, rolling one die at a time
 19 | void shuffle(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)) {
 20 |   uint64_t i;
 21 |   for (i = size; i > 1; i--) {
 22 |     uint64_t nextpos = random_bounded(i, rng);
 23 |     uint64_t tmp = storage[i - 1];   // likely in cache
 24 |     uint64_t val = storage[nextpos]; // could be costly
 25 |     storage[i - 1] = val;
 26 |     storage[nextpos] = tmp; // you might have to read this store later
 27 |   }
 28 | }
 29 | 
 30 | // Fisher-Yates shuffle, rolling up to two dice at a time
 31 | void shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)) {
 32 |   uint64_t i = size;
 33 |   for (; i > 1 << 30; i--) {
 34 |     partial_shuffle_64b(storage, i, 1, i, rng);
 35 |   }
 36 | 
 37 |   // Batches of 2 for sizes up to 2^30 elements
 38 |   uint64_t bound = (uint64_t)1 << 60;
 39 |   for (; i > 1; i -= 2) {
 40 |     bound = partial_shuffle_64b(storage, i, 2, bound, rng);
 41 |   }
 42 | }
 43 | 
 44 | // Fisher-Yates shuffle, rolling up to six dice at a time
 45 | void shuffle_batch_23456(uint64_t *storage, uint64_t size,
 46 |                          uint64_t (*rng)(void)) {
 47 |   uint64_t i = size;
 48 |   for (; i > 1 << 30; i--) {
 49 |     partial_shuffle_64b(storage, i, 1, i, rng);
 50 |   }
 51 | 
 52 |   // Batches of 2 for sizes up to 2^30 elements
 53 |   uint64_t bound = (uint64_t)1 << 60;
 54 |   for (; i > 1 << 19; i -= 2) {
 55 |     bound = partial_shuffle_64b(storage, i, 2, bound, rng);
 56 |   }
 57 | 
 58 |   // Batches of 3 for sizes up to 2^19 elements
 59 |   bound = (uint64_t)1 << 57;
 60 |   for (; i > 1 << 14; i -= 3) {
 61 |     bound = partial_shuffle_64b(storage, i, 3, bound, rng);
 62 |   }
 63 | 
 64 |   // Batches of 4 for sizes up to 2^14 elements
 65 |   bound = (uint64_t)1 << 56;
 66 |   for (; i > 1 << 11; i -= 4) {
 67 |     bound = partial_shuffle_64b(storage, i, 4, bound, rng);
 68 |   }
 69 | 
 70 |   // Batches of 5 for sizes up to 2^11 elements
 71 |   bound = (uint64_t)1 << 55;
 72 |   for (; i > 1 << 9; i -= 5) {
 73 |     bound = partial_shuffle_64b(storage, i, 5, bound, rng);
 74 |   }
 75 | 
 76 |   // Batches of 6 for sizes up to 2^9 elements
 77 |   bound = (uint64_t)1 << 54;
 78 |   for (; i > 6; i -= 6) {
 79 |     bound = partial_shuffle_64b(storage, i, 6, bound, rng);
 80 |   }
 81 | 
 82 |   if (i > 1) {
 83 |     partial_shuffle_64b(storage, i, i - 1, 720, rng);
 84 |   }
 85 | }
 86 | 
 87 | 
 88 | // Fisher-Yates shuffle, rolling up to two dice at a time
 89 | void naive_shuffle_batch_2(uint64_t *storage, uint64_t size, uint64_t (*rng)(void)) {
 90 |   uint64_t i = size;
 91 |   for (; i > (UINT64_C(1) << 32); i--) {
 92 |     naive_partial_shuffle_64b(storage, i, 1, rng);
 93 |   }
 94 |   for (; i > 1; i -= 2) {
 95 |     naive_partial_shuffle_64b(storage, i, 2, rng);
 96 |   }
 97 | }
 98 | 
 99 | 
100 | // Shuffle with Lehmer RNG
101 | 
102 | void shuffle_lehmer(uint64_t *storage, uint64_t size) {
103 |   shuffle(storage, size, lehmer64);
104 | }
105 | 
106 | void shuffle_lehmer_2(uint64_t *storage, uint64_t size) {
107 |   shuffle_batch_2(storage, size, lehmer64);
108 | }
109 | 
110 | void shuffle_lehmer_23456(uint64_t *storage, uint64_t size) {
111 |   shuffle_batch_23456(storage, size, lehmer64);
112 | }
113 | 
114 | void naive_shuffle_lehmer_2(uint64_t *storage, uint64_t size) {
115 |   naive_shuffle_batch_2(storage, size, lehmer64);
116 | }
117 | 
118 | // Shuffle with PCG RNG
119 | 
120 | void shuffle_pcg(uint64_t *storage, uint64_t size) {
121 |   shuffle(storage, size, pcg64);
122 | }
123 | 
124 | void shuffle_pcg_2(uint64_t *storage, uint64_t size) {
125 |   shuffle_batch_2(storage, size, pcg64);
126 | }
127 | 
128 | void shuffle_pcg_23456(uint64_t *storage, uint64_t size) {
129 |   shuffle_batch_23456(storage, size, pcg64);
130 | }
131 | 
132 | void naive_shuffle_pcg_2(uint64_t *storage, uint64_t size) {
133 |   naive_shuffle_batch_2(storage, size, pcg64);
134 | }
135 | 
136 | // Shuffle with ChaCha RNG
137 | void shuffle_chacha(uint64_t *storage, uint64_t size) {
138 |   shuffle(storage, size, chacha_u64_global);
139 | }
140 | 
141 | void shuffle_chacha_2(uint64_t *storage, uint64_t size) {
142 |   shuffle_batch_2(storage, size, chacha_u64_global);
143 | }
144 | 
145 | void shuffle_chacha_23456(uint64_t *storage, uint64_t size) {
146 |   shuffle_batch_23456(storage, size, chacha_u64_global);
147 | }
148 | 
149 | void naive_shuffle_chacha_2(uint64_t *storage, uint64_t size) {
150 |   naive_shuffle_batch_2(storage, size, chacha_u64_global);
151 | }
152 | // Random bounded Lehmer
153 | 
154 | uint64_t random_bounded_lehmer(uint64_t range) {
155 |   return random_bounded(range, lehmer64);
156 | }
157 | 


--------------------------------------------------------------------------------
/src/splitmix64.h:
--------------------------------------------------------------------------------
 1 | #ifndef SPLITMIX64_H
 2 | #define SPLITMIX64_H
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | inline uint64_t splitmix64_stateless(uint64_t index) {
 7 |   uint64_t z = (index * UINT64_C(0x9E3779B97F4A7C15));
 8 |   z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
 9 |   z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
10 |   return z ^ (z >> 31);
11 | }
12 | 
13 | // floor( ( (1+sqrt(5))/2 ) * 2**64 MOD 2**64)
14 | #define GOLDEN_GAMMA UINT64_C(0x9E3779B97F4A7C15)
15 | 
16 | inline uint64_t splitmix64_r(uint64_t *seed) {
17 |   uint64_t z = (*seed += GOLDEN_GAMMA);
18 |   // David Stafford's Mix13 for MurmurHash3's 64-bit finalizer
19 |   z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
20 |   z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
21 |   return z ^ (z >> 31);
22 | }
23 | 
24 | // returns the value of splitmix64 "offset" steps from seed
25 | inline uint64_t splitmix64_stateless_offset(uint64_t seed, uint64_t offset) {
26 |   seed += offset * GOLDEN_GAMMA;
27 |   return splitmix64_r(&seed);
28 | }
29 | 
30 | #endif // SPLITMIX64_H
31 | 


--------------------------------------------------------------------------------
/tests/basic.cpp:
--------------------------------------------------------------------------------
  1 | #include <array>
  2 | #include <bitset>
  3 | #include <iomanip>
  4 | #include <iostream>
  5 | #include <numeric>
  6 | #include <limits>
  7 | 
  8 | extern "C" {
  9 | #include "random_bounded.h"
 10 | }
 11 | #include "template_shuffle.h"
 12 | 
 13 | /***
 14 |  * How do we test a shuffle function?
 15 |  * There are many tests that one could apply to a shuffle function.
 16 |  */
 17 | using shuffle_function = void (*)(uint64_t *, uint64_t);
 18 | 
 19 | template <class function_type>
 20 | bool everyone_can_move_everywhere(const function_type &function) {
 21 |   constexpr size_t size = 512;
 22 |   uint64_t input[size];
 23 |   std::bitset<size> bits[size];
 24 |   for (size_t trial = 0; trial < size * size; trial++) {
 25 |     // We always start from the same input.
 26 |     std::iota(input, input + size, 0);
 27 |     // We shuffle:
 28 |     function(input, size);
 29 |     // Mark that at position i we found value input[i].
 30 |     for (size_t i = 0; i < size; i++) {
 31 |       bits[i][input[i]] = 1;
 32 |     }
 33 |   }
 34 |   for (const std::bitset<size> &b : bits) {
 35 |     if (!b.all()) {
 36 |       return false;
 37 |     }
 38 |   }
 39 |   return true;
 40 | }
 41 | 
 42 | template <class function_type>
 43 | bool uniformity_test(const function_type &function) {
 44 |   constexpr size_t size = 512;
 45 |   uint64_t input[size];
 46 |   std::array<size_t, size> bits[size]{};
 47 |   size_t volume = size * size;
 48 |   for (size_t trial = 0; trial < volume; trial++) {
 49 |     // We always start from the same input.
 50 |     std::iota(input, input + size, 0);
 51 |     // We shuffle:
 52 |     function(input, size);
 53 |     // Mark that at position i we found value input[i].
 54 |     for (size_t i = 0; i < size; i++) {
 55 |       bits[i][input[i]] += 1;
 56 |     }
 57 |   }
 58 |   size_t overall_min {std::numeric_limits<size_t>::max() };
 59 |   size_t overall_max = 0;
 60 |   size_t average = 0;
 61 | 
 62 |   for (const std::array<size_t, size> &b : bits) {
 63 |     average += std::accumulate(b.begin(), b.end(), 0);
 64 |     size_t max_value = *std::max_element(b.begin(), b.end());
 65 |     size_t min_value = *std::min_element(b.begin(), b.end());
 66 |     if (max_value > overall_max) {
 67 |       overall_max = max_value;
 68 |     }
 69 |     if (min_value < overall_min) {
 70 |       overall_min = min_value;
 71 |     }
 72 |   }
 73 |   size_t gap = overall_max - overall_min;
 74 |   double mean = (double)average / volume;
 75 |   double relative_gap = (double)gap / mean;
 76 | 
 77 |   printf("relative gap: %f, ", relative_gap);
 78 | 
 79 |   return relative_gap < 0.6;
 80 | }
 81 | 
 82 | template <class function_type>
 83 | bool any_possible_pair_at_the_start(const function_type &function) {
 84 |   constexpr size_t size = 64;
 85 |   uint64_t input[size];
 86 |   std::bitset<size * size> bits;
 87 |   for (size_t trial = 0; trial < size * size * size; trial++) {
 88 |     // We always start from the same input.
 89 |     std::iota(input, input + size, 0);
 90 |     // We shuffle:
 91 |     function(input, size);
 92 |     bits[input[0] * size + input[1]] = 1;
 93 |   }
 94 |   for (size_t i = 0; i < size; i++) {
 95 |     for (size_t j = 0; j < size; j++) {
 96 |       if (i == j) {
 97 |         if (bits[i * size + j]) {
 98 |           return false;
 99 |         }
100 |       } else {
101 |         if (!bits[i * size + j]) {
102 |           return false;
103 |         }
104 |       }
105 |     }
106 |   }
107 |   return true;
108 | }
109 | 
110 | template <class function_type>
111 | bool any_possible_pair_at_the_end(const function_type &function) {
112 |   constexpr size_t size = 64;
113 |   uint64_t input[size];
114 |   std::bitset<size * size> bits;
115 |   for (size_t trial = 0; trial < size * size * size; trial++) {
116 |     // We always start from the same input.
117 |     std::iota(input, input + size, 0);
118 |     // We shuffle:
119 |     function(input, size);
120 |     bits[input[0] * size + input[1]] = 1;
121 |   }
122 |   for (size_t i = 0; i < size; i++) {
123 |     for (size_t j = 0; j < size; j++) {
124 |       if (i == j) {
125 |         if (bits[i * size + j]) {
126 |           return false;
127 |         }
128 |       } else {
129 |         if (!bits[i * size + j]) {
130 |           return false;
131 |         }
132 |       }
133 |     }
134 |   }
135 |   return true;
136 | }
137 | 
138 | struct named_function {
139 |   std::string name;
140 |   shuffle_function function;
141 | };
142 | 
143 | named_function func[] = {
144 |     {"shuffle_lehmer", shuffle_lehmer},
145 |     {"shuffle_lehmer_2", shuffle_lehmer_2},
146 |     {"shuffle_lehmer_23456", shuffle_lehmer_23456},
147 |     {"shuffle_pcg", shuffle_pcg},
148 |     {"shuffle_pcg_2", shuffle_pcg_2},
149 |     {"shuffle_pcg_23456", shuffle_pcg_23456}
150 | };
151 | 
152 | bool test_everyone_can_move_everywhere() {
153 |   std::cout << __FUNCTION__ << std::endl;
154 |   for (const auto &f : func) {
155 |     std::cout << std::setw(40) << f.name << ": ";
156 |     std::cout.flush();
157 |     if (!everyone_can_move_everywhere(f.function)) {
158 |       std::cerr << "!!!Test failed for " << f.name << std::endl;
159 |       return false;
160 |     } else {
161 |       std::cout << "passed" << std::endl;
162 |     }
163 |   }
164 |   return true;
165 | }
166 | 
167 | bool test_uniformity_test() {
168 |   std::cout << __FUNCTION__ << std::endl;
169 |   for (const auto &f : func) {
170 |     std::cout << std::setw(40) << f.name << ": ";
171 |     std::cout.flush();
172 |     if (!uniformity_test(f.function)) {
173 |       std::cerr << "!!!Test failed for " << f.name << std::endl;
174 |       return false;
175 |     } else {
176 |       std::cout << "passed" << std::endl;
177 |     }
178 |   }
179 |   return true;
180 | }
181 | 
182 | bool test_any_possible_pair_at_the_start() {
183 |   std::cout << __FUNCTION__ << std::endl;
184 |   for (const auto &f : func) {
185 |     std::cout << std::setw(40) << f.name << ": ";
186 |     std::cout.flush();
187 |     if (!any_possible_pair_at_the_start(f.function)) {
188 |       std::cerr << "!!!Test failed for " << f.name << std::endl;
189 |       return false;
190 |     } else {
191 |       std::cout << "passed" << std::endl;
192 |     }
193 |   }
194 |   return true;
195 | }
196 | 
197 | bool test_any_possible_pair_at_the_end() {
198 |   std::cout << __FUNCTION__ << std::endl;
199 |   for (const auto &f : func) {
200 |     std::cout << std::setw(40) << f.name << ": ";
201 |     std::cout.flush();
202 |     if (!any_possible_pair_at_the_end(f.function)) {
203 |       std::cerr << "!!!Test failed for " << f.name << std::endl;
204 |       return false;
205 |     } else {
206 |       std::cout << "passed" << std::endl;
207 |     }
208 |   }
209 |   return true;
210 | }
211 | 
212 | int main() {
213 |   seed(1234);
214 |   bool success = true;
215 |   success &= test_uniformity_test();
216 |   success &= test_any_possible_pair_at_the_end();
217 |   success &= test_any_possible_pair_at_the_start();
218 |   success &= test_everyone_can_move_everywhere();
219 |   if (success) {
220 |     std::cout << "All tests passed" << std::endl;
221 |   } else {
222 |     std::cerr << "Some tests failed" << std::endl;
223 |   }
224 |   return success ? EXIT_SUCCESS : EXIT_FAILURE;
225 | }
226 | 


--------------------------------------------------------------------------------