├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── contrib
    └── rust
    │   ├── Cargo.toml
    │   └── src
    │       └── lib.rs
├── engine_chacha.h
├── engine_os.h
├── nanobenchmark.cc
├── nanobenchmark.h
├── nanobenchmark_test.cc
├── randen.cc
├── randen.h
├── randen_benchmark.cc
├── randen_test.cc
├── third_party
    └── pcg_random
    │   ├── LICENSE
    │   └── include
    │       ├── pcg_extras.hpp
    │       └── pcg_random.hpp
├── util.h
├── vector128.h
└── vector128_test.cc


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | 
 3 | dist: trusty
 4 | 
 5 | compiler:
 6 |   - clang
 7 |   - gcc
 8 | 
 9 | script:
10 |   - make
11 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution,
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to <https://cla.developers.google.com/> to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | override CPPFLAGS += -I. -I../
 2 | override CXXFLAGS += -std=c++11 -Wall -O3 -fno-pic -mavx2 -maes
 3 | override LDFLAGS += $(CXXFLAGS)
 4 | override CXX = clang++
 5 | 
 6 | all: $(addprefix bin/, nanobenchmark_test randen_test randen_benchmark vector128_test)
 7 | 
 8 | obj/%.o: %.cc
 9 | 	@mkdir -p -- $(dir $@)
10 | 	$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
11 | 
12 | bin/%: obj/%.o obj/nanobenchmark.o obj/randen.o
13 | 	@mkdir -p bin
14 | 	$(CXX) $(LDFLAGS) $^ -o $@
15 | 
16 | .DELETE_ON_ERROR:
17 | deps.mk: $(wildcard *.cc) $(wildcard *.h) Makefile
18 | 	set -eu; for file in *.cc; do \
19 | 		target=obj/$${file##*/}; target=$${target%.*}.o; \
20 | 		$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -MM -MT \
21 | 		"$$target" "$$file"; \
22 | 	done >$@
23 | -include deps.mk
24 | 
25 | clean:
26 | 	[ ! -d obj ] || $(RM) -r -- obj/
27 | 	[ ! -d bin ] || $(RM) -r -- bin/
28 | 	[ ! -d lib ] || $(RM) -r -- lib/
29 | 
30 | .PHONY: clean all
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Overview
  2 | 
  3 | What if we could default to attack-resistant random generators without excessive
  4 | CPU cost? We introduce 'Randen', a new generator with security guarantees; it
  5 | outperforms MT19937, pcg64_c32, Philox, ISAAC and ChaCha8 in real-world
  6 | benchmarks. This is made possible by AES hardware acceleration and a large
  7 | Feistel permutation.
  8 | 
  9 | ## Related work
 10 | 
 11 | AES-CTR (encrypting a counter) is a well-known and easy to implement generator.
 12 | It has two known weaknesses:
 13 | 
 14 | -   A known-key distinguisher on 10-round, 128-bit AES [https://goo.gl/3xReB9].
 15 | 
 16 | -   No forward security/backtracking resistance: compromising the current state
 17 |     lets attackers distinguish prior outputs from random.
 18 | 
 19 | NIST 800-90a r1 [https://goo.gl/68Fwmv] is a standardized generator that ensures
 20 | backtracking resistance, but is not fast enough for a general-purpose generator
 21 | (5-10x slower than AES).
 22 | 
 23 | ## Algorithm
 24 | 
 25 | The Randen generator is based upon three existing components:
 26 | 
 27 | 1)  Reverie [https://eprint.iacr.org/2016/886.pdf] is a sponge-like generator
 28 |     that requires a cryptographic permutation. It improves upon "Provably Robust
 29 |     Sponge-Based PRNGs and KDFs" by achieving backtracking resistance with only
 30 |     a single permutation per buffer.
 31 | 
 32 | 2)  Simpira v2 [https://eprint.iacr.org/2016/122.pdf] constructs up to 1024-bit
 33 |     permutations using an improved Generalized Feistel network with 2-round
 34 |     AES-128 functions. This Feistel block shuffle achieves diffusion sooner and
 35 |     is less vulnerable to sliced-biclique attacks than a Type-2 cyclic shuffle.
 36 | 
 37 | 3)  "New criterion for diffusion property" [https://goo.gl/mLXH4f] shows that
 38 |     the same kind of improved Feistel block shuffle can be extended to 16
 39 |     branches, which enables a more efficient 2048-bit permutation.
 40 | 
 41 | We combine these by plugging the larger Simpira-like permutation into Reverie.
 42 | 
 43 | ## Performance
 44 | 
 45 | The implementation targets x86 (Westmere), POWER 8 and ARM64.
 46 | 
 47 | x86 microbenchmark: generating random bits in a tight loop
 48 | (cpb=cycles per byte, MAD=median absolute deviation):
 49 | 
 50 | RNG | cpb | MAD
 51 | --- | --- | ---
 52 | Randen          |  1.54 | 0.002
 53 | pcg64_c32       |  0.78 | 0.003
 54 | mt19937_64      |  1.79 | 0.001
 55 | ChaCha8         |  3.02 | 0.003
 56 | ISAAC           |  4.08 | 0.006
 57 | Philox          |  4.70 | 0.003
 58 | /dev/urandom (ChaCha20)    | 15.27 | 0.018
 59 | BCryptGenRandom (CTR-DRBG) | 16.80 | 0.009
 60 | 
 61 | x86 real-world benchmark (reservoir sampling):
 62 | 
 63 | RNG | cpb | MAD
 64 | --- | --- | ---
 65 | Randen    |  2.60 | 0.008
 66 | pcg64_c32 |  3.03 | 0.009
 67 | mt19937_64|  2.82 | 0.009
 68 | ChaCha8   |  3.75 | 0.008
 69 | ISAAC     |  4.46 | 0.014
 70 | Philox    |  4.95 | 0.009
 71 | /dev/urandom (ChaCha20)    | 13.46 | 0.017
 72 | BCryptGenRandom (CTR-DRBG) | 16.41 | 0.015
 73 | 
 74 | ## Security
 75 | 
 76 | Randen is indistinguishable from random and backtracking-resistant. For more
 77 | details and benchmarks, please see ["Randen - fast backtracking-resistant random
 78 | generator with AES+Feistel+Reverie"](https://arxiv.org/abs/1810.02227).
 79 | 
 80 | ## Usage
 81 | 
 82 | `make && bin/randen_benchmark`
 83 | 
 84 | Note that the code relies on compiler optimizations. Cycles per byte may
 85 | increase by factors of 1.6 when compiled with GCC 7.3, and 1.3 with
 86 | Clang 4.0.1. This can be mitigated by manually unrolling the loops.
 87 | 
 88 | ## Third-party implementations / bindings
 89 | 
 90 | Thanks to Frank Denis for making us aware of these third-party implementations
 91 | or bindings. Note that the algorithm is still under review and subject to
 92 | change, but please feel free to get in touch or raise an issue and we'll
 93 | add yours as well.
 94 | 
 95 | By | Language | URL
 96 | --- | --- | ---
 97 | Frank Denis | C | https://github.com/jedisct1/randen-rng
 98 | 
 99 | 
100 | This is not an official Google product.
101 | 


--------------------------------------------------------------------------------
/contrib/rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "randen"
 3 | version = "0.0.0"
 4 | authors = ["Ruud van Asseldonk <ruud@veniogames.com>", "Jan Wassenberg <janwas@google.com>", "Brendan Hickey <bhickey@google.com>"]
 5 | license = "Apache-2.0"
 6 | description = "Randen is a fast, backtracking resistant CSPRNG."
 7 | repository = "https://github.com/google/randen"
 8 | keywords = [ "Crypto", "rng", "random" ]
 9 | 
10 | [dependencies.rand]
11 | version = "0.5"
12 | features = ["i128_support"]
13 | 


--------------------------------------------------------------------------------
/contrib/rust/src/lib.rs:
--------------------------------------------------------------------------------
  1 | //! The Randen pseudorandom number generator.
  2 | 
  3 | extern crate rand;
  4 | 
  5 | use std::mem;
  6 | use std::ops::BitXorAssign;
  7 | 
  8 | use rand::{Error, FromEntropy, RngCore, SeedableRng};
  9 | use std::arch::x86_64::{__m128i, _mm_aesenc_si128};
 10 | 
 11 | /// Size of the entire sponge / state for the Randen PRNG.
 12 | const STATE_LEN: usize = 16; // 256 bytes, 16x16 bytes.
 13 | 
 14 | /// Size of the "inner" (inaccessible) part of the sponge.
 15 | ///
 16 | /// Larger values would require more frequent calls to `randen_generate`.
 17 | const CAPACITY: usize = 1; // 1x16 bytes.
 18 | 
 19 | /// Size of the default seed consumed by the sponge.
 20 | const SEED_LEN: usize = STATE_LEN - CAPACITY;
 21 | const SEED_BYTES: usize = SEED_LEN * 16;
 22 | 
 23 | const STATE_BYTES: usize = STATE_LEN * 16;
 24 | const CAPACITY_BYTES: usize = CAPACITY * 16;
 25 | 
 26 | const FEISTEL_ROUNDS: usize = 17;
 27 | const FEISTEL_FUNCTIONS: usize = 8;
 28 | const ROUND_KEYS_LEN: usize = FEISTEL_ROUNDS * FEISTEL_FUNCTIONS;
 29 | 
 30 | /// Aligned 128 bits wrapper.
 31 | #[derive(Copy, Clone, Debug, Eq, PartialEq)]
 32 | #[repr(align(16))]
 33 | pub struct U128A(u128);
 34 | 
 35 | impl U128A {
 36 |     #[inline(always)]
 37 |     fn from(m128i: __m128i) -> U128A {
 38 |         unsafe { mem::transmute(m128i) }
 39 |     }
 40 | 
 41 |     #[inline(always)]
 42 |     fn m128i(self) -> __m128i {
 43 |         unsafe { mem::transmute(self) }
 44 |     }
 45 | }
 46 | 
 47 | impl BitXorAssign for U128A {
 48 |     fn bitxor_assign(&mut self, rhs: U128A) {
 49 |         self.0 ^= rhs.0;
 50 |     }
 51 | }
 52 | 
 53 | // "Nothing up my sleeve" numbers from the first hex digits of pi.
 54 | //
 55 | // Obtained from http://hexpi.sourceforge.net/. The array was generated by the
 56 | // following Python script:
 57 | /*
 58 | python3 << EOF
 59 | """Generates Randen round keys array from pi-hex.62500.txt file."""
 60 | KEYS = 136
 61 | 
 62 | def chunks(l, n):
 63 |     """Yield successive n-sized chunks from l."""
 64 |     for i in range(0, len(l), n):
 65 |         yield l[i:i + n]
 66 | 
 67 | with open("pi-hex.62500.txt") as file:
 68 |     for key in chunks(file.read(KEYS * 32), 32):
 69 |       print('    U128A(0x{}),'.format(key[16:], key[:16]))
 70 | EOF
 71 | */
 72 | const ROUND_KEYS: [U128A; ROUND_KEYS_LEN] = [
 73 |     U128A(0x13198A2E03707344243F6A8885A308D3),
 74 |     U128A(0x082EFA98EC4E6C89A4093822299F31D0),
 75 |     U128A(0xBE5466CF34E90C6C452821E638D01377),
 76 |     U128A(0x3F84D5B5B5470917C0AC29B7C97C50DD),
 77 |     U128A(0xD1310BA698DFB5AC9216D5D98979FB1B),
 78 |     U128A(0xB8E1AFED6A267E962FFD72DBD01ADFB7),
 79 |     U128A(0x24A19947B3916CF7BA7C9045F12C7F99),
 80 |     U128A(0x636920D871574E690801F2E2858EFC16),
 81 |     U128A(0x0D95748F728EB658A458FEA3F4933D7E),
 82 |     U128A(0x7B54A41DC25A59B5718BCD5882154AEE),
 83 |     U128A(0xC5D1B023286085F09C30D5392AF26013),
 84 |     U128A(0x8E79DCB0603A180ECA417918B8DB38EF),
 85 |     U128A(0xD71577C1BD314B276C9E0E8BB01E8A3E),
 86 |     U128A(0xE65525F3AA55AB9478AF2FDA55605C60),
 87 |     U128A(0x55CA396A2AAB10B65748986263E81440),
 88 |     U128A(0xA15486AF7C72E993B4CC5C341141E8CE),
 89 |     U128A(0x2BA9C55D741831F6B3EE1411636FBC2A),
 90 |     U128A(0xAFD6BA336C24CF5CCE5C3E169B87931E),
 91 |     U128A(0x3B8F48986B4BB9AF7A32538128958677),
 92 |     U128A(0x61D809CCFB21A991C4BFE81B66282193),
 93 |     U128A(0xEF845D5DE98575B1487CAC605DEC8032),
 94 |     U128A(0x23893E81D396ACC5DC262302EB651B88),
 95 |     U128A(0x2E0B4482A48420040F6D6FF383F44239),
 96 |     U128A(0x21C66842F6E96C9A69C8F04A9E1F9B5E),
 97 |     U128A(0x6A51A0D2D8542F68670C9C61ABD388F0),
 98 |     U128A(0x6EEF0B6C137A3BE4960FA728AB5133A3),
 99 |     U128A(0xA1F1651D39AF0176BA3BF0507EFB2A98),
100 |     U128A(0x8CEE8619456F9FB466CA593E82430E88),
101 |     U128A(0xE06F75D885C120737D84A5C33B8B5EBE),
102 |     U128A(0x4ED3AA62363F7706401A449F56C16AA6),
103 |     U128A(0x37D0D724D00A12481BFEDF72429B023D),
104 |     U128A(0x075372C980991B7BDB0FEAD349F1C09B),
105 |     U128A(0xE3FE501AB6794C3B25D479D8F6E8DEF7),
106 |     U128A(0xC1A94FB6409F60C4976CE0BD04C006BA),
107 |     U128A(0x68FB6FAF3E6C53B55E5C9EC2196A2463),
108 |     U128A(0x6DFC511F9B30952C1339B2EB3B52EC6F),
109 |     U128A(0xBEE3D004DE334AFDCC814544AF5EBD09),
110 |     U128A(0xC0CBA85745C8740F660F2807192E4BB3),
111 |     U128A(0x5579C0BD1A60320AD20B5F39B9D3FBDB),
112 |     U128A(0x679F25FEFB1FA3CCD6A100C6402C7279),
113 |     U128A(0x3C7516DFFD616B158EA5E9F8DB3222F8),
114 |     U128A(0x323DB5FAFD2387602F501EC8AD0552AB),
115 |     U128A(0x9E5C57BBCA6F8CA053317B483E00DF82),
116 |     U128A(0xD542A8F6287EFFC31A87562EDF1769DB),
117 |     U128A(0x695B27B0BBCA58C8AC6732C68C4F5573),
118 |     U128A(0x10FA3D98FD2183B8E1FFA35DB8F011A0),
119 |     U128A(0x9A53E479B6F845654AFCB56C2DD1D35B),
120 |     U128A(0xE1DDF2DAA4CB7E33D28E49BC4BFB9790),
121 |     U128A(0xEF20CADA36774C0162FB1341CEE4C6E8),
122 |     U128A(0x95DBDA4DAE909198D07E9EFE2BF11FB4),
123 |     U128A(0xD08ED1D0AFC725E0EAAD8E716B93D5A0),
124 |     U128A(0x8FF6E2FBF2122B648E3C5B2F8E7594B7),
125 |     U128A(0x4FAD5EA0688FC31C8888B812900DF01C),
126 |     U128A(0x2F2F2218BE0E1777D1CFF191B3A8C1AD),
127 |     U128A(0xE5A0CC0FB56F74E8EA752DFE8B021FA1),
128 |     U128A(0xB4A84FE0FD13E0B718ACF3D6CE89E299),
129 |     U128A(0x165FA266809577057CC43B81D2ADA8D9),
130 |     U128A(0xE6AD206577B5FA8693CC7314211A1477),
131 |     U128A(0xEBCDAF0C7B3E89A0C75442F5FB9D35CF),
132 |     U128A(0x00250E2D2071B35ED6411BD3AE1E7E49),
133 |     U128A(0x2464369BF009B91E226800BB57B8E0AF),
134 |     U128A(0x78C14389D95A537F5563911D59DFA6AA),
135 |     U128A(0x832603766295CFA9207D5BA202E5B9C5),
136 |     U128A(0xB3472DCA7B14A94A11C819684E734A41),
137 |     U128A(0xD60F573FBC9BC6E41B5100529A532915),
138 |     U128A(0x08BA6FB5571BE91F2B60A47681E67400),
139 |     U128A(0xB6636521E7B9F9B6F296EC6B2A0DD915),
140 |     U128A(0x53B02D5DA99F8FA1FF34052EC5855664),
141 |     U128A(0x4B7A70E9B5B3294408BA47996E85076A),
142 |     U128A(0xAD6EA6B049A7DF7DDB75092EC4192623),
143 |     U128A(0xECAA8C71699A18FF9CEE60B88FEDB266),
144 |     U128A(0x193602A575094C295664526CC2B19EE1),
145 |     U128A(0x3F54989A5B429D65A0591340E4183A3E),
146 |     U128A(0xA1D29C07EFE830F56B8FE4D699F73FD6),
147 |     U128A(0x4CDD20868470EB264D2D38E6F0255DC1),
148 |     U128A(0x09686B3F3EBAEFC96382E9C6021ECC5E),
149 |     U128A(0x687F358452A0E2863C9718146B6A70A1),
150 |     U128A(0x3E07841C7FDEAE5CB79C5305AA500737),
151 |     U128A(0xB03ADA37F0500C0D8E7D44EC5716F2B8),
152 |     U128A(0xAE0CF51A3CB574B2F01C1F040200B3FF),
153 |     U128A(0xD19113F97CA92FF625837A58DC0921BD),
154 |     U128A(0x3AE5E58137C2DADC9432477322F54701),
155 |     U128A(0xA94461460FD0030EC8B576349AF3DDA7),
156 |     U128A(0xE238CD993BEA0E2FECC8C73EA4751E41),
157 |     U128A(0x4E548B384F6DB9083280BBA1183EB331),
158 |     U128A(0x2CB8129024977C796F420D03F60A04BF),
159 |     U128A(0xDE9A771FD99308105679B072BCAF89AF),
160 |     U128A(0x5512721F2E6B7124B38BAE12DCCF3F2E),
161 |     U128A(0x7A5847187408DA17501ADDE69F84CD87),
162 |     U128A(0xEC7AEC3ADB851DFABC9F9ABCE94B7D8C),
163 |     U128A(0xEF1C18473215D80863094366C464C3D2),
164 |     U128A(0x12A14D432A65C451DD433B3724C2BA16),
165 |     U128A(0x71DFF89E10314E5550940002133AE4DD),
166 |     U128A(0x043556F1D7A3C76B81AC77D65F11199B),
167 |     U128A(0xF28FE6ED97F1FBFA3C11183B5924A509),
168 |     U128A(0x86E34570EAE96FB19EBABF2C1E153C6E),
169 |     U128A(0x771FE71C4E3D06FA860E5E0A5A3E2AB3),
170 |     U128A(0x803E89D65266C8252965DCB999E71D0F),
171 |     U128A(0xC6150EBA94E2EA782E4CC9789C10B36A),
172 |     U128A(0xF2F74EA7361D2B3DA6FC3C531E0A2DF4),
173 |     U128A(0x5223A708F71312B61939260F19C27960),
174 |     U128A(0xE3BC4595A67BC883EBADFE6EEAC31F66),
175 |     U128A(0xC332DDEFBE6C5AA5B17F37D1018CFF28),
176 |     U128A(0xEECEA50FDB2F953B6558218568AB9702),
177 |     U128A(0x1521B628290761702AEF7DAD5B6E2F84),
178 |     U128A(0x13CCA830EB61BD96ECDD4775619F1510),
179 |     U128A(0xB5735C904C70A2390334FE1EAA0363CF),
180 |     U128A(0xEECC86BC60622CA7D59E9E0BCBAADE14),
181 |     U128A(0x648B1EAF19BDF0CA9CAB5CABB2F3846E),
182 |     U128A(0x40685A323C2AB4B3A02369B9655ABB50),
183 |     U128A(0x9B540B19875FA099319EE9D5C021B8F7),
184 |     U128A(0xF837889A97E32D7795F7997E623D7DA8),
185 |     U128A(0x0E358829C7E61FD611ED935F16681281),
186 |     U128A(0x57F584A51B22726396DEDFA17858BA99),
187 |     U128A(0xCDB30AEB532E30549B83C3FF1AC24696),
188 |     U128A(0x58EBF2EF34C6FFEA8FD948E46DBC3128),
189 |     U128A(0x5D4A14D9E864B7E3FE28ED61EE7C3C73),
190 |     U128A(0x45EEE2B6A3AAABEA42105D14203E13E0),
191 |     U128A(0xC742F442EF6ABBB5DB6C4F15FACB4FD0),
192 |     U128A(0xD81E799E86854DC7654F3B1D41CD2105),
193 |     U128A(0xCF62A1F25B8D2646E44B476A3D816250),
194 |     U128A(0x7F1524C369CB7492FC8883A0C1C7B6A3),
195 |     U128A(0x095BBF00AD19489D47848A0B5692B285),
196 |     U128A(0x58428D2A0C55F5EA1462B17423820D00),
197 |     U128A(0x3372F0928D937E411DADF43E233F7061),
198 |     U128A(0x7CDE3759CBEE7460D65FECF16C223BDB),
199 |     U128A(0xA607808419F8509E4085F2A7CE77326E),
200 |     U128A(0xA969A7AAC50C06C2E8EFD85561D99735),
201 |     U128A(0x9E447A2EC34534845A04ABFC800BCADC),
202 |     U128A(0xDB73DBD3105588CDFDD567050E1E9EC9),
203 |     U128A(0xC5C43465713E38D8675FDA79E3674340),
204 |     U128A(0x153E21E78FB03D4A3D28F89EF16DFF20),
205 |     U128A(0xE93D5A68948140F7E6E39F2BDB83ADF7),
206 |     U128A(0x411520F77602D4F7F64C261C94692934),
207 |     U128A(0xD40824713320F46ABCF46B2ED4A10068),
208 |     U128A(0x1E39F62E9724454643B7D4B7500061AF),
209 | ];
210 | 
211 | pub type State = [U128A; STATE_LEN];
212 | 
213 | #[inline(always)]
214 | fn aes_round(state: U128A, round_key: U128A) -> U128A {
215 |     unsafe { U128A::from(_mm_aesenc_si128(state.m128i(), round_key.m128i())) }
216 | }
217 | 
218 | /// Improved odd-even shuffle from "New criterion for diffusion property".
219 | #[inline(always)]
220 | fn block_shuffle(source: State) -> State {
221 |     let shuffle = [7, 2, 13, 4, 11, 8, 3, 6, 15, 0, 9, 10, 1, 14, 5, 12];
222 |     // TODO: Check if the zeros get generated; if so, use mem::uninitialized.
223 |     let mut new_state = [U128A(0); STATE_LEN];
224 |     for (i, shuf) in shuffle.iter().enumerate() {
225 |         new_state[i] = source[*shuf];
226 |     }
227 |     new_state
228 | }
229 | 
230 | /// Cryptographic permutation based on type-2 Generalized Feistel Network.
231 | ///
232 | /// An adversary who can query a permutation for a chosen ciphertext cannot
233 | /// distinguish the permutation from a truly random permutation in less than
234 | /// 2^64 queries, if the round function is a pseudorandom function. This is
235 | /// similar to the b=8 case of Simpira v2, but more efficient than Simpira's
236 | /// generic construction from b=16.
237 | #[inline(always)]
238 | fn permute(state: &mut State) {
239 |     let mut keys = ROUND_KEYS.iter();
240 |     for _ in 0..FEISTEL_ROUNDS {
241 |         for branch in 0..FEISTEL_FUNCTIONS {
242 |             let even = state[branch * 2];
243 |             let odd = state[branch * 2 + 1];
244 |             // Feistel round function using two AES subrounds. Very similar to
245 |             // F() from Simpira v2, but with independent subround keys. Uses 17
246 |             // AES rounds per 16 bytes (vs. 10 for AES-CTR). Computing eight
247 |             // round functions in parallel hides the 7-cycle AESNI latency on
248 |             // HSW. Note that the Feistel XORs are 'free' (included in the
249 |             // second AES instruction).
250 |             let f1 = aes_round(even, *keys.next().unwrap());
251 |             let f2 = aes_round(f1, odd);
252 |             state[branch * 2 + 1] = f2;
253 |         }
254 |         *state = block_shuffle(*state);
255 |     }
256 | }
257 | 
258 | /// Generate updates the Randen sponge.
259 | ///
260 | /// The outer portion of the sponge (`CAPACITY_BYTES..STATE_BYTES`) may be
261 | /// consumed as PRNG output after applying this function.
262 | #[cfg(target_endian = "little")]
263 | pub fn randen_generate(state: &mut State) {
264 |     let prev_inner = state[0];
265 |     // Note: for a big-endian architecture, the endianness of the state and
266 |     // round keys needs to be converted first. But as this currently relies on
267 |     // an x86-only instruction, we don't deal with this at the moment.
268 |     permute(state);
269 | 
270 |     // Ensure backtracking resistance.
271 |     state[0] ^= prev_inner;
272 | }
273 | 
274 | #[cfg(target_endian = "big")]
275 | pub fn randen_generate(state: &mut State) {
276 |     unimplemented!("Big endian requires swapping the bytes in the state and round keys.");
277 | }
278 | 
279 | pub fn randen_absorb(state: &mut State, seed: &[U128A; SEED_LEN]) {
280 |     for (seed_elem, state_elem) in seed.iter().zip(&mut state[1..]) {
281 |         *state_elem ^= *seed_elem;
282 |     }
283 | }
284 | 
285 | // Note: do not derive Copy, to avoid accidental reuse of the state.
286 | #[derive(Clone, Debug)]
287 | pub struct RandenRng {
288 |     /// The current state.
289 |     state: State,
290 |     /// Index of the next unconsumed byte of the state.
291 |     ///
292 |     /// The value is least `CAPACITY_BYTES`. The value may exceed `STATE_BYTES -
293 |     /// 1`. In that case a generate is required before consuming bytes.
294 |     cursor: usize,
295 | }
296 | 
297 | impl RandenRng {
298 |     /// Create a Randen random number generator using a fixed default seed.
299 |     pub fn new_unseeded() -> RandenRng {
300 |         RandenRng {
301 |             state: [U128A(0); STATE_LEN],
302 |             // Set the cursor to indicate that the state is fully consumed, to
303 |             // enforce a generate before returning any bytes. This way the
304 |             // initial zeros are not exposed as random numbers.
305 |             cursor: STATE_BYTES,
306 |         }
307 |     }
308 | }
309 | 
310 | // The implementations of `next_u32` and `next_u64` are similar apart from the
311 | // types and size constants, use a macro so we only have to write it once.
312 | macro_rules! impl_next {
313 |     ($func: ident, $t: ty, $size: expr) => {
314 |         fn $func(&mut self) -> $t {
315 |             // If we don't have enough bytes left in the state, generate new
316 |             // random bytes.
317 |             if self.cursor > STATE_BYTES - $size {
318 |                 randen_generate(&mut self.state);
319 |                 self.cursor = CAPACITY_BYTES;
320 |             }
321 | 
322 |             // Round the cursor up to the next multiple of $size, so we can
323 |             // pretend that the state is an array of $ts and load one from
324 |             // there. It means we discard some bytes if the cursor was not at
325 |             // a multiple of $size, but the advantage is that we don't need to
326 |             // worry about carrying over bytes between generations, when there
327 |             // are < $size bytes available.
328 |             let index = (self.cursor + $size - 1) / $size;
329 |             self.cursor = (index + 1) * $size;
330 |             let ts: [$t; STATE_BYTES / $size] =
331 |                 unsafe { mem::transmute(self.state) };
332 |             ts[index]
333 |         }
334 |     }
335 | }
336 | 
337 | impl RngCore for RandenRng {
338 |     impl_next!(next_u32, u32, 4);
339 |     impl_next!(next_u64, u64, 8);
340 | 
341 |     fn fill_bytes(&mut self, dest: &mut [u8]) {
342 |         let mut i = 0;
343 |         let len = dest.len();
344 |         while i < len {
345 |             if self.cursor >= STATE_BYTES {
346 |                 randen_generate(&mut self.state);
347 |                 self.cursor = CAPACITY_BYTES;
348 |             }
349 | 
350 |             let bytes: [u8; STATE_BYTES] = unsafe { mem::transmute(self.state) };
351 | 
352 |             // This iteration we will consume as many bytes as there are left
353 |             // to fill, or as many bytes as are available for consumption,
354 |             // whichever is less.
355 |             let consume_bytes = (len - i).min(STATE_BYTES - self.cursor);
356 |             let source = &bytes[self.cursor..self.cursor + consume_bytes];
357 |             dest[i..i + consume_bytes].copy_from_slice(source);
358 |             self.cursor += consume_bytes;
359 |             i += consume_bytes;
360 |         }
361 |     }
362 | 
363 |     fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
364 |         Ok(self.fill_bytes(dest))
365 |     }
366 | }
367 | 
368 | pub struct RandenSeed(pub [u8; SEED_BYTES]);
369 | 
370 | impl Default for RandenSeed {
371 |     fn default() -> RandenSeed {
372 |         RandenSeed([0; SEED_BYTES])
373 |     }
374 | }
375 | 
376 | impl AsMut<[u8]> for RandenSeed {
377 |     fn as_mut(&mut self) -> &mut [u8] {
378 |         &mut self.0
379 |     }
380 | }
381 | 
382 | impl SeedableRng for RandenRng {
383 |     type Seed = RandenSeed;
384 | 
385 |     fn from_seed(seed: RandenSeed) -> RandenRng {
386 |         let mut rng = RandenRng::new_unseeded();
387 |         unsafe {
388 |             // [u8] isn't necessarily 16 byte aligned. Transmuting it to [U128]
389 |             // won't fix the alignment, but a subsequent clone should work.
390 |             let unaligned_seed = std::mem::transmute::<[u8; SEED_BYTES], [U128A; SEED_LEN]>(seed.0);
391 |             let aligned_seed = unaligned_seed.clone();
392 |             randen_absorb(&mut rng.state, &aligned_seed);
393 |             rng
394 |         }
395 |     }
396 | }
397 | 
398 | #[cfg(test)]
399 | mod test {
400 |     use super::{RandenRng, U128A};
401 |     use rand::{RngCore, SeedableRng};
402 | 
403 |     #[test]
404 |     fn randen_rng_next_u64_test_vectors() {
405 |         // These test vectors were generated from the reference C++
406 |         // implementation with the following program:
407 |         //
408 |         // int main(int, char**) {
409 |         //   randen::Randen<std::uint64_t> rng;
410 |         //   for (int i = 0; i < 33; i++) {
411 |         //     std::cout << "        assert_eq!(rng.next_u64(), 0x";
412 |         //     std::cout << std::setbase(16) << std::setw(16)
413 |         //               << std::setfill('0') << rng();
414 |         //     std::cout << ");\n";
415 |         //   }
416 |         //   std::cout << std::endl;
417 |         //   return 0;
418 |         // }
419 | 
420 |         // Note that there are more bytes consumed than the size of the state,
421 |         // forcing a `randen_generate()`.
422 |         let mut rng = RandenRng::new_unseeded();
423 |         assert_eq!(rng.next_u64(), 0xdda9f47cd90410ee);
424 |         assert_eq!(rng.next_u64(), 0xc3c14f134e433977);
425 |         assert_eq!(rng.next_u64(), 0xf0b780f545c72912);
426 |         assert_eq!(rng.next_u64(), 0x887bf3087fd8ca10);
427 |         assert_eq!(rng.next_u64(), 0x30ec63baff3c6d59);
428 |         assert_eq!(rng.next_u64(), 0x15dbb1d37696599f);
429 |         assert_eq!(rng.next_u64(), 0x2808a316f49a54c);
430 |         assert_eq!(rng.next_u64(), 0xb29f73606f7f20a6);
431 |         assert_eq!(rng.next_u64(), 0x9cbf605e3fd9de8a);
432 |         assert_eq!(rng.next_u64(), 0x3b8feaf9d5c8e50e);
433 |         assert_eq!(rng.next_u64(), 0xd8b2ffd356301ed5);
434 |         assert_eq!(rng.next_u64(), 0xc970ae1a78183bbb);
435 |         assert_eq!(rng.next_u64(), 0xcdfd8d76eb8f9a19);
436 |         assert_eq!(rng.next_u64(), 0xf4b327fe0fc73c37);
437 |         assert_eq!(rng.next_u64(), 0xd5af05dd3eff9556);
438 |         assert_eq!(rng.next_u64(), 0xc3a506eb91420c9d);
439 |         assert_eq!(rng.next_u64(), 0x7023920e0d6bfe8c);
440 |         assert_eq!(rng.next_u64(), 0x48db1bb78f83c4a1);
441 |         assert_eq!(rng.next_u64(), 0xed1ef4c26b87b840);
442 |         assert_eq!(rng.next_u64(), 0x58d3575834956d42);
443 |         assert_eq!(rng.next_u64(), 0x497cabf3431154fc);
444 |         assert_eq!(rng.next_u64(), 0x8eef32a23e0b2df3);
445 |         assert_eq!(rng.next_u64(), 0xd88b5749f090e5ea);
446 |         assert_eq!(rng.next_u64(), 0x4e24370570029a8b);
447 |         assert_eq!(rng.next_u64(), 0x78fcec2cbb6342f5);
448 |         assert_eq!(rng.next_u64(), 0xc651a582a970692f);
449 |         assert_eq!(rng.next_u64(), 0x352ee4ad1816afe3);
450 |         assert_eq!(rng.next_u64(), 0x463cb745612f55db);
451 |         assert_eq!(rng.next_u64(), 0x811ef0821c3de851);
452 |         assert_eq!(rng.next_u64(), 0x26ff374c101da7e);
453 |         assert_eq!(rng.next_u64(), 0xa0660379992d58fc);
454 |         assert_eq!(rng.next_u64(), 0x6f7e616704c4fa59);
455 |         assert_eq!(rng.next_u64(), 0x915f3445685da798);
456 |     }
457 | 
458 |     #[test]
459 |     fn randen_rng_next_u32_test_vectors() {
460 |         // Same test as `randen_rng_next_u64_test_vectors()`, generated from the
461 |         // same C++ program, but adapted to produce 32-bit integers.
462 | 
463 |         // Note that there are more bytes consumed than the size of the state,
464 |         // forcing a `randen_generate()`.
465 |         let mut rng = RandenRng::new_unseeded();
466 |         assert_eq!(rng.next_u32(), 0xd90410ee);
467 |         assert_eq!(rng.next_u32(), 0xdda9f47c);
468 |         assert_eq!(rng.next_u32(), 0x4e433977);
469 |         assert_eq!(rng.next_u32(), 0xc3c14f13);
470 |         assert_eq!(rng.next_u32(), 0x45c72912);
471 |         assert_eq!(rng.next_u32(), 0xf0b780f5);
472 |         assert_eq!(rng.next_u32(), 0x7fd8ca10);
473 |         assert_eq!(rng.next_u32(), 0x887bf308);
474 |         assert_eq!(rng.next_u32(), 0xff3c6d59);
475 |         assert_eq!(rng.next_u32(), 0x30ec63ba);
476 |         assert_eq!(rng.next_u32(), 0x7696599f);
477 |         assert_eq!(rng.next_u32(), 0x15dbb1d3);
478 |         assert_eq!(rng.next_u32(), 0x6f49a54c);
479 |         assert_eq!(rng.next_u32(), 0x02808a31);
480 |         assert_eq!(rng.next_u32(), 0x6f7f20a6);
481 |         assert_eq!(rng.next_u32(), 0xb29f7360);
482 |         assert_eq!(rng.next_u32(), 0x3fd9de8a);
483 |         assert_eq!(rng.next_u32(), 0x9cbf605e);
484 |         assert_eq!(rng.next_u32(), 0xd5c8e50e);
485 |         assert_eq!(rng.next_u32(), 0x3b8feaf9);
486 |         assert_eq!(rng.next_u32(), 0x56301ed5);
487 |         assert_eq!(rng.next_u32(), 0xd8b2ffd3);
488 |         assert_eq!(rng.next_u32(), 0x78183bbb);
489 |         assert_eq!(rng.next_u32(), 0xc970ae1a);
490 |         assert_eq!(rng.next_u32(), 0xeb8f9a19);
491 |         assert_eq!(rng.next_u32(), 0xcdfd8d76);
492 |         assert_eq!(rng.next_u32(), 0x0fc73c37);
493 |         assert_eq!(rng.next_u32(), 0xf4b327fe);
494 |         assert_eq!(rng.next_u32(), 0x3eff9556);
495 |         assert_eq!(rng.next_u32(), 0xd5af05dd);
496 |         assert_eq!(rng.next_u32(), 0x91420c9d);
497 |         assert_eq!(rng.next_u32(), 0xc3a506eb);
498 |         assert_eq!(rng.next_u32(), 0x0d6bfe8c);
499 |         assert_eq!(rng.next_u32(), 0x7023920e);
500 |         assert_eq!(rng.next_u32(), 0x8f83c4a1);
501 |         assert_eq!(rng.next_u32(), 0x48db1bb7);
502 |         assert_eq!(rng.next_u32(), 0x6b87b840);
503 |         assert_eq!(rng.next_u32(), 0xed1ef4c2);
504 |         assert_eq!(rng.next_u32(), 0x34956d42);
505 |         assert_eq!(rng.next_u32(), 0x58d35758);
506 |         assert_eq!(rng.next_u32(), 0x431154fc);
507 |         assert_eq!(rng.next_u32(), 0x497cabf3);
508 |         assert_eq!(rng.next_u32(), 0x3e0b2df3);
509 |         assert_eq!(rng.next_u32(), 0x8eef32a2);
510 |         assert_eq!(rng.next_u32(), 0xf090e5ea);
511 |         assert_eq!(rng.next_u32(), 0xd88b5749);
512 |         assert_eq!(rng.next_u32(), 0x70029a8b);
513 |         assert_eq!(rng.next_u32(), 0x4e243705);
514 |         assert_eq!(rng.next_u32(), 0xbb6342f5);
515 |         assert_eq!(rng.next_u32(), 0x78fcec2c);
516 |         assert_eq!(rng.next_u32(), 0xa970692f);
517 |         assert_eq!(rng.next_u32(), 0xc651a582);
518 |         assert_eq!(rng.next_u32(), 0x1816afe3);
519 |         assert_eq!(rng.next_u32(), 0x352ee4ad);
520 |         assert_eq!(rng.next_u32(), 0x612f55db);
521 |         assert_eq!(rng.next_u32(), 0x463cb745);
522 |         assert_eq!(rng.next_u32(), 0x1c3de851);
523 |         assert_eq!(rng.next_u32(), 0x811ef082);
524 |         assert_eq!(rng.next_u32(), 0xc101da7e);
525 |         assert_eq!(rng.next_u32(), 0x026ff374);
526 |         assert_eq!(rng.next_u32(), 0x992d58fc);
527 |         assert_eq!(rng.next_u32(), 0xa0660379);
528 |         assert_eq!(rng.next_u32(), 0x04c4fa59);
529 |         assert_eq!(rng.next_u32(), 0x6f7e6167);
530 |         assert_eq!(rng.next_u32(), 0x685da798);
531 |     }
532 | 
533 |     #[test]
534 |     fn randen_rng_fill_bytes_test_vectors() {
535 |         // The expected values were generated from the reference C++
536 |         // implementation using the following program:
537 |         //
538 |         // int main(int, char**) {
539 |         //   randen::Randen<std::uint8_t> rng;
540 |         //   std::uint8_t seq_1[37] = {0};
541 |         //   std::uint8_t seq_2[151] = {0};
542 |         //   std::uint8_t seq_3[233] = {0};
543 |         //   for (std::uint8_t& x : seq_1) x = rng();
544 |         //   for (std::uint8_t& x : seq_2) x = rng();
545 |         //   for (std::uint8_t& x : seq_3) x = rng();
546 |         //   std::cout << "        assert_eq!(seq_1[36], "
547 |         //             << static_cast<uint32_t>(seq_1[36]) << ");\n";
548 |         //   std::cout << "        assert_eq!(seq_2[150], "
549 |         //             << static_cast<uint32_t>(seq_2[150]) << ");\n";
550 |         //   std::cout << "        assert_eq!(seq_3[232], "
551 |         //             << static_cast<uint32_t>(seq_3[232]) << ");\n";
552 |         //   std::cout << std::endl;
553 |         //   return 0;
554 |         // }
555 | 
556 |         let mut seq_1 = [0_u8; 37];
557 |         let mut seq_2 = [0_u8; 151];
558 |         let mut seq_3 = [0_u8; 233];
559 |         let mut rng = RandenRng::new_unseeded();
560 |         rng.fill_bytes(&mut seq_1);
561 |         rng.fill_bytes(&mut seq_2);
562 |         rng.fill_bytes(&mut seq_3);
563 |         assert_eq!(seq_1[36], 186);
564 |         assert_eq!(seq_2[150], 112);
565 |         assert_eq!(seq_3[232], 24);
566 |     }
567 | }
568 | 


--------------------------------------------------------------------------------
/engine_chacha.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #ifndef ENGINE_CHACHA_H_
 16 | #define ENGINE_CHACHA_H_
 17 | #if defined(__SSE2__) && defined(__AES__)
 18 | 
 19 | #include <cstdint>
 20 | #include <limits>
 21 | #include "tmmintrin.h"
 22 | 
 23 | namespace randen {
 24 | 
 25 | // Modified from https://gist.github.com/orlp/32f5d1b631ab092608b1:
 26 | /*
 27 |     Copyright (c) 2015 Orson Peters <orsonpeters@gmail.com>
 28 | 
 29 |     This software is provided 'as-is', without any express or implied warranty.
 30 |    In no event will the authors be held liable for any damages arising from the
 31 |    use of this software.
 32 | 
 33 |     Permission is granted to anyone to use this software for any purpose,
 34 |    including commercial applications, and to alter it and redistribute it
 35 |    freely, subject to the following restrictions:
 36 | 
 37 |     1. The origin of this software must not be misrepresented; you must not
 38 |    claim that you wrote the original software. If you use this software in a
 39 |    product, an acknowledgment in the product documentation would be appreciated
 40 |    but is not required.
 41 | 
 42 |     2. Altered source versions must be plainly marked as such, and must not be
 43 |    misrepresented as being the original software.
 44 | 
 45 |     3. This notice may not be removed or altered from any source distribution.
 46 | */
 47 | 
 48 | template <typename T>
 49 | class ChaCha {
 50 |  public:
 51 |   static constexpr size_t R = 8;
 52 |   typedef T result_type;
 53 | 
 54 |   static constexpr result_type min() {
 55 |     return std::numeric_limits<result_type>::min();
 56 |   }
 57 |   static constexpr result_type max() {
 58 |     return std::numeric_limits<result_type>::max();
 59 |   }
 60 | 
 61 |   explicit ChaCha(uint64_t seedval, uint64_t stream = 0) {
 62 |     seed(seedval, stream);
 63 |   }
 64 |   template <class Sseq>
 65 |   explicit ChaCha(Sseq& seq) {
 66 |     seed(seq);
 67 |   }
 68 | 
 69 |   void seed(uint64_t seedval, uint64_t stream = 0) {
 70 |     ctr = 0;
 71 |     keysetup[0] = seedval & 0xffffffffu;
 72 |     keysetup[1] = seedval >> 32;
 73 |     keysetup[2] = keysetup[3] = 0xdeadbeef;  // Could use 128-bit seed.
 74 |     keysetup[4] = stream & 0xffffffffu;
 75 |     keysetup[5] = stream >> 32;
 76 |     keysetup[6] = keysetup[7] = 0xdeadbeef;  // Could use 128-bit stream.
 77 |   }
 78 | 
 79 |   template <class Sseq>
 80 |   void seed(Sseq& seq) {
 81 |     ctr = 0;
 82 |     seq.generate(keysetup, keysetup + 8);
 83 |   }
 84 | 
 85 |   result_type operator()() {
 86 |     int idx = ctr % 16;
 87 |     if (idx == 0) generate_block();
 88 | 
 89 |     result_type ret;
 90 |     memcpy(&ret, block + idx, sizeof(ret));
 91 |     ctr += sizeof(ret) / sizeof(uint32_t);
 92 | 
 93 |     return ret;
 94 |   }
 95 | 
 96 |  private:
 97 |   void generate_block() {
 98 |     uint32_t constants[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
 99 | 
100 |     uint32_t input[16];
101 |     for (int i = 0; i < 4; ++i) input[i] = constants[i];
102 |     for (int i = 0; i < 8; ++i) input[4 + i] = keysetup[i];
103 |     input[12] = (ctr / 16) & 0xffffffffu;
104 |     input[13] = (ctr / 16) >> 32;
105 |     input[14] = input[15] = 0xdeadbeef;  // Could use 128-bit counter.
106 | 
107 |     for (int i = 0; i < 16; ++i) block[i] = input[i];
108 |     chacha_core();
109 |     for (int i = 0; i < 16; ++i) block[i] += input[i];
110 |   }
111 | 
112 |   // Get an efficient _mm_roti_epi32 based on enabled features.
113 | #define _mm_roti_epi32(r, c)                                                   \
114 |   (((c) == 8)                                                                  \
115 |        ? _mm_shuffle_epi8((r), _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6,   \
116 |                                             5, 4, 7, 2, 1, 0, 3))              \
117 |        : ((c) == 16)                                                           \
118 |              ? _mm_shuffle_epi8((r), _mm_set_epi8(13, 12, 15, 14, 9, 8, 11,    \
119 |                                                   10, 5, 4, 7, 6, 1, 0, 3, 2)) \
120 |              : ((c) == 24) ? _mm_shuffle_epi8(                                 \
121 |                                  (r), _mm_set_epi8(12, 15, 14, 13, 8, 11, 10,  \
122 |                                                    9, 4, 7, 6, 5, 0, 3, 2, 1)) \
123 |                            : _mm_xor_si128(_mm_slli_epi32((r), (c)),           \
124 |                                            _mm_srli_epi32((r), 32 - (c))))
125 | 
126 |   void chacha_core() {
127 | // ROTVn rotates the elements in the given vector n places to the left.
128 | #define CHACHA_ROTV1(x) _mm_shuffle_epi32((__m128i)x, 0x39)
129 | #define CHACHA_ROTV2(x) _mm_shuffle_epi32((__m128i)x, 0x4e)
130 | #define CHACHA_ROTV3(x) _mm_shuffle_epi32((__m128i)x, 0x93)
131 | 
132 |     __m128i a = _mm_load_si128((__m128i*)(block));
133 |     __m128i b = _mm_load_si128((__m128i*)(block + 4));
134 |     __m128i c = _mm_load_si128((__m128i*)(block + 8));
135 |     __m128i d = _mm_load_si128((__m128i*)(block + 12));
136 | 
137 |     for (int i = 0; i < R; i += 2) {
138 |       a = _mm_add_epi32(a, b);
139 |       d = _mm_xor_si128(d, a);
140 |       d = _mm_roti_epi32(d, 16);
141 |       c = _mm_add_epi32(c, d);
142 |       b = _mm_xor_si128(b, c);
143 |       b = _mm_roti_epi32(b, 12);
144 |       a = _mm_add_epi32(a, b);
145 |       d = _mm_xor_si128(d, a);
146 |       d = _mm_roti_epi32(d, 8);
147 |       c = _mm_add_epi32(c, d);
148 |       b = _mm_xor_si128(b, c);
149 |       b = _mm_roti_epi32(b, 7);
150 | 
151 |       b = CHACHA_ROTV1(b);
152 |       c = CHACHA_ROTV2(c);
153 |       d = CHACHA_ROTV3(d);
154 | 
155 |       a = _mm_add_epi32(a, b);
156 |       d = _mm_xor_si128(d, a);
157 |       d = _mm_roti_epi32(d, 16);
158 |       c = _mm_add_epi32(c, d);
159 |       b = _mm_xor_si128(b, c);
160 |       b = _mm_roti_epi32(b, 12);
161 |       a = _mm_add_epi32(a, b);
162 |       d = _mm_xor_si128(d, a);
163 |       d = _mm_roti_epi32(d, 8);
164 |       c = _mm_add_epi32(c, d);
165 |       b = _mm_xor_si128(b, c);
166 |       b = _mm_roti_epi32(b, 7);
167 | 
168 |       b = CHACHA_ROTV3(b);
169 |       c = CHACHA_ROTV2(c);
170 |       d = CHACHA_ROTV1(d);
171 |     }
172 | 
173 |     _mm_store_si128((__m128i*)(block), a);
174 |     _mm_store_si128((__m128i*)(block + 4), b);
175 |     _mm_store_si128((__m128i*)(block + 8), c);
176 |     _mm_store_si128((__m128i*)(block + 12), d);
177 | 
178 | #undef CHACHA_ROTV3
179 | #undef CHACHA_ROTV2
180 | #undef CHACHA_ROTV1
181 |   }
182 | 
183 |   alignas(16) uint32_t block[16];
184 |   uint32_t keysetup[8];
185 |   uint64_t ctr;
186 | };
187 | 
188 | }  // namespace randen
189 | 
190 | #endif  // defined(__SSE2__) && defined(__AES__)
191 | #endif  // ENGINE_CHACHA_H_
192 | 


--------------------------------------------------------------------------------
/engine_os.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef ENGINE_OS_H_
16 | #define ENGINE_OS_H_
17 | 
18 | #ifdef _WIN64
19 | #define NOMINMAX
20 | #include <windows.h>
21 | // Must come after windows.h; this comment ensures that.
22 | #include <bcrypt.h>
23 | #pragma comment(lib, "bcrypt")
24 | #endif
25 | 
26 | #include "util.h"
27 | 
28 | namespace randen {
29 | 
30 | // Buffered, uses OS CSPRNG.
31 | template <typename T>
32 | class alignas(32) EngineOS {
33 |  public:
34 |   // C++11 URBG interface:
35 |   using result_type = T;
36 |   static constexpr T min() { return T(0); }
37 |   static constexpr T max() { return ~T(0); }
38 | 
39 |   EngineOS() {
40 |     // The first call to operator() will trigger a refill.
41 |     next_ = kStateT;
42 | 
43 | #ifdef _WIN32
44 |     RANDEN_CHECK(0 == BCryptOpenAlgorithmProvider(
45 |                           &provider_, BCRYPT_RNG_ALGORITHM, nullptr, 0));
46 | #else
47 |     dev_ = fopen("/dev/urandom", "r");
48 |     RANDEN_CHECK(dev_ != nullptr);
49 | #endif
50 |   }
51 | 
52 |   ~EngineOS() {
53 | #ifdef _WIN32
54 |     RANDEN_CHECK(0 == BCryptCloseAlgorithmProvider(provider_, 0));
55 | #else
56 |     RANDEN_CHECK(fclose(dev_) == 0);
57 | #endif
58 |   }
59 | 
60 |   // Returns random bits from the buffer in units of T.
61 |   T operator()() {
62 |     // (Local copy ensures compiler knows this is not aliased.)
63 |     size_t next = next_;
64 | 
65 |     // Refill the buffer if needed (unlikely).
66 |     if (next >= kStateT) {
67 | #ifdef _WIN32
68 |       RANDEN_CHECK(0 == BCryptGenRandom(provider_,
69 |                                         reinterpret_cast<BYTE*>(&state_[0]),
70 |                                         sizeof(state_), 0));
71 | #else
72 |       const size_t bytes_read = fread(&state_[0], 1, sizeof(state_), dev_);
73 |       RANDEN_CHECK(bytes_read == sizeof(state_));
74 | #endif
75 |       next = 0;
76 |     }
77 | 
78 |     const T ret = state_[next];
79 |     next_ = next + 1;
80 |     return ret;
81 |   }
82 | 
83 |  private:
84 |   static constexpr size_t kStateT = 256 / sizeof(T);  // same as Randen
85 | 
86 |   alignas(32) T state_[kStateT];
87 |   size_t next_;  // index within state_
88 | #ifdef _WIN32
89 |   BCRYPT_ALG_HANDLE provider_;
90 | #else
91 |   FILE* dev_;
92 | #endif
93 | };
94 | 
95 | }  // namespace randen
96 | 
97 | #endif  // ENGINE_OS_H_
98 | 


--------------------------------------------------------------------------------
/nanobenchmark.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "nanobenchmark.h"
 16 | #include "randen.h"
 17 | 
 18 | #include <stddef.h>
 19 | #include <stdio.h>
 20 | #include <stdlib.h>  // abort
 21 | #include <string.h>  // memcpy
 22 | #include <time.h>  // clock_gettime
 23 | #include <algorithm>  // sort
 24 | #include <atomic>
 25 | #include <limits>
 26 | #include <numeric>  // iota
 27 | #include <string>
 28 | #include <vector>
 29 | 
 30 | // Architecture
 31 | #if defined(__x86_64__) || defined(_M_X64)
 32 | #define NB_ARCH_X86
 33 | #if defined(_MSC_VER)
 34 | #include <intrin.h>
 35 | #else
 36 | #include <cpuid.h>  // NOLINT
 37 | #endif
 38 | #elif defined(__powerpc64__) || defined(_M_PPC)
 39 | #define NB_ARCH_PPC
 40 | #include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
 41 | #elif defined(__aarch64__) || defined(__arm__)
 42 | #define NB_ARCH_ARM
 43 | #else
 44 | #error "Please add support for this architecture"
 45 | #endif
 46 | 
 47 | // OS
 48 | #if defined(_WIN32) || defined(_WIN64)
 49 | #define NB_OS_WIN
 50 | #define NOMINMAX
 51 | #include <windows.h>  // NOLINT
 52 | #elif defined(__linux__)
 53 | #define NB_OS_LINUX
 54 | #include <sched.h>  // NOLINT
 55 | #else
 56 | #error "Please add support for this OS"
 57 | #endif
 58 | 
 59 | namespace randen {
 60 | namespace platform {
 61 | namespace {
 62 | 
 63 | // Enables sanity checks that verify correct operation at the cost of
 64 | // longer benchmark runs.
 65 | #ifndef NANOBENCHMARK_ENABLE_CHECKS
 66 | #define NANOBENCHMARK_ENABLE_CHECKS 0
 67 | #endif
 68 | 
 69 | #define NANOBENCHMARK_CHECK_ALWAYS(condition)                             \
 70 |   while (!(condition)) {                                                  \
 71 |     fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
 72 |     abort();                                                              \
 73 |   }
 74 | 
 75 | #if NANOBENCHMARK_ENABLE_CHECKS
 76 | #define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
 77 | #else
 78 | #define NANOBENCHMARK_CHECK(condition)
 79 | #endif
 80 | 
 81 | // Compiler-specific
 82 | #ifdef _MSC_VER
 83 | #define NB_RESTRICT __restrict
 84 | #define NB_INLINE __forceinline
 85 | #define NB_NOINLINE __declspec(noinline)
 86 | 
 87 | #elif defined(__GNUC__) || defined(__clang__)
 88 | #define NB_RESTRICT __restrict__
 89 | #define NB_INLINE inline __attribute__((always_inline))
 90 | #define NB_NOINLINE inline __attribute__((noinline))
 91 | 
 92 | #else
 93 | #error "Unsupported compiler"
 94 | #endif
 95 | 
 96 | #ifdef NB_ARCH_X86
 97 | 
 98 | void Cpuid(const uint32_t level, const uint32_t count,
 99 |            uint32_t* NB_RESTRICT abcd) {
100 | #ifdef _MSC_VER
101 |   int regs[4];
102 |   __cpuidex(regs, level, count);
103 |   for (int i = 0; i < 4; ++i) {
104 |     abcd[i] = regs[i];
105 |   }
106 | #else
107 |   uint32_t a, b, c, d;
108 |   __cpuid_count(level, count, a, b, c, d);
109 |   abcd[0] = a;
110 |   abcd[1] = b;
111 |   abcd[2] = c;
112 |   abcd[3] = d;
113 | #endif
114 | }
115 | 
116 | std::string BrandString() {
117 |   char brand_string[49];
118 |   uint32_t abcd[4];
119 | 
120 |   // Check if brand string is supported (it is on all reasonable Intel/AMD)
121 |   Cpuid(0x80000000U, 0, abcd);
122 |   if (abcd[0] < 0x80000004U) {
123 |     return std::string();
124 |   }
125 | 
126 |   for (int i = 0; i < 3; ++i) {
127 |     Cpuid(0x80000002U + i, 0, abcd);
128 |     memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
129 |   }
130 |   brand_string[48] = 0;
131 |   return brand_string;
132 | }
133 | 
134 | // Returns the frequency quoted inside the brand string. This does not
135 | // account for throttling nor Turbo Boost.
136 | double NominalClockRate() {
137 |   const std::string& brand_string = BrandString();
138 |   // Brand strings include the maximum configured frequency. These prefixes are
139 |   // defined by Intel CPUID documentation.
140 |   const char* prefixes[3] = {"MHz", "GHz", "THz"};
141 |   const double multipliers[3] = {1E6, 1E9, 1E12};
142 |   for (size_t i = 0; i < 3; ++i) {
143 |     const size_t pos_prefix = brand_string.find(prefixes[i]);
144 |     if (pos_prefix != std::string::npos) {
145 |       const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
146 |       if (pos_space != std::string::npos) {
147 |         const std::string digits =
148 |             brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
149 |         return std::stod(digits) * multipliers[i];
150 |       }
151 |     }
152 |   }
153 | 
154 |   return 0.0;
155 | }
156 | 
157 | #endif  // NB_ARCH_X86
158 | 
159 | }  // namespace
160 | 
161 | void PinThreadToCPU(int cpu) {
162 |   if (cpu < 0) {
163 |     // We might migrate to another CPU before pinning below, but at least cpu
164 |     // will be one of the CPUs on which this thread ran.
165 | #if defined(NB_OS_WIN)
166 |     cpu = static_cast<int>(GetCurrentProcessorNumber());
167 | #elif defined(NB_OS_LINUX)
168 |     cpu = sched_getcpu();
169 | #else
170 | #error "Please add support for this OS"
171 | #endif
172 |     NANOBENCHMARK_CHECK_ALWAYS(cpu >= 0);
173 |   }
174 | 
175 | #if defined(NB_OS_WIN)
176 |   const HANDLE hThread = GetCurrentThread();
177 |   const DWORD_PTR prev = SetThreadAffinityMask(hThread, 1ULL << cpu);
178 |   NANOBENCHMARK_CHECK_ALWAYS(prev != 0);
179 | #elif defined(NB_OS_LINUX)
180 |   const pid_t pid = 0;  // current thread
181 |   cpu_set_t set;
182 |   CPU_ZERO(&set);
183 |   CPU_SET(cpu, &set);
184 |   const int err = sched_setaffinity(pid, sizeof(set), &set);
185 |   NANOBENCHMARK_CHECK_ALWAYS(err == 0);
186 | #else
187 | #error "Please add support for this OS"
188 | #endif
189 | }
190 | 
191 | // Returns tick rate. Invariant means the tick counter frequency is independent
192 | // of CPU throttling or sleep. May be expensive, caller should cache the result.
193 | double InvariantTicksPerSecond() {
194 | #if defined(NB_ARCH_PPC)
195 |   return __ppc_get_timebase_freq();
196 | #elif defined(NB_ARCH_X86)
197 |   // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
198 |   return NominalClockRate();
199 | #else
200 |   // Fall back to clock_gettime nanoseconds.
201 |   return 1E9;
202 | #endif
203 | }
204 | 
205 | }  // namespace platform
206 | namespace {
207 | 
208 | // Prevents the compiler from eliding the computations that led to "output".
209 | template <class T>
210 | inline void PreventElision(T&& output) {
211 | #ifndef _MSC_VER
212 |   // Works by indicating to the compiler that "output" is being read and
213 |   // modified. The +r constraint avoids unnecessary writes to memory, but only
214 |   // works for built-in types (typically FuncOutput).
215 |   asm volatile("" : "+r"(output) : : "memory");
216 | #else
217 |   // MSVC does not support inline assembly anymore (and never supported GCC's
218 |   // RTL constraints). Self-assignment with #pragma optimize("off") might be
219 |   // expected to prevent elision, but it does not with MSVC 2015. Type-punning
220 |   // with volatile pointers generates inefficient code on MSVC 2017.
221 |   static std::atomic<T> dummy(T{});
222 |   dummy.store(output, std::memory_order_relaxed);
223 | #endif
224 | }
225 | 
226 | namespace timer {
227 | 
228 | // Start/Stop return absolute timestamps and must be placed immediately before
229 | // and after the region to measure. We provide separate Start/Stop functions
230 | // because they use different fences.
231 | //
232 | // Background: RDTSC is not 'serializing'; earlier instructions may complete
233 | // after it, and/or later instructions may complete before it. 'Fences' ensure
234 | // regions' elapsed times are independent of such reordering. The only
235 | // documented unprivileged serializing instruction is CPUID, which acts as a
236 | // full fence (no reordering across it in either direction). Unfortunately
237 | // the latency of CPUID varies wildly (perhaps made worse by not initializing
238 | // its EAX input). Because it cannot reliably be deducted from the region's
239 | // elapsed time, it must not be included in the region to measure (i.e.
240 | // between the two RDTSC).
241 | //
242 | // The newer RDTSCP is sometimes described as serializing, but it actually
243 | // only serves as a half-fence with release semantics. Although all
244 | // instructions in the region will complete before the final timestamp is
245 | // captured, subsequent instructions may leak into the region and increase the
246 | // elapsed time. Inserting another fence after the final RDTSCP would prevent
247 | // such reordering without affecting the measured region.
248 | //
249 | // Fortunately, such a fence exists. The LFENCE instruction is only documented
250 | // to delay later loads until earlier loads are visible. However, Intel's
251 | // reference manual says it acts as a full fence (waiting until all earlier
252 | // instructions have completed, and delaying later instructions until it
253 | // completes). AMD assigns the same behavior to MFENCE.
254 | //
255 | // We need a fence before the initial RDTSC to prevent earlier instructions
256 | // from leaking into the region, and arguably another after RDTSC to avoid
257 | // region instructions from completing before the timestamp is recorded.
258 | // When surrounded by fences, the additional RDTSCP half-fence provides no
259 | // benefit, so the initial timestamp can be recorded via RDTSC, which has
260 | // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
261 | // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
262 | //
263 | // Using Start+Start leads to higher variance and overhead than Stop+Stop.
264 | // However, Stop+Stop includes an LFENCE in the region measurements, which
265 | // adds a delay dependent on earlier loads. The combination of Start+Stop
266 | // is faster than Start+Start and more consistent than Stop+Stop because
267 | // the first LFENCE already delayed subsequent loads before the measured
268 | // region. This combination seems not to have been considered in prior work:
269 | // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
270 | //
271 | // Note: performance counters can measure 'exact' instructions-retired or
272 | // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
273 | // requires fences. Unfortunately, it is not accessible on all OSes and we
274 | // prefer to avoid kernel-mode drivers. Performance counters are also affected
275 | // by several under/over-count errata, so we use the TSC instead.
276 | 
277 | // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
278 | // divide by InvariantTicksPerSecond.
279 | inline uint64_t Start64() {
280 |   uint64_t t;
281 | #if defined(NB_ARCH_PPC)
282 |   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
283 | #elif defined(NB_ARCH_X86)
284 | #if defined(_MSC_VER)
285 |   _ReadWriteBarrier();
286 |   _mm_lfence();
287 |   _ReadWriteBarrier();
288 |   t = __rdtsc();
289 |   _ReadWriteBarrier();
290 |   _mm_lfence();
291 |   _ReadWriteBarrier();
292 | #else
293 |   asm volatile(
294 |       "lfence\n\t"
295 |       "rdtsc\n\t"
296 |       "shl $32, %%rdx\n\t"
297 |       "or %%rdx, %0\n\t"
298 |       "lfence"
299 |       : "=a"(t)
300 |       :
301 |       // "memory" avoids reordering. rdx = TSC >> 32.
302 |       // "cc" = flags modified by SHL.
303 |       : "rdx", "memory", "cc");
304 | #endif
305 | #else
306 |   // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
307 |   timespec ts;
308 |   clock_gettime(CLOCK_MONOTONIC, &ts);
309 |   t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
310 | #endif
311 |   return t;
312 | }
313 | 
314 | inline uint64_t Stop64() {
315 |   uint64_t t;
316 | #if defined(NB_ARCH_PPC)
317 |   asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
318 | #elif defined(NB_ARCH_X86)
319 | #if defined(_MSC_VER)
320 |   _ReadWriteBarrier();
321 |   unsigned aux;
322 |   t = __rdtscp(&aux);
323 |   _ReadWriteBarrier();
324 |   _mm_lfence();
325 |   _ReadWriteBarrier();
326 | #else
327 |   // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
328 |   asm volatile(
329 |       "rdtscp\n\t"
330 |       "shl $32, %%rdx\n\t"
331 |       "or %%rdx, %0\n\t"
332 |       "lfence"
333 |       : "=a"(t)
334 |       :
335 |       // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
336 |       // "cc" = flags modified by SHL.
337 |       : "rcx", "rdx", "memory", "cc");
338 | #endif
339 | #else
340 |   t = Start64();
341 | #endif
342 |   return t;
343 | }
344 | 
345 | // Returns a 32-bit timestamp with about 4 cycles less overhead than
346 | // Start64. Only suitable for measuring very short regions because the
347 | // timestamp overflows about once a second.
348 | inline uint32_t Start32() {
349 |   uint32_t t;
350 | #if defined(NB_ARCH_X86)
351 | #if defined(_MSC_VER)
352 |   _ReadWriteBarrier();
353 |   _mm_lfence();
354 |   _ReadWriteBarrier();
355 |   t = static_cast<uint32_t>(__rdtsc());
356 |   _ReadWriteBarrier();
357 |   _mm_lfence();
358 |   _ReadWriteBarrier();
359 | #else
360 |   asm volatile(
361 |       "lfence\n\t"
362 |       "rdtsc\n\t"
363 |       "lfence"
364 |       : "=a"(t)
365 |       :
366 |       // "memory" avoids reordering. rdx = TSC >> 32.
367 |       : "rdx", "memory");
368 | #endif
369 | #else
370 |   t = static_cast<uint32_t>(Start64());
371 | #endif
372 |   return t;
373 | }
374 | 
375 | inline uint32_t Stop32() {
376 |   uint32_t t;
377 | #if defined(NB_ARCH_X86)
378 | #if defined(_MSC_VER)
379 |   _ReadWriteBarrier();
380 |   unsigned aux;
381 |   t = static_cast<uint32_t>(__rdtscp(&aux));
382 |   _ReadWriteBarrier();
383 |   _mm_lfence();
384 |   _ReadWriteBarrier();
385 | #else
386 |   // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
387 |   asm volatile(
388 |       "rdtscp\n\t"
389 |       "lfence"
390 |       : "=a"(t)
391 |       :
392 |       // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
393 |       : "rcx", "rdx", "memory");
394 | #endif
395 | #else
396 |   t = static_cast<uint32_t>(Stop64());
397 | #endif
398 |   return t;
399 | }
400 | 
401 | }  // namespace timer
402 | 
403 | namespace robust_statistics {
404 | 
405 | // Sorts integral values in ascending order (e.g. for Mode). About 3x faster
406 | // than std::sort for input distributions with very few unique values.
407 | template <class T>
408 | void CountingSort(T* values, size_t num_values) {
409 |   // Unique values and their frequency (similar to flat_map).
410 |   using Unique = std::pair<T, int>;
411 |   std::vector<Unique> unique;
412 |   for (size_t i = 0; i < num_values; ++i) {
413 |     const T value = values[i];
414 |     const auto pos =
415 |         std::find_if(unique.begin(), unique.end(),
416 |                      [value](const Unique u) { return u.first == value; });
417 |     if (pos == unique.end()) {
418 |       unique.push_back(std::make_pair(value, 1));
419 |     } else {
420 |       ++pos->second;
421 |     }
422 |   }
423 | 
424 |   // Sort in ascending order of value (pair.first).
425 |   std::sort(unique.begin(), unique.end());
426 | 
427 |   // Write that many copies of each unique value to the array.
428 |   T* NB_RESTRICT p = values;
429 |   for (const auto& value_count : unique) {
430 |     std::fill(p, p + value_count.second, value_count.first);
431 |     p += value_count.second;
432 |   }
433 |   NANOBENCHMARK_CHECK(p == values + num_values);
434 | }
435 | 
436 | // @return i in [idx_begin, idx_begin + half_count) that minimizes
437 | // sorted[i + half_count] - sorted[i].
438 | template <typename T>
439 | size_t MinRange(const T* const NB_RESTRICT sorted, const size_t idx_begin,
440 |                 const size_t half_count) {
441 |   T min_range = std::numeric_limits<T>::max();
442 |   size_t min_idx = 0;
443 | 
444 |   for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
445 |     NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
446 |     const T range = sorted[idx + half_count] - sorted[idx];
447 |     if (range < min_range) {
448 |       min_range = range;
449 |       min_idx = idx;
450 |     }
451 |   }
452 | 
453 |   return min_idx;
454 | }
455 | 
456 | // Returns an estimate of the mode by calling MinRange on successively
457 | // halved intervals. "sorted" must be in ascending order. This is the
458 | // Half Sample Mode estimator proposed by Bickel in "On a fast, robust
459 | // estimator of the mode", with complexity O(N log N). The mode is less
460 | // affected by outliers in highly-skewed distributions than the median.
461 | // The averaging operation below assumes "T" is an unsigned integer type.
462 | template <typename T>
463 | T ModeOfSorted(const T* const NB_RESTRICT sorted, const size_t num_values) {
464 |   size_t idx_begin = 0;
465 |   size_t half_count = num_values / 2;
466 |   while (half_count > 1) {
467 |     idx_begin = MinRange(sorted, idx_begin, half_count);
468 |     half_count >>= 1;
469 |   }
470 | 
471 |   const T x = sorted[idx_begin + 0];
472 |   if (half_count == 0) {
473 |     return x;
474 |   }
475 |   NANOBENCHMARK_CHECK(half_count == 1);
476 |   const T average = (x + sorted[idx_begin + 1] + 1) / 2;
477 |   return average;
478 | }
479 | 
480 | // Returns the mode. Side effect: sorts "values".
481 | template <typename T>
482 | T Mode(T* values, const size_t num_values) {
483 |   CountingSort(values, num_values);
484 |   return ModeOfSorted(values, num_values);
485 | }
486 | 
487 | template <typename T, size_t N>
488 | T Mode(T (&values)[N]) {
489 |   return Mode(&values[0], N);
490 | }
491 | 
492 | // Returns the median value. Side effect: sorts "values".
493 | template <typename T>
494 | T Median(T* values, const size_t num_values) {
495 |   NANOBENCHMARK_CHECK(!values->empty());
496 |   std::sort(values, values + num_values);
497 |   const size_t half = num_values / 2;
498 |   // Odd count: return middle
499 |   if (num_values % 2) {
500 |     return values[half];
501 |   }
502 |   // Even count: return average of middle two.
503 |   return (values[half] + values[half - 1] + 1) / 2;
504 | }
505 | 
506 | // Returns a robust measure of variability.
507 | template <typename T>
508 | T MedianAbsoluteDeviation(const T* values, const size_t num_values,
509 |                           const T median) {
510 |   NANOBENCHMARK_CHECK(num_values != 0);
511 |   std::vector<T> abs_deviations;
512 |   abs_deviations.reserve(num_values);
513 |   for (size_t i = 0; i < num_values; ++i) {
514 |     const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
515 |     abs_deviations.push_back(static_cast<T>(abs));
516 |   }
517 |   return Median(abs_deviations.data(), num_values);
518 | }
519 | 
520 | }  // namespace robust_statistics
521 | 
522 | // Ticks := platform-specific timer values (CPU cycles on x86). Must be
523 | // unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
524 | // read than 64 bit.
525 | using Ticks = uint32_t;
526 | 
527 | // Returns timer overhead / minimum measurable difference.
528 | Ticks TimerResolution() {
529 |   // Nested loop avoids exceeding stack/L1 capacity.
530 |   Ticks repetitions[Params::kTimerSamples];
531 |   for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
532 |     Ticks samples[Params::kTimerSamples];
533 |     for (size_t i = 0; i < Params::kTimerSamples; ++i) {
534 |       const Ticks t0 = timer::Start32();
535 |       const Ticks t1 = timer::Stop32();
536 |       samples[i] = t1 - t0;
537 |     }
538 |     repetitions[rep] = robust_statistics::Mode(samples);
539 |   }
540 |   return robust_statistics::Mode(repetitions);
541 | }
542 | 
543 | static const Ticks timer_resolution = TimerResolution();
544 | 
545 | // Estimates the expected value of "lambda" values with a variable number of
546 | // samples until the variability "rel_mad" is less than "max_rel_mad".
547 | template <class Lambda>
548 | Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
549 |                         const Params& p, const Lambda& lambda) {
550 |   // Choose initial samples_per_eval based on a single estimated duration.
551 |   Ticks t0 = timer::Start32();
552 |   lambda();
553 |   Ticks t1 = timer::Stop32();
554 |   Ticks est = t1 - t0;
555 |   static const double ticks_per_second = platform::InvariantTicksPerSecond();
556 |   const size_t ticks_per_eval =
557 |       static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
558 |   size_t samples_per_eval = ticks_per_eval / est;
559 |   samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
560 | 
561 |   std::vector<Ticks> samples;
562 |   samples.reserve(1 + samples_per_eval);
563 |   samples.push_back(est);
564 | 
565 |   // Percentage is too strict for tiny differences, so also allow a small
566 |   // absolute "median absolute deviation".
567 |   const Ticks max_abs_mad = (timer_resolution + 99) / 100;
568 |   *rel_mad = 0.0;  // ensure initialized
569 | 
570 |   for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
571 |     samples.reserve(samples.size() + samples_per_eval);
572 |     for (size_t i = 0; i < samples_per_eval; ++i) {
573 |       t0 = timer::Start32();
574 |       lambda();
575 |       t1 = timer::Stop32();
576 |       samples.push_back(t1 - t0);
577 |     }
578 | 
579 |     if (samples.size() >= p.min_mode_samples) {
580 |       est = robust_statistics::Mode(samples.data(), samples.size());
581 |     } else {
582 |       // For "few" (depends also on the variance) samples, Median is safer.
583 |       est = robust_statistics::Median(samples.data(), samples.size());
584 |     }
585 |     NANOBENCHMARK_CHECK(est != 0);
586 | 
587 |     // Median absolute deviation (mad) is a robust measure of 'variability'.
588 |     const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
589 |         samples.data(), samples.size(), est);
590 |     *rel_mad = static_cast<double>(int(abs_mad)) / est;
591 | 
592 |     if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
593 |       if (p.verbose) {
594 |         printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
595 |                samples.size(), est, abs_mad, *rel_mad * 100.0);
596 |       }
597 |       return est;
598 |     }
599 |   }
600 | 
601 |   if (p.verbose) {
602 |     printf(
603 |         "WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
604 |         *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
605 |   }
606 |   return est;
607 | }
608 | 
609 | using InputVec = std::vector<FuncInput>;
610 | 
611 | // Returns vector of unique input values.
612 | InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
613 |   InputVec unique(inputs, inputs + num_inputs);
614 |   std::sort(unique.begin(), unique.end());
615 |   unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
616 |   return unique;
617 | }
618 | 
619 | // Returns how often we need to call func for sufficient precision, or zero
620 | // on failure (e.g. the elapsed time is too long for a 32-bit tick count).
621 | size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
622 |                const Params& p) {
623 |   // Min elapsed ticks for any input.
624 |   Ticks min_duration = ~0u;
625 | 
626 |   for (const FuncInput input : unique) {
627 |     // Make sure a 32-bit timer is sufficient.
628 |     const uint64_t t0 = timer::Start64();
629 |     PreventElision(func(arg, input));
630 |     const uint64_t t1 = timer::Stop64();
631 |     const uint64_t elapsed = t1 - t0;
632 |     if (elapsed >= (1ULL << 30)) {
633 |       fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
634 |               input);
635 |       return 0;
636 |     }
637 | 
638 |     double rel_mad;
639 |     const Ticks total = SampleUntilStable(
640 |         p.target_rel_mad, &rel_mad, p,
641 |         [func, arg, input]() { PreventElision(func(arg, input)); });
642 |     min_duration = std::min(min_duration, total - timer_resolution);
643 |   }
644 | 
645 |   // Number of repetitions required to reach the target resolution.
646 |   const size_t max_skip = p.precision_divisor;
647 |   // Number of repetitions given the estimated duration.
648 |   const size_t num_skip =
649 |       min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
650 |   if (p.verbose) {
651 |     printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
652 |            max_skip, min_duration, num_skip);
653 |   }
654 |   return num_skip;
655 | }
656 | 
657 | // Replicates inputs until we can omit "num_skip" occurrences of an input.
658 | InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
659 |                          const size_t num_unique, const size_t num_skip,
660 |                          const Params& p) {
661 |   InputVec full;
662 |   if (num_unique == 1) {
663 |     full.assign(p.subset_ratio * num_skip, inputs[0]);
664 |     return full;
665 |   }
666 | 
667 |   full.reserve(p.subset_ratio * num_skip * num_inputs);
668 |   for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
669 |     full.insert(full.end(), inputs, inputs + num_inputs);
670 |   }
671 |   randen::Randen<uint32_t> rng;
672 |   std::shuffle(full.begin(), full.end(), rng);
673 |   return full;
674 | }
675 | 
676 | // Copies the "full" to "subset" in the same order, but with "num_skip"
677 | // randomly selected occurrences of "input_to_skip" removed.
678 | void FillSubset(const InputVec& full, const FuncInput input_to_skip,
679 |                 const size_t num_skip, InputVec* subset) {
680 |   const size_t count = std::count(full.begin(), full.end(), input_to_skip);
681 |   // Generate num_skip random indices: which occurrence to skip.
682 |   std::vector<uint32_t> omit(count);
683 |   std::iota(omit.begin(), omit.end(), 0);
684 |   // omit[] is the same on every call, but that's OK because they identify the
685 |   // Nth instance of input_to_skip, so the position within full[] differs.
686 |   randen::Randen<uint32_t> rng;
687 |   std::shuffle(omit.begin(), omit.end(), rng);
688 |   omit.resize(num_skip);
689 |   std::sort(omit.begin(), omit.end());
690 | 
691 |   uint32_t occurrence = ~0u;  // 0 after preincrement
692 |   size_t idx_omit = 0;        // cursor within omit[]
693 |   size_t idx_subset = 0;      // cursor within *subset
694 |   for (const FuncInput next : full) {
695 |     if (next == input_to_skip) {
696 |       ++occurrence;
697 |       // Haven't removed enough already
698 |       if (idx_omit < num_skip) {
699 |         // This one is up for removal
700 |         if (occurrence == omit[idx_omit]) {
701 |           ++idx_omit;
702 |           continue;
703 |         }
704 |       }
705 |     }
706 |     if (idx_subset < subset->size()) {
707 |       (*subset)[idx_subset++] = next;
708 |     }
709 |   }
710 |   NANOBENCHMARK_CHECK(idx_subset == subset->size());
711 |   NANOBENCHMARK_CHECK(idx_omit == omit.size());
712 |   NANOBENCHMARK_CHECK(occurrence == count - 1);
713 | }
714 | 
715 | // Returns total ticks elapsed for all inputs.
716 | Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
717 |                     const Params& p, double* max_rel_mad) {
718 |   double rel_mad;
719 |   const Ticks duration =
720 |       SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
721 |         for (const FuncInput input : *inputs) {
722 |           PreventElision(func(arg, input));
723 |         }
724 |       });
725 |   *max_rel_mad = std::max(*max_rel_mad, rel_mad);
726 |   return duration;
727 | }
728 | 
729 | // (Nearly) empty Func for measuring timer overhead/resolution.
730 | NB_NOINLINE FuncOutput EmptyFunc(const void* arg, const FuncInput input) {
731 |   return input;
732 | }
733 | 
734 | // Returns overhead of accessing inputs[] and calling a function; this will
735 | // be deducted from future TotalDuration return values.
736 | Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
737 |   double rel_mad;
738 |   // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
739 |   return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
740 |     for (const FuncInput input : *inputs) {
741 |       PreventElision(EmptyFunc(arg, input));
742 |     }
743 |   });
744 | }
745 | 
746 | }  // namespace
747 | 
748 | size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
749 |                const size_t num_inputs, Result* results, const Params& p) {
750 |   NANOBENCHMARK_CHECK(num_inputs != 0);
751 |   const InputVec& unique = UniqueInputs(inputs, num_inputs);
752 | 
753 |   const size_t num_skip = NumSkip(func, arg, unique, p);  // never 0
754 |   if (num_skip == 0) return 0;  // NumSkip already printed error message
755 |   const float mul = 1.0f / static_cast<int>(num_skip);
756 | 
757 |   const InputVec& full =
758 |       ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
759 |   InputVec subset(full.size() - num_skip);
760 | 
761 |   const Ticks overhead = Overhead(arg, &full, p);
762 |   const Ticks overhead_skip = Overhead(arg, &subset, p);
763 |   if (overhead < overhead_skip) {
764 |     fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
765 |             overhead_skip);
766 |     return 0;
767 |   }
768 | 
769 |   if (p.verbose) {
770 |     printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
771 |            overhead, overhead_skip);
772 |   }
773 | 
774 |   double max_rel_mad = 0.0;
775 |   const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
776 | 
777 |   for (size_t i = 0; i < unique.size(); ++i) {
778 |     FillSubset(full, unique[i], num_skip, &subset);
779 |     const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
780 | 
781 |     if (total < total_skip) {
782 |       fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
783 |       return 0;
784 |     }
785 | 
786 |     const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
787 |     results[i].input = unique[i];
788 |     results[i].ticks = duration * mul;
789 |     results[i].variability = static_cast<float>(max_rel_mad);
790 |   }
791 | 
792 |   return unique.size();
793 | }
794 | 
795 | }  // namespace randen
796 | 


--------------------------------------------------------------------------------
/nanobenchmark.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #ifndef NANOBENCHMARK_H_
 16 | #define NANOBENCHMARK_H_
 17 | 
 18 | // Benchmarks functions of a single integer argument with realistic branch
 19 | // prediction hit rates. Uses a robust estimator to summarize the measurements.
 20 | // The precision is about 0.2%.
 21 | //
 22 | // Examples: see nanobenchmark_test.cc.
 23 | //
 24 | // Background: Microbenchmarks such as http://github.com/google/benchmark
 25 | // can measure elapsed times on the order of a microsecond. Shorter functions
 26 | // are typically measured by repeating them thousands of times and dividing
 27 | // the total elapsed time by this count. Unfortunately, repetition (especially
 28 | // with the same input parameter!) influences the runtime. In time-critical
 29 | // code, it is reasonable to expect warm instruction/data caches and TLBs,
 30 | // but a perfect record of which branches will be taken is unrealistic.
 31 | // Unless the application also repeatedly invokes the measured function with
 32 | // the same parameter, the benchmark is measuring something very different -
 33 | // a best-case result, almost as if the parameter were made a compile-time
 34 | // constant. This may lead to erroneous conclusions about branch-heavy
 35 | // algorithms outperforming branch-free alternatives.
 36 | //
 37 | // Our approach differs in three ways. Adding fences to the timer functions
 38 | // reduces variability due to instruction reordering, improving the timer
 39 | // resolution to about 40 CPU cycles. However, shorter functions must still
 40 | // be invoked repeatedly. For more realistic branch prediction performance,
 41 | // we vary the input parameter according to a user-specified distribution.
 42 | // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
 43 | // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
 44 | // central tendency of the measurement samples with the "half sample mode",
 45 | // which is more robust to outliers and skewed data than the mean or median.
 46 | 
 47 | // WARNING if included from multiple translation units compiled with distinct
 48 | // flags: this header requires textual inclusion and a predefined NB_NAMESPACE
 49 | // macro that is unique to the current compile flags. We must also avoid
 50 | // standard library headers such as vector and functional that define functions.
 51 | 
 52 | #include <stddef.h>
 53 | #include <stdint.h>
 54 | 
 55 | namespace randen {
 56 | 
 57 | namespace platform {
 58 | 
 59 | // Ensures the thread is running on the specified cpu, and no others.
 60 | // Reduces caused by desynchronized socket RDTSC and context switches.
 61 | // If "cpu" is negative, pin to the currently running core.
 62 | void PinThreadToCPU(const int cpu = -1);
 63 | 
 64 | // Returns tick rate, useful for converting measurements to seconds. Invariant
 65 | // means the tick counter frequency is independent of CPU throttling or sleep.
 66 | // This call may be expensive, callers should cache the result.
 67 | double InvariantTicksPerSecond();
 68 | 
 69 | }  // namespace platform
 70 | 
 71 | // Input influencing the function being measured (e.g. number of bytes to copy).
 72 | using FuncInput = size_t;
 73 | 
 74 | // "Proof of work" returned by Func to ensure the compiler does not elide it.
 75 | using FuncOutput = uint64_t;
 76 | 
 77 | // Function to measure: either 1) a captureless lambda or function with two
 78 | // arguments or 2) a lambda with capture, in which case the first argument
 79 | // is reserved for use by MeasureClosure.
 80 | using Func = FuncOutput (*)(const void*, FuncInput);
 81 | 
 82 | // Internal parameters that determine precision/resolution/measuring time.
 83 | struct Params {
 84 |   // For measuring timer overhead/resolution. Used in a nested loop =>
 85 |   // quadratic time, acceptable because we know timer overhead is "low".
 86 |   // constexpr because this is used to define array bounds.
 87 |   static constexpr size_t kTimerSamples = 256;
 88 | 
 89 |   // Best-case precision, expressed as a divisor of the timer resolution.
 90 |   // Larger => more calls to Func and higher precision.
 91 |   size_t precision_divisor = 1024;
 92 | 
 93 |   // Ratio between full and subset input distribution sizes. Cannot be less
 94 |   // than 2; larger values increase measurement time but more faithfully
 95 |   // model the given input distribution.
 96 |   size_t subset_ratio = 2;
 97 | 
 98 |   // Together with the estimated Func duration, determines how many times to
 99 |   // call Func before checking the sample variability. Larger values increase
100 |   // measurement time, memory/cache use and precision.
101 |   double seconds_per_eval = 4E-3;
102 | 
103 |   // The minimum number of samples before estimating the central tendency.
104 |   size_t min_samples_per_eval = 7;
105 | 
106 |   // The mode is better than median for estimating the central tendency of
107 |   // skewed/fat-tailed distributions, but it requires sufficient samples
108 |   // relative to the width of half-ranges.
109 |   size_t min_mode_samples = 64;
110 | 
111 |   // Maximum permissible variability (= median absolute deviation / center).
112 |   double target_rel_mad = 0.002;
113 | 
114 |   // Abort after this many evals without reaching target_rel_mad. This
115 |   // prevents infinite loops.
116 |   size_t max_evals = 9;
117 | 
118 |   // Whether to print additional statistics to stdout.
119 |   bool verbose = true;
120 | };
121 | 
122 | // Measurement result for each unique input.
123 | struct Result {
124 |   FuncInput input;
125 | 
126 |   // Robust estimate (mode or median) of duration.
127 |   float ticks;
128 | 
129 |   // Measure of variability (median absolute deviation relative to "ticks").
130 |   float variability;
131 | };
132 | 
133 | // Precisely measures the number of ticks elapsed when calling "func" with the
134 | // given inputs, shuffled to ensure realistic branch prediction hit rates.
135 | //
136 | // "func" returns a 'proof of work' to ensure its computations are not elided.
137 | // "arg" is passed to Func, or reserved for internal use by MeasureClosure.
138 | // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
139 | //   "func". The values should be chosen to maximize coverage of "func". This
140 | //   represents a distribution, so a value's frequency should reflect its
141 | //   probability in the real application. Order does not matter; for example, a
142 | //   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
143 | // Returns how many Result were written to "results": one per unique input, or
144 | //   zero if the measurement failed (an error message goes to stderr).
145 | size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
146 |                const size_t num_inputs, Result* results,
147 |                const Params& p = Params());
148 | 
149 | // Per-copt namespace prevents leaking generated code into other modules.
150 | namespace NB_NAMESPACE {
151 | 
152 | // Calls operator() of the given closure (lambda function).
153 | template <class Closure>
154 | static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
155 |   return (*f)(input);
156 | }
157 | 
158 | }  // namespace NB_NAMESPACE
159 | 
160 | // Same as Measure, except "closure" is typically a lambda function of
161 | // FuncInput -> FuncOutput with a capture list.
162 | template <class Closure>
163 | static inline size_t MeasureClosure(const Closure& closure,
164 |                                     const FuncInput* inputs,
165 |                                     const size_t num_inputs, Result* results,
166 |                                     const Params& p = Params()) {
167 |   return Measure(reinterpret_cast<Func>(&NB_NAMESPACE::CallClosure<Closure>),
168 |                  reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
169 |                  results, p);
170 | }
171 | 
172 | }  // namespace randen
173 | 
174 | #endif  // NANOBENCHMARK_H_
175 | 


--------------------------------------------------------------------------------
/nanobenchmark_test.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <stdio.h>
 16 | #include <unistd.h>     // sleep
 17 | 
 18 | #include "nanobenchmark.h"
 19 | #include "randen.h"
 20 | #include "util.h"
 21 | #include "vector128.h"
 22 | 
 23 | namespace randen {
 24 | namespace {
 25 | 
 26 | uint64_t AES(const void*, const FuncInput num_rounds) {
 27 |   // Ensures multiple invocations are serially dependent, otherwise we're
 28 |   // measuring the throughput rather than latency.
 29 |   static V prev;
 30 |   V m = prev;
 31 |   for (size_t i = 0; i < num_rounds; ++i) {
 32 |     m = AES(m, m);
 33 |   }
 34 |   prev = m;
 35 |   alignas(16) uint64_t lanes[2];
 36 |   Store(m, lanes, 0);
 37 |   return lanes[0];
 38 | }
 39 | 
 40 | template <size_t N>
 41 | void MeasureAES(const FuncInput (&inputs)[N]) {
 42 |   Result results[N];
 43 |   Params params;
 44 |   params.max_evals = 4;  // avoid test timeout
 45 |   const size_t num_results = Measure(&AES, nullptr, inputs, N, results, params);
 46 |   for (size_t i = 0; i < num_results; ++i) {
 47 |     printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
 48 |            results[i].ticks, results[i].variability * 100.0);
 49 |   }
 50 | }
 51 | 
 52 | uint64_t Div(const void*, FuncInput in) {
 53 |   // Here we're measuring the throughput because benchmark invocations are
 54 |   // independent.
 55 |   const int64_t d1 = 0xFFFFFFFFFFll / int64_t(in);  // IDIV
 56 |   return d1;
 57 | }
 58 | 
 59 | template <size_t N>
 60 | void MeasureDiv(const FuncInput (&inputs)[N]) {
 61 |   Result results[N];
 62 |   Params params;
 63 |   params.max_evals = 4;  // avoid test timeout
 64 |   const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
 65 |   for (size_t i = 0; i < num_results; ++i) {
 66 |     printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
 67 |            results[i].ticks, results[i].variability * 100.0);
 68 |   }
 69 | }
 70 | 
 71 | Randen<uint32_t> rng;
 72 | 
 73 | // A function whose runtime depends on rng.
 74 | uint64_t Random(const void* arg, FuncInput in) {
 75 |   const uint32_t r = rng() & 0xF;
 76 |   return AES(arg, r * r);
 77 | }
 78 | 
 79 | // Ensure the measured variability is high.
 80 | template <size_t N>
 81 | void MeasureRandom(const FuncInput (&inputs)[N]) {
 82 |   Result results[N];
 83 |   Params p;
 84 |   p.max_evals = 4;  // avoid test timeout
 85 |   p.verbose = false;
 86 |   const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
 87 |   for (size_t i = 0; i < num_results; ++i) {
 88 |     RANDEN_CHECK(results[i].variability > 1E-3);
 89 |   }
 90 | }
 91 | 
 92 | template <size_t N>
 93 | void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
 94 |   printf("Expect a 'measurement failed' below:\n");
 95 |   Result results[N];
 96 |   const size_t num_results = MeasureClosure(
 97 |       [](const FuncInput input) {
 98 |         // Loop until the sleep succeeds (not interrupted by signal). We assume
 99 |         // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
100 |         while (sleep(2) != 0) {
101 |         }
102 |         return input;
103 |       },
104 |       inputs, N, results);
105 |   RANDEN_CHECK(num_results == 0);
106 | }
107 | 
108 | void RunAll(const int argc, char* argv[]) {
109 |   // Avoid migrating between cores - important on multi-socket systems.
110 |   int cpu = -1;
111 |   if (argc == 2) {
112 |     cpu = strtol(argv[1], nullptr, 10);
113 |   }
114 |   platform::PinThreadToCPU(cpu);
115 | 
116 |   // unpredictable == 1 but the compiler doesn't know that.
117 |   const int unpredictable = argc != 999;
118 |   static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
119 |                                      static_cast<FuncInput>(unpredictable + 9)};
120 | 
121 |   MeasureAES(inputs);
122 |   MeasureDiv(inputs);
123 |   MeasureRandom(inputs);
124 |   EnsureLongMeasurementFails(inputs);
125 | }
126 | 
127 | }  // namespace
128 | }  // namespace randen
129 | 
130 | int main(int argc, char* argv[]) {
131 |   randen::RunAll(argc, argv);
132 |   return 0;
133 | }
134 | 


--------------------------------------------------------------------------------
/randen.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "randen.h"
 16 | 
 17 | #include <string.h>     // memcpy
 18 | 
 19 | #include "vector128.h"
 20 | 
 21 | namespace randen {
 22 | namespace {
 23 | 
 24 | // High-level summary:
 25 | // 1) Reverie (see "A Robust and Sponge-Like PRNG with Improved Efficiency") is
 26 | //    a sponge-like random generator that requires a cryptographic permutation.
 27 | //    It improves upon "Provably Robust Sponge-Based PRNGs and KDFs" by
 28 | //    achieving backtracking resistance with only one Permute() per buffer.
 29 | //
 30 | // 2) "Simpira v2: A Family of Efficient Permutations Using the AES Round
 31 | //    Function" constructs up to 1024-bit permutations using an improved
 32 | //    Generalized Feistel network with 2-round AES-128 functions. This Feistel
 33 | //    block shuffle achieves diffusion faster and is less vulnerable to
 34 | //    sliced-biclique attacks than the Type-2 cyclic shuffle.
 35 | //
 36 | // 3) "Improving the Generalized Feistel" and "New criterion for diffusion
 37 | //    property" extends the same kind of improved Feistel block shuffle to 16
 38 | //    branches, which enables a 2048-bit permutation.
 39 | //
 40 | // We combine these three ideas and also change Simpira's subround keys from
 41 | // structured/low-entropy counters to digits of Pi.
 42 | 
 43 | // Largest size for which security proofs are known.
 44 | constexpr int kFeistelBlocks = 16;
 45 | 
 46 | // Type-2 generalized Feistel => one round function for every two blocks.
 47 | constexpr int kFeistelFunctions = kFeistelBlocks / 2;  // = 8
 48 | 
 49 | // Ensures SPRP security and two full subblock diffusions.
 50 | constexpr int kFeistelRounds = 16 + 1;  // > 4 * log2(kFeistelBlocks)
 51 | 
 52 | // Independent keys (272 = 2.1 KiB) for the first AES subround of each function.
 53 | constexpr int kKeys = kFeistelRounds * kFeistelFunctions;
 54 | 
 55 | const uint64_t* RANDEN_RESTRICT Keys() {
 56 |   // "Nothing up my sleeve" numbers from the first hex digits of Pi, obtained
 57 |   // from http://hexpi.sourceforge.net/. Native byte order.
 58 |   alignas(32) static constexpr uint64_t pi_digits[kKeys * kLanes] = {
 59 |       RANDEN_LE(0x243F6A8885A308D3ull, 0x13198A2E03707344ull),
 60 |       RANDEN_LE(0xA4093822299F31D0ull, 0x082EFA98EC4E6C89ull),
 61 |       RANDEN_LE(0x452821E638D01377ull, 0xBE5466CF34E90C6Cull),
 62 |       RANDEN_LE(0xC0AC29B7C97C50DDull, 0x3F84D5B5B5470917ull),
 63 |       RANDEN_LE(0x9216D5D98979FB1Bull, 0xD1310BA698DFB5ACull),
 64 |       RANDEN_LE(0x2FFD72DBD01ADFB7ull, 0xB8E1AFED6A267E96ull),
 65 |       RANDEN_LE(0xBA7C9045F12C7F99ull, 0x24A19947B3916CF7ull),
 66 |       RANDEN_LE(0x0801F2E2858EFC16ull, 0x636920D871574E69ull),
 67 |       RANDEN_LE(0xA458FEA3F4933D7Eull, 0x0D95748F728EB658ull),
 68 |       RANDEN_LE(0x718BCD5882154AEEull, 0x7B54A41DC25A59B5ull),
 69 |       RANDEN_LE(0x9C30D5392AF26013ull, 0xC5D1B023286085F0ull),
 70 |       RANDEN_LE(0xCA417918B8DB38EFull, 0x8E79DCB0603A180Eull),
 71 |       RANDEN_LE(0x6C9E0E8BB01E8A3Eull, 0xD71577C1BD314B27ull),
 72 |       RANDEN_LE(0x78AF2FDA55605C60ull, 0xE65525F3AA55AB94ull),
 73 |       RANDEN_LE(0x5748986263E81440ull, 0x55CA396A2AAB10B6ull),
 74 |       RANDEN_LE(0xB4CC5C341141E8CEull, 0xA15486AF7C72E993ull),
 75 |       RANDEN_LE(0xB3EE1411636FBC2Aull, 0x2BA9C55D741831F6ull),
 76 |       RANDEN_LE(0xCE5C3E169B87931Eull, 0xAFD6BA336C24CF5Cull),
 77 |       RANDEN_LE(0x7A32538128958677ull, 0x3B8F48986B4BB9AFull),
 78 |       RANDEN_LE(0xC4BFE81B66282193ull, 0x61D809CCFB21A991ull),
 79 |       RANDEN_LE(0x487CAC605DEC8032ull, 0xEF845D5DE98575B1ull),
 80 |       RANDEN_LE(0xDC262302EB651B88ull, 0x23893E81D396ACC5ull),
 81 |       RANDEN_LE(0x0F6D6FF383F44239ull, 0x2E0B4482A4842004ull),
 82 |       RANDEN_LE(0x69C8F04A9E1F9B5Eull, 0x21C66842F6E96C9Aull),
 83 |       RANDEN_LE(0x670C9C61ABD388F0ull, 0x6A51A0D2D8542F68ull),
 84 |       RANDEN_LE(0x960FA728AB5133A3ull, 0x6EEF0B6C137A3BE4ull),
 85 |       RANDEN_LE(0xBA3BF0507EFB2A98ull, 0xA1F1651D39AF0176ull),
 86 |       RANDEN_LE(0x66CA593E82430E88ull, 0x8CEE8619456F9FB4ull),
 87 |       RANDEN_LE(0x7D84A5C33B8B5EBEull, 0xE06F75D885C12073ull),
 88 |       RANDEN_LE(0x401A449F56C16AA6ull, 0x4ED3AA62363F7706ull),
 89 |       RANDEN_LE(0x1BFEDF72429B023Dull, 0x37D0D724D00A1248ull),
 90 |       RANDEN_LE(0xDB0FEAD349F1C09Bull, 0x075372C980991B7Bull),
 91 |       RANDEN_LE(0x25D479D8F6E8DEF7ull, 0xE3FE501AB6794C3Bull),
 92 |       RANDEN_LE(0x976CE0BD04C006BAull, 0xC1A94FB6409F60C4ull),
 93 |       RANDEN_LE(0x5E5C9EC2196A2463ull, 0x68FB6FAF3E6C53B5ull),
 94 |       RANDEN_LE(0x1339B2EB3B52EC6Full, 0x6DFC511F9B30952Cull),
 95 |       RANDEN_LE(0xCC814544AF5EBD09ull, 0xBEE3D004DE334AFDull),
 96 |       RANDEN_LE(0x660F2807192E4BB3ull, 0xC0CBA85745C8740Full),
 97 |       RANDEN_LE(0xD20B5F39B9D3FBDBull, 0x5579C0BD1A60320Aull),
 98 |       RANDEN_LE(0xD6A100C6402C7279ull, 0x679F25FEFB1FA3CCull),
 99 |       RANDEN_LE(0x8EA5E9F8DB3222F8ull, 0x3C7516DFFD616B15ull),
100 |       RANDEN_LE(0x2F501EC8AD0552ABull, 0x323DB5FAFD238760ull),
101 |       RANDEN_LE(0x53317B483E00DF82ull, 0x9E5C57BBCA6F8CA0ull),
102 |       RANDEN_LE(0x1A87562EDF1769DBull, 0xD542A8F6287EFFC3ull),
103 |       RANDEN_LE(0xAC6732C68C4F5573ull, 0x695B27B0BBCA58C8ull),
104 |       RANDEN_LE(0xE1FFA35DB8F011A0ull, 0x10FA3D98FD2183B8ull),
105 |       RANDEN_LE(0x4AFCB56C2DD1D35Bull, 0x9A53E479B6F84565ull),
106 |       RANDEN_LE(0xD28E49BC4BFB9790ull, 0xE1DDF2DAA4CB7E33ull),
107 |       RANDEN_LE(0x62FB1341CEE4C6E8ull, 0xEF20CADA36774C01ull),
108 |       RANDEN_LE(0xD07E9EFE2BF11FB4ull, 0x95DBDA4DAE909198ull),
109 |       RANDEN_LE(0xEAAD8E716B93D5A0ull, 0xD08ED1D0AFC725E0ull),
110 |       RANDEN_LE(0x8E3C5B2F8E7594B7ull, 0x8FF6E2FBF2122B64ull),
111 |       RANDEN_LE(0x8888B812900DF01Cull, 0x4FAD5EA0688FC31Cull),
112 |       RANDEN_LE(0xD1CFF191B3A8C1ADull, 0x2F2F2218BE0E1777ull),
113 |       RANDEN_LE(0xEA752DFE8B021FA1ull, 0xE5A0CC0FB56F74E8ull),
114 |       RANDEN_LE(0x18ACF3D6CE89E299ull, 0xB4A84FE0FD13E0B7ull),
115 |       RANDEN_LE(0x7CC43B81D2ADA8D9ull, 0x165FA26680957705ull),
116 |       RANDEN_LE(0x93CC7314211A1477ull, 0xE6AD206577B5FA86ull),
117 |       RANDEN_LE(0xC75442F5FB9D35CFull, 0xEBCDAF0C7B3E89A0ull),
118 |       RANDEN_LE(0xD6411BD3AE1E7E49ull, 0x00250E2D2071B35Eull),
119 |       RANDEN_LE(0x226800BB57B8E0AFull, 0x2464369BF009B91Eull),
120 |       RANDEN_LE(0x5563911D59DFA6AAull, 0x78C14389D95A537Full),
121 |       RANDEN_LE(0x207D5BA202E5B9C5ull, 0x832603766295CFA9ull),
122 |       RANDEN_LE(0x11C819684E734A41ull, 0xB3472DCA7B14A94Aull),
123 |       RANDEN_LE(0x1B5100529A532915ull, 0xD60F573FBC9BC6E4ull),
124 |       RANDEN_LE(0x2B60A47681E67400ull, 0x08BA6FB5571BE91Full),
125 |       RANDEN_LE(0xF296EC6B2A0DD915ull, 0xB6636521E7B9F9B6ull),
126 |       RANDEN_LE(0xFF34052EC5855664ull, 0x53B02D5DA99F8FA1ull),
127 |       RANDEN_LE(0x08BA47996E85076Aull, 0x4B7A70E9B5B32944ull),
128 |       RANDEN_LE(0xDB75092EC4192623ull, 0xAD6EA6B049A7DF7Dull),
129 |       RANDEN_LE(0x9CEE60B88FEDB266ull, 0xECAA8C71699A18FFull),
130 |       RANDEN_LE(0x5664526CC2B19EE1ull, 0x193602A575094C29ull),
131 |       RANDEN_LE(0xA0591340E4183A3Eull, 0x3F54989A5B429D65ull),
132 |       RANDEN_LE(0x6B8FE4D699F73FD6ull, 0xA1D29C07EFE830F5ull),
133 |       RANDEN_LE(0x4D2D38E6F0255DC1ull, 0x4CDD20868470EB26ull),
134 |       RANDEN_LE(0x6382E9C6021ECC5Eull, 0x09686B3F3EBAEFC9ull),
135 |       RANDEN_LE(0x3C9718146B6A70A1ull, 0x687F358452A0E286ull),
136 |       RANDEN_LE(0xB79C5305AA500737ull, 0x3E07841C7FDEAE5Cull),
137 |       RANDEN_LE(0x8E7D44EC5716F2B8ull, 0xB03ADA37F0500C0Dull),
138 |       RANDEN_LE(0xF01C1F040200B3FFull, 0xAE0CF51A3CB574B2ull),
139 |       RANDEN_LE(0x25837A58DC0921BDull, 0xD19113F97CA92FF6ull),
140 |       RANDEN_LE(0x9432477322F54701ull, 0x3AE5E58137C2DADCull),
141 |       RANDEN_LE(0xC8B576349AF3DDA7ull, 0xA94461460FD0030Eull),
142 |       RANDEN_LE(0xECC8C73EA4751E41ull, 0xE238CD993BEA0E2Full),
143 |       RANDEN_LE(0x3280BBA1183EB331ull, 0x4E548B384F6DB908ull),
144 |       RANDEN_LE(0x6F420D03F60A04BFull, 0x2CB8129024977C79ull),
145 |       RANDEN_LE(0x5679B072BCAF89AFull, 0xDE9A771FD9930810ull),
146 |       RANDEN_LE(0xB38BAE12DCCF3F2Eull, 0x5512721F2E6B7124ull),
147 |       RANDEN_LE(0x501ADDE69F84CD87ull, 0x7A5847187408DA17ull),
148 |       RANDEN_LE(0xBC9F9ABCE94B7D8Cull, 0xEC7AEC3ADB851DFAull),
149 |       RANDEN_LE(0x63094366C464C3D2ull, 0xEF1C18473215D808ull),
150 |       RANDEN_LE(0xDD433B3724C2BA16ull, 0x12A14D432A65C451ull),
151 |       RANDEN_LE(0x50940002133AE4DDull, 0x71DFF89E10314E55ull),
152 |       RANDEN_LE(0x81AC77D65F11199Bull, 0x043556F1D7A3C76Bull),
153 |       RANDEN_LE(0x3C11183B5924A509ull, 0xF28FE6ED97F1FBFAull),
154 |       RANDEN_LE(0x9EBABF2C1E153C6Eull, 0x86E34570EAE96FB1ull),
155 |       RANDEN_LE(0x860E5E0A5A3E2AB3ull, 0x771FE71C4E3D06FAull),
156 |       RANDEN_LE(0x2965DCB999E71D0Full, 0x803E89D65266C825ull),
157 |       RANDEN_LE(0x2E4CC9789C10B36Aull, 0xC6150EBA94E2EA78ull),
158 |       RANDEN_LE(0xA6FC3C531E0A2DF4ull, 0xF2F74EA7361D2B3Dull),
159 |       RANDEN_LE(0x1939260F19C27960ull, 0x5223A708F71312B6ull),
160 |       RANDEN_LE(0xEBADFE6EEAC31F66ull, 0xE3BC4595A67BC883ull),
161 |       RANDEN_LE(0xB17F37D1018CFF28ull, 0xC332DDEFBE6C5AA5ull),
162 |       RANDEN_LE(0x6558218568AB9702ull, 0xEECEA50FDB2F953Bull),
163 |       RANDEN_LE(0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull),
164 |       RANDEN_LE(0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull),
165 |       RANDEN_LE(0x0334FE1EAA0363CFull, 0xB5735C904C70A239ull),
166 |       RANDEN_LE(0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull),
167 |       RANDEN_LE(0x9CAB5CABB2F3846Eull, 0x648B1EAF19BDF0CAull),
168 |       RANDEN_LE(0xA02369B9655ABB50ull, 0x40685A323C2AB4B3ull),
169 |       RANDEN_LE(0x319EE9D5C021B8F7ull, 0x9B540B19875FA099ull),
170 |       RANDEN_LE(0x95F7997E623D7DA8ull, 0xF837889A97E32D77ull),
171 |       RANDEN_LE(0x11ED935F16681281ull, 0x0E358829C7E61FD6ull),
172 |       RANDEN_LE(0x96DEDFA17858BA99ull, 0x57F584A51B227263ull),
173 |       RANDEN_LE(0x9B83C3FF1AC24696ull, 0xCDB30AEB532E3054ull),
174 |       RANDEN_LE(0x8FD948E46DBC3128ull, 0x58EBF2EF34C6FFEAull),
175 |       RANDEN_LE(0xFE28ED61EE7C3C73ull, 0x5D4A14D9E864B7E3ull),
176 |       RANDEN_LE(0x42105D14203E13E0ull, 0x45EEE2B6A3AAABEAull),
177 |       RANDEN_LE(0xDB6C4F15FACB4FD0ull, 0xC742F442EF6ABBB5ull),
178 |       RANDEN_LE(0x654F3B1D41CD2105ull, 0xD81E799E86854DC7ull),
179 |       RANDEN_LE(0xE44B476A3D816250ull, 0xCF62A1F25B8D2646ull),
180 |       RANDEN_LE(0xFC8883A0C1C7B6A3ull, 0x7F1524C369CB7492ull),
181 |       RANDEN_LE(0x47848A0B5692B285ull, 0x095BBF00AD19489Dull),
182 |       RANDEN_LE(0x1462B17423820D00ull, 0x58428D2A0C55F5EAull),
183 |       RANDEN_LE(0x1DADF43E233F7061ull, 0x3372F0928D937E41ull),
184 |       RANDEN_LE(0xD65FECF16C223BDBull, 0x7CDE3759CBEE7460ull),
185 |       RANDEN_LE(0x4085F2A7CE77326Eull, 0xA607808419F8509Eull),
186 |       RANDEN_LE(0xE8EFD85561D99735ull, 0xA969A7AAC50C06C2ull),
187 |       RANDEN_LE(0x5A04ABFC800BCADCull, 0x9E447A2EC3453484ull),
188 |       RANDEN_LE(0xFDD567050E1E9EC9ull, 0xDB73DBD3105588CDull),
189 |       RANDEN_LE(0x675FDA79E3674340ull, 0xC5C43465713E38D8ull),
190 |       RANDEN_LE(0x3D28F89EF16DFF20ull, 0x153E21E78FB03D4Aull),
191 |       RANDEN_LE(0xE6E39F2BDB83ADF7ull, 0xE93D5A68948140F7ull),
192 |       RANDEN_LE(0xF64C261C94692934ull, 0x411520F77602D4F7ull),
193 |       RANDEN_LE(0xBCF46B2ED4A10068ull, 0xD40824713320F46Aull),
194 |       RANDEN_LE(0x43B7D4B7500061AFull, 0x1E39F62E97244546ull)};
195 |   static_assert(pi_digits[kKeys * kLanes - 1] != 0, "Too few initializers");
196 |   return pi_digits;
197 | }
198 | 
199 | // Improved odd-even shuffle from "New criterion for diffusion property".
200 | RANDEN_INLINE void BlockShuffle(uint64_t* RANDEN_RESTRICT state) {
201 |   // First make a copy (optimized out).
202 |   uint64_t source[kFeistelBlocks * kLanes];
203 |   memcpy(source, state, sizeof(source));
204 | 
205 |   constexpr int shuffle[kFeistelBlocks] = {7,  2, 13, 4,  11, 8,  3, 6,
206 |                                            15, 0, 9,  10, 1,  14, 5, 12};
207 |   for (int branch = 0; branch < kFeistelBlocks; ++branch) {
208 |     const V v = Load(source, shuffle[branch]);
209 |     Store(v, state, branch);
210 |   }
211 | }
212 | 
213 | // Cryptographic permutation based via type-2 Generalized Feistel Network.
214 | // Indistinguishable from ideal by chosen-ciphertext adversaries using less than
215 | // 2^64 queries if the round function is a PRF. This is similar to the b=8 case
216 | // of Simpira v2, but more efficient than its generic construction for b=16.
217 | RANDEN_INLINE void Permute(uint64_t* RANDEN_RESTRICT state) {
218 |   // Round keys for one AES per Feistel round and branch: first digits of Pi.
219 |   const uint64_t* RANDEN_RESTRICT keys = Keys();
220 | 
221 |   // (Successfully unrolled; the first iteration jumps into the second half)
222 | #ifdef __clang__
223 | #pragma clang loop unroll_count(2)
224 | #endif
225 |   for (int round = 0; round < kFeistelRounds; ++round) {
226 |     for (int branch = 0; branch < kFeistelBlocks; branch += 2) {
227 |       const V even = Load(state, branch);
228 |       const V odd = Load(state, branch + 1);
229 |       // Feistel round function using two AES subrounds. Very similar to F()
230 |       // from Simpira v2, but with independent subround keys. Uses 17 AES rounds
231 |       // per 16 bytes (vs. 10 for AES-CTR). Computing eight round functions in
232 |       // parallel hides the 7-cycle AESNI latency on HSW. Note that the Feistel
233 |       // XORs are 'free' (included in the second AES instruction).
234 |       const V f1 = AES(even, Load(keys, 0));
235 |       keys += kLanes;
236 |       const V f2 = AES(f1, odd);
237 |       Store(f2, state, branch + 1);
238 |     }
239 | 
240 |     BlockShuffle(state);
241 |   }
242 | }
243 | 
244 | // Enables native loads in the round loop by pre-swapping.
245 | RANDEN_INLINE void SwapIfBigEndian(uint64_t* RANDEN_RESTRICT state) {
246 | #ifdef RANDEN_BIG_ENDIAN
247 |   for (int branch = 0; branch < kFeistelBlocks; ++branch) {
248 |     const V v = ReverseBytes(Load(state, branch));
249 |     Store(v, state, branch);
250 |   }
251 | #endif
252 | }
253 | 
254 | }  // namespace
255 | 
256 | void Internal::Absorb(const void* seed_void, void* state_void) {
257 |   uint64_t* RANDEN_RESTRICT state = reinterpret_cast<uint64_t*>(state_void);
258 |   const uint64_t* RANDEN_RESTRICT seed =
259 |       reinterpret_cast<const uint64_t*>(seed_void);
260 | 
261 |   constexpr int kCapacityBlocks = kCapacityBytes / sizeof(V);
262 |   static_assert(kCapacityBlocks * sizeof(V) == kCapacityBytes, "Not i*V");
263 |   for (size_t i = kCapacityBlocks; i < kStateBytes / sizeof(V); ++i) {
264 |     V block = Load(state, i);
265 |     block ^= Load(seed, i - kCapacityBlocks);
266 |     Store(block, state, i);
267 |   }
268 | }
269 | 
270 | void Internal::Generate(void* state_void) {
271 |   uint64_t* RANDEN_RESTRICT state = reinterpret_cast<uint64_t*>(state_void);
272 | 
273 |   static_assert(kCapacityBytes == sizeof(V), "Capacity mismatch");
274 |   const V prev_inner = Load(state, 0);
275 | 
276 |   SwapIfBigEndian(state);
277 | 
278 |   Permute(state);
279 | 
280 |   SwapIfBigEndian(state);
281 | 
282 |   // Ensure backtracking resistance.
283 |   V inner = Load(state, 0);
284 |   inner ^= prev_inner;
285 |   Store(inner, state, 0);
286 | }
287 | 
288 | }  // namespace randen
289 | 


--------------------------------------------------------------------------------
/randen.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // 'Strong' (indistinguishable from random, backtracking-resistant) random
 16 | // generator, faster in some benchmarks than std::mt19937_64 and pcg64_c32.
 17 | // Accompanying paper: https://arxiv.org/abs/1810.02227
 18 | 
 19 | #ifndef RANDEN_H_
 20 | #define RANDEN_H_
 21 | 
 22 | #include <stdint.h>
 23 | #include <string.h>  // memcpy
 24 | #include <algorithm>
 25 | #include <ios>
 26 | #include <istream>
 27 | #include <iterator>
 28 | #include <limits>
 29 | #include <ostream>
 30 | #include <type_traits>
 31 | 
 32 | // RANDen = RANDom generator or beetroots in Swiss German.
 33 | namespace randen {
 34 | 
 35 | struct Internal {
 36 |   static void Absorb(const void* seed, void* state);
 37 |   static void Generate(void* state);
 38 | 
 39 |   static constexpr int kStateBytes = 256;  // 2048-bit
 40 | 
 41 |   // Size of the 'inner' (inaccessible) part of the sponge. Larger values would
 42 |   // require more frequent calls to Generate.
 43 |   static constexpr int kCapacityBytes = 16;  // 128-bit
 44 | };
 45 | 
 46 | // Deterministic pseudorandom byte generator with backtracking resistance
 47 | // (leaking the state does not compromise prior outputs). Based on Reverie
 48 | // (see "A Robust and Sponge-Like PRNG with Improved Efficiency") instantiated
 49 | // with an improved Simpira-like permutation.
 50 | // Returns values of type "T" (must be a built-in unsigned integer type).
 51 | template <typename T>
 52 | class alignas(32) Randen {
 53 |   static_assert(std::is_unsigned<T>::value,
 54 |                 "Randen must be parameterized by a built-in unsigned integer");
 55 | 
 56 |  public:
 57 |   // C++11 URBG interface:
 58 |   using result_type = T;
 59 | 
 60 |   static constexpr result_type min() {
 61 |     return std::numeric_limits<result_type>::min();
 62 |   }
 63 | 
 64 |   static constexpr result_type max() {
 65 |     return std::numeric_limits<result_type>::max();
 66 |   }
 67 | 
 68 |   explicit Randen(result_type seed_value = 0) { seed(seed_value); }
 69 | 
 70 |   template <class SeedSequence,
 71 |             typename = typename std::enable_if<
 72 |                 !std::is_same<SeedSequence, Randen>::value>::type>
 73 |   explicit Randen(SeedSequence&& seq) {
 74 |     seed(seq);
 75 |   }
 76 | 
 77 |   // Default copy and move operators.
 78 |   Randen(const Randen&) = default;
 79 |   Randen& operator=(const Randen&) = default;
 80 | 
 81 |   Randen(Randen&&) = default;
 82 |   Randen& operator=(Randen&&) = default;
 83 | 
 84 |   // Returns random bits from the buffer in units of T.
 85 |   result_type operator()() {
 86 |     // (Local copy ensures compiler knows this is not aliased.)
 87 |     size_t next = next_;
 88 | 
 89 |     // Refill the buffer if needed (unlikely).
 90 |     if (next >= kStateT) {
 91 |       Internal::Generate(state_);
 92 |       next = kCapacityT;
 93 |     }
 94 | 
 95 |     const result_type ret = state_[next];
 96 |     next_ = next + 1;
 97 |     return ret;
 98 |   }
 99 | 
100 |   template <class SeedSequence>
101 |   typename std::enable_if<
102 |       !std::is_convertible<SeedSequence, result_type>::value, void>::type
103 |   seed(SeedSequence& seq) {
104 |     seed();
105 |     reseed(seq);
106 |   }
107 | 
108 |   void seed(result_type seed_value = 0) {
109 |     next_ = kStateT;
110 |     std::fill(std::begin(state_), std::begin(state_) + kCapacityT, 0);
111 |     std::fill(std::begin(state_) + kCapacityT, std::end(state_), seed_value);
112 |   }
113 | 
114 |   // Inserts entropy into (part of) the state. Calling this periodically with
115 |   // sufficient entropy ensures prediction resistance (attackers cannot predict
116 |   // future outputs even if state is compromised).
117 |   template <class SeedSequence>
118 |   void reseed(SeedSequence& seq) {
119 |     using U32 = typename SeedSequence::result_type;
120 |     constexpr int kRate32 =
121 |         (Internal::kStateBytes - Internal::kCapacityBytes) / sizeof(U32);
122 |     U32 buffer[kRate32];
123 |     seq.generate(buffer, buffer + kRate32);
124 |     Internal::Absorb(buffer, state_);
125 |     next_ = kStateT;  // Generate will be called by operator()
126 |   }
127 | 
128 |   void discard(unsigned long long count) {
129 |     using ull_t = unsigned long long;
130 |     const ull_t remaining = kStateT - next_;
131 |     if (count <= remaining) {
132 |       next_ += count;
133 |       return;
134 |     }
135 |     count -= remaining;
136 | 
137 |     const ull_t kRateT = kStateT - kCapacityT;
138 |     while (count > kRateT) {
139 |       Internal::Generate(state_);
140 |       next_ = kCapacityT;
141 |       count -= kRateT;
142 |     }
143 | 
144 |     if (count != 0) {
145 |       Internal::Generate(state_);
146 |       next_ = kCapacityT + count;
147 |     }
148 |   }
149 | 
150 |   bool operator==(const Randen& other) const {
151 |     return next_ == other.next_ &&
152 |            std::equal(std::begin(state_), std::end(state_),
153 |                       std::begin(other.state_));
154 |   }
155 | 
156 |   bool operator!=(const Randen& other) const { return !(*this == other); }
157 | 
158 |   template <class CharT, class Traits>
159 |   friend std::basic_ostream<CharT, Traits>& operator<<(
160 |       std::basic_ostream<CharT, Traits>& os,  // NOLINT(runtime/references)
161 |       const Randen<T>& engine) {              // NOLINT(runtime/references)
162 |     const auto flags = os.flags(std::ios_base::dec | std::ios_base::left);
163 |     const auto fill = os.fill(os.widen(' '));
164 | 
165 |     for (auto x : engine.state_) {
166 |       os << x << os.fill();
167 |     }
168 |     os << engine.next_;
169 | 
170 |     os.flags(flags);
171 |     os.fill(fill);
172 |     return os;
173 |   }
174 | 
175 |   template <class CharT, class Traits>
176 |   friend std::basic_istream<CharT, Traits>& operator>>(
177 |       std::basic_istream<CharT, Traits>& is,  // NOLINT(runtime/references)
178 |       Randen<T>& engine) {                    // NOLINT(runtime/references)
179 |     const auto flags = is.flags(std::ios_base::dec | std::ios_base::skipws);
180 |     const auto fill = is.fill(is.widen(' '));
181 | 
182 |     T state[kStateT];
183 |     size_t next;
184 |     for (auto& x : state) {
185 |       is >> x;
186 |     }
187 |     is >> next;
188 |     if (!is.fail()) {
189 |       memcpy(engine.state_, state, sizeof(engine.state_));
190 |       engine.next_ = next;
191 |     }
192 |     is.flags(flags);
193 |     is.fill(fill);
194 |     return is;
195 |   }
196 | 
197 |  private:
198 |   static constexpr size_t kStateT = Internal::kStateBytes / sizeof(T);
199 |   static constexpr size_t kCapacityT = Internal::kCapacityBytes / sizeof(T);
200 | 
201 |   // First kCapacityT are `inner', the others are accessible random bits.
202 |   alignas(32) result_type state_[kStateT];
203 |   size_t next_;  // index within state_
204 | };
205 | 
206 | }  // namespace randen
207 | 
208 | #endif  // RANDEN_H_
209 | 


--------------------------------------------------------------------------------
/randen_benchmark.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // Please disable Turbo Boost and CPU throttling!
 16 | 
 17 | #include "randen.h"
 18 | 
 19 | // std::uniform_*_distribution are slow due to division/log2; we provide
 20 | // faster variants if this is 0.
 21 | #define USE_STD_DISTRIBUTIONS 0
 22 | 
 23 | // Which engines to benchmark.
 24 | #define ENABLE_RANDEN 1
 25 | #define ENABLE_PCG 1
 26 | #define ENABLE_MT 1
 27 | #if defined(__SSE2__) && defined(__AES__)
 28 | #define ENABLE_CHACHA 1
 29 | #else
 30 | #define ENABLE_CHACHA 0
 31 | #endif
 32 | #define ENABLE_OS 1
 33 | 
 34 | #if ENABLE_PCG
 35 | #include "third_party/pcg_random/include/pcg_random.hpp"
 36 | #endif
 37 | 
 38 | #if ENABLE_MT
 39 | #include <random>
 40 | #endif
 41 | 
 42 | #if ENABLE_CHACHA
 43 | #include "engine_chacha.h"
 44 | #endif
 45 | 
 46 | #if ENABLE_OS
 47 | #include "engine_os.h"
 48 | #endif
 49 | 
 50 | 
 51 | #ifdef _MSC_VER
 52 | #include <intrin.h>
 53 | #endif
 54 | #include <stdio.h>
 55 | #include <algorithm>
 56 | #include <numeric>  // iota
 57 | 
 58 | #include "nanobenchmark.h"
 59 | #include "util.h"
 60 | 
 61 | namespace randen {
 62 | namespace {
 63 | 
 64 | #if USE_STD_DISTRIBUTIONS
 65 | using UniformInt = std::uniform_int_distribution<int>;
 66 | using UniformDouble = std::uniform_real_distribution<double>;
 67 | #else
 68 | // These are subsets of std::uniform_*_distribution.
 69 | 
 70 | class UniformInt {
 71 |  public:
 72 |   // (To support u64, add a Multiply overload and GetU64 as below.)
 73 |   using result_type = uint32_t;
 74 | 
 75 |   struct param_type {
 76 |     using distribution_type = UniformInt;
 77 | 
 78 |     param_type(const result_type begin, const result_type end)
 79 |         : begin(begin), end(end) {}
 80 | 
 81 |     // Half-open interval.
 82 |     result_type begin;
 83 |     result_type end;
 84 |   };
 85 | 
 86 |   // Engine is a C++11 UniformRandomBitGenerator returning >= 32 bits.
 87 |   template <class Engine>
 88 |   result_type operator()(Engine& engine, const param_type param) const {
 89 |     using Bits = decltype(engine());  // == typename Engine::result_type
 90 |     static_assert(std::is_same<uint32_t, Bits>::value ||
 91 |                       std::is_same<uint64_t, Bits>::value,
 92 |                   "Need u32 or u64");
 93 | 
 94 |     // We assume range < pow(2, sizeof(decltype(engine()))*8).
 95 |     const result_type range = param.end - param.begin;
 96 | 
 97 |     // Division-free with high probability. Algorithm and variable names are
 98 |     // from https://arxiv.org/pdf/1805.10941.pdf.
 99 |     result_type x = engine();  // (possibly a narrowing conversion from Bits)
100 |     result_type hi, lo;
101 |     Multiply(x, range, &hi, &lo);
102 |     // Rejected, try again (unlikely for small ranges).
103 |     if (lo < range) {
104 |       const result_type t = Negate(range) % range;
105 |       while (hi < t) {
106 |         x = engine();
107 |         Multiply(x, range, &hi, &lo);
108 |       }
109 |     }
110 | 
111 |     return hi + param.begin;
112 |   }
113 | 
114 |  private:
115 |   static constexpr result_type Negate(result_type x) {
116 |     return ~x + 1;  // assumes two's complement.
117 |   }
118 | 
119 |   static void Multiply(const uint32_t x, const uint32_t y, uint32_t* hi,
120 |                        uint32_t* lo) {
121 |     const uint64_t wide = static_cast<uint64_t>(x) * y;
122 |     *hi = wide >> 32;
123 |     *lo = static_cast<uint32_t>(wide & 0xFFFFFFFFu);
124 |   }
125 | };
126 | 
127 | class UniformDouble {
128 |  public:
129 |   // (Can also be float - we would just cast from double.)
130 |   using result_type = double;
131 | 
132 |   // Engine is a C++11 UniformRandomBitGenerator returning either u32 or u64.
133 |   template <class Engine>
134 |   result_type operator()(Engine& engine) const {
135 |     uint64_t bits = GetU64(decltype(engine())(), engine);
136 |     if (bits == 0) return static_cast<result_type>(0.0);
137 |     const int leading_zeros = NumZeroBitsAboveMSBNonzero(bits);
138 |     bits <<= leading_zeros;  // shift out leading zeros
139 |     bits >>= (64 - 53);      // zero exponent
140 |     const uint64_t exp = 1022 - leading_zeros;
141 |     const uint64_t ieee = (exp << 52) | bits;
142 |     double ret;
143 |     memcpy(&ret, &ieee, sizeof(ret));
144 |     return static_cast<result_type>(ret);
145 |   }
146 | 
147 |  private:
148 |   template <class Engine>
149 |   static uint64_t GetU64(uint64_t, Engine& engine) {
150 |     return engine();
151 |   }
152 | 
153 |   // Adapter for generating u64 from u32 engine.
154 |   template <class Engine>
155 |   static uint64_t GetU64(uint32_t, Engine& engine) {
156 |     uint64_t ret = engine();
157 |     ret <<= 32;
158 |     ret |= engine();
159 |     return ret;
160 |   }
161 | };
162 | #endif  // !USE_STD_DISTRIBUTIONS
163 | 
164 | // Benchmark::Num64() is passed to its constructor and operator() after
165 | // multiplying with a (non-compile-time-constant) 1 to prevent constant folding.
166 | // It is also used to compute cycles per byte.
167 | 
168 | // Microbenchmark: generates N numbers in a tight loop.
169 | struct BenchmarkLoop {
170 |   // Large enough that we can ignore size % buffer size.
171 |   static size_t Num64() { return 100000; }
172 | 
173 |   explicit BenchmarkLoop(const uint64_t num_64) {}
174 | 
175 |   template <class Engine>
176 |   uint64_t operator()(const uint64_t num_64, Engine& engine) const {
177 |     for (size_t i = 0; i < num_64 - 1; ++i) {
178 |       (void)engine();
179 |     }
180 |     return engine();
181 |   }
182 | };
183 | 
184 | // Real-world benchmark: shuffles a vector.
185 | class BenchmarkShuffle {
186 |  public:
187 |   static size_t Num64() { return 50000; }
188 | 
189 |   explicit BenchmarkShuffle(const uint64_t num_64) : ints_to_shuffle_(num_64) {}
190 | 
191 |   template <class Engine>
192 |   uint64_t operator()(const uint64_t num_64, Engine& engine) const {
193 |     ints_to_shuffle_[0] = static_cast<int>(num_64 & 0xFFFF);
194 | #if USE_STD_DISTRIBUTIONS
195 |     std::shuffle(ints_to_shuffle_.begin(), ints_to_shuffle_.end(), engine);
196 | #else
197 |     // Similar algorithm, but UniformInt instead of std::u_i_d => 2-3x speedup.
198 |     UniformInt dist;
199 |     for (size_t i = num_64 - 1; i != 0; --i) {
200 |       const UniformInt::param_type param(0, i);
201 |       std::swap(ints_to_shuffle_[i], ints_to_shuffle_[dist(engine, param)]);
202 |     }
203 | #endif
204 |     return ints_to_shuffle_[0];
205 |   }
206 | 
207 |  private:
208 |   mutable std::vector<int> ints_to_shuffle_;
209 | };
210 | 
211 | // Reservoir sampling.
212 | class BenchmarkSample {
213 |  public:
214 |   static size_t Num64() { return 50000; }
215 | 
216 |   explicit BenchmarkSample(const uint64_t num_64)
217 |       : population_(num_64), chosen_(kNumChosen) {
218 |     std::iota(population_.begin(), population_.end(), 0);
219 |   }
220 | 
221 |   template <class Engine>
222 |   uint64_t operator()(const uint64_t num_64, Engine& engine) const {
223 |     // Can replace with std::sample after C++17.
224 |     std::copy(population_.begin(), population_.begin() + kNumChosen,
225 |               chosen_.begin());
226 |     UniformInt dist;
227 |     for (size_t i = kNumChosen; i < num_64; ++i) {
228 |       const UniformInt::param_type param(0, i);
229 |       const size_t index = dist(engine, param);
230 |       if (index < kNumChosen) {
231 |         chosen_[index] = population_[i];
232 |       }
233 |     }
234 | 
235 |     return chosen_.front();
236 |   }
237 | 
238 |  private:
239 |   static constexpr size_t kNumChosen = 10000;
240 | 
241 |   std::vector<int> population_;
242 |   mutable std::vector<int> chosen_;
243 | };
244 | 
245 | // Actual application: Monte Carlo estimation of Pi * 1E6.
246 | class BenchmarkMonteCarlo {
247 |  public:
248 |   static size_t Num64() { return 200000; }
249 | 
250 |   explicit BenchmarkMonteCarlo(const uint64_t num_64) {}
251 | 
252 |   template <class Engine>
253 |   uint64_t operator()(const uint64_t num_64, Engine& engine) const {
254 |     int64_t in_circle = 0;
255 |     for (size_t i = 0; i < num_64; i += 2) {
256 |       const double x = dist_(engine);
257 |       const double y = dist_(engine);
258 |       in_circle += (x * x + y * y) < 1.0;
259 |     }
260 |     return 8 * 1000 * 1000 * in_circle / num_64;
261 |   }
262 | 
263 |  private:
264 |   mutable UniformDouble dist_;
265 | };
266 | 
267 | template <class Benchmark, class Engine>
268 | void RunBenchmark(const char* caption, Engine& engine, const int unpredictable1,
269 |                   const Benchmark& benchmark) {
270 |   printf("%8s: ", caption);
271 |   const size_t kNumInputs = 1;
272 |   const FuncInput inputs[kNumInputs] = {
273 |       static_cast<FuncInput>(Benchmark::Num64() * unpredictable1)};
274 |   Result results[kNumInputs];
275 | 
276 |   Params p;
277 |   p.verbose = false;
278 | #if defined(__powerpc__)
279 |   p.max_evals = 7;
280 | #else
281 |   p.max_evals = 8;
282 | #endif
283 |   p.target_rel_mad = 0.002;
284 |   const size_t num_results = MeasureClosure(
285 |       [&benchmark, &engine](const FuncInput input) {
286 |         return benchmark(input, engine);
287 |       },
288 |       inputs, kNumInputs, results, p);
289 |   RANDEN_CHECK(num_results == kNumInputs);
290 |   for (size_t i = 0; i < num_results; ++i) {
291 |     const double cycles_per_byte =
292 |         results[i].ticks / (results[i].input * sizeof(uint64_t));
293 |     const double mad = results[i].variability * cycles_per_byte;
294 |     printf("%6zu: %5.2f (+/- %5.3f)\n", results[i].input, cycles_per_byte, mad);
295 |   }
296 | }
297 | 
298 | // Calls RunBenchmark for each (enabled) engine.
299 | template <class Benchmark>
300 | void ForeachEngine(const int unpredictable1) {
301 |   using T = uint64_t;  // WARNING: keep in sync with MT/PCG.
302 | 
303 |   const Benchmark benchmark(
304 |       static_cast<uint64_t>(Benchmark::Num64() * unpredictable1));
305 | 
306 | #if ENABLE_RANDEN
307 |   Randen<T> eng_randen;
308 |   RunBenchmark("Randen", eng_randen, unpredictable1, benchmark);
309 | #endif
310 | 
311 | #if ENABLE_PCG
312 |   // Quoting from pcg_random.hpp: "the c variants offer better crypographic
313 |   // security (just how good the cryptographic security is is an open
314 |   // question)".
315 |   pcg64_c32 eng_pcg;
316 |   RunBenchmark("PCG", eng_pcg, unpredictable1, benchmark);
317 | #endif
318 | 
319 | #if ENABLE_MT
320 |   std::mt19937_64 eng_mt;
321 |   RunBenchmark("MT", eng_mt, unpredictable1, benchmark);
322 | #endif
323 | 
324 | 
325 | #if ENABLE_CHACHA
326 |   ChaCha<T> eng_chacha(0x243f6a8885a308d3ull, 0x243F6A8885A308D3ull);
327 |   RunBenchmark("ChaCha8", eng_chacha, unpredictable1, benchmark);
328 | #endif
329 | 
330 | #if ENABLE_OS
331 |   EngineOS<T> eng_os;
332 |   RunBenchmark("OS", eng_os, unpredictable1, benchmark);
333 | #endif
334 | 
335 |   printf("\n");
336 | }
337 | 
338 | void RunAll(int argc, char* argv[]) {
339 |   // Immediately output any results (for non-local runs).
340 |   setvbuf(stdout, nullptr, _IONBF, 0);
341 | 
342 |   printf("Config: enable std=%d\n", USE_STD_DISTRIBUTIONS);
343 | 
344 |   // Avoid migrating between cores - important on multi-socket systems.
345 |   int cpu = -1;
346 |   if (argc == 2) {
347 |     cpu = strtol(argv[1], nullptr, 10);
348 |   }
349 |   platform::PinThreadToCPU(cpu);
350 | 
351 |   // Ensures the iteration counts are not compile-time constants.
352 |   const int unpredictable1 = argc != 999;
353 | 
354 |   ForeachEngine<BenchmarkLoop>(unpredictable1);
355 |   ForeachEngine<BenchmarkShuffle>(unpredictable1);
356 |   ForeachEngine<BenchmarkSample>(unpredictable1);
357 |   ForeachEngine<BenchmarkMonteCarlo>(unpredictable1);
358 | }
359 | 
360 | }  // namespace
361 | }  // namespace randen
362 | 
363 | int main(int argc, char* argv[]) {
364 |   randen::RunAll(argc, argv);
365 |   return 0;
366 | }
367 | 


--------------------------------------------------------------------------------
/randen_test.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "randen.h"
 16 | 
 17 | #include <stdio.h>
 18 | #include <algorithm>
 19 | #include <random>  // seed_seq
 20 | #include <sstream>
 21 | 
 22 | #define UPDATE_GOLDEN 0
 23 | #define ENABLE_VERIFY 1
 24 | #define ENABLE_DUMP 0
 25 | 
 26 | namespace randen {
 27 | namespace {
 28 | 
 29 | #define STR(x) #x
 30 | 
 31 | #define ASSERT_TRUE(condition)                                                \
 32 |   do {                                                                        \
 33 |     if (!(condition)) {                                                       \
 34 |       printf("Assertion [" STR(condition) "] failed on line %d\n", __LINE__); \
 35 |       abort();                                                                \
 36 |     }                                                                         \
 37 |   } while (false)
 38 | 
 39 | using EngRanden = Randen<uint64_t>;
 40 | 
 41 | #if ENABLE_VERIFY
 42 | 
 43 | void VerifyReseedChangesAllValues() {
 44 |   const size_t kNumOutputs = 127;
 45 |   EngRanden engine;
 46 | 
 47 |   std::seed_seq seq1{1, 2, 3, 4, 5, 6, 7};
 48 |   engine.seed(seq1);
 49 |   uint64_t out1[kNumOutputs];
 50 |   for (size_t i = 0; i < kNumOutputs; ++i) {
 51 |     out1[i] = engine();
 52 |   }
 53 | 
 54 |   std::seed_seq seq2{127, 255, 511};
 55 |   engine.seed(seq2);
 56 |   uint64_t out2[kNumOutputs];
 57 |   engine.seed(seq2);
 58 | 
 59 |   for (size_t i = 0; i < kNumOutputs; ++i) {
 60 |     out2[i] = engine();
 61 |     ASSERT_TRUE(out2[i] != out1[i]);
 62 |   }
 63 | }
 64 | 
 65 | void VerifyDiscard() {
 66 |   const int N = 56;  // two buffer's worth
 67 |   for (int num_used = 0; num_used < N; ++num_used) {
 68 |     EngRanden engine_used;
 69 |     for (int i = 0; i < num_used; ++i) {
 70 |       (void)engine_used();
 71 |     }
 72 | 
 73 |     for (int num_discard = 0; num_discard < N; ++num_discard) {
 74 |       EngRanden engine1 = engine_used;
 75 |       EngRanden engine2 = engine_used;
 76 |       for (int i = 0; i < num_discard; ++i) {
 77 |         (void)engine1();
 78 |       }
 79 |       engine2.discard(num_discard);
 80 |       for (int i = 0; i < N; ++i) {
 81 |         const uint64_t r1 = engine1();
 82 |         const uint64_t r2 = engine2();
 83 |         ASSERT_TRUE(r1 == r2);
 84 |       }
 85 |     }
 86 |   }
 87 | }
 88 | 
 89 | void VerifyGolden() {
 90 |   // prime number => some buffer values unused.
 91 |   const size_t kNumOutputs = 127;
 92 | #if UPDATE_GOLDEN
 93 |   EngRanden engine;
 94 |   for (size_t i = 0; i < kNumOutputs; ++i) {
 95 |     printf("0x%016lx,\n", engine());
 96 |   }
 97 |   printf("\n");
 98 | #else
 99 |   const uint64_t golden[kNumOutputs] = {
100 |       0xdda9f47cd90410ee, 0xc3c14f134e433977, 0xf0b780f545c72912,
101 |       0x887bf3087fd8ca10, 0x30ec63baff3c6d59, 0x15dbb1d37696599f,
102 |       0x02808a316f49a54c, 0xb29f73606f7f20a6, 0x9cbf605e3fd9de8a,
103 |       0x3b8feaf9d5c8e50e, 0xd8b2ffd356301ed5, 0xc970ae1a78183bbb,
104 |       0xcdfd8d76eb8f9a19, 0xf4b327fe0fc73c37, 0xd5af05dd3eff9556,
105 |       0xc3a506eb91420c9d, 0x7023920e0d6bfe8c, 0x48db1bb78f83c4a1,
106 |       0xed1ef4c26b87b840, 0x58d3575834956d42, 0x497cabf3431154fc,
107 |       0x8eef32a23e0b2df3, 0xd88b5749f090e5ea, 0x4e24370570029a8b,
108 |       0x78fcec2cbb6342f5, 0xc651a582a970692f, 0x352ee4ad1816afe3,
109 |       0x463cb745612f55db, 0x811ef0821c3de851, 0x026ff374c101da7e,
110 |       0xa0660379992d58fc, 0x6f7e616704c4fa59, 0x915f3445685da798,
111 |       0x04b0a374a3b795c7, 0x4663352533ce1882, 0x26802a8ac76571ce,
112 |       0x5588ba3a4d6e6c51, 0xb9fdefb4a24dc738, 0x607195a5e200f5fd,
113 |       0xa2101a42d35f1956, 0xe1e5e03c759c0709, 0x7e100308f3290764,
114 |       0xcbcf585399e432f1, 0x082572cc5da6606f, 0x0904469acbfee8f2,
115 |       0xe8a2be4f8335d8f1, 0x08e8a1f1a69da69a, 0xf08bd31b6daecd51,
116 |       0x2e9705bb053d6b46, 0x6542a20aad57bff5, 0x78e3a810213b6ffb,
117 |       0xda2fc9db0713c391, 0xc0932718cd55781f, 0xdc16a59cdd85f8a6,
118 |       0xb97289c1be0f2f9c, 0xb9bfb29c2b20bfe5, 0x5524bb834771435b,
119 |       0xc0a2a0e403a892d4, 0xff4af3ab8d1b78c5, 0x8265da3d39d1a750,
120 |       0x66e455f627495189, 0xf0ec5f424bcad77f, 0x3424e47dc22596e3,
121 |       0xc82d3120b57e3270, 0xc191c595afc4dcbf, 0xbc0c95129ccedcdd,
122 |       0x7f90650ea6cd6ab4, 0x120392bd2bb70939, 0xa7c8fac5a7917eb0,
123 |       0x7287491832695ad3, 0x7c1bf9839c7c1ce5, 0xd088cb9418be0361,
124 |       0x78565cdefd28c4ad, 0xe2e991fa58e1e79e, 0x2a9eac28b08c96bf,
125 |       0x7351b9fef98bafad, 0x13a685861bab87e0, 0x6c4f179696cb2225,
126 |       0x30537425cac70991, 0x64c6de5aa0501971, 0x7e05e3aa8ec720dc,
127 |       0x01590d9dc6c532b7, 0x738184388f3bc1d2, 0x74a07d9c54e3e63f,
128 |       0x6bcdf185561f255f, 0x26ffdc5067be3acb, 0x171df81934f68604,
129 |       0xa0eaf2e1cf99b1c6, 0x5d1cb02075ba1cea, 0x7ea5a21665683e5a,
130 |       0xba6364eff80de02f, 0x957f38cbd2123fdf, 0x892d8317de82f7a2,
131 |       0x606e0a0e41d452ee, 0x4eb28826766fcf5b, 0xe707b1db50f7b43e,
132 |       0x6ee217df16527d78, 0x5a362d56e80a0951, 0x443e63857d4076ca,
133 |       0xf6737962ba6b23dd, 0xd796b052151ee94d, 0x790d9a5f048adfeb,
134 |       0x8b833ff84893da5d, 0x033ed95c12b04a03, 0x9877c4225061ca76,
135 |       0x3d6724b1bb15eab9, 0x42e5352fe30ce989, 0xd68d6810adf74fb3,
136 |       0x3cdbf7e358df4b8b, 0x265b565a7431fde7, 0x52d2242f65b37f88,
137 |       0x2922a47f6d3e8779, 0x29d40f00566d5e26, 0x5d836d6e2958d6b5,
138 |       0x6c056608b7d9c1b6, 0x288db0e1124b14a0, 0x8fb946504faa6c9d,
139 |       0x0b9471bdb8f19d32, 0xfd1fe27d144a09e0, 0x8943a9464540251c,
140 |       0x8048f217633fce36, 0xea6ac458da141bda, 0x4334b8b02ff7612f,
141 |       0xfeda1384ade74d31, 0x096d119a3605c85b, 0xdbc8441f5227e216,
142 |       0x541ad7efa6ddc1d3};
143 |   EngRanden engine;
144 |   for (size_t i = 0; i < kNumOutputs; ++i) {
145 |     ASSERT_TRUE(golden[i] == engine());
146 |   }
147 | #endif
148 | }
149 | 
150 | #endif  // ENABLE_VERIFY
151 | 
152 | void VerifyRandReqEngine() {
153 |   // Validates that Randen satisfies [rand.req.engine].
154 |   // Names after definition of [rand.req.engine] in C++ standard.
155 |   // e is a value of E
156 |   // v is a lvalue of E
157 |   // x, y are possibly const values of E
158 |   // s is a value of T
159 |   // q is a value satisfying requirements of seed_sequence
160 |   // z is a value of type unsigned long long
161 |   // os is a some specialization of basic_ostream
162 |   // is is a some specialization of basic_istream
163 | 
164 |   using E = EngRanden;
165 |   using T = typename EngRanden::result_type;
166 | 
167 |   static_assert(std::is_copy_constructible<E>::value,
168 |                 "Randen must be copy constructible");
169 | 
170 |   static_assert(std::is_copy_assignable<E>::value,
171 |                 "Randen must be copy assignable");
172 | 
173 |   E e, v;
174 |   const E x, y;
175 |   T s = 1;
176 |   std::seed_seq q{1, 2, 3};
177 |   unsigned long long z = 1;  // NOLINT(runtime/int)
178 |   std::wostringstream os;
179 |   std::wistringstream is;
180 | 
181 |   E{};
182 |   E{x};
183 |   E{s};
184 |   E{q};
185 | 
186 |   // Verify that seed() and default-construct is identical.
187 |   e.seed();
188 |   {
189 |     E f;
190 |     ASSERT_TRUE(e == f);
191 |   }
192 | 
193 |   // Verify the seed() result type.
194 |   static_assert(std::is_same<decltype(e.seed(s)), void>::value,
195 |                 "return type of seed() must be void");
196 | 
197 |   static_assert(std::is_same<decltype(e.seed(q)), void>::value,
198 |                 "return type of seed() must be void");
199 | 
200 |   // verify that seed via seed_sequence and construct via seed_sequence
201 |   // is identical.
202 |   e.seed(q);
203 |   {
204 |     E f{q};
205 |     ASSERT_TRUE(e == f);
206 |   }
207 | 
208 |   // Verify the operator() result type.
209 |   static_assert(std::is_same<decltype(e()), T>::value,
210 |                 "return type of operator() must be result_type");
211 | 
212 |   // Verify that once the state has advanced that the engines
213 |   // are no longer equal.
214 |   e();
215 |   {
216 |     E f{q};
217 |     ASSERT_TRUE(e != f);
218 |   }
219 | 
220 |   {
221 |     E f;
222 |     ASSERT_TRUE(e != f);
223 |   }
224 | 
225 |   // Verify discard.
226 |   e.discard(z);
227 |   {
228 |     // The state equivalence should change.
229 |     E f, g;
230 |     f.discard(2);
231 |     ASSERT_TRUE(f != g);
232 | 
233 |     g();
234 |     g();
235 |     ASSERT_TRUE(f == g);
236 |   }
237 | 
238 |   // Verify operator == result types.
239 |   static_assert(std::is_same<decltype(x == y), bool>::value,
240 |                 "return type of operator== must be bool");
241 | 
242 |   static_assert(std::is_same<decltype(x != y), bool>::value,
243 |                 "return type of operator!= must be bool");
244 | 
245 |   // Verify operator<<() result.
246 |   {
247 |     auto& os2 = (os << e);
248 |     ASSERT_TRUE(&os2 == &os);
249 |   }
250 | 
251 |   // Verify operator>>() result.
252 |   {
253 |     auto& is2 = (is >> e);
254 |     ASSERT_TRUE(&is2 == &is);
255 |   }
256 | }
257 | 
258 | void VerifyStreamOperators() {
259 |   EngRanden engine1(171);
260 |   EngRanden engine2;
261 | 
262 |   {
263 |     std::stringstream stream;
264 |     stream << engine1;
265 |     stream >> engine2;
266 |   }
267 | 
268 |   const int N = 56;  // two buffer's worth
269 |   for (int i = 0; i < N; ++i) {
270 |     const uint64_t r1 = engine1();
271 |     const uint64_t r2 = engine2();
272 |     ASSERT_TRUE(r1 == r2);
273 |   }
274 | }
275 | 
276 | void Verify() {
277 | #if ENABLE_VERIFY
278 |   VerifyReseedChangesAllValues();
279 |   VerifyDiscard();
280 |   VerifyGolden();
281 |   VerifyRandReqEngine();
282 |   VerifyStreamOperators();
283 | #endif
284 | }
285 | 
286 | void DumpOutput() {
287 | #if ENABLE_DUMP
288 |   const size_t kNumOutputs = 1500 * 1000 * 1000;
289 |   std::vector<uint64_t> outputs(kNumOutputs);
290 |   EngRanden engine;
291 |   for (size_t i = 0; i < kNumOutputs; ++i) {
292 |     outputs[i] = engine();
293 |   }
294 | 
295 |   FILE* f = fopen("/tmp/randen.bin", "wb");
296 |   if (f != nullptr) {
297 |     fwrite(outputs.data(), kNumOutputs, 8, f);
298 |     fclose(f);
299 |   }
300 | #endif  // ENABLE_DUMP
301 | }
302 | 
303 | void RunAll() {
304 |   // Immediately output any results (for non-local runs).
305 |   setvbuf(stdout, nullptr, _IONBF, 0);
306 | 
307 |   Verify();
308 |   DumpOutput();
309 | }
310 | 
311 | }  // namespace
312 | }  // namespace randen
313 | 
314 | int main(int argc, char* argv[]) {
315 |   randen::RunAll();
316 |   return 0;
317 | }
318 | 


--------------------------------------------------------------------------------
/third_party/pcg_random/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/third_party/pcg_random/include/pcg_extras.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * PCG Random Number Generation for C++
  3 |  *
  4 |  * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  *
 18 |  * For additional information about the PCG random number generation scheme,
 19 |  * including its license and other licensing options, visit
 20 |  *
 21 |  *     http://www.pcg-random.org
 22 |  */
 23 | 
 24 | /*
 25 |  * This file provides support code that is useful for random-number generation
 26 |  * but not specific to the PCG generation scheme, including:
 27 |  *      - 128-bit int support for platforms where it isn't available natively
 28 |  *      - bit twiddling operations
 29 |  *      - I/O of 128-bit and 8-bit integers
 30 |  *      - Handling the evilness of SeedSeq
 31 |  *      - Support for efficiently producing random numbers less than a given
 32 |  *        bound
 33 |  */
 34 | 
 35 | #ifndef PCG_EXTRAS_HPP_INCLUDED
 36 | #define PCG_EXTRAS_HPP_INCLUDED 1
 37 | 
 38 | #include <cinttypes>
 39 | #include <cstddef>
 40 | #include <cstdlib>
 41 | #include <cstring>
 42 | #include <cassert>
 43 | #include <limits>
 44 | #include <iostream>
 45 | #include <type_traits>
 46 | #include <utility>
 47 | #include <locale>
 48 | #include <iterator>
 49 | #include <utility>
 50 | 
 51 | #ifdef __GNUC__
 52 |     #include <cxxabi.h>
 53 | #endif
 54 | 
 55 | /*
 56 |  * Abstractions for compiler-specific directives
 57 |  */
 58 | 
 59 | #ifdef __GNUC__
 60 |     #define PCG_NOINLINE __attribute__((noinline))
 61 |     #define PCG_INLINE   __attribute__((always_inline))
 62 | #else
 63 |     #define PCG_NOINLINE
 64 |     #define PCG_INLINE
 65 | #endif
 66 | 
 67 | /*
 68 |  * Some members of the PCG library use 128-bit math.  When compiling on 64-bit
 69 |  * platforms, both GCC and Clang provide 128-bit integer types that are ideal
 70 |  * for the job.
 71 |  *
 72 |  * On 32-bit platforms (or with other compilers), we fall back to a C++
 73 |  * class that provides 128-bit unsigned integers instead.  It may seem
 74 |  * like we're reinventing the wheel here, because libraries already exist
 75 |  * that support large integers, but most existing libraries provide a very
 76 |  * generic multiprecision code, but here we're operating at a fixed size.
 77 |  * Also, most other libraries are fairly heavyweight.  So we use a direct
 78 |  * implementation.  Sadly, it's much slower than hand-coded assembly or
 79 |  * direct CPU support.
 80 |  *
 81 |  */
 82 | #if __SIZEOF_INT128__
 83 |     namespace pcg_extras {
 84 |         typedef __uint128_t pcg128_t;
 85 |     }
 86 |     #define PCG_128BIT_CONSTANT(high,low) \
 87 |             ((pcg128_t(high) << 64) + low)
 88 | #else
 89 |     #include "pcg_uint128.hpp"
 90 |     namespace pcg_extras {
 91 |         typedef pcg_extras::uint_x4<uint32_t,uint64_t> pcg128_t;
 92 |     }
 93 |     #define PCG_128BIT_CONSTANT(high,low) \
 94 |             pcg128_t(high,low)
 95 |     #define PCG_EMULATED_128BIT_MATH 1
 96 | #endif
 97 | 
 98 | 
 99 | // google3 crosstool consistently fails to recognize rotr / rotl methods as
100 | // hardware rotations, so force it to use inlined assembly.
101 | // TODO(ahh): switch *everything* to wg21.link/P0553 when that's an option.
102 | #define PCG_USE_INLINE_ASM 1
103 | 
104 | namespace pcg_extras {
105 | 
106 | /*
107 |  * We often need to represent a "number of bits".  When used normally, these
108 |  * numbers are never greater than 128, so an unsigned char is plenty.
109 |  * If you're using a nonstandard generator of a larger size, you can set
110 |  * PCG_BITCOUNT_T to have it define it as a larger size.  (Some compilers
111 |  * might produce faster code if you set it to an unsigned int.)
112 |  */
113 | 
114 | #ifndef PCG_BITCOUNT_T
115 |     typedef uint8_t bitcount_t;
116 | #else
117 |     typedef PCG_BITCOUNT_T bitcount_t;
118 | #endif
119 | 
120 | /*
121 |  * C++ requires us to be able to serialize RNG state by printing or reading
122 |  * it from a stream.  Because we use 128-bit ints, we also need to be able
123 |  * ot print them, so here is code to do so.
124 |  *
125 |  * This code provides enough functionality to print 128-bit ints in decimal
126 |  * and zero-padded in hex.  It's not a full-featured implementation.
127 |  */
128 | 
129 | template <typename CharT, typename Traits>
130 | std::basic_ostream<CharT,Traits>&
131 | operator<<(std::basic_ostream<CharT,Traits>& out, pcg128_t value)
132 | {
133 |     auto desired_base = out.flags() & out.basefield;
134 |     bool want_hex = desired_base == out.hex;
135 | 
136 |     if (want_hex) {
137 |         uint64_t highpart = uint64_t(value >> 64);
138 |         uint64_t lowpart  = uint64_t(value);
139 |         auto desired_width = out.width();
140 |         if (desired_width > 16) {
141 |             out.width(desired_width - 16);
142 |         }
143 |         if (highpart != 0 || desired_width > 16)
144 |             out << highpart;
145 |         CharT oldfill = '\0';
146 |         if (highpart != 0) {
147 |             out.width(16);
148 |             oldfill = out.fill('0');
149 |         }
150 |         auto oldflags = out.setf(decltype(desired_base){}, out.showbase);
151 |         out << lowpart;
152 |         out.setf(oldflags);
153 |         if (highpart != 0) {
154 |             out.fill(oldfill);
155 |         }
156 |         return out;
157 |     }
158 |     constexpr size_t MAX_CHARS_128BIT = 40;
159 | 
160 |     char buffer[MAX_CHARS_128BIT];
161 |     char* pos = buffer+sizeof(buffer);
162 |     *(--pos) = '\0';
163 |     constexpr auto BASE = pcg128_t(10ULL);
164 |     do {
165 |         auto div = value / BASE;
166 |         auto mod = uint32_t(value - (div * BASE));
167 |         *(--pos) = '0' + char(mod);
168 |         value = div;
169 |     } while(value != pcg128_t(0ULL));
170 |     return out << pos;
171 | }
172 | 
173 | template <typename CharT, typename Traits>
174 | std::basic_istream<CharT,Traits>&
175 | operator>>(std::basic_istream<CharT,Traits>& in, pcg128_t& value)
176 | {
177 |     typename std::basic_istream<CharT,Traits>::sentry s(in);
178 | 
179 |     if (!s)
180 |          return in;
181 | 
182 |     constexpr auto BASE = pcg128_t(10ULL);
183 |     pcg128_t current(0ULL);
184 |     bool did_nothing = true;
185 |     bool overflow = false;
186 |     for(;;) {
187 |         CharT wide_ch = in.get();
188 |         if (!in.good())
189 |             break;
190 |         auto ch = in.narrow(wide_ch, '\0');
191 |         if (ch < '0' || ch > '9') {
192 |             in.unget();
193 |             break;
194 |         }
195 |         did_nothing = false;
196 |         pcg128_t digit(uint32_t(ch - '0'));
197 |         pcg128_t timesbase = current*BASE;
198 |         overflow = overflow || timesbase < current;
199 |         current = timesbase + digit;
200 |         overflow = overflow || current < digit;
201 |     }
202 | 
203 |     if (did_nothing || overflow) {
204 |         in.setstate(std::ios::failbit);
205 |         if (overflow)
206 |             current = ~pcg128_t(0ULL);
207 |     }
208 | 
209 |     value = current;
210 | 
211 |     return in;
212 | }
213 | 
214 | /*
215 |  * Likewise, if people use tiny rngs, we'll be serializing uint8_t.
216 |  * If we just used the provided IO operators, they'd read/write chars,
217 |  * not ints, so we need to define our own.  We *can* redefine this operator
218 |  * here because we're in our own namespace.
219 |  */
220 | 
221 | template <typename CharT, typename Traits>
222 | std::basic_ostream<CharT,Traits>&
223 | operator<<(std::basic_ostream<CharT,Traits>&out, uint8_t value)
224 | {
225 |     return out << uint32_t(value);
226 | }
227 | 
228 | template <typename CharT, typename Traits>
229 | std::basic_istream<CharT,Traits>&
230 | operator>>(std::basic_istream<CharT,Traits>& in, uint8_t& target)
231 | {
232 |     uint32_t value = 0xdecea5edU;
233 |     in >> value;
234 |     if (!in && value == 0xdecea5edU)
235 |         return in;
236 |     if (value > uint8_t(~0)) {
237 |         in.setstate(std::ios::failbit);
238 |         value = ~0U;
239 |     }
240 |     target = uint8_t(value);
241 |     return in;
242 | }
243 | 
244 | /* Unfortunately, the above functions don't get found in preference to the
245 |  * built in ones, so we create some more specific overloads that will.
246 |  * Ugh.
247 |  */
248 | 
249 | inline std::ostream& operator<<(std::ostream& out, uint8_t value)
250 | {
251 |     return pcg_extras::operator<< <char>(out, value);
252 | }
253 | 
254 | inline std::istream& operator>>(std::istream& in, uint8_t& value)
255 | {
256 |     return pcg_extras::operator>> <char>(in, value);
257 | }
258 | 
259 | 
260 | 
261 | /*
262 |  * Useful bitwise operations.
263 |  */
264 | 
265 | /*
266 |  * XorShifts are invertable, but they are someting of a pain to invert.
267 |  * This function backs them out.  It's used by the whacky "inside out"
268 |  * generator defined later.
269 |  */
270 | 
271 | template <typename itype>
272 | inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift)
273 | {
274 |     if (2*shift >= bits) {
275 |         return x ^ (x >> shift);
276 |     }
277 |     itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1;
278 |     itype highmask1 = ~lowmask1;
279 |     itype top1 = x;
280 |     itype bottom1 = x & lowmask1;
281 |     top1 ^= top1 >> shift;
282 |     top1 &= highmask1;
283 |     x = top1 | bottom1;
284 |     itype lowmask2 = (itype(1U) << (bits - shift)) - 1;
285 |     itype bottom2 = x & lowmask2;
286 |     bottom2 = unxorshift(bottom2, bits - shift, shift);
287 |     bottom2 &= lowmask1;
288 |     return top1 | bottom2;
289 | }
290 | 
291 | /*
292 |  * Rotate left and right.
293 |  *
294 |  * In ideal world, compilers would spot idiomatic rotate code and convert it
295 |  * to a rotate instruction.  Of course, opinions vary on what the correct
296 |  * idiom is and how to spot it.  For clang, sometimes it generates better
297 |  * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM.
298 |  */
299 | 
300 | template <typename itype>
301 | inline itype rotl(itype value, bitcount_t rot)
302 | {
303 |     constexpr bitcount_t bits = sizeof(itype) * 8;
304 |     constexpr bitcount_t mask = bits - 1;
305 | #if PCG_USE_ZEROCHECK_ROTATE_IDIOM
306 |     return rot ? (value << rot) | (value >> (bits - rot)) : value;
307 | #else
308 |     return (value << rot) | (value >> ((- rot) & mask));
309 | #endif
310 | }
311 | 
312 | template <typename itype>
313 | inline itype rotr(itype value, bitcount_t rot)
314 | {
315 |     constexpr bitcount_t bits = sizeof(itype) * 8;
316 |     constexpr bitcount_t mask = bits - 1;
317 | #if PCG_USE_ZEROCHECK_ROTATE_IDIOM
318 |     return rot ? (value >> rot) | (value << (bits - rot)) : value;
319 | #else
320 |     return (value >> rot) | (value << ((- rot) & mask));
321 | #endif
322 | }
323 | 
324 | /* Unfortunately, both Clang and GCC sometimes perform poorly when it comes
325 |  * to properly recognizing idiomatic rotate code, so for we also provide
326 |  * assembler directives (enabled with PCG_USE_INLINE_ASM).  Boo, hiss.
327 |  * (I hope that these compilers get better so that this code can die.)
328 |  *
329 |  * These overloads will be preferred over the general template code above.
330 |  */
331 | 
332 | #if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__  || __i386__)
333 | 
334 | inline uint8_t rotr(uint8_t value, bitcount_t rot)
335 | {
336 |     asm ("rorb   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
337 |     return value;
338 | }
339 | 
340 | inline uint16_t rotr(uint16_t value, bitcount_t rot)
341 | {
342 |     asm ("rorw   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
343 |     return value;
344 | }
345 | 
346 | inline uint32_t rotr(uint32_t value, bitcount_t rot)
347 | {
348 |     asm ("rorl   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
349 |     return value;
350 | }
351 | 
352 | #if __x86_64__
353 | inline uint64_t rotr(uint64_t value, bitcount_t rot)
354 | {
355 |     asm ("rorq   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
356 |     return value;
357 | }
358 | #endif // __x86_64__
359 | 
360 | #endif // PCG_USE_INLINE_ASM
361 | 
362 | 
363 | /*
364 |  * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of
365 |  * 32-bit integers with seed data, but sometimes we want to produce
366 |  * larger or smaller integers.
367 |  *
368 |  * The following code handles this annoyance.
369 |  *
370 |  * uneven_copy will copy an array of 32-bit ints to an array of larger or
371 |  * smaller ints (actually, the code is general it only needing forward
372 |  * iterators).  The copy is identical to the one that would be performed if
373 |  * we just did memcpy on a standard little-endian machine, but works
374 |  * regardless of the endian of the machine (or the weirdness of the ints
375 |  * involved).
376 |  *
377 |  * generate_to initializes an array of integers using a SeedSeq
378 |  * object.  It is given the size as a static constant at compile time and
379 |  * tries to avoid memory allocation.  If we're filling in 32-bit constants
380 |  * we just do it directly.  If we need a separate buffer and it's small,
381 |  * we allocate it on the stack.  Otherwise, we fall back to heap allocation.
382 |  * Ugh.
383 |  *
384 |  * generate_one produces a single value of some integral type using a
385 |  * SeedSeq object.
386 |  */
387 | 
388 |  /* uneven_copy helper, case where destination ints are less than 32 bit. */
389 | 
390 | template<class SrcIter, class DestIter>
391 | SrcIter uneven_copy_impl(
392 |     SrcIter src_first, DestIter dest_first, DestIter dest_last,
393 |     std::true_type)
394 | {
395 |     typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
396 |     typedef typename std::iterator_traits<DestIter>::value_type dest_t;
397 | 
398 |     constexpr bitcount_t SRC_SIZE  = sizeof(src_t);
399 |     constexpr bitcount_t DEST_SIZE = sizeof(dest_t);
400 |     constexpr bitcount_t DEST_BITS = DEST_SIZE * 8;
401 |     constexpr bitcount_t SCALE     = SRC_SIZE / DEST_SIZE;
402 | 
403 |     size_t count = 0;
404 |     src_t value = 0;
405 | 
406 |     while (dest_first != dest_last) {
407 |         if ((count++ % SCALE) == 0)
408 |             value = *src_first++;       // Get more bits
409 |         else
410 |             value >>= DEST_BITS;        // Move down bits
411 | 
412 |         *dest_first++ = dest_t(value);  // Truncates, ignores high bits.
413 |     }
414 |     return src_first;
415 | }
416 | 
417 |  /* uneven_copy helper, case where destination ints are more than 32 bit. */
418 | 
419 | template<class SrcIter, class DestIter>
420 | SrcIter uneven_copy_impl(
421 |     SrcIter src_first, DestIter dest_first, DestIter dest_last,
422 |     std::false_type)
423 | {
424 |     typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
425 |     typedef typename std::iterator_traits<DestIter>::value_type dest_t;
426 | 
427 |     constexpr auto SRC_SIZE  = sizeof(src_t);
428 |     constexpr auto SRC_BITS  = SRC_SIZE * 8;
429 |     constexpr auto DEST_SIZE = sizeof(dest_t);
430 |     constexpr auto SCALE     = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE;
431 | 
432 |     while (dest_first != dest_last) {
433 |         dest_t value(0UL);
434 |         unsigned int shift = 0;
435 | 
436 |         for (size_t i = 0; i < SCALE; ++i) {
437 |             value |= dest_t(*src_first++) << shift;
438 |             shift += SRC_BITS;
439 |         }
440 | 
441 |         *dest_first++ = value;
442 |     }
443 |     return src_first;
444 | }
445 | 
446 | /* uneven_copy, call the right code for larger vs. smaller */
447 | 
448 | template<class SrcIter, class DestIter>
449 | inline SrcIter uneven_copy(SrcIter src_first,
450 |                            DestIter dest_first, DestIter dest_last)
451 | {
452 |     typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
453 |     typedef typename std::iterator_traits<DestIter>::value_type dest_t;
454 | 
455 |     constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t);
456 | 
457 |     return uneven_copy_impl(src_first, dest_first, dest_last,
458 |                             std::integral_constant<bool, DEST_IS_SMALLER>{});
459 | }
460 | 
461 | /* generate_to, fill in a fixed-size array of integral type using a SeedSeq
462 |  * (actually works for any random-access iterator)
463 |  */
464 | 
465 | template <size_t size, typename SeedSeq, typename DestIter>
466 | inline void generate_to_impl(SeedSeq&& generator, DestIter dest,
467 |                              std::true_type)
468 | {
469 |     generator.generate(dest, dest+size);
470 | }
471 | 
472 | template <size_t size, typename SeedSeq, typename DestIter>
473 | void generate_to_impl(SeedSeq&& generator, DestIter dest,
474 |                       std::false_type)
475 | {
476 |     typedef typename std::iterator_traits<DestIter>::value_type dest_t;
477 |     constexpr auto DEST_SIZE = sizeof(dest_t);
478 |     constexpr auto GEN_SIZE  = sizeof(uint32_t);
479 | 
480 |     constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE;
481 |     constexpr size_t FROM_ELEMS =
482 |         GEN_IS_SMALLER
483 |             ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE)
484 |             : (size + (GEN_SIZE / DEST_SIZE) - 1)
485 |                 / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER);
486 |                         //  this odd code ^^^^^^^^^^^^^^^^^ is work-around for
487 |                         //  a bug: http://llvm.org/bugs/show_bug.cgi?id=21287
488 | 
489 |     if (FROM_ELEMS <= 1024) {
490 |         uint32_t buffer[FROM_ELEMS];
491 |         generator.generate(buffer, buffer+FROM_ELEMS);
492 |         uneven_copy(buffer, dest, dest+size);
493 |     } else {
494 |         uint32_t* buffer = static_cast<uint32_t*>(malloc(GEN_SIZE * FROM_ELEMS));
495 |         generator.generate(buffer, buffer+FROM_ELEMS);
496 |         uneven_copy(buffer, dest, dest+size);
497 |         free(static_cast<void*>(buffer));
498 |     }
499 | }
500 | 
501 | template <size_t size, typename SeedSeq, typename DestIter>
502 | inline void generate_to(SeedSeq&& generator, DestIter dest)
503 | {
504 |     typedef typename std::iterator_traits<DestIter>::value_type dest_t;
505 |     constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t);
506 | 
507 |     generate_to_impl<size>(std::forward<SeedSeq>(generator), dest,
508 |                            std::integral_constant<bool, IS_32BIT>{});
509 | }
510 | 
511 | /* generate_one, produce a value of integral type using a SeedSeq
512 |  * (optionally, we can have it produce more than one and pick which one
513 |  * we want)
514 |  */
515 | 
516 | template <typename UInt, size_t i = 0UL, size_t N = i+1UL, typename SeedSeq>
517 | inline UInt generate_one(SeedSeq&& generator)
518 | {
519 |     UInt result[N];
520 |     generate_to<N>(std::forward<SeedSeq>(generator), result);
521 |     return result[i];
522 | }
523 | 
524 | template <typename RngType>
525 | auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound)
526 |         -> typename RngType::result_type
527 | {
528 |     typedef typename RngType::result_type rtype;
529 |     rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound)
530 |                     % upper_bound;
531 |     for (;;) {
532 |         rtype r = rng() - RngType::min();
533 |         if (r >= threshold)
534 |             return r % upper_bound;
535 |     }
536 | }
537 | 
538 | template <typename Iter, typename RandType>
539 | void shuffle(Iter from, Iter to, RandType&& rng)
540 | {
541 |     typedef typename std::iterator_traits<Iter>::difference_type delta_t;
542 |     typedef typename std::remove_reference<RandType>::type::result_type result_t;
543 |     auto count = to - from;
544 |     while (count > 1) {
545 |         delta_t chosen = delta_t(bounded_rand(rng, result_t(count)));
546 |         --count;
547 |         --to;
548 |         using std::swap;
549 |         swap(*(from + chosen), *to);
550 |     }
551 | }
552 | 
553 | /*
554 |  * Although std::seed_seq is useful, it isn't everything.  Often we want to
555 |  * initialize a random-number generator some other way, such as from a random
556 |  * device.
557 |  *
558 |  * Technically, it does not meet the requirements of a SeedSequence because
559 |  * it lacks some of the rarely-used member functions (some of which would
560 |  * be impossible to provide).  However the C++ standard is quite specific
561 |  * that actual engines only called the generate method, so it ought not to be
562 |  * a problem in practice.
563 |  */
564 | 
565 | template <typename RngType>
566 | class seed_seq_from {
567 | private:
568 |     RngType rng_;
569 | 
570 |     typedef uint_least32_t result_type;
571 | 
572 | public:
573 |     template<typename... Args>
574 |     seed_seq_from(Args&&... args) :
575 |         rng_(std::forward<Args>(args)...)
576 |     {
577 |         // Nothing (else) to do...
578 |     }
579 | 
580 |     template<typename Iter>
581 |     void generate(Iter start, Iter finish)
582 |     {
583 |         for (auto i = start; i != finish; ++i)
584 |             *i = result_type(rng_());
585 |     }
586 | 
587 |     constexpr size_t size() const
588 |     {
589 |         return (sizeof(typename RngType::result_type) > sizeof(result_type)
590 |                 && RngType::max() > ~size_t(0UL))
591 |              ? ~size_t(0UL)
592 |              : size_t(RngType::max());
593 |     }
594 | };
595 | 
596 | /*
597 |  * Sometimes you might want a distinct seed based on when the program
598 |  * was compiled.  That way, a particular instance of the program will
599 |  * behave the same way, but when recompiled it'll produce a different
600 |  * value.
601 |  */
602 | 
603 | template <typename IntType>
604 | struct static_arbitrary_seed {
605 | private:
606 |     static constexpr IntType fnv(IntType hash, const char* pos) {
607 |         return *pos == '\0'
608 |              ? hash
609 |              : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1));
610 |     }
611 | 
612 | public:
613 |     static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)),
614 |                         __DATE__ __TIME__ __FILE__);
615 | };
616 | 
617 | // Sometimes, when debugging or testing, it's handy to be able print the name
618 | // of a (in human-readable form).  This code allows the idiom:
619 | //
620 | //      cout << printable_typename<my_foo_type_t>()
621 | //
622 | // to print out my_foo_type_t (or its concrete type if it is a synonym)
623 | 
624 | template <typename T>
625 | struct printable_typename {};
626 | 
627 | template <typename T>
628 | std::ostream& operator<<(std::ostream& out, printable_typename<T>) {
629 |     const char *implementation_typename = typeid(T).name();
630 | #ifdef __GNUC__
631 |     int status;
632 |     char* pretty_name =
633 |         abi::__cxa_demangle(implementation_typename, NULL, NULL, &status);
634 |     if (status == 0)
635 |         out << pretty_name;
636 |     free(static_cast<void*>(pretty_name));
637 |     if (status == 0)
638 |         return out;
639 | #endif
640 |     out << implementation_typename;
641 |     return out;
642 | }
643 | 
644 | } // namespace pcg_extras
645 | 
646 | #endif // PCG_EXTRAS_HPP_INCLUDED
647 | 


--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2018 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef UTIL_H_
16 | #define UTIL_H_
17 | 
18 | #include <stdint.h>
19 | #include <stdio.h>
20 | #include <stdlib.h>
21 | 
22 | #ifdef _MSC_VER
23 | #include <intrin.h>
24 | #endif
25 | 
26 | #define RANDEN_CHECK(condition)                          \
27 |   do {                                                   \
28 |     if (!(condition)) {                                  \
29 |       printf("Assertion failed on line %d\n", __LINE__); \
30 |       abort();                                           \
31 |     }                                                    \
32 |   } while (false)
33 | 
34 | namespace randen {
35 | 
36 | // "x" != 0.
37 | static inline int NumZeroBitsAboveMSBNonzero(const uint64_t x) {
38 | #ifdef _MSC_VER
39 |   return static_cast<int>(__lzcnt64(x));  // WARNING: requires BMI2
40 | #else
41 |   return __builtin_clzll(x);
42 | #endif
43 | }
44 | 
45 | }  // namespace randen
46 | 
47 | #endif  // UTIL_H_
48 | 


--------------------------------------------------------------------------------
/vector128.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // Wrappers for platform-specific 128-bit vectors.
 16 | #ifndef VECTOR128_H_
 17 | #define VECTOR128_H_
 18 | 
 19 | #include <stdint.h>     // uint64_t
 20 | 
 21 | #if defined(__SSE2__) && defined(__AES__)
 22 | 
 23 | #define RANDEN_AESNI 1
 24 | #include <wmmintrin.h>
 25 | 
 26 | #elif defined(__powerpc__) && defined(__VSX__)
 27 | 
 28 | #define RANDEN_PPC 1
 29 | #define RANDEN_BIG_ENDIAN 1
 30 | #include <altivec.h>
 31 | 
 32 | #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_CRYPTO)
 33 | 
 34 | #define RANDEN_ARM 1
 35 | #include <arm_neon.h>
 36 | 
 37 | #else
 38 | #error "Port"
 39 | #endif
 40 | 
 41 | #if defined(__clang__) || defined(__GNUC__)
 42 | #define RANDEN_INLINE inline __attribute__((always_inline))
 43 | #define RANDEN_RESTRICT __restrict__
 44 | #else
 45 | #define RANDEN_INLINE
 46 | #define RANDEN_RESTRICT
 47 | #endif
 48 | 
 49 | namespace randen {
 50 | 
 51 | #ifdef RANDEN_AESNI
 52 | 
 53 | class V {
 54 |  public:
 55 |   RANDEN_INLINE V() {}  // Leaves v_ uninitialized.
 56 |   RANDEN_INLINE V& operator=(const V other) {
 57 |     raw_ = other.raw_;
 58 |     return *this;
 59 |   }
 60 | 
 61 |   // Convert from/to intrinsics.
 62 |   RANDEN_INLINE explicit V(const __m128i raw) : raw_(raw) {}
 63 |   __m128i raw() const { return raw_; }
 64 | 
 65 |   RANDEN_INLINE V& operator^=(const V other) {
 66 |     raw_ = _mm_xor_si128(raw_, other.raw_);
 67 |     return *this;
 68 |   }
 69 | 
 70 |  private:
 71 |   // Note: this wrapper is faster than using __m128i directly.
 72 |   __m128i raw_;
 73 | };
 74 | 
 75 | #elif defined(RANDEN_PPC)
 76 | 
 77 | // Already provides operator^=.
 78 | using V = vector unsigned long long;
 79 | 
 80 | #elif defined(RANDEN_ARM)
 81 | 
 82 | // Already provides operator^=.
 83 | using V = uint8x16_t;
 84 | 
 85 | #else
 86 | #error "Port"
 87 | #endif
 88 | 
 89 | constexpr int kLanes = sizeof(V) / sizeof(uint64_t);
 90 | 
 91 | // On big-endian platforms, byte-swap constants (e.g. round keys) to ensure
 92 | // results match little-endian platforms.
 93 | #ifdef RANDEN_BIG_ENDIAN
 94 | #define RANDEN_LE(a, b) __builtin_bswap64(b), __builtin_bswap64(a)
 95 | #else
 96 | #define RANDEN_LE(a, b) a, b
 97 | #endif
 98 | 
 99 | #ifdef RANDEN_BIG_ENDIAN
100 | static RANDEN_INLINE V ReverseBytes(const V v) {
101 |   // Reverses the bytes of the vector.
102 |   const vector unsigned char perm = {15, 14, 13, 12, 11, 10, 9, 8,
103 |                                      7,  6,  5,  4,  3,  2,  1, 0};
104 |   return vec_perm(v, v, perm);
105 | }
106 | #endif
107 | 
108 | // WARNING: these load/store in native byte order. It is OK to load and then
109 | // store an unchanged vector, but interpreting the bits as a number or input
110 | // to AES will have platform-dependent results. Call ReverseBytes after load
111 | // and/or before store #ifdef RANDEN_BIG_ENDIAN.
112 | 
113 | static RANDEN_INLINE V Load(const uint64_t* RANDEN_RESTRICT lanes,
114 |                             const int block) {
115 | #ifdef RANDEN_AESNI
116 |   const uint64_t* RANDEN_RESTRICT from = lanes + block * kLanes;
117 |   return V(_mm_load_si128(reinterpret_cast<const __m128i*>(from)));
118 | #elif defined(RANDEN_PPC)
119 |   const V* RANDEN_RESTRICT from =
120 |       reinterpret_cast<const V*>(lanes + block * kLanes);
121 |   return vec_vsx_ld(0, from);
122 | #elif defined(RANDEN_ARM)
123 |   const uint8_t* RANDEN_RESTRICT from =
124 |       reinterpret_cast<const uint8_t*>(lanes + block * kLanes);
125 |   return vld1q_u8(from);
126 | #else
127 | #error "Port"
128 | #endif
129 | }
130 | 
131 | static RANDEN_INLINE void Store(const V v, uint64_t* RANDEN_RESTRICT lanes,
132 |                                 const int block) {
133 | #ifdef RANDEN_AESNI
134 |   uint64_t* RANDEN_RESTRICT to = lanes + block * kLanes;
135 |   _mm_store_si128(reinterpret_cast<__m128i * RANDEN_RESTRICT>(to), v.raw());
136 | #elif defined(RANDEN_PPC)
137 |   V* RANDEN_RESTRICT to = reinterpret_cast<V*>(lanes + block * kLanes);
138 |   vec_vsx_st(v, 0, to);
139 | #elif defined(RANDEN_ARM)
140 |   uint8_t* RANDEN_RESTRICT to =
141 |       reinterpret_cast<uint8_t*>(lanes + block * kLanes);
142 |   vst1q_u8(to, v);
143 | #else
144 | #error "Port"
145 | #endif
146 | }
147 | 
148 | // One round of AES. "round_key" is a public constant for breaking the
149 | // symmetry of AES (ensures previously equal columns differ afterwards).
150 | static RANDEN_INLINE V AES(const V state, const V round_key) {
151 | #ifdef RANDEN_AESNI
152 |   // It is important to always use the full round function - omitting the
153 |   // final MixColumns reduces security [https://eprint.iacr.org/2010/041.pdf]
154 |   // and does not help because we never decrypt.
155 |   return V(_mm_aesenc_si128(state.raw(), round_key.raw()));
156 | #elif defined(RANDEN_PPC)
157 |   return V(__builtin_crypto_vcipher(state, round_key));
158 | #elif defined(RANDEN_ARM)
159 |   return vaesmcq_u8(vaeseq_u8(state, round_key));
160 | #else
161 | #error "Port"
162 | #endif
163 | }
164 | 
165 | }  // namespace randen
166 | 
167 | #endif  // VECTOR128_H_
168 | 


--------------------------------------------------------------------------------
/vector128_test.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 Google Inc. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "vector128.h"
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | 
 20 | namespace randen {
 21 | namespace {
 22 | 
 23 | #define ASSERT_TRUE(condition)                     \
 24 |   while (!(condition)) {                           \
 25 |     printf("Check failed at line %d\n", __LINE__); \
 26 |     abort();                                       \
 27 |   }
 28 | 
 29 | void TestLoadStore() {
 30 |   const int N = 4;
 31 |   alignas(16) uint64_t test_cases[N * 2] = {
 32 |       1, 2, 3, 4, 0x1234567890ABCDEFuLL, 0x2143658709BADCFEuLL};
 33 | 
 34 |   alignas(16) uint64_t stored[N * 2];
 35 |   for (int i = 0; i < N; ++i) {
 36 |     V v = Load(test_cases, i);
 37 |     Store(v, stored, i);
 38 | 
 39 |     ASSERT_TRUE(test_cases[2 * i + 0] == stored[2 * i + 0]);
 40 |     ASSERT_TRUE(test_cases[2 * i + 1] == stored[2 * i + 1]);
 41 |   }
 42 | }
 43 | 
 44 | void TestXor() {
 45 |   alignas(16) uint64_t test_cases[][3][2] = {
 46 |       {{1, 2}, {3, 4}, {2, 6}},
 47 |       {{0x1234567890ABCDEFuLL, 0x2143658709BADCFEuLL},
 48 |        {0x2143658709BADCFEuLL, 0x1234567890ABCDEFuLL},
 49 |        {0x337733ff99111111uLL, 0x337733ff99111111uLL}}};
 50 | 
 51 |   for (const auto& test_case : test_cases) {
 52 |     V v1 = Load(test_case[0], 0);
 53 |     V v2 = Load(test_case[1], 0);
 54 | 
 55 |     v1 ^= v2;
 56 |     alignas(16) uint64_t data_stored[2];
 57 |     Store(v1, data_stored, 0);
 58 | 
 59 |     ASSERT_TRUE(test_case[2][0] == data_stored[0]);
 60 |     ASSERT_TRUE(test_case[2][1] == data_stored[1]);
 61 |   }
 62 | }
 63 | 
 64 | void TestAes() {
 65 |   // This test also catches byte-order bugs in Load/Store functions
 66 |   alignas(16) uint64_t message[2] = {
 67 |       RANDEN_LE(0x8899AABBCCDDEEFFuLL, 0x0123456789ABCDEFuLL)};
 68 |   alignas(16) uint64_t key[2] = {
 69 |       RANDEN_LE(0x0022446688AACCEEuLL, 0x1133557799BBDDFFuLL)};
 70 |   alignas(16) uint64_t expected_result[2] = {
 71 |       RANDEN_LE(0x28E4EE1884504333uLL, 0x16AB0E57DFC442EDuLL)};
 72 | 
 73 |   V v_message = Load(message, 0);
 74 |   V v_key = Load(key, 0);
 75 |   V v_result = AES(v_message, v_key);
 76 | 
 77 |   alignas(16) uint64_t result[2];
 78 |   Store(v_result, result, 0);
 79 | 
 80 |   ASSERT_TRUE(expected_result[0] == result[0]);
 81 |   ASSERT_TRUE(expected_result[1] == result[1]);
 82 | }
 83 | 
 84 | void RunAll() {
 85 |   // Immediately output any results (for non-local runs).
 86 |   setvbuf(stdout, nullptr, _IONBF, 0);
 87 | 
 88 |   TestLoadStore();
 89 |   TestXor();
 90 |   TestAes();
 91 | }
 92 | 
 93 | }  // namespace
 94 | }  // namespace randen
 95 | 
 96 | int main(int argc, char* argv[]) {
 97 |   randen::RunAll();
 98 |   return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------