├── .travis.yml
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── contrib
└── rust
│ ├── Cargo.toml
│ └── src
│ └── lib.rs
├── engine_chacha.h
├── engine_os.h
├── nanobenchmark.cc
├── nanobenchmark.h
├── nanobenchmark_test.cc
├── randen.cc
├── randen.h
├── randen_benchmark.cc
├── randen_test.cc
├── third_party
└── pcg_random
│ ├── LICENSE
│ └── include
│ ├── pcg_extras.hpp
│ └── pcg_random.hpp
├── util.h
├── vector128.h
└── vector128_test.cc
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: cpp
2 |
3 | dist: trusty
4 |
5 | compiler:
6 | - clang
7 | - gcc
8 |
9 | script:
10 | - make
11 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement. You (or your employer) retain the copyright to your contribution,
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to to see
12 | your current agreements on file or to sign a new one.
13 |
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 |
18 | ## Code reviews
19 |
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
204 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | override CPPFLAGS += -I. -I../
2 | override CXXFLAGS += -std=c++11 -Wall -O3 -fno-pic -mavx2 -maes
3 | override LDFLAGS += $(CXXFLAGS)
4 | override CXX = clang++
5 |
6 | all: $(addprefix bin/, nanobenchmark_test randen_test randen_benchmark vector128_test)
7 |
8 | obj/%.o: %.cc
9 | @mkdir -p -- $(dir $@)
10 | $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@
11 |
12 | bin/%: obj/%.o obj/nanobenchmark.o obj/randen.o
13 | @mkdir -p bin
14 | $(CXX) $(LDFLAGS) $^ -o $@
15 |
16 | .DELETE_ON_ERROR:
17 | deps.mk: $(wildcard *.cc) $(wildcard *.h) Makefile
18 | set -eu; for file in *.cc; do \
19 | target=obj/$${file##*/}; target=$${target%.*}.o; \
20 | $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -MM -MT \
21 | "$$target" "$$file"; \
22 | done >$@
23 | -include deps.mk
24 |
25 | clean:
26 | [ ! -d obj ] || $(RM) -r -- obj/
27 | [ ! -d bin ] || $(RM) -r -- bin/
28 | [ ! -d lib ] || $(RM) -r -- lib/
29 |
30 | .PHONY: clean all
31 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Overview
2 |
3 | What if we could default to attack-resistant random generators without excessive
4 | CPU cost? We introduce 'Randen', a new generator with security guarantees; it
5 | outperforms MT19937, pcg64_c32, Philox, ISAAC and ChaCha8 in real-world
6 | benchmarks. This is made possible by AES hardware acceleration and a large
7 | Feistel permutation.
8 |
9 | ## Related work
10 |
11 | AES-CTR (encrypting a counter) is a well-known and easy to implement generator.
12 | It has two known weaknesses:
13 |
14 | - A known-key distinguisher on 10-round, 128-bit AES [https://goo.gl/3xReB9].
15 |
16 | - No forward security/backtracking resistance: compromising the current state
17 | lets attackers distinguish prior outputs from random.
18 |
19 | NIST 800-90a r1 [https://goo.gl/68Fwmv] is a standardized generator that ensures
20 | backtracking resistance, but is not fast enough for a general-purpose generator
21 | (5-10x slower than AES).
22 |
23 | ## Algorithm
24 |
25 | The Randen generator is based upon three existing components:
26 |
27 | 1) Reverie [https://eprint.iacr.org/2016/886.pdf] is a sponge-like generator
28 | that requires a cryptographic permutation. It improves upon "Provably Robust
29 | Sponge-Based PRNGs and KDFs" by achieving backtracking resistance with only
30 | a single permutation per buffer.
31 |
32 | 2) Simpira v2 [https://eprint.iacr.org/2016/122.pdf] constructs up to 1024-bit
33 | permutations using an improved Generalized Feistel network with 2-round
34 | AES-128 functions. This Feistel block shuffle achieves diffusion sooner and
35 | is less vulnerable to sliced-biclique attacks than a Type-2 cyclic shuffle.
36 |
37 | 3) "New criterion for diffusion property" [https://goo.gl/mLXH4f] shows that
38 | the same kind of improved Feistel block shuffle can be extended to 16
39 | branches, which enables a more efficient 2048-bit permutation.
40 |
41 | We combine these by plugging the larger Simpira-like permutation into Reverie.
42 |
43 | ## Performance
44 |
45 | The implementation targets x86 (Westmere), POWER 8 and ARM64.
46 |
47 | x86 microbenchmark: generating random bits in a tight loop
48 | (cpb=cycles per byte, MAD=median absolute deviation):
49 |
50 | RNG | cpb | MAD
51 | --- | --- | ---
52 | Randen | 1.54 | 0.002
53 | pcg64_c32 | 0.78 | 0.003
54 | mt19937_64 | 1.79 | 0.001
55 | ChaCha8 | 3.02 | 0.003
56 | ISAAC | 4.08 | 0.006
57 | Philox | 4.70 | 0.003
58 | /dev/urandom (ChaCha20) | 15.27 | 0.018
59 | BCryptGenRandom (CTR-DRBG) | 16.80 | 0.009
60 |
61 | x86 real-world benchmark (reservoir sampling):
62 |
63 | RNG | cpb | MAD
64 | --- | --- | ---
65 | Randen | 2.60 | 0.008
66 | pcg64_c32 | 3.03 | 0.009
67 | mt19937_64| 2.82 | 0.009
68 | ChaCha8 | 3.75 | 0.008
69 | ISAAC | 4.46 | 0.014
70 | Philox | 4.95 | 0.009
71 | /dev/urandom (ChaCha20) | 13.46 | 0.017
72 | BCryptGenRandom (CTR-DRBG) | 16.41 | 0.015
73 |
74 | ## Security
75 |
76 | Randen is indistinguishable from random and backtracking-resistant. For more
77 | details and benchmarks, please see ["Randen - fast backtracking-resistant random
78 | generator with AES+Feistel+Reverie"](https://arxiv.org/abs/1810.02227).
79 |
80 | ## Usage
81 |
82 | `make && bin/randen_benchmark`
83 |
84 | Note that the code relies on compiler optimizations. Cycles per byte may
85 | increase by factors of 1.6 when compiled with GCC 7.3, and 1.3 with
86 | Clang 4.0.1. This can be mitigated by manually unrolling the loops.
87 |
88 | ## Third-party implementations / bindings
89 |
90 | Thanks to Frank Denis for making us aware of these third-party implementations
91 | or bindings. Note that the algorithm is still under review and subject to
92 | change, but please feel free to get in touch or raise an issue and we'll
93 | add yours as well.
94 |
95 | By | Language | URL
96 | --- | --- | ---
97 | Frank Denis | C | https://github.com/jedisct1/randen-rng
98 |
99 |
100 | This is not an official Google product.
101 |
--------------------------------------------------------------------------------
/contrib/rust/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "randen"
3 | version = "0.0.0"
4 | authors = ["Ruud van Asseldonk ", "Jan Wassenberg ", "Brendan Hickey "]
5 | license = "Apache-2.0"
6 | description = "Randen is a fast, backtracking resistant CSPRNG."
7 | repository = "https://github.com/google/randen"
8 | keywords = [ "Crypto", "rng", "random" ]
9 |
10 | [dependencies.rand]
11 | version = "0.5"
12 | features = ["i128_support"]
13 |
--------------------------------------------------------------------------------
/contrib/rust/src/lib.rs:
--------------------------------------------------------------------------------
1 | //! The Randen pseudorandom number generator.
2 |
3 | extern crate rand;
4 |
5 | use std::mem;
6 | use std::ops::BitXorAssign;
7 |
8 | use rand::{Error, FromEntropy, RngCore, SeedableRng};
9 | use std::arch::x86_64::{__m128i, _mm_aesenc_si128};
10 |
11 | /// Size of the entire sponge / state for the Randen PRNG.
12 | const STATE_LEN: usize = 16; // 256 bytes, 16x16 bytes.
13 |
14 | /// Size of the "inner" (inaccessible) part of the sponge.
15 | ///
16 | /// Larger values would require more frequent calls to `randen_generate`.
17 | const CAPACITY: usize = 1; // 1x16 bytes.
18 |
19 | /// Size of the default seed consumed by the sponge.
20 | const SEED_LEN: usize = STATE_LEN - CAPACITY;
21 | const SEED_BYTES: usize = SEED_LEN * 16;
22 |
23 | const STATE_BYTES: usize = STATE_LEN * 16;
24 | const CAPACITY_BYTES: usize = CAPACITY * 16;
25 |
26 | const FEISTEL_ROUNDS: usize = 17;
27 | const FEISTEL_FUNCTIONS: usize = 8;
28 | const ROUND_KEYS_LEN: usize = FEISTEL_ROUNDS * FEISTEL_FUNCTIONS;
29 |
30 | /// Aligned 128 bits wrapper.
31 | #[derive(Copy, Clone, Debug, Eq, PartialEq)]
32 | #[repr(align(16))]
33 | pub struct U128A(u128);
34 |
35 | impl U128A {
36 | #[inline(always)]
37 | fn from(m128i: __m128i) -> U128A {
38 | unsafe { mem::transmute(m128i) }
39 | }
40 |
41 | #[inline(always)]
42 | fn m128i(self) -> __m128i {
43 | unsafe { mem::transmute(self) }
44 | }
45 | }
46 |
47 | impl BitXorAssign for U128A {
48 | fn bitxor_assign(&mut self, rhs: U128A) {
49 | self.0 ^= rhs.0;
50 | }
51 | }
52 |
53 | // "Nothing up my sleeve" numbers from the first hex digits of pi.
54 | //
55 | // Obtained from http://hexpi.sourceforge.net/. The array was generated by the
56 | // following Python script:
57 | /*
58 | python3 << EOF
59 | """Generates Randen round keys array from pi-hex.62500.txt file."""
60 | KEYS = 136
61 |
62 | def chunks(l, n):
63 | """Yield successive n-sized chunks from l."""
64 | for i in range(0, len(l), n):
65 | yield l[i:i + n]
66 |
67 | with open("pi-hex.62500.txt") as file:
68 | for key in chunks(file.read(KEYS * 32), 32):
69 | print(' U128A(0x{}),'.format(key[16:], key[:16]))
70 | EOF
71 | */
72 | const ROUND_KEYS: [U128A; ROUND_KEYS_LEN] = [
73 | U128A(0x13198A2E03707344243F6A8885A308D3),
74 | U128A(0x082EFA98EC4E6C89A4093822299F31D0),
75 | U128A(0xBE5466CF34E90C6C452821E638D01377),
76 | U128A(0x3F84D5B5B5470917C0AC29B7C97C50DD),
77 | U128A(0xD1310BA698DFB5AC9216D5D98979FB1B),
78 | U128A(0xB8E1AFED6A267E962FFD72DBD01ADFB7),
79 | U128A(0x24A19947B3916CF7BA7C9045F12C7F99),
80 | U128A(0x636920D871574E690801F2E2858EFC16),
81 | U128A(0x0D95748F728EB658A458FEA3F4933D7E),
82 | U128A(0x7B54A41DC25A59B5718BCD5882154AEE),
83 | U128A(0xC5D1B023286085F09C30D5392AF26013),
84 | U128A(0x8E79DCB0603A180ECA417918B8DB38EF),
85 | U128A(0xD71577C1BD314B276C9E0E8BB01E8A3E),
86 | U128A(0xE65525F3AA55AB9478AF2FDA55605C60),
87 | U128A(0x55CA396A2AAB10B65748986263E81440),
88 | U128A(0xA15486AF7C72E993B4CC5C341141E8CE),
89 | U128A(0x2BA9C55D741831F6B3EE1411636FBC2A),
90 | U128A(0xAFD6BA336C24CF5CCE5C3E169B87931E),
91 | U128A(0x3B8F48986B4BB9AF7A32538128958677),
92 | U128A(0x61D809CCFB21A991C4BFE81B66282193),
93 | U128A(0xEF845D5DE98575B1487CAC605DEC8032),
94 | U128A(0x23893E81D396ACC5DC262302EB651B88),
95 | U128A(0x2E0B4482A48420040F6D6FF383F44239),
96 | U128A(0x21C66842F6E96C9A69C8F04A9E1F9B5E),
97 | U128A(0x6A51A0D2D8542F68670C9C61ABD388F0),
98 | U128A(0x6EEF0B6C137A3BE4960FA728AB5133A3),
99 | U128A(0xA1F1651D39AF0176BA3BF0507EFB2A98),
100 | U128A(0x8CEE8619456F9FB466CA593E82430E88),
101 | U128A(0xE06F75D885C120737D84A5C33B8B5EBE),
102 | U128A(0x4ED3AA62363F7706401A449F56C16AA6),
103 | U128A(0x37D0D724D00A12481BFEDF72429B023D),
104 | U128A(0x075372C980991B7BDB0FEAD349F1C09B),
105 | U128A(0xE3FE501AB6794C3B25D479D8F6E8DEF7),
106 | U128A(0xC1A94FB6409F60C4976CE0BD04C006BA),
107 | U128A(0x68FB6FAF3E6C53B55E5C9EC2196A2463),
108 | U128A(0x6DFC511F9B30952C1339B2EB3B52EC6F),
109 | U128A(0xBEE3D004DE334AFDCC814544AF5EBD09),
110 | U128A(0xC0CBA85745C8740F660F2807192E4BB3),
111 | U128A(0x5579C0BD1A60320AD20B5F39B9D3FBDB),
112 | U128A(0x679F25FEFB1FA3CCD6A100C6402C7279),
113 | U128A(0x3C7516DFFD616B158EA5E9F8DB3222F8),
114 | U128A(0x323DB5FAFD2387602F501EC8AD0552AB),
115 | U128A(0x9E5C57BBCA6F8CA053317B483E00DF82),
116 | U128A(0xD542A8F6287EFFC31A87562EDF1769DB),
117 | U128A(0x695B27B0BBCA58C8AC6732C68C4F5573),
118 | U128A(0x10FA3D98FD2183B8E1FFA35DB8F011A0),
119 | U128A(0x9A53E479B6F845654AFCB56C2DD1D35B),
120 | U128A(0xE1DDF2DAA4CB7E33D28E49BC4BFB9790),
121 | U128A(0xEF20CADA36774C0162FB1341CEE4C6E8),
122 | U128A(0x95DBDA4DAE909198D07E9EFE2BF11FB4),
123 | U128A(0xD08ED1D0AFC725E0EAAD8E716B93D5A0),
124 | U128A(0x8FF6E2FBF2122B648E3C5B2F8E7594B7),
125 | U128A(0x4FAD5EA0688FC31C8888B812900DF01C),
126 | U128A(0x2F2F2218BE0E1777D1CFF191B3A8C1AD),
127 | U128A(0xE5A0CC0FB56F74E8EA752DFE8B021FA1),
128 | U128A(0xB4A84FE0FD13E0B718ACF3D6CE89E299),
129 | U128A(0x165FA266809577057CC43B81D2ADA8D9),
130 | U128A(0xE6AD206577B5FA8693CC7314211A1477),
131 | U128A(0xEBCDAF0C7B3E89A0C75442F5FB9D35CF),
132 | U128A(0x00250E2D2071B35ED6411BD3AE1E7E49),
133 | U128A(0x2464369BF009B91E226800BB57B8E0AF),
134 | U128A(0x78C14389D95A537F5563911D59DFA6AA),
135 | U128A(0x832603766295CFA9207D5BA202E5B9C5),
136 | U128A(0xB3472DCA7B14A94A11C819684E734A41),
137 | U128A(0xD60F573FBC9BC6E41B5100529A532915),
138 | U128A(0x08BA6FB5571BE91F2B60A47681E67400),
139 | U128A(0xB6636521E7B9F9B6F296EC6B2A0DD915),
140 | U128A(0x53B02D5DA99F8FA1FF34052EC5855664),
141 | U128A(0x4B7A70E9B5B3294408BA47996E85076A),
142 | U128A(0xAD6EA6B049A7DF7DDB75092EC4192623),
143 | U128A(0xECAA8C71699A18FF9CEE60B88FEDB266),
144 | U128A(0x193602A575094C295664526CC2B19EE1),
145 | U128A(0x3F54989A5B429D65A0591340E4183A3E),
146 | U128A(0xA1D29C07EFE830F56B8FE4D699F73FD6),
147 | U128A(0x4CDD20868470EB264D2D38E6F0255DC1),
148 | U128A(0x09686B3F3EBAEFC96382E9C6021ECC5E),
149 | U128A(0x687F358452A0E2863C9718146B6A70A1),
150 | U128A(0x3E07841C7FDEAE5CB79C5305AA500737),
151 | U128A(0xB03ADA37F0500C0D8E7D44EC5716F2B8),
152 | U128A(0xAE0CF51A3CB574B2F01C1F040200B3FF),
153 | U128A(0xD19113F97CA92FF625837A58DC0921BD),
154 | U128A(0x3AE5E58137C2DADC9432477322F54701),
155 | U128A(0xA94461460FD0030EC8B576349AF3DDA7),
156 | U128A(0xE238CD993BEA0E2FECC8C73EA4751E41),
157 | U128A(0x4E548B384F6DB9083280BBA1183EB331),
158 | U128A(0x2CB8129024977C796F420D03F60A04BF),
159 | U128A(0xDE9A771FD99308105679B072BCAF89AF),
160 | U128A(0x5512721F2E6B7124B38BAE12DCCF3F2E),
161 | U128A(0x7A5847187408DA17501ADDE69F84CD87),
162 | U128A(0xEC7AEC3ADB851DFABC9F9ABCE94B7D8C),
163 | U128A(0xEF1C18473215D80863094366C464C3D2),
164 | U128A(0x12A14D432A65C451DD433B3724C2BA16),
165 | U128A(0x71DFF89E10314E5550940002133AE4DD),
166 | U128A(0x043556F1D7A3C76B81AC77D65F11199B),
167 | U128A(0xF28FE6ED97F1FBFA3C11183B5924A509),
168 | U128A(0x86E34570EAE96FB19EBABF2C1E153C6E),
169 | U128A(0x771FE71C4E3D06FA860E5E0A5A3E2AB3),
170 | U128A(0x803E89D65266C8252965DCB999E71D0F),
171 | U128A(0xC6150EBA94E2EA782E4CC9789C10B36A),
172 | U128A(0xF2F74EA7361D2B3DA6FC3C531E0A2DF4),
173 | U128A(0x5223A708F71312B61939260F19C27960),
174 | U128A(0xE3BC4595A67BC883EBADFE6EEAC31F66),
175 | U128A(0xC332DDEFBE6C5AA5B17F37D1018CFF28),
176 | U128A(0xEECEA50FDB2F953B6558218568AB9702),
177 | U128A(0x1521B628290761702AEF7DAD5B6E2F84),
178 | U128A(0x13CCA830EB61BD96ECDD4775619F1510),
179 | U128A(0xB5735C904C70A2390334FE1EAA0363CF),
180 | U128A(0xEECC86BC60622CA7D59E9E0BCBAADE14),
181 | U128A(0x648B1EAF19BDF0CA9CAB5CABB2F3846E),
182 | U128A(0x40685A323C2AB4B3A02369B9655ABB50),
183 | U128A(0x9B540B19875FA099319EE9D5C021B8F7),
184 | U128A(0xF837889A97E32D7795F7997E623D7DA8),
185 | U128A(0x0E358829C7E61FD611ED935F16681281),
186 | U128A(0x57F584A51B22726396DEDFA17858BA99),
187 | U128A(0xCDB30AEB532E30549B83C3FF1AC24696),
188 | U128A(0x58EBF2EF34C6FFEA8FD948E46DBC3128),
189 | U128A(0x5D4A14D9E864B7E3FE28ED61EE7C3C73),
190 | U128A(0x45EEE2B6A3AAABEA42105D14203E13E0),
191 | U128A(0xC742F442EF6ABBB5DB6C4F15FACB4FD0),
192 | U128A(0xD81E799E86854DC7654F3B1D41CD2105),
193 | U128A(0xCF62A1F25B8D2646E44B476A3D816250),
194 | U128A(0x7F1524C369CB7492FC8883A0C1C7B6A3),
195 | U128A(0x095BBF00AD19489D47848A0B5692B285),
196 | U128A(0x58428D2A0C55F5EA1462B17423820D00),
197 | U128A(0x3372F0928D937E411DADF43E233F7061),
198 | U128A(0x7CDE3759CBEE7460D65FECF16C223BDB),
199 | U128A(0xA607808419F8509E4085F2A7CE77326E),
200 | U128A(0xA969A7AAC50C06C2E8EFD85561D99735),
201 | U128A(0x9E447A2EC34534845A04ABFC800BCADC),
202 | U128A(0xDB73DBD3105588CDFDD567050E1E9EC9),
203 | U128A(0xC5C43465713E38D8675FDA79E3674340),
204 | U128A(0x153E21E78FB03D4A3D28F89EF16DFF20),
205 | U128A(0xE93D5A68948140F7E6E39F2BDB83ADF7),
206 | U128A(0x411520F77602D4F7F64C261C94692934),
207 | U128A(0xD40824713320F46ABCF46B2ED4A10068),
208 | U128A(0x1E39F62E9724454643B7D4B7500061AF),
209 | ];
210 |
211 | pub type State = [U128A; STATE_LEN];
212 |
213 | #[inline(always)]
214 | fn aes_round(state: U128A, round_key: U128A) -> U128A {
215 | unsafe { U128A::from(_mm_aesenc_si128(state.m128i(), round_key.m128i())) }
216 | }
217 |
218 | /// Improved odd-even shuffle from "New criterion for diffusion property".
219 | #[inline(always)]
220 | fn block_shuffle(source: State) -> State {
221 | let shuffle = [7, 2, 13, 4, 11, 8, 3, 6, 15, 0, 9, 10, 1, 14, 5, 12];
222 | // TODO: Check if the zeros get generated; if so, use mem::uninitialized.
223 | let mut new_state = [U128A(0); STATE_LEN];
224 | for (i, shuf) in shuffle.iter().enumerate() {
225 | new_state[i] = source[*shuf];
226 | }
227 | new_state
228 | }
229 |
230 | /// Cryptographic permutation based on type-2 Generalized Feistel Network.
231 | ///
232 | /// An adversary who can query a permutation for a chosen ciphertext cannot
233 | /// distinguish the permutation from a truly random permutation in less than
234 | /// 2^64 queries, if the round function is a pseudorandom function. This is
235 | /// similar to the b=8 case of Simpira v2, but more efficient than Simpira's
236 | /// generic construction from b=16.
237 | #[inline(always)]
238 | fn permute(state: &mut State) {
239 | let mut keys = ROUND_KEYS.iter();
240 | for _ in 0..FEISTEL_ROUNDS {
241 | for branch in 0..FEISTEL_FUNCTIONS {
242 | let even = state[branch * 2];
243 | let odd = state[branch * 2 + 1];
244 | // Feistel round function using two AES subrounds. Very similar to
245 | // F() from Simpira v2, but with independent subround keys. Uses 17
246 | // AES rounds per 16 bytes (vs. 10 for AES-CTR). Computing eight
247 | // round functions in parallel hides the 7-cycle AESNI latency on
248 | // HSW. Note that the Feistel XORs are 'free' (included in the
249 | // second AES instruction).
250 | let f1 = aes_round(even, *keys.next().unwrap());
251 | let f2 = aes_round(f1, odd);
252 | state[branch * 2 + 1] = f2;
253 | }
254 | *state = block_shuffle(*state);
255 | }
256 | }
257 |
258 | /// Generate updates the Randen sponge.
259 | ///
260 | /// The outer portion of the sponge (`CAPACITY_BYTES..STATE_BYTES`) may be
261 | /// consumed as PRNG output after applying this function.
262 | #[cfg(target_endian = "little")]
263 | pub fn randen_generate(state: &mut State) {
264 | let prev_inner = state[0];
265 | // Note: for a big-endian architecture, the endianness of the state and
266 | // round keys needs to be converted first. But as this currently relies on
267 | // an x86-only instruction, we don't deal with this at the moment.
268 | permute(state);
269 |
270 | // Ensure backtracking resistance.
271 | state[0] ^= prev_inner;
272 | }
273 |
274 | #[cfg(target_endian = "big")]
275 | pub fn randen_generate(state: &mut State) {
276 | unimplemented!("Big endian requires swapping the bytes in the state and round keys.");
277 | }
278 |
279 | pub fn randen_absorb(state: &mut State, seed: &[U128A; SEED_LEN]) {
280 | for (seed_elem, state_elem) in seed.iter().zip(&mut state[1..]) {
281 | *state_elem ^= *seed_elem;
282 | }
283 | }
284 |
285 | // Note: do not derive Copy, to avoid accidental reuse of the state.
286 | #[derive(Clone, Debug)]
287 | pub struct RandenRng {
288 | /// The current state.
289 | state: State,
290 | /// Index of the next unconsumed byte of the state.
291 | ///
292 | /// The value is least `CAPACITY_BYTES`. The value may exceed `STATE_BYTES -
293 | /// 1`. In that case a generate is required before consuming bytes.
294 | cursor: usize,
295 | }
296 |
297 | impl RandenRng {
298 | /// Create a Randen random number generator using a fixed default seed.
299 | pub fn new_unseeded() -> RandenRng {
300 | RandenRng {
301 | state: [U128A(0); STATE_LEN],
302 | // Set the cursor to indicate that the state is fully consumed, to
303 | // enforce a generate before returning any bytes. This way the
304 | // initial zeros are not exposed as random numbers.
305 | cursor: STATE_BYTES,
306 | }
307 | }
308 | }
309 |
310 | // The implementations of `next_u32` and `next_u64` are similar apart from the
311 | // types and size constants, use a macro so we only have to write it once.
312 | macro_rules! impl_next {
313 | ($func: ident, $t: ty, $size: expr) => {
314 | fn $func(&mut self) -> $t {
315 | // If we don't have enough bytes left in the state, generate new
316 | // random bytes.
317 | if self.cursor > STATE_BYTES - $size {
318 | randen_generate(&mut self.state);
319 | self.cursor = CAPACITY_BYTES;
320 | }
321 |
322 | // Round the cursor up to the next multiple of $size, so we can
323 | // pretend that the state is an array of $ts and load one from
324 | // there. It means we discard some bytes if the cursor was not at
325 | // a multiple of $size, but the advantage is that we don't need to
326 | // worry about carrying over bytes between generations, when there
327 | // are < $size bytes available.
328 | let index = (self.cursor + $size - 1) / $size;
329 | self.cursor = (index + 1) * $size;
330 | let ts: [$t; STATE_BYTES / $size] =
331 | unsafe { mem::transmute(self.state) };
332 | ts[index]
333 | }
334 | }
335 | }
336 |
337 | impl RngCore for RandenRng {
338 | impl_next!(next_u32, u32, 4);
339 | impl_next!(next_u64, u64, 8);
340 |
341 | fn fill_bytes(&mut self, dest: &mut [u8]) {
342 | let mut i = 0;
343 | let len = dest.len();
344 | while i < len {
345 | if self.cursor >= STATE_BYTES {
346 | randen_generate(&mut self.state);
347 | self.cursor = CAPACITY_BYTES;
348 | }
349 |
350 | let bytes: [u8; STATE_BYTES] = unsafe { mem::transmute(self.state) };
351 |
352 | // This iteration we will consume as many bytes as there are left
353 | // to fill, or as many bytes as are available for consumption,
354 | // whichever is less.
355 | let consume_bytes = (len - i).min(STATE_BYTES - self.cursor);
356 | let source = &bytes[self.cursor..self.cursor + consume_bytes];
357 | dest[i..i + consume_bytes].copy_from_slice(source);
358 | self.cursor += consume_bytes;
359 | i += consume_bytes;
360 | }
361 | }
362 |
363 | fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> {
364 | Ok(self.fill_bytes(dest))
365 | }
366 | }
367 |
368 | pub struct RandenSeed(pub [u8; SEED_BYTES]);
369 |
370 | impl Default for RandenSeed {
371 | fn default() -> RandenSeed {
372 | RandenSeed([0; SEED_BYTES])
373 | }
374 | }
375 |
376 | impl AsMut<[u8]> for RandenSeed {
377 | fn as_mut(&mut self) -> &mut [u8] {
378 | &mut self.0
379 | }
380 | }
381 |
382 | impl SeedableRng for RandenRng {
383 | type Seed = RandenSeed;
384 |
385 | fn from_seed(seed: RandenSeed) -> RandenRng {
386 | let mut rng = RandenRng::new_unseeded();
387 | unsafe {
388 | // [u8] isn't necessarily 16 byte aligned. Transmuting it to [U128]
389 | // won't fix the alignment, but a subsequent clone should work.
390 | let unaligned_seed = std::mem::transmute::<[u8; SEED_BYTES], [U128A; SEED_LEN]>(seed.0);
391 | let aligned_seed = unaligned_seed.clone();
392 | randen_absorb(&mut rng.state, &aligned_seed);
393 | rng
394 | }
395 | }
396 | }
397 |
398 | #[cfg(test)]
399 | mod test {
400 | use super::{RandenRng, U128A};
401 | use rand::{RngCore, SeedableRng};
402 |
403 | #[test]
404 | fn randen_rng_next_u64_test_vectors() {
405 | // These test vectors were generated from the reference C++
406 | // implementation with the following program:
407 | //
408 | // int main(int, char**) {
409 | // randen::Randen rng;
410 | // for (int i = 0; i < 33; i++) {
411 | // std::cout << " assert_eq!(rng.next_u64(), 0x";
412 | // std::cout << std::setbase(16) << std::setw(16)
413 | // << std::setfill('0') << rng();
414 | // std::cout << ");\n";
415 | // }
416 | // std::cout << std::endl;
417 | // return 0;
418 | // }
419 |
420 | // Note that there are more bytes consumed than the size of the state,
421 | // forcing a `randen_generate()`.
422 | let mut rng = RandenRng::new_unseeded();
423 | assert_eq!(rng.next_u64(), 0xdda9f47cd90410ee);
424 | assert_eq!(rng.next_u64(), 0xc3c14f134e433977);
425 | assert_eq!(rng.next_u64(), 0xf0b780f545c72912);
426 | assert_eq!(rng.next_u64(), 0x887bf3087fd8ca10);
427 | assert_eq!(rng.next_u64(), 0x30ec63baff3c6d59);
428 | assert_eq!(rng.next_u64(), 0x15dbb1d37696599f);
429 | assert_eq!(rng.next_u64(), 0x2808a316f49a54c);
430 | assert_eq!(rng.next_u64(), 0xb29f73606f7f20a6);
431 | assert_eq!(rng.next_u64(), 0x9cbf605e3fd9de8a);
432 | assert_eq!(rng.next_u64(), 0x3b8feaf9d5c8e50e);
433 | assert_eq!(rng.next_u64(), 0xd8b2ffd356301ed5);
434 | assert_eq!(rng.next_u64(), 0xc970ae1a78183bbb);
435 | assert_eq!(rng.next_u64(), 0xcdfd8d76eb8f9a19);
436 | assert_eq!(rng.next_u64(), 0xf4b327fe0fc73c37);
437 | assert_eq!(rng.next_u64(), 0xd5af05dd3eff9556);
438 | assert_eq!(rng.next_u64(), 0xc3a506eb91420c9d);
439 | assert_eq!(rng.next_u64(), 0x7023920e0d6bfe8c);
440 | assert_eq!(rng.next_u64(), 0x48db1bb78f83c4a1);
441 | assert_eq!(rng.next_u64(), 0xed1ef4c26b87b840);
442 | assert_eq!(rng.next_u64(), 0x58d3575834956d42);
443 | assert_eq!(rng.next_u64(), 0x497cabf3431154fc);
444 | assert_eq!(rng.next_u64(), 0x8eef32a23e0b2df3);
445 | assert_eq!(rng.next_u64(), 0xd88b5749f090e5ea);
446 | assert_eq!(rng.next_u64(), 0x4e24370570029a8b);
447 | assert_eq!(rng.next_u64(), 0x78fcec2cbb6342f5);
448 | assert_eq!(rng.next_u64(), 0xc651a582a970692f);
449 | assert_eq!(rng.next_u64(), 0x352ee4ad1816afe3);
450 | assert_eq!(rng.next_u64(), 0x463cb745612f55db);
451 | assert_eq!(rng.next_u64(), 0x811ef0821c3de851);
452 | assert_eq!(rng.next_u64(), 0x26ff374c101da7e);
453 | assert_eq!(rng.next_u64(), 0xa0660379992d58fc);
454 | assert_eq!(rng.next_u64(), 0x6f7e616704c4fa59);
455 | assert_eq!(rng.next_u64(), 0x915f3445685da798);
456 | }
457 |
458 | #[test]
459 | fn randen_rng_next_u32_test_vectors() {
460 | // Same test as `randen_rng_next_u64_test_vectors()`, generated from the
461 | // same C++ program, but adapted to produce 32-bit integers.
462 |
463 | // Note that there are more bytes consumed than the size of the state,
464 | // forcing a `randen_generate()`.
465 | let mut rng = RandenRng::new_unseeded();
466 | assert_eq!(rng.next_u32(), 0xd90410ee);
467 | assert_eq!(rng.next_u32(), 0xdda9f47c);
468 | assert_eq!(rng.next_u32(), 0x4e433977);
469 | assert_eq!(rng.next_u32(), 0xc3c14f13);
470 | assert_eq!(rng.next_u32(), 0x45c72912);
471 | assert_eq!(rng.next_u32(), 0xf0b780f5);
472 | assert_eq!(rng.next_u32(), 0x7fd8ca10);
473 | assert_eq!(rng.next_u32(), 0x887bf308);
474 | assert_eq!(rng.next_u32(), 0xff3c6d59);
475 | assert_eq!(rng.next_u32(), 0x30ec63ba);
476 | assert_eq!(rng.next_u32(), 0x7696599f);
477 | assert_eq!(rng.next_u32(), 0x15dbb1d3);
478 | assert_eq!(rng.next_u32(), 0x6f49a54c);
479 | assert_eq!(rng.next_u32(), 0x02808a31);
480 | assert_eq!(rng.next_u32(), 0x6f7f20a6);
481 | assert_eq!(rng.next_u32(), 0xb29f7360);
482 | assert_eq!(rng.next_u32(), 0x3fd9de8a);
483 | assert_eq!(rng.next_u32(), 0x9cbf605e);
484 | assert_eq!(rng.next_u32(), 0xd5c8e50e);
485 | assert_eq!(rng.next_u32(), 0x3b8feaf9);
486 | assert_eq!(rng.next_u32(), 0x56301ed5);
487 | assert_eq!(rng.next_u32(), 0xd8b2ffd3);
488 | assert_eq!(rng.next_u32(), 0x78183bbb);
489 | assert_eq!(rng.next_u32(), 0xc970ae1a);
490 | assert_eq!(rng.next_u32(), 0xeb8f9a19);
491 | assert_eq!(rng.next_u32(), 0xcdfd8d76);
492 | assert_eq!(rng.next_u32(), 0x0fc73c37);
493 | assert_eq!(rng.next_u32(), 0xf4b327fe);
494 | assert_eq!(rng.next_u32(), 0x3eff9556);
495 | assert_eq!(rng.next_u32(), 0xd5af05dd);
496 | assert_eq!(rng.next_u32(), 0x91420c9d);
497 | assert_eq!(rng.next_u32(), 0xc3a506eb);
498 | assert_eq!(rng.next_u32(), 0x0d6bfe8c);
499 | assert_eq!(rng.next_u32(), 0x7023920e);
500 | assert_eq!(rng.next_u32(), 0x8f83c4a1);
501 | assert_eq!(rng.next_u32(), 0x48db1bb7);
502 | assert_eq!(rng.next_u32(), 0x6b87b840);
503 | assert_eq!(rng.next_u32(), 0xed1ef4c2);
504 | assert_eq!(rng.next_u32(), 0x34956d42);
505 | assert_eq!(rng.next_u32(), 0x58d35758);
506 | assert_eq!(rng.next_u32(), 0x431154fc);
507 | assert_eq!(rng.next_u32(), 0x497cabf3);
508 | assert_eq!(rng.next_u32(), 0x3e0b2df3);
509 | assert_eq!(rng.next_u32(), 0x8eef32a2);
510 | assert_eq!(rng.next_u32(), 0xf090e5ea);
511 | assert_eq!(rng.next_u32(), 0xd88b5749);
512 | assert_eq!(rng.next_u32(), 0x70029a8b);
513 | assert_eq!(rng.next_u32(), 0x4e243705);
514 | assert_eq!(rng.next_u32(), 0xbb6342f5);
515 | assert_eq!(rng.next_u32(), 0x78fcec2c);
516 | assert_eq!(rng.next_u32(), 0xa970692f);
517 | assert_eq!(rng.next_u32(), 0xc651a582);
518 | assert_eq!(rng.next_u32(), 0x1816afe3);
519 | assert_eq!(rng.next_u32(), 0x352ee4ad);
520 | assert_eq!(rng.next_u32(), 0x612f55db);
521 | assert_eq!(rng.next_u32(), 0x463cb745);
522 | assert_eq!(rng.next_u32(), 0x1c3de851);
523 | assert_eq!(rng.next_u32(), 0x811ef082);
524 | assert_eq!(rng.next_u32(), 0xc101da7e);
525 | assert_eq!(rng.next_u32(), 0x026ff374);
526 | assert_eq!(rng.next_u32(), 0x992d58fc);
527 | assert_eq!(rng.next_u32(), 0xa0660379);
528 | assert_eq!(rng.next_u32(), 0x04c4fa59);
529 | assert_eq!(rng.next_u32(), 0x6f7e6167);
530 | assert_eq!(rng.next_u32(), 0x685da798);
531 | }
532 |
533 | #[test]
534 | fn randen_rng_fill_bytes_test_vectors() {
535 | // The expected values were generated from the reference C++
536 | // implementation using the following program:
537 | //
538 | // int main(int, char**) {
539 | // randen::Randen rng;
540 | // std::uint8_t seq_1[37] = {0};
541 | // std::uint8_t seq_2[151] = {0};
542 | // std::uint8_t seq_3[233] = {0};
543 | // for (std::uint8_t& x : seq_1) x = rng();
544 | // for (std::uint8_t& x : seq_2) x = rng();
545 | // for (std::uint8_t& x : seq_3) x = rng();
546 | // std::cout << " assert_eq!(seq_1[36], "
547 | // << static_cast(seq_1[36]) << ");\n";
548 | // std::cout << " assert_eq!(seq_2[150], "
549 | // << static_cast(seq_2[150]) << ");\n";
550 | // std::cout << " assert_eq!(seq_3[232], "
551 | // << static_cast(seq_3[232]) << ");\n";
552 | // std::cout << std::endl;
553 | // return 0;
554 | // }
555 |
556 | let mut seq_1 = [0_u8; 37];
557 | let mut seq_2 = [0_u8; 151];
558 | let mut seq_3 = [0_u8; 233];
559 | let mut rng = RandenRng::new_unseeded();
560 | rng.fill_bytes(&mut seq_1);
561 | rng.fill_bytes(&mut seq_2);
562 | rng.fill_bytes(&mut seq_3);
563 | assert_eq!(seq_1[36], 186);
564 | assert_eq!(seq_2[150], 112);
565 | assert_eq!(seq_3[232], 24);
566 | }
567 | }
568 |
--------------------------------------------------------------------------------
/engine_chacha.h:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef ENGINE_CHACHA_H_
16 | #define ENGINE_CHACHA_H_
17 | #if defined(__SSE2__) && defined(__AES__)
18 |
19 | #include
20 | #include
21 | #include "tmmintrin.h"
22 |
23 | namespace randen {
24 |
25 | // Modified from https://gist.github.com/orlp/32f5d1b631ab092608b1:
26 | /*
27 | Copyright (c) 2015 Orson Peters
28 |
29 | This software is provided 'as-is', without any express or implied warranty.
30 | In no event will the authors be held liable for any damages arising from the
31 | use of this software.
32 |
33 | Permission is granted to anyone to use this software for any purpose,
34 | including commercial applications, and to alter it and redistribute it
35 | freely, subject to the following restrictions:
36 |
37 | 1. The origin of this software must not be misrepresented; you must not
38 | claim that you wrote the original software. If you use this software in a
39 | product, an acknowledgment in the product documentation would be appreciated
40 | but is not required.
41 |
42 | 2. Altered source versions must be plainly marked as such, and must not be
43 | misrepresented as being the original software.
44 |
45 | 3. This notice may not be removed or altered from any source distribution.
46 | */
47 |
48 | template
49 | class ChaCha {
50 | public:
51 | static constexpr size_t R = 8;
52 | typedef T result_type;
53 |
54 | static constexpr result_type min() {
55 | return std::numeric_limits::min();
56 | }
57 | static constexpr result_type max() {
58 | return std::numeric_limits::max();
59 | }
60 |
61 | explicit ChaCha(uint64_t seedval, uint64_t stream = 0) {
62 | seed(seedval, stream);
63 | }
64 | template
65 | explicit ChaCha(Sseq& seq) {
66 | seed(seq);
67 | }
68 |
69 | void seed(uint64_t seedval, uint64_t stream = 0) {
70 | ctr = 0;
71 | keysetup[0] = seedval & 0xffffffffu;
72 | keysetup[1] = seedval >> 32;
73 | keysetup[2] = keysetup[3] = 0xdeadbeef; // Could use 128-bit seed.
74 | keysetup[4] = stream & 0xffffffffu;
75 | keysetup[5] = stream >> 32;
76 | keysetup[6] = keysetup[7] = 0xdeadbeef; // Could use 128-bit stream.
77 | }
78 |
79 | template
80 | void seed(Sseq& seq) {
81 | ctr = 0;
82 | seq.generate(keysetup, keysetup + 8);
83 | }
84 |
85 | result_type operator()() {
86 | int idx = ctr % 16;
87 | if (idx == 0) generate_block();
88 |
89 | result_type ret;
90 | memcpy(&ret, block + idx, sizeof(ret));
91 | ctr += sizeof(ret) / sizeof(uint32_t);
92 |
93 | return ret;
94 | }
95 |
96 | private:
97 | void generate_block() {
98 | uint32_t constants[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
99 |
100 | uint32_t input[16];
101 | for (int i = 0; i < 4; ++i) input[i] = constants[i];
102 | for (int i = 0; i < 8; ++i) input[4 + i] = keysetup[i];
103 | input[12] = (ctr / 16) & 0xffffffffu;
104 | input[13] = (ctr / 16) >> 32;
105 | input[14] = input[15] = 0xdeadbeef; // Could use 128-bit counter.
106 |
107 | for (int i = 0; i < 16; ++i) block[i] = input[i];
108 | chacha_core();
109 | for (int i = 0; i < 16; ++i) block[i] += input[i];
110 | }
111 |
112 | // Get an efficient _mm_roti_epi32 based on enabled features.
113 | #define _mm_roti_epi32(r, c) \
114 | (((c) == 8) \
115 | ? _mm_shuffle_epi8((r), _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, \
116 | 5, 4, 7, 2, 1, 0, 3)) \
117 | : ((c) == 16) \
118 | ? _mm_shuffle_epi8((r), _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \
119 | 10, 5, 4, 7, 6, 1, 0, 3, 2)) \
120 | : ((c) == 24) ? _mm_shuffle_epi8( \
121 | (r), _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, \
122 | 9, 4, 7, 6, 5, 0, 3, 2, 1)) \
123 | : _mm_xor_si128(_mm_slli_epi32((r), (c)), \
124 | _mm_srli_epi32((r), 32 - (c))))
125 |
126 | void chacha_core() {
127 | // ROTVn rotates the elements in the given vector n places to the left.
128 | #define CHACHA_ROTV1(x) _mm_shuffle_epi32((__m128i)x, 0x39)
129 | #define CHACHA_ROTV2(x) _mm_shuffle_epi32((__m128i)x, 0x4e)
130 | #define CHACHA_ROTV3(x) _mm_shuffle_epi32((__m128i)x, 0x93)
131 |
132 | __m128i a = _mm_load_si128((__m128i*)(block));
133 | __m128i b = _mm_load_si128((__m128i*)(block + 4));
134 | __m128i c = _mm_load_si128((__m128i*)(block + 8));
135 | __m128i d = _mm_load_si128((__m128i*)(block + 12));
136 |
137 | for (int i = 0; i < R; i += 2) {
138 | a = _mm_add_epi32(a, b);
139 | d = _mm_xor_si128(d, a);
140 | d = _mm_roti_epi32(d, 16);
141 | c = _mm_add_epi32(c, d);
142 | b = _mm_xor_si128(b, c);
143 | b = _mm_roti_epi32(b, 12);
144 | a = _mm_add_epi32(a, b);
145 | d = _mm_xor_si128(d, a);
146 | d = _mm_roti_epi32(d, 8);
147 | c = _mm_add_epi32(c, d);
148 | b = _mm_xor_si128(b, c);
149 | b = _mm_roti_epi32(b, 7);
150 |
151 | b = CHACHA_ROTV1(b);
152 | c = CHACHA_ROTV2(c);
153 | d = CHACHA_ROTV3(d);
154 |
155 | a = _mm_add_epi32(a, b);
156 | d = _mm_xor_si128(d, a);
157 | d = _mm_roti_epi32(d, 16);
158 | c = _mm_add_epi32(c, d);
159 | b = _mm_xor_si128(b, c);
160 | b = _mm_roti_epi32(b, 12);
161 | a = _mm_add_epi32(a, b);
162 | d = _mm_xor_si128(d, a);
163 | d = _mm_roti_epi32(d, 8);
164 | c = _mm_add_epi32(c, d);
165 | b = _mm_xor_si128(b, c);
166 | b = _mm_roti_epi32(b, 7);
167 |
168 | b = CHACHA_ROTV3(b);
169 | c = CHACHA_ROTV2(c);
170 | d = CHACHA_ROTV1(d);
171 | }
172 |
173 | _mm_store_si128((__m128i*)(block), a);
174 | _mm_store_si128((__m128i*)(block + 4), b);
175 | _mm_store_si128((__m128i*)(block + 8), c);
176 | _mm_store_si128((__m128i*)(block + 12), d);
177 |
178 | #undef CHACHA_ROTV3
179 | #undef CHACHA_ROTV2
180 | #undef CHACHA_ROTV1
181 | }
182 |
183 | alignas(16) uint32_t block[16];
184 | uint32_t keysetup[8];
185 | uint64_t ctr;
186 | };
187 |
188 | } // namespace randen
189 |
190 | #endif // defined(__SSE2__) && defined(__AES__)
191 | #endif // ENGINE_CHACHA_H_
192 |
--------------------------------------------------------------------------------
/engine_os.h:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef ENGINE_OS_H_
16 | #define ENGINE_OS_H_
17 |
18 | #ifdef _WIN64
19 | #define NOMINMAX
20 | #include
21 | // Must come after windows.h; this comment ensures that.
22 | #include
23 | #pragma comment(lib, "bcrypt")
24 | #endif
25 |
26 | #include "util.h"
27 |
28 | namespace randen {
29 |
30 | // Buffered, uses OS CSPRNG.
31 | template
32 | class alignas(32) EngineOS {
33 | public:
34 | // C++11 URBG interface:
35 | using result_type = T;
36 | static constexpr T min() { return T(0); }
37 | static constexpr T max() { return ~T(0); }
38 |
39 | EngineOS() {
40 | // The first call to operator() will trigger a refill.
41 | next_ = kStateT;
42 |
43 | #ifdef _WIN32
44 | RANDEN_CHECK(0 == BCryptOpenAlgorithmProvider(
45 | &provider_, BCRYPT_RNG_ALGORITHM, nullptr, 0));
46 | #else
47 | dev_ = fopen("/dev/urandom", "r");
48 | RANDEN_CHECK(dev_ != nullptr);
49 | #endif
50 | }
51 |
52 | ~EngineOS() {
53 | #ifdef _WIN32
54 | RANDEN_CHECK(0 == BCryptCloseAlgorithmProvider(provider_, 0));
55 | #else
56 | RANDEN_CHECK(fclose(dev_) == 0);
57 | #endif
58 | }
59 |
60 | // Returns random bits from the buffer in units of T.
61 | T operator()() {
62 | // (Local copy ensures compiler knows this is not aliased.)
63 | size_t next = next_;
64 |
65 | // Refill the buffer if needed (unlikely).
66 | if (next >= kStateT) {
67 | #ifdef _WIN32
68 | RANDEN_CHECK(0 == BCryptGenRandom(provider_,
69 | reinterpret_cast(&state_[0]),
70 | sizeof(state_), 0));
71 | #else
72 | const size_t bytes_read = fread(&state_[0], 1, sizeof(state_), dev_);
73 | RANDEN_CHECK(bytes_read == sizeof(state_));
74 | #endif
75 | next = 0;
76 | }
77 |
78 | const T ret = state_[next];
79 | next_ = next + 1;
80 | return ret;
81 | }
82 |
83 | private:
84 | static constexpr size_t kStateT = 256 / sizeof(T); // same as Randen
85 |
86 | alignas(32) T state_[kStateT];
87 | size_t next_; // index within state_
88 | #ifdef _WIN32
89 | BCRYPT_ALG_HANDLE provider_;
90 | #else
91 | FILE* dev_;
92 | #endif
93 | };
94 |
95 | } // namespace randen
96 |
97 | #endif // ENGINE_OS_H_
98 |
--------------------------------------------------------------------------------
/nanobenchmark.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "nanobenchmark.h"
16 | #include "randen.h"
17 |
18 | #include
19 | #include
20 | #include // abort
21 | #include // memcpy
22 | #include // clock_gettime
23 | #include // sort
24 | #include
25 | #include
26 | #include // iota
27 | #include
28 | #include
29 |
30 | // Architecture
31 | #if defined(__x86_64__) || defined(_M_X64)
32 | #define NB_ARCH_X86
33 | #if defined(_MSC_VER)
34 | #include
35 | #else
36 | #include // NOLINT
37 | #endif
38 | #elif defined(__powerpc64__) || defined(_M_PPC)
39 | #define NB_ARCH_PPC
40 | #include // NOLINT __ppc_get_timebase_freq
41 | #elif defined(__aarch64__) || defined(__arm__)
42 | #define NB_ARCH_ARM
43 | #else
44 | #error "Please add support for this architecture"
45 | #endif
46 |
47 | // OS
48 | #if defined(_WIN32) || defined(_WIN64)
49 | #define NB_OS_WIN
50 | #define NOMINMAX
51 | #include // NOLINT
52 | #elif defined(__linux__)
53 | #define NB_OS_LINUX
54 | #include // NOLINT
55 | #else
56 | #error "Please add support for this OS"
57 | #endif
58 |
59 | namespace randen {
60 | namespace platform {
61 | namespace {
62 |
63 | // Enables sanity checks that verify correct operation at the cost of
64 | // longer benchmark runs.
65 | #ifndef NANOBENCHMARK_ENABLE_CHECKS
66 | #define NANOBENCHMARK_ENABLE_CHECKS 0
67 | #endif
68 |
69 | #define NANOBENCHMARK_CHECK_ALWAYS(condition) \
70 | while (!(condition)) { \
71 | fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
72 | abort(); \
73 | }
74 |
75 | #if NANOBENCHMARK_ENABLE_CHECKS
76 | #define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
77 | #else
78 | #define NANOBENCHMARK_CHECK(condition)
79 | #endif
80 |
81 | // Compiler-specific
82 | #ifdef _MSC_VER
83 | #define NB_RESTRICT __restrict
84 | #define NB_INLINE __forceinline
85 | #define NB_NOINLINE __declspec(noinline)
86 |
87 | #elif defined(__GNUC__) || defined(__clang__)
88 | #define NB_RESTRICT __restrict__
89 | #define NB_INLINE inline __attribute__((always_inline))
90 | #define NB_NOINLINE inline __attribute__((noinline))
91 |
92 | #else
93 | #error "Unsupported compiler"
94 | #endif
95 |
96 | #ifdef NB_ARCH_X86
97 |
98 | void Cpuid(const uint32_t level, const uint32_t count,
99 | uint32_t* NB_RESTRICT abcd) {
100 | #ifdef _MSC_VER
101 | int regs[4];
102 | __cpuidex(regs, level, count);
103 | for (int i = 0; i < 4; ++i) {
104 | abcd[i] = regs[i];
105 | }
106 | #else
107 | uint32_t a, b, c, d;
108 | __cpuid_count(level, count, a, b, c, d);
109 | abcd[0] = a;
110 | abcd[1] = b;
111 | abcd[2] = c;
112 | abcd[3] = d;
113 | #endif
114 | }
115 |
116 | std::string BrandString() {
117 | char brand_string[49];
118 | uint32_t abcd[4];
119 |
120 | // Check if brand string is supported (it is on all reasonable Intel/AMD)
121 | Cpuid(0x80000000U, 0, abcd);
122 | if (abcd[0] < 0x80000004U) {
123 | return std::string();
124 | }
125 |
126 | for (int i = 0; i < 3; ++i) {
127 | Cpuid(0x80000002U + i, 0, abcd);
128 | memcpy(brand_string + i * 16, &abcd, sizeof(abcd));
129 | }
130 | brand_string[48] = 0;
131 | return brand_string;
132 | }
133 |
134 | // Returns the frequency quoted inside the brand string. This does not
135 | // account for throttling nor Turbo Boost.
136 | double NominalClockRate() {
137 | const std::string& brand_string = BrandString();
138 | // Brand strings include the maximum configured frequency. These prefixes are
139 | // defined by Intel CPUID documentation.
140 | const char* prefixes[3] = {"MHz", "GHz", "THz"};
141 | const double multipliers[3] = {1E6, 1E9, 1E12};
142 | for (size_t i = 0; i < 3; ++i) {
143 | const size_t pos_prefix = brand_string.find(prefixes[i]);
144 | if (pos_prefix != std::string::npos) {
145 | const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
146 | if (pos_space != std::string::npos) {
147 | const std::string digits =
148 | brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
149 | return std::stod(digits) * multipliers[i];
150 | }
151 | }
152 | }
153 |
154 | return 0.0;
155 | }
156 |
157 | #endif // NB_ARCH_X86
158 |
159 | } // namespace
160 |
161 | void PinThreadToCPU(int cpu) {
162 | if (cpu < 0) {
163 | // We might migrate to another CPU before pinning below, but at least cpu
164 | // will be one of the CPUs on which this thread ran.
165 | #if defined(NB_OS_WIN)
166 | cpu = static_cast(GetCurrentProcessorNumber());
167 | #elif defined(NB_OS_LINUX)
168 | cpu = sched_getcpu();
169 | #else
170 | #error "Please add support for this OS"
171 | #endif
172 | NANOBENCHMARK_CHECK_ALWAYS(cpu >= 0);
173 | }
174 |
175 | #if defined(NB_OS_WIN)
176 | const HANDLE hThread = GetCurrentThread();
177 | const DWORD_PTR prev = SetThreadAffinityMask(hThread, 1ULL << cpu);
178 | NANOBENCHMARK_CHECK_ALWAYS(prev != 0);
179 | #elif defined(NB_OS_LINUX)
180 | const pid_t pid = 0; // current thread
181 | cpu_set_t set;
182 | CPU_ZERO(&set);
183 | CPU_SET(cpu, &set);
184 | const int err = sched_setaffinity(pid, sizeof(set), &set);
185 | NANOBENCHMARK_CHECK_ALWAYS(err == 0);
186 | #else
187 | #error "Please add support for this OS"
188 | #endif
189 | }
190 |
191 | // Returns tick rate. Invariant means the tick counter frequency is independent
192 | // of CPU throttling or sleep. May be expensive, caller should cache the result.
193 | double InvariantTicksPerSecond() {
194 | #if defined(NB_ARCH_PPC)
195 | return __ppc_get_timebase_freq();
196 | #elif defined(NB_ARCH_X86)
197 | // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
198 | return NominalClockRate();
199 | #else
200 | // Fall back to clock_gettime nanoseconds.
201 | return 1E9;
202 | #endif
203 | }
204 |
205 | } // namespace platform
206 | namespace {
207 |
208 | // Prevents the compiler from eliding the computations that led to "output".
209 | template
210 | inline void PreventElision(T&& output) {
211 | #ifndef _MSC_VER
212 | // Works by indicating to the compiler that "output" is being read and
213 | // modified. The +r constraint avoids unnecessary writes to memory, but only
214 | // works for built-in types (typically FuncOutput).
215 | asm volatile("" : "+r"(output) : : "memory");
216 | #else
217 | // MSVC does not support inline assembly anymore (and never supported GCC's
218 | // RTL constraints). Self-assignment with #pragma optimize("off") might be
219 | // expected to prevent elision, but it does not with MSVC 2015. Type-punning
220 | // with volatile pointers generates inefficient code on MSVC 2017.
221 | static std::atomic dummy(T{});
222 | dummy.store(output, std::memory_order_relaxed);
223 | #endif
224 | }
225 |
226 | namespace timer {
227 |
228 | // Start/Stop return absolute timestamps and must be placed immediately before
229 | // and after the region to measure. We provide separate Start/Stop functions
230 | // because they use different fences.
231 | //
232 | // Background: RDTSC is not 'serializing'; earlier instructions may complete
233 | // after it, and/or later instructions may complete before it. 'Fences' ensure
234 | // regions' elapsed times are independent of such reordering. The only
235 | // documented unprivileged serializing instruction is CPUID, which acts as a
236 | // full fence (no reordering across it in either direction). Unfortunately
237 | // the latency of CPUID varies wildly (perhaps made worse by not initializing
238 | // its EAX input). Because it cannot reliably be deducted from the region's
239 | // elapsed time, it must not be included in the region to measure (i.e.
240 | // between the two RDTSC).
241 | //
242 | // The newer RDTSCP is sometimes described as serializing, but it actually
243 | // only serves as a half-fence with release semantics. Although all
244 | // instructions in the region will complete before the final timestamp is
245 | // captured, subsequent instructions may leak into the region and increase the
246 | // elapsed time. Inserting another fence after the final RDTSCP would prevent
247 | // such reordering without affecting the measured region.
248 | //
249 | // Fortunately, such a fence exists. The LFENCE instruction is only documented
250 | // to delay later loads until earlier loads are visible. However, Intel's
251 | // reference manual says it acts as a full fence (waiting until all earlier
252 | // instructions have completed, and delaying later instructions until it
253 | // completes). AMD assigns the same behavior to MFENCE.
254 | //
255 | // We need a fence before the initial RDTSC to prevent earlier instructions
256 | // from leaking into the region, and arguably another after RDTSC to avoid
257 | // region instructions from completing before the timestamp is recorded.
258 | // When surrounded by fences, the additional RDTSCP half-fence provides no
259 | // benefit, so the initial timestamp can be recorded via RDTSC, which has
260 | // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
261 | // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
262 | //
263 | // Using Start+Start leads to higher variance and overhead than Stop+Stop.
264 | // However, Stop+Stop includes an LFENCE in the region measurements, which
265 | // adds a delay dependent on earlier loads. The combination of Start+Stop
266 | // is faster than Start+Start and more consistent than Stop+Stop because
267 | // the first LFENCE already delayed subsequent loads before the measured
268 | // region. This combination seems not to have been considered in prior work:
269 | // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
270 | //
271 | // Note: performance counters can measure 'exact' instructions-retired or
272 | // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
273 | // requires fences. Unfortunately, it is not accessible on all OSes and we
274 | // prefer to avoid kernel-mode drivers. Performance counters are also affected
275 | // by several under/over-count errata, so we use the TSC instead.
276 |
277 | // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
278 | // divide by InvariantTicksPerSecond.
279 | inline uint64_t Start64() {
280 | uint64_t t;
281 | #if defined(NB_ARCH_PPC)
282 | asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
283 | #elif defined(NB_ARCH_X86)
284 | #if defined(_MSC_VER)
285 | _ReadWriteBarrier();
286 | _mm_lfence();
287 | _ReadWriteBarrier();
288 | t = __rdtsc();
289 | _ReadWriteBarrier();
290 | _mm_lfence();
291 | _ReadWriteBarrier();
292 | #else
293 | asm volatile(
294 | "lfence\n\t"
295 | "rdtsc\n\t"
296 | "shl $32, %%rdx\n\t"
297 | "or %%rdx, %0\n\t"
298 | "lfence"
299 | : "=a"(t)
300 | :
301 | // "memory" avoids reordering. rdx = TSC >> 32.
302 | // "cc" = flags modified by SHL.
303 | : "rdx", "memory", "cc");
304 | #endif
305 | #else
306 | // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
307 | timespec ts;
308 | clock_gettime(CLOCK_MONOTONIC, &ts);
309 | t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
310 | #endif
311 | return t;
312 | }
313 |
314 | inline uint64_t Stop64() {
315 | uint64_t t;
316 | #if defined(NB_ARCH_PPC)
317 | asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
318 | #elif defined(NB_ARCH_X86)
319 | #if defined(_MSC_VER)
320 | _ReadWriteBarrier();
321 | unsigned aux;
322 | t = __rdtscp(&aux);
323 | _ReadWriteBarrier();
324 | _mm_lfence();
325 | _ReadWriteBarrier();
326 | #else
327 | // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
328 | asm volatile(
329 | "rdtscp\n\t"
330 | "shl $32, %%rdx\n\t"
331 | "or %%rdx, %0\n\t"
332 | "lfence"
333 | : "=a"(t)
334 | :
335 | // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
336 | // "cc" = flags modified by SHL.
337 | : "rcx", "rdx", "memory", "cc");
338 | #endif
339 | #else
340 | t = Start64();
341 | #endif
342 | return t;
343 | }
344 |
345 | // Returns a 32-bit timestamp with about 4 cycles less overhead than
346 | // Start64. Only suitable for measuring very short regions because the
347 | // timestamp overflows about once a second.
348 | inline uint32_t Start32() {
349 | uint32_t t;
350 | #if defined(NB_ARCH_X86)
351 | #if defined(_MSC_VER)
352 | _ReadWriteBarrier();
353 | _mm_lfence();
354 | _ReadWriteBarrier();
355 | t = static_cast(__rdtsc());
356 | _ReadWriteBarrier();
357 | _mm_lfence();
358 | _ReadWriteBarrier();
359 | #else
360 | asm volatile(
361 | "lfence\n\t"
362 | "rdtsc\n\t"
363 | "lfence"
364 | : "=a"(t)
365 | :
366 | // "memory" avoids reordering. rdx = TSC >> 32.
367 | : "rdx", "memory");
368 | #endif
369 | #else
370 | t = static_cast(Start64());
371 | #endif
372 | return t;
373 | }
374 |
375 | inline uint32_t Stop32() {
376 | uint32_t t;
377 | #if defined(NB_ARCH_X86)
378 | #if defined(_MSC_VER)
379 | _ReadWriteBarrier();
380 | unsigned aux;
381 | t = static_cast(__rdtscp(&aux));
382 | _ReadWriteBarrier();
383 | _mm_lfence();
384 | _ReadWriteBarrier();
385 | #else
386 | // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
387 | asm volatile(
388 | "rdtscp\n\t"
389 | "lfence"
390 | : "=a"(t)
391 | :
392 | // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
393 | : "rcx", "rdx", "memory");
394 | #endif
395 | #else
396 | t = static_cast(Stop64());
397 | #endif
398 | return t;
399 | }
400 |
401 | } // namespace timer
402 |
403 | namespace robust_statistics {
404 |
405 | // Sorts integral values in ascending order (e.g. for Mode). About 3x faster
406 | // than std::sort for input distributions with very few unique values.
407 | template
408 | void CountingSort(T* values, size_t num_values) {
409 | // Unique values and their frequency (similar to flat_map).
410 | using Unique = std::pair;
411 | std::vector unique;
412 | for (size_t i = 0; i < num_values; ++i) {
413 | const T value = values[i];
414 | const auto pos =
415 | std::find_if(unique.begin(), unique.end(),
416 | [value](const Unique u) { return u.first == value; });
417 | if (pos == unique.end()) {
418 | unique.push_back(std::make_pair(value, 1));
419 | } else {
420 | ++pos->second;
421 | }
422 | }
423 |
424 | // Sort in ascending order of value (pair.first).
425 | std::sort(unique.begin(), unique.end());
426 |
427 | // Write that many copies of each unique value to the array.
428 | T* NB_RESTRICT p = values;
429 | for (const auto& value_count : unique) {
430 | std::fill(p, p + value_count.second, value_count.first);
431 | p += value_count.second;
432 | }
433 | NANOBENCHMARK_CHECK(p == values + num_values);
434 | }
435 |
436 | // @return i in [idx_begin, idx_begin + half_count) that minimizes
437 | // sorted[i + half_count] - sorted[i].
438 | template
439 | size_t MinRange(const T* const NB_RESTRICT sorted, const size_t idx_begin,
440 | const size_t half_count) {
441 | T min_range = std::numeric_limits::max();
442 | size_t min_idx = 0;
443 |
444 | for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
445 | NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
446 | const T range = sorted[idx + half_count] - sorted[idx];
447 | if (range < min_range) {
448 | min_range = range;
449 | min_idx = idx;
450 | }
451 | }
452 |
453 | return min_idx;
454 | }
455 |
456 | // Returns an estimate of the mode by calling MinRange on successively
457 | // halved intervals. "sorted" must be in ascending order. This is the
458 | // Half Sample Mode estimator proposed by Bickel in "On a fast, robust
459 | // estimator of the mode", with complexity O(N log N). The mode is less
460 | // affected by outliers in highly-skewed distributions than the median.
461 | // The averaging operation below assumes "T" is an unsigned integer type.
462 | template
463 | T ModeOfSorted(const T* const NB_RESTRICT sorted, const size_t num_values) {
464 | size_t idx_begin = 0;
465 | size_t half_count = num_values / 2;
466 | while (half_count > 1) {
467 | idx_begin = MinRange(sorted, idx_begin, half_count);
468 | half_count >>= 1;
469 | }
470 |
471 | const T x = sorted[idx_begin + 0];
472 | if (half_count == 0) {
473 | return x;
474 | }
475 | NANOBENCHMARK_CHECK(half_count == 1);
476 | const T average = (x + sorted[idx_begin + 1] + 1) / 2;
477 | return average;
478 | }
479 |
480 | // Returns the mode. Side effect: sorts "values".
481 | template
482 | T Mode(T* values, const size_t num_values) {
483 | CountingSort(values, num_values);
484 | return ModeOfSorted(values, num_values);
485 | }
486 |
487 | template
488 | T Mode(T (&values)[N]) {
489 | return Mode(&values[0], N);
490 | }
491 |
492 | // Returns the median value. Side effect: sorts "values".
493 | template
494 | T Median(T* values, const size_t num_values) {
495 | NANOBENCHMARK_CHECK(!values->empty());
496 | std::sort(values, values + num_values);
497 | const size_t half = num_values / 2;
498 | // Odd count: return middle
499 | if (num_values % 2) {
500 | return values[half];
501 | }
502 | // Even count: return average of middle two.
503 | return (values[half] + values[half - 1] + 1) / 2;
504 | }
505 |
506 | // Returns a robust measure of variability.
507 | template
508 | T MedianAbsoluteDeviation(const T* values, const size_t num_values,
509 | const T median) {
510 | NANOBENCHMARK_CHECK(num_values != 0);
511 | std::vector abs_deviations;
512 | abs_deviations.reserve(num_values);
513 | for (size_t i = 0; i < num_values; ++i) {
514 | const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
515 | abs_deviations.push_back(static_cast(abs));
516 | }
517 | return Median(abs_deviations.data(), num_values);
518 | }
519 |
520 | } // namespace robust_statistics
521 |
522 | // Ticks := platform-specific timer values (CPU cycles on x86). Must be
523 | // unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
524 | // read than 64 bit.
525 | using Ticks = uint32_t;
526 |
527 | // Returns timer overhead / minimum measurable difference.
528 | Ticks TimerResolution() {
529 | // Nested loop avoids exceeding stack/L1 capacity.
530 | Ticks repetitions[Params::kTimerSamples];
531 | for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
532 | Ticks samples[Params::kTimerSamples];
533 | for (size_t i = 0; i < Params::kTimerSamples; ++i) {
534 | const Ticks t0 = timer::Start32();
535 | const Ticks t1 = timer::Stop32();
536 | samples[i] = t1 - t0;
537 | }
538 | repetitions[rep] = robust_statistics::Mode(samples);
539 | }
540 | return robust_statistics::Mode(repetitions);
541 | }
542 |
543 | static const Ticks timer_resolution = TimerResolution();
544 |
545 | // Estimates the expected value of "lambda" values with a variable number of
546 | // samples until the variability "rel_mad" is less than "max_rel_mad".
547 | template
548 | Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
549 | const Params& p, const Lambda& lambda) {
550 | // Choose initial samples_per_eval based on a single estimated duration.
551 | Ticks t0 = timer::Start32();
552 | lambda();
553 | Ticks t1 = timer::Stop32();
554 | Ticks est = t1 - t0;
555 | static const double ticks_per_second = platform::InvariantTicksPerSecond();
556 | const size_t ticks_per_eval =
557 | static_cast(ticks_per_second * p.seconds_per_eval);
558 | size_t samples_per_eval = ticks_per_eval / est;
559 | samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
560 |
561 | std::vector samples;
562 | samples.reserve(1 + samples_per_eval);
563 | samples.push_back(est);
564 |
565 | // Percentage is too strict for tiny differences, so also allow a small
566 | // absolute "median absolute deviation".
567 | const Ticks max_abs_mad = (timer_resolution + 99) / 100;
568 | *rel_mad = 0.0; // ensure initialized
569 |
570 | for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
571 | samples.reserve(samples.size() + samples_per_eval);
572 | for (size_t i = 0; i < samples_per_eval; ++i) {
573 | t0 = timer::Start32();
574 | lambda();
575 | t1 = timer::Stop32();
576 | samples.push_back(t1 - t0);
577 | }
578 |
579 | if (samples.size() >= p.min_mode_samples) {
580 | est = robust_statistics::Mode(samples.data(), samples.size());
581 | } else {
582 | // For "few" (depends also on the variance) samples, Median is safer.
583 | est = robust_statistics::Median(samples.data(), samples.size());
584 | }
585 | NANOBENCHMARK_CHECK(est != 0);
586 |
587 | // Median absolute deviation (mad) is a robust measure of 'variability'.
588 | const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
589 | samples.data(), samples.size(), est);
590 | *rel_mad = static_cast(int(abs_mad)) / est;
591 |
592 | if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
593 | if (p.verbose) {
594 | printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
595 | samples.size(), est, abs_mad, *rel_mad * 100.0);
596 | }
597 | return est;
598 | }
599 | }
600 |
601 | if (p.verbose) {
602 | printf(
603 | "WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
604 | *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
605 | }
606 | return est;
607 | }
608 |
609 | using InputVec = std::vector;
610 |
611 | // Returns vector of unique input values.
612 | InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
613 | InputVec unique(inputs, inputs + num_inputs);
614 | std::sort(unique.begin(), unique.end());
615 | unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
616 | return unique;
617 | }
618 |
619 | // Returns how often we need to call func for sufficient precision, or zero
620 | // on failure (e.g. the elapsed time is too long for a 32-bit tick count).
621 | size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
622 | const Params& p) {
623 | // Min elapsed ticks for any input.
624 | Ticks min_duration = ~0u;
625 |
626 | for (const FuncInput input : unique) {
627 | // Make sure a 32-bit timer is sufficient.
628 | const uint64_t t0 = timer::Start64();
629 | PreventElision(func(arg, input));
630 | const uint64_t t1 = timer::Stop64();
631 | const uint64_t elapsed = t1 - t0;
632 | if (elapsed >= (1ULL << 30)) {
633 | fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
634 | input);
635 | return 0;
636 | }
637 |
638 | double rel_mad;
639 | const Ticks total = SampleUntilStable(
640 | p.target_rel_mad, &rel_mad, p,
641 | [func, arg, input]() { PreventElision(func(arg, input)); });
642 | min_duration = std::min(min_duration, total - timer_resolution);
643 | }
644 |
645 | // Number of repetitions required to reach the target resolution.
646 | const size_t max_skip = p.precision_divisor;
647 | // Number of repetitions given the estimated duration.
648 | const size_t num_skip =
649 | min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
650 | if (p.verbose) {
651 | printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
652 | max_skip, min_duration, num_skip);
653 | }
654 | return num_skip;
655 | }
656 |
657 | // Replicates inputs until we can omit "num_skip" occurrences of an input.
658 | InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
659 | const size_t num_unique, const size_t num_skip,
660 | const Params& p) {
661 | InputVec full;
662 | if (num_unique == 1) {
663 | full.assign(p.subset_ratio * num_skip, inputs[0]);
664 | return full;
665 | }
666 |
667 | full.reserve(p.subset_ratio * num_skip * num_inputs);
668 | for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
669 | full.insert(full.end(), inputs, inputs + num_inputs);
670 | }
671 | randen::Randen rng;
672 | std::shuffle(full.begin(), full.end(), rng);
673 | return full;
674 | }
675 |
676 | // Copies the "full" to "subset" in the same order, but with "num_skip"
677 | // randomly selected occurrences of "input_to_skip" removed.
678 | void FillSubset(const InputVec& full, const FuncInput input_to_skip,
679 | const size_t num_skip, InputVec* subset) {
680 | const size_t count = std::count(full.begin(), full.end(), input_to_skip);
681 | // Generate num_skip random indices: which occurrence to skip.
682 | std::vector omit(count);
683 | std::iota(omit.begin(), omit.end(), 0);
684 | // omit[] is the same on every call, but that's OK because they identify the
685 | // Nth instance of input_to_skip, so the position within full[] differs.
686 | randen::Randen rng;
687 | std::shuffle(omit.begin(), omit.end(), rng);
688 | omit.resize(num_skip);
689 | std::sort(omit.begin(), omit.end());
690 |
691 | uint32_t occurrence = ~0u; // 0 after preincrement
692 | size_t idx_omit = 0; // cursor within omit[]
693 | size_t idx_subset = 0; // cursor within *subset
694 | for (const FuncInput next : full) {
695 | if (next == input_to_skip) {
696 | ++occurrence;
697 | // Haven't removed enough already
698 | if (idx_omit < num_skip) {
699 | // This one is up for removal
700 | if (occurrence == omit[idx_omit]) {
701 | ++idx_omit;
702 | continue;
703 | }
704 | }
705 | }
706 | if (idx_subset < subset->size()) {
707 | (*subset)[idx_subset++] = next;
708 | }
709 | }
710 | NANOBENCHMARK_CHECK(idx_subset == subset->size());
711 | NANOBENCHMARK_CHECK(idx_omit == omit.size());
712 | NANOBENCHMARK_CHECK(occurrence == count - 1);
713 | }
714 |
715 | // Returns total ticks elapsed for all inputs.
716 | Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
717 | const Params& p, double* max_rel_mad) {
718 | double rel_mad;
719 | const Ticks duration =
720 | SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
721 | for (const FuncInput input : *inputs) {
722 | PreventElision(func(arg, input));
723 | }
724 | });
725 | *max_rel_mad = std::max(*max_rel_mad, rel_mad);
726 | return duration;
727 | }
728 |
729 | // (Nearly) empty Func for measuring timer overhead/resolution.
730 | NB_NOINLINE FuncOutput EmptyFunc(const void* arg, const FuncInput input) {
731 | return input;
732 | }
733 |
734 | // Returns overhead of accessing inputs[] and calling a function; this will
735 | // be deducted from future TotalDuration return values.
736 | Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
737 | double rel_mad;
738 | // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
739 | return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
740 | for (const FuncInput input : *inputs) {
741 | PreventElision(EmptyFunc(arg, input));
742 | }
743 | });
744 | }
745 |
746 | } // namespace
747 |
748 | size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
749 | const size_t num_inputs, Result* results, const Params& p) {
750 | NANOBENCHMARK_CHECK(num_inputs != 0);
751 | const InputVec& unique = UniqueInputs(inputs, num_inputs);
752 |
753 | const size_t num_skip = NumSkip(func, arg, unique, p); // never 0
754 | if (num_skip == 0) return 0; // NumSkip already printed error message
755 | const float mul = 1.0f / static_cast(num_skip);
756 |
757 | const InputVec& full =
758 | ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
759 | InputVec subset(full.size() - num_skip);
760 |
761 | const Ticks overhead = Overhead(arg, &full, p);
762 | const Ticks overhead_skip = Overhead(arg, &subset, p);
763 | if (overhead < overhead_skip) {
764 | fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
765 | overhead_skip);
766 | return 0;
767 | }
768 |
769 | if (p.verbose) {
770 | printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
771 | overhead, overhead_skip);
772 | }
773 |
774 | double max_rel_mad = 0.0;
775 | const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
776 |
777 | for (size_t i = 0; i < unique.size(); ++i) {
778 | FillSubset(full, unique[i], num_skip, &subset);
779 | const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
780 |
781 | if (total < total_skip) {
782 | fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
783 | return 0;
784 | }
785 |
786 | const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
787 | results[i].input = unique[i];
788 | results[i].ticks = duration * mul;
789 | results[i].variability = static_cast(max_rel_mad);
790 | }
791 |
792 | return unique.size();
793 | }
794 |
795 | } // namespace randen
796 |
--------------------------------------------------------------------------------
/nanobenchmark.h:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef NANOBENCHMARK_H_
16 | #define NANOBENCHMARK_H_
17 |
18 | // Benchmarks functions of a single integer argument with realistic branch
19 | // prediction hit rates. Uses a robust estimator to summarize the measurements.
20 | // The precision is about 0.2%.
21 | //
22 | // Examples: see nanobenchmark_test.cc.
23 | //
24 | // Background: Microbenchmarks such as http://github.com/google/benchmark
25 | // can measure elapsed times on the order of a microsecond. Shorter functions
26 | // are typically measured by repeating them thousands of times and dividing
27 | // the total elapsed time by this count. Unfortunately, repetition (especially
28 | // with the same input parameter!) influences the runtime. In time-critical
29 | // code, it is reasonable to expect warm instruction/data caches and TLBs,
30 | // but a perfect record of which branches will be taken is unrealistic.
31 | // Unless the application also repeatedly invokes the measured function with
32 | // the same parameter, the benchmark is measuring something very different -
33 | // a best-case result, almost as if the parameter were made a compile-time
34 | // constant. This may lead to erroneous conclusions about branch-heavy
35 | // algorithms outperforming branch-free alternatives.
36 | //
37 | // Our approach differs in three ways. Adding fences to the timer functions
38 | // reduces variability due to instruction reordering, improving the timer
39 | // resolution to about 40 CPU cycles. However, shorter functions must still
40 | // be invoked repeatedly. For more realistic branch prediction performance,
41 | // we vary the input parameter according to a user-specified distribution.
42 | // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
43 | // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
44 | // central tendency of the measurement samples with the "half sample mode",
45 | // which is more robust to outliers and skewed data than the mean or median.
46 |
47 | // WARNING if included from multiple translation units compiled with distinct
48 | // flags: this header requires textual inclusion and a predefined NB_NAMESPACE
49 | // macro that is unique to the current compile flags. We must also avoid
50 | // standard library headers such as vector and functional that define functions.
51 |
52 | #include
53 | #include
54 |
55 | namespace randen {
56 |
57 | namespace platform {
58 |
59 | // Ensures the thread is running on the specified cpu, and no others.
60 | // Reduces caused by desynchronized socket RDTSC and context switches.
61 | // If "cpu" is negative, pin to the currently running core.
62 | void PinThreadToCPU(const int cpu = -1);
63 |
64 | // Returns tick rate, useful for converting measurements to seconds. Invariant
65 | // means the tick counter frequency is independent of CPU throttling or sleep.
66 | // This call may be expensive, callers should cache the result.
67 | double InvariantTicksPerSecond();
68 |
69 | } // namespace platform
70 |
71 | // Input influencing the function being measured (e.g. number of bytes to copy).
72 | using FuncInput = size_t;
73 |
74 | // "Proof of work" returned by Func to ensure the compiler does not elide it.
75 | using FuncOutput = uint64_t;
76 |
77 | // Function to measure: either 1) a captureless lambda or function with two
78 | // arguments or 2) a lambda with capture, in which case the first argument
79 | // is reserved for use by MeasureClosure.
80 | using Func = FuncOutput (*)(const void*, FuncInput);
81 |
82 | // Internal parameters that determine precision/resolution/measuring time.
83 | struct Params {
84 | // For measuring timer overhead/resolution. Used in a nested loop =>
85 | // quadratic time, acceptable because we know timer overhead is "low".
86 | // constexpr because this is used to define array bounds.
87 | static constexpr size_t kTimerSamples = 256;
88 |
89 | // Best-case precision, expressed as a divisor of the timer resolution.
90 | // Larger => more calls to Func and higher precision.
91 | size_t precision_divisor = 1024;
92 |
93 | // Ratio between full and subset input distribution sizes. Cannot be less
94 | // than 2; larger values increase measurement time but more faithfully
95 | // model the given input distribution.
96 | size_t subset_ratio = 2;
97 |
98 | // Together with the estimated Func duration, determines how many times to
99 | // call Func before checking the sample variability. Larger values increase
100 | // measurement time, memory/cache use and precision.
101 | double seconds_per_eval = 4E-3;
102 |
103 | // The minimum number of samples before estimating the central tendency.
104 | size_t min_samples_per_eval = 7;
105 |
106 | // The mode is better than median for estimating the central tendency of
107 | // skewed/fat-tailed distributions, but it requires sufficient samples
108 | // relative to the width of half-ranges.
109 | size_t min_mode_samples = 64;
110 |
111 | // Maximum permissible variability (= median absolute deviation / center).
112 | double target_rel_mad = 0.002;
113 |
114 | // Abort after this many evals without reaching target_rel_mad. This
115 | // prevents infinite loops.
116 | size_t max_evals = 9;
117 |
118 | // Whether to print additional statistics to stdout.
119 | bool verbose = true;
120 | };
121 |
122 | // Measurement result for each unique input.
123 | struct Result {
124 | FuncInput input;
125 |
126 | // Robust estimate (mode or median) of duration.
127 | float ticks;
128 |
129 | // Measure of variability (median absolute deviation relative to "ticks").
130 | float variability;
131 | };
132 |
133 | // Precisely measures the number of ticks elapsed when calling "func" with the
134 | // given inputs, shuffled to ensure realistic branch prediction hit rates.
135 | //
136 | // "func" returns a 'proof of work' to ensure its computations are not elided.
137 | // "arg" is passed to Func, or reserved for internal use by MeasureClosure.
138 | // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
139 | // "func". The values should be chosen to maximize coverage of "func". This
140 | // represents a distribution, so a value's frequency should reflect its
141 | // probability in the real application. Order does not matter; for example, a
142 | // uniform distribution over [0, 4) could be represented as {3,0,2,1}.
143 | // Returns how many Result were written to "results": one per unique input, or
144 | // zero if the measurement failed (an error message goes to stderr).
145 | size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
146 | const size_t num_inputs, Result* results,
147 | const Params& p = Params());
148 |
149 | // Per-copt namespace prevents leaking generated code into other modules.
150 | namespace NB_NAMESPACE {
151 |
152 | // Calls operator() of the given closure (lambda function).
153 | template
154 | static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
155 | return (*f)(input);
156 | }
157 |
158 | } // namespace NB_NAMESPACE
159 |
160 | // Same as Measure, except "closure" is typically a lambda function of
161 | // FuncInput -> FuncOutput with a capture list.
162 | template
163 | static inline size_t MeasureClosure(const Closure& closure,
164 | const FuncInput* inputs,
165 | const size_t num_inputs, Result* results,
166 | const Params& p = Params()) {
167 | return Measure(reinterpret_cast(&NB_NAMESPACE::CallClosure),
168 | reinterpret_cast(&closure), inputs, num_inputs,
169 | results, p);
170 | }
171 |
172 | } // namespace randen
173 |
174 | #endif // NANOBENCHMARK_H_
175 |
--------------------------------------------------------------------------------
/nanobenchmark_test.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include
16 | #include // sleep
17 |
18 | #include "nanobenchmark.h"
19 | #include "randen.h"
20 | #include "util.h"
21 | #include "vector128.h"
22 |
23 | namespace randen {
24 | namespace {
25 |
26 | uint64_t AES(const void*, const FuncInput num_rounds) {
27 | // Ensures multiple invocations are serially dependent, otherwise we're
28 | // measuring the throughput rather than latency.
29 | static V prev;
30 | V m = prev;
31 | for (size_t i = 0; i < num_rounds; ++i) {
32 | m = AES(m, m);
33 | }
34 | prev = m;
35 | alignas(16) uint64_t lanes[2];
36 | Store(m, lanes, 0);
37 | return lanes[0];
38 | }
39 |
40 | template
41 | void MeasureAES(const FuncInput (&inputs)[N]) {
42 | Result results[N];
43 | Params params;
44 | params.max_evals = 4; // avoid test timeout
45 | const size_t num_results = Measure(&AES, nullptr, inputs, N, results, params);
46 | for (size_t i = 0; i < num_results; ++i) {
47 | printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
48 | results[i].ticks, results[i].variability * 100.0);
49 | }
50 | }
51 |
52 | uint64_t Div(const void*, FuncInput in) {
53 | // Here we're measuring the throughput because benchmark invocations are
54 | // independent.
55 | const int64_t d1 = 0xFFFFFFFFFFll / int64_t(in); // IDIV
56 | return d1;
57 | }
58 |
59 | template
60 | void MeasureDiv(const FuncInput (&inputs)[N]) {
61 | Result results[N];
62 | Params params;
63 | params.max_evals = 4; // avoid test timeout
64 | const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
65 | for (size_t i = 0; i < num_results; ++i) {
66 | printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
67 | results[i].ticks, results[i].variability * 100.0);
68 | }
69 | }
70 |
71 | Randen rng;
72 |
73 | // A function whose runtime depends on rng.
74 | uint64_t Random(const void* arg, FuncInput in) {
75 | const uint32_t r = rng() & 0xF;
76 | return AES(arg, r * r);
77 | }
78 |
79 | // Ensure the measured variability is high.
80 | template
81 | void MeasureRandom(const FuncInput (&inputs)[N]) {
82 | Result results[N];
83 | Params p;
84 | p.max_evals = 4; // avoid test timeout
85 | p.verbose = false;
86 | const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
87 | for (size_t i = 0; i < num_results; ++i) {
88 | RANDEN_CHECK(results[i].variability > 1E-3);
89 | }
90 | }
91 |
92 | template
93 | void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
94 | printf("Expect a 'measurement failed' below:\n");
95 | Result results[N];
96 | const size_t num_results = MeasureClosure(
97 | [](const FuncInput input) {
98 | // Loop until the sleep succeeds (not interrupted by signal). We assume
99 | // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
100 | while (sleep(2) != 0) {
101 | }
102 | return input;
103 | },
104 | inputs, N, results);
105 | RANDEN_CHECK(num_results == 0);
106 | }
107 |
108 | void RunAll(const int argc, char* argv[]) {
109 | // Avoid migrating between cores - important on multi-socket systems.
110 | int cpu = -1;
111 | if (argc == 2) {
112 | cpu = strtol(argv[1], nullptr, 10);
113 | }
114 | platform::PinThreadToCPU(cpu);
115 |
116 | // unpredictable == 1 but the compiler doesn't know that.
117 | const int unpredictable = argc != 999;
118 | static const FuncInput inputs[] = {static_cast(unpredictable) + 2,
119 | static_cast(unpredictable + 9)};
120 |
121 | MeasureAES(inputs);
122 | MeasureDiv(inputs);
123 | MeasureRandom(inputs);
124 | EnsureLongMeasurementFails(inputs);
125 | }
126 |
127 | } // namespace
128 | } // namespace randen
129 |
130 | int main(int argc, char* argv[]) {
131 | randen::RunAll(argc, argv);
132 | return 0;
133 | }
134 |
--------------------------------------------------------------------------------
/randen.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "randen.h"
16 |
17 | #include // memcpy
18 |
19 | #include "vector128.h"
20 |
21 | namespace randen {
22 | namespace {
23 |
24 | // High-level summary:
25 | // 1) Reverie (see "A Robust and Sponge-Like PRNG with Improved Efficiency") is
26 | // a sponge-like random generator that requires a cryptographic permutation.
27 | // It improves upon "Provably Robust Sponge-Based PRNGs and KDFs" by
28 | // achieving backtracking resistance with only one Permute() per buffer.
29 | //
30 | // 2) "Simpira v2: A Family of Efficient Permutations Using the AES Round
31 | // Function" constructs up to 1024-bit permutations using an improved
32 | // Generalized Feistel network with 2-round AES-128 functions. This Feistel
33 | // block shuffle achieves diffusion faster and is less vulnerable to
34 | // sliced-biclique attacks than the Type-2 cyclic shuffle.
35 | //
36 | // 3) "Improving the Generalized Feistel" and "New criterion for diffusion
37 | // property" extends the same kind of improved Feistel block shuffle to 16
38 | // branches, which enables a 2048-bit permutation.
39 | //
40 | // We combine these three ideas and also change Simpira's subround keys from
41 | // structured/low-entropy counters to digits of Pi.
42 |
43 | // Largest size for which security proofs are known.
44 | constexpr int kFeistelBlocks = 16;
45 |
46 | // Type-2 generalized Feistel => one round function for every two blocks.
47 | constexpr int kFeistelFunctions = kFeistelBlocks / 2; // = 8
48 |
49 | // Ensures SPRP security and two full subblock diffusions.
50 | constexpr int kFeistelRounds = 16 + 1; // > 4 * log2(kFeistelBlocks)
51 |
52 | // Independent keys (272 = 2.1 KiB) for the first AES subround of each function.
53 | constexpr int kKeys = kFeistelRounds * kFeistelFunctions;
54 |
55 | const uint64_t* RANDEN_RESTRICT Keys() {
56 | // "Nothing up my sleeve" numbers from the first hex digits of Pi, obtained
57 | // from http://hexpi.sourceforge.net/. Native byte order.
58 | alignas(32) static constexpr uint64_t pi_digits[kKeys * kLanes] = {
59 | RANDEN_LE(0x243F6A8885A308D3ull, 0x13198A2E03707344ull),
60 | RANDEN_LE(0xA4093822299F31D0ull, 0x082EFA98EC4E6C89ull),
61 | RANDEN_LE(0x452821E638D01377ull, 0xBE5466CF34E90C6Cull),
62 | RANDEN_LE(0xC0AC29B7C97C50DDull, 0x3F84D5B5B5470917ull),
63 | RANDEN_LE(0x9216D5D98979FB1Bull, 0xD1310BA698DFB5ACull),
64 | RANDEN_LE(0x2FFD72DBD01ADFB7ull, 0xB8E1AFED6A267E96ull),
65 | RANDEN_LE(0xBA7C9045F12C7F99ull, 0x24A19947B3916CF7ull),
66 | RANDEN_LE(0x0801F2E2858EFC16ull, 0x636920D871574E69ull),
67 | RANDEN_LE(0xA458FEA3F4933D7Eull, 0x0D95748F728EB658ull),
68 | RANDEN_LE(0x718BCD5882154AEEull, 0x7B54A41DC25A59B5ull),
69 | RANDEN_LE(0x9C30D5392AF26013ull, 0xC5D1B023286085F0ull),
70 | RANDEN_LE(0xCA417918B8DB38EFull, 0x8E79DCB0603A180Eull),
71 | RANDEN_LE(0x6C9E0E8BB01E8A3Eull, 0xD71577C1BD314B27ull),
72 | RANDEN_LE(0x78AF2FDA55605C60ull, 0xE65525F3AA55AB94ull),
73 | RANDEN_LE(0x5748986263E81440ull, 0x55CA396A2AAB10B6ull),
74 | RANDEN_LE(0xB4CC5C341141E8CEull, 0xA15486AF7C72E993ull),
75 | RANDEN_LE(0xB3EE1411636FBC2Aull, 0x2BA9C55D741831F6ull),
76 | RANDEN_LE(0xCE5C3E169B87931Eull, 0xAFD6BA336C24CF5Cull),
77 | RANDEN_LE(0x7A32538128958677ull, 0x3B8F48986B4BB9AFull),
78 | RANDEN_LE(0xC4BFE81B66282193ull, 0x61D809CCFB21A991ull),
79 | RANDEN_LE(0x487CAC605DEC8032ull, 0xEF845D5DE98575B1ull),
80 | RANDEN_LE(0xDC262302EB651B88ull, 0x23893E81D396ACC5ull),
81 | RANDEN_LE(0x0F6D6FF383F44239ull, 0x2E0B4482A4842004ull),
82 | RANDEN_LE(0x69C8F04A9E1F9B5Eull, 0x21C66842F6E96C9Aull),
83 | RANDEN_LE(0x670C9C61ABD388F0ull, 0x6A51A0D2D8542F68ull),
84 | RANDEN_LE(0x960FA728AB5133A3ull, 0x6EEF0B6C137A3BE4ull),
85 | RANDEN_LE(0xBA3BF0507EFB2A98ull, 0xA1F1651D39AF0176ull),
86 | RANDEN_LE(0x66CA593E82430E88ull, 0x8CEE8619456F9FB4ull),
87 | RANDEN_LE(0x7D84A5C33B8B5EBEull, 0xE06F75D885C12073ull),
88 | RANDEN_LE(0x401A449F56C16AA6ull, 0x4ED3AA62363F7706ull),
89 | RANDEN_LE(0x1BFEDF72429B023Dull, 0x37D0D724D00A1248ull),
90 | RANDEN_LE(0xDB0FEAD349F1C09Bull, 0x075372C980991B7Bull),
91 | RANDEN_LE(0x25D479D8F6E8DEF7ull, 0xE3FE501AB6794C3Bull),
92 | RANDEN_LE(0x976CE0BD04C006BAull, 0xC1A94FB6409F60C4ull),
93 | RANDEN_LE(0x5E5C9EC2196A2463ull, 0x68FB6FAF3E6C53B5ull),
94 | RANDEN_LE(0x1339B2EB3B52EC6Full, 0x6DFC511F9B30952Cull),
95 | RANDEN_LE(0xCC814544AF5EBD09ull, 0xBEE3D004DE334AFDull),
96 | RANDEN_LE(0x660F2807192E4BB3ull, 0xC0CBA85745C8740Full),
97 | RANDEN_LE(0xD20B5F39B9D3FBDBull, 0x5579C0BD1A60320Aull),
98 | RANDEN_LE(0xD6A100C6402C7279ull, 0x679F25FEFB1FA3CCull),
99 | RANDEN_LE(0x8EA5E9F8DB3222F8ull, 0x3C7516DFFD616B15ull),
100 | RANDEN_LE(0x2F501EC8AD0552ABull, 0x323DB5FAFD238760ull),
101 | RANDEN_LE(0x53317B483E00DF82ull, 0x9E5C57BBCA6F8CA0ull),
102 | RANDEN_LE(0x1A87562EDF1769DBull, 0xD542A8F6287EFFC3ull),
103 | RANDEN_LE(0xAC6732C68C4F5573ull, 0x695B27B0BBCA58C8ull),
104 | RANDEN_LE(0xE1FFA35DB8F011A0ull, 0x10FA3D98FD2183B8ull),
105 | RANDEN_LE(0x4AFCB56C2DD1D35Bull, 0x9A53E479B6F84565ull),
106 | RANDEN_LE(0xD28E49BC4BFB9790ull, 0xE1DDF2DAA4CB7E33ull),
107 | RANDEN_LE(0x62FB1341CEE4C6E8ull, 0xEF20CADA36774C01ull),
108 | RANDEN_LE(0xD07E9EFE2BF11FB4ull, 0x95DBDA4DAE909198ull),
109 | RANDEN_LE(0xEAAD8E716B93D5A0ull, 0xD08ED1D0AFC725E0ull),
110 | RANDEN_LE(0x8E3C5B2F8E7594B7ull, 0x8FF6E2FBF2122B64ull),
111 | RANDEN_LE(0x8888B812900DF01Cull, 0x4FAD5EA0688FC31Cull),
112 | RANDEN_LE(0xD1CFF191B3A8C1ADull, 0x2F2F2218BE0E1777ull),
113 | RANDEN_LE(0xEA752DFE8B021FA1ull, 0xE5A0CC0FB56F74E8ull),
114 | RANDEN_LE(0x18ACF3D6CE89E299ull, 0xB4A84FE0FD13E0B7ull),
115 | RANDEN_LE(0x7CC43B81D2ADA8D9ull, 0x165FA26680957705ull),
116 | RANDEN_LE(0x93CC7314211A1477ull, 0xE6AD206577B5FA86ull),
117 | RANDEN_LE(0xC75442F5FB9D35CFull, 0xEBCDAF0C7B3E89A0ull),
118 | RANDEN_LE(0xD6411BD3AE1E7E49ull, 0x00250E2D2071B35Eull),
119 | RANDEN_LE(0x226800BB57B8E0AFull, 0x2464369BF009B91Eull),
120 | RANDEN_LE(0x5563911D59DFA6AAull, 0x78C14389D95A537Full),
121 | RANDEN_LE(0x207D5BA202E5B9C5ull, 0x832603766295CFA9ull),
122 | RANDEN_LE(0x11C819684E734A41ull, 0xB3472DCA7B14A94Aull),
123 | RANDEN_LE(0x1B5100529A532915ull, 0xD60F573FBC9BC6E4ull),
124 | RANDEN_LE(0x2B60A47681E67400ull, 0x08BA6FB5571BE91Full),
125 | RANDEN_LE(0xF296EC6B2A0DD915ull, 0xB6636521E7B9F9B6ull),
126 | RANDEN_LE(0xFF34052EC5855664ull, 0x53B02D5DA99F8FA1ull),
127 | RANDEN_LE(0x08BA47996E85076Aull, 0x4B7A70E9B5B32944ull),
128 | RANDEN_LE(0xDB75092EC4192623ull, 0xAD6EA6B049A7DF7Dull),
129 | RANDEN_LE(0x9CEE60B88FEDB266ull, 0xECAA8C71699A18FFull),
130 | RANDEN_LE(0x5664526CC2B19EE1ull, 0x193602A575094C29ull),
131 | RANDEN_LE(0xA0591340E4183A3Eull, 0x3F54989A5B429D65ull),
132 | RANDEN_LE(0x6B8FE4D699F73FD6ull, 0xA1D29C07EFE830F5ull),
133 | RANDEN_LE(0x4D2D38E6F0255DC1ull, 0x4CDD20868470EB26ull),
134 | RANDEN_LE(0x6382E9C6021ECC5Eull, 0x09686B3F3EBAEFC9ull),
135 | RANDEN_LE(0x3C9718146B6A70A1ull, 0x687F358452A0E286ull),
136 | RANDEN_LE(0xB79C5305AA500737ull, 0x3E07841C7FDEAE5Cull),
137 | RANDEN_LE(0x8E7D44EC5716F2B8ull, 0xB03ADA37F0500C0Dull),
138 | RANDEN_LE(0xF01C1F040200B3FFull, 0xAE0CF51A3CB574B2ull),
139 | RANDEN_LE(0x25837A58DC0921BDull, 0xD19113F97CA92FF6ull),
140 | RANDEN_LE(0x9432477322F54701ull, 0x3AE5E58137C2DADCull),
141 | RANDEN_LE(0xC8B576349AF3DDA7ull, 0xA94461460FD0030Eull),
142 | RANDEN_LE(0xECC8C73EA4751E41ull, 0xE238CD993BEA0E2Full),
143 | RANDEN_LE(0x3280BBA1183EB331ull, 0x4E548B384F6DB908ull),
144 | RANDEN_LE(0x6F420D03F60A04BFull, 0x2CB8129024977C79ull),
145 | RANDEN_LE(0x5679B072BCAF89AFull, 0xDE9A771FD9930810ull),
146 | RANDEN_LE(0xB38BAE12DCCF3F2Eull, 0x5512721F2E6B7124ull),
147 | RANDEN_LE(0x501ADDE69F84CD87ull, 0x7A5847187408DA17ull),
148 | RANDEN_LE(0xBC9F9ABCE94B7D8Cull, 0xEC7AEC3ADB851DFAull),
149 | RANDEN_LE(0x63094366C464C3D2ull, 0xEF1C18473215D808ull),
150 | RANDEN_LE(0xDD433B3724C2BA16ull, 0x12A14D432A65C451ull),
151 | RANDEN_LE(0x50940002133AE4DDull, 0x71DFF89E10314E55ull),
152 | RANDEN_LE(0x81AC77D65F11199Bull, 0x043556F1D7A3C76Bull),
153 | RANDEN_LE(0x3C11183B5924A509ull, 0xF28FE6ED97F1FBFAull),
154 | RANDEN_LE(0x9EBABF2C1E153C6Eull, 0x86E34570EAE96FB1ull),
155 | RANDEN_LE(0x860E5E0A5A3E2AB3ull, 0x771FE71C4E3D06FAull),
156 | RANDEN_LE(0x2965DCB999E71D0Full, 0x803E89D65266C825ull),
157 | RANDEN_LE(0x2E4CC9789C10B36Aull, 0xC6150EBA94E2EA78ull),
158 | RANDEN_LE(0xA6FC3C531E0A2DF4ull, 0xF2F74EA7361D2B3Dull),
159 | RANDEN_LE(0x1939260F19C27960ull, 0x5223A708F71312B6ull),
160 | RANDEN_LE(0xEBADFE6EEAC31F66ull, 0xE3BC4595A67BC883ull),
161 | RANDEN_LE(0xB17F37D1018CFF28ull, 0xC332DDEFBE6C5AA5ull),
162 | RANDEN_LE(0x6558218568AB9702ull, 0xEECEA50FDB2F953Bull),
163 | RANDEN_LE(0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull),
164 | RANDEN_LE(0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull),
165 | RANDEN_LE(0x0334FE1EAA0363CFull, 0xB5735C904C70A239ull),
166 | RANDEN_LE(0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull),
167 | RANDEN_LE(0x9CAB5CABB2F3846Eull, 0x648B1EAF19BDF0CAull),
168 | RANDEN_LE(0xA02369B9655ABB50ull, 0x40685A323C2AB4B3ull),
169 | RANDEN_LE(0x319EE9D5C021B8F7ull, 0x9B540B19875FA099ull),
170 | RANDEN_LE(0x95F7997E623D7DA8ull, 0xF837889A97E32D77ull),
171 | RANDEN_LE(0x11ED935F16681281ull, 0x0E358829C7E61FD6ull),
172 | RANDEN_LE(0x96DEDFA17858BA99ull, 0x57F584A51B227263ull),
173 | RANDEN_LE(0x9B83C3FF1AC24696ull, 0xCDB30AEB532E3054ull),
174 | RANDEN_LE(0x8FD948E46DBC3128ull, 0x58EBF2EF34C6FFEAull),
175 | RANDEN_LE(0xFE28ED61EE7C3C73ull, 0x5D4A14D9E864B7E3ull),
176 | RANDEN_LE(0x42105D14203E13E0ull, 0x45EEE2B6A3AAABEAull),
177 | RANDEN_LE(0xDB6C4F15FACB4FD0ull, 0xC742F442EF6ABBB5ull),
178 | RANDEN_LE(0x654F3B1D41CD2105ull, 0xD81E799E86854DC7ull),
179 | RANDEN_LE(0xE44B476A3D816250ull, 0xCF62A1F25B8D2646ull),
180 | RANDEN_LE(0xFC8883A0C1C7B6A3ull, 0x7F1524C369CB7492ull),
181 | RANDEN_LE(0x47848A0B5692B285ull, 0x095BBF00AD19489Dull),
182 | RANDEN_LE(0x1462B17423820D00ull, 0x58428D2A0C55F5EAull),
183 | RANDEN_LE(0x1DADF43E233F7061ull, 0x3372F0928D937E41ull),
184 | RANDEN_LE(0xD65FECF16C223BDBull, 0x7CDE3759CBEE7460ull),
185 | RANDEN_LE(0x4085F2A7CE77326Eull, 0xA607808419F8509Eull),
186 | RANDEN_LE(0xE8EFD85561D99735ull, 0xA969A7AAC50C06C2ull),
187 | RANDEN_LE(0x5A04ABFC800BCADCull, 0x9E447A2EC3453484ull),
188 | RANDEN_LE(0xFDD567050E1E9EC9ull, 0xDB73DBD3105588CDull),
189 | RANDEN_LE(0x675FDA79E3674340ull, 0xC5C43465713E38D8ull),
190 | RANDEN_LE(0x3D28F89EF16DFF20ull, 0x153E21E78FB03D4Aull),
191 | RANDEN_LE(0xE6E39F2BDB83ADF7ull, 0xE93D5A68948140F7ull),
192 | RANDEN_LE(0xF64C261C94692934ull, 0x411520F77602D4F7ull),
193 | RANDEN_LE(0xBCF46B2ED4A10068ull, 0xD40824713320F46Aull),
194 | RANDEN_LE(0x43B7D4B7500061AFull, 0x1E39F62E97244546ull)};
195 | static_assert(pi_digits[kKeys * kLanes - 1] != 0, "Too few initializers");
196 | return pi_digits;
197 | }
198 |
199 | // Improved odd-even shuffle from "New criterion for diffusion property".
200 | RANDEN_INLINE void BlockShuffle(uint64_t* RANDEN_RESTRICT state) {
201 | // First make a copy (optimized out).
202 | uint64_t source[kFeistelBlocks * kLanes];
203 | memcpy(source, state, sizeof(source));
204 |
205 | constexpr int shuffle[kFeistelBlocks] = {7, 2, 13, 4, 11, 8, 3, 6,
206 | 15, 0, 9, 10, 1, 14, 5, 12};
207 | for (int branch = 0; branch < kFeistelBlocks; ++branch) {
208 | const V v = Load(source, shuffle[branch]);
209 | Store(v, state, branch);
210 | }
211 | }
212 |
213 | // Cryptographic permutation based via type-2 Generalized Feistel Network.
214 | // Indistinguishable from ideal by chosen-ciphertext adversaries using less than
215 | // 2^64 queries if the round function is a PRF. This is similar to the b=8 case
216 | // of Simpira v2, but more efficient than its generic construction for b=16.
217 | RANDEN_INLINE void Permute(uint64_t* RANDEN_RESTRICT state) {
218 | // Round keys for one AES per Feistel round and branch: first digits of Pi.
219 | const uint64_t* RANDEN_RESTRICT keys = Keys();
220 |
221 | // (Successfully unrolled; the first iteration jumps into the second half)
222 | #ifdef __clang__
223 | #pragma clang loop unroll_count(2)
224 | #endif
225 | for (int round = 0; round < kFeistelRounds; ++round) {
226 | for (int branch = 0; branch < kFeistelBlocks; branch += 2) {
227 | const V even = Load(state, branch);
228 | const V odd = Load(state, branch + 1);
229 | // Feistel round function using two AES subrounds. Very similar to F()
230 | // from Simpira v2, but with independent subround keys. Uses 17 AES rounds
231 | // per 16 bytes (vs. 10 for AES-CTR). Computing eight round functions in
232 | // parallel hides the 7-cycle AESNI latency on HSW. Note that the Feistel
233 | // XORs are 'free' (included in the second AES instruction).
234 | const V f1 = AES(even, Load(keys, 0));
235 | keys += kLanes;
236 | const V f2 = AES(f1, odd);
237 | Store(f2, state, branch + 1);
238 | }
239 |
240 | BlockShuffle(state);
241 | }
242 | }
243 |
244 | // Enables native loads in the round loop by pre-swapping.
245 | RANDEN_INLINE void SwapIfBigEndian(uint64_t* RANDEN_RESTRICT state) {
246 | #ifdef RANDEN_BIG_ENDIAN
247 | for (int branch = 0; branch < kFeistelBlocks; ++branch) {
248 | const V v = ReverseBytes(Load(state, branch));
249 | Store(v, state, branch);
250 | }
251 | #endif
252 | }
253 |
254 | } // namespace
255 |
256 | void Internal::Absorb(const void* seed_void, void* state_void) {
257 | uint64_t* RANDEN_RESTRICT state = reinterpret_cast(state_void);
258 | const uint64_t* RANDEN_RESTRICT seed =
259 | reinterpret_cast(seed_void);
260 |
261 | constexpr int kCapacityBlocks = kCapacityBytes / sizeof(V);
262 | static_assert(kCapacityBlocks * sizeof(V) == kCapacityBytes, "Not i*V");
263 | for (size_t i = kCapacityBlocks; i < kStateBytes / sizeof(V); ++i) {
264 | V block = Load(state, i);
265 | block ^= Load(seed, i - kCapacityBlocks);
266 | Store(block, state, i);
267 | }
268 | }
269 |
270 | void Internal::Generate(void* state_void) {
271 | uint64_t* RANDEN_RESTRICT state = reinterpret_cast(state_void);
272 |
273 | static_assert(kCapacityBytes == sizeof(V), "Capacity mismatch");
274 | const V prev_inner = Load(state, 0);
275 |
276 | SwapIfBigEndian(state);
277 |
278 | Permute(state);
279 |
280 | SwapIfBigEndian(state);
281 |
282 | // Ensure backtracking resistance.
283 | V inner = Load(state, 0);
284 | inner ^= prev_inner;
285 | Store(inner, state, 0);
286 | }
287 |
288 | } // namespace randen
289 |
--------------------------------------------------------------------------------
/randen.h:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | // 'Strong' (indistinguishable from random, backtracking-resistant) random
16 | // generator, faster in some benchmarks than std::mt19937_64 and pcg64_c32.
17 | // Accompanying paper: https://arxiv.org/abs/1810.02227
18 |
19 | #ifndef RANDEN_H_
20 | #define RANDEN_H_
21 |
22 | #include
23 | #include // memcpy
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 |
32 | // RANDen = RANDom generator or beetroots in Swiss German.
33 | namespace randen {
34 |
35 | struct Internal {
36 | static void Absorb(const void* seed, void* state);
37 | static void Generate(void* state);
38 |
39 | static constexpr int kStateBytes = 256; // 2048-bit
40 |
41 | // Size of the 'inner' (inaccessible) part of the sponge. Larger values would
42 | // require more frequent calls to Generate.
43 | static constexpr int kCapacityBytes = 16; // 128-bit
44 | };
45 |
46 | // Deterministic pseudorandom byte generator with backtracking resistance
47 | // (leaking the state does not compromise prior outputs). Based on Reverie
48 | // (see "A Robust and Sponge-Like PRNG with Improved Efficiency") instantiated
49 | // with an improved Simpira-like permutation.
50 | // Returns values of type "T" (must be a built-in unsigned integer type).
51 | template
52 | class alignas(32) Randen {
53 | static_assert(std::is_unsigned::value,
54 | "Randen must be parameterized by a built-in unsigned integer");
55 |
56 | public:
57 | // C++11 URBG interface:
58 | using result_type = T;
59 |
60 | static constexpr result_type min() {
61 | return std::numeric_limits::min();
62 | }
63 |
64 | static constexpr result_type max() {
65 | return std::numeric_limits::max();
66 | }
67 |
68 | explicit Randen(result_type seed_value = 0) { seed(seed_value); }
69 |
70 | template ::value>::type>
73 | explicit Randen(SeedSequence&& seq) {
74 | seed(seq);
75 | }
76 |
77 | // Default copy and move operators.
78 | Randen(const Randen&) = default;
79 | Randen& operator=(const Randen&) = default;
80 |
81 | Randen(Randen&&) = default;
82 | Randen& operator=(Randen&&) = default;
83 |
84 | // Returns random bits from the buffer in units of T.
85 | result_type operator()() {
86 | // (Local copy ensures compiler knows this is not aliased.)
87 | size_t next = next_;
88 |
89 | // Refill the buffer if needed (unlikely).
90 | if (next >= kStateT) {
91 | Internal::Generate(state_);
92 | next = kCapacityT;
93 | }
94 |
95 | const result_type ret = state_[next];
96 | next_ = next + 1;
97 | return ret;
98 | }
99 |
100 | template
101 | typename std::enable_if<
102 | !std::is_convertible::value, void>::type
103 | seed(SeedSequence& seq) {
104 | seed();
105 | reseed(seq);
106 | }
107 |
108 | void seed(result_type seed_value = 0) {
109 | next_ = kStateT;
110 | std::fill(std::begin(state_), std::begin(state_) + kCapacityT, 0);
111 | std::fill(std::begin(state_) + kCapacityT, std::end(state_), seed_value);
112 | }
113 |
114 | // Inserts entropy into (part of) the state. Calling this periodically with
115 | // sufficient entropy ensures prediction resistance (attackers cannot predict
116 | // future outputs even if state is compromised).
117 | template
118 | void reseed(SeedSequence& seq) {
119 | using U32 = typename SeedSequence::result_type;
120 | constexpr int kRate32 =
121 | (Internal::kStateBytes - Internal::kCapacityBytes) / sizeof(U32);
122 | U32 buffer[kRate32];
123 | seq.generate(buffer, buffer + kRate32);
124 | Internal::Absorb(buffer, state_);
125 | next_ = kStateT; // Generate will be called by operator()
126 | }
127 |
128 | void discard(unsigned long long count) {
129 | using ull_t = unsigned long long;
130 | const ull_t remaining = kStateT - next_;
131 | if (count <= remaining) {
132 | next_ += count;
133 | return;
134 | }
135 | count -= remaining;
136 |
137 | const ull_t kRateT = kStateT - kCapacityT;
138 | while (count > kRateT) {
139 | Internal::Generate(state_);
140 | next_ = kCapacityT;
141 | count -= kRateT;
142 | }
143 |
144 | if (count != 0) {
145 | Internal::Generate(state_);
146 | next_ = kCapacityT + count;
147 | }
148 | }
149 |
150 | bool operator==(const Randen& other) const {
151 | return next_ == other.next_ &&
152 | std::equal(std::begin(state_), std::end(state_),
153 | std::begin(other.state_));
154 | }
155 |
156 | bool operator!=(const Randen& other) const { return !(*this == other); }
157 |
158 | template
159 | friend std::basic_ostream& operator<<(
160 | std::basic_ostream& os, // NOLINT(runtime/references)
161 | const Randen& engine) { // NOLINT(runtime/references)
162 | const auto flags = os.flags(std::ios_base::dec | std::ios_base::left);
163 | const auto fill = os.fill(os.widen(' '));
164 |
165 | for (auto x : engine.state_) {
166 | os << x << os.fill();
167 | }
168 | os << engine.next_;
169 |
170 | os.flags(flags);
171 | os.fill(fill);
172 | return os;
173 | }
174 |
175 | template
176 | friend std::basic_istream& operator>>(
177 | std::basic_istream& is, // NOLINT(runtime/references)
178 | Randen& engine) { // NOLINT(runtime/references)
179 | const auto flags = is.flags(std::ios_base::dec | std::ios_base::skipws);
180 | const auto fill = is.fill(is.widen(' '));
181 |
182 | T state[kStateT];
183 | size_t next;
184 | for (auto& x : state) {
185 | is >> x;
186 | }
187 | is >> next;
188 | if (!is.fail()) {
189 | memcpy(engine.state_, state, sizeof(engine.state_));
190 | engine.next_ = next;
191 | }
192 | is.flags(flags);
193 | is.fill(fill);
194 | return is;
195 | }
196 |
197 | private:
198 | static constexpr size_t kStateT = Internal::kStateBytes / sizeof(T);
199 | static constexpr size_t kCapacityT = Internal::kCapacityBytes / sizeof(T);
200 |
201 | // First kCapacityT are `inner', the others are accessible random bits.
202 | alignas(32) result_type state_[kStateT];
203 | size_t next_; // index within state_
204 | };
205 |
206 | } // namespace randen
207 |
208 | #endif // RANDEN_H_
209 |
--------------------------------------------------------------------------------
/randen_benchmark.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | // Please disable Turbo Boost and CPU throttling!
16 |
17 | #include "randen.h"
18 |
19 | // std::uniform_*_distribution are slow due to division/log2; we provide
20 | // faster variants if this is 0.
21 | #define USE_STD_DISTRIBUTIONS 0
22 |
23 | // Which engines to benchmark.
24 | #define ENABLE_RANDEN 1
25 | #define ENABLE_PCG 1
26 | #define ENABLE_MT 1
27 | #if defined(__SSE2__) && defined(__AES__)
28 | #define ENABLE_CHACHA 1
29 | #else
30 | #define ENABLE_CHACHA 0
31 | #endif
32 | #define ENABLE_OS 1
33 |
34 | #if ENABLE_PCG
35 | #include "third_party/pcg_random/include/pcg_random.hpp"
36 | #endif
37 |
38 | #if ENABLE_MT
39 | #include
40 | #endif
41 |
42 | #if ENABLE_CHACHA
43 | #include "engine_chacha.h"
44 | #endif
45 |
46 | #if ENABLE_OS
47 | #include "engine_os.h"
48 | #endif
49 |
50 |
51 | #ifdef _MSC_VER
52 | #include
53 | #endif
54 | #include
55 | #include
56 | #include // iota
57 |
58 | #include "nanobenchmark.h"
59 | #include "util.h"
60 |
61 | namespace randen {
62 | namespace {
63 |
64 | #if USE_STD_DISTRIBUTIONS
65 | using UniformInt = std::uniform_int_distribution;
66 | using UniformDouble = std::uniform_real_distribution;
67 | #else
68 | // These are subsets of std::uniform_*_distribution.
69 |
70 | class UniformInt {
71 | public:
72 | // (To support u64, add a Multiply overload and GetU64 as below.)
73 | using result_type = uint32_t;
74 |
75 | struct param_type {
76 | using distribution_type = UniformInt;
77 |
78 | param_type(const result_type begin, const result_type end)
79 | : begin(begin), end(end) {}
80 |
81 | // Half-open interval.
82 | result_type begin;
83 | result_type end;
84 | };
85 |
86 | // Engine is a C++11 UniformRandomBitGenerator returning >= 32 bits.
87 | template
88 | result_type operator()(Engine& engine, const param_type param) const {
89 | using Bits = decltype(engine()); // == typename Engine::result_type
90 | static_assert(std::is_same::value ||
91 | std::is_same::value,
92 | "Need u32 or u64");
93 |
94 | // We assume range < pow(2, sizeof(decltype(engine()))*8).
95 | const result_type range = param.end - param.begin;
96 |
97 | // Division-free with high probability. Algorithm and variable names are
98 | // from https://arxiv.org/pdf/1805.10941.pdf.
99 | result_type x = engine(); // (possibly a narrowing conversion from Bits)
100 | result_type hi, lo;
101 | Multiply(x, range, &hi, &lo);
102 | // Rejected, try again (unlikely for small ranges).
103 | if (lo < range) {
104 | const result_type t = Negate(range) % range;
105 | while (hi < t) {
106 | x = engine();
107 | Multiply(x, range, &hi, &lo);
108 | }
109 | }
110 |
111 | return hi + param.begin;
112 | }
113 |
114 | private:
115 | static constexpr result_type Negate(result_type x) {
116 | return ~x + 1; // assumes two's complement.
117 | }
118 |
119 | static void Multiply(const uint32_t x, const uint32_t y, uint32_t* hi,
120 | uint32_t* lo) {
121 | const uint64_t wide = static_cast(x) * y;
122 | *hi = wide >> 32;
123 | *lo = static_cast(wide & 0xFFFFFFFFu);
124 | }
125 | };
126 |
127 | class UniformDouble {
128 | public:
129 | // (Can also be float - we would just cast from double.)
130 | using result_type = double;
131 |
132 | // Engine is a C++11 UniformRandomBitGenerator returning either u32 or u64.
133 | template
134 | result_type operator()(Engine& engine) const {
135 | uint64_t bits = GetU64(decltype(engine())(), engine);
136 | if (bits == 0) return static_cast(0.0);
137 | const int leading_zeros = NumZeroBitsAboveMSBNonzero(bits);
138 | bits <<= leading_zeros; // shift out leading zeros
139 | bits >>= (64 - 53); // zero exponent
140 | const uint64_t exp = 1022 - leading_zeros;
141 | const uint64_t ieee = (exp << 52) | bits;
142 | double ret;
143 | memcpy(&ret, &ieee, sizeof(ret));
144 | return static_cast(ret);
145 | }
146 |
147 | private:
148 | template
149 | static uint64_t GetU64(uint64_t, Engine& engine) {
150 | return engine();
151 | }
152 |
153 | // Adapter for generating u64 from u32 engine.
154 | template
155 | static uint64_t GetU64(uint32_t, Engine& engine) {
156 | uint64_t ret = engine();
157 | ret <<= 32;
158 | ret |= engine();
159 | return ret;
160 | }
161 | };
162 | #endif // !USE_STD_DISTRIBUTIONS
163 |
164 | // Benchmark::Num64() is passed to its constructor and operator() after
165 | // multiplying with a (non-compile-time-constant) 1 to prevent constant folding.
166 | // It is also used to compute cycles per byte.
167 |
168 | // Microbenchmark: generates N numbers in a tight loop.
169 | struct BenchmarkLoop {
170 | // Large enough that we can ignore size % buffer size.
171 | static size_t Num64() { return 100000; }
172 |
173 | explicit BenchmarkLoop(const uint64_t num_64) {}
174 |
175 | template
176 | uint64_t operator()(const uint64_t num_64, Engine& engine) const {
177 | for (size_t i = 0; i < num_64 - 1; ++i) {
178 | (void)engine();
179 | }
180 | return engine();
181 | }
182 | };
183 |
184 | // Real-world benchmark: shuffles a vector.
185 | class BenchmarkShuffle {
186 | public:
187 | static size_t Num64() { return 50000; }
188 |
189 | explicit BenchmarkShuffle(const uint64_t num_64) : ints_to_shuffle_(num_64) {}
190 |
191 | template
192 | uint64_t operator()(const uint64_t num_64, Engine& engine) const {
193 | ints_to_shuffle_[0] = static_cast(num_64 & 0xFFFF);
194 | #if USE_STD_DISTRIBUTIONS
195 | std::shuffle(ints_to_shuffle_.begin(), ints_to_shuffle_.end(), engine);
196 | #else
197 | // Similar algorithm, but UniformInt instead of std::u_i_d => 2-3x speedup.
198 | UniformInt dist;
199 | for (size_t i = num_64 - 1; i != 0; --i) {
200 | const UniformInt::param_type param(0, i);
201 | std::swap(ints_to_shuffle_[i], ints_to_shuffle_[dist(engine, param)]);
202 | }
203 | #endif
204 | return ints_to_shuffle_[0];
205 | }
206 |
207 | private:
208 | mutable std::vector ints_to_shuffle_;
209 | };
210 |
211 | // Reservoir sampling.
212 | class BenchmarkSample {
213 | public:
214 | static size_t Num64() { return 50000; }
215 |
216 | explicit BenchmarkSample(const uint64_t num_64)
217 | : population_(num_64), chosen_(kNumChosen) {
218 | std::iota(population_.begin(), population_.end(), 0);
219 | }
220 |
221 | template
222 | uint64_t operator()(const uint64_t num_64, Engine& engine) const {
223 | // Can replace with std::sample after C++17.
224 | std::copy(population_.begin(), population_.begin() + kNumChosen,
225 | chosen_.begin());
226 | UniformInt dist;
227 | for (size_t i = kNumChosen; i < num_64; ++i) {
228 | const UniformInt::param_type param(0, i);
229 | const size_t index = dist(engine, param);
230 | if (index < kNumChosen) {
231 | chosen_[index] = population_[i];
232 | }
233 | }
234 |
235 | return chosen_.front();
236 | }
237 |
238 | private:
239 | static constexpr size_t kNumChosen = 10000;
240 |
241 | std::vector population_;
242 | mutable std::vector chosen_;
243 | };
244 |
245 | // Actual application: Monte Carlo estimation of Pi * 1E6.
246 | class BenchmarkMonteCarlo {
247 | public:
248 | static size_t Num64() { return 200000; }
249 |
250 | explicit BenchmarkMonteCarlo(const uint64_t num_64) {}
251 |
252 | template
253 | uint64_t operator()(const uint64_t num_64, Engine& engine) const {
254 | int64_t in_circle = 0;
255 | for (size_t i = 0; i < num_64; i += 2) {
256 | const double x = dist_(engine);
257 | const double y = dist_(engine);
258 | in_circle += (x * x + y * y) < 1.0;
259 | }
260 | return 8 * 1000 * 1000 * in_circle / num_64;
261 | }
262 |
263 | private:
264 | mutable UniformDouble dist_;
265 | };
266 |
267 | template
268 | void RunBenchmark(const char* caption, Engine& engine, const int unpredictable1,
269 | const Benchmark& benchmark) {
270 | printf("%8s: ", caption);
271 | const size_t kNumInputs = 1;
272 | const FuncInput inputs[kNumInputs] = {
273 | static_cast(Benchmark::Num64() * unpredictable1)};
274 | Result results[kNumInputs];
275 |
276 | Params p;
277 | p.verbose = false;
278 | #if defined(__powerpc__)
279 | p.max_evals = 7;
280 | #else
281 | p.max_evals = 8;
282 | #endif
283 | p.target_rel_mad = 0.002;
284 | const size_t num_results = MeasureClosure(
285 | [&benchmark, &engine](const FuncInput input) {
286 | return benchmark(input, engine);
287 | },
288 | inputs, kNumInputs, results, p);
289 | RANDEN_CHECK(num_results == kNumInputs);
290 | for (size_t i = 0; i < num_results; ++i) {
291 | const double cycles_per_byte =
292 | results[i].ticks / (results[i].input * sizeof(uint64_t));
293 | const double mad = results[i].variability * cycles_per_byte;
294 | printf("%6zu: %5.2f (+/- %5.3f)\n", results[i].input, cycles_per_byte, mad);
295 | }
296 | }
297 |
298 | // Calls RunBenchmark for each (enabled) engine.
299 | template
300 | void ForeachEngine(const int unpredictable1) {
301 | using T = uint64_t; // WARNING: keep in sync with MT/PCG.
302 |
303 | const Benchmark benchmark(
304 | static_cast(Benchmark::Num64() * unpredictable1));
305 |
306 | #if ENABLE_RANDEN
307 | Randen eng_randen;
308 | RunBenchmark("Randen", eng_randen, unpredictable1, benchmark);
309 | #endif
310 |
311 | #if ENABLE_PCG
312 | // Quoting from pcg_random.hpp: "the c variants offer better crypographic
313 | // security (just how good the cryptographic security is is an open
314 | // question)".
315 | pcg64_c32 eng_pcg;
316 | RunBenchmark("PCG", eng_pcg, unpredictable1, benchmark);
317 | #endif
318 |
319 | #if ENABLE_MT
320 | std::mt19937_64 eng_mt;
321 | RunBenchmark("MT", eng_mt, unpredictable1, benchmark);
322 | #endif
323 |
324 |
325 | #if ENABLE_CHACHA
326 | ChaCha eng_chacha(0x243f6a8885a308d3ull, 0x243F6A8885A308D3ull);
327 | RunBenchmark("ChaCha8", eng_chacha, unpredictable1, benchmark);
328 | #endif
329 |
330 | #if ENABLE_OS
331 | EngineOS eng_os;
332 | RunBenchmark("OS", eng_os, unpredictable1, benchmark);
333 | #endif
334 |
335 | printf("\n");
336 | }
337 |
338 | void RunAll(int argc, char* argv[]) {
339 | // Immediately output any results (for non-local runs).
340 | setvbuf(stdout, nullptr, _IONBF, 0);
341 |
342 | printf("Config: enable std=%d\n", USE_STD_DISTRIBUTIONS);
343 |
344 | // Avoid migrating between cores - important on multi-socket systems.
345 | int cpu = -1;
346 | if (argc == 2) {
347 | cpu = strtol(argv[1], nullptr, 10);
348 | }
349 | platform::PinThreadToCPU(cpu);
350 |
351 | // Ensures the iteration counts are not compile-time constants.
352 | const int unpredictable1 = argc != 999;
353 |
354 | ForeachEngine(unpredictable1);
355 | ForeachEngine(unpredictable1);
356 | ForeachEngine(unpredictable1);
357 | ForeachEngine(unpredictable1);
358 | }
359 |
360 | } // namespace
361 | } // namespace randen
362 |
363 | int main(int argc, char* argv[]) {
364 | randen::RunAll(argc, argv);
365 | return 0;
366 | }
367 |
--------------------------------------------------------------------------------
/randen_test.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "randen.h"
16 |
17 | #include
18 | #include
19 | #include // seed_seq
20 | #include
21 |
22 | #define UPDATE_GOLDEN 0
23 | #define ENABLE_VERIFY 1
24 | #define ENABLE_DUMP 0
25 |
26 | namespace randen {
27 | namespace {
28 |
29 | #define STR(x) #x
30 |
31 | #define ASSERT_TRUE(condition) \
32 | do { \
33 | if (!(condition)) { \
34 | printf("Assertion [" STR(condition) "] failed on line %d\n", __LINE__); \
35 | abort(); \
36 | } \
37 | } while (false)
38 |
39 | using EngRanden = Randen;
40 |
41 | #if ENABLE_VERIFY
42 |
43 | void VerifyReseedChangesAllValues() {
44 | const size_t kNumOutputs = 127;
45 | EngRanden engine;
46 |
47 | std::seed_seq seq1{1, 2, 3, 4, 5, 6, 7};
48 | engine.seed(seq1);
49 | uint64_t out1[kNumOutputs];
50 | for (size_t i = 0; i < kNumOutputs; ++i) {
51 | out1[i] = engine();
52 | }
53 |
54 | std::seed_seq seq2{127, 255, 511};
55 | engine.seed(seq2);
56 | uint64_t out2[kNumOutputs];
57 | engine.seed(seq2);
58 |
59 | for (size_t i = 0; i < kNumOutputs; ++i) {
60 | out2[i] = engine();
61 | ASSERT_TRUE(out2[i] != out1[i]);
62 | }
63 | }
64 |
65 | void VerifyDiscard() {
66 | const int N = 56; // two buffer's worth
67 | for (int num_used = 0; num_used < N; ++num_used) {
68 | EngRanden engine_used;
69 | for (int i = 0; i < num_used; ++i) {
70 | (void)engine_used();
71 | }
72 |
73 | for (int num_discard = 0; num_discard < N; ++num_discard) {
74 | EngRanden engine1 = engine_used;
75 | EngRanden engine2 = engine_used;
76 | for (int i = 0; i < num_discard; ++i) {
77 | (void)engine1();
78 | }
79 | engine2.discard(num_discard);
80 | for (int i = 0; i < N; ++i) {
81 | const uint64_t r1 = engine1();
82 | const uint64_t r2 = engine2();
83 | ASSERT_TRUE(r1 == r2);
84 | }
85 | }
86 | }
87 | }
88 |
89 | void VerifyGolden() {
90 | // prime number => some buffer values unused.
91 | const size_t kNumOutputs = 127;
92 | #if UPDATE_GOLDEN
93 | EngRanden engine;
94 | for (size_t i = 0; i < kNumOutputs; ++i) {
95 | printf("0x%016lx,\n", engine());
96 | }
97 | printf("\n");
98 | #else
99 | const uint64_t golden[kNumOutputs] = {
100 | 0xdda9f47cd90410ee, 0xc3c14f134e433977, 0xf0b780f545c72912,
101 | 0x887bf3087fd8ca10, 0x30ec63baff3c6d59, 0x15dbb1d37696599f,
102 | 0x02808a316f49a54c, 0xb29f73606f7f20a6, 0x9cbf605e3fd9de8a,
103 | 0x3b8feaf9d5c8e50e, 0xd8b2ffd356301ed5, 0xc970ae1a78183bbb,
104 | 0xcdfd8d76eb8f9a19, 0xf4b327fe0fc73c37, 0xd5af05dd3eff9556,
105 | 0xc3a506eb91420c9d, 0x7023920e0d6bfe8c, 0x48db1bb78f83c4a1,
106 | 0xed1ef4c26b87b840, 0x58d3575834956d42, 0x497cabf3431154fc,
107 | 0x8eef32a23e0b2df3, 0xd88b5749f090e5ea, 0x4e24370570029a8b,
108 | 0x78fcec2cbb6342f5, 0xc651a582a970692f, 0x352ee4ad1816afe3,
109 | 0x463cb745612f55db, 0x811ef0821c3de851, 0x026ff374c101da7e,
110 | 0xa0660379992d58fc, 0x6f7e616704c4fa59, 0x915f3445685da798,
111 | 0x04b0a374a3b795c7, 0x4663352533ce1882, 0x26802a8ac76571ce,
112 | 0x5588ba3a4d6e6c51, 0xb9fdefb4a24dc738, 0x607195a5e200f5fd,
113 | 0xa2101a42d35f1956, 0xe1e5e03c759c0709, 0x7e100308f3290764,
114 | 0xcbcf585399e432f1, 0x082572cc5da6606f, 0x0904469acbfee8f2,
115 | 0xe8a2be4f8335d8f1, 0x08e8a1f1a69da69a, 0xf08bd31b6daecd51,
116 | 0x2e9705bb053d6b46, 0x6542a20aad57bff5, 0x78e3a810213b6ffb,
117 | 0xda2fc9db0713c391, 0xc0932718cd55781f, 0xdc16a59cdd85f8a6,
118 | 0xb97289c1be0f2f9c, 0xb9bfb29c2b20bfe5, 0x5524bb834771435b,
119 | 0xc0a2a0e403a892d4, 0xff4af3ab8d1b78c5, 0x8265da3d39d1a750,
120 | 0x66e455f627495189, 0xf0ec5f424bcad77f, 0x3424e47dc22596e3,
121 | 0xc82d3120b57e3270, 0xc191c595afc4dcbf, 0xbc0c95129ccedcdd,
122 | 0x7f90650ea6cd6ab4, 0x120392bd2bb70939, 0xa7c8fac5a7917eb0,
123 | 0x7287491832695ad3, 0x7c1bf9839c7c1ce5, 0xd088cb9418be0361,
124 | 0x78565cdefd28c4ad, 0xe2e991fa58e1e79e, 0x2a9eac28b08c96bf,
125 | 0x7351b9fef98bafad, 0x13a685861bab87e0, 0x6c4f179696cb2225,
126 | 0x30537425cac70991, 0x64c6de5aa0501971, 0x7e05e3aa8ec720dc,
127 | 0x01590d9dc6c532b7, 0x738184388f3bc1d2, 0x74a07d9c54e3e63f,
128 | 0x6bcdf185561f255f, 0x26ffdc5067be3acb, 0x171df81934f68604,
129 | 0xa0eaf2e1cf99b1c6, 0x5d1cb02075ba1cea, 0x7ea5a21665683e5a,
130 | 0xba6364eff80de02f, 0x957f38cbd2123fdf, 0x892d8317de82f7a2,
131 | 0x606e0a0e41d452ee, 0x4eb28826766fcf5b, 0xe707b1db50f7b43e,
132 | 0x6ee217df16527d78, 0x5a362d56e80a0951, 0x443e63857d4076ca,
133 | 0xf6737962ba6b23dd, 0xd796b052151ee94d, 0x790d9a5f048adfeb,
134 | 0x8b833ff84893da5d, 0x033ed95c12b04a03, 0x9877c4225061ca76,
135 | 0x3d6724b1bb15eab9, 0x42e5352fe30ce989, 0xd68d6810adf74fb3,
136 | 0x3cdbf7e358df4b8b, 0x265b565a7431fde7, 0x52d2242f65b37f88,
137 | 0x2922a47f6d3e8779, 0x29d40f00566d5e26, 0x5d836d6e2958d6b5,
138 | 0x6c056608b7d9c1b6, 0x288db0e1124b14a0, 0x8fb946504faa6c9d,
139 | 0x0b9471bdb8f19d32, 0xfd1fe27d144a09e0, 0x8943a9464540251c,
140 | 0x8048f217633fce36, 0xea6ac458da141bda, 0x4334b8b02ff7612f,
141 | 0xfeda1384ade74d31, 0x096d119a3605c85b, 0xdbc8441f5227e216,
142 | 0x541ad7efa6ddc1d3};
143 | EngRanden engine;
144 | for (size_t i = 0; i < kNumOutputs; ++i) {
145 | ASSERT_TRUE(golden[i] == engine());
146 | }
147 | #endif
148 | }
149 |
150 | #endif // ENABLE_VERIFY
151 |
152 | void VerifyRandReqEngine() {
153 | // Validates that Randen satisfies [rand.req.engine].
154 | // Names after definition of [rand.req.engine] in C++ standard.
155 | // e is a value of E
156 | // v is a lvalue of E
157 | // x, y are possibly const values of E
158 | // s is a value of T
159 | // q is a value satisfying requirements of seed_sequence
160 | // z is a value of type unsigned long long
161 | // os is a some specialization of basic_ostream
162 | // is is a some specialization of basic_istream
163 |
164 | using E = EngRanden;
165 | using T = typename EngRanden::result_type;
166 |
167 | static_assert(std::is_copy_constructible::value,
168 | "Randen must be copy constructible");
169 |
170 | static_assert(std::is_copy_assignable::value,
171 | "Randen must be copy assignable");
172 |
173 | E e, v;
174 | const E x, y;
175 | T s = 1;
176 | std::seed_seq q{1, 2, 3};
177 | unsigned long long z = 1; // NOLINT(runtime/int)
178 | std::wostringstream os;
179 | std::wistringstream is;
180 |
181 | E{};
182 | E{x};
183 | E{s};
184 | E{q};
185 |
186 | // Verify that seed() and default-construct is identical.
187 | e.seed();
188 | {
189 | E f;
190 | ASSERT_TRUE(e == f);
191 | }
192 |
193 | // Verify the seed() result type.
194 | static_assert(std::is_same::value,
195 | "return type of seed() must be void");
196 |
197 | static_assert(std::is_same::value,
198 | "return type of seed() must be void");
199 |
200 | // verify that seed via seed_sequence and construct via seed_sequence
201 | // is identical.
202 | e.seed(q);
203 | {
204 | E f{q};
205 | ASSERT_TRUE(e == f);
206 | }
207 |
208 | // Verify the operator() result type.
209 | static_assert(std::is_same::value,
210 | "return type of operator() must be result_type");
211 |
212 | // Verify that once the state has advanced that the engines
213 | // are no longer equal.
214 | e();
215 | {
216 | E f{q};
217 | ASSERT_TRUE(e != f);
218 | }
219 |
220 | {
221 | E f;
222 | ASSERT_TRUE(e != f);
223 | }
224 |
225 | // Verify discard.
226 | e.discard(z);
227 | {
228 | // The state equivalence should change.
229 | E f, g;
230 | f.discard(2);
231 | ASSERT_TRUE(f != g);
232 |
233 | g();
234 | g();
235 | ASSERT_TRUE(f == g);
236 | }
237 |
238 | // Verify operator == result types.
239 | static_assert(std::is_same::value,
240 | "return type of operator== must be bool");
241 |
242 | static_assert(std::is_same::value,
243 | "return type of operator!= must be bool");
244 |
245 | // Verify operator<<() result.
246 | {
247 | auto& os2 = (os << e);
248 | ASSERT_TRUE(&os2 == &os);
249 | }
250 |
251 | // Verify operator>>() result.
252 | {
253 | auto& is2 = (is >> e);
254 | ASSERT_TRUE(&is2 == &is);
255 | }
256 | }
257 |
258 | void VerifyStreamOperators() {
259 | EngRanden engine1(171);
260 | EngRanden engine2;
261 |
262 | {
263 | std::stringstream stream;
264 | stream << engine1;
265 | stream >> engine2;
266 | }
267 |
268 | const int N = 56; // two buffer's worth
269 | for (int i = 0; i < N; ++i) {
270 | const uint64_t r1 = engine1();
271 | const uint64_t r2 = engine2();
272 | ASSERT_TRUE(r1 == r2);
273 | }
274 | }
275 |
276 | void Verify() {
277 | #if ENABLE_VERIFY
278 | VerifyReseedChangesAllValues();
279 | VerifyDiscard();
280 | VerifyGolden();
281 | VerifyRandReqEngine();
282 | VerifyStreamOperators();
283 | #endif
284 | }
285 |
286 | void DumpOutput() {
287 | #if ENABLE_DUMP
288 | const size_t kNumOutputs = 1500 * 1000 * 1000;
289 | std::vector outputs(kNumOutputs);
290 | EngRanden engine;
291 | for (size_t i = 0; i < kNumOutputs; ++i) {
292 | outputs[i] = engine();
293 | }
294 |
295 | FILE* f = fopen("/tmp/randen.bin", "wb");
296 | if (f != nullptr) {
297 | fwrite(outputs.data(), kNumOutputs, 8, f);
298 | fclose(f);
299 | }
300 | #endif // ENABLE_DUMP
301 | }
302 |
303 | void RunAll() {
304 | // Immediately output any results (for non-local runs).
305 | setvbuf(stdout, nullptr, _IONBF, 0);
306 |
307 | Verify();
308 | DumpOutput();
309 | }
310 |
311 | } // namespace
312 | } // namespace randen
313 |
314 | int main(int argc, char* argv[]) {
315 | randen::RunAll();
316 | return 0;
317 | }
318 |
--------------------------------------------------------------------------------
/third_party/pcg_random/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/third_party/pcg_random/include/pcg_extras.hpp:
--------------------------------------------------------------------------------
1 | /*
2 | * PCG Random Number Generation for C++
3 | *
4 | * Copyright 2014 Melissa O'Neill
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | *
18 | * For additional information about the PCG random number generation scheme,
19 | * including its license and other licensing options, visit
20 | *
21 | * http://www.pcg-random.org
22 | */
23 |
24 | /*
25 | * This file provides support code that is useful for random-number generation
26 | * but not specific to the PCG generation scheme, including:
27 | * - 128-bit int support for platforms where it isn't available natively
28 | * - bit twiddling operations
29 | * - I/O of 128-bit and 8-bit integers
30 | * - Handling the evilness of SeedSeq
31 | * - Support for efficiently producing random numbers less than a given
32 | * bound
33 | */
34 |
35 | #ifndef PCG_EXTRAS_HPP_INCLUDED
36 | #define PCG_EXTRAS_HPP_INCLUDED 1
37 |
38 | #include
39 | #include
40 | #include
41 | #include
42 | #include
43 | #include
44 | #include
45 | #include
46 | #include
47 | #include
48 | #include
49 | #include
50 |
51 | #ifdef __GNUC__
52 | #include
53 | #endif
54 |
55 | /*
56 | * Abstractions for compiler-specific directives
57 | */
58 |
59 | #ifdef __GNUC__
60 | #define PCG_NOINLINE __attribute__((noinline))
61 | #define PCG_INLINE __attribute__((always_inline))
62 | #else
63 | #define PCG_NOINLINE
64 | #define PCG_INLINE
65 | #endif
66 |
67 | /*
68 | * Some members of the PCG library use 128-bit math. When compiling on 64-bit
69 | * platforms, both GCC and Clang provide 128-bit integer types that are ideal
70 | * for the job.
71 | *
72 | * On 32-bit platforms (or with other compilers), we fall back to a C++
73 | * class that provides 128-bit unsigned integers instead. It may seem
74 | * like we're reinventing the wheel here, because libraries already exist
75 | * that support large integers, but most existing libraries provide a very
76 | * generic multiprecision code, but here we're operating at a fixed size.
77 | * Also, most other libraries are fairly heavyweight. So we use a direct
78 | * implementation. Sadly, it's much slower than hand-coded assembly or
79 | * direct CPU support.
80 | *
81 | */
82 | #if __SIZEOF_INT128__
83 | namespace pcg_extras {
84 | typedef __uint128_t pcg128_t;
85 | }
86 | #define PCG_128BIT_CONSTANT(high,low) \
87 | ((pcg128_t(high) << 64) + low)
88 | #else
89 | #include "pcg_uint128.hpp"
90 | namespace pcg_extras {
91 | typedef pcg_extras::uint_x4 pcg128_t;
92 | }
93 | #define PCG_128BIT_CONSTANT(high,low) \
94 | pcg128_t(high,low)
95 | #define PCG_EMULATED_128BIT_MATH 1
96 | #endif
97 |
98 |
99 | // google3 crosstool consistently fails to recognize rotr / rotl methods as
100 | // hardware rotations, so force it to use inlined assembly.
101 | // TODO(ahh): switch *everything* to wg21.link/P0553 when that's an option.
102 | #define PCG_USE_INLINE_ASM 1
103 |
104 | namespace pcg_extras {
105 |
106 | /*
107 | * We often need to represent a "number of bits". When used normally, these
108 | * numbers are never greater than 128, so an unsigned char is plenty.
109 | * If you're using a nonstandard generator of a larger size, you can set
110 | * PCG_BITCOUNT_T to have it define it as a larger size. (Some compilers
111 | * might produce faster code if you set it to an unsigned int.)
112 | */
113 |
114 | #ifndef PCG_BITCOUNT_T
115 | typedef uint8_t bitcount_t;
116 | #else
117 | typedef PCG_BITCOUNT_T bitcount_t;
118 | #endif
119 |
120 | /*
121 | * C++ requires us to be able to serialize RNG state by printing or reading
122 | * it from a stream. Because we use 128-bit ints, we also need to be able
123 | * ot print them, so here is code to do so.
124 | *
125 | * This code provides enough functionality to print 128-bit ints in decimal
126 | * and zero-padded in hex. It's not a full-featured implementation.
127 | */
128 |
129 | template
130 | std::basic_ostream&
131 | operator<<(std::basic_ostream& out, pcg128_t value)
132 | {
133 | auto desired_base = out.flags() & out.basefield;
134 | bool want_hex = desired_base == out.hex;
135 |
136 | if (want_hex) {
137 | uint64_t highpart = uint64_t(value >> 64);
138 | uint64_t lowpart = uint64_t(value);
139 | auto desired_width = out.width();
140 | if (desired_width > 16) {
141 | out.width(desired_width - 16);
142 | }
143 | if (highpart != 0 || desired_width > 16)
144 | out << highpart;
145 | CharT oldfill = '\0';
146 | if (highpart != 0) {
147 | out.width(16);
148 | oldfill = out.fill('0');
149 | }
150 | auto oldflags = out.setf(decltype(desired_base){}, out.showbase);
151 | out << lowpart;
152 | out.setf(oldflags);
153 | if (highpart != 0) {
154 | out.fill(oldfill);
155 | }
156 | return out;
157 | }
158 | constexpr size_t MAX_CHARS_128BIT = 40;
159 |
160 | char buffer[MAX_CHARS_128BIT];
161 | char* pos = buffer+sizeof(buffer);
162 | *(--pos) = '\0';
163 | constexpr auto BASE = pcg128_t(10ULL);
164 | do {
165 | auto div = value / BASE;
166 | auto mod = uint32_t(value - (div * BASE));
167 | *(--pos) = '0' + char(mod);
168 | value = div;
169 | } while(value != pcg128_t(0ULL));
170 | return out << pos;
171 | }
172 |
173 | template
174 | std::basic_istream&
175 | operator>>(std::basic_istream& in, pcg128_t& value)
176 | {
177 | typename std::basic_istream::sentry s(in);
178 |
179 | if (!s)
180 | return in;
181 |
182 | constexpr auto BASE = pcg128_t(10ULL);
183 | pcg128_t current(0ULL);
184 | bool did_nothing = true;
185 | bool overflow = false;
186 | for(;;) {
187 | CharT wide_ch = in.get();
188 | if (!in.good())
189 | break;
190 | auto ch = in.narrow(wide_ch, '\0');
191 | if (ch < '0' || ch > '9') {
192 | in.unget();
193 | break;
194 | }
195 | did_nothing = false;
196 | pcg128_t digit(uint32_t(ch - '0'));
197 | pcg128_t timesbase = current*BASE;
198 | overflow = overflow || timesbase < current;
199 | current = timesbase + digit;
200 | overflow = overflow || current < digit;
201 | }
202 |
203 | if (did_nothing || overflow) {
204 | in.setstate(std::ios::failbit);
205 | if (overflow)
206 | current = ~pcg128_t(0ULL);
207 | }
208 |
209 | value = current;
210 |
211 | return in;
212 | }
213 |
214 | /*
215 | * Likewise, if people use tiny rngs, we'll be serializing uint8_t.
216 | * If we just used the provided IO operators, they'd read/write chars,
217 | * not ints, so we need to define our own. We *can* redefine this operator
218 | * here because we're in our own namespace.
219 | */
220 |
221 | template
222 | std::basic_ostream&
223 | operator<<(std::basic_ostream&out, uint8_t value)
224 | {
225 | return out << uint32_t(value);
226 | }
227 |
228 | template
229 | std::basic_istream&
230 | operator>>(std::basic_istream& in, uint8_t& target)
231 | {
232 | uint32_t value = 0xdecea5edU;
233 | in >> value;
234 | if (!in && value == 0xdecea5edU)
235 | return in;
236 | if (value > uint8_t(~0)) {
237 | in.setstate(std::ios::failbit);
238 | value = ~0U;
239 | }
240 | target = uint8_t(value);
241 | return in;
242 | }
243 |
244 | /* Unfortunately, the above functions don't get found in preference to the
245 | * built in ones, so we create some more specific overloads that will.
246 | * Ugh.
247 | */
248 |
249 | inline std::ostream& operator<<(std::ostream& out, uint8_t value)
250 | {
251 | return pcg_extras::operator<< (out, value);
252 | }
253 |
254 | inline std::istream& operator>>(std::istream& in, uint8_t& value)
255 | {
256 | return pcg_extras::operator>> (in, value);
257 | }
258 |
259 |
260 |
261 | /*
262 | * Useful bitwise operations.
263 | */
264 |
265 | /*
266 | * XorShifts are invertable, but they are someting of a pain to invert.
267 | * This function backs them out. It's used by the whacky "inside out"
268 | * generator defined later.
269 | */
270 |
271 | template
272 | inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift)
273 | {
274 | if (2*shift >= bits) {
275 | return x ^ (x >> shift);
276 | }
277 | itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1;
278 | itype highmask1 = ~lowmask1;
279 | itype top1 = x;
280 | itype bottom1 = x & lowmask1;
281 | top1 ^= top1 >> shift;
282 | top1 &= highmask1;
283 | x = top1 | bottom1;
284 | itype lowmask2 = (itype(1U) << (bits - shift)) - 1;
285 | itype bottom2 = x & lowmask2;
286 | bottom2 = unxorshift(bottom2, bits - shift, shift);
287 | bottom2 &= lowmask1;
288 | return top1 | bottom2;
289 | }
290 |
291 | /*
292 | * Rotate left and right.
293 | *
294 | * In ideal world, compilers would spot idiomatic rotate code and convert it
295 | * to a rotate instruction. Of course, opinions vary on what the correct
296 | * idiom is and how to spot it. For clang, sometimes it generates better
297 | * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM.
298 | */
299 |
300 | template
301 | inline itype rotl(itype value, bitcount_t rot)
302 | {
303 | constexpr bitcount_t bits = sizeof(itype) * 8;
304 | constexpr bitcount_t mask = bits - 1;
305 | #if PCG_USE_ZEROCHECK_ROTATE_IDIOM
306 | return rot ? (value << rot) | (value >> (bits - rot)) : value;
307 | #else
308 | return (value << rot) | (value >> ((- rot) & mask));
309 | #endif
310 | }
311 |
312 | template
313 | inline itype rotr(itype value, bitcount_t rot)
314 | {
315 | constexpr bitcount_t bits = sizeof(itype) * 8;
316 | constexpr bitcount_t mask = bits - 1;
317 | #if PCG_USE_ZEROCHECK_ROTATE_IDIOM
318 | return rot ? (value >> rot) | (value << (bits - rot)) : value;
319 | #else
320 | return (value >> rot) | (value << ((- rot) & mask));
321 | #endif
322 | }
323 |
324 | /* Unfortunately, both Clang and GCC sometimes perform poorly when it comes
325 | * to properly recognizing idiomatic rotate code, so for we also provide
326 | * assembler directives (enabled with PCG_USE_INLINE_ASM). Boo, hiss.
327 | * (I hope that these compilers get better so that this code can die.)
328 | *
329 | * These overloads will be preferred over the general template code above.
330 | */
331 |
332 | #if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__ || __i386__)
333 |
334 | inline uint8_t rotr(uint8_t value, bitcount_t rot)
335 | {
336 | asm ("rorb %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
337 | return value;
338 | }
339 |
340 | inline uint16_t rotr(uint16_t value, bitcount_t rot)
341 | {
342 | asm ("rorw %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
343 | return value;
344 | }
345 |
346 | inline uint32_t rotr(uint32_t value, bitcount_t rot)
347 | {
348 | asm ("rorl %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
349 | return value;
350 | }
351 |
352 | #if __x86_64__
353 | inline uint64_t rotr(uint64_t value, bitcount_t rot)
354 | {
355 | asm ("rorq %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
356 | return value;
357 | }
358 | #endif // __x86_64__
359 |
360 | #endif // PCG_USE_INLINE_ASM
361 |
362 |
363 | /*
364 | * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of
365 | * 32-bit integers with seed data, but sometimes we want to produce
366 | * larger or smaller integers.
367 | *
368 | * The following code handles this annoyance.
369 | *
370 | * uneven_copy will copy an array of 32-bit ints to an array of larger or
371 | * smaller ints (actually, the code is general it only needing forward
372 | * iterators). The copy is identical to the one that would be performed if
373 | * we just did memcpy on a standard little-endian machine, but works
374 | * regardless of the endian of the machine (or the weirdness of the ints
375 | * involved).
376 | *
377 | * generate_to initializes an array of integers using a SeedSeq
378 | * object. It is given the size as a static constant at compile time and
379 | * tries to avoid memory allocation. If we're filling in 32-bit constants
380 | * we just do it directly. If we need a separate buffer and it's small,
381 | * we allocate it on the stack. Otherwise, we fall back to heap allocation.
382 | * Ugh.
383 | *
384 | * generate_one produces a single value of some integral type using a
385 | * SeedSeq object.
386 | */
387 |
388 | /* uneven_copy helper, case where destination ints are less than 32 bit. */
389 |
390 | template
391 | SrcIter uneven_copy_impl(
392 | SrcIter src_first, DestIter dest_first, DestIter dest_last,
393 | std::true_type)
394 | {
395 | typedef typename std::iterator_traits::value_type src_t;
396 | typedef typename std::iterator_traits::value_type dest_t;
397 |
398 | constexpr bitcount_t SRC_SIZE = sizeof(src_t);
399 | constexpr bitcount_t DEST_SIZE = sizeof(dest_t);
400 | constexpr bitcount_t DEST_BITS = DEST_SIZE * 8;
401 | constexpr bitcount_t SCALE = SRC_SIZE / DEST_SIZE;
402 |
403 | size_t count = 0;
404 | src_t value = 0;
405 |
406 | while (dest_first != dest_last) {
407 | if ((count++ % SCALE) == 0)
408 | value = *src_first++; // Get more bits
409 | else
410 | value >>= DEST_BITS; // Move down bits
411 |
412 | *dest_first++ = dest_t(value); // Truncates, ignores high bits.
413 | }
414 | return src_first;
415 | }
416 |
417 | /* uneven_copy helper, case where destination ints are more than 32 bit. */
418 |
419 | template
420 | SrcIter uneven_copy_impl(
421 | SrcIter src_first, DestIter dest_first, DestIter dest_last,
422 | std::false_type)
423 | {
424 | typedef typename std::iterator_traits::value_type src_t;
425 | typedef typename std::iterator_traits::value_type dest_t;
426 |
427 | constexpr auto SRC_SIZE = sizeof(src_t);
428 | constexpr auto SRC_BITS = SRC_SIZE * 8;
429 | constexpr auto DEST_SIZE = sizeof(dest_t);
430 | constexpr auto SCALE = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE;
431 |
432 | while (dest_first != dest_last) {
433 | dest_t value(0UL);
434 | unsigned int shift = 0;
435 |
436 | for (size_t i = 0; i < SCALE; ++i) {
437 | value |= dest_t(*src_first++) << shift;
438 | shift += SRC_BITS;
439 | }
440 |
441 | *dest_first++ = value;
442 | }
443 | return src_first;
444 | }
445 |
446 | /* uneven_copy, call the right code for larger vs. smaller */
447 |
448 | template
449 | inline SrcIter uneven_copy(SrcIter src_first,
450 | DestIter dest_first, DestIter dest_last)
451 | {
452 | typedef typename std::iterator_traits::value_type src_t;
453 | typedef typename std::iterator_traits::value_type dest_t;
454 |
455 | constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t);
456 |
457 | return uneven_copy_impl(src_first, dest_first, dest_last,
458 | std::integral_constant{});
459 | }
460 |
461 | /* generate_to, fill in a fixed-size array of integral type using a SeedSeq
462 | * (actually works for any random-access iterator)
463 | */
464 |
465 | template
466 | inline void generate_to_impl(SeedSeq&& generator, DestIter dest,
467 | std::true_type)
468 | {
469 | generator.generate(dest, dest+size);
470 | }
471 |
472 | template
473 | void generate_to_impl(SeedSeq&& generator, DestIter dest,
474 | std::false_type)
475 | {
476 | typedef typename std::iterator_traits::value_type dest_t;
477 | constexpr auto DEST_SIZE = sizeof(dest_t);
478 | constexpr auto GEN_SIZE = sizeof(uint32_t);
479 |
480 | constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE;
481 | constexpr size_t FROM_ELEMS =
482 | GEN_IS_SMALLER
483 | ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE)
484 | : (size + (GEN_SIZE / DEST_SIZE) - 1)
485 | / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER);
486 | // this odd code ^^^^^^^^^^^^^^^^^ is work-around for
487 | // a bug: http://llvm.org/bugs/show_bug.cgi?id=21287
488 |
489 | if (FROM_ELEMS <= 1024) {
490 | uint32_t buffer[FROM_ELEMS];
491 | generator.generate(buffer, buffer+FROM_ELEMS);
492 | uneven_copy(buffer, dest, dest+size);
493 | } else {
494 | uint32_t* buffer = static_cast(malloc(GEN_SIZE * FROM_ELEMS));
495 | generator.generate(buffer, buffer+FROM_ELEMS);
496 | uneven_copy(buffer, dest, dest+size);
497 | free(static_cast(buffer));
498 | }
499 | }
500 |
501 | template
502 | inline void generate_to(SeedSeq&& generator, DestIter dest)
503 | {
504 | typedef typename std::iterator_traits::value_type dest_t;
505 | constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t);
506 |
507 | generate_to_impl(std::forward(generator), dest,
508 | std::integral_constant{});
509 | }
510 |
511 | /* generate_one, produce a value of integral type using a SeedSeq
512 | * (optionally, we can have it produce more than one and pick which one
513 | * we want)
514 | */
515 |
516 | template
517 | inline UInt generate_one(SeedSeq&& generator)
518 | {
519 | UInt result[N];
520 | generate_to(std::forward(generator), result);
521 | return result[i];
522 | }
523 |
524 | template
525 | auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound)
526 | -> typename RngType::result_type
527 | {
528 | typedef typename RngType::result_type rtype;
529 | rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound)
530 | % upper_bound;
531 | for (;;) {
532 | rtype r = rng() - RngType::min();
533 | if (r >= threshold)
534 | return r % upper_bound;
535 | }
536 | }
537 |
538 | template
539 | void shuffle(Iter from, Iter to, RandType&& rng)
540 | {
541 | typedef typename std::iterator_traits::difference_type delta_t;
542 | typedef typename std::remove_reference::type::result_type result_t;
543 | auto count = to - from;
544 | while (count > 1) {
545 | delta_t chosen = delta_t(bounded_rand(rng, result_t(count)));
546 | --count;
547 | --to;
548 | using std::swap;
549 | swap(*(from + chosen), *to);
550 | }
551 | }
552 |
553 | /*
554 | * Although std::seed_seq is useful, it isn't everything. Often we want to
555 | * initialize a random-number generator some other way, such as from a random
556 | * device.
557 | *
558 | * Technically, it does not meet the requirements of a SeedSequence because
559 | * it lacks some of the rarely-used member functions (some of which would
560 | * be impossible to provide). However the C++ standard is quite specific
561 | * that actual engines only called the generate method, so it ought not to be
562 | * a problem in practice.
563 | */
564 |
565 | template
566 | class seed_seq_from {
567 | private:
568 | RngType rng_;
569 |
570 | typedef uint_least32_t result_type;
571 |
572 | public:
573 | template
574 | seed_seq_from(Args&&... args) :
575 | rng_(std::forward(args)...)
576 | {
577 | // Nothing (else) to do...
578 | }
579 |
580 | template
581 | void generate(Iter start, Iter finish)
582 | {
583 | for (auto i = start; i != finish; ++i)
584 | *i = result_type(rng_());
585 | }
586 |
587 | constexpr size_t size() const
588 | {
589 | return (sizeof(typename RngType::result_type) > sizeof(result_type)
590 | && RngType::max() > ~size_t(0UL))
591 | ? ~size_t(0UL)
592 | : size_t(RngType::max());
593 | }
594 | };
595 |
596 | /*
597 | * Sometimes you might want a distinct seed based on when the program
598 | * was compiled. That way, a particular instance of the program will
599 | * behave the same way, but when recompiled it'll produce a different
600 | * value.
601 | */
602 |
603 | template
604 | struct static_arbitrary_seed {
605 | private:
606 | static constexpr IntType fnv(IntType hash, const char* pos) {
607 | return *pos == '\0'
608 | ? hash
609 | : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1));
610 | }
611 |
612 | public:
613 | static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)),
614 | __DATE__ __TIME__ __FILE__);
615 | };
616 |
617 | // Sometimes, when debugging or testing, it's handy to be able print the name
618 | // of a (in human-readable form). This code allows the idiom:
619 | //
620 | // cout << printable_typename()
621 | //
622 | // to print out my_foo_type_t (or its concrete type if it is a synonym)
623 |
624 | template
625 | struct printable_typename {};
626 |
627 | template
628 | std::ostream& operator<<(std::ostream& out, printable_typename) {
629 | const char *implementation_typename = typeid(T).name();
630 | #ifdef __GNUC__
631 | int status;
632 | char* pretty_name =
633 | abi::__cxa_demangle(implementation_typename, NULL, NULL, &status);
634 | if (status == 0)
635 | out << pretty_name;
636 | free(static_cast(pretty_name));
637 | if (status == 0)
638 | return out;
639 | #endif
640 | out << implementation_typename;
641 | return out;
642 | }
643 |
644 | } // namespace pcg_extras
645 |
646 | #endif // PCG_EXTRAS_HPP_INCLUDED
647 |
--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
1 | // Copyright 2018 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef UTIL_H_
16 | #define UTIL_H_
17 |
18 | #include
19 | #include
20 | #include
21 |
22 | #ifdef _MSC_VER
23 | #include
24 | #endif
25 |
26 | #define RANDEN_CHECK(condition) \
27 | do { \
28 | if (!(condition)) { \
29 | printf("Assertion failed on line %d\n", __LINE__); \
30 | abort(); \
31 | } \
32 | } while (false)
33 |
34 | namespace randen {
35 |
36 | // "x" != 0.
37 | static inline int NumZeroBitsAboveMSBNonzero(const uint64_t x) {
38 | #ifdef _MSC_VER
39 | return static_cast(__lzcnt64(x)); // WARNING: requires BMI2
40 | #else
41 | return __builtin_clzll(x);
42 | #endif
43 | }
44 |
45 | } // namespace randen
46 |
47 | #endif // UTIL_H_
48 |
--------------------------------------------------------------------------------
/vector128.h:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | // Wrappers for platform-specific 128-bit vectors.
16 | #ifndef VECTOR128_H_
17 | #define VECTOR128_H_
18 |
19 | #include // uint64_t
20 |
21 | #if defined(__SSE2__) && defined(__AES__)
22 |
23 | #define RANDEN_AESNI 1
24 | #include
25 |
26 | #elif defined(__powerpc__) && defined(__VSX__)
27 |
28 | #define RANDEN_PPC 1
29 | #define RANDEN_BIG_ENDIAN 1
30 | #include
31 |
32 | #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_CRYPTO)
33 |
34 | #define RANDEN_ARM 1
35 | #include
36 |
37 | #else
38 | #error "Port"
39 | #endif
40 |
41 | #if defined(__clang__) || defined(__GNUC__)
42 | #define RANDEN_INLINE inline __attribute__((always_inline))
43 | #define RANDEN_RESTRICT __restrict__
44 | #else
45 | #define RANDEN_INLINE
46 | #define RANDEN_RESTRICT
47 | #endif
48 |
49 | namespace randen {
50 |
51 | #ifdef RANDEN_AESNI
52 |
53 | class V {
54 | public:
55 | RANDEN_INLINE V() {} // Leaves v_ uninitialized.
56 | RANDEN_INLINE V& operator=(const V other) {
57 | raw_ = other.raw_;
58 | return *this;
59 | }
60 |
61 | // Convert from/to intrinsics.
62 | RANDEN_INLINE explicit V(const __m128i raw) : raw_(raw) {}
63 | __m128i raw() const { return raw_; }
64 |
65 | RANDEN_INLINE V& operator^=(const V other) {
66 | raw_ = _mm_xor_si128(raw_, other.raw_);
67 | return *this;
68 | }
69 |
70 | private:
71 | // Note: this wrapper is faster than using __m128i directly.
72 | __m128i raw_;
73 | };
74 |
75 | #elif defined(RANDEN_PPC)
76 |
77 | // Already provides operator^=.
78 | using V = vector unsigned long long;
79 |
80 | #elif defined(RANDEN_ARM)
81 |
82 | // Already provides operator^=.
83 | using V = uint8x16_t;
84 |
85 | #else
86 | #error "Port"
87 | #endif
88 |
89 | constexpr int kLanes = sizeof(V) / sizeof(uint64_t);
90 |
91 | // On big-endian platforms, byte-swap constants (e.g. round keys) to ensure
92 | // results match little-endian platforms.
93 | #ifdef RANDEN_BIG_ENDIAN
94 | #define RANDEN_LE(a, b) __builtin_bswap64(b), __builtin_bswap64(a)
95 | #else
96 | #define RANDEN_LE(a, b) a, b
97 | #endif
98 |
99 | #ifdef RANDEN_BIG_ENDIAN
100 | static RANDEN_INLINE V ReverseBytes(const V v) {
101 | // Reverses the bytes of the vector.
102 | const vector unsigned char perm = {15, 14, 13, 12, 11, 10, 9, 8,
103 | 7, 6, 5, 4, 3, 2, 1, 0};
104 | return vec_perm(v, v, perm);
105 | }
106 | #endif
107 |
108 | // WARNING: these load/store in native byte order. It is OK to load and then
109 | // store an unchanged vector, but interpreting the bits as a number or input
110 | // to AES will have platform-dependent results. Call ReverseBytes after load
111 | // and/or before store #ifdef RANDEN_BIG_ENDIAN.
112 |
113 | static RANDEN_INLINE V Load(const uint64_t* RANDEN_RESTRICT lanes,
114 | const int block) {
115 | #ifdef RANDEN_AESNI
116 | const uint64_t* RANDEN_RESTRICT from = lanes + block * kLanes;
117 | return V(_mm_load_si128(reinterpret_cast(from)));
118 | #elif defined(RANDEN_PPC)
119 | const V* RANDEN_RESTRICT from =
120 | reinterpret_cast(lanes + block * kLanes);
121 | return vec_vsx_ld(0, from);
122 | #elif defined(RANDEN_ARM)
123 | const uint8_t* RANDEN_RESTRICT from =
124 | reinterpret_cast(lanes + block * kLanes);
125 | return vld1q_u8(from);
126 | #else
127 | #error "Port"
128 | #endif
129 | }
130 |
131 | static RANDEN_INLINE void Store(const V v, uint64_t* RANDEN_RESTRICT lanes,
132 | const int block) {
133 | #ifdef RANDEN_AESNI
134 | uint64_t* RANDEN_RESTRICT to = lanes + block * kLanes;
135 | _mm_store_si128(reinterpret_cast<__m128i * RANDEN_RESTRICT>(to), v.raw());
136 | #elif defined(RANDEN_PPC)
137 | V* RANDEN_RESTRICT to = reinterpret_cast(lanes + block * kLanes);
138 | vec_vsx_st(v, 0, to);
139 | #elif defined(RANDEN_ARM)
140 | uint8_t* RANDEN_RESTRICT to =
141 | reinterpret_cast(lanes + block * kLanes);
142 | vst1q_u8(to, v);
143 | #else
144 | #error "Port"
145 | #endif
146 | }
147 |
148 | // One round of AES. "round_key" is a public constant for breaking the
149 | // symmetry of AES (ensures previously equal columns differ afterwards).
150 | static RANDEN_INLINE V AES(const V state, const V round_key) {
151 | #ifdef RANDEN_AESNI
152 | // It is important to always use the full round function - omitting the
153 | // final MixColumns reduces security [https://eprint.iacr.org/2010/041.pdf]
154 | // and does not help because we never decrypt.
155 | return V(_mm_aesenc_si128(state.raw(), round_key.raw()));
156 | #elif defined(RANDEN_PPC)
157 | return V(__builtin_crypto_vcipher(state, round_key));
158 | #elif defined(RANDEN_ARM)
159 | return vaesmcq_u8(vaeseq_u8(state, round_key));
160 | #else
161 | #error "Port"
162 | #endif
163 | }
164 |
165 | } // namespace randen
166 |
167 | #endif // VECTOR128_H_
168 |
--------------------------------------------------------------------------------
/vector128_test.cc:
--------------------------------------------------------------------------------
1 | // Copyright 2017 Google Inc. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "vector128.h"
16 |
17 | #include
18 | #include
19 |
20 | namespace randen {
21 | namespace {
22 |
23 | #define ASSERT_TRUE(condition) \
24 | while (!(condition)) { \
25 | printf("Check failed at line %d\n", __LINE__); \
26 | abort(); \
27 | }
28 |
29 | void TestLoadStore() {
30 | const int N = 4;
31 | alignas(16) uint64_t test_cases[N * 2] = {
32 | 1, 2, 3, 4, 0x1234567890ABCDEFuLL, 0x2143658709BADCFEuLL};
33 |
34 | alignas(16) uint64_t stored[N * 2];
35 | for (int i = 0; i < N; ++i) {
36 | V v = Load(test_cases, i);
37 | Store(v, stored, i);
38 |
39 | ASSERT_TRUE(test_cases[2 * i + 0] == stored[2 * i + 0]);
40 | ASSERT_TRUE(test_cases[2 * i + 1] == stored[2 * i + 1]);
41 | }
42 | }
43 |
44 | void TestXor() {
45 | alignas(16) uint64_t test_cases[][3][2] = {
46 | {{1, 2}, {3, 4}, {2, 6}},
47 | {{0x1234567890ABCDEFuLL, 0x2143658709BADCFEuLL},
48 | {0x2143658709BADCFEuLL, 0x1234567890ABCDEFuLL},
49 | {0x337733ff99111111uLL, 0x337733ff99111111uLL}}};
50 |
51 | for (const auto& test_case : test_cases) {
52 | V v1 = Load(test_case[0], 0);
53 | V v2 = Load(test_case[1], 0);
54 |
55 | v1 ^= v2;
56 | alignas(16) uint64_t data_stored[2];
57 | Store(v1, data_stored, 0);
58 |
59 | ASSERT_TRUE(test_case[2][0] == data_stored[0]);
60 | ASSERT_TRUE(test_case[2][1] == data_stored[1]);
61 | }
62 | }
63 |
64 | void TestAes() {
65 | // This test also catches byte-order bugs in Load/Store functions
66 | alignas(16) uint64_t message[2] = {
67 | RANDEN_LE(0x8899AABBCCDDEEFFuLL, 0x0123456789ABCDEFuLL)};
68 | alignas(16) uint64_t key[2] = {
69 | RANDEN_LE(0x0022446688AACCEEuLL, 0x1133557799BBDDFFuLL)};
70 | alignas(16) uint64_t expected_result[2] = {
71 | RANDEN_LE(0x28E4EE1884504333uLL, 0x16AB0E57DFC442EDuLL)};
72 |
73 | V v_message = Load(message, 0);
74 | V v_key = Load(key, 0);
75 | V v_result = AES(v_message, v_key);
76 |
77 | alignas(16) uint64_t result[2];
78 | Store(v_result, result, 0);
79 |
80 | ASSERT_TRUE(expected_result[0] == result[0]);
81 | ASSERT_TRUE(expected_result[1] == result[1]);
82 | }
83 |
84 | void RunAll() {
85 | // Immediately output any results (for non-local runs).
86 | setvbuf(stdout, nullptr, _IONBF, 0);
87 |
88 | TestLoadStore();
89 | TestXor();
90 | TestAes();
91 | }
92 |
93 | } // namespace
94 | } // namespace randen
95 |
96 | int main(int argc, char* argv[]) {
97 | randen::RunAll();
98 | return 0;
99 | }
100 |
--------------------------------------------------------------------------------