├── .travis.yml ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── contrib └── rust │ ├── Cargo.toml │ └── src │ └── lib.rs ├── engine_chacha.h ├── engine_os.h ├── nanobenchmark.cc ├── nanobenchmark.h ├── nanobenchmark_test.cc ├── randen.cc ├── randen.h ├── randen_benchmark.cc ├── randen_test.cc ├── third_party └── pcg_random │ ├── LICENSE │ └── include │ ├── pcg_extras.hpp │ └── pcg_random.hpp ├── util.h ├── vector128.h └── vector128_test.cc /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | 3 | dist: trusty 4 | 5 | compiler: 6 | - clang 7 | - gcc 8 | 9 | script: 10 | - make 11 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution, 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | override CPPFLAGS += -I. -I../ 2 | override CXXFLAGS += -std=c++11 -Wall -O3 -fno-pic -mavx2 -maes 3 | override LDFLAGS += $(CXXFLAGS) 4 | override CXX = clang++ 5 | 6 | all: $(addprefix bin/, nanobenchmark_test randen_test randen_benchmark vector128_test) 7 | 8 | obj/%.o: %.cc 9 | @mkdir -p -- $(dir $@) 10 | $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) $< -o $@ 11 | 12 | bin/%: obj/%.o obj/nanobenchmark.o obj/randen.o 13 | @mkdir -p bin 14 | $(CXX) $(LDFLAGS) $^ -o $@ 15 | 16 | .DELETE_ON_ERROR: 17 | deps.mk: $(wildcard *.cc) $(wildcard *.h) Makefile 18 | set -eu; for file in *.cc; do \ 19 | target=obj/$${file##*/}; target=$${target%.*}.o; \ 20 | $(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -MM -MT \ 21 | "$$target" "$$file"; \ 22 | done >$@ 23 | -include deps.mk 24 | 25 | clean: 26 | [ ! -d obj ] || $(RM) -r -- obj/ 27 | [ ! -d bin ] || $(RM) -r -- bin/ 28 | [ ! -d lib ] || $(RM) -r -- lib/ 29 | 30 | .PHONY: clean all 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | What if we could default to attack-resistant random generators without excessive 4 | CPU cost? We introduce 'Randen', a new generator with security guarantees; it 5 | outperforms MT19937, pcg64_c32, Philox, ISAAC and ChaCha8 in real-world 6 | benchmarks. This is made possible by AES hardware acceleration and a large 7 | Feistel permutation. 8 | 9 | ## Related work 10 | 11 | AES-CTR (encrypting a counter) is a well-known and easy to implement generator. 12 | It has two known weaknesses: 13 | 14 | - A known-key distinguisher on 10-round, 128-bit AES [https://goo.gl/3xReB9]. 15 | 16 | - No forward security/backtracking resistance: compromising the current state 17 | lets attackers distinguish prior outputs from random. 18 | 19 | NIST 800-90a r1 [https://goo.gl/68Fwmv] is a standardized generator that ensures 20 | backtracking resistance, but is not fast enough for a general-purpose generator 21 | (5-10x slower than AES). 22 | 23 | ## Algorithm 24 | 25 | The Randen generator is based upon three existing components: 26 | 27 | 1) Reverie [https://eprint.iacr.org/2016/886.pdf] is a sponge-like generator 28 | that requires a cryptographic permutation. It improves upon "Provably Robust 29 | Sponge-Based PRNGs and KDFs" by achieving backtracking resistance with only 30 | a single permutation per buffer. 31 | 32 | 2) Simpira v2 [https://eprint.iacr.org/2016/122.pdf] constructs up to 1024-bit 33 | permutations using an improved Generalized Feistel network with 2-round 34 | AES-128 functions. This Feistel block shuffle achieves diffusion sooner and 35 | is less vulnerable to sliced-biclique attacks than a Type-2 cyclic shuffle. 36 | 37 | 3) "New criterion for diffusion property" [https://goo.gl/mLXH4f] shows that 38 | the same kind of improved Feistel block shuffle can be extended to 16 39 | branches, which enables a more efficient 2048-bit permutation. 40 | 41 | We combine these by plugging the larger Simpira-like permutation into Reverie. 42 | 43 | ## Performance 44 | 45 | The implementation targets x86 (Westmere), POWER 8 and ARM64. 46 | 47 | x86 microbenchmark: generating random bits in a tight loop 48 | (cpb=cycles per byte, MAD=median absolute deviation): 49 | 50 | RNG | cpb | MAD 51 | --- | --- | --- 52 | Randen | 1.54 | 0.002 53 | pcg64_c32 | 0.78 | 0.003 54 | mt19937_64 | 1.79 | 0.001 55 | ChaCha8 | 3.02 | 0.003 56 | ISAAC | 4.08 | 0.006 57 | Philox | 4.70 | 0.003 58 | /dev/urandom (ChaCha20) | 15.27 | 0.018 59 | BCryptGenRandom (CTR-DRBG) | 16.80 | 0.009 60 | 61 | x86 real-world benchmark (reservoir sampling): 62 | 63 | RNG | cpb | MAD 64 | --- | --- | --- 65 | Randen | 2.60 | 0.008 66 | pcg64_c32 | 3.03 | 0.009 67 | mt19937_64| 2.82 | 0.009 68 | ChaCha8 | 3.75 | 0.008 69 | ISAAC | 4.46 | 0.014 70 | Philox | 4.95 | 0.009 71 | /dev/urandom (ChaCha20) | 13.46 | 0.017 72 | BCryptGenRandom (CTR-DRBG) | 16.41 | 0.015 73 | 74 | ## Security 75 | 76 | Randen is indistinguishable from random and backtracking-resistant. For more 77 | details and benchmarks, please see ["Randen - fast backtracking-resistant random 78 | generator with AES+Feistel+Reverie"](https://arxiv.org/abs/1810.02227). 79 | 80 | ## Usage 81 | 82 | `make && bin/randen_benchmark` 83 | 84 | Note that the code relies on compiler optimizations. Cycles per byte may 85 | increase by factors of 1.6 when compiled with GCC 7.3, and 1.3 with 86 | Clang 4.0.1. This can be mitigated by manually unrolling the loops. 87 | 88 | ## Third-party implementations / bindings 89 | 90 | Thanks to Frank Denis for making us aware of these third-party implementations 91 | or bindings. Note that the algorithm is still under review and subject to 92 | change, but please feel free to get in touch or raise an issue and we'll 93 | add yours as well. 94 | 95 | By | Language | URL 96 | --- | --- | --- 97 | Frank Denis | C | https://github.com/jedisct1/randen-rng 98 | 99 | 100 | This is not an official Google product. 101 | -------------------------------------------------------------------------------- /contrib/rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "randen" 3 | version = "0.0.0" 4 | authors = ["Ruud van Asseldonk ", "Jan Wassenberg ", "Brendan Hickey "] 5 | license = "Apache-2.0" 6 | description = "Randen is a fast, backtracking resistant CSPRNG." 7 | repository = "https://github.com/google/randen" 8 | keywords = [ "Crypto", "rng", "random" ] 9 | 10 | [dependencies.rand] 11 | version = "0.5" 12 | features = ["i128_support"] 13 | -------------------------------------------------------------------------------- /contrib/rust/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! The Randen pseudorandom number generator. 2 | 3 | extern crate rand; 4 | 5 | use std::mem; 6 | use std::ops::BitXorAssign; 7 | 8 | use rand::{Error, FromEntropy, RngCore, SeedableRng}; 9 | use std::arch::x86_64::{__m128i, _mm_aesenc_si128}; 10 | 11 | /// Size of the entire sponge / state for the Randen PRNG. 12 | const STATE_LEN: usize = 16; // 256 bytes, 16x16 bytes. 13 | 14 | /// Size of the "inner" (inaccessible) part of the sponge. 15 | /// 16 | /// Larger values would require more frequent calls to `randen_generate`. 17 | const CAPACITY: usize = 1; // 1x16 bytes. 18 | 19 | /// Size of the default seed consumed by the sponge. 20 | const SEED_LEN: usize = STATE_LEN - CAPACITY; 21 | const SEED_BYTES: usize = SEED_LEN * 16; 22 | 23 | const STATE_BYTES: usize = STATE_LEN * 16; 24 | const CAPACITY_BYTES: usize = CAPACITY * 16; 25 | 26 | const FEISTEL_ROUNDS: usize = 17; 27 | const FEISTEL_FUNCTIONS: usize = 8; 28 | const ROUND_KEYS_LEN: usize = FEISTEL_ROUNDS * FEISTEL_FUNCTIONS; 29 | 30 | /// Aligned 128 bits wrapper. 31 | #[derive(Copy, Clone, Debug, Eq, PartialEq)] 32 | #[repr(align(16))] 33 | pub struct U128A(u128); 34 | 35 | impl U128A { 36 | #[inline(always)] 37 | fn from(m128i: __m128i) -> U128A { 38 | unsafe { mem::transmute(m128i) } 39 | } 40 | 41 | #[inline(always)] 42 | fn m128i(self) -> __m128i { 43 | unsafe { mem::transmute(self) } 44 | } 45 | } 46 | 47 | impl BitXorAssign for U128A { 48 | fn bitxor_assign(&mut self, rhs: U128A) { 49 | self.0 ^= rhs.0; 50 | } 51 | } 52 | 53 | // "Nothing up my sleeve" numbers from the first hex digits of pi. 54 | // 55 | // Obtained from http://hexpi.sourceforge.net/. The array was generated by the 56 | // following Python script: 57 | /* 58 | python3 << EOF 59 | """Generates Randen round keys array from pi-hex.62500.txt file.""" 60 | KEYS = 136 61 | 62 | def chunks(l, n): 63 | """Yield successive n-sized chunks from l.""" 64 | for i in range(0, len(l), n): 65 | yield l[i:i + n] 66 | 67 | with open("pi-hex.62500.txt") as file: 68 | for key in chunks(file.read(KEYS * 32), 32): 69 | print(' U128A(0x{}),'.format(key[16:], key[:16])) 70 | EOF 71 | */ 72 | const ROUND_KEYS: [U128A; ROUND_KEYS_LEN] = [ 73 | U128A(0x13198A2E03707344243F6A8885A308D3), 74 | U128A(0x082EFA98EC4E6C89A4093822299F31D0), 75 | U128A(0xBE5466CF34E90C6C452821E638D01377), 76 | U128A(0x3F84D5B5B5470917C0AC29B7C97C50DD), 77 | U128A(0xD1310BA698DFB5AC9216D5D98979FB1B), 78 | U128A(0xB8E1AFED6A267E962FFD72DBD01ADFB7), 79 | U128A(0x24A19947B3916CF7BA7C9045F12C7F99), 80 | U128A(0x636920D871574E690801F2E2858EFC16), 81 | U128A(0x0D95748F728EB658A458FEA3F4933D7E), 82 | U128A(0x7B54A41DC25A59B5718BCD5882154AEE), 83 | U128A(0xC5D1B023286085F09C30D5392AF26013), 84 | U128A(0x8E79DCB0603A180ECA417918B8DB38EF), 85 | U128A(0xD71577C1BD314B276C9E0E8BB01E8A3E), 86 | U128A(0xE65525F3AA55AB9478AF2FDA55605C60), 87 | U128A(0x55CA396A2AAB10B65748986263E81440), 88 | U128A(0xA15486AF7C72E993B4CC5C341141E8CE), 89 | U128A(0x2BA9C55D741831F6B3EE1411636FBC2A), 90 | U128A(0xAFD6BA336C24CF5CCE5C3E169B87931E), 91 | U128A(0x3B8F48986B4BB9AF7A32538128958677), 92 | U128A(0x61D809CCFB21A991C4BFE81B66282193), 93 | U128A(0xEF845D5DE98575B1487CAC605DEC8032), 94 | U128A(0x23893E81D396ACC5DC262302EB651B88), 95 | U128A(0x2E0B4482A48420040F6D6FF383F44239), 96 | U128A(0x21C66842F6E96C9A69C8F04A9E1F9B5E), 97 | U128A(0x6A51A0D2D8542F68670C9C61ABD388F0), 98 | U128A(0x6EEF0B6C137A3BE4960FA728AB5133A3), 99 | U128A(0xA1F1651D39AF0176BA3BF0507EFB2A98), 100 | U128A(0x8CEE8619456F9FB466CA593E82430E88), 101 | U128A(0xE06F75D885C120737D84A5C33B8B5EBE), 102 | U128A(0x4ED3AA62363F7706401A449F56C16AA6), 103 | U128A(0x37D0D724D00A12481BFEDF72429B023D), 104 | U128A(0x075372C980991B7BDB0FEAD349F1C09B), 105 | U128A(0xE3FE501AB6794C3B25D479D8F6E8DEF7), 106 | U128A(0xC1A94FB6409F60C4976CE0BD04C006BA), 107 | U128A(0x68FB6FAF3E6C53B55E5C9EC2196A2463), 108 | U128A(0x6DFC511F9B30952C1339B2EB3B52EC6F), 109 | U128A(0xBEE3D004DE334AFDCC814544AF5EBD09), 110 | U128A(0xC0CBA85745C8740F660F2807192E4BB3), 111 | U128A(0x5579C0BD1A60320AD20B5F39B9D3FBDB), 112 | U128A(0x679F25FEFB1FA3CCD6A100C6402C7279), 113 | U128A(0x3C7516DFFD616B158EA5E9F8DB3222F8), 114 | U128A(0x323DB5FAFD2387602F501EC8AD0552AB), 115 | U128A(0x9E5C57BBCA6F8CA053317B483E00DF82), 116 | U128A(0xD542A8F6287EFFC31A87562EDF1769DB), 117 | U128A(0x695B27B0BBCA58C8AC6732C68C4F5573), 118 | U128A(0x10FA3D98FD2183B8E1FFA35DB8F011A0), 119 | U128A(0x9A53E479B6F845654AFCB56C2DD1D35B), 120 | U128A(0xE1DDF2DAA4CB7E33D28E49BC4BFB9790), 121 | U128A(0xEF20CADA36774C0162FB1341CEE4C6E8), 122 | U128A(0x95DBDA4DAE909198D07E9EFE2BF11FB4), 123 | U128A(0xD08ED1D0AFC725E0EAAD8E716B93D5A0), 124 | U128A(0x8FF6E2FBF2122B648E3C5B2F8E7594B7), 125 | U128A(0x4FAD5EA0688FC31C8888B812900DF01C), 126 | U128A(0x2F2F2218BE0E1777D1CFF191B3A8C1AD), 127 | U128A(0xE5A0CC0FB56F74E8EA752DFE8B021FA1), 128 | U128A(0xB4A84FE0FD13E0B718ACF3D6CE89E299), 129 | U128A(0x165FA266809577057CC43B81D2ADA8D9), 130 | U128A(0xE6AD206577B5FA8693CC7314211A1477), 131 | U128A(0xEBCDAF0C7B3E89A0C75442F5FB9D35CF), 132 | U128A(0x00250E2D2071B35ED6411BD3AE1E7E49), 133 | U128A(0x2464369BF009B91E226800BB57B8E0AF), 134 | U128A(0x78C14389D95A537F5563911D59DFA6AA), 135 | U128A(0x832603766295CFA9207D5BA202E5B9C5), 136 | U128A(0xB3472DCA7B14A94A11C819684E734A41), 137 | U128A(0xD60F573FBC9BC6E41B5100529A532915), 138 | U128A(0x08BA6FB5571BE91F2B60A47681E67400), 139 | U128A(0xB6636521E7B9F9B6F296EC6B2A0DD915), 140 | U128A(0x53B02D5DA99F8FA1FF34052EC5855664), 141 | U128A(0x4B7A70E9B5B3294408BA47996E85076A), 142 | U128A(0xAD6EA6B049A7DF7DDB75092EC4192623), 143 | U128A(0xECAA8C71699A18FF9CEE60B88FEDB266), 144 | U128A(0x193602A575094C295664526CC2B19EE1), 145 | U128A(0x3F54989A5B429D65A0591340E4183A3E), 146 | U128A(0xA1D29C07EFE830F56B8FE4D699F73FD6), 147 | U128A(0x4CDD20868470EB264D2D38E6F0255DC1), 148 | U128A(0x09686B3F3EBAEFC96382E9C6021ECC5E), 149 | U128A(0x687F358452A0E2863C9718146B6A70A1), 150 | U128A(0x3E07841C7FDEAE5CB79C5305AA500737), 151 | U128A(0xB03ADA37F0500C0D8E7D44EC5716F2B8), 152 | U128A(0xAE0CF51A3CB574B2F01C1F040200B3FF), 153 | U128A(0xD19113F97CA92FF625837A58DC0921BD), 154 | U128A(0x3AE5E58137C2DADC9432477322F54701), 155 | U128A(0xA94461460FD0030EC8B576349AF3DDA7), 156 | U128A(0xE238CD993BEA0E2FECC8C73EA4751E41), 157 | U128A(0x4E548B384F6DB9083280BBA1183EB331), 158 | U128A(0x2CB8129024977C796F420D03F60A04BF), 159 | U128A(0xDE9A771FD99308105679B072BCAF89AF), 160 | U128A(0x5512721F2E6B7124B38BAE12DCCF3F2E), 161 | U128A(0x7A5847187408DA17501ADDE69F84CD87), 162 | U128A(0xEC7AEC3ADB851DFABC9F9ABCE94B7D8C), 163 | U128A(0xEF1C18473215D80863094366C464C3D2), 164 | U128A(0x12A14D432A65C451DD433B3724C2BA16), 165 | U128A(0x71DFF89E10314E5550940002133AE4DD), 166 | U128A(0x043556F1D7A3C76B81AC77D65F11199B), 167 | U128A(0xF28FE6ED97F1FBFA3C11183B5924A509), 168 | U128A(0x86E34570EAE96FB19EBABF2C1E153C6E), 169 | U128A(0x771FE71C4E3D06FA860E5E0A5A3E2AB3), 170 | U128A(0x803E89D65266C8252965DCB999E71D0F), 171 | U128A(0xC6150EBA94E2EA782E4CC9789C10B36A), 172 | U128A(0xF2F74EA7361D2B3DA6FC3C531E0A2DF4), 173 | U128A(0x5223A708F71312B61939260F19C27960), 174 | U128A(0xE3BC4595A67BC883EBADFE6EEAC31F66), 175 | U128A(0xC332DDEFBE6C5AA5B17F37D1018CFF28), 176 | U128A(0xEECEA50FDB2F953B6558218568AB9702), 177 | U128A(0x1521B628290761702AEF7DAD5B6E2F84), 178 | U128A(0x13CCA830EB61BD96ECDD4775619F1510), 179 | U128A(0xB5735C904C70A2390334FE1EAA0363CF), 180 | U128A(0xEECC86BC60622CA7D59E9E0BCBAADE14), 181 | U128A(0x648B1EAF19BDF0CA9CAB5CABB2F3846E), 182 | U128A(0x40685A323C2AB4B3A02369B9655ABB50), 183 | U128A(0x9B540B19875FA099319EE9D5C021B8F7), 184 | U128A(0xF837889A97E32D7795F7997E623D7DA8), 185 | U128A(0x0E358829C7E61FD611ED935F16681281), 186 | U128A(0x57F584A51B22726396DEDFA17858BA99), 187 | U128A(0xCDB30AEB532E30549B83C3FF1AC24696), 188 | U128A(0x58EBF2EF34C6FFEA8FD948E46DBC3128), 189 | U128A(0x5D4A14D9E864B7E3FE28ED61EE7C3C73), 190 | U128A(0x45EEE2B6A3AAABEA42105D14203E13E0), 191 | U128A(0xC742F442EF6ABBB5DB6C4F15FACB4FD0), 192 | U128A(0xD81E799E86854DC7654F3B1D41CD2105), 193 | U128A(0xCF62A1F25B8D2646E44B476A3D816250), 194 | U128A(0x7F1524C369CB7492FC8883A0C1C7B6A3), 195 | U128A(0x095BBF00AD19489D47848A0B5692B285), 196 | U128A(0x58428D2A0C55F5EA1462B17423820D00), 197 | U128A(0x3372F0928D937E411DADF43E233F7061), 198 | U128A(0x7CDE3759CBEE7460D65FECF16C223BDB), 199 | U128A(0xA607808419F8509E4085F2A7CE77326E), 200 | U128A(0xA969A7AAC50C06C2E8EFD85561D99735), 201 | U128A(0x9E447A2EC34534845A04ABFC800BCADC), 202 | U128A(0xDB73DBD3105588CDFDD567050E1E9EC9), 203 | U128A(0xC5C43465713E38D8675FDA79E3674340), 204 | U128A(0x153E21E78FB03D4A3D28F89EF16DFF20), 205 | U128A(0xE93D5A68948140F7E6E39F2BDB83ADF7), 206 | U128A(0x411520F77602D4F7F64C261C94692934), 207 | U128A(0xD40824713320F46ABCF46B2ED4A10068), 208 | U128A(0x1E39F62E9724454643B7D4B7500061AF), 209 | ]; 210 | 211 | pub type State = [U128A; STATE_LEN]; 212 | 213 | #[inline(always)] 214 | fn aes_round(state: U128A, round_key: U128A) -> U128A { 215 | unsafe { U128A::from(_mm_aesenc_si128(state.m128i(), round_key.m128i())) } 216 | } 217 | 218 | /// Improved odd-even shuffle from "New criterion for diffusion property". 219 | #[inline(always)] 220 | fn block_shuffle(source: State) -> State { 221 | let shuffle = [7, 2, 13, 4, 11, 8, 3, 6, 15, 0, 9, 10, 1, 14, 5, 12]; 222 | // TODO: Check if the zeros get generated; if so, use mem::uninitialized. 223 | let mut new_state = [U128A(0); STATE_LEN]; 224 | for (i, shuf) in shuffle.iter().enumerate() { 225 | new_state[i] = source[*shuf]; 226 | } 227 | new_state 228 | } 229 | 230 | /// Cryptographic permutation based on type-2 Generalized Feistel Network. 231 | /// 232 | /// An adversary who can query a permutation for a chosen ciphertext cannot 233 | /// distinguish the permutation from a truly random permutation in less than 234 | /// 2^64 queries, if the round function is a pseudorandom function. This is 235 | /// similar to the b=8 case of Simpira v2, but more efficient than Simpira's 236 | /// generic construction from b=16. 237 | #[inline(always)] 238 | fn permute(state: &mut State) { 239 | let mut keys = ROUND_KEYS.iter(); 240 | for _ in 0..FEISTEL_ROUNDS { 241 | for branch in 0..FEISTEL_FUNCTIONS { 242 | let even = state[branch * 2]; 243 | let odd = state[branch * 2 + 1]; 244 | // Feistel round function using two AES subrounds. Very similar to 245 | // F() from Simpira v2, but with independent subround keys. Uses 17 246 | // AES rounds per 16 bytes (vs. 10 for AES-CTR). Computing eight 247 | // round functions in parallel hides the 7-cycle AESNI latency on 248 | // HSW. Note that the Feistel XORs are 'free' (included in the 249 | // second AES instruction). 250 | let f1 = aes_round(even, *keys.next().unwrap()); 251 | let f2 = aes_round(f1, odd); 252 | state[branch * 2 + 1] = f2; 253 | } 254 | *state = block_shuffle(*state); 255 | } 256 | } 257 | 258 | /// Generate updates the Randen sponge. 259 | /// 260 | /// The outer portion of the sponge (`CAPACITY_BYTES..STATE_BYTES`) may be 261 | /// consumed as PRNG output after applying this function. 262 | #[cfg(target_endian = "little")] 263 | pub fn randen_generate(state: &mut State) { 264 | let prev_inner = state[0]; 265 | // Note: for a big-endian architecture, the endianness of the state and 266 | // round keys needs to be converted first. But as this currently relies on 267 | // an x86-only instruction, we don't deal with this at the moment. 268 | permute(state); 269 | 270 | // Ensure backtracking resistance. 271 | state[0] ^= prev_inner; 272 | } 273 | 274 | #[cfg(target_endian = "big")] 275 | pub fn randen_generate(state: &mut State) { 276 | unimplemented!("Big endian requires swapping the bytes in the state and round keys."); 277 | } 278 | 279 | pub fn randen_absorb(state: &mut State, seed: &[U128A; SEED_LEN]) { 280 | for (seed_elem, state_elem) in seed.iter().zip(&mut state[1..]) { 281 | *state_elem ^= *seed_elem; 282 | } 283 | } 284 | 285 | // Note: do not derive Copy, to avoid accidental reuse of the state. 286 | #[derive(Clone, Debug)] 287 | pub struct RandenRng { 288 | /// The current state. 289 | state: State, 290 | /// Index of the next unconsumed byte of the state. 291 | /// 292 | /// The value is least `CAPACITY_BYTES`. The value may exceed `STATE_BYTES - 293 | /// 1`. In that case a generate is required before consuming bytes. 294 | cursor: usize, 295 | } 296 | 297 | impl RandenRng { 298 | /// Create a Randen random number generator using a fixed default seed. 299 | pub fn new_unseeded() -> RandenRng { 300 | RandenRng { 301 | state: [U128A(0); STATE_LEN], 302 | // Set the cursor to indicate that the state is fully consumed, to 303 | // enforce a generate before returning any bytes. This way the 304 | // initial zeros are not exposed as random numbers. 305 | cursor: STATE_BYTES, 306 | } 307 | } 308 | } 309 | 310 | // The implementations of `next_u32` and `next_u64` are similar apart from the 311 | // types and size constants, use a macro so we only have to write it once. 312 | macro_rules! impl_next { 313 | ($func: ident, $t: ty, $size: expr) => { 314 | fn $func(&mut self) -> $t { 315 | // If we don't have enough bytes left in the state, generate new 316 | // random bytes. 317 | if self.cursor > STATE_BYTES - $size { 318 | randen_generate(&mut self.state); 319 | self.cursor = CAPACITY_BYTES; 320 | } 321 | 322 | // Round the cursor up to the next multiple of $size, so we can 323 | // pretend that the state is an array of $ts and load one from 324 | // there. It means we discard some bytes if the cursor was not at 325 | // a multiple of $size, but the advantage is that we don't need to 326 | // worry about carrying over bytes between generations, when there 327 | // are < $size bytes available. 328 | let index = (self.cursor + $size - 1) / $size; 329 | self.cursor = (index + 1) * $size; 330 | let ts: [$t; STATE_BYTES / $size] = 331 | unsafe { mem::transmute(self.state) }; 332 | ts[index] 333 | } 334 | } 335 | } 336 | 337 | impl RngCore for RandenRng { 338 | impl_next!(next_u32, u32, 4); 339 | impl_next!(next_u64, u64, 8); 340 | 341 | fn fill_bytes(&mut self, dest: &mut [u8]) { 342 | let mut i = 0; 343 | let len = dest.len(); 344 | while i < len { 345 | if self.cursor >= STATE_BYTES { 346 | randen_generate(&mut self.state); 347 | self.cursor = CAPACITY_BYTES; 348 | } 349 | 350 | let bytes: [u8; STATE_BYTES] = unsafe { mem::transmute(self.state) }; 351 | 352 | // This iteration we will consume as many bytes as there are left 353 | // to fill, or as many bytes as are available for consumption, 354 | // whichever is less. 355 | let consume_bytes = (len - i).min(STATE_BYTES - self.cursor); 356 | let source = &bytes[self.cursor..self.cursor + consume_bytes]; 357 | dest[i..i + consume_bytes].copy_from_slice(source); 358 | self.cursor += consume_bytes; 359 | i += consume_bytes; 360 | } 361 | } 362 | 363 | fn try_fill_bytes(&mut self, dest: &mut [u8]) -> Result<(), Error> { 364 | Ok(self.fill_bytes(dest)) 365 | } 366 | } 367 | 368 | pub struct RandenSeed(pub [u8; SEED_BYTES]); 369 | 370 | impl Default for RandenSeed { 371 | fn default() -> RandenSeed { 372 | RandenSeed([0; SEED_BYTES]) 373 | } 374 | } 375 | 376 | impl AsMut<[u8]> for RandenSeed { 377 | fn as_mut(&mut self) -> &mut [u8] { 378 | &mut self.0 379 | } 380 | } 381 | 382 | impl SeedableRng for RandenRng { 383 | type Seed = RandenSeed; 384 | 385 | fn from_seed(seed: RandenSeed) -> RandenRng { 386 | let mut rng = RandenRng::new_unseeded(); 387 | unsafe { 388 | // [u8] isn't necessarily 16 byte aligned. Transmuting it to [U128] 389 | // won't fix the alignment, but a subsequent clone should work. 390 | let unaligned_seed = std::mem::transmute::<[u8; SEED_BYTES], [U128A; SEED_LEN]>(seed.0); 391 | let aligned_seed = unaligned_seed.clone(); 392 | randen_absorb(&mut rng.state, &aligned_seed); 393 | rng 394 | } 395 | } 396 | } 397 | 398 | #[cfg(test)] 399 | mod test { 400 | use super::{RandenRng, U128A}; 401 | use rand::{RngCore, SeedableRng}; 402 | 403 | #[test] 404 | fn randen_rng_next_u64_test_vectors() { 405 | // These test vectors were generated from the reference C++ 406 | // implementation with the following program: 407 | // 408 | // int main(int, char**) { 409 | // randen::Randen rng; 410 | // for (int i = 0; i < 33; i++) { 411 | // std::cout << " assert_eq!(rng.next_u64(), 0x"; 412 | // std::cout << std::setbase(16) << std::setw(16) 413 | // << std::setfill('0') << rng(); 414 | // std::cout << ");\n"; 415 | // } 416 | // std::cout << std::endl; 417 | // return 0; 418 | // } 419 | 420 | // Note that there are more bytes consumed than the size of the state, 421 | // forcing a `randen_generate()`. 422 | let mut rng = RandenRng::new_unseeded(); 423 | assert_eq!(rng.next_u64(), 0xdda9f47cd90410ee); 424 | assert_eq!(rng.next_u64(), 0xc3c14f134e433977); 425 | assert_eq!(rng.next_u64(), 0xf0b780f545c72912); 426 | assert_eq!(rng.next_u64(), 0x887bf3087fd8ca10); 427 | assert_eq!(rng.next_u64(), 0x30ec63baff3c6d59); 428 | assert_eq!(rng.next_u64(), 0x15dbb1d37696599f); 429 | assert_eq!(rng.next_u64(), 0x2808a316f49a54c); 430 | assert_eq!(rng.next_u64(), 0xb29f73606f7f20a6); 431 | assert_eq!(rng.next_u64(), 0x9cbf605e3fd9de8a); 432 | assert_eq!(rng.next_u64(), 0x3b8feaf9d5c8e50e); 433 | assert_eq!(rng.next_u64(), 0xd8b2ffd356301ed5); 434 | assert_eq!(rng.next_u64(), 0xc970ae1a78183bbb); 435 | assert_eq!(rng.next_u64(), 0xcdfd8d76eb8f9a19); 436 | assert_eq!(rng.next_u64(), 0xf4b327fe0fc73c37); 437 | assert_eq!(rng.next_u64(), 0xd5af05dd3eff9556); 438 | assert_eq!(rng.next_u64(), 0xc3a506eb91420c9d); 439 | assert_eq!(rng.next_u64(), 0x7023920e0d6bfe8c); 440 | assert_eq!(rng.next_u64(), 0x48db1bb78f83c4a1); 441 | assert_eq!(rng.next_u64(), 0xed1ef4c26b87b840); 442 | assert_eq!(rng.next_u64(), 0x58d3575834956d42); 443 | assert_eq!(rng.next_u64(), 0x497cabf3431154fc); 444 | assert_eq!(rng.next_u64(), 0x8eef32a23e0b2df3); 445 | assert_eq!(rng.next_u64(), 0xd88b5749f090e5ea); 446 | assert_eq!(rng.next_u64(), 0x4e24370570029a8b); 447 | assert_eq!(rng.next_u64(), 0x78fcec2cbb6342f5); 448 | assert_eq!(rng.next_u64(), 0xc651a582a970692f); 449 | assert_eq!(rng.next_u64(), 0x352ee4ad1816afe3); 450 | assert_eq!(rng.next_u64(), 0x463cb745612f55db); 451 | assert_eq!(rng.next_u64(), 0x811ef0821c3de851); 452 | assert_eq!(rng.next_u64(), 0x26ff374c101da7e); 453 | assert_eq!(rng.next_u64(), 0xa0660379992d58fc); 454 | assert_eq!(rng.next_u64(), 0x6f7e616704c4fa59); 455 | assert_eq!(rng.next_u64(), 0x915f3445685da798); 456 | } 457 | 458 | #[test] 459 | fn randen_rng_next_u32_test_vectors() { 460 | // Same test as `randen_rng_next_u64_test_vectors()`, generated from the 461 | // same C++ program, but adapted to produce 32-bit integers. 462 | 463 | // Note that there are more bytes consumed than the size of the state, 464 | // forcing a `randen_generate()`. 465 | let mut rng = RandenRng::new_unseeded(); 466 | assert_eq!(rng.next_u32(), 0xd90410ee); 467 | assert_eq!(rng.next_u32(), 0xdda9f47c); 468 | assert_eq!(rng.next_u32(), 0x4e433977); 469 | assert_eq!(rng.next_u32(), 0xc3c14f13); 470 | assert_eq!(rng.next_u32(), 0x45c72912); 471 | assert_eq!(rng.next_u32(), 0xf0b780f5); 472 | assert_eq!(rng.next_u32(), 0x7fd8ca10); 473 | assert_eq!(rng.next_u32(), 0x887bf308); 474 | assert_eq!(rng.next_u32(), 0xff3c6d59); 475 | assert_eq!(rng.next_u32(), 0x30ec63ba); 476 | assert_eq!(rng.next_u32(), 0x7696599f); 477 | assert_eq!(rng.next_u32(), 0x15dbb1d3); 478 | assert_eq!(rng.next_u32(), 0x6f49a54c); 479 | assert_eq!(rng.next_u32(), 0x02808a31); 480 | assert_eq!(rng.next_u32(), 0x6f7f20a6); 481 | assert_eq!(rng.next_u32(), 0xb29f7360); 482 | assert_eq!(rng.next_u32(), 0x3fd9de8a); 483 | assert_eq!(rng.next_u32(), 0x9cbf605e); 484 | assert_eq!(rng.next_u32(), 0xd5c8e50e); 485 | assert_eq!(rng.next_u32(), 0x3b8feaf9); 486 | assert_eq!(rng.next_u32(), 0x56301ed5); 487 | assert_eq!(rng.next_u32(), 0xd8b2ffd3); 488 | assert_eq!(rng.next_u32(), 0x78183bbb); 489 | assert_eq!(rng.next_u32(), 0xc970ae1a); 490 | assert_eq!(rng.next_u32(), 0xeb8f9a19); 491 | assert_eq!(rng.next_u32(), 0xcdfd8d76); 492 | assert_eq!(rng.next_u32(), 0x0fc73c37); 493 | assert_eq!(rng.next_u32(), 0xf4b327fe); 494 | assert_eq!(rng.next_u32(), 0x3eff9556); 495 | assert_eq!(rng.next_u32(), 0xd5af05dd); 496 | assert_eq!(rng.next_u32(), 0x91420c9d); 497 | assert_eq!(rng.next_u32(), 0xc3a506eb); 498 | assert_eq!(rng.next_u32(), 0x0d6bfe8c); 499 | assert_eq!(rng.next_u32(), 0x7023920e); 500 | assert_eq!(rng.next_u32(), 0x8f83c4a1); 501 | assert_eq!(rng.next_u32(), 0x48db1bb7); 502 | assert_eq!(rng.next_u32(), 0x6b87b840); 503 | assert_eq!(rng.next_u32(), 0xed1ef4c2); 504 | assert_eq!(rng.next_u32(), 0x34956d42); 505 | assert_eq!(rng.next_u32(), 0x58d35758); 506 | assert_eq!(rng.next_u32(), 0x431154fc); 507 | assert_eq!(rng.next_u32(), 0x497cabf3); 508 | assert_eq!(rng.next_u32(), 0x3e0b2df3); 509 | assert_eq!(rng.next_u32(), 0x8eef32a2); 510 | assert_eq!(rng.next_u32(), 0xf090e5ea); 511 | assert_eq!(rng.next_u32(), 0xd88b5749); 512 | assert_eq!(rng.next_u32(), 0x70029a8b); 513 | assert_eq!(rng.next_u32(), 0x4e243705); 514 | assert_eq!(rng.next_u32(), 0xbb6342f5); 515 | assert_eq!(rng.next_u32(), 0x78fcec2c); 516 | assert_eq!(rng.next_u32(), 0xa970692f); 517 | assert_eq!(rng.next_u32(), 0xc651a582); 518 | assert_eq!(rng.next_u32(), 0x1816afe3); 519 | assert_eq!(rng.next_u32(), 0x352ee4ad); 520 | assert_eq!(rng.next_u32(), 0x612f55db); 521 | assert_eq!(rng.next_u32(), 0x463cb745); 522 | assert_eq!(rng.next_u32(), 0x1c3de851); 523 | assert_eq!(rng.next_u32(), 0x811ef082); 524 | assert_eq!(rng.next_u32(), 0xc101da7e); 525 | assert_eq!(rng.next_u32(), 0x026ff374); 526 | assert_eq!(rng.next_u32(), 0x992d58fc); 527 | assert_eq!(rng.next_u32(), 0xa0660379); 528 | assert_eq!(rng.next_u32(), 0x04c4fa59); 529 | assert_eq!(rng.next_u32(), 0x6f7e6167); 530 | assert_eq!(rng.next_u32(), 0x685da798); 531 | } 532 | 533 | #[test] 534 | fn randen_rng_fill_bytes_test_vectors() { 535 | // The expected values were generated from the reference C++ 536 | // implementation using the following program: 537 | // 538 | // int main(int, char**) { 539 | // randen::Randen rng; 540 | // std::uint8_t seq_1[37] = {0}; 541 | // std::uint8_t seq_2[151] = {0}; 542 | // std::uint8_t seq_3[233] = {0}; 543 | // for (std::uint8_t& x : seq_1) x = rng(); 544 | // for (std::uint8_t& x : seq_2) x = rng(); 545 | // for (std::uint8_t& x : seq_3) x = rng(); 546 | // std::cout << " assert_eq!(seq_1[36], " 547 | // << static_cast(seq_1[36]) << ");\n"; 548 | // std::cout << " assert_eq!(seq_2[150], " 549 | // << static_cast(seq_2[150]) << ");\n"; 550 | // std::cout << " assert_eq!(seq_3[232], " 551 | // << static_cast(seq_3[232]) << ");\n"; 552 | // std::cout << std::endl; 553 | // return 0; 554 | // } 555 | 556 | let mut seq_1 = [0_u8; 37]; 557 | let mut seq_2 = [0_u8; 151]; 558 | let mut seq_3 = [0_u8; 233]; 559 | let mut rng = RandenRng::new_unseeded(); 560 | rng.fill_bytes(&mut seq_1); 561 | rng.fill_bytes(&mut seq_2); 562 | rng.fill_bytes(&mut seq_3); 563 | assert_eq!(seq_1[36], 186); 564 | assert_eq!(seq_2[150], 112); 565 | assert_eq!(seq_3[232], 24); 566 | } 567 | } 568 | -------------------------------------------------------------------------------- /engine_chacha.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_CHACHA_H_ 16 | #define ENGINE_CHACHA_H_ 17 | #if defined(__SSE2__) && defined(__AES__) 18 | 19 | #include 20 | #include 21 | #include "tmmintrin.h" 22 | 23 | namespace randen { 24 | 25 | // Modified from https://gist.github.com/orlp/32f5d1b631ab092608b1: 26 | /* 27 | Copyright (c) 2015 Orson Peters 28 | 29 | This software is provided 'as-is', without any express or implied warranty. 30 | In no event will the authors be held liable for any damages arising from the 31 | use of this software. 32 | 33 | Permission is granted to anyone to use this software for any purpose, 34 | including commercial applications, and to alter it and redistribute it 35 | freely, subject to the following restrictions: 36 | 37 | 1. The origin of this software must not be misrepresented; you must not 38 | claim that you wrote the original software. If you use this software in a 39 | product, an acknowledgment in the product documentation would be appreciated 40 | but is not required. 41 | 42 | 2. Altered source versions must be plainly marked as such, and must not be 43 | misrepresented as being the original software. 44 | 45 | 3. This notice may not be removed or altered from any source distribution. 46 | */ 47 | 48 | template 49 | class ChaCha { 50 | public: 51 | static constexpr size_t R = 8; 52 | typedef T result_type; 53 | 54 | static constexpr result_type min() { 55 | return std::numeric_limits::min(); 56 | } 57 | static constexpr result_type max() { 58 | return std::numeric_limits::max(); 59 | } 60 | 61 | explicit ChaCha(uint64_t seedval, uint64_t stream = 0) { 62 | seed(seedval, stream); 63 | } 64 | template 65 | explicit ChaCha(Sseq& seq) { 66 | seed(seq); 67 | } 68 | 69 | void seed(uint64_t seedval, uint64_t stream = 0) { 70 | ctr = 0; 71 | keysetup[0] = seedval & 0xffffffffu; 72 | keysetup[1] = seedval >> 32; 73 | keysetup[2] = keysetup[3] = 0xdeadbeef; // Could use 128-bit seed. 74 | keysetup[4] = stream & 0xffffffffu; 75 | keysetup[5] = stream >> 32; 76 | keysetup[6] = keysetup[7] = 0xdeadbeef; // Could use 128-bit stream. 77 | } 78 | 79 | template 80 | void seed(Sseq& seq) { 81 | ctr = 0; 82 | seq.generate(keysetup, keysetup + 8); 83 | } 84 | 85 | result_type operator()() { 86 | int idx = ctr % 16; 87 | if (idx == 0) generate_block(); 88 | 89 | result_type ret; 90 | memcpy(&ret, block + idx, sizeof(ret)); 91 | ctr += sizeof(ret) / sizeof(uint32_t); 92 | 93 | return ret; 94 | } 95 | 96 | private: 97 | void generate_block() { 98 | uint32_t constants[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; 99 | 100 | uint32_t input[16]; 101 | for (int i = 0; i < 4; ++i) input[i] = constants[i]; 102 | for (int i = 0; i < 8; ++i) input[4 + i] = keysetup[i]; 103 | input[12] = (ctr / 16) & 0xffffffffu; 104 | input[13] = (ctr / 16) >> 32; 105 | input[14] = input[15] = 0xdeadbeef; // Could use 128-bit counter. 106 | 107 | for (int i = 0; i < 16; ++i) block[i] = input[i]; 108 | chacha_core(); 109 | for (int i = 0; i < 16; ++i) block[i] += input[i]; 110 | } 111 | 112 | // Get an efficient _mm_roti_epi32 based on enabled features. 113 | #define _mm_roti_epi32(r, c) \ 114 | (((c) == 8) \ 115 | ? _mm_shuffle_epi8((r), _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, \ 116 | 5, 4, 7, 2, 1, 0, 3)) \ 117 | : ((c) == 16) \ 118 | ? _mm_shuffle_epi8((r), _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, \ 119 | 10, 5, 4, 7, 6, 1, 0, 3, 2)) \ 120 | : ((c) == 24) ? _mm_shuffle_epi8( \ 121 | (r), _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, \ 122 | 9, 4, 7, 6, 5, 0, 3, 2, 1)) \ 123 | : _mm_xor_si128(_mm_slli_epi32((r), (c)), \ 124 | _mm_srli_epi32((r), 32 - (c)))) 125 | 126 | void chacha_core() { 127 | // ROTVn rotates the elements in the given vector n places to the left. 128 | #define CHACHA_ROTV1(x) _mm_shuffle_epi32((__m128i)x, 0x39) 129 | #define CHACHA_ROTV2(x) _mm_shuffle_epi32((__m128i)x, 0x4e) 130 | #define CHACHA_ROTV3(x) _mm_shuffle_epi32((__m128i)x, 0x93) 131 | 132 | __m128i a = _mm_load_si128((__m128i*)(block)); 133 | __m128i b = _mm_load_si128((__m128i*)(block + 4)); 134 | __m128i c = _mm_load_si128((__m128i*)(block + 8)); 135 | __m128i d = _mm_load_si128((__m128i*)(block + 12)); 136 | 137 | for (int i = 0; i < R; i += 2) { 138 | a = _mm_add_epi32(a, b); 139 | d = _mm_xor_si128(d, a); 140 | d = _mm_roti_epi32(d, 16); 141 | c = _mm_add_epi32(c, d); 142 | b = _mm_xor_si128(b, c); 143 | b = _mm_roti_epi32(b, 12); 144 | a = _mm_add_epi32(a, b); 145 | d = _mm_xor_si128(d, a); 146 | d = _mm_roti_epi32(d, 8); 147 | c = _mm_add_epi32(c, d); 148 | b = _mm_xor_si128(b, c); 149 | b = _mm_roti_epi32(b, 7); 150 | 151 | b = CHACHA_ROTV1(b); 152 | c = CHACHA_ROTV2(c); 153 | d = CHACHA_ROTV3(d); 154 | 155 | a = _mm_add_epi32(a, b); 156 | d = _mm_xor_si128(d, a); 157 | d = _mm_roti_epi32(d, 16); 158 | c = _mm_add_epi32(c, d); 159 | b = _mm_xor_si128(b, c); 160 | b = _mm_roti_epi32(b, 12); 161 | a = _mm_add_epi32(a, b); 162 | d = _mm_xor_si128(d, a); 163 | d = _mm_roti_epi32(d, 8); 164 | c = _mm_add_epi32(c, d); 165 | b = _mm_xor_si128(b, c); 166 | b = _mm_roti_epi32(b, 7); 167 | 168 | b = CHACHA_ROTV3(b); 169 | c = CHACHA_ROTV2(c); 170 | d = CHACHA_ROTV1(d); 171 | } 172 | 173 | _mm_store_si128((__m128i*)(block), a); 174 | _mm_store_si128((__m128i*)(block + 4), b); 175 | _mm_store_si128((__m128i*)(block + 8), c); 176 | _mm_store_si128((__m128i*)(block + 12), d); 177 | 178 | #undef CHACHA_ROTV3 179 | #undef CHACHA_ROTV2 180 | #undef CHACHA_ROTV1 181 | } 182 | 183 | alignas(16) uint32_t block[16]; 184 | uint32_t keysetup[8]; 185 | uint64_t ctr; 186 | }; 187 | 188 | } // namespace randen 189 | 190 | #endif // defined(__SSE2__) && defined(__AES__) 191 | #endif // ENGINE_CHACHA_H_ 192 | -------------------------------------------------------------------------------- /engine_os.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef ENGINE_OS_H_ 16 | #define ENGINE_OS_H_ 17 | 18 | #ifdef _WIN64 19 | #define NOMINMAX 20 | #include 21 | // Must come after windows.h; this comment ensures that. 22 | #include 23 | #pragma comment(lib, "bcrypt") 24 | #endif 25 | 26 | #include "util.h" 27 | 28 | namespace randen { 29 | 30 | // Buffered, uses OS CSPRNG. 31 | template 32 | class alignas(32) EngineOS { 33 | public: 34 | // C++11 URBG interface: 35 | using result_type = T; 36 | static constexpr T min() { return T(0); } 37 | static constexpr T max() { return ~T(0); } 38 | 39 | EngineOS() { 40 | // The first call to operator() will trigger a refill. 41 | next_ = kStateT; 42 | 43 | #ifdef _WIN32 44 | RANDEN_CHECK(0 == BCryptOpenAlgorithmProvider( 45 | &provider_, BCRYPT_RNG_ALGORITHM, nullptr, 0)); 46 | #else 47 | dev_ = fopen("/dev/urandom", "r"); 48 | RANDEN_CHECK(dev_ != nullptr); 49 | #endif 50 | } 51 | 52 | ~EngineOS() { 53 | #ifdef _WIN32 54 | RANDEN_CHECK(0 == BCryptCloseAlgorithmProvider(provider_, 0)); 55 | #else 56 | RANDEN_CHECK(fclose(dev_) == 0); 57 | #endif 58 | } 59 | 60 | // Returns random bits from the buffer in units of T. 61 | T operator()() { 62 | // (Local copy ensures compiler knows this is not aliased.) 63 | size_t next = next_; 64 | 65 | // Refill the buffer if needed (unlikely). 66 | if (next >= kStateT) { 67 | #ifdef _WIN32 68 | RANDEN_CHECK(0 == BCryptGenRandom(provider_, 69 | reinterpret_cast(&state_[0]), 70 | sizeof(state_), 0)); 71 | #else 72 | const size_t bytes_read = fread(&state_[0], 1, sizeof(state_), dev_); 73 | RANDEN_CHECK(bytes_read == sizeof(state_)); 74 | #endif 75 | next = 0; 76 | } 77 | 78 | const T ret = state_[next]; 79 | next_ = next + 1; 80 | return ret; 81 | } 82 | 83 | private: 84 | static constexpr size_t kStateT = 256 / sizeof(T); // same as Randen 85 | 86 | alignas(32) T state_[kStateT]; 87 | size_t next_; // index within state_ 88 | #ifdef _WIN32 89 | BCRYPT_ALG_HANDLE provider_; 90 | #else 91 | FILE* dev_; 92 | #endif 93 | }; 94 | 95 | } // namespace randen 96 | 97 | #endif // ENGINE_OS_H_ 98 | -------------------------------------------------------------------------------- /nanobenchmark.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "nanobenchmark.h" 16 | #include "randen.h" 17 | 18 | #include 19 | #include 20 | #include // abort 21 | #include // memcpy 22 | #include // clock_gettime 23 | #include // sort 24 | #include 25 | #include 26 | #include // iota 27 | #include 28 | #include 29 | 30 | // Architecture 31 | #if defined(__x86_64__) || defined(_M_X64) 32 | #define NB_ARCH_X86 33 | #if defined(_MSC_VER) 34 | #include 35 | #else 36 | #include // NOLINT 37 | #endif 38 | #elif defined(__powerpc64__) || defined(_M_PPC) 39 | #define NB_ARCH_PPC 40 | #include // NOLINT __ppc_get_timebase_freq 41 | #elif defined(__aarch64__) || defined(__arm__) 42 | #define NB_ARCH_ARM 43 | #else 44 | #error "Please add support for this architecture" 45 | #endif 46 | 47 | // OS 48 | #if defined(_WIN32) || defined(_WIN64) 49 | #define NB_OS_WIN 50 | #define NOMINMAX 51 | #include // NOLINT 52 | #elif defined(__linux__) 53 | #define NB_OS_LINUX 54 | #include // NOLINT 55 | #else 56 | #error "Please add support for this OS" 57 | #endif 58 | 59 | namespace randen { 60 | namespace platform { 61 | namespace { 62 | 63 | // Enables sanity checks that verify correct operation at the cost of 64 | // longer benchmark runs. 65 | #ifndef NANOBENCHMARK_ENABLE_CHECKS 66 | #define NANOBENCHMARK_ENABLE_CHECKS 0 67 | #endif 68 | 69 | #define NANOBENCHMARK_CHECK_ALWAYS(condition) \ 70 | while (!(condition)) { \ 71 | fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \ 72 | abort(); \ 73 | } 74 | 75 | #if NANOBENCHMARK_ENABLE_CHECKS 76 | #define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition) 77 | #else 78 | #define NANOBENCHMARK_CHECK(condition) 79 | #endif 80 | 81 | // Compiler-specific 82 | #ifdef _MSC_VER 83 | #define NB_RESTRICT __restrict 84 | #define NB_INLINE __forceinline 85 | #define NB_NOINLINE __declspec(noinline) 86 | 87 | #elif defined(__GNUC__) || defined(__clang__) 88 | #define NB_RESTRICT __restrict__ 89 | #define NB_INLINE inline __attribute__((always_inline)) 90 | #define NB_NOINLINE inline __attribute__((noinline)) 91 | 92 | #else 93 | #error "Unsupported compiler" 94 | #endif 95 | 96 | #ifdef NB_ARCH_X86 97 | 98 | void Cpuid(const uint32_t level, const uint32_t count, 99 | uint32_t* NB_RESTRICT abcd) { 100 | #ifdef _MSC_VER 101 | int regs[4]; 102 | __cpuidex(regs, level, count); 103 | for (int i = 0; i < 4; ++i) { 104 | abcd[i] = regs[i]; 105 | } 106 | #else 107 | uint32_t a, b, c, d; 108 | __cpuid_count(level, count, a, b, c, d); 109 | abcd[0] = a; 110 | abcd[1] = b; 111 | abcd[2] = c; 112 | abcd[3] = d; 113 | #endif 114 | } 115 | 116 | std::string BrandString() { 117 | char brand_string[49]; 118 | uint32_t abcd[4]; 119 | 120 | // Check if brand string is supported (it is on all reasonable Intel/AMD) 121 | Cpuid(0x80000000U, 0, abcd); 122 | if (abcd[0] < 0x80000004U) { 123 | return std::string(); 124 | } 125 | 126 | for (int i = 0; i < 3; ++i) { 127 | Cpuid(0x80000002U + i, 0, abcd); 128 | memcpy(brand_string + i * 16, &abcd, sizeof(abcd)); 129 | } 130 | brand_string[48] = 0; 131 | return brand_string; 132 | } 133 | 134 | // Returns the frequency quoted inside the brand string. This does not 135 | // account for throttling nor Turbo Boost. 136 | double NominalClockRate() { 137 | const std::string& brand_string = BrandString(); 138 | // Brand strings include the maximum configured frequency. These prefixes are 139 | // defined by Intel CPUID documentation. 140 | const char* prefixes[3] = {"MHz", "GHz", "THz"}; 141 | const double multipliers[3] = {1E6, 1E9, 1E12}; 142 | for (size_t i = 0; i < 3; ++i) { 143 | const size_t pos_prefix = brand_string.find(prefixes[i]); 144 | if (pos_prefix != std::string::npos) { 145 | const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1); 146 | if (pos_space != std::string::npos) { 147 | const std::string digits = 148 | brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1); 149 | return std::stod(digits) * multipliers[i]; 150 | } 151 | } 152 | } 153 | 154 | return 0.0; 155 | } 156 | 157 | #endif // NB_ARCH_X86 158 | 159 | } // namespace 160 | 161 | void PinThreadToCPU(int cpu) { 162 | if (cpu < 0) { 163 | // We might migrate to another CPU before pinning below, but at least cpu 164 | // will be one of the CPUs on which this thread ran. 165 | #if defined(NB_OS_WIN) 166 | cpu = static_cast(GetCurrentProcessorNumber()); 167 | #elif defined(NB_OS_LINUX) 168 | cpu = sched_getcpu(); 169 | #else 170 | #error "Please add support for this OS" 171 | #endif 172 | NANOBENCHMARK_CHECK_ALWAYS(cpu >= 0); 173 | } 174 | 175 | #if defined(NB_OS_WIN) 176 | const HANDLE hThread = GetCurrentThread(); 177 | const DWORD_PTR prev = SetThreadAffinityMask(hThread, 1ULL << cpu); 178 | NANOBENCHMARK_CHECK_ALWAYS(prev != 0); 179 | #elif defined(NB_OS_LINUX) 180 | const pid_t pid = 0; // current thread 181 | cpu_set_t set; 182 | CPU_ZERO(&set); 183 | CPU_SET(cpu, &set); 184 | const int err = sched_setaffinity(pid, sizeof(set), &set); 185 | NANOBENCHMARK_CHECK_ALWAYS(err == 0); 186 | #else 187 | #error "Please add support for this OS" 188 | #endif 189 | } 190 | 191 | // Returns tick rate. Invariant means the tick counter frequency is independent 192 | // of CPU throttling or sleep. May be expensive, caller should cache the result. 193 | double InvariantTicksPerSecond() { 194 | #if defined(NB_ARCH_PPC) 195 | return __ppc_get_timebase_freq(); 196 | #elif defined(NB_ARCH_X86) 197 | // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs. 198 | return NominalClockRate(); 199 | #else 200 | // Fall back to clock_gettime nanoseconds. 201 | return 1E9; 202 | #endif 203 | } 204 | 205 | } // namespace platform 206 | namespace { 207 | 208 | // Prevents the compiler from eliding the computations that led to "output". 209 | template 210 | inline void PreventElision(T&& output) { 211 | #ifndef _MSC_VER 212 | // Works by indicating to the compiler that "output" is being read and 213 | // modified. The +r constraint avoids unnecessary writes to memory, but only 214 | // works for built-in types (typically FuncOutput). 215 | asm volatile("" : "+r"(output) : : "memory"); 216 | #else 217 | // MSVC does not support inline assembly anymore (and never supported GCC's 218 | // RTL constraints). Self-assignment with #pragma optimize("off") might be 219 | // expected to prevent elision, but it does not with MSVC 2015. Type-punning 220 | // with volatile pointers generates inefficient code on MSVC 2017. 221 | static std::atomic dummy(T{}); 222 | dummy.store(output, std::memory_order_relaxed); 223 | #endif 224 | } 225 | 226 | namespace timer { 227 | 228 | // Start/Stop return absolute timestamps and must be placed immediately before 229 | // and after the region to measure. We provide separate Start/Stop functions 230 | // because they use different fences. 231 | // 232 | // Background: RDTSC is not 'serializing'; earlier instructions may complete 233 | // after it, and/or later instructions may complete before it. 'Fences' ensure 234 | // regions' elapsed times are independent of such reordering. The only 235 | // documented unprivileged serializing instruction is CPUID, which acts as a 236 | // full fence (no reordering across it in either direction). Unfortunately 237 | // the latency of CPUID varies wildly (perhaps made worse by not initializing 238 | // its EAX input). Because it cannot reliably be deducted from the region's 239 | // elapsed time, it must not be included in the region to measure (i.e. 240 | // between the two RDTSC). 241 | // 242 | // The newer RDTSCP is sometimes described as serializing, but it actually 243 | // only serves as a half-fence with release semantics. Although all 244 | // instructions in the region will complete before the final timestamp is 245 | // captured, subsequent instructions may leak into the region and increase the 246 | // elapsed time. Inserting another fence after the final RDTSCP would prevent 247 | // such reordering without affecting the measured region. 248 | // 249 | // Fortunately, such a fence exists. The LFENCE instruction is only documented 250 | // to delay later loads until earlier loads are visible. However, Intel's 251 | // reference manual says it acts as a full fence (waiting until all earlier 252 | // instructions have completed, and delaying later instructions until it 253 | // completes). AMD assigns the same behavior to MFENCE. 254 | // 255 | // We need a fence before the initial RDTSC to prevent earlier instructions 256 | // from leaking into the region, and arguably another after RDTSC to avoid 257 | // region instructions from completing before the timestamp is recorded. 258 | // When surrounded by fences, the additional RDTSCP half-fence provides no 259 | // benefit, so the initial timestamp can be recorded via RDTSC, which has 260 | // lower overhead than RDTSCP because it does not read TSC_AUX. In summary, 261 | // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE. 262 | // 263 | // Using Start+Start leads to higher variance and overhead than Stop+Stop. 264 | // However, Stop+Stop includes an LFENCE in the region measurements, which 265 | // adds a delay dependent on earlier loads. The combination of Start+Stop 266 | // is faster than Start+Start and more consistent than Stop+Stop because 267 | // the first LFENCE already delayed subsequent loads before the measured 268 | // region. This combination seems not to have been considered in prior work: 269 | // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c 270 | // 271 | // Note: performance counters can measure 'exact' instructions-retired or 272 | // (unhalted) cycle counts. The RDPMC instruction is not serializing and also 273 | // requires fences. Unfortunately, it is not accessible on all OSes and we 274 | // prefer to avoid kernel-mode drivers. Performance counters are also affected 275 | // by several under/over-count errata, so we use the TSC instead. 276 | 277 | // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds, 278 | // divide by InvariantTicksPerSecond. 279 | inline uint64_t Start64() { 280 | uint64_t t; 281 | #if defined(NB_ARCH_PPC) 282 | asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); 283 | #elif defined(NB_ARCH_X86) 284 | #if defined(_MSC_VER) 285 | _ReadWriteBarrier(); 286 | _mm_lfence(); 287 | _ReadWriteBarrier(); 288 | t = __rdtsc(); 289 | _ReadWriteBarrier(); 290 | _mm_lfence(); 291 | _ReadWriteBarrier(); 292 | #else 293 | asm volatile( 294 | "lfence\n\t" 295 | "rdtsc\n\t" 296 | "shl $32, %%rdx\n\t" 297 | "or %%rdx, %0\n\t" 298 | "lfence" 299 | : "=a"(t) 300 | : 301 | // "memory" avoids reordering. rdx = TSC >> 32. 302 | // "cc" = flags modified by SHL. 303 | : "rdx", "memory", "cc"); 304 | #endif 305 | #else 306 | // Fall back to OS - unsure how to reliably query cntvct_el0 frequency. 307 | timespec ts; 308 | clock_gettime(CLOCK_MONOTONIC, &ts); 309 | t = ts.tv_sec * 1000000000LL + ts.tv_nsec; 310 | #endif 311 | return t; 312 | } 313 | 314 | inline uint64_t Stop64() { 315 | uint64_t t; 316 | #if defined(NB_ARCH_PPC) 317 | asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268)); 318 | #elif defined(NB_ARCH_X86) 319 | #if defined(_MSC_VER) 320 | _ReadWriteBarrier(); 321 | unsigned aux; 322 | t = __rdtscp(&aux); 323 | _ReadWriteBarrier(); 324 | _mm_lfence(); 325 | _ReadWriteBarrier(); 326 | #else 327 | // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). 328 | asm volatile( 329 | "rdtscp\n\t" 330 | "shl $32, %%rdx\n\t" 331 | "or %%rdx, %0\n\t" 332 | "lfence" 333 | : "=a"(t) 334 | : 335 | // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. 336 | // "cc" = flags modified by SHL. 337 | : "rcx", "rdx", "memory", "cc"); 338 | #endif 339 | #else 340 | t = Start64(); 341 | #endif 342 | return t; 343 | } 344 | 345 | // Returns a 32-bit timestamp with about 4 cycles less overhead than 346 | // Start64. Only suitable for measuring very short regions because the 347 | // timestamp overflows about once a second. 348 | inline uint32_t Start32() { 349 | uint32_t t; 350 | #if defined(NB_ARCH_X86) 351 | #if defined(_MSC_VER) 352 | _ReadWriteBarrier(); 353 | _mm_lfence(); 354 | _ReadWriteBarrier(); 355 | t = static_cast(__rdtsc()); 356 | _ReadWriteBarrier(); 357 | _mm_lfence(); 358 | _ReadWriteBarrier(); 359 | #else 360 | asm volatile( 361 | "lfence\n\t" 362 | "rdtsc\n\t" 363 | "lfence" 364 | : "=a"(t) 365 | : 366 | // "memory" avoids reordering. rdx = TSC >> 32. 367 | : "rdx", "memory"); 368 | #endif 369 | #else 370 | t = static_cast(Start64()); 371 | #endif 372 | return t; 373 | } 374 | 375 | inline uint32_t Stop32() { 376 | uint32_t t; 377 | #if defined(NB_ARCH_X86) 378 | #if defined(_MSC_VER) 379 | _ReadWriteBarrier(); 380 | unsigned aux; 381 | t = static_cast(__rdtscp(&aux)); 382 | _ReadWriteBarrier(); 383 | _mm_lfence(); 384 | _ReadWriteBarrier(); 385 | #else 386 | // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx). 387 | asm volatile( 388 | "rdtscp\n\t" 389 | "lfence" 390 | : "=a"(t) 391 | : 392 | // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32. 393 | : "rcx", "rdx", "memory"); 394 | #endif 395 | #else 396 | t = static_cast(Stop64()); 397 | #endif 398 | return t; 399 | } 400 | 401 | } // namespace timer 402 | 403 | namespace robust_statistics { 404 | 405 | // Sorts integral values in ascending order (e.g. for Mode). About 3x faster 406 | // than std::sort for input distributions with very few unique values. 407 | template 408 | void CountingSort(T* values, size_t num_values) { 409 | // Unique values and their frequency (similar to flat_map). 410 | using Unique = std::pair; 411 | std::vector unique; 412 | for (size_t i = 0; i < num_values; ++i) { 413 | const T value = values[i]; 414 | const auto pos = 415 | std::find_if(unique.begin(), unique.end(), 416 | [value](const Unique u) { return u.first == value; }); 417 | if (pos == unique.end()) { 418 | unique.push_back(std::make_pair(value, 1)); 419 | } else { 420 | ++pos->second; 421 | } 422 | } 423 | 424 | // Sort in ascending order of value (pair.first). 425 | std::sort(unique.begin(), unique.end()); 426 | 427 | // Write that many copies of each unique value to the array. 428 | T* NB_RESTRICT p = values; 429 | for (const auto& value_count : unique) { 430 | std::fill(p, p + value_count.second, value_count.first); 431 | p += value_count.second; 432 | } 433 | NANOBENCHMARK_CHECK(p == values + num_values); 434 | } 435 | 436 | // @return i in [idx_begin, idx_begin + half_count) that minimizes 437 | // sorted[i + half_count] - sorted[i]. 438 | template 439 | size_t MinRange(const T* const NB_RESTRICT sorted, const size_t idx_begin, 440 | const size_t half_count) { 441 | T min_range = std::numeric_limits::max(); 442 | size_t min_idx = 0; 443 | 444 | for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) { 445 | NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]); 446 | const T range = sorted[idx + half_count] - sorted[idx]; 447 | if (range < min_range) { 448 | min_range = range; 449 | min_idx = idx; 450 | } 451 | } 452 | 453 | return min_idx; 454 | } 455 | 456 | // Returns an estimate of the mode by calling MinRange on successively 457 | // halved intervals. "sorted" must be in ascending order. This is the 458 | // Half Sample Mode estimator proposed by Bickel in "On a fast, robust 459 | // estimator of the mode", with complexity O(N log N). The mode is less 460 | // affected by outliers in highly-skewed distributions than the median. 461 | // The averaging operation below assumes "T" is an unsigned integer type. 462 | template 463 | T ModeOfSorted(const T* const NB_RESTRICT sorted, const size_t num_values) { 464 | size_t idx_begin = 0; 465 | size_t half_count = num_values / 2; 466 | while (half_count > 1) { 467 | idx_begin = MinRange(sorted, idx_begin, half_count); 468 | half_count >>= 1; 469 | } 470 | 471 | const T x = sorted[idx_begin + 0]; 472 | if (half_count == 0) { 473 | return x; 474 | } 475 | NANOBENCHMARK_CHECK(half_count == 1); 476 | const T average = (x + sorted[idx_begin + 1] + 1) / 2; 477 | return average; 478 | } 479 | 480 | // Returns the mode. Side effect: sorts "values". 481 | template 482 | T Mode(T* values, const size_t num_values) { 483 | CountingSort(values, num_values); 484 | return ModeOfSorted(values, num_values); 485 | } 486 | 487 | template 488 | T Mode(T (&values)[N]) { 489 | return Mode(&values[0], N); 490 | } 491 | 492 | // Returns the median value. Side effect: sorts "values". 493 | template 494 | T Median(T* values, const size_t num_values) { 495 | NANOBENCHMARK_CHECK(!values->empty()); 496 | std::sort(values, values + num_values); 497 | const size_t half = num_values / 2; 498 | // Odd count: return middle 499 | if (num_values % 2) { 500 | return values[half]; 501 | } 502 | // Even count: return average of middle two. 503 | return (values[half] + values[half - 1] + 1) / 2; 504 | } 505 | 506 | // Returns a robust measure of variability. 507 | template 508 | T MedianAbsoluteDeviation(const T* values, const size_t num_values, 509 | const T median) { 510 | NANOBENCHMARK_CHECK(num_values != 0); 511 | std::vector abs_deviations; 512 | abs_deviations.reserve(num_values); 513 | for (size_t i = 0; i < num_values; ++i) { 514 | const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median)); 515 | abs_deviations.push_back(static_cast(abs)); 516 | } 517 | return Median(abs_deviations.data(), num_values); 518 | } 519 | 520 | } // namespace robust_statistics 521 | 522 | // Ticks := platform-specific timer values (CPU cycles on x86). Must be 523 | // unsigned to guarantee wraparound on overflow. 32 bit timers are faster to 524 | // read than 64 bit. 525 | using Ticks = uint32_t; 526 | 527 | // Returns timer overhead / minimum measurable difference. 528 | Ticks TimerResolution() { 529 | // Nested loop avoids exceeding stack/L1 capacity. 530 | Ticks repetitions[Params::kTimerSamples]; 531 | for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) { 532 | Ticks samples[Params::kTimerSamples]; 533 | for (size_t i = 0; i < Params::kTimerSamples; ++i) { 534 | const Ticks t0 = timer::Start32(); 535 | const Ticks t1 = timer::Stop32(); 536 | samples[i] = t1 - t0; 537 | } 538 | repetitions[rep] = robust_statistics::Mode(samples); 539 | } 540 | return robust_statistics::Mode(repetitions); 541 | } 542 | 543 | static const Ticks timer_resolution = TimerResolution(); 544 | 545 | // Estimates the expected value of "lambda" values with a variable number of 546 | // samples until the variability "rel_mad" is less than "max_rel_mad". 547 | template 548 | Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad, 549 | const Params& p, const Lambda& lambda) { 550 | // Choose initial samples_per_eval based on a single estimated duration. 551 | Ticks t0 = timer::Start32(); 552 | lambda(); 553 | Ticks t1 = timer::Stop32(); 554 | Ticks est = t1 - t0; 555 | static const double ticks_per_second = platform::InvariantTicksPerSecond(); 556 | const size_t ticks_per_eval = 557 | static_cast(ticks_per_second * p.seconds_per_eval); 558 | size_t samples_per_eval = ticks_per_eval / est; 559 | samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval); 560 | 561 | std::vector samples; 562 | samples.reserve(1 + samples_per_eval); 563 | samples.push_back(est); 564 | 565 | // Percentage is too strict for tiny differences, so also allow a small 566 | // absolute "median absolute deviation". 567 | const Ticks max_abs_mad = (timer_resolution + 99) / 100; 568 | *rel_mad = 0.0; // ensure initialized 569 | 570 | for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) { 571 | samples.reserve(samples.size() + samples_per_eval); 572 | for (size_t i = 0; i < samples_per_eval; ++i) { 573 | t0 = timer::Start32(); 574 | lambda(); 575 | t1 = timer::Stop32(); 576 | samples.push_back(t1 - t0); 577 | } 578 | 579 | if (samples.size() >= p.min_mode_samples) { 580 | est = robust_statistics::Mode(samples.data(), samples.size()); 581 | } else { 582 | // For "few" (depends also on the variance) samples, Median is safer. 583 | est = robust_statistics::Median(samples.data(), samples.size()); 584 | } 585 | NANOBENCHMARK_CHECK(est != 0); 586 | 587 | // Median absolute deviation (mad) is a robust measure of 'variability'. 588 | const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation( 589 | samples.data(), samples.size(), est); 590 | *rel_mad = static_cast(int(abs_mad)) / est; 591 | 592 | if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) { 593 | if (p.verbose) { 594 | printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n", 595 | samples.size(), est, abs_mad, *rel_mad * 100.0); 596 | } 597 | return est; 598 | } 599 | } 600 | 601 | if (p.verbose) { 602 | printf( 603 | "WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n", 604 | *rel_mad * 100.0, max_rel_mad * 100.0, samples.size()); 605 | } 606 | return est; 607 | } 608 | 609 | using InputVec = std::vector; 610 | 611 | // Returns vector of unique input values. 612 | InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) { 613 | InputVec unique(inputs, inputs + num_inputs); 614 | std::sort(unique.begin(), unique.end()); 615 | unique.erase(std::unique(unique.begin(), unique.end()), unique.end()); 616 | return unique; 617 | } 618 | 619 | // Returns how often we need to call func for sufficient precision, or zero 620 | // on failure (e.g. the elapsed time is too long for a 32-bit tick count). 621 | size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique, 622 | const Params& p) { 623 | // Min elapsed ticks for any input. 624 | Ticks min_duration = ~0u; 625 | 626 | for (const FuncInput input : unique) { 627 | // Make sure a 32-bit timer is sufficient. 628 | const uint64_t t0 = timer::Start64(); 629 | PreventElision(func(arg, input)); 630 | const uint64_t t1 = timer::Stop64(); 631 | const uint64_t elapsed = t1 - t0; 632 | if (elapsed >= (1ULL << 30)) { 633 | fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n", 634 | input); 635 | return 0; 636 | } 637 | 638 | double rel_mad; 639 | const Ticks total = SampleUntilStable( 640 | p.target_rel_mad, &rel_mad, p, 641 | [func, arg, input]() { PreventElision(func(arg, input)); }); 642 | min_duration = std::min(min_duration, total - timer_resolution); 643 | } 644 | 645 | // Number of repetitions required to reach the target resolution. 646 | const size_t max_skip = p.precision_divisor; 647 | // Number of repetitions given the estimated duration. 648 | const size_t num_skip = 649 | min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration; 650 | if (p.verbose) { 651 | printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution, 652 | max_skip, min_duration, num_skip); 653 | } 654 | return num_skip; 655 | } 656 | 657 | // Replicates inputs until we can omit "num_skip" occurrences of an input. 658 | InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs, 659 | const size_t num_unique, const size_t num_skip, 660 | const Params& p) { 661 | InputVec full; 662 | if (num_unique == 1) { 663 | full.assign(p.subset_ratio * num_skip, inputs[0]); 664 | return full; 665 | } 666 | 667 | full.reserve(p.subset_ratio * num_skip * num_inputs); 668 | for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) { 669 | full.insert(full.end(), inputs, inputs + num_inputs); 670 | } 671 | randen::Randen rng; 672 | std::shuffle(full.begin(), full.end(), rng); 673 | return full; 674 | } 675 | 676 | // Copies the "full" to "subset" in the same order, but with "num_skip" 677 | // randomly selected occurrences of "input_to_skip" removed. 678 | void FillSubset(const InputVec& full, const FuncInput input_to_skip, 679 | const size_t num_skip, InputVec* subset) { 680 | const size_t count = std::count(full.begin(), full.end(), input_to_skip); 681 | // Generate num_skip random indices: which occurrence to skip. 682 | std::vector omit(count); 683 | std::iota(omit.begin(), omit.end(), 0); 684 | // omit[] is the same on every call, but that's OK because they identify the 685 | // Nth instance of input_to_skip, so the position within full[] differs. 686 | randen::Randen rng; 687 | std::shuffle(omit.begin(), omit.end(), rng); 688 | omit.resize(num_skip); 689 | std::sort(omit.begin(), omit.end()); 690 | 691 | uint32_t occurrence = ~0u; // 0 after preincrement 692 | size_t idx_omit = 0; // cursor within omit[] 693 | size_t idx_subset = 0; // cursor within *subset 694 | for (const FuncInput next : full) { 695 | if (next == input_to_skip) { 696 | ++occurrence; 697 | // Haven't removed enough already 698 | if (idx_omit < num_skip) { 699 | // This one is up for removal 700 | if (occurrence == omit[idx_omit]) { 701 | ++idx_omit; 702 | continue; 703 | } 704 | } 705 | } 706 | if (idx_subset < subset->size()) { 707 | (*subset)[idx_subset++] = next; 708 | } 709 | } 710 | NANOBENCHMARK_CHECK(idx_subset == subset->size()); 711 | NANOBENCHMARK_CHECK(idx_omit == omit.size()); 712 | NANOBENCHMARK_CHECK(occurrence == count - 1); 713 | } 714 | 715 | // Returns total ticks elapsed for all inputs. 716 | Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs, 717 | const Params& p, double* max_rel_mad) { 718 | double rel_mad; 719 | const Ticks duration = 720 | SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() { 721 | for (const FuncInput input : *inputs) { 722 | PreventElision(func(arg, input)); 723 | } 724 | }); 725 | *max_rel_mad = std::max(*max_rel_mad, rel_mad); 726 | return duration; 727 | } 728 | 729 | // (Nearly) empty Func for measuring timer overhead/resolution. 730 | NB_NOINLINE FuncOutput EmptyFunc(const void* arg, const FuncInput input) { 731 | return input; 732 | } 733 | 734 | // Returns overhead of accessing inputs[] and calling a function; this will 735 | // be deducted from future TotalDuration return values. 736 | Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) { 737 | double rel_mad; 738 | // Zero tolerance because repeatability is crucial and EmptyFunc is fast. 739 | return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() { 740 | for (const FuncInput input : *inputs) { 741 | PreventElision(EmptyFunc(arg, input)); 742 | } 743 | }); 744 | } 745 | 746 | } // namespace 747 | 748 | size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, 749 | const size_t num_inputs, Result* results, const Params& p) { 750 | NANOBENCHMARK_CHECK(num_inputs != 0); 751 | const InputVec& unique = UniqueInputs(inputs, num_inputs); 752 | 753 | const size_t num_skip = NumSkip(func, arg, unique, p); // never 0 754 | if (num_skip == 0) return 0; // NumSkip already printed error message 755 | const float mul = 1.0f / static_cast(num_skip); 756 | 757 | const InputVec& full = 758 | ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p); 759 | InputVec subset(full.size() - num_skip); 760 | 761 | const Ticks overhead = Overhead(arg, &full, p); 762 | const Ticks overhead_skip = Overhead(arg, &subset, p); 763 | if (overhead < overhead_skip) { 764 | fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead, 765 | overhead_skip); 766 | return 0; 767 | } 768 | 769 | if (p.verbose) { 770 | printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(), 771 | overhead, overhead_skip); 772 | } 773 | 774 | double max_rel_mad = 0.0; 775 | const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad); 776 | 777 | for (size_t i = 0; i < unique.size(); ++i) { 778 | FillSubset(full, unique[i], num_skip, &subset); 779 | const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad); 780 | 781 | if (total < total_skip) { 782 | fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip); 783 | return 0; 784 | } 785 | 786 | const Ticks duration = (total - overhead) - (total_skip - overhead_skip); 787 | results[i].input = unique[i]; 788 | results[i].ticks = duration * mul; 789 | results[i].variability = static_cast(max_rel_mad); 790 | } 791 | 792 | return unique.size(); 793 | } 794 | 795 | } // namespace randen 796 | -------------------------------------------------------------------------------- /nanobenchmark.h: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef NANOBENCHMARK_H_ 16 | #define NANOBENCHMARK_H_ 17 | 18 | // Benchmarks functions of a single integer argument with realistic branch 19 | // prediction hit rates. Uses a robust estimator to summarize the measurements. 20 | // The precision is about 0.2%. 21 | // 22 | // Examples: see nanobenchmark_test.cc. 23 | // 24 | // Background: Microbenchmarks such as http://github.com/google/benchmark 25 | // can measure elapsed times on the order of a microsecond. Shorter functions 26 | // are typically measured by repeating them thousands of times and dividing 27 | // the total elapsed time by this count. Unfortunately, repetition (especially 28 | // with the same input parameter!) influences the runtime. In time-critical 29 | // code, it is reasonable to expect warm instruction/data caches and TLBs, 30 | // but a perfect record of which branches will be taken is unrealistic. 31 | // Unless the application also repeatedly invokes the measured function with 32 | // the same parameter, the benchmark is measuring something very different - 33 | // a best-case result, almost as if the parameter were made a compile-time 34 | // constant. This may lead to erroneous conclusions about branch-heavy 35 | // algorithms outperforming branch-free alternatives. 36 | // 37 | // Our approach differs in three ways. Adding fences to the timer functions 38 | // reduces variability due to instruction reordering, improving the timer 39 | // resolution to about 40 CPU cycles. However, shorter functions must still 40 | // be invoked repeatedly. For more realistic branch prediction performance, 41 | // we vary the input parameter according to a user-specified distribution. 42 | // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the 43 | // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the 44 | // central tendency of the measurement samples with the "half sample mode", 45 | // which is more robust to outliers and skewed data than the mean or median. 46 | 47 | // WARNING if included from multiple translation units compiled with distinct 48 | // flags: this header requires textual inclusion and a predefined NB_NAMESPACE 49 | // macro that is unique to the current compile flags. We must also avoid 50 | // standard library headers such as vector and functional that define functions. 51 | 52 | #include 53 | #include 54 | 55 | namespace randen { 56 | 57 | namespace platform { 58 | 59 | // Ensures the thread is running on the specified cpu, and no others. 60 | // Reduces caused by desynchronized socket RDTSC and context switches. 61 | // If "cpu" is negative, pin to the currently running core. 62 | void PinThreadToCPU(const int cpu = -1); 63 | 64 | // Returns tick rate, useful for converting measurements to seconds. Invariant 65 | // means the tick counter frequency is independent of CPU throttling or sleep. 66 | // This call may be expensive, callers should cache the result. 67 | double InvariantTicksPerSecond(); 68 | 69 | } // namespace platform 70 | 71 | // Input influencing the function being measured (e.g. number of bytes to copy). 72 | using FuncInput = size_t; 73 | 74 | // "Proof of work" returned by Func to ensure the compiler does not elide it. 75 | using FuncOutput = uint64_t; 76 | 77 | // Function to measure: either 1) a captureless lambda or function with two 78 | // arguments or 2) a lambda with capture, in which case the first argument 79 | // is reserved for use by MeasureClosure. 80 | using Func = FuncOutput (*)(const void*, FuncInput); 81 | 82 | // Internal parameters that determine precision/resolution/measuring time. 83 | struct Params { 84 | // For measuring timer overhead/resolution. Used in a nested loop => 85 | // quadratic time, acceptable because we know timer overhead is "low". 86 | // constexpr because this is used to define array bounds. 87 | static constexpr size_t kTimerSamples = 256; 88 | 89 | // Best-case precision, expressed as a divisor of the timer resolution. 90 | // Larger => more calls to Func and higher precision. 91 | size_t precision_divisor = 1024; 92 | 93 | // Ratio between full and subset input distribution sizes. Cannot be less 94 | // than 2; larger values increase measurement time but more faithfully 95 | // model the given input distribution. 96 | size_t subset_ratio = 2; 97 | 98 | // Together with the estimated Func duration, determines how many times to 99 | // call Func before checking the sample variability. Larger values increase 100 | // measurement time, memory/cache use and precision. 101 | double seconds_per_eval = 4E-3; 102 | 103 | // The minimum number of samples before estimating the central tendency. 104 | size_t min_samples_per_eval = 7; 105 | 106 | // The mode is better than median for estimating the central tendency of 107 | // skewed/fat-tailed distributions, but it requires sufficient samples 108 | // relative to the width of half-ranges. 109 | size_t min_mode_samples = 64; 110 | 111 | // Maximum permissible variability (= median absolute deviation / center). 112 | double target_rel_mad = 0.002; 113 | 114 | // Abort after this many evals without reaching target_rel_mad. This 115 | // prevents infinite loops. 116 | size_t max_evals = 9; 117 | 118 | // Whether to print additional statistics to stdout. 119 | bool verbose = true; 120 | }; 121 | 122 | // Measurement result for each unique input. 123 | struct Result { 124 | FuncInput input; 125 | 126 | // Robust estimate (mode or median) of duration. 127 | float ticks; 128 | 129 | // Measure of variability (median absolute deviation relative to "ticks"). 130 | float variability; 131 | }; 132 | 133 | // Precisely measures the number of ticks elapsed when calling "func" with the 134 | // given inputs, shuffled to ensure realistic branch prediction hit rates. 135 | // 136 | // "func" returns a 'proof of work' to ensure its computations are not elided. 137 | // "arg" is passed to Func, or reserved for internal use by MeasureClosure. 138 | // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to 139 | // "func". The values should be chosen to maximize coverage of "func". This 140 | // represents a distribution, so a value's frequency should reflect its 141 | // probability in the real application. Order does not matter; for example, a 142 | // uniform distribution over [0, 4) could be represented as {3,0,2,1}. 143 | // Returns how many Result were written to "results": one per unique input, or 144 | // zero if the measurement failed (an error message goes to stderr). 145 | size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, 146 | const size_t num_inputs, Result* results, 147 | const Params& p = Params()); 148 | 149 | // Per-copt namespace prevents leaking generated code into other modules. 150 | namespace NB_NAMESPACE { 151 | 152 | // Calls operator() of the given closure (lambda function). 153 | template 154 | static FuncOutput CallClosure(const Closure* f, const FuncInput input) { 155 | return (*f)(input); 156 | } 157 | 158 | } // namespace NB_NAMESPACE 159 | 160 | // Same as Measure, except "closure" is typically a lambda function of 161 | // FuncInput -> FuncOutput with a capture list. 162 | template 163 | static inline size_t MeasureClosure(const Closure& closure, 164 | const FuncInput* inputs, 165 | const size_t num_inputs, Result* results, 166 | const Params& p = Params()) { 167 | return Measure(reinterpret_cast(&NB_NAMESPACE::CallClosure), 168 | reinterpret_cast(&closure), inputs, num_inputs, 169 | results, p); 170 | } 171 | 172 | } // namespace randen 173 | 174 | #endif // NANOBENCHMARK_H_ 175 | -------------------------------------------------------------------------------- /nanobenchmark_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #include // sleep 17 | 18 | #include "nanobenchmark.h" 19 | #include "randen.h" 20 | #include "util.h" 21 | #include "vector128.h" 22 | 23 | namespace randen { 24 | namespace { 25 | 26 | uint64_t AES(const void*, const FuncInput num_rounds) { 27 | // Ensures multiple invocations are serially dependent, otherwise we're 28 | // measuring the throughput rather than latency. 29 | static V prev; 30 | V m = prev; 31 | for (size_t i = 0; i < num_rounds; ++i) { 32 | m = AES(m, m); 33 | } 34 | prev = m; 35 | alignas(16) uint64_t lanes[2]; 36 | Store(m, lanes, 0); 37 | return lanes[0]; 38 | } 39 | 40 | template 41 | void MeasureAES(const FuncInput (&inputs)[N]) { 42 | Result results[N]; 43 | Params params; 44 | params.max_evals = 4; // avoid test timeout 45 | const size_t num_results = Measure(&AES, nullptr, inputs, N, results, params); 46 | for (size_t i = 0; i < num_results; ++i) { 47 | printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input, 48 | results[i].ticks, results[i].variability * 100.0); 49 | } 50 | } 51 | 52 | uint64_t Div(const void*, FuncInput in) { 53 | // Here we're measuring the throughput because benchmark invocations are 54 | // independent. 55 | const int64_t d1 = 0xFFFFFFFFFFll / int64_t(in); // IDIV 56 | return d1; 57 | } 58 | 59 | template 60 | void MeasureDiv(const FuncInput (&inputs)[N]) { 61 | Result results[N]; 62 | Params params; 63 | params.max_evals = 4; // avoid test timeout 64 | const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params); 65 | for (size_t i = 0; i < num_results; ++i) { 66 | printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input, 67 | results[i].ticks, results[i].variability * 100.0); 68 | } 69 | } 70 | 71 | Randen rng; 72 | 73 | // A function whose runtime depends on rng. 74 | uint64_t Random(const void* arg, FuncInput in) { 75 | const uint32_t r = rng() & 0xF; 76 | return AES(arg, r * r); 77 | } 78 | 79 | // Ensure the measured variability is high. 80 | template 81 | void MeasureRandom(const FuncInput (&inputs)[N]) { 82 | Result results[N]; 83 | Params p; 84 | p.max_evals = 4; // avoid test timeout 85 | p.verbose = false; 86 | const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p); 87 | for (size_t i = 0; i < num_results; ++i) { 88 | RANDEN_CHECK(results[i].variability > 1E-3); 89 | } 90 | } 91 | 92 | template 93 | void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) { 94 | printf("Expect a 'measurement failed' below:\n"); 95 | Result results[N]; 96 | const size_t num_results = MeasureClosure( 97 | [](const FuncInput input) { 98 | // Loop until the sleep succeeds (not interrupted by signal). We assume 99 | // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit. 100 | while (sleep(2) != 0) { 101 | } 102 | return input; 103 | }, 104 | inputs, N, results); 105 | RANDEN_CHECK(num_results == 0); 106 | } 107 | 108 | void RunAll(const int argc, char* argv[]) { 109 | // Avoid migrating between cores - important on multi-socket systems. 110 | int cpu = -1; 111 | if (argc == 2) { 112 | cpu = strtol(argv[1], nullptr, 10); 113 | } 114 | platform::PinThreadToCPU(cpu); 115 | 116 | // unpredictable == 1 but the compiler doesn't know that. 117 | const int unpredictable = argc != 999; 118 | static const FuncInput inputs[] = {static_cast(unpredictable) + 2, 119 | static_cast(unpredictable + 9)}; 120 | 121 | MeasureAES(inputs); 122 | MeasureDiv(inputs); 123 | MeasureRandom(inputs); 124 | EnsureLongMeasurementFails(inputs); 125 | } 126 | 127 | } // namespace 128 | } // namespace randen 129 | 130 | int main(int argc, char* argv[]) { 131 | randen::RunAll(argc, argv); 132 | return 0; 133 | } 134 | -------------------------------------------------------------------------------- /randen.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "randen.h" 16 | 17 | #include // memcpy 18 | 19 | #include "vector128.h" 20 | 21 | namespace randen { 22 | namespace { 23 | 24 | // High-level summary: 25 | // 1) Reverie (see "A Robust and Sponge-Like PRNG with Improved Efficiency") is 26 | // a sponge-like random generator that requires a cryptographic permutation. 27 | // It improves upon "Provably Robust Sponge-Based PRNGs and KDFs" by 28 | // achieving backtracking resistance with only one Permute() per buffer. 29 | // 30 | // 2) "Simpira v2: A Family of Efficient Permutations Using the AES Round 31 | // Function" constructs up to 1024-bit permutations using an improved 32 | // Generalized Feistel network with 2-round AES-128 functions. This Feistel 33 | // block shuffle achieves diffusion faster and is less vulnerable to 34 | // sliced-biclique attacks than the Type-2 cyclic shuffle. 35 | // 36 | // 3) "Improving the Generalized Feistel" and "New criterion for diffusion 37 | // property" extends the same kind of improved Feistel block shuffle to 16 38 | // branches, which enables a 2048-bit permutation. 39 | // 40 | // We combine these three ideas and also change Simpira's subround keys from 41 | // structured/low-entropy counters to digits of Pi. 42 | 43 | // Largest size for which security proofs are known. 44 | constexpr int kFeistelBlocks = 16; 45 | 46 | // Type-2 generalized Feistel => one round function for every two blocks. 47 | constexpr int kFeistelFunctions = kFeistelBlocks / 2; // = 8 48 | 49 | // Ensures SPRP security and two full subblock diffusions. 50 | constexpr int kFeistelRounds = 16 + 1; // > 4 * log2(kFeistelBlocks) 51 | 52 | // Independent keys (272 = 2.1 KiB) for the first AES subround of each function. 53 | constexpr int kKeys = kFeistelRounds * kFeistelFunctions; 54 | 55 | const uint64_t* RANDEN_RESTRICT Keys() { 56 | // "Nothing up my sleeve" numbers from the first hex digits of Pi, obtained 57 | // from http://hexpi.sourceforge.net/. Native byte order. 58 | alignas(32) static constexpr uint64_t pi_digits[kKeys * kLanes] = { 59 | RANDEN_LE(0x243F6A8885A308D3ull, 0x13198A2E03707344ull), 60 | RANDEN_LE(0xA4093822299F31D0ull, 0x082EFA98EC4E6C89ull), 61 | RANDEN_LE(0x452821E638D01377ull, 0xBE5466CF34E90C6Cull), 62 | RANDEN_LE(0xC0AC29B7C97C50DDull, 0x3F84D5B5B5470917ull), 63 | RANDEN_LE(0x9216D5D98979FB1Bull, 0xD1310BA698DFB5ACull), 64 | RANDEN_LE(0x2FFD72DBD01ADFB7ull, 0xB8E1AFED6A267E96ull), 65 | RANDEN_LE(0xBA7C9045F12C7F99ull, 0x24A19947B3916CF7ull), 66 | RANDEN_LE(0x0801F2E2858EFC16ull, 0x636920D871574E69ull), 67 | RANDEN_LE(0xA458FEA3F4933D7Eull, 0x0D95748F728EB658ull), 68 | RANDEN_LE(0x718BCD5882154AEEull, 0x7B54A41DC25A59B5ull), 69 | RANDEN_LE(0x9C30D5392AF26013ull, 0xC5D1B023286085F0ull), 70 | RANDEN_LE(0xCA417918B8DB38EFull, 0x8E79DCB0603A180Eull), 71 | RANDEN_LE(0x6C9E0E8BB01E8A3Eull, 0xD71577C1BD314B27ull), 72 | RANDEN_LE(0x78AF2FDA55605C60ull, 0xE65525F3AA55AB94ull), 73 | RANDEN_LE(0x5748986263E81440ull, 0x55CA396A2AAB10B6ull), 74 | RANDEN_LE(0xB4CC5C341141E8CEull, 0xA15486AF7C72E993ull), 75 | RANDEN_LE(0xB3EE1411636FBC2Aull, 0x2BA9C55D741831F6ull), 76 | RANDEN_LE(0xCE5C3E169B87931Eull, 0xAFD6BA336C24CF5Cull), 77 | RANDEN_LE(0x7A32538128958677ull, 0x3B8F48986B4BB9AFull), 78 | RANDEN_LE(0xC4BFE81B66282193ull, 0x61D809CCFB21A991ull), 79 | RANDEN_LE(0x487CAC605DEC8032ull, 0xEF845D5DE98575B1ull), 80 | RANDEN_LE(0xDC262302EB651B88ull, 0x23893E81D396ACC5ull), 81 | RANDEN_LE(0x0F6D6FF383F44239ull, 0x2E0B4482A4842004ull), 82 | RANDEN_LE(0x69C8F04A9E1F9B5Eull, 0x21C66842F6E96C9Aull), 83 | RANDEN_LE(0x670C9C61ABD388F0ull, 0x6A51A0D2D8542F68ull), 84 | RANDEN_LE(0x960FA728AB5133A3ull, 0x6EEF0B6C137A3BE4ull), 85 | RANDEN_LE(0xBA3BF0507EFB2A98ull, 0xA1F1651D39AF0176ull), 86 | RANDEN_LE(0x66CA593E82430E88ull, 0x8CEE8619456F9FB4ull), 87 | RANDEN_LE(0x7D84A5C33B8B5EBEull, 0xE06F75D885C12073ull), 88 | RANDEN_LE(0x401A449F56C16AA6ull, 0x4ED3AA62363F7706ull), 89 | RANDEN_LE(0x1BFEDF72429B023Dull, 0x37D0D724D00A1248ull), 90 | RANDEN_LE(0xDB0FEAD349F1C09Bull, 0x075372C980991B7Bull), 91 | RANDEN_LE(0x25D479D8F6E8DEF7ull, 0xE3FE501AB6794C3Bull), 92 | RANDEN_LE(0x976CE0BD04C006BAull, 0xC1A94FB6409F60C4ull), 93 | RANDEN_LE(0x5E5C9EC2196A2463ull, 0x68FB6FAF3E6C53B5ull), 94 | RANDEN_LE(0x1339B2EB3B52EC6Full, 0x6DFC511F9B30952Cull), 95 | RANDEN_LE(0xCC814544AF5EBD09ull, 0xBEE3D004DE334AFDull), 96 | RANDEN_LE(0x660F2807192E4BB3ull, 0xC0CBA85745C8740Full), 97 | RANDEN_LE(0xD20B5F39B9D3FBDBull, 0x5579C0BD1A60320Aull), 98 | RANDEN_LE(0xD6A100C6402C7279ull, 0x679F25FEFB1FA3CCull), 99 | RANDEN_LE(0x8EA5E9F8DB3222F8ull, 0x3C7516DFFD616B15ull), 100 | RANDEN_LE(0x2F501EC8AD0552ABull, 0x323DB5FAFD238760ull), 101 | RANDEN_LE(0x53317B483E00DF82ull, 0x9E5C57BBCA6F8CA0ull), 102 | RANDEN_LE(0x1A87562EDF1769DBull, 0xD542A8F6287EFFC3ull), 103 | RANDEN_LE(0xAC6732C68C4F5573ull, 0x695B27B0BBCA58C8ull), 104 | RANDEN_LE(0xE1FFA35DB8F011A0ull, 0x10FA3D98FD2183B8ull), 105 | RANDEN_LE(0x4AFCB56C2DD1D35Bull, 0x9A53E479B6F84565ull), 106 | RANDEN_LE(0xD28E49BC4BFB9790ull, 0xE1DDF2DAA4CB7E33ull), 107 | RANDEN_LE(0x62FB1341CEE4C6E8ull, 0xEF20CADA36774C01ull), 108 | RANDEN_LE(0xD07E9EFE2BF11FB4ull, 0x95DBDA4DAE909198ull), 109 | RANDEN_LE(0xEAAD8E716B93D5A0ull, 0xD08ED1D0AFC725E0ull), 110 | RANDEN_LE(0x8E3C5B2F8E7594B7ull, 0x8FF6E2FBF2122B64ull), 111 | RANDEN_LE(0x8888B812900DF01Cull, 0x4FAD5EA0688FC31Cull), 112 | RANDEN_LE(0xD1CFF191B3A8C1ADull, 0x2F2F2218BE0E1777ull), 113 | RANDEN_LE(0xEA752DFE8B021FA1ull, 0xE5A0CC0FB56F74E8ull), 114 | RANDEN_LE(0x18ACF3D6CE89E299ull, 0xB4A84FE0FD13E0B7ull), 115 | RANDEN_LE(0x7CC43B81D2ADA8D9ull, 0x165FA26680957705ull), 116 | RANDEN_LE(0x93CC7314211A1477ull, 0xE6AD206577B5FA86ull), 117 | RANDEN_LE(0xC75442F5FB9D35CFull, 0xEBCDAF0C7B3E89A0ull), 118 | RANDEN_LE(0xD6411BD3AE1E7E49ull, 0x00250E2D2071B35Eull), 119 | RANDEN_LE(0x226800BB57B8E0AFull, 0x2464369BF009B91Eull), 120 | RANDEN_LE(0x5563911D59DFA6AAull, 0x78C14389D95A537Full), 121 | RANDEN_LE(0x207D5BA202E5B9C5ull, 0x832603766295CFA9ull), 122 | RANDEN_LE(0x11C819684E734A41ull, 0xB3472DCA7B14A94Aull), 123 | RANDEN_LE(0x1B5100529A532915ull, 0xD60F573FBC9BC6E4ull), 124 | RANDEN_LE(0x2B60A47681E67400ull, 0x08BA6FB5571BE91Full), 125 | RANDEN_LE(0xF296EC6B2A0DD915ull, 0xB6636521E7B9F9B6ull), 126 | RANDEN_LE(0xFF34052EC5855664ull, 0x53B02D5DA99F8FA1ull), 127 | RANDEN_LE(0x08BA47996E85076Aull, 0x4B7A70E9B5B32944ull), 128 | RANDEN_LE(0xDB75092EC4192623ull, 0xAD6EA6B049A7DF7Dull), 129 | RANDEN_LE(0x9CEE60B88FEDB266ull, 0xECAA8C71699A18FFull), 130 | RANDEN_LE(0x5664526CC2B19EE1ull, 0x193602A575094C29ull), 131 | RANDEN_LE(0xA0591340E4183A3Eull, 0x3F54989A5B429D65ull), 132 | RANDEN_LE(0x6B8FE4D699F73FD6ull, 0xA1D29C07EFE830F5ull), 133 | RANDEN_LE(0x4D2D38E6F0255DC1ull, 0x4CDD20868470EB26ull), 134 | RANDEN_LE(0x6382E9C6021ECC5Eull, 0x09686B3F3EBAEFC9ull), 135 | RANDEN_LE(0x3C9718146B6A70A1ull, 0x687F358452A0E286ull), 136 | RANDEN_LE(0xB79C5305AA500737ull, 0x3E07841C7FDEAE5Cull), 137 | RANDEN_LE(0x8E7D44EC5716F2B8ull, 0xB03ADA37F0500C0Dull), 138 | RANDEN_LE(0xF01C1F040200B3FFull, 0xAE0CF51A3CB574B2ull), 139 | RANDEN_LE(0x25837A58DC0921BDull, 0xD19113F97CA92FF6ull), 140 | RANDEN_LE(0x9432477322F54701ull, 0x3AE5E58137C2DADCull), 141 | RANDEN_LE(0xC8B576349AF3DDA7ull, 0xA94461460FD0030Eull), 142 | RANDEN_LE(0xECC8C73EA4751E41ull, 0xE238CD993BEA0E2Full), 143 | RANDEN_LE(0x3280BBA1183EB331ull, 0x4E548B384F6DB908ull), 144 | RANDEN_LE(0x6F420D03F60A04BFull, 0x2CB8129024977C79ull), 145 | RANDEN_LE(0x5679B072BCAF89AFull, 0xDE9A771FD9930810ull), 146 | RANDEN_LE(0xB38BAE12DCCF3F2Eull, 0x5512721F2E6B7124ull), 147 | RANDEN_LE(0x501ADDE69F84CD87ull, 0x7A5847187408DA17ull), 148 | RANDEN_LE(0xBC9F9ABCE94B7D8Cull, 0xEC7AEC3ADB851DFAull), 149 | RANDEN_LE(0x63094366C464C3D2ull, 0xEF1C18473215D808ull), 150 | RANDEN_LE(0xDD433B3724C2BA16ull, 0x12A14D432A65C451ull), 151 | RANDEN_LE(0x50940002133AE4DDull, 0x71DFF89E10314E55ull), 152 | RANDEN_LE(0x81AC77D65F11199Bull, 0x043556F1D7A3C76Bull), 153 | RANDEN_LE(0x3C11183B5924A509ull, 0xF28FE6ED97F1FBFAull), 154 | RANDEN_LE(0x9EBABF2C1E153C6Eull, 0x86E34570EAE96FB1ull), 155 | RANDEN_LE(0x860E5E0A5A3E2AB3ull, 0x771FE71C4E3D06FAull), 156 | RANDEN_LE(0x2965DCB999E71D0Full, 0x803E89D65266C825ull), 157 | RANDEN_LE(0x2E4CC9789C10B36Aull, 0xC6150EBA94E2EA78ull), 158 | RANDEN_LE(0xA6FC3C531E0A2DF4ull, 0xF2F74EA7361D2B3Dull), 159 | RANDEN_LE(0x1939260F19C27960ull, 0x5223A708F71312B6ull), 160 | RANDEN_LE(0xEBADFE6EEAC31F66ull, 0xE3BC4595A67BC883ull), 161 | RANDEN_LE(0xB17F37D1018CFF28ull, 0xC332DDEFBE6C5AA5ull), 162 | RANDEN_LE(0x6558218568AB9702ull, 0xEECEA50FDB2F953Bull), 163 | RANDEN_LE(0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull), 164 | RANDEN_LE(0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull), 165 | RANDEN_LE(0x0334FE1EAA0363CFull, 0xB5735C904C70A239ull), 166 | RANDEN_LE(0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull), 167 | RANDEN_LE(0x9CAB5CABB2F3846Eull, 0x648B1EAF19BDF0CAull), 168 | RANDEN_LE(0xA02369B9655ABB50ull, 0x40685A323C2AB4B3ull), 169 | RANDEN_LE(0x319EE9D5C021B8F7ull, 0x9B540B19875FA099ull), 170 | RANDEN_LE(0x95F7997E623D7DA8ull, 0xF837889A97E32D77ull), 171 | RANDEN_LE(0x11ED935F16681281ull, 0x0E358829C7E61FD6ull), 172 | RANDEN_LE(0x96DEDFA17858BA99ull, 0x57F584A51B227263ull), 173 | RANDEN_LE(0x9B83C3FF1AC24696ull, 0xCDB30AEB532E3054ull), 174 | RANDEN_LE(0x8FD948E46DBC3128ull, 0x58EBF2EF34C6FFEAull), 175 | RANDEN_LE(0xFE28ED61EE7C3C73ull, 0x5D4A14D9E864B7E3ull), 176 | RANDEN_LE(0x42105D14203E13E0ull, 0x45EEE2B6A3AAABEAull), 177 | RANDEN_LE(0xDB6C4F15FACB4FD0ull, 0xC742F442EF6ABBB5ull), 178 | RANDEN_LE(0x654F3B1D41CD2105ull, 0xD81E799E86854DC7ull), 179 | RANDEN_LE(0xE44B476A3D816250ull, 0xCF62A1F25B8D2646ull), 180 | RANDEN_LE(0xFC8883A0C1C7B6A3ull, 0x7F1524C369CB7492ull), 181 | RANDEN_LE(0x47848A0B5692B285ull, 0x095BBF00AD19489Dull), 182 | RANDEN_LE(0x1462B17423820D00ull, 0x58428D2A0C55F5EAull), 183 | RANDEN_LE(0x1DADF43E233F7061ull, 0x3372F0928D937E41ull), 184 | RANDEN_LE(0xD65FECF16C223BDBull, 0x7CDE3759CBEE7460ull), 185 | RANDEN_LE(0x4085F2A7CE77326Eull, 0xA607808419F8509Eull), 186 | RANDEN_LE(0xE8EFD85561D99735ull, 0xA969A7AAC50C06C2ull), 187 | RANDEN_LE(0x5A04ABFC800BCADCull, 0x9E447A2EC3453484ull), 188 | RANDEN_LE(0xFDD567050E1E9EC9ull, 0xDB73DBD3105588CDull), 189 | RANDEN_LE(0x675FDA79E3674340ull, 0xC5C43465713E38D8ull), 190 | RANDEN_LE(0x3D28F89EF16DFF20ull, 0x153E21E78FB03D4Aull), 191 | RANDEN_LE(0xE6E39F2BDB83ADF7ull, 0xE93D5A68948140F7ull), 192 | RANDEN_LE(0xF64C261C94692934ull, 0x411520F77602D4F7ull), 193 | RANDEN_LE(0xBCF46B2ED4A10068ull, 0xD40824713320F46Aull), 194 | RANDEN_LE(0x43B7D4B7500061AFull, 0x1E39F62E97244546ull)}; 195 | static_assert(pi_digits[kKeys * kLanes - 1] != 0, "Too few initializers"); 196 | return pi_digits; 197 | } 198 | 199 | // Improved odd-even shuffle from "New criterion for diffusion property". 200 | RANDEN_INLINE void BlockShuffle(uint64_t* RANDEN_RESTRICT state) { 201 | // First make a copy (optimized out). 202 | uint64_t source[kFeistelBlocks * kLanes]; 203 | memcpy(source, state, sizeof(source)); 204 | 205 | constexpr int shuffle[kFeistelBlocks] = {7, 2, 13, 4, 11, 8, 3, 6, 206 | 15, 0, 9, 10, 1, 14, 5, 12}; 207 | for (int branch = 0; branch < kFeistelBlocks; ++branch) { 208 | const V v = Load(source, shuffle[branch]); 209 | Store(v, state, branch); 210 | } 211 | } 212 | 213 | // Cryptographic permutation based via type-2 Generalized Feistel Network. 214 | // Indistinguishable from ideal by chosen-ciphertext adversaries using less than 215 | // 2^64 queries if the round function is a PRF. This is similar to the b=8 case 216 | // of Simpira v2, but more efficient than its generic construction for b=16. 217 | RANDEN_INLINE void Permute(uint64_t* RANDEN_RESTRICT state) { 218 | // Round keys for one AES per Feistel round and branch: first digits of Pi. 219 | const uint64_t* RANDEN_RESTRICT keys = Keys(); 220 | 221 | // (Successfully unrolled; the first iteration jumps into the second half) 222 | #ifdef __clang__ 223 | #pragma clang loop unroll_count(2) 224 | #endif 225 | for (int round = 0; round < kFeistelRounds; ++round) { 226 | for (int branch = 0; branch < kFeistelBlocks; branch += 2) { 227 | const V even = Load(state, branch); 228 | const V odd = Load(state, branch + 1); 229 | // Feistel round function using two AES subrounds. Very similar to F() 230 | // from Simpira v2, but with independent subround keys. Uses 17 AES rounds 231 | // per 16 bytes (vs. 10 for AES-CTR). Computing eight round functions in 232 | // parallel hides the 7-cycle AESNI latency on HSW. Note that the Feistel 233 | // XORs are 'free' (included in the second AES instruction). 234 | const V f1 = AES(even, Load(keys, 0)); 235 | keys += kLanes; 236 | const V f2 = AES(f1, odd); 237 | Store(f2, state, branch + 1); 238 | } 239 | 240 | BlockShuffle(state); 241 | } 242 | } 243 | 244 | // Enables native loads in the round loop by pre-swapping. 245 | RANDEN_INLINE void SwapIfBigEndian(uint64_t* RANDEN_RESTRICT state) { 246 | #ifdef RANDEN_BIG_ENDIAN 247 | for (int branch = 0; branch < kFeistelBlocks; ++branch) { 248 | const V v = ReverseBytes(Load(state, branch)); 249 | Store(v, state, branch); 250 | } 251 | #endif 252 | } 253 | 254 | } // namespace 255 | 256 | void Internal::Absorb(const void* seed_void, void* state_void) { 257 | uint64_t* RANDEN_RESTRICT state = reinterpret_cast(state_void); 258 | const uint64_t* RANDEN_RESTRICT seed = 259 | reinterpret_cast(seed_void); 260 | 261 | constexpr int kCapacityBlocks = kCapacityBytes / sizeof(V); 262 | static_assert(kCapacityBlocks * sizeof(V) == kCapacityBytes, "Not i*V"); 263 | for (size_t i = kCapacityBlocks; i < kStateBytes / sizeof(V); ++i) { 264 | V block = Load(state, i); 265 | block ^= Load(seed, i - kCapacityBlocks); 266 | Store(block, state, i); 267 | } 268 | } 269 | 270 | void Internal::Generate(void* state_void) { 271 | uint64_t* RANDEN_RESTRICT state = reinterpret_cast(state_void); 272 | 273 | static_assert(kCapacityBytes == sizeof(V), "Capacity mismatch"); 274 | const V prev_inner = Load(state, 0); 275 | 276 | SwapIfBigEndian(state); 277 | 278 | Permute(state); 279 | 280 | SwapIfBigEndian(state); 281 | 282 | // Ensure backtracking resistance. 283 | V inner = Load(state, 0); 284 | inner ^= prev_inner; 285 | Store(inner, state, 0); 286 | } 287 | 288 | } // namespace randen 289 | -------------------------------------------------------------------------------- /randen.h: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // 'Strong' (indistinguishable from random, backtracking-resistant) random 16 | // generator, faster in some benchmarks than std::mt19937_64 and pcg64_c32. 17 | // Accompanying paper: https://arxiv.org/abs/1810.02227 18 | 19 | #ifndef RANDEN_H_ 20 | #define RANDEN_H_ 21 | 22 | #include 23 | #include // memcpy 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | // RANDen = RANDom generator or beetroots in Swiss German. 33 | namespace randen { 34 | 35 | struct Internal { 36 | static void Absorb(const void* seed, void* state); 37 | static void Generate(void* state); 38 | 39 | static constexpr int kStateBytes = 256; // 2048-bit 40 | 41 | // Size of the 'inner' (inaccessible) part of the sponge. Larger values would 42 | // require more frequent calls to Generate. 43 | static constexpr int kCapacityBytes = 16; // 128-bit 44 | }; 45 | 46 | // Deterministic pseudorandom byte generator with backtracking resistance 47 | // (leaking the state does not compromise prior outputs). Based on Reverie 48 | // (see "A Robust and Sponge-Like PRNG with Improved Efficiency") instantiated 49 | // with an improved Simpira-like permutation. 50 | // Returns values of type "T" (must be a built-in unsigned integer type). 51 | template 52 | class alignas(32) Randen { 53 | static_assert(std::is_unsigned::value, 54 | "Randen must be parameterized by a built-in unsigned integer"); 55 | 56 | public: 57 | // C++11 URBG interface: 58 | using result_type = T; 59 | 60 | static constexpr result_type min() { 61 | return std::numeric_limits::min(); 62 | } 63 | 64 | static constexpr result_type max() { 65 | return std::numeric_limits::max(); 66 | } 67 | 68 | explicit Randen(result_type seed_value = 0) { seed(seed_value); } 69 | 70 | template ::value>::type> 73 | explicit Randen(SeedSequence&& seq) { 74 | seed(seq); 75 | } 76 | 77 | // Default copy and move operators. 78 | Randen(const Randen&) = default; 79 | Randen& operator=(const Randen&) = default; 80 | 81 | Randen(Randen&&) = default; 82 | Randen& operator=(Randen&&) = default; 83 | 84 | // Returns random bits from the buffer in units of T. 85 | result_type operator()() { 86 | // (Local copy ensures compiler knows this is not aliased.) 87 | size_t next = next_; 88 | 89 | // Refill the buffer if needed (unlikely). 90 | if (next >= kStateT) { 91 | Internal::Generate(state_); 92 | next = kCapacityT; 93 | } 94 | 95 | const result_type ret = state_[next]; 96 | next_ = next + 1; 97 | return ret; 98 | } 99 | 100 | template 101 | typename std::enable_if< 102 | !std::is_convertible::value, void>::type 103 | seed(SeedSequence& seq) { 104 | seed(); 105 | reseed(seq); 106 | } 107 | 108 | void seed(result_type seed_value = 0) { 109 | next_ = kStateT; 110 | std::fill(std::begin(state_), std::begin(state_) + kCapacityT, 0); 111 | std::fill(std::begin(state_) + kCapacityT, std::end(state_), seed_value); 112 | } 113 | 114 | // Inserts entropy into (part of) the state. Calling this periodically with 115 | // sufficient entropy ensures prediction resistance (attackers cannot predict 116 | // future outputs even if state is compromised). 117 | template 118 | void reseed(SeedSequence& seq) { 119 | using U32 = typename SeedSequence::result_type; 120 | constexpr int kRate32 = 121 | (Internal::kStateBytes - Internal::kCapacityBytes) / sizeof(U32); 122 | U32 buffer[kRate32]; 123 | seq.generate(buffer, buffer + kRate32); 124 | Internal::Absorb(buffer, state_); 125 | next_ = kStateT; // Generate will be called by operator() 126 | } 127 | 128 | void discard(unsigned long long count) { 129 | using ull_t = unsigned long long; 130 | const ull_t remaining = kStateT - next_; 131 | if (count <= remaining) { 132 | next_ += count; 133 | return; 134 | } 135 | count -= remaining; 136 | 137 | const ull_t kRateT = kStateT - kCapacityT; 138 | while (count > kRateT) { 139 | Internal::Generate(state_); 140 | next_ = kCapacityT; 141 | count -= kRateT; 142 | } 143 | 144 | if (count != 0) { 145 | Internal::Generate(state_); 146 | next_ = kCapacityT + count; 147 | } 148 | } 149 | 150 | bool operator==(const Randen& other) const { 151 | return next_ == other.next_ && 152 | std::equal(std::begin(state_), std::end(state_), 153 | std::begin(other.state_)); 154 | } 155 | 156 | bool operator!=(const Randen& other) const { return !(*this == other); } 157 | 158 | template 159 | friend std::basic_ostream& operator<<( 160 | std::basic_ostream& os, // NOLINT(runtime/references) 161 | const Randen& engine) { // NOLINT(runtime/references) 162 | const auto flags = os.flags(std::ios_base::dec | std::ios_base::left); 163 | const auto fill = os.fill(os.widen(' ')); 164 | 165 | for (auto x : engine.state_) { 166 | os << x << os.fill(); 167 | } 168 | os << engine.next_; 169 | 170 | os.flags(flags); 171 | os.fill(fill); 172 | return os; 173 | } 174 | 175 | template 176 | friend std::basic_istream& operator>>( 177 | std::basic_istream& is, // NOLINT(runtime/references) 178 | Randen& engine) { // NOLINT(runtime/references) 179 | const auto flags = is.flags(std::ios_base::dec | std::ios_base::skipws); 180 | const auto fill = is.fill(is.widen(' ')); 181 | 182 | T state[kStateT]; 183 | size_t next; 184 | for (auto& x : state) { 185 | is >> x; 186 | } 187 | is >> next; 188 | if (!is.fail()) { 189 | memcpy(engine.state_, state, sizeof(engine.state_)); 190 | engine.next_ = next; 191 | } 192 | is.flags(flags); 193 | is.fill(fill); 194 | return is; 195 | } 196 | 197 | private: 198 | static constexpr size_t kStateT = Internal::kStateBytes / sizeof(T); 199 | static constexpr size_t kCapacityT = Internal::kCapacityBytes / sizeof(T); 200 | 201 | // First kCapacityT are `inner', the others are accessible random bits. 202 | alignas(32) result_type state_[kStateT]; 203 | size_t next_; // index within state_ 204 | }; 205 | 206 | } // namespace randen 207 | 208 | #endif // RANDEN_H_ 209 | -------------------------------------------------------------------------------- /randen_benchmark.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Please disable Turbo Boost and CPU throttling! 16 | 17 | #include "randen.h" 18 | 19 | // std::uniform_*_distribution are slow due to division/log2; we provide 20 | // faster variants if this is 0. 21 | #define USE_STD_DISTRIBUTIONS 0 22 | 23 | // Which engines to benchmark. 24 | #define ENABLE_RANDEN 1 25 | #define ENABLE_PCG 1 26 | #define ENABLE_MT 1 27 | #if defined(__SSE2__) && defined(__AES__) 28 | #define ENABLE_CHACHA 1 29 | #else 30 | #define ENABLE_CHACHA 0 31 | #endif 32 | #define ENABLE_OS 1 33 | 34 | #if ENABLE_PCG 35 | #include "third_party/pcg_random/include/pcg_random.hpp" 36 | #endif 37 | 38 | #if ENABLE_MT 39 | #include 40 | #endif 41 | 42 | #if ENABLE_CHACHA 43 | #include "engine_chacha.h" 44 | #endif 45 | 46 | #if ENABLE_OS 47 | #include "engine_os.h" 48 | #endif 49 | 50 | 51 | #ifdef _MSC_VER 52 | #include 53 | #endif 54 | #include 55 | #include 56 | #include // iota 57 | 58 | #include "nanobenchmark.h" 59 | #include "util.h" 60 | 61 | namespace randen { 62 | namespace { 63 | 64 | #if USE_STD_DISTRIBUTIONS 65 | using UniformInt = std::uniform_int_distribution; 66 | using UniformDouble = std::uniform_real_distribution; 67 | #else 68 | // These are subsets of std::uniform_*_distribution. 69 | 70 | class UniformInt { 71 | public: 72 | // (To support u64, add a Multiply overload and GetU64 as below.) 73 | using result_type = uint32_t; 74 | 75 | struct param_type { 76 | using distribution_type = UniformInt; 77 | 78 | param_type(const result_type begin, const result_type end) 79 | : begin(begin), end(end) {} 80 | 81 | // Half-open interval. 82 | result_type begin; 83 | result_type end; 84 | }; 85 | 86 | // Engine is a C++11 UniformRandomBitGenerator returning >= 32 bits. 87 | template 88 | result_type operator()(Engine& engine, const param_type param) const { 89 | using Bits = decltype(engine()); // == typename Engine::result_type 90 | static_assert(std::is_same::value || 91 | std::is_same::value, 92 | "Need u32 or u64"); 93 | 94 | // We assume range < pow(2, sizeof(decltype(engine()))*8). 95 | const result_type range = param.end - param.begin; 96 | 97 | // Division-free with high probability. Algorithm and variable names are 98 | // from https://arxiv.org/pdf/1805.10941.pdf. 99 | result_type x = engine(); // (possibly a narrowing conversion from Bits) 100 | result_type hi, lo; 101 | Multiply(x, range, &hi, &lo); 102 | // Rejected, try again (unlikely for small ranges). 103 | if (lo < range) { 104 | const result_type t = Negate(range) % range; 105 | while (hi < t) { 106 | x = engine(); 107 | Multiply(x, range, &hi, &lo); 108 | } 109 | } 110 | 111 | return hi + param.begin; 112 | } 113 | 114 | private: 115 | static constexpr result_type Negate(result_type x) { 116 | return ~x + 1; // assumes two's complement. 117 | } 118 | 119 | static void Multiply(const uint32_t x, const uint32_t y, uint32_t* hi, 120 | uint32_t* lo) { 121 | const uint64_t wide = static_cast(x) * y; 122 | *hi = wide >> 32; 123 | *lo = static_cast(wide & 0xFFFFFFFFu); 124 | } 125 | }; 126 | 127 | class UniformDouble { 128 | public: 129 | // (Can also be float - we would just cast from double.) 130 | using result_type = double; 131 | 132 | // Engine is a C++11 UniformRandomBitGenerator returning either u32 or u64. 133 | template 134 | result_type operator()(Engine& engine) const { 135 | uint64_t bits = GetU64(decltype(engine())(), engine); 136 | if (bits == 0) return static_cast(0.0); 137 | const int leading_zeros = NumZeroBitsAboveMSBNonzero(bits); 138 | bits <<= leading_zeros; // shift out leading zeros 139 | bits >>= (64 - 53); // zero exponent 140 | const uint64_t exp = 1022 - leading_zeros; 141 | const uint64_t ieee = (exp << 52) | bits; 142 | double ret; 143 | memcpy(&ret, &ieee, sizeof(ret)); 144 | return static_cast(ret); 145 | } 146 | 147 | private: 148 | template 149 | static uint64_t GetU64(uint64_t, Engine& engine) { 150 | return engine(); 151 | } 152 | 153 | // Adapter for generating u64 from u32 engine. 154 | template 155 | static uint64_t GetU64(uint32_t, Engine& engine) { 156 | uint64_t ret = engine(); 157 | ret <<= 32; 158 | ret |= engine(); 159 | return ret; 160 | } 161 | }; 162 | #endif // !USE_STD_DISTRIBUTIONS 163 | 164 | // Benchmark::Num64() is passed to its constructor and operator() after 165 | // multiplying with a (non-compile-time-constant) 1 to prevent constant folding. 166 | // It is also used to compute cycles per byte. 167 | 168 | // Microbenchmark: generates N numbers in a tight loop. 169 | struct BenchmarkLoop { 170 | // Large enough that we can ignore size % buffer size. 171 | static size_t Num64() { return 100000; } 172 | 173 | explicit BenchmarkLoop(const uint64_t num_64) {} 174 | 175 | template 176 | uint64_t operator()(const uint64_t num_64, Engine& engine) const { 177 | for (size_t i = 0; i < num_64 - 1; ++i) { 178 | (void)engine(); 179 | } 180 | return engine(); 181 | } 182 | }; 183 | 184 | // Real-world benchmark: shuffles a vector. 185 | class BenchmarkShuffle { 186 | public: 187 | static size_t Num64() { return 50000; } 188 | 189 | explicit BenchmarkShuffle(const uint64_t num_64) : ints_to_shuffle_(num_64) {} 190 | 191 | template 192 | uint64_t operator()(const uint64_t num_64, Engine& engine) const { 193 | ints_to_shuffle_[0] = static_cast(num_64 & 0xFFFF); 194 | #if USE_STD_DISTRIBUTIONS 195 | std::shuffle(ints_to_shuffle_.begin(), ints_to_shuffle_.end(), engine); 196 | #else 197 | // Similar algorithm, but UniformInt instead of std::u_i_d => 2-3x speedup. 198 | UniformInt dist; 199 | for (size_t i = num_64 - 1; i != 0; --i) { 200 | const UniformInt::param_type param(0, i); 201 | std::swap(ints_to_shuffle_[i], ints_to_shuffle_[dist(engine, param)]); 202 | } 203 | #endif 204 | return ints_to_shuffle_[0]; 205 | } 206 | 207 | private: 208 | mutable std::vector ints_to_shuffle_; 209 | }; 210 | 211 | // Reservoir sampling. 212 | class BenchmarkSample { 213 | public: 214 | static size_t Num64() { return 50000; } 215 | 216 | explicit BenchmarkSample(const uint64_t num_64) 217 | : population_(num_64), chosen_(kNumChosen) { 218 | std::iota(population_.begin(), population_.end(), 0); 219 | } 220 | 221 | template 222 | uint64_t operator()(const uint64_t num_64, Engine& engine) const { 223 | // Can replace with std::sample after C++17. 224 | std::copy(population_.begin(), population_.begin() + kNumChosen, 225 | chosen_.begin()); 226 | UniformInt dist; 227 | for (size_t i = kNumChosen; i < num_64; ++i) { 228 | const UniformInt::param_type param(0, i); 229 | const size_t index = dist(engine, param); 230 | if (index < kNumChosen) { 231 | chosen_[index] = population_[i]; 232 | } 233 | } 234 | 235 | return chosen_.front(); 236 | } 237 | 238 | private: 239 | static constexpr size_t kNumChosen = 10000; 240 | 241 | std::vector population_; 242 | mutable std::vector chosen_; 243 | }; 244 | 245 | // Actual application: Monte Carlo estimation of Pi * 1E6. 246 | class BenchmarkMonteCarlo { 247 | public: 248 | static size_t Num64() { return 200000; } 249 | 250 | explicit BenchmarkMonteCarlo(const uint64_t num_64) {} 251 | 252 | template 253 | uint64_t operator()(const uint64_t num_64, Engine& engine) const { 254 | int64_t in_circle = 0; 255 | for (size_t i = 0; i < num_64; i += 2) { 256 | const double x = dist_(engine); 257 | const double y = dist_(engine); 258 | in_circle += (x * x + y * y) < 1.0; 259 | } 260 | return 8 * 1000 * 1000 * in_circle / num_64; 261 | } 262 | 263 | private: 264 | mutable UniformDouble dist_; 265 | }; 266 | 267 | template 268 | void RunBenchmark(const char* caption, Engine& engine, const int unpredictable1, 269 | const Benchmark& benchmark) { 270 | printf("%8s: ", caption); 271 | const size_t kNumInputs = 1; 272 | const FuncInput inputs[kNumInputs] = { 273 | static_cast(Benchmark::Num64() * unpredictable1)}; 274 | Result results[kNumInputs]; 275 | 276 | Params p; 277 | p.verbose = false; 278 | #if defined(__powerpc__) 279 | p.max_evals = 7; 280 | #else 281 | p.max_evals = 8; 282 | #endif 283 | p.target_rel_mad = 0.002; 284 | const size_t num_results = MeasureClosure( 285 | [&benchmark, &engine](const FuncInput input) { 286 | return benchmark(input, engine); 287 | }, 288 | inputs, kNumInputs, results, p); 289 | RANDEN_CHECK(num_results == kNumInputs); 290 | for (size_t i = 0; i < num_results; ++i) { 291 | const double cycles_per_byte = 292 | results[i].ticks / (results[i].input * sizeof(uint64_t)); 293 | const double mad = results[i].variability * cycles_per_byte; 294 | printf("%6zu: %5.2f (+/- %5.3f)\n", results[i].input, cycles_per_byte, mad); 295 | } 296 | } 297 | 298 | // Calls RunBenchmark for each (enabled) engine. 299 | template 300 | void ForeachEngine(const int unpredictable1) { 301 | using T = uint64_t; // WARNING: keep in sync with MT/PCG. 302 | 303 | const Benchmark benchmark( 304 | static_cast(Benchmark::Num64() * unpredictable1)); 305 | 306 | #if ENABLE_RANDEN 307 | Randen eng_randen; 308 | RunBenchmark("Randen", eng_randen, unpredictable1, benchmark); 309 | #endif 310 | 311 | #if ENABLE_PCG 312 | // Quoting from pcg_random.hpp: "the c variants offer better crypographic 313 | // security (just how good the cryptographic security is is an open 314 | // question)". 315 | pcg64_c32 eng_pcg; 316 | RunBenchmark("PCG", eng_pcg, unpredictable1, benchmark); 317 | #endif 318 | 319 | #if ENABLE_MT 320 | std::mt19937_64 eng_mt; 321 | RunBenchmark("MT", eng_mt, unpredictable1, benchmark); 322 | #endif 323 | 324 | 325 | #if ENABLE_CHACHA 326 | ChaCha eng_chacha(0x243f6a8885a308d3ull, 0x243F6A8885A308D3ull); 327 | RunBenchmark("ChaCha8", eng_chacha, unpredictable1, benchmark); 328 | #endif 329 | 330 | #if ENABLE_OS 331 | EngineOS eng_os; 332 | RunBenchmark("OS", eng_os, unpredictable1, benchmark); 333 | #endif 334 | 335 | printf("\n"); 336 | } 337 | 338 | void RunAll(int argc, char* argv[]) { 339 | // Immediately output any results (for non-local runs). 340 | setvbuf(stdout, nullptr, _IONBF, 0); 341 | 342 | printf("Config: enable std=%d\n", USE_STD_DISTRIBUTIONS); 343 | 344 | // Avoid migrating between cores - important on multi-socket systems. 345 | int cpu = -1; 346 | if (argc == 2) { 347 | cpu = strtol(argv[1], nullptr, 10); 348 | } 349 | platform::PinThreadToCPU(cpu); 350 | 351 | // Ensures the iteration counts are not compile-time constants. 352 | const int unpredictable1 = argc != 999; 353 | 354 | ForeachEngine(unpredictable1); 355 | ForeachEngine(unpredictable1); 356 | ForeachEngine(unpredictable1); 357 | ForeachEngine(unpredictable1); 358 | } 359 | 360 | } // namespace 361 | } // namespace randen 362 | 363 | int main(int argc, char* argv[]) { 364 | randen::RunAll(argc, argv); 365 | return 0; 366 | } 367 | -------------------------------------------------------------------------------- /randen_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "randen.h" 16 | 17 | #include 18 | #include 19 | #include // seed_seq 20 | #include 21 | 22 | #define UPDATE_GOLDEN 0 23 | #define ENABLE_VERIFY 1 24 | #define ENABLE_DUMP 0 25 | 26 | namespace randen { 27 | namespace { 28 | 29 | #define STR(x) #x 30 | 31 | #define ASSERT_TRUE(condition) \ 32 | do { \ 33 | if (!(condition)) { \ 34 | printf("Assertion [" STR(condition) "] failed on line %d\n", __LINE__); \ 35 | abort(); \ 36 | } \ 37 | } while (false) 38 | 39 | using EngRanden = Randen; 40 | 41 | #if ENABLE_VERIFY 42 | 43 | void VerifyReseedChangesAllValues() { 44 | const size_t kNumOutputs = 127; 45 | EngRanden engine; 46 | 47 | std::seed_seq seq1{1, 2, 3, 4, 5, 6, 7}; 48 | engine.seed(seq1); 49 | uint64_t out1[kNumOutputs]; 50 | for (size_t i = 0; i < kNumOutputs; ++i) { 51 | out1[i] = engine(); 52 | } 53 | 54 | std::seed_seq seq2{127, 255, 511}; 55 | engine.seed(seq2); 56 | uint64_t out2[kNumOutputs]; 57 | engine.seed(seq2); 58 | 59 | for (size_t i = 0; i < kNumOutputs; ++i) { 60 | out2[i] = engine(); 61 | ASSERT_TRUE(out2[i] != out1[i]); 62 | } 63 | } 64 | 65 | void VerifyDiscard() { 66 | const int N = 56; // two buffer's worth 67 | for (int num_used = 0; num_used < N; ++num_used) { 68 | EngRanden engine_used; 69 | for (int i = 0; i < num_used; ++i) { 70 | (void)engine_used(); 71 | } 72 | 73 | for (int num_discard = 0; num_discard < N; ++num_discard) { 74 | EngRanden engine1 = engine_used; 75 | EngRanden engine2 = engine_used; 76 | for (int i = 0; i < num_discard; ++i) { 77 | (void)engine1(); 78 | } 79 | engine2.discard(num_discard); 80 | for (int i = 0; i < N; ++i) { 81 | const uint64_t r1 = engine1(); 82 | const uint64_t r2 = engine2(); 83 | ASSERT_TRUE(r1 == r2); 84 | } 85 | } 86 | } 87 | } 88 | 89 | void VerifyGolden() { 90 | // prime number => some buffer values unused. 91 | const size_t kNumOutputs = 127; 92 | #if UPDATE_GOLDEN 93 | EngRanden engine; 94 | for (size_t i = 0; i < kNumOutputs; ++i) { 95 | printf("0x%016lx,\n", engine()); 96 | } 97 | printf("\n"); 98 | #else 99 | const uint64_t golden[kNumOutputs] = { 100 | 0xdda9f47cd90410ee, 0xc3c14f134e433977, 0xf0b780f545c72912, 101 | 0x887bf3087fd8ca10, 0x30ec63baff3c6d59, 0x15dbb1d37696599f, 102 | 0x02808a316f49a54c, 0xb29f73606f7f20a6, 0x9cbf605e3fd9de8a, 103 | 0x3b8feaf9d5c8e50e, 0xd8b2ffd356301ed5, 0xc970ae1a78183bbb, 104 | 0xcdfd8d76eb8f9a19, 0xf4b327fe0fc73c37, 0xd5af05dd3eff9556, 105 | 0xc3a506eb91420c9d, 0x7023920e0d6bfe8c, 0x48db1bb78f83c4a1, 106 | 0xed1ef4c26b87b840, 0x58d3575834956d42, 0x497cabf3431154fc, 107 | 0x8eef32a23e0b2df3, 0xd88b5749f090e5ea, 0x4e24370570029a8b, 108 | 0x78fcec2cbb6342f5, 0xc651a582a970692f, 0x352ee4ad1816afe3, 109 | 0x463cb745612f55db, 0x811ef0821c3de851, 0x026ff374c101da7e, 110 | 0xa0660379992d58fc, 0x6f7e616704c4fa59, 0x915f3445685da798, 111 | 0x04b0a374a3b795c7, 0x4663352533ce1882, 0x26802a8ac76571ce, 112 | 0x5588ba3a4d6e6c51, 0xb9fdefb4a24dc738, 0x607195a5e200f5fd, 113 | 0xa2101a42d35f1956, 0xe1e5e03c759c0709, 0x7e100308f3290764, 114 | 0xcbcf585399e432f1, 0x082572cc5da6606f, 0x0904469acbfee8f2, 115 | 0xe8a2be4f8335d8f1, 0x08e8a1f1a69da69a, 0xf08bd31b6daecd51, 116 | 0x2e9705bb053d6b46, 0x6542a20aad57bff5, 0x78e3a810213b6ffb, 117 | 0xda2fc9db0713c391, 0xc0932718cd55781f, 0xdc16a59cdd85f8a6, 118 | 0xb97289c1be0f2f9c, 0xb9bfb29c2b20bfe5, 0x5524bb834771435b, 119 | 0xc0a2a0e403a892d4, 0xff4af3ab8d1b78c5, 0x8265da3d39d1a750, 120 | 0x66e455f627495189, 0xf0ec5f424bcad77f, 0x3424e47dc22596e3, 121 | 0xc82d3120b57e3270, 0xc191c595afc4dcbf, 0xbc0c95129ccedcdd, 122 | 0x7f90650ea6cd6ab4, 0x120392bd2bb70939, 0xa7c8fac5a7917eb0, 123 | 0x7287491832695ad3, 0x7c1bf9839c7c1ce5, 0xd088cb9418be0361, 124 | 0x78565cdefd28c4ad, 0xe2e991fa58e1e79e, 0x2a9eac28b08c96bf, 125 | 0x7351b9fef98bafad, 0x13a685861bab87e0, 0x6c4f179696cb2225, 126 | 0x30537425cac70991, 0x64c6de5aa0501971, 0x7e05e3aa8ec720dc, 127 | 0x01590d9dc6c532b7, 0x738184388f3bc1d2, 0x74a07d9c54e3e63f, 128 | 0x6bcdf185561f255f, 0x26ffdc5067be3acb, 0x171df81934f68604, 129 | 0xa0eaf2e1cf99b1c6, 0x5d1cb02075ba1cea, 0x7ea5a21665683e5a, 130 | 0xba6364eff80de02f, 0x957f38cbd2123fdf, 0x892d8317de82f7a2, 131 | 0x606e0a0e41d452ee, 0x4eb28826766fcf5b, 0xe707b1db50f7b43e, 132 | 0x6ee217df16527d78, 0x5a362d56e80a0951, 0x443e63857d4076ca, 133 | 0xf6737962ba6b23dd, 0xd796b052151ee94d, 0x790d9a5f048adfeb, 134 | 0x8b833ff84893da5d, 0x033ed95c12b04a03, 0x9877c4225061ca76, 135 | 0x3d6724b1bb15eab9, 0x42e5352fe30ce989, 0xd68d6810adf74fb3, 136 | 0x3cdbf7e358df4b8b, 0x265b565a7431fde7, 0x52d2242f65b37f88, 137 | 0x2922a47f6d3e8779, 0x29d40f00566d5e26, 0x5d836d6e2958d6b5, 138 | 0x6c056608b7d9c1b6, 0x288db0e1124b14a0, 0x8fb946504faa6c9d, 139 | 0x0b9471bdb8f19d32, 0xfd1fe27d144a09e0, 0x8943a9464540251c, 140 | 0x8048f217633fce36, 0xea6ac458da141bda, 0x4334b8b02ff7612f, 141 | 0xfeda1384ade74d31, 0x096d119a3605c85b, 0xdbc8441f5227e216, 142 | 0x541ad7efa6ddc1d3}; 143 | EngRanden engine; 144 | for (size_t i = 0; i < kNumOutputs; ++i) { 145 | ASSERT_TRUE(golden[i] == engine()); 146 | } 147 | #endif 148 | } 149 | 150 | #endif // ENABLE_VERIFY 151 | 152 | void VerifyRandReqEngine() { 153 | // Validates that Randen satisfies [rand.req.engine]. 154 | // Names after definition of [rand.req.engine] in C++ standard. 155 | // e is a value of E 156 | // v is a lvalue of E 157 | // x, y are possibly const values of E 158 | // s is a value of T 159 | // q is a value satisfying requirements of seed_sequence 160 | // z is a value of type unsigned long long 161 | // os is a some specialization of basic_ostream 162 | // is is a some specialization of basic_istream 163 | 164 | using E = EngRanden; 165 | using T = typename EngRanden::result_type; 166 | 167 | static_assert(std::is_copy_constructible::value, 168 | "Randen must be copy constructible"); 169 | 170 | static_assert(std::is_copy_assignable::value, 171 | "Randen must be copy assignable"); 172 | 173 | E e, v; 174 | const E x, y; 175 | T s = 1; 176 | std::seed_seq q{1, 2, 3}; 177 | unsigned long long z = 1; // NOLINT(runtime/int) 178 | std::wostringstream os; 179 | std::wistringstream is; 180 | 181 | E{}; 182 | E{x}; 183 | E{s}; 184 | E{q}; 185 | 186 | // Verify that seed() and default-construct is identical. 187 | e.seed(); 188 | { 189 | E f; 190 | ASSERT_TRUE(e == f); 191 | } 192 | 193 | // Verify the seed() result type. 194 | static_assert(std::is_same::value, 195 | "return type of seed() must be void"); 196 | 197 | static_assert(std::is_same::value, 198 | "return type of seed() must be void"); 199 | 200 | // verify that seed via seed_sequence and construct via seed_sequence 201 | // is identical. 202 | e.seed(q); 203 | { 204 | E f{q}; 205 | ASSERT_TRUE(e == f); 206 | } 207 | 208 | // Verify the operator() result type. 209 | static_assert(std::is_same::value, 210 | "return type of operator() must be result_type"); 211 | 212 | // Verify that once the state has advanced that the engines 213 | // are no longer equal. 214 | e(); 215 | { 216 | E f{q}; 217 | ASSERT_TRUE(e != f); 218 | } 219 | 220 | { 221 | E f; 222 | ASSERT_TRUE(e != f); 223 | } 224 | 225 | // Verify discard. 226 | e.discard(z); 227 | { 228 | // The state equivalence should change. 229 | E f, g; 230 | f.discard(2); 231 | ASSERT_TRUE(f != g); 232 | 233 | g(); 234 | g(); 235 | ASSERT_TRUE(f == g); 236 | } 237 | 238 | // Verify operator == result types. 239 | static_assert(std::is_same::value, 240 | "return type of operator== must be bool"); 241 | 242 | static_assert(std::is_same::value, 243 | "return type of operator!= must be bool"); 244 | 245 | // Verify operator<<() result. 246 | { 247 | auto& os2 = (os << e); 248 | ASSERT_TRUE(&os2 == &os); 249 | } 250 | 251 | // Verify operator>>() result. 252 | { 253 | auto& is2 = (is >> e); 254 | ASSERT_TRUE(&is2 == &is); 255 | } 256 | } 257 | 258 | void VerifyStreamOperators() { 259 | EngRanden engine1(171); 260 | EngRanden engine2; 261 | 262 | { 263 | std::stringstream stream; 264 | stream << engine1; 265 | stream >> engine2; 266 | } 267 | 268 | const int N = 56; // two buffer's worth 269 | for (int i = 0; i < N; ++i) { 270 | const uint64_t r1 = engine1(); 271 | const uint64_t r2 = engine2(); 272 | ASSERT_TRUE(r1 == r2); 273 | } 274 | } 275 | 276 | void Verify() { 277 | #if ENABLE_VERIFY 278 | VerifyReseedChangesAllValues(); 279 | VerifyDiscard(); 280 | VerifyGolden(); 281 | VerifyRandReqEngine(); 282 | VerifyStreamOperators(); 283 | #endif 284 | } 285 | 286 | void DumpOutput() { 287 | #if ENABLE_DUMP 288 | const size_t kNumOutputs = 1500 * 1000 * 1000; 289 | std::vector outputs(kNumOutputs); 290 | EngRanden engine; 291 | for (size_t i = 0; i < kNumOutputs; ++i) { 292 | outputs[i] = engine(); 293 | } 294 | 295 | FILE* f = fopen("/tmp/randen.bin", "wb"); 296 | if (f != nullptr) { 297 | fwrite(outputs.data(), kNumOutputs, 8, f); 298 | fclose(f); 299 | } 300 | #endif // ENABLE_DUMP 301 | } 302 | 303 | void RunAll() { 304 | // Immediately output any results (for non-local runs). 305 | setvbuf(stdout, nullptr, _IONBF, 0); 306 | 307 | Verify(); 308 | DumpOutput(); 309 | } 310 | 311 | } // namespace 312 | } // namespace randen 313 | 314 | int main(int argc, char* argv[]) { 315 | randen::RunAll(); 316 | return 0; 317 | } 318 | -------------------------------------------------------------------------------- /third_party/pcg_random/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /third_party/pcg_random/include/pcg_extras.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * PCG Random Number Generation for C++ 3 | * 4 | * Copyright 2014 Melissa O'Neill 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * For additional information about the PCG random number generation scheme, 19 | * including its license and other licensing options, visit 20 | * 21 | * http://www.pcg-random.org 22 | */ 23 | 24 | /* 25 | * This file provides support code that is useful for random-number generation 26 | * but not specific to the PCG generation scheme, including: 27 | * - 128-bit int support for platforms where it isn't available natively 28 | * - bit twiddling operations 29 | * - I/O of 128-bit and 8-bit integers 30 | * - Handling the evilness of SeedSeq 31 | * - Support for efficiently producing random numbers less than a given 32 | * bound 33 | */ 34 | 35 | #ifndef PCG_EXTRAS_HPP_INCLUDED 36 | #define PCG_EXTRAS_HPP_INCLUDED 1 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #ifdef __GNUC__ 52 | #include 53 | #endif 54 | 55 | /* 56 | * Abstractions for compiler-specific directives 57 | */ 58 | 59 | #ifdef __GNUC__ 60 | #define PCG_NOINLINE __attribute__((noinline)) 61 | #define PCG_INLINE __attribute__((always_inline)) 62 | #else 63 | #define PCG_NOINLINE 64 | #define PCG_INLINE 65 | #endif 66 | 67 | /* 68 | * Some members of the PCG library use 128-bit math. When compiling on 64-bit 69 | * platforms, both GCC and Clang provide 128-bit integer types that are ideal 70 | * for the job. 71 | * 72 | * On 32-bit platforms (or with other compilers), we fall back to a C++ 73 | * class that provides 128-bit unsigned integers instead. It may seem 74 | * like we're reinventing the wheel here, because libraries already exist 75 | * that support large integers, but most existing libraries provide a very 76 | * generic multiprecision code, but here we're operating at a fixed size. 77 | * Also, most other libraries are fairly heavyweight. So we use a direct 78 | * implementation. Sadly, it's much slower than hand-coded assembly or 79 | * direct CPU support. 80 | * 81 | */ 82 | #if __SIZEOF_INT128__ 83 | namespace pcg_extras { 84 | typedef __uint128_t pcg128_t; 85 | } 86 | #define PCG_128BIT_CONSTANT(high,low) \ 87 | ((pcg128_t(high) << 64) + low) 88 | #else 89 | #include "pcg_uint128.hpp" 90 | namespace pcg_extras { 91 | typedef pcg_extras::uint_x4 pcg128_t; 92 | } 93 | #define PCG_128BIT_CONSTANT(high,low) \ 94 | pcg128_t(high,low) 95 | #define PCG_EMULATED_128BIT_MATH 1 96 | #endif 97 | 98 | 99 | // google3 crosstool consistently fails to recognize rotr / rotl methods as 100 | // hardware rotations, so force it to use inlined assembly. 101 | // TODO(ahh): switch *everything* to wg21.link/P0553 when that's an option. 102 | #define PCG_USE_INLINE_ASM 1 103 | 104 | namespace pcg_extras { 105 | 106 | /* 107 | * We often need to represent a "number of bits". When used normally, these 108 | * numbers are never greater than 128, so an unsigned char is plenty. 109 | * If you're using a nonstandard generator of a larger size, you can set 110 | * PCG_BITCOUNT_T to have it define it as a larger size. (Some compilers 111 | * might produce faster code if you set it to an unsigned int.) 112 | */ 113 | 114 | #ifndef PCG_BITCOUNT_T 115 | typedef uint8_t bitcount_t; 116 | #else 117 | typedef PCG_BITCOUNT_T bitcount_t; 118 | #endif 119 | 120 | /* 121 | * C++ requires us to be able to serialize RNG state by printing or reading 122 | * it from a stream. Because we use 128-bit ints, we also need to be able 123 | * ot print them, so here is code to do so. 124 | * 125 | * This code provides enough functionality to print 128-bit ints in decimal 126 | * and zero-padded in hex. It's not a full-featured implementation. 127 | */ 128 | 129 | template 130 | std::basic_ostream& 131 | operator<<(std::basic_ostream& out, pcg128_t value) 132 | { 133 | auto desired_base = out.flags() & out.basefield; 134 | bool want_hex = desired_base == out.hex; 135 | 136 | if (want_hex) { 137 | uint64_t highpart = uint64_t(value >> 64); 138 | uint64_t lowpart = uint64_t(value); 139 | auto desired_width = out.width(); 140 | if (desired_width > 16) { 141 | out.width(desired_width - 16); 142 | } 143 | if (highpart != 0 || desired_width > 16) 144 | out << highpart; 145 | CharT oldfill = '\0'; 146 | if (highpart != 0) { 147 | out.width(16); 148 | oldfill = out.fill('0'); 149 | } 150 | auto oldflags = out.setf(decltype(desired_base){}, out.showbase); 151 | out << lowpart; 152 | out.setf(oldflags); 153 | if (highpart != 0) { 154 | out.fill(oldfill); 155 | } 156 | return out; 157 | } 158 | constexpr size_t MAX_CHARS_128BIT = 40; 159 | 160 | char buffer[MAX_CHARS_128BIT]; 161 | char* pos = buffer+sizeof(buffer); 162 | *(--pos) = '\0'; 163 | constexpr auto BASE = pcg128_t(10ULL); 164 | do { 165 | auto div = value / BASE; 166 | auto mod = uint32_t(value - (div * BASE)); 167 | *(--pos) = '0' + char(mod); 168 | value = div; 169 | } while(value != pcg128_t(0ULL)); 170 | return out << pos; 171 | } 172 | 173 | template 174 | std::basic_istream& 175 | operator>>(std::basic_istream& in, pcg128_t& value) 176 | { 177 | typename std::basic_istream::sentry s(in); 178 | 179 | if (!s) 180 | return in; 181 | 182 | constexpr auto BASE = pcg128_t(10ULL); 183 | pcg128_t current(0ULL); 184 | bool did_nothing = true; 185 | bool overflow = false; 186 | for(;;) { 187 | CharT wide_ch = in.get(); 188 | if (!in.good()) 189 | break; 190 | auto ch = in.narrow(wide_ch, '\0'); 191 | if (ch < '0' || ch > '9') { 192 | in.unget(); 193 | break; 194 | } 195 | did_nothing = false; 196 | pcg128_t digit(uint32_t(ch - '0')); 197 | pcg128_t timesbase = current*BASE; 198 | overflow = overflow || timesbase < current; 199 | current = timesbase + digit; 200 | overflow = overflow || current < digit; 201 | } 202 | 203 | if (did_nothing || overflow) { 204 | in.setstate(std::ios::failbit); 205 | if (overflow) 206 | current = ~pcg128_t(0ULL); 207 | } 208 | 209 | value = current; 210 | 211 | return in; 212 | } 213 | 214 | /* 215 | * Likewise, if people use tiny rngs, we'll be serializing uint8_t. 216 | * If we just used the provided IO operators, they'd read/write chars, 217 | * not ints, so we need to define our own. We *can* redefine this operator 218 | * here because we're in our own namespace. 219 | */ 220 | 221 | template 222 | std::basic_ostream& 223 | operator<<(std::basic_ostream&out, uint8_t value) 224 | { 225 | return out << uint32_t(value); 226 | } 227 | 228 | template 229 | std::basic_istream& 230 | operator>>(std::basic_istream& in, uint8_t& target) 231 | { 232 | uint32_t value = 0xdecea5edU; 233 | in >> value; 234 | if (!in && value == 0xdecea5edU) 235 | return in; 236 | if (value > uint8_t(~0)) { 237 | in.setstate(std::ios::failbit); 238 | value = ~0U; 239 | } 240 | target = uint8_t(value); 241 | return in; 242 | } 243 | 244 | /* Unfortunately, the above functions don't get found in preference to the 245 | * built in ones, so we create some more specific overloads that will. 246 | * Ugh. 247 | */ 248 | 249 | inline std::ostream& operator<<(std::ostream& out, uint8_t value) 250 | { 251 | return pcg_extras::operator<< (out, value); 252 | } 253 | 254 | inline std::istream& operator>>(std::istream& in, uint8_t& value) 255 | { 256 | return pcg_extras::operator>> (in, value); 257 | } 258 | 259 | 260 | 261 | /* 262 | * Useful bitwise operations. 263 | */ 264 | 265 | /* 266 | * XorShifts are invertable, but they are someting of a pain to invert. 267 | * This function backs them out. It's used by the whacky "inside out" 268 | * generator defined later. 269 | */ 270 | 271 | template 272 | inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift) 273 | { 274 | if (2*shift >= bits) { 275 | return x ^ (x >> shift); 276 | } 277 | itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1; 278 | itype highmask1 = ~lowmask1; 279 | itype top1 = x; 280 | itype bottom1 = x & lowmask1; 281 | top1 ^= top1 >> shift; 282 | top1 &= highmask1; 283 | x = top1 | bottom1; 284 | itype lowmask2 = (itype(1U) << (bits - shift)) - 1; 285 | itype bottom2 = x & lowmask2; 286 | bottom2 = unxorshift(bottom2, bits - shift, shift); 287 | bottom2 &= lowmask1; 288 | return top1 | bottom2; 289 | } 290 | 291 | /* 292 | * Rotate left and right. 293 | * 294 | * In ideal world, compilers would spot idiomatic rotate code and convert it 295 | * to a rotate instruction. Of course, opinions vary on what the correct 296 | * idiom is and how to spot it. For clang, sometimes it generates better 297 | * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM. 298 | */ 299 | 300 | template 301 | inline itype rotl(itype value, bitcount_t rot) 302 | { 303 | constexpr bitcount_t bits = sizeof(itype) * 8; 304 | constexpr bitcount_t mask = bits - 1; 305 | #if PCG_USE_ZEROCHECK_ROTATE_IDIOM 306 | return rot ? (value << rot) | (value >> (bits - rot)) : value; 307 | #else 308 | return (value << rot) | (value >> ((- rot) & mask)); 309 | #endif 310 | } 311 | 312 | template 313 | inline itype rotr(itype value, bitcount_t rot) 314 | { 315 | constexpr bitcount_t bits = sizeof(itype) * 8; 316 | constexpr bitcount_t mask = bits - 1; 317 | #if PCG_USE_ZEROCHECK_ROTATE_IDIOM 318 | return rot ? (value >> rot) | (value << (bits - rot)) : value; 319 | #else 320 | return (value >> rot) | (value << ((- rot) & mask)); 321 | #endif 322 | } 323 | 324 | /* Unfortunately, both Clang and GCC sometimes perform poorly when it comes 325 | * to properly recognizing idiomatic rotate code, so for we also provide 326 | * assembler directives (enabled with PCG_USE_INLINE_ASM). Boo, hiss. 327 | * (I hope that these compilers get better so that this code can die.) 328 | * 329 | * These overloads will be preferred over the general template code above. 330 | */ 331 | 332 | #if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__ || __i386__) 333 | 334 | inline uint8_t rotr(uint8_t value, bitcount_t rot) 335 | { 336 | asm ("rorb %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); 337 | return value; 338 | } 339 | 340 | inline uint16_t rotr(uint16_t value, bitcount_t rot) 341 | { 342 | asm ("rorw %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); 343 | return value; 344 | } 345 | 346 | inline uint32_t rotr(uint32_t value, bitcount_t rot) 347 | { 348 | asm ("rorl %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); 349 | return value; 350 | } 351 | 352 | #if __x86_64__ 353 | inline uint64_t rotr(uint64_t value, bitcount_t rot) 354 | { 355 | asm ("rorq %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); 356 | return value; 357 | } 358 | #endif // __x86_64__ 359 | 360 | #endif // PCG_USE_INLINE_ASM 361 | 362 | 363 | /* 364 | * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of 365 | * 32-bit integers with seed data, but sometimes we want to produce 366 | * larger or smaller integers. 367 | * 368 | * The following code handles this annoyance. 369 | * 370 | * uneven_copy will copy an array of 32-bit ints to an array of larger or 371 | * smaller ints (actually, the code is general it only needing forward 372 | * iterators). The copy is identical to the one that would be performed if 373 | * we just did memcpy on a standard little-endian machine, but works 374 | * regardless of the endian of the machine (or the weirdness of the ints 375 | * involved). 376 | * 377 | * generate_to initializes an array of integers using a SeedSeq 378 | * object. It is given the size as a static constant at compile time and 379 | * tries to avoid memory allocation. If we're filling in 32-bit constants 380 | * we just do it directly. If we need a separate buffer and it's small, 381 | * we allocate it on the stack. Otherwise, we fall back to heap allocation. 382 | * Ugh. 383 | * 384 | * generate_one produces a single value of some integral type using a 385 | * SeedSeq object. 386 | */ 387 | 388 | /* uneven_copy helper, case where destination ints are less than 32 bit. */ 389 | 390 | template 391 | SrcIter uneven_copy_impl( 392 | SrcIter src_first, DestIter dest_first, DestIter dest_last, 393 | std::true_type) 394 | { 395 | typedef typename std::iterator_traits::value_type src_t; 396 | typedef typename std::iterator_traits::value_type dest_t; 397 | 398 | constexpr bitcount_t SRC_SIZE = sizeof(src_t); 399 | constexpr bitcount_t DEST_SIZE = sizeof(dest_t); 400 | constexpr bitcount_t DEST_BITS = DEST_SIZE * 8; 401 | constexpr bitcount_t SCALE = SRC_SIZE / DEST_SIZE; 402 | 403 | size_t count = 0; 404 | src_t value = 0; 405 | 406 | while (dest_first != dest_last) { 407 | if ((count++ % SCALE) == 0) 408 | value = *src_first++; // Get more bits 409 | else 410 | value >>= DEST_BITS; // Move down bits 411 | 412 | *dest_first++ = dest_t(value); // Truncates, ignores high bits. 413 | } 414 | return src_first; 415 | } 416 | 417 | /* uneven_copy helper, case where destination ints are more than 32 bit. */ 418 | 419 | template 420 | SrcIter uneven_copy_impl( 421 | SrcIter src_first, DestIter dest_first, DestIter dest_last, 422 | std::false_type) 423 | { 424 | typedef typename std::iterator_traits::value_type src_t; 425 | typedef typename std::iterator_traits::value_type dest_t; 426 | 427 | constexpr auto SRC_SIZE = sizeof(src_t); 428 | constexpr auto SRC_BITS = SRC_SIZE * 8; 429 | constexpr auto DEST_SIZE = sizeof(dest_t); 430 | constexpr auto SCALE = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE; 431 | 432 | while (dest_first != dest_last) { 433 | dest_t value(0UL); 434 | unsigned int shift = 0; 435 | 436 | for (size_t i = 0; i < SCALE; ++i) { 437 | value |= dest_t(*src_first++) << shift; 438 | shift += SRC_BITS; 439 | } 440 | 441 | *dest_first++ = value; 442 | } 443 | return src_first; 444 | } 445 | 446 | /* uneven_copy, call the right code for larger vs. smaller */ 447 | 448 | template 449 | inline SrcIter uneven_copy(SrcIter src_first, 450 | DestIter dest_first, DestIter dest_last) 451 | { 452 | typedef typename std::iterator_traits::value_type src_t; 453 | typedef typename std::iterator_traits::value_type dest_t; 454 | 455 | constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t); 456 | 457 | return uneven_copy_impl(src_first, dest_first, dest_last, 458 | std::integral_constant{}); 459 | } 460 | 461 | /* generate_to, fill in a fixed-size array of integral type using a SeedSeq 462 | * (actually works for any random-access iterator) 463 | */ 464 | 465 | template 466 | inline void generate_to_impl(SeedSeq&& generator, DestIter dest, 467 | std::true_type) 468 | { 469 | generator.generate(dest, dest+size); 470 | } 471 | 472 | template 473 | void generate_to_impl(SeedSeq&& generator, DestIter dest, 474 | std::false_type) 475 | { 476 | typedef typename std::iterator_traits::value_type dest_t; 477 | constexpr auto DEST_SIZE = sizeof(dest_t); 478 | constexpr auto GEN_SIZE = sizeof(uint32_t); 479 | 480 | constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE; 481 | constexpr size_t FROM_ELEMS = 482 | GEN_IS_SMALLER 483 | ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE) 484 | : (size + (GEN_SIZE / DEST_SIZE) - 1) 485 | / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER); 486 | // this odd code ^^^^^^^^^^^^^^^^^ is work-around for 487 | // a bug: http://llvm.org/bugs/show_bug.cgi?id=21287 488 | 489 | if (FROM_ELEMS <= 1024) { 490 | uint32_t buffer[FROM_ELEMS]; 491 | generator.generate(buffer, buffer+FROM_ELEMS); 492 | uneven_copy(buffer, dest, dest+size); 493 | } else { 494 | uint32_t* buffer = static_cast(malloc(GEN_SIZE * FROM_ELEMS)); 495 | generator.generate(buffer, buffer+FROM_ELEMS); 496 | uneven_copy(buffer, dest, dest+size); 497 | free(static_cast(buffer)); 498 | } 499 | } 500 | 501 | template 502 | inline void generate_to(SeedSeq&& generator, DestIter dest) 503 | { 504 | typedef typename std::iterator_traits::value_type dest_t; 505 | constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t); 506 | 507 | generate_to_impl(std::forward(generator), dest, 508 | std::integral_constant{}); 509 | } 510 | 511 | /* generate_one, produce a value of integral type using a SeedSeq 512 | * (optionally, we can have it produce more than one and pick which one 513 | * we want) 514 | */ 515 | 516 | template 517 | inline UInt generate_one(SeedSeq&& generator) 518 | { 519 | UInt result[N]; 520 | generate_to(std::forward(generator), result); 521 | return result[i]; 522 | } 523 | 524 | template 525 | auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound) 526 | -> typename RngType::result_type 527 | { 528 | typedef typename RngType::result_type rtype; 529 | rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound) 530 | % upper_bound; 531 | for (;;) { 532 | rtype r = rng() - RngType::min(); 533 | if (r >= threshold) 534 | return r % upper_bound; 535 | } 536 | } 537 | 538 | template 539 | void shuffle(Iter from, Iter to, RandType&& rng) 540 | { 541 | typedef typename std::iterator_traits::difference_type delta_t; 542 | typedef typename std::remove_reference::type::result_type result_t; 543 | auto count = to - from; 544 | while (count > 1) { 545 | delta_t chosen = delta_t(bounded_rand(rng, result_t(count))); 546 | --count; 547 | --to; 548 | using std::swap; 549 | swap(*(from + chosen), *to); 550 | } 551 | } 552 | 553 | /* 554 | * Although std::seed_seq is useful, it isn't everything. Often we want to 555 | * initialize a random-number generator some other way, such as from a random 556 | * device. 557 | * 558 | * Technically, it does not meet the requirements of a SeedSequence because 559 | * it lacks some of the rarely-used member functions (some of which would 560 | * be impossible to provide). However the C++ standard is quite specific 561 | * that actual engines only called the generate method, so it ought not to be 562 | * a problem in practice. 563 | */ 564 | 565 | template 566 | class seed_seq_from { 567 | private: 568 | RngType rng_; 569 | 570 | typedef uint_least32_t result_type; 571 | 572 | public: 573 | template 574 | seed_seq_from(Args&&... args) : 575 | rng_(std::forward(args)...) 576 | { 577 | // Nothing (else) to do... 578 | } 579 | 580 | template 581 | void generate(Iter start, Iter finish) 582 | { 583 | for (auto i = start; i != finish; ++i) 584 | *i = result_type(rng_()); 585 | } 586 | 587 | constexpr size_t size() const 588 | { 589 | return (sizeof(typename RngType::result_type) > sizeof(result_type) 590 | && RngType::max() > ~size_t(0UL)) 591 | ? ~size_t(0UL) 592 | : size_t(RngType::max()); 593 | } 594 | }; 595 | 596 | /* 597 | * Sometimes you might want a distinct seed based on when the program 598 | * was compiled. That way, a particular instance of the program will 599 | * behave the same way, but when recompiled it'll produce a different 600 | * value. 601 | */ 602 | 603 | template 604 | struct static_arbitrary_seed { 605 | private: 606 | static constexpr IntType fnv(IntType hash, const char* pos) { 607 | return *pos == '\0' 608 | ? hash 609 | : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1)); 610 | } 611 | 612 | public: 613 | static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)), 614 | __DATE__ __TIME__ __FILE__); 615 | }; 616 | 617 | // Sometimes, when debugging or testing, it's handy to be able print the name 618 | // of a (in human-readable form). This code allows the idiom: 619 | // 620 | // cout << printable_typename() 621 | // 622 | // to print out my_foo_type_t (or its concrete type if it is a synonym) 623 | 624 | template 625 | struct printable_typename {}; 626 | 627 | template 628 | std::ostream& operator<<(std::ostream& out, printable_typename) { 629 | const char *implementation_typename = typeid(T).name(); 630 | #ifdef __GNUC__ 631 | int status; 632 | char* pretty_name = 633 | abi::__cxa_demangle(implementation_typename, NULL, NULL, &status); 634 | if (status == 0) 635 | out << pretty_name; 636 | free(static_cast(pretty_name)); 637 | if (status == 0) 638 | return out; 639 | #endif 640 | out << implementation_typename; 641 | return out; 642 | } 643 | 644 | } // namespace pcg_extras 645 | 646 | #endif // PCG_EXTRAS_HPP_INCLUDED 647 | -------------------------------------------------------------------------------- /util.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef UTIL_H_ 16 | #define UTIL_H_ 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #ifdef _MSC_VER 23 | #include 24 | #endif 25 | 26 | #define RANDEN_CHECK(condition) \ 27 | do { \ 28 | if (!(condition)) { \ 29 | printf("Assertion failed on line %d\n", __LINE__); \ 30 | abort(); \ 31 | } \ 32 | } while (false) 33 | 34 | namespace randen { 35 | 36 | // "x" != 0. 37 | static inline int NumZeroBitsAboveMSBNonzero(const uint64_t x) { 38 | #ifdef _MSC_VER 39 | return static_cast(__lzcnt64(x)); // WARNING: requires BMI2 40 | #else 41 | return __builtin_clzll(x); 42 | #endif 43 | } 44 | 45 | } // namespace randen 46 | 47 | #endif // UTIL_H_ 48 | -------------------------------------------------------------------------------- /vector128.h: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Wrappers for platform-specific 128-bit vectors. 16 | #ifndef VECTOR128_H_ 17 | #define VECTOR128_H_ 18 | 19 | #include // uint64_t 20 | 21 | #if defined(__SSE2__) && defined(__AES__) 22 | 23 | #define RANDEN_AESNI 1 24 | #include 25 | 26 | #elif defined(__powerpc__) && defined(__VSX__) 27 | 28 | #define RANDEN_PPC 1 29 | #define RANDEN_BIG_ENDIAN 1 30 | #include 31 | 32 | #elif defined(__ARM_NEON) && defined(__ARM_FEATURE_CRYPTO) 33 | 34 | #define RANDEN_ARM 1 35 | #include 36 | 37 | #else 38 | #error "Port" 39 | #endif 40 | 41 | #if defined(__clang__) || defined(__GNUC__) 42 | #define RANDEN_INLINE inline __attribute__((always_inline)) 43 | #define RANDEN_RESTRICT __restrict__ 44 | #else 45 | #define RANDEN_INLINE 46 | #define RANDEN_RESTRICT 47 | #endif 48 | 49 | namespace randen { 50 | 51 | #ifdef RANDEN_AESNI 52 | 53 | class V { 54 | public: 55 | RANDEN_INLINE V() {} // Leaves v_ uninitialized. 56 | RANDEN_INLINE V& operator=(const V other) { 57 | raw_ = other.raw_; 58 | return *this; 59 | } 60 | 61 | // Convert from/to intrinsics. 62 | RANDEN_INLINE explicit V(const __m128i raw) : raw_(raw) {} 63 | __m128i raw() const { return raw_; } 64 | 65 | RANDEN_INLINE V& operator^=(const V other) { 66 | raw_ = _mm_xor_si128(raw_, other.raw_); 67 | return *this; 68 | } 69 | 70 | private: 71 | // Note: this wrapper is faster than using __m128i directly. 72 | __m128i raw_; 73 | }; 74 | 75 | #elif defined(RANDEN_PPC) 76 | 77 | // Already provides operator^=. 78 | using V = vector unsigned long long; 79 | 80 | #elif defined(RANDEN_ARM) 81 | 82 | // Already provides operator^=. 83 | using V = uint8x16_t; 84 | 85 | #else 86 | #error "Port" 87 | #endif 88 | 89 | constexpr int kLanes = sizeof(V) / sizeof(uint64_t); 90 | 91 | // On big-endian platforms, byte-swap constants (e.g. round keys) to ensure 92 | // results match little-endian platforms. 93 | #ifdef RANDEN_BIG_ENDIAN 94 | #define RANDEN_LE(a, b) __builtin_bswap64(b), __builtin_bswap64(a) 95 | #else 96 | #define RANDEN_LE(a, b) a, b 97 | #endif 98 | 99 | #ifdef RANDEN_BIG_ENDIAN 100 | static RANDEN_INLINE V ReverseBytes(const V v) { 101 | // Reverses the bytes of the vector. 102 | const vector unsigned char perm = {15, 14, 13, 12, 11, 10, 9, 8, 103 | 7, 6, 5, 4, 3, 2, 1, 0}; 104 | return vec_perm(v, v, perm); 105 | } 106 | #endif 107 | 108 | // WARNING: these load/store in native byte order. It is OK to load and then 109 | // store an unchanged vector, but interpreting the bits as a number or input 110 | // to AES will have platform-dependent results. Call ReverseBytes after load 111 | // and/or before store #ifdef RANDEN_BIG_ENDIAN. 112 | 113 | static RANDEN_INLINE V Load(const uint64_t* RANDEN_RESTRICT lanes, 114 | const int block) { 115 | #ifdef RANDEN_AESNI 116 | const uint64_t* RANDEN_RESTRICT from = lanes + block * kLanes; 117 | return V(_mm_load_si128(reinterpret_cast(from))); 118 | #elif defined(RANDEN_PPC) 119 | const V* RANDEN_RESTRICT from = 120 | reinterpret_cast(lanes + block * kLanes); 121 | return vec_vsx_ld(0, from); 122 | #elif defined(RANDEN_ARM) 123 | const uint8_t* RANDEN_RESTRICT from = 124 | reinterpret_cast(lanes + block * kLanes); 125 | return vld1q_u8(from); 126 | #else 127 | #error "Port" 128 | #endif 129 | } 130 | 131 | static RANDEN_INLINE void Store(const V v, uint64_t* RANDEN_RESTRICT lanes, 132 | const int block) { 133 | #ifdef RANDEN_AESNI 134 | uint64_t* RANDEN_RESTRICT to = lanes + block * kLanes; 135 | _mm_store_si128(reinterpret_cast<__m128i * RANDEN_RESTRICT>(to), v.raw()); 136 | #elif defined(RANDEN_PPC) 137 | V* RANDEN_RESTRICT to = reinterpret_cast(lanes + block * kLanes); 138 | vec_vsx_st(v, 0, to); 139 | #elif defined(RANDEN_ARM) 140 | uint8_t* RANDEN_RESTRICT to = 141 | reinterpret_cast(lanes + block * kLanes); 142 | vst1q_u8(to, v); 143 | #else 144 | #error "Port" 145 | #endif 146 | } 147 | 148 | // One round of AES. "round_key" is a public constant for breaking the 149 | // symmetry of AES (ensures previously equal columns differ afterwards). 150 | static RANDEN_INLINE V AES(const V state, const V round_key) { 151 | #ifdef RANDEN_AESNI 152 | // It is important to always use the full round function - omitting the 153 | // final MixColumns reduces security [https://eprint.iacr.org/2010/041.pdf] 154 | // and does not help because we never decrypt. 155 | return V(_mm_aesenc_si128(state.raw(), round_key.raw())); 156 | #elif defined(RANDEN_PPC) 157 | return V(__builtin_crypto_vcipher(state, round_key)); 158 | #elif defined(RANDEN_ARM) 159 | return vaesmcq_u8(vaeseq_u8(state, round_key)); 160 | #else 161 | #error "Port" 162 | #endif 163 | } 164 | 165 | } // namespace randen 166 | 167 | #endif // VECTOR128_H_ 168 | -------------------------------------------------------------------------------- /vector128_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2017 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "vector128.h" 16 | 17 | #include 18 | #include 19 | 20 | namespace randen { 21 | namespace { 22 | 23 | #define ASSERT_TRUE(condition) \ 24 | while (!(condition)) { \ 25 | printf("Check failed at line %d\n", __LINE__); \ 26 | abort(); \ 27 | } 28 | 29 | void TestLoadStore() { 30 | const int N = 4; 31 | alignas(16) uint64_t test_cases[N * 2] = { 32 | 1, 2, 3, 4, 0x1234567890ABCDEFuLL, 0x2143658709BADCFEuLL}; 33 | 34 | alignas(16) uint64_t stored[N * 2]; 35 | for (int i = 0; i < N; ++i) { 36 | V v = Load(test_cases, i); 37 | Store(v, stored, i); 38 | 39 | ASSERT_TRUE(test_cases[2 * i + 0] == stored[2 * i + 0]); 40 | ASSERT_TRUE(test_cases[2 * i + 1] == stored[2 * i + 1]); 41 | } 42 | } 43 | 44 | void TestXor() { 45 | alignas(16) uint64_t test_cases[][3][2] = { 46 | {{1, 2}, {3, 4}, {2, 6}}, 47 | {{0x1234567890ABCDEFuLL, 0x2143658709BADCFEuLL}, 48 | {0x2143658709BADCFEuLL, 0x1234567890ABCDEFuLL}, 49 | {0x337733ff99111111uLL, 0x337733ff99111111uLL}}}; 50 | 51 | for (const auto& test_case : test_cases) { 52 | V v1 = Load(test_case[0], 0); 53 | V v2 = Load(test_case[1], 0); 54 | 55 | v1 ^= v2; 56 | alignas(16) uint64_t data_stored[2]; 57 | Store(v1, data_stored, 0); 58 | 59 | ASSERT_TRUE(test_case[2][0] == data_stored[0]); 60 | ASSERT_TRUE(test_case[2][1] == data_stored[1]); 61 | } 62 | } 63 | 64 | void TestAes() { 65 | // This test also catches byte-order bugs in Load/Store functions 66 | alignas(16) uint64_t message[2] = { 67 | RANDEN_LE(0x8899AABBCCDDEEFFuLL, 0x0123456789ABCDEFuLL)}; 68 | alignas(16) uint64_t key[2] = { 69 | RANDEN_LE(0x0022446688AACCEEuLL, 0x1133557799BBDDFFuLL)}; 70 | alignas(16) uint64_t expected_result[2] = { 71 | RANDEN_LE(0x28E4EE1884504333uLL, 0x16AB0E57DFC442EDuLL)}; 72 | 73 | V v_message = Load(message, 0); 74 | V v_key = Load(key, 0); 75 | V v_result = AES(v_message, v_key); 76 | 77 | alignas(16) uint64_t result[2]; 78 | Store(v_result, result, 0); 79 | 80 | ASSERT_TRUE(expected_result[0] == result[0]); 81 | ASSERT_TRUE(expected_result[1] == result[1]); 82 | } 83 | 84 | void RunAll() { 85 | // Immediately output any results (for non-local runs). 86 | setvbuf(stdout, nullptr, _IONBF, 0); 87 | 88 | TestLoadStore(); 89 | TestXor(); 90 | TestAes(); 91 | } 92 | 93 | } // namespace 94 | } // namespace randen 95 | 96 | int main(int argc, char* argv[]) { 97 | randen::RunAll(); 98 | return 0; 99 | } 100 | --------------------------------------------------------------------------------