├── .gitignore ├── images ├── sobol.png ├── random.png └── sobol_owen.png ├── Cargo.toml ├── licenses ├── MIT.txt ├── JOE_KUO.txt └── APACHE-2.0.txt ├── examples └── plots.rs ├── benches └── bench.rs ├── CHANGELOG.md ├── README.md ├── src ├── parts.rs ├── lib.rs └── wide.rs └── direction_numbers └── new-joe-kuo-6.256.txt /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | **/*.rs.bk 3 | Cargo.lock 4 | 5 | -------------------------------------------------------------------------------- /images/sobol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cessen/sobol_burley/HEAD/images/sobol.png -------------------------------------------------------------------------------- /images/random.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cessen/sobol_burley/HEAD/images/random.png -------------------------------------------------------------------------------- /images/sobol_owen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cessen/sobol_burley/HEAD/images/sobol_owen.png -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "sobol_burley" 3 | version = "0.5.0" 4 | description = "A seedable Owen-scrambled Sobol sequence." 5 | categories = ["algorithms", "mathematics", "no-std"] 6 | keywords = ["sobol", "low-discrepancy", "monte-carlo", "quasirandom", "sampling"] 7 | documentation = "https://docs.rs/sobol_burley" 8 | repository = "https://github.com/cessen/sobol_burley" 9 | readme = "README.md" 10 | authors = ["Nathan Vegdahl "] 11 | edition = "2018" 12 | license = "MIT OR Apache-2.0" 13 | build = "build.rs" 14 | exclude = ["/images"] 15 | 16 | [lib] 17 | name = "sobol_burley" 18 | path = "src/lib.rs" 19 | 20 | [features] 21 | default = ["simd"] 22 | simd = [] 23 | 24 | [dev-dependencies] 25 | rand = "0.8" 26 | bencher = "0.1.5" 27 | 28 | [[bench]] 29 | name = "bench" 30 | harness = false -------------------------------------------------------------------------------- /licenses/MIT.txt: -------------------------------------------------------------------------------- 1 | Copyright 2021 Nathan Vegdahl 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /examples/plots.rs: -------------------------------------------------------------------------------- 1 | use std::fs::File; 2 | use std::io::Write; 3 | 4 | use sobol_burley::parts::{sobol_rev, u32_to_f32_norm}; 5 | 6 | const X_RES: usize = 512; 7 | const Y_RES: usize = 512; 8 | const POINTS: u32 = 1 << 10; 9 | 10 | fn main() { 11 | // Plain sobol. 12 | plot(|i| sobol(i, 0), |i| sobol(i, 1), "sobol.pbm"); 13 | for d in 0..20 { 14 | plot( 15 | |i| sobol(i, d), 16 | |i| sobol(i, d + 1), 17 | &format!("sobol_{}.pbm", d), 18 | ); 19 | } 20 | 21 | // Scrambled shuffled sobol. 22 | for d in 0..20 { 23 | plot( 24 | |i| sobol_burley::sample(i, d, 0), 25 | |i| sobol_burley::sample(i, d + 1, 0), 26 | &format!("sobol_burley_{}.pbm", d), 27 | ); 28 | } 29 | } 30 | 31 | fn sobol(i: u32, dimension: u32) -> f32 { 32 | let sobol_int = sobol_rev(i.reverse_bits(), dimension).reverse_bits(); 33 | u32_to_f32_norm(sobol_int) 34 | } 35 | 36 | fn plot(x_fn: F1, y_fn: F2, filename: &str) 37 | where 38 | F1: Fn(u32) -> f32, 39 | F2: Fn(u32) -> f32, 40 | { 41 | let mut image = vec![1u8; X_RES * Y_RES]; 42 | for i in 0..POINTS { 43 | let x = (x_fn(i) * (X_RES - 1) as f32) as usize; 44 | let y = (y_fn(i) * (Y_RES - 1) as f32) as usize; 45 | image[y * X_RES + x] = 0; 46 | } 47 | 48 | let mut f = File::create(filename).unwrap(); 49 | f.write(format!("P1\n{} {}\n\n", X_RES, Y_RES).as_bytes()) 50 | .unwrap(); 51 | for chunk in image.chunks(80) { 52 | for pixel in chunk.iter() { 53 | f.write(if *pixel == 0 { b"0" } else { b"1" }).unwrap(); 54 | } 55 | f.write(b"\n").unwrap(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | use bencher::{benchmark_group, benchmark_main, black_box, Bencher}; 2 | use rand::prelude::*; 3 | use sobol_burley::{sample, sample_4d}; 4 | 5 | //---- 6 | 7 | fn gen_1000_samples_4d(bench: &mut Bencher) { 8 | bench.iter(|| { 9 | for i in 0..250u32 { 10 | black_box(sample_4d(i, 0, 1234567890)); 11 | } 12 | }); 13 | } 14 | 15 | fn gen_1000_samples_incoherent_4d(bench: &mut Bencher) { 16 | let mut rng = rand::thread_rng(); 17 | bench.iter(|| { 18 | let s = rng.gen::(); 19 | let d = rng.gen::(); 20 | let seed = rng.gen::(); 21 | for i in 0..250u32 { 22 | black_box(sample_4d( 23 | s.wrapping_add(i).wrapping_mul(512), 24 | d.wrapping_add(i).wrapping_mul(97) % 32, 25 | seed, 26 | )); 27 | } 28 | }); 29 | } 30 | 31 | fn gen_1000_samples(bench: &mut Bencher) { 32 | bench.iter(|| { 33 | for i in 0..1000u32 { 34 | black_box(sample(i, 0, 1234567890)); 35 | } 36 | }); 37 | } 38 | 39 | fn gen_1000_samples_incoherent(bench: &mut Bencher) { 40 | let mut rng = rand::thread_rng(); 41 | bench.iter(|| { 42 | let s = rng.gen::(); 43 | let d = rng.gen::(); 44 | let seed = rng.gen::(); 45 | for i in 0..1000u32 { 46 | black_box(sample( 47 | s.wrapping_add(i).wrapping_mul(512), 48 | d.wrapping_add(i).wrapping_mul(97) % 128, 49 | seed, 50 | )); 51 | } 52 | }); 53 | } 54 | 55 | //---- 56 | 57 | benchmark_group!( 58 | benches, 59 | gen_1000_samples, 60 | gen_1000_samples_incoherent, 61 | gen_1000_samples_4d, 62 | gen_1000_samples_incoherent_4d, 63 | ); 64 | benchmark_main!(benches); 65 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 4 | ## [Unreleased] 5 | 6 | 7 | ## [0.5.0] - 2023-07-05 8 | 9 | - Fleshed out the `Int4` impls with subtraction and conversion to `[u32; 4]`. 10 | - Cleaned up the function names in the `parts` module to be more consistent. 11 | 12 | 13 | ## [0.4.0] - 2022-07-16 14 | 15 | - Expose sampler internals so people can build their own custom samplers if needed for specific situations. 16 | 17 | 18 | ## [0.3.1] - 2021-05-16 19 | 20 | - Add a `simd` feature flag, to allow people to disable simd (and thus unsafe code). 21 | - Documentation improvements. 22 | 23 | 24 | ## [0.3.0] - 2021-05-14 25 | 26 | - Add support for computing just one dimension at a time. This makes usage 27 | easier when performance isn't critical, and makes the documention a bit 28 | easier to follow. 29 | - Panic in debug when the max sample count is exceeded. 30 | - Reduce memory footprint, by storing only the half of the direction vector data that we actually use. 31 | - Documentation improvements. 32 | 33 | 34 | ## [0.2.0] - 2021-05-11 35 | 36 | - Renamed MAX_DIMENSION_SET to NUM_DIMENSION_SETS to better reflect its meaning. 37 | - Some documentation improvements and cleanups. 38 | - Very tiny performance improvements due to better u32-to-f32 conversion and 39 | leaner SSE bit reversal code. 40 | 41 | 42 | ## [0.1.0] - 2021-05-11 43 | 44 | - First release. 45 | 46 | 47 | [Unreleased]: https://github.com/cessen/sobol_burley/compare/v0.5.0...HEAD 48 | [0.5.0]: https://github.com/cessen/sobol_burley/compare/v0.4.0...v0.5.0 49 | [0.4.0]: https://github.com/cessen/sobol_burley/compare/v0.3.1...v0.4.0 50 | [0.3.1]: https://github.com/cessen/sobol_burley/compare/v0.3.0...v0.3.1 51 | [0.3.0]: https://github.com/cessen/sobol_burley/compare/v0.2.0...v0.3.0 52 | [0.2.0]: https://github.com/cessen/sobol_burley/compare/v0.1.0...v0.2.0 53 | [0.1.0]: https://github.com/cessen/sobol_burley/releases/tag/v0.1.0 54 | -------------------------------------------------------------------------------- /licenses/JOE_KUO.txt: -------------------------------------------------------------------------------- 1 | The Sobol direction numbers in `direction_numbers/` and some of the code in `build.rs` (demarcated by comments) is from the website 2 | http://web.maths.unsw.edu.au/~fkuo/sobol/ 3 | 4 | From these papers: 5 | * S. Joe and F. Y. Kuo, Remark on Algorithm 659: Implementing 6 | Sobol's quasirandom sequence generator, ACM Trans. Math. Softw. 29, 7 | 49-57 (2003) 8 | * S. Joe and F. Y. Kuo, Constructing Sobol sequences with better 9 | two-dimensional projections, SIAM J. Sci. Comput. 30, 2635-2654 10 | (2008) 11 | 12 | Copyright (c) 2008, Frances Y. Kuo and Stephen Joe 13 | All rights reserved. 14 | 15 | Redistribution and use in source and binary forms, with or without 16 | modification, are permitted provided that the following conditions are 17 | met: 18 | 19 | * Redistributions of source code must retain the above copyright 20 | notice, this list of conditions and the following disclaimer. 21 | 22 | * Redistributions in binary form must reproduce the above copyright 23 | notice, this list of conditions and the following disclaimer in the 24 | documentation and/or other materials provided with the 25 | distribution. 26 | 27 | * Neither the names of the copyright holders nor the names of the 28 | University of New South Wales and the University of Waikato 29 | and its contributors may be used to endorse or promote products 30 | derived from this software without specific prior written 31 | permission. 32 | 33 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 34 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 35 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 36 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE 37 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 38 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 39 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 40 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 41 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 42 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 43 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sobol-Burley 2 | 3 | [![Latest Release][crates-io-badge]][crates-io-url] 4 | [![Documentation][docs-rs-img]][docs-rs-url] 5 | 6 | A seedable Owen-scrambled Sobol sequence based on the paper [Practical Hash-based Owen Scrambling](http://www.jcgt.org/published/0009/04/01/) by Brent Burley, but with an improved hash from [Building a Better LK Hash](https://psychopath.io/post/2021_01_30_building_a_better_lk_hash) and more dimensions due to [Kuo et al](http://web.maths.unsw.edu.au/~fkuo/sobol/). 7 | 8 | This crate is geared towards practical graphics applications, and as such has some limitations: 9 | 10 | * The maximum sequence length is 2^16. 11 | * The maximum number of dimensions is 256 (although this can be worked around with seeding). 12 | * Only `f32` output is supported. 13 | 14 | These are all trade-offs for the sake of better performance and a smaller memory footprint. 15 | 16 | Expanding this crate to be more suitable for a wider range of applications is a tentative goal for the future. However, efficient execution for graphics applications will always be the top priority. 17 | 18 | 19 | ## Basic usage 20 | 21 | Basic usage is pretty straightforward: 22 | 23 | ```rust 24 | use sobol_burley::sample; 25 | 26 | // Print 1024 3-dimensional points. 27 | for i in 0..1024 { 28 | let x = sample(i, 0, 0); 29 | let y = sample(i, 1, 0); 30 | let z = sample(i, 2, 0); 31 | println!("({}, {}, {})", x, y, z); 32 | } 33 | ``` 34 | 35 | The first parameter of `sample()` is the index of the sample you want, and the second parameter is the index of the dimension you want. The parameters are zero-indexed, and outputs are in the interval [0, 1). 36 | 37 | If all you want is a single Owen-scrambled Sobol sequence, then this is all you need. For more advanced usage, see the crate documentation. 38 | 39 | 40 | ## Why Owen-scrambled Sobol? 41 | 42 | There are other resources that explain this properly and in-depth, including Brent Burley's paper linked above. But here's the short version just to give some intuition: 43 | 44 | If you use random points, you get this: 45 | 46 | ![1024 random points](https://raw.githubusercontent.com/cessen/sobol_burley/master/images/random.png) 47 | 48 | If you use plain Sobol, you get this: 49 | 50 | ![1024 random points](https://raw.githubusercontent.com/cessen/sobol_burley/master/images/sobol.png) 51 | 52 | But if you use Owen-scrambled Sobol, you get this: 53 | 54 | ![1024 random points](https://raw.githubusercontent.com/cessen/sobol_burley/master/images/sobol_owen.png) 55 | 56 | Random points have an uneven distribution, and plain Sobol exhibits a strong structure that can result in bias and artifacts. But Owen-scrambled Sobol in some sense gets the best of both worlds: the even distribution of Sobol, but randomized to minimize structure. 57 | 58 | 59 | ## Unsafe code 60 | 61 | This crate uses unsafe code for SIMD acceleration. For 100% safe code, you can disable SIMD support via the `simd` feature flag (enabled by default). 62 | 63 | 64 | ## License 65 | 66 | The main code in this project is licensed under either of 67 | 68 | * MIT license (licenses/MIT.txt or http://opensource.org/licenses/MIT) 69 | * Apache License, Version 2.0, (licenses/APACHE-2.0.txt or http://www.apache.org/licenses/LICENSE-2.0) 70 | 71 | at your option. 72 | 73 | The Sobol direction numbers under `direction_numbers/` and some of the code in `build.rs` (demarcated by comments) is adapted from work by Stephen Joe and Frances Y. Kuo, and is under the 3-clause BSD license. See `licenses/JOE_KUO.txt` for details. 74 | 75 | 76 | ## Contributing 77 | 78 | Contributions are absolutely welcome! Please keep in mind that this crate aims to be: 79 | 80 | * no-std and allocation-free. PRs that use allocation, etc. are very likely to be rejected. 81 | * As small as it reasonably can be, including transitive dependencies. PRs that pull in dependencies--especially deep dependency trees--are likely to be rejected unless they really pull their weight. 82 | 83 | Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in this project by you will be licensed as above (MIT/Apache dual-license), without any additional terms or conditions. 84 | 85 | 86 | [crates-io-badge]: https://img.shields.io/crates/v/sobol_burley.svg 87 | [crates-io-url]: https://crates.io/crates/sobol_burley 88 | [docs-rs-img]: https://docs.rs/sobol_burley/badge.svg 89 | [docs-rs-url]: https://docs.rs/sobol_burley 90 | -------------------------------------------------------------------------------- /src/parts.rs: -------------------------------------------------------------------------------- 1 | //! The building blocks for making a Sobol sampler. 2 | //! 3 | //! This module contains the internal components of the main samplers 4 | //! in this crate. You can use these to build alternative Sobol 5 | //! samplers. However, it is easy to mess things up in subtle ways. 6 | //! So unless you have unique requirements it is recommended to stick 7 | //! with the main samplers in the crate root. 8 | //! 9 | //! **Note:** many of the functions in this module return reversed-bit 10 | //! integers, and take some of their parameters as reversed-bit 11 | //! integers as well. This is always indicated by a `_rev` postfix 12 | //! in the function name (for return values) and parameter names 13 | //! (for parameter values). 14 | //! 15 | //! # Examples 16 | //! 17 | //! A simple, non-scrambled Sobol sequence function: 18 | //! 19 | //! ```rust 20 | //! # use sobol_burley::parts::{sobol_rev, u32_to_f32_norm}; 21 | //! fn sobol(i: u32, dimension: u32) -> f32 { 22 | //! let sobol_int = sobol_rev(i.reverse_bits(), dimension).reverse_bits(); 23 | //! 24 | //! u32_to_f32_norm(sobol_int) 25 | //! } 26 | //! ``` 27 | //! 28 | //! A basic Owen-scrambled Sobol sequence function: 29 | //! 30 | //! ```rust 31 | //! # use sobol_burley::parts::{sobol_rev, u32_to_f32_norm, owen_scramble_rev, hash}; 32 | //! fn sobol_owen(i: u32, dimension: u32) -> f32 { 33 | //! let sobol_int_rev = sobol_rev(i.reverse_bits(), dimension); 34 | //! 35 | //! let sobol_owen_int = owen_scramble_rev( 36 | //! sobol_int_rev, 37 | //! hash(dimension), 38 | //! ).reverse_bits(); 39 | //! 40 | //! u32_to_f32_norm(sobol_owen_int) 41 | //! } 42 | //! ``` 43 | 44 | pub use crate::wide::Int4; 45 | 46 | use crate::{NUM_DIMENSIONS, NUM_DIMENSION_SETS_4D, REV_VECTORS}; 47 | 48 | /// Compute one dimension of a single sample in the Sobol sequence. 49 | #[inline] 50 | pub fn sobol_rev(sample_index_rev: u32, dimension: u32) -> u32 { 51 | assert!(dimension < NUM_DIMENSIONS); 52 | 53 | // The direction vectors are organized for SIMD, so we 54 | // need to access them this way. 55 | let dimension_set = (dimension >> 2) as usize; 56 | let sub_dimension = (dimension & 0b11) as usize; 57 | 58 | // Compute the Sobol sample with reversed bits. 59 | let vecs = &REV_VECTORS[dimension_set]; 60 | let mut sobol = 0u32; 61 | let mut index = sample_index_rev & 0xffff0000; // Only use the top 16 bits. 62 | let mut i = 0; 63 | while index != 0 { 64 | let j = index.leading_zeros(); 65 | // Note: using `get_unchecked()` here instead gives about a 3% 66 | // performance boost. I'm opting to leave that on the table for now, 67 | // for the sake of keeping the main code entirely safe. 68 | sobol ^= vecs[(i + j) as usize][sub_dimension]; 69 | i += j + 1; 70 | index <<= j; 71 | index <<= 1; 72 | } 73 | 74 | sobol 75 | } 76 | 77 | /// Same as [`sobol_rev()`] except returns four dimensions at once. 78 | /// 79 | /// **Note:** `dimension_set` indexes into sets of four dimensions: 80 | /// 81 | /// * `0` -> `[dim0, dim1, dim2, dim3]` 82 | /// * `1` -> `[dim4, dim5, dim6, dim7]` 83 | /// * etc. 84 | #[inline] 85 | pub fn sobol_int4_rev(sample_index_rev: u32, dimension_set: u32) -> Int4 { 86 | assert!(dimension_set < NUM_DIMENSION_SETS_4D); 87 | 88 | // Compute the Sobol sample with reversed bits. 89 | let vecs = &REV_VECTORS[dimension_set as usize]; 90 | let mut sobol = Int4::zero(); 91 | let mut index = sample_index_rev & 0xffff0000; // Only use the top 16 bits. 92 | let mut i = 0; 93 | while index != 0 { 94 | let j = index.leading_zeros(); 95 | // Note: using `get_unchecked()` here instead gives about a 3% 96 | // performance boost. I'm opting to leave that on the table for now, 97 | // for the sake of keeping the main code entirely safe. 98 | sobol ^= vecs[(i + j) as usize].into(); 99 | i += j + 1; 100 | index <<= j; 101 | index <<= 1; 102 | } 103 | 104 | sobol 105 | } 106 | 107 | /// Scramble `n` using a hash function that closely approximates a 108 | /// reverse-bit Owen scramble. 109 | /// 110 | /// Passing a different random `scramble` parameter results in a different 111 | /// random Owen scramble. 112 | /// 113 | /// Uses the hash function from 114 | /// 115 | /// 116 | /// **IMPORTANT:** `scramble` must already be well randomized! For 117 | /// example, incrementing integers will not work. In general, you should 118 | /// either: 119 | /// 120 | /// * Get `scramble` from a random source, or 121 | /// * First pass `scramble` through a hash function like [`hash_u32()`] 122 | /// to randomize it before passing it to this function. 123 | #[inline(always)] 124 | pub fn owen_scramble_rev(mut n_rev: u32, scramble: u32) -> u32 { 125 | n_rev ^= n_rev.wrapping_mul(0x3d20adea); 126 | n_rev = n_rev.wrapping_add(scramble); 127 | n_rev = n_rev.wrapping_mul((scramble >> 16) | 1); 128 | n_rev ^= n_rev.wrapping_mul(0x05526c56); 129 | n_rev ^= n_rev.wrapping_mul(0x53a22864); 130 | 131 | n_rev 132 | } 133 | 134 | /// Same as [`owen_scramble_rev()`], except on 4 integers at a time. 135 | /// 136 | /// You can (and probably should) put a different random scramble value 137 | /// in each lane of `scramble` to scramble each lane differently. 138 | #[inline(always)] 139 | pub fn owen_scramble_int4_rev(mut n_rev: Int4, scramble: Int4) -> Int4 { 140 | n_rev ^= n_rev * [0x3d20adea; 4].into(); 141 | n_rev += scramble; 142 | n_rev *= (scramble >> 16) | [1; 4].into(); 143 | n_rev ^= n_rev * [0x05526c56; 4].into(); 144 | n_rev ^= n_rev * [0x53a22864; 4].into(); 145 | 146 | n_rev 147 | } 148 | 149 | /// A fast 32-bit hash function. 150 | /// 151 | /// From 152 | #[inline(always)] 153 | pub fn hash(mut n: u32) -> u32 { 154 | n ^= 0xe6fe3beb; // So zero doesn't map to zero. 155 | 156 | n ^= n >> 16; 157 | n = n.wrapping_mul(0x7feb352d); 158 | n ^= n >> 15; 159 | n = n.wrapping_mul(0x846ca68b); 160 | n ^= n >> 16; 161 | 162 | n 163 | } 164 | 165 | /// Same as [`hash_u32()`] except on four numbers at once. 166 | #[inline(always)] 167 | pub fn hash_int4(mut n: Int4) -> Int4 { 168 | n ^= [0xe6fe3beb; 4].into(); // So zero doesn't map to zero. 169 | 170 | n ^= n >> 16; 171 | n *= [0x7feb352d; 4].into(); 172 | n ^= n >> 15; 173 | n *= [0x846ca68b; 4].into(); 174 | n ^= n >> 16; 175 | 176 | n 177 | } 178 | 179 | /// Convert a `u32` to a float in [0.0, 1.0). 180 | /// 181 | /// This maps the full range of `u32` to the [0, 1) range. 182 | #[inline(always)] 183 | pub fn u32_to_f32_norm(n: u32) -> f32 { 184 | f32::from_bits((n >> 9) | 0x3f800000) - 1.0 185 | } 186 | 187 | #[cfg(test)] 188 | mod tests { 189 | use super::*; 190 | 191 | #[test] 192 | pub fn to_norm_f32() { 193 | assert_eq!(u32_to_f32_norm(0), 0.0); 194 | assert!(u32_to_f32_norm(core::u32::MAX) < 1.0); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! A seedable Owen-scrambled Sobol sequence. 2 | //! 3 | //! This crate is based on the paper [Practical Hash-based Owen 4 | //! Scrambling](http://www.jcgt.org/published/0009/04/01/) by Brent Burley, 5 | //! but with an improved hash from [Building a Better LK 6 | //! Hash](https://psychopath.io/post/2021_01_30_building_a_better_lk_hash) 7 | //! and more dimensions due to 8 | //! [Kuo et al.](http://web.maths.unsw.edu.au/~fkuo/sobol/) 9 | //! 10 | //! This crate is geared towards practical graphics applications, and 11 | //! as such has some limitations: 12 | //! 13 | //! * The maximum sequence length is 2^16. 14 | //! * The maximum number of dimensions is 256 (although this can be worked 15 | //! around with seeding). 16 | //! * Only `f32` output is supported. 17 | //! 18 | //! These are all trade-offs for the sake of better performance and a smaller 19 | //! memory footprint. 20 | //! 21 | //! 22 | //! ## Basic usage 23 | //! 24 | //! Basic usage is pretty straightforward: 25 | //! 26 | //! ```rust 27 | //! use sobol_burley::sample; 28 | //! 29 | //! // Print 1024 3-dimensional points. 30 | //! for i in 0..1024 { 31 | //! let x = sample(i, 0, 0); 32 | //! let y = sample(i, 1, 0); 33 | //! let z = sample(i, 2, 0); 34 | //! println!("({}, {}, {})", x, y, z); 35 | //! } 36 | //! ``` 37 | //! 38 | //! The first parameter of `sample()` is the index of the sample you want, 39 | //! and the second parameter is the index of the dimension you want. The 40 | //! parameters are zero-indexed, and outputs are in the interval [0, 1). 41 | //! 42 | //! If all you want is a single Owen-scrambled Sobol sequence, then this is 43 | //! all you need. You can ignore the third parameter. 44 | //! 45 | //! 46 | //! ## Seeding 47 | //! 48 | //! *(Note: the `sample()` function automatically uses a different Owen 49 | //! scramble for each dimension, so seeding is unnecessary if you just want 50 | //! a single Sobol sequence.)* 51 | //! 52 | //! The third parameter of `sample()` is a seed that produces statistically 53 | //! independent Sobol sequences via the scrambling+shuffling technique from 54 | //! Brent Burley's paper. 55 | //! 56 | //! One of the applications for this is to decorrelate the error between 57 | //! related integral estimates. For example, in a 3d renderer you might 58 | //! pass a different seed to each pixel so that error in the pixel colors 59 | //! shows up as noise instead of as structured artifacts. 60 | //! 61 | //! Another important application is "padding" the dimensions of a Sobol 62 | //! sequence. By changing the seed we can re-use the same dimensions over 63 | //! and over to create an arbitrarily high-dimensional sequence. For example: 64 | //! 65 | //! ```rust 66 | //! # use sobol_burley::sample; 67 | //! // Print 10000 dimensions of a single sample. 68 | //! for dimension in 0..10000 { 69 | //! let seed = dimension / 4; 70 | //! let n = sample(0, dimension % 4, seed); 71 | //! println!("{}", n); 72 | //! } 73 | //!``` 74 | //! 75 | //! In this example we change seeds every 4 dimensions. This allows us to 76 | //! re-use the same 4 dimensions over and over, extending the sequence to as 77 | //! many dimensions as we like. Each set of 4 dimensions is stratified within 78 | //! itself, but is randomly decorrelated from the other sets. 79 | //! 80 | //! See Burley's paper for justification of this padding approach as well as 81 | //! recommendations about its use. 82 | //! 83 | //! 84 | //! # SIMD 85 | //! 86 | //! You can use `sample_4d()` to compute four dimensions at once, returned as 87 | //! an array of floats. 88 | //! 89 | //! On x86-64 architectures `sample_4d()` utilizes SIMD for a roughly 4x 90 | //! speed-up. On other architectures it still computes correct results, but 91 | //! SIMD isn't supported yet. 92 | //! 93 | //! Importantly, `sample()` and `sample_4d()` always compute identical results: 94 | //! 95 | //! ```rust 96 | //! # use sobol_burley::{sample, sample_4d}; 97 | //! for dimension_set in 0..10 { 98 | //! let a = [ 99 | //! sample(0, dimension_set * 4, 0), 100 | //! sample(0, dimension_set * 4 + 1, 0), 101 | //! sample(0, dimension_set * 4 + 2, 0), 102 | //! sample(0, dimension_set * 4 + 3, 0) 103 | //! ]; 104 | //! let b = sample_4d(0, dimension_set, 0); 105 | //! 106 | //! assert_eq!(a, b); 107 | //! } 108 | //! ``` 109 | //! 110 | //! The difference is only in performance and how the dimensions are indexed. 111 | 112 | #![no_std] 113 | #![allow(clippy::unreadable_literal)] 114 | 115 | pub mod parts; 116 | mod wide; 117 | 118 | // This `include` provides `NUM_DIMENSIONS` and `REV_VECTORS`. 119 | // See the build.rs file for how this included file is generated. 120 | include!(concat!(env!("OUT_DIR"), "/vectors.inc")); 121 | 122 | /// The number of available 4d dimension sets. 123 | /// 124 | /// This is just `NUM_DIMENSIONS / 4`, for convenience. 125 | pub const NUM_DIMENSION_SETS_4D: u32 = NUM_DIMENSIONS / 4; 126 | 127 | /// Compute one dimension of a single sample in the Sobol sequence. 128 | /// 129 | /// `sample_index` specifies which sample in the Sobol sequence to compute. 130 | /// A maxmimum of 2^16 samples is supported. 131 | /// 132 | /// `dimension` specifies which dimension to compute. 133 | /// 134 | /// `seed` produces statistically independent Sobol sequences. Passing two 135 | /// different seeds will produce two different sequences that are only randomly 136 | /// associated, with no stratification or correlation between them. 137 | /// 138 | /// Returns a number in the interval [0, 1). 139 | /// 140 | /// # Panics 141 | /// 142 | /// * Panics if `dimension` is greater than or equal to [`NUM_DIMENSIONS`]. 143 | /// * In debug, panics if `sample_index` is greater than or equal to 2^16. 144 | /// In release, returns unspecified floats in the interval [0, 1). 145 | #[inline] 146 | pub fn sample(sample_index: u32, dimension: u32, seed: u32) -> f32 { 147 | use parts::*; 148 | debug_assert!(sample_index < (1 << 16)); 149 | 150 | // Shuffle the index using the given seed to produce a unique statistically 151 | // independent Sobol sequence. 152 | let shuffled_rev_index = 153 | owen_scramble_rev(sample_index.reverse_bits(), hash(seed ^ 0x79c68e4a)); 154 | 155 | let sobol = sobol_rev(shuffled_rev_index, dimension); 156 | 157 | // Compute the scramble value for doing Owen scrambling. 158 | // The multiply on `seed` is to avoid accidental cancellation 159 | // with `dimension` on an incrementing or otherwise structured 160 | // seed. 161 | let scramble = { 162 | let seed = seed.wrapping_mul(0x9c8f2d3b); 163 | let ds = dimension >> 2; 164 | ds ^ seed ^ [0x912f69ba, 0x174f18ab, 0x691e72ca, 0xb40cc1b8][dimension as usize & 0b11] 165 | }; 166 | 167 | let sobol_owen_rev = owen_scramble_rev(sobol, hash(scramble)); 168 | 169 | u32_to_f32_norm(sobol_owen_rev.reverse_bits()) 170 | } 171 | 172 | /// Compute four dimensions of a single sample in the Sobol sequence. 173 | /// 174 | /// This is identical to [`sample()`], but computes four dimensions at once. 175 | /// On x86-64 architectures it utilizes SIMD for a roughly 4x speed-up. 176 | /// On other architectures it still computes correct results, but doesn't 177 | /// utilize SIMD. 178 | /// 179 | /// `dimension_set` specifies which four dimensions to compute. `0` yields the 180 | /// first four dimensions, `1` the second four dimensions, and so on. 181 | /// 182 | /// # Panics 183 | /// 184 | /// * Panics if `dimension_set` is greater than or equal to 185 | /// [`NUM_DIMENSION_SETS_4D`]. 186 | /// * In debug, panics if `sample_index` is greater than or equal to 2^16. 187 | /// In release, returns unspecified floats in the interval [0, 1). 188 | #[inline] 189 | pub fn sample_4d(sample_index: u32, dimension_set: u32, seed: u32) -> [f32; 4] { 190 | use parts::*; 191 | debug_assert!(sample_index < (1 << 16)); 192 | 193 | // Shuffle the index using the given seed to produce a unique statistically 194 | // independent Sobol sequence. 195 | let shuffled_rev_index = 196 | owen_scramble_rev(sample_index.reverse_bits(), hash(seed ^ 0x79c68e4a)); 197 | 198 | let sobol = sobol_int4_rev(shuffled_rev_index, dimension_set); 199 | 200 | // Compute the scramble values for doing Owen scrambling. 201 | // The multiply on `seed` is to avoid accidental cancellation 202 | // with `dimension` on an incrementing or otherwise structured 203 | // seed. 204 | let scramble = { 205 | let seed: Int4 = [seed.wrapping_mul(0x9c8f2d3b); 4].into(); 206 | let ds: Int4 = [dimension_set; 4].into(); 207 | seed ^ ds ^ [0x912f69ba, 0x174f18ab, 0x691e72ca, 0xb40cc1b8].into() 208 | }; 209 | 210 | let sobol_owen_rev = owen_scramble_int4_rev(sobol, hash_int4(scramble)); 211 | 212 | // Un-reverse the bits and convert to floating point in [0, 1). 213 | sobol_owen_rev.reverse_bits().to_f32_norm() 214 | } 215 | 216 | //---------------------------------------------------------------- 217 | 218 | #[cfg(test)] 219 | mod tests { 220 | use super::*; 221 | 222 | #[test] 223 | fn check_1d_and_4d_match() { 224 | for s in 0..4 { 225 | for d in 0..8 { 226 | for n in 0..256 { 227 | let a1 = sample(n, d * 4, s); 228 | let b1 = sample(n, d * 4 + 1, s); 229 | let c1 = sample(n, d * 4 + 2, s); 230 | let d1 = sample(n, d * 4 + 3, s); 231 | 232 | let [a2, b2, c2, d2] = sample_4d(n, d, s); 233 | 234 | assert_eq!(a1, a2); 235 | assert_eq!(b1, b2); 236 | assert_eq!(c1, c2); 237 | assert_eq!(d1, d2); 238 | } 239 | } 240 | } 241 | } 242 | } 243 | -------------------------------------------------------------------------------- /licenses/APACHE-2.0.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /direction_numbers/new-joe-kuo-6.256.txt: -------------------------------------------------------------------------------- 1 | d s a m_i 2 | 2 1 0 1 3 | 3 2 1 1 3 4 | 4 3 1 1 3 1 5 | 5 3 2 1 1 1 6 | 6 4 1 1 1 3 3 7 | 7 4 4 1 3 5 13 8 | 8 5 2 1 1 5 5 17 9 | 9 5 4 1 1 5 5 5 10 | 10 5 7 1 1 7 11 19 11 | 11 5 11 1 1 5 1 1 12 | 12 5 13 1 1 1 3 11 13 | 13 5 14 1 3 5 5 31 14 | 14 6 1 1 3 3 9 7 49 15 | 15 6 13 1 1 1 15 21 21 16 | 16 6 16 1 3 1 13 27 49 17 | 17 6 19 1 1 1 15 7 5 18 | 18 6 22 1 3 1 15 13 25 19 | 19 6 25 1 1 5 5 19 61 20 | 20 7 1 1 3 7 11 23 15 103 21 | 21 7 4 1 3 7 13 13 15 69 22 | 22 7 7 1 1 3 13 7 35 63 23 | 23 7 8 1 3 5 9 1 25 53 24 | 24 7 14 1 3 1 13 9 35 107 25 | 25 7 19 1 3 1 5 27 61 31 26 | 26 7 21 1 1 5 11 19 41 61 27 | 27 7 28 1 3 5 3 3 13 69 28 | 28 7 31 1 1 7 13 1 19 1 29 | 29 7 32 1 3 7 5 13 19 59 30 | 30 7 37 1 1 3 9 25 29 41 31 | 31 7 41 1 3 5 13 23 1 55 32 | 32 7 42 1 3 7 3 13 59 17 33 | 33 7 50 1 3 1 3 5 53 69 34 | 34 7 55 1 1 5 5 23 33 13 35 | 35 7 56 1 1 7 7 1 61 123 36 | 36 7 59 1 1 7 9 13 61 49 37 | 37 7 62 1 3 3 5 3 55 33 38 | 38 8 14 1 3 1 15 31 13 49 245 39 | 39 8 21 1 3 5 15 31 59 63 97 40 | 40 8 22 1 3 1 11 11 11 77 249 41 | 41 8 38 1 3 1 11 27 43 71 9 42 | 42 8 47 1 1 7 15 21 11 81 45 43 | 43 8 49 1 3 7 3 25 31 65 79 44 | 44 8 50 1 3 1 1 19 11 3 205 45 | 45 8 52 1 1 5 9 19 21 29 157 46 | 46 8 56 1 3 7 11 1 33 89 185 47 | 47 8 67 1 3 3 3 15 9 79 71 48 | 48 8 70 1 3 7 11 15 39 119 27 49 | 49 8 84 1 1 3 1 11 31 97 225 50 | 50 8 97 1 1 1 3 23 43 57 177 51 | 51 8 103 1 3 7 7 17 17 37 71 52 | 52 8 115 1 3 1 5 27 63 123 213 53 | 53 8 122 1 1 3 5 11 43 53 133 54 | 54 9 8 1 3 5 5 29 17 47 173 479 55 | 55 9 13 1 3 3 11 3 1 109 9 69 56 | 56 9 16 1 1 1 5 17 39 23 5 343 57 | 57 9 22 1 3 1 5 25 15 31 103 499 58 | 58 9 25 1 1 1 11 11 17 63 105 183 59 | 59 9 44 1 1 5 11 9 29 97 231 363 60 | 60 9 47 1 1 5 15 19 45 41 7 383 61 | 61 9 52 1 3 7 7 31 19 83 137 221 62 | 62 9 55 1 1 1 3 23 15 111 223 83 63 | 63 9 59 1 1 5 13 31 15 55 25 161 64 | 64 9 62 1 1 3 13 25 47 39 87 257 65 | 65 9 67 1 1 1 11 21 53 125 249 293 66 | 66 9 74 1 1 7 11 11 7 57 79 323 67 | 67 9 81 1 1 5 5 17 13 81 3 131 68 | 68 9 82 1 1 7 13 23 7 65 251 475 69 | 69 9 87 1 3 5 1 9 43 3 149 11 70 | 70 9 91 1 1 3 13 31 13 13 255 487 71 | 71 9 94 1 3 3 1 5 63 89 91 127 72 | 72 9 103 1 1 3 3 1 19 123 127 237 73 | 73 9 104 1 1 5 7 23 31 37 243 289 74 | 74 9 109 1 1 5 11 17 53 117 183 491 75 | 75 9 122 1 1 1 5 1 13 13 209 345 76 | 76 9 124 1 1 3 15 1 57 115 7 33 77 | 77 9 137 1 3 1 11 7 43 81 207 175 78 | 78 9 138 1 3 1 1 15 27 63 255 49 79 | 79 9 143 1 3 5 3 27 61 105 171 305 80 | 80 9 145 1 1 5 3 1 3 57 249 149 81 | 81 9 152 1 1 3 5 5 57 15 13 159 82 | 82 9 157 1 1 1 11 7 11 105 141 225 83 | 83 9 167 1 3 3 5 27 59 121 101 271 84 | 84 9 173 1 3 5 9 11 49 51 59 115 85 | 85 9 176 1 1 7 1 23 45 125 71 419 86 | 86 9 181 1 1 3 5 23 5 105 109 75 87 | 87 9 182 1 1 7 15 7 11 67 121 453 88 | 88 9 185 1 3 7 3 9 13 31 27 449 89 | 89 9 191 1 3 1 15 19 39 39 89 15 90 | 90 9 194 1 1 1 1 1 33 73 145 379 91 | 91 9 199 1 3 1 15 15 43 29 13 483 92 | 92 9 218 1 1 7 3 19 27 85 131 431 93 | 93 9 220 1 3 3 3 5 35 23 195 349 94 | 94 9 227 1 3 3 7 9 27 39 59 297 95 | 95 9 229 1 1 3 9 11 17 13 241 157 96 | 96 9 230 1 3 7 15 25 57 33 189 213 97 | 97 9 234 1 1 7 1 9 55 73 83 217 98 | 98 9 236 1 3 3 13 19 27 23 113 249 99 | 99 9 241 1 3 5 3 23 43 3 253 479 100 | 100 9 244 1 1 5 5 11 5 45 117 217 101 | 101 9 253 1 3 3 7 29 37 33 123 147 102 | 102 10 4 1 3 1 15 5 5 37 227 223 459 103 | 103 10 13 1 1 7 5 5 39 63 255 135 487 104 | 104 10 19 1 3 1 7 9 7 87 249 217 599 105 | 105 10 22 1 1 3 13 9 47 7 225 363 247 106 | 106 10 50 1 3 7 13 19 13 9 67 9 737 107 | 107 10 55 1 3 5 5 19 59 7 41 319 677 108 | 108 10 64 1 1 5 3 31 63 15 43 207 789 109 | 109 10 69 1 1 7 9 13 39 3 47 497 169 110 | 110 10 98 1 3 1 7 21 17 97 19 415 905 111 | 111 10 107 1 3 7 1 3 31 71 111 165 127 112 | 112 10 115 1 1 5 11 1 61 83 119 203 847 113 | 113 10 121 1 3 3 13 9 61 19 97 47 35 114 | 114 10 127 1 1 7 7 15 29 63 95 417 469 115 | 115 10 134 1 3 1 9 25 9 71 57 213 385 116 | 116 10 140 1 3 5 13 31 47 101 57 39 341 117 | 117 10 145 1 1 3 3 31 57 125 173 365 551 118 | 118 10 152 1 3 7 1 13 57 67 157 451 707 119 | 119 10 158 1 1 1 7 21 13 105 89 429 965 120 | 120 10 161 1 1 5 9 17 51 45 119 157 141 121 | 121 10 171 1 3 7 7 13 45 91 9 129 741 122 | 122 10 181 1 3 7 1 23 57 67 141 151 571 123 | 123 10 194 1 1 3 11 17 47 93 107 375 157 124 | 124 10 199 1 3 3 5 11 21 43 51 169 915 125 | 125 10 203 1 1 5 3 15 55 101 67 455 625 126 | 126 10 208 1 3 5 9 1 23 29 47 345 595 127 | 127 10 227 1 3 7 7 5 49 29 155 323 589 128 | 128 10 242 1 3 3 7 5 41 127 61 261 717 129 | 129 10 251 1 3 7 7 17 23 117 67 129 1009 130 | 130 10 253 1 1 3 13 11 39 21 207 123 305 131 | 131 10 265 1 1 3 9 29 3 95 47 231 73 132 | 132 10 266 1 3 1 9 1 29 117 21 441 259 133 | 133 10 274 1 3 1 13 21 39 125 211 439 723 134 | 134 10 283 1 1 7 3 17 63 115 89 49 773 135 | 135 10 289 1 3 7 13 11 33 101 107 63 73 136 | 136 10 295 1 1 5 5 13 57 63 135 437 177 137 | 137 10 301 1 1 3 7 27 63 93 47 417 483 138 | 138 10 316 1 1 3 1 23 29 1 191 49 23 139 | 139 10 319 1 1 3 15 25 55 9 101 219 607 140 | 140 10 324 1 3 1 7 7 19 51 251 393 307 141 | 141 10 346 1 3 3 3 25 55 17 75 337 3 142 | 142 10 352 1 1 1 13 25 17 65 45 479 413 143 | 143 10 361 1 1 7 7 27 49 99 161 213 727 144 | 144 10 367 1 3 5 1 23 5 43 41 251 857 145 | 145 10 382 1 3 3 7 11 61 39 87 383 835 146 | 146 10 395 1 1 3 15 13 7 29 7 505 923 147 | 147 10 398 1 3 7 1 5 31 47 157 445 501 148 | 148 10 400 1 1 3 7 1 43 9 147 115 605 149 | 149 10 412 1 3 3 13 5 1 119 211 455 1001 150 | 150 10 419 1 1 3 5 13 19 3 243 75 843 151 | 151 10 422 1 3 7 7 1 19 91 249 357 589 152 | 152 10 426 1 1 1 9 1 25 109 197 279 411 153 | 153 10 428 1 3 1 15 23 57 59 135 191 75 154 | 154 10 433 1 1 5 15 29 21 39 253 383 349 155 | 155 10 446 1 3 3 5 19 45 61 151 199 981 156 | 156 10 454 1 3 5 13 9 61 107 141 141 1 157 | 157 10 457 1 3 1 11 27 25 85 105 309 979 158 | 158 10 472 1 3 3 11 19 7 115 223 349 43 159 | 159 10 493 1 1 7 9 21 39 123 21 275 927 160 | 160 10 505 1 1 7 13 15 41 47 243 303 437 161 | 161 10 508 1 1 1 7 7 3 15 99 409 719 162 | 162 11 2 1 3 3 15 27 49 113 123 113 67 469 163 | 163 11 11 1 3 7 11 3 23 87 169 119 483 199 164 | 164 11 21 1 1 5 15 7 17 109 229 179 213 741 165 | 165 11 22 1 1 5 13 11 17 25 135 403 557 1433 166 | 166 11 35 1 3 1 1 1 61 67 215 189 945 1243 167 | 167 11 49 1 1 7 13 17 33 9 221 429 217 1679 168 | 168 11 50 1 1 3 11 27 3 15 93 93 865 1049 169 | 169 11 56 1 3 7 7 25 41 121 35 373 379 1547 170 | 170 11 61 1 3 3 9 11 35 45 205 241 9 59 171 | 171 11 70 1 3 1 7 3 51 7 177 53 975 89 172 | 172 11 74 1 1 3 5 27 1 113 231 299 759 861 173 | 173 11 79 1 3 3 15 25 29 5 255 139 891 2031 174 | 174 11 84 1 3 1 1 13 9 109 193 419 95 17 175 | 175 11 88 1 1 7 9 3 7 29 41 135 839 867 176 | 176 11 103 1 1 7 9 25 49 123 217 113 909 215 177 | 177 11 104 1 1 7 3 23 15 43 133 217 327 901 178 | 178 11 112 1 1 3 3 13 53 63 123 477 711 1387 179 | 179 11 115 1 1 3 15 7 29 75 119 181 957 247 180 | 180 11 117 1 1 1 11 27 25 109 151 267 99 1461 181 | 181 11 122 1 3 7 15 5 5 53 145 11 725 1501 182 | 182 11 134 1 3 7 1 9 43 71 229 157 607 1835 183 | 183 11 137 1 3 3 13 25 1 5 27 471 349 127 184 | 184 11 146 1 1 1 1 23 37 9 221 269 897 1685 185 | 185 11 148 1 1 3 3 31 29 51 19 311 553 1969 186 | 186 11 157 1 3 7 5 5 55 17 39 475 671 1529 187 | 187 11 158 1 1 7 1 1 35 47 27 437 395 1635 188 | 188 11 162 1 1 7 3 13 23 43 135 327 139 389 189 | 189 11 164 1 3 7 3 9 25 91 25 429 219 513 190 | 190 11 168 1 1 3 5 13 29 119 201 277 157 2043 191 | 191 11 173 1 3 5 3 29 57 13 17 167 739 1031 192 | 192 11 185 1 3 3 5 29 21 95 27 255 679 1531 193 | 193 11 186 1 3 7 15 9 5 21 71 61 961 1201 194 | 194 11 191 1 3 5 13 15 57 33 93 459 867 223 195 | 195 11 193 1 1 1 15 17 43 127 191 67 177 1073 196 | 196 11 199 1 1 1 15 23 7 21 199 75 293 1611 197 | 197 11 213 1 3 7 13 15 39 21 149 65 741 319 198 | 198 11 214 1 3 7 11 23 13 101 89 277 519 711 199 | 199 11 220 1 3 7 15 19 27 85 203 441 97 1895 200 | 200 11 227 1 3 1 3 29 25 21 155 11 191 197 201 | 201 11 236 1 1 7 5 27 11 81 101 457 675 1687 202 | 202 11 242 1 3 1 5 25 5 65 193 41 567 781 203 | 203 11 251 1 3 1 5 11 15 113 77 411 695 1111 204 | 204 11 256 1 1 3 9 11 53 119 171 55 297 509 205 | 205 11 259 1 1 1 1 11 39 113 139 165 347 595 206 | 206 11 265 1 3 7 11 9 17 101 13 81 325 1733 207 | 207 11 266 1 3 1 1 21 43 115 9 113 907 645 208 | 208 11 276 1 1 7 3 9 25 117 197 159 471 475 209 | 209 11 292 1 3 1 9 11 21 57 207 485 613 1661 210 | 210 11 304 1 1 7 7 27 55 49 223 89 85 1523 211 | 211 11 310 1 1 5 3 19 41 45 51 447 299 1355 212 | 212 11 316 1 3 1 13 1 33 117 143 313 187 1073 213 | 213 11 319 1 1 7 7 5 11 65 97 377 377 1501 214 | 214 11 322 1 3 1 1 21 35 95 65 99 23 1239 215 | 215 11 328 1 1 5 9 3 37 95 167 115 425 867 216 | 216 11 334 1 3 3 13 1 37 27 189 81 679 773 217 | 217 11 339 1 1 3 11 1 61 99 233 429 969 49 218 | 218 11 341 1 1 1 7 25 63 99 165 245 793 1143 219 | 219 11 345 1 1 5 11 11 43 55 65 71 283 273 220 | 220 11 346 1 1 5 5 9 3 101 251 355 379 1611 221 | 221 11 362 1 1 1 15 21 63 85 99 49 749 1335 222 | 222 11 367 1 1 5 13 27 9 121 43 255 715 289 223 | 223 11 372 1 3 1 5 27 19 17 223 77 571 1415 224 | 224 11 375 1 1 5 3 13 59 125 251 195 551 1737 225 | 225 11 376 1 3 3 15 13 27 49 105 389 971 755 226 | 226 11 381 1 3 5 15 23 43 35 107 447 763 253 227 | 227 11 385 1 3 5 11 21 3 17 39 497 407 611 228 | 228 11 388 1 1 7 13 15 31 113 17 23 507 1995 229 | 229 11 392 1 1 7 15 3 15 31 153 423 79 503 230 | 230 11 409 1 1 7 9 19 25 23 171 505 923 1989 231 | 231 11 415 1 1 5 9 21 27 121 223 133 87 697 232 | 232 11 416 1 1 5 5 9 19 107 99 319 765 1461 233 | 233 11 421 1 1 3 3 19 25 3 101 171 729 187 234 | 234 11 428 1 1 3 1 13 23 85 93 291 209 37 235 | 235 11 431 1 1 1 15 25 25 77 253 333 947 1073 236 | 236 11 434 1 1 3 9 17 29 55 47 255 305 2037 237 | 237 11 439 1 3 3 9 29 63 9 103 489 939 1523 238 | 238 11 446 1 3 7 15 7 31 89 175 369 339 595 239 | 239 11 451 1 3 7 13 25 5 71 207 251 367 665 240 | 240 11 453 1 3 3 3 21 25 75 35 31 321 1603 241 | 241 11 457 1 1 1 9 11 1 65 5 11 329 535 242 | 242 11 458 1 1 5 3 19 13 17 43 379 485 383 243 | 243 11 471 1 3 5 13 13 9 85 147 489 787 1133 244 | 244 11 475 1 3 1 1 5 51 37 129 195 297 1783 245 | 245 11 478 1 1 3 15 19 57 59 181 455 697 2033 246 | 246 11 484 1 3 7 1 27 9 65 145 325 189 201 247 | 247 11 493 1 3 1 15 31 23 19 5 485 581 539 248 | 248 11 494 1 1 7 13 11 15 65 83 185 847 831 249 | 249 11 499 1 3 5 7 7 55 73 15 303 511 1905 250 | 250 11 502 1 3 5 9 7 21 45 15 397 385 597 251 | 251 11 517 1 3 7 3 23 13 73 221 511 883 1265 252 | 252 11 518 1 1 3 11 1 51 73 185 33 975 1441 253 | 253 11 524 1 3 3 9 19 59 21 39 339 37 143 254 | 254 11 527 1 1 7 1 31 33 19 167 117 635 639 255 | 255 11 555 1 1 1 3 5 13 59 83 355 349 1967 256 | 256 11 560 1 1 1 5 19 3 53 133 97 863 983 257 | -------------------------------------------------------------------------------- /src/wide.rs: -------------------------------------------------------------------------------- 1 | //-------------------------------------------------------------------------- 2 | // x86/64 SSE 3 | #[cfg(all(target_arch = "x86_64", feature = "simd"))] 4 | pub(crate) mod sse { 5 | use core::arch::x86_64::{ 6 | __m128i, _mm_add_epi32, _mm_and_si128, _mm_or_si128, _mm_set1_epi32, _mm_set1_ps, 7 | _mm_set_epi32, _mm_setzero_si128, _mm_sll_epi32, _mm_slli_epi32, _mm_srl_epi32, 8 | _mm_srli_epi32, _mm_sub_epi32, _mm_sub_ps, _mm_xor_si128, 9 | }; 10 | 11 | /// A packed set of four `u32`s. 12 | /// 13 | /// Addition, subtraction, and multiplication are all wrapping. 14 | /// 15 | /// Uses SIMD for computation on supported platforms. 16 | #[derive(Debug, Copy, Clone)] 17 | pub struct Int4 { 18 | v: __m128i, 19 | } 20 | 21 | impl Int4 { 22 | #[inline(always)] 23 | pub(crate) fn zero() -> Int4 { 24 | Int4 { 25 | v: unsafe { _mm_setzero_si128() }, 26 | } 27 | } 28 | 29 | /// For testing. 30 | #[allow(dead_code)] 31 | fn get(self, i: usize) -> u32 { 32 | let n: [u32; 4] = unsafe { core::mem::transmute(self) }; 33 | n[i] 34 | } 35 | 36 | /// Convert each integer to a float in [0.0, 1.0). 37 | /// 38 | /// Same behavior as 39 | /// [`parts::u32_to_f32_norm()`](`crate::parts::u32_to_f32_norm()`), 40 | /// applied to each integer individually. 41 | #[inline(always)] 42 | pub fn to_f32_norm(self) -> [f32; 4] { 43 | let n4 = unsafe { 44 | let a = _mm_srli_epi32(self.v, 9); 45 | let b = _mm_or_si128(a, _mm_set1_epi32(core::mem::transmute(0x3f800000u32))); 46 | _mm_sub_ps(core::mem::transmute(b), _mm_set1_ps(1.0)) 47 | }; 48 | 49 | unsafe { core::mem::transmute(n4) } 50 | } 51 | 52 | /// Reverse the order of the bits in each integer. 53 | /// 54 | /// Same behavior as `reverse_bits()` in the Rust standard 55 | /// library, applied to each integer individually. 56 | #[inline] 57 | pub fn reverse_bits(self) -> Int4 { 58 | let mut n = self.v; 59 | unsafe { 60 | // From http://aggregate.org/MAGIC/#Bit%20Reversal but SIMD 61 | // on four numbers at once. 62 | 63 | let y0 = _mm_set1_epi32(core::mem::transmute(0x55555555u32)); 64 | n = _mm_or_si128( 65 | _mm_and_si128(_mm_srli_epi32(n, 1), y0), 66 | _mm_slli_epi32(_mm_and_si128(n, y0), 1), 67 | ); 68 | 69 | let y1 = _mm_set1_epi32(core::mem::transmute(0x33333333u32)); 70 | n = _mm_or_si128( 71 | _mm_and_si128(_mm_srli_epi32(n, 2), y1), 72 | _mm_slli_epi32(_mm_and_si128(n, y1), 2), 73 | ); 74 | 75 | let y2 = _mm_set1_epi32(core::mem::transmute(0x0f0f0f0fu32)); 76 | n = _mm_or_si128( 77 | _mm_and_si128(_mm_srli_epi32(n, 4), y2), 78 | _mm_slli_epi32(_mm_and_si128(n, y2), 4), 79 | ); 80 | 81 | let y3 = _mm_set1_epi32(core::mem::transmute(0x00ff00ffu32)); 82 | n = _mm_or_si128( 83 | _mm_and_si128(_mm_srli_epi32(n, 8), y3), 84 | _mm_slli_epi32(_mm_and_si128(n, y3), 8), 85 | ); 86 | 87 | n = _mm_or_si128(_mm_srli_epi32(n, 16), _mm_slli_epi32(n, 16)); 88 | 89 | Int4 { v: n } 90 | } 91 | } 92 | } 93 | 94 | impl core::ops::Mul for Int4 { 95 | type Output = Self; 96 | #[inline(always)] 97 | fn mul(self, other: Self) -> Int4 { 98 | // This only works with SSE 4.1 support. 99 | #[cfg(target_feature = "sse4.1")] 100 | unsafe { 101 | use core::arch::x86_64::_mm_mullo_epi32; 102 | Int4 { 103 | v: _mm_mullo_epi32(self.v, other.v), 104 | } 105 | } 106 | 107 | // This works on all x86-64 chips. 108 | #[cfg(not(target_feature = "sse4.1"))] 109 | unsafe { 110 | use core::arch::x86_64::{_mm_mul_epu32, _mm_shuffle_epi32}; 111 | let a = _mm_and_si128( 112 | _mm_mul_epu32(self.v, other.v), 113 | _mm_set_epi32(0, 0xffffffffu32 as i32, 0, 0xffffffffu32 as i32), 114 | ); 115 | let b = _mm_and_si128( 116 | _mm_mul_epu32( 117 | _mm_shuffle_epi32(self.v, 0b11_11_01_01), 118 | _mm_shuffle_epi32(other.v, 0b11_11_01_01), 119 | ), 120 | _mm_set_epi32(0, 0xffffffffu32 as i32, 0, 0xffffffffu32 as i32), 121 | ); 122 | Int4 { 123 | v: _mm_or_si128(a, _mm_shuffle_epi32(b, 0b10_11_00_01)), 124 | } 125 | } 126 | } 127 | } 128 | 129 | impl core::ops::MulAssign for Int4 { 130 | #[inline(always)] 131 | fn mul_assign(&mut self, other: Self) { 132 | *self = *self * other; 133 | } 134 | } 135 | 136 | impl core::ops::Add for Int4 { 137 | type Output = Self; 138 | #[inline(always)] 139 | fn add(self, other: Self) -> Self { 140 | Int4 { 141 | v: unsafe { _mm_add_epi32(self.v, other.v) }, 142 | } 143 | } 144 | } 145 | 146 | impl core::ops::AddAssign for Int4 { 147 | #[inline(always)] 148 | fn add_assign(&mut self, other: Self) { 149 | *self = *self + other; 150 | } 151 | } 152 | 153 | impl core::ops::Sub for Int4 { 154 | type Output = Self; 155 | #[inline(always)] 156 | fn sub(self, other: Self) -> Self { 157 | Int4 { 158 | v: unsafe { _mm_sub_epi32(self.v, other.v) }, 159 | } 160 | } 161 | } 162 | 163 | impl core::ops::SubAssign for Int4 { 164 | #[inline(always)] 165 | fn sub_assign(&mut self, other: Self) { 166 | *self = *self - other; 167 | } 168 | } 169 | 170 | impl core::ops::BitAnd for Int4 { 171 | type Output = Self; 172 | #[inline(always)] 173 | fn bitand(self, other: Self) -> Int4 { 174 | Int4 { 175 | v: unsafe { _mm_and_si128(self.v, other.v) }, 176 | } 177 | } 178 | } 179 | 180 | impl core::ops::BitAndAssign for Int4 { 181 | #[inline(always)] 182 | fn bitand_assign(&mut self, other: Self) { 183 | *self = *self & other; 184 | } 185 | } 186 | 187 | impl core::ops::BitOr for Int4 { 188 | type Output = Self; 189 | #[inline(always)] 190 | fn bitor(self, other: Self) -> Int4 { 191 | Int4 { 192 | v: unsafe { _mm_or_si128(self.v, other.v) }, 193 | } 194 | } 195 | } 196 | 197 | impl core::ops::BitOrAssign for Int4 { 198 | #[inline(always)] 199 | fn bitor_assign(&mut self, other: Self) { 200 | *self = *self | other; 201 | } 202 | } 203 | 204 | impl core::ops::BitXor for Int4 { 205 | type Output = Self; 206 | #[inline(always)] 207 | fn bitxor(self, other: Self) -> Int4 { 208 | Int4 { 209 | v: unsafe { _mm_xor_si128(self.v, other.v) }, 210 | } 211 | } 212 | } 213 | 214 | impl core::ops::BitXorAssign for Int4 { 215 | #[inline(always)] 216 | fn bitxor_assign(&mut self, other: Self) { 217 | *self = *self ^ other; 218 | } 219 | } 220 | 221 | impl core::ops::Shl for Int4 { 222 | type Output = Self; 223 | #[inline(always)] 224 | fn shl(self, other: i32) -> Int4 { 225 | Int4 { 226 | v: unsafe { _mm_sll_epi32(self.v, _mm_set_epi32(0, 0, 0, other)) }, 227 | } 228 | } 229 | } 230 | 231 | impl core::ops::Shr for Int4 { 232 | type Output = Self; 233 | #[inline(always)] 234 | fn shr(self, other: i32) -> Int4 { 235 | Int4 { 236 | v: unsafe { _mm_srl_epi32(self.v, _mm_set_epi32(0, 0, 0, other)) }, 237 | } 238 | } 239 | } 240 | 241 | impl From<[u32; 4]> for Int4 { 242 | #[inline(always)] 243 | fn from(v: [u32; 4]) -> Self { 244 | Int4 { 245 | v: unsafe { core::mem::transmute(v) }, 246 | } 247 | } 248 | } 249 | 250 | impl From for [u32; 4] { 251 | #[inline(always)] 252 | fn from(i: Int4) -> [u32; 4] { 253 | unsafe { core::mem::transmute(i.v) } 254 | } 255 | } 256 | 257 | #[cfg(test)] 258 | mod tests { 259 | use super::*; 260 | 261 | #[test] 262 | fn from_array() { 263 | let a = Int4::from([1, 2, 3, 4]); 264 | assert_eq!(a.get(0), 1); 265 | assert_eq!(a.get(1), 2); 266 | assert_eq!(a.get(2), 3); 267 | assert_eq!(a.get(3), 4); 268 | } 269 | 270 | #[test] 271 | fn shr() { 272 | let a = Int4::from([0xffffffff; 4]) >> 16; 273 | assert_eq!(a.get(0), 0x0000ffff); 274 | assert_eq!(a.get(1), 0x0000ffff); 275 | assert_eq!(a.get(2), 0x0000ffff); 276 | assert_eq!(a.get(3), 0x0000ffff); 277 | } 278 | 279 | #[test] 280 | fn shl() { 281 | let a = Int4::from([0xffffffff; 4]) << 16; 282 | assert_eq!(a.get(0), 0xffff0000); 283 | assert_eq!(a.get(1), 0xffff0000); 284 | assert_eq!(a.get(2), 0xffff0000); 285 | assert_eq!(a.get(3), 0xffff0000); 286 | } 287 | 288 | #[test] 289 | fn to_f32_norm() { 290 | let a = Int4::from([0x00000000; 4]); 291 | let b = Int4::from([0x80000000; 4]); 292 | let c = Int4::from([0xffffffff; 4]); 293 | 294 | let a2 = a.to_f32_norm(); 295 | let b2 = b.to_f32_norm(); 296 | let c2 = c.to_f32_norm(); 297 | 298 | assert_eq!(a2, [0.0, 0.0, 0.0, 0.0]); 299 | assert_eq!(b2, [0.5, 0.5, 0.5, 0.5]); 300 | assert!(c2[0] > 0.99999 && c2[0] < 1.0); 301 | assert!(c2[1] > 0.99999 && c2[1] < 1.0); 302 | assert!(c2[2] > 0.99999 && c2[2] < 1.0); 303 | assert!(c2[3] > 0.99999 && c2[3] < 1.0); 304 | } 305 | 306 | #[test] 307 | fn reverse_bits() { 308 | let a = 0xcde7a64e_u32; 309 | let b = 0xdc69fbd9_u32; 310 | let c = 0x3238fec6_u32; 311 | let d = 0x1fb9ba8f_u32; 312 | 313 | assert_eq!(Int4::from([a; 4]).reverse_bits().get(0), a.reverse_bits()); 314 | assert_eq!(Int4::from([b; 4]).reverse_bits().get(0), b.reverse_bits()); 315 | assert_eq!(Int4::from([c; 4]).reverse_bits().get(0), c.reverse_bits()); 316 | assert_eq!(Int4::from([d; 4]).reverse_bits().get(0), d.reverse_bits()); 317 | } 318 | } 319 | } 320 | #[cfg(all(target_arch = "x86_64", feature = "simd"))] 321 | pub use sse::Int4; 322 | 323 | //-------------------------------------------------------------------------- 324 | // Fallback 325 | #[cfg(not(all(target_arch = "x86_64", feature = "simd")))] 326 | pub(crate) mod fallback { 327 | /// A packed set of four `u32`s. 328 | /// 329 | /// Uses SIMD for computation on supported platforms. 330 | #[derive(Debug, Copy, Clone)] 331 | #[repr(align(16))] 332 | pub struct Int4 { 333 | v: [u32; 4], 334 | } 335 | 336 | impl Int4 { 337 | #[inline(always)] 338 | pub(crate) fn zero() -> Int4 { 339 | Int4 { v: [0, 0, 0, 0] } 340 | } 341 | 342 | /// Convert each integer to a float in [0.0, 1.0). 343 | /// 344 | /// Same behavior as 345 | /// [`parts::u32_to_f32_norm()`](`crate::parts::u32_to_f32_norm()`), 346 | /// applied to each integer individually. 347 | #[inline(always)] 348 | pub fn to_f32_norm(self) -> [f32; 4] { 349 | [ 350 | f32::from_bits((self.v[0] >> 9) | 0x3f800000) - 1.0, 351 | f32::from_bits((self.v[1] >> 9) | 0x3f800000) - 1.0, 352 | f32::from_bits((self.v[2] >> 9) | 0x3f800000) - 1.0, 353 | f32::from_bits((self.v[3] >> 9) | 0x3f800000) - 1.0, 354 | ] 355 | } 356 | 357 | /// Reverse the order of the bits in each integer. 358 | /// 359 | /// Same behavior as `reverse_bits()` in the Rust standard 360 | /// library, applied to each integer individually. 361 | #[inline(always)] 362 | pub fn reverse_bits(self) -> Int4 { 363 | Int4 { 364 | v: [ 365 | self.v[0].reverse_bits(), 366 | self.v[1].reverse_bits(), 367 | self.v[2].reverse_bits(), 368 | self.v[3].reverse_bits(), 369 | ], 370 | } 371 | } 372 | } 373 | 374 | impl core::ops::Mul for Int4 { 375 | type Output = Self; 376 | #[inline(always)] 377 | fn mul(self, other: Self) -> Int4 { 378 | Int4 { 379 | v: [ 380 | self.v[0].wrapping_mul(other.v[0]), 381 | self.v[1].wrapping_mul(other.v[1]), 382 | self.v[2].wrapping_mul(other.v[2]), 383 | self.v[3].wrapping_mul(other.v[3]), 384 | ], 385 | } 386 | } 387 | } 388 | 389 | impl core::ops::MulAssign for Int4 { 390 | #[inline(always)] 391 | fn mul_assign(&mut self, other: Self) { 392 | *self = *self * other; 393 | } 394 | } 395 | 396 | impl core::ops::Add for Int4 { 397 | type Output = Self; 398 | #[inline(always)] 399 | fn add(self, other: Self) -> Self { 400 | Int4 { 401 | v: [ 402 | self.v[0].wrapping_add(other.v[0]), 403 | self.v[1].wrapping_add(other.v[1]), 404 | self.v[2].wrapping_add(other.v[2]), 405 | self.v[3].wrapping_add(other.v[3]), 406 | ], 407 | } 408 | } 409 | } 410 | 411 | impl core::ops::AddAssign for Int4 { 412 | #[inline(always)] 413 | fn add_assign(&mut self, other: Self) { 414 | *self = *self + other; 415 | } 416 | } 417 | 418 | impl core::ops::Sub for Int4 { 419 | type Output = Self; 420 | #[inline(always)] 421 | fn sub(self, other: Self) -> Self { 422 | Int4 { 423 | v: [ 424 | self.v[0].wrapping_sub(other.v[0]), 425 | self.v[1].wrapping_sub(other.v[1]), 426 | self.v[2].wrapping_sub(other.v[2]), 427 | self.v[3].wrapping_sub(other.v[3]), 428 | ], 429 | } 430 | } 431 | } 432 | 433 | impl core::ops::SubAssign for Int4 { 434 | #[inline(always)] 435 | fn sub_assign(&mut self, other: Self) { 436 | *self = *self - other; 437 | } 438 | } 439 | 440 | impl core::ops::BitAnd for Int4 { 441 | type Output = Self; 442 | #[inline(always)] 443 | fn bitand(self, other: Self) -> Int4 { 444 | Int4 { 445 | v: [ 446 | self.v[0] & other.v[0], 447 | self.v[1] & other.v[1], 448 | self.v[2] & other.v[2], 449 | self.v[3] & other.v[3], 450 | ], 451 | } 452 | } 453 | } 454 | 455 | impl core::ops::BitAndAssign for Int4 { 456 | #[inline(always)] 457 | fn bitand_assign(&mut self, other: Self) { 458 | *self = *self & other; 459 | } 460 | } 461 | 462 | impl core::ops::BitOr for Int4 { 463 | type Output = Self; 464 | #[inline(always)] 465 | fn bitor(self, other: Self) -> Int4 { 466 | Int4 { 467 | v: [ 468 | self.v[0] | other.v[0], 469 | self.v[1] | other.v[1], 470 | self.v[2] | other.v[2], 471 | self.v[3] | other.v[3], 472 | ], 473 | } 474 | } 475 | } 476 | 477 | impl core::ops::BitOrAssign for Int4 { 478 | #[inline(always)] 479 | fn bitor_assign(&mut self, other: Self) { 480 | *self = *self | other; 481 | } 482 | } 483 | 484 | impl core::ops::BitXor for Int4 { 485 | type Output = Self; 486 | #[inline(always)] 487 | fn bitxor(self, other: Self) -> Int4 { 488 | Int4 { 489 | v: [ 490 | self.v[0] ^ other.v[0], 491 | self.v[1] ^ other.v[1], 492 | self.v[2] ^ other.v[2], 493 | self.v[3] ^ other.v[3], 494 | ], 495 | } 496 | } 497 | } 498 | 499 | impl core::ops::BitXorAssign for Int4 { 500 | #[inline(always)] 501 | fn bitxor_assign(&mut self, other: Self) { 502 | *self = *self ^ other; 503 | } 504 | } 505 | 506 | impl core::ops::Shl for Int4 { 507 | type Output = Self; 508 | #[inline(always)] 509 | fn shl(self, other: i32) -> Int4 { 510 | Int4 { 511 | v: [ 512 | self.v[0] << other, 513 | self.v[1] << other, 514 | self.v[2] << other, 515 | self.v[3] << other, 516 | ], 517 | } 518 | } 519 | } 520 | 521 | impl core::ops::Shr for Int4 { 522 | type Output = Self; 523 | #[inline(always)] 524 | fn shr(self, other: i32) -> Int4 { 525 | Int4 { 526 | v: [ 527 | self.v[0] >> other, 528 | self.v[1] >> other, 529 | self.v[2] >> other, 530 | self.v[3] >> other, 531 | ], 532 | } 533 | } 534 | } 535 | 536 | impl From<[u32; 4]> for Int4 { 537 | #[inline(always)] 538 | fn from(v: [u32; 4]) -> Self { 539 | Int4 { v } 540 | } 541 | } 542 | 543 | impl From for [u32; 4] { 544 | #[inline(always)] 545 | fn from(i: Int4) -> [u32; 4] { 546 | i.v 547 | } 548 | } 549 | } 550 | #[cfg(not(all(target_arch = "x86_64", feature = "simd")))] 551 | pub use fallback::Int4; 552 | --------------------------------------------------------------------------------