├── .gitignore ├── .travis.yml ├── CHANGELOG.org ├── Cargo.toml ├── LICENSE ├── README.org ├── benches ├── destride.rs ├── intrin.rs └── usage.rs ├── examples └── main.rs ├── scripts └── vec_patterns_gen.py ├── src ├── arch │ ├── mod.rs │ ├── unknown │ │ ├── intrin │ │ │ ├── abs.rs │ │ │ ├── cmp.rs │ │ │ ├── destride.rs │ │ │ ├── downcast.rs │ │ │ ├── endian.rs │ │ │ ├── eq.rs │ │ │ ├── hadd.rs │ │ │ ├── hsub.rs │ │ │ ├── merge.rs │ │ │ ├── mod.rs │ │ │ ├── recip.rs │ │ │ ├── round.rs │ │ │ ├── rsqrt.rs │ │ │ ├── saturating_add.rs │ │ │ ├── saturating_hadd.rs │ │ │ ├── saturating_hsub.rs │ │ │ ├── saturating_sub.rs │ │ │ ├── sqrt.rs │ │ │ ├── sum.rs │ │ │ ├── transmute.rs │ │ │ └── upcast.rs │ │ ├── mod.rs │ │ ├── vec_patterns.rs │ │ └── vecs.rs │ └── x86 │ │ ├── intrin │ │ ├── abs.rs │ │ ├── addsub.rs │ │ ├── cast.rs │ │ ├── cmp.rs │ │ ├── destride.rs │ │ ├── downcast.rs │ │ ├── endian.rs │ │ ├── eq.rs │ │ ├── hadd.rs │ │ ├── hsub.rs │ │ ├── merge.rs │ │ ├── mod.rs │ │ ├── popcnt.rs │ │ ├── recip.rs │ │ ├── round.rs │ │ ├── rsqrt.rs │ │ ├── saturating_add.rs │ │ ├── saturating_hadd.rs │ │ ├── saturating_hsub.rs │ │ ├── saturating_sub.rs │ │ ├── sqrt.rs │ │ ├── sum.rs │ │ ├── swizzle.rs │ │ ├── transmute.rs │ │ └── upcast.rs │ │ ├── mod.rs │ │ ├── vec_patterns.rs │ │ └── vecs.rs ├── debug.rs ├── into_iters.rs ├── intrin │ ├── abs.rs │ ├── addsub.rs │ ├── cast.rs │ ├── cmp.rs │ ├── destride.rs │ ├── downcast.rs │ ├── endian.rs │ ├── eq.rs │ ├── hadd.rs │ ├── hsub.rs │ ├── macros.rs │ ├── merge.rs │ ├── mod.rs │ ├── popcnt.rs │ ├── recip.rs │ ├── round.rs │ ├── rsqrt.rs │ ├── saturating_add.rs │ ├── saturating_hadd.rs │ ├── saturating_hsub.rs │ ├── saturating_sub.rs │ ├── sqrt.rs │ ├── sum.rs │ ├── swizzle.rs │ ├── transmute.rs │ └── upcast.rs ├── iters.rs ├── lib.rs ├── prelude.rs ├── stride.rs ├── stride_zip.rs ├── vec_patterns.rs ├── vecs.rs └── zip.rs └── tests ├── iters.rs ├── kernel.rs └── zip.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | **/*.rs.bk 3 | #* 4 | *# 5 | *~ 6 | ~* 7 | *.bak 8 | *.sav 9 | kek.* 10 | kek-* 11 | Cargo.lock 12 | .idea 13 | *.rlib 14 | *.d 15 | *.s 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | sudo: false 3 | rust: 4 | - nightly 5 | env: 6 | - RUSTFLAGS="-C target-feature=-sse -C target-feature=+x87" RUN="build --all --examples --tests --benches --bins" 7 | - RUSTFLAGS="-C target-feature=+sse" RUN="build --all --examples --tests --benches --bins" 8 | - RUSTFLAGS="-C target-feature=+sse2" RUN="build --all --examples --tests --benches --bins" 9 | - RUSTFLAGS="-C target-feature=+sse3" RUN="build --all --examples --tests --benches --bins" 10 | - RUSTFLAGS="-C target-feature=+ssse3" RUN="build --all --examples --tests --benches --bins" 11 | - RUSTFLAGS="-C target-feature=+sse4a" RUN="build --all --examples --tests --benches --bins" 12 | - RUSTFLAGS="-C target-feature=+sse4.1" RUN="build --all --examples --tests --benches --bins" 13 | - RUSTFLAGS="-C target-feature=+sse4.2" RUN="build --all --examples --tests --benches --bins" 14 | - RUSTFLAGS="-C target-feature=+avx" RUN="build --all --examples --tests --benches --bins" 15 | - RUSTFLAGS="-C target-feature=+avx2" RUN="build --all --examples --tests --benches --bins" 16 | - RUSTFLAGS="-C target-feature=+avx512" RUN="build --all --examples --tests --benches --bins" 17 | - RUSTFLAGS="-C target-cpu=x86-64" RUN="test --lib --tests --benches --examples --doc" 18 | - RUSTFLAGS="-C target-cpu=pentium" RUN="test --lib --tests --benches --examples" 19 | - RUSTFLAGS="-C target-cpu=pentium3" RUN="test --lib --tests --benches --examples" 20 | - RUSTFLAGS="-C target-cpu=pentium4" RUN="test --lib --tests --benches --examples" 21 | - RUSTFLAGS="-C target-cpu=core2" RUN="test --lib --tests --benches --examples" 22 | - RUSTFLAGS="-C target-cpu=nehalem" RUN="test --lib --tests --benches --examples" 23 | - RUSTFLAGS="-C target-cpu=sandybridge" RUN="test --lib --tests --benches --examples" 24 | - RUSTFLAGS="-C target-cpu=native" RUN="test --lib --tests --benches --examples" 25 | matrix: 26 | fast_finish: true 27 | install: 28 | script: 29 | - cat /proc/cpuinfo 30 | - bash -c "cargo $RUN --verbose" 31 | - bash -c "cargo $RUN --verbose --features \"std\"" 32 | notifications: 33 | email: false 34 | -------------------------------------------------------------------------------- /CHANGELOG.org: -------------------------------------------------------------------------------- 1 | * 0.4.3 2 | ** Features 3 | - Significantly speed up automatic iterators (huge thanks to Osveron!) 4 | * 0.4.2 5 | ** Features 6 | - Add ~simd_for_each~ 7 | - Add equality comparison via ~PackedEq~ 8 | * 0.4.1 9 | ** Bugfixes & Minor Improvements 10 | - Fix an issue with zipping even collections 11 | - Fix an upcast on AVX2 machines 12 | * 0.4.0 13 | Announcing faster 0.4.0 - a 4,500+ line diff from 0.3.0 14 | ** Big Changes 15 | - Support ~#![no_std]~ 16 | - Add striping, gathers, and scatters. 17 | - Add vector merging 18 | - Add vectorized endianness operations 19 | - Add limited vector swizzling 20 | - Add lockstep packed iterators 21 | ** Features 22 | - Add tons of docstrings 23 | - Allow ~FnMut~ closures in ~simd_map~ and ~simd_reduce~ 24 | - Vectorize operations on last elements of an uneven collection 25 | - Implement compound assignment operators for architectures without hardware SIMD 26 | - Add large vectors for architectures without hardware SIMD 27 | ** Bugfixes & Minor Improvements 28 | - Add a changelog 29 | - Fix a correctness issue when mapping over uneven collections 30 | - Vectorize min/max for SSE4.1 31 | - Vectorize ~Upcast~ for SSE4.1 32 | - Implement ~Downcast~ polyfills on many more vector types 33 | - Implement and test ~[saturating_]{hadd,hsub}~ on more vector types. 34 | - Undeprecate ~halfs~ and ~interleave~ 35 | * 0.3.0 36 | Announcing Faster 0.3.0, a 3,500+ line diff from 0.1.1 37 | ** Big Changes 38 | - Support for targets without hardware SIMD 39 | - Support for architectures other than x86 40 | - Documentation & examples for most objects 41 | - Intuitive support for uneven collections 42 | - Support SIMD-accelerated reductive operations 43 | - Add upcasting, casting, and downcasting 44 | ** Features 45 | - Add summation and product calculations for all vectors 46 | - Add default initializer for vectors 47 | - Add many more polyfills and feature gates 48 | - Allow scalar iteration of SIMD iterators with `map` and `fold` 49 | - Add vector constructors for interleaved and half-and-half patterns 50 | - Expose saturating addition and subtraction intrinsics 51 | ** Bugfixes & Minor Improvements 52 | - Add tests for sqrt, transmute, abs, recip, upcast, downcast, cast, and more 53 | - Make ~[saturating]_{hadd,hsub}~ portable 54 | * 0.2.0 55 | Announcing Faster 0.2.0, a 2,500+ line diff from 0.1.1 56 | ** Big Changes 57 | - Change license to MPL 2.0 58 | - Support for uneven collections 59 | - Add packed transmutations 60 | ** Features 61 | - Implement many more operations and polyfills 62 | ** Maintainence & Bugfixes: 63 | - Require only SSE for 64 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "faster" 3 | description = "Explicit SIMD for humans" 4 | authors = ["Adam Niederer "] 5 | license = "MPL-2.0" 6 | version = "0.5.2" 7 | edition = "2018" 8 | 9 | keywords = ["simd"] 10 | categories = ["no-std", "hardware-support", "api-bindings"] 11 | documentation = "https://docs.adamniederer.com/faster/index.html" 12 | repository = "https://github.com/AdamNiederer/faster" 13 | readme = "README.org" 14 | 15 | [dependencies] 16 | vektor = "0.2.1" 17 | packed_simd = {version = "0.3.4", package = "packed_simd_2"} 18 | 19 | [features] 20 | default = ["std"] 21 | std = [] 22 | trace = [] # When enabled, `FASTER_DEBUG_FILE` environment can configure log file. 23 | 24 | [dev-dependencies] 25 | 26 | [profile.release] 27 | opt-level = 3 28 | -------------------------------------------------------------------------------- /benches/destride.rs: -------------------------------------------------------------------------------- 1 | #![feature(stdsimd, test)] 2 | 3 | #[cfg(test)] extern crate test; 4 | #[macro_use] extern crate faster; 5 | 6 | #[cfg(test)] 7 | mod destride { 8 | use faster::prelude::*; 9 | use test::{Bencher, black_box}; 10 | 11 | #[bench] 12 | #[cfg(feature = "std")] 13 | fn destride_two(b: &mut Bencher) { 14 | let a = [0u8; 4096]; 15 | b.iter(|| { 16 | for v in a.simd_iter(u8s(0)).unroll(2) { 17 | let _ = black_box(v[0].destride_two(v[1])); 18 | } 19 | }) 20 | } 21 | 22 | #[bench] 23 | #[cfg(feature = "std")] 24 | fn destride_four(b: &mut Bencher) { 25 | let a = [0u8; 4096]; 26 | b.iter(|| { 27 | for v in a.simd_iter(u8s(0)).unroll(4) { 28 | let _ = black_box(v[0].destride_four(v[1], v[2], v[3])); 29 | } 30 | }) 31 | } 32 | 33 | #[bench] 34 | #[cfg(feature = "std")] 35 | fn destride_two_16(b: &mut Bencher) { 36 | let a = [0u16; 4096]; 37 | b.iter(|| { 38 | for v in a.simd_iter(u16s(0)).unroll(2) { 39 | let _ = black_box(v[0].destride_two(v[1])); 40 | } 41 | }) 42 | } 43 | 44 | #[bench] 45 | #[cfg(feature = "std")] 46 | fn destride_four_16(b: &mut Bencher) { 47 | let a = [0u16; 4096]; 48 | b.iter(|| { 49 | for v in a.simd_iter(u16s(0)).unroll(4) { 50 | let _ = v[0].destride_four(v[1], v[2], v[3]); 51 | } 52 | }) 53 | } 54 | 55 | #[bench] 56 | #[cfg(feature = "std")] 57 | fn destride_two_32(b: &mut Bencher) { 58 | let a = [0u32; 4096]; 59 | b.iter(|| { 60 | for v in a.simd_iter(u32s(0)).unroll(2) { 61 | let _ = black_box(v[0].destride_two(v[1])); 62 | } 63 | }) 64 | } 65 | 66 | #[bench] 67 | #[cfg(feature = "std")] 68 | fn destride_four_32(b: &mut Bencher) { 69 | let a = [0u32; 4096]; 70 | b.iter(|| { 71 | for v in a.simd_iter(u32s(0)).unroll(4) { 72 | let _ = v[0].destride_four(v[1], v[2], v[3]); 73 | } 74 | }) 75 | } 76 | 77 | #[bench] 78 | #[cfg(feature = "std")] 79 | fn destride_four_naiive(b: &mut Bencher) { 80 | let a = [0u8; 4096]; 81 | b.iter(|| { 82 | (&a[..]).stride_four(tuplify!(4, u8s(0))).zip() 83 | .simd_do_each(|x| { black_box(x); }); 84 | }) 85 | } 86 | 87 | #[bench] 88 | #[cfg(feature = "std")] 89 | fn destride_two_naiive(b: &mut Bencher) { 90 | let a = [0u8; 4096]; 91 | b.iter(|| { 92 | (&a[..]).stride_two(tuplify!(2, u8s(0))).zip() 93 | .simd_do_each(|x| { black_box(x); }); 94 | }) 95 | } 96 | 97 | #[bench] 98 | #[cfg(feature = "std")] 99 | fn destride_four_naiive_16(b: &mut Bencher) { 100 | let a = [0u16; 4096]; 101 | b.iter(|| { 102 | (&a[..]).stride_four(tuplify!(4, u16s(0))).zip() 103 | .simd_do_each(|x| { black_box(x); }); 104 | }) 105 | } 106 | 107 | #[bench] 108 | #[cfg(feature = "std")] 109 | fn destride_two_naiive_16(b: &mut Bencher) { 110 | let a = [0u16; 4096]; 111 | b.iter(|| { 112 | (&a[..]).stride_two(tuplify!(2, u16s(0))).zip() 113 | .simd_do_each(|x| { black_box(x); }); 114 | }) 115 | } 116 | 117 | #[bench] 118 | #[cfg(feature = "std")] 119 | fn destride_four_naiive_32(b: &mut Bencher) { 120 | let a = [0u32; 4096]; 121 | b.iter(|| { 122 | (&a[..]).stride_four(tuplify!(4, u32s(0))).zip() 123 | .simd_do_each(|x| { black_box(x); }); 124 | }) 125 | } 126 | 127 | #[bench] 128 | #[cfg(feature = "std")] 129 | fn destride_two_naiive_32(b: &mut Bencher) { 130 | let a = [0u32; 4096]; 131 | b.iter(|| { 132 | (&a[..]).stride_two(tuplify!(2, u32s(0))).zip() 133 | .simd_do_each(|x| { black_box(x); }); 134 | }) 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /benches/intrin.rs: -------------------------------------------------------------------------------- 1 | #![feature(test, stdsimd)] 2 | 3 | #[cfg(test)] extern crate test; 4 | extern crate faster; 5 | 6 | const ARRAY_F32: &[f32] = &[-123.456f32; 1024]; 7 | 8 | macro_rules! bench_intrin_1 { 9 | ($simd_name:ident, $simd_fn:expr, $scalar_name:ident, $scalar_fn:expr) => { 10 | #[bench] 11 | #[cfg(feature = "std")] 12 | fn $scalar_name(b: &mut Bencher) { 13 | b.iter(|| { black_box( 14 | crate::ARRAY_F32.iter().map(|v| { $scalar_fn(*v) }).collect::>() 15 | )}) 16 | } 17 | 18 | #[bench] 19 | #[cfg(feature = "std")] 20 | fn $simd_name(b: &mut Bencher) { 21 | b.iter(|| { black_box( 22 | crate::ARRAY_F32.simd_iter(f32s(0.0)).simd_map(|v| { $simd_fn(v) }).scalar_collect() 23 | )}); 24 | } 25 | } 26 | } 27 | 28 | macro_rules! bench_intrin_2 { 29 | ($simd_name:ident, $simd_fn:ident, $scalar_name:ident, $scalar_fn:ident) => { 30 | #[bench] 31 | #[cfg(feature = "std")] 32 | fn $scalar_name(b: &mut Bencher) { 33 | b.iter(|| { black_box( 34 | crate::ARRAY_F32.iter().map(|v| { v.$scalar_fn(*v) }).collect::>() 35 | )}) 36 | } 37 | 38 | #[bench] 39 | #[cfg(feature = "std")] 40 | fn $simd_name(b: &mut Bencher) { 41 | b.iter(|| { black_box( 42 | crate::ARRAY_F32.simd_iter(f32s(0.0)).simd_map(|v| {v.$simd_fn(v) }).scalar_collect() 43 | )}); 44 | } 45 | } 46 | } 47 | 48 | 49 | #[cfg(test)] 50 | mod intrin { 51 | use faster::prelude::*; 52 | use test::{Bencher, black_box}; 53 | 54 | bench_intrin_1!(abs_simd, |x: f32s| x.abs(), abs_scala, |x: f32| x.abs()); 55 | bench_intrin_1!(ceil_simd, |x: f32s| x.ceil(), ceil_scala, |x: f32| x.ceil()); 56 | bench_intrin_1!(floor_simd, |x: f32s| x.floor(), floor_scala, |x: f32| x.floor()); 57 | bench_intrin_2!(min_simd, min, min_scala, min); 58 | bench_intrin_2!(max_simd, max, max_scala, max); 59 | bench_intrin_1!(recip_simd, |x: f32s| x.recip(), recip_scala, |x: f32| 1.0f32 / x); 60 | bench_intrin_1!(round_simd, |x: f32s| x.round(), round_scala, |x: f32| x.round()); 61 | bench_intrin_1!(sqrt_simd, |x: f32s| x.sqrt(), sqrt_scala, |x: f32| x.sqrt()); 62 | bench_intrin_1!(trunc_simd, |x: f32s| x.trunc(), trunc_scala, |x: f32| x.trunc()); 63 | } 64 | -------------------------------------------------------------------------------- /examples/main.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | #![feature(stdsimd)] 8 | 9 | extern crate faster; 10 | use faster::*; 11 | 12 | #[cfg(feature = "std")] 13 | fn main() { 14 | let lots_of_84s = (&[-10i8; 33][..]).simd_iter(i8s(0)) 15 | .simd_map(|v| i8s(9) * v.abs().be_i8s() - i8s(4) - i8s(2)) 16 | .simd_map(|v| v) 17 | .scalar_collect(); 18 | 19 | let lots_of_3s = (&[-123.456f32; 128][..]).simd_iter(f32s(0.0)) 20 | .simd_map(|v| { f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() - 21 | f32s(4.0) - f32s(2.0) }) 22 | .scalar_collect(); 23 | 24 | let lots_of_3s_sc = (&[-123.456f32; 128][..]).iter() 25 | .map(|v| { 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() - 26 | 4.0 - 2.0 }) 27 | .collect::>(); 28 | 29 | let mut some_u8s = [0u8; 100]; 30 | let filled_u8s = (&[5u8; 100][..]).simd_iter(u8s(0)) 31 | .simd_map(|vector| vector * u8s(2)) 32 | .scalar_fill(&mut some_u8s); 33 | 34 | let reduced = (&[-1.0f32; 128][..]).simd_iter(f32s(0.0)) 35 | .simd_reduce(f32s(0.0), |a, v| a + v.abs().sqrt().sqrt().floor()).sum(); 36 | 37 | let strided = (0..20u32).collect::>().as_slice() 38 | .stride_two(tuplify!(2, u32s(99))).zip().simd_map(|(a, b)| a + b) 39 | .scalar_collect(); 40 | 41 | println!("{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n", lots_of_84s, lots_of_3s, lots_of_3s_sc, filled_u8s, filled_u8s.len(), reduced, strided); 42 | } 43 | 44 | #[cfg(not(feature = "std"))] 45 | fn main() {} 46 | -------------------------------------------------------------------------------- /src/arch/mod.rs: -------------------------------------------------------------------------------- 1 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 2 | pub mod x86; 3 | 4 | #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] 5 | pub mod unknown; 6 | 7 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 8 | pub use self::x86 as current; 9 | 10 | #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] 11 | pub use self::unknown as current; 12 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/cmp.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::cmp::*; 11 | 12 | rust_fallback_impl_binary! { 13 | impl Cmp for u8x16 where "__undefined" { 14 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 15 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 16 | } 17 | } 18 | 19 | rust_fallback_impl_binary! { 20 | impl Cmp for i8x16 where "__undefined" { 21 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 22 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 23 | } 24 | } 25 | 26 | rust_fallback_impl_binary! { 27 | impl Cmp for u16x8 where "__undefined" { 28 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 29 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 30 | } 31 | } 32 | 33 | rust_fallback_impl_binary! { 34 | impl Cmp for i16x8 where "__undefined" { 35 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 36 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 37 | } 38 | } 39 | 40 | rust_fallback_impl_binary! { 41 | impl Cmp for u32x4 where "__undefined" { 42 | min => __undefined(), [0, 1, 2, 3]; 43 | max => __undefined(), [0, 1, 2, 3]; 44 | } 45 | } 46 | 47 | rust_fallback_impl_binary! { 48 | impl Cmp for i32x4 where "__undefined" { 49 | min => __undefined(), [0, 1, 2, 3]; 50 | max => __undefined(), [0, 1, 2, 3]; 51 | } 52 | } 53 | 54 | rust_fallback_impl_binary! { 55 | impl Cmp for f32x4 where "__undefined" { 56 | min => __undefined(), [0, 1, 2, 3]; 57 | max => __undefined(), [0, 1, 2, 3]; 58 | } 59 | } 60 | 61 | rust_fallback_impl_binary! { 62 | impl Cmp for f64x2 where "__undefined" { 63 | min => __undefined(), [0, 1]; 64 | max => __undefined(), [0, 1]; 65 | } 66 | } 67 | 68 | rust_fallback_impl_binary! { 69 | impl Cmp for u8x32 where "__undefined" { 70 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 71 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 72 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 73 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 74 | } 75 | } 76 | 77 | rust_fallback_impl_binary! { 78 | impl Cmp for i8x32 where "__undefined" { 79 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 80 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 81 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 82 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 83 | } 84 | } 85 | 86 | rust_fallback_impl_binary! { 87 | impl Cmp for u16x16 where "__undefined" { 88 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 89 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 90 | } 91 | } 92 | 93 | rust_fallback_impl_binary! { 94 | impl Cmp for i16x16 where "__undefined" { 95 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 96 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 97 | } 98 | } 99 | 100 | rust_fallback_impl_binary! { 101 | impl Cmp for u32x8 where "__undefined" { 102 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 103 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 104 | } 105 | } 106 | 107 | rust_fallback_impl_binary! { 108 | impl Cmp for i32x8 where "__undefined" { 109 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 110 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 111 | } 112 | } 113 | 114 | rust_fallback_impl_binary! { 115 | impl Cmp for f32x8 where "__undefined" { 116 | min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 117 | max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 118 | } 119 | } 120 | 121 | rust_fallback_impl_binary! { 122 | impl Cmp for f64x4 where "__undefined" { 123 | min => __undefined(), [0, 1, 2, 3]; 124 | max => __undefined(), [0, 1, 2, 3]; 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/destride.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::intrin::destride::*; 10 | 11 | impl Destride for u8x16 { 12 | #[inline(always)] 13 | fn destride_two(self, other: Self) -> (Self, Self) { 14 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14) 15 | } 16 | 17 | #[inline(always)] 18 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 19 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12) 20 | } 21 | } 22 | 23 | impl Destride for u8x32 { 24 | #[inline(always)] 25 | fn destride_two(self, other: Self) -> (Self, Self) { 26 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) 27 | } 28 | 29 | #[inline(always)] 30 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 31 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28) 32 | } 33 | } 34 | 35 | impl Destride for i8x16 { 36 | #[inline(always)] 37 | fn destride_two(self, other: Self) -> (Self, Self) { 38 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14) 39 | } 40 | 41 | #[inline(always)] 42 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 43 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12) 44 | } 45 | } 46 | 47 | impl Destride for i8x32 { 48 | #[inline(always)] 49 | fn destride_two(self, other: Self) -> (Self, Self) { 50 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) 51 | } 52 | 53 | #[inline(always)] 54 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 55 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28) 56 | } 57 | } 58 | 59 | macro_rules! impl_destride { 60 | ($t:ty, $($two:expr, $four:expr),*) => { 61 | impl Destride for $t { 62 | #[inline(always)] 63 | fn destride_two(self, other: Self) -> (Self, Self) { 64 | destride_two_polyfill!(self, other, $($two, $four),*) 65 | } 66 | 67 | #[inline(always)] 68 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 69 | destride_four_polyfill!(self, b, c, d, $($two),*) 70 | } 71 | } 72 | } 73 | } 74 | 75 | impl_destride!(u16x16, 0, 2, 4, 6, 8, 10, 12, 14); 76 | impl_destride!(u16x8, 0, 2, 4, 6); 77 | impl_destride!(i16x16, 0, 2, 4, 6, 8, 10, 12, 14); 78 | impl_destride!(i16x8, 0, 2, 4, 6); 79 | 80 | impl_destride!(u32x8, 0, 2, 4, 6); 81 | impl_destride!(u32x4, 0, 2); 82 | impl_destride!(i32x8, 0, 2, 4, 6); 83 | impl_destride!(i32x4, 0, 2); 84 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/endian.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::endian::*; 11 | 12 | impl_packed_swap_bytes!(u8x16, u8x16, "__undefined", __undefined, 13 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), 14 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); 15 | impl_packed_swap_bytes!(i8x16, u8x16, "__undefined", __undefined, 16 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), 17 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)); 18 | impl_packed_swap_bytes!(u16x8, u8x16, "__undefined", __undefined, 19 | (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), 20 | (0, 1, 2, 3, 4, 5, 6, 7)); 21 | impl_packed_swap_bytes!(i16x8, u8x16, "__undefined", __undefined, 22 | (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), 23 | (0, 1, 2, 3, 4, 5, 6, 7)); 24 | impl_packed_swap_bytes!(u32x4, u8x16, "__undefined", __undefined, 25 | (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), 26 | (0, 1, 2, 3)); 27 | impl_packed_swap_bytes!(i32x4, u8x16, "__undefined", __undefined, 28 | (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), 29 | (0, 1, 2, 3)); 30 | impl_packed_swap_bytes!(u64x2, u8x16, "__undefined", __undefined, 31 | (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), 32 | (0, 1)); 33 | impl_packed_swap_bytes!(i64x2, u8x16, "__undefined", __undefined, 34 | (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8), 35 | (0, 1)); 36 | 37 | mod tests { 38 | #![allow(unused_imports)] 39 | 40 | use crate::prelude::*; 41 | use crate::arch::current::vecs::*; 42 | 43 | test_packed_swap_bytes!((u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2), 44 | (swap_bytes_u8x16, swap_bytes_i8x16, swap_bytes_u16x8, swap_bytes_i16x8, swap_bytes_u32x4, swap_bytes_i32x4, swap_bytes_u64x2, swap_bytes_i64x2)); 45 | } 46 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/eq.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::eq::*; 9 | use crate::arch::current::vecs::*; 10 | use crate::vecs::*; 11 | 12 | rust_fallback_eq! { 13 | impl Eq for u8x16 where "__undefined" { 14 | eq_mask, eq => u8x16, u8, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 15 | } 16 | } 17 | 18 | rust_fallback_eq! { 19 | impl Eq for i8x16 where "__undefined" { 20 | eq_mask, eq => u8x16, u8, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 21 | } 22 | } 23 | 24 | rust_fallback_eq! { 25 | impl Eq for u16x8 where "__undefined" { 26 | eq_mask, eq => u16x8, u16, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 27 | } 28 | } 29 | 30 | rust_fallback_eq! { 31 | impl Eq for i16x8 where "__undefined" { 32 | eq_mask, eq => u16x8, u16, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 33 | } 34 | } 35 | 36 | rust_fallback_eq! { 37 | impl Eq for u32x4 where "__undefined" { 38 | eq_mask, eq => u32x4, u32, __undefined(), [0, 1, 2, 3]; 39 | } 40 | } 41 | 42 | rust_fallback_eq! { 43 | impl Eq for i32x4 where "__undefined" { 44 | eq_mask, eq => u32x4, u32, __undefined(), [0, 1, 2, 3]; 45 | } 46 | } 47 | 48 | rust_fallback_eq! { 49 | impl Eq for f32x4 where "__undefined" { 50 | eq_mask, eq => u32x4, u32, __undefined(), [0, 1, 2, 3]; 51 | } 52 | } 53 | 54 | rust_fallback_eq! { 55 | impl Eq for f64x2 where "__undefined" { 56 | eq_mask, eq => u64x2, u64, __undefined(), [0, 1]; 57 | } 58 | } 59 | 60 | rust_fallback_eq! { 61 | impl Eq for u64x2 where "__undefined" { 62 | eq_mask, eq => u64x2, u64, __undefined(), [0, 1]; 63 | } 64 | } 65 | 66 | rust_fallback_eq! { 67 | impl Eq for i64x2 where "__undefined" { 68 | eq_mask, eq => u64x2, u64, __undefined(), [0, 1]; 69 | } 70 | } 71 | 72 | mod tests { 73 | #![allow(unused_imports)] 74 | use crate::prelude::*; 75 | use crate::arch::current::vecs::*; 76 | 77 | // test_packed_eq!(u8x64, u8, u8x64, u8, test_eq_u8x64); 78 | // test_packed_eq!(u8x32, u8, u8x32, u8, test_eq_u8x32); 79 | test_packed_eq!(u8x16, u8, u8x16, u8, test_eq_u8x16); 80 | // test_packed_eq!(i8x64, i8, u8x64, u8, test_eq_i8x64); 81 | // test_packed_eq!(i8x32, i8, u8x32, u8, test_eq_i8x32); 82 | test_packed_eq!(i8x16, i8, u8x16, u8, test_eq_i8x16); 83 | // test_packed_eq!(u16x32, u16, u16x32, u16, test_eq_u16x32); 84 | // test_packed_eq!(u16x16, u16, u16x16, u16, test_eq_u16x16); 85 | test_packed_eq!(u16x8, u16, u16x8, u16, test_eq_u16x8); 86 | // test_packed_eq!(i16x32, i16, u16x32, u16, test_eq_i16x32); 87 | // test_packed_eq!(i16x16, i16, u16x16, u16, test_eq_i16x16); 88 | test_packed_eq!(i16x8, i16, u16x8, u16, test_eq_i16x8); 89 | // test_packed_eq!(u32x16, u32, u32x16, u32, test_eq_u32x16); 90 | // test_packed_eq!(u32x8, u32, u32x8, u32, test_eq_u32x8); 91 | test_packed_eq!(u32x4, u32, u32x4, u32, test_eq_u32x4); 92 | // test_packed_eq!(i32x16, i32, u32x16, u32, test_eq_i32x16); 93 | // test_packed_eq!(i32x8, i32, u32x8, u32, test_eq_i32x8); 94 | test_packed_eq!(i32x4, i32, u32x4, u32, test_eq_i32x4); 95 | // test_packed_eq!(f32x16, f32, u32x16, u32, test_eq_f32x16); 96 | // test_packed_eq!(f32x8, f32, u32x8, u32, test_eq_f32x8); 97 | test_packed_eq!(f32x4, f32, u32x4, u32, test_eq_f32x4); 98 | // test_packed_eq!(u64x8, u64, u64x8, u64, test_eq_u64x8); 99 | // test_packed_eq!(u64x4, u64, u64x4, u64, test_eq_u64x4); 100 | test_packed_eq!(u64x2, u64, u64x2, u64, test_eq_u64x2); 101 | // test_packed_eq!(i64x8, i64, u64x8, u64, test_eq_i64x8); 102 | // test_packed_eq!(i64x4, i64, u64x4, u64, test_eq_i64x4); 103 | test_packed_eq!(i64x2, i64, u64x2, u64, test_eq_i64x2); 104 | // test_packed_eq!(f64x8, f64, u64x8, u64, test_eq_f64x8); 105 | // test_packed_eq!(f64x4, f64, u64x4, u64, test_eq_f64x4); 106 | test_packed_eq!(f64x2, f64, u64x2, u64, test_eq_f64x2); 107 | } 108 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/hadd.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::hadd::*; 9 | use crate::core::ops::Add; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | 13 | impl HAdd for u64x2 { hop!(hadd, Add::add, 0, 1); } 14 | impl HAdd for u32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 15 | impl HAdd for u16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 16 | impl HAdd for u8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 17 | impl HAdd for i64x2 { hop!(hadd, Add::add, 0, 1); } 18 | impl HAdd for i32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 19 | impl HAdd for i16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 20 | impl HAdd for i8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 21 | impl HAdd for f64x2 { hop!(hadd, Add::add, 0, 1); } 22 | impl HAdd for f32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 23 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/hsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::hsub::*; 11 | use crate::core::ops::Sub; 12 | 13 | impl HSub for u64x2 { hop!(hsub, Sub::sub, 0, 1); } 14 | impl HSub for u32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 15 | impl HSub for u16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 16 | impl HSub for u8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 17 | impl HSub for i64x2 { hop!(hsub, Sub::sub, 0, 1); } 18 | impl HSub for i32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 19 | impl HSub for i16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 20 | impl HSub for i8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 21 | impl HSub for f64x2 { hop!(hsub, Sub::sub, 0, 1); } 22 | impl HSub for f32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 23 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/merge.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::merge::*; 11 | 12 | // Will produce fallback implementations only, so we get away with __undefined. 13 | impl_packed_merge!(u8x16, u8x16, u8, __undefined, "__undefined", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 14 | impl_packed_merge!(u16x8, u16x8, u16, __undefined, "__undefined", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 15 | impl_packed_merge!(u32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3); 16 | impl_packed_merge!(u64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1); 17 | impl_packed_merge!(i8x16, u8x16, u8, __undefined, "__undefined", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 18 | impl_packed_merge!(i16x8, u16x8, u16, __undefined, "__undefined", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 19 | impl_packed_merge!(i32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3); 20 | impl_packed_merge!(i64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1); 21 | impl_packed_merge!(f32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3); 22 | impl_packed_merge!(f64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1); 23 | 24 | mod tests { 25 | #![allow(unused_imports)] 26 | 27 | use crate::prelude::*; 28 | use crate::arch::current::vecs::*; 29 | 30 | // TODO: Which ones do we really need? 31 | test_packed_merge!( 32 | (u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4, u64x2, i64x2, f64x2), 33 | (merge_u8x16, merge_i8x16, merge_u16x8, merge_i16x8, merge_u32x4, merge_i32x4, merge_f32x4, merge_u64x2, merge_i64x2, merge_f64x2) 34 | ); 35 | } 36 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/mod.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | mod abs; 9 | mod cmp; 10 | mod destride; 11 | mod downcast; 12 | mod endian; 13 | mod eq; 14 | mod hadd; 15 | mod hsub; 16 | mod merge; 17 | mod recip; 18 | mod round; 19 | mod rsqrt; 20 | mod saturating_add; 21 | mod saturating_hadd; 22 | mod saturating_sub; 23 | mod saturating_hsub; 24 | mod sum; 25 | mod sqrt; 26 | mod transmute; 27 | mod upcast; 28 | 29 | pub mod prelude { 30 | pub use super::abs::*; 31 | pub use super::cmp::*; 32 | pub use super::destride::*; 33 | pub use super::downcast::*; 34 | pub use super::endian::*; 35 | pub use super::eq::*; 36 | pub use super::hadd::*; 37 | pub use super::hsub::*; 38 | pub use super::merge::*; 39 | pub use super::recip::*; 40 | pub use super::round::*; 41 | pub use super::rsqrt::*; 42 | pub use super::saturating_add::*; 43 | pub use super::saturating_hadd::*; 44 | pub use super::saturating_hsub::*; 45 | pub use super::saturating_sub::*; 46 | pub use super::sum::*; 47 | pub use super::sqrt::*; 48 | pub use super::transmute::*; 49 | pub use super::upcast::*; 50 | } 51 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/recip.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::recip::Recip; 11 | 12 | rust_fallback_impl! { 13 | impl Recip for f32x4 where "__undefined" { 14 | recip => __undefined(), [0, 1, 2, 3]; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/round.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::round::Round; 9 | use crate::arch::current::vecs::*; 10 | use crate::vecs::*; 11 | 12 | rust_fallback_impl! { 13 | impl Round for f32x4 where "__undefined" { 14 | round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3]; 15 | ceil => __undefined(), [0, 1, 2, 3]; 16 | floor => __undefined(), [0, 1, 2, 3]; 17 | trunc => __undefined(_MM_FROUND_TRUNC), [0, 1, 2, 3]; 18 | } 19 | } 20 | 21 | rust_fallback_impl! { 22 | impl Round for f64x2 where "__undefined" { 23 | round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1]; 24 | ceil => __undefined(), [0, 1]; 25 | floor => __undefined(), [0, 1]; 26 | trunc => __undefined(_MM_FROUND_TRUNC), [0, 1]; 27 | } 28 | } 29 | 30 | rust_fallback_impl! { 31 | impl Round for f32x8 where "__undefined" { 32 | round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3, 4, 5, 6, 7]; 33 | ceil => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 34 | floor => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 35 | trunc => __undefined(_MM_FROUND_TRUNC), [0, 1, 2, 3, 4, 5, 6, 7]; 36 | } 37 | } 38 | 39 | rust_fallback_impl! { 40 | impl Round for f64x4 where "__undefined" { 41 | round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3]; 42 | ceil => __undefined(), [0, 1, 2, 3]; 43 | floor => __undefined(), [0, 1, 2, 3]; 44 | trunc => __undefined(_MM_FROUND_TRUNC), [0, 1, 2, 3]; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/rsqrt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::rsqrt::*; 9 | use crate::arch::current::vecs::*; 10 | use crate::vecs::*; 11 | 12 | // TODO: Guards and non-simd 13 | // 14 | //rust_fallback_impl! { 15 | // impl Rsqrt for f32x8 where "__undefined" { 16 | // rsqrt => _mm256_rsqrt_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 17 | // } 18 | //} 19 | 20 | rust_fallback_impl! { 21 | impl Rsqrt for f32x4 where "__undefined" { 22 | rsqrt => __undefined(), [0, 1, 2, 3]; 23 | } 24 | } 25 | 26 | rust_fallback_impl! { 27 | impl Rsqrt for f64x2 where "__undefined" { 28 | rsqrt => __undefined(), [0, 1]; 29 | } 30 | } 31 | 32 | impl Rsqrt for f32 { 33 | #[inline(always)] 34 | fn rsqrt(&self) -> Self { 35 | self.sqrt().recip() 36 | } 37 | } 38 | 39 | impl Rsqrt for f64 { 40 | #[inline(always)] 41 | fn rsqrt(&self) -> Self { 42 | self.sqrt().recip() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/saturating_add.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::saturating_add::*; 11 | 12 | rust_fallback_impl_binary! { 13 | impl SaturatingAdd for u8x16 where "__undefined" { 14 | saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 15 | } 16 | } 17 | 18 | rust_fallback_impl_binary! { 19 | impl SaturatingAdd for i8x16 where "__undefined" { 20 | saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 21 | } 22 | } 23 | 24 | rust_fallback_impl_binary! { 25 | impl SaturatingAdd for u16x8 where "__undefined" { 26 | saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 27 | } 28 | } 29 | 30 | rust_fallback_impl_binary! { 31 | impl SaturatingAdd for i16x8 where "__undefined" { 32 | saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 33 | } 34 | } 35 | 36 | rust_fallback_impl_binary! { 37 | impl SaturatingAdd for u32x4 where "__undefined" { 38 | saturating_add => __undefined(), [0, 1, 2, 3]; 39 | } 40 | } 41 | 42 | rust_fallback_impl_binary! { 43 | impl SaturatingAdd for i32x4 where "__undefined" { 44 | saturating_add => __undefined(), [0, 1, 2, 3]; 45 | } 46 | } 47 | 48 | rust_fallback_impl_binary! { 49 | impl SaturatingAdd for u64x2 where "__undefined" { 50 | saturating_add => __undefined(), [0, 1]; 51 | } 52 | } 53 | 54 | rust_fallback_impl_binary! { 55 | impl SaturatingAdd for i64x2 where "__undefined" { 56 | saturating_add => __undefined(), [0, 1]; 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/saturating_hadd.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::saturating_hadd::*; 11 | 12 | impl SaturatingHAdd for u64x2 { hop!(saturating_hadd, u64::saturating_add, 0, 1); } 13 | impl SaturatingHAdd for u32x4 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); } 14 | impl SaturatingHAdd for u16x8 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 15 | impl SaturatingHAdd for u8x16 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 16 | impl SaturatingHAdd for i64x2 { hop!(saturating_hadd, i64::saturating_add, 0, 1); } 17 | impl SaturatingHAdd for i32x4 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); } 18 | impl SaturatingHAdd for i16x8 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 19 | impl SaturatingHAdd for i8x16 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 20 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/saturating_hsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::saturating_hsub::*; 11 | 12 | impl SaturatingHSub for u64x2 { hop!(saturating_hsub, u64::saturating_sub, 0, 1); } 13 | impl SaturatingHSub for u32x4 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); } 14 | impl SaturatingHSub for u16x8 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 15 | impl SaturatingHSub for u8x16 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 16 | impl SaturatingHSub for i64x2 { hop!(saturating_hsub, i64::saturating_sub, 0, 1); } 17 | impl SaturatingHSub for i32x4 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); } 18 | impl SaturatingHSub for i16x8 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 19 | impl SaturatingHSub for i8x16 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 20 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/saturating_sub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::saturating_sub::*; 9 | use crate::arch::current::vecs::*; 10 | use crate::vecs::*; 11 | 12 | rust_fallback_impl_binary! { 13 | impl SaturatingSub for u8x16 where "__undefined" { 14 | saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 15 | } 16 | } 17 | 18 | rust_fallback_impl_binary! { 19 | impl SaturatingSub for i8x16 where "__undefined" { 20 | saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 21 | } 22 | } 23 | 24 | rust_fallback_impl_binary! { 25 | impl SaturatingSub for u16x8 where "__undefined" { 26 | saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 27 | } 28 | } 29 | 30 | rust_fallback_impl_binary! { 31 | impl SaturatingSub for i16x8 where "__undefined" { 32 | saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7]; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/sqrt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::sqrt::*; 9 | use crate::arch::current::vecs::*; 10 | use crate::vecs::*; 11 | 12 | rust_fallback_impl! { 13 | impl Sqrt for f32x4 where "__undefined" { 14 | sqrt => __undefined(), [0, 1, 2, 3]; 15 | } 16 | } 17 | 18 | rust_fallback_impl! { 19 | impl Sqrt for f64x2 where "__undefined" { 20 | sqrt => __undefined(), [0, 1]; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/sum.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | use crate::vecs::*; 10 | use crate::intrin::sum::{Sum,UpcastSum}; 11 | 12 | impl_packed_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, f32x4, f64x2); 13 | impl_packed_upcast_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, f32x4, f64x2); 14 | 15 | mod tests { 16 | #![allow(unused_imports)] 17 | 18 | use crate::prelude::*; 19 | use crate::arch::current::vecs::*; 20 | 21 | test_packed_sum_int!(u8x16, u8, test_packed_sum_u8x16); 22 | test_packed_sum_int!(i8x16, i8, test_packed_sum_i8x16); 23 | test_packed_sum_int!(u16x8, u16, test_packed_sum_u16x8); 24 | test_packed_sum_int!(i16x8, i16, test_packed_sum_i16x8); 25 | test_packed_sum_int!(u32x4, u32, test_packed_sum_u32x4); 26 | test_packed_sum_int!(i32x4, i32, test_packed_sum_i32x4); 27 | test_packed_sum_int!(u64x2, u64, test_packed_sum_u64x2); 28 | test_packed_sum_int!(i64x2, i64, test_packed_sum_i64x2); 29 | 30 | test_packed_sum!(f32x4, f32, test_packed_sum_f32x4); 31 | test_packed_sum!(f64x2, f64, test_packed_sum_f64x2); 32 | } 33 | -------------------------------------------------------------------------------- /src/arch/unknown/intrin/transmute.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::intrin::transmute::*; 9 | use crate::arch::current::vecs::*; 10 | use crate::core::mem::transmute; 11 | 12 | impl_packed_transmute!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4, 13 | u64x2, i64x2, f64x2, ... 14 | u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, 15 | f32x4, u64x2, i64x2, f64x2, 16 | "__undefined", "__undefined"); 17 | -------------------------------------------------------------------------------- /src/arch/unknown/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod intrin; 2 | pub mod vecs; 3 | pub mod vec_patterns; 4 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/addsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::arch::current::vecs::*; 9 | 10 | // impl AddSub for f32x4 { 11 | // #[inline(always)] 12 | // fn addsub(&self, other: Self) -> Self { 13 | // unsafe { _mm_addsub_ps(*self, other) } 14 | // } 15 | // } 16 | 17 | // impl AddSub for f64x2 { 18 | // #[inline(always)] 19 | // fn addsub(&self, other: Self) -> Self { 20 | // unsafe { _mm_addsub_pd(*self, other) } 21 | // } 22 | // } 23 | 24 | // impl AddSub for f32x8 { 25 | // #[inline(always)] 26 | // fn addsub(&self, other: Self) -> Self { 27 | // unsafe { _mm256_addsub_ps(*self, other) } 28 | // } 29 | // } 30 | 31 | // impl AddSub for f64x4 { 32 | // #[inline(always)] 33 | // fn addsub(&self, other: Self) -> Self { 34 | // unsafe { _mm256_addsub_pd(*self, other) } 35 | // } 36 | // } 37 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/cast.rs: -------------------------------------------------------------------------------- 1 | 2 | // impl_cast!(Asu8s, i8x16, u8x16, as_u8s, as_u8x16); 3 | // impl_cast!(Asi8s, u8x16, i8x16, as_i8s, as_i8x16); 4 | 5 | // impl_cast!(Asu8s, i8x32, u8x32, as_u8s, as_u8x32); 6 | // impl_cast!(Asi8s, u8x32, i8x32, as_i8s, as_i8x32); 7 | 8 | // impl_cast!(Asu8s, i8x64, u8x64, as_u8s, as_u8x64); 9 | // impl_cast!(Asi8s, u8x64, i8x64, as_i8s, as_i8x64); 10 | 11 | // impl_cast!(Asu16s, i16x8, u16x8, as_u16s, as_u16x8); 12 | // impl_cast!(Asi16s, u16x8, i16x8, as_i16s, as_i16x8); 13 | 14 | // impl_cast!(Asu16s, i16x16, u16x16, as_u16s, as_u16x16); 15 | // impl_cast!(Asi16s, u16x16, i16x16, as_i16s, as_i16x16); 16 | 17 | // impl_cast!(Asu16s, i16x32, u16x32, as_u16s, as_u16x32); 18 | // impl_cast!(Asi16s, u16x32, i16x32, as_i16s, as_i16x32); 19 | 20 | // impl_cast!(Asu32s, i32x4, u32x4, as_u32s, as_u32x4); 21 | // impl_cast!(Asu32s, f32x4, u32x4, as_u32s, as_u32x4); 22 | // impl_cast!(Asi32s, f32x4, i32x4, as_i32s, as_i32x4); 23 | // impl_cast!(Asi32s, u32x4, i32x4, as_i32s, as_i32x4); 24 | // impl_cast!(Asf32s, u32x4, f32x4, as_f32s, as_f32x4); 25 | // impl_cast!(Asf32s, i32x4, f32x4, as_f32s, as_f32x4); 26 | 27 | // impl_cast!(Asu32s, i32x8, u32x8, as_u32s, as_u32x8); 28 | // impl_cast!(Asu32s, f32x8, u32x8, as_u32s, as_u32x8); 29 | // impl_cast!(Asi32s, f32x8, i32x8, as_i32s, as_i32x8); 30 | // impl_cast!(Asi32s, u32x8, i32x8, as_i32s, as_i32x8); 31 | // impl_cast!(Asf32s, u32x8, f32x8, as_f32s, as_f32x8); 32 | // impl_cast!(Asf32s, i32x8, f32x8, as_f32s, as_f32x8); 33 | 34 | // impl_cast!(Asu32s, i32x16, u32x16, as_u32s, as_u32x16); 35 | // impl_cast!(Asu32s, f32x16, u32x16, as_u32s, as_u32x16); 36 | // impl_cast!(Asi32s, f32x16, i32x16, as_i32s, as_i32x16); 37 | // impl_cast!(Asi32s, u32x16, i32x16, as_i32s, as_i32x16); 38 | // impl_cast!(Asf32s, u32x16, f32x16, as_f32s, as_f32x16); 39 | // impl_cast!(Asf32s, i32x16, f32x16, as_f32s, as_f32x16); 40 | 41 | // impl_cast!(Asu64s, i64x2, u64x2, as_u64s, as_u64x2); 42 | // impl_cast!(Asu64s, f64x2, u64x2, as_u64s, as_u64x2); 43 | // impl_cast!(Asi64s, f64x2, i64x2, as_i64s, as_i64x2); 44 | // impl_cast!(Asi64s, u64x2, i64x2, as_i64s, as_i64x2); 45 | // impl_cast!(Asf64s, u64x2, f64x2, as_f64s, as_f64x2); 46 | // impl_cast!(Asf64s, i64x2, f64x2, as_f64s, as_f64x2); 47 | 48 | // impl_cast!(Asu64s, i64x4, u64x4, as_u64s, as_u64x4); 49 | // impl_cast!(Asu64s, f64x4, u64x4, as_u64s, as_u64x4); 50 | // impl_cast!(Asi64s, f64x4, i64x4, as_i64s, as_i64x4); 51 | // impl_cast!(Asi64s, u64x4, i64x4, as_i64s, as_i64x4); 52 | // impl_cast!(Asf64s, u64x4, f64x4, as_f64s, as_f64x4); 53 | // impl_cast!(Asf64s, i64x4, f64x4, as_f64s, as_f64x4); 54 | 55 | // impl_cast!(Asu64s, i64x8, u64x8, as_u64s, as_u64x8); 56 | // impl_cast!(Asu64s, f64x8, u64x8, as_u64s, as_u64x8); 57 | // impl_cast!(Asi64s, f64x8, i64x8, as_i64s, as_i64x8); 58 | // impl_cast!(Asi64s, u64x8, i64x8, as_i64s, as_i64x8); 59 | // impl_cast!(Asf64s, u64x8, f64x8, as_f64s, as_f64x8); 60 | // impl_cast!(Asf64s, i64x8, f64x8, as_f64s, as_f64x8); 61 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/cmp.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | use crate::intrin::cmp::*; 13 | 14 | rust_fallback_impl_binary! { 15 | impl Cmp for u8x16 where "sse2" { 16 | min => _mm_min_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 17 | max => _mm_max_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 18 | } 19 | } 20 | 21 | rust_fallback_impl_binary! { 22 | impl Cmp for i8x16 where "sse4.1" { 23 | min => _mm_min_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 24 | max => _mm_max_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 25 | } 26 | } 27 | 28 | rust_fallback_impl_binary! { 29 | impl Cmp for u16x8 where "sse4.1" { 30 | min => _mm_min_epu16(), [0, 1, 2, 3, 4, 5, 6, 7]; 31 | max => _mm_max_epu16(), [0, 1, 2, 3, 4, 5, 6, 7]; 32 | } 33 | } 34 | 35 | rust_fallback_impl_binary! { 36 | impl Cmp for i16x8 where "sse4.1" { 37 | min => _mm_min_epi16(), [0, 1, 2, 3, 4, 5, 6, 7]; 38 | max => _mm_max_epi16(), [0, 1, 2, 3, 4, 5, 6, 7]; 39 | } 40 | } 41 | 42 | rust_fallback_impl_binary! { 43 | impl Cmp for u32x4 where "sse4.1" { 44 | min => _mm_min_epu32(), [0, 1, 2, 3]; 45 | max => _mm_max_epu32(), [0, 1, 2, 3]; 46 | } 47 | } 48 | 49 | rust_fallback_impl_binary! { 50 | impl Cmp for i32x4 where "sse4.1" { 51 | min => _mm_min_epi32(), [0, 1, 2, 3]; 52 | max => _mm_max_epi32(), [0, 1, 2, 3]; 53 | } 54 | } 55 | 56 | rust_fallback_impl_binary! { 57 | impl Cmp for f32x4 where "sse" { 58 | min => _mm_min_ps(), [0, 1, 2, 3]; 59 | max => _mm_max_ps(), [0, 1, 2, 3]; 60 | } 61 | } 62 | 63 | rust_fallback_impl_binary! { 64 | impl Cmp for f64x2 where "sse2" { 65 | min => _mm_min_pd(), [0, 1]; 66 | max => _mm_max_pd(), [0, 1]; 67 | } 68 | } 69 | 70 | rust_fallback_impl_binary! { 71 | impl Cmp for u8x32 where "avx2" { 72 | min => _mm256_min_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 73 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 74 | max => _mm256_max_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 75 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 76 | } 77 | } 78 | 79 | rust_fallback_impl_binary! { 80 | impl Cmp for i8x32 where "avx2" { 81 | min => _mm256_min_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 82 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 83 | max => _mm256_max_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 84 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 85 | } 86 | } 87 | 88 | rust_fallback_impl_binary! { 89 | impl Cmp for u16x16 where "avx2" { 90 | min => _mm256_min_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 91 | max => _mm256_max_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 92 | } 93 | } 94 | 95 | rust_fallback_impl_binary! { 96 | impl Cmp for i16x16 where "avx2" { 97 | min => _mm256_min_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 98 | max => _mm256_max_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 99 | } 100 | } 101 | 102 | rust_fallback_impl_binary! { 103 | impl Cmp for u32x8 where "avx" { 104 | min => _mm256_min_epu32(), [0, 1, 2, 3, 4, 5, 6, 7]; 105 | max => _mm256_max_epu32(), [0, 1, 2, 3, 4, 5, 6, 7]; 106 | } 107 | } 108 | 109 | rust_fallback_impl_binary! { 110 | impl Cmp for i32x8 where "avx" { 111 | min => _mm256_min_epi32(), [0, 1, 2, 3, 4, 5, 6, 7]; 112 | max => _mm256_max_epi32(), [0, 1, 2, 3, 4, 5, 6, 7]; 113 | } 114 | } 115 | 116 | rust_fallback_impl_binary! { 117 | impl Cmp for f32x8 where "avx" { 118 | min => _mm256_min_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 119 | max => _mm256_max_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 120 | } 121 | } 122 | 123 | rust_fallback_impl_binary! { 124 | impl Cmp for f64x4 where "avx" { 125 | min => _mm256_min_pd(), [0, 1, 2, 3]; 126 | max => _mm256_max_pd(), [0, 1, 2, 3]; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/destride.rs: -------------------------------------------------------------------------------- 1 | use crate::arch::current::vecs::*; 2 | use crate::vecs::*; 3 | use crate::vektor::x86_64::*; 4 | use crate::vektor::x86::*; 5 | use crate::intrin::merge::*; 6 | use crate::intrin::transmute::*; 7 | use crate::intrin::destride::*; 8 | use crate::core::mem::transmute; 9 | 10 | impl Destride for u8x16 { 11 | #[inline(always)] 12 | #[cfg(target_feature = "ssse3")] 13 | fn destride_two(self, other: Self) -> (Self, Self) { 14 | optimized!(); 15 | unsafe { 16 | let a = _mm_shuffle_epi8(self.be_i8s(), Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s()); 17 | let b = _mm_shuffle_epi8(other.be_i8s(), Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s()); 18 | // Backwards merge of a and b (keeps elements at the same indices) 19 | let c = _mm_shuffle_epi8(b.merge_halves(a), Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7).be_i8s()); 20 | (a.merge_halves(b).be_u8s(), c.be_u8s()) 21 | } 22 | } 23 | 24 | #[inline(always)] 25 | #[cfg(not(target_feature = "ssse3"))] 26 | fn destride_two(self, other: Self) -> (Self, Self) { 27 | fallback!(); 28 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14) 29 | } 30 | 31 | #[inline(always)] 32 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 33 | fallback!(); 34 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12) 35 | } 36 | } 37 | 38 | impl Destride for u8x32 { 39 | #[inline(always)] 40 | #[cfg(target_feature = "avx2")] 41 | fn destride_two(self, other: Self) -> (Self, Self) { 42 | optimized!(); 43 | unsafe { 44 | // In-lane destrided vectors 45 | let a = _mm256_shuffle_epi8(self.be_i8s(), Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s()); 46 | let b = _mm256_shuffle_epi8(other.be_i8s(), Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s()); 47 | // Cross-lane destrided vectors 48 | let aa = _mm256_permute4x64_epi64(a.be_i64s(), 0xD8).be_u8s(); 49 | let bb = _mm256_permute4x64_epi64(b.be_i64s(), 0xD8).be_u8s(); 50 | // Backwards merge of aa and bb (keeps elements at the same indices) 51 | let c = _mm256_permute4x64_epi64(aa.merge_halves(bb).be_i64s(), 0x4E).be_u8s(); 52 | (aa.merge_halves(bb), c) 53 | } 54 | } 55 | 56 | #[inline(always)] 57 | #[cfg(not(target_feature = "avx2"))] 58 | fn destride_two(self, other: Self) -> (Self, Self) { 59 | fallback!(); 60 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) 61 | } 62 | 63 | #[inline(always)] 64 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 65 | fallback!(); 66 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28) 67 | } 68 | } 69 | 70 | impl Destride for i8x16 { 71 | #[inline(always)] 72 | #[cfg(target_feature = "ssse3")] 73 | fn destride_two(self, other: Self) -> (Self, Self) { 74 | optimized!(); 75 | unsafe { 76 | let a = _mm_shuffle_epi8(transmute(self), transmute(Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15))); 77 | let b = _mm_shuffle_epi8(transmute(other), transmute(Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14))); 78 | // Backwards merge of a and b (keeps elements at the same indices) 79 | let c = _mm_shuffle_epi8(b.merge_halves(a), transmute(Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7))); 80 | (a.be_i8s().merge_halves(b.be_i8s()), c.be_i8s()) 81 | } 82 | } 83 | 84 | #[inline(always)] 85 | #[cfg(not(target_feature = "ssse3"))] 86 | fn destride_two(self, other: Self) -> (Self, Self) { 87 | fallback!(); 88 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14) 89 | } 90 | 91 | #[inline(always)] 92 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 93 | fallback!(); 94 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12) 95 | } 96 | } 97 | 98 | impl Destride for i8x32 { 99 | #[inline(always)] 100 | #[cfg(target_feature = "avx2")] 101 | fn destride_two(self, other: Self) -> (Self, Self) { 102 | optimized!(); 103 | unsafe { 104 | // In-lane destrided vectors 105 | let a = _mm256_shuffle_epi8(transmute(self), transmute(Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15))); 106 | let b = _mm256_shuffle_epi8(transmute(other), transmute(Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14))); 107 | // Cross-lane destrided vectors 108 | let aa = _mm256_permute4x64_epi64(a.be_i64s(), 0xD8).be_i8s(); 109 | let bb = _mm256_permute4x64_epi64(b.be_i64s(), 0xD8).be_i8s(); 110 | // Backwards merge of aa and bb (keeps elements at the same indices) 111 | let c = _mm256_permute4x64_epi64(aa.merge_halves(bb).be_i64s(), 0x4E).be_i8s(); 112 | (aa.merge_halves(bb), c) 113 | } 114 | } 115 | 116 | #[inline(always)] 117 | #[cfg(not(target_feature = "avx2"))] 118 | fn destride_two(self, other: Self) -> (Self, Self) { 119 | fallback!(); 120 | destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30) 121 | } 122 | 123 | #[inline(always)] 124 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 125 | fallback!(); 126 | destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28) 127 | } 128 | } 129 | 130 | macro_rules! impl_destride { 131 | ($t:ty, $($two:expr, $four:expr),*) => { 132 | impl Destride for $t { 133 | #[inline(always)] 134 | fn destride_two(self, other: Self) -> (Self, Self) { 135 | fallback!(); 136 | destride_two_polyfill!(self, other, $($two, $four),*) 137 | } 138 | 139 | #[inline(always)] 140 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { 141 | fallback!(); 142 | destride_four_polyfill!(self, b, c, d, $($two),*) 143 | } 144 | } 145 | } 146 | } 147 | 148 | impl_destride!(u16x16, 0, 2, 4, 6, 8, 10, 12, 14); 149 | impl_destride!(u16x8, 0, 2, 4, 6); 150 | impl_destride!(i16x16, 0, 2, 4, 6, 8, 10, 12, 14); 151 | impl_destride!(i16x8, 0, 2, 4, 6); 152 | 153 | impl_destride!(u32x8, 0, 2, 4, 6); 154 | impl_destride!(u32x4, 0, 2); 155 | impl_destride!(i32x8, 0, 2, 4, 6); 156 | impl_destride!(i32x4, 0, 2); 157 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/eq.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::core::ops::BitXor; 11 | use crate::intrin::eq::*; 12 | use crate::arch::current::vecs::*; 13 | use crate::vecs::*; 14 | 15 | rust_fallback_eq! { 16 | impl Eq for u8x16 where "sse2" { 17 | eq_mask, eq => u8x16, u8, _mm_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 18 | } 19 | } 20 | 21 | rust_fallback_eq! { 22 | impl Eq for i8x16 where "sse4.1" { 23 | eq_mask, eq => u8x16, u8, _mm_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 24 | } 25 | } 26 | 27 | rust_fallback_eq! { 28 | impl Eq for u16x8 where "sse4.1" { 29 | eq_mask, eq => u16x8, u16, _mm_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7]; 30 | } 31 | } 32 | 33 | rust_fallback_eq! { 34 | impl Eq for i16x8 where "sse4.1" { 35 | eq_mask, eq => u16x8, u16, _mm_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7]; 36 | } 37 | } 38 | 39 | rust_fallback_eq! { 40 | impl Eq for u32x4 where "sse4.1" { 41 | eq_mask, eq => u32x4, u32, _mm_cmpeq_epi32(), [0, 1, 2, 3]; 42 | } 43 | } 44 | 45 | rust_fallback_eq! { 46 | impl Eq for i32x4 where "sse4.1" { 47 | eq_mask, eq => u32x4, u32, _mm_cmpeq_epi32(), [0, 1, 2, 3]; 48 | } 49 | } 50 | 51 | rust_fallback_eq! { 52 | impl Eq for f32x4 where "sse" { 53 | eq_mask, eq => u32x4, u32, _mm_cmpeq_ps(), [0, 1, 2, 3]; 54 | } 55 | } 56 | 57 | rust_fallback_eq! { 58 | impl Eq for f64x2 where "sse2" { 59 | eq_mask, eq => u64x2, u64, _mm_cmpeq_pd(), [0, 1]; 60 | } 61 | } 62 | 63 | rust_fallback_eq! { 64 | impl Eq for u64x2 where "sse4.1" { 65 | eq_mask, eq => u64x2, u64, _mm_cmpeq_epi64(), [0, 1]; 66 | } 67 | } 68 | 69 | rust_fallback_eq! { 70 | impl Eq for i64x2 where "sse4.1" { 71 | eq_mask, eq => u64x2, u64, _mm_cmpeq_epi64(), [0, 1]; 72 | } 73 | } 74 | 75 | rust_fallback_eq! { 76 | impl Eq for u8x32 where "avx2" { 77 | eq_mask, eq => u8x32, u8, _mm256_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 78 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 79 | } 80 | } 81 | 82 | rust_fallback_eq! { 83 | impl Eq for i8x32 where "avx2" { 84 | eq_mask, eq => u8x32, u8, _mm256_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 85 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 86 | } 87 | } 88 | 89 | rust_fallback_eq! { 90 | impl Eq for u16x16 where "avx2" { 91 | eq_mask, eq => u16x16, u16, _mm256_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 92 | } 93 | } 94 | 95 | rust_fallback_eq! { 96 | impl Eq for i16x16 where "avx2" { 97 | eq_mask, eq => u16x16, u16, _mm256_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 98 | } 99 | } 100 | 101 | rust_fallback_eq! { 102 | impl Eq for u32x8 where "avx2" { 103 | eq_mask, eq => u32x8, u32, _mm256_cmpeq_epi32(), [0, 1, 2, 3, 4, 5, 6, 7]; 104 | } 105 | } 106 | 107 | rust_fallback_eq! { 108 | impl Eq for i32x8 where "avx2" { 109 | eq_mask, eq => u32x8, u32, _mm256_cmpeq_epi32(), [0, 1, 2, 3, 4, 5, 6, 7]; 110 | } 111 | } 112 | 113 | rust_fallback_eq! { 114 | impl Eq for f32x8 where "avx" { 115 | eq_mask, eq => u32x8, u32, _mm256_cmp_ps(0x00), [0, 1, 2, 3, 4, 5, 6, 7]; 116 | } 117 | } 118 | 119 | rust_fallback_eq! { 120 | impl Eq for f64x4 where "avx" { 121 | eq_mask, eq => u64x4, u64, _mm256_cmp_pd(0x00), [0, 1, 2, 3]; 122 | } 123 | } 124 | 125 | rust_fallback_eq! { 126 | impl Eq for u64x4 where "avx2" { 127 | eq_mask, eq => u64x4, u64, _mm256_cmpeq_epi64(), [0, 1, 2, 3]; 128 | } 129 | } 130 | 131 | rust_fallback_eq! { 132 | impl Eq for i64x4 where "avx2" { 133 | eq_mask, eq => u64x4, u64, _mm256_cmpeq_epi64(), [0, 1, 2, 3]; 134 | } 135 | } 136 | 137 | mod tests { 138 | use crate::prelude::*; 139 | use crate::arch::current::vecs::*; 140 | 141 | // test_packed_eq!(u8x64, u8, u8x64, u8, test_eq_u8x64); 142 | test_packed_eq!(u8x32, u8, u8x32, u8, test_eq_u8x32); 143 | test_packed_eq!(u8x16, u8, u8x16, u8, test_eq_u8x16); 144 | // test_packed_eq!(i8x64, i8, u8x64, u8, test_eq_i8x64); 145 | test_packed_eq!(i8x32, i8, u8x32, u8, test_eq_i8x32); 146 | test_packed_eq!(i8x16, i8, u8x16, u8, test_eq_i8x16); 147 | // test_packed_eq!(u16x32, u16, u16x32, u16, test_eq_u16x32); 148 | test_packed_eq!(u16x16, u16, u16x16, u16, test_eq_u16x16); 149 | test_packed_eq!(u16x8, u16, u16x8, u16, test_eq_u16x8); 150 | // test_packed_eq!(i16x32, i16, u16x32, u16, test_eq_i16x32); 151 | test_packed_eq!(i16x16, i16, u16x16, u16, test_eq_i16x16); 152 | test_packed_eq!(i16x8, i16, u16x8, u16, test_eq_i16x8); 153 | // test_packed_eq!(u32x16, u32, u32x16, u32, test_eq_u32x16); 154 | test_packed_eq!(u32x8, u32, u32x8, u32, test_eq_u32x8); 155 | test_packed_eq!(u32x4, u32, u32x4, u32, test_eq_u32x4); 156 | // test_packed_eq!(i32x16, i32, u32x16, u32, test_eq_i32x16); 157 | test_packed_eq!(i32x8, i32, u32x8, u32, test_eq_i32x8); 158 | test_packed_eq!(i32x4, i32, u32x4, u32, test_eq_i32x4); 159 | // test_packed_eq!(f32x16, f32, u32x16, u32, test_eq_f32x16); 160 | test_packed_eq!(f32x8, f32, u32x8, u32, test_eq_f32x8); 161 | test_packed_eq!(f32x4, f32, u32x4, u32, test_eq_f32x4); 162 | // test_packed_eq!(u64x8, u64, u64x8, u64, test_eq_u64x8); 163 | test_packed_eq!(u64x4, u64, u64x4, u64, test_eq_u64x4); 164 | test_packed_eq!(u64x2, u64, u64x2, u64, test_eq_u64x2); 165 | // test_packed_eq!(i64x8, i64, u64x8, u64, test_eq_i64x8); 166 | test_packed_eq!(i64x4, i64, u64x4, u64, test_eq_i64x4); 167 | test_packed_eq!(i64x2, i64, u64x2, u64, test_eq_i64x2); 168 | // test_packed_eq!(f64x8, f64, u64x8, u64, test_eq_f64x8); 169 | test_packed_eq!(f64x4, f64, u64x4, u64, test_eq_f64x4); 170 | test_packed_eq!(f64x2, f64, u64x2, u64, test_eq_f64x2); 171 | } 172 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/hadd.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::intrin::transmute::*; 11 | use crate::intrin::hadd::*; 12 | use crate::core::ops::Add; 13 | use crate::arch::current::vecs::*; 14 | use crate::vecs::*; 15 | 16 | #[cfg(target_feature = "sse3")] 17 | impl HAdd for f32x4 { 18 | #[inline(always)] 19 | fn hadd(&self, other: Self) -> Self { 20 | optimized!(); 21 | unsafe { _mm_hadd_ps(_mm_shuffle_ps(*self, other, 0b01000100), 22 | _mm_shuffle_ps(*self, other, 0b11101110)) } 23 | } 24 | } 25 | 26 | #[cfg(target_feature = "sse3")] 27 | impl HAdd for f64x2 { 28 | #[inline(always)] 29 | #[cfg(target_feature = "sse3")] 30 | fn hadd(&self, other: Self) -> Self { 31 | optimized!(); 32 | unsafe { _mm_hadd_pd(*self, other) } 33 | } 34 | } 35 | 36 | #[cfg(target_feature = "avx2")] 37 | impl HAdd for f32x8 { 38 | #[inline(always)] 39 | fn hadd(&self, other: Self) -> Self { 40 | optimized!(); 41 | unsafe { _mm256_hadd_ps(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), 42 | _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked()) } 43 | } 44 | } 45 | 46 | #[cfg(target_feature = "avx")] 47 | impl HAdd for f64x4 { 48 | #[inline(always)] 49 | fn hadd(&self, other: Self) -> Self { 50 | optimized!(); 51 | unsafe { _mm256_hadd_pd(*self, other) } 52 | } 53 | } 54 | 55 | #[cfg(target_feature = "ssse3")] 56 | impl HAdd for i16x8 { 57 | #[inline(always)] 58 | fn hadd(&self, other: Self) -> Self { 59 | optimized!(); 60 | unsafe { _mm_hadd_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 61 | _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 62 | } 63 | } 64 | 65 | #[cfg(target_feature = "ssse3")] 66 | impl HAdd for i32x4 { 67 | #[inline(always)] 68 | fn hadd(&self, other: Self) -> Self { 69 | optimized!(); 70 | unsafe { _mm_hadd_epi32(_mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), 71 | _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } 72 | } 73 | } 74 | 75 | #[cfg(target_feature = "avx2")] 76 | impl HAdd for i16x16 { 77 | #[inline(always)] 78 | fn hadd(&self, other: Self) -> Self { 79 | optimized!(); 80 | unsafe { _mm256_hadd_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 81 | _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 82 | } 83 | } 84 | 85 | #[cfg(target_feature = "avx2")] 86 | impl HAdd for i32x8 { 87 | #[inline(always)] 88 | fn hadd(&self, other: Self) -> Self { 89 | optimized!(); 90 | unsafe { _mm256_hadd_epi32(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), 91 | _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } 92 | } 93 | } 94 | 95 | impl HAdd for u64x2 { hop!(hadd, Add::add, 0, 1); } 96 | impl HAdd for u64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 97 | impl HAdd for u64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 98 | impl HAdd for u32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 99 | impl HAdd for u32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 100 | impl HAdd for u32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 101 | impl HAdd for u16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 102 | impl HAdd for u16x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 103 | impl HAdd for u16x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 104 | impl HAdd for u8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 105 | impl HAdd for u8x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 106 | impl HAdd for u8x64 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 107 | impl HAdd for i64x2 { hop!(hadd, Add::add, 0, 1); } 108 | impl HAdd for i64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 109 | impl HAdd for i64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 110 | #[cfg(not(target_feature = "ssse3"))] 111 | impl HAdd for i32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 112 | #[cfg(not(target_feature = "avx2"))] 113 | impl HAdd for i32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 114 | impl HAdd for i32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 115 | #[cfg(not(target_feature = "ssse3"))] 116 | impl HAdd for i16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 117 | #[cfg(not(target_feature = "avx2"))] 118 | impl HAdd for i16x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 119 | impl HAdd for i16x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 120 | impl HAdd for i8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 121 | impl HAdd for i8x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 122 | impl HAdd for i8x64 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 123 | #[cfg(not(target_feature = "sse3"))] 124 | impl HAdd for f64x2 { hop!(hadd, Add::add, 0, 1); } 125 | #[cfg(not(target_feature = "avx"))] 126 | impl HAdd for f64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 127 | impl HAdd for f64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 128 | #[cfg(not(target_feature = "sse3"))] 129 | impl HAdd for f32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); } 130 | #[cfg(not(target_feature = "avx2"))] 131 | impl HAdd for f32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); } 132 | impl HAdd for f32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 133 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/hsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | use crate::intrin::transmute::*; 13 | use crate::intrin::hsub::*; 14 | use crate::core::ops::Sub; 15 | 16 | #[cfg(target_feature = "sse3")] 17 | impl HSub for f32x4 { 18 | #[inline(always)] 19 | fn hsub(&self, other: Self) -> Self { 20 | optimized!(); 21 | unsafe { _mm_hsub_ps(_mm_shuffle_ps(*self, other, 0b01000100), 22 | _mm_shuffle_ps(*self, other, 0b11101110)) } 23 | } 24 | } 25 | 26 | #[cfg(target_feature = "sse3")] 27 | impl HSub for f64x2 { 28 | #[inline(always)] 29 | fn hsub(&self, other: Self) -> Self { 30 | optimized!(); 31 | unsafe { _mm_hsub_pd(*self, other) } 32 | } 33 | } 34 | 35 | #[cfg(target_feature = "avx2")] 36 | impl HSub for f32x8 { 37 | #[inline(always)] 38 | fn hsub(&self, other: Self) -> Self { 39 | optimized!(); 40 | unsafe { _mm256_hsub_ps(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(), 41 | _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked()) } 42 | } 43 | } 44 | 45 | #[cfg(target_feature = "avx")] 46 | impl HSub for f64x4 { 47 | #[inline(always)] 48 | fn hsub(&self, other: Self) -> Self { 49 | optimized!(); 50 | unsafe { _mm256_hsub_pd(*self, other) } 51 | } 52 | } 53 | 54 | #[cfg(target_feature = "ssse3")] 55 | impl HSub for i16x8 { 56 | #[inline(always)] 57 | fn hsub(&self, other: Self) -> Self { 58 | optimized!(); 59 | unsafe { _mm_hsub_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 60 | _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 61 | } 62 | } 63 | 64 | #[cfg(target_feature = "ssse3")] 65 | impl HSub for i32x4 { 66 | #[inline(always)] 67 | fn hsub(&self, other: Self) -> Self { 68 | optimized!(); 69 | unsafe { _mm_hsub_epi32(_mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), 70 | _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } 71 | } 72 | } 73 | 74 | #[cfg(target_feature = "avx2")] 75 | impl HSub for i16x16 { 76 | #[inline(always)] 77 | fn hsub(&self, other: Self) -> Self { 78 | optimized!(); 79 | unsafe { _mm256_hsub_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 80 | _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 81 | } 82 | } 83 | 84 | #[cfg(target_feature = "avx2")] 85 | impl HSub for i32x8 { 86 | #[inline(always)] 87 | fn hsub(&self, other: Self) -> Self { 88 | optimized!(); 89 | unsafe { _mm256_hsub_epi32(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(), 90 | _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) } 91 | } 92 | } 93 | 94 | impl HSub for u64x2 { hop!(hsub, Sub::sub, 0, 1); } 95 | impl HSub for u64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 96 | impl HSub for u64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 97 | impl HSub for u32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 98 | impl HSub for u32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 99 | impl HSub for u32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 100 | impl HSub for u16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 101 | impl HSub for u16x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 102 | impl HSub for u16x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 103 | impl HSub for u8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 104 | impl HSub for u8x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 105 | impl HSub for u8x64 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 106 | impl HSub for i64x2 { hop!(hsub, Sub::sub, 0, 1); } 107 | impl HSub for i64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 108 | impl HSub for i64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 109 | #[cfg(not(target_feature = "ssse3"))] 110 | impl HSub for i32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 111 | #[cfg(not(target_feature = "avx2"))] 112 | impl HSub for i32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 113 | impl HSub for i32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 114 | #[cfg(not(target_feature = "ssse3"))] 115 | impl HSub for i16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 116 | #[cfg(not(target_feature = "avx2"))] 117 | impl HSub for i16x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 118 | impl HSub for i16x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 119 | impl HSub for i8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 120 | impl HSub for i8x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 121 | impl HSub for i8x64 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 122 | #[cfg(not(target_feature = "sse3"))] 123 | impl HSub for f64x2 { hop!(hsub, Sub::sub, 0, 1); } 124 | #[cfg(not(target_feature = "avx"))] 125 | impl HSub for f64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 126 | impl HSub for f64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 127 | #[cfg(not(target_feature = "sse3"))] 128 | impl HSub for f32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); } 129 | #[cfg(not(target_feature = "avx2"))] 130 | impl HSub for f32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); } 131 | impl HSub for f32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 132 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/merge.rs: -------------------------------------------------------------------------------- 1 | use crate::arch::current::vecs::*; 2 | use crate::vecs::*; 3 | use crate::vec_patterns::*; 4 | use crate::vektor::x86_64::*; 5 | use crate::vektor::x86::*; 6 | use crate::intrin::transmute::*; 7 | use crate::intrin::merge::*; 8 | use crate::core::mem::transmute; 9 | 10 | // TODO: The AVX-512 version of this macro doesn't work; impl when stdsimd gets 11 | // around to it (and when I have some hardware to test it on). 12 | impl_packed_merge!(u8x16, u8x16, u8, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 13 | impl_packed_merge!(u8x32, u8x32, u8, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 14 | impl_packed_merge!(u8x64, u8x64, u8, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), (32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); 15 | 16 | impl_packed_merge!(u16x8, u16x8, u16, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 17 | impl_packed_merge!(u16x16, u16x16, u16, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 18 | impl_packed_merge!(u16x32, u16x32, u16, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 19 | 20 | impl_packed_merge!(u32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3); 21 | impl_packed_merge!(u32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 22 | impl_packed_merge!(u32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 23 | 24 | impl_packed_merge!(u64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1); 25 | impl_packed_merge!(u64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3); 26 | impl_packed_merge!(u64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 27 | 28 | impl_packed_merge!(i8x16, u8x16, u8, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 29 | impl_packed_merge!(i8x32, u8x32, u8, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 30 | impl_packed_merge!(i8x64, u8x64, u8, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), (32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); 31 | 32 | impl_packed_merge!(i16x8, u16x8, u16, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 33 | impl_packed_merge!(i16x16, u16x16, u16, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 34 | impl_packed_merge!(i16x32, u16x32, u16, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); 35 | 36 | impl_packed_merge!(i32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3); 37 | impl_packed_merge!(i32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 38 | impl_packed_merge!(i32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 39 | 40 | impl_packed_merge!(i64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1); 41 | impl_packed_merge!(i64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3); 42 | impl_packed_merge!(i64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 43 | 44 | impl_packed_merge!(f32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3); 45 | impl_packed_merge!(f32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 46 | impl_packed_merge!(f32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 47 | 48 | impl_packed_merge!(f64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1); 49 | impl_packed_merge!(f64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3); 50 | impl_packed_merge!(f64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7); 51 | 52 | mod tests { 53 | use crate::prelude::*; 54 | use crate::arch::current::vecs::*; 55 | 56 | test_packed_merge!( 57 | (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2), 58 | (merge_u8x64, merge_u8x32, merge_u8x16, merge_i8x64, merge_i8x32, merge_i8x16, merge_u16x32, merge_u16x16, merge_u16x8, merge_i16x32, merge_i16x16, merge_i16x8, merge_u32x16, merge_u32x8, merge_u32x4, merge_i32x16, merge_i32x8, merge_i32x4, merge_f32x16, merge_f32x8, merge_f32x4, merge_u64x8, merge_u64x4, merge_u64x2, merge_i64x8, merge_i64x4, merge_i64x2, merge_f64x8, merge_f64x4, merge_f64x2)); 59 | } 60 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/mod.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | #![allow(unused_imports)] 9 | 10 | mod abs; 11 | mod addsub; 12 | mod cmp; 13 | mod destride; 14 | mod downcast; 15 | mod endian; 16 | mod eq; 17 | mod hadd; 18 | mod hsub; 19 | mod merge; 20 | mod popcnt; 21 | mod recip; 22 | mod round; 23 | mod sum; 24 | mod rsqrt; 25 | mod saturating_add; 26 | mod saturating_hadd; 27 | mod saturating_sub; 28 | mod saturating_hsub; 29 | mod sqrt; 30 | mod transmute; 31 | mod upcast; 32 | 33 | // We use an internal prelude not to clutter the namespace when we import 34 | // from actual prelude. 35 | pub mod prelude { 36 | pub use super::abs::*; 37 | pub use super::addsub::*; 38 | pub use super::cmp::*; 39 | pub use super::destride::*; 40 | pub use super::downcast::*; 41 | pub use super::endian::*; 42 | pub use super::eq::*; 43 | pub use super::hadd::*; 44 | pub use super::hsub::*; 45 | pub use super::merge::*; 46 | pub use super::popcnt::*; 47 | pub use super::recip::*; 48 | pub use super::round::*; 49 | pub use super::rsqrt::*; 50 | pub use super::sum::*; 51 | pub use super::saturating_add::*; 52 | pub use super::saturating_hadd::*; 53 | pub use super::saturating_hsub::*; 54 | pub use super::saturating_sub::*; 55 | pub use super::transmute::*; 56 | pub use super::upcast::*; 57 | } 58 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/popcnt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::intrin::sum::*; 11 | use crate::intrin::transmute::*; 12 | use crate::intrin::popcnt::*; 13 | use crate::arch::current::intrin::upcast::*; 14 | use crate::intrin::sum::UpcastSum; 15 | use crate::arch::current::vecs::*; 16 | use crate::intrin::upcast::*; 17 | use crate::vecs::*; 18 | 19 | #[inline(always)] 20 | #[cfg(target_feature = "ssse3")] 21 | unsafe fn popcnt128(v: u8x16) -> usize { 22 | // SSE3 popcnt algorithm by Wojciech Muła 23 | // http://wm.ite.pl/articles/sse-popcount.html 24 | optimized!(); 25 | let lookup = i8x16::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); 26 | let lo = v.be_u8s() & 0x0f; 27 | let hi: u8x16 = v.be_u8s() >> 4; 28 | (_mm_shuffle_epi8(lookup, hi.be_i8s()).be_u8s() 29 | + _mm_shuffle_epi8(lookup, lo.be_i8s()).be_u8s()) 30 | .sum_upcast() as usize 31 | } 32 | 33 | #[inline(always)] 34 | #[cfg(not(target_feature = "ssse3"))] 35 | #[allow(unused_unsafe)] 36 | unsafe fn popcnt128(v: u8x16) -> usize { 37 | fallback!(); 38 | v.be_u64s(). scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) 39 | } 40 | 41 | #[inline(always)] 42 | #[cfg(target_feature = "avx2")] 43 | unsafe fn popcnt256(v: u8x32) -> usize { 44 | // AVX2 popcnt algorithm by Wojciech Muła, Nathan Kurz, and Daniel Lemire 45 | // https://arxiv.org/abs/1611.07612 46 | optimized!(); 47 | let lookup = i8x32::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 48 | 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); 49 | let lo = v.be_u8s() & 0x0f; 50 | let hi: u8x32 = v.be_u8s() >> 4; 51 | (_mm256_shuffle_epi8(lookup, hi.be_i8s()).be_u8s() 52 | + _mm256_shuffle_epi8(lookup, lo.be_i8s()).be_u8s()) 53 | .sum_upcast() as usize 54 | } 55 | 56 | #[inline(always)] 57 | #[cfg(not(target_feature = "avx2"))] 58 | #[allow(unused_unsafe)] 59 | unsafe fn popcnt256(v: u8x32) -> usize { 60 | fallback!(); 61 | v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) 62 | } 63 | 64 | #[inline(always)] 65 | // #[cfg(not(target_feature = "avx512"))] 66 | unsafe fn popcnt512(v: u8x64) -> usize { 67 | fallback!(); 68 | v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize)) 69 | } 70 | 71 | impl_popcnt!(u8x64, popcnt512, u8x32, popcnt256, u8x16, popcnt128); 72 | impl_popcnt!(i8x64, popcnt512, i8x32, popcnt256, i8x16, popcnt128); 73 | impl_popcnt!(u16x32, popcnt512, u16x16, popcnt256, u16x8, popcnt128); 74 | impl_popcnt!(i16x32, popcnt512, i16x16, popcnt256, i16x8, popcnt128); 75 | impl_popcnt!(u32x16, popcnt512, u32x8, popcnt256, u32x4, popcnt128); 76 | impl_popcnt!(i32x16, popcnt512, i32x8, popcnt256, i32x4, popcnt128); 77 | impl_popcnt!(u64x8, popcnt512, u64x4, popcnt256, u64x2, popcnt128); 78 | impl_popcnt!(i64x8, popcnt512, i64x4, popcnt256, i64x2, popcnt128); 79 | 80 | #[cfg(test)] 81 | mod tests { 82 | use crate::prelude::*; 83 | use crate::arch::current::vecs::*; 84 | 85 | test_popcnt!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, u64, u64, u64, i64, i64, i64), 86 | (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2), 87 | (popcnt_u8x64, popcnt_u8x32, popcnt_u8x16, popcnt_i8x64, popcnt_i8x32, popcnt_i8x16, popcnt_u16x32, popcnt_u16x16, popcnt_u16x8, popcnt_i16x32, popcnt_i16x16, popcnt_i16x8, popcnt_u32x16, popcnt_u32x8, popcnt_u32x4, popcnt_i32x16, popcnt_i32x8, popcnt_i32x4, popcnt_u64x8, popcnt_u64x4, popcnt_u64x2, popcnt_i64x8, popcnt_i64x4, popcnt_i64x2)); 88 | } 89 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/recip.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | use crate::intrin::recip::Recip; 13 | 14 | rust_fallback_impl! { 15 | impl Recip for f32x8 where "avx" { 16 | recip => _mm256_rcp_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 17 | } 18 | } 19 | 20 | rust_fallback_impl! { 21 | impl Recip for f32x4 where "sse" { 22 | recip => _mm_rcp_ps(), [0, 1, 2, 3]; 23 | } 24 | } 25 | 26 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/round.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::intrin::round::Round; 11 | use crate::core::arch::x86_64::{_MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TRUNC}; 12 | use crate::arch::current::vecs::*; 13 | use crate::vecs::*; 14 | 15 | rust_fallback_impl! { 16 | impl Round for f32x4 where "sse4.1" { 17 | round => _mm_round_ps(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3]; 18 | ceil => _mm_ceil_ps(), [0, 1, 2, 3]; 19 | floor => _mm_floor_ps(), [0, 1, 2, 3]; 20 | trunc => _mm_round_ps(_MM_FROUND_TRUNC), [0, 1, 2, 3]; 21 | } 22 | } 23 | 24 | rust_fallback_impl! { 25 | impl Round for f64x2 where "sse4.1" { 26 | round => _mm_round_pd(_MM_FROUND_TO_NEAREST_INT), [0, 1]; 27 | ceil => _mm_ceil_pd(), [0, 1]; 28 | floor => _mm_floor_pd(), [0, 1]; 29 | trunc => _mm_round_pd(_MM_FROUND_TRUNC), [0, 1]; 30 | } 31 | } 32 | 33 | rust_fallback_impl! { 34 | impl Round for f32x8 where "avx" { 35 | round => _mm256_round_ps(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3, 4, 5, 6, 7]; 36 | ceil => _mm256_ceil_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 37 | floor => _mm256_floor_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 38 | trunc => _mm256_round_ps(_MM_FROUND_TRUNC), [0, 1, 2, 3, 4, 5, 6, 7]; 39 | } 40 | } 41 | 42 | rust_fallback_impl! { 43 | impl Round for f64x4 where "sse4.1" { 44 | round => _mm256_round_pd(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3]; 45 | ceil => _mm256_ceil_pd(), [0, 1, 2, 3]; 46 | floor => _mm256_floor_pd(), [0, 1, 2, 3]; 47 | trunc => _mm256_round_pd(_MM_FROUND_TRUNC), [0, 1, 2, 3]; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/rsqrt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::intrin::rsqrt::*; 11 | use crate::arch::current::vecs::*; 12 | use crate::vecs::*; 13 | 14 | // TODO: Guards and non-simd 15 | 16 | rust_fallback_impl! { 17 | impl Rsqrt for f32x8 where "avx" { 18 | rsqrt => _mm256_rsqrt_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 19 | } 20 | } 21 | 22 | rust_fallback_impl! { 23 | impl Rsqrt for f32x4 where "sse" { 24 | rsqrt => _mm_rsqrt_ps(), [0, 1, 2, 3]; 25 | } 26 | } 27 | 28 | impl Rsqrt for f32 { 29 | #[inline(always)] 30 | fn rsqrt(&self) -> Self { 31 | self.sqrt().recip() 32 | } 33 | } 34 | 35 | impl Rsqrt for f64 { 36 | #[inline(always)] 37 | fn rsqrt(&self) -> Self { 38 | self.sqrt().recip() 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/saturating_add.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | use crate::intrin::saturating_add::*; 13 | 14 | rust_fallback_impl_binary! { 15 | impl SaturatingAdd for u8x16 where "sse2" { 16 | saturating_add => _mm_adds_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 17 | } 18 | } 19 | 20 | rust_fallback_impl_binary! { 21 | impl SaturatingAdd for i8x16 where "sse2" { 22 | saturating_add => _mm_adds_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 23 | } 24 | } 25 | 26 | rust_fallback_impl_binary! { 27 | impl SaturatingAdd for u16x8 where "sse2" { 28 | saturating_add => _mm_adds_epu16(), [0, 1, 2, 3, 4, 5, 6, 7]; 29 | } 30 | } 31 | 32 | rust_fallback_impl_binary! { 33 | impl SaturatingAdd for i16x8 where "sse2" { 34 | saturating_add => _mm_adds_epi16(), [0, 1, 2, 3, 4, 5, 6, 7]; 35 | } 36 | } 37 | 38 | rust_fallback_impl_binary! { 39 | impl SaturatingAdd for u8x32 where "avx2" { 40 | saturating_add => _mm256_adds_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 41 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 42 | 43 | } 44 | } 45 | 46 | rust_fallback_impl_binary! { 47 | impl SaturatingAdd for i8x32 where "avx2" { 48 | saturating_add => _mm256_adds_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 49 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 50 | } 51 | } 52 | 53 | rust_fallback_impl_binary! { 54 | impl SaturatingAdd for u16x16 where "avx2" { 55 | saturating_add => _mm256_adds_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 56 | } 57 | } 58 | 59 | rust_fallback_impl_binary! { 60 | impl SaturatingAdd for i16x16 where "avx2" { 61 | saturating_add => _mm256_adds_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/saturating_hadd.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | use crate::intrin::transmute::*; 13 | use crate::intrin::saturating_hadd::*; 14 | 15 | #[cfg(target_feature = "ssse3")] 16 | impl SaturatingHAdd for i16x8 { 17 | #[inline(always)] 18 | fn saturating_hadd(&self, other: Self) -> Self { 19 | optimized!(); 20 | unsafe { _mm_hadds_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 21 | _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 22 | } 23 | } 24 | 25 | #[cfg(target_feature = "avx2")] 26 | impl SaturatingHAdd for i16x16 { 27 | #[inline(always)] 28 | fn saturating_hadd(&self, other: Self) -> Self { 29 | optimized!(); 30 | unsafe { _mm256_hadds_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 31 | _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 32 | } 33 | } 34 | 35 | impl SaturatingHAdd for u64x2 { hop!(saturating_hadd, u64::saturating_add, 0, 1); } 36 | impl SaturatingHAdd for u64x4 { hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3); } 37 | impl SaturatingHAdd for u64x8 { hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 38 | impl SaturatingHAdd for u32x4 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); } 39 | impl SaturatingHAdd for u32x8 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 40 | impl SaturatingHAdd for u32x16 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 41 | impl SaturatingHAdd for u16x8 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 42 | impl SaturatingHAdd for u16x16 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 43 | impl SaturatingHAdd for u16x32 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 44 | impl SaturatingHAdd for u8x16 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 45 | impl SaturatingHAdd for u8x32 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 46 | impl SaturatingHAdd for u8x64 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 47 | impl SaturatingHAdd for i64x2 { hop!(saturating_hadd, i64::saturating_add, 0, 1); } 48 | impl SaturatingHAdd for i64x4 { hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3); } 49 | impl SaturatingHAdd for i64x8 { hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 50 | impl SaturatingHAdd for i32x4 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); } 51 | impl SaturatingHAdd for i32x8 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 52 | impl SaturatingHAdd for i32x16 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 53 | #[cfg(not(target_feature = "ssse3"))] 54 | impl SaturatingHAdd for i16x8 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); } 55 | #[cfg(not(target_feature = "avx2"))] 56 | impl SaturatingHAdd for i16x16 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 57 | impl SaturatingHAdd for i16x32 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 58 | impl SaturatingHAdd for i8x16 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 59 | impl SaturatingHAdd for i8x32 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 60 | impl SaturatingHAdd for i8x64 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 61 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/saturating_hsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::arch::current::vecs::*; 11 | use crate::vecs::*; 12 | use crate::intrin::transmute::*; 13 | use crate::intrin::saturating_hsub::*; 14 | 15 | #[cfg(target_feature = "ssse3")] 16 | impl SaturatingHSub for i16x8 { 17 | #[inline(always)] 18 | fn saturating_hsub(&self, other: Self) -> Self { 19 | optimized!(); 20 | unsafe { _mm_hsubs_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 21 | _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 22 | } 23 | } 24 | 25 | #[cfg(target_feature = "avx2")] 26 | impl SaturatingHSub for i16x16 { 27 | #[inline(always)] 28 | fn saturating_hsub(&self, other: Self) -> Self { 29 | optimized!(); 30 | unsafe { _mm256_hsubs_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(), 31 | _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) } 32 | } 33 | } 34 | 35 | impl SaturatingHSub for u64x2 { hop!(saturating_hsub, u64::saturating_sub, 0, 1); } 36 | impl SaturatingHSub for u64x4 { hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3); } 37 | impl SaturatingHSub for u64x8 { hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 38 | impl SaturatingHSub for u32x4 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); } 39 | impl SaturatingHSub for u32x8 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 40 | impl SaturatingHSub for u32x16 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 41 | impl SaturatingHSub for u16x8 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 42 | impl SaturatingHSub for u16x16 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 43 | impl SaturatingHSub for u16x32 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 44 | impl SaturatingHSub for u8x16 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 45 | impl SaturatingHSub for u8x32 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 46 | impl SaturatingHSub for u8x64 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 47 | impl SaturatingHSub for i64x2 { hop!(saturating_hsub, i64::saturating_sub, 0, 1); } 48 | impl SaturatingHSub for i64x4 { hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3); } 49 | impl SaturatingHSub for i64x8 { hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 50 | impl SaturatingHSub for i32x4 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); } 51 | impl SaturatingHSub for i32x8 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 52 | impl SaturatingHSub for i32x16 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 53 | #[cfg(not(target_feature = "ssse3"))] 54 | impl SaturatingHSub for i16x8 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); } 55 | #[cfg(not(target_feature = "avx2"))] 56 | impl SaturatingHSub for i16x16 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 57 | impl SaturatingHSub for i16x32 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 58 | impl SaturatingHSub for i8x16 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } 59 | impl SaturatingHSub for i8x32 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); } 60 | impl SaturatingHSub for i8x64 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); } 61 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/saturating_sub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::intrin::saturating_sub::*; 11 | use crate::arch::current::vecs::*; 12 | use crate::vecs::*; 13 | 14 | rust_fallback_impl_binary! { 15 | impl SaturatingSub for u8x16 where "sse2" { 16 | saturating_sub => _mm_subs_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 17 | } 18 | } 19 | 20 | rust_fallback_impl_binary! { 21 | impl SaturatingSub for i8x16 where "sse2" { 22 | saturating_sub => _mm_subs_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 23 | } 24 | } 25 | 26 | rust_fallback_impl_binary! { 27 | impl SaturatingSub for u16x8 where "sse2" { 28 | saturating_sub => _mm_subs_epu16(), [0, 1, 2, 3, 4, 5, 6, 7]; 29 | } 30 | } 31 | 32 | rust_fallback_impl_binary! { 33 | impl SaturatingSub for i16x8 where "sse2" { 34 | saturating_sub => _mm_subs_epi16(), [0, 1, 2, 3, 4, 5, 6, 7]; 35 | } 36 | } 37 | 38 | rust_fallback_impl_binary! { 39 | impl SaturatingSub for u8x32 where "avx2" { 40 | saturating_sub => _mm256_subs_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 41 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 42 | } 43 | } 44 | 45 | rust_fallback_impl_binary! { 46 | impl SaturatingSub for i8x32 where "avx2" { 47 | saturating_sub => _mm256_subs_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 48 | 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]; 49 | } 50 | } 51 | 52 | rust_fallback_impl_binary! { 53 | impl SaturatingSub for u16x16 where "avx2" { 54 | saturating_sub => _mm256_subs_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 55 | } 56 | } 57 | 58 | rust_fallback_impl_binary! { 59 | impl SaturatingSub for i16x16 where "avx2" { 60 | saturating_sub => _mm256_subs_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/sqrt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use vektor::x86_64::*; 9 | use vektor::x86::*; 10 | use crate::intrin::sqrt::*; 11 | use crate::arch::current::vecs::*; 12 | use crate::vecs::*; 13 | 14 | rust_fallback_impl! { 15 | impl Sqrt for f32x8 where "avx" { 16 | sqrt => _mm256_sqrt_ps(), [0, 1, 2, 3, 4, 5, 6, 7]; 17 | } 18 | } 19 | 20 | rust_fallback_impl! { 21 | impl Sqrt for f64x4 where "avx" { 22 | sqrt => _mm256_sqrt_pd(), [0, 1, 2, 3]; 23 | } 24 | } 25 | 26 | rust_fallback_impl! { 27 | impl Sqrt for f32x4 where "sse" { 28 | sqrt => _mm_sqrt_ps(), [0, 1, 2, 3]; 29 | } 30 | } 31 | 32 | rust_fallback_impl! { 33 | impl Sqrt for f64x2 where "sse2" { 34 | sqrt => _mm_sqrt_pd(), [0, 1]; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/arch/x86/intrin/transmute.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vektor::x86_64::*; 9 | use crate::vektor::x86::*; 10 | use crate::intrin::transmute::*; 11 | use crate::arch::current::vecs::*; 12 | use crate::vecs::*; 13 | use crate::core::mem::transmute; 14 | 15 | impl_packed_transmute!(u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, f32x8, 16 | u64x4, i64x4, f64x4, ... 17 | u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, 18 | f32x8, u64x4, i64x4, f64x4, 19 | "avx", "avx512"); 20 | impl_packed_transmute!(u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, f32x16, 21 | u64x8, i64x8, f64x8, ... 22 | u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, 23 | f32x16, u64x8, i64x8, f64x8, 24 | "avx512", "avx1024"); 25 | impl_packed_transmute!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4, 26 | u64x2, i64x2, f64x2, ... 27 | u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, 28 | f32x4, u64x2, i64x2, f64x2, 29 | "sse", "avx"); 30 | -------------------------------------------------------------------------------- /src/arch/x86/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod intrin; 2 | pub mod vecs; 3 | pub mod vec_patterns; 4 | -------------------------------------------------------------------------------- /src/arch/x86/vecs.rs: -------------------------------------------------------------------------------- 1 | pub use crate::vecs::*; 2 | pub use packed_simd::{u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2}; 3 | 4 | impl_packed!(u8, u8s, u8x64, 1, 64, ["avx512"], ["avx1024"]); 5 | impl_packed!(u8, u8s, u8x32, 1, 32, ["avx2"], ["avx512"]); 6 | impl_packed!(u8, u8s, u8x16, 1, 16, [], ["avx2"]); 7 | impl_packed!(i8, i8s, i8x64, 1, 64, ["avx512"], ["avx1024"]); 8 | impl_packed!(i8, i8s, i8x32, 1, 32, ["avx2"], ["avx512"]); 9 | impl_packed!(i8, i8s, i8x16, 1, 16, [], ["avx2"]); 10 | impl_packed!(u16, u16s, u16x32, 2, 32, ["avx512"], ["avx1024"]); 11 | impl_packed!(u16, u16s, u16x16, 2, 16, ["avx2"], ["avx512"]); 12 | impl_packed!(u16, u16s, u16x8, 2, 8, [], ["avx2"]); 13 | impl_packed!(i16, i16s, i16x32, 2, 32, ["avx512"], ["avx1024"]); 14 | impl_packed!(i16, i16s, i16x16, 2, 16, ["avx2"], ["avx512"]); 15 | impl_packed!(i16, i16s, i16x8, 2, 8, [], ["avx2"]); 16 | impl_packed!(u32, u32s, u32x16, 4, 16, ["avx512"], ["avx1024"]); 17 | impl_packed!(u32, u32s, u32x8, 4, 8, ["avx2"], ["avx512"]); 18 | impl_packed!(u32, u32s, u32x4, 4, 4, [], ["avx2"]); 19 | impl_packed!(i32, i32s, i32x16, 4, 16, ["avx512"], ["avx1024"]); 20 | impl_packed!(i32, i32s, i32x8, 4, 8, ["avx2"], ["avx512"]); 21 | impl_packed!(i32, i32s, i32x4, 4, 4, [], ["avx2"]); 22 | impl_packed!(f32, f32s, f32x16, 4, 16, ["avx512"], ["avx1024"]); 23 | impl_packed!(f32, f32s, f32x8, 4, 8, ["avx2"], ["avx512"]); 24 | impl_packed!(f32, f32s, f32x4, 4, 4, [], ["avx2"]); 25 | impl_packed!(u64, u64s, u64x8, 8, 8, ["avx512"], ["avx1024"]); 26 | impl_packed!(u64, u64s, u64x4, 8, 4, ["avx2"], ["avx512"]); 27 | impl_packed!(u64, u64s, u64x2, 8, 2, [], ["avx2"]); 28 | impl_packed!(i64, i64s, i64x8, 8, 8, ["avx512"], ["avx1024"]); 29 | impl_packed!(i64, i64s, i64x4, 8, 4, ["avx2"], ["avx512"]); 30 | impl_packed!(i64, i64s, i64x2, 8, 2, [], ["avx2"]); 31 | impl_packed!(f64, f64s, f64x8, 8, 8, ["avx512"], ["avx1024"]); 32 | impl_packed!(f64, f64s, f64x4, 8, 4, ["avx2"], ["avx512"]); 33 | impl_packed!(f64, f64s, f64x2, 8, 2, [], ["avx2"]); 34 | 35 | #[cfg(test)] 36 | mod tests { 37 | use super::Packed; 38 | use super::*; 39 | 40 | macro_rules! test_product { 41 | (($($el:tt),*), ($($vec:tt),*), ($($fn:tt),*), ($($sum:tt),*)) => ( 42 | $( 43 | #[test] 44 | fn $fn() { 45 | assert_eq!($vec::splat(1i8 as $el).product(), $sum as $el); 46 | } 47 | )* 48 | ) 49 | } 50 | 51 | // TODO: Do we need better test cases for this? 52 | test_product!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, f32, f32, f32, u64, u64, u64, i64, i64, i64, f64, f64, f64), 53 | (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2), 54 | (scalar_product_u8x64, scalar_product_u8x32, scalar_product_u8x16, scalar_product_i8x64, scalar_product_i8x32, scalar_product_i8x16, scalar_product_u16x32, scalar_product_u16x16, scalar_product_u16x8, scalar_product_i16x32, scalar_product_i16x16, scalar_product_i16x8, scalar_product_u32x16, scalar_product_u32x8, scalar_product_u32x4, scalar_product_i32x16, scalar_product_i32x8, scalar_product_i32x4, scalar_product_f32x16, scalar_product_f32x8, scalar_product_f32x4, scalar_product_u64x8, scalar_product_u64x4, scalar_product_u64x2, scalar_product_i64x8, scalar_product_i64x4, scalar_product_i64x2, scalar_product_f64x8, scalar_product_f64x4, scalar_product_f64x2), 55 | (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)); 56 | } 57 | -------------------------------------------------------------------------------- /src/debug.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused_macros, dead_code)] 2 | 3 | use std::collections::HashSet; 4 | use std::cell::RefCell; 5 | 6 | thread_local! { 7 | // Not perfect as it might print multiple times (once per thread), 8 | // but better than a `global` hack to prevent multiple prints of the 9 | // same warning. 10 | pub(crate) static OUTPUT_GUARD: RefCell> = RefCell::new(HashSet::new()); 11 | } 12 | 13 | 14 | macro_rules! debug_append_log { 15 | ($str:expr) => { 16 | use std::io::Write; 17 | 18 | // Allows the user to configure debug file path at compile time, 19 | // e.g., when building for embedded / Android. 20 | let file_name = option_env!("FASTER_DEBUG_FILE").unwrap_or("faster-debug.txt"); 21 | 22 | std::fs::OpenOptions::new() 23 | .write(true) 24 | .create(true) 25 | .append(true) 26 | .open(file_name).and_then(|mut file| { 27 | writeln!(file, "{}", $str) 28 | }).ok(); // `ok` suppresses warning about unused results, about which we don't care. 29 | } 30 | } 31 | 32 | 33 | /// Prints the given string once (for the current thread). 34 | /// Useful for not spamming the console. 35 | macro_rules! debug_output_once { 36 | ($str:expr) => { 37 | let output = $str; 38 | 39 | crate::debug::OUTPUT_GUARD.with(|f| { 40 | let mut output_guard = f.borrow_mut(); 41 | 42 | if output_guard.contains(&output) { 43 | return; 44 | } 45 | 46 | // Also print to file (if enabled). 47 | debug_append_log!(output); 48 | println!("{}", output); 49 | 50 | output_guard.insert(output); 51 | }); 52 | } 53 | } 54 | 55 | 56 | /// Signal a software fallback is executed. 57 | #[cfg(feature="trace")] 58 | macro_rules! fallback { 59 | () => { 60 | debug_output_once!(format!("⛔ faster is using SOFTWARE emulation here ({}:{}).", file!(), line!())); 61 | } 62 | } 63 | 64 | /// Signal an optimized SIMD intrinsic is executed. 65 | #[cfg(feature="trace")] 66 | macro_rules! optimized { 67 | () => { 68 | debug_output_once!(format!("🚄 faster is using HARDWARE acceleration here ({}:{}).", file!(), line!())); 69 | } 70 | } 71 | 72 | #[cfg(not(feature="trace"))] 73 | macro_rules! fallback { 74 | () => { } 75 | } 76 | 77 | #[cfg(not(feature="trace"))] 78 | macro_rules! optimized { 79 | () => { } 80 | } 81 | 82 | -------------------------------------------------------------------------------- /src/into_iters.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy owf the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::iters::{SIMDIter, SIMDIterator, SIMDObject}; 9 | #[allow(unused_imports)] // Remove for specialization 10 | use crate::iters::SIMDAdapter; 11 | use crate::arch::current::vecs::*; 12 | 13 | /// A trait which transforms a contiguous collection into an owned stream of 14 | /// vectors. 15 | pub trait IntoSIMDIterator { 16 | type Iter : SIMDIterator; 17 | 18 | /// Return an iterator over this data which will automatically pack 19 | /// values into SIMD vectors. See `SIMDIterator::simd_map` and 20 | /// `SIMDIterator::simd_reduce` for more information. 21 | fn into_simd_iter(self, default: ::Vector) -> Self::Iter; 22 | } 23 | 24 | /// A trait which transforms a contiguous collection into a slice-backed stream 25 | /// of vectors. 26 | pub trait IntoSIMDRefIterator<'a> { 27 | type Iter : SIMDIterator; 28 | 29 | /// Return an iterator over this data which will automatically pack 30 | /// values into SIMD vectors. See `SIMDIterator::simd_map` and 31 | /// `SIMDIterator::simd_reduce` for more information. 32 | fn simd_iter(&'a self, default: ::Vector) -> Self::Iter; 33 | } 34 | 35 | /// A trait which transforms a contiguous collection into a mutable slice-backed 36 | /// stream of vectors. 37 | pub trait IntoSIMDRefMutIterator<'a> { 38 | type Iter : SIMDIterator; 39 | 40 | /// Return an iterator over this data which will automatically pack 41 | /// values into SIMD vectors. See `SIMDIterator::simd_map` and 42 | /// `SIMDIterator::simd_reduce` for more information. 43 | fn simd_iter_mut(&'a mut self, default: ::Vector) -> Self::Iter; 44 | } 45 | 46 | macro_rules! impl_array_intos { 47 | ($($el:ty, $vec:ty),*) => { 48 | $( 49 | #[cfg(feature = "std")] 50 | impl IntoSIMDIterator for Vec<$el> { 51 | type Iter = SIMDIter; 52 | 53 | #[inline(always)] 54 | fn into_simd_iter(self, default: $vec) -> Self::Iter { 55 | SIMDIter { 56 | data: self, 57 | position: 0, 58 | default: default, 59 | } 60 | } 61 | } 62 | 63 | impl<'a> IntoSIMDRefIterator<'a> for &'a [$el] { 64 | type Iter = SIMDIter; 65 | 66 | #[inline(always)] 67 | fn simd_iter(&'a self, default: $vec) -> Self::Iter { 68 | SIMDIter { 69 | data: self, 70 | position: 0, 71 | default: default, 72 | } 73 | } 74 | } 75 | 76 | impl<'a> IntoSIMDRefMutIterator<'a> for &'a mut [$el] { 77 | type Iter = SIMDIter; 78 | 79 | #[inline(always)] 80 | fn simd_iter_mut(&'a mut self, default: $vec) -> Self::Iter { 81 | SIMDIter { 82 | data: self, 83 | position: 0, 84 | default: default, 85 | } 86 | } 87 | } 88 | 89 | impl<'a> IntoSIMDRefMutIterator<'a> for [$el] { 90 | type Iter = SIMDIter<&'a mut Self>; 91 | 92 | #[inline(always)] 93 | fn simd_iter_mut(&'a mut self, default: $vec) -> Self::Iter { 94 | SIMDIter { 95 | data: self, 96 | position: 0, 97 | default: default, 98 | } 99 | } 100 | } 101 | 102 | impl<'a> IntoSIMDRefIterator<'a> for [$el] { 103 | type Iter = SIMDIter<&'a Self>; 104 | 105 | #[inline(always)] 106 | fn simd_iter(&'a self, default: $vec) -> Self::Iter { 107 | SIMDIter { 108 | data: self, 109 | position: 0, 110 | default: default, 111 | } 112 | } 113 | } 114 | )* 115 | } 116 | } 117 | 118 | impl_array_intos!(u8, u8s, 119 | i8, i8s, 120 | u16, u16s, 121 | i16, i16s, 122 | u32, u32s, 123 | i32, i32s, 124 | f32, f32s, 125 | u64, u64s, 126 | i64, i64s, 127 | f64, f64s); 128 | 129 | // TODO: Specialization 130 | // impl IntoSIMDIterator for I where I : ExactSizeIterator + Iterator, S : Packable { 131 | // type Iter = SIMDAdapter; 132 | 133 | // #[inline(always)] 134 | // fn into_simd_iter(self, default: S::Vector) -> Self::Iter { 135 | // SIMDAdapter { 136 | // iter: self, 137 | // position: 0, 138 | // default: default, 139 | // scratch: default 140 | // } 141 | // } 142 | // } 143 | -------------------------------------------------------------------------------- /src/intrin/abs.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Abs { 9 | type Out; 10 | /// Return a vector the absolute value of the elements of `self`. 11 | /// 12 | /// # Examples 13 | /// 14 | /// ``` 15 | /// extern crate faster; 16 | /// use faster::*; 17 | /// 18 | /// # fn main() { 19 | /// assert_eq!(i32s(-2).abs(), u32s(2)); 20 | /// assert_eq!(i8s(-256).abs(), u8s(256)); 21 | /// # } 22 | /// ``` 23 | fn abs(&self) -> Self::Out; 24 | } 25 | 26 | #[cfg(test)] 27 | mod tests { 28 | use crate::prelude::*; 29 | 30 | #[test] 31 | fn abs_i8s() { 32 | for i in -128..127 { 33 | assert_eq!(i8s(i).abs().extract(0), (i as i64).abs() as u8); 34 | } 35 | } 36 | 37 | #[test] 38 | fn abs_i16s() { 39 | for i in -32768..32767 { 40 | assert_eq!(i16s(i).abs().extract(0), (i as i64).abs() as u16); 41 | } 42 | } 43 | 44 | #[test] 45 | fn abs_i32s() { 46 | for i in -65536..65536 { 47 | assert_eq!(i32s(i).abs().extract(0), (i as i64).abs() as u32); 48 | } 49 | } 50 | 51 | #[test] 52 | fn abs_i64s() { 53 | for i in -65536..65536 { 54 | assert_eq!(i64s(i).abs().extract(0), (i as i64).abs() as u64); 55 | } 56 | } 57 | 58 | #[test] 59 | fn abs_f32s() { 60 | let mut i = -1024.0; 61 | while i < 1024.0 { 62 | // This test has some pretty significant float error if done on x86 63 | assert_eq!(f32s(i).abs().extract(0), i.abs()); 64 | i += 1.0 65 | } 66 | } 67 | 68 | #[test] 69 | fn abs_f64s() { 70 | let mut i = -1024.0; 71 | while i < 1024.0 { 72 | // This test has some pretty significant float error if done on x86 73 | assert_eq!(f64s(i).abs().extract(0), i.abs()); 74 | i += 1.0 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/intrin/addsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait AddSub { 9 | fn addsub(&self, other: Self) -> Self; 10 | } 11 | -------------------------------------------------------------------------------- /src/intrin/cast.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Asi8s { 9 | type Cast; 10 | 11 | /// Return a vector containing all elements of `self` cast to i8s. 12 | fn as_i8s(self) -> Self::Cast; 13 | } 14 | 15 | pub trait Asu8s { 16 | type Cast; 17 | 18 | /// Return a vector containing all elements of `self` cast to u8s. 19 | fn as_u8s(self) -> Self::Cast; 20 | } 21 | 22 | pub trait Asi16s { 23 | type Cast; 24 | 25 | /// Return a vector containing all elements of `self` cast to i16s. 26 | fn as_i16s(self) -> Self::Cast; 27 | } 28 | 29 | pub trait Asu16s { 30 | type Cast; 31 | 32 | /// Return a vector containing all elements of `self` cast to u16s. 33 | fn as_u16s(self) -> Self::Cast; 34 | } 35 | 36 | pub trait Asf32s { 37 | type Cast; 38 | 39 | /// Return a vector containing all elements of `self` cast to f32s. 40 | fn as_f32s(self) -> Self::Cast; 41 | } 42 | 43 | pub trait Asi32s { 44 | type Cast; 45 | 46 | /// Return a vector containing all elements of `self` cast to i32s. 47 | fn as_i32s(self) -> Self::Cast; 48 | } 49 | 50 | pub trait Asu32s { 51 | type Cast; 52 | 53 | /// Return a vector containing all elements of `self` cast to u32s. 54 | fn as_u32s(self) -> Self::Cast; 55 | } 56 | 57 | pub trait Asf64s { 58 | type Cast; 59 | 60 | /// Return a vector containing all elements of `self` cast to f64s. 61 | fn as_f64s(self) -> Self::Cast; 62 | } 63 | 64 | pub trait Asi64s { 65 | type Cast; 66 | 67 | /// Return a vector containing all elements of `self` cast to i64s. 68 | fn as_i64s(self) -> Self::Cast; 69 | } 70 | 71 | pub trait Asu64s { 72 | type Cast; 73 | 74 | /// Return a vector containing all elements of `self` cast to u64s. 75 | fn as_u64s(self) -> Self::Cast; 76 | } 77 | 78 | // macro_rules! impl_cast { 79 | // ($trait:path, $from:ty, $to:ty, $name:ident, $rsname:ident) => ( 80 | // impl $trait for $from { 81 | // type Cast = $to; 82 | 83 | // #[inline(always)] 84 | // fn $name(self) -> Self::Cast { 85 | // self.$rsname() 86 | // } 87 | // } 88 | // ); 89 | // } 90 | -------------------------------------------------------------------------------- /src/intrin/cmp.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Cmp { 9 | /// Return a vector where each element at an index i is the maximum of the 10 | /// elements at index i in `self` and `other`. 11 | /// 12 | /// ```ignore 13 | /// use faster::*; 14 | /// 15 | /// # fn main() { 16 | /// assert_eq!(i8s(0).max(i8s(2)), i8s(2)); 17 | /// assert_eq!(i8s::halfs(1, 0).max(i8s::halfs(2, -1)), i8s::halfs(2, 0)); 18 | /// # } 19 | /// ``` 20 | fn max(&self, other: Self) -> Self; 21 | 22 | /// Return a vector where each element at an index i is the minimum of the 23 | /// elements at index i in `self` and `other`. 24 | /// 25 | /// ```ignore 26 | /// use faster::*; 27 | /// 28 | /// # fn main() { 29 | /// assert_eq!(i8s(0).min(i8s(2)), i8s(0)); 30 | /// assert_eq!(i8s::halfs(1, 0).min(i8s::halfs(2, -1)), i8s::halfs(1, -1)); 31 | /// # } 32 | /// ``` 33 | fn min(&self, other: Self) -> Self; 34 | } 35 | -------------------------------------------------------------------------------- /src/intrin/destride.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Destride : Sized { 9 | fn destride_two(self, other: Self) -> (Self, Self); 10 | fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self); 11 | } 12 | 13 | // TODO: LLVM actually autovectorizes our polyfills, but we should still have an 14 | // explicit implementation for everything 15 | 16 | macro_rules! destride_two_polyfill { 17 | ($self:expr, $other:expr, $($n:expr),*) => { 18 | (Self::new($($self.extract($n)),*, 19 | $($other.extract($n)),*), 20 | Self::new($($self.extract($n + 1)),*, 21 | $($other.extract($n + 1)),*)) 22 | } 23 | } 24 | 25 | macro_rules! destride_four_polyfill { 26 | ($self:expr, $b:expr, $c:expr, $d:expr, $($n:expr),*) => { 27 | (Self::new($($self.extract($n)),*, 28 | $($b.extract($n)),*, 29 | $($c.extract($n)),*, 30 | $($d.extract($n)),*), 31 | Self::new($($self.extract($n + 1)),*, 32 | $($b.extract($n + 1)),*, 33 | $($c.extract($n + 1)),*, 34 | $($d.extract($n + 1)),*), 35 | Self::new($($self.extract($n + 2)),*, 36 | $($b.extract($n + 2)),*, 37 | $($c.extract($n + 2)),*, 38 | $($d.extract($n + 2)),*), 39 | Self::new($($self.extract($n + 3)),*, 40 | $($b.extract($n + 3)),*, 41 | $($c.extract($n + 3)),*, 42 | $($d.extract($n + 3)),*)) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/intrin/downcast.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Downcast { 9 | /// Return a vector containing elements of the same value as `self` and 10 | /// `other`, but different type. The first half of the returned vector 11 | /// contains the downcast values of `self`, whereas the second half of the 12 | /// returned vector contains the downcast values of `other`. The returned 13 | /// vector is equal in size to `self` and `other`. If an element exceeds 14 | /// the maximum or minimum value of the downcast type, it is saturated. 15 | /// 16 | /// # Examples 17 | /// 18 | /// ``` 19 | /// extern crate faster; 20 | /// use faster::*; 21 | /// 22 | /// # fn main() { 23 | /// assert_eq!(i32s(2).saturating_downcast(i32s(3)), i16s::halfs(2, 3)); 24 | /// assert_eq!(i16s(128).saturating_downcast(i16s(-129)), i8s::halfs(127, -128)); 25 | /// # } 26 | /// ``` 27 | fn saturating_downcast(self, other: Self) -> T; 28 | } 29 | -------------------------------------------------------------------------------- /src/intrin/endian.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Reendianize : Sized + Copy { 9 | /// Return a vector containing elements of `self` with switched endianness. 10 | /// 11 | /// ``` 12 | /// extern crate faster; 13 | /// use faster::*; 14 | /// 15 | /// # fn main() { 16 | /// assert_eq!(u32s(0xDEADBEEF).swap_bytes(), u32s(0xEFBEADDE)); 17 | /// # } 18 | /// ``` 19 | fn swap_bytes(&self) -> Self; 20 | 21 | #[cfg(target_endian = "big")] 22 | #[inline(always)] 23 | fn to_be(&self) -> Self { 24 | *self 25 | } 26 | 27 | #[cfg(target_endian = "little")] 28 | #[inline(always)] 29 | fn to_be(&self) -> Self { 30 | self.swap_bytes() 31 | } 32 | 33 | #[cfg(target_endian = "big")] 34 | #[inline(always)] 35 | fn to_le(&self) -> Self { 36 | self.swap_bytes() 37 | } 38 | 39 | #[cfg(target_endian = "little")] 40 | #[inline(always)] 41 | fn to_le(&self) -> Self { 42 | *self 43 | } 44 | 45 | #[cfg(target_endian = "big")] 46 | #[inline(always)] 47 | fn from_be(&self) -> Self { 48 | *self 49 | } 50 | 51 | #[cfg(target_endian = "little")] 52 | #[inline(always)] 53 | fn from_be(&self) -> Self { 54 | self.swap_bytes() 55 | } 56 | 57 | #[cfg(target_endian = "big")] 58 | #[inline(always)] 59 | fn from_le(&self) -> Self { 60 | self.swap_bytes() 61 | } 62 | 63 | #[cfg(target_endian = "little")] 64 | #[inline(always)] 65 | fn from_le(&self) -> Self { 66 | *self 67 | } 68 | } 69 | 70 | macro_rules! impl_packed_swap_bytes { 71 | ($vec:tt, $uvec:tt, $feat:expr, $mmfn:tt, ($($c:expr),*), ($($a:expr, $b:expr),*)) => { 72 | impl Reendianize for $vec { 73 | #[cfg(not(target_feature = $feat))] 74 | #[inline(always)] 75 | fn swap_bytes(&self) -> Self { 76 | fallback!(); 77 | $vec::new($(self.extract($a).swap_bytes(), 78 | self.extract($b).swap_bytes()),*) 79 | } 80 | 81 | #[cfg(target_feature = $feat)] 82 | #[inline(always)] 83 | fn swap_bytes(&self) -> Self { 84 | optimized!(); 85 | unsafe { 86 | transmute($mmfn(self.be_i8s(), $uvec::new($($c),*).be_i8s())) 87 | } 88 | } 89 | } 90 | } 91 | } 92 | 93 | macro_rules! test_packed_swap_bytes { 94 | (($($vec:tt),*), ($($fn:tt),*)) => { 95 | $( 96 | #[test] 97 | fn $fn() { 98 | let a = $vec::interleave(33u8 as <$vec as Packed>::Scalar, 99 | 92u8 as <$vec as Packed>::Scalar); 100 | let b = $vec::interleave((33u8 as <$vec as Packed>::Scalar).swap_bytes(), 101 | (92u8 as <$vec as Packed>::Scalar).swap_bytes()); 102 | assert_eq!(a.swap_bytes(), b); 103 | } 104 | )* 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/intrin/eq.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::core::ops::BitXor; 9 | use crate::vecs::*; 10 | 11 | pub trait Eq : Packed { 12 | type Out : Pattern + BitXor; 13 | 14 | /// Return a vector where each element at an index i is filled with 1s if 15 | /// the elements of `self` and `other` at index i are equal, and filled with 16 | /// zeroes otherwise. 17 | /// 18 | /// ``` 19 | /// extern crate faster; 20 | /// use faster::*; 21 | /// 22 | /// # fn main() { 23 | /// assert_eq!(u8s::interleave(0, 2).eq_mask(u8s(0)).be_u8s(), u8s::interleave(0xFF, 0).be_u8s()); 24 | /// assert_eq!(u32s::halfs(1, 0).eq_mask(u32s(0)), u32s::halfs(0, 0xFFFFFFFF)); 25 | /// # } 26 | /// ``` 27 | fn eq_mask(&self, other: Self) -> Self::Out; 28 | 29 | /// Return a vector where each element at an index i is filled with 1s if 30 | /// the elements of `self` and `other` at index i are not equal, and filled 31 | /// with zeroes otherwise. 32 | /// 33 | /// ``` 34 | /// extern crate faster; 35 | /// use faster::*; 36 | /// 37 | /// # fn main() { 38 | /// assert_eq!(u8s::interleave(0, 2).ne_mask(u8s(0)), u8s::interleave(0, 0xFF)); 39 | /// assert_eq!(u32s::halfs(1, 0).ne_mask(u32s(0)), u32s::halfs(0xFFFFFFFF, 0)); 40 | /// # } 41 | /// ``` 42 | #[inline(always)] 43 | fn ne_mask(&self, other: Self) -> Self::Out { self.eq_mask(other) ^ Self::Out::ones() } 44 | } 45 | 46 | macro_rules! rust_fallback_eq { 47 | (impl $trait:tt for $type:tt where $feat:tt { 48 | $($newfn:ident, $rustfn:ident => $mask:tt, $maskel:tt, $mmfn:tt ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => ( 49 | impl $trait for $type { 50 | $( 51 | type Out = $mask; 52 | 53 | #[inline(always)] 54 | #[cfg(target_feature = $feat)] 55 | fn $newfn(&self, other: Self) -> $mask { 56 | use crate::core::mem::transmute; 57 | unsafe { transmute($mmfn(transmute(*self), transmute(other), $($mmfnargs),*)) } 58 | } 59 | 60 | #[inline(always)] 61 | #[cfg(not(target_feature = $feat))] 62 | fn $newfn(&self, other: Self) -> Self::Out { 63 | fallback!(); 64 | use crate::core::mem::transmute; 65 | unsafe { 66 | Self::Out::new($(transmute(if self.extract($n).$rustfn(&other.extract($n)) { 67 | $maskel::max_value() 68 | } else { 69 | $maskel::min_value() 70 | })),*) 71 | } 72 | } 73 | )* 74 | } 75 | ); 76 | } 77 | 78 | macro_rules! test_packed_eq { 79 | ($vec:tt, $el:tt, $mask:tt, $maskel:tt, $name:tt) => { 80 | #[test] 81 | fn $name() { 82 | assert_eq!($vec::halfs(1 as $el, 0 as $el).eq_mask($vec::splat(0 as $el)), 83 | $mask::halfs(0, $maskel::max_value())); 84 | 85 | assert_eq!($vec::interleave(1 as $el, 0 as $el).eq_mask($vec::splat(1 as $el)), 86 | $mask::interleave($maskel::max_value(), 0)); 87 | 88 | assert_eq!($vec::halfs(1 as $el, 0 as $el).ne_mask($vec::splat(0 as $el)), 89 | $mask::halfs($maskel::max_value(), 0)); 90 | 91 | assert_eq!($vec::interleave(1 as $el, 0 as $el).ne_mask($vec::splat(1 as $el)), 92 | $mask::interleave(0, $maskel::max_value())); 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/intrin/hadd.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait HAdd { 9 | /// Return a vector containing the interleaved sums of elements in `self` 10 | /// and `other`. The returned vector will begin with the sum of the first 11 | /// two elements in `self`, and end with the sum of the last two elements in 12 | /// `other` 13 | fn hadd(&self, other: Self) -> Self; 14 | } 15 | 16 | #[cfg(test)] 17 | mod tests { 18 | use crate::prelude::*; 19 | 20 | #[test] 21 | fn hadd_i8s() { 22 | assert_eq!(i8s(1).hadd(i8s(2)), i8s::interleave(2, 4)); 23 | assert_eq!(i8s::interleave(1, 2).hadd(i8s::interleave(3, 4)), i8s::interleave(3, 7)); 24 | } 25 | 26 | #[test] 27 | fn hadd_i16s() { 28 | assert_eq!(i16s(1).hadd(i16s(2)), i16s::interleave(2, 4)); 29 | assert_eq!(i16s::interleave(1, 2).hadd(i16s::interleave(3, 4)), i16s::interleave(3, 7)); 30 | } 31 | 32 | #[test] 33 | fn hadd_i32s() { 34 | assert_eq!(i32s(1).hadd(i32s(2)), i32s::interleave(2, 4)); 35 | assert_eq!(i32s::interleave(1, 2).hadd(i32s::interleave(3, 4)), i32s::interleave(3, 7)); 36 | } 37 | 38 | #[test] 39 | fn hadd_i64s() { 40 | assert_eq!(i64s(1).hadd(i64s(2)), i64s::interleave(2, 4)); 41 | assert_eq!(i64s::interleave(1, 2).hadd(i64s::interleave(3, 4)), i64s::interleave(3, 7)); 42 | } 43 | 44 | #[test] 45 | fn hadd_u8s() { 46 | assert_eq!(u8s(1).hadd(u8s(2)), u8s::interleave(2, 4)); 47 | assert_eq!(u8s::interleave(1, 2).hadd(u8s::interleave(3, 4)), u8s::interleave(3, 7)); 48 | } 49 | 50 | #[test] 51 | fn hadd_u16s() { 52 | assert_eq!(u16s(1).hadd(u16s(2)), u16s::interleave(2, 4)); 53 | assert_eq!(u16s::interleave(1, 2).hadd(u16s::interleave(3, 4)), u16s::interleave(3, 7)); 54 | } 55 | 56 | #[test] 57 | fn hadd_u32s() { 58 | assert_eq!(u32s(1).hadd(u32s(2)), u32s::interleave(2, 4)); 59 | assert_eq!(u32s::interleave(1, 2).hadd(u32s::interleave(3, 4)), u32s::interleave(3, 7)); 60 | } 61 | 62 | #[test] 63 | fn hadd_u64s() { 64 | assert_eq!(u64s(1).hadd(u64s(2)), u64s::interleave(2, 4)); 65 | assert_eq!(u64s::interleave(1, 2).hadd(u64s::interleave(3, 4)), u64s::interleave(3, 7)); 66 | } 67 | 68 | #[test] 69 | fn hadd_f32s() { 70 | assert_eq!(f32s(1.0).hadd(f32s(2.0)), f32s::interleave(2.0, 4.0)); 71 | assert_eq!(f32s::interleave(1.0, 2.0).hadd(f32s::interleave(3.0, 4.0)), f32s::interleave(3.0, 7.0)); 72 | } 73 | 74 | #[test] 75 | fn hadd_f64s() { 76 | assert_eq!(f64s(1.0).hadd(f64s(2.0)), f64s::interleave(2.0, 4.0)); 77 | assert_eq!(f64s::interleave(1.0, 2.0).hadd(f64s::interleave(3.0, 4.0)), f64s::interleave(3.0, 7.0)); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/intrin/hsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait HSub { 9 | /// Return a vector containing the interleaved differences of elements in 10 | /// `self` and `other`. The returned vector will begin with the difference 11 | /// of the first two elements in `self`, and end with the difference of the 12 | /// last two elements in `other` 13 | fn hsub(&self, other: Self) -> Self; 14 | } 15 | 16 | #[cfg(test)] 17 | mod tests { 18 | use crate::prelude::*; 19 | 20 | #[test] 21 | fn hsub_i8s() { 22 | assert_eq!(i8s(1).hsub(i8s(2)), i8s::interleave(0, 0)); 23 | assert_eq!(i8s::interleave(1, 2).hsub(i8s::interleave(3, 4)), i8s::interleave(-1, -1)); 24 | } 25 | 26 | #[test] 27 | fn hsub_i16s() { 28 | assert_eq!(i16s(1).hsub(i16s(2)), i16s::interleave(0, 0)); 29 | assert_eq!(i16s::interleave(1, 2).hsub(i16s::interleave(3, 4)), i16s::interleave(-1, -1)); 30 | } 31 | 32 | #[test] 33 | fn hsub_i32s() { 34 | assert_eq!(i32s(1).hsub(i32s(2)), i32s::interleave(0, 0)); 35 | assert_eq!(i32s::interleave(1, 2).hsub(i32s::interleave(3, 4)), i32s::interleave(-1, -1)); 36 | } 37 | 38 | #[test] 39 | fn hsub_i64s() { 40 | assert_eq!(i64s(1).hsub(i64s(2)), i64s::interleave(0, 0)); 41 | assert_eq!(i64s::interleave(1, 2).hsub(i64s::interleave(3, 4)), i64s::interleave(-1, -1)); 42 | } 43 | 44 | #[test] 45 | fn hsub_u8s() { 46 | assert_eq!(u8s(1).hsub(u8s(2)), u8s::interleave(0, 0)); 47 | assert_eq!(u8s::interleave(2, 1).hsub(u8s::interleave(4, 3)), u8s::interleave(1, 1)); 48 | } 49 | 50 | #[test] 51 | fn hsub_u16s() { 52 | assert_eq!(u16s(1).hsub(u16s(2)), u16s::interleave(0, 0)); 53 | assert_eq!(u16s::interleave(2, 1).hsub(u16s::interleave(4, 3)), u16s::interleave(1, 1)); 54 | } 55 | 56 | #[test] 57 | fn hsub_u32s() { 58 | assert_eq!(u32s(1).hsub(u32s(2)), u32s::interleave(0, 0)); 59 | assert_eq!(u32s::interleave(2, 1).hsub(u32s::interleave(4, 3)), u32s::interleave(1, 1)); 60 | } 61 | 62 | #[test] 63 | fn hsub_u64s() { 64 | assert_eq!(u64s(1).hsub(u64s(2)), u64s::interleave(0, 0)); 65 | assert_eq!(u64s::interleave(2, 1).hsub(u64s::interleave(4, 3)), u64s::interleave(1, 1)); 66 | } 67 | 68 | #[test] 69 | fn hsub_f32s() { 70 | assert_eq!(f32s(1.0).hsub(f32s(2.0)), f32s::interleave(0.0, 0.0)); 71 | assert_eq!(f32s::interleave(1.0, 2.0).hsub(f32s::interleave(3.0, 4.0)), f32s::interleave(-1.0, -1.0)); 72 | } 73 | 74 | #[test] 75 | fn hsub_f64s() { 76 | assert_eq!(f64s(1.0).hsub(f64s(2.0)), f64s::interleave(0.0, 0.0)); 77 | assert_eq!(f64s::interleave(1.0, 2.0).hsub(f64s::interleave(3.0, 4.0)), f64s::interleave(-1.0, -1.0)); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/intrin/macros.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | 9 | macro_rules! rust_fallback_impl { 10 | (impl $trait:tt for $type:tt where $feat:tt { 11 | $($rustfn:ident => $mmfn:tt ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => ( 12 | impl $trait for $type { 13 | $( 14 | #[inline(always)] 15 | #[cfg(target_feature = $feat)] 16 | fn $rustfn(&self) -> Self { 17 | optimized!(); 18 | unsafe { $mmfn(*self, $($mmfnargs),*) } 19 | } 20 | 21 | #[inline(always)] 22 | #[cfg(not(target_feature = $feat))] 23 | fn $rustfn(&self) -> Self { 24 | fallback!(); 25 | Self::new($(self.extract($n).$rustfn(),)*) 26 | } 27 | )* 28 | } 29 | ); 30 | } 31 | 32 | macro_rules! rust_fallback_impl_binary { 33 | (impl $trait:tt for $type:tt where $feat:tt { 34 | $($rustfn:ident => $mmfn:tt ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => ( 35 | impl $trait for $type { 36 | $( 37 | #[inline(always)] 38 | #[cfg(target_feature = $feat)] 39 | fn $rustfn(&self, other: Self) -> Self { 40 | use crate::core::mem::transmute; 41 | optimized!(); 42 | unsafe { transmute($mmfn(transmute(*self), transmute(other), $($mmfnargs),*)) } 43 | } 44 | 45 | #[inline(always)] 46 | #[cfg(not(target_feature = $feat))] 47 | fn $rustfn(&self, other: Self) -> Self { 48 | fallback!(); 49 | Self::new($(self.extract($n).$rustfn(other.extract($n)),)*) 50 | } 51 | )* 52 | } 53 | ); 54 | } 55 | 56 | macro_rules! hop { 57 | ($name:ident, $fn:path, $($a:expr, $b:expr),*) => { 58 | #[inline(always)] 59 | fn $name(&self, other: Self) -> Self { 60 | fallback!(); 61 | Self::new($($fn(self.extract($a), self.extract($b)), 62 | $fn(other.extract($a), other.extract($b))),*) 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/intrin/merge.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Merge { 9 | /// Return a vector with the first half populated by the first half of 10 | /// `self`, and the second half populated by the second half of `other`. 11 | /// 12 | /// ``` 13 | /// extern crate faster; 14 | /// use faster::*; 15 | /// 16 | /// # fn main() { 17 | /// assert_eq!(u8s(2).merge_halves(u8s(3)), u8s::halfs(2, 3)); 18 | /// # } 19 | /// ``` 20 | fn merge_halves(&self, other: Self) -> Self; 21 | 22 | /// Return a vector containing the even elements of `self` interleaved with 23 | /// the odd elements of other, starting with the first element of `self`. 24 | /// 25 | /// ``` 26 | /// extern crate faster; 27 | /// use faster::*; 28 | /// 29 | /// # fn main() { 30 | /// assert_eq!(u8s(2).merge_interleaved(u8s(3)), u8s::interleave(2, 3)); 31 | /// # } 32 | /// ``` 33 | fn merge_interleaved(&self, other: Self) -> Self; 34 | 35 | /// Return a vector containing the first `offset` elements of `self`, then 36 | /// the last `(Self::WIDTH - offset)` elements of `other`. 37 | /// 38 | /// ``` 39 | /// extern crate faster; 40 | /// use faster::*; 41 | /// 42 | /// # fn main() { 43 | /// assert_eq!(u8s(2).merge_partitioned(u8s(3), 2), u8s::partition(2u8, 3u8, 2)); 44 | /// # } 45 | /// ``` 46 | fn merge_partitioned(&self, other: Self, offset: usize) -> Self; 47 | } 48 | 49 | macro_rules! impl_packed_merge { 50 | ($vec:ty, $uvec:tt, $uscl:tt, $mmfn:expr, $feat:expr, ($($a:expr),*), ($($b:expr),*), $($na:expr, $nb:expr),*) => { 51 | #[cfg(not(target_feature = $feat))] 52 | impl Merge for $vec { 53 | 54 | #[inline(always)] 55 | fn merge_halves(&self, other: Self) -> Self { 56 | fallback!(); 57 | unsafe { 58 | Self::new($(self.extract_unchecked($a)),*, 59 | $(other.extract_unchecked($b)),*) 60 | } 61 | } 62 | 63 | #[inline(always)] 64 | fn merge_interleaved(&self, other: Self) -> Self { 65 | fallback!(); 66 | unsafe { 67 | Self::new($(self.extract_unchecked($na), other.extract_unchecked($nb)),*) 68 | } 69 | } 70 | 71 | #[inline(always)] 72 | fn merge_partitioned(&self, other: Self, offset: usize) -> Self { 73 | fallback!(); 74 | assert!(offset < Self::WIDTH); 75 | let mut ret = self.clone(); 76 | for i in offset..Self::WIDTH { 77 | unsafe { 78 | ret = ret.replace_unchecked(i, other.extract_unchecked(i)); 79 | } 80 | } 81 | ret 82 | } 83 | } 84 | 85 | #[cfg(target_feature = $feat)] 86 | impl Merge for $vec { 87 | 88 | #[inline(always)] 89 | fn merge_halves(&self, other: Self) -> Self { 90 | unsafe { 91 | transmute($mmfn( 92 | self.be_i8s(), other.be_i8s(), 93 | transmute($uvec::halfs($uscl::min_value(), $uscl::max_value())))) 94 | } 95 | } 96 | 97 | #[inline(always)] 98 | fn merge_interleaved(&self, other: Self) -> Self { 99 | unsafe { 100 | transmute($mmfn( 101 | self.be_i8s(), other.be_i8s(), 102 | transmute($uvec::interleave($uscl::min_value(), $uscl::max_value())))) 103 | } 104 | } 105 | 106 | #[inline(always)] 107 | fn merge_partitioned(&self, other: Self, offset: usize) -> Self { 108 | unsafe { 109 | transmute($mmfn( 110 | self.be_i8s(), other.be_i8s(), 111 | transmute(Self::partition_mask(offset)))) 112 | } 113 | } 114 | } 115 | } 116 | } 117 | 118 | macro_rules! test_packed_merge { 119 | (($($vec:tt),*), ($($fn:ident),*)) => { 120 | $( 121 | #[test] 122 | fn $fn() { 123 | let asc = 30i32 as <$vec as Packed>::Scalar; 124 | let bsc = 5i32 as <$vec as Packed>::Scalar; 125 | let a = $vec::splat(asc); 126 | let b = $vec::splat(bsc); 127 | assert_eq!(a.merge_interleaved(b), $vec::interleave(asc, bsc)); 128 | assert_eq!(b.merge_interleaved(a), $vec::interleave(bsc, asc)); 129 | 130 | assert_eq!(a.merge_halves(b), $vec::halfs(asc, bsc)); 131 | assert_eq!(b.merge_halves(a), $vec::halfs(bsc, asc)); 132 | 133 | for i in 0..$vec::WIDTH { 134 | assert_eq!(a.merge_partitioned(b, i), $vec::partition(asc, bsc, i)); 135 | assert_eq!(b.merge_partitioned(a, i), $vec::partition(bsc, asc, i)); 136 | } 137 | } 138 | )* 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/intrin/mod.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub mod abs; 9 | pub mod addsub; 10 | pub mod cast; 11 | pub mod cmp; 12 | #[macro_use] pub mod destride; 13 | pub mod downcast; 14 | #[macro_use] pub mod endian; 15 | #[macro_use] pub mod eq; 16 | pub mod hadd; 17 | pub mod hsub; 18 | #[macro_use] pub mod macros; 19 | #[macro_use] pub mod merge; 20 | #[macro_use] pub mod popcnt; 21 | pub mod recip; 22 | pub mod round; 23 | pub mod rsqrt; 24 | #[macro_use] pub mod sum; 25 | pub mod saturating_add; 26 | pub mod saturating_hadd; 27 | pub mod saturating_hsub; 28 | pub mod saturating_sub; 29 | pub mod sqrt; 30 | #[macro_use] pub mod transmute; 31 | pub mod upcast; 32 | 33 | // We use an internal prelude not to clutter the namespace when we import 34 | // from actual prelude. 35 | pub(crate) mod prelude { 36 | pub use super::abs::*; 37 | pub use super::addsub::*; 38 | pub use super::cast::*; 39 | pub use super::cmp::*; 40 | pub use super::destride::*; 41 | pub use super::downcast::*; 42 | pub use super::endian::*; 43 | pub use super::eq::*; 44 | pub use super::hadd::*; 45 | pub use super::hsub::*; 46 | pub use super::merge::*; 47 | pub use super::popcnt::*; 48 | pub use super::recip::*; 49 | pub use super::round::*; 50 | pub use super::rsqrt::*; 51 | pub use super::sum::*; 52 | pub use super::saturating_add::*; 53 | pub use super::saturating_hadd::*; 54 | pub use super::saturating_hsub::*; 55 | pub use super::saturating_sub::*; 56 | pub use super::sqrt::*; 57 | pub use super::transmute::*; 58 | pub use super::upcast::*; 59 | } 60 | -------------------------------------------------------------------------------- /src/intrin/popcnt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vecs::*; 9 | 10 | pub trait Popcnt : Packed { 11 | fn count_ones(&self) -> usize; 12 | 13 | #[inline(always)] 14 | fn count_zeroes(&self) -> usize { 15 | (Self::WIDTH * Self::Scalar::SIZE * 8) - self.count_ones() 16 | } 17 | } 18 | 19 | // Only used in some architectures. Might produce `unused` warning on others. 20 | #[allow(unused_macros)] 21 | macro_rules! impl_popcnt { 22 | ($($vec:ty, $fn:ident),*) => { 23 | $( 24 | impl Popcnt for $vec { 25 | #[inline(always)] 26 | #[allow(unused_unsafe)] 27 | fn count_ones(&self) -> usize { 28 | fallback!(); 29 | unsafe { $fn(self.be_u8s()) } 30 | } 31 | } 32 | )* 33 | } 34 | } 35 | 36 | // Only used in some architectures. Might produce `unused` warning on others. 37 | #[allow(unused_macros)] 38 | macro_rules! test_popcnt { 39 | (($($el:tt),*), ($($vec:tt),*), ($($fn:tt),*)) => ( 40 | $( 41 | #[test] 42 | fn $fn() { 43 | assert_eq!($vec::splat(1i8 as $el).count_ones(), $vec::WIDTH); 44 | assert_eq!($vec::splat(1i8 as $el).count_zeroes() 45 | + $vec::splat(1i8 as $el).count_ones(), 46 | $vec::WIDTH * <<$vec as Packed>::Scalar as Packable>::SIZE * 8); 47 | assert_eq!($vec::splat(!(0 as $el)).count_ones(), 48 | $vec::WIDTH * <<$vec as Packed>::Scalar as Packable>::SIZE * 8); 49 | assert_eq!($vec::splat(!(0 as $el)).count_zeroes(), 0); 50 | } 51 | )* 52 | ) 53 | } 54 | -------------------------------------------------------------------------------- /src/intrin/recip.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Recip { 9 | /// Return a vector containing an estimation of the reciprocal of the 10 | /// corresponding elements of `self`. 11 | /// 12 | /// # Examples 13 | /// 14 | /// ``` 15 | /// extern crate faster; 16 | /// use faster::*; 17 | /// 18 | /// # fn main() { 19 | /// assert!(0.25 - 0.01 < f32s(4.0).recip().coalesce() && 20 | /// 0.25 + 0.01 > f32s(4.0).recip().coalesce()); 21 | /// # } 22 | /// ``` 23 | fn recip(&self) -> Self; 24 | } 25 | 26 | #[cfg(test)] 27 | mod tests { 28 | use crate::prelude::*; 29 | use std::f32::INFINITY; 30 | 31 | #[test] 32 | fn recip_f32s() { 33 | let mut i = -1024.0; 34 | while i < 1024.0 { 35 | // This test has some pretty significant float error if done on x86 36 | let ans = f32s(i).recip().extract(0); 37 | let real = f32s(1.0 / i).extract(0); 38 | assert!((real == INFINITY && ans == INFINITY) || (ans - real).abs() < 0.0005); 39 | i += 1.0 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/intrin/rsqrt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | // TODO: Guards and non-simd 9 | 10 | pub trait Rsqrt { 11 | /// Return a vector containing an approximation of the reciprocals of the 12 | /// square-roots of elements in `self`. May contain significant float error 13 | /// past 10^-3. 14 | /// 15 | /// ``` 16 | /// extern crate faster; 17 | /// use faster::*; 18 | /// 19 | /// # fn main() { 20 | /// assert!(0.33333333 - 0.01 < f32s(9.0).rsqrt().coalesce() && 21 | /// 0.33333333 + 0.01 > f32s(9.0).rsqrt().coalesce()); 22 | /// # } 23 | /// ``` 24 | fn rsqrt(&self) -> Self; 25 | } 26 | -------------------------------------------------------------------------------- /src/intrin/saturating_add.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait SaturatingAdd { 9 | fn saturating_add(&self, other: Self) -> Self; 10 | } 11 | -------------------------------------------------------------------------------- /src/intrin/saturating_hadd.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait SaturatingHAdd { 9 | /// Return a vector containing the interleaved sums of elements in `self` 10 | /// and `other`, using saturating addition. The returned vector will begin 11 | /// with the sum of the first two elements in `self`, and end with the sum 12 | /// of the last two elements in `other` 13 | fn saturating_hadd(&self, other: Self) -> Self; 14 | } 15 | 16 | #[cfg(test)] 17 | mod tests { 18 | use crate::prelude::*; 19 | 20 | #[test] 21 | fn saturating_hadd_i8s() { 22 | assert_eq!(i8s(1).saturating_hadd(i8s(2)), i8s::interleave(2, 4)); 23 | assert_eq!(i8s::interleave(1, 2).saturating_hadd(i8s::interleave(3, 4)), i8s::interleave(3, 7)); 24 | assert_eq!(i8s::interleave(-100, -100).saturating_hadd(i8s::interleave(100, 100)), i8s::interleave(i8::min_value(), i8::max_value())); 25 | } 26 | 27 | #[test] 28 | fn saturating_hadd_i16s() { 29 | assert_eq!(i16s(1).saturating_hadd(i16s(2)), i16s::interleave(2, 4)); 30 | assert_eq!(i16s::interleave(1, 2).saturating_hadd(i16s::interleave(3, 4)), i16s::interleave(3, 7)); 31 | assert_eq!(i16s::interleave(-30000, -30000).saturating_hadd(i16s::interleave(30000, 30000)), i16s::interleave(i16::min_value(), i16::max_value())); 32 | } 33 | 34 | #[test] 35 | fn saturating_hadd_i32s() { 36 | assert_eq!(i32s(1).saturating_hadd(i32s(2)), i32s::interleave(2, 4)); 37 | assert_eq!(i32s::interleave(1, 2).saturating_hadd(i32s::interleave(3, 4)), i32s::interleave(3, 7)); 38 | assert_eq!(i32s::interleave(-2_000_000_000, -2_000_000_000).saturating_hadd(i32s::interleave(2_000_000_000, 2_000_000_000)), i32s::interleave(i32::min_value(), i32::max_value())); 39 | } 40 | 41 | #[test] 42 | fn saturating_hadd_i64s() { 43 | assert_eq!(i64s(1).saturating_hadd(i64s(2)), i64s::interleave(2, 4)); 44 | assert_eq!(i64s::interleave(1, 2).saturating_hadd(i64s::interleave(3, 4)), i64s::interleave(3, 7)); 45 | assert_eq!(i64s::interleave(-9_000_000_000_000_000_000, -9_000_000_000_000_000_000).saturating_hadd(i64s::interleave(9_000_000_000_000_000_000, 9_000_000_000_000_000_000)), i64s::interleave(i64::min_value(), i64::max_value())); 46 | } 47 | 48 | #[test] 49 | fn saturating_hadd_u8s() { 50 | assert_eq!(u8s(1).saturating_hadd(u8s(2)), u8s::interleave(2, 4)); 51 | assert_eq!(u8s::interleave(1, 2).saturating_hadd(u8s::interleave(3, 4)), u8s::interleave(3, 7)); 52 | assert_eq!(u8s(200).saturating_hadd(u8s(200)), u8s(u8::max_value())); 53 | } 54 | 55 | #[test] 56 | fn saturating_hadd_u16s() { 57 | assert_eq!(u16s(1).saturating_hadd(u16s(2)), u16s::interleave(2, 4)); 58 | assert_eq!(u16s::interleave(1, 2).saturating_hadd(u16s::interleave(3, 4)), u16s::interleave(3, 7)); 59 | assert_eq!(u16s(60000).saturating_hadd(u16s(60000)), u16s(u16::max_value())); 60 | } 61 | 62 | #[test] 63 | fn saturating_hadd_u32s() { 64 | assert_eq!(u32s(1).saturating_hadd(u32s(2)), u32s::interleave(2, 4)); 65 | assert_eq!(u32s::interleave(1, 2).saturating_hadd(u32s::interleave(3, 4)), u32s::interleave(3, 7)); 66 | assert_eq!(u32s(4_000_000_000).saturating_hadd(u32s(4_000_000_000)), u32s(u32::max_value())); 67 | } 68 | 69 | #[test] 70 | fn saturating_hadd_u64s() { 71 | assert_eq!(u64s(1).saturating_hadd(u64s(2)), u64s::interleave(2, 4)); 72 | assert_eq!(u64s::interleave(1, 2).saturating_hadd(u64s::interleave(3, 4)), u64s::interleave(3, 7)); 73 | assert_eq!(u64s(18_000_000_000_000_000_000).saturating_hadd(u64s(18_000_000_000_000_000_000)), u64s(u64::max_value())); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/intrin/saturating_hsub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait SaturatingHSub { 9 | /// Return a vector containing the interleaved differences of elements in 10 | /// `self` and `other`, using saturating subtraction. The returned vector 11 | /// will begin with the difference of the first two elements in `self`, and 12 | /// end with the difference of the last two elements in `other` 13 | fn saturating_hsub(&self, other: Self) -> Self; 14 | } 15 | 16 | #[cfg(test)] 17 | mod tests { 18 | use crate::prelude::*; 19 | 20 | #[test] 21 | fn saturating_hsub_i8s() { 22 | assert_eq!(i8s(1).saturating_hsub(i8s(2)), i8s::interleave(0, 0)); 23 | assert_eq!(i8s::interleave(1, 2).saturating_hsub(i8s::interleave(3, 4)), i8s::interleave(-1, -1)); 24 | assert_eq!(i8s::interleave(-100, 100).saturating_hsub(i8s::interleave(100, -100)), i8s::interleave(i8::min_value(), i8::max_value())); 25 | } 26 | 27 | #[test] 28 | fn saturating_hsub_i16s() { 29 | assert_eq!(i16s(1).saturating_hsub(i16s(2)), i16s::interleave(0, 0)); 30 | assert_eq!(i16s::interleave(1, 2).saturating_hsub(i16s::interleave(3, 4)), i16s::interleave(-1, -1)); 31 | assert_eq!(i16s::interleave(-30000, 30000).saturating_hsub(i16s::interleave(30000, -30000)), i16s::interleave(i16::min_value(), i16::max_value())); 32 | } 33 | 34 | #[test] 35 | fn saturating_hsub_i32s() { 36 | assert_eq!(i32s(1).saturating_hsub(i32s(2)), i32s::interleave(0, 0)); 37 | assert_eq!(i32s::interleave(1, 2).saturating_hsub(i32s::interleave(3, 4)), i32s::interleave(-1, -1)); 38 | assert_eq!(i32s::interleave(-2_000_000_000, 2_000_000_000).saturating_hsub(i32s::interleave(2_000_000_000, -2_000_000_000)), i32s::interleave(i32::min_value(), i32::max_value())); 39 | } 40 | 41 | #[test] 42 | fn saturating_hsub_i64s() { 43 | assert_eq!(i64s(1).saturating_hsub(i64s(2)), i64s::interleave(0, 0)); 44 | assert_eq!(i64s::interleave(1, 2).saturating_hsub(i64s::interleave(3, 4)), i64s::interleave(-1, -1)); 45 | assert_eq!(i64s::interleave(-9_000_000_000_000_000_000, 9_000_000_000_000_000_000).saturating_hsub(i64s::interleave(9_000_000_000_000_000_000, -9_000_000_000_000_000_000)), i64s::interleave(i64::min_value(), i64::max_value())); 46 | } 47 | 48 | #[test] 49 | fn saturating_hsub_u8s() { 50 | assert_eq!(u8s(1).saturating_hsub(u8s(2)), u8s::interleave(0, 0)); 51 | assert_eq!(u8s::interleave(1, 2).saturating_hsub(u8s::interleave(3, 4)), u8s::interleave(0, 0)); 52 | assert_eq!(u8s::interleave(2, 1).saturating_hsub(u8s::interleave(4, 3)), u8s::interleave(1, 1)); 53 | } 54 | 55 | #[test] 56 | fn saturating_hsub_u16s() { 57 | assert_eq!(u16s(1).saturating_hsub(u16s(2)), u16s::interleave(0, 0)); 58 | assert_eq!(u16s::interleave(1, 2).saturating_hsub(u16s::interleave(3, 4)), u16s::interleave(0, 0)); 59 | assert_eq!(u16s::interleave(2, 1).saturating_hsub(u16s::interleave(4, 3)), u16s::interleave(1, 1)); 60 | } 61 | 62 | #[test] 63 | fn saturating_hsub_u32s() { 64 | assert_eq!(u32s(1).saturating_hsub(u32s(2)), u32s::interleave(0, 0)); 65 | assert_eq!(u32s::interleave(1, 2).saturating_hsub(u32s::interleave(3, 4)), u32s::interleave(0, 0)); 66 | assert_eq!(u32s::interleave(2, 1).saturating_hsub(u32s::interleave(4, 3)), u32s::interleave(1, 1)); 67 | } 68 | 69 | #[test] 70 | fn saturating_hsub_u64s() { 71 | assert_eq!(u64s(1).saturating_hsub(u64s(2)), u64s::interleave(0, 0)); 72 | assert_eq!(u64s::interleave(1, 2).saturating_hsub(u64s::interleave(3, 4)), u64s::interleave(0, 0)); 73 | assert_eq!(u64s::interleave(2, 1).saturating_hsub(u64s::interleave(4, 3)), u64s::interleave(1, 1)); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/intrin/saturating_sub.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait SaturatingSub { 9 | fn saturating_sub(&self, other: Self) -> Self; 10 | } 11 | -------------------------------------------------------------------------------- /src/intrin/sqrt.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Sqrt { 9 | /// Return a vector the containing square roots of the elements of `self`. 10 | /// 11 | /// # Examples 12 | /// 13 | /// ``` 14 | /// #![feature(rust_2018_preview, stdsimd)] 15 | /// extern crate faster; 16 | /// use faster::*; 17 | /// 18 | /// # fn main() { 19 | /// assert_eq!(f32s(4.0).sqrt(), f32s(2.0)); 20 | /// assert_eq!(f64s(9.0).sqrt(), f64s(3.0)); 21 | /// # } 22 | /// ``` 23 | fn sqrt(&self) -> Self; 24 | } 25 | 26 | #[cfg(test)] 27 | mod tests { 28 | use crate::prelude::*; 29 | 30 | #[test] 31 | fn sqrt_f64s() { 32 | assert_eq!(f64s(1.0).sqrt(), f64s(1.0)); 33 | assert!(f64s(9.0).sqrt().max(f64s(2.999)) == f64s(9.0).sqrt()); 34 | assert!(f64s(9.0).sqrt().min(f64s(3.001)) == f64s(9.0).sqrt()); 35 | } 36 | 37 | #[test] 38 | fn sqrt_f32s() { 39 | assert_eq!(f32s(1.0).sqrt(), f32s(1.0)); 40 | assert!(f32s(9.0).sqrt().max(f32s(2.999)) == f32s(9.0).sqrt()); 41 | assert!(f32s(9.0).sqrt().min(f32s(3.001)) == f32s(9.0).sqrt()); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/intrin/sum.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | use crate::vecs::*; 9 | 10 | pub trait Sum : Packed { 11 | /// Return a scalar equivalent to the sum of all elements of this vector. 12 | fn sum(&self) -> Self::Scalar; 13 | } 14 | 15 | pub trait UpcastSum : { 16 | /// Return a scalar equivalent to the sum of all elements of this vector, 17 | /// but collect the result in an i64 rather than the vector's type. 18 | fn sum_upcast(&self) -> i64; 19 | } 20 | 21 | macro_rules! impl_packed_sum { 22 | ($($vec:tt),*) => { 23 | $( 24 | impl Sum for $vec { 25 | #[inline(always)] 26 | fn sum(&self) -> Self::Scalar { 27 | fallback!(); 28 | self.scalar_reduce(0 as Self::Scalar, |acc, s| acc + s) 29 | } 30 | } 31 | )* 32 | } 33 | } 34 | 35 | macro_rules! impl_packed_upcast_sum { 36 | ($($vec:tt),*) => { 37 | $( 38 | impl UpcastSum for $vec { 39 | #[inline(always)] 40 | fn sum_upcast(&self) -> i64 { 41 | fallback!(); 42 | self.scalar_reduce(0i64, |acc, s| acc + (s as i64)) 43 | } 44 | } 45 | )* 46 | } 47 | } 48 | 49 | macro_rules! test_packed_sum_int { 50 | ($vec:tt, $el:tt, $name:ident) => { 51 | #[test] 52 | fn $name() { 53 | // Try not to overflow 54 | let mut i = $el::min_value() / 64 + 1; 55 | 56 | while i < $el::max_value() / 64 - 1 { 57 | let v = $vec::splat(i); 58 | assert_eq!(v.sum(), 59 | v.scalar_reduce(0 as $el, |acc, v| acc + v)); 60 | assert_eq!(v.sum_upcast(), 61 | v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64))); 62 | i += $el::max_value() / 20; 63 | } 64 | } 65 | }; 66 | } 67 | 68 | macro_rules! test_packed_sum { 69 | ($vec:tt, $el:tt, $name:ident) => { 70 | #[test] 71 | fn $name() { 72 | for i in -100..100 { 73 | let v = $vec::splat(i as $el); 74 | assert_eq!(v.sum(), 75 | v.scalar_reduce(0 as $el, |acc, v| acc + v)); 76 | assert_eq!(v.sum_upcast(), 77 | v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64))); 78 | } 79 | } 80 | }; 81 | } 82 | -------------------------------------------------------------------------------- /src/intrin/swizzle.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Swizzle { 9 | /// Return a vector containing elements of self, but with even and odd 10 | /// elements swapped in-place. For (n = 0, 2, ... Self::WIDTH), elements at 11 | /// indices n and n + 1 are swapped. 12 | /// 13 | /// ``` 14 | /// extern crate faster; 15 | /// use faster::*; 16 | /// 17 | /// # fn main() { 18 | /// assert_eq!(u8s::interleave(2, 1).flip(), u8s::interleave(1, 2)); 19 | /// assert_eq!(u64s::interleave(2, 1).flip(), u64s::interleave(1, 2)); 20 | /// # } 21 | /// ``` 22 | fn flip(&self) -> Self; 23 | } 24 | 25 | macro_rules! impl_packed_swizzle { 26 | ($vec:tt, $uvec:tt, $feat:expr, $mmfn:tt, ($($c:expr),*), ($($a:expr, $b:expr),*)) => { 27 | impl Swizzle for $vec { 28 | #[cfg(not(target_feature = $feat))] 29 | #[inline(always)] 30 | fn flip(&self) -> Self { 31 | fallback!(); 32 | $vec::new($(self.extract($b), self.extract($a)),*) 33 | } 34 | 35 | #[cfg(target_feature = $feat)] 36 | #[inline(always)] 37 | fn flip(&self) -> Self { 38 | optimized!(); 39 | unsafe { 40 | transmute($mmfn(self.be_i8s(), $uvec::new($($c),*).be_i8s())) 41 | } 42 | } 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/intrin/transmute.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub trait Transmute { 9 | type i8s; 10 | type u8s; 11 | type i16s; 12 | type u16s; 13 | type i32s; 14 | type u32s; 15 | type f32s; 16 | type i64s; 17 | type u64s; 18 | type f64s; 19 | 20 | fn be_i8s(&self) -> Self::i8s; 21 | fn be_u8s(&self) -> Self::u8s; 22 | fn be_i16s(&self) -> Self::i16s; 23 | fn be_u16s(&self) -> Self::u16s; 24 | fn be_i32s(&self) -> Self::i32s; 25 | fn be_u32s(&self) -> Self::u32s; 26 | // TODO: Remove possibility of signalling NaNs 27 | unsafe fn be_f32s_unchecked(&self) -> Self::f32s; 28 | fn be_i64s(&self) -> Self::i64s; 29 | fn be_u64s(&self) -> Self::u64s; 30 | // TODO: Remove possibility of signalling NaNs 31 | unsafe fn be_f64s_unchecked(&self) -> Self::f64s; 32 | } 33 | 34 | macro_rules! impl_packed_transmute { 35 | ($($t:ty,)* ... $u8s:ty, $i8s:ty, $u16s:ty, $i16s:ty, $u32s:ty, $i32s:ty, 36 | $f32s:ty, $u64s:ty, $i64s:ty, $f64s:ty, $feat:expr, $nfeat:expr) => ( 37 | $( 38 | impl Transmute for $t { 39 | type i8s = $i8s; 40 | type u8s = $u8s; 41 | type i16s = $i16s; 42 | type u16s = $u16s; 43 | type i32s = $i32s; 44 | type u32s = $u32s; 45 | type f32s = $f32s; 46 | type i64s = $i64s; 47 | type u64s = $u64s; 48 | type f64s = $f64s; 49 | 50 | #[inline(always)] 51 | fn be_i8s(&self) -> Self::i8s { 52 | unsafe { transmute::(*self) } 53 | } 54 | 55 | #[inline(always)] 56 | fn be_u8s(&self) -> Self::u8s { 57 | unsafe { transmute::(*self) } 58 | } 59 | 60 | #[inline(always)] 61 | fn be_i16s(&self) -> Self::i16s { 62 | unsafe { transmute::(*self) } 63 | } 64 | 65 | #[inline(always)] 66 | fn be_u16s(&self) -> Self::u16s { 67 | unsafe { transmute::(*self) } 68 | } 69 | 70 | #[inline(always)] 71 | fn be_i32s(&self) -> Self::i32s { 72 | unsafe { transmute::(*self) } 73 | } 74 | 75 | #[inline(always)] 76 | fn be_u32s(&self) -> Self::u32s { 77 | unsafe { transmute::(*self) } 78 | } 79 | 80 | #[inline(always)] 81 | unsafe fn be_f32s_unchecked(&self) -> Self::f32s { 82 | transmute::(*self) 83 | } 84 | 85 | #[inline(always)] 86 | fn be_i64s(&self) -> Self::i64s { 87 | unsafe { transmute::(*self) } 88 | } 89 | 90 | #[inline(always)] 91 | fn be_u64s(&self) -> Self::u64s { 92 | unsafe { transmute::(*self) } 93 | } 94 | 95 | #[inline(always)] 96 | unsafe fn be_f64s_unchecked(&self) -> Self::f64s { 97 | transmute::(*self) 98 | } 99 | } 100 | )* 101 | ); 102 | } 103 | 104 | #[cfg(test)] 105 | mod tests { 106 | use crate::prelude::*; 107 | 108 | macro_rules! test_transmute { 109 | ($name:ident, $val:expr, $xmute:ident) => ( 110 | #[test] 111 | fn $name() { 112 | #![allow(unused_unsafe)] 113 | assert_eq!(unsafe { $val.be_i8s().$xmute() }, $val); 114 | assert_eq!(unsafe { $val.be_u8s().$xmute() }, $val); 115 | assert_eq!(unsafe { $val.be_i16s().$xmute() }, $val); 116 | assert_eq!(unsafe { $val.be_u16s().$xmute() }, $val); 117 | assert_eq!(unsafe { $val.be_i32s().$xmute() }, $val); 118 | assert_eq!(unsafe { $val.be_u32s().$xmute() }, $val); 119 | assert_eq!(unsafe { $val.be_i64s().$xmute() }, $val); 120 | assert_eq!(unsafe { $val.be_u64s().$xmute() }, $val); 121 | } 122 | ) 123 | } 124 | 125 | test_transmute!(transmute_u8s, u8s(1), be_u8s); 126 | test_transmute!(transmute_i8s, i8s(1), be_i8s); 127 | test_transmute!(transmute_u16s, u16s(1), be_u16s); 128 | test_transmute!(transmute_i16s, i16s(1), be_i16s); 129 | test_transmute!(transmute_u32s, u32s(1), be_u32s); 130 | test_transmute!(transmute_i32s, i32s(1), be_i32s); 131 | test_transmute!(transmute_f32s, f32s(1.0), be_f32s_unchecked); 132 | test_transmute!(transmute_u64s, u64s(1), be_u64s); 133 | test_transmute!(transmute_i64s, i64s(1), be_i64s); 134 | test_transmute!(transmute_f64s, f64s(1.0), be_f64s_unchecked); 135 | } 136 | -------------------------------------------------------------------------------- /src/intrin/upcast.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | // TODO: Upcast for u.. 9 | 10 | pub trait Upcast { 11 | /// Return two vectors containing elements of the same value, but different 12 | /// type. The first vector contains the first half of `self`, and the second 13 | /// vector contains the second half. Both returned vectors are equal in size 14 | /// to `self`. 15 | /// 16 | /// # Examples 17 | /// 18 | /// ``` 19 | /// extern crate faster; 20 | /// use faster::*; 21 | /// 22 | /// # fn main() { 23 | /// assert_eq!(i8s::halfs(2, 3).upcast(), (i16s(2), i16s(3))) 24 | /// # } 25 | /// ``` 26 | fn upcast(self) -> (T, T); 27 | } 28 | 29 | #[cfg(test)] 30 | mod tests { 31 | use crate::prelude::*; 32 | 33 | #[test] 34 | fn upcast_i8s() { 35 | assert_eq!(i8s::interleave(1, 2).upcast().0, i16s::interleave(1, 2)); 36 | assert_eq!(i8s::interleave(1, 2).upcast().1, i16s::interleave(1, 2)); 37 | } 38 | 39 | #[test] 40 | fn upcast_u8s() { 41 | assert_eq!(u8s::interleave(1, 2).upcast().0, u16s::interleave(1, 2)); 42 | assert_eq!(u8s::interleave(1, 2).upcast().1, u16s::interleave(1, 2)); 43 | } 44 | 45 | #[test] 46 | fn upcast_i16s() { 47 | assert_eq!(i16s::interleave(1, 2).upcast().0, i32s::interleave(1, 2)); 48 | assert_eq!(i16s::interleave(1, 2).upcast().1, i32s::interleave(1, 2)); 49 | } 50 | 51 | #[test] 52 | fn upcast_u16s() { 53 | assert_eq!(u16s::interleave(1, 2).upcast().0, u32s::interleave(1, 2)); 54 | assert_eq!(u16s::interleave(1, 2).upcast().1, u32s::interleave(1, 2)); 55 | } 56 | 57 | #[test] 58 | fn upcast_i32s_i64s() { 59 | // TODO: Fix ugliness 60 | assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).0, i64s::interleave(1, 2)); 61 | assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).1, i64s::interleave(1, 2)); 62 | } 63 | 64 | #[test] 65 | fn upcast_i32s_f64s() { 66 | // TODO: Fix ugliness 67 | assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).0, f64s::interleave(1.0, 2.0)); 68 | assert_eq!(Upcast::::upcast(i32s::interleave(1, 2)).1, f64s::interleave(1.0, 2.0)); 69 | } 70 | 71 | #[test] 72 | fn upcast_u32s() { 73 | assert_eq!(u32s::interleave(1, 2).upcast().0, u64s::interleave(1, 2)); 74 | assert_eq!(u32s::interleave(1, 2).upcast().1, u64s::interleave(1, 2)); 75 | } 76 | 77 | #[test] 78 | fn upcast_f32s() { 79 | assert_eq!(f32s::interleave(1.0, 2.0).upcast(), (f64s::interleave(1.0, 2.0), f64s::interleave(1.0, 2.0))); 80 | assert_eq!(f32s::interleave(1.0, 2.0).upcast().0, f64s::interleave(1.0, 2.0)); 81 | assert_eq!(f32s::interleave(1.0, 2.0).upcast().1, f64s::interleave(1.0, 2.0)); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | //! The SIMD library for humans. 9 | 10 | //! Faster allows convenient application of explicit SIMD to existing code. It 11 | //! allows you to write explicit SIMD code once and compile it for any target, 12 | //! regardless of architecture, SIMD capability, or age. 13 | 14 | //! # SIMD Iterators 15 | //! 16 | //! SIMD iterators are formed using [`simd_iter`], [`simd_iter_mut`], and 17 | //! [`into_simd_iter`], which return types which allow the usage of the 18 | //! [`simd_map`] and [`simd_reduce`] functions. These functions automatically 19 | //! pack your iterator's data into SIMD vectors and allow you to transparently 20 | //! operate on them in a closure. 21 | //! 22 | //! [`simd_iter`]: iters/trait.IntoSIMDIterator.html#tymethod.into_simd_iter 23 | //! [`simd_iter_mut`]: iters/trait.IntoSIMDIterator.html#tymethod.simd_iter 24 | //! [`into_simd_iter`]: iters/trait.IntoSIMDRefMutIterator.html#tymethod.simd_iter_mut 25 | //! [`simd_map`]: iters/trait.SIMDIterator.html#tymethod.simd_map 26 | //! [`simd_reduce`]: iters/trait.SIMDIterator.html#tymethod.simd_reduce 27 | //! 28 | //! # SIMD Polyfills 29 | //! 30 | //! Once your data is packed into a SIMD vector, you may perform many common 31 | //! SIMD operations on it. These operations have names and behavior independent 32 | //! of any vendor-specific ISA, and have non-SIMD polyfills for machines which 33 | //! cannot perform these operations in a single cycle. See the [`intrin`] module 34 | //! for all available operations. 35 | //! 36 | //! [`intrin`]: intrin/index.html 37 | //! 38 | //! # Examples 39 | //! 40 | //! Faster is currently capable of mapping and reductive operations in SIMD. 41 | //! 42 | //! ## Mapping 43 | //! 44 | //! The simplest example of a computation with `faster` is a single map 45 | //! operation. 46 | //! 47 | //! ``` 48 | //! extern crate faster; 49 | //! use faster::*; 50 | //! 51 | //! # #[cfg(not(feature = "std"))] 52 | //! # fn main() { } 53 | //! 54 | //! # #[cfg(feature = "std")] 55 | //! # fn main() { 56 | //! let lots_of_10s = [-10i8; 3000].simd_iter(i8s(0)) 57 | //! .simd_map(|v| v.abs()) 58 | //! .scalar_collect(); 59 | //! assert_eq!(lots_of_10s, vec![10u8; 3000]); 60 | //! # } 61 | //! ``` 62 | //! 63 | //! In this example, a vector of type [`i8s`] is passed into the closure. The 64 | //! exact type of [`i8s`] is dependent on compilation target, but it will always 65 | //! implement the same operations. Because taking the absolute value of a vector 66 | //! converts it to [`u8s`], the closure will return [`u8s`]. 67 | //! 68 | //! [`scalar_collect`] takes the iterator of [`u8s`] and converts it into a 69 | //! `Vec`. 70 | //! 71 | //! [`i8s`]: vecs/type.i8s.html 72 | //! [`u8s`]: vecs/type.u8s.html 73 | //! [`scalar_collect`]: iters/trait.IntoScalar.html#tymethod.scalar_collect 74 | //! 75 | //! ## Reduction 76 | //! 77 | //! Faster can perform reductive operations with similar power to mapping 78 | //! operations: 79 | //! 80 | //! ``` 81 | //! #![feature(stdsimd)] 82 | //! extern crate faster; 83 | //! use faster::*; 84 | //! 85 | //! # fn main() { 86 | //! let two_hundred = [2.0f32; 100].simd_iter(f32s(0.0)) 87 | //! .simd_reduce(f32s(0.0), |acc, v| acc + v) 88 | //! .sum(); 89 | //! assert_eq!(two_hundred, 200.0f32); 90 | //! # } 91 | //! ``` 92 | //! 93 | //! This example sums every number in the collection. The first parameter to 94 | //! simd_reduce is the default value of the accumulator, just like any 95 | //! other reduction. The second value is used if the collection being reduced 96 | //! over doesn't fit evenly into your system's vectors - it is the default value 97 | //! of the last vector, and each element of the vector is used only if it isn't 98 | //! filled by an element of the collection. Typically, a value of 0 or 1 is a 99 | //! suitable default. 100 | //! 101 | //! Minding portability is very important when performing reductive 102 | //! operations. See below for some tips on keeping your code portable across all 103 | //! architectures. 104 | //! 105 | //! ## Multiple collections 106 | //! 107 | //! Faster supports vectorized lockstep iteration over multiple collections. 108 | //! Simply [`zip`] them up, and proceed as normal. 109 | //! 110 | //! [`zip`]: zip/trait.IntoSIMDZip.html 111 | //! 112 | //! ``` 113 | //! extern crate faster; 114 | //! use faster::*; 115 | //! 116 | //! # #[cfg(not(feature = "std"))] 117 | //! # fn main() { } 118 | //! 119 | //! # #[cfg(feature = "std")] 120 | //! # fn main() { 121 | //! let sevens = ([4i32; 200].simd_iter(i32s(0)), [3i32; 200].simd_iter(i32s(0))) 122 | //! .zip() 123 | //! .simd_map(|(a, b)| a + b) 124 | //! .scalar_collect(); 125 | //! # } 126 | //! ``` 127 | //! 128 | //! ## Striping Collections 129 | //! 130 | //! Reading every nth element of a collection can be vectorized on most 131 | //! machines. Simply call [`stride`], or one of the slightly-faster tuple-based 132 | //! functions, such as [`stride_two`]. 133 | //! 134 | //! [`stride`]: iters/struct.SIMDRefIter.html#method.stride 135 | //! [`stride_two`]: iters/struct.SIMDRefIter.html#method.stride_two 136 | //! 137 | //! ``` 138 | //! extern crate faster; 139 | //! use faster::*; 140 | //! 141 | //! # #[cfg(not(feature = "std"))] 142 | //! # fn main() { } 143 | //! 144 | //! # #[cfg(feature = "std")] 145 | //! # fn main() { 146 | //! // Computes the determinant of matrices arranged as [a, b, c, d, a, b, c...] 147 | //! let slice: &[f32] = &[1.0f32; 1024]; 148 | //! let determinant = slice.stride_four(tuplify!(4, f32s(0.0))).zip() 149 | //! .simd_map(|(a, b, c, d)| a * d - b * c) 150 | //! .scalar_collect(); 151 | //! # } 152 | //! ``` 153 | //! 154 | //! # Portability 155 | //! 156 | //! While `faster` does most of the work ensuring your code stays portable 157 | //! across platforms, a user of this library must still understand that it is 158 | //! very possible to write non-portable algorithms using this library. Anything 159 | //! which relies on vector width, anything which is impure, and anything which 160 | //! uses constants in reductive operations is inherently nonportable. Some 161 | //! examples below: 162 | //! 163 | //! ``` 164 | //! extern crate faster; 165 | //! use faster::*; 166 | //! 167 | //! # #[cfg(not(feature = "std"))] 168 | //! # fn main() { } 169 | //! 170 | //! # #[cfg(feature = "std")] 171 | //! # fn main() { 172 | //! let mut flip = true; 173 | //! let impure = [1i8; 3000].simd_iter(i8s(0)) 174 | //! .simd_map(|v| { flip = !flip; if flip { v + i8s(1) } else { v } }) 175 | //! .scalar_collect(); 176 | //! // Depending on the width of your target's SIMD vectors, `impure` could be 177 | //! // [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, ...] or 178 | //! // [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...], etc. 179 | //! # } 180 | //! ``` 181 | //! 182 | //! ``` 183 | //! extern crate faster; 184 | //! use faster::*; 185 | //! 186 | //! # fn main() { 187 | //! let length_dependent = [0i8; 10].simd_iter(i8s(0)) 188 | //! .simd_reduce(i8s(0), |acc, v| acc + v + i8s(1)).sum(); 189 | //! // `length_dependent` could be a different number on a different target! 190 | //! # } 191 | //! ``` 192 | //! 193 | //! As a precaution, it is best practice to keep all functions pure, and only 194 | //! operate on SIMD vectors in your SIMD-enabled closures unless you know 195 | //! exactly what is happening under the hood. It's also important to remember 196 | //! that these problems will crop up even if you only support x86; the width 197 | //! difference between AVX and SSE is the primary source of these issues! 198 | 199 | #![cfg_attr(not(feature = "std"), no_std)] 200 | #![cfg_attr(test, feature(test))] 201 | #![feature(stdsimd)] 202 | // , mmx_target_feature, sse4a_target_feautre, tbm_target_feature 203 | 204 | mod core { 205 | #[cfg(not(feature = "std"))] 206 | pub use core::*; 207 | #[cfg(feature = "std")] 208 | pub use std::*; 209 | } 210 | 211 | extern crate packed_simd; 212 | extern crate vektor; 213 | 214 | #[macro_use] pub(crate) mod debug; 215 | #[macro_use] pub mod zip; 216 | #[macro_use] pub mod vecs; 217 | pub mod vec_patterns; 218 | pub mod iters; 219 | pub mod into_iters; 220 | #[macro_use] pub mod intrin; 221 | #[macro_use] pub mod arch; 222 | pub mod prelude; 223 | pub mod stride_zip; 224 | pub mod stride; 225 | 226 | pub use crate::prelude::*; 227 | -------------------------------------------------------------------------------- /src/prelude.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | pub use crate::iters::*; 9 | pub use crate::into_iters::*; 10 | pub use crate::vecs::{Packed, Pattern}; 11 | pub use crate::arch::current::vecs::{u8s, i8s, u16s, i16s, u32s, i32s, f32s, u64s, i64s, f64s}; 12 | pub use crate::arch::current::intrin::prelude::*; 13 | pub use crate::intrin::prelude::*; 14 | pub use crate::zip::*; 15 | pub use crate::stride_zip::*; 16 | pub use crate::stride::*; 17 | -------------------------------------------------------------------------------- /src/stride_zip.rs: -------------------------------------------------------------------------------- 1 | use crate::iters::{SIMDIterator}; 2 | use crate::vecs::{Packed, Packable}; 3 | use crate::intrin::destride::*; 4 | use crate::zip::{SIMDZippedIterable, SIMDZippedIterator, SIMDZippedObject}; 5 | 6 | pub struct StrideZip where T : SIMDIterator, T::Vector : Destride { 7 | base: usize, 8 | peek: Option, 9 | iter: T 10 | } 11 | 12 | /// A trait which can transform a collection of iterators into a `Zip` 13 | pub trait IntoStrideZip : Sized { 14 | /// Return an iterator which may iterate over `self` in lockstep. 15 | fn stride_zip(self) -> StrideZip 16 | where Self : SIMDIterator, Self::Vector : Destride; 17 | } 18 | 19 | impl IntoStrideZip for T where T : SIMDIterator, T::Vector : Destride { 20 | fn stride_zip(self) -> StrideZip { 21 | StrideZip { 22 | base: self.scalar_pos(), 23 | peek: None, 24 | iter: self 25 | } 26 | } 27 | } 28 | 29 | impl SIMDZippedObject for StrideZip where T : SIMDIterator, T::Vector : Destride { 30 | type Scalars = (T::Scalar, T::Scalar); 31 | type Vectors = (T::Vector, T::Vector); 32 | 33 | /// Return the vector length of this object. 34 | #[inline(always)] 35 | fn width(&self) -> usize { 36 | T::Vector::WIDTH 37 | } 38 | 39 | /// Return the scalar length of this object. 40 | #[inline(always)] 41 | fn size(&self) -> usize { 42 | T::Scalar::SIZE 43 | } 44 | } 45 | 46 | impl ExactSizeIterator for StrideZip where T : SIMDIterator, T::Vector : Destride { 47 | #[inline(always)] 48 | fn len(&self) -> usize { 49 | self.iter.len() / 2 50 | } 51 | } 52 | 53 | impl SIMDZippedIterable for StrideZip where T : SIMDIterator, T::Vector : Destride { 54 | #[inline(always)] 55 | fn scalar_pos(&self) -> usize { 56 | (self.iter.scalar_pos() - self.base) / 2 57 | } 58 | 59 | #[inline(always)] 60 | fn vector_pos(&self) -> usize { 61 | (self.iter.vector_pos() - (self.base / self.width())) / 2 62 | } 63 | 64 | #[inline(always)] 65 | fn scalar_len(&self) -> usize { 66 | self.iter.scalar_len() / 2 67 | } 68 | 69 | #[inline(always)] 70 | fn advance(&mut self, amount: usize) { 71 | self.iter.advance(2 * amount); 72 | } 73 | 74 | #[inline(always)] 75 | fn default(&self) -> Self::Vectors { 76 | (T::Vector::default(), T::Vector::default()) 77 | } 78 | } 79 | 80 | impl Iterator for StrideZip where T : SIMDIterator, T::Vector : Destride { 81 | type Item = ::Vectors; 82 | 83 | fn next(&mut self) -> Option { 84 | let first = self.iter.next()?; 85 | let second = self.iter.next(); 86 | if let Some(second) = second { 87 | Some(first.destride_two(second)) 88 | } else { 89 | self.peek = Some(first); 90 | None 91 | } 92 | } 93 | } 94 | 95 | impl SIMDZippedIterator for StrideZip where T : SIMDIterator, T::Vector : Destride { 96 | fn end(&mut self) -> Option<(Self::Vectors, usize)> { 97 | let first = self.iter.next(); 98 | let (end, n) = self.iter.end().unwrap_or((self.iter.default(), 0)); 99 | if let Some(first) = first { 100 | Some((first.destride_two(end), (self.width() + n) / 2)) 101 | } else { 102 | if let Some(v) = self.peek { 103 | self.peek = None; 104 | Some((v.destride_two(end), (self.width() + n) / 2)) 105 | } else if n > 0 { 106 | Some((end.destride_two(self.iter.default()), n / 2)) 107 | } else { 108 | None 109 | } 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/vec_patterns.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | // This file is machine-generated. See vec_patterns_gen.py for more info. 9 | 10 | use crate::vecs::*; 11 | 12 | /// Constructors which may be used to instantiate vectors with patterned data. 13 | pub trait Pattern : Packed { 14 | /// Return a vector whose first `Self::WIDTH / 2` elements are `hi`, and 15 | /// whose last `Self::WIDTH / 2` elements are `lo`. 16 | fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self; 17 | 18 | /// Return a vector containing `hi` at every even index, and lo at every odd 19 | /// index. 20 | fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self; 21 | 22 | /// Return a vector whose first `off` elements are `hi`, and whose last 23 | /// `Self::WIDTH - off` elements are `lo`. 24 | fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self; 25 | 26 | /// Return a vector whose first `off` elements are memset to 0x00, and whose 27 | /// last `Self::WIDTH - off` elements are memset to 0xFF. 28 | fn partition_mask(off: usize) -> Self; 29 | 30 | /// Return a vector made entirely of ones. 31 | fn ones() -> Self; 32 | 33 | /// Return a vector made entirely of zeroes. 34 | fn zeroes() -> Self; 35 | } 36 | -------------------------------------------------------------------------------- /src/vecs.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | #![allow(dead_code)] 8 | 9 | pub use crate::vec_patterns::Pattern; 10 | use crate::core::fmt::Debug; 11 | use crate::intrin::merge::*; 12 | 13 | /// A SIMD vector of some type. 14 | pub trait Packed : Sized + Copy + Debug + Merge { 15 | /// The type which fits into this SIMD vector 16 | type Scalar : Packable; 17 | 18 | /// The number of elements in this vector 19 | const WIDTH: usize; 20 | 21 | #[inline(always)] 22 | /// Return the number of elements in this vector 23 | fn width(&self) -> usize { 24 | Self::WIDTH 25 | } 26 | 27 | /// Create a new vector with `Self::WIDTH` elements from `data`, beginning 28 | /// at `offset`. 29 | fn load(data: &[Self::Scalar], offset: usize) -> Self; 30 | 31 | /// Create a new vector with `Self::WIDTH` elements from `data`, beginning 32 | /// at `offset`, without asserting length of data. 33 | unsafe fn load_unchecked(data: &[Self::Scalar], offset: usize) -> Self; 34 | 35 | /// Write `Self::WIDTH` elements from this vector to `data`, beginning at 36 | /// `offset`. 37 | fn store(self, data: &mut [Self::Scalar], offset: usize); 38 | 39 | /// Create a new vector with `Self::WIDTH` elements from `data`, beginning 40 | /// at `offset`, without asserting length of data. 41 | unsafe fn store_unchecked(self, data: &mut [Self::Scalar], offset: usize); 42 | 43 | /// Assert all elements of the vector are equal, then return the 44 | /// element. Opposite operation of `Self::splat`. 45 | fn coalesce(self) -> Self::Scalar; 46 | 47 | /// Return a vector with all elements initialized to `data`. Opposite 48 | /// operation for `Self::coalesce`. 49 | fn splat(data: Self::Scalar) -> Self; 50 | 51 | /// Return a vector with all elements initialized to the default 52 | /// value for the underlying element type. 53 | fn default() -> Self; 54 | 55 | /// Return the `idx`th element of this vector. 56 | fn extract(&self, idx: usize) -> Self::Scalar; 57 | 58 | /// Return the `idx`th element of this vector. 59 | unsafe fn extract_unchecked(&self, idx: usize) -> Self::Scalar; 60 | 61 | /// Replace the `idx`th element of this vector with `data`. 62 | fn replace(&mut self, idx: usize, data: Self::Scalar) -> Self; 63 | 64 | /// Replace the `idx`th element of this vector with `data`. 65 | unsafe fn replace_unchecked(&mut self, idx: usize, data: Self::Scalar) -> Self; 66 | 67 | /// Return a scalar equivalent to the product of all elements of this 68 | /// vector. 69 | fn product(&self) -> Self::Scalar; 70 | 71 | /// Return the result of a scalar reduction over this vector 72 | fn scalar_reduce(&self, acc: T, func: F) -> T 73 | where F: FnMut(T, Self::Scalar) -> T; 74 | } 75 | 76 | /// A type that may be packed into a SIMD vector. 77 | pub trait Packable where Self : Sized + Copy + Debug { 78 | type Vector : Packed + Clone; 79 | const SIZE: usize; 80 | } 81 | 82 | // Vector types which aren't interpreted as SIMD vectors, for systems which 83 | // don't have SIMD support. 84 | 85 | macro_rules! impl_packed { 86 | ($el:tt, $pvec:tt, $vec:tt, $sz:expr, $width:expr, [$($feat:expr),*], [$($nfeat:expr),*]) => ( 87 | 88 | /// A SIMD vector of this primitive type. 89 | #[allow(non_camel_case_types)] 90 | #[cfg(all($(target_feature = $feat,)* not($(target_feature = $nfeat)*)))] 91 | pub type $pvec = $vec; 92 | 93 | /// Return a vector of this type with all elements initialized to 94 | /// `data`. 95 | #[inline(always)] 96 | #[cfg(all($(target_feature = $feat,)* not($(target_feature = $nfeat)*)))] 97 | pub fn $pvec(data: $el) -> $pvec { 98 | $vec::splat(data) 99 | } 100 | 101 | #[cfg(all($(target_feature = $feat,)* not($(target_feature = $nfeat)*)))] 102 | impl Packable for $el { 103 | type Vector = $vec; 104 | const SIZE: usize = $sz; 105 | } 106 | 107 | impl Packed for $vec { 108 | type Scalar = $el; 109 | 110 | const WIDTH: usize = $width; 111 | 112 | #[inline(always)] 113 | fn load(data: &[$el], offset: usize) -> $vec { 114 | $vec::from_slice_unaligned(&data[offset..]) 115 | } 116 | 117 | #[inline(always)] 118 | unsafe fn load_unchecked(data: &[$el], offset: usize) -> $vec { 119 | debug_assert!(data[offset..].len() >= Self::WIDTH); 120 | $vec::from_slice_unaligned_unchecked(&data[offset..]) 121 | } 122 | 123 | #[inline(always)] 124 | fn store(self, data: &mut [$el], offset: usize) { 125 | $vec::write_to_slice_unaligned(self, &mut data[offset..]); 126 | } 127 | 128 | #[inline(always)] 129 | unsafe fn store_unchecked(self, data: &mut [$el], offset: usize) { 130 | debug_assert!(data[offset..].len() >= Self::WIDTH); 131 | $vec::write_to_slice_unaligned_unchecked(self, &mut data[offset..]); 132 | } 133 | 134 | #[inline(always)] 135 | fn coalesce(self) -> Self::Scalar { 136 | for i in 1..Self::WIDTH { 137 | debug_assert!(self.extract(i - 1) == self.extract(i)); 138 | } 139 | self.extract(0) 140 | } 141 | 142 | #[inline(always)] 143 | fn extract(&self, idx: usize) -> Self::Scalar { 144 | $vec::extract(*self, idx ) 145 | } 146 | 147 | #[inline(always)] 148 | unsafe fn extract_unchecked(&self, idx: usize) -> Self::Scalar { 149 | debug_assert!(idx < Self::WIDTH); 150 | $vec::extract_unchecked(*self, idx ) 151 | } 152 | 153 | #[inline(always)] 154 | fn replace(&mut self, idx: usize, data: Self::Scalar) -> Self { 155 | $vec::replace(*self, idx, data) 156 | } 157 | 158 | #[inline(always)] 159 | unsafe fn replace_unchecked(&mut self, idx: usize, data: Self::Scalar) -> Self { 160 | debug_assert!(idx < Self::WIDTH); 161 | $vec::replace_unchecked(*self, idx, data) 162 | } 163 | 164 | #[inline(always)] 165 | fn splat(data: $el) -> Self { 166 | $vec::splat(data) 167 | } 168 | 169 | #[inline(always)] 170 | fn default() -> Self { 171 | $vec::splat($el::default()) 172 | } 173 | 174 | #[inline(always)] 175 | fn product(&self) -> Self::Scalar { 176 | let mut acc = 1 as $el; 177 | for i in 0..Self::WIDTH { 178 | acc *= self.extract(i) 179 | } 180 | acc 181 | } 182 | 183 | #[inline(always)] 184 | fn scalar_reduce(&self, mut acc: T, mut func: F) -> T 185 | where F: FnMut(T, Self::Scalar) -> T { 186 | for i in 0..Self::WIDTH { 187 | acc = func(acc, self.extract(i)) 188 | } 189 | acc 190 | } 191 | } 192 | ); 193 | } 194 | 195 | -------------------------------------------------------------------------------- /tests/iters.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate faster; 4 | 5 | #[cfg(test)] 6 | mod tests { 7 | use faster::*; 8 | 9 | #[test] 10 | #[cfg(feature = "std")] 11 | fn in_place_mutation() { 12 | let test = |mut vec: Vec| { 13 | let mut scl = vec.clone(); 14 | vec.simd_iter_mut(f32s(0.0)) 15 | .simd_for_each(|x| *x /= f32s(2f32)); 16 | 17 | scl.iter_mut() 18 | .for_each(|x| *x /= 2f32); 19 | 20 | assert_eq!(vec, scl); 21 | }; 22 | 23 | let vec: Vec = (0..(f32s::WIDTH - 1)).map(|x| x as f32).collect(); 24 | test(vec); 25 | 26 | let vec: Vec = (0..f32s::WIDTH).map(|x| x as f32).collect(); 27 | test(vec); 28 | 29 | let vec: Vec = (0..(f32s::WIDTH + 1)).map(|x| x as f32).collect(); 30 | test(vec); 31 | } 32 | 33 | #[test] 34 | fn simd_reduce() { 35 | let vec = [2u32; 129]; 36 | let sum = vec.simd_iter(u32s(0u32)).simd_reduce(u32s(0u32), |acc, x| acc + x).sum(); 37 | assert_eq!(sum, 2 * 129); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /tests/kernel.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | #![feature(test,stdsimd)] 9 | 10 | extern crate faster; 11 | 12 | #[cfg(test)] 13 | mod tests { 14 | use faster::*; 15 | 16 | macro_rules! kernel_definite { 17 | ($name:ident, $native_type:ty, $simd_type:ident) => ( 18 | 19 | /// Tests a number of simple kernel computations with integer values. 20 | #[test] 21 | fn $name() { 22 | for n in 0 .. 16 { 23 | 24 | let vec_of_1 = vec![1 as $native_type; n]; 25 | let vec_of_3 = vec![3 as $native_type; n]; 26 | let mut out_vec = vec![0 as $native_type; n]; 27 | 28 | // Should produce n times (3 - 1) * (3 - 1) == n * 4 for each element 29 | let sum: $native_type = ((&vec_of_3[..]).simd_iter($simd_type(0)), (&vec_of_1[..]).simd_iter($simd_type(0))).zip() 30 | .simd_map(|(a, b)| (a - b) * (a - b)) 31 | .scalar_fill(&mut out_vec) 32 | .iter() 33 | .sum(); 34 | 35 | assert_eq!(sum, (n * 4) as $native_type); 36 | 37 | // Same as above, but this time we reduce with simd_reduce 38 | let sum: $native_type = ((&vec_of_3[..]).simd_iter($simd_type(0)), (&vec_of_1[..]).simd_iter($simd_type(0))).zip() 39 | .simd_map(|(a, b)| (a - b) * (a - b)) 40 | .simd_reduce($simd_type(0), |a, v| a + v) 41 | .sum(); 42 | 43 | assert_eq!(sum, (n * 4) as $native_type); 44 | } 45 | } 46 | ) 47 | } 48 | 49 | kernel_definite!(kernel_i64, i64, i64s); 50 | kernel_definite!(kernel_i32, i32, i32s); 51 | kernel_definite!(kernel_i16, i16, i16s); 52 | kernel_definite!(kernel_i8, i8, i8s); 53 | 54 | kernel_definite!(kernel_u64, u64, u64s); 55 | kernel_definite!(kernel_u32, u32, u32s); 56 | kernel_definite!(kernel_u16, u16, u16s); 57 | kernel_definite!(kernel_u8, u8, u8s); 58 | 59 | macro_rules! kernel_relative { 60 | ($name:ident, $native_type:ty, $simd_type:ident) => ( 61 | 62 | /// Tests a number of simple kernel computations with float values. 63 | #[test] 64 | fn $name() { 65 | for n in 0 .. 16 { 66 | let vec_of_1 = vec![1 as $native_type; n]; 67 | let vec_of_3 = vec![3 as $native_type; n]; 68 | 69 | // Should produce n times (1 - 3) * (1 - 3) == n * 4 for each element 70 | let sum_scalar: $native_type = vec_of_1.iter() 71 | .zip(vec_of_3.iter()) 72 | .map(|(a, b)| (a - b) * (a - b)) 73 | .sum(); 74 | 75 | // Same as above, but this time we reduce with simd_reduce 76 | let sum_simd: $native_type = (vec_of_1.simd_iter($simd_type(0.0 as $native_type)), 77 | vec_of_3.simd_iter($simd_type(0.0 as $native_type))) 78 | .zip() 79 | .simd_map(|(a, b)| (a - b) * (a - b)) 80 | .simd_reduce($simd_type(0.0 as $native_type), |a, v| a + v) 81 | .sum(); 82 | 83 | // Ensure both ways produce the same result 84 | assert_eq!(sum_scalar, sum_simd); 85 | 86 | // Make sure the result is equal to our target within a certain limit. 87 | assert!((sum_simd - (n * 4) as $native_type).abs() < 0.0001); 88 | } 89 | } 90 | ) 91 | } 92 | 93 | kernel_relative!(kernel_f32, f32, f32s); 94 | kernel_relative!(kernel_f64, f64, f64s); 95 | } 96 | -------------------------------------------------------------------------------- /tests/zip.rs: -------------------------------------------------------------------------------- 1 | // This file is part of faster, the SIMD library for humans. 2 | // Copyright 2017 Adam Niederer 3 | 4 | // This Source Code Form is subject to the terms of the Mozilla Public 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/. 7 | 8 | #![feature(test)] 9 | #![feature(stdsimd)] 10 | 11 | extern crate faster; 12 | 13 | #[cfg(test)] 14 | mod tests { 15 | use faster::*; 16 | 17 | #[test] 18 | #[cfg(feature = "std")] 19 | fn zipped_stride_iters() { 20 | let matrices = [1i16, 2, 3, 4, 5, 6, 7, 8, 9][..].iter().cycle().take(9 * 100).map(|i| i.clone()).collect::>(); 21 | let determinants = (&matrices[..]).stride_nine(tuplify!(9, i16s(0))).zip() 22 | .simd_map(|(a, b, c, d, e, f, g, h, i)| { 23 | assert_eq!(a.extract(a.width() - 1), 1); 24 | assert_eq!(b.extract(b.width() - 1), 2); 25 | assert_eq!(c.extract(c.width() - 1), 3); 26 | assert_eq!(d.extract(d.width() - 1), 4); 27 | assert_eq!(e.extract(e.width() - 1), 5); 28 | assert_eq!(f.extract(f.width() - 1), 6); 29 | assert_eq!(g.extract(g.width() - 1), 7); 30 | assert_eq!(h.extract(h.width() - 1), 8); 31 | assert_eq!(i.extract(i.width() - 1), 9); 32 | (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h) 33 | }).scalar_collect(); 34 | assert!(determinants.iter().fold(true, |acc, x| acc && x == &0)); 35 | 36 | let matrices = [1i64, 0, 0, 0, 5, 4, 2, 3, 0][..].iter().cycle().take(9 * 100).map(|i| i.clone()).collect::>(); 37 | let determinants = (&matrices[..]).stride_nine(tuplify!(9, i64s(0))).zip() 38 | .simd_map(|(a, b, c, d, e, f, g, h, i)| { 39 | (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h) 40 | }).scalar_collect(); 41 | assert!(determinants.iter().fold(true, |acc, x| { acc && x == &-12 })); 42 | } 43 | 44 | #[test] 45 | #[cfg(feature = "std")] 46 | fn zipped_heterogeneous_iters() { 47 | let to_stride = [1i8, 2, 3, 4, 5, 6, 7, 8][..].iter().cycle().take(512).map(|i| i.clone()).collect::>(); 48 | let (a, b) = to_stride.stride_two(tuplify!(2, i8s(0))); 49 | let standard_iter_a = vec!(3i8; 256).into_simd_iter(i8s(0)); 50 | let standard_iter_b = vec!(7i8; 256).into_simd_iter(i8s(0)); 51 | 52 | let a_times_three = (a, standard_iter_a).zip() 53 | .simd_map(|(s, c)| s * c) 54 | .scalar_collect(); 55 | 56 | let b_times_three = (b, standard_iter_b).zip() 57 | .simd_map(|(s, c)| s * c) 58 | .scalar_collect(); 59 | 60 | let a_times_three_check = to_stride.chunks(2).map(|c| c[0] * 3); 61 | let b_times_three_check = to_stride.chunks(2).map(|c| c[1] * 7); 62 | 63 | assert!(a_times_three_check.zip(a_times_three) 64 | .fold(true, |acc, (a, b)| acc && a == b)); 65 | 66 | assert!(b_times_three_check.zip(b_times_three) 67 | .fold(true, |acc, (a, b)| acc && a == b)); 68 | } 69 | 70 | #[test] 71 | fn zip_simd_reduce() { 72 | let vec1 = [2u32; 129]; 73 | let vec2 = [3u32; 129]; 74 | let result = (vec1.simd_iter(u32s(0u32)), vec2.simd_iter(u32s(0u32))).zip().simd_reduce(u32s(0u32), |acc, (x, y)| acc + x * y).sum(); 75 | assert_eq!(result, 2 * 3 * 129); 76 | } 77 | 78 | } 79 | --------------------------------------------------------------------------------