├── .gitignore
├── .travis.yml
├── CHANGELOG.org
├── Cargo.toml
├── LICENSE
├── README.org
├── benches
    ├── destride.rs
    ├── intrin.rs
    └── usage.rs
├── examples
    └── main.rs
├── scripts
    └── vec_patterns_gen.py
├── src
    ├── arch
    │   ├── mod.rs
    │   ├── unknown
    │   │   ├── intrin
    │   │   │   ├── abs.rs
    │   │   │   ├── cmp.rs
    │   │   │   ├── destride.rs
    │   │   │   ├── downcast.rs
    │   │   │   ├── endian.rs
    │   │   │   ├── eq.rs
    │   │   │   ├── hadd.rs
    │   │   │   ├── hsub.rs
    │   │   │   ├── merge.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── recip.rs
    │   │   │   ├── round.rs
    │   │   │   ├── rsqrt.rs
    │   │   │   ├── saturating_add.rs
    │   │   │   ├── saturating_hadd.rs
    │   │   │   ├── saturating_hsub.rs
    │   │   │   ├── saturating_sub.rs
    │   │   │   ├── sqrt.rs
    │   │   │   ├── sum.rs
    │   │   │   ├── transmute.rs
    │   │   │   └── upcast.rs
    │   │   ├── mod.rs
    │   │   ├── vec_patterns.rs
    │   │   └── vecs.rs
    │   └── x86
    │   │   ├── intrin
    │   │       ├── abs.rs
    │   │       ├── addsub.rs
    │   │       ├── cast.rs
    │   │       ├── cmp.rs
    │   │       ├── destride.rs
    │   │       ├── downcast.rs
    │   │       ├── endian.rs
    │   │       ├── eq.rs
    │   │       ├── hadd.rs
    │   │       ├── hsub.rs
    │   │       ├── merge.rs
    │   │       ├── mod.rs
    │   │       ├── popcnt.rs
    │   │       ├── recip.rs
    │   │       ├── round.rs
    │   │       ├── rsqrt.rs
    │   │       ├── saturating_add.rs
    │   │       ├── saturating_hadd.rs
    │   │       ├── saturating_hsub.rs
    │   │       ├── saturating_sub.rs
    │   │       ├── sqrt.rs
    │   │       ├── sum.rs
    │   │       ├── swizzle.rs
    │   │       ├── transmute.rs
    │   │       └── upcast.rs
    │   │   ├── mod.rs
    │   │   ├── vec_patterns.rs
    │   │   └── vecs.rs
    ├── debug.rs
    ├── into_iters.rs
    ├── intrin
    │   ├── abs.rs
    │   ├── addsub.rs
    │   ├── cast.rs
    │   ├── cmp.rs
    │   ├── destride.rs
    │   ├── downcast.rs
    │   ├── endian.rs
    │   ├── eq.rs
    │   ├── hadd.rs
    │   ├── hsub.rs
    │   ├── macros.rs
    │   ├── merge.rs
    │   ├── mod.rs
    │   ├── popcnt.rs
    │   ├── recip.rs
    │   ├── round.rs
    │   ├── rsqrt.rs
    │   ├── saturating_add.rs
    │   ├── saturating_hadd.rs
    │   ├── saturating_hsub.rs
    │   ├── saturating_sub.rs
    │   ├── sqrt.rs
    │   ├── sum.rs
    │   ├── swizzle.rs
    │   ├── transmute.rs
    │   └── upcast.rs
    ├── iters.rs
    ├── lib.rs
    ├── prelude.rs
    ├── stride.rs
    ├── stride_zip.rs
    ├── vec_patterns.rs
    ├── vecs.rs
    └── zip.rs
└── tests
    ├── iters.rs
    ├── kernel.rs
    └── zip.rs


/.gitignore:
--------------------------------------------------------------------------------
 1 | /target/
 2 | **/*.rs.bk
 3 | #*
 4 | *#
 5 | *~
 6 | ~*
 7 | *.bak
 8 | *.sav
 9 | kek.*
10 | kek-*
11 | Cargo.lock
12 | .idea
13 | *.rlib
14 | *.d
15 | *.s
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | sudo: false
 3 | rust:
 4 |   - nightly
 5 | env:
 6 |   - RUSTFLAGS="-C target-feature=-sse -C target-feature=+x87" RUN="build --all --examples --tests --benches --bins"
 7 |   - RUSTFLAGS="-C target-feature=+sse" RUN="build --all --examples --tests --benches --bins"
 8 |   - RUSTFLAGS="-C target-feature=+sse2" RUN="build --all --examples --tests --benches --bins"
 9 |   - RUSTFLAGS="-C target-feature=+sse3" RUN="build --all --examples --tests --benches --bins"
10 |   - RUSTFLAGS="-C target-feature=+ssse3" RUN="build --all --examples --tests --benches --bins"
11 |   - RUSTFLAGS="-C target-feature=+sse4a" RUN="build --all --examples --tests --benches --bins"
12 |   - RUSTFLAGS="-C target-feature=+sse4.1" RUN="build --all --examples --tests --benches --bins"
13 |   - RUSTFLAGS="-C target-feature=+sse4.2" RUN="build --all --examples --tests --benches --bins"
14 |   - RUSTFLAGS="-C target-feature=+avx" RUN="build --all --examples --tests --benches --bins"
15 |   - RUSTFLAGS="-C target-feature=+avx2" RUN="build --all --examples --tests --benches --bins"
16 |   - RUSTFLAGS="-C target-feature=+avx512" RUN="build --all --examples --tests --benches --bins"
17 |   - RUSTFLAGS="-C target-cpu=x86-64" RUN="test --lib --tests --benches --examples --doc"
18 |   - RUSTFLAGS="-C target-cpu=pentium" RUN="test --lib --tests --benches --examples"
19 |   - RUSTFLAGS="-C target-cpu=pentium3" RUN="test --lib --tests --benches --examples"
20 |   - RUSTFLAGS="-C target-cpu=pentium4" RUN="test --lib --tests --benches --examples"
21 |   - RUSTFLAGS="-C target-cpu=core2" RUN="test --lib --tests --benches --examples"
22 |   - RUSTFLAGS="-C target-cpu=nehalem" RUN="test --lib --tests --benches --examples"
23 |   - RUSTFLAGS="-C target-cpu=sandybridge" RUN="test --lib --tests --benches --examples"
24 |   - RUSTFLAGS="-C target-cpu=native" RUN="test --lib --tests --benches --examples"
25 | matrix:
26 |   fast_finish: true
27 | install:
28 | script:
29 |   - cat /proc/cpuinfo
30 |   - bash -c "cargo $RUN --verbose"
31 |   - bash -c "cargo $RUN --verbose --features \"std\""
32 | notifications:
33 |   email: false
34 | 


--------------------------------------------------------------------------------
/CHANGELOG.org:
--------------------------------------------------------------------------------
 1 | * 0.4.3
 2 | ** Features
 3 | - Significantly speed up automatic iterators (huge thanks to Osveron!)
 4 | * 0.4.2
 5 | ** Features
 6 | - Add ~simd_for_each~
 7 | - Add equality comparison via ~PackedEq~
 8 | * 0.4.1
 9 | ** Bugfixes & Minor Improvements
10 | - Fix an issue with zipping even collections
11 | - Fix an upcast on AVX2 machines
12 | * 0.4.0
13 | Announcing faster 0.4.0 - a 4,500+ line diff from 0.3.0
14 | ** Big Changes
15 | - Support ~#![no_std]~
16 | - Add striping, gathers, and scatters.
17 | - Add vector merging
18 | - Add vectorized endianness operations
19 | - Add limited vector swizzling
20 | - Add lockstep packed iterators
21 | ** Features
22 | - Add tons of docstrings
23 | - Allow ~FnMut~ closures in ~simd_map~ and ~simd_reduce~
24 | - Vectorize operations on last elements of an uneven collection
25 | - Implement compound assignment operators for architectures without hardware SIMD
26 | - Add large vectors for architectures without hardware SIMD
27 | ** Bugfixes & Minor Improvements
28 | - Add a changelog
29 | - Fix a correctness issue when mapping over uneven collections
30 | - Vectorize min/max for SSE4.1
31 | - Vectorize ~Upcast~ for SSE4.1
32 | - Implement ~Downcast~ polyfills on many more vector types
33 | - Implement and test ~[saturating_]{hadd,hsub}~ on more vector types.
34 | - Undeprecate ~halfs~ and ~interleave~
35 | * 0.3.0
36 | Announcing Faster 0.3.0, a 3,500+ line diff from 0.1.1
37 | ** Big Changes
38 | - Support for targets without hardware SIMD
39 | - Support for architectures other than x86
40 | - Documentation & examples for most objects
41 | - Intuitive support for uneven collections
42 | - Support SIMD-accelerated reductive operations
43 | - Add upcasting, casting, and downcasting
44 | ** Features
45 | - Add summation and product calculations for all vectors
46 | - Add default initializer for vectors
47 | - Add many more polyfills and feature gates
48 | - Allow scalar iteration of SIMD iterators with `map` and `fold`
49 | - Add vector constructors for interleaved and half-and-half patterns
50 | - Expose saturating addition and subtraction intrinsics
51 | ** Bugfixes & Minor Improvements
52 | - Add tests for sqrt, transmute, abs, recip, upcast, downcast, cast, and more
53 | - Make ~[saturating]_{hadd,hsub}~ portable
54 | * 0.2.0
55 | Announcing Faster 0.2.0, a 2,500+ line diff from 0.1.1
56 | ** Big Changes
57 | - Change license to MPL 2.0
58 | - Support for uneven collections
59 | - Add packed transmutations
60 | ** Features
61 | - Implement many more operations and polyfills
62 | ** Maintainence & Bugfixes:
63 | - Require only SSE for
64 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "faster"
 3 | description = "Explicit SIMD for humans"
 4 | authors = ["Adam Niederer <adam.niederer@gmail.com>"]
 5 | license = "MPL-2.0"
 6 | version = "0.5.2"
 7 | edition = "2018"
 8 | 
 9 | keywords = ["simd"]
10 | categories = ["no-std", "hardware-support", "api-bindings"]
11 | documentation = "https://docs.adamniederer.com/faster/index.html"
12 | repository = "https://github.com/AdamNiederer/faster"
13 | readme = "README.org"
14 | 
15 | [dependencies]
16 | vektor = "0.2.1"
17 | packed_simd = {version = "0.3.4", package = "packed_simd_2"}
18 | 
19 | [features]
20 | default = ["std"]
21 | std = []
22 | trace = []  # When enabled, `FASTER_DEBUG_FILE` environment can configure log file.
23 | 
24 | [dev-dependencies]
25 | 
26 | [profile.release]
27 | opt-level = 3
28 | 


--------------------------------------------------------------------------------
/benches/destride.rs:
--------------------------------------------------------------------------------
  1 | #![feature(stdsimd, test)]
  2 | 
  3 | #[cfg(test)] extern crate test;
  4 | #[macro_use] extern crate faster;
  5 | 
  6 | #[cfg(test)]
  7 | mod destride {
  8 |     use faster::prelude::*;
  9 |     use test::{Bencher, black_box};
 10 | 
 11 |     #[bench]
 12 |     #[cfg(feature = "std")]
 13 |     fn destride_two(b: &mut Bencher) {
 14 |         let a = [0u8; 4096];
 15 |         b.iter(|| {
 16 |             for v in a.simd_iter(u8s(0)).unroll(2) {
 17 |                 let _ = black_box(v[0].destride_two(v[1]));
 18 |             }
 19 |         })
 20 |     }
 21 | 
 22 |     #[bench]
 23 |     #[cfg(feature = "std")]
 24 |     fn destride_four(b: &mut Bencher) {
 25 |         let a = [0u8; 4096];
 26 |         b.iter(|| {
 27 |             for v in a.simd_iter(u8s(0)).unroll(4) {
 28 |                 let _ = black_box(v[0].destride_four(v[1], v[2], v[3]));
 29 |             }
 30 |         })
 31 |     }
 32 | 
 33 |     #[bench]
 34 |     #[cfg(feature = "std")]
 35 |     fn destride_two_16(b: &mut Bencher) {
 36 |         let a = [0u16; 4096];
 37 |         b.iter(|| {
 38 |             for v in a.simd_iter(u16s(0)).unroll(2) {
 39 |                 let _ = black_box(v[0].destride_two(v[1]));
 40 |             }
 41 |         })
 42 |     }
 43 | 
 44 |     #[bench]
 45 |     #[cfg(feature = "std")]
 46 |     fn destride_four_16(b: &mut Bencher) {
 47 |         let a = [0u16; 4096];
 48 |         b.iter(|| {
 49 |             for v in a.simd_iter(u16s(0)).unroll(4) {
 50 |                 let _ = v[0].destride_four(v[1], v[2], v[3]);
 51 |             }
 52 |         })
 53 |     }
 54 | 
 55 |     #[bench]
 56 |     #[cfg(feature = "std")]
 57 |     fn destride_two_32(b: &mut Bencher) {
 58 |         let a = [0u32; 4096];
 59 |         b.iter(|| {
 60 |             for v in a.simd_iter(u32s(0)).unroll(2) {
 61 |                 let _ = black_box(v[0].destride_two(v[1]));
 62 |             }
 63 |         })
 64 |     }
 65 | 
 66 |     #[bench]
 67 |     #[cfg(feature = "std")]
 68 |     fn destride_four_32(b: &mut Bencher) {
 69 |         let a = [0u32; 4096];
 70 |         b.iter(|| {
 71 |             for v in a.simd_iter(u32s(0)).unroll(4) {
 72 |                 let _ = v[0].destride_four(v[1], v[2], v[3]);
 73 |             }
 74 |         })
 75 |     }
 76 | 
 77 |     #[bench]
 78 |     #[cfg(feature = "std")]
 79 |     fn destride_four_naiive(b: &mut Bencher) {
 80 |         let a = [0u8; 4096];
 81 |         b.iter(|| {
 82 |             (&a[..]).stride_four(tuplify!(4, u8s(0))).zip()
 83 |                 .simd_do_each(|x| { black_box(x); });
 84 |         })
 85 |     }
 86 | 
 87 |     #[bench]
 88 |     #[cfg(feature = "std")]
 89 |     fn destride_two_naiive(b: &mut Bencher) {
 90 |         let a = [0u8; 4096];
 91 |         b.iter(|| {
 92 |             (&a[..]).stride_two(tuplify!(2, u8s(0))).zip()
 93 |                 .simd_do_each(|x| { black_box(x); });
 94 |         })
 95 |     }
 96 | 
 97 |     #[bench]
 98 |     #[cfg(feature = "std")]
 99 |     fn destride_four_naiive_16(b: &mut Bencher) {
100 |         let a = [0u16; 4096];
101 |         b.iter(|| {
102 |             (&a[..]).stride_four(tuplify!(4, u16s(0))).zip()
103 |                 .simd_do_each(|x| { black_box(x); });
104 |         })
105 |     }
106 | 
107 |     #[bench]
108 |     #[cfg(feature = "std")]
109 |     fn destride_two_naiive_16(b: &mut Bencher) {
110 |         let a = [0u16; 4096];
111 |         b.iter(|| {
112 |             (&a[..]).stride_two(tuplify!(2, u16s(0))).zip()
113 |                 .simd_do_each(|x| { black_box(x); });
114 |         })
115 |     }
116 | 
117 |     #[bench]
118 |     #[cfg(feature = "std")]
119 |     fn destride_four_naiive_32(b: &mut Bencher) {
120 |         let a = [0u32; 4096];
121 |         b.iter(|| {
122 |             (&a[..]).stride_four(tuplify!(4, u32s(0))).zip()
123 |                 .simd_do_each(|x| { black_box(x); });
124 |         })
125 |     }
126 | 
127 |     #[bench]
128 |     #[cfg(feature = "std")]
129 |     fn destride_two_naiive_32(b: &mut Bencher) {
130 |         let a = [0u32; 4096];
131 |         b.iter(|| {
132 |             (&a[..]).stride_two(tuplify!(2, u32s(0))).zip()
133 |                 .simd_do_each(|x| { black_box(x); });
134 |         })
135 |     }
136 | }
137 | 


--------------------------------------------------------------------------------
/benches/intrin.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test, stdsimd)]
 2 | 
 3 | #[cfg(test)] extern crate test;
 4 | extern crate faster;
 5 | 
 6 | const ARRAY_F32: &[f32] = &[-123.456f32; 1024];
 7 | 
 8 | macro_rules! bench_intrin_1 {
 9 |     ($simd_name:ident, $simd_fn:expr, $scalar_name:ident, $scalar_fn:expr) => {
10 |         #[bench]
11 |         #[cfg(feature = "std")]
12 |         fn $scalar_name(b: &mut Bencher) {
13 |             b.iter(|| { black_box(
14 |                crate::ARRAY_F32.iter().map(|v| { $scalar_fn(*v) }).collect::<Vec<f32>>()
15 |             )})
16 |         }
17 | 
18 |         #[bench]
19 |         #[cfg(feature = "std")]
20 |         fn $simd_name(b: &mut Bencher) {
21 |             b.iter(|| { black_box(
22 |                 crate::ARRAY_F32.simd_iter(f32s(0.0)).simd_map(|v| { $simd_fn(v) }).scalar_collect()
23 |             )});
24 |         }
25 |     }
26 | }
27 | 
28 | macro_rules! bench_intrin_2 {
29 |     ($simd_name:ident, $simd_fn:ident, $scalar_name:ident, $scalar_fn:ident) => {
30 |         #[bench]
31 |         #[cfg(feature = "std")]
32 |         fn $scalar_name(b: &mut Bencher) {
33 |             b.iter(|| { black_box(
34 |                 crate::ARRAY_F32.iter().map(|v| { v.$scalar_fn(*v) }).collect::<Vec<f32>>()
35 |             )})
36 |         }
37 | 
38 |         #[bench]
39 |         #[cfg(feature = "std")]
40 |         fn $simd_name(b: &mut Bencher) {
41 |             b.iter(|| { black_box(
42 |                 crate::ARRAY_F32.simd_iter(f32s(0.0)).simd_map(|v| {v.$simd_fn(v) }).scalar_collect()
43 |             )});
44 |         }
45 |     }
46 | }
47 | 
48 | 
49 | #[cfg(test)]
50 | mod intrin {
51 |     use faster::prelude::*;
52 |     use test::{Bencher, black_box};
53 | 
54 |     bench_intrin_1!(abs_simd, |x: f32s| x.abs(), abs_scala, |x: f32| x.abs());
55 |     bench_intrin_1!(ceil_simd, |x: f32s| x.ceil(), ceil_scala, |x: f32| x.ceil());
56 |     bench_intrin_1!(floor_simd, |x: f32s| x.floor(), floor_scala, |x: f32| x.floor());
57 |     bench_intrin_2!(min_simd, min, min_scala, min);
58 |     bench_intrin_2!(max_simd, max, max_scala, max);
59 |     bench_intrin_1!(recip_simd, |x: f32s| x.recip(), recip_scala, |x: f32| 1.0f32 / x);
60 |     bench_intrin_1!(round_simd, |x: f32s| x.round(), round_scala, |x: f32| x.round());
61 |     bench_intrin_1!(sqrt_simd, |x: f32s| x.sqrt(), sqrt_scala, |x: f32| x.sqrt());
62 |     bench_intrin_1!(trunc_simd, |x: f32s| x.trunc(), trunc_scala, |x: f32| x.trunc());
63 | }
64 | 


--------------------------------------------------------------------------------
/examples/main.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | #![feature(stdsimd)]
 8 | 
 9 | extern crate faster;
10 | use faster::*;
11 | 
12 | #[cfg(feature = "std")]
13 | fn main() {
14 |     let lots_of_84s = (&[-10i8; 33][..]).simd_iter(i8s(0))
15 |         .simd_map(|v| i8s(9) * v.abs().be_i8s() - i8s(4) - i8s(2))
16 |         .simd_map(|v| v)
17 |         .scalar_collect();
18 | 
19 |     let lots_of_3s = (&[-123.456f32; 128][..]).simd_iter(f32s(0.0))
20 |         .simd_map(|v| { f32s(9.0) * v.abs().sqrt().rsqrt().ceil().sqrt() -
21 |                         f32s(4.0) - f32s(2.0) })
22 |         .scalar_collect();
23 | 
24 |     let lots_of_3s_sc = (&[-123.456f32; 128][..]).iter()
25 |         .map(|v| { 9.0 * v.abs().sqrt().sqrt().recip().ceil().sqrt() -
26 |                    4.0 - 2.0 })
27 |         .collect::<Vec<f32>>();
28 | 
29 |     let mut some_u8s = [0u8; 100];
30 |     let filled_u8s = (&[5u8; 100][..]).simd_iter(u8s(0))
31 |         .simd_map(|vector| vector * u8s(2))
32 |         .scalar_fill(&mut some_u8s);
33 | 
34 |     let reduced = (&[-1.0f32; 128][..]).simd_iter(f32s(0.0))
35 |         .simd_reduce(f32s(0.0), |a, v| a + v.abs().sqrt().sqrt().floor()).sum();
36 | 
37 |     let strided = (0..20u32).collect::<Vec<u32>>().as_slice()
38 |         .stride_two(tuplify!(2, u32s(99))).zip().simd_map(|(a, b)| a + b)
39 |         .scalar_collect();
40 | 
41 |     println!("{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n{:?}\n", lots_of_84s, lots_of_3s, lots_of_3s_sc, filled_u8s, filled_u8s.len(), reduced, strided);
42 | }
43 | 
44 | #[cfg(not(feature = "std"))]
45 | fn main() {}
46 | 


--------------------------------------------------------------------------------
/src/arch/mod.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 2 | pub mod x86;
 3 | 
 4 | #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
 5 | pub mod unknown;
 6 | 
 7 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 8 | pub use self::x86 as current;
 9 | 
10 | #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
11 | pub use self::unknown as current;
12 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/cmp.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::arch::current::vecs::*;
  9 | use crate::vecs::*;
 10 | use crate::intrin::cmp::*;
 11 | 
 12 | rust_fallback_impl_binary! {
 13 |     impl Cmp for u8x16 where "__undefined" {
 14 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 15 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 16 |     }
 17 | }
 18 | 
 19 | rust_fallback_impl_binary! {
 20 |     impl Cmp for i8x16 where "__undefined" {
 21 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 22 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 23 |     }
 24 | }
 25 | 
 26 | rust_fallback_impl_binary! {
 27 |     impl Cmp for u16x8 where "__undefined" {
 28 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
 29 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
 30 |     }
 31 | }
 32 | 
 33 | rust_fallback_impl_binary! {
 34 |     impl Cmp for i16x8 where "__undefined" {
 35 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
 36 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
 37 |     }
 38 | }
 39 | 
 40 | rust_fallback_impl_binary! {
 41 |     impl Cmp for u32x4 where "__undefined" {
 42 |         min => __undefined(), [0, 1, 2, 3];
 43 |         max => __undefined(), [0, 1, 2, 3];
 44 |     }
 45 | }
 46 | 
 47 | rust_fallback_impl_binary! {
 48 |     impl Cmp for i32x4 where "__undefined" {
 49 |         min => __undefined(), [0, 1, 2, 3];
 50 |         max => __undefined(), [0, 1, 2, 3];
 51 |     }
 52 | }
 53 | 
 54 | rust_fallback_impl_binary! {
 55 |     impl Cmp for f32x4 where "__undefined" {
 56 |         min => __undefined(), [0, 1, 2, 3];
 57 |         max => __undefined(), [0, 1, 2, 3];
 58 |     }
 59 | }
 60 | 
 61 | rust_fallback_impl_binary! {
 62 |     impl Cmp for f64x2 where "__undefined" {
 63 |         min => __undefined(), [0, 1];
 64 |         max => __undefined(), [0, 1];
 65 |     }
 66 | }
 67 | 
 68 | rust_fallback_impl_binary! {
 69 |     impl Cmp for u8x32 where "__undefined" {
 70 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 71 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 72 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 73 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 74 |     }
 75 | }
 76 | 
 77 | rust_fallback_impl_binary! {
 78 |     impl Cmp for i8x32 where "__undefined" {
 79 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 80 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 81 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 82 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 83 |     }
 84 | }
 85 | 
 86 | rust_fallback_impl_binary! {
 87 |     impl Cmp for u16x16 where "__undefined" {
 88 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 89 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 90 |     }
 91 | }
 92 | 
 93 | rust_fallback_impl_binary! {
 94 |     impl Cmp for i16x16 where "__undefined" {
 95 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 96 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 97 |     }
 98 | }
 99 | 
100 | rust_fallback_impl_binary! {
101 |     impl Cmp for u32x8 where "__undefined" {
102 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
103 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
104 |     }
105 | }
106 | 
107 | rust_fallback_impl_binary! {
108 |     impl Cmp for i32x8 where "__undefined" {
109 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
110 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
111 |     }
112 | }
113 | 
114 | rust_fallback_impl_binary! {
115 |     impl Cmp for f32x8 where "__undefined" {
116 |         min => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
117 |         max => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
118 |     }
119 | }
120 | 
121 | rust_fallback_impl_binary! {
122 |     impl Cmp for f64x4 where "__undefined" {
123 |         min => __undefined(), [0, 1, 2, 3];
124 |         max => __undefined(), [0, 1, 2, 3];
125 |     }
126 | }
127 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/destride.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::intrin::destride::*;
10 | 
11 | impl Destride for u8x16 {
12 |     #[inline(always)]
13 |     fn destride_two(self, other: Self) -> (Self, Self) {
14 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14)
15 |     }
16 | 
17 |     #[inline(always)]
18 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
19 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12)
20 |     }
21 | }
22 | 
23 | impl Destride for u8x32 {
24 |     #[inline(always)]
25 |     fn destride_two(self, other: Self) -> (Self, Self) {
26 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)
27 |     }
28 | 
29 |     #[inline(always)]
30 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
31 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28)
32 |     }
33 | }
34 | 
35 | impl Destride for i8x16 {
36 |     #[inline(always)]
37 |     fn destride_two(self, other: Self) -> (Self, Self) {
38 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14)
39 |     }
40 | 
41 |     #[inline(always)]
42 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
43 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12)
44 |     }
45 | }
46 | 
47 | impl Destride for i8x32 {
48 |     #[inline(always)]
49 |     fn destride_two(self, other: Self) -> (Self, Self) {
50 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)
51 |     }
52 | 
53 |     #[inline(always)]
54 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
55 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28)
56 |     }
57 | }
58 | 
59 | macro_rules! impl_destride {
60 |     ($t:ty, $($two:expr, $four:expr),*) => {
61 |         impl Destride for $t {
62 |             #[inline(always)]
63 |             fn destride_two(self, other: Self) -> (Self, Self) {
64 |                 destride_two_polyfill!(self, other, $($two, $four),*)
65 |             }
66 | 
67 |             #[inline(always)]
68 |             fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
69 |                 destride_four_polyfill!(self, b, c, d, $($two),*)
70 |             }
71 |         }
72 |     }
73 | }
74 | 
75 | impl_destride!(u16x16, 0, 2, 4, 6, 8, 10, 12, 14);
76 | impl_destride!(u16x8, 0, 2, 4, 6);
77 | impl_destride!(i16x16, 0, 2, 4, 6, 8, 10, 12, 14);
78 | impl_destride!(i16x8, 0, 2, 4, 6);
79 | 
80 | impl_destride!(u32x8, 0, 2, 4, 6);
81 | impl_destride!(u32x4, 0, 2);
82 | impl_destride!(i32x8, 0, 2, 4, 6);
83 | impl_destride!(i32x4, 0, 2);
84 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/endian.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::endian::*;
11 | 
12 | impl_packed_swap_bytes!(u8x16, u8x16, "__undefined", __undefined,
13 |                         (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
14 |                         (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
15 | impl_packed_swap_bytes!(i8x16, u8x16, "__undefined", __undefined,
16 |                         (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
17 |                         (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15));
18 | impl_packed_swap_bytes!(u16x8, u8x16, "__undefined", __undefined,
19 |                         (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14),
20 |                         (0, 1, 2, 3, 4, 5, 6, 7));
21 | impl_packed_swap_bytes!(i16x8, u8x16, "__undefined", __undefined,
22 |                         (1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14),
23 |                         (0, 1, 2, 3, 4, 5, 6, 7));
24 | impl_packed_swap_bytes!(u32x4, u8x16, "__undefined", __undefined,
25 |                         (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12),
26 |                         (0, 1, 2, 3));
27 | impl_packed_swap_bytes!(i32x4, u8x16, "__undefined", __undefined,
28 |                         (3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12),
29 |                         (0, 1, 2, 3));
30 | impl_packed_swap_bytes!(u64x2, u8x16, "__undefined", __undefined,
31 |                         (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8),
32 |                         (0, 1));
33 | impl_packed_swap_bytes!(i64x2, u8x16, "__undefined", __undefined,
34 |                         (7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8),
35 |                         (0, 1));
36 | 
37 | mod tests {
38 |     #![allow(unused_imports)]
39 | 
40 |     use crate::prelude::*;
41 |     use crate::arch::current::vecs::*;
42 | 
43 |     test_packed_swap_bytes!((u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2),
44 |                              (swap_bytes_u8x16, swap_bytes_i8x16, swap_bytes_u16x8, swap_bytes_i16x8, swap_bytes_u32x4, swap_bytes_i32x4, swap_bytes_u64x2, swap_bytes_i64x2));
45 | }
46 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/eq.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::intrin::eq::*;
  9 | use crate::arch::current::vecs::*;
 10 | use crate::vecs::*;
 11 | 
 12 | rust_fallback_eq! {
 13 |     impl Eq for u8x16 where "__undefined" {
 14 |         eq_mask, eq => u8x16, u8, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 15 |     }
 16 | }
 17 | 
 18 | rust_fallback_eq! {
 19 |     impl Eq for i8x16 where "__undefined" {
 20 |         eq_mask, eq => u8x16, u8, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 21 |     }
 22 | }
 23 | 
 24 | rust_fallback_eq! {
 25 |     impl Eq for u16x8 where "__undefined" {
 26 |         eq_mask, eq => u16x8, u16, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
 27 |     }
 28 | }
 29 | 
 30 | rust_fallback_eq! {
 31 |     impl Eq for i16x8 where "__undefined" {
 32 |         eq_mask, eq => u16x8, u16, __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
 33 |     }
 34 | }
 35 | 
 36 | rust_fallback_eq! {
 37 |     impl Eq for u32x4 where "__undefined" {
 38 |         eq_mask, eq => u32x4, u32, __undefined(), [0, 1, 2, 3];
 39 |     }
 40 | }
 41 | 
 42 | rust_fallback_eq! {
 43 |     impl Eq for i32x4 where "__undefined" {
 44 |         eq_mask, eq => u32x4, u32, __undefined(), [0, 1, 2, 3];
 45 |     }
 46 | }
 47 | 
 48 | rust_fallback_eq! {
 49 |     impl Eq for f32x4 where "__undefined" {
 50 |         eq_mask, eq => u32x4, u32, __undefined(), [0, 1, 2, 3];
 51 |     }
 52 | }
 53 | 
 54 | rust_fallback_eq! {
 55 |     impl Eq for f64x2 where "__undefined" {
 56 |         eq_mask, eq => u64x2, u64, __undefined(), [0, 1];
 57 |     }
 58 | }
 59 | 
 60 | rust_fallback_eq! {
 61 |     impl Eq for u64x2 where "__undefined" {
 62 |         eq_mask, eq => u64x2, u64, __undefined(), [0, 1];
 63 |     }
 64 | }
 65 | 
 66 | rust_fallback_eq! {
 67 |     impl Eq for i64x2 where "__undefined" {
 68 |         eq_mask, eq => u64x2, u64, __undefined(), [0, 1];
 69 |     }
 70 | }
 71 | 
 72 | mod tests {
 73 |     #![allow(unused_imports)]
 74 |     use crate::prelude::*;
 75 |     use crate::arch::current::vecs::*;
 76 | 
 77 |     // test_packed_eq!(u8x64, u8, u8x64, u8, test_eq_u8x64);
 78 |     // test_packed_eq!(u8x32, u8, u8x32, u8, test_eq_u8x32);
 79 |     test_packed_eq!(u8x16, u8, u8x16, u8, test_eq_u8x16);
 80 |     // test_packed_eq!(i8x64, i8, u8x64, u8, test_eq_i8x64);
 81 |     // test_packed_eq!(i8x32, i8, u8x32, u8, test_eq_i8x32);
 82 |     test_packed_eq!(i8x16, i8, u8x16, u8, test_eq_i8x16);
 83 |     // test_packed_eq!(u16x32, u16, u16x32, u16, test_eq_u16x32);
 84 |     // test_packed_eq!(u16x16, u16, u16x16, u16, test_eq_u16x16);
 85 |     test_packed_eq!(u16x8, u16, u16x8, u16, test_eq_u16x8);
 86 |     // test_packed_eq!(i16x32, i16, u16x32, u16, test_eq_i16x32);
 87 |     // test_packed_eq!(i16x16, i16, u16x16, u16, test_eq_i16x16);
 88 |     test_packed_eq!(i16x8, i16, u16x8, u16, test_eq_i16x8);
 89 |     // test_packed_eq!(u32x16, u32, u32x16, u32, test_eq_u32x16);
 90 |     // test_packed_eq!(u32x8, u32, u32x8, u32, test_eq_u32x8);
 91 |     test_packed_eq!(u32x4, u32, u32x4, u32, test_eq_u32x4);
 92 |     // test_packed_eq!(i32x16, i32, u32x16, u32, test_eq_i32x16);
 93 |     // test_packed_eq!(i32x8, i32, u32x8, u32, test_eq_i32x8);
 94 |     test_packed_eq!(i32x4, i32, u32x4, u32, test_eq_i32x4);
 95 |     // test_packed_eq!(f32x16, f32, u32x16, u32, test_eq_f32x16);
 96 |     // test_packed_eq!(f32x8, f32, u32x8, u32, test_eq_f32x8);
 97 |     test_packed_eq!(f32x4, f32, u32x4, u32, test_eq_f32x4);
 98 |     // test_packed_eq!(u64x8, u64, u64x8, u64, test_eq_u64x8);
 99 |     // test_packed_eq!(u64x4, u64, u64x4, u64, test_eq_u64x4);
100 |     test_packed_eq!(u64x2, u64, u64x2, u64, test_eq_u64x2);
101 |     // test_packed_eq!(i64x8, i64, u64x8, u64, test_eq_i64x8);
102 |     // test_packed_eq!(i64x4, i64, u64x4, u64, test_eq_i64x4);
103 |     test_packed_eq!(i64x2, i64, u64x2, u64, test_eq_i64x2);
104 |     // test_packed_eq!(f64x8, f64, u64x8, u64, test_eq_f64x8);
105 |     // test_packed_eq!(f64x4, f64, u64x4, u64, test_eq_f64x4);
106 |     test_packed_eq!(f64x2, f64, u64x2, u64, test_eq_f64x2);
107 | }
108 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/hadd.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::intrin::hadd::*;
 9 | use crate::core::ops::Add;
10 | use crate::arch::current::vecs::*;
11 | use crate::vecs::*;
12 | 
13 | impl HAdd for u64x2 { hop!(hadd, Add::add, 0, 1); }
14 | impl HAdd for u32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
15 | impl HAdd for u16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
16 | impl HAdd for u8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
17 | impl HAdd for i64x2 { hop!(hadd, Add::add, 0, 1); }
18 | impl HAdd for i32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
19 | impl HAdd for i16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
20 | impl HAdd for i8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
21 | impl HAdd for f64x2 { hop!(hadd, Add::add, 0, 1); }
22 | impl HAdd for f32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
23 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/hsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::hsub::*;
11 | use crate::core::ops::Sub;
12 | 
13 | impl HSub for u64x2 { hop!(hsub, Sub::sub, 0, 1); }
14 | impl HSub for u32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
15 | impl HSub for u16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
16 | impl HSub for u8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
17 | impl HSub for i64x2 { hop!(hsub, Sub::sub, 0, 1); }
18 | impl HSub for i32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
19 | impl HSub for i16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
20 | impl HSub for i8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
21 | impl HSub for f64x2 { hop!(hsub, Sub::sub, 0, 1); }
22 | impl HSub for f32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
23 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/merge.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::merge::*;
11 | 
12 | // Will produce fallback implementations only, so we get away with __undefined.
13 | impl_packed_merge!(u8x16, u8x16,  u8, __undefined, "__undefined", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
14 | impl_packed_merge!(u16x8, u16x8, u16, __undefined, "__undefined", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
15 | impl_packed_merge!(u32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3);
16 | impl_packed_merge!(u64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1);
17 | impl_packed_merge!(i8x16, u8x16,  u8, __undefined, "__undefined", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
18 | impl_packed_merge!(i16x8, u16x8, u16, __undefined, "__undefined", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
19 | impl_packed_merge!(i32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3);
20 | impl_packed_merge!(i64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1);
21 | impl_packed_merge!(f32x4, u32x4, u32, __undefined, "__undefined", (0, 1), (2, 3), 0, 1, 2, 3);
22 | impl_packed_merge!(f64x2, u64x2, u64, __undefined, "__undefined", (0), (1), 0, 1);
23 | 
24 | mod tests {
25 |     #![allow(unused_imports)]
26 | 
27 |     use crate::prelude::*;
28 |     use crate::arch::current::vecs::*;
29 | 
30 |     // TODO: Which ones do we really need?
31 |     test_packed_merge!(
32 |         (u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4, u64x2, i64x2, f64x2),
33 |         (merge_u8x16, merge_i8x16, merge_u16x8, merge_i16x8, merge_u32x4, merge_i32x4, merge_f32x4, merge_u64x2, merge_i64x2, merge_f64x2)
34 |     );
35 | }
36 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/mod.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | mod abs;
 9 | mod cmp;
10 | mod destride;
11 | mod downcast;
12 | mod endian;
13 | mod eq;
14 | mod hadd;
15 | mod hsub;
16 | mod merge;
17 | mod recip;
18 | mod round;
19 | mod rsqrt;
20 | mod saturating_add;
21 | mod saturating_hadd;
22 | mod saturating_sub;
23 | mod saturating_hsub;
24 | mod sum;
25 | mod sqrt;
26 | mod transmute;
27 | mod upcast;
28 | 
29 | pub mod prelude {
30 |     pub use super::abs::*;
31 |     pub use super::cmp::*;
32 |     pub use super::destride::*;
33 |     pub use super::downcast::*;
34 |     pub use super::endian::*;
35 |     pub use super::eq::*;
36 |     pub use super::hadd::*;
37 |     pub use super::hsub::*;
38 |     pub use super::merge::*;
39 |     pub use super::recip::*;
40 |     pub use super::round::*;
41 |     pub use super::rsqrt::*;
42 |     pub use super::saturating_add::*;
43 |     pub use super::saturating_hadd::*;
44 |     pub use super::saturating_hsub::*;
45 |     pub use super::saturating_sub::*;
46 |     pub use super::sum::*;
47 |     pub use super::sqrt::*;
48 |     pub use super::transmute::*;
49 |     pub use super::upcast::*;
50 | }
51 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/recip.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::recip::Recip;
11 | 
12 | rust_fallback_impl! {
13 |     impl Recip for f32x4 where "__undefined" {
14 |         recip => __undefined(), [0, 1, 2, 3];
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/round.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::intrin::round::Round;
 9 | use crate::arch::current::vecs::*;
10 | use crate::vecs::*;
11 | 
12 | rust_fallback_impl! {
13 |     impl Round for f32x4 where "__undefined" {
14 |         round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3];
15 |         ceil => __undefined(), [0, 1, 2, 3];
16 |         floor => __undefined(), [0, 1, 2, 3];
17 |         trunc => __undefined(_MM_FROUND_TRUNC), [0, 1, 2, 3];
18 |     }
19 | }
20 | 
21 | rust_fallback_impl! {
22 |     impl Round for f64x2 where "__undefined" {
23 |         round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1];
24 |         ceil => __undefined(), [0, 1];
25 |         floor => __undefined(), [0, 1];
26 |         trunc => __undefined(_MM_FROUND_TRUNC), [0, 1];
27 |     }
28 | }
29 | 
30 | rust_fallback_impl! {
31 |     impl Round for f32x8 where "__undefined" {
32 |         round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3, 4, 5, 6, 7];
33 |         ceil => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
34 |         floor => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
35 |         trunc => __undefined(_MM_FROUND_TRUNC), [0, 1, 2, 3, 4, 5, 6, 7];
36 |     }
37 | }
38 | 
39 | rust_fallback_impl! {
40 |     impl Round for f64x4 where "__undefined" {
41 |         round => __undefined(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3];
42 |         ceil => __undefined(), [0, 1, 2, 3];
43 |         floor => __undefined(), [0, 1, 2, 3];
44 |         trunc => __undefined(_MM_FROUND_TRUNC), [0, 1, 2, 3];
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/rsqrt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::intrin::rsqrt::*;
 9 | use crate::arch::current::vecs::*;
10 | use crate::vecs::*;
11 | 
12 | // TODO: Guards and non-simd
13 | //
14 | //rust_fallback_impl! {
15 | //    impl Rsqrt for f32x8 where "__undefined" {
16 | //        rsqrt => _mm256_rsqrt_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
17 | //    }
18 | //}
19 | 
20 | rust_fallback_impl! {
21 |     impl Rsqrt for f32x4 where "__undefined" {
22 |         rsqrt => __undefined(), [0, 1, 2, 3];
23 |     }
24 | }
25 | 
26 | rust_fallback_impl! {
27 |     impl Rsqrt for f64x2 where "__undefined" {
28 |         rsqrt => __undefined(), [0, 1];
29 |     }
30 | }
31 | 
32 | impl Rsqrt for f32 {
33 |     #[inline(always)]
34 |     fn rsqrt(&self) -> Self {
35 |         self.sqrt().recip()
36 |     }
37 | }
38 | 
39 | impl Rsqrt for f64 {
40 |     #[inline(always)]
41 |     fn rsqrt(&self) -> Self {
42 |         self.sqrt().recip()
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/saturating_add.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::saturating_add::*;
11 | 
12 | rust_fallback_impl_binary! {
13 |     impl SaturatingAdd for u8x16 where "__undefined" {
14 |         saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
15 |     }
16 | }
17 | 
18 | rust_fallback_impl_binary! {
19 |     impl SaturatingAdd for i8x16 where "__undefined" {
20 |         saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
21 |     }
22 | }
23 | 
24 | rust_fallback_impl_binary! {
25 |     impl SaturatingAdd for u16x8 where "__undefined" {
26 |         saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
27 |     }
28 | }
29 | 
30 | rust_fallback_impl_binary! {
31 |     impl SaturatingAdd for i16x8 where "__undefined" {
32 |         saturating_add => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
33 |     }
34 | }
35 | 
36 | rust_fallback_impl_binary! {
37 |     impl SaturatingAdd for u32x4 where "__undefined" {
38 |         saturating_add => __undefined(), [0, 1, 2, 3];
39 |     }
40 | }
41 | 
42 | rust_fallback_impl_binary! {
43 |     impl SaturatingAdd for i32x4 where "__undefined" {
44 |         saturating_add => __undefined(), [0, 1, 2, 3];
45 |     }
46 | }
47 | 
48 | rust_fallback_impl_binary! {
49 |     impl SaturatingAdd for u64x2 where "__undefined" {
50 |         saturating_add => __undefined(), [0, 1];
51 |     }
52 | }
53 | 
54 | rust_fallback_impl_binary! {
55 |     impl SaturatingAdd for i64x2 where "__undefined" {
56 |         saturating_add => __undefined(), [0, 1];
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/saturating_hadd.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::saturating_hadd::*;
11 | 
12 | impl SaturatingHAdd for u64x2 { hop!(saturating_hadd, u64::saturating_add, 0, 1); }
13 | impl SaturatingHAdd for u32x4 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); }
14 | impl SaturatingHAdd for u16x8 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
15 | impl SaturatingHAdd for u8x16 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
16 | impl SaturatingHAdd for i64x2 { hop!(saturating_hadd, i64::saturating_add, 0, 1); }
17 | impl SaturatingHAdd for i32x4 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); }
18 | impl SaturatingHAdd for i16x8 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
19 | impl SaturatingHAdd for i8x16 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
20 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/saturating_hsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::saturating_hsub::*;
11 | 
12 | impl SaturatingHSub for u64x2 { hop!(saturating_hsub, u64::saturating_sub, 0, 1); }
13 | impl SaturatingHSub for u32x4 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); }
14 | impl SaturatingHSub for u16x8 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
15 | impl SaturatingHSub for u8x16 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
16 | impl SaturatingHSub for i64x2 { hop!(saturating_hsub, i64::saturating_sub, 0, 1); }
17 | impl SaturatingHSub for i32x4 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); }
18 | impl SaturatingHSub for i16x8 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
19 | impl SaturatingHSub for i8x16 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
20 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/saturating_sub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::intrin::saturating_sub::*;
 9 | use crate::arch::current::vecs::*;
10 | use crate::vecs::*;
11 | 
12 | rust_fallback_impl_binary! {
13 |     impl SaturatingSub for u8x16 where "__undefined" {
14 |         saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
15 |     }
16 | }
17 | 
18 | rust_fallback_impl_binary! {
19 |     impl SaturatingSub for i8x16 where "__undefined" {
20 |         saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
21 |     }
22 | }
23 | 
24 | rust_fallback_impl_binary! {
25 |     impl SaturatingSub for u16x8 where "__undefined" {
26 |         saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
27 |     }
28 | }
29 | 
30 | rust_fallback_impl_binary! {
31 |     impl SaturatingSub for i16x8 where "__undefined" {
32 |         saturating_sub => __undefined(), [0, 1, 2, 3, 4, 5, 6, 7];
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/sqrt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::intrin::sqrt::*;
 9 | use crate::arch::current::vecs::*;
10 | use crate::vecs::*;
11 | 
12 | rust_fallback_impl! {
13 |     impl Sqrt for f32x4 where "__undefined" {
14 |         sqrt => __undefined(), [0, 1, 2, 3];
15 |     }
16 | }
17 | 
18 | rust_fallback_impl! {
19 |     impl Sqrt for f64x2 where "__undefined" {
20 |         sqrt => __undefined(), [0, 1];
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/sum.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | use crate::vecs::*;
10 | use crate::intrin::sum::{Sum,UpcastSum};
11 | 
12 | impl_packed_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, f32x4, f64x2);
13 | impl_packed_upcast_sum!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2, f32x4, f64x2);
14 | 
15 | mod tests {
16 |     #![allow(unused_imports)]
17 | 
18 |     use crate::prelude::*;
19 |     use crate::arch::current::vecs::*;
20 | 
21 |     test_packed_sum_int!(u8x16, u8, test_packed_sum_u8x16);
22 |     test_packed_sum_int!(i8x16, i8, test_packed_sum_i8x16);
23 |     test_packed_sum_int!(u16x8, u16, test_packed_sum_u16x8);
24 |     test_packed_sum_int!(i16x8, i16, test_packed_sum_i16x8);
25 |     test_packed_sum_int!(u32x4, u32, test_packed_sum_u32x4);
26 |     test_packed_sum_int!(i32x4, i32, test_packed_sum_i32x4);
27 |     test_packed_sum_int!(u64x2, u64, test_packed_sum_u64x2);
28 |     test_packed_sum_int!(i64x2, i64, test_packed_sum_i64x2);
29 | 
30 |     test_packed_sum!(f32x4, f32, test_packed_sum_f32x4);
31 |     test_packed_sum!(f64x2, f64, test_packed_sum_f64x2);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/arch/unknown/intrin/transmute.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::intrin::transmute::*;
 9 | use crate::arch::current::vecs::*;
10 | use crate::core::mem::transmute;
11 | 
12 | impl_packed_transmute!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4,
13 |                        u64x2, i64x2, f64x2, ...
14 |                        u8x16, i8x16, u16x8, i16x8, u32x4, i32x4,
15 |                        f32x4, u64x2, i64x2, f64x2,
16 |                        "__undefined", "__undefined");
17 | 


--------------------------------------------------------------------------------
/src/arch/unknown/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod intrin;
2 | pub mod vecs;
3 | pub mod vec_patterns;
4 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/addsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::arch::current::vecs::*;
 9 | 
10 | // impl AddSub for f32x4 {
11 | //     #[inline(always)]
12 | //     fn addsub(&self, other: Self) -> Self {
13 | //         unsafe { _mm_addsub_ps(*self, other) }
14 | //     }
15 | // }
16 | 
17 | // impl AddSub for f64x2 {
18 | //     #[inline(always)]
19 | //     fn addsub(&self, other: Self) -> Self {
20 | //         unsafe { _mm_addsub_pd(*self, other) }
21 | //     }
22 | // }
23 | 
24 | // impl AddSub for f32x8 {
25 | //     #[inline(always)]
26 | //     fn addsub(&self, other: Self) -> Self {
27 | //         unsafe { _mm256_addsub_ps(*self, other) }
28 | //     }
29 | // }
30 | 
31 | // impl AddSub for f64x4 {
32 | //     #[inline(always)]
33 | //     fn addsub(&self, other: Self) -> Self {
34 | //         unsafe { _mm256_addsub_pd(*self, other) }
35 | //     }
36 | // }
37 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/cast.rs:
--------------------------------------------------------------------------------
 1 | 
 2 | // impl_cast!(Asu8s, i8x16, u8x16, as_u8s, as_u8x16);
 3 | // impl_cast!(Asi8s, u8x16, i8x16, as_i8s, as_i8x16);
 4 | 
 5 | // impl_cast!(Asu8s, i8x32, u8x32, as_u8s, as_u8x32);
 6 | // impl_cast!(Asi8s, u8x32, i8x32, as_i8s, as_i8x32);
 7 | 
 8 | // impl_cast!(Asu8s, i8x64, u8x64, as_u8s, as_u8x64);
 9 | // impl_cast!(Asi8s, u8x64, i8x64, as_i8s, as_i8x64);
10 | 
11 | // impl_cast!(Asu16s, i16x8, u16x8, as_u16s, as_u16x8);
12 | // impl_cast!(Asi16s, u16x8, i16x8, as_i16s, as_i16x8);
13 | 
14 | // impl_cast!(Asu16s, i16x16, u16x16, as_u16s, as_u16x16);
15 | // impl_cast!(Asi16s, u16x16, i16x16, as_i16s, as_i16x16);
16 | 
17 | // impl_cast!(Asu16s, i16x32, u16x32, as_u16s, as_u16x32);
18 | // impl_cast!(Asi16s, u16x32, i16x32, as_i16s, as_i16x32);
19 | 
20 | // impl_cast!(Asu32s, i32x4, u32x4, as_u32s, as_u32x4);
21 | // impl_cast!(Asu32s, f32x4, u32x4, as_u32s, as_u32x4);
22 | // impl_cast!(Asi32s, f32x4, i32x4, as_i32s, as_i32x4);
23 | // impl_cast!(Asi32s, u32x4, i32x4, as_i32s, as_i32x4);
24 | // impl_cast!(Asf32s, u32x4, f32x4, as_f32s, as_f32x4);
25 | // impl_cast!(Asf32s, i32x4, f32x4, as_f32s, as_f32x4);
26 | 
27 | // impl_cast!(Asu32s, i32x8, u32x8, as_u32s, as_u32x8);
28 | // impl_cast!(Asu32s, f32x8, u32x8, as_u32s, as_u32x8);
29 | // impl_cast!(Asi32s, f32x8, i32x8, as_i32s, as_i32x8);
30 | // impl_cast!(Asi32s, u32x8, i32x8, as_i32s, as_i32x8);
31 | // impl_cast!(Asf32s, u32x8, f32x8, as_f32s, as_f32x8);
32 | // impl_cast!(Asf32s, i32x8, f32x8, as_f32s, as_f32x8);
33 | 
34 | // impl_cast!(Asu32s, i32x16, u32x16, as_u32s, as_u32x16);
35 | // impl_cast!(Asu32s, f32x16, u32x16, as_u32s, as_u32x16);
36 | // impl_cast!(Asi32s, f32x16, i32x16, as_i32s, as_i32x16);
37 | // impl_cast!(Asi32s, u32x16, i32x16, as_i32s, as_i32x16);
38 | // impl_cast!(Asf32s, u32x16, f32x16, as_f32s, as_f32x16);
39 | // impl_cast!(Asf32s, i32x16, f32x16, as_f32s, as_f32x16);
40 | 
41 | // impl_cast!(Asu64s, i64x2, u64x2, as_u64s, as_u64x2);
42 | // impl_cast!(Asu64s, f64x2, u64x2, as_u64s, as_u64x2);
43 | // impl_cast!(Asi64s, f64x2, i64x2, as_i64s, as_i64x2);
44 | // impl_cast!(Asi64s, u64x2, i64x2, as_i64s, as_i64x2);
45 | // impl_cast!(Asf64s, u64x2, f64x2, as_f64s, as_f64x2);
46 | // impl_cast!(Asf64s, i64x2, f64x2, as_f64s, as_f64x2);
47 | 
48 | // impl_cast!(Asu64s, i64x4, u64x4, as_u64s, as_u64x4);
49 | // impl_cast!(Asu64s, f64x4, u64x4, as_u64s, as_u64x4);
50 | // impl_cast!(Asi64s, f64x4, i64x4, as_i64s, as_i64x4);
51 | // impl_cast!(Asi64s, u64x4, i64x4, as_i64s, as_i64x4);
52 | // impl_cast!(Asf64s, u64x4, f64x4, as_f64s, as_f64x4);
53 | // impl_cast!(Asf64s, i64x4, f64x4, as_f64s, as_f64x4);
54 | 
55 | // impl_cast!(Asu64s, i64x8, u64x8, as_u64s, as_u64x8);
56 | // impl_cast!(Asu64s, f64x8, u64x8, as_u64s, as_u64x8);
57 | // impl_cast!(Asi64s, f64x8, i64x8, as_i64s, as_i64x8);
58 | // impl_cast!(Asi64s, u64x8, i64x8, as_i64s, as_i64x8);
59 | // impl_cast!(Asf64s, u64x8, f64x8, as_f64s, as_f64x8);
60 | // impl_cast!(Asf64s, i64x8, f64x8, as_f64s, as_f64x8);
61 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/cmp.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::vektor::x86_64::*;
  9 | use crate::vektor::x86::*;
 10 | use crate::arch::current::vecs::*;
 11 | use crate::vecs::*;
 12 | use crate::intrin::cmp::*;
 13 | 
 14 | rust_fallback_impl_binary! {
 15 |     impl Cmp for u8x16 where "sse2" {
 16 |         min => _mm_min_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 17 |         max => _mm_max_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 18 |     }
 19 | }
 20 | 
 21 | rust_fallback_impl_binary! {
 22 |     impl Cmp for i8x16 where "sse4.1" {
 23 |         min => _mm_min_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 24 |         max => _mm_max_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 25 |     }
 26 | }
 27 | 
 28 | rust_fallback_impl_binary! {
 29 |     impl Cmp for u16x8 where "sse4.1" {
 30 |         min => _mm_min_epu16(), [0, 1, 2, 3, 4, 5, 6, 7];
 31 |         max => _mm_max_epu16(), [0, 1, 2, 3, 4, 5, 6, 7];
 32 |     }
 33 | }
 34 | 
 35 | rust_fallback_impl_binary! {
 36 |     impl Cmp for i16x8 where "sse4.1" {
 37 |         min => _mm_min_epi16(), [0, 1, 2, 3, 4, 5, 6, 7];
 38 |         max => _mm_max_epi16(), [0, 1, 2, 3, 4, 5, 6, 7];
 39 |     }
 40 | }
 41 | 
 42 | rust_fallback_impl_binary! {
 43 |     impl Cmp for u32x4 where "sse4.1" {
 44 |         min => _mm_min_epu32(), [0, 1, 2, 3];
 45 |         max => _mm_max_epu32(), [0, 1, 2, 3];
 46 |     }
 47 | }
 48 | 
 49 | rust_fallback_impl_binary! {
 50 |     impl Cmp for i32x4 where "sse4.1" {
 51 |         min => _mm_min_epi32(), [0, 1, 2, 3];
 52 |         max => _mm_max_epi32(), [0, 1, 2, 3];
 53 |     }
 54 | }
 55 | 
 56 | rust_fallback_impl_binary! {
 57 |     impl Cmp for f32x4 where "sse" {
 58 |         min => _mm_min_ps(), [0, 1, 2, 3];
 59 |         max => _mm_max_ps(), [0, 1, 2, 3];
 60 |     }
 61 | }
 62 | 
 63 | rust_fallback_impl_binary! {
 64 |     impl Cmp for f64x2 where "sse2" {
 65 |         min => _mm_min_pd(), [0, 1];
 66 |         max => _mm_max_pd(), [0, 1];
 67 |     }
 68 | }
 69 | 
 70 | rust_fallback_impl_binary! {
 71 |     impl Cmp for u8x32 where "avx2" {
 72 |         min => _mm256_min_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 73 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 74 |         max => _mm256_max_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 75 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 76 |     }
 77 | }
 78 | 
 79 | rust_fallback_impl_binary! {
 80 |     impl Cmp for i8x32 where "avx2" {
 81 |         min => _mm256_min_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 82 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 83 |         max => _mm256_max_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 84 |                                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 85 |     }
 86 | }
 87 | 
 88 | rust_fallback_impl_binary! {
 89 |     impl Cmp for u16x16 where "avx2" {
 90 |         min => _mm256_min_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 91 |         max => _mm256_max_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 92 |     }
 93 | }
 94 | 
 95 | rust_fallback_impl_binary! {
 96 |     impl Cmp for i16x16 where "avx2" {
 97 |         min => _mm256_min_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 98 |         max => _mm256_max_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 99 |     }
100 | }
101 | 
102 | rust_fallback_impl_binary! {
103 |     impl Cmp for u32x8 where "avx" {
104 |         min => _mm256_min_epu32(), [0, 1, 2, 3, 4, 5, 6, 7];
105 |         max => _mm256_max_epu32(), [0, 1, 2, 3, 4, 5, 6, 7];
106 |     }
107 | }
108 | 
109 | rust_fallback_impl_binary! {
110 |     impl Cmp for i32x8 where "avx" {
111 |         min => _mm256_min_epi32(), [0, 1, 2, 3, 4, 5, 6, 7];
112 |         max => _mm256_max_epi32(), [0, 1, 2, 3, 4, 5, 6, 7];
113 |     }
114 | }
115 | 
116 | rust_fallback_impl_binary! {
117 |     impl Cmp for f32x8 where "avx" {
118 |         min => _mm256_min_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
119 |         max => _mm256_max_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
120 |     }
121 | }
122 | 
123 | rust_fallback_impl_binary! {
124 |     impl Cmp for f64x4 where "avx" {
125 |         min => _mm256_min_pd(), [0, 1, 2, 3];
126 |         max => _mm256_max_pd(), [0, 1, 2, 3];
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/destride.rs:
--------------------------------------------------------------------------------
  1 | use crate::arch::current::vecs::*;
  2 | use crate::vecs::*;
  3 | use crate::vektor::x86_64::*;
  4 | use crate::vektor::x86::*;
  5 | use crate::intrin::merge::*;
  6 | use crate::intrin::transmute::*;
  7 | use crate::intrin::destride::*;
  8 | use crate::core::mem::transmute;
  9 | 
 10 | impl Destride for u8x16 {
 11 |     #[inline(always)]
 12 |     #[cfg(target_feature = "ssse3")]
 13 |     fn destride_two(self, other: Self) -> (Self, Self) {
 14 |         optimized!();
 15 |         unsafe {
 16 |             let a = _mm_shuffle_epi8(self.be_i8s(), Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s());
 17 |             let b = _mm_shuffle_epi8(other.be_i8s(), Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s());
 18 |             // Backwards merge of a and b (keeps elements at the same indices)
 19 |             let c = _mm_shuffle_epi8(b.merge_halves(a), Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7).be_i8s());
 20 |             (a.merge_halves(b).be_u8s(), c.be_u8s())
 21 |         }
 22 |     }
 23 | 
 24 |     #[inline(always)]
 25 |     #[cfg(not(target_feature = "ssse3"))]
 26 |     fn destride_two(self, other: Self) -> (Self, Self) {
 27 |         fallback!();
 28 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14)
 29 |     }
 30 | 
 31 |     #[inline(always)]
 32 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
 33 |         fallback!();
 34 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12)
 35 |     }
 36 | }
 37 | 
 38 | impl Destride for u8x32 {
 39 |     #[inline(always)]
 40 |     #[cfg(target_feature = "avx2")]
 41 |     fn destride_two(self, other: Self) -> (Self, Self) {
 42 |         optimized!();
 43 |         unsafe {
 44 |             // In-lane destrided vectors
 45 |             let a = _mm256_shuffle_epi8(self.be_i8s(), Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15).be_i8s());
 46 |             let b = _mm256_shuffle_epi8(other.be_i8s(), Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14).be_i8s());
 47 |             // Cross-lane destrided vectors
 48 |             let aa = _mm256_permute4x64_epi64(a.be_i64s(), 0xD8).be_u8s();
 49 |             let bb = _mm256_permute4x64_epi64(b.be_i64s(), 0xD8).be_u8s();
 50 |             // Backwards merge of aa and bb (keeps elements at the same indices)
 51 |             let c = _mm256_permute4x64_epi64(aa.merge_halves(bb).be_i64s(), 0x4E).be_u8s();
 52 |             (aa.merge_halves(bb), c)
 53 |         }
 54 |     }
 55 | 
 56 |     #[inline(always)]
 57 |     #[cfg(not(target_feature = "avx2"))]
 58 |     fn destride_two(self, other: Self) -> (Self, Self) {
 59 |         fallback!();
 60 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)
 61 |     }
 62 | 
 63 |     #[inline(always)]
 64 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
 65 |         fallback!();
 66 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28)
 67 |     }
 68 | }
 69 | 
 70 | impl Destride for i8x16 {
 71 |     #[inline(always)]
 72 |     #[cfg(target_feature = "ssse3")]
 73 |     fn destride_two(self, other: Self) -> (Self, Self) {
 74 |         optimized!();
 75 |         unsafe {
 76 |             let a = _mm_shuffle_epi8(transmute(self), transmute(Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)));
 77 |             let b = _mm_shuffle_epi8(transmute(other), transmute(Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14)));
 78 |             // Backwards merge of a and b (keeps elements at the same indices)
 79 |             let c = _mm_shuffle_epi8(b.merge_halves(a), transmute(Self::new(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)));
 80 |             (a.be_i8s().merge_halves(b.be_i8s()), c.be_i8s())
 81 |         }
 82 |     }
 83 | 
 84 |     #[inline(always)]
 85 |     #[cfg(not(target_feature = "ssse3"))]
 86 |     fn destride_two(self, other: Self) -> (Self, Self) {
 87 |         fallback!();
 88 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14)
 89 |     }
 90 | 
 91 |     #[inline(always)]
 92 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
 93 |         fallback!();
 94 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12)
 95 |     }
 96 | }
 97 | 
 98 | impl Destride for i8x32 {
 99 |     #[inline(always)]
100 |     #[cfg(target_feature = "avx2")]
101 |     fn destride_two(self, other: Self) -> (Self, Self) {
102 |         optimized!();
103 |         unsafe {
104 |             // In-lane destrided vectors
105 |             let a = _mm256_shuffle_epi8(transmute(self), transmute(Self::new(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15)));
106 |             let b = _mm256_shuffle_epi8(transmute(other), transmute(Self::new(1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14)));
107 |             // Cross-lane destrided vectors
108 |             let aa = _mm256_permute4x64_epi64(a.be_i64s(), 0xD8).be_i8s();
109 |             let bb = _mm256_permute4x64_epi64(b.be_i64s(), 0xD8).be_i8s();
110 |             // Backwards merge of aa and bb (keeps elements at the same indices)
111 |             let c = _mm256_permute4x64_epi64(aa.merge_halves(bb).be_i64s(), 0x4E).be_i8s();
112 |             (aa.merge_halves(bb), c)
113 |         }
114 |     }
115 | 
116 |     #[inline(always)]
117 |     #[cfg(not(target_feature = "avx2"))]
118 |     fn destride_two(self, other: Self) -> (Self, Self) {
119 |         fallback!();
120 |         destride_two_polyfill!(self, other, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)
121 |     }
122 | 
123 |     #[inline(always)]
124 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
125 |         fallback!();
126 |         destride_four_polyfill!(self, b, c, d, 0, 4, 8, 12, 16, 20, 24, 28)
127 |     }
128 | }
129 | 
130 | macro_rules! impl_destride {
131 |     ($t:ty, $($two:expr, $four:expr),*) => {
132 |         impl Destride for $t {
133 |             #[inline(always)]
134 |             fn destride_two(self, other: Self) -> (Self, Self) {
135 |                 fallback!();
136 |                 destride_two_polyfill!(self, other, $($two, $four),*)
137 |             }
138 | 
139 |             #[inline(always)]
140 |             fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
141 |                 fallback!();
142 |                 destride_four_polyfill!(self, b, c, d, $($two),*)
143 |             }
144 |         }
145 |     }
146 | }
147 | 
148 | impl_destride!(u16x16, 0, 2, 4, 6, 8, 10, 12, 14);
149 | impl_destride!(u16x8, 0, 2, 4, 6);
150 | impl_destride!(i16x16, 0, 2, 4, 6, 8, 10, 12, 14);
151 | impl_destride!(i16x8, 0, 2, 4, 6);
152 | 
153 | impl_destride!(u32x8, 0, 2, 4, 6);
154 | impl_destride!(u32x4, 0, 2);
155 | impl_destride!(i32x8, 0, 2, 4, 6);
156 | impl_destride!(i32x4, 0, 2);
157 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/eq.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::vektor::x86_64::*;
  9 | use crate::vektor::x86::*;
 10 | use crate::core::ops::BitXor;
 11 | use crate::intrin::eq::*;
 12 | use crate::arch::current::vecs::*;
 13 | use crate::vecs::*;
 14 | 
 15 | rust_fallback_eq! {
 16 |     impl Eq for u8x16 where "sse2" {
 17 |         eq_mask, eq => u8x16, u8, _mm_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 18 |     }
 19 | }
 20 | 
 21 | rust_fallback_eq! {
 22 |     impl Eq for i8x16 where "sse4.1" {
 23 |         eq_mask, eq => u8x16, u8, _mm_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 24 |     }
 25 | }
 26 | 
 27 | rust_fallback_eq! {
 28 |     impl Eq for u16x8 where "sse4.1" {
 29 |         eq_mask, eq => u16x8, u16, _mm_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7];
 30 |     }
 31 | }
 32 | 
 33 | rust_fallback_eq! {
 34 |     impl Eq for i16x8 where "sse4.1" {
 35 |         eq_mask, eq => u16x8, u16, _mm_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7];
 36 |     }
 37 | }
 38 | 
 39 | rust_fallback_eq! {
 40 |     impl Eq for u32x4 where "sse4.1" {
 41 |         eq_mask, eq => u32x4, u32, _mm_cmpeq_epi32(), [0, 1, 2, 3];
 42 |     }
 43 | }
 44 | 
 45 | rust_fallback_eq! {
 46 |     impl Eq for i32x4 where "sse4.1" {
 47 |         eq_mask, eq => u32x4, u32, _mm_cmpeq_epi32(), [0, 1, 2, 3];
 48 |     }
 49 | }
 50 | 
 51 | rust_fallback_eq! {
 52 |     impl Eq for f32x4 where "sse" {
 53 |         eq_mask, eq => u32x4, u32, _mm_cmpeq_ps(), [0, 1, 2, 3];
 54 |     }
 55 | }
 56 | 
 57 | rust_fallback_eq! {
 58 |     impl Eq for f64x2 where "sse2" {
 59 |         eq_mask, eq => u64x2, u64, _mm_cmpeq_pd(), [0, 1];
 60 |     }
 61 | }
 62 | 
 63 | rust_fallback_eq! {
 64 |     impl Eq for u64x2 where "sse4.1" {
 65 |         eq_mask, eq => u64x2, u64, _mm_cmpeq_epi64(), [0, 1];
 66 |     }
 67 | }
 68 | 
 69 | rust_fallback_eq! {
 70 |     impl Eq for i64x2 where "sse4.1" {
 71 |         eq_mask, eq => u64x2, u64, _mm_cmpeq_epi64(), [0, 1];
 72 |     }
 73 | }
 74 | 
 75 | rust_fallback_eq! {
 76 |     impl Eq for u8x32 where "avx2" {
 77 |         eq_mask, eq => u8x32, u8, _mm256_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 78 |                                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 79 |     }
 80 | }
 81 | 
 82 | rust_fallback_eq! {
 83 |     impl Eq for i8x32 where "avx2" {
 84 |         eq_mask, eq => u8x32, u8, _mm256_cmpeq_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 85 |                                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
 86 |     }
 87 | }
 88 | 
 89 | rust_fallback_eq! {
 90 |     impl Eq for u16x16 where "avx2" {
 91 |         eq_mask, eq => u16x16, u16, _mm256_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 92 |     }
 93 | }
 94 | 
 95 | rust_fallback_eq! {
 96 |     impl Eq for i16x16 where "avx2" {
 97 |         eq_mask, eq => u16x16, u16, _mm256_cmpeq_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
 98 |     }
 99 | }
100 | 
101 | rust_fallback_eq! {
102 |     impl Eq for u32x8 where "avx2" {
103 |         eq_mask, eq => u32x8, u32, _mm256_cmpeq_epi32(), [0, 1, 2, 3, 4, 5, 6, 7];
104 |     }
105 | }
106 | 
107 | rust_fallback_eq! {
108 |     impl Eq for i32x8 where "avx2" {
109 |         eq_mask, eq => u32x8, u32, _mm256_cmpeq_epi32(), [0, 1, 2, 3, 4, 5, 6, 7];
110 |     }
111 | }
112 | 
113 | rust_fallback_eq! {
114 |     impl Eq for f32x8 where "avx" {
115 |         eq_mask, eq => u32x8, u32, _mm256_cmp_ps(0x00), [0, 1, 2, 3, 4, 5, 6, 7];
116 |     }
117 | }
118 | 
119 | rust_fallback_eq! {
120 |     impl Eq for f64x4 where "avx" {
121 |         eq_mask, eq => u64x4, u64, _mm256_cmp_pd(0x00), [0, 1, 2, 3];
122 |     }
123 | }
124 | 
125 | rust_fallback_eq! {
126 |     impl Eq for u64x4 where "avx2" {
127 |         eq_mask, eq => u64x4, u64, _mm256_cmpeq_epi64(), [0, 1, 2, 3];
128 |     }
129 | }
130 | 
131 | rust_fallback_eq! {
132 |     impl Eq for i64x4 where "avx2" {
133 |         eq_mask, eq => u64x4, u64, _mm256_cmpeq_epi64(), [0, 1, 2, 3];
134 |     }
135 | }
136 | 
137 | mod tests {
138 |     use crate::prelude::*;
139 |     use crate::arch::current::vecs::*;
140 | 
141 |     // test_packed_eq!(u8x64, u8, u8x64, u8, test_eq_u8x64);
142 |     test_packed_eq!(u8x32, u8, u8x32, u8, test_eq_u8x32);
143 |     test_packed_eq!(u8x16, u8, u8x16, u8, test_eq_u8x16);
144 |     // test_packed_eq!(i8x64, i8, u8x64, u8, test_eq_i8x64);
145 |     test_packed_eq!(i8x32, i8, u8x32, u8, test_eq_i8x32);
146 |     test_packed_eq!(i8x16, i8, u8x16, u8, test_eq_i8x16);
147 |     // test_packed_eq!(u16x32, u16, u16x32, u16, test_eq_u16x32);
148 |     test_packed_eq!(u16x16, u16, u16x16, u16, test_eq_u16x16);
149 |     test_packed_eq!(u16x8, u16, u16x8, u16, test_eq_u16x8);
150 |     // test_packed_eq!(i16x32, i16, u16x32, u16, test_eq_i16x32);
151 |     test_packed_eq!(i16x16, i16, u16x16, u16, test_eq_i16x16);
152 |     test_packed_eq!(i16x8, i16, u16x8, u16, test_eq_i16x8);
153 |     // test_packed_eq!(u32x16, u32, u32x16, u32, test_eq_u32x16);
154 |     test_packed_eq!(u32x8, u32, u32x8, u32, test_eq_u32x8);
155 |     test_packed_eq!(u32x4, u32, u32x4, u32, test_eq_u32x4);
156 |     // test_packed_eq!(i32x16, i32, u32x16, u32, test_eq_i32x16);
157 |     test_packed_eq!(i32x8, i32, u32x8, u32, test_eq_i32x8);
158 |     test_packed_eq!(i32x4, i32, u32x4, u32, test_eq_i32x4);
159 |     // test_packed_eq!(f32x16, f32, u32x16, u32, test_eq_f32x16);
160 |     test_packed_eq!(f32x8, f32, u32x8, u32, test_eq_f32x8);
161 |     test_packed_eq!(f32x4, f32, u32x4, u32, test_eq_f32x4);
162 |     // test_packed_eq!(u64x8, u64, u64x8, u64, test_eq_u64x8);
163 |     test_packed_eq!(u64x4, u64, u64x4, u64, test_eq_u64x4);
164 |     test_packed_eq!(u64x2, u64, u64x2, u64, test_eq_u64x2);
165 |     // test_packed_eq!(i64x8, i64, u64x8, u64, test_eq_i64x8);
166 |     test_packed_eq!(i64x4, i64, u64x4, u64, test_eq_i64x4);
167 |     test_packed_eq!(i64x2, i64, u64x2, u64, test_eq_i64x2);
168 |     // test_packed_eq!(f64x8, f64, u64x8, u64, test_eq_f64x8);
169 |     test_packed_eq!(f64x4, f64, u64x4, u64, test_eq_f64x4);
170 |     test_packed_eq!(f64x2, f64, u64x2, u64, test_eq_f64x2);
171 | }
172 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/hadd.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::vektor::x86_64::*;
  9 | use crate::vektor::x86::*;
 10 | use crate::intrin::transmute::*;
 11 | use crate::intrin::hadd::*;
 12 | use crate::core::ops::Add;
 13 | use crate::arch::current::vecs::*;
 14 | use crate::vecs::*;
 15 | 
 16 | #[cfg(target_feature = "sse3")]
 17 | impl HAdd for f32x4 {
 18 |     #[inline(always)]
 19 |     fn hadd(&self, other: Self) -> Self {
 20 |         optimized!();
 21 |         unsafe { _mm_hadd_ps(_mm_shuffle_ps(*self, other, 0b01000100),
 22 |                              _mm_shuffle_ps(*self, other, 0b11101110)) }
 23 |     }
 24 | }
 25 | 
 26 | #[cfg(target_feature = "sse3")]
 27 | impl HAdd for f64x2 {
 28 |     #[inline(always)]
 29 |     #[cfg(target_feature = "sse3")]
 30 |     fn hadd(&self, other: Self) -> Self {
 31 |         optimized!();
 32 |         unsafe { _mm_hadd_pd(*self, other) }
 33 |     }
 34 | }
 35 | 
 36 | #[cfg(target_feature = "avx2")]
 37 | impl HAdd for f32x8 {
 38 |     #[inline(always)]
 39 |     fn hadd(&self, other: Self) -> Self {
 40 |         optimized!();
 41 |         unsafe { _mm256_hadd_ps(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(),
 42 |                                 _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked()) }
 43 |     }
 44 | }
 45 | 
 46 | #[cfg(target_feature = "avx")]
 47 | impl HAdd for f64x4 {
 48 |     #[inline(always)]
 49 |     fn hadd(&self, other: Self) -> Self {
 50 |         optimized!();
 51 |         unsafe { _mm256_hadd_pd(*self, other) }
 52 |     }
 53 | }
 54 | 
 55 | #[cfg(target_feature = "ssse3")]
 56 | impl HAdd for i16x8 {
 57 |     #[inline(always)]
 58 |     fn hadd(&self, other: Self) -> Self {
 59 |         optimized!();
 60 |         unsafe { _mm_hadd_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
 61 |                                 _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
 62 |     }
 63 | }
 64 | 
 65 | #[cfg(target_feature = "ssse3")]
 66 | impl HAdd for i32x4 {
 67 |     #[inline(always)]
 68 |     fn hadd(&self, other: Self) -> Self {
 69 |         optimized!();
 70 |         unsafe { _mm_hadd_epi32(_mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(),
 71 |                                 _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) }
 72 |     }
 73 | }
 74 | 
 75 | #[cfg(target_feature = "avx2")]
 76 | impl HAdd for i16x16 {
 77 |     #[inline(always)]
 78 |     fn hadd(&self, other: Self) -> Self {
 79 |         optimized!();
 80 |         unsafe { _mm256_hadd_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
 81 |                                    _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
 82 |     }
 83 | }
 84 | 
 85 | #[cfg(target_feature = "avx2")]
 86 | impl HAdd for i32x8 {
 87 |     #[inline(always)]
 88 |     fn hadd(&self, other: Self) -> Self {
 89 |         optimized!();
 90 |         unsafe { _mm256_hadd_epi32(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(),
 91 |                                    _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) }
 92 |     }
 93 | }
 94 | 
 95 | impl HAdd for u64x2 { hop!(hadd, Add::add, 0, 1); }
 96 | impl HAdd for u64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
 97 | impl HAdd for u64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
 98 | impl HAdd for u32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
 99 | impl HAdd for u32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
100 | impl HAdd for u32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
101 | impl HAdd for u16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
102 | impl HAdd for u16x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
103 | impl HAdd for u16x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
104 | impl HAdd for u8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
105 | impl HAdd for u8x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
106 | impl HAdd for u8x64 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
107 | impl HAdd for i64x2 { hop!(hadd, Add::add, 0, 1); }
108 | impl HAdd for i64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
109 | impl HAdd for i64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
110 | #[cfg(not(target_feature = "ssse3"))]
111 | impl HAdd for i32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
112 | #[cfg(not(target_feature = "avx2"))]
113 | impl HAdd for i32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
114 | impl HAdd for i32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
115 | #[cfg(not(target_feature = "ssse3"))]
116 | impl HAdd for i16x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
117 | #[cfg(not(target_feature = "avx2"))]
118 | impl HAdd for i16x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
119 | impl HAdd for i16x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
120 | impl HAdd for i8x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
121 | impl HAdd for i8x32 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
122 | impl HAdd for i8x64 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
123 | #[cfg(not(target_feature = "sse3"))]
124 | impl HAdd for f64x2 { hop!(hadd, Add::add, 0, 1); }
125 | #[cfg(not(target_feature = "avx"))]
126 | impl HAdd for f64x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
127 | impl HAdd for f64x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
128 | #[cfg(not(target_feature = "sse3"))]
129 | impl HAdd for f32x4 { hop!(hadd, Add::add, 0, 1, 2, 3); }
130 | #[cfg(not(target_feature = "avx2"))]
131 | impl HAdd for f32x8 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7); }
132 | impl HAdd for f32x16 { hop!(hadd, Add::add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
133 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/hsub.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::vektor::x86_64::*;
  9 | use crate::vektor::x86::*;
 10 | use crate::arch::current::vecs::*;
 11 | use crate::vecs::*;
 12 | use crate::intrin::transmute::*;
 13 | use crate::intrin::hsub::*;
 14 | use crate::core::ops::Sub;
 15 | 
 16 | #[cfg(target_feature = "sse3")]
 17 | impl HSub for f32x4 {
 18 |     #[inline(always)]
 19 |     fn hsub(&self, other: Self) -> Self {
 20 |         optimized!();
 21 |         unsafe { _mm_hsub_ps(_mm_shuffle_ps(*self, other, 0b01000100),
 22 |                              _mm_shuffle_ps(*self, other, 0b11101110)) }
 23 |     }
 24 | }
 25 | 
 26 | #[cfg(target_feature = "sse3")]
 27 | impl HSub for f64x2 {
 28 |     #[inline(always)]
 29 |     fn hsub(&self, other: Self) -> Self {
 30 |         optimized!();
 31 |         unsafe { _mm_hsub_pd(*self, other) }
 32 |     }
 33 | }
 34 | 
 35 | #[cfg(target_feature = "avx2")]
 36 | impl HSub for f32x8 {
 37 |     #[inline(always)]
 38 |     fn hsub(&self, other: Self) -> Self {
 39 |         optimized!();
 40 |         unsafe { _mm256_hsub_ps(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked(),
 41 |                                 _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_f32s_unchecked()) }
 42 |     }
 43 | }
 44 | 
 45 | #[cfg(target_feature = "avx")]
 46 | impl HSub for f64x4 {
 47 |     #[inline(always)]
 48 |     fn hsub(&self, other: Self) -> Self {
 49 |         optimized!();
 50 |         unsafe { _mm256_hsub_pd(*self, other) }
 51 |     }
 52 | }
 53 | 
 54 | #[cfg(target_feature = "ssse3")]
 55 | impl HSub for i16x8 {
 56 |     #[inline(always)]
 57 |     fn hsub(&self, other: Self) -> Self {
 58 |         optimized!();
 59 |         unsafe { _mm_hsub_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
 60 |                                 _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
 61 |     }
 62 | }
 63 | 
 64 | #[cfg(target_feature = "ssse3")]
 65 | impl HSub for i32x4 {
 66 |     #[inline(always)]
 67 |     fn hsub(&self, other: Self) -> Self {
 68 |         optimized!();
 69 |         unsafe { _mm_hsub_epi32(_mm_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(),
 70 |                                 _mm_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) }
 71 |     }
 72 | }
 73 | 
 74 | #[cfg(target_feature = "avx2")]
 75 | impl HSub for i16x16 {
 76 |     #[inline(always)]
 77 |     fn hsub(&self, other: Self) -> Self {
 78 |         optimized!();
 79 |         unsafe { _mm256_hsub_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
 80 |                                    _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
 81 |     }
 82 | }
 83 | 
 84 | #[cfg(target_feature = "avx2")]
 85 | impl HSub for i32x8 {
 86 |     #[inline(always)]
 87 |     fn hsub(&self, other: Self) -> Self {
 88 |         optimized!();
 89 |         unsafe { _mm256_hsub_epi32(_mm256_unpacklo_epi64(self.be_i64s(), other.be_i64s()).be_i32s(),
 90 |                                    _mm256_unpackhi_epi64(self.be_i64s(), other.be_i64s()).be_i32s()) }
 91 |     }
 92 | }
 93 | 
 94 | impl HSub for u64x2 { hop!(hsub, Sub::sub, 0, 1); }
 95 | impl HSub for u64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
 96 | impl HSub for u64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
 97 | impl HSub for u32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
 98 | impl HSub for u32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
 99 | impl HSub for u32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
100 | impl HSub for u16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
101 | impl HSub for u16x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
102 | impl HSub for u16x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
103 | impl HSub for u8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
104 | impl HSub for u8x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
105 | impl HSub for u8x64 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
106 | impl HSub for i64x2 { hop!(hsub, Sub::sub, 0, 1); }
107 | impl HSub for i64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
108 | impl HSub for i64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
109 | #[cfg(not(target_feature = "ssse3"))]
110 | impl HSub for i32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
111 | #[cfg(not(target_feature = "avx2"))]
112 | impl HSub for i32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
113 | impl HSub for i32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
114 | #[cfg(not(target_feature = "ssse3"))]
115 | impl HSub for i16x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
116 | #[cfg(not(target_feature = "avx2"))]
117 | impl HSub for i16x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
118 | impl HSub for i16x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
119 | impl HSub for i8x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
120 | impl HSub for i8x32 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
121 | impl HSub for i8x64 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
122 | #[cfg(not(target_feature = "sse3"))]
123 | impl HSub for f64x2 { hop!(hsub, Sub::sub, 0, 1); }
124 | #[cfg(not(target_feature = "avx"))]
125 | impl HSub for f64x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
126 | impl HSub for f64x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
127 | #[cfg(not(target_feature = "sse3"))]
128 | impl HSub for f32x4 { hop!(hsub, Sub::sub, 0, 1, 2, 3); }
129 | #[cfg(not(target_feature = "avx2"))]
130 | impl HSub for f32x8 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7); }
131 | impl HSub for f32x16 { hop!(hsub, Sub::sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
132 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/merge.rs:
--------------------------------------------------------------------------------
 1 | use crate::arch::current::vecs::*;
 2 | use crate::vecs::*;
 3 | use crate::vec_patterns::*;
 4 | use crate::vektor::x86_64::*;
 5 | use crate::vektor::x86::*;
 6 | use crate::intrin::transmute::*;
 7 | use crate::intrin::merge::*;
 8 | use crate::core::mem::transmute;
 9 | 
10 | // TODO: The AVX-512 version of this macro doesn't work; impl when stdsimd gets
11 | // around to it (and when I have some hardware to test it on).
12 | impl_packed_merge!(u8x16, u8x16, u8, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
13 | impl_packed_merge!(u8x32, u8x32, u8, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
14 | impl_packed_merge!(u8x64, u8x64, u8, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), (32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
15 | 
16 | impl_packed_merge!(u16x8, u16x8, u16, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
17 | impl_packed_merge!(u16x16, u16x16, u16, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
18 | impl_packed_merge!(u16x32, u16x32, u16, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
19 | 
20 | impl_packed_merge!(u32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3);
21 | impl_packed_merge!(u32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
22 | impl_packed_merge!(u32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
23 | 
24 | impl_packed_merge!(u64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1);
25 | impl_packed_merge!(u64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3);
26 | impl_packed_merge!(u64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
27 | 
28 | impl_packed_merge!(i8x16, u8x16, u8, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
29 | impl_packed_merge!(i8x32, u8x32, u8, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
30 | impl_packed_merge!(i8x64, u8x64, u8, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), (32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63);
31 | 
32 | impl_packed_merge!(i16x8, u16x8, u16, _mm_blendv_epi8, "sse4.1", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
33 | impl_packed_merge!(i16x16, u16x16, u16, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
34 | impl_packed_merge!(i16x32, u16x32, u16, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), (16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
35 | 
36 | impl_packed_merge!(i32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3);
37 | impl_packed_merge!(i32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
38 | impl_packed_merge!(i32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
39 | 
40 | impl_packed_merge!(i64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1);
41 | impl_packed_merge!(i64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3);
42 | impl_packed_merge!(i64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
43 | 
44 | impl_packed_merge!(f32x4, u32x4, u32, _mm_blendv_epi8, "sse4.1", (0, 1), (2, 3), 0, 1, 2, 3);
45 | impl_packed_merge!(f32x8, u32x8, u32, _mm256_blendv_epi8, "avx2", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
46 | impl_packed_merge!(f32x16, u32x16, u32, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3, 4, 5, 6, 7), (8, 9, 10, 11, 12, 13, 14, 15), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
47 | 
48 | impl_packed_merge!(f64x2, u64x2, u64, _mm_blendv_epi8, "sse4.1", (0), (1), 0, 1);
49 | impl_packed_merge!(f64x4, u64x4, u64, _mm256_blendv_epi8, "avx2", (0, 1), (2, 3), 0, 1, 2, 3);
50 | impl_packed_merge!(f64x8, u64x8, u64, _mm512_mask_mov_epi8, "avx512-butnotyet", (0, 1, 2, 3), (4, 5, 6, 7), 0, 1, 2, 3, 4, 5, 6, 7);
51 | 
52 | mod tests {
53 |     use crate::prelude::*;
54 |     use crate::arch::current::vecs::*;
55 | 
56 |     test_packed_merge!(
57 |         (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2),
58 |         (merge_u8x64, merge_u8x32, merge_u8x16, merge_i8x64, merge_i8x32, merge_i8x16, merge_u16x32, merge_u16x16, merge_u16x8, merge_i16x32, merge_i16x16, merge_i16x8, merge_u32x16, merge_u32x8, merge_u32x4, merge_i32x16, merge_i32x8, merge_i32x4, merge_f32x16, merge_f32x8, merge_f32x4, merge_u64x8, merge_u64x4, merge_u64x2, merge_i64x8, merge_i64x4, merge_i64x2, merge_f64x8, merge_f64x4, merge_f64x2));
59 | }
60 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/mod.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | #![allow(unused_imports)]
 9 | 
10 | mod abs;
11 | mod addsub;
12 | mod cmp;
13 | mod destride;
14 | mod downcast;
15 | mod endian;
16 | mod eq;
17 | mod hadd;
18 | mod hsub;
19 | mod merge;
20 | mod popcnt;
21 | mod recip;
22 | mod round;
23 | mod sum;
24 | mod rsqrt;
25 | mod saturating_add;
26 | mod saturating_hadd;
27 | mod saturating_sub;
28 | mod saturating_hsub;
29 | mod sqrt;
30 | mod transmute;
31 | mod upcast;
32 | 
33 | // We use an internal prelude not to clutter the namespace when we import
34 | // from actual prelude.
35 | pub mod prelude {
36 |     pub use super::abs::*;
37 |     pub use super::addsub::*;
38 |     pub use super::cmp::*;
39 |     pub use super::destride::*;
40 |     pub use super::downcast::*;
41 |     pub use super::endian::*;
42 |     pub use super::eq::*;
43 |     pub use super::hadd::*;
44 |     pub use super::hsub::*;
45 |     pub use super::merge::*;
46 |     pub use super::popcnt::*;
47 |     pub use super::recip::*;
48 |     pub use super::round::*;
49 |     pub use super::rsqrt::*;
50 |     pub use super::sum::*;
51 |     pub use super::saturating_add::*;
52 |     pub use super::saturating_hadd::*;
53 |     pub use super::saturating_hsub::*;
54 |     pub use super::saturating_sub::*;
55 |     pub use super::transmute::*;
56 |     pub use super::upcast::*;
57 | }
58 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/popcnt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::intrin::sum::*;
11 | use crate::intrin::transmute::*;
12 | use crate::intrin::popcnt::*;
13 | use crate::arch::current::intrin::upcast::*;
14 | use crate::intrin::sum::UpcastSum;
15 | use crate::arch::current::vecs::*;
16 | use crate::intrin::upcast::*;
17 | use crate::vecs::*;
18 | 
19 | #[inline(always)]
20 | #[cfg(target_feature = "ssse3")]
21 | unsafe fn popcnt128(v: u8x16) -> usize {
22 |     // SSE3 popcnt algorithm by Wojciech Muła
23 |     // http://wm.ite.pl/articles/sse-popcount.html
24 |     optimized!();
25 |     let lookup = i8x16::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
26 |     let lo = v.be_u8s() & 0x0f;
27 |     let hi: u8x16 = v.be_u8s() >> 4;
28 |     (_mm_shuffle_epi8(lookup, hi.be_i8s()).be_u8s()
29 |         + _mm_shuffle_epi8(lookup, lo.be_i8s()).be_u8s())
30 |         .sum_upcast() as usize
31 | }
32 | 
33 | #[inline(always)]
34 | #[cfg(not(target_feature = "ssse3"))]
35 | #[allow(unused_unsafe)]
36 | unsafe fn popcnt128(v: u8x16) -> usize {
37 |     fallback!();
38 |     v.be_u64s(). scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize))
39 | }
40 | 
41 | #[inline(always)]
42 | #[cfg(target_feature = "avx2")]
43 | unsafe fn popcnt256(v: u8x32) -> usize {
44 |     // AVX2 popcnt algorithm by Wojciech Muła, Nathan Kurz, and Daniel Lemire
45 |     // https://arxiv.org/abs/1611.07612
46 |     optimized!();
47 |     let lookup = i8x32::new(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
48 |                             0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
49 |     let lo = v.be_u8s() & 0x0f;
50 |     let hi: u8x32 = v.be_u8s() >> 4;
51 |     (_mm256_shuffle_epi8(lookup, hi.be_i8s()).be_u8s()
52 |         + _mm256_shuffle_epi8(lookup, lo.be_i8s()).be_u8s())
53 |         .sum_upcast() as usize
54 | }
55 | 
56 | #[inline(always)]
57 | #[cfg(not(target_feature = "avx2"))]
58 | #[allow(unused_unsafe)]
59 | unsafe fn popcnt256(v: u8x32) -> usize {
60 |     fallback!();
61 |     v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize))
62 | }
63 | 
64 | #[inline(always)]
65 | // #[cfg(not(target_feature = "avx512"))]
66 | unsafe fn popcnt512(v: u8x64) -> usize {
67 |     fallback!();
68 |     v.be_u64s().scalar_reduce(0, |acc, s| acc + (s.count_ones() as usize))
69 | }
70 | 
71 | impl_popcnt!(u8x64, popcnt512, u8x32, popcnt256, u8x16, popcnt128);
72 | impl_popcnt!(i8x64, popcnt512, i8x32, popcnt256, i8x16, popcnt128);
73 | impl_popcnt!(u16x32, popcnt512, u16x16, popcnt256, u16x8, popcnt128);
74 | impl_popcnt!(i16x32, popcnt512, i16x16, popcnt256, i16x8, popcnt128);
75 | impl_popcnt!(u32x16, popcnt512, u32x8, popcnt256, u32x4, popcnt128);
76 | impl_popcnt!(i32x16, popcnt512, i32x8, popcnt256, i32x4, popcnt128);
77 | impl_popcnt!(u64x8, popcnt512, u64x4, popcnt256, u64x2, popcnt128);
78 | impl_popcnt!(i64x8, popcnt512, i64x4, popcnt256, i64x2, popcnt128);
79 | 
80 | #[cfg(test)]
81 | mod tests {
82 |     use crate::prelude::*;
83 |     use crate::arch::current::vecs::*;
84 | 
85 |     test_popcnt!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, u64, u64, u64, i64, i64, i64),
86 |                  (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2),
87 |                  (popcnt_u8x64, popcnt_u8x32, popcnt_u8x16, popcnt_i8x64, popcnt_i8x32, popcnt_i8x16, popcnt_u16x32, popcnt_u16x16, popcnt_u16x8, popcnt_i16x32, popcnt_i16x16, popcnt_i16x8, popcnt_u32x16, popcnt_u32x8, popcnt_u32x4, popcnt_i32x16, popcnt_i32x8, popcnt_i32x4, popcnt_u64x8, popcnt_u64x4, popcnt_u64x2, popcnt_i64x8, popcnt_i64x4, popcnt_i64x2));
88 | }
89 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/recip.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::arch::current::vecs::*;
11 | use crate::vecs::*;
12 | use crate::intrin::recip::Recip;
13 | 
14 | rust_fallback_impl! {
15 |     impl Recip for f32x8 where "avx" {
16 |         recip => _mm256_rcp_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
17 |     }
18 | }
19 | 
20 | rust_fallback_impl! {
21 |     impl Recip for f32x4 where "sse" {
22 |         recip => _mm_rcp_ps(), [0, 1, 2, 3];
23 |     }
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/round.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::intrin::round::Round;
11 | use crate::core::arch::x86_64::{_MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TRUNC};
12 | use crate::arch::current::vecs::*;
13 | use crate::vecs::*;
14 | 
15 | rust_fallback_impl! {
16 |     impl Round for f32x4 where "sse4.1" {
17 |         round => _mm_round_ps(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3];
18 |         ceil => _mm_ceil_ps(), [0, 1, 2, 3];
19 |         floor => _mm_floor_ps(), [0, 1, 2, 3];
20 |         trunc => _mm_round_ps(_MM_FROUND_TRUNC), [0, 1, 2, 3];
21 |     }
22 | }
23 | 
24 | rust_fallback_impl! {
25 |     impl Round for f64x2 where "sse4.1" {
26 |         round => _mm_round_pd(_MM_FROUND_TO_NEAREST_INT), [0, 1];
27 |         ceil => _mm_ceil_pd(), [0, 1];
28 |         floor => _mm_floor_pd(), [0, 1];
29 |         trunc => _mm_round_pd(_MM_FROUND_TRUNC), [0, 1];
30 |     }
31 | }
32 | 
33 | rust_fallback_impl! {
34 |     impl Round for f32x8 where "avx" {
35 |         round => _mm256_round_ps(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3, 4, 5, 6, 7];
36 |         ceil => _mm256_ceil_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
37 |         floor => _mm256_floor_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
38 |         trunc => _mm256_round_ps(_MM_FROUND_TRUNC), [0, 1, 2, 3, 4, 5, 6, 7];
39 |     }
40 | }
41 | 
42 | rust_fallback_impl! {
43 |     impl Round for f64x4 where "sse4.1" {
44 |         round => _mm256_round_pd(_MM_FROUND_TO_NEAREST_INT), [0, 1, 2, 3];
45 |         ceil => _mm256_ceil_pd(), [0, 1, 2, 3];
46 |         floor => _mm256_floor_pd(), [0, 1, 2, 3];
47 |         trunc => _mm256_round_pd(_MM_FROUND_TRUNC), [0, 1, 2, 3];
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/rsqrt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::intrin::rsqrt::*;
11 | use crate::arch::current::vecs::*;
12 | use crate::vecs::*;
13 | 
14 | // TODO: Guards and non-simd
15 | 
16 | rust_fallback_impl! {
17 |     impl Rsqrt for f32x8 where "avx" {
18 |         rsqrt => _mm256_rsqrt_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
19 |     }
20 | }
21 | 
22 | rust_fallback_impl! {
23 |     impl Rsqrt for f32x4 where "sse" {
24 |         rsqrt => _mm_rsqrt_ps(), [0, 1, 2, 3];
25 |     }
26 | }
27 | 
28 | impl Rsqrt for f32 {
29 |     #[inline(always)]
30 |     fn rsqrt(&self) -> Self {
31 |         self.sqrt().recip()
32 |     }
33 | }
34 | 
35 | impl Rsqrt for f64 {
36 |     #[inline(always)]
37 |     fn rsqrt(&self) -> Self {
38 |         self.sqrt().recip()
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/saturating_add.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::arch::current::vecs::*;
11 | use crate::vecs::*;
12 | use crate::intrin::saturating_add::*;
13 | 
14 | rust_fallback_impl_binary! {
15 |     impl SaturatingAdd for u8x16 where "sse2" {
16 |         saturating_add => _mm_adds_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
17 |     }
18 | }
19 | 
20 | rust_fallback_impl_binary! {
21 |     impl SaturatingAdd for i8x16 where "sse2" {
22 |         saturating_add => _mm_adds_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
23 |     }
24 | }
25 | 
26 | rust_fallback_impl_binary! {
27 |     impl SaturatingAdd for u16x8 where "sse2" {
28 |         saturating_add => _mm_adds_epu16(), [0, 1, 2, 3, 4, 5, 6, 7];
29 |     }
30 | }
31 | 
32 | rust_fallback_impl_binary! {
33 |     impl SaturatingAdd for i16x8 where "sse2" {
34 |         saturating_add => _mm_adds_epi16(), [0, 1, 2, 3, 4, 5, 6, 7];
35 |     }
36 | }
37 | 
38 | rust_fallback_impl_binary! {
39 |     impl SaturatingAdd for u8x32 where "avx2" {
40 |         saturating_add => _mm256_adds_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
41 |                                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
42 | 
43 |     }
44 | }
45 | 
46 | rust_fallback_impl_binary! {
47 |     impl SaturatingAdd for i8x32 where "avx2" {
48 |         saturating_add => _mm256_adds_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
49 |                                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
50 |     }
51 | }
52 | 
53 | rust_fallback_impl_binary! {
54 |     impl SaturatingAdd for u16x16 where "avx2" {
55 |         saturating_add => _mm256_adds_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
56 |     }
57 | }
58 | 
59 | rust_fallback_impl_binary! {
60 |     impl SaturatingAdd for i16x16 where "avx2" {
61 |         saturating_add => _mm256_adds_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/saturating_hadd.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::arch::current::vecs::*;
11 | use crate::vecs::*;
12 | use crate::intrin::transmute::*;
13 | use crate::intrin::saturating_hadd::*;
14 | 
15 | #[cfg(target_feature = "ssse3")]
16 | impl SaturatingHAdd for i16x8 {
17 |     #[inline(always)]
18 |     fn saturating_hadd(&self, other: Self) -> Self {
19 |         optimized!();
20 |         unsafe { _mm_hadds_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
21 |                                  _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
22 |     }
23 | }
24 | 
25 | #[cfg(target_feature = "avx2")]
26 | impl SaturatingHAdd for i16x16 {
27 |     #[inline(always)]
28 |     fn saturating_hadd(&self, other: Self) -> Self {
29 |         optimized!();
30 |         unsafe { _mm256_hadds_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
31 |                                     _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
32 |     }
33 | }
34 | 
35 | impl SaturatingHAdd for u64x2 { hop!(saturating_hadd, u64::saturating_add, 0, 1); }
36 | impl SaturatingHAdd for u64x4 { hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3); }
37 | impl SaturatingHAdd for u64x8 { hop!(saturating_hadd, u64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
38 | impl SaturatingHAdd for u32x4 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3); }
39 | impl SaturatingHAdd for u32x8 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
40 | impl SaturatingHAdd for u32x16 { hop!(saturating_hadd, u32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
41 | impl SaturatingHAdd for u16x8 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
42 | impl SaturatingHAdd for u16x16 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
43 | impl SaturatingHAdd for u16x32 { hop!(saturating_hadd, u16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
44 | impl SaturatingHAdd for u8x16 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
45 | impl SaturatingHAdd for u8x32 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
46 | impl SaturatingHAdd for u8x64 { hop!(saturating_hadd, u8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
47 | impl SaturatingHAdd for i64x2 { hop!(saturating_hadd, i64::saturating_add, 0, 1); }
48 | impl SaturatingHAdd for i64x4 { hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3); }
49 | impl SaturatingHAdd for i64x8 { hop!(saturating_hadd, i64::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
50 | impl SaturatingHAdd for i32x4 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3); }
51 | impl SaturatingHAdd for i32x8 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
52 | impl SaturatingHAdd for i32x16 { hop!(saturating_hadd, i32::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
53 | #[cfg(not(target_feature = "ssse3"))]
54 | impl SaturatingHAdd for i16x8 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7); }
55 | #[cfg(not(target_feature = "avx2"))]
56 | impl SaturatingHAdd for i16x16 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
57 | impl SaturatingHAdd for i16x32 { hop!(saturating_hadd, i16::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
58 | impl SaturatingHAdd for i8x16 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
59 | impl SaturatingHAdd for i8x32 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
60 | impl SaturatingHAdd for i8x64 { hop!(saturating_hadd, i8::saturating_add, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
61 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/saturating_hsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::arch::current::vecs::*;
11 | use crate::vecs::*;
12 | use crate::intrin::transmute::*;
13 | use crate::intrin::saturating_hsub::*;
14 | 
15 | #[cfg(target_feature = "ssse3")]
16 | impl SaturatingHSub for i16x8 {
17 |     #[inline(always)]
18 |     fn saturating_hsub(&self, other: Self) -> Self {
19 |         optimized!();
20 |         unsafe { _mm_hsubs_epi16(_mm_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
21 |                                  _mm_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
22 |     }
23 | }
24 | 
25 | #[cfg(target_feature = "avx2")]
26 | impl SaturatingHSub for i16x16 {
27 |     #[inline(always)]
28 |     fn saturating_hsub(&self, other: Self) -> Self {
29 |         optimized!();
30 |         unsafe { _mm256_hsubs_epi16(_mm256_unpacklo_epi32(self.be_i32s(), other.be_i32s()).be_i16s(),
31 |                                     _mm256_unpackhi_epi32(self.be_i32s(), other.be_i32s()).be_i16s()) }
32 |     }
33 | }
34 | 
35 | impl SaturatingHSub for u64x2 { hop!(saturating_hsub, u64::saturating_sub, 0, 1); }
36 | impl SaturatingHSub for u64x4 { hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3); }
37 | impl SaturatingHSub for u64x8 { hop!(saturating_hsub, u64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
38 | impl SaturatingHSub for u32x4 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3); }
39 | impl SaturatingHSub for u32x8 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
40 | impl SaturatingHSub for u32x16 { hop!(saturating_hsub, u32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
41 | impl SaturatingHSub for u16x8 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
42 | impl SaturatingHSub for u16x16 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
43 | impl SaturatingHSub for u16x32 { hop!(saturating_hsub, u16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
44 | impl SaturatingHSub for u8x16 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
45 | impl SaturatingHSub for u8x32 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
46 | impl SaturatingHSub for u8x64 { hop!(saturating_hsub, u8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
47 | impl SaturatingHSub for i64x2 { hop!(saturating_hsub, i64::saturating_sub, 0, 1); }
48 | impl SaturatingHSub for i64x4 { hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3); }
49 | impl SaturatingHSub for i64x8 { hop!(saturating_hsub, i64::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
50 | impl SaturatingHSub for i32x4 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3); }
51 | impl SaturatingHSub for i32x8 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
52 | impl SaturatingHSub for i32x16 { hop!(saturating_hsub, i32::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
53 | #[cfg(not(target_feature = "ssse3"))]
54 | impl SaturatingHSub for i16x8 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7); }
55 | #[cfg(not(target_feature = "avx2"))]
56 | impl SaturatingHSub for i16x16 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
57 | impl SaturatingHSub for i16x32 { hop!(saturating_hsub, i16::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
58 | impl SaturatingHSub for i8x16 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
59 | impl SaturatingHSub for i8x32 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
60 | impl SaturatingHSub for i8x64 { hop!(saturating_hsub, i8::saturating_sub, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63); }
61 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/saturating_sub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::intrin::saturating_sub::*;
11 | use crate::arch::current::vecs::*;
12 | use crate::vecs::*;
13 | 
14 | rust_fallback_impl_binary! {
15 |     impl SaturatingSub for u8x16 where "sse2" {
16 |         saturating_sub => _mm_subs_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
17 |     }
18 | }
19 | 
20 | rust_fallback_impl_binary! {
21 |     impl SaturatingSub for i8x16 where "sse2" {
22 |         saturating_sub => _mm_subs_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
23 |     }
24 | }
25 | 
26 | rust_fallback_impl_binary! {
27 |     impl SaturatingSub for u16x8 where "sse2" {
28 |         saturating_sub => _mm_subs_epu16(), [0, 1, 2, 3, 4, 5, 6, 7];
29 |     }
30 | }
31 | 
32 | rust_fallback_impl_binary! {
33 |     impl SaturatingSub for i16x8 where "sse2" {
34 |         saturating_sub => _mm_subs_epi16(), [0, 1, 2, 3, 4, 5, 6, 7];
35 |     }
36 | }
37 | 
38 | rust_fallback_impl_binary! {
39 |     impl SaturatingSub for u8x32 where "avx2" {
40 |         saturating_sub => _mm256_subs_epu8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
41 |                                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
42 |     }
43 | }
44 | 
45 | rust_fallback_impl_binary! {
46 |     impl SaturatingSub for i8x32 where "avx2" {
47 |         saturating_sub => _mm256_subs_epi8(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
48 |                                                17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31];
49 |     }
50 | }
51 | 
52 | rust_fallback_impl_binary! {
53 |     impl SaturatingSub for u16x16 where "avx2" {
54 |         saturating_sub => _mm256_subs_epu16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
55 |     }
56 | }
57 | 
58 | rust_fallback_impl_binary! {
59 |     impl SaturatingSub for i16x16 where "avx2" {
60 |         saturating_sub => _mm256_subs_epi16(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/sqrt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use vektor::x86_64::*;
 9 | use vektor::x86::*;
10 | use crate::intrin::sqrt::*;
11 | use crate::arch::current::vecs::*;
12 | use crate::vecs::*;
13 | 
14 | rust_fallback_impl! {
15 |     impl Sqrt for f32x8 where "avx" {
16 |         sqrt => _mm256_sqrt_ps(), [0, 1, 2, 3, 4, 5, 6, 7];
17 |     }
18 | }
19 | 
20 | rust_fallback_impl! {
21 |     impl Sqrt for f64x4 where "avx" {
22 |         sqrt => _mm256_sqrt_pd(), [0, 1, 2, 3];
23 |     }
24 | }
25 | 
26 | rust_fallback_impl! {
27 |     impl Sqrt for f32x4 where "sse" {
28 |         sqrt => _mm_sqrt_ps(), [0, 1, 2, 3];
29 |     }
30 | }
31 | 
32 | rust_fallback_impl! {
33 |     impl Sqrt for f64x2 where "sse2" {
34 |         sqrt => _mm_sqrt_pd(), [0, 1];
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/arch/x86/intrin/transmute.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vektor::x86_64::*;
 9 | use crate::vektor::x86::*;
10 | use crate::intrin::transmute::*;
11 | use crate::arch::current::vecs::*;
12 | use crate::vecs::*;
13 | use crate::core::mem::transmute;
14 | 
15 | impl_packed_transmute!(u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, f32x8,
16 |                        u64x4, i64x4, f64x4, ...
17 |                        u8x32, i8x32, u16x16, i16x16, u32x8, i32x8,
18 |                        f32x8, u64x4, i64x4, f64x4,
19 |                        "avx", "avx512");
20 | impl_packed_transmute!(u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, f32x16,
21 |                        u64x8, i64x8, f64x8, ...
22 |                        u8x64, i8x64, u16x32, i16x32, u32x16, i32x16,
23 |                        f32x16, u64x8, i64x8, f64x8,
24 |                        "avx512", "avx1024");
25 | impl_packed_transmute!(u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, f32x4,
26 |                        u64x2, i64x2, f64x2, ...
27 |                        u8x16, i8x16, u16x8, i16x8, u32x4, i32x4,
28 |                        f32x4, u64x2, i64x2, f64x2,
29 |                        "sse", "avx");
30 | 


--------------------------------------------------------------------------------
/src/arch/x86/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod intrin;
2 | pub mod vecs;
3 | pub mod vec_patterns;
4 | 


--------------------------------------------------------------------------------
/src/arch/x86/vecs.rs:
--------------------------------------------------------------------------------
 1 | pub use crate::vecs::*;
 2 | pub use packed_simd::{u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2};
 3 | 
 4 | impl_packed!(u8, u8s, u8x64, 1, 64, ["avx512"], ["avx1024"]);
 5 | impl_packed!(u8, u8s, u8x32, 1, 32, ["avx2"], ["avx512"]);
 6 | impl_packed!(u8, u8s, u8x16, 1, 16, [], ["avx2"]);
 7 | impl_packed!(i8, i8s, i8x64, 1, 64, ["avx512"], ["avx1024"]);
 8 | impl_packed!(i8, i8s, i8x32, 1, 32, ["avx2"], ["avx512"]);
 9 | impl_packed!(i8, i8s, i8x16, 1, 16, [], ["avx2"]);
10 | impl_packed!(u16, u16s, u16x32, 2, 32, ["avx512"], ["avx1024"]);
11 | impl_packed!(u16, u16s, u16x16, 2, 16, ["avx2"], ["avx512"]);
12 | impl_packed!(u16, u16s, u16x8, 2, 8, [], ["avx2"]);
13 | impl_packed!(i16, i16s, i16x32, 2, 32, ["avx512"], ["avx1024"]);
14 | impl_packed!(i16, i16s, i16x16, 2, 16, ["avx2"], ["avx512"]);
15 | impl_packed!(i16, i16s, i16x8, 2, 8, [], ["avx2"]);
16 | impl_packed!(u32, u32s, u32x16, 4, 16, ["avx512"], ["avx1024"]);
17 | impl_packed!(u32, u32s, u32x8, 4, 8, ["avx2"], ["avx512"]);
18 | impl_packed!(u32, u32s, u32x4, 4, 4, [], ["avx2"]);
19 | impl_packed!(i32, i32s, i32x16, 4, 16, ["avx512"], ["avx1024"]);
20 | impl_packed!(i32, i32s, i32x8, 4, 8, ["avx2"], ["avx512"]);
21 | impl_packed!(i32, i32s, i32x4, 4, 4, [], ["avx2"]);
22 | impl_packed!(f32, f32s, f32x16, 4, 16, ["avx512"], ["avx1024"]);
23 | impl_packed!(f32, f32s, f32x8, 4, 8, ["avx2"], ["avx512"]);
24 | impl_packed!(f32, f32s, f32x4, 4, 4, [], ["avx2"]);
25 | impl_packed!(u64, u64s, u64x8, 8, 8, ["avx512"], ["avx1024"]);
26 | impl_packed!(u64, u64s, u64x4, 8, 4, ["avx2"], ["avx512"]);
27 | impl_packed!(u64, u64s, u64x2, 8, 2, [], ["avx2"]);
28 | impl_packed!(i64, i64s, i64x8, 8, 8, ["avx512"], ["avx1024"]);
29 | impl_packed!(i64, i64s, i64x4, 8, 4, ["avx2"], ["avx512"]);
30 | impl_packed!(i64, i64s, i64x2, 8, 2, [], ["avx2"]);
31 | impl_packed!(f64, f64s, f64x8, 8, 8, ["avx512"], ["avx1024"]);
32 | impl_packed!(f64, f64s, f64x4, 8, 4, ["avx2"], ["avx512"]);
33 | impl_packed!(f64, f64s, f64x2, 8, 2, [], ["avx2"]);
34 | 
35 | #[cfg(test)]
36 | mod tests {
37 |     use super::Packed;
38 |     use super::*;
39 | 
40 |     macro_rules! test_product {
41 |         (($($el:tt),*), ($($vec:tt),*), ($($fn:tt),*), ($($sum:tt),*)) => (
42 |             $(
43 |                 #[test]
44 |                 fn $fn() {
45 |                     assert_eq!($vec::splat(1i8 as $el).product(), $sum as $el);
46 |                 }
47 |             )*
48 |         )
49 |     }
50 | 
51 |     // TODO: Do we need better test cases for this?
52 |     test_product!((u8, u8, u8, i8, i8, i8, u16, u16, u16, i16, i16, i16, u32, u32, u32, i32, i32, i32, f32, f32, f32, u64, u64, u64, i64, i64, i64, f64, f64, f64),
53 |                   (u8x64, u8x32, u8x16, i8x64, i8x32, i8x16, u16x32, u16x16, u16x8, i16x32, i16x16, i16x8, u32x16, u32x8, u32x4, i32x16, i32x8, i32x4, f32x16, f32x8, f32x4, u64x8, u64x4, u64x2, i64x8, i64x4, i64x2, f64x8, f64x4, f64x2),
54 |                   (scalar_product_u8x64, scalar_product_u8x32, scalar_product_u8x16, scalar_product_i8x64, scalar_product_i8x32, scalar_product_i8x16, scalar_product_u16x32, scalar_product_u16x16, scalar_product_u16x8, scalar_product_i16x32, scalar_product_i16x16, scalar_product_i16x8, scalar_product_u32x16, scalar_product_u32x8, scalar_product_u32x4, scalar_product_i32x16, scalar_product_i32x8, scalar_product_i32x4, scalar_product_f32x16, scalar_product_f32x8, scalar_product_f32x4, scalar_product_u64x8, scalar_product_u64x4, scalar_product_u64x2, scalar_product_i64x8, scalar_product_i64x4, scalar_product_i64x2, scalar_product_f64x8, scalar_product_f64x4, scalar_product_f64x2),
55 |                   (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1));
56 | }
57 | 


--------------------------------------------------------------------------------
/src/debug.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused_macros, dead_code)]
 2 | 
 3 | use std::collections::HashSet;
 4 | use std::cell::RefCell;
 5 | 
 6 | thread_local! {
 7 |     // Not perfect as it might print multiple times (once per thread),
 8 |     // but better than a `global` hack to prevent multiple prints of the
 9 |     // same warning.
10 |     pub(crate) static OUTPUT_GUARD: RefCell<HashSet<String>> = RefCell::new(HashSet::new());
11 | }
12 | 
13 | 
14 | macro_rules! debug_append_log {
15 |     ($str:expr) => {
16 |         use std::io::Write;
17 | 
18 |         // Allows the user to configure debug file path at compile time,
19 |         // e.g., when building for embedded / Android.
20 |         let file_name = option_env!("FASTER_DEBUG_FILE").unwrap_or("faster-debug.txt");
21 | 
22 |         std::fs::OpenOptions::new()
23 |             .write(true)
24 |             .create(true)
25 |             .append(true)
26 |             .open(file_name).and_then(|mut file| {
27 |                 writeln!(file, "{}", $str)
28 |         }).ok(); // `ok` suppresses warning about unused results, about which we don't care.
29 |     }
30 | }
31 | 
32 | 
33 | /// Prints the given string once (for the current thread).
34 | /// Useful for not spamming the console.
35 | macro_rules! debug_output_once {
36 |     ($str:expr) => {
37 |         let output = $str;
38 | 
39 |         crate::debug::OUTPUT_GUARD.with(|f| {
40 |             let mut output_guard = f.borrow_mut();
41 | 
42 |             if output_guard.contains(&output) {
43 |                 return;
44 |             }
45 | 
46 |             // Also print to file (if enabled).
47 |             debug_append_log!(output);
48 |             println!("{}", output);
49 | 
50 |             output_guard.insert(output);
51 |         });
52 |     }
53 | }
54 | 
55 | 
56 | /// Signal a software fallback is executed.
57 | #[cfg(feature="trace")]
58 | macro_rules! fallback {
59 |     () => {
60 |         debug_output_once!(format!("⛔ faster is using SOFTWARE emulation here ({}:{}).", file!(), line!()));
61 |     }
62 | }
63 | 
64 | /// Signal an optimized SIMD intrinsic is executed.
65 | #[cfg(feature="trace")]
66 | macro_rules! optimized {
67 |     () => {
68 |         debug_output_once!(format!("🚄 faster is using HARDWARE acceleration here ({}:{}).", file!(), line!()));
69 |     }
70 | }
71 | 
72 | #[cfg(not(feature="trace"))]
73 | macro_rules! fallback {
74 |     () => { }
75 | }
76 | 
77 | #[cfg(not(feature="trace"))]
78 | macro_rules! optimized {
79 |     () => { }
80 | }
81 | 
82 | 


--------------------------------------------------------------------------------
/src/into_iters.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy owf the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | use crate::iters::{SIMDIter, SIMDIterator, SIMDObject};
  9 | #[allow(unused_imports)] // Remove for specialization
 10 | use crate::iters::SIMDAdapter;
 11 | use crate::arch::current::vecs::*;
 12 | 
 13 | /// A trait which transforms a contiguous collection into an owned stream of
 14 | /// vectors.
 15 | pub trait IntoSIMDIterator {
 16 |     type Iter : SIMDIterator;
 17 | 
 18 |     /// Return an iterator over this data which will automatically pack
 19 |     /// values into SIMD vectors. See `SIMDIterator::simd_map` and
 20 |     /// `SIMDIterator::simd_reduce` for more information.
 21 |     fn into_simd_iter(self, default: <Self::Iter as SIMDObject>::Vector) -> Self::Iter;
 22 | }
 23 | 
 24 | /// A trait which transforms a contiguous collection into a slice-backed stream
 25 | /// of vectors.
 26 | pub trait IntoSIMDRefIterator<'a> {
 27 |     type Iter : SIMDIterator;
 28 | 
 29 |     /// Return an iterator over this data which will automatically pack
 30 |     /// values into SIMD vectors. See `SIMDIterator::simd_map` and
 31 |     /// `SIMDIterator::simd_reduce` for more information.
 32 |     fn simd_iter(&'a self, default: <Self::Iter as SIMDObject>::Vector) -> Self::Iter;
 33 | }
 34 | 
 35 | /// A trait which transforms a contiguous collection into a mutable slice-backed
 36 | /// stream of vectors.
 37 | pub trait IntoSIMDRefMutIterator<'a> {
 38 |     type Iter : SIMDIterator;
 39 | 
 40 |     /// Return an iterator over this data which will automatically pack
 41 |     /// values into SIMD vectors. See `SIMDIterator::simd_map` and
 42 |     /// `SIMDIterator::simd_reduce` for more information.
 43 |     fn simd_iter_mut(&'a mut self, default: <Self::Iter as SIMDObject>::Vector) -> Self::Iter;
 44 | }
 45 | 
 46 | macro_rules! impl_array_intos {
 47 |     ($($el:ty, $vec:ty),*) => {
 48 |         $(
 49 |             #[cfg(feature = "std")]
 50 |             impl IntoSIMDIterator for Vec<$el> {
 51 |                 type Iter = SIMDIter<Self>;
 52 | 
 53 |                 #[inline(always)]
 54 |                 fn into_simd_iter(self, default: $vec) -> Self::Iter {
 55 |                     SIMDIter {
 56 |                         data: self,
 57 |                         position: 0,
 58 |                         default: default,
 59 |                     }
 60 |                 }
 61 |             }
 62 | 
 63 |             impl<'a> IntoSIMDRefIterator<'a> for &'a [$el] {
 64 |                 type Iter = SIMDIter<Self>;
 65 | 
 66 |                 #[inline(always)]
 67 |                 fn simd_iter(&'a self, default: $vec) -> Self::Iter {
 68 |                     SIMDIter {
 69 |                         data: self,
 70 |                         position: 0,
 71 |                         default: default,
 72 |                     }
 73 |                 }
 74 |             }
 75 | 
 76 |             impl<'a> IntoSIMDRefMutIterator<'a> for &'a mut [$el] {
 77 |                 type Iter = SIMDIter<Self>;
 78 | 
 79 |                 #[inline(always)]
 80 |                 fn simd_iter_mut(&'a mut self, default: $vec) -> Self::Iter {
 81 |                     SIMDIter {
 82 |                         data: self,
 83 |                         position: 0,
 84 |                         default: default,
 85 |                     }
 86 |                 }
 87 |             }
 88 | 
 89 |             impl<'a> IntoSIMDRefMutIterator<'a> for [$el] {
 90 |                 type Iter = SIMDIter<&'a mut Self>;
 91 | 
 92 |                 #[inline(always)]
 93 |                 fn simd_iter_mut(&'a mut self, default: $vec) -> Self::Iter {
 94 |                     SIMDIter {
 95 |                         data: self,
 96 |                         position: 0,
 97 |                         default: default,
 98 |                     }
 99 |                 }
100 |             }
101 | 
102 |             impl<'a> IntoSIMDRefIterator<'a> for [$el] {
103 |                 type Iter = SIMDIter<&'a Self>;
104 | 
105 |                 #[inline(always)]
106 |                 fn simd_iter(&'a self, default: $vec) -> Self::Iter {
107 |                     SIMDIter {
108 |                         data: self,
109 |                         position: 0,
110 |                         default: default,
111 |                     }
112 |                 }
113 |             }
114 |         )*
115 |     }
116 | }
117 | 
118 | impl_array_intos!(u8, u8s,
119 |                   i8, i8s,
120 |                   u16, u16s,
121 |                   i16, i16s,
122 |                   u32, u32s,
123 |                   i32, i32s,
124 |                   f32, f32s,
125 |                   u64, u64s,
126 |                   i64, i64s,
127 |                   f64, f64s);
128 | 
129 | // TODO: Specialization
130 | // impl<I, S> IntoSIMDIterator for I where I : ExactSizeIterator + Iterator<Item = S>, S : Packable {
131 | //     type Iter = SIMDAdapter<Self, S::Vector>;
132 | 
133 | //     #[inline(always)]
134 | //     fn into_simd_iter(self, default: S::Vector) -> Self::Iter {
135 | //         SIMDAdapter {
136 | //             iter: self,
137 | //             position: 0,
138 | //             default: default,
139 | //             scratch: default
140 | //         }
141 | //     }
142 | // }
143 | 


--------------------------------------------------------------------------------
/src/intrin/abs.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Abs {
 9 |     type Out;
10 |     /// Return a vector the absolute value of the elements of `self`.
11 |     ///
12 |     /// # Examples
13 |     ///
14 |     /// ```
15 |     /// extern crate faster;
16 |     /// use faster::*;
17 |     ///
18 |     /// # fn main() {
19 |     /// assert_eq!(i32s(-2).abs(), u32s(2));
20 |     /// assert_eq!(i8s(-256).abs(), u8s(256));
21 |     /// # }
22 |     /// ```
23 |     fn abs(&self) -> Self::Out;
24 | }
25 | 
26 | #[cfg(test)]
27 | mod tests {
28 |     use crate::prelude::*;
29 | 
30 |     #[test]
31 |     fn abs_i8s() {
32 |         for i in -128..127 {
33 |             assert_eq!(i8s(i).abs().extract(0), (i as i64).abs() as u8);
34 |         }
35 |     }
36 | 
37 |     #[test]
38 |     fn abs_i16s() {
39 |         for i in -32768..32767 {
40 |             assert_eq!(i16s(i).abs().extract(0), (i as i64).abs() as u16);
41 |         }
42 |     }
43 | 
44 |     #[test]
45 |     fn abs_i32s() {
46 |         for i in -65536..65536 {
47 |             assert_eq!(i32s(i).abs().extract(0), (i as i64).abs() as u32);
48 |         }
49 |     }
50 | 
51 |     #[test]
52 |     fn abs_i64s() {
53 |         for i in -65536..65536 {
54 |             assert_eq!(i64s(i).abs().extract(0), (i as i64).abs() as u64);
55 |         }
56 |     }
57 | 
58 |     #[test]
59 |     fn abs_f32s() {
60 |         let mut i = -1024.0;
61 |         while i < 1024.0 {
62 |             // This test has some pretty significant float error if done on x86
63 |             assert_eq!(f32s(i).abs().extract(0), i.abs());
64 |             i += 1.0
65 |         }
66 |     }
67 | 
68 |     #[test]
69 |     fn abs_f64s() {
70 |         let mut i = -1024.0;
71 |         while i < 1024.0 {
72 |             // This test has some pretty significant float error if done on x86
73 |             assert_eq!(f64s(i).abs().extract(0), i.abs());
74 |             i += 1.0
75 |         }
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/intrin/addsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait AddSub {
 9 |     fn addsub(&self, other: Self) -> Self;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/intrin/cast.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Asi8s {
 9 |     type Cast;
10 | 
11 |     /// Return a vector containing all elements of `self` cast to i8s.
12 |     fn as_i8s(self) -> Self::Cast;
13 | }
14 | 
15 | pub trait Asu8s {
16 |     type Cast;
17 | 
18 |     /// Return a vector containing all elements of `self` cast to u8s.
19 |     fn as_u8s(self) -> Self::Cast;
20 | }
21 | 
22 | pub trait Asi16s {
23 |     type Cast;
24 | 
25 |     /// Return a vector containing all elements of `self` cast to i16s.
26 |     fn as_i16s(self) -> Self::Cast;
27 | }
28 | 
29 | pub trait Asu16s {
30 |     type Cast;
31 | 
32 |     /// Return a vector containing all elements of `self` cast to u16s.
33 |     fn as_u16s(self) -> Self::Cast;
34 | }
35 | 
36 | pub trait Asf32s {
37 |     type Cast;
38 | 
39 |     /// Return a vector containing all elements of `self` cast to f32s.
40 |     fn as_f32s(self) -> Self::Cast;
41 | }
42 | 
43 | pub trait Asi32s {
44 |     type Cast;
45 | 
46 |     /// Return a vector containing all elements of `self` cast to i32s.
47 |     fn as_i32s(self) -> Self::Cast;
48 | }
49 | 
50 | pub trait Asu32s {
51 |     type Cast;
52 | 
53 |     /// Return a vector containing all elements of `self` cast to u32s.
54 |     fn as_u32s(self) -> Self::Cast;
55 | }
56 | 
57 | pub trait Asf64s {
58 |     type Cast;
59 | 
60 |     /// Return a vector containing all elements of `self` cast to f64s.
61 |     fn as_f64s(self) -> Self::Cast;
62 | }
63 | 
64 | pub trait Asi64s {
65 |     type Cast;
66 | 
67 |     /// Return a vector containing all elements of `self` cast to i64s.
68 |     fn as_i64s(self) -> Self::Cast;
69 | }
70 | 
71 | pub trait Asu64s {
72 |     type Cast;
73 | 
74 |     /// Return a vector containing all elements of `self` cast to u64s.
75 |     fn as_u64s(self) -> Self::Cast;
76 | }
77 | 
78 | // macro_rules! impl_cast {
79 | //     ($trait:path, $from:ty, $to:ty, $name:ident, $rsname:ident) => (
80 | //         impl $trait for $from {
81 | //             type Cast = $to;
82 | 
83 | //             #[inline(always)]
84 | //             fn $name(self) -> Self::Cast {
85 | //                 self.$rsname()
86 | //             }
87 | //         }
88 | //     );
89 | // }
90 | 


--------------------------------------------------------------------------------
/src/intrin/cmp.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Cmp {
 9 |     /// Return a vector where each element at an index i is the maximum of the
10 |     /// elements at index i in `self` and `other`.
11 |     ///
12 |     /// ```ignore
13 |     /// use faster::*;
14 |     ///
15 |     /// # fn main() {
16 |     /// assert_eq!(i8s(0).max(i8s(2)), i8s(2));
17 |     /// assert_eq!(i8s::halfs(1, 0).max(i8s::halfs(2, -1)), i8s::halfs(2, 0));
18 |     /// # }
19 |     /// ```
20 |     fn max(&self, other: Self) -> Self;
21 | 
22 |     /// Return a vector where each element at an index i is the minimum of the
23 |     /// elements at index i in `self` and `other`.
24 |     ///
25 |     /// ```ignore
26 |     /// use faster::*;
27 |     ///
28 |     /// # fn main() {
29 |     /// assert_eq!(i8s(0).min(i8s(2)), i8s(0));
30 |     /// assert_eq!(i8s::halfs(1, 0).min(i8s::halfs(2, -1)), i8s::halfs(1, -1));
31 |     /// # }
32 |     /// ```
33 |     fn min(&self, other: Self) -> Self;
34 | }
35 | 


--------------------------------------------------------------------------------
/src/intrin/destride.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Destride : Sized {
 9 |     fn destride_two(self, other: Self) -> (Self, Self);
10 |     fn destride_four(self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self);
11 | }
12 | 
13 | // TODO: LLVM actually autovectorizes our polyfills, but we should still have an
14 | // explicit implementation for everything
15 | 
16 | macro_rules! destride_two_polyfill {
17 |     ($self:expr, $other:expr, $($n:expr),*) => {
18 |         (Self::new($($self.extract($n)),*,
19 |                    $($other.extract($n)),*),
20 |          Self::new($($self.extract($n + 1)),*,
21 |                    $($other.extract($n + 1)),*))
22 |     }
23 | }
24 | 
25 | macro_rules! destride_four_polyfill {
26 |     ($self:expr, $b:expr, $c:expr, $d:expr, $($n:expr),*) => {
27 |         (Self::new($($self.extract($n)),*,
28 |                    $($b.extract($n)),*,
29 |                    $($c.extract($n)),*,
30 |                    $($d.extract($n)),*),
31 |          Self::new($($self.extract($n + 1)),*,
32 |                    $($b.extract($n + 1)),*,
33 |                    $($c.extract($n + 1)),*,
34 |                    $($d.extract($n + 1)),*),
35 |          Self::new($($self.extract($n + 2)),*,
36 |                    $($b.extract($n + 2)),*,
37 |                    $($c.extract($n + 2)),*,
38 |                    $($d.extract($n + 2)),*),
39 |          Self::new($($self.extract($n + 3)),*,
40 |                    $($b.extract($n + 3)),*,
41 |                    $($c.extract($n + 3)),*,
42 |                    $($d.extract($n + 3)),*))
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/intrin/downcast.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Downcast<T> {
 9 |     /// Return a vector containing elements of the same value as `self` and
10 |     /// `other`, but different type. The first half of the returned vector
11 |     /// contains the downcast values of `self`, whereas the second half of the
12 |     /// returned vector contains the downcast values of `other`. The returned
13 |     /// vector is equal in size to `self` and `other`. If an element exceeds
14 |     /// the maximum or minimum value of the downcast type, it is saturated.
15 |     ///
16 |     /// # Examples
17 |     ///
18 |     /// ```
19 |     /// extern crate faster;
20 |     /// use faster::*;
21 |     ///
22 |     /// # fn main() {
23 |     /// assert_eq!(i32s(2).saturating_downcast(i32s(3)), i16s::halfs(2, 3));
24 |     /// assert_eq!(i16s(128).saturating_downcast(i16s(-129)), i8s::halfs(127, -128));
25 |     /// # }
26 |     /// ```
27 |     fn saturating_downcast(self, other: Self) -> T;
28 | }
29 | 


--------------------------------------------------------------------------------
/src/intrin/endian.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | pub trait Reendianize : Sized + Copy {
  9 |     /// Return a vector containing elements of `self` with switched endianness.
 10 |     ///
 11 |     /// ```
 12 |     /// extern crate faster;
 13 |     /// use faster::*;
 14 |     ///
 15 |     /// # fn main() {
 16 |     /// assert_eq!(u32s(0xDEADBEEF).swap_bytes(), u32s(0xEFBEADDE));
 17 |     /// # }
 18 |     /// ```
 19 |     fn swap_bytes(&self) -> Self;
 20 | 
 21 |     #[cfg(target_endian = "big")]
 22 |     #[inline(always)]
 23 |     fn to_be(&self) -> Self {
 24 |         *self
 25 |     }
 26 | 
 27 |     #[cfg(target_endian = "little")]
 28 |     #[inline(always)]
 29 |     fn to_be(&self) -> Self {
 30 |         self.swap_bytes()
 31 |     }
 32 | 
 33 |     #[cfg(target_endian = "big")]
 34 |     #[inline(always)]
 35 |     fn to_le(&self) -> Self {
 36 |         self.swap_bytes()
 37 |     }
 38 | 
 39 |     #[cfg(target_endian = "little")]
 40 |     #[inline(always)]
 41 |     fn to_le(&self) -> Self {
 42 |         *self
 43 |     }
 44 | 
 45 |     #[cfg(target_endian = "big")]
 46 |     #[inline(always)]
 47 |     fn from_be(&self) -> Self {
 48 |         *self
 49 |     }
 50 | 
 51 |     #[cfg(target_endian = "little")]
 52 |     #[inline(always)]
 53 |     fn from_be(&self) -> Self {
 54 |         self.swap_bytes()
 55 |     }
 56 | 
 57 |     #[cfg(target_endian = "big")]
 58 |     #[inline(always)]
 59 |     fn from_le(&self) -> Self {
 60 |         self.swap_bytes()
 61 |     }
 62 | 
 63 |     #[cfg(target_endian = "little")]
 64 |     #[inline(always)]
 65 |     fn from_le(&self) -> Self {
 66 |         *self
 67 |     }
 68 | }
 69 | 
 70 | macro_rules! impl_packed_swap_bytes {
 71 |     ($vec:tt, $uvec:tt, $feat:expr, $mmfn:tt, ($($c:expr),*), ($($a:expr, $b:expr),*)) => {
 72 |         impl Reendianize for $vec {
 73 |             #[cfg(not(target_feature = $feat))]
 74 |             #[inline(always)]
 75 |             fn swap_bytes(&self) -> Self {
 76 |                 fallback!();
 77 |                 $vec::new($(self.extract($a).swap_bytes(),
 78 |                             self.extract($b).swap_bytes()),*)
 79 |             }
 80 | 
 81 |             #[cfg(target_feature = $feat)]
 82 |             #[inline(always)]
 83 |             fn swap_bytes(&self) -> Self {
 84 |                 optimized!();
 85 |                 unsafe {
 86 |                     transmute($mmfn(self.be_i8s(), $uvec::new($($c),*).be_i8s()))
 87 |                 }
 88 |             }
 89 |         }
 90 |     }
 91 | }
 92 | 
 93 | macro_rules! test_packed_swap_bytes {
 94 |         (($($vec:tt),*), ($($fn:tt),*)) => {
 95 |             $(
 96 |                 #[test]
 97 |                 fn $fn() {
 98 |                     let a = $vec::interleave(33u8 as <$vec as Packed>::Scalar,
 99 |                                              92u8 as <$vec as Packed>::Scalar);
100 |                     let b = $vec::interleave((33u8 as <$vec as Packed>::Scalar).swap_bytes(),
101 |                                              (92u8 as <$vec as Packed>::Scalar).swap_bytes());
102 |                     assert_eq!(a.swap_bytes(), b);
103 |                 }
104 |             )*
105 |         }
106 |     }
107 | 


--------------------------------------------------------------------------------
/src/intrin/eq.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::core::ops::BitXor;
 9 | use crate::vecs::*;
10 | 
11 | pub trait Eq : Packed {
12 |     type Out : Pattern + BitXor<Self::Out, Output = Self::Out>;
13 | 
14 |     /// Return a vector where each element at an index i is filled with 1s if
15 |     /// the elements of `self` and `other` at index i are equal, and filled with
16 |     /// zeroes otherwise.
17 |     ///
18 |     /// ```
19 |     /// extern crate faster;
20 |     /// use faster::*;
21 |     ///
22 |     /// # fn main() {
23 |     /// assert_eq!(u8s::interleave(0, 2).eq_mask(u8s(0)).be_u8s(), u8s::interleave(0xFF, 0).be_u8s());
24 |     /// assert_eq!(u32s::halfs(1, 0).eq_mask(u32s(0)), u32s::halfs(0, 0xFFFFFFFF));
25 |     /// # }
26 |     /// ```
27 |     fn eq_mask(&self, other: Self) -> Self::Out;
28 | 
29 |     /// Return a vector where each element at an index i is filled with 1s if
30 |     /// the elements of `self` and `other` at index i are not equal, and filled
31 |     /// with zeroes otherwise.
32 |     ///
33 |     /// ```
34 |     /// extern crate faster;
35 |     /// use faster::*;
36 |     ///
37 |     /// # fn main() {
38 |     /// assert_eq!(u8s::interleave(0, 2).ne_mask(u8s(0)), u8s::interleave(0, 0xFF));
39 |     /// assert_eq!(u32s::halfs(1, 0).ne_mask(u32s(0)), u32s::halfs(0xFFFFFFFF, 0));
40 |     /// # }
41 |     /// ```
42 |     #[inline(always)]
43 |     fn ne_mask(&self, other: Self) -> Self::Out { self.eq_mask(other) ^ Self::Out::ones() }
44 | }
45 | 
46 | macro_rules! rust_fallback_eq {
47 |     (impl $trait:tt for $type:tt where $feat:tt {
48 |         $($newfn:ident, $rustfn:ident => $mask:tt, $maskel:tt, $mmfn:tt ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => (
49 |         impl $trait for $type {
50 |             $(
51 |                 type Out = $mask;
52 | 
53 |                 #[inline(always)]
54 |                 #[cfg(target_feature = $feat)]
55 |                 fn $newfn(&self, other: Self) -> $mask {
56 |                     use crate::core::mem::transmute;
57 |                     unsafe { transmute($mmfn(transmute(*self), transmute(other), $($mmfnargs),*)) }
58 |                 }
59 | 
60 |                 #[inline(always)]
61 |                 #[cfg(not(target_feature = $feat))]
62 |                 fn $newfn(&self, other: Self) -> Self::Out {
63 |                     fallback!();
64 |                     use crate::core::mem::transmute;
65 |                     unsafe {
66 |                         Self::Out::new($(transmute(if self.extract($n).$rustfn(&other.extract($n)) {
67 |                             $maskel::max_value()
68 |                         } else {
69 |                             $maskel::min_value()
70 |                         })),*)
71 |                     }
72 |                 }
73 |             )*
74 |         }
75 |     );
76 | }
77 | 
78 | macro_rules! test_packed_eq {
79 |         ($vec:tt, $el:tt, $mask:tt, $maskel:tt, $name:tt) => {
80 |             #[test]
81 |             fn $name() {
82 |                 assert_eq!($vec::halfs(1 as $el, 0 as $el).eq_mask($vec::splat(0 as $el)),
83 |                            $mask::halfs(0, $maskel::max_value()));
84 | 
85 |                 assert_eq!($vec::interleave(1 as $el, 0 as $el).eq_mask($vec::splat(1 as $el)),
86 |                            $mask::interleave($maskel::max_value(), 0));
87 | 
88 |                 assert_eq!($vec::halfs(1 as $el, 0 as $el).ne_mask($vec::splat(0 as $el)),
89 |                            $mask::halfs($maskel::max_value(), 0));
90 | 
91 |                 assert_eq!($vec::interleave(1 as $el, 0 as $el).ne_mask($vec::splat(1 as $el)),
92 |                            $mask::interleave(0, $maskel::max_value()));
93 |             }
94 |         }
95 |     }
96 | 


--------------------------------------------------------------------------------
/src/intrin/hadd.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait HAdd {
 9 |     /// Return a vector containing the interleaved sums of elements in `self`
10 |     /// and `other`. The returned vector will begin with the sum of the first
11 |     /// two elements in `self`, and end with the sum of the last two elements in
12 |     /// `other`
13 |     fn hadd(&self, other: Self) -> Self;
14 | }
15 | 
16 | #[cfg(test)]
17 | mod tests {
18 |     use crate::prelude::*;
19 | 
20 |     #[test]
21 |     fn hadd_i8s() {
22 |         assert_eq!(i8s(1).hadd(i8s(2)), i8s::interleave(2, 4));
23 |         assert_eq!(i8s::interleave(1, 2).hadd(i8s::interleave(3, 4)), i8s::interleave(3, 7));
24 |     }
25 | 
26 |     #[test]
27 |     fn hadd_i16s() {
28 |         assert_eq!(i16s(1).hadd(i16s(2)), i16s::interleave(2, 4));
29 |         assert_eq!(i16s::interleave(1, 2).hadd(i16s::interleave(3, 4)), i16s::interleave(3, 7));
30 |     }
31 | 
32 |     #[test]
33 |     fn hadd_i32s() {
34 |         assert_eq!(i32s(1).hadd(i32s(2)), i32s::interleave(2, 4));
35 |         assert_eq!(i32s::interleave(1, 2).hadd(i32s::interleave(3, 4)), i32s::interleave(3, 7));
36 |     }
37 | 
38 |     #[test]
39 |     fn hadd_i64s() {
40 |         assert_eq!(i64s(1).hadd(i64s(2)), i64s::interleave(2, 4));
41 |         assert_eq!(i64s::interleave(1, 2).hadd(i64s::interleave(3, 4)), i64s::interleave(3, 7));
42 |     }
43 | 
44 |     #[test]
45 |     fn hadd_u8s() {
46 |         assert_eq!(u8s(1).hadd(u8s(2)), u8s::interleave(2, 4));
47 |         assert_eq!(u8s::interleave(1, 2).hadd(u8s::interleave(3, 4)), u8s::interleave(3, 7));
48 |     }
49 | 
50 |     #[test]
51 |     fn hadd_u16s() {
52 |         assert_eq!(u16s(1).hadd(u16s(2)), u16s::interleave(2, 4));
53 |         assert_eq!(u16s::interleave(1, 2).hadd(u16s::interleave(3, 4)), u16s::interleave(3, 7));
54 |     }
55 | 
56 |     #[test]
57 |     fn hadd_u32s() {
58 |         assert_eq!(u32s(1).hadd(u32s(2)), u32s::interleave(2, 4));
59 |         assert_eq!(u32s::interleave(1, 2).hadd(u32s::interleave(3, 4)), u32s::interleave(3, 7));
60 |     }
61 | 
62 |     #[test]
63 |     fn hadd_u64s() {
64 |         assert_eq!(u64s(1).hadd(u64s(2)), u64s::interleave(2, 4));
65 |         assert_eq!(u64s::interleave(1, 2).hadd(u64s::interleave(3, 4)), u64s::interleave(3, 7));
66 |     }
67 | 
68 |     #[test]
69 |     fn hadd_f32s() {
70 |         assert_eq!(f32s(1.0).hadd(f32s(2.0)), f32s::interleave(2.0, 4.0));
71 |         assert_eq!(f32s::interleave(1.0, 2.0).hadd(f32s::interleave(3.0, 4.0)), f32s::interleave(3.0, 7.0));
72 |     }
73 | 
74 |     #[test]
75 |     fn hadd_f64s() {
76 |         assert_eq!(f64s(1.0).hadd(f64s(2.0)), f64s::interleave(2.0, 4.0));
77 |         assert_eq!(f64s::interleave(1.0, 2.0).hadd(f64s::interleave(3.0, 4.0)), f64s::interleave(3.0, 7.0));
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/intrin/hsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait HSub {
 9 |     /// Return a vector containing the interleaved differences of elements in
10 |     /// `self` and `other`. The returned vector will begin with the difference
11 |     /// of the first two elements in `self`, and end with the difference of the
12 |     /// last two elements in `other`
13 |     fn hsub(&self, other: Self) -> Self;
14 | }
15 | 
16 | #[cfg(test)]
17 | mod tests {
18 |     use crate::prelude::*;
19 | 
20 |     #[test]
21 |     fn hsub_i8s() {
22 |         assert_eq!(i8s(1).hsub(i8s(2)), i8s::interleave(0, 0));
23 |         assert_eq!(i8s::interleave(1, 2).hsub(i8s::interleave(3, 4)), i8s::interleave(-1, -1));
24 |     }
25 | 
26 |     #[test]
27 |     fn hsub_i16s() {
28 |         assert_eq!(i16s(1).hsub(i16s(2)), i16s::interleave(0, 0));
29 |         assert_eq!(i16s::interleave(1, 2).hsub(i16s::interleave(3, 4)), i16s::interleave(-1, -1));
30 |     }
31 | 
32 |     #[test]
33 |     fn hsub_i32s() {
34 |         assert_eq!(i32s(1).hsub(i32s(2)), i32s::interleave(0, 0));
35 |         assert_eq!(i32s::interleave(1, 2).hsub(i32s::interleave(3, 4)), i32s::interleave(-1, -1));
36 |     }
37 | 
38 |     #[test]
39 |     fn hsub_i64s() {
40 |         assert_eq!(i64s(1).hsub(i64s(2)), i64s::interleave(0, 0));
41 |         assert_eq!(i64s::interleave(1, 2).hsub(i64s::interleave(3, 4)), i64s::interleave(-1, -1));
42 |     }
43 | 
44 |     #[test]
45 |     fn hsub_u8s() {
46 |         assert_eq!(u8s(1).hsub(u8s(2)), u8s::interleave(0, 0));
47 |         assert_eq!(u8s::interleave(2, 1).hsub(u8s::interleave(4, 3)), u8s::interleave(1, 1));
48 |     }
49 | 
50 |     #[test]
51 |     fn hsub_u16s() {
52 |         assert_eq!(u16s(1).hsub(u16s(2)), u16s::interleave(0, 0));
53 |         assert_eq!(u16s::interleave(2, 1).hsub(u16s::interleave(4, 3)), u16s::interleave(1, 1));
54 |     }
55 | 
56 |     #[test]
57 |     fn hsub_u32s() {
58 |         assert_eq!(u32s(1).hsub(u32s(2)), u32s::interleave(0, 0));
59 |         assert_eq!(u32s::interleave(2, 1).hsub(u32s::interleave(4, 3)), u32s::interleave(1, 1));
60 |     }
61 | 
62 |     #[test]
63 |     fn hsub_u64s() {
64 |         assert_eq!(u64s(1).hsub(u64s(2)), u64s::interleave(0, 0));
65 |         assert_eq!(u64s::interleave(2, 1).hsub(u64s::interleave(4, 3)), u64s::interleave(1, 1));
66 |     }
67 | 
68 |     #[test]
69 |     fn hsub_f32s() {
70 |         assert_eq!(f32s(1.0).hsub(f32s(2.0)), f32s::interleave(0.0, 0.0));
71 |         assert_eq!(f32s::interleave(1.0, 2.0).hsub(f32s::interleave(3.0, 4.0)), f32s::interleave(-1.0, -1.0));
72 |     }
73 | 
74 |     #[test]
75 |     fn hsub_f64s() {
76 |         assert_eq!(f64s(1.0).hsub(f64s(2.0)), f64s::interleave(0.0, 0.0));
77 |         assert_eq!(f64s::interleave(1.0, 2.0).hsub(f64s::interleave(3.0, 4.0)), f64s::interleave(-1.0, -1.0));
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/intrin/macros.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | 
 9 | macro_rules! rust_fallback_impl {
10 |     (impl $trait:tt for $type:tt where $feat:tt {
11 |         $($rustfn:ident => $mmfn:tt  ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => (
12 |         impl $trait for $type {
13 |             $(
14 |                 #[inline(always)]
15 |                 #[cfg(target_feature = $feat)]
16 |                 fn $rustfn(&self) -> Self {
17 |                     optimized!();
18 |                     unsafe { $mmfn(*self, $($mmfnargs),*) }
19 |                 }
20 | 
21 |                 #[inline(always)]
22 |                 #[cfg(not(target_feature = $feat))]
23 |                 fn $rustfn(&self) -> Self {
24 |                     fallback!();
25 |                     Self::new($(self.extract($n).$rustfn(),)*)
26 |                 }
27 |             )*
28 |         }
29 |     );
30 | }
31 | 
32 | macro_rules! rust_fallback_impl_binary {
33 |     (impl $trait:tt for $type:tt where $feat:tt {
34 |         $($rustfn:ident => $mmfn:tt  ( $($mmfnargs:expr),* ), [$($n:expr),+]);*;}) => (
35 |         impl $trait for $type {
36 |             $(
37 |                 #[inline(always)]
38 |                 #[cfg(target_feature = $feat)]
39 |                 fn $rustfn(&self, other: Self) -> Self {
40 |                     use crate::core::mem::transmute;
41 |                     optimized!();
42 |                     unsafe { transmute($mmfn(transmute(*self), transmute(other), $($mmfnargs),*)) }
43 |                 }
44 | 
45 |                 #[inline(always)]
46 |                 #[cfg(not(target_feature = $feat))]
47 |                 fn $rustfn(&self, other: Self) -> Self {
48 |                     fallback!();
49 |                     Self::new($(self.extract($n).$rustfn(other.extract($n)),)*)
50 |                 }
51 |             )*
52 |         }
53 |     );
54 | }
55 | 
56 | macro_rules! hop {
57 |     ($name:ident, $fn:path, $($a:expr, $b:expr),*) => {
58 |         #[inline(always)]
59 |         fn $name(&self, other: Self) -> Self {
60 |             fallback!();
61 |             Self::new($($fn(self.extract($a), self.extract($b)),
62 |                         $fn(other.extract($a), other.extract($b))),*)
63 |         }
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/intrin/merge.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | pub trait Merge {
  9 |     /// Return a vector with the first half populated by the first half of
 10 |     /// `self`, and the second half populated by the second half of `other`.
 11 |     ///
 12 |     /// ```
 13 |     /// extern crate faster;
 14 |     /// use faster::*;
 15 |     ///
 16 |     /// # fn main() {
 17 |     /// assert_eq!(u8s(2).merge_halves(u8s(3)), u8s::halfs(2, 3));
 18 |     /// # }
 19 |     /// ```
 20 |     fn merge_halves(&self, other: Self) -> Self;
 21 | 
 22 |     /// Return a vector containing the even elements of `self` interleaved with
 23 |     /// the odd elements of other, starting with the first element of `self`.
 24 |     ///
 25 |     /// ```
 26 |     /// extern crate faster;
 27 |     /// use faster::*;
 28 |     ///
 29 |     /// # fn main() {
 30 |     /// assert_eq!(u8s(2).merge_interleaved(u8s(3)), u8s::interleave(2, 3));
 31 |     /// # }
 32 |     /// ```
 33 |     fn merge_interleaved(&self, other: Self) -> Self;
 34 | 
 35 |     /// Return a vector containing the first `offset` elements of `self`, then
 36 |     /// the last `(Self::WIDTH - offset)` elements of `other`.
 37 |     ///
 38 |     /// ```
 39 |     /// extern crate faster;
 40 |     /// use faster::*;
 41 |     ///
 42 |     /// # fn main() {
 43 |     /// assert_eq!(u8s(2).merge_partitioned(u8s(3), 2), u8s::partition(2u8, 3u8, 2));
 44 |     /// # }
 45 |     /// ```
 46 |     fn merge_partitioned(&self, other: Self, offset: usize) -> Self;
 47 | }
 48 | 
 49 | macro_rules! impl_packed_merge {
 50 |     ($vec:ty, $uvec:tt, $uscl:tt, $mmfn:expr, $feat:expr, ($($a:expr),*), ($($b:expr),*), $($na:expr, $nb:expr),*) => {
 51 |         #[cfg(not(target_feature = $feat))]
 52 |         impl Merge for $vec {
 53 | 
 54 |             #[inline(always)]
 55 |             fn merge_halves(&self, other: Self) -> Self {
 56 |                 fallback!();
 57 |                 unsafe {
 58 |                     Self::new($(self.extract_unchecked($a)),*,
 59 |                               $(other.extract_unchecked($b)),*)
 60 |                 }
 61 |             }
 62 | 
 63 |             #[inline(always)]
 64 |             fn merge_interleaved(&self, other: Self) -> Self {
 65 |                 fallback!();
 66 |                 unsafe {
 67 |                     Self::new($(self.extract_unchecked($na), other.extract_unchecked($nb)),*)
 68 |                 }
 69 |             }
 70 | 
 71 |             #[inline(always)]
 72 |             fn merge_partitioned(&self, other: Self, offset: usize) -> Self {
 73 |                 fallback!();
 74 |                 assert!(offset < Self::WIDTH);
 75 |                 let mut ret = self.clone();
 76 |                 for i in offset..Self::WIDTH {
 77 |                     unsafe {
 78 |                         ret = ret.replace_unchecked(i, other.extract_unchecked(i));
 79 |                     }
 80 |                 }
 81 |                 ret
 82 |             }
 83 |         }
 84 | 
 85 |         #[cfg(target_feature = $feat)]
 86 |         impl Merge for $vec {
 87 | 
 88 |             #[inline(always)]
 89 |             fn merge_halves(&self, other: Self) -> Self {
 90 |                 unsafe {
 91 |                     transmute($mmfn(
 92 |                         self.be_i8s(), other.be_i8s(),
 93 |                         transmute($uvec::halfs($uscl::min_value(), $uscl::max_value()))))
 94 |                 }
 95 |             }
 96 | 
 97 |             #[inline(always)]
 98 |             fn merge_interleaved(&self, other: Self) -> Self {
 99 |                 unsafe {
100 |                     transmute($mmfn(
101 |                         self.be_i8s(), other.be_i8s(),
102 |                         transmute($uvec::interleave($uscl::min_value(), $uscl::max_value()))))
103 |                 }
104 |             }
105 | 
106 |             #[inline(always)]
107 |             fn merge_partitioned(&self, other: Self, offset: usize) -> Self {
108 |                 unsafe {
109 |                     transmute($mmfn(
110 |                         self.be_i8s(), other.be_i8s(),
111 |                         transmute(Self::partition_mask(offset))))
112 |                 }
113 |             }
114 |         }
115 |     }
116 | }
117 | 
118 | macro_rules! test_packed_merge {
119 |     (($($vec:tt),*), ($($fn:ident),*)) => {
120 |         $(
121 |             #[test]
122 |             fn $fn() {
123 |                 let asc = 30i32 as <$vec as Packed>::Scalar;
124 |                 let bsc = 5i32 as <$vec as Packed>::Scalar;
125 |                 let a = $vec::splat(asc);
126 |                 let b = $vec::splat(bsc);
127 |                 assert_eq!(a.merge_interleaved(b), $vec::interleave(asc, bsc));
128 |                 assert_eq!(b.merge_interleaved(a), $vec::interleave(bsc, asc));
129 | 
130 |                 assert_eq!(a.merge_halves(b), $vec::halfs(asc, bsc));
131 |                 assert_eq!(b.merge_halves(a), $vec::halfs(bsc, asc));
132 | 
133 |                 for i in 0..$vec::WIDTH {
134 |                     assert_eq!(a.merge_partitioned(b, i), $vec::partition(asc, bsc, i));
135 |                     assert_eq!(b.merge_partitioned(a, i), $vec::partition(bsc, asc, i));
136 |                 }
137 |             }
138 |         )*
139 |     }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/intrin/mod.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub mod abs;
 9 | pub mod addsub;
10 | pub mod cast;
11 | pub mod cmp;
12 | #[macro_use] pub mod destride;
13 | pub mod downcast;
14 | #[macro_use] pub mod endian;
15 | #[macro_use] pub mod eq;
16 | pub mod hadd;
17 | pub mod hsub;
18 | #[macro_use] pub mod macros;
19 | #[macro_use] pub mod merge;
20 | #[macro_use] pub mod popcnt;
21 | pub mod recip;
22 | pub mod round;
23 | pub mod rsqrt;
24 | #[macro_use] pub mod sum;
25 | pub mod saturating_add;
26 | pub mod saturating_hadd;
27 | pub mod saturating_hsub;
28 | pub mod saturating_sub;
29 | pub mod sqrt;
30 | #[macro_use] pub mod transmute;
31 | pub mod upcast;
32 | 
33 | // We use an internal prelude not to clutter the namespace when we import
34 | // from actual prelude.
35 | pub(crate) mod prelude {
36 |     pub use super::abs::*;
37 |     pub use super::addsub::*;
38 |     pub use super::cast::*;
39 |     pub use super::cmp::*;
40 |     pub use super::destride::*;
41 |     pub use super::downcast::*;
42 |     pub use super::endian::*;
43 |     pub use super::eq::*;
44 |     pub use super::hadd::*;
45 |     pub use super::hsub::*;
46 |     pub use super::merge::*;
47 |     pub use super::popcnt::*;
48 |     pub use super::recip::*;
49 |     pub use super::round::*;
50 |     pub use super::rsqrt::*;
51 |     pub use super::sum::*;
52 |     pub use super::saturating_add::*;
53 |     pub use super::saturating_hadd::*;
54 |     pub use super::saturating_hsub::*;
55 |     pub use super::saturating_sub::*;
56 |     pub use super::sqrt::*;
57 |     pub use super::transmute::*;
58 |     pub use super::upcast::*;
59 | }
60 | 


--------------------------------------------------------------------------------
/src/intrin/popcnt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vecs::*;
 9 | 
10 | pub trait Popcnt : Packed {
11 |     fn count_ones(&self) -> usize;
12 | 
13 |     #[inline(always)]
14 |     fn count_zeroes(&self) -> usize {
15 |         (Self::WIDTH * Self::Scalar::SIZE * 8) - self.count_ones()
16 |     }
17 | }
18 | 
19 | // Only used in some architectures. Might produce `unused` warning on others.
20 | #[allow(unused_macros)]
21 | macro_rules! impl_popcnt {
22 |     ($($vec:ty, $fn:ident),*) => {
23 |         $(
24 |             impl Popcnt for $vec {
25 |                 #[inline(always)]
26 |                 #[allow(unused_unsafe)]
27 |                 fn count_ones(&self) -> usize {
28 |                     fallback!();
29 |                     unsafe { $fn(self.be_u8s()) }
30 |                 }
31 |             }
32 |         )*
33 |     }
34 | }
35 | 
36 | // Only used in some architectures. Might produce `unused` warning on others.
37 | #[allow(unused_macros)]
38 | macro_rules! test_popcnt {
39 |     (($($el:tt),*), ($($vec:tt),*), ($($fn:tt),*)) => (
40 |         $(
41 |             #[test]
42 |             fn $fn() {
43 |                 assert_eq!($vec::splat(1i8 as $el).count_ones(), $vec::WIDTH);
44 |                 assert_eq!($vec::splat(1i8 as $el).count_zeroes()
45 |                            + $vec::splat(1i8 as $el).count_ones(),
46 |                            $vec::WIDTH * <<$vec as Packed>::Scalar as Packable>::SIZE * 8);
47 |                 assert_eq!($vec::splat(!(0 as $el)).count_ones(),
48 |                            $vec::WIDTH * <<$vec as Packed>::Scalar as Packable>::SIZE * 8);
49 |                 assert_eq!($vec::splat(!(0 as $el)).count_zeroes(), 0);
50 |             }
51 |         )*
52 |     )
53 | }
54 | 


--------------------------------------------------------------------------------
/src/intrin/recip.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Recip {
 9 |     /// Return a vector containing an estimation of the reciprocal of the
10 |     /// corresponding elements of `self`.
11 |     ///
12 |     /// # Examples
13 |     ///
14 |     /// ```
15 |     /// extern crate faster;
16 |     /// use faster::*;
17 |     ///
18 |     /// # fn main() {
19 |     /// assert!(0.25 - 0.01 < f32s(4.0).recip().coalesce() &&
20 |     ///         0.25 + 0.01 > f32s(4.0).recip().coalesce());
21 |     /// # }
22 |     /// ```
23 |     fn recip(&self) -> Self;
24 | }
25 | 
26 | #[cfg(test)]
27 | mod tests {
28 |     use crate::prelude::*;
29 |     use std::f32::INFINITY;
30 | 
31 |     #[test]
32 |     fn recip_f32s() {
33 |         let mut i = -1024.0;
34 |         while i < 1024.0 {
35 |             // This test has some pretty significant float error if done on x86
36 |             let ans = f32s(i).recip().extract(0);
37 |             let real = f32s(1.0 / i).extract(0);
38 |             assert!((real == INFINITY && ans == INFINITY) || (ans - real).abs() < 0.0005);
39 |             i += 1.0
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/intrin/rsqrt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | // TODO: Guards and non-simd
 9 | 
10 | pub trait Rsqrt {
11 |     /// Return a vector containing an approximation of the reciprocals of the
12 |     /// square-roots of elements in `self`. May contain significant float error
13 |     /// past 10^-3.
14 |     ///
15 |     /// ```
16 |     /// extern crate faster;
17 |     /// use faster::*;
18 |     ///
19 |     /// # fn main() {
20 |     /// assert!(0.33333333 - 0.01 < f32s(9.0).rsqrt().coalesce() &&
21 |     ///         0.33333333 + 0.01 > f32s(9.0).rsqrt().coalesce());
22 |     /// # }
23 |     /// ```
24 |     fn rsqrt(&self) -> Self;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/intrin/saturating_add.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait SaturatingAdd {
 9 |     fn saturating_add(&self, other: Self) -> Self;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/intrin/saturating_hadd.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait SaturatingHAdd {
 9 |     /// Return a vector containing the interleaved sums of elements in `self`
10 |     /// and `other`, using saturating addition. The returned vector will begin
11 |     /// with the sum of the first two elements in `self`, and end with the sum
12 |     /// of the last two elements in `other`
13 |     fn saturating_hadd(&self, other: Self) -> Self;
14 | }
15 | 
16 | #[cfg(test)]
17 | mod tests {
18 |     use crate::prelude::*;
19 | 
20 |     #[test]
21 |     fn saturating_hadd_i8s() {
22 |         assert_eq!(i8s(1).saturating_hadd(i8s(2)), i8s::interleave(2, 4));
23 |         assert_eq!(i8s::interleave(1, 2).saturating_hadd(i8s::interleave(3, 4)), i8s::interleave(3, 7));
24 |         assert_eq!(i8s::interleave(-100, -100).saturating_hadd(i8s::interleave(100, 100)), i8s::interleave(i8::min_value(), i8::max_value()));
25 |     }
26 | 
27 |     #[test]
28 |     fn saturating_hadd_i16s() {
29 |         assert_eq!(i16s(1).saturating_hadd(i16s(2)), i16s::interleave(2, 4));
30 |         assert_eq!(i16s::interleave(1, 2).saturating_hadd(i16s::interleave(3, 4)), i16s::interleave(3, 7));
31 |         assert_eq!(i16s::interleave(-30000, -30000).saturating_hadd(i16s::interleave(30000, 30000)), i16s::interleave(i16::min_value(), i16::max_value()));
32 |     }
33 | 
34 |     #[test]
35 |     fn saturating_hadd_i32s() {
36 |         assert_eq!(i32s(1).saturating_hadd(i32s(2)), i32s::interleave(2, 4));
37 |         assert_eq!(i32s::interleave(1, 2).saturating_hadd(i32s::interleave(3, 4)), i32s::interleave(3, 7));
38 |         assert_eq!(i32s::interleave(-2_000_000_000, -2_000_000_000).saturating_hadd(i32s::interleave(2_000_000_000, 2_000_000_000)), i32s::interleave(i32::min_value(), i32::max_value()));
39 |     }
40 | 
41 |     #[test]
42 |     fn saturating_hadd_i64s() {
43 |         assert_eq!(i64s(1).saturating_hadd(i64s(2)), i64s::interleave(2, 4));
44 |         assert_eq!(i64s::interleave(1, 2).saturating_hadd(i64s::interleave(3, 4)), i64s::interleave(3, 7));
45 |         assert_eq!(i64s::interleave(-9_000_000_000_000_000_000, -9_000_000_000_000_000_000).saturating_hadd(i64s::interleave(9_000_000_000_000_000_000, 9_000_000_000_000_000_000)), i64s::interleave(i64::min_value(), i64::max_value()));
46 |     }
47 | 
48 |     #[test]
49 |     fn saturating_hadd_u8s() {
50 |         assert_eq!(u8s(1).saturating_hadd(u8s(2)), u8s::interleave(2, 4));
51 |         assert_eq!(u8s::interleave(1, 2).saturating_hadd(u8s::interleave(3, 4)), u8s::interleave(3, 7));
52 |         assert_eq!(u8s(200).saturating_hadd(u8s(200)), u8s(u8::max_value()));
53 |     }
54 | 
55 |     #[test]
56 |     fn saturating_hadd_u16s() {
57 |         assert_eq!(u16s(1).saturating_hadd(u16s(2)), u16s::interleave(2, 4));
58 |         assert_eq!(u16s::interleave(1, 2).saturating_hadd(u16s::interleave(3, 4)), u16s::interleave(3, 7));
59 |         assert_eq!(u16s(60000).saturating_hadd(u16s(60000)), u16s(u16::max_value()));
60 |     }
61 | 
62 |     #[test]
63 |     fn saturating_hadd_u32s() {
64 |         assert_eq!(u32s(1).saturating_hadd(u32s(2)), u32s::interleave(2, 4));
65 |         assert_eq!(u32s::interleave(1, 2).saturating_hadd(u32s::interleave(3, 4)), u32s::interleave(3, 7));
66 |         assert_eq!(u32s(4_000_000_000).saturating_hadd(u32s(4_000_000_000)), u32s(u32::max_value()));
67 |     }
68 | 
69 |     #[test]
70 |     fn saturating_hadd_u64s() {
71 |         assert_eq!(u64s(1).saturating_hadd(u64s(2)), u64s::interleave(2, 4));
72 |         assert_eq!(u64s::interleave(1, 2).saturating_hadd(u64s::interleave(3, 4)), u64s::interleave(3, 7));
73 |         assert_eq!(u64s(18_000_000_000_000_000_000).saturating_hadd(u64s(18_000_000_000_000_000_000)), u64s(u64::max_value()));
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/intrin/saturating_hsub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait SaturatingHSub {
 9 |     /// Return a vector containing the interleaved differences of elements in
10 |     /// `self` and `other`, using saturating subtraction. The returned vector
11 |     /// will begin with the difference of the first two elements in `self`, and
12 |     /// end with the difference of the last two elements in `other`
13 |     fn saturating_hsub(&self, other: Self) -> Self;
14 | }
15 | 
16 | #[cfg(test)]
17 | mod tests {
18 |     use crate::prelude::*;
19 | 
20 |     #[test]
21 |     fn saturating_hsub_i8s() {
22 |         assert_eq!(i8s(1).saturating_hsub(i8s(2)), i8s::interleave(0, 0));
23 |         assert_eq!(i8s::interleave(1, 2).saturating_hsub(i8s::interleave(3, 4)), i8s::interleave(-1, -1));
24 |         assert_eq!(i8s::interleave(-100, 100).saturating_hsub(i8s::interleave(100, -100)), i8s::interleave(i8::min_value(), i8::max_value()));
25 |     }
26 | 
27 |     #[test]
28 |     fn saturating_hsub_i16s() {
29 |         assert_eq!(i16s(1).saturating_hsub(i16s(2)), i16s::interleave(0, 0));
30 |         assert_eq!(i16s::interleave(1, 2).saturating_hsub(i16s::interleave(3, 4)), i16s::interleave(-1, -1));
31 |         assert_eq!(i16s::interleave(-30000, 30000).saturating_hsub(i16s::interleave(30000, -30000)), i16s::interleave(i16::min_value(), i16::max_value()));
32 |     }
33 | 
34 |     #[test]
35 |     fn saturating_hsub_i32s() {
36 |         assert_eq!(i32s(1).saturating_hsub(i32s(2)), i32s::interleave(0, 0));
37 |         assert_eq!(i32s::interleave(1, 2).saturating_hsub(i32s::interleave(3, 4)), i32s::interleave(-1, -1));
38 |         assert_eq!(i32s::interleave(-2_000_000_000, 2_000_000_000).saturating_hsub(i32s::interleave(2_000_000_000, -2_000_000_000)), i32s::interleave(i32::min_value(), i32::max_value()));
39 |     }
40 | 
41 |     #[test]
42 |     fn saturating_hsub_i64s() {
43 |         assert_eq!(i64s(1).saturating_hsub(i64s(2)), i64s::interleave(0, 0));
44 |         assert_eq!(i64s::interleave(1, 2).saturating_hsub(i64s::interleave(3, 4)), i64s::interleave(-1, -1));
45 |         assert_eq!(i64s::interleave(-9_000_000_000_000_000_000, 9_000_000_000_000_000_000).saturating_hsub(i64s::interleave(9_000_000_000_000_000_000, -9_000_000_000_000_000_000)), i64s::interleave(i64::min_value(), i64::max_value()));
46 |     }
47 | 
48 |     #[test]
49 |     fn saturating_hsub_u8s() {
50 |         assert_eq!(u8s(1).saturating_hsub(u8s(2)), u8s::interleave(0, 0));
51 |         assert_eq!(u8s::interleave(1, 2).saturating_hsub(u8s::interleave(3, 4)), u8s::interleave(0, 0));
52 |         assert_eq!(u8s::interleave(2, 1).saturating_hsub(u8s::interleave(4, 3)), u8s::interleave(1, 1));
53 |     }
54 | 
55 |     #[test]
56 |     fn saturating_hsub_u16s() {
57 |         assert_eq!(u16s(1).saturating_hsub(u16s(2)), u16s::interleave(0, 0));
58 |         assert_eq!(u16s::interleave(1, 2).saturating_hsub(u16s::interleave(3, 4)), u16s::interleave(0, 0));
59 |         assert_eq!(u16s::interleave(2, 1).saturating_hsub(u16s::interleave(4, 3)), u16s::interleave(1, 1));
60 |     }
61 | 
62 |     #[test]
63 |     fn saturating_hsub_u32s() {
64 |         assert_eq!(u32s(1).saturating_hsub(u32s(2)), u32s::interleave(0, 0));
65 |         assert_eq!(u32s::interleave(1, 2).saturating_hsub(u32s::interleave(3, 4)), u32s::interleave(0, 0));
66 |         assert_eq!(u32s::interleave(2, 1).saturating_hsub(u32s::interleave(4, 3)), u32s::interleave(1, 1));
67 |     }
68 | 
69 |     #[test]
70 |     fn saturating_hsub_u64s() {
71 |         assert_eq!(u64s(1).saturating_hsub(u64s(2)), u64s::interleave(0, 0));
72 |         assert_eq!(u64s::interleave(1, 2).saturating_hsub(u64s::interleave(3, 4)), u64s::interleave(0, 0));
73 |         assert_eq!(u64s::interleave(2, 1).saturating_hsub(u64s::interleave(4, 3)), u64s::interleave(1, 1));
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/intrin/saturating_sub.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait SaturatingSub {
 9 |     fn saturating_sub(&self, other: Self) -> Self;
10 | }
11 | 


--------------------------------------------------------------------------------
/src/intrin/sqrt.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Sqrt {
 9 |     /// Return a vector the containing square roots of the elements of `self`.
10 |     ///
11 |     /// # Examples
12 |     ///
13 |     /// ```
14 |     /// #![feature(rust_2018_preview, stdsimd)]
15 |     /// extern crate faster;
16 |     /// use faster::*;
17 |     ///
18 |     /// # fn main() {
19 |     /// assert_eq!(f32s(4.0).sqrt(), f32s(2.0));
20 |     /// assert_eq!(f64s(9.0).sqrt(), f64s(3.0));
21 |     /// # }
22 |     /// ```
23 |     fn sqrt(&self) -> Self;
24 | }
25 | 
26 | #[cfg(test)]
27 | mod tests {
28 |     use crate::prelude::*;
29 | 
30 |     #[test]
31 |     fn sqrt_f64s() {
32 |         assert_eq!(f64s(1.0).sqrt(), f64s(1.0));
33 |         assert!(f64s(9.0).sqrt().max(f64s(2.999)) == f64s(9.0).sqrt());
34 |         assert!(f64s(9.0).sqrt().min(f64s(3.001)) == f64s(9.0).sqrt());
35 |     }
36 | 
37 |     #[test]
38 |     fn sqrt_f32s() {
39 |         assert_eq!(f32s(1.0).sqrt(), f32s(1.0));
40 |         assert!(f32s(9.0).sqrt().max(f32s(2.999)) == f32s(9.0).sqrt());
41 |         assert!(f32s(9.0).sqrt().min(f32s(3.001)) == f32s(9.0).sqrt());
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/intrin/sum.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | use crate::vecs::*;
 9 | 
10 | pub trait Sum : Packed {
11 |     /// Return a scalar equivalent to the sum of all elements of this vector.
12 |     fn sum(&self) -> Self::Scalar;
13 | }
14 | 
15 | pub trait UpcastSum :  {
16 |     /// Return a scalar equivalent to the sum of all elements of this vector,
17 |     /// but collect the result in an i64 rather than the vector's type.
18 |     fn sum_upcast(&self) -> i64;
19 | }
20 | 
21 | macro_rules! impl_packed_sum {
22 |     ($($vec:tt),*) => {
23 |         $(
24 |             impl Sum for $vec {
25 |                 #[inline(always)]
26 |                 fn sum(&self) -> Self::Scalar {
27 |                     fallback!();
28 |                     self.scalar_reduce(0 as Self::Scalar, |acc, s| acc + s)
29 |                 }
30 |             }
31 |         )*
32 |     }
33 | }
34 | 
35 | macro_rules! impl_packed_upcast_sum {
36 |     ($($vec:tt),*) => {
37 |         $(
38 |             impl UpcastSum for $vec {
39 |                 #[inline(always)]
40 |                 fn sum_upcast(&self) -> i64 {
41 |                     fallback!();
42 |                     self.scalar_reduce(0i64, |acc, s| acc + (s as i64))
43 |                 }
44 |             }
45 |         )*
46 |     }
47 | }
48 | 
49 | macro_rules! test_packed_sum_int {
50 |     ($vec:tt, $el:tt, $name:ident) => {
51 |         #[test]
52 |         fn $name() {
53 |             // Try not to overflow
54 |             let mut i = $el::min_value() / 64 + 1;
55 | 
56 |             while i < $el::max_value() / 64 - 1 {
57 |                 let v = $vec::splat(i);
58 |                 assert_eq!(v.sum(),
59 |                            v.scalar_reduce(0 as $el, |acc, v| acc + v));
60 |                 assert_eq!(v.sum_upcast(),
61 |                            v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64)));
62 |                 i += $el::max_value() / 20;
63 |             }
64 |         }
65 |     };
66 | }
67 | 
68 | macro_rules! test_packed_sum {
69 |     ($vec:tt, $el:tt, $name:ident) => {
70 |         #[test]
71 |         fn $name() {
72 |             for i in -100..100 {
73 |                 let v = $vec::splat(i as $el);
74 |                 assert_eq!(v.sum(),
75 |                            v.scalar_reduce(0 as $el, |acc, v| acc + v));
76 |                 assert_eq!(v.sum_upcast(),
77 |                            v.scalar_reduce(0 as i64, |acc, v| acc + (v as i64)));
78 |             }
79 |         }
80 |     };
81 | }
82 | 


--------------------------------------------------------------------------------
/src/intrin/swizzle.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub trait Swizzle {
 9 |     /// Return a vector containing elements of self, but with even and odd
10 |     /// elements swapped in-place. For (n = 0, 2, ... Self::WIDTH), elements at
11 |     /// indices n and n + 1 are swapped.
12 |     ///
13 |     /// ```
14 |     /// extern crate faster;
15 |     /// use faster::*;
16 |     ///
17 |     /// # fn main() {
18 |     /// assert_eq!(u8s::interleave(2, 1).flip(), u8s::interleave(1, 2));
19 |     /// assert_eq!(u64s::interleave(2, 1).flip(), u64s::interleave(1, 2));
20 |     /// # }
21 |     /// ```
22 |     fn flip(&self) -> Self;
23 | }
24 | 
25 | macro_rules! impl_packed_swizzle {
26 |     ($vec:tt, $uvec:tt, $feat:expr, $mmfn:tt, ($($c:expr),*), ($($a:expr, $b:expr),*)) => {
27 |         impl Swizzle for $vec {
28 |             #[cfg(not(target_feature = $feat))]
29 |             #[inline(always)]
30 |             fn flip(&self) -> Self {
31 |                 fallback!();
32 |                 $vec::new($(self.extract($b), self.extract($a)),*)
33 |             }
34 | 
35 |             #[cfg(target_feature = $feat)]
36 |             #[inline(always)]
37 |             fn flip(&self) -> Self {
38 |                 optimized!();
39 |                 unsafe {
40 |                     transmute($mmfn(self.be_i8s(), $uvec::new($($c),*).be_i8s()))
41 |                 }
42 |             }
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/intrin/transmute.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | pub trait Transmute {
  9 |     type i8s;
 10 |     type u8s;
 11 |     type i16s;
 12 |     type u16s;
 13 |     type i32s;
 14 |     type u32s;
 15 |     type f32s;
 16 |     type i64s;
 17 |     type u64s;
 18 |     type f64s;
 19 | 
 20 |     fn be_i8s(&self) -> Self::i8s;
 21 |     fn be_u8s(&self) -> Self::u8s;
 22 |     fn be_i16s(&self) -> Self::i16s;
 23 |     fn be_u16s(&self) -> Self::u16s;
 24 |     fn be_i32s(&self) -> Self::i32s;
 25 |     fn be_u32s(&self) -> Self::u32s;
 26 |     // TODO: Remove possibility of signalling NaNs
 27 |     unsafe fn be_f32s_unchecked(&self) -> Self::f32s;
 28 |     fn be_i64s(&self) -> Self::i64s;
 29 |     fn be_u64s(&self) -> Self::u64s;
 30 |     // TODO: Remove possibility of signalling NaNs
 31 |     unsafe fn be_f64s_unchecked(&self) -> Self::f64s;
 32 | }
 33 | 
 34 | macro_rules! impl_packed_transmute {
 35 |     ($($t:ty,)* ... $u8s:ty, $i8s:ty, $u16s:ty, $i16s:ty, $u32s:ty, $i32s:ty,
 36 |      $f32s:ty, $u64s:ty, $i64s:ty, $f64s:ty, $feat:expr, $nfeat:expr) => (
 37 |         $(
 38 |             impl Transmute for $t {
 39 |                 type i8s = $i8s;
 40 |                 type u8s = $u8s;
 41 |                 type i16s = $i16s;
 42 |                 type u16s = $u16s;
 43 |                 type i32s = $i32s;
 44 |                 type u32s = $u32s;
 45 |                 type f32s = $f32s;
 46 |                 type i64s = $i64s;
 47 |                 type u64s = $u64s;
 48 |                 type f64s = $f64s;
 49 | 
 50 |                 #[inline(always)]
 51 |                 fn be_i8s(&self) -> Self::i8s {
 52 |                     unsafe { transmute::<Self, Self::i8s>(*self) }
 53 |                 }
 54 | 
 55 |                 #[inline(always)]
 56 |                 fn be_u8s(&self) -> Self::u8s {
 57 |                     unsafe { transmute::<Self, Self::u8s>(*self) }
 58 |                 }
 59 | 
 60 |                 #[inline(always)]
 61 |                 fn be_i16s(&self) -> Self::i16s {
 62 |                     unsafe { transmute::<Self, Self::i16s>(*self) }
 63 |                 }
 64 | 
 65 |                 #[inline(always)]
 66 |                 fn be_u16s(&self) -> Self::u16s {
 67 |                     unsafe { transmute::<Self, Self::u16s>(*self) }
 68 |                 }
 69 | 
 70 |                 #[inline(always)]
 71 |                 fn be_i32s(&self) -> Self::i32s {
 72 |                     unsafe { transmute::<Self, Self::i32s>(*self) }
 73 |                 }
 74 | 
 75 |                 #[inline(always)]
 76 |                 fn be_u32s(&self) -> Self::u32s {
 77 |                     unsafe { transmute::<Self, Self::u32s>(*self) }
 78 |                 }
 79 | 
 80 |                 #[inline(always)]
 81 |                 unsafe fn be_f32s_unchecked(&self) -> Self::f32s {
 82 |                     transmute::<Self, Self::f32s>(*self)
 83 |                 }
 84 | 
 85 |                 #[inline(always)]
 86 |                 fn be_i64s(&self) -> Self::i64s {
 87 |                     unsafe { transmute::<Self, Self::i64s>(*self) }
 88 |                 }
 89 | 
 90 |                 #[inline(always)]
 91 |                 fn be_u64s(&self) -> Self::u64s {
 92 |                     unsafe { transmute::<Self, Self::u64s>(*self) }
 93 |                 }
 94 | 
 95 |                 #[inline(always)]
 96 |                 unsafe fn be_f64s_unchecked(&self) -> Self::f64s {
 97 |                     transmute::<Self, Self::f64s>(*self)
 98 |                 }
 99 |             }
100 |         )*
101 |     );
102 | }
103 | 
104 | #[cfg(test)]
105 | mod tests {
106 |     use crate::prelude::*;
107 | 
108 |     macro_rules! test_transmute {
109 |         ($name:ident, $val:expr, $xmute:ident) => (
110 |             #[test]
111 |             fn $name() {
112 |                 #![allow(unused_unsafe)]
113 |                 assert_eq!(unsafe { $val.be_i8s().$xmute() }, $val);
114 |                 assert_eq!(unsafe { $val.be_u8s().$xmute() }, $val);
115 |                 assert_eq!(unsafe { $val.be_i16s().$xmute() }, $val);
116 |                 assert_eq!(unsafe { $val.be_u16s().$xmute() }, $val);
117 |                 assert_eq!(unsafe { $val.be_i32s().$xmute() }, $val);
118 |                 assert_eq!(unsafe { $val.be_u32s().$xmute() }, $val);
119 |                 assert_eq!(unsafe { $val.be_i64s().$xmute() }, $val);
120 |                 assert_eq!(unsafe { $val.be_u64s().$xmute() }, $val);
121 |             }
122 |         )
123 |     }
124 | 
125 |     test_transmute!(transmute_u8s, u8s(1), be_u8s);
126 |     test_transmute!(transmute_i8s, i8s(1), be_i8s);
127 |     test_transmute!(transmute_u16s, u16s(1), be_u16s);
128 |     test_transmute!(transmute_i16s, i16s(1), be_i16s);
129 |     test_transmute!(transmute_u32s, u32s(1), be_u32s);
130 |     test_transmute!(transmute_i32s, i32s(1), be_i32s);
131 |     test_transmute!(transmute_f32s, f32s(1.0), be_f32s_unchecked);
132 |     test_transmute!(transmute_u64s, u64s(1), be_u64s);
133 |     test_transmute!(transmute_i64s, i64s(1), be_i64s);
134 |     test_transmute!(transmute_f64s, f64s(1.0), be_f64s_unchecked);
135 | }
136 | 


--------------------------------------------------------------------------------
/src/intrin/upcast.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | // TODO: Upcast<i..> for u..
 9 | 
10 | pub trait Upcast<T> {
11 |     /// Return two vectors containing elements of the same value, but different
12 |     /// type. The first vector contains the first half of `self`, and the second
13 |     /// vector contains the second half. Both returned vectors are equal in size
14 |     /// to `self`.
15 |     ///
16 |     /// # Examples
17 |     ///
18 |     /// ```
19 |     /// extern crate faster;
20 |     /// use faster::*;
21 |     ///
22 |     /// # fn main() {
23 |     /// assert_eq!(i8s::halfs(2, 3).upcast(), (i16s(2), i16s(3)))
24 |     /// # }
25 |     /// ```
26 |     fn upcast(self) -> (T, T);
27 | }
28 | 
29 | #[cfg(test)]
30 | mod tests {
31 |     use crate::prelude::*;
32 | 
33 |     #[test]
34 |     fn upcast_i8s() {
35 |         assert_eq!(i8s::interleave(1, 2).upcast().0, i16s::interleave(1, 2));
36 |         assert_eq!(i8s::interleave(1, 2).upcast().1, i16s::interleave(1, 2));
37 |     }
38 | 
39 |     #[test]
40 |     fn upcast_u8s() {
41 |         assert_eq!(u8s::interleave(1, 2).upcast().0, u16s::interleave(1, 2));
42 |         assert_eq!(u8s::interleave(1, 2).upcast().1, u16s::interleave(1, 2));
43 |     }
44 | 
45 |     #[test]
46 |     fn upcast_i16s() {
47 |         assert_eq!(i16s::interleave(1, 2).upcast().0, i32s::interleave(1, 2));
48 |         assert_eq!(i16s::interleave(1, 2).upcast().1, i32s::interleave(1, 2));
49 |     }
50 | 
51 |     #[test]
52 |     fn upcast_u16s() {
53 |         assert_eq!(u16s::interleave(1, 2).upcast().0, u32s::interleave(1, 2));
54 |         assert_eq!(u16s::interleave(1, 2).upcast().1, u32s::interleave(1, 2));
55 |     }
56 | 
57 |     #[test]
58 |     fn upcast_i32s_i64s() {
59 |         // TODO: Fix ugliness
60 |         assert_eq!(Upcast::<i64s>::upcast(i32s::interleave(1, 2)).0, i64s::interleave(1, 2));
61 |         assert_eq!(Upcast::<i64s>::upcast(i32s::interleave(1, 2)).1, i64s::interleave(1, 2));
62 |     }
63 | 
64 |     #[test]
65 |     fn upcast_i32s_f64s() {
66 |         // TODO: Fix ugliness
67 |         assert_eq!(Upcast::<f64s>::upcast(i32s::interleave(1, 2)).0, f64s::interleave(1.0, 2.0));
68 |         assert_eq!(Upcast::<f64s>::upcast(i32s::interleave(1, 2)).1, f64s::interleave(1.0, 2.0));
69 |     }
70 | 
71 |     #[test]
72 |     fn upcast_u32s() {
73 |         assert_eq!(u32s::interleave(1, 2).upcast().0, u64s::interleave(1, 2));
74 |         assert_eq!(u32s::interleave(1, 2).upcast().1, u64s::interleave(1, 2));
75 |     }
76 | 
77 |     #[test]
78 |     fn upcast_f32s() {
79 |         assert_eq!(f32s::interleave(1.0, 2.0).upcast(), (f64s::interleave(1.0, 2.0), f64s::interleave(1.0, 2.0)));
80 |         assert_eq!(f32s::interleave(1.0, 2.0).upcast().0, f64s::interleave(1.0, 2.0));
81 |         assert_eq!(f32s::interleave(1.0, 2.0).upcast().1, f64s::interleave(1.0, 2.0));
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | //! The SIMD library for humans.
  9 | 
 10 | //! Faster allows convenient application of explicit SIMD to existing code. It
 11 | //! allows you to write explicit SIMD code once and compile it for any target,
 12 | //! regardless of architecture, SIMD capability, or age.
 13 | 
 14 | //! # SIMD Iterators
 15 | //!
 16 | //! SIMD iterators are formed using [`simd_iter`], [`simd_iter_mut`], and
 17 | //! [`into_simd_iter`], which return types which allow the usage of the
 18 | //! [`simd_map`] and [`simd_reduce`] functions. These functions automatically
 19 | //! pack your iterator's data into SIMD vectors and allow you to transparently
 20 | //! operate on them in a closure.
 21 | //!
 22 | //! [`simd_iter`]: iters/trait.IntoSIMDIterator.html#tymethod.into_simd_iter
 23 | //! [`simd_iter_mut`]: iters/trait.IntoSIMDIterator.html#tymethod.simd_iter
 24 | //! [`into_simd_iter`]: iters/trait.IntoSIMDRefMutIterator.html#tymethod.simd_iter_mut
 25 | //! [`simd_map`]: iters/trait.SIMDIterator.html#tymethod.simd_map
 26 | //! [`simd_reduce`]: iters/trait.SIMDIterator.html#tymethod.simd_reduce
 27 | //!
 28 | //! # SIMD Polyfills
 29 | //!
 30 | //! Once your data is packed into a SIMD vector, you may perform many common
 31 | //! SIMD operations on it. These operations have names and behavior independent
 32 | //! of any vendor-specific ISA, and have non-SIMD polyfills for machines which
 33 | //! cannot perform these operations in a single cycle. See the [`intrin`] module
 34 | //! for all available operations.
 35 | //!
 36 | //! [`intrin`]: intrin/index.html
 37 | //!
 38 | //! # Examples
 39 | //!
 40 | //! Faster is currently capable of mapping and reductive operations in SIMD.
 41 | //!
 42 | //! ## Mapping
 43 | //!
 44 | //! The simplest example of a computation with `faster` is a single map
 45 | //! operation.
 46 | //!
 47 | //! ```
 48 | //! extern crate faster;
 49 | //! use faster::*;
 50 | //!
 51 | //! # #[cfg(not(feature = "std"))]
 52 | //! # fn main() { }
 53 | //!
 54 | //! # #[cfg(feature = "std")]
 55 | //! # fn main() {
 56 | //! let lots_of_10s = [-10i8; 3000].simd_iter(i8s(0))
 57 | //!    .simd_map(|v| v.abs())
 58 | //!    .scalar_collect();
 59 | //! assert_eq!(lots_of_10s, vec![10u8; 3000]);
 60 | //! # }
 61 | //! ```
 62 | //!
 63 | //! In this example, a vector of type [`i8s`] is passed into the closure. The
 64 | //! exact type of [`i8s`] is dependent on compilation target, but it will always
 65 | //! implement the same operations. Because taking the absolute value of a vector
 66 | //! converts it to [`u8s`], the closure will return [`u8s`].
 67 | //!
 68 | //! [`scalar_collect`] takes the iterator of [`u8s`] and converts it into a
 69 | //! `Vec<u8>`.
 70 | //!
 71 | //! [`i8s`]: vecs/type.i8s.html
 72 | //! [`u8s`]: vecs/type.u8s.html
 73 | //! [`scalar_collect`]: iters/trait.IntoScalar.html#tymethod.scalar_collect
 74 | //!
 75 | //! ## Reduction
 76 | //!
 77 | //! Faster can perform reductive operations with similar power to mapping
 78 | //! operations:
 79 | //!
 80 | //! ```
 81 | //! #![feature(stdsimd)]
 82 | //! extern crate faster;
 83 | //! use faster::*;
 84 | //!
 85 | //! # fn main() {
 86 | //! let two_hundred = [2.0f32; 100].simd_iter(f32s(0.0))
 87 | //!    .simd_reduce(f32s(0.0), |acc, v| acc + v)
 88 | //!    .sum();
 89 | //! assert_eq!(two_hundred, 200.0f32);
 90 | //! # }
 91 | //! ```
 92 | //!
 93 | //! This example sums every number in the collection. The first parameter to
 94 | //! simd_reduce is the default value of the accumulator, just like any
 95 | //! other reduction. The second value is used if the collection being reduced
 96 | //! over doesn't fit evenly into your system's vectors - it is the default value
 97 | //! of the last vector, and each element of the vector is used only if it isn't
 98 | //! filled by an element of the collection. Typically, a value of 0 or 1 is a
 99 | //! suitable default.
100 | //!
101 | //! Minding portability is very important when performing reductive
102 | //! operations. See below for some tips on keeping your code portable across all
103 | //! architectures.
104 | //!
105 | //! ## Multiple collections
106 | //!
107 | //! Faster supports vectorized lockstep iteration over multiple collections.
108 | //! Simply [`zip`] them up, and proceed as normal.
109 | //!
110 | //! [`zip`]: zip/trait.IntoSIMDZip.html
111 | //!
112 | //! ```
113 | //! extern crate faster;
114 | //! use faster::*;
115 | //!
116 | //! # #[cfg(not(feature = "std"))]
117 | //! # fn main() { }
118 | //!
119 | //! # #[cfg(feature = "std")]
120 | //! # fn main() {
121 | //! let sevens = ([4i32; 200].simd_iter(i32s(0)), [3i32; 200].simd_iter(i32s(0)))
122 | //!     .zip()
123 | //!     .simd_map(|(a, b)| a + b)
124 | //!     .scalar_collect();
125 | //! # }
126 | //! ```
127 | //!
128 | //! ## Striping Collections
129 | //!
130 | //! Reading every nth element of a collection can be vectorized on most
131 | //! machines. Simply call [`stride`], or one of the slightly-faster tuple-based
132 | //! functions, such as [`stride_two`].
133 | //!
134 | //! [`stride`]: iters/struct.SIMDRefIter.html#method.stride
135 | //! [`stride_two`]: iters/struct.SIMDRefIter.html#method.stride_two
136 | //!
137 | //! ```
138 | //! extern crate faster;
139 | //! use faster::*;
140 | //!
141 | //! # #[cfg(not(feature = "std"))]
142 | //! # fn main() { }
143 | //!
144 | //! # #[cfg(feature = "std")]
145 | //! # fn main() {
146 | //!     // Computes the determinant of matrices arranged as [a, b, c, d, a, b, c...]
147 | //!     let slice: &[f32] = &[1.0f32; 1024];
148 | //!     let determinant = slice.stride_four(tuplify!(4, f32s(0.0))).zip()
149 | //!         .simd_map(|(a, b, c, d)| a * d - b * c)
150 | //!         .scalar_collect();
151 | //! # }
152 | //! ```
153 | //!
154 | //! # Portability
155 | //!
156 | //! While `faster` does most of the work ensuring your code stays portable
157 | //! across platforms, a user of this library must still understand that it is
158 | //! very possible to write non-portable algorithms using this library. Anything
159 | //! which relies on vector width, anything which is impure, and anything which
160 | //! uses constants in reductive operations is inherently nonportable. Some
161 | //! examples below:
162 | //!
163 | //! ```
164 | //! extern crate faster;
165 | //! use faster::*;
166 | //!
167 | //! # #[cfg(not(feature = "std"))]
168 | //! # fn main() { }
169 | //!
170 | //! # #[cfg(feature = "std")]
171 | //! # fn main() {
172 | //! let mut flip = true;
173 | //! let impure = [1i8; 3000].simd_iter(i8s(0))
174 | //!    .simd_map(|v| { flip = !flip; if flip { v + i8s(1) } else { v } })
175 | //!    .scalar_collect();
176 | //! // Depending on the width of your target's SIMD vectors, `impure` could be
177 | //! // [1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, ...] or
178 | //! // [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, ...], etc.
179 | //! # }
180 | //! ```
181 | //!
182 | //! ```
183 | //! extern crate faster;
184 | //! use faster::*;
185 | //!
186 | //! # fn main() {
187 | //! let length_dependent = [0i8; 10].simd_iter(i8s(0))
188 | //!    .simd_reduce(i8s(0), |acc, v| acc + v + i8s(1)).sum();
189 | //! // `length_dependent` could be a different number on a different target!
190 | //! # }
191 | //! ```
192 | //!
193 | //! As a precaution, it is best practice to keep all functions pure, and only
194 | //! operate on SIMD vectors in your SIMD-enabled closures unless you know
195 | //! exactly what is happening under the hood. It's also important to remember
196 | //! that these problems will crop up even if you only support x86; the width
197 | //! difference between AVX and SSE is the primary source of these issues!
198 | 
199 | #![cfg_attr(not(feature = "std"), no_std)]
200 | #![cfg_attr(test, feature(test))]
201 | #![feature(stdsimd)]
202 | // , mmx_target_feature, sse4a_target_feautre, tbm_target_feature
203 | 
204 | mod core {
205 |     #[cfg(not(feature = "std"))]
206 |     pub use core::*;
207 |     #[cfg(feature = "std")]
208 |     pub use std::*;
209 | }
210 | 
211 | extern crate packed_simd;
212 | extern crate vektor;
213 | 
214 | #[macro_use] pub(crate) mod debug;
215 | #[macro_use] pub mod zip;
216 | #[macro_use] pub mod vecs;
217 | pub mod vec_patterns;
218 | pub mod iters;
219 | pub mod into_iters;
220 | #[macro_use] pub mod intrin;
221 | #[macro_use] pub mod arch;
222 | pub mod prelude;
223 | pub mod stride_zip;
224 | pub mod stride;
225 | 
226 | pub use crate::prelude::*;
227 | 


--------------------------------------------------------------------------------
/src/prelude.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | pub use crate::iters::*;
 9 | pub use crate::into_iters::*;
10 | pub use crate::vecs::{Packed, Pattern};
11 | pub use crate::arch::current::vecs::{u8s, i8s, u16s, i16s, u32s, i32s, f32s, u64s, i64s, f64s};
12 | pub use crate::arch::current::intrin::prelude::*;
13 | pub use crate::intrin::prelude::*;
14 | pub use crate::zip::*;
15 | pub use crate::stride_zip::*;
16 | pub use crate::stride::*;
17 | 


--------------------------------------------------------------------------------
/src/stride_zip.rs:
--------------------------------------------------------------------------------
  1 | use crate::iters::{SIMDIterator};
  2 | use crate::vecs::{Packed, Packable};
  3 | use crate::intrin::destride::*;
  4 | use crate::zip::{SIMDZippedIterable, SIMDZippedIterator, SIMDZippedObject};
  5 | 
  6 | pub struct StrideZip<T> where T : SIMDIterator, T::Vector : Destride {
  7 |     base: usize,
  8 |     peek: Option<T::Vector>,
  9 |     iter: T
 10 | }
 11 | 
 12 | /// A trait which can transform a collection of iterators into a `Zip`
 13 | pub trait IntoStrideZip : Sized {
 14 |     /// Return an iterator which may iterate over `self` in lockstep.
 15 |     fn stride_zip(self) -> StrideZip<Self>
 16 |         where Self : SIMDIterator, Self::Vector : Destride;
 17 | }
 18 | 
 19 | impl<T> IntoStrideZip for T where T : SIMDIterator, T::Vector : Destride {
 20 |     fn stride_zip(self) -> StrideZip<Self> {
 21 |         StrideZip {
 22 |             base: self.scalar_pos(),
 23 |             peek: None,
 24 |             iter: self
 25 |         }
 26 |     }
 27 | }
 28 | 
 29 | impl<T> SIMDZippedObject for StrideZip<T> where T : SIMDIterator, T::Vector : Destride {
 30 |     type Scalars = (T::Scalar, T::Scalar);
 31 |     type Vectors = (T::Vector, T::Vector);
 32 | 
 33 |     /// Return the vector length of this object.
 34 |     #[inline(always)]
 35 |     fn width(&self) -> usize {
 36 |         T::Vector::WIDTH
 37 |     }
 38 | 
 39 |     /// Return the scalar length of this object.
 40 |     #[inline(always)]
 41 |     fn size(&self) -> usize {
 42 |         T::Scalar::SIZE
 43 |     }
 44 | }
 45 | 
 46 | impl<T> ExactSizeIterator for StrideZip<T> where T : SIMDIterator, T::Vector : Destride {
 47 |     #[inline(always)]
 48 |     fn len(&self) -> usize {
 49 |         self.iter.len() / 2
 50 |     }
 51 | }
 52 | 
 53 | impl<T> SIMDZippedIterable for StrideZip<T> where T : SIMDIterator, T::Vector : Destride {
 54 |     #[inline(always)]
 55 |     fn scalar_pos(&self) -> usize {
 56 |         (self.iter.scalar_pos() - self.base) / 2
 57 |     }
 58 | 
 59 |     #[inline(always)]
 60 |     fn vector_pos(&self) -> usize {
 61 |         (self.iter.vector_pos() - (self.base / self.width())) / 2
 62 |     }
 63 | 
 64 |     #[inline(always)]
 65 |     fn scalar_len(&self) -> usize {
 66 |         self.iter.scalar_len() / 2
 67 |     }
 68 | 
 69 |     #[inline(always)]
 70 |     fn advance(&mut self, amount: usize) {
 71 |         self.iter.advance(2 * amount);
 72 |     }
 73 | 
 74 |     #[inline(always)]
 75 |     fn default(&self) -> Self::Vectors {
 76 |         (T::Vector::default(), T::Vector::default())
 77 |     }
 78 | }
 79 | 
 80 | impl<T> Iterator for StrideZip<T> where T : SIMDIterator, T::Vector : Destride {
 81 |     type Item = <Self as SIMDZippedObject>::Vectors;
 82 |     
 83 |     fn next(&mut self) -> Option<Self::Item> {
 84 |         let first = self.iter.next()?;
 85 |         let second = self.iter.next();
 86 |         if let Some(second) = second {
 87 |             Some(first.destride_two(second))
 88 |         } else {
 89 |             self.peek = Some(first);
 90 |             None
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | impl<T> SIMDZippedIterator for StrideZip<T> where T : SIMDIterator, T::Vector : Destride {
 96 |     fn end(&mut self) -> Option<(Self::Vectors, usize)> {
 97 |         let first = self.iter.next();
 98 |         let (end, n) = self.iter.end().unwrap_or((self.iter.default(), 0));
 99 |         if let Some(first) = first {
100 |             Some((first.destride_two(end), (self.width() + n) / 2))
101 |         } else {
102 |             if let Some(v) = self.peek {
103 |                 self.peek = None;
104 |                 Some((v.destride_two(end), (self.width() + n) / 2))
105 |             } else if n > 0 {
106 |                 Some((end.destride_two(self.iter.default()), n / 2))
107 |             } else {
108 |                 None
109 |             }
110 |         }
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/src/vec_patterns.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | // This file is machine-generated. See vec_patterns_gen.py for more info.
 9 | 
10 | use crate::vecs::*; 
11 |  
12 | /// Constructors which may be used to instantiate vectors with patterned data.
13 | pub trait Pattern : Packed {
14 |     /// Return a vector whose first `Self::WIDTH / 2` elements are `hi`, and
15 |     /// whose last `Self::WIDTH / 2` elements are `lo`.
16 |     fn halfs(hi: Self::Scalar, lo: Self::Scalar) -> Self;
17 | 
18 |     /// Return a vector containing `hi` at every even index, and lo at every odd
19 |     /// index.
20 |     fn interleave(hi: Self::Scalar, lo: Self::Scalar) -> Self;
21 | 
22 |     /// Return a vector whose first `off` elements are `hi`, and whose last
23 |     /// `Self::WIDTH - off` elements are `lo`.
24 |     fn partition(hi: Self::Scalar, lo: Self::Scalar, off: usize) -> Self;
25 | 
26 |     /// Return a vector whose first `off` elements are memset to 0x00, and whose
27 |     /// last `Self::WIDTH - off` elements are memset to 0xFF.
28 |     fn partition_mask(off: usize) -> Self;
29 | 
30 |     /// Return a vector made entirely of ones.
31 |     fn ones() -> Self;
32 | 
33 |     /// Return a vector made entirely of zeroes.
34 |     fn zeroes() -> Self;
35 | }
36 | 


--------------------------------------------------------------------------------
/src/vecs.rs:
--------------------------------------------------------------------------------
  1 | // This file is part of faster, the SIMD library for humans.
  2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
  3 | 
  4 | // This Source Code Form is subject to the terms of the Mozilla Public
  5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | #![allow(dead_code)]
  8 | 
  9 | pub use crate::vec_patterns::Pattern;
 10 | use crate::core::fmt::Debug;
 11 | use crate::intrin::merge::*;
 12 | 
 13 | /// A SIMD vector of some type.
 14 | pub trait Packed : Sized + Copy + Debug + Merge {
 15 |     /// The type which fits into this SIMD vector
 16 |     type Scalar : Packable;
 17 | 
 18 |     /// The number of elements in this vector
 19 |     const WIDTH: usize;
 20 | 
 21 |     #[inline(always)]
 22 |     /// Return the number of elements in this vector
 23 |     fn width(&self) -> usize {
 24 |         Self::WIDTH
 25 |     }
 26 | 
 27 |     /// Create a new vector with `Self::WIDTH` elements from `data`, beginning
 28 |     /// at `offset`.
 29 |     fn load(data: &[Self::Scalar], offset: usize) -> Self;
 30 | 
 31 |     /// Create a new vector with `Self::WIDTH` elements from `data`, beginning
 32 |     /// at `offset`, without asserting length of data.
 33 |     unsafe fn load_unchecked(data: &[Self::Scalar], offset: usize) -> Self;
 34 | 
 35 |     /// Write `Self::WIDTH` elements from this vector to `data`, beginning at
 36 |     /// `offset`.
 37 |     fn store(self, data: &mut [Self::Scalar], offset: usize);
 38 | 
 39 |     /// Create a new vector with `Self::WIDTH` elements from `data`, beginning
 40 |     /// at `offset`, without asserting length of data.
 41 |     unsafe fn store_unchecked(self, data: &mut [Self::Scalar], offset: usize);
 42 | 
 43 |     /// Assert all elements of the vector are equal, then return the
 44 |     /// element. Opposite operation of `Self::splat`.
 45 |     fn coalesce(self) -> Self::Scalar;
 46 | 
 47 |     /// Return a vector with all elements initialized to `data`. Opposite
 48 |     /// operation for `Self::coalesce`.
 49 |     fn splat(data: Self::Scalar) -> Self;
 50 | 
 51 |     /// Return a vector with all elements initialized to the default
 52 |     /// value for the underlying element type.
 53 |     fn default() -> Self;
 54 | 
 55 |     /// Return the `idx`th element of this vector.
 56 |     fn extract(&self, idx: usize) -> Self::Scalar;
 57 | 
 58 |     /// Return the `idx`th element of this vector.
 59 |     unsafe fn extract_unchecked(&self, idx: usize) -> Self::Scalar;
 60 | 
 61 |     /// Replace the `idx`th element of this vector with `data`.
 62 |     fn replace(&mut self, idx: usize, data: Self::Scalar) -> Self;
 63 | 
 64 |     /// Replace the `idx`th element of this vector with `data`.
 65 |     unsafe fn replace_unchecked(&mut self, idx: usize, data: Self::Scalar) -> Self;
 66 | 
 67 |     /// Return a scalar equivalent to the product of all elements of this
 68 |     /// vector.
 69 |     fn product(&self) -> Self::Scalar;
 70 | 
 71 |     /// Return the result of a scalar reduction over this vector
 72 |     fn scalar_reduce<T, F>(&self, acc: T, func: F) -> T
 73 |     where F: FnMut(T, Self::Scalar) -> T;
 74 | }
 75 | 
 76 | /// A type that may be packed into a SIMD vector.
 77 | pub trait Packable where Self : Sized + Copy + Debug {
 78 |     type Vector : Packed<Scalar = Self> + Clone;
 79 |     const SIZE: usize;
 80 | }
 81 | 
 82 | // Vector types which aren't interpreted as SIMD vectors, for systems which
 83 | // don't have SIMD support.
 84 | 
 85 | macro_rules! impl_packed {
 86 |     ($el:tt, $pvec:tt, $vec:tt, $sz:expr, $width:expr, [$($feat:expr),*], [$($nfeat:expr),*]) => (
 87 | 
 88 |         /// A SIMD vector of this primitive type.
 89 |         #[allow(non_camel_case_types)]
 90 |         #[cfg(all($(target_feature = $feat,)* not($(target_feature = $nfeat)*)))]
 91 |         pub type $pvec = $vec;
 92 | 
 93 |         /// Return a vector of this type with all elements initialized to
 94 |         /// `data`.
 95 |         #[inline(always)]
 96 |         #[cfg(all($(target_feature = $feat,)* not($(target_feature = $nfeat)*)))]
 97 |         pub fn $pvec(data: $el) -> $pvec {
 98 |             $vec::splat(data)
 99 |         }
100 | 
101 |         #[cfg(all($(target_feature = $feat,)* not($(target_feature = $nfeat)*)))]
102 |         impl Packable for $el {
103 |             type Vector = $vec;
104 |             const SIZE: usize = $sz;
105 |         }
106 | 
107 |         impl Packed for $vec {
108 |             type Scalar = $el;
109 | 
110 |             const WIDTH: usize = $width;
111 | 
112 |             #[inline(always)]
113 |             fn load(data: &[$el], offset: usize) -> $vec {
114 |                 $vec::from_slice_unaligned(&data[offset..])
115 |             }
116 | 
117 |             #[inline(always)]
118 |             unsafe fn load_unchecked(data: &[$el], offset: usize) -> $vec {
119 |                 debug_assert!(data[offset..].len() >= Self::WIDTH);
120 |                 $vec::from_slice_unaligned_unchecked(&data[offset..])
121 |             }
122 | 
123 |             #[inline(always)]
124 |             fn store(self, data: &mut [$el], offset: usize) {
125 |                 $vec::write_to_slice_unaligned(self, &mut data[offset..]);
126 |             }
127 | 
128 |             #[inline(always)]
129 |             unsafe fn store_unchecked(self, data: &mut [$el], offset: usize) {
130 |                 debug_assert!(data[offset..].len() >= Self::WIDTH);
131 |                 $vec::write_to_slice_unaligned_unchecked(self, &mut data[offset..]);
132 |             }
133 | 
134 |             #[inline(always)]
135 |             fn coalesce(self) -> Self::Scalar {
136 |                 for i in 1..Self::WIDTH {
137 |                     debug_assert!(self.extract(i - 1) == self.extract(i));
138 |                 }
139 |                 self.extract(0)
140 |             }
141 | 
142 |             #[inline(always)]
143 |             fn extract(&self, idx: usize) -> Self::Scalar {
144 |                 $vec::extract(*self, idx )
145 |             }
146 | 
147 |             #[inline(always)]
148 |             unsafe fn extract_unchecked(&self, idx: usize) -> Self::Scalar {
149 |                 debug_assert!(idx < Self::WIDTH);
150 |                 $vec::extract_unchecked(*self, idx )
151 |             }
152 | 
153 |             #[inline(always)]
154 |             fn replace(&mut self, idx: usize, data: Self::Scalar) -> Self {
155 |                 $vec::replace(*self, idx, data)
156 |             }
157 | 
158 |             #[inline(always)]
159 |             unsafe fn replace_unchecked(&mut self, idx: usize, data: Self::Scalar) -> Self {
160 |                 debug_assert!(idx < Self::WIDTH);
161 |                 $vec::replace_unchecked(*self, idx, data)
162 |             }
163 | 
164 |             #[inline(always)]
165 |             fn splat(data: $el) -> Self {
166 |                 $vec::splat(data)
167 |             }
168 | 
169 |             #[inline(always)]
170 |             fn default() -> Self {
171 |                 $vec::splat($el::default())
172 |             }
173 | 
174 |             #[inline(always)]
175 |             fn product(&self) -> Self::Scalar {
176 |                 let mut acc = 1 as $el;
177 |                 for i in 0..Self::WIDTH {
178 |                     acc *= self.extract(i)
179 |                 }
180 |                 acc
181 |             }
182 | 
183 |             #[inline(always)]
184 |             fn scalar_reduce<T, F>(&self, mut acc: T, mut func: F) -> T
185 |             where F: FnMut(T, Self::Scalar) -> T {
186 |                 for i in 0..Self::WIDTH {
187 |                     acc = func(acc, self.extract(i))
188 |                 }
189 |                 acc
190 |             }
191 |         }
192 |     );
193 | }
194 | 
195 | 


--------------------------------------------------------------------------------
/tests/iters.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test)]
 2 | 
 3 | extern crate faster;
 4 | 
 5 | #[cfg(test)]
 6 | mod tests {
 7 |     use faster::*;
 8 | 
 9 |     #[test]
10 |     #[cfg(feature = "std")]
11 |     fn in_place_mutation() {
12 |         let test = |mut vec: Vec<f32>| {
13 |             let mut scl = vec.clone();
14 |             vec.simd_iter_mut(f32s(0.0))
15 |                 .simd_for_each(|x| *x /= f32s(2f32));
16 | 
17 |             scl.iter_mut()
18 |                 .for_each(|x| *x /= 2f32);
19 | 
20 |             assert_eq!(vec, scl);
21 |         };
22 | 
23 |         let vec: Vec<f32> = (0..(f32s::WIDTH - 1)).map(|x| x as f32).collect();
24 |         test(vec);
25 | 
26 |         let vec: Vec<f32> = (0..f32s::WIDTH).map(|x| x as f32).collect();
27 |         test(vec);
28 | 
29 |         let vec: Vec<f32> = (0..(f32s::WIDTH + 1)).map(|x| x as f32).collect();
30 |         test(vec);
31 |     }
32 | 
33 |     #[test]
34 |     fn simd_reduce() {
35 |         let vec = [2u32; 129];
36 |         let sum = vec.simd_iter(u32s(0u32)).simd_reduce(u32s(0u32), |acc, x| acc + x).sum();
37 |         assert_eq!(sum, 2 * 129);
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/tests/kernel.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | #![feature(test,stdsimd)]
 9 | 
10 | extern crate faster;
11 | 
12 | #[cfg(test)]
13 | mod tests {
14 |     use faster::*;
15 | 
16 |     macro_rules! kernel_definite {
17 |         ($name:ident, $native_type:ty, $simd_type:ident) => (
18 | 
19 |             /// Tests a number of simple kernel computations with integer values.
20 |             #[test]
21 |             fn $name() {
22 |                 for n in 0 .. 16 {
23 | 
24 |                     let vec_of_1 = vec![1 as $native_type; n];
25 |                     let vec_of_3 = vec![3 as $native_type; n];
26 |                     let mut out_vec = vec![0 as $native_type; n];
27 | 
28 |                     // Should produce n times (3 - 1) * (3 - 1) == n * 4 for each element
29 |                     let sum: $native_type = ((&vec_of_3[..]).simd_iter($simd_type(0)), (&vec_of_1[..]).simd_iter($simd_type(0))).zip()
30 |                         .simd_map(|(a, b)| (a - b) * (a - b))
31 |                         .scalar_fill(&mut out_vec)
32 |                         .iter()
33 |                         .sum();
34 | 
35 |                     assert_eq!(sum, (n * 4) as $native_type);
36 | 
37 |                     // Same as above, but this time we reduce with simd_reduce
38 |                     let sum: $native_type = ((&vec_of_3[..]).simd_iter($simd_type(0)), (&vec_of_1[..]).simd_iter($simd_type(0))).zip()
39 |                         .simd_map(|(a, b)| (a - b) * (a - b))
40 |                         .simd_reduce($simd_type(0), |a, v| a + v)
41 |                         .sum();
42 | 
43 |                     assert_eq!(sum, (n * 4) as $native_type);
44 |                 }
45 |             }
46 |         )
47 |     }
48 | 
49 |     kernel_definite!(kernel_i64, i64, i64s);
50 |     kernel_definite!(kernel_i32, i32, i32s);
51 |     kernel_definite!(kernel_i16, i16, i16s);
52 |     kernel_definite!(kernel_i8, i8, i8s);
53 | 
54 |     kernel_definite!(kernel_u64, u64, u64s);
55 |     kernel_definite!(kernel_u32, u32, u32s);
56 |     kernel_definite!(kernel_u16, u16, u16s);
57 |     kernel_definite!(kernel_u8, u8, u8s);
58 | 
59 |     macro_rules! kernel_relative {
60 |         ($name:ident, $native_type:ty, $simd_type:ident) => (
61 | 
62 |             /// Tests a number of simple kernel computations with float values.
63 |             #[test]
64 |             fn $name() {
65 |                 for n in 0 .. 16 {
66 |                     let vec_of_1 = vec![1 as $native_type; n];
67 |                     let vec_of_3 = vec![3 as $native_type; n];
68 | 
69 |                     // Should produce n times (1 - 3) * (1 - 3) == n * 4 for each element
70 |                     let sum_scalar: $native_type = vec_of_1.iter()
71 |                         .zip(vec_of_3.iter())
72 |                         .map(|(a, b)| (a - b) * (a - b))
73 |                         .sum();
74 | 
75 |                     // Same as above, but this time we reduce with simd_reduce
76 |                     let sum_simd: $native_type = (vec_of_1.simd_iter($simd_type(0.0 as $native_type)),
77 |                                                   vec_of_3.simd_iter($simd_type(0.0 as $native_type)))
78 |                         .zip()
79 |                         .simd_map(|(a, b)| (a - b) * (a - b))
80 |                         .simd_reduce($simd_type(0.0 as $native_type), |a, v| a + v)
81 |                         .sum();
82 | 
83 |                     // Ensure both ways produce the same result
84 |                     assert_eq!(sum_scalar, sum_simd);
85 | 
86 |                     // Make sure the result is equal to our target within a certain limit.
87 |                     assert!((sum_simd - (n * 4) as $native_type).abs() < 0.0001);
88 |                 }
89 |             }
90 |         )
91 |     }
92 | 
93 |     kernel_relative!(kernel_f32, f32, f32s);
94 |     kernel_relative!(kernel_f64, f64, f64s);
95 | }
96 | 


--------------------------------------------------------------------------------
/tests/zip.rs:
--------------------------------------------------------------------------------
 1 | // This file is part of faster, the SIMD library for humans.
 2 | // Copyright 2017 Adam Niederer <adam.niederer@gmail.com>
 3 | 
 4 | // This Source Code Form is subject to the terms of the Mozilla Public
 5 | // License, v. 2.0. If a copy of the MPL was not distributed with this
 6 | // file, You can obtain one at http://mozilla.org/MPL/2.0/.
 7 | 
 8 | #![feature(test)]
 9 | #![feature(stdsimd)]
10 | 
11 | extern crate faster;
12 | 
13 | #[cfg(test)]
14 | mod tests {
15 |     use faster::*;
16 | 
17 |     #[test]
18 |     #[cfg(feature = "std")]
19 |     fn zipped_stride_iters() {
20 |         let matrices = [1i16, 2, 3, 4, 5, 6, 7, 8, 9][..].iter().cycle().take(9 * 100).map(|i| i.clone()).collect::<Vec<_>>();
21 |         let determinants = (&matrices[..]).stride_nine(tuplify!(9, i16s(0))).zip()
22 |             .simd_map(|(a, b, c, d, e, f, g, h, i)| {
23 |                 assert_eq!(a.extract(a.width() - 1), 1);
24 |                 assert_eq!(b.extract(b.width() - 1), 2);
25 |                 assert_eq!(c.extract(c.width() - 1), 3);
26 |                 assert_eq!(d.extract(d.width() - 1), 4);
27 |                 assert_eq!(e.extract(e.width() - 1), 5);
28 |                 assert_eq!(f.extract(f.width() - 1), 6);
29 |                 assert_eq!(g.extract(g.width() - 1), 7);
30 |                 assert_eq!(h.extract(h.width() - 1), 8);
31 |                 assert_eq!(i.extract(i.width() - 1), 9);
32 |                 (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h)
33 |             }).scalar_collect();
34 |         assert!(determinants.iter().fold(true, |acc, x| acc && x == &0));
35 | 
36 |         let matrices = [1i64, 0, 0, 0, 5, 4, 2, 3, 0][..].iter().cycle().take(9 * 100).map(|i| i.clone()).collect::<Vec<_>>();
37 |         let determinants = (&matrices[..]).stride_nine(tuplify!(9, i64s(0))).zip()
38 |             .simd_map(|(a, b, c, d, e, f, g, h, i)| {
39 |                 (a * e * i) + (b * f * g) + (c * d * h) - (c * e * g) - (b * d * i) - (a * f * h)
40 |             }).scalar_collect();
41 |         assert!(determinants.iter().fold(true, |acc, x| { acc && x == &-12 }));
42 |     }
43 | 
44 |     #[test]
45 |     #[cfg(feature = "std")]
46 |     fn zipped_heterogeneous_iters() {
47 |         let to_stride = [1i8, 2, 3, 4, 5, 6, 7, 8][..].iter().cycle().take(512).map(|i| i.clone()).collect::<Vec<_>>();
48 |         let (a, b) = to_stride.stride_two(tuplify!(2, i8s(0)));
49 |         let standard_iter_a = vec!(3i8; 256).into_simd_iter(i8s(0));
50 |         let standard_iter_b = vec!(7i8; 256).into_simd_iter(i8s(0));
51 | 
52 |         let a_times_three = (a, standard_iter_a).zip()
53 |             .simd_map(|(s, c)| s * c)
54 |             .scalar_collect();
55 | 
56 |         let b_times_three = (b, standard_iter_b).zip()
57 |             .simd_map(|(s, c)| s * c)
58 |             .scalar_collect();
59 | 
60 |         let a_times_three_check = to_stride.chunks(2).map(|c| c[0] * 3);
61 |         let b_times_three_check = to_stride.chunks(2).map(|c| c[1] * 7);
62 | 
63 |         assert!(a_times_three_check.zip(a_times_three)
64 |                 .fold(true, |acc, (a, b)| acc && a == b));
65 | 
66 |         assert!(b_times_three_check.zip(b_times_three)
67 |                 .fold(true, |acc, (a, b)| acc && a == b));
68 |     }
69 | 
70 |     #[test]
71 |     fn zip_simd_reduce() {
72 |         let vec1 = [2u32; 129];
73 |         let vec2 = [3u32; 129];
74 |         let result = (vec1.simd_iter(u32s(0u32)), vec2.simd_iter(u32s(0u32))).zip().simd_reduce(u32s(0u32), |acc, (x, y)| acc + x * y).sum();
75 |         assert_eq!(result, 2 * 3 * 129);
76 |     }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------