├── .github
    ├── codecov.yml
    └── workflows
    │   ├── audit.yaml
    │   ├── benchmarks.yaml
    │   ├── coverage.yaml
    │   └── test.yaml
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── benches
    └── track.rs
├── benchmarks
    ├── Cargo.toml
    └── benches
    │   ├── dot_product.rs
    │   ├── life.rs
    │   ├── simple.rs
    │   ├── sum.rs
    │   └── utils.rs
├── examples
    └── matrix.rs
├── proptest-regressions
    └── sse.txt
└── src
    ├── iterators.rs
    ├── lib.rs
    ├── mask.rs
    ├── types.rs
    └── vector.rs


/.github/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment:
 2 |   layout: "diff, flags, files"
 3 |   require_changes: true
 4 | 
 5 | coverage:
 6 |   status:
 7 |     project:
 8 |       default:
 9 |         informational: true
10 | 


--------------------------------------------------------------------------------
/.github/workflows/audit.yaml:
--------------------------------------------------------------------------------
 1 | name: Security audit
 2 | on:
 3 |   pull_request:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   schedule:
 8 |     - cron: '0 0 * * 0'
 9 | 
10 | jobs:
11 |   security_audit:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@v2
15 |       - uses: actions-rs/audit-check@35b7b53b1e25b55642157ac01b4adceb5b9ebef3
16 |         with:
17 |           token: ${{ secrets.GITHUB_TOKEN }}
18 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmarks.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   # Run once a week to preserve the cache
 7 |   # (even though it still feels the cache gets lost sometimes?)
 8 |   # FIXME: Doesn't seem to be working. Using the GH pages thing for now.
 9 |   #schedule:
10 |   #  - cron: '0 0 * * 0'
11 | 
12 | name: benchmark pull requests
13 | 
14 | jobs:
15 |   runBenchmark:
16 |     name: run benchmark
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v2
21 |         with:
22 |           fetch-depth: 0
23 | 
24 |       - name: Install Rust
25 |         uses: actions-rs/toolchain@v1
26 |         with:
27 |           toolchain: stable
28 |           default: true
29 |           profile: minimal
30 | 
31 |       - name: Restore compile cache
32 |         uses: Swatinem/rust-cache@v1
33 | 
34 |       - name: Restore previous benchmark data
35 |         uses: actions/cache@v2
36 |         with:
37 |           path: ./bench-cache
38 |           key: ${{ runner.os }}-benchmark
39 | 
40 |       - name: Run benchmarks
41 |         # We choose just the tracking ones. There's a whole fleet that we check
42 |         # that compile, but they are too heavy both to run in CI and to show in
43 |         # the PRs. And they mostly compare us to other methods.
44 |         #
45 |         # Provide the bencher output, as the following tool knows how to read that.
46 |         run: cargo bench --bench track -- --output-format bencher | grep -v 'Gnuplot not found' | tee benches.out
47 | 
48 |       - name: Compare benchmarks
49 |         uses: rhysd/github-action-benchmark@4eed2c2f4cd0d374720c4b913f79faa8aafcfa6b
50 |         with:
51 |           name: Track benchmarks
52 |           tool: cargo
53 |           output-file-path: benches.out
54 |           github-token: ${{ secrets.GITHUB_TOKEN }}
55 |           auto-push: true
56 |           alert-threshold: '150%'
57 |           comment-on-alert: true
58 |           comment-always: true
59 |           # We don't want that to fail. Both our benchmarks and the CI are a
60 |           # bit noisy and we have quite a few measurements, so the chance of
61 |           # one failing at random is quite high. It's still nice to have it
62 |           # measured and available as a comment.
63 |           fail-on-alert: false
64 |           #external-data-json-path: ./bench-cache/benchmark-data.json
65 |           # Because it doesn't put it into the PR, it puts it into the commit :-|
66 |           alert-comment-cc-users: '@vorner'
67 | 


--------------------------------------------------------------------------------
/.github/workflows/coverage.yaml:
--------------------------------------------------------------------------------
 1 | name: Test coverage
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 |   RUST_BACKTRACE: full
12 | 
13 | jobs:
14 |   coverage:
15 |     name: Coverage
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v2
20 | 
21 |       - name: Install Rust
22 |         uses: actions-rs/toolchain@v1
23 |         with:
24 |           toolchain: nightly
25 |           profile: minimal
26 |           default: true
27 | 
28 |       - name: Restore cache
29 |         uses: Swatinem/rust-cache@v1
30 | 
31 |       - name: Run cargo-tarpaulin
32 |         uses: actions-rs/tarpaulin@v0.1
33 |         with:
34 |           args: '--all-features --run-types Doctests,Tests'
35 |           timeout: 120
36 | 
37 |       - name: Upload to codecov.io
38 |         uses: codecov/codecov-action@5a8bb4701eca7ba3673f21664b887f652c58d0a3
39 |         with:
40 |           token: ${{ secrets.CODECOV_TOKEN }}
41 | 
42 |       - name: Archive code coverage results
43 |         uses: actions/upload-artifact@v2
44 |         with:
45 |           name: code-coverage-report
46 |           path: cobertura.xml
47 |           retention-days: 30
48 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
  1 | name: test
  2 | 
  3 | on:
  4 |   push:
  5 | 
  6 | env:
  7 |   CARGO_TERM_COLOR: always
  8 |   RUST_BACKTRACE: full
  9 | 
 10 | jobs:
 11 |   test:
 12 |     name: Build & test
 13 |     strategy:
 14 |       fail-fast: false
 15 |       matrix:
 16 |         os:
 17 |           - ubuntu-latest
 18 |           - macos-latest
 19 |           - windows-latest
 20 |         rust:
 21 |           - stable
 22 |           - beta
 23 |           - nightly
 24 | 
 25 |     runs-on: ${{ matrix.os }}
 26 | 
 27 |     steps:
 28 |       - name: checkout
 29 |         uses: actions/checkout@v2
 30 | 
 31 |       - name: Install Rust
 32 |         uses: actions-rs/toolchain@v1
 33 |         with:
 34 |           toolchain: ${{ matrix.rust }}
 35 |           default: true
 36 |           profile: minimal
 37 | 
 38 |       - name: Restore cache
 39 |         uses: Swatinem/rust-cache@v1
 40 | 
 41 |       - name: Build & test
 42 |         env:
 43 |           RUST_VERSION: ${{ matrix.rust }}
 44 |           OS: ${{ matrix.os }}
 45 |           RUSTFLAGS: -D warnings
 46 |         run: cargo test --all-features
 47 | 
 48 |   rustfmt:
 49 |     name: Check formatting
 50 |     runs-on: ubuntu-latest
 51 |     steps:
 52 |       - name: checkout
 53 |         uses: actions/checkout@v2
 54 | 
 55 |       - name: Install Rust
 56 |         uses: actions-rs/toolchain@v1
 57 |         with:
 58 |           profile: minimal
 59 |           toolchain: stable
 60 |           default: true
 61 |           components: rustfmt
 62 | 
 63 |       - run: cargo fmt --all -- --check
 64 | 
 65 |   links:
 66 |     name: Check documentation links
 67 |     runs-on: ubuntu-latest
 68 |     steps:
 69 |       - name: checkout
 70 |         uses: actions/checkout@v2
 71 | 
 72 |       - name: Install Rust
 73 |         uses: actions-rs/toolchain@v1
 74 |         with:
 75 |           toolchain: stable
 76 |           default: true
 77 | 
 78 |       - name: Restore cache
 79 |         uses: Swatinem/rust-cache@v1
 80 | 
 81 |       - name: Check links
 82 |         run: cargo rustdoc --all-features -- -D warnings
 83 | 
 84 |   clippy:
 85 |     name: Clippy lints
 86 |     runs-on: ubuntu-latest
 87 |     steps:
 88 |       - name: Checkout repository
 89 |         uses: actions/checkout@v2
 90 | 
 91 |       - name: Install Rust
 92 |         uses: actions-rs/toolchain@v1
 93 |         with:
 94 |           toolchain: stable
 95 |           profile: minimal
 96 |           default: true
 97 |           components: clippy
 98 | 
 99 |       - name: Restore cache
100 |         uses: Swatinem/rust-cache@v1
101 | 
102 |       - name: Run clippy linter
103 |         run: cargo clippy --all --all-features --tests -- -D clippy::all -D warnings
104 | 
105 | #  miri:
106 | #    name: Miri checks
107 | #    runs-on: ubuntu-latest
108 | #    steps:
109 | #      - name: Checkout repository
110 | #        uses: actions/checkout@v2
111 | #
112 | #      - name: Install Rust
113 | #        uses: actions-rs/toolchain@v1
114 | #        with:
115 | #          toolchain: nightly
116 | #          profile: minimal
117 | #          default: true
118 | #          components: "miri"
119 | #
120 | #      - name: Restore cache
121 | #        uses: Swatinem/rust-cache@v1
122 | #
123 | #      - name: Run miri
124 | #        env:
125 | #          PROPTEST_CASES: "10"
126 | #          MIRIFLAGS: "-Zmiri-disable-isolation"
127 | #        run: cargo miri test --all-features
128 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | Cargo.lock
3 | tags
4 | perf.data
5 | perf.data.old
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: rust
 2 | cache: cargo
 3 | rust:
 4 |     - stable
 5 |     - beta
 6 |     - nightly
 7 | os:
 8 |     - windows
 9 |     - linux
10 |     - osx
11 | 
12 | before_script:
13 |     - |
14 |       (travis_wait rustup component add rustfmt-preview || true) &&
15 |       (travis_wait rustup component add clippy-preview || true)
16 | 
17 | script:
18 |     - |
19 |       ./ci-check.sh
20 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 0.2.1
 2 | 
 3 | * `From` implementations for relevant arrays.
 4 | * `mul_add` support.
 5 | 
 6 | # 0.2.0
 7 | 
 8 | * Refactorings to use const generics instead of `generic_array`.
 9 | * Few more operators (eg. `vector *= scalar`).
10 | 
11 | # 0.1.1
12 | 
13 | * Free-standing versions of `vectorize` and `vectorize_pad`, to have a place to
14 |   put a turbofish.
15 | 
16 | # 0.1.0
17 | 
18 | * Initial release.
19 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "slipstream"
 3 | version = "0.2.1"
 4 | authors = ["Michal 'vorner' Vaner <vorner@vorner.cz>"]
 5 | edition = "2018"
 6 | description = "SIMD library usable by the masses"
 7 | repository = "https://github.com/vorner/splitstream"
 8 | readme = "README.md"
 9 | keywords = ["simd", "performance"]
10 | categories = ["hardware-support"]
11 | license = "Apache-2.0 OR MIT"
12 | autobenches = false
13 | 
14 | [badges]
15 | travis-ci = { repository = "vorner/arc-swap" }
16 | maintenance = { status = "actively-developed" }
17 | 
18 | [workspace]
19 | members = ["benchmarks"]
20 | 
21 | [dependencies]
22 | num-traits = "0.2"
23 | 
24 | [dev-dependencies]
25 | criterion = "~0.3"
26 | multiversion = "~0.6"
27 | proptest = "~0.10"
28 | rand = "~0.8"
29 | 
30 | [profile.release]
31 | debug = 2
32 | 
33 | [profile.test]
34 | # Some tests are slow to run. Even slower than it takes to compile them properly.
35 | opt-level = 1
36 | 
37 | [[bench]]
38 | name = "track"
39 | harness = false
40 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                               Apache License
  2 |                         Version 2.0, January 2004
  3 |                      http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |    "License" shall mean the terms and conditions for use, reproduction,
 10 |    and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |    "Licensor" shall mean the copyright owner or entity authorized by
 13 |    the copyright owner that is granting the License.
 14 | 
 15 |    "Legal Entity" shall mean the union of the acting entity and all
 16 |    other entities that control, are controlled by, or are under common
 17 |    control with that entity. For the purposes of this definition,
 18 |    "control" means (i) the power, direct or indirect, to cause the
 19 |    direction or management of such entity, whether by contract or
 20 |    otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |    outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |    "You" (or "Your") shall mean an individual or Legal Entity
 24 |    exercising permissions granted by this License.
 25 | 
 26 |    "Source" form shall mean the preferred form for making modifications,
 27 |    including but not limited to software source code, documentation
 28 |    source, and configuration files.
 29 | 
 30 |    "Object" form shall mean any form resulting from mechanical
 31 |    transformation or translation of a Source form, including but
 32 |    not limited to compiled object code, generated documentation,
 33 |    and conversions to other media types.
 34 | 
 35 |    "Work" shall mean the work of authorship, whether in Source or
 36 |    Object form, made available under the License, as indicated by a
 37 |    copyright notice that is included in or attached to the work
 38 |    (an example is provided in the Appendix below).
 39 | 
 40 |    "Derivative Works" shall mean any work, whether in Source or Object
 41 |    form, that is based on (or derived from) the Work and for which the
 42 |    editorial revisions, annotations, elaborations, or other modifications
 43 |    represent, as a whole, an original work of authorship. For the purposes
 44 |    of this License, Derivative Works shall not include works that remain
 45 |    separable from, or merely link (or bind by name) to the interfaces of,
 46 |    the Work and Derivative Works thereof.
 47 | 
 48 |    "Contribution" shall mean any work of authorship, including
 49 |    the original version of the Work and any modifications or additions
 50 |    to that Work or Derivative Works thereof, that is intentionally
 51 |    submitted to Licensor for inclusion in the Work by the copyright owner
 52 |    or by an individual or Legal Entity authorized to submit on behalf of
 53 |    the copyright owner. For the purposes of this definition, "submitted"
 54 |    means any form of electronic, verbal, or written communication sent
 55 |    to the Licensor or its representatives, including but not limited to
 56 |    communication on electronic mailing lists, source code control systems,
 57 |    and issue tracking systems that are managed by, or on behalf of, the
 58 |    Licensor for the purpose of discussing and improving the Work, but
 59 |    excluding communication that is conspicuously marked or otherwise
 60 |    designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |    "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |    on behalf of whom a Contribution has been received by Licensor and
 64 |    subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |    this License, each Contributor hereby grants to You a perpetual,
 68 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |    copyright license to reproduce, prepare Derivative Works of,
 70 |    publicly display, publicly perform, sublicense, and distribute the
 71 |    Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |    this License, each Contributor hereby grants to You a perpetual,
 75 |    worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |    (except as stated in this section) patent license to make, have made,
 77 |    use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |    where such license applies only to those patent claims licensable
 79 |    by such Contributor that are necessarily infringed by their
 80 |    Contribution(s) alone or by combination of their Contribution(s)
 81 |    with the Work to which such Contribution(s) was submitted. If You
 82 |    institute patent litigation against any entity (including a
 83 |    cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |    or a Contribution incorporated within the Work constitutes direct
 85 |    or contributory patent infringement, then any patent licenses
 86 |    granted to You under this License for that Work shall terminate
 87 |    as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |    Work or Derivative Works thereof in any medium, with or without
 91 |    modifications, and in Source or Object form, provided that You
 92 |    meet the following conditions:
 93 | 
 94 |    (a) You must give any other recipients of the Work or
 95 |        Derivative Works a copy of this License; and
 96 | 
 97 |    (b) You must cause any modified files to carry prominent notices
 98 |        stating that You changed the files; and
 99 | 
100 |    (c) You must retain, in the Source form of any Derivative Works
101 |        that You distribute, all copyright, patent, trademark, and
102 |        attribution notices from the Source form of the Work,
103 |        excluding those notices that do not pertain to any part of
104 |        the Derivative Works; and
105 | 
106 |    (d) If the Work includes a "NOTICE" text file as part of its
107 |        distribution, then any Derivative Works that You distribute must
108 |        include a readable copy of the attribution notices contained
109 |        within such NOTICE file, excluding those notices that do not
110 |        pertain to any part of the Derivative Works, in at least one
111 |        of the following places: within a NOTICE text file distributed
112 |        as part of the Derivative Works; within the Source form or
113 |        documentation, if provided along with the Derivative Works; or,
114 |        within a display generated by the Derivative Works, if and
115 |        wherever such third-party notices normally appear. The contents
116 |        of the NOTICE file are for informational purposes only and
117 |        do not modify the License. You may add Your own attribution
118 |        notices within Derivative Works that You distribute, alongside
119 |        or as an addendum to the NOTICE text from the Work, provided
120 |        that such additional attribution notices cannot be construed
121 |        as modifying the License.
122 | 
123 |    You may add Your own copyright statement to Your modifications and
124 |    may provide additional or different license terms and conditions
125 |    for use, reproduction, or distribution of Your modifications, or
126 |    for any such Derivative Works as a whole, provided Your use,
127 |    reproduction, and distribution of the Work otherwise complies with
128 |    the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |    any Contribution intentionally submitted for inclusion in the Work
132 |    by You to the Licensor shall be under the terms and conditions of
133 |    this License, without any additional terms or conditions.
134 |    Notwithstanding the above, nothing herein shall supersede or modify
135 |    the terms of any separate license agreement you may have executed
136 |    with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |    names, trademarks, service marks, or product names of the Licensor,
140 |    except as required for reasonable and customary use in describing the
141 |    origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |    agreed to in writing, Licensor provides the Work (and each
145 |    Contributor provides its Contributions) on an "AS IS" BASIS,
146 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |    implied, including, without limitation, any warranties or conditions
148 |    of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |    PARTICULAR PURPOSE. You are solely responsible for determining the
150 |    appropriateness of using or redistributing the Work and assume any
151 |    risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |    whether in tort (including negligence), contract, or otherwise,
155 |    unless required by applicable law (such as deliberate and grossly
156 |    negligent acts) or agreed to in writing, shall any Contributor be
157 |    liable to You for damages, including any direct, indirect, special,
158 |    incidental, or consequential damages of any character arising as a
159 |    result of this License or out of the use or inability to use the
160 |    Work (including but not limited to damages for loss of goodwill,
161 |    work stoppage, computer failure or malfunction, or any and all
162 |    other commercial damages or losses), even if such Contributor
163 |    has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |    the Work or Derivative Works thereof, You may choose to offer,
167 |    and charge a fee for, acceptance of support, warranty, indemnity,
168 |    or other liability obligations and/or rights consistent with this
169 |    License. However, in accepting such obligations, You may act only
170 |    on Your own behalf and on Your sole responsibility, not on behalf
171 |    of any other Contributor, and only if You agree to indemnify,
172 |    defend, and hold each Contributor harmless for any liability
173 |    incurred by, or claims asserted against, such Contributor by reason
174 |    of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |    To apply the Apache License to your work, attach the following
181 |    boilerplate notice, with the fields enclosed by brackets "[]"
182 |    replaced with your own identifying information. (Don't include
183 |    the brackets!)  The text should be enclosed in the appropriate
184 |    comment syntax for the file format. We also recommend that a
185 |    file or class name and description of purpose be included on the
186 |    same "printed page" as the copyright notice for easier
187 |    identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 | 	http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 tokio-jsonrpc developers
 2 | 
 3 | Permission is hereby granted, free of charge, to any
 4 | person obtaining a copy of this software and associated
 5 | documentation files (the "Software"), to deal in the
 6 | Software without restriction, including without
 7 | limitation the rights to use, copy, modify, merge,
 8 | publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software
10 | is furnished to do so, subject to the following
11 | conditions:
12 | 
13 | The above copyright notice and this permission notice
14 | shall be included in all copies or substantial portions
15 | of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | DEALINGS IN THE SOFTWARE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Slipstream
  2 | 
  3 | [![Actions Status](https://github.com/vorner/slipstream/workflows/test/badge.svg)](https://github.com/vorner/slipstream/actions)
  4 | [![codecov](https://codecov.io/gh/vorner/slipstream/branch/master/graph/badge.svg?token=RG02T39PJZ)](https://codecov.io/gh/vorner/slipstream)
  5 | [![docs](https://docs.rs/slipstream/badge.svg)](https://docs.rs/slipstream)
  6 | 
  7 | 
  8 | This library helps writing code in a way that incentives the compiler to
  9 | optimize the results better (without really doing anything itself).
 10 | 
 11 | Modern compilers, including `rustc`, are able to come up with impressive ways to
 12 | speed up the resulting code, using techniques like loop unrolling and
 13 | autovectorization, routinely outperforming what one would hand-craft.
 14 | Nevertheless, each optimisation has some assumptions that must be proven to hold
 15 | before it can be applied.
 16 | 
 17 | This library offers „vector“ types, like `u16x8`, which act in a very similar
 18 | way as little fixed-sized arrays (in this case it would be `[u16; 8]`), but with
 19 | arithmetics defined for them. They also enforce alignment of the whole vectors.
 20 | Therefore, one can write the algorithm in a way that works on these groups of
 21 | data and make it easier for the compiler to prove the assumptions. This can
 22 | result in multiple factor speed ups by giving the compiler these proofs „for
 23 | free“ and allowing it to apply aggressive optimizations.
 24 | 
 25 | The API is inspired by the [`packed_simd`] and [`faster`] crates, but as it
 26 | relies on the autovectorizer instead of using explicit SIMD instructions, it
 27 | works on stable rust, allows speed ups even on platforms that don't have
 28 | explicit SIMD support from the rust standard library (or no SIMD support at
 29 | all).
 30 | 
 31 | The downside is the optimizations are not *guaranteed*. While it oftentimes
 32 | produces results competitive or even better than hand-crafted vectorized code,
 33 | a small change to surrounding code can also lead to much worse results.  You're
 34 | advised to apply this to only tight loops with enough data to crunch and to
 35 | measure the performance.
 36 | 
 37 | It goes well together with function multiversioning, see for example the
 38 | [`multiversion`] crate.
 39 | 
 40 | More details can be found in the [documentation], including tips for effective
 41 | use and what to try if the performance isn't as good as expected.
 42 | 
 43 | ## Example
 44 | 
 45 | As a very simple example, imagine that the crux of the application's performance
 46 | is summing a huge array of floats and we have this code:
 47 | 
 48 | ```rust
 49 | fn compute(d: &[f32]) -> f32 {
 50 |     d.iter().sum()
 51 | }
 52 | ```
 53 | 
 54 | Now, one could rewrite it to something like this, using manual vectorization:
 55 | 
 56 | ```rust
 57 | use core::arch::x86_64 as arch;
 58 | 
 59 | unsafe fn compute_sse(d: &[f32]) -> f32 {
 60 |     let mut result = arch::_mm_setzero_ps();
 61 |     let iter = data.chunks_exact(4);
 62 |     let remainder = iter.remainder().iter().sum::<f32>();
 63 |     for v in iter {
 64 |         result = arch::_mm_add_ps(result, arch::_mm_loadu_ps(v.as_ptr()));
 65 |     }
 66 | 
 67 |     let result: [f32; 4] = mem::transmute(result);
 68 |     let result = result.iter().sum::<f32>() + remainder;
 69 | }
 70 | ```
 71 | 
 72 | And while this does result in significant speedup, it's also much less readable,
 73 | one has to allow using unsafe through the application logic and is not portable
 74 | (it won't run on anything that's not Intel and it won't take advantage of newer
 75 | and better vector instructions even there). These downside usually make it not
 76 | worth pursuing for more complex algorithms.
 77 | 
 78 | Using `slipstream`, one can also write this:
 79 | 
 80 | ```rust
 81 | fn compute_slipstream(d: &[f32]) -> f32 {
 82 |     // Will split the data into vectors of 4 lanes, padding the last one with
 83 |     // the lanes from the provided parameter.
 84 |     d.vectorize_pad(f32x4::default())
 85 |         // Sum the vectors into a final vector
 86 |         .sum::<f32x4>()
 87 |         // Sum the lanes of the vectors together.
 88 |         .horizontal_sum()
 89 | }
 90 | ```
 91 | 
 92 | This is still longer and more complex than the original, but seems much more
 93 | manageable than the manual version. It's also portable and might provide some
 94 | speedup on platforms that don't have any vector instructions. Using the right
 95 | annotations on the function, one is also able to generate multiple versions and
 96 | dispatch the one that takes advantage of the newest and shiniest instructions
 97 | the CPU supports at runtime.
 98 | 
 99 | Corresponding benchmarks on i5-8265U suggest that this version comes close to
100 | the manual one. Indeed, there are similar variants that are even faster.
101 | 
102 | ```
103 | test sum::basic                               ... bench:  11,707,693 ns/iter (+/- 261,428)
104 | test sum::manual_sse_convert                  ... bench:   3,000,906 ns/iter (+/- 535,041)
105 | test sum::vectorize_pad_default               ... bench:   3,141,834 ns/iter (+/- 81,376)
106 | ```
107 | 
108 | Note: to re-run the benchmarks as above, use `type V = f32x4` in
109 | `benches/utils.rs`.
110 | 
111 | Warning: Floats are not associative. The first, manual, version may produce
112 | slightly different results because of rounding errors.
113 | 
114 | ## Help wanted
115 | 
116 | It is an open source library and help in developing it is welcome. There are
117 | some areal where Your contribution would be especially appreciated:
118 | 
119 | * Feedback about the API, documentation and generally how well it is usable.
120 | * Implementing missing APIs: While a lot is covered already, there are areas
121 |   that are still missing. I know of:
122 |   - Some way to convert between different sizes of the base type (eg. `f32x4 ->
123 |     f64x4`).
124 |   - Various methods on types that are present on the base types ‒ trigonometric
125 |     functions on floats, rounding, absolute values, number of set/unset bits on
126 |     unsigned integers...
127 |   - Vector-scalar multiplications. It is currently possible to do eg
128 |     `f32x2::splat(-1.0) * f32x2::new([1, 2])`, but it would be more comfortable
129 |     if it could be just written as `-1.0 * f32x2::new([1, 2])`.
130 | * Use cases and benchmarks: if you can come up with a simple, well-vectorizable
131 |   problem and submit it as a benchmark, it helps keeping and improving the
132 |   performance of the library. Both cases where the library performs well and
133 |   where it *doesn't* are good to have (the latter could be considered bugs of a
134 |   kind). Optimally, if such benchmark contains a naïve implementation (without
135 |   this library), implementation using this library (possibly in multiple
136 |   variations) and a hand-written vectorized code with the platform specific
137 |   intrinsics. But if any of these are missing (for example because it would be
138 |   too much work to write the manually vectorized code), it's still better than
139 |   nothing.
140 | * Improving performance: While it is the compiler that makes the program go
141 |   fast, how good the compiler is in the job highly depends on if it can „see
142 |   through“ the code. If you can tweak implementation of some method in a way
143 |   that's more understandable and transparent to the compiler, it is great. Most
144 |   of the code was written as fast as possible and only some tweaking was done
145 |   for now. For example, the `vectorize_pad` method seems surprisingly slow,
146 |   ideally it would produce code with comparable speed to `vectorize`.
147 | * Dealing with unsafe: At many places, the library uses `unsafe` code. This was
148 |   oftentimes written that way because of performance ‒ for example, initializing
149 |   the `GenericArray` from an iterator prevented a lot of optimisations and led
150 |   to significantly inferior performance. Optimally, each such `unsafe` code
151 |   would get replaced by safe code, or would get a comment explaining/proving
152 |   that it is indeed safe.
153 | 
154 | If you want to work on anything bigger, it's a good idea to open an issue on the
155 | repository to both discuss it first and to reserve the task.
156 | 
157 | ## License
158 | 
159 | Licensed under either of
160 | 
161 |  * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
162 |  * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
163 | 
164 | at your option.
165 | 
166 | ### Contribution
167 | 
168 | Unless you explicitly state otherwise, any contribution intentionally
169 | submitted for inclusion in the work by you, as defined in the Apache-2.0
170 | license, shall be dual licensed as above, without any additional terms
171 | or conditions.
172 | 
173 | [`packed_simd`]: https://crates.io/crates/packed_simd
174 | [`faster`]: https://crates.io/crates/faster
175 | [`multiversion`]: https://crates.io/crates/multiversion
176 | [documentation]: https://docs.rs/slipstream
177 | 


--------------------------------------------------------------------------------
/benches/track.rs:
--------------------------------------------------------------------------------
 1 | use std::iter;
 2 | 
 3 | use criterion::{black_box, criterion_group, criterion_main, Criterion};
 4 | use multiversion::multiversion;
 5 | 
 6 | use slipstream::prelude::*;
 7 | 
 8 | type V = f32x8;
 9 | 
10 | const SIZE: usize = 4096 * 100;
11 | 
12 | #[multiversion]
13 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")]
14 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")]
15 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")]
16 | #[clone(target = "[arm|aarch64]+neon")]
17 | fn sum(data: &[V]) -> f32 {
18 |     data.iter().copied().sum::<V>().horizontal_sum()
19 | }
20 | 
21 | fn sum_scalar(data: &[f32]) -> f32 {
22 |     data.iter().copied().sum()
23 | }
24 | 
25 | #[multiversion]
26 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")]
27 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")]
28 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")]
29 | fn dot_product(l: &[f32], r: &[f32]) -> f32 {
30 |     (l, r)
31 |         .vectorize()
32 |         .map(|(l, r): (V, V)| l * r)
33 |         .sum::<V>()
34 |         .horizontal_sum()
35 | }
36 | 
37 | fn dot_product_scalar(l: &[f32], r: &[f32]) -> f32 {
38 |     l.iter().zip(r).map(|(l, r)| l * r).sum()
39 | }
40 | 
41 | fn benchmark(c: &mut Criterion) {
42 |     let vecs = iter::repeat_with(rand::random)
43 |         .map(|v: [f32; V::LANES]| V::new(&v))
44 |         .take(SIZE / V::LANES)
45 |         .collect::<Vec<_>>();
46 | 
47 |     let scalars_a = iter::repeat_with(rand::random)
48 |         .take(SIZE)
49 |         .collect::<Vec<_>>();
50 | 
51 |     let scalars_b = iter::repeat_with(rand::random)
52 |         .take(SIZE)
53 |         .collect::<Vec<_>>();
54 | 
55 |     c.bench_function("sum_vec", |b| {
56 |         b.iter(|| black_box(sum(&vecs)));
57 |     });
58 | 
59 |     c.bench_function("sum_scalar", |b| {
60 |         b.iter(|| black_box(sum_scalar(&scalars_a)));
61 |     });
62 | 
63 |     c.bench_function("dot_product_vec", |b| {
64 |         b.iter(|| black_box(dot_product(&scalars_a, &scalars_b)));
65 |     });
66 | 
67 |     c.bench_function("dot_product_scalar", |b| {
68 |         b.iter(|| black_box(dot_product_scalar(&scalars_a, &scalars_b)));
69 |     });
70 | }
71 | 
72 | criterion_group!(benches, benchmark);
73 | criterion_main!(benches);
74 | 


--------------------------------------------------------------------------------
/benchmarks/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "benchmarks"
 3 | version = "0.1.0"
 4 | authors = ["Michal 'vorner' Vaner <vorner@vorner.cz>"]
 5 | edition = "2018"
 6 | publish = false
 7 | autobenches = false
 8 | 
 9 | [dependencies]
10 | 
11 | [dev-dependencies]
12 | slipstream = { path = ".." }
13 | multiversion = "~0.6"
14 | once_cell = "~1"
15 | proptest = "~0.10"
16 | rand = "~0.8"
17 | packed_simd_2 = "~0.3"
18 | 
19 | [[bench]]
20 | name = "simple"
21 | path = "benches/simple.rs"
22 | 


--------------------------------------------------------------------------------
/benchmarks/benches/dot_product.rs:
--------------------------------------------------------------------------------
  1 | use multiversion::multiversion;
  2 | use test::Bencher;
  3 | 
  4 | use crate::mv;
  5 | use crate::utils::{gen_data, gen_vecs, V};
  6 | use slipstream::prelude::*;
  7 | 
  8 | mv! {
  9 |     fn vectorized_idx(l: &[V], r: &[V]) -> f32 {
 10 |         assert_eq!(l.len(), r.len());
 11 |         let mut result = V::default();
 12 |         for i in 0..l.len() {
 13 |             result += l[i] * r[i];
 14 |         }
 15 | 
 16 |         result.horizontal_sum()
 17 |     }
 18 | 
 19 |     fn vectorized(l: &[V], r: &[V]) -> f32 {
 20 |         (l, r).vectorize()
 21 |             .map(|(l, r)| l * r)
 22 |             .sum::<V>()
 23 |             .horizontal_sum()
 24 |     }
 25 | 
 26 |     fn vectorize_zip(l: &[f32], r: &[f32]) -> f32 {
 27 |         let l = l.vectorize();
 28 |         let r = r.vectorize();
 29 |         l.zip(r)
 30 |             .map(|(l, r): (V, V)| l * r)
 31 |             .sum::<V>()
 32 |             .horizontal_sum()
 33 |     }
 34 | 
 35 |     fn vectorize_tuple(l: &[f32], r: &[f32]) -> f32 {
 36 |         (l, r).vectorize()
 37 |             .map(|(l, r): (V, V)| l * r)
 38 |             .sum::<V>()
 39 |             .horizontal_sum()
 40 |     }
 41 | 
 42 |     fn vectorize_tuple_for(l: &[f32], r: &[f32]) -> f32 {
 43 |         let mut result = V::default();
 44 |         for (l, r) in (l, r).vectorize() {
 45 |             let (l, r): (V, V) = (l, r);
 46 |             result += l * r;
 47 |         }
 48 |         result.horizontal_sum()
 49 |     }
 50 | 
 51 |     fn packed(l: &[f32], r: &[f32]) -> f32 {
 52 |         type V = packed_simd_2::f32x16;
 53 |         let l = l.chunks_exact(16);
 54 |         let r = r.chunks_exact(16);
 55 |         let mut result = V::default();
 56 |         for (l, r) in l.zip(r) {
 57 |             let l = V::from_slice_unaligned(l);
 58 |             let r = V::from_slice_unaligned(r);
 59 |             result = l.mul_adde(r, result);
 60 |         }
 61 |         result.sum()
 62 |     }
 63 | }
 64 | 
 65 | #[bench]
 66 | fn simple(b: &mut Bencher) {
 67 |     let (l, r) = gen_data();
 68 | 
 69 |     b.iter(|| {
 70 |         let result: f32 = l.iter().zip(r.iter()).map(|(&l, &r)| l * r).sum();
 71 |         test::black_box(result);
 72 |     });
 73 | }
 74 | 
 75 | #[bench]
 76 | fn vectorized_default(b: &mut Bencher) {
 77 |     let (l, r) = gen_vecs();
 78 |     b.iter(|| {
 79 |         test::black_box(vectorized_default_version(l, r));
 80 |     });
 81 | }
 82 | 
 83 | #[bench]
 84 | fn vectorized_detect(b: &mut Bencher) {
 85 |     let (l, r) = gen_vecs();
 86 |     b.iter(|| {
 87 |         test::black_box(vectorized(l, r));
 88 |     });
 89 | }
 90 | 
 91 | #[bench]
 92 | fn vectorized_idx_default(b: &mut Bencher) {
 93 |     let (l, r) = gen_vecs();
 94 |     b.iter(|| {
 95 |         test::black_box(vectorized_idx_default_version(l, r));
 96 |     });
 97 | }
 98 | 
 99 | #[bench]
100 | fn vectorized_idx_detect(b: &mut Bencher) {
101 |     let (l, r) = gen_vecs();
102 |     b.iter(|| {
103 |         test::black_box(vectorized_idx(l, r));
104 |     });
105 | }
106 | 
107 | #[bench]
108 | fn vectorize_zip_default(b: &mut Bencher) {
109 |     let (l, r) = gen_data();
110 |     b.iter(|| {
111 |         test::black_box(vectorize_zip_default_version(l, r));
112 |     });
113 | }
114 | 
115 | #[bench]
116 | fn vectorize_zip_detect(b: &mut Bencher) {
117 |     let (l, r) = gen_data();
118 |     b.iter(|| {
119 |         test::black_box(vectorize_zip(l, r));
120 |     });
121 | }
122 | 
123 | #[bench]
124 | fn vectorize_tuple_default(b: &mut Bencher) {
125 |     let (l, r) = gen_data();
126 |     b.iter(|| {
127 |         test::black_box(vectorize_tuple_default_version(l, r));
128 |     });
129 | }
130 | 
131 | #[bench]
132 | fn vectorize_tuple_detect(b: &mut Bencher) {
133 |     let (l, r) = gen_data();
134 |     b.iter(|| {
135 |         test::black_box(vectorize_tuple(l, r));
136 |     });
137 | }
138 | 
139 | #[bench]
140 | fn packed_default(b: &mut Bencher) {
141 |     let (l, r) = gen_data();
142 |     b.iter(|| {
143 |         test::black_box(packed_default_version(l, r));
144 |     });
145 | }
146 | 
147 | #[bench]
148 | fn packed_detect(b: &mut Bencher) {
149 |     let (l, r) = gen_data();
150 |     b.iter(|| {
151 |         test::black_box(packed(l, r));
152 |     });
153 | }
154 | 
155 | #[bench]
156 | fn vectorize_tuple_for_default(b: &mut Bencher) {
157 |     let (l, r) = gen_data();
158 |     b.iter(|| {
159 |         test::black_box(vectorize_tuple_for_default_version(l, r));
160 |     });
161 | }
162 | 
163 | #[bench]
164 | fn vectorize_tuple_for_detect(b: &mut Bencher) {
165 |     let (l, r) = gen_data();
166 |     b.iter(|| {
167 |         test::black_box(vectorize_tuple_for(l, r));
168 |     });
169 | }
170 | 
171 | #[bench]
172 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
173 | fn manual_sse(b: &mut Bencher) {
174 |     use std::mem;
175 | 
176 |     use crate::utils::arch::{self, __m128};
177 |     use crate::utils::gen_arch_vecs;
178 | 
179 |     let (l, r) = gen_arch_vecs();
180 | 
181 |     #[target_feature(enable = "fma", enable = "sse")]
182 |     unsafe fn inner(l: &[__m128], r: &[__m128]) -> f32 {
183 |         let mut result = arch::_mm_setzero_ps();
184 |         for (&l, &r) in l.iter().zip(r.iter()) {
185 |             result = arch::_mm_add_ps(result, arch::_mm_mul_ps(l, r));
186 |         }
187 | 
188 |         let result: [f32; 4] = mem::transmute(result);
189 |         result.iter().sum()
190 |     }
191 | 
192 |     b.iter(|| test::black_box(unsafe { inner(l, r) }));
193 | }
194 | 
195 | #[bench]
196 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
197 | fn manual_sse_fmadd(b: &mut Bencher) {
198 |     use std::mem;
199 | 
200 |     use crate::utils::arch::{self, __m128};
201 |     use crate::utils::gen_arch_vecs;
202 | 
203 |     let (l, r) = gen_arch_vecs();
204 | 
205 |     #[target_feature(enable = "fma", enable = "sse")]
206 |     unsafe fn inner(l: &[__m128], r: &[__m128]) -> f32 {
207 |         let mut result = arch::_mm_setzero_ps();
208 |         for (&l, &r) in l.iter().zip(r.iter()) {
209 |             result = arch::_mm_fmadd_ps(l, r, result);
210 |         }
211 | 
212 |         let result: [f32; 4] = mem::transmute(result);
213 |         result.iter().sum()
214 |     }
215 | 
216 |     if is_x86_feature_detected!("fma") {
217 |         b.iter(|| unsafe { test::black_box(inner(l, r)) });
218 |     }
219 | }
220 | 


--------------------------------------------------------------------------------
/benchmarks/benches/life.rs:
--------------------------------------------------------------------------------
  1 | use std::iter;
  2 | use std::mem;
  3 | 
  4 | use multiversion::multiversion;
  5 | use once_cell::sync::Lazy;
  6 | use test::Bencher;
  7 | 
  8 | use crate::mv;
  9 | use slipstream::prelude::*;
 10 | 
 11 | type Bools = bx32;
 12 | type Counts = u8x32;
 13 | 
 14 | #[derive(Clone, Debug, PartialEq)]
 15 | struct Life {
 16 |     edge: usize,
 17 |     cells: Vec<bool>,
 18 |     next: Vec<bool>,
 19 | }
 20 | 
 21 | const NEIGHS: [(isize, isize); 8] = [
 22 |     (-1, -1),
 23 |     (-1, 0),
 24 |     (-1, 1),
 25 |     (0, 1),
 26 |     (1, 1),
 27 |     (1, 0),
 28 |     (1, -1),
 29 |     (0, -1),
 30 | ];
 31 | 
 32 | const SIZE: usize = 1026;
 33 | 
 34 | impl Life {
 35 |     fn at(&self, x: usize, y: usize) -> usize {
 36 |         y * self.edge + x
 37 |     }
 38 |     fn set(&mut self, x: usize, y: usize, val: bool) {
 39 |         let idx = self.at(x, y);
 40 |         self.cells[idx] = val;
 41 |     }
 42 |     fn set_next(&mut self, x: usize, y: usize, val: bool) {
 43 |         let idx = self.at(x, y);
 44 |         self.next[idx] = val;
 45 |     }
 46 |     fn get(&self, x: usize, y: usize) -> bool {
 47 |         self.cells[self.at(x, y)]
 48 |     }
 49 | 
 50 |     /// Place a frame of always dead cells which won't participate in the game.
 51 |     ///
 52 |     /// These just solve the issue what to do with edges of the game plan.
 53 |     fn frame(&mut self) {
 54 |         for i in 0..self.edge {
 55 |             self.set(0, i, false);
 56 |             self.set(self.edge - 1, i, false);
 57 |             self.set(i, 0, false);
 58 |             self.set(i, self.edge - 1, false);
 59 |         }
 60 |     }
 61 |     fn gen() -> Self {
 62 |         fn inner() -> Life {
 63 |             let cells = iter::repeat_with(rand::random).take(SIZE * SIZE).collect();
 64 |             let mut me = Life {
 65 |                 edge: SIZE,
 66 |                 cells,
 67 |                 next: Vec::new(),
 68 |             };
 69 |             me.frame();
 70 |             me.next = me.cells.clone();
 71 |             me
 72 |         }
 73 | 
 74 |         static CACHED: Lazy<Life> = Lazy::new(inner);
 75 |         CACHED.clone()
 76 |     }
 77 | 
 78 |     fn step(&mut self) {
 79 |         for y in 1..self.edge - 1 {
 80 |             for x in 1..self.edge - 1 {
 81 |                 let cnt = NEIGHS
 82 |                     .iter()
 83 |                     .filter(|&&(xd, yd)| {
 84 |                         self.get(((x as isize) + xd) as usize, ((y as isize) + yd) as usize)
 85 |                     })
 86 |                     .count();
 87 |                 let alive = match cnt {
 88 |                     2 if self.get(x, y) => true,
 89 |                     3 => true,
 90 |                     _ => false,
 91 |                 };
 92 |                 self.set_next(x, y, alive);
 93 |             }
 94 |         }
 95 |         mem::swap(&mut self.cells, &mut self.next);
 96 |     }
 97 | 
 98 |     mv! {
 99 |         fn step_vectorized(&mut self) {
100 |             assert_eq!(mem::align_of::<Counts>(), mem::align_of::<Bools>());
101 |             assert_eq!(mem::size_of::<Counts>(), mem::size_of::<Bools>());
102 |             let twos = Counts::splat(2);
103 |             let threes = Counts::splat(3);
104 |             let dead = Bools::default();
105 |             let alive = Bools::splat(true);
106 | 
107 |             let mut neighs: [_; 8] = Default::default();
108 |             for y in 1..self.edge - 1 {
109 |                 let cells = &self.cells;
110 |                 for (ndest, &(xd, yd)) in neighs.iter_mut().zip(&NEIGHS) {
111 |                     let idx = self.at((1 + xd) as usize, ((y as isize) + yd) as usize);
112 |                     *ndest = &cells[idx..idx + self.edge - 2];
113 |                 }
114 | 
115 |                 let center_idx = self.at(1, y);
116 |                 let center = &cells[center_idx..center_idx + self.edge - 2];
117 |                 let dst = &mut self.next[center_idx..center_idx + self.edge - 2];
118 | 
119 |                 let iter = slipstream::vectorize::<([Bools; 8], Bools, _), _>((neighs, center, dst));
120 | 
121 |                 for (neighs, center, mut dst) in iter {
122 |                     let mut live_neigh_cnt = Counts::default();
123 |                     // FIXME: Using sum here unfortunately prevents inlining, which leads to
124 |                     // performance drop *and* barrier across which we don't get the AVX
125 |                     // instructions. So manually expanding the loop.
126 |                     for n in &neighs {
127 |                         // TODO: We want some safe transforms in here.
128 |                         live_neigh_cnt += unsafe { mem::transmute::<_, Counts>(*n) };
129 |                     }
130 |                     let survive = live_neigh_cnt.eq(twos);
131 |                     *dst = dead.blend(alive, survive) & center;
132 |                     let born = live_neigh_cnt.eq(threes);
133 |                     *dst |= dead.blend(alive, born);
134 |                 }
135 |             }
136 |             mem::swap(&mut self.cells, &mut self.next);
137 |         }
138 |     }
139 | }
140 | 
141 | #[bench]
142 | fn basic(b: &mut Bencher) {
143 |     let mut life = Life::gen();
144 | 
145 |     b.iter(|| {
146 |         life.step();
147 |     });
148 | }
149 | 
150 | #[bench]
151 | fn vectorize_detect(b: &mut Bencher) {
152 |     let mut life = Life::gen();
153 | 
154 |     b.iter(|| {
155 |         life.step_vectorized();
156 |     });
157 | }
158 | 
159 | #[bench]
160 | fn vectorize_default(b: &mut Bencher) {
161 |     let mut life = Life::gen();
162 | 
163 |     b.iter(|| {
164 |         life.step_vectorized_default_version();
165 |     });
166 | }
167 | 
168 | #[test]
169 | fn same_results() {
170 |     let mut l1 = Life::gen();
171 |     let mut l2 = l1.clone();
172 | 
173 |     for i in 0..100 {
174 |         assert_eq!(l1, l2, "Lifes differ in step {}", i);
175 |         l1.step();
176 |         l2.step_vectorized();
177 |     }
178 | }
179 | 
180 | // TODO: Anyone wants to volunteer and write a manually-vectorized version?
181 | 


--------------------------------------------------------------------------------
/benchmarks/benches/simple.rs:
--------------------------------------------------------------------------------
 1 | #![feature(test)]
 2 | // These two are needed when benchmarking for arm
 3 | #![feature(aarch64_target_feature)]
 4 | #![feature(arm_target_feature)]
 5 | #![feature(stdsimd)]
 6 | // The lint comes from somewhere inside macros, no idea why :-(
 7 | #![allow(unused_braces)]
 8 | 
 9 | extern crate test;
10 | 
11 | mod utils;
12 | 
13 | mod dot_product;
14 | mod life;
15 | mod sum;
16 | 


--------------------------------------------------------------------------------
/benchmarks/benches/sum.rs:
--------------------------------------------------------------------------------
  1 | use multiversion::multiversion;
  2 | use test::Bencher;
  3 | 
  4 | use slipstream::prelude::*;
  5 | 
  6 | use crate::mv;
  7 | use crate::utils::{gen_data, gen_vecs, V};
  8 | 
  9 | #[bench]
 10 | fn basic(b: &mut Bencher) {
 11 |     let (data, _) = gen_data();
 12 | 
 13 |     b.iter(|| {
 14 |         test::black_box(data.iter().sum::<f32>());
 15 |     })
 16 | }
 17 | 
 18 | mv! {
 19 |     fn vectorized(data: &[V]) -> f32 {
 20 |         let mut result = V::default();
 21 | 
 22 |         for v in data {
 23 |             result += *v;
 24 |         }
 25 | 
 26 |         result.iter().sum()
 27 |     }
 28 | 
 29 |     fn vectorized_rev(data: &[V]) -> f32 {
 30 |         let mut result = V::default();
 31 | 
 32 |         for v in data {
 33 |             result += *v;
 34 |         }
 35 | 
 36 |         // Any idea why this rev makes it run faster?
 37 |         result.iter().rev().sum()
 38 |     }
 39 | 
 40 |     fn vectorized_horizontal(data: &[V]) -> f32 {
 41 |         let mut result = V::default();
 42 | 
 43 |         for v in data {
 44 |             result += *v;
 45 |         }
 46 | 
 47 |         result.horizontal_sum()
 48 |     }
 49 | 
 50 |     fn vectorized_tree(data: &[V]) -> f32 {
 51 |         let mut result = V::default();
 52 | 
 53 |         for v in data {
 54 |             result += *v;
 55 |         }
 56 | 
 57 |         #[inline]
 58 |         fn sum_up(d: &[f32]) -> f32 {
 59 |             if d.len() == 1 {
 60 |                 d[0]
 61 |             } else {
 62 |                 let mid = d.len() / 2;
 63 |                 sum_up(&d[..mid]) + sum_up(&d[mid..])
 64 |             }
 65 |         }
 66 | 
 67 |         sum_up(&result[..])
 68 |     }
 69 | 
 70 |     fn vectorize(data: &[f32]) -> f32 {
 71 |         let mut result = V::default();
 72 | 
 73 |         for v in data.vectorize() {
 74 |             result += v;
 75 |         }
 76 | 
 77 |         result.iter().rev().sum()
 78 |     }
 79 | 
 80 |     fn vectorize_horizontal(data: &[f32]) -> f32 {
 81 |         let mut result = V::default();
 82 | 
 83 |         for v in data.vectorize() {
 84 |             result += v;
 85 |         }
 86 | 
 87 |         result.horizontal_sum()
 88 |     }
 89 | 
 90 |     fn sum(data: &[V]) -> f32 {
 91 |         data.iter()
 92 |             .copied()
 93 |             .sum::<V>()
 94 |             .horizontal_sum()
 95 |     }
 96 | 
 97 |     fn sum_vectorize(data: &[f32]) -> f32 {
 98 |         data.vectorize()
 99 |             .sum::<V>()
100 |             .horizontal_sum()
101 |     }
102 | 
103 |     // Testing what happens performance wise if we get mutable iteration in play
104 |     fn vectorize_mut(data: &mut [f32]) -> f32 {
105 |         let mut result = V::default();
106 | 
107 |         for v in data.vectorize() {
108 |             result += *v;
109 |         }
110 | 
111 |         result.horizontal_sum()
112 |     }
113 | 
114 |     fn vectorize_pad(data: &[f32]) -> f32 {
115 |         data[1..].vectorize_pad(V::default())
116 |             .sum::<V>()
117 |             .horizontal_sum()
118 |     }
119 | 
120 |     fn vectorize_split(data: &[f32]) -> f32 {
121 |         let len = data.len();
122 |         let rem = len % V::LANES;
123 |         let main = data[..len - rem].vectorize().sum::<V>().horizontal_sum();
124 |         let rem = data[len - rem..].iter().sum::<f32>();
125 |         main + rem
126 |     }
127 | }
128 | 
129 | #[bench]
130 | fn vectorized_default(b: &mut Bencher) {
131 |     let (data, _) = gen_vecs();
132 | 
133 |     b.iter(|| {
134 |         test::black_box(vectorized_default_version(data));
135 |     })
136 | }
137 | 
138 | #[bench]
139 | fn vectorized_detect(b: &mut Bencher) {
140 |     let (data, _) = gen_vecs();
141 | 
142 |     b.iter(|| {
143 |         test::black_box(vectorized(data));
144 |     })
145 | }
146 | 
147 | #[bench]
148 | fn vectorized_rev_default(b: &mut Bencher) {
149 |     let (data, _) = gen_vecs();
150 | 
151 |     b.iter(|| {
152 |         test::black_box(vectorized_rev_default_version(data));
153 |     })
154 | }
155 | 
156 | #[bench]
157 | fn vectorized_rev_detect(b: &mut Bencher) {
158 |     let (data, _) = gen_vecs();
159 | 
160 |     b.iter(|| {
161 |         test::black_box(vectorized_rev(data));
162 |     })
163 | }
164 | 
165 | #[bench]
166 | fn vectorized_tree_default(b: &mut Bencher) {
167 |     let (data, _) = gen_vecs();
168 | 
169 |     b.iter(|| {
170 |         test::black_box(vectorized_tree_default_version(data));
171 |     })
172 | }
173 | 
174 | #[bench]
175 | fn vectorized_tree_detect(b: &mut Bencher) {
176 |     let (data, _) = gen_vecs();
177 | 
178 |     b.iter(|| {
179 |         test::black_box(vectorized_tree(data));
180 |     })
181 | }
182 | 
183 | #[bench]
184 | fn vectorize_default(b: &mut Bencher) {
185 |     let (data, _) = gen_data();
186 | 
187 |     b.iter(|| {
188 |         test::black_box(vectorize_default_version(data));
189 |     });
190 | }
191 | 
192 | #[bench]
193 | fn vectorize_detect(b: &mut Bencher) {
194 |     let (data, _) = gen_data();
195 | 
196 |     b.iter(|| {
197 |         test::black_box(vectorize(data));
198 |     });
199 | }
200 | 
201 | #[bench]
202 | fn vectorize_horizontal_default(b: &mut Bencher) {
203 |     let (data, _) = gen_data();
204 | 
205 |     b.iter(|| {
206 |         test::black_box(vectorize_horizontal_default_version(data));
207 |     });
208 | }
209 | 
210 | #[bench]
211 | fn vectorize_horizontal_detect(b: &mut Bencher) {
212 |     let (data, _) = gen_data();
213 | 
214 |     b.iter(|| {
215 |         test::black_box(vectorize_horizontal(data));
216 |     });
217 | }
218 | 
219 | #[bench]
220 | fn sum_vectorize_default(b: &mut Bencher) {
221 |     let (data, _) = gen_data();
222 | 
223 |     b.iter(|| {
224 |         test::black_box(sum_vectorize_default_version(data));
225 |     })
226 | }
227 | 
228 | #[bench]
229 | fn sum_vectorize_detect(b: &mut Bencher) {
230 |     let (data, _) = gen_data();
231 | 
232 |     b.iter(|| {
233 |         test::black_box(sum_vectorize(data));
234 |     })
235 | }
236 | 
237 | #[bench]
238 | fn vectorize_mut_default(b: &mut Bencher) {
239 |     let (data, _) = gen_data();
240 |     let mut data = data.to_vec();
241 | 
242 |     b.iter(|| {
243 |         test::black_box(vectorize_mut_default_version(&mut data));
244 |     })
245 | }
246 | 
247 | #[bench]
248 | fn vectorize_mut_detect(b: &mut Bencher) {
249 |     let (data, _) = gen_data();
250 |     let mut data = data.to_vec();
251 | 
252 |     b.iter(|| {
253 |         test::black_box(vectorize_mut(&mut data));
254 |     })
255 | }
256 | 
257 | #[bench]
258 | fn sum_default(b: &mut Bencher) {
259 |     let (data, _) = gen_vecs();
260 | 
261 |     b.iter(|| {
262 |         test::black_box(sum_default_version(data));
263 |     })
264 | }
265 | 
266 | #[bench]
267 | fn sum_detect(b: &mut Bencher) {
268 |     let (data, _) = gen_vecs();
269 | 
270 |     b.iter(|| {
271 |         test::black_box(sum(data));
272 |     })
273 | }
274 | 
275 | #[bench]
276 | fn vectorized_horizontal_default(b: &mut Bencher) {
277 |     let (data, _) = gen_vecs();
278 | 
279 |     b.iter(|| {
280 |         test::black_box(vectorized_horizontal_default_version(data));
281 |     })
282 | }
283 | 
284 | #[bench]
285 | fn vectorized_horizontal_detect(b: &mut Bencher) {
286 |     let (data, _) = gen_vecs();
287 | 
288 |     b.iter(|| {
289 |         test::black_box(vectorized_horizontal(data));
290 |     })
291 | }
292 | 
293 | #[bench]
294 | fn vectorize_pad_default(b: &mut Bencher) {
295 |     let (data, _) = gen_data();
296 | 
297 |     b.iter(|| {
298 |         test::black_box(vectorize_pad_default_version(data));
299 |     })
300 | }
301 | 
302 | #[bench]
303 | fn vectorize_pad_detect(b: &mut Bencher) {
304 |     let (data, _) = gen_data();
305 | 
306 |     b.iter(|| {
307 |         test::black_box(vectorize_pad(data));
308 |     })
309 | }
310 | 
311 | #[bench]
312 | fn vectorize_split_default(b: &mut Bencher) {
313 |     let (data, _) = gen_data();
314 | 
315 |     b.iter(|| {
316 |         test::black_box(vectorize_split_default_version(data));
317 |     })
318 | }
319 | 
320 | #[bench]
321 | fn vectorize_split_detect(b: &mut Bencher) {
322 |     let (data, _) = gen_data();
323 | 
324 |     b.iter(|| {
325 |         test::black_box(vectorize_split(data));
326 |     })
327 | }
328 | 
329 | #[bench]
330 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
331 | fn manual_sse(b: &mut Bencher) {
332 |     use core::mem;
333 | 
334 |     use crate::utils::arch::{self, __m128};
335 |     use crate::utils::gen_arch_vecs;
336 | 
337 |     let (data, _) = gen_arch_vecs();
338 | 
339 |     // Note: this is technically not correct on the x86 target, we should check first, but who
340 |     // cares in benchmarks.
341 |     #[target_feature(enable = "sse")]
342 |     unsafe fn inner(d: &[__m128]) -> f32 {
343 |         let mut result = arch::_mm_setzero_ps();
344 |         for v in d {
345 |             result = arch::_mm_add_ps(result, *v);
346 |         }
347 | 
348 |         let result: [f32; 4] = mem::transmute(result);
349 |         result.iter().sum::<f32>()
350 |     }
351 | 
352 |     b.iter(|| test::black_box(unsafe { inner(data) }))
353 | }
354 | 
355 | #[bench]
356 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
357 | fn manual_sse_convert(b: &mut Bencher) {
358 |     use core::mem;
359 | 
360 |     use crate::utils::arch;
361 | 
362 |     let (data, _) = gen_data();
363 | 
364 |     // Note: this is technically not correct on the x86 target, we should check first, but who
365 |     // cares in benchmarks.
366 |     #[target_feature(enable = "sse")]
367 |     unsafe fn inner(d: &[f32]) -> f32 {
368 |         let mut result = arch::_mm_setzero_ps();
369 |         let iter = d.chunks_exact(4);
370 |         let remainder = iter.remainder().iter().sum::<f32>();
371 |         for v in iter {
372 |             result = arch::_mm_add_ps(result, arch::_mm_loadu_ps(v.as_ptr()));
373 |         }
374 | 
375 |         let result: [f32; 4] = mem::transmute(result);
376 |         result.iter().sum::<f32>() + remainder
377 |     }
378 | 
379 |     b.iter(|| test::black_box(unsafe { inner(data) }))
380 | }
381 | 


--------------------------------------------------------------------------------
/benchmarks/benches/utils.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(target_arch = "x86")]
 2 | pub use core::arch::x86::{self as arch, __m128};
 3 | #[cfg(target_arch = "x86_64")]
 4 | pub use core::arch::x86_64::{self as arch, __m128};
 5 | use std::iter;
 6 | 
 7 | use once_cell::sync::Lazy;
 8 | 
 9 | #[macro_export]
10 | macro_rules! mv {
11 |     ($(fn $name: ident($($params: tt)*) $(-> $res: ty)? $body: block)*) => {
12 |         $(
13 |             #[multiversion]
14 |             #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")]
15 |             #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")]
16 |             #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")]
17 |             #[clone(target = "[arm|aarch64]+neon")]
18 |             fn $name($($params)*) $(-> $res)? $body
19 |         )*
20 |     };
21 | }
22 | 
23 | pub(crate) const SIZE: usize = 10 * 1024 * 1024;
24 | pub(crate) type V = slipstream::f32x4;
25 | 
26 | pub(crate) fn gen_data() -> (&'static [f32], &'static [f32]) {
27 |     fn inner() -> Vec<f32> {
28 |         iter::repeat_with(rand::random).take(SIZE).collect()
29 |     }
30 |     static CACHED: Lazy<(Vec<f32>, Vec<f32>)> = Lazy::new(|| (inner(), inner()));
31 |     (&CACHED.0, &CACHED.1)
32 | }
33 | 
34 | pub(crate) fn gen_vecs() -> (&'static [V], &'static [V]) {
35 |     fn inner() -> Vec<V> {
36 |         iter::repeat_with(rand::random)
37 |             .map(|v: [f32; V::LANES]| V::new(&v))
38 |             .take(SIZE / V::LANES)
39 |             .collect()
40 |     }
41 |     static CACHED: Lazy<(Vec<V>, Vec<V>)> = Lazy::new(|| (inner(), inner()));
42 |     (&CACHED.0, &CACHED.1)
43 | }
44 | 
45 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
46 | pub(crate) fn gen_arch_vecs() -> (&'static [__m128], &'static [__m128]) {
47 |     fn inner() -> Vec<__m128> {
48 |         iter::repeat_with(|| {
49 |             let v: [f32; 4] = rand::random();
50 |             unsafe { arch::_mm_loadu_ps(v.as_ptr()) }
51 |         })
52 |         .take(SIZE / 4)
53 |         .collect()
54 |     }
55 | 
56 |     static CACHED: Lazy<(Vec<__m128>, Vec<__m128>)> = Lazy::new(|| (inner(), inner()));
57 |     (&CACHED.0, &CACHED.1)
58 | }
59 | 


--------------------------------------------------------------------------------
/examples/matrix.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt::Display;
  2 | use std::iter;
  3 | use std::num::Wrapping;
  4 | use std::ops::Mul;
  5 | use std::time::Instant;
  6 | 
  7 | use multiversion::{multiversion, target};
  8 | use rand::random;
  9 | use slipstream::prelude::*;
 10 | 
 11 | const SIZE: usize = 1024;
 12 | type V = wu32x8;
 13 | type O = usizex8;
 14 | const L: usize = V::LANES;
 15 | 
 16 | #[derive(Debug, PartialEq)]
 17 | struct Matrix(Vec<Wrapping<u32>>);
 18 | 
 19 | #[inline]
 20 | fn at(x: usize, y: usize) -> usize {
 21 |     y * SIZE + x
 22 | }
 23 | 
 24 | impl Matrix {
 25 |     fn random() -> Self {
 26 |         Self(
 27 |             iter::repeat_with(random)
 28 |                 .map(Wrapping)
 29 |                 .take(SIZE * SIZE)
 30 |                 .collect(),
 31 |         )
 32 |     }
 33 | 
 34 |     #[multiversion]
 35 |     #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")]
 36 |     #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")]
 37 |     #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")]
 38 |     fn mult_simd(&self, rhs: &Matrix) -> Matrix {
 39 |         let mut output = vec![Wrapping(0); SIZE * SIZE];
 40 | 
 41 |         // Pre-compute offsets when gathering the column
 42 |         let mut column: [V; SIZE / L] = [Default::default(); SIZE / L];
 43 |         let offsets = (0..L).collect::<Vec<_>>();
 44 |         let base_offsets = O::new(offsets) * SIZE;
 45 |         let mut offsets: [O; SIZE / L] = [Default::default(); SIZE / L];
 46 |         for i in 0..SIZE / L {
 47 |             offsets[i] = base_offsets + i * L * SIZE;
 48 |         }
 49 | 
 50 |         // Across columns
 51 |         for x in 0..SIZE {
 52 |             // The gather_load is likely slower than just vectorizing the row, so we do this less
 53 |             // often and just once for each column instead of each time.
 54 |             for (col, off) in (&mut column[..], &offsets[..]).vectorize() {
 55 |                 *col = V::gather_load(&rhs.0, off + x);
 56 |             }
 57 | 
 58 |             // Across rows
 59 |             for y in 0..SIZE {
 60 |                 let row_start = at(0, y);
 61 |                 output[at(x, y)] =
 62 |                     dispatch!(dot_prod(&self.0[row_start..row_start + SIZE], &column));
 63 |             }
 64 |         }
 65 |         Matrix(output)
 66 |     }
 67 | }
 68 | 
 69 | #[multiversion]
 70 | #[specialize(
 71 |     target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma",
 72 |     fn = "dot_prod_avx",
 73 |     unsafe = true
 74 | )]
 75 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")]
 76 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")]
 77 | fn dot_prod(row: &[Wrapping<u32>], column: &[V]) -> Wrapping<u32> {
 78 |     (row, column)
 79 |         .vectorize()
 80 |         .map(|(r, c): (V, V)| r * c)
 81 |         .sum::<V>()
 82 |         .horizontal_sum()
 83 | }
 84 | 
 85 | #[target("[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")]
 86 | unsafe fn dot_prod_avx(row: &[Wrapping<u32>], column: &[V]) -> Wrapping<u32> {
 87 |     let mut result = V::default();
 88 |     for (r, c) in (row, column).vectorize() {
 89 |         let r: V = r;
 90 |         result += r * c;
 91 |     }
 92 |     result.horizontal_sum()
 93 | }
 94 | 
 95 | impl Mul for &'_ Matrix {
 96 |     type Output = Matrix;
 97 |     fn mul(self, rhs: &Matrix) -> Matrix {
 98 |         let mut output = vec![Wrapping(0); SIZE * SIZE];
 99 |         for x in 0..SIZE {
100 |             for y in 0..SIZE {
101 |                 for z in 0..SIZE {
102 |                     output[at(x, y)] += self.0[at(z, y)] * rhs.0[at(x, z)];
103 |                 }
104 |             }
105 |         }
106 |         Matrix(output)
107 |     }
108 | }
109 | 
110 | fn timed<N: Display, R, F: FnOnce() -> R>(name: N, f: F) -> R {
111 |     let now = Instant::now();
112 |     let result = f();
113 |     println!("{} took:\t{:?}", name, now.elapsed());
114 |     result
115 | }
116 | 
117 | fn main() {
118 |     let a = Matrix::random();
119 |     let b = Matrix::random();
120 |     let z = timed("Scalar multiplication", || &a * &b);
121 |     let x = timed("Compile-time detected", || a.mult_simd_default_version(&b));
122 |     let w = timed("Run-time detected", || a.mult_simd(&b));
123 |     assert_eq!(z, x);
124 |     assert_eq!(z, w);
125 | }
126 | 


--------------------------------------------------------------------------------
/proptest-regressions/sse.txt:
--------------------------------------------------------------------------------
1 | # Seeds for failure cases proptest has generated in the past. It is
2 | # automatically read and these particular cases re-run before any
3 | # novel cases are generated.
4 | #
5 | # It is recommended to check this file in to source control so that
6 | # everyone who runs the test benefits from these saved cases.
7 | cc 9ba8d6f5bd1318e956010646a5b78d9b06c736ca22b061ddea8b6bd63b2b8d77 # shrinks to a = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], b = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
8 | 


--------------------------------------------------------------------------------
/src/iterators.rs:
--------------------------------------------------------------------------------
  1 | //! The [`Vectorizable`] trait and a lot of its service types.
  2 | //!
  3 | //! The [`Vectorizable`] trait allows to turning slices of base types to iterators of vectors, both
  4 | //! in separation and in tandem. The rest of this module provides the related types and traits.
  5 | //!
  6 | //! Usually, it is enough to bring in the [`prelude`][crate::prelude], which already contains the
  7 | //! trait. It is seldom necessary to interact with this module directly.
  8 | //!
  9 | //! # Examples
 10 | //!
 11 | //! ```rust
 12 | //! use slipstream::prelude::*;
 13 | //!
 14 | //! fn double(input: &[u32], output: &mut [u32]) {
 15 | //!     let two = u32x8::splat(2);
 16 | //!     for (i, mut o) in (input, output).vectorize() {
 17 | //!         *o = two * i;
 18 | //!     }
 19 | //! }
 20 | //! # double(&[], &mut [])
 21 | //! ```
 22 | 
 23 | use core::iter::FusedIterator;
 24 | use core::marker::PhantomData;
 25 | use core::mem::{self, MaybeUninit};
 26 | use core::ops::*;
 27 | use core::ptr;
 28 | use core::slice;
 29 | 
 30 | use crate::inner::Repr;
 31 | use crate::vector::align::Align;
 32 | use crate::Vector;
 33 | 
 34 | // TODO: Deref to arrays, not slices
 35 | /// A proxy object for iterating over mutable slices.
 36 | ///
 37 | /// For technical reasons (mostly alignment and padding), it's not possible to return a simple
 38 | /// reference. This type is returned instead and it can be used to both read and write the vectors
 39 | /// a slice is turned into.
 40 | ///
 41 | /// Note that the data are written in the destructor. Usually, this should not matter, but if you
 42 | /// [`forget`][mem::forget], the changes will be lost (this is meant as a warning, not as a way to
 43 | /// implement poor-man's transactions).
 44 | #[derive(Debug)]
 45 | pub struct MutProxy<'a, B, V>
 46 | where
 47 |     V: AsRef<[B]>,
 48 |     B: Copy,
 49 | {
 50 |     data: V,
 51 |     restore: &'a mut [B],
 52 | }
 53 | 
 54 | impl<B, V> Deref for MutProxy<'_, B, V>
 55 | where
 56 |     V: AsRef<[B]>,
 57 |     B: Copy,
 58 | {
 59 |     type Target = V;
 60 |     #[inline]
 61 |     fn deref(&self) -> &V {
 62 |         &self.data
 63 |     }
 64 | }
 65 | 
 66 | impl<B, V> DerefMut for MutProxy<'_, B, V>
 67 | where
 68 |     V: AsRef<[B]>,
 69 |     B: Copy,
 70 | {
 71 |     #[inline]
 72 |     fn deref_mut(&mut self) -> &mut V {
 73 |         &mut self.data
 74 |     }
 75 | }
 76 | 
 77 | impl<B, V> Drop for MutProxy<'_, B, V>
 78 | where
 79 |     V: AsRef<[B]>,
 80 |     B: Copy,
 81 | {
 82 |     #[inline]
 83 |     fn drop(&mut self) {
 84 |         self.restore
 85 |             .copy_from_slice(&self.data.as_ref()[..self.restore.len()]);
 86 |     }
 87 | }
 88 | 
 89 | #[doc(hidden)]
 90 | pub trait Partial<V> {
 91 |     fn take_partial(&mut self) -> Option<V>;
 92 |     fn size(&self) -> usize;
 93 | }
 94 | 
 95 | impl<V> Partial<V> for () {
 96 |     #[inline]
 97 |     fn take_partial(&mut self) -> Option<V> {
 98 |         None
 99 |     }
100 |     #[inline]
101 |     fn size(&self) -> usize {
102 |         0
103 |     }
104 | }
105 | 
106 | impl<V> Partial<V> for Option<V> {
107 |     #[inline]
108 |     fn take_partial(&mut self) -> Option<V> {
109 |         Option::take(self)
110 |     }
111 |     fn size(&self) -> usize {
112 |         self.is_some() as usize
113 |     }
114 | }
115 | 
116 | #[doc(hidden)]
117 | pub trait Vectorizer<R> {
118 |     /// Get the nth vector.
119 |     ///
120 |     /// # Safety
121 |     ///
122 |     /// * idx must be in range (as declared on creation).
123 |     /// * It may be called at most once per each index.
124 |     unsafe fn get(&mut self, idx: usize) -> R;
125 | }
126 | 
127 | /// The iterator returned by methods on [`Vectorizable`].
128 | ///
129 | /// While it's unusual to need to *name* the type, this is the thing that is returned from
130 | /// [`Vectorizable::vectorize`] and [`Vectorizable::vectorize_pad`]. It might be of interest to
131 | /// know that it implements several iterator „extensions“ ([`DoubleEndedIterator`],
132 | /// [`ExactSizeIterator`] and [`FusedIterator`]). Also, several methods are optimized ‒ for
133 | /// example, the `count` is constant time operation, while the generic is linear.
134 | #[derive(Copy, Clone, Debug)]
135 | pub struct VectorizedIter<V, P, R> {
136 |     partial: P,
137 |     vectorizer: V,
138 |     left: usize,
139 |     right: usize,
140 |     _result: PhantomData<R>,
141 | }
142 | 
143 | impl<V, P, R> Iterator for VectorizedIter<V, P, R>
144 | where
145 |     V: Vectorizer<R>,
146 |     P: Partial<R>,
147 | {
148 |     type Item = R;
149 | 
150 |     #[inline]
151 |     fn next(&mut self) -> Option<R> {
152 |         if self.left < self.right {
153 |             let idx = self.left;
154 |             self.left += 1;
155 |             Some(unsafe { self.vectorizer.get(idx) })
156 |         } else {
157 |             self.partial.take_partial()
158 |         }
159 |     }
160 | 
161 |     #[inline]
162 |     fn size_hint(&self) -> (usize, Option<usize>) {
163 |         let len = self.right - self.left + self.partial.size();
164 |         (len, Some(len))
165 |     }
166 | 
167 |     // Overriden for performance… these things have no side effects, so we can avoid calling next
168 | 
169 |     #[inline]
170 |     fn count(self) -> usize {
171 |         self.size_hint().0
172 |     }
173 | 
174 |     #[inline]
175 |     fn last(mut self) -> Option<R> {
176 |         self.next_back()
177 |     }
178 | 
179 |     // TODO: This wants some tests
180 |     #[inline]
181 |     fn nth(&mut self, n: usize) -> Option<R> {
182 |         let main_len = self.right - self.left;
183 |         if main_len >= n {
184 |             self.left += n;
185 |             self.next()
186 |         } else {
187 |             self.left = self.right;
188 |             self.partial.take_partial();
189 |             None
190 |         }
191 |     }
192 | }
193 | 
194 | impl<V, P, R> DoubleEndedIterator for VectorizedIter<V, P, R>
195 | where
196 |     V: Vectorizer<R>,
197 |     P: Partial<R>,
198 | {
199 |     // TODO: Tests
200 |     #[inline]
201 |     fn next_back(&mut self) -> Option<Self::Item> {
202 |         if let Some(partial) = self.partial.take_partial() {
203 |             Some(partial)
204 |         } else if self.left < self.right {
205 |             self.right -= 1;
206 |             Some(unsafe { self.vectorizer.get(self.right) })
207 |         } else {
208 |             None
209 |         }
210 |     }
211 | }
212 | 
213 | impl<V, P, R> ExactSizeIterator for VectorizedIter<V, P, R>
214 | where
215 |     V: Vectorizer<R>,
216 |     P: Partial<R>,
217 | {
218 | }
219 | 
220 | impl<V, P, R> FusedIterator for VectorizedIter<V, P, R>
221 | where
222 |     V: Vectorizer<R>,
223 |     P: Partial<R>,
224 | {
225 | }
226 | 
227 | /// A trait describing things with direct support for splitting into vectors.
228 | ///
229 | /// This supports vectorized iteration over shared and mutable slices as well as types composed of
230 | /// them (tuples and short fixed-sized arrays).
231 | ///
232 | /// Note that, unlike normal iterators, shared slices return owned values (vectors) and mutable
233 | /// slices return [proxy objects][MutProxy] that allow writing the data back. It is not possible to
234 | /// directly borrow from the slice because of alignment. The tuples and arrays return tuples and
235 | /// arrays of the inner values.
236 | ///
237 | /// Already pre-vectorized inputs are also supported (this is useful in combination with other not
238 | /// vectorized inputs).
239 | ///
240 | /// # Type hints
241 | ///
242 | /// Oftentimes, the compiler can infer the type of the base type, but not the length of the vector.
243 | /// It is therefore needed to provide a type hint.
244 | ///
245 | /// Furthermore, for tuples and arrays, the inner type really needs to be the slice, not something
246 | /// that can coerce into it (eg. vec or array).
247 | ///
248 | /// Alternatively, you can use the free-standing functions [`vectorize`][crate::vectorize] and
249 | /// [`vectorize_pad`][crate::vectorize_pad]. It allows using the turbofish to provide the hint.
250 | ///
251 | /// # Examples
252 | ///
253 | /// ```rust
254 | /// # use slipstream::prelude::*;
255 | /// let data = [1, 2, 3, 4];
256 | /// let v = data.vectorize().collect::<Vec<u32x2>>();
257 | /// assert_eq!(vec![u32x2::new([1, 2]), u32x2::new([3, 4])], v);
258 | /// ```
259 | ///
260 | /// ```rust
261 | /// # use slipstream::prelude::*;
262 | /// let data = [1, 2, 3, 4];
263 | /// for v in data.vectorize() {
264 | ///     let v: u32x2 = v; // Type hint
265 | ///     println!("{:?}", v);
266 | /// }
267 | /// ```
268 | ///
269 | /// ```rust
270 | /// # use slipstream::prelude::*;
271 | /// let input = [1, 2, 3, 4];
272 | /// let mut output = [0; 4];
273 | /// let mul = u32x2::splat(2);
274 | /// // We have to force the coercion to slice by [..]
275 | /// for (i, mut o) in (&input[..], &mut output[..]).vectorize() {
276 | ///     *o = mul * i;
277 | /// }
278 | /// assert_eq!(output, [2, 4, 6, 8]);
279 | /// ```
280 | ///
281 | /// ```rust
282 | /// # use slipstream::prelude::*;
283 | /// let vectorized = [u32x2::new([1, 2]), u32x2::new([3, 4])];
284 | /// let not_vectorized = [1, 2, 3, 4];
285 | /// for (v, n) in (&vectorized[..], &not_vectorized[..]).vectorize() {
286 | ///     assert_eq!(v, n);
287 | /// }
288 | /// ```
289 | pub trait Vectorizable<V>: Sized {
290 |     /// The input type provided by user to fill in the padding/uneven end.
291 |     ///
292 |     /// Note that this doesn't necessarily have to be the same type as the type returned by the
293 |     /// resulting iterator. For example, in case of mutable slices, the input is the vector, while
294 |     /// the output is [`MutProxy`].
295 |     type Padding;
296 | 
297 |     /// An internal type managing the splitting into vectors.
298 |     ///
299 |     /// Not of direct interest of the users of this crate.
300 |     type Vectorizer: Vectorizer<V>;
301 | 
302 |     /// Internal method to create the vectorizer and kick of the iteration.
303 |     fn create(self, pad: Option<Self::Padding>) -> (Self::Vectorizer, usize, Option<V>);
304 | 
305 |     /// Vectorize a slice or composite of slices
306 |     ///
307 |     /// This variant assumes the input is divisible by the size of the vector. Prefer this if
308 |     /// possible over [`vectorize_pad`][Vectorizable::vectorize_pad], as it is usually
309 |     /// significantly faster.
310 |     ///
311 |     /// # Panics
312 |     ///
313 |     /// * If the slice length isn't divisible by the vector size.
314 |     /// * If the parts of the composite produce different number of vectors. It is not mandated for
315 |     ///   the slices to be of equal length, only to produce the same number of vectors.
316 |     ///
317 |     /// # Examples
318 |     ///
319 |     /// ```rust
320 |     /// # use slipstream::prelude::*;
321 |     /// let longer = [1, 2, 3, 4, 5, 6, 7, 8];
322 |     /// let shorter = [1, 2, 3, 4];
323 |     /// for i in (&shorter[..], &longer[..]).vectorize() {
324 |     ///     let (s, l): (u32x2, u32x4) = i;
325 |     ///     println!("s: {:?}, l: {:?})", s, l);
326 |     /// }
327 |     /// ```
328 |     #[inline(always)]
329 |     fn vectorize(self) -> VectorizedIter<Self::Vectorizer, (), V> {
330 |         let (vectorizer, len, partial) = self.create(None);
331 |         assert!(partial.is_none());
332 |         VectorizedIter {
333 |             partial: (),
334 |             vectorizer,
335 |             left: 0,
336 |             right: len,
337 |             _result: PhantomData,
338 |         }
339 |     }
340 | 
341 |     /// Vectorizes a slice or composite of slices, padding the odd end if needed.
342 |     ///
343 |     /// While the [`vectorize`][Vectorizable::vectorize] assumes the input can be split into
344 |     /// vectors without leftover, this version deals with the uneven rest by producing a padding
345 |     /// vector (if needed). The unused lanes are taken from the `pad` parameter. This is at the
346 |     /// cost of some performance (TODO: figure out why it is so much slower).
347 |     ///
348 |     /// For mutable slices, padding is used as usual, but the added lanes are not stored anywhere.
349 |     ///
350 |     /// The padding is produced at the end.
351 |     ///
352 |     /// In case of composites, this still assumes they produce the same number of full vectors and
353 |     /// that they all either do or don't need a padding.
354 |     ///
355 |     /// # Panics
356 |     ///
357 |     /// If the above assumption about number of vectors and same padding behaviour is violated.
358 |     ///
359 |     /// ```rust
360 |     /// # use slipstream::prelude::*;
361 |     /// let data = [1, 2, 3, 4, 5, 6];
362 |     /// let v = data.vectorize_pad(i32x4::splat(-1)).collect::<Vec<_>>();
363 |     /// assert_eq!(v, vec![i32x4::new([1, 2, 3, 4]), i32x4::new([5, 6, -1, -1])]);
364 |     /// ```
365 |     #[inline(always)]
366 |     fn vectorize_pad(self, pad: Self::Padding) -> VectorizedIter<Self::Vectorizer, Option<V>, V> {
367 |         let (vectorizer, len, partial) = self.create(Some(pad));
368 |         VectorizedIter {
369 |             partial,
370 |             vectorizer,
371 |             left: 0,
372 |             right: len,
373 |             _result: PhantomData,
374 |         }
375 |     }
376 | }
377 | 
378 | #[doc(hidden)]
379 | #[derive(Copy, Clone, Debug)]
380 | pub struct ReadVectorizer<'a, A: Align, B: Repr, const S: usize> {
381 |     start: *const B,
382 |     _vector: PhantomData<Vector<A, B, S>>,
383 |     _slice: PhantomData<&'a [B]>, // To hold the lifetime
384 | }
385 | 
386 | // Note: The impls here assume V, B, P are Sync and Send, which they are. Nobody is able to create
387 | // this directly and we do have the limits on Vector, the allowed implementations, etc.
388 | unsafe impl<A: Align, B: Repr, const S: usize> Send for ReadVectorizer<'_, A, B, S> {}
389 | unsafe impl<A: Align, B: Repr, const S: usize> Sync for ReadVectorizer<'_, A, B, S> {}
390 | 
391 | impl<A: Align, B: Repr, const S: usize> Vectorizer<Vector<A, B, S>>
392 |     for ReadVectorizer<'_, A, B, S>
393 | {
394 |     #[inline(always)]
395 |     unsafe fn get(&mut self, idx: usize) -> Vector<A, B, S> {
396 |         Vector::new_unchecked(self.start.add(S * idx))
397 |     }
398 | }
399 | 
400 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable<Vector<A, B, S>> for &'a [B] {
401 |     type Vectorizer = ReadVectorizer<'a, A, B, S>;
402 |     type Padding = Vector<A, B, S>;
403 |     #[inline]
404 |     fn create(
405 |         self,
406 |         pad: Option<Vector<A, B, S>>,
407 |     ) -> (Self::Vectorizer, usize, Option<Vector<A, B, S>>) {
408 |         let len = self.len();
409 |         assert!(
410 |             len * mem::size_of::<B>() <= isize::MAX as usize,
411 |             "Slice too huge"
412 |         );
413 |         let rest = len % S;
414 |         let main = len - rest;
415 |         let start = self.as_ptr();
416 |         let partial = match (rest, pad) {
417 |             (0, _) => None,
418 |             (_, Some(mut pad)) => {
419 |                 pad[..rest].copy_from_slice(&self[main..]);
420 |                 Some(pad)
421 |             }
422 |             _ => panic!(
423 |                 "Data to vectorize not divisible by lanes ({} vs {})",
424 |                 S, len,
425 |             ),
426 |         };
427 |         let me = ReadVectorizer {
428 |             start,
429 |             _vector: PhantomData,
430 |             _slice: PhantomData,
431 |         };
432 |         (me, main / S, partial)
433 |     }
434 | }
435 | 
436 | #[doc(hidden)]
437 | #[derive(Copy, Clone, Debug)]
438 | pub struct WriteVectorizer<'a, A: Align, B: Repr, const S: usize> {
439 |     start: *mut B,
440 |     _vector: PhantomData<Vector<A, B, S>>,
441 |     _slice: PhantomData<&'a mut [B]>, // To hold the lifetime
442 | }
443 | 
444 | // Note: The impls here assume V, B, P are Sync and Send, which they are. Nobody is able to create
445 | // this directly and we do have the limits on Vector, the allowed implementations, etc.
446 | unsafe impl<A: Align, B: Repr, const S: usize> Send for WriteVectorizer<'_, A, B, S> {}
447 | unsafe impl<A: Align, B: Repr, const S: usize> Sync for WriteVectorizer<'_, A, B, S> {}
448 | 
449 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizer<MutProxy<'a, B, Vector<A, B, S>>>
450 |     for WriteVectorizer<'a, A, B, S>
451 | {
452 |     #[inline(always)]
453 |     unsafe fn get(&mut self, idx: usize) -> MutProxy<'a, B, Vector<A, B, S>> {
454 |         // FIXME: Technically, we extend the lifetime in the from_raw_parts_mut beyond what rust
455 |         // would allow us to normally do. But is this OK? As we are guaranteed never to give any
456 |         // chunk twice, this should act similar to IterMut from slice or similar.
457 |         let ptr = self.start.add(S * idx);
458 |         MutProxy {
459 |             data: Vector::new_unchecked(ptr),
460 |             restore: slice::from_raw_parts_mut(ptr, S),
461 |         }
462 |     }
463 | }
464 | 
465 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable<MutProxy<'a, B, Vector<A, B, S>>>
466 |     for &'a mut [B]
467 | {
468 |     type Vectorizer = WriteVectorizer<'a, A, B, S>;
469 |     type Padding = Vector<A, B, S>;
470 |     #[inline]
471 |     #[allow(clippy::type_complexity)]
472 |     fn create(
473 |         self,
474 |         pad: Option<Vector<A, B, S>>,
475 |     ) -> (
476 |         Self::Vectorizer,
477 |         usize,
478 |         Option<MutProxy<'a, B, Vector<A, B, S>>>,
479 |     ) {
480 |         let len = self.len();
481 |         assert!(
482 |             len * mem::size_of::<B>() <= isize::MAX as usize,
483 |             "Slice too huge"
484 |         );
485 |         let rest = len % S;
486 |         let main = len - rest;
487 |         let start = self.as_mut_ptr();
488 |         let partial = match (rest, pad) {
489 |             (0, _) => None,
490 |             (_, Some(mut pad)) => {
491 |                 let restore = &mut self[main..];
492 |                 pad[..rest].copy_from_slice(restore);
493 |                 Some(MutProxy { data: pad, restore })
494 |             }
495 |             _ => panic!(
496 |                 "Data to vectorize not divisible by lanes ({} vs {})",
497 |                 S, len,
498 |             ),
499 |         };
500 |         let me = WriteVectorizer {
501 |             start,
502 |             _vector: PhantomData,
503 |             _slice: PhantomData,
504 |         };
505 |         (me, main / S, partial)
506 |     }
507 | }
508 | 
509 | macro_rules! vectorizable_tuple {
510 |     ($(($X: ident, $XR: ident, $X0: tt)),*) => {
511 |         impl<$($X, $XR),*> Vectorizer<($($XR),*)> for ($($X),*)
512 |         where
513 |             $($X: Vectorizer<$XR>,)*
514 |         {
515 |             #[inline(always)]
516 |             unsafe fn get(&mut self, idx: usize) -> ($($XR),*) {
517 |                 ($(self.$X0.get(idx)),*)
518 |             }
519 |         }
520 | 
521 |         impl<$($X, $XR),*> Vectorizable<($($XR),*)> for ($($X),*)
522 |         where
523 |             $($X: Vectorizable<$XR>,)*
524 |         {
525 |             type Vectorizer = ($($X::Vectorizer),*);
526 |             type Padding = ($($X::Padding),*);
527 |             #[inline]
528 |             #[allow(clippy::eq_op)]
529 |             fn create(self, pad: Option<Self::Padding>)
530 |                 -> (Self::Vectorizer, usize, Option<($($XR),*)>)
531 |             {
532 |                 let pad = match pad {
533 |                     Some(pad) => ($(Some(pad.$X0)),*),
534 |                     None => Default::default(), // Bunch of Nones in a tuple.. (None, None, None)...
535 |                 };
536 |                 let created = ($(self.$X0.create(pad.$X0)),*);
537 |                 $(
538 |                     // TODO: We may want to support this in the padded mode eventually by
539 |                     // creating more paddings
540 |                     assert_eq!(
541 |                         (created.0).1,
542 |                         created.$X0.1,
543 |                         "Vectorizing data of different lengths"
544 |                     );
545 |                     // TODO: We could also handle this in the padded mode by doing empty pads
546 |                     assert_eq!(
547 |                         (created.0).2.is_some(),
548 |                         created.$X0.2.is_some(),
549 |                         "Paddings are not the same for all vectorized data",
550 |                     );
551 |                 )*
552 |                 let vectorizer = ($(created.$X0.0),*);
553 |                 let pad = if (created.0).2.is_some() {
554 |                     Some(($(created.$X0.2.unwrap()),*))
555 |                 } else {
556 |                     None
557 |                 };
558 |                 (vectorizer, (created.0).1, pad)
559 |             }
560 |         }
561 |     }
562 | }
563 | 
564 | vectorizable_tuple!((A, AR, 0), (B, BR, 1));
565 | vectorizable_tuple!((A, AR, 0), (B, BR, 1), (C, CR, 2));
566 | vectorizable_tuple!((A, AR, 0), (B, BR, 1), (C, CR, 2), (D, DR, 3));
567 | vectorizable_tuple!((A, AR, 0), (B, BR, 1), (C, CR, 2), (D, DR, 3), (E, ER, 4));
568 | vectorizable_tuple!(
569 |     (A, AR, 0),
570 |     (B, BR, 1),
571 |     (C, CR, 2),
572 |     (D, DR, 3),
573 |     (E, ER, 4),
574 |     (F, FR, 5)
575 | );
576 | vectorizable_tuple!(
577 |     (A, AR, 0),
578 |     (B, BR, 1),
579 |     (C, CR, 2),
580 |     (D, DR, 3),
581 |     (E, ER, 4),
582 |     (F, FR, 5),
583 |     (G, GR, 6)
584 | );
585 | vectorizable_tuple!(
586 |     (A, AR, 0),
587 |     (B, BR, 1),
588 |     (C, CR, 2),
589 |     (D, DR, 3),
590 |     (E, ER, 4),
591 |     (F, FR, 5),
592 |     (G, GR, 6),
593 |     (H, HR, 7)
594 | );
595 | 
596 | impl<T, TR, const S: usize> Vectorizer<[TR; S]> for [T; S]
597 | where
598 |     T: Vectorizer<TR>,
599 | {
600 |     #[inline(always)]
601 |     unsafe fn get(&mut self, idx: usize) -> [TR; S] {
602 |         let mut res = MaybeUninit::<[TR; S]>::uninit();
603 |         for (i, v) in self.iter_mut().enumerate() {
604 |             ptr::write(res.as_mut_ptr().cast::<TR>().add(i), v.get(idx));
605 |         }
606 |         res.assume_init()
607 |     }
608 | }
609 | 
610 | impl<T, TR, const S: usize> Vectorizable<[TR; S]> for [T; S]
611 | where
612 |     T: Vectorizable<TR> + Copy,
613 |     T::Padding: Copy,
614 | {
615 |     type Vectorizer = [T::Vectorizer; S];
616 |     type Padding = [T::Padding; S];
617 |     #[inline]
618 |     fn create(self, pad: Option<Self::Padding>) -> (Self::Vectorizer, usize, Option<[TR; S]>) {
619 |         let mut vectorizer = MaybeUninit::<Self::Vectorizer>::uninit();
620 |         let mut size = 0;
621 |         let mut padding = MaybeUninit::<[TR; S]>::uninit();
622 |         let mut seen_some_pad = false;
623 |         let mut seen_none_pad = false;
624 |         unsafe {
625 |             for i in 0..S {
626 |                 let (v, s, p) = self[i].create(pad.map(|p| p[i]));
627 |                 ptr::write(vectorizer.as_mut_ptr().cast::<T::Vectorizer>().add(i), v);
628 |                 if i == 0 {
629 |                     size = s;
630 |                 } else {
631 |                     assert_eq!(size, s, "Vectorized lengths inconsistent across the array",);
632 |                 }
633 |                 match p {
634 |                     Some(p) => {
635 |                         seen_some_pad = true;
636 |                         ptr::write(padding.as_mut_ptr().cast::<TR>().add(i), p);
637 |                     }
638 |                     None => seen_none_pad = true,
639 |                 }
640 |             }
641 |             assert!(
642 |                 !seen_some_pad || !seen_none_pad,
643 |                 "Paddings inconsistent across the array",
644 |             );
645 |             let padding = if seen_some_pad {
646 |                 Some(padding.assume_init())
647 |             } else {
648 |                 None
649 |             };
650 |             (vectorizer.assume_init(), size, padding)
651 |         }
652 |     }
653 | }
654 | 
655 | impl<'a, T> Vectorizer<T> for &'a [T]
656 | where
657 |     T: Copy,
658 | {
659 |     unsafe fn get(&mut self, idx: usize) -> T {
660 |         *self.get_unchecked(idx)
661 |     }
662 | }
663 | 
664 | impl<'a, T> Vectorizer<&'a mut T> for &'a mut [T] {
665 |     unsafe fn get(&mut self, idx: usize) -> &'a mut T {
666 |         // FIXME: Why do we have to extend the lifetime here? Is it safe? Intuitively, it should,
667 |         // because we hand out each chunk only once and this is what IterMut does too.
668 |         let ptr = self.get_unchecked_mut(idx) as *mut T;
669 |         &mut *ptr
670 |     }
671 | }
672 | 
673 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable<Vector<A, B, S>>
674 |     for &'a [Vector<A, B, S>]
675 | {
676 |     type Padding = ();
677 |     type Vectorizer = &'a [Vector<A, B, S>];
678 |     fn create(self, _pad: Option<()>) -> (Self::Vectorizer, usize, Option<Vector<A, B, S>>) {
679 |         (self, self.len(), None)
680 |     }
681 | }
682 | 
683 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable<&'a mut Vector<A, B, S>>
684 |     for &'a mut [Vector<A, B, S>]
685 | {
686 |     type Padding = ();
687 |     type Vectorizer = &'a mut [Vector<A, B, S>];
688 |     fn create(
689 |         self,
690 |         _pad: Option<()>,
691 |     ) -> (Self::Vectorizer, usize, Option<&'a mut Vector<A, B, S>>) {
692 |         let len = self.len();
693 |         (self, len, None)
694 |     }
695 | }
696 | 
697 | #[cfg(test)]
698 | mod tests {
699 |     use super::*;
700 |     use crate::prelude::*;
701 | 
702 |     #[test]
703 |     fn iter() {
704 |         let data = (0..=10u16).collect::<Vec<_>>();
705 |         let vtotal: u16x8 = data.vectorize_pad(u16x8::default()).sum();
706 |         let total: u16 = vtotal.horizontal_sum();
707 |         assert_eq!(total, 55);
708 |     }
709 | 
710 |     #[test]
711 |     fn iter_mut() {
712 |         let data = (0..33u32).collect::<Vec<_>>();
713 |         let mut dst = [0u32; 33];
714 |         let ones = u32x4::splat(1);
715 |         for (mut d, s) in
716 |             (&mut dst[..], &data[..]).vectorize_pad((u32x4::default(), u32x4::default()))
717 |         {
718 |             *d = ones + s;
719 |         }
720 | 
721 |         for (l, r) in data.iter().zip(dst.iter()) {
722 |             assert_eq!(*l + 1, *r);
723 |         }
724 |     }
725 | 
726 |     // Here, one of the inputs is already vectorized
727 |     #[test]
728 |     fn iter_prevec() {
729 |         let src = [0, 1, 2, 3, 4, 5, 6, 7];
730 |         let mut dst = [u16x4::default(); 2];
731 | 
732 |         for (dst, src) in (&mut dst[..], &src[..]).vectorize() {
733 |             *dst = src;
734 |         }
735 | 
736 |         assert_eq!(dst, [u16x4::new([0, 1, 2, 3]), u16x4::new([4, 5, 6, 7])]);
737 |     }
738 | }
739 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![doc(test(attr(deny(warnings))))]
  2 | #![warn(missing_docs)]
  3 | #![allow(non_camel_case_types)]
  4 | #![cfg_attr(not(test), no_std)]
  5 | 
  6 | //! This library helps writing code in a way that incentives the compiler to
  7 | //! optimize the results better (without really doing anything itself).
  8 | //!
  9 | //! Modern compilers, including `rustc`, are able to come up with impressive ways to
 10 | //! speed up the resulting code, using techniques like loop unrolling and
 11 | //! autovectorization, routinely outperforming what one would hand-craft.
 12 | //! Nevertheless, each optimisation has some assumptions that must be proven to hold
 13 | //! before it can be applied.
 14 | //!
 15 | //! This library offers „vector“ types, like [`u16x8`], which act in a very similar
 16 | //! way as little fixed-sized arrays (in this case it would be `[u16; 8]`), but with
 17 | //! arithmetics defined for them. They also enforce alignment of the whole vectors.
 18 | //! Therefore, one can write the algorithm in a way that works on these groups of
 19 | //! data and make it easier for the compiler to prove the assumptions. This can
 20 | //! result in multiple factor speed ups by giving the compiler these proofs „for
 21 | //! free“ and allowing it to apply aggressive optimizations.
 22 | //!
 23 | //! Unlike several other SIMD libraries, this one doesn't do any actual explicit SIMD. That results
 24 | //! in relatively simpler interface while still working on stable compiler. It also works in no-std
 25 | //! environment. However, the optimisations are not guaranteed. In particular, while the crate may
 26 | //! allow for a significant speed-ups, it can *also make your code slower*. When using the crate,
 27 | //! you're strongly advised to benchmark.
 28 | //!
 29 | //! # Anatomy of the crate
 30 | //!
 31 | //! ## Vector types
 32 | //!
 33 | //! On the surface, there are types like [`u16x8`], which is just an wrapper around `[u16; 8]`.
 34 | //! These wrappers act a bit like arrays (they can be dereferenced to a slice, they can be indexed)
 35 | //! and have **common arithmetic traits** implemented. The arithmetic is applied to each index
 36 | //! separately, eg:
 37 | //!
 38 | //! ```
 39 | //! # use slipstream::prelude::*;
 40 | //! let a = u8x2::new([1, 2]);
 41 | //! let b = u8x2::new([3, 4]);
 42 | //! assert_eq!(a + b, u8x2::new([4, 6]));
 43 | //! ```
 44 | //!
 45 | //! All these types are backed by the generic [`Vector`] type.  See the
 46 | //! methods there to see how they can be created and how they interact.
 47 | //!
 48 | //! All these can be imported by importing prelude:
 49 | //!
 50 | //! ```
 51 | //! # #[allow(unused_imports)]
 52 | //! use slipstream::prelude::*;
 53 | //! ```
 54 | //!
 55 | //! The names are based on primitive types, therefore there are types like [`u8x2`], [`i8x2`],
 56 | //! [`f32x4`], [`f64x2`].
 57 | //!
 58 | //! There are some more types:
 59 | //!
 60 | //! * [`wu8x2`] is based on [`Wrapping<u8>`][core::num::Wrapping], [`wi8x2`] is based on
 61 | //!   [`Wrapping<i8>`][core::num::Wrapping].
 62 | //! * [`bx2`] are vectors of [`bool`]s.
 63 | //! * [`m8x2`] are mask vectors. They act *a bit* like booleans, but they have width and use all
 64 | //!   bits set to `1` for `true`. These can be used to [`blend`][Vector::blend] vectors together,
 65 | //!   mask loads and stores and are results of comparisons. The representation is inspired by what
 66 | //!   the vector instructions actually use, so they should be possible for the compiler to
 67 | //!   autovectorize. The widths match the types they work with ‒ comparing two [`u32x2`]s will
 68 | //!   result in [`m32x2`]. The lanes can be converted to/from [`bool`] with methods on the [`Mask`]
 69 | //!   trait, but usually these are just fed back to some other vector operations.
 70 | //!
 71 | //! ## Vectorization of slices
 72 | //!
 73 | //! While it might be better for performance to store all data already in the vector types, it
 74 | //! oftentimes happen that the input is in form of a slice or multiple slices of the primitive
 75 | //! types. It would be possible to chunk the input and load them into the vectors one at a time,
 76 | //! either manually or by using something like the [`chunks_exact`][core::slice::ChunksExact]
 77 | //! and [`zip`][core::iter::Iterator::zip]. Nevertheless, it turns out to be inconvenient and often
 78 | //! too complex for the compiler to make sense of and vectorize properly.
 79 | //!
 80 | //! Therefore, the crate provides its own means for splitting the data into vectors, using the
 81 | //! [`Vectorizable`] trait. This is implemented on const and mutable slices as well as tuples and
 82 | //! small (fixed-sized) arrays of these. The trait adds the [`vectorize`][Vectorizable::vectorize]
 83 | //! and [`vectorize_pad`][Vectorizable::vectorize_pad] methods.
 84 | //!
 85 | //! As the methods can't know into how wide vectors the input should be split, it is often needed
 86 | //! to provide a type hint somewhere.
 87 | //!
 88 | //! ```rust
 89 | //! # use slipstream::prelude::*;
 90 | //! fn dot_product(l: &[f32], r: &[f32]) -> f32 {
 91 | //!     let mut result = f32x8::default();
 92 | //!     // This assumes l and r are of the same length and divisible by 8
 93 | //!     for (l, r) in (l, r).vectorize() {
 94 | //!         // Force the exact type of l and r vectors
 95 | //!         let (l, r): (f32x8, f32x8) = (l, r);
 96 | //!         result += l * r;
 97 | //!     }
 98 | //!     // Sum the 8 lanes together
 99 | //!     result.horizontal_sum()
100 | //! }
101 | //! # dot_product(&[], &[]);
102 | //! ```
103 | //!
104 | //! # Multiversioning and dynamic instruction set selection
105 | //!
106 | //! If used as in the examples above, the compiler chooses an instruction set at compile time,
107 | //! based on the command line arguments. By default these are conservative, to run on arbitrary
108 | //! (old) CPU. It is possible to either enable newer instructions at compile time (at the cost of
109 | //! not being able to run the program on the older CPUs) or compile multiple versions of the same
110 | //! function and choose the right one at runtime, depending on what the CPU actually supports.
111 | //!
112 | //! While this library doesn't provide any direct support for multiversioning, it has been observed
113 | //! to work reasonably well in combination with the [`multiversion`] crate.
114 | //!
115 | //! Note that using a newer and richer instruction set is not always a win. In some cases it can
116 | //! even lead to performance degradation. In particular:
117 | //!
118 | //! * Wide enough vectors must be used to take advantage of the 256 or more bits of the newer
119 | //!   instruction set (using these with older instruction set is not a problem; the vector
120 | //!   operations will simply translate to multiple narrower instructions). This might create larger
121 | //!   „leftovers“ on the ends of slices that need to be handled in non-vectorized manner.
122 | //! * The CPU may need to switch state, possibly negotiate a higher power supply. This might lead
123 | //!   to slow down before that happens and might degrade performance of neighboring cores.
124 | //! * Some AMD processors (Buldozers) know the instructions, but simulate them by dispatching the
125 | //!   narrower instructions internally (at least it seems so, one 256bit instruction takes a bit
126 | //!   longer than two 128bit ones).
127 | //!
128 | //! Depending on the workload, both slowdowns and full 2* speedups were observed. The chances of
129 | //! speedups are higher when there's a lot of data to crunch „in one go“ (so the CPU has time to
130 | //! „warm up“, the leftovers don't matter that much, etc).
131 | //!
132 | //! # Performance tuning tips
133 | //!
134 | //! The sole purpose of this library is to get faster programs, so here are few things to keep in
135 | //! mind when trying.
136 | //!
137 | //! This library (or SIMD in general) is not a silver bullet. It's good to tackle a lot of data
138 | //! crunching by sheer force (the hammer style approach), but can yield only multiplicative
139 | //! speedups (depending on the width of the instructions, on the size of the base type, etc, one
140 | //! can't expect more than 10 or 20 times speedup, usually less). Oftentimes, more high level
141 | //! optimizations bring significantly better results ‒ choosing a better algorithm, reordering the
142 | //! data in memory to avoid cache misses. These can give you orders of magnitude in some cases.
143 | //! Also, besides instruction level parallelism, one can try using threads to parallelize across
144 | //! cores (for example using [`rayon`]). Therefore, vectorization should be used in the latter
145 | //! stages of performance tuning.
146 | //!
147 | //! Also note that when used on a platform without any SIMD support, it can lead to both speed ups
148 | //! (due to loop unrolling) and slowdowns (probably due to exhaustion of available CPU registers).
149 | //!
150 | //! It is important to measure and profile. Not only because you want to spend the time optimizing
151 | //! the hot parts of the program which actually take significant amount of time, but because the
152 | //! autovectorizer and compiler optimizations sometimes produce surprising results.
153 | //!
154 | //! ## Performance characteristics
155 | //!
156 | //! In general, simple lane-wise operations are significantly faster than horizontal operations
157 | //! (when neighboring lanes may interact) and complex ones. Therefore, adding two vectors using the
158 | //! `+` operator is likely to end up being faster than the
159 | //! [`horizontal_sum`][Vector::horizontal_sum] or the [`gather_load`][Vector::gather_load]
160 | //! constructor.
161 | //!
162 | //! It is advisable to keep as much in vectors as possible instead of operating on separate lanes.
163 | //!
164 | //! Therefore, to compute a sum of bunch of numbers, split the input into vectors, sum these up and
165 | //! do single `horizontal_sum` at the very end.
166 | //!
167 | //! ```rust
168 | //! # use slipstream::prelude::*;
169 | //! fn sum(data: &[f32x8]) -> f32 {
170 | //!     data
171 | //!         .iter()
172 | //!         .copied()
173 | //!         .sum::<f32x8>() // Summing up whole f32x8 vectors, result is also f32x8
174 | //!         .horizontal_sum() // Summing individual lanes of that vector
175 | //! }
176 | //! # assert_eq!(0.0, sum(&[]));
177 | //! ```
178 | //!
179 | //! Also keep in mind that there's usually some „warm up“ for vectorized part of code. This partly
180 | //! comes from the need to somehow deal with uneven ends (if the input is not divisible by the
181 | //! vector size). Also, some instructions require the CPU to switch state, possibly lower frequency
182 | //! and negotiate higher power supply, which may even hinder performance of neighboring cores (this
183 | //! is more of a problem for „newer“ instruction sets like AVX-512 than eg. SSE).
184 | //!
185 | //! Therefore, there's little advantage of interspersing otherwise non-vectorized code with
186 | //! occasional vector variable. The best results are for crunching big inputs all at once.
187 | //!
188 | //! ## Suggested process
189 | //!
190 | //! * Write the non-vectorized version first. Make sure to use the correct algorithm, avoid
191 | //!   unnecessary work, etc.
192 | //! * Parallelize it across threads where it makes sense.
193 | //! * Prepare a micro-benchmark exercising the hot part.
194 | //! * Try rewriting it using the vector types in this crate, but keep the non-vectorized version
195 | //!   around for comparison. Make sure to run the benchmark for both.
196 | //! * If the vectorized version doesn't meet the expectations (or even make things slower), you can
197 | //!   check these things:
198 | //!   - If using the [`multiversion`] crate, watch out for (not) inlining. The detected instruction
199 | //!     set is not propagated to other functions called from the multiversioned one, only to the
200 | //!     inlined ones.
201 | //!   - Make sure to use reasonably sized vector type. On one side, it needs to be large enough to
202 | //!     fill the whole SIMD register (128 bit for SSE and NEON, 256 for AVX, 512 bits for AVX-512).
203 | //!     On the other side, it should not be too large ‒ while wider vectors can be simulated by
204 | //!     executing multiple narrower instructions, they also take multiple registers and that may
205 | //!     lead to unnecessary „juggling“.
206 | //!   - See the profiler output if any particular part stands out. Oftentimes, some constructs like
207 | //!     the [`zip`][core::iter::Iterator::zip] iterator adaptor were found to be problematic. If a
208 | //!     construct is too complex for rustc to „see through“, it can be helped by rewriting that
209 | //!     particular part manually in a simpler way. Pulling slice range checks before the loop might
210 | //!     help too, as rustc no longer has to ensure a panic from the violation would happen at the
211 | //!     right time in the middle of processing.
212 | //!   - Check the assembler output if it looks sane. Seeing if it looks vectorized can be done
213 | //!     without extensive assembler knowledge ‒ SIMD instructions have longer names and use
214 | //!     different named registers (`xmm?` ones for SSE, `ymm?` ones for AVX).
215 | //!
216 | //! See if the profiler can be configured to show inlined functions instead of counting the whole
217 | //! runtime to the whole function. Some profilers can even show annotated assembler code,
218 | //! pinpointing the instruction or area that takes long time. In such case, be aware that an
219 | //! instruction might take a long time because it waits on a data dependency (some preceding
220 | //! instruction still being executed in the pipeline) or data from memory.
221 | //!
222 | //! For the `perf` profile, this can be done with `perf record --call-graph=dwarf <executable>`,
223 | //! `perf report` and `perf annotate`. Make sure to profile with both optimizations *and* debug
224 | //! symbols enabled (but if developing a proprietary thing, make sure to ship *without* the debug
225 | //! symbols).
226 | //!
227 | //! ```toml
228 | //! [profile.release]
229 | //! debug = 2
230 | //! ```
231 | //!
232 | //! When all else fails, you can always rewrite only parts of the algorithm using the explicit
233 | //! intrinsics in [`core::arch`] and leave the rest for autovectorizer. The vector types should be
234 | //! compatible for transmuting to the low-level vectors (eg. `__m128`).
235 | //!
236 | //! # Alternatives
237 | //!
238 | //! There are other crates that try to help with SIMD:
239 | //!
240 | //! * [`packed_simd`]: This is *the* official SIMD library. The downside is, this works only on
241 | //!   nighty compiler and the timeline when this could get stabilized is unclear.
242 | //! * [`faster`]: Works only on nightly and looks abandoned.
243 | //! * [`simdeez`]: Doesn't have unsigned ints. Works on stable, but is unsound (can lead to UB
244 | //!   without writing a single line of user `unsafe` code).
245 | //! * [`safe_simd`]: It has somewhat more complex API than this library, because it deals with
246 | //!   instruction sets explicitly. It supports explicit vectorization (doesn't rely on
247 | //!   autovectorizer). It is not yet released.
248 | //!
249 | //! [`multiversion`]: https://crates.io/crates/multiversion
250 | //! [`rayon`]: https://crates.io/crates/rayon
251 | //! [`packed_simd`]: https://crates.io/crates/packed_simd
252 | //! [`faster`]: https://crates.io/crates/faster
253 | //! [`simdeez`]: https://crates.io/crates/simdeez
254 | //! [`safe_simd`]: https://github.com/calebzulawski/safe_simd/
255 | 
256 | pub mod iterators;
257 | pub mod mask;
258 | pub mod types;
259 | pub mod vector;
260 | 
261 | pub use iterators::Vectorizable;
262 | pub use mask::Mask;
263 | pub use types::*;
264 | pub use vector::Vector;
265 | 
266 | /// Commonly used imports
267 | ///
268 | /// This can be imported to get all the vector types and all the relevant user-facing traits of the
269 | /// crate.
270 | pub mod prelude {
271 |     pub use crate::types::*;
272 |     pub use crate::vector::Masked as _;
273 |     pub use crate::Mask as _;
274 |     pub use crate::Vectorizable as _;
275 | }
276 | 
277 | mod inner {
278 |     use core::num::Wrapping;
279 | 
280 |     use crate::mask::{m128, m16, m32, m64, m8, msize, Mask};
281 | 
282 |     /// A trait to enable vectors to use this type as the base type.
283 |     ///
284 |     /// # Safety
285 |     ///
286 |     /// This is in a private module to prevent users creating their own „crazy“ vector
287 |     /// implementations. We make some non-trivial assumptions about the inner types and be are
288 |     /// conservative at least until we figure out what *exact* assumptions these are and formalize
289 |     /// them.
290 |     pub unsafe trait Repr: Send + Sync + Copy + 'static {
291 |         type Mask: Mask;
292 |         const ONE: Self;
293 |     }
294 | 
295 |     unsafe impl Repr for Wrapping<u8> {
296 |         type Mask = m8;
297 |         const ONE: Wrapping<u8> = Wrapping(1);
298 |     }
299 |     unsafe impl Repr for Wrapping<u16> {
300 |         type Mask = m16;
301 |         const ONE: Wrapping<u16> = Wrapping(1);
302 |     }
303 |     unsafe impl Repr for Wrapping<u32> {
304 |         type Mask = m32;
305 |         const ONE: Wrapping<u32> = Wrapping(1);
306 |     }
307 |     unsafe impl Repr for Wrapping<u64> {
308 |         type Mask = m64;
309 |         const ONE: Wrapping<u64> = Wrapping(1);
310 |     }
311 |     unsafe impl Repr for Wrapping<u128> {
312 |         type Mask = m128;
313 |         const ONE: Wrapping<u128> = Wrapping(1);
314 |     }
315 |     unsafe impl Repr for Wrapping<usize> {
316 |         type Mask = msize;
317 |         const ONE: Wrapping<usize> = Wrapping(1);
318 |     }
319 |     unsafe impl Repr for u8 {
320 |         type Mask = m8;
321 |         const ONE: u8 = 1;
322 |     }
323 |     unsafe impl Repr for u16 {
324 |         type Mask = m16;
325 |         const ONE: u16 = 1;
326 |     }
327 |     unsafe impl Repr for u32 {
328 |         type Mask = m32;
329 |         const ONE: u32 = 1;
330 |     }
331 |     unsafe impl Repr for u64 {
332 |         type Mask = m64;
333 |         const ONE: u64 = 1;
334 |     }
335 |     unsafe impl Repr for u128 {
336 |         type Mask = m128;
337 |         const ONE: u128 = 1;
338 |     }
339 |     unsafe impl Repr for usize {
340 |         type Mask = msize;
341 |         const ONE: usize = 1;
342 |     }
343 | 
344 |     unsafe impl Repr for Wrapping<i8> {
345 |         type Mask = m8;
346 |         const ONE: Wrapping<i8> = Wrapping(1);
347 |     }
348 |     unsafe impl Repr for Wrapping<i16> {
349 |         type Mask = m16;
350 |         const ONE: Wrapping<i16> = Wrapping(1);
351 |     }
352 |     unsafe impl Repr for Wrapping<i32> {
353 |         type Mask = m32;
354 |         const ONE: Wrapping<i32> = Wrapping(1);
355 |     }
356 |     unsafe impl Repr for Wrapping<i64> {
357 |         type Mask = m64;
358 |         const ONE: Wrapping<i64> = Wrapping(1);
359 |     }
360 |     unsafe impl Repr for Wrapping<i128> {
361 |         type Mask = m128;
362 |         const ONE: Wrapping<i128> = Wrapping(1);
363 |     }
364 |     unsafe impl Repr for Wrapping<isize> {
365 |         type Mask = msize;
366 |         const ONE: Wrapping<isize> = Wrapping(1);
367 |     }
368 |     unsafe impl Repr for i8 {
369 |         type Mask = m8;
370 |         const ONE: i8 = 1;
371 |     }
372 |     unsafe impl Repr for i16 {
373 |         type Mask = m16;
374 |         const ONE: i16 = 1;
375 |     }
376 |     unsafe impl Repr for i32 {
377 |         type Mask = m32;
378 |         const ONE: i32 = 1;
379 |     }
380 |     unsafe impl Repr for i64 {
381 |         type Mask = m64;
382 |         const ONE: i64 = 1;
383 |     }
384 |     unsafe impl Repr for i128 {
385 |         type Mask = m128;
386 |         const ONE: i128 = 1;
387 |     }
388 |     unsafe impl Repr for isize {
389 |         type Mask = msize;
390 |         const ONE: isize = 1;
391 |     }
392 | 
393 |     unsafe impl Repr for f32 {
394 |         type Mask = m32;
395 |         const ONE: f32 = 1.0;
396 |     }
397 |     unsafe impl Repr for f64 {
398 |         type Mask = m64;
399 |         const ONE: f64 = 1.0;
400 |     }
401 |     unsafe impl<M: Mask> Repr for M {
402 |         type Mask = Self;
403 |         const ONE: M = M::TRUE;
404 |     }
405 | }
406 | 
407 | /// Free-standing version of [`Vectorizable::vectorize`].
408 | ///
409 | /// This is the same as `a.vectorize()`. Nevertheless, this version might be more convenient as it
410 | /// allows hinting the result vector type with turbofish.
411 | ///
412 | /// ```rust
413 | /// # use slipstream::prelude::*;
414 | /// let data = [1, 2, 3, 4];
415 | /// for v in slipstream::vectorize::<u32x2, _>(&data[..]) {
416 | ///     println!("{:?}", v);
417 | /// }
418 | /// ```
419 | #[inline(always)]
420 | pub fn vectorize<V, A>(a: A) -> impl Iterator<Item = V>
421 | where
422 |     A: Vectorizable<V>,
423 | {
424 |     a.vectorize()
425 | }
426 | 
427 | /// Free-standing version of [`Vectorizable::vectorize_pad`].
428 | ///
429 | /// Equivalent to `a.vectorize_pad(pad)`, but may be more convenient or readable in certain cases.
430 | ///
431 | /// ```rust
432 | /// # use slipstream::prelude::*;
433 | /// let data = [1, 2, 3, 4, 5, 6];
434 | /// let v = slipstream::vectorize_pad(&data[..], i32x4::splat(-1)).collect::<Vec<_>>();
435 | /// assert_eq!(v, vec![i32x4::new([1, 2, 3, 4]), i32x4::new([5, 6, -1, -1])]);
436 | /// ```
437 | #[inline(always)]
438 | pub fn vectorize_pad<V, A>(a: A, pad: A::Padding) -> impl Iterator<Item = V>
439 | where
440 |     A: Vectorizable<V>,
441 | {
442 |     a.vectorize_pad(pad)
443 | }
444 | 
445 | #[cfg(test)]
446 | mod tests {
447 |     use crate::prelude::*;
448 | 
449 |     #[test]
450 |     fn minmax() {
451 |         let a = u32x4::new([1, 4, 8, 9]);
452 |         let b = u32x4::new([3, 3, 5, 11]);
453 | 
454 |         assert_eq!(a.minimum(b), u32x4::new([1, 3, 5, 9]));
455 |         assert_eq!(a.maximum(b), u32x4::new([3, 4, 8, 11]));
456 |         assert_eq!(a.minimum(b), b.minimum(a));
457 |         assert_eq!(a.maximum(b), b.maximum(a));
458 |         assert_eq!(a.maximum(b).ge(a.minimum(b)), m32x4::splat(m32::TRUE));
459 |     }
460 | }
461 | 


--------------------------------------------------------------------------------
/src/mask.rs:
--------------------------------------------------------------------------------
  1 | //! Bool-like types used for masked operations.
  2 | //!
  3 | //! With multi-lane vectors, it is sometimes useful to do a lane-wise comparison or to disable some
  4 | //! of the lanes for a given operation. Naturally, one would express this using a correctly sized
  5 | //! `bool` array.
  6 | //!
  7 | //! Nevertheless, the CPU SIMD instructions don't use bools, but signal `true`/`false` with a
  8 | //! full-sized type with either all bits set to 1 or 0 (TODO: this is not true for AVX-512, what do
  9 | //! we want to do about it?). Therefore, we define our own types that act like bools, but are
 10 | //! represented in the above way. The comparison operators return vectors of these base mask types.
 11 | //! The selection operations accept whatever mask vector with the same number of lanes, but they
 12 | //! are expected to act fastest with the correct sized ones.
 13 | //!
 14 | //! For the purpose of input, `bool` is also considered a mask type.
 15 | //!
 16 | //! The interesting operations are:
 17 | //! * Comparisons ([`lt`][crate::Vector::lt], [`le`][crate::Vector::le], [`eq`][crate::Vector::eq],
 18 | //!   [`ge`][crate::Vector::ge], [`gt`][crate::Vector::gt])
 19 | //! * The [`blend`][crate::Vector::blend] method.
 20 | //! * Masked [loading][crate::Vector::gather_load_masked] and
 21 | //!   [storing][crate::Vector::scatter_store_masked] of vectors.
 22 | //!
 23 | //! The number in the type name specifies the number of bits. Therefore, for the
 24 | //! [`u16x4`][crate::u16x4], the natural mask type is a vector of 4 [`m16`], which is
 25 | //! [`m16x4`][crate::m16x4].
 26 | //!
 27 | //! While it is possible to operate with the bools (by converting them), it is more common to
 28 | //! simply pipe the masks back into the vectors. Note that they *do* implement the usual boolean
 29 | //! operators (however, only the non-shortcircuiting/bitwise variants). These work lane-wise.
 30 | //!
 31 | //! # Examples
 32 | //!
 33 | //! ```rust
 34 | //! # use slipstream::prelude::*;
 35 | //! fn abs(vals: &mut [i32]) {
 36 | //!     let zeroes = i32x8::default();
 37 | //!     for mut v in vals.vectorize_pad(i32x8::default()) {
 38 | //!         // Type of this one is m32x8 and is true whereever the lane isnegative.
 39 | //!         let negative = v.lt(zeroes);
 40 | //!         // Pick lanes from v where non-negative, pick from -v where negative.
 41 | //!         *v = v.blend(-*v, negative);
 42 | //!     }
 43 | //! }
 44 | //! let mut data = [1, -2, 3];
 45 | //! abs(&mut data);
 46 | //! assert_eq!(data, [1, 2, 3]);
 47 | //! ```
 48 | use core::ops::*;
 49 | 
 50 | mod inner {
 51 |     pub trait Sealed {}
 52 | }
 53 | 
 54 | /// The trait implemented by all the mask types.
 55 | ///
 56 | /// Note that this trait is not implementable by downstream crates, as code in the crate assumes
 57 | /// (and relies for safety on the assumption) that the type can ever hold only the two values.
 58 | ///
 59 | /// See the [module documentation][crate::mask].
 60 | pub trait Mask:
 61 |     Copy
 62 |     + Eq
 63 |     + Send
 64 |     + Sync
 65 |     + inner::Sealed
 66 |     + Not
 67 |     + BitAnd
 68 |     + BitAndAssign
 69 |     + BitOr
 70 |     + BitOrAssign
 71 |     + BitXor
 72 |     + BitXorAssign
 73 |     + 'static
 74 | {
 75 |     /// A constant specifying the true value of the type.
 76 |     ///
 77 |     /// For bool, this is `true`. For the others, this means all bits set to `1` ‒ eg. 256 for
 78 |     /// [`m8].
 79 |     const TRUE: Self;
 80 | 
 81 |     /// The false value of the type.
 82 |     ///
 83 |     /// For bool, this is `false`. For the others, this means 0 (all bits set to 0).
 84 |     const FALSE: Self;
 85 | 
 86 |     /// Converts the type to bool.
 87 |     #[inline]
 88 |     fn bool(self) -> bool {
 89 |         if self == Self::TRUE {
 90 |             true
 91 |         } else if self == Self::FALSE {
 92 |             false
 93 |         } else {
 94 |             unsafe { core::hint::unreachable_unchecked() }
 95 |         }
 96 |     }
 97 | 
 98 |     /// Converts the type from bool.
 99 |     #[inline]
100 |     fn from_bool(v: bool) -> Self {
101 |         if v {
102 |             Self::TRUE
103 |         } else {
104 |             Self::FALSE
105 |         }
106 |     }
107 | }
108 | 
109 | /// Inner implementation of the mask types.
110 | ///
111 | /// This is to be used through the type aliases in this module, like [`m8`], or more often through
112 | /// vectors of these, like [`m8x4`][crate::m8x4]. These are the [`mask vectors`][crate::mask].
113 | #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd)]
114 | pub struct MaskWrapper<I>(I);
115 | 
116 | macro_rules! trait_impl {
117 |     ($T: ident, $m: ident, $TA: ident, $ma: ident) => {
118 |         impl<I: $T<Output = I>> $T for MaskWrapper<I> {
119 |             type Output = Self;
120 |             fn $m(self, rhs: Self) -> Self {
121 |                 Self((self.0).$m(rhs.0))
122 |             }
123 |         }
124 | 
125 |         impl<I: $TA> $TA for MaskWrapper<I> {
126 |             fn $ma(&mut self, rhs: Self) {
127 |                 (self.0).$ma(rhs.0)
128 |             }
129 |         }
130 |     };
131 | }
132 | 
133 | trait_impl!(BitAnd, bitand, BitAndAssign, bitand_assign);
134 | trait_impl!(BitOr, bitor, BitOrAssign, bitor_assign);
135 | trait_impl!(BitXor, bitxor, BitXorAssign, bitxor_assign);
136 | 
137 | impl<I: Not<Output = I>> Not for MaskWrapper<I> {
138 |     type Output = Self;
139 |     fn not(self) -> Self::Output {
140 |         Self(self.0.not())
141 |     }
142 | }
143 | 
144 | #[allow(missing_docs)]
145 | pub type m8 = MaskWrapper<u8>;
146 | 
147 | impl inner::Sealed for m8 {}
148 | 
149 | impl Mask for m8 {
150 |     const TRUE: Self = MaskWrapper(u8::MAX);
151 |     const FALSE: Self = MaskWrapper(0);
152 | }
153 | 
154 | #[allow(missing_docs)]
155 | pub type m16 = MaskWrapper<u16>;
156 | 
157 | impl inner::Sealed for m16 {}
158 | 
159 | impl Mask for m16 {
160 |     const TRUE: Self = MaskWrapper(u16::MAX);
161 |     const FALSE: Self = MaskWrapper(0);
162 | }
163 | 
164 | #[allow(missing_docs)]
165 | pub type m32 = MaskWrapper<u32>;
166 | 
167 | impl inner::Sealed for m32 {}
168 | 
169 | impl Mask for m32 {
170 |     const TRUE: Self = MaskWrapper(u32::MAX);
171 |     const FALSE: Self = MaskWrapper(0);
172 | }
173 | 
174 | #[allow(missing_docs)]
175 | pub type m64 = MaskWrapper<u64>;
176 | 
177 | impl inner::Sealed for m64 {}
178 | 
179 | impl Mask for m64 {
180 |     const TRUE: Self = MaskWrapper(u64::MAX);
181 |     const FALSE: Self = MaskWrapper(0);
182 | }
183 | 
184 | #[allow(missing_docs)]
185 | pub type m128 = MaskWrapper<u128>;
186 | 
187 | impl inner::Sealed for m128 {}
188 | 
189 | impl Mask for m128 {
190 |     const TRUE: Self = MaskWrapper(u128::MAX);
191 |     const FALSE: Self = MaskWrapper(0);
192 | }
193 | 
194 | #[allow(missing_docs)]
195 | pub type msize = MaskWrapper<usize>;
196 | 
197 | impl inner::Sealed for msize {}
198 | 
199 | impl Mask for msize {
200 |     const TRUE: Self = MaskWrapper(usize::MAX);
201 |     const FALSE: Self = MaskWrapper(0);
202 | }
203 | 
204 | impl inner::Sealed for bool {}
205 | 
206 | impl Mask for bool {
207 |     const TRUE: Self = true;
208 |     const FALSE: Self = false;
209 | }
210 | 


--------------------------------------------------------------------------------
/src/types.rs:
--------------------------------------------------------------------------------
  1 | #![allow(missing_docs)]
  2 | //! Type aliases of the commonly used vector types.
  3 | //!
  4 | //! While the vector types are created from the [`Vector`] by setting the base type and length,
  5 | //! this is seldom done in downstream code. Instead, this module provides the commonly used types
  6 | //! as aliases, like [u16x8]. See the [crate introduction](crate) for further details about the
  7 | //! naming convention.
  8 | //!
  9 | //! All these types are also exported as part of the [`prelude`][crate::prelude].
 10 | use core::num::Wrapping;
 11 | 
 12 | pub use crate::mask::{m16, m32, m64, m8, msize};
 13 | use crate::vector::align::*;
 14 | use crate::vector::Vector;
 15 | 
 16 | pub type bx2 = Vector<Align2, bool, 2>;
 17 | pub type bx4 = Vector<Align4, bool, 4>;
 18 | pub type bx8 = Vector<Align8, bool, 8>;
 19 | pub type bx16 = Vector<Align16, bool, 16>;
 20 | pub type bx32 = Vector<Align32, bool, 32>;
 21 | 
 22 | pub type m8x2 = Vector<Align2, m8, 2>;
 23 | pub type m8x4 = Vector<Align4, m8, 4>;
 24 | pub type m8x8 = Vector<Align8, m8, 8>;
 25 | pub type m8x16 = Vector<Align16, m8, 16>;
 26 | pub type m8x32 = Vector<Align32, m8, 32>;
 27 | 
 28 | pub type m16x2 = Vector<Align4, m16, 2>;
 29 | pub type m16x4 = Vector<Align8, m16, 4>;
 30 | pub type m16x8 = Vector<Align16, m16, 8>;
 31 | pub type m16x16 = Vector<Align32, m16, 16>;
 32 | 
 33 | pub type m32x2 = Vector<Align8, m32, 2>;
 34 | pub type m32x4 = Vector<Align16, m32, 4>;
 35 | pub type m32x8 = Vector<Align32, m32, 8>;
 36 | pub type m32x16 = Vector<Align64, m32, 16>;
 37 | 
 38 | pub type m64x2 = Vector<Align16, m64, 2>;
 39 | pub type m64x4 = Vector<Align32, m64, 4>;
 40 | pub type m64x8 = Vector<Align64, m64, 8>;
 41 | pub type m64x16 = Vector<Align128, m64, 16>;
 42 | 
 43 | pub type u8x2 = Vector<Align2, u8, 2>;
 44 | pub type u8x4 = Vector<Align4, u8, 4>;
 45 | pub type u8x8 = Vector<Align8, u8, 8>;
 46 | pub type u8x16 = Vector<Align16, u8, 16>;
 47 | pub type u8x32 = Vector<Align32, u8, 32>;
 48 | 
 49 | pub type u16x2 = Vector<Align4, u16, 2>;
 50 | pub type u16x4 = Vector<Align8, u16, 4>;
 51 | pub type u16x8 = Vector<Align16, u16, 8>;
 52 | pub type u16x16 = Vector<Align32, u16, 16>;
 53 | 
 54 | pub type u32x2 = Vector<Align8, u32, 2>;
 55 | pub type u32x4 = Vector<Align16, u32, 4>;
 56 | pub type u32x8 = Vector<Align32, u32, 8>;
 57 | pub type u32x16 = Vector<Align64, u32, 16>;
 58 | 
 59 | pub type u64x2 = Vector<Align16, u64, 2>;
 60 | pub type u64x4 = Vector<Align32, u64, 4>;
 61 | pub type u64x8 = Vector<Align64, u64, 8>;
 62 | pub type u64x16 = Vector<Align128, u64, 16>;
 63 | 
 64 | pub type wu8x2 = Vector<Align2, Wrapping<u8>, 2>;
 65 | pub type wu8x4 = Vector<Align4, Wrapping<u8>, 4>;
 66 | pub type wu8x8 = Vector<Align8, Wrapping<u8>, 8>;
 67 | pub type wu8x16 = Vector<Align16, Wrapping<u8>, 16>;
 68 | pub type wu8x32 = Vector<Align32, Wrapping<u8>, 32>;
 69 | 
 70 | pub type wu16x2 = Vector<Align4, Wrapping<u16>, 2>;
 71 | pub type wu16x4 = Vector<Align8, Wrapping<u16>, 4>;
 72 | pub type wu16x8 = Vector<Align16, Wrapping<u16>, 8>;
 73 | pub type wu16x16 = Vector<Align32, Wrapping<u16>, 16>;
 74 | 
 75 | pub type wu32x2 = Vector<Align8, Wrapping<u32>, 2>;
 76 | pub type wu32x4 = Vector<Align16, Wrapping<u32>, 4>;
 77 | pub type wu32x8 = Vector<Align32, Wrapping<u32>, 8>;
 78 | pub type wu32x16 = Vector<Align64, Wrapping<u32>, 16>;
 79 | 
 80 | pub type wu64x2 = Vector<Align16, Wrapping<u64>, 2>;
 81 | pub type wu64x4 = Vector<Align32, Wrapping<u64>, 4>;
 82 | pub type wu64x8 = Vector<Align64, Wrapping<u64>, 8>;
 83 | pub type wu64x16 = Vector<Align128, Wrapping<u64>, 16>;
 84 | 
 85 | pub type i8x2 = Vector<Align2, i8, 2>;
 86 | pub type i8x4 = Vector<Align4, i8, 4>;
 87 | pub type i8x8 = Vector<Align8, i8, 8>;
 88 | pub type i8x16 = Vector<Align16, i8, 16>;
 89 | pub type i8x32 = Vector<Align32, i8, 32>;
 90 | 
 91 | pub type i16x2 = Vector<Align4, i16, 2>;
 92 | pub type i16x4 = Vector<Align8, i16, 4>;
 93 | pub type i16x8 = Vector<Align16, i16, 8>;
 94 | pub type i16x16 = Vector<Align32, i16, 16>;
 95 | 
 96 | pub type i32x2 = Vector<Align8, i32, 2>;
 97 | pub type i32x4 = Vector<Align16, i32, 4>;
 98 | pub type i32x8 = Vector<Align32, i32, 8>;
 99 | pub type i32x16 = Vector<Align64, i32, 16>;
100 | 
101 | pub type i64x2 = Vector<Align16, i64, 2>;
102 | pub type i64x4 = Vector<Align32, i64, 4>;
103 | pub type i64x8 = Vector<Align64, i64, 8>;
104 | pub type i64x16 = Vector<Align128, i64, 16>;
105 | 
106 | pub type wi8x2 = Vector<Align2, Wrapping<i8>, 2>;
107 | pub type wi8x4 = Vector<Align4, Wrapping<i8>, 4>;
108 | pub type wi8x8 = Vector<Align8, Wrapping<i8>, 8>;
109 | pub type wi8x16 = Vector<Align16, Wrapping<i8>, 16>;
110 | pub type wi8x32 = Vector<Align32, Wrapping<i8>, 32>;
111 | 
112 | pub type wi16x2 = Vector<Align4, Wrapping<i16>, 2>;
113 | pub type wi16x4 = Vector<Align8, Wrapping<i16>, 4>;
114 | pub type wi16x8 = Vector<Align16, Wrapping<i16>, 8>;
115 | pub type wi16x16 = Vector<Align32, Wrapping<i16>, 16>;
116 | 
117 | pub type wi32x2 = Vector<Align8, Wrapping<i32>, 2>;
118 | pub type wi32x4 = Vector<Align16, Wrapping<i32>, 4>;
119 | pub type wi32x8 = Vector<Align32, Wrapping<i32>, 8>;
120 | pub type wi32x16 = Vector<Align64, Wrapping<i32>, 16>;
121 | 
122 | pub type wi64x2 = Vector<Align16, Wrapping<i64>, 2>;
123 | pub type wi64x4 = Vector<Align32, Wrapping<i64>, 4>;
124 | pub type wi64x8 = Vector<Align64, Wrapping<i64>, 8>;
125 | pub type wi64x16 = Vector<Align128, Wrapping<i64>, 16>;
126 | 
127 | pub type f32x2 = Vector<Align8, f32, 2>;
128 | pub type f32x4 = Vector<Align16, f32, 4>;
129 | pub type f32x8 = Vector<Align32, f32, 8>;
130 | pub type f32x16 = Vector<Align64, f32, 16>;
131 | 
132 | pub type f64x2 = Vector<Align16, f64, 2>;
133 | pub type f64x4 = Vector<Align32, f64, 4>;
134 | pub type f64x8 = Vector<Align64, f64, 8>;
135 | pub type f64x16 = Vector<Align128, f64, 16>;
136 | 
137 | // Note: the usize/isize vectors are per-pointer-width because they need a different alignment.
138 | 
139 | #[cfg(target_pointer_width = "32")]
140 | mod sized {
141 |     use super::*;
142 | 
143 |     pub type msizex2 = Vector<Align8, msize, 2>;
144 |     pub type msizex4 = Vector<Align16, msize, 4>;
145 |     pub type msizex8 = Vector<Align32, msize, 8>;
146 |     pub type msizex16 = Vector<Align64, msize, 16>;
147 | 
148 |     pub type usizex2 = Vector<Align8, usize, 2>;
149 |     pub type usizex4 = Vector<Align16, usize, 4>;
150 |     pub type usizex8 = Vector<Align32, usize, 8>;
151 |     pub type usizex16 = Vector<Align64, usize, 16>;
152 | 
153 |     pub type wusizex2 = Vector<Align8, Wrapping<usize>, 2>;
154 |     pub type wusizex4 = Vector<Align16, Wrapping<usize>, 4>;
155 |     pub type wusizex8 = Vector<Align32, Wrapping<usize>, 8>;
156 |     pub type wusizex16 = Vector<Align64, Wrapping<usize>, 16>;
157 | 
158 |     pub type isizex2 = Vector<Align8, isize, 2>;
159 |     pub type isizex4 = Vector<Align16, isize, 4>;
160 |     pub type isizex8 = Vector<Align32, isize, 8>;
161 |     pub type isizex16 = Vector<Align64, isize, 16>;
162 | 
163 |     pub type wisizex2 = Vector<Align8, Wrapping<isize>, 2>;
164 |     pub type wisizex4 = Vector<Align16, Wrapping<isize>, 4>;
165 |     pub type wisizex8 = Vector<Align32, Wrapping<isize>, 8>;
166 |     pub type wisizex16 = Vector<Align64, Wrapping<isize>, 16>;
167 | }
168 | 
169 | #[cfg(target_pointer_width = "64")]
170 | mod sized {
171 |     use super::*;
172 | 
173 |     pub type msizex2 = Vector<Align16, msize, 2>;
174 |     pub type msizex4 = Vector<Align32, msize, 4>;
175 |     pub type msizex8 = Vector<Align64, msize, 8>;
176 |     pub type msizex16 = Vector<Align128, msize, 16>;
177 | 
178 |     pub type usizex2 = Vector<Align16, usize, 2>;
179 |     pub type usizex4 = Vector<Align32, usize, 4>;
180 |     pub type usizex8 = Vector<Align64, usize, 8>;
181 |     pub type usizex16 = Vector<Align128, usize, 16>;
182 | 
183 |     pub type wusizex2 = Vector<Align16, Wrapping<usize>, 2>;
184 |     pub type wusizex4 = Vector<Align32, Wrapping<usize>, 4>;
185 |     pub type wusizex8 = Vector<Align64, Wrapping<usize>, 8>;
186 |     pub type wusizex16 = Vector<Align128, Wrapping<usize>, 16>;
187 | 
188 |     pub type isizex2 = Vector<Align16, isize, 2>;
189 |     pub type isizex4 = Vector<Align32, isize, 4>;
190 |     pub type isizex8 = Vector<Align64, isize, 8>;
191 |     pub type isizex16 = Vector<Align128, isize, 16>;
192 | 
193 |     pub type wisizex2 = Vector<Align16, Wrapping<isize>, 2>;
194 |     pub type wisizex4 = Vector<Align32, Wrapping<isize>, 4>;
195 |     pub type wisizex8 = Vector<Align64, Wrapping<isize>, 8>;
196 |     pub type wisizex16 = Vector<Align128, Wrapping<isize>, 16>;
197 | }
198 | 
199 | pub use sized::*;
200 | 


--------------------------------------------------------------------------------
/src/vector.rs:
--------------------------------------------------------------------------------
   1 | //! Low-level definitions of the vector types and their traits.
   2 | //!
   3 | //! While the user usually operates with the type aliases defined in [`types`][crate::types] (and
   4 | //! exported through the [`prelude`][crate::prelude], this module provides the actual
   5 | //! implementation of the types.
   6 | //!
   7 | //! The module defines a [`Vector`] type. This allows setting not only the base type and number of
   8 | //! lanes, but also alignment (through an additional alignment marker type, available in the
   9 | //! [`align`][mod@align] submodule).
  10 | //!
  11 | //! There are multiple alignments available. Small vectors shouldn't require bigger alignment than
  12 | //! their size, while the bigger ones should require larger one to make it possible to use wider
  13 | //! SIMD registers.
  14 | //!
  15 | //! The type aliases in [`types`][crate::types] takes this into account.
  16 | //!
  17 | //! These types aliases are not thoroughly documented on themselves. The documentation is on the
  18 | //! [`Vector`]. A lot of its functionality is in traits it implements.
  19 | 
  20 | use core::fmt::{Debug, Formatter, Result as FmtResult};
  21 | use core::iter::{Product, Sum};
  22 | use core::mem::{self, MaybeUninit};
  23 | use core::ops::*;
  24 | use core::ptr;
  25 | use num_traits::Float;
  26 | 
  27 | use self::align::Align;
  28 | use crate::inner::Repr;
  29 | use crate::Mask;
  30 | 
  31 | /// Enforcement of alignment.
  32 | ///
  33 | /// This is mostly an implementation detail seldom used by consumers of the crate.
  34 | pub mod align {
  35 |     /// Marker trait for alignment enforcers.
  36 |     ///
  37 |     /// The SIMD vectors need to be properly aligned. Rust allows doing that by an attribute, but that
  38 |     /// needs another top-level vector type. We use zero-sized types to enforce it in a different way.
  39 |     ///
  40 |     /// This is just a marker type for the enforcers, to avoid people putting the wrong parameter at
  41 |     /// the wrong place.
  42 |     pub trait Align: Copy {}
  43 | 
  44 |     macro_rules! align {
  45 |         ($name: ident, $align: expr) => {
  46 |             /// Alignment marker.
  47 |             #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
  48 |             #[repr(align($align))]
  49 |             pub struct $name;
  50 |             impl Align for $name {}
  51 |         };
  52 |     }
  53 | 
  54 |     align!(Align1, 1);
  55 |     align!(Align2, 2);
  56 |     align!(Align4, 4);
  57 |     align!(Align8, 8);
  58 |     align!(Align16, 16);
  59 |     align!(Align32, 32);
  60 |     align!(Align64, 64);
  61 |     align!(Align128, 128);
  62 | }
  63 | 
  64 | // TODO: Seal?
  65 | /// Trait to look up a mask corresponding to a type.
  66 | ///
  67 | /// The [`Vector`] implements this and allows for finding out what the corresponding mask type for
  68 | /// it is. This is not an inherent associated type because these don't yet exist in Rust.
  69 | pub trait Masked {
  70 |     /// The mask type for this vector.
  71 |     ///
  72 |     /// Masks are vector types of boolean-like base types. They are used as results of lane-wise
  73 |     /// comparisons like [`eq`][Vector::eq] and for enabling subsets of lanes for certain
  74 |     /// operations, like [`blend`][Vector::blend] and
  75 |     /// [`gather_load_masked`][Vector::gather_load_masked].
  76 |     ///
  77 |     /// This associated type describes the native mask for the given vector. For example for
  78 |     /// [`u32x4`][crate::u32x4] it would be [`m32x4`][crate::m32x4]. This is the type that the
  79 |     /// comparisons produce. While the selection methods accept any mask type of the right number
  80 |     /// of lanes, using this type on their input is expected to yield the best performance.
  81 |     type Mask;
  82 | }
  83 | 
  84 | macro_rules! bin_op_impl {
  85 |     ($tr: ident, $meth: ident, $tr_assign: ident, $meth_assign: ident) => {
  86 |         impl<A: Align, B: $tr<Output = B> + Repr, const S: usize> $tr for Vector<A, B, S> {
  87 |             type Output = Self;
  88 |             #[inline]
  89 |             fn $meth(self, rhs: Self) -> Self {
  90 |                 unsafe {
  91 |                     let mut data = MaybeUninit::<Self>::uninit();
  92 |                     for i in 0..S {
  93 |                         ptr::write(
  94 |                             data.as_mut_ptr().cast::<B>().add(i),
  95 |                             $tr::$meth(self.data[i], rhs.data[i]),
  96 |                         );
  97 |                     }
  98 |                     data.assume_init()
  99 |                 }
 100 |             }
 101 |         }
 102 | 
 103 |         impl<A: Align, B: $tr<Output = B> + Repr, const S: usize> $tr<B> for Vector<A, B, S> {
 104 |             type Output = Self;
 105 |             #[inline]
 106 |             fn $meth(self, rhs: B) -> Self {
 107 |                 unsafe {
 108 |                     let mut data = MaybeUninit::<Self>::uninit();
 109 |                     for i in 0..S {
 110 |                         ptr::write(
 111 |                             data.as_mut_ptr().cast::<B>().add(i),
 112 |                             $tr::$meth(self.data[i], rhs),
 113 |                         );
 114 |                     }
 115 |                     data.assume_init()
 116 |                 }
 117 |             }
 118 |         }
 119 | 
 120 |         impl<A: Align, B: $tr_assign + Repr, const S: usize> $tr_assign for Vector<A, B, S> {
 121 |             #[inline]
 122 |             fn $meth_assign(&mut self, rhs: Self) {
 123 |                 for i in 0..S {
 124 |                     $tr_assign::$meth_assign(&mut self.data[i], rhs.data[i]);
 125 |                 }
 126 |             }
 127 |         }
 128 | 
 129 |         impl<A: Align, B: $tr_assign + Repr, const S: usize> $tr_assign<B> for Vector<A, B, S> {
 130 |             #[inline]
 131 |             fn $meth_assign(&mut self, rhs: B) {
 132 |                 for i in 0..S {
 133 |                     $tr_assign::$meth_assign(&mut self.data[i], rhs);
 134 |                 }
 135 |             }
 136 |         }
 137 |     };
 138 | }
 139 | 
 140 | macro_rules! una_op_impl {
 141 |     ($tr: ident, $meth: ident) => {
 142 |         impl<A: Align, B: $tr<Output = B> + Repr, const S: usize> $tr for Vector<A, B, S> {
 143 |             type Output = Self;
 144 |             #[inline]
 145 |             fn $meth(self) -> Self {
 146 |                 unsafe {
 147 |                     let mut data = MaybeUninit::<Self>::uninit();
 148 |                     for i in 0..S {
 149 |                         ptr::write(
 150 |                             data.as_mut_ptr().cast::<B>().add(i),
 151 |                             $tr::$meth(self.data[i]),
 152 |                         );
 153 |                     }
 154 |                     data.assume_init()
 155 |                 }
 156 |             }
 157 |         }
 158 |     };
 159 | }
 160 | 
 161 | macro_rules! cmp_op {
 162 |     ($($(#[ $meta: meta ])* $tr: ident => $op: ident;)*) => {
 163 |         $(
 164 |             $(#[ $meta ])*
 165 |             #[inline]
 166 |             pub fn $op(self, other: Self) -> <Self as Masked>::Mask
 167 |             where
 168 |                 B: $tr,
 169 |             {
 170 |                 let mut data = MaybeUninit::<<Self as Masked>::Mask>::uninit();
 171 |                 unsafe {
 172 |                     for i in 0..S {
 173 |                         ptr::write(
 174 |                             data.as_mut_ptr().cast::<B::Mask>().add(i),
 175 |                             B::Mask::from_bool(self.data[i].$op(&other.data[i])),
 176 |                         );
 177 |                     }
 178 |                     data.assume_init()
 179 |                 }
 180 |             }
 181 |         )*
 182 |     };
 183 | }
 184 | 
 185 | /// A vector type.
 186 | ///
 187 | /// Vector types are mostly well aligned fixed sized arrays. Unlike the arrays, they have the usual
 188 | /// numeric operators and several helpful methods implemented on them. They perform the operations
 189 | /// „per lane“ independently and allow the CPU to parallelize the computations.
 190 | ///
 191 | /// The types have convenient aliases ‒ for example [`u32x4`][crate::u32x4] is an alias for
 192 | /// `Vector<Align16, u32, 4>` and corresponds to `[u32; 4]` (but aligned to 16 bytes).
 193 | ///
 194 | /// While these can be operated as arrays (indexing, copying between slices, etc), it is better to
 195 | /// perform operations on whole vectors at once.
 196 | ///
 197 | /// The usual comparing operators don't exist (`<=`), but there are „per lane“ comparison operators
 198 | /// that return mask vectors ‒ vectors of boolean-like values. These can either be examined
 199 | /// manually, or fed into other operations on vectors, like [`blend`][Vector::blend] or
 200 | /// [`gather_load_masked`][Vector::gather_load_masked].
 201 | ///
 202 | /// # Examples
 203 | ///
 204 | /// ```rust
 205 | /// # use slipstream::prelude::*;
 206 | /// let a = i32x4::new([1, -2, 3, -4]);
 207 | /// let b = -a;                           // [-1, 2, -3, 4]
 208 | /// let positive = a.ge(i32x4::splat(1)); // Lane-wise a >= 1
 209 | /// // Will take from b where positive is true, from a otherwise
 210 | /// let abs = b.blend(a, positive);
 211 | /// assert_eq!(abs, i32x4::new([1, 2, 3, 4]));
 212 | /// ```
 213 | #[repr(C)]
 214 | #[derive(Copy, Clone)]
 215 | pub struct Vector<A, B, const S: usize>
 216 | where
 217 |     A: Align,
 218 |     B: Repr,
 219 | {
 220 |     _align: [A; 0],
 221 |     data: [B; S],
 222 | }
 223 | 
 224 | impl<A, B, const S: usize> Vector<A, B, S>
 225 | where
 226 |     A: Align,
 227 |     B: Repr,
 228 | {
 229 |     /// Number of lanes of the vector.
 230 |     pub const LANES: usize = S;
 231 | 
 232 |     #[inline(always)]
 233 |     fn assert_size() {
 234 |         assert!(S > 0);
 235 |         assert!(
 236 |             isize::MAX as usize > mem::size_of::<Self>(),
 237 |             "Vector type too huge",
 238 |         );
 239 |         assert_eq!(
 240 |             mem::size_of::<Self>(),
 241 |             mem::size_of::<[B; S]>(),
 242 |             "Must not contain paddings/invalid Align parameter",
 243 |         );
 244 |     }
 245 | 
 246 |     /// Loads the vector without doing bounds checks.
 247 |     ///
 248 |     /// # Safety
 249 |     ///
 250 |     /// The pointed to memory must be valid in `Self::LANES` consecutive cells ‒ eg. it must
 251 |     /// contain a full array of the base types.
 252 |     #[inline]
 253 |     pub unsafe fn new_unchecked(input: *const B) -> Self {
 254 |         Self::assert_size();
 255 |         Self {
 256 |             _align: [],
 257 |             data: ptr::read(input.cast()),
 258 |         }
 259 |     }
 260 |     /// Loads the vector from correctly sized slice.
 261 |     ///
 262 |     /// This loads the vector from correctly sized slice or anything that can be converted to it ‒
 263 |     /// specifically, fixed sized arrays and other vectors work.
 264 |     ///
 265 |     /// # Example
 266 |     ///
 267 |     /// ```rust
 268 |     /// # use slipstream::prelude::*;
 269 |     /// let vec = (0..10).collect::<Vec<_>>();
 270 |     /// let v1 = u32x4::new(&vec[0..4]);
 271 |     /// let v2 = u32x4::new(v1);
 272 |     /// let v3 = u32x4::new([2, 3, 4, 5]);
 273 |     /// assert_eq!(v1 + v2 + v3, u32x4::new([2, 5, 8, 11]));
 274 |     /// ```
 275 |     ///
 276 |     /// # Panics
 277 |     ///
 278 |     /// If the provided slice is of incompatible size.
 279 |     #[inline]
 280 |     pub fn new<I>(input: I) -> Self
 281 |     where
 282 |         I: AsRef<[B]>,
 283 |     {
 284 |         let input = input.as_ref();
 285 |         assert_eq!(
 286 |             input.len(),
 287 |             S,
 288 |             "Creating vector from the wrong sized slice (expected {}, got {})",
 289 |             S,
 290 |             input.len(),
 291 |         );
 292 |         unsafe { Self::new_unchecked(input.as_ptr()) }
 293 |     }
 294 | 
 295 |     // TODO: Can we turn it into const fn?
 296 |     /// Produces a vector of all lanes set to the same value.
 297 |     ///
 298 |     /// ```rust
 299 |     /// # use slipstream::prelude::*;
 300 |     /// let v = f32x4::splat(1.2);
 301 |     /// assert_eq!(v, f32x4::new([1.2, 1.2, 1.2, 1.2]));
 302 |     /// ```
 303 |     #[inline]
 304 |     pub fn splat(value: B) -> Self {
 305 |         Self::assert_size();
 306 |         Self {
 307 |             _align: [],
 308 |             data: [value; S],
 309 |         }
 310 |     }
 311 | 
 312 |     /// Loads the vector from a slice by indexing it.
 313 |     ///
 314 |     /// Unlike [`new`], this can load the vector from discontinuous parts of the slice, out of
 315 |     /// order or multiple lanes from the same location. This flexibility comes at the cost of lower
 316 |     /// performance (in particular, I've never seen this to get auto-vectorized even though a
 317 |     /// gather instruction exists), therefore prefer [`new`] where possible.
 318 |     ///
 319 |     /// # Examples
 320 |     ///
 321 |     /// ```rust
 322 |     /// # use slipstream::prelude::*;
 323 |     /// let input = (2..100).collect::<Vec<_>>();
 324 |     /// let vec = u32x4::gather_load(&input, [3, 3, 1, 32]);
 325 |     /// assert_eq!(vec, u32x4::new([5, 5, 3, 34]));
 326 |     /// ```
 327 |     ///
 328 |     /// It is possible to use another vector as the indices:
 329 |     ///
 330 |     /// ```rust
 331 |     /// # use slipstream::prelude::*;
 332 |     /// let indices = usizex4::new([1, 2, 3, 4]) * usizex4::splat(2);
 333 |     /// let input = (0..10).collect::<Vec<_>>();
 334 |     /// let vec = u32x4::gather_load(&input, indices);
 335 |     /// assert_eq!(vec, u32x4::new([2, 4, 6, 8]));
 336 |     /// ```
 337 |     ///
 338 |     /// It is possible to use another vector as an input, allowing to narrow it down or shuffle.
 339 |     ///
 340 |     /// ```rust
 341 |     /// # use slipstream::prelude::*;
 342 |     /// let a = u32x4::new([1, 2, 3, 4]);
 343 |     /// let b = u32x4::gather_load(a, [2, 0, 1, 3]);
 344 |     /// assert_eq!(b, u32x4::new([3, 1, 2, 4]));
 345 |     /// let c = u32x2::gather_load(a, [2, 2]);
 346 |     /// assert_eq!(c, u32x2::new([3, 3]));
 347 |     /// ```
 348 |     ///
 349 |     /// # Panics
 350 |     ///
 351 |     /// * If the `idx` slice doesn't have the same length as the vector.
 352 |     /// * If any of the indices is out of bounds of the `input`.
 353 |     ///
 354 |     /// [`new`]: Vector::new
 355 |     #[inline]
 356 |     pub fn gather_load<I, Idx>(input: I, idx: Idx) -> Self
 357 |     where
 358 |         I: AsRef<[B]>,
 359 |         Idx: AsRef<[usize]>,
 360 |     {
 361 |         Self::assert_size();
 362 |         let input = input.as_ref();
 363 |         let idx = idx.as_ref();
 364 |         assert_eq!(
 365 |             S,
 366 |             idx.len(),
 367 |             "Gathering vector from wrong number of indexes"
 368 |         );
 369 |         assert!(idx.iter().all(|&l| l < input.len()), "Gather out of bounds");
 370 |         let mut data = MaybeUninit::<Self>::uninit();
 371 |         unsafe {
 372 |             for i in 0..S {
 373 |                 let idx = *idx.get_unchecked(i);
 374 |                 let input = *input.get_unchecked(idx);
 375 |                 ptr::write(data.as_mut_ptr().cast::<B>().add(i), input);
 376 |             }
 377 |             data.assume_init()
 378 |         }
 379 |     }
 380 | 
 381 |     /// Loads enabled lanes from a slice by indexing it.
 382 |     ///
 383 |     /// This is similar to [`gather_load`]. However, the loading of lanes is
 384 |     /// enabled by a mask. If the corresponding lane mask is not set, the value is taken from
 385 |     /// `self`. In other words, if the mask is all-true, it is semantically equivalent to
 386 |     /// [`gather_load`], expect with possible worse performance.
 387 |     ///
 388 |     /// # Examples
 389 |     ///
 390 |     /// ```rust
 391 |     /// # use slipstream::prelude::*;
 392 |     /// let input = (0..100).collect::<Vec<_>>();
 393 |     /// let v = u32x4::default().gather_load_masked(
 394 |     ///     &input,
 395 |     ///     [1, 4, 2, 2],
 396 |     ///     [m32::TRUE, m32::FALSE, m32::FALSE, m32::TRUE]
 397 |     /// );
 398 |     /// assert_eq!(v, u32x4::new([1, 0, 0, 2]));
 399 |     /// ```
 400 |     ///
 401 |     /// ```rust
 402 |     /// # use slipstream::prelude::*;
 403 |     /// let left = u32x2::new([1, 2]);
 404 |     /// let right = u32x2::new([3, 4]);
 405 |     /// let idx = usizex4::new([0, 1, 0, 1]);
 406 |     /// let mask = m32x4::new([m32::TRUE, m32::TRUE, m32::FALSE, m32::FALSE]);
 407 |     /// let v = u32x4::default()
 408 |     ///     .gather_load_masked(left, idx, mask)
 409 |     ///     .gather_load_masked(right, idx, !mask);
 410 |     /// assert_eq!(v, u32x4::new([1, 2, 3, 4]));
 411 |     /// ```
 412 |     ///
 413 |     /// # Panics
 414 |     ///
 415 |     /// * If the `mask` or the `idx` parameter is of different length than the vector.
 416 |     /// * If any of the active indices are out of bounds of `input`.
 417 |     ///
 418 |     /// [`gather_load`]: Vector::gather_load
 419 |     #[inline]
 420 |     pub fn gather_load_masked<I, Idx, M, MB>(mut self, input: I, idx: Idx, mask: M) -> Self
 421 |     where
 422 |         I: AsRef<[B]>,
 423 |         Idx: AsRef<[usize]>,
 424 |         M: AsRef<[MB]>,
 425 |         MB: Mask,
 426 |     {
 427 |         let input = input.as_ref();
 428 |         let idx = idx.as_ref();
 429 |         let mask = mask.as_ref();
 430 |         let len = idx.len();
 431 |         assert_eq!(S, len, "Gathering vector from wrong number of indexes");
 432 |         assert_eq!(S, mask.len(), "Gathering with wrong sized mask");
 433 |         for i in 0..S {
 434 |             unsafe {
 435 |                 if mask.get_unchecked(i).bool() {
 436 |                     let idx = *idx.get_unchecked(i);
 437 |                     self[i] = input[idx];
 438 |                 }
 439 |             }
 440 |         }
 441 |         self
 442 |     }
 443 | 
 444 |     /// Stores the content into a continuous slice of the correct length.
 445 |     ///
 446 |     /// This is less general than [`scatter_store`][Vector::scatter_store], that one allows storing
 447 |     /// to different parts of the slice.
 448 |     ///
 449 |     /// The counterpart of this is [`new`][Vector::new].
 450 |     ///
 451 |     /// # Panics
 452 |     ///
 453 |     /// If the length doesn't match.
 454 |     #[inline]
 455 |     pub fn store<O: AsMut<[B]>>(self, mut output: O) {
 456 |         output.as_mut().copy_from_slice(&self[..])
 457 |     }
 458 | 
 459 |     /// Store the vector into a slice by indexing it.
 460 |     ///
 461 |     /// This is the inverse of [`gather_load`][Vector::gather_load]. It takes the lanes of the
 462 |     /// vector and stores them into the slice into given indices.
 463 |     ///
 464 |     /// If you want to store it into a continuous slice, it is potentially faster to do it using
 465 |     /// the `copy_from_slice` method or by [`store`][Vector::store]:
 466 |     ///
 467 |     /// ```rust
 468 |     /// # use slipstream::prelude::*;
 469 |     /// let mut data = vec![0; 6];
 470 |     /// let v = u32x4::new([1, 2, 3, 4]);
 471 |     /// data[0..4].copy_from_slice(&v[..]);
 472 |     /// assert_eq!(&data[..], &[1, 2, 3, 4, 0, 0]);
 473 |     /// v.store(&mut data[..4]);
 474 |     /// assert_eq!(&data[..], &[1, 2, 3, 4, 0, 0]);
 475 |     /// ```
 476 |     ///
 477 |     /// # Examples
 478 |     ///
 479 |     /// ```rust
 480 |     /// # use slipstream::prelude::*;
 481 |     /// let mut data = vec![0; 6];
 482 |     /// let v = u32x4::new([1, 2, 3, 4]);
 483 |     /// v.scatter_store(&mut data, [2, 5, 0, 1]);
 484 |     /// assert_eq!(&data[..], &[3, 4, 1, 0, 0, 2]);
 485 |     /// ```
 486 |     ///
 487 |     /// # Warning
 488 |     ///
 489 |     /// If multiple lanes are to be stored into the same slice element, it is not specified which
 490 |     /// of them will end up being stored. It is not UB to do so and it'll always be one of them,
 491 |     /// however it may change between versions or even between compilation targets which.
 492 |     ///
 493 |     /// This is to allow for potential different behaviour of different platforms.
 494 |     ///
 495 |     /// # Panics
 496 |     ///
 497 |     /// * If the `idx` has a different length than the vector.
 498 |     /// * If any of the indices are out of bounds of `output`.
 499 |     #[inline]
 500 |     pub fn scatter_store<O, Idx>(self, mut output: O, idx: Idx)
 501 |     where
 502 |         O: AsMut<[B]>,
 503 |         Idx: AsRef<[usize]>,
 504 |     {
 505 |         let output = output.as_mut();
 506 |         let idx = idx.as_ref();
 507 |         assert_eq!(S, idx.len(), "Scattering vector to wrong number of indexes");
 508 |         // Check prior to starting the scatter before we write anything. Might be nicer for
 509 |         // optimizer + we don't want to do partial scatter.
 510 |         assert!(
 511 |             idx.iter().all(|&l| l < output.len()),
 512 |             "Scatter out of bounds"
 513 |         );
 514 |         for i in 0..S {
 515 |             unsafe {
 516 |                 // get_unchecked: index checked above in bulk and we use this one in hope
 517 |                 // it'll taste better to the autovectorizer and it might find a scatter
 518 |                 // insrtuction for us.
 519 |                 let idx = *idx.get_unchecked(i);
 520 |                 *output.get_unchecked_mut(idx) = self[i];
 521 |             }
 522 |         }
 523 |     }
 524 | 
 525 |     /// A masked version of [`scatter_store`].
 526 |     ///
 527 |     /// This acts in the same way as [`scatter_store`], except lanes disabled by the `mask` are not
 528 |     /// stored anywhere.
 529 |     ///
 530 |     /// # Panics
 531 |     ///
 532 |     /// * If the `idx` or `mask` has a different length than the vector.
 533 |     /// * If any of the active indices are out of bounds of `output`.
 534 |     ///
 535 |     /// [`scatter_store`]: Vector::scatter_store
 536 |     #[inline]
 537 |     pub fn scatter_store_masked<O, Idx, M, MB>(self, mut output: O, idx: Idx, mask: M)
 538 |     where
 539 |         O: AsMut<[B]>,
 540 |         Idx: AsRef<[usize]>,
 541 |         M: AsRef<[MB]>,
 542 |         MB: Mask,
 543 |     {
 544 |         let output = output.as_mut();
 545 |         let idx = idx.as_ref();
 546 |         let mask = mask.as_ref();
 547 |         assert_eq!(S, idx.len(), "Scattering vector to wrong number of indexes");
 548 |         assert_eq!(S, mask.len(), "Scattering vector with wrong sized mask");
 549 |         // Check prior to starting the scatter before we write anything. Might be nicer for
 550 |         // optimizer + we don't want to do partial scatter.
 551 |         let in_bounds = idx
 552 |             .iter()
 553 |             .enumerate()
 554 |             .all(|(i, &l)| !mask[i].bool() || l < output.len());
 555 |         assert!(in_bounds, "Scatter out of bounds");
 556 |         for i in 0..S {
 557 |             if mask[i].bool() {
 558 |                 unsafe {
 559 |                     // get_unchecked: index checked above in bulk and we use this one in
 560 |                     // hope it'll taste better to the autovectorizer and it might find a
 561 |                     // scatter insrtuction for us.
 562 |                     let idx = *idx.get_unchecked(i);
 563 |                     *output.get_unchecked_mut(idx) = self[i];
 564 |                 }
 565 |             }
 566 |         }
 567 |     }
 568 | 
 569 |     /// Blend self and other using mask.
 570 |     ///
 571 |     /// Imports enabled lanes from `other`, keeps disabled lanes from `self`.
 572 |     ///
 573 |     /// # Examples
 574 |     ///
 575 |     /// ```rust
 576 |     /// # use slipstream::prelude::*;
 577 |     /// let odd = u32x4::new([1, 3, 5, 7]);
 578 |     /// let even = u32x4::new([2, 4, 6, 8]);
 579 |     /// let mask = m32x4::new([m32::TRUE, m32::FALSE, m32::TRUE, m32::FALSE]);
 580 |     /// assert_eq!(odd.blend(even, mask), u32x4::new([2, 3, 6, 7]));
 581 |     /// ```
 582 |     #[inline]
 583 |     pub fn blend<M, MB>(self, other: Self, mask: M) -> Self
 584 |     where
 585 |         M: AsRef<[MB]>,
 586 |         MB: Mask,
 587 |     {
 588 |         let mut data = MaybeUninit::<Self>::uninit();
 589 |         let mask = mask.as_ref();
 590 |         unsafe {
 591 |             for i in 0..S {
 592 |                 ptr::write(
 593 |                     data.as_mut_ptr().cast::<B>().add(i),
 594 |                     if mask[i].bool() { other[i] } else { self[i] },
 595 |                 );
 596 |             }
 597 |             data.assume_init()
 598 |         }
 599 |     }
 600 | 
 601 |     /// A lane-wise maximum.
 602 |     ///
 603 |     /// # Examples
 604 |     ///
 605 |     /// ```rust
 606 |     /// # use slipstream::prelude::*;
 607 |     /// let a = u32x4::new([1, 4, 2, 5]);
 608 |     /// let b = u32x4::new([2, 3, 2, 6]);
 609 |     /// assert_eq!(a.maximum(b), u32x4::new([2, 4, 2, 6]));
 610 |     /// ```
 611 |     #[inline]
 612 |     pub fn maximum(self, other: Self) -> Self
 613 |     where
 614 |         B: PartialOrd,
 615 |     {
 616 |         let m = self.lt(other);
 617 |         self.blend(other, m)
 618 |     }
 619 | 
 620 |     /// A lane-wise maximum.
 621 |     ///
 622 |     /// # Examples
 623 |     ///
 624 |     /// ```rust
 625 |     /// # use slipstream::prelude::*;
 626 |     /// let a = u32x4::new([1, 4, 2, 5]);
 627 |     /// let b = u32x4::new([2, 3, 2, 6]);
 628 |     /// assert_eq!(a.minimum(b), u32x4::new([1, 3, 2, 5]));
 629 |     /// ```
 630 |     #[inline]
 631 |     pub fn minimum(self, other: Self) -> Self
 632 |     where
 633 |         B: PartialOrd,
 634 |     {
 635 |         let m = self.gt(other);
 636 |         self.blend(other, m)
 637 |     }
 638 | 
 639 |     // TODO: Example
 640 |     /// Sums the lanes together.
 641 |     ///
 642 |     /// The additions are done in a tree manner: `(a[0] + a[1]) + (a[2] + a[3])`.
 643 |     ///
 644 |     /// Note that this is potentially a slow operation. Prefer to do as many operations on whole
 645 |     /// vectors and only at the very end perform the horizontal operation.
 646 |     #[inline]
 647 |     pub fn horizontal_sum(self) -> B
 648 |     where
 649 |         B: Add<Output = B>,
 650 |     {
 651 |         #[inline(always)]
 652 |         fn inner<B: Copy + Add<Output = B>>(d: &[B]) -> B {
 653 |             if d.len() == 1 {
 654 |                 d[0]
 655 |             } else {
 656 |                 let mid = d.len() / 2;
 657 |                 inner(&d[..mid]) + inner(&d[mid..])
 658 |             }
 659 |         }
 660 |         inner(&self.data)
 661 |     }
 662 | 
 663 |     /// Multiplies all the lanes of the vector.
 664 |     ///
 665 |     /// The multiplications are done in a tree manner: `(a[0] * a[1]) * (a[2] * a[3])`.
 666 |     ///
 667 |     /// Note that this is potentially a slow operation. Prefer to do as many operations on whole
 668 |     /// vectors and only at the very end perform the horizontal operation.
 669 |     #[inline]
 670 |     pub fn horizontal_product(self) -> B
 671 |     where
 672 |         B: Mul<Output = B>,
 673 |     {
 674 |         #[inline(always)]
 675 |         fn inner<B: Copy + Mul<Output = B>>(d: &[B]) -> B {
 676 |             if d.len() == 1 {
 677 |                 d[0]
 678 |             } else {
 679 |                 let mid = d.len() / 2;
 680 |                 inner(&d[..mid]) * inner(&d[mid..])
 681 |             }
 682 |         }
 683 |         inner(&self.data)
 684 |     }
 685 | 
 686 |     cmp_op!(
 687 |         /// Lane-wise `==`.
 688 |         PartialEq => eq;
 689 | 
 690 |         /// Lane-wise `<`.
 691 |         PartialOrd => lt;
 692 | 
 693 |         /// Lane-wise `>`.
 694 |         PartialOrd => gt;
 695 | 
 696 |         /// Lane-wise `<=`.
 697 |         PartialOrd => le;
 698 | 
 699 |         /// Lane-wise `>=`.
 700 |         PartialOrd => ge;
 701 |     );
 702 | }
 703 | 
 704 | impl<A, B, const S: usize> Vector<A, B, S>
 705 | where
 706 |     A: Align,
 707 |     B: Repr + Float,
 708 | {
 709 |     /// Fused multiply-add. Computes (self * a) + b with only one rounding
 710 |     /// error, yielding a more accurate result than an unfused multiply-add.
 711 |     ///
 712 |     /// Using mul_add can be more performant than an unfused multiply-add if the
 713 |     /// target architecture has a dedicated fma CPU instruction.
 714 |     #[inline]
 715 |     pub fn mul_add(self, a: Self, b: Self) -> Self {
 716 |         let mut result = Self::splat(B::zero());
 717 |         for ((res, &s), (&a, &b)) in result
 718 |             .data
 719 |             .iter_mut()
 720 |             .zip(self.data.iter())
 721 |             .zip(a.data.iter().zip(b.data.iter()))
 722 |         {
 723 |             *res = s.mul_add(a, b);
 724 |         }
 725 |         result
 726 |     }
 727 | }
 728 | 
 729 | impl<A: Align, B: Repr, const S: usize> Masked for Vector<A, B, S> {
 730 |     type Mask = Vector<A, B::Mask, S>;
 731 | }
 732 | 
 733 | impl<A: Align, B: Default + Repr, const S: usize> Default for Vector<A, B, S> {
 734 |     #[inline]
 735 |     fn default() -> Self {
 736 |         Self::splat(Default::default())
 737 |     }
 738 | }
 739 | 
 740 | impl<A: Align, B: Debug + Repr, const S: usize> Debug for Vector<A, B, S> {
 741 |     fn fmt(&self, fmt: &mut Formatter) -> FmtResult {
 742 |         fmt.debug_tuple("Vector").field(&self.data).finish()
 743 |     }
 744 | }
 745 | 
 746 | impl<A: Align, B: Repr, const S: usize> Deref for Vector<A, B, S> {
 747 |     type Target = [B; S];
 748 |     #[inline]
 749 |     fn deref(&self) -> &[B; S] {
 750 |         &self.data
 751 |     }
 752 | }
 753 | 
 754 | impl<A: Align, B: Repr, const S: usize> DerefMut for Vector<A, B, S> {
 755 |     #[inline]
 756 |     fn deref_mut(&mut self) -> &mut [B; S] {
 757 |         &mut self.data
 758 |     }
 759 | }
 760 | 
 761 | impl<A: Align, B: Repr, const S: usize> AsRef<[B]> for Vector<A, B, S> {
 762 |     #[inline]
 763 |     fn as_ref(&self) -> &[B] {
 764 |         &self.data
 765 |     }
 766 | }
 767 | 
 768 | impl<A: Align, B: Repr, const S: usize> AsRef<[B; S]> for Vector<A, B, S> {
 769 |     #[inline]
 770 |     fn as_ref(&self) -> &[B; S] {
 771 |         &self.data
 772 |     }
 773 | }
 774 | 
 775 | impl<A: Align, B: Repr, const S: usize> AsMut<[B]> for Vector<A, B, S> {
 776 |     #[inline]
 777 |     fn as_mut(&mut self) -> &mut [B] {
 778 |         &mut self.data
 779 |     }
 780 | }
 781 | 
 782 | impl<A: Align, B: Repr, const S: usize> AsMut<[B; S]> for Vector<A, B, S> {
 783 |     #[inline]
 784 |     fn as_mut(&mut self) -> &mut [B; S] {
 785 |         &mut self.data
 786 |     }
 787 | }
 788 | 
 789 | impl<A: Align, B: Repr, const S: usize> From<[B; S]> for Vector<A, B, S> {
 790 |     #[inline]
 791 |     fn from(data: [B; S]) -> Self {
 792 |         Self::assert_size();
 793 |         Self { _align: [], data }
 794 |     }
 795 | }
 796 | 
 797 | impl<A: Align, B: Repr, const S: usize> From<Vector<A, B, S>> for [B; S] {
 798 |     #[inline]
 799 |     fn from(vector: Vector<A, B, S>) -> [B; S] {
 800 |         vector.data
 801 |     }
 802 | }
 803 | 
 804 | impl<I, A, B, const S: usize> Index<I> for Vector<A, B, S>
 805 | where
 806 |     A: Align,
 807 |     B: Repr,
 808 |     [B; S]: Index<I>,
 809 | {
 810 |     type Output = <[B; S] as Index<I>>::Output;
 811 |     #[inline]
 812 |     fn index(&self, idx: I) -> &Self::Output {
 813 |         &self.data[idx]
 814 |     }
 815 | }
 816 | 
 817 | impl<I, A, B, const S: usize> IndexMut<I> for Vector<A, B, S>
 818 | where
 819 |     A: Align,
 820 |     B: Repr,
 821 |     [B; S]: IndexMut<I>,
 822 | {
 823 |     #[inline]
 824 |     fn index_mut(&mut self, idx: I) -> &mut Self::Output {
 825 |         &mut self.data[idx]
 826 |     }
 827 | }
 828 | 
 829 | impl<A: Align, B: AddAssign + Default + Repr, const S: usize> Sum for Vector<A, B, S> {
 830 |     #[inline]
 831 |     fn sum<I>(iter: I) -> Self
 832 |     where
 833 |         I: Iterator<Item = Self>,
 834 |     {
 835 |         let mut result = Self::default();
 836 |         for i in iter {
 837 |             result += i;
 838 |         }
 839 | 
 840 |         result
 841 |     }
 842 | }
 843 | 
 844 | impl<A: Align, B: MulAssign + Repr, const S: usize> Product for Vector<A, B, S> {
 845 |     #[inline]
 846 |     fn product<I>(iter: I) -> Self
 847 |     where
 848 |         I: Iterator<Item = Self>,
 849 |     {
 850 |         let mut result = Self::splat(B::ONE);
 851 |         for i in iter {
 852 |             result *= i;
 853 |         }
 854 | 
 855 |         result
 856 |     }
 857 | }
 858 | 
 859 | bin_op_impl!(Add, add, AddAssign, add_assign);
 860 | bin_op_impl!(Sub, sub, SubAssign, sub_assign);
 861 | bin_op_impl!(Mul, mul, MulAssign, mul_assign);
 862 | bin_op_impl!(Div, div, DivAssign, div_assign);
 863 | bin_op_impl!(Rem, rem, RemAssign, rem_assign);
 864 | bin_op_impl!(BitAnd, bitand, BitAndAssign, bitand_assign);
 865 | bin_op_impl!(BitOr, bitor, BitOrAssign, bitor_assign);
 866 | bin_op_impl!(BitXor, bitxor, BitXorAssign, bitxor_assign);
 867 | bin_op_impl!(Shl, shl, ShlAssign, shl_assign);
 868 | bin_op_impl!(Shr, shr, ShrAssign, shr_assign);
 869 | 
 870 | una_op_impl!(Neg, neg);
 871 | una_op_impl!(Not, not);
 872 | 
 873 | impl<A: Align, B: PartialEq + Repr, const S: usize> PartialEq for Vector<A, B, S> {
 874 |     #[inline]
 875 |     fn eq(&self, other: &Self) -> bool {
 876 |         self.data == other.data
 877 |     }
 878 | }
 879 | 
 880 | impl<A: Align, B: Eq + Repr, const S: usize> Eq for Vector<A, B, S> {}
 881 | 
 882 | impl<A: Align, B: PartialEq + Repr, const S: usize> PartialEq<[B; S]> for Vector<A, B, S> {
 883 |     #[inline]
 884 |     fn eq(&self, other: &[B; S]) -> bool {
 885 |         self.data == *other
 886 |     }
 887 | }
 888 | 
 889 | impl<A: Align, B: PartialEq + Repr, const S: usize> PartialEq<Vector<A, B, S>> for [B; S] {
 890 |     #[inline]
 891 |     fn eq(&self, other: &Vector<A, B, S>) -> bool {
 892 |         *self == other.data
 893 |     }
 894 | }
 895 | 
 896 | #[cfg(test)]
 897 | mod tests {
 898 |     use super::*;
 899 |     use crate::prelude::*;
 900 | 
 901 |     type V = u16x4;
 902 | 
 903 |     #[test]
 904 |     #[should_panic(expected = "Creating vector from the wrong sized slice (expected 4, got 3)")]
 905 |     fn wrong_size_new() {
 906 |         V::new([1, 2, 3]);
 907 |     }
 908 | 
 909 |     #[test]
 910 |     fn round_trip() {
 911 |         let orig = [1, 2, 3, 4];
 912 |         assert_eq!(<[u16; 4]>::from(u16x4::from(orig)), orig);
 913 |     }
 914 | 
 915 |     #[test]
 916 |     fn shuffle() {
 917 |         let v1 = V::new([1, 2, 3, 4]);
 918 |         let v2 = V::gather_load(v1, [3, 1, 2, 0]);
 919 |         assert_eq!(v2.deref(), &[4, 2, 3, 1]);
 920 |         let v3 = V::gather_load(v2, [0, 0, 2, 2]);
 921 |         assert_eq!(v3.deref(), &[4, 4, 3, 3]);
 922 |     }
 923 | 
 924 |     #[test]
 925 |     fn gather() {
 926 |         let data = (1..=10).collect::<Vec<_>>();
 927 |         let v = V::gather_load(data, [0, 2, 4, 6]);
 928 |         assert_eq!(v, [1, 3, 5, 7]);
 929 |     }
 930 | 
 931 |     #[test]
 932 |     fn scatter() {
 933 |         let v = V::new([1, 2, 3, 4]);
 934 |         let mut output = [0; 10];
 935 |         v.scatter_store(&mut output, [1, 3, 5, 7]);
 936 |         assert_eq!(output, [0, 1, 0, 2, 0, 3, 0, 4, 0, 0]);
 937 |     }
 938 | 
 939 |     #[test]
 940 |     #[should_panic(expected = "Gather out of bounds")]
 941 |     fn gather_oob() {
 942 |         V::gather_load([1, 2, 3], [0, 1, 2, 3]);
 943 |     }
 944 | 
 945 |     #[test]
 946 |     #[should_panic(expected = "Gathering vector from wrong number of indexes")]
 947 |     fn gather_idx_cnt() {
 948 |         V::gather_load([0, 1, 2, 3, 4], [0, 1]);
 949 |     }
 950 | 
 951 |     #[test]
 952 |     #[should_panic(expected = "Scatter out of bounds")]
 953 |     fn scatter_oob() {
 954 |         let mut out = [0; 10];
 955 |         V::new([1, 2, 3, 4]).scatter_store(&mut out, [0, 1, 2, 15]);
 956 |     }
 957 | 
 958 |     #[test]
 959 |     #[should_panic(expected = "Scattering vector to wrong number of indexes")]
 960 |     fn scatter_idx_cnt() {
 961 |         let mut out = [0; 10];
 962 |         V::new([1, 2, 3, 4]).scatter_store(&mut out, [0, 1, 2]);
 963 |     }
 964 | 
 965 |     // TODO: Tests for out of bounds index on masked loads/stores + tests for index out of bound
 966 |     // but disabled by the mask
 967 | 
 968 |     const T: m32 = m32::TRUE;
 969 |     const F: m32 = m32::FALSE;
 970 | 
 971 |     #[test]
 972 |     fn cmp() {
 973 |         let v1 = u32x4::new([1, 3, 5, 7]);
 974 |         let v2 = u32x4::new([2, 3, 4, 5]);
 975 | 
 976 |         assert_eq!(v1.eq(v2), m32x4::new([F, T, F, F]));
 977 |         assert_eq!(v1.le(v2), m32x4::new([T, T, F, F]));
 978 |         assert_eq!(v1.ge(v2), m32x4::new([F, T, T, T]));
 979 |     }
 980 | 
 981 |     #[test]
 982 |     fn blend() {
 983 |         let v1 = u32x4::new([1, 2, 3, 4]);
 984 |         let v2 = u32x4::new([5, 6, 7, 8]);
 985 | 
 986 |         let b1 = v1.blend(v2, m32x4::new([F, T, F, T]));
 987 |         assert_eq!(b1, u32x4::new([1, 6, 3, 8]));
 988 | 
 989 |         let b2 = v1.blend(v2, [false, true, false, true]);
 990 |         assert_eq!(b1, b2);
 991 |     }
 992 | 
 993 |     #[test]
 994 |     fn fma() {
 995 |         let a = f32x4::new([1.0, 2.0, 3.0, 4.0]);
 996 |         let b = f32x4::new([5.0, 6.0, 7.0, 8.0]);
 997 |         let c = f32x4::new([9.0, 10.0, 11.0, 12.0]);
 998 | 
 999 |         assert_eq!(a.mul_add(b, c), f32x4::new([14.0, 22.0, 32.0, 44.0]));
1000 |     }
1001 | }
1002 | 


--------------------------------------------------------------------------------