├── .github ├── codecov.yml └── workflows │ ├── audit.yaml │ ├── benchmarks.yaml │ ├── coverage.yaml │ └── test.yaml ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── benches └── track.rs ├── benchmarks ├── Cargo.toml └── benches │ ├── dot_product.rs │ ├── life.rs │ ├── simple.rs │ ├── sum.rs │ └── utils.rs ├── examples └── matrix.rs ├── proptest-regressions └── sse.txt └── src ├── iterators.rs ├── lib.rs ├── mask.rs ├── types.rs └── vector.rs /.github/codecov.yml: -------------------------------------------------------------------------------- 1 | comment: 2 | layout: "diff, flags, files" 3 | require_changes: true 4 | 5 | coverage: 6 | status: 7 | project: 8 | default: 9 | informational: true 10 | -------------------------------------------------------------------------------- /.github/workflows/audit.yaml: -------------------------------------------------------------------------------- 1 | name: Security audit 2 | on: 3 | pull_request: 4 | push: 5 | branches: 6 | - master 7 | schedule: 8 | - cron: '0 0 * * 0' 9 | 10 | jobs: 11 | security_audit: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - uses: actions-rs/audit-check@35b7b53b1e25b55642157ac01b4adceb5b9ebef3 16 | with: 17 | token: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /.github/workflows/benchmarks.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | push: 4 | branches: 5 | - master 6 | # Run once a week to preserve the cache 7 | # (even though it still feels the cache gets lost sometimes?) 8 | # FIXME: Doesn't seem to be working. Using the GH pages thing for now. 9 | #schedule: 10 | # - cron: '0 0 * * 0' 11 | 12 | name: benchmark pull requests 13 | 14 | jobs: 15 | runBenchmark: 16 | name: run benchmark 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v2 21 | with: 22 | fetch-depth: 0 23 | 24 | - name: Install Rust 25 | uses: actions-rs/toolchain@v1 26 | with: 27 | toolchain: stable 28 | default: true 29 | profile: minimal 30 | 31 | - name: Restore compile cache 32 | uses: Swatinem/rust-cache@v1 33 | 34 | - name: Restore previous benchmark data 35 | uses: actions/cache@v2 36 | with: 37 | path: ./bench-cache 38 | key: ${{ runner.os }}-benchmark 39 | 40 | - name: Run benchmarks 41 | # We choose just the tracking ones. There's a whole fleet that we check 42 | # that compile, but they are too heavy both to run in CI and to show in 43 | # the PRs. And they mostly compare us to other methods. 44 | # 45 | # Provide the bencher output, as the following tool knows how to read that. 46 | run: cargo bench --bench track -- --output-format bencher | grep -v 'Gnuplot not found' | tee benches.out 47 | 48 | - name: Compare benchmarks 49 | uses: rhysd/github-action-benchmark@4eed2c2f4cd0d374720c4b913f79faa8aafcfa6b 50 | with: 51 | name: Track benchmarks 52 | tool: cargo 53 | output-file-path: benches.out 54 | github-token: ${{ secrets.GITHUB_TOKEN }} 55 | auto-push: true 56 | alert-threshold: '150%' 57 | comment-on-alert: true 58 | comment-always: true 59 | # We don't want that to fail. Both our benchmarks and the CI are a 60 | # bit noisy and we have quite a few measurements, so the chance of 61 | # one failing at random is quite high. It's still nice to have it 62 | # measured and available as a comment. 63 | fail-on-alert: false 64 | #external-data-json-path: ./bench-cache/benchmark-data.json 65 | # Because it doesn't put it into the PR, it puts it into the commit :-| 66 | alert-comment-cc-users: '@vorner' 67 | -------------------------------------------------------------------------------- /.github/workflows/coverage.yaml: -------------------------------------------------------------------------------- 1 | name: Test coverage 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | RUST_BACKTRACE: full 12 | 13 | jobs: 14 | coverage: 15 | name: Coverage 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v2 20 | 21 | - name: Install Rust 22 | uses: actions-rs/toolchain@v1 23 | with: 24 | toolchain: nightly 25 | profile: minimal 26 | default: true 27 | 28 | - name: Restore cache 29 | uses: Swatinem/rust-cache@v1 30 | 31 | - name: Run cargo-tarpaulin 32 | uses: actions-rs/tarpaulin@v0.1 33 | with: 34 | args: '--all-features --run-types Doctests,Tests' 35 | timeout: 120 36 | 37 | - name: Upload to codecov.io 38 | uses: codecov/codecov-action@5a8bb4701eca7ba3673f21664b887f652c58d0a3 39 | with: 40 | token: ${{ secrets.CODECOV_TOKEN }} 41 | 42 | - name: Archive code coverage results 43 | uses: actions/upload-artifact@v2 44 | with: 45 | name: code-coverage-report 46 | path: cobertura.xml 47 | retention-days: 30 48 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | 3 | on: 4 | push: 5 | 6 | env: 7 | CARGO_TERM_COLOR: always 8 | RUST_BACKTRACE: full 9 | 10 | jobs: 11 | test: 12 | name: Build & test 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | os: 17 | - ubuntu-latest 18 | - macos-latest 19 | - windows-latest 20 | rust: 21 | - stable 22 | - beta 23 | - nightly 24 | 25 | runs-on: ${{ matrix.os }} 26 | 27 | steps: 28 | - name: checkout 29 | uses: actions/checkout@v2 30 | 31 | - name: Install Rust 32 | uses: actions-rs/toolchain@v1 33 | with: 34 | toolchain: ${{ matrix.rust }} 35 | default: true 36 | profile: minimal 37 | 38 | - name: Restore cache 39 | uses: Swatinem/rust-cache@v1 40 | 41 | - name: Build & test 42 | env: 43 | RUST_VERSION: ${{ matrix.rust }} 44 | OS: ${{ matrix.os }} 45 | RUSTFLAGS: -D warnings 46 | run: cargo test --all-features 47 | 48 | rustfmt: 49 | name: Check formatting 50 | runs-on: ubuntu-latest 51 | steps: 52 | - name: checkout 53 | uses: actions/checkout@v2 54 | 55 | - name: Install Rust 56 | uses: actions-rs/toolchain@v1 57 | with: 58 | profile: minimal 59 | toolchain: stable 60 | default: true 61 | components: rustfmt 62 | 63 | - run: cargo fmt --all -- --check 64 | 65 | links: 66 | name: Check documentation links 67 | runs-on: ubuntu-latest 68 | steps: 69 | - name: checkout 70 | uses: actions/checkout@v2 71 | 72 | - name: Install Rust 73 | uses: actions-rs/toolchain@v1 74 | with: 75 | toolchain: stable 76 | default: true 77 | 78 | - name: Restore cache 79 | uses: Swatinem/rust-cache@v1 80 | 81 | - name: Check links 82 | run: cargo rustdoc --all-features -- -D warnings 83 | 84 | clippy: 85 | name: Clippy lints 86 | runs-on: ubuntu-latest 87 | steps: 88 | - name: Checkout repository 89 | uses: actions/checkout@v2 90 | 91 | - name: Install Rust 92 | uses: actions-rs/toolchain@v1 93 | with: 94 | toolchain: stable 95 | profile: minimal 96 | default: true 97 | components: clippy 98 | 99 | - name: Restore cache 100 | uses: Swatinem/rust-cache@v1 101 | 102 | - name: Run clippy linter 103 | run: cargo clippy --all --all-features --tests -- -D clippy::all -D warnings 104 | 105 | # miri: 106 | # name: Miri checks 107 | # runs-on: ubuntu-latest 108 | # steps: 109 | # - name: Checkout repository 110 | # uses: actions/checkout@v2 111 | # 112 | # - name: Install Rust 113 | # uses: actions-rs/toolchain@v1 114 | # with: 115 | # toolchain: nightly 116 | # profile: minimal 117 | # default: true 118 | # components: "miri" 119 | # 120 | # - name: Restore cache 121 | # uses: Swatinem/rust-cache@v1 122 | # 123 | # - name: Run miri 124 | # env: 125 | # PROPTEST_CASES: "10" 126 | # MIRIFLAGS: "-Zmiri-disable-isolation" 127 | # run: cargo miri test --all-features 128 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | tags 4 | perf.data 5 | perf.data.old 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | cache: cargo 3 | rust: 4 | - stable 5 | - beta 6 | - nightly 7 | os: 8 | - windows 9 | - linux 10 | - osx 11 | 12 | before_script: 13 | - | 14 | (travis_wait rustup component add rustfmt-preview || true) && 15 | (travis_wait rustup component add clippy-preview || true) 16 | 17 | script: 18 | - | 19 | ./ci-check.sh 20 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.2.1 2 | 3 | * `From` implementations for relevant arrays. 4 | * `mul_add` support. 5 | 6 | # 0.2.0 7 | 8 | * Refactorings to use const generics instead of `generic_array`. 9 | * Few more operators (eg. `vector *= scalar`). 10 | 11 | # 0.1.1 12 | 13 | * Free-standing versions of `vectorize` and `vectorize_pad`, to have a place to 14 | put a turbofish. 15 | 16 | # 0.1.0 17 | 18 | * Initial release. 19 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "slipstream" 3 | version = "0.2.1" 4 | authors = ["Michal 'vorner' Vaner "] 5 | edition = "2018" 6 | description = "SIMD library usable by the masses" 7 | repository = "https://github.com/vorner/splitstream" 8 | readme = "README.md" 9 | keywords = ["simd", "performance"] 10 | categories = ["hardware-support"] 11 | license = "Apache-2.0 OR MIT" 12 | autobenches = false 13 | 14 | [badges] 15 | travis-ci = { repository = "vorner/arc-swap" } 16 | maintenance = { status = "actively-developed" } 17 | 18 | [workspace] 19 | members = ["benchmarks"] 20 | 21 | [dependencies] 22 | num-traits = "0.2" 23 | 24 | [dev-dependencies] 25 | criterion = "~0.3" 26 | multiversion = "~0.6" 27 | proptest = "~0.10" 28 | rand = "~0.8" 29 | 30 | [profile.release] 31 | debug = 2 32 | 33 | [profile.test] 34 | # Some tests are slow to run. Even slower than it takes to compile them properly. 35 | opt-level = 1 36 | 37 | [[bench]] 38 | name = "track" 39 | harness = false 40 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 tokio-jsonrpc developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Slipstream 2 | 3 | [![Actions Status](https://github.com/vorner/slipstream/workflows/test/badge.svg)](https://github.com/vorner/slipstream/actions) 4 | [![codecov](https://codecov.io/gh/vorner/slipstream/branch/master/graph/badge.svg?token=RG02T39PJZ)](https://codecov.io/gh/vorner/slipstream) 5 | [![docs](https://docs.rs/slipstream/badge.svg)](https://docs.rs/slipstream) 6 | 7 | 8 | This library helps writing code in a way that incentives the compiler to 9 | optimize the results better (without really doing anything itself). 10 | 11 | Modern compilers, including `rustc`, are able to come up with impressive ways to 12 | speed up the resulting code, using techniques like loop unrolling and 13 | autovectorization, routinely outperforming what one would hand-craft. 14 | Nevertheless, each optimisation has some assumptions that must be proven to hold 15 | before it can be applied. 16 | 17 | This library offers „vector“ types, like `u16x8`, which act in a very similar 18 | way as little fixed-sized arrays (in this case it would be `[u16; 8]`), but with 19 | arithmetics defined for them. They also enforce alignment of the whole vectors. 20 | Therefore, one can write the algorithm in a way that works on these groups of 21 | data and make it easier for the compiler to prove the assumptions. This can 22 | result in multiple factor speed ups by giving the compiler these proofs „for 23 | free“ and allowing it to apply aggressive optimizations. 24 | 25 | The API is inspired by the [`packed_simd`] and [`faster`] crates, but as it 26 | relies on the autovectorizer instead of using explicit SIMD instructions, it 27 | works on stable rust, allows speed ups even on platforms that don't have 28 | explicit SIMD support from the rust standard library (or no SIMD support at 29 | all). 30 | 31 | The downside is the optimizations are not *guaranteed*. While it oftentimes 32 | produces results competitive or even better than hand-crafted vectorized code, 33 | a small change to surrounding code can also lead to much worse results. You're 34 | advised to apply this to only tight loops with enough data to crunch and to 35 | measure the performance. 36 | 37 | It goes well together with function multiversioning, see for example the 38 | [`multiversion`] crate. 39 | 40 | More details can be found in the [documentation], including tips for effective 41 | use and what to try if the performance isn't as good as expected. 42 | 43 | ## Example 44 | 45 | As a very simple example, imagine that the crux of the application's performance 46 | is summing a huge array of floats and we have this code: 47 | 48 | ```rust 49 | fn compute(d: &[f32]) -> f32 { 50 | d.iter().sum() 51 | } 52 | ``` 53 | 54 | Now, one could rewrite it to something like this, using manual vectorization: 55 | 56 | ```rust 57 | use core::arch::x86_64 as arch; 58 | 59 | unsafe fn compute_sse(d: &[f32]) -> f32 { 60 | let mut result = arch::_mm_setzero_ps(); 61 | let iter = data.chunks_exact(4); 62 | let remainder = iter.remainder().iter().sum::(); 63 | for v in iter { 64 | result = arch::_mm_add_ps(result, arch::_mm_loadu_ps(v.as_ptr())); 65 | } 66 | 67 | let result: [f32; 4] = mem::transmute(result); 68 | let result = result.iter().sum::() + remainder; 69 | } 70 | ``` 71 | 72 | And while this does result in significant speedup, it's also much less readable, 73 | one has to allow using unsafe through the application logic and is not portable 74 | (it won't run on anything that's not Intel and it won't take advantage of newer 75 | and better vector instructions even there). These downside usually make it not 76 | worth pursuing for more complex algorithms. 77 | 78 | Using `slipstream`, one can also write this: 79 | 80 | ```rust 81 | fn compute_slipstream(d: &[f32]) -> f32 { 82 | // Will split the data into vectors of 4 lanes, padding the last one with 83 | // the lanes from the provided parameter. 84 | d.vectorize_pad(f32x4::default()) 85 | // Sum the vectors into a final vector 86 | .sum::() 87 | // Sum the lanes of the vectors together. 88 | .horizontal_sum() 89 | } 90 | ``` 91 | 92 | This is still longer and more complex than the original, but seems much more 93 | manageable than the manual version. It's also portable and might provide some 94 | speedup on platforms that don't have any vector instructions. Using the right 95 | annotations on the function, one is also able to generate multiple versions and 96 | dispatch the one that takes advantage of the newest and shiniest instructions 97 | the CPU supports at runtime. 98 | 99 | Corresponding benchmarks on i5-8265U suggest that this version comes close to 100 | the manual one. Indeed, there are similar variants that are even faster. 101 | 102 | ``` 103 | test sum::basic ... bench: 11,707,693 ns/iter (+/- 261,428) 104 | test sum::manual_sse_convert ... bench: 3,000,906 ns/iter (+/- 535,041) 105 | test sum::vectorize_pad_default ... bench: 3,141,834 ns/iter (+/- 81,376) 106 | ``` 107 | 108 | Note: to re-run the benchmarks as above, use `type V = f32x4` in 109 | `benches/utils.rs`. 110 | 111 | Warning: Floats are not associative. The first, manual, version may produce 112 | slightly different results because of rounding errors. 113 | 114 | ## Help wanted 115 | 116 | It is an open source library and help in developing it is welcome. There are 117 | some areal where Your contribution would be especially appreciated: 118 | 119 | * Feedback about the API, documentation and generally how well it is usable. 120 | * Implementing missing APIs: While a lot is covered already, there are areas 121 | that are still missing. I know of: 122 | - Some way to convert between different sizes of the base type (eg. `f32x4 -> 123 | f64x4`). 124 | - Various methods on types that are present on the base types ‒ trigonometric 125 | functions on floats, rounding, absolute values, number of set/unset bits on 126 | unsigned integers... 127 | - Vector-scalar multiplications. It is currently possible to do eg 128 | `f32x2::splat(-1.0) * f32x2::new([1, 2])`, but it would be more comfortable 129 | if it could be just written as `-1.0 * f32x2::new([1, 2])`. 130 | * Use cases and benchmarks: if you can come up with a simple, well-vectorizable 131 | problem and submit it as a benchmark, it helps keeping and improving the 132 | performance of the library. Both cases where the library performs well and 133 | where it *doesn't* are good to have (the latter could be considered bugs of a 134 | kind). Optimally, if such benchmark contains a naïve implementation (without 135 | this library), implementation using this library (possibly in multiple 136 | variations) and a hand-written vectorized code with the platform specific 137 | intrinsics. But if any of these are missing (for example because it would be 138 | too much work to write the manually vectorized code), it's still better than 139 | nothing. 140 | * Improving performance: While it is the compiler that makes the program go 141 | fast, how good the compiler is in the job highly depends on if it can „see 142 | through“ the code. If you can tweak implementation of some method in a way 143 | that's more understandable and transparent to the compiler, it is great. Most 144 | of the code was written as fast as possible and only some tweaking was done 145 | for now. For example, the `vectorize_pad` method seems surprisingly slow, 146 | ideally it would produce code with comparable speed to `vectorize`. 147 | * Dealing with unsafe: At many places, the library uses `unsafe` code. This was 148 | oftentimes written that way because of performance ‒ for example, initializing 149 | the `GenericArray` from an iterator prevented a lot of optimisations and led 150 | to significantly inferior performance. Optimally, each such `unsafe` code 151 | would get replaced by safe code, or would get a comment explaining/proving 152 | that it is indeed safe. 153 | 154 | If you want to work on anything bigger, it's a good idea to open an issue on the 155 | repository to both discuss it first and to reserve the task. 156 | 157 | ## License 158 | 159 | Licensed under either of 160 | 161 | * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 162 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 163 | 164 | at your option. 165 | 166 | ### Contribution 167 | 168 | Unless you explicitly state otherwise, any contribution intentionally 169 | submitted for inclusion in the work by you, as defined in the Apache-2.0 170 | license, shall be dual licensed as above, without any additional terms 171 | or conditions. 172 | 173 | [`packed_simd`]: https://crates.io/crates/packed_simd 174 | [`faster`]: https://crates.io/crates/faster 175 | [`multiversion`]: https://crates.io/crates/multiversion 176 | [documentation]: https://docs.rs/slipstream 177 | -------------------------------------------------------------------------------- /benches/track.rs: -------------------------------------------------------------------------------- 1 | use std::iter; 2 | 3 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 4 | use multiversion::multiversion; 5 | 6 | use slipstream::prelude::*; 7 | 8 | type V = f32x8; 9 | 10 | const SIZE: usize = 4096 * 100; 11 | 12 | #[multiversion] 13 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")] 14 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")] 15 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")] 16 | #[clone(target = "[arm|aarch64]+neon")] 17 | fn sum(data: &[V]) -> f32 { 18 | data.iter().copied().sum::().horizontal_sum() 19 | } 20 | 21 | fn sum_scalar(data: &[f32]) -> f32 { 22 | data.iter().copied().sum() 23 | } 24 | 25 | #[multiversion] 26 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")] 27 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")] 28 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")] 29 | fn dot_product(l: &[f32], r: &[f32]) -> f32 { 30 | (l, r) 31 | .vectorize() 32 | .map(|(l, r): (V, V)| l * r) 33 | .sum::() 34 | .horizontal_sum() 35 | } 36 | 37 | fn dot_product_scalar(l: &[f32], r: &[f32]) -> f32 { 38 | l.iter().zip(r).map(|(l, r)| l * r).sum() 39 | } 40 | 41 | fn benchmark(c: &mut Criterion) { 42 | let vecs = iter::repeat_with(rand::random) 43 | .map(|v: [f32; V::LANES]| V::new(&v)) 44 | .take(SIZE / V::LANES) 45 | .collect::>(); 46 | 47 | let scalars_a = iter::repeat_with(rand::random) 48 | .take(SIZE) 49 | .collect::>(); 50 | 51 | let scalars_b = iter::repeat_with(rand::random) 52 | .take(SIZE) 53 | .collect::>(); 54 | 55 | c.bench_function("sum_vec", |b| { 56 | b.iter(|| black_box(sum(&vecs))); 57 | }); 58 | 59 | c.bench_function("sum_scalar", |b| { 60 | b.iter(|| black_box(sum_scalar(&scalars_a))); 61 | }); 62 | 63 | c.bench_function("dot_product_vec", |b| { 64 | b.iter(|| black_box(dot_product(&scalars_a, &scalars_b))); 65 | }); 66 | 67 | c.bench_function("dot_product_scalar", |b| { 68 | b.iter(|| black_box(dot_product_scalar(&scalars_a, &scalars_b))); 69 | }); 70 | } 71 | 72 | criterion_group!(benches, benchmark); 73 | criterion_main!(benches); 74 | -------------------------------------------------------------------------------- /benchmarks/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "benchmarks" 3 | version = "0.1.0" 4 | authors = ["Michal 'vorner' Vaner "] 5 | edition = "2018" 6 | publish = false 7 | autobenches = false 8 | 9 | [dependencies] 10 | 11 | [dev-dependencies] 12 | slipstream = { path = ".." } 13 | multiversion = "~0.6" 14 | once_cell = "~1" 15 | proptest = "~0.10" 16 | rand = "~0.8" 17 | packed_simd_2 = "~0.3" 18 | 19 | [[bench]] 20 | name = "simple" 21 | path = "benches/simple.rs" 22 | -------------------------------------------------------------------------------- /benchmarks/benches/dot_product.rs: -------------------------------------------------------------------------------- 1 | use multiversion::multiversion; 2 | use test::Bencher; 3 | 4 | use crate::mv; 5 | use crate::utils::{gen_data, gen_vecs, V}; 6 | use slipstream::prelude::*; 7 | 8 | mv! { 9 | fn vectorized_idx(l: &[V], r: &[V]) -> f32 { 10 | assert_eq!(l.len(), r.len()); 11 | let mut result = V::default(); 12 | for i in 0..l.len() { 13 | result += l[i] * r[i]; 14 | } 15 | 16 | result.horizontal_sum() 17 | } 18 | 19 | fn vectorized(l: &[V], r: &[V]) -> f32 { 20 | (l, r).vectorize() 21 | .map(|(l, r)| l * r) 22 | .sum::() 23 | .horizontal_sum() 24 | } 25 | 26 | fn vectorize_zip(l: &[f32], r: &[f32]) -> f32 { 27 | let l = l.vectorize(); 28 | let r = r.vectorize(); 29 | l.zip(r) 30 | .map(|(l, r): (V, V)| l * r) 31 | .sum::() 32 | .horizontal_sum() 33 | } 34 | 35 | fn vectorize_tuple(l: &[f32], r: &[f32]) -> f32 { 36 | (l, r).vectorize() 37 | .map(|(l, r): (V, V)| l * r) 38 | .sum::() 39 | .horizontal_sum() 40 | } 41 | 42 | fn vectorize_tuple_for(l: &[f32], r: &[f32]) -> f32 { 43 | let mut result = V::default(); 44 | for (l, r) in (l, r).vectorize() { 45 | let (l, r): (V, V) = (l, r); 46 | result += l * r; 47 | } 48 | result.horizontal_sum() 49 | } 50 | 51 | fn packed(l: &[f32], r: &[f32]) -> f32 { 52 | type V = packed_simd_2::f32x16; 53 | let l = l.chunks_exact(16); 54 | let r = r.chunks_exact(16); 55 | let mut result = V::default(); 56 | for (l, r) in l.zip(r) { 57 | let l = V::from_slice_unaligned(l); 58 | let r = V::from_slice_unaligned(r); 59 | result = l.mul_adde(r, result); 60 | } 61 | result.sum() 62 | } 63 | } 64 | 65 | #[bench] 66 | fn simple(b: &mut Bencher) { 67 | let (l, r) = gen_data(); 68 | 69 | b.iter(|| { 70 | let result: f32 = l.iter().zip(r.iter()).map(|(&l, &r)| l * r).sum(); 71 | test::black_box(result); 72 | }); 73 | } 74 | 75 | #[bench] 76 | fn vectorized_default(b: &mut Bencher) { 77 | let (l, r) = gen_vecs(); 78 | b.iter(|| { 79 | test::black_box(vectorized_default_version(l, r)); 80 | }); 81 | } 82 | 83 | #[bench] 84 | fn vectorized_detect(b: &mut Bencher) { 85 | let (l, r) = gen_vecs(); 86 | b.iter(|| { 87 | test::black_box(vectorized(l, r)); 88 | }); 89 | } 90 | 91 | #[bench] 92 | fn vectorized_idx_default(b: &mut Bencher) { 93 | let (l, r) = gen_vecs(); 94 | b.iter(|| { 95 | test::black_box(vectorized_idx_default_version(l, r)); 96 | }); 97 | } 98 | 99 | #[bench] 100 | fn vectorized_idx_detect(b: &mut Bencher) { 101 | let (l, r) = gen_vecs(); 102 | b.iter(|| { 103 | test::black_box(vectorized_idx(l, r)); 104 | }); 105 | } 106 | 107 | #[bench] 108 | fn vectorize_zip_default(b: &mut Bencher) { 109 | let (l, r) = gen_data(); 110 | b.iter(|| { 111 | test::black_box(vectorize_zip_default_version(l, r)); 112 | }); 113 | } 114 | 115 | #[bench] 116 | fn vectorize_zip_detect(b: &mut Bencher) { 117 | let (l, r) = gen_data(); 118 | b.iter(|| { 119 | test::black_box(vectorize_zip(l, r)); 120 | }); 121 | } 122 | 123 | #[bench] 124 | fn vectorize_tuple_default(b: &mut Bencher) { 125 | let (l, r) = gen_data(); 126 | b.iter(|| { 127 | test::black_box(vectorize_tuple_default_version(l, r)); 128 | }); 129 | } 130 | 131 | #[bench] 132 | fn vectorize_tuple_detect(b: &mut Bencher) { 133 | let (l, r) = gen_data(); 134 | b.iter(|| { 135 | test::black_box(vectorize_tuple(l, r)); 136 | }); 137 | } 138 | 139 | #[bench] 140 | fn packed_default(b: &mut Bencher) { 141 | let (l, r) = gen_data(); 142 | b.iter(|| { 143 | test::black_box(packed_default_version(l, r)); 144 | }); 145 | } 146 | 147 | #[bench] 148 | fn packed_detect(b: &mut Bencher) { 149 | let (l, r) = gen_data(); 150 | b.iter(|| { 151 | test::black_box(packed(l, r)); 152 | }); 153 | } 154 | 155 | #[bench] 156 | fn vectorize_tuple_for_default(b: &mut Bencher) { 157 | let (l, r) = gen_data(); 158 | b.iter(|| { 159 | test::black_box(vectorize_tuple_for_default_version(l, r)); 160 | }); 161 | } 162 | 163 | #[bench] 164 | fn vectorize_tuple_for_detect(b: &mut Bencher) { 165 | let (l, r) = gen_data(); 166 | b.iter(|| { 167 | test::black_box(vectorize_tuple_for(l, r)); 168 | }); 169 | } 170 | 171 | #[bench] 172 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] 173 | fn manual_sse(b: &mut Bencher) { 174 | use std::mem; 175 | 176 | use crate::utils::arch::{self, __m128}; 177 | use crate::utils::gen_arch_vecs; 178 | 179 | let (l, r) = gen_arch_vecs(); 180 | 181 | #[target_feature(enable = "fma", enable = "sse")] 182 | unsafe fn inner(l: &[__m128], r: &[__m128]) -> f32 { 183 | let mut result = arch::_mm_setzero_ps(); 184 | for (&l, &r) in l.iter().zip(r.iter()) { 185 | result = arch::_mm_add_ps(result, arch::_mm_mul_ps(l, r)); 186 | } 187 | 188 | let result: [f32; 4] = mem::transmute(result); 189 | result.iter().sum() 190 | } 191 | 192 | b.iter(|| test::black_box(unsafe { inner(l, r) })); 193 | } 194 | 195 | #[bench] 196 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] 197 | fn manual_sse_fmadd(b: &mut Bencher) { 198 | use std::mem; 199 | 200 | use crate::utils::arch::{self, __m128}; 201 | use crate::utils::gen_arch_vecs; 202 | 203 | let (l, r) = gen_arch_vecs(); 204 | 205 | #[target_feature(enable = "fma", enable = "sse")] 206 | unsafe fn inner(l: &[__m128], r: &[__m128]) -> f32 { 207 | let mut result = arch::_mm_setzero_ps(); 208 | for (&l, &r) in l.iter().zip(r.iter()) { 209 | result = arch::_mm_fmadd_ps(l, r, result); 210 | } 211 | 212 | let result: [f32; 4] = mem::transmute(result); 213 | result.iter().sum() 214 | } 215 | 216 | if is_x86_feature_detected!("fma") { 217 | b.iter(|| unsafe { test::black_box(inner(l, r)) }); 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /benchmarks/benches/life.rs: -------------------------------------------------------------------------------- 1 | use std::iter; 2 | use std::mem; 3 | 4 | use multiversion::multiversion; 5 | use once_cell::sync::Lazy; 6 | use test::Bencher; 7 | 8 | use crate::mv; 9 | use slipstream::prelude::*; 10 | 11 | type Bools = bx32; 12 | type Counts = u8x32; 13 | 14 | #[derive(Clone, Debug, PartialEq)] 15 | struct Life { 16 | edge: usize, 17 | cells: Vec, 18 | next: Vec, 19 | } 20 | 21 | const NEIGHS: [(isize, isize); 8] = [ 22 | (-1, -1), 23 | (-1, 0), 24 | (-1, 1), 25 | (0, 1), 26 | (1, 1), 27 | (1, 0), 28 | (1, -1), 29 | (0, -1), 30 | ]; 31 | 32 | const SIZE: usize = 1026; 33 | 34 | impl Life { 35 | fn at(&self, x: usize, y: usize) -> usize { 36 | y * self.edge + x 37 | } 38 | fn set(&mut self, x: usize, y: usize, val: bool) { 39 | let idx = self.at(x, y); 40 | self.cells[idx] = val; 41 | } 42 | fn set_next(&mut self, x: usize, y: usize, val: bool) { 43 | let idx = self.at(x, y); 44 | self.next[idx] = val; 45 | } 46 | fn get(&self, x: usize, y: usize) -> bool { 47 | self.cells[self.at(x, y)] 48 | } 49 | 50 | /// Place a frame of always dead cells which won't participate in the game. 51 | /// 52 | /// These just solve the issue what to do with edges of the game plan. 53 | fn frame(&mut self) { 54 | for i in 0..self.edge { 55 | self.set(0, i, false); 56 | self.set(self.edge - 1, i, false); 57 | self.set(i, 0, false); 58 | self.set(i, self.edge - 1, false); 59 | } 60 | } 61 | fn gen() -> Self { 62 | fn inner() -> Life { 63 | let cells = iter::repeat_with(rand::random).take(SIZE * SIZE).collect(); 64 | let mut me = Life { 65 | edge: SIZE, 66 | cells, 67 | next: Vec::new(), 68 | }; 69 | me.frame(); 70 | me.next = me.cells.clone(); 71 | me 72 | } 73 | 74 | static CACHED: Lazy = Lazy::new(inner); 75 | CACHED.clone() 76 | } 77 | 78 | fn step(&mut self) { 79 | for y in 1..self.edge - 1 { 80 | for x in 1..self.edge - 1 { 81 | let cnt = NEIGHS 82 | .iter() 83 | .filter(|&&(xd, yd)| { 84 | self.get(((x as isize) + xd) as usize, ((y as isize) + yd) as usize) 85 | }) 86 | .count(); 87 | let alive = match cnt { 88 | 2 if self.get(x, y) => true, 89 | 3 => true, 90 | _ => false, 91 | }; 92 | self.set_next(x, y, alive); 93 | } 94 | } 95 | mem::swap(&mut self.cells, &mut self.next); 96 | } 97 | 98 | mv! { 99 | fn step_vectorized(&mut self) { 100 | assert_eq!(mem::align_of::(), mem::align_of::()); 101 | assert_eq!(mem::size_of::(), mem::size_of::()); 102 | let twos = Counts::splat(2); 103 | let threes = Counts::splat(3); 104 | let dead = Bools::default(); 105 | let alive = Bools::splat(true); 106 | 107 | let mut neighs: [_; 8] = Default::default(); 108 | for y in 1..self.edge - 1 { 109 | let cells = &self.cells; 110 | for (ndest, &(xd, yd)) in neighs.iter_mut().zip(&NEIGHS) { 111 | let idx = self.at((1 + xd) as usize, ((y as isize) + yd) as usize); 112 | *ndest = &cells[idx..idx + self.edge - 2]; 113 | } 114 | 115 | let center_idx = self.at(1, y); 116 | let center = &cells[center_idx..center_idx + self.edge - 2]; 117 | let dst = &mut self.next[center_idx..center_idx + self.edge - 2]; 118 | 119 | let iter = slipstream::vectorize::<([Bools; 8], Bools, _), _>((neighs, center, dst)); 120 | 121 | for (neighs, center, mut dst) in iter { 122 | let mut live_neigh_cnt = Counts::default(); 123 | // FIXME: Using sum here unfortunately prevents inlining, which leads to 124 | // performance drop *and* barrier across which we don't get the AVX 125 | // instructions. So manually expanding the loop. 126 | for n in &neighs { 127 | // TODO: We want some safe transforms in here. 128 | live_neigh_cnt += unsafe { mem::transmute::<_, Counts>(*n) }; 129 | } 130 | let survive = live_neigh_cnt.eq(twos); 131 | *dst = dead.blend(alive, survive) & center; 132 | let born = live_neigh_cnt.eq(threes); 133 | *dst |= dead.blend(alive, born); 134 | } 135 | } 136 | mem::swap(&mut self.cells, &mut self.next); 137 | } 138 | } 139 | } 140 | 141 | #[bench] 142 | fn basic(b: &mut Bencher) { 143 | let mut life = Life::gen(); 144 | 145 | b.iter(|| { 146 | life.step(); 147 | }); 148 | } 149 | 150 | #[bench] 151 | fn vectorize_detect(b: &mut Bencher) { 152 | let mut life = Life::gen(); 153 | 154 | b.iter(|| { 155 | life.step_vectorized(); 156 | }); 157 | } 158 | 159 | #[bench] 160 | fn vectorize_default(b: &mut Bencher) { 161 | let mut life = Life::gen(); 162 | 163 | b.iter(|| { 164 | life.step_vectorized_default_version(); 165 | }); 166 | } 167 | 168 | #[test] 169 | fn same_results() { 170 | let mut l1 = Life::gen(); 171 | let mut l2 = l1.clone(); 172 | 173 | for i in 0..100 { 174 | assert_eq!(l1, l2, "Lifes differ in step {}", i); 175 | l1.step(); 176 | l2.step_vectorized(); 177 | } 178 | } 179 | 180 | // TODO: Anyone wants to volunteer and write a manually-vectorized version? 181 | -------------------------------------------------------------------------------- /benchmarks/benches/simple.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | // These two are needed when benchmarking for arm 3 | #![feature(aarch64_target_feature)] 4 | #![feature(arm_target_feature)] 5 | #![feature(stdsimd)] 6 | // The lint comes from somewhere inside macros, no idea why :-( 7 | #![allow(unused_braces)] 8 | 9 | extern crate test; 10 | 11 | mod utils; 12 | 13 | mod dot_product; 14 | mod life; 15 | mod sum; 16 | -------------------------------------------------------------------------------- /benchmarks/benches/sum.rs: -------------------------------------------------------------------------------- 1 | use multiversion::multiversion; 2 | use test::Bencher; 3 | 4 | use slipstream::prelude::*; 5 | 6 | use crate::mv; 7 | use crate::utils::{gen_data, gen_vecs, V}; 8 | 9 | #[bench] 10 | fn basic(b: &mut Bencher) { 11 | let (data, _) = gen_data(); 12 | 13 | b.iter(|| { 14 | test::black_box(data.iter().sum::()); 15 | }) 16 | } 17 | 18 | mv! { 19 | fn vectorized(data: &[V]) -> f32 { 20 | let mut result = V::default(); 21 | 22 | for v in data { 23 | result += *v; 24 | } 25 | 26 | result.iter().sum() 27 | } 28 | 29 | fn vectorized_rev(data: &[V]) -> f32 { 30 | let mut result = V::default(); 31 | 32 | for v in data { 33 | result += *v; 34 | } 35 | 36 | // Any idea why this rev makes it run faster? 37 | result.iter().rev().sum() 38 | } 39 | 40 | fn vectorized_horizontal(data: &[V]) -> f32 { 41 | let mut result = V::default(); 42 | 43 | for v in data { 44 | result += *v; 45 | } 46 | 47 | result.horizontal_sum() 48 | } 49 | 50 | fn vectorized_tree(data: &[V]) -> f32 { 51 | let mut result = V::default(); 52 | 53 | for v in data { 54 | result += *v; 55 | } 56 | 57 | #[inline] 58 | fn sum_up(d: &[f32]) -> f32 { 59 | if d.len() == 1 { 60 | d[0] 61 | } else { 62 | let mid = d.len() / 2; 63 | sum_up(&d[..mid]) + sum_up(&d[mid..]) 64 | } 65 | } 66 | 67 | sum_up(&result[..]) 68 | } 69 | 70 | fn vectorize(data: &[f32]) -> f32 { 71 | let mut result = V::default(); 72 | 73 | for v in data.vectorize() { 74 | result += v; 75 | } 76 | 77 | result.iter().rev().sum() 78 | } 79 | 80 | fn vectorize_horizontal(data: &[f32]) -> f32 { 81 | let mut result = V::default(); 82 | 83 | for v in data.vectorize() { 84 | result += v; 85 | } 86 | 87 | result.horizontal_sum() 88 | } 89 | 90 | fn sum(data: &[V]) -> f32 { 91 | data.iter() 92 | .copied() 93 | .sum::() 94 | .horizontal_sum() 95 | } 96 | 97 | fn sum_vectorize(data: &[f32]) -> f32 { 98 | data.vectorize() 99 | .sum::() 100 | .horizontal_sum() 101 | } 102 | 103 | // Testing what happens performance wise if we get mutable iteration in play 104 | fn vectorize_mut(data: &mut [f32]) -> f32 { 105 | let mut result = V::default(); 106 | 107 | for v in data.vectorize() { 108 | result += *v; 109 | } 110 | 111 | result.horizontal_sum() 112 | } 113 | 114 | fn vectorize_pad(data: &[f32]) -> f32 { 115 | data[1..].vectorize_pad(V::default()) 116 | .sum::() 117 | .horizontal_sum() 118 | } 119 | 120 | fn vectorize_split(data: &[f32]) -> f32 { 121 | let len = data.len(); 122 | let rem = len % V::LANES; 123 | let main = data[..len - rem].vectorize().sum::().horizontal_sum(); 124 | let rem = data[len - rem..].iter().sum::(); 125 | main + rem 126 | } 127 | } 128 | 129 | #[bench] 130 | fn vectorized_default(b: &mut Bencher) { 131 | let (data, _) = gen_vecs(); 132 | 133 | b.iter(|| { 134 | test::black_box(vectorized_default_version(data)); 135 | }) 136 | } 137 | 138 | #[bench] 139 | fn vectorized_detect(b: &mut Bencher) { 140 | let (data, _) = gen_vecs(); 141 | 142 | b.iter(|| { 143 | test::black_box(vectorized(data)); 144 | }) 145 | } 146 | 147 | #[bench] 148 | fn vectorized_rev_default(b: &mut Bencher) { 149 | let (data, _) = gen_vecs(); 150 | 151 | b.iter(|| { 152 | test::black_box(vectorized_rev_default_version(data)); 153 | }) 154 | } 155 | 156 | #[bench] 157 | fn vectorized_rev_detect(b: &mut Bencher) { 158 | let (data, _) = gen_vecs(); 159 | 160 | b.iter(|| { 161 | test::black_box(vectorized_rev(data)); 162 | }) 163 | } 164 | 165 | #[bench] 166 | fn vectorized_tree_default(b: &mut Bencher) { 167 | let (data, _) = gen_vecs(); 168 | 169 | b.iter(|| { 170 | test::black_box(vectorized_tree_default_version(data)); 171 | }) 172 | } 173 | 174 | #[bench] 175 | fn vectorized_tree_detect(b: &mut Bencher) { 176 | let (data, _) = gen_vecs(); 177 | 178 | b.iter(|| { 179 | test::black_box(vectorized_tree(data)); 180 | }) 181 | } 182 | 183 | #[bench] 184 | fn vectorize_default(b: &mut Bencher) { 185 | let (data, _) = gen_data(); 186 | 187 | b.iter(|| { 188 | test::black_box(vectorize_default_version(data)); 189 | }); 190 | } 191 | 192 | #[bench] 193 | fn vectorize_detect(b: &mut Bencher) { 194 | let (data, _) = gen_data(); 195 | 196 | b.iter(|| { 197 | test::black_box(vectorize(data)); 198 | }); 199 | } 200 | 201 | #[bench] 202 | fn vectorize_horizontal_default(b: &mut Bencher) { 203 | let (data, _) = gen_data(); 204 | 205 | b.iter(|| { 206 | test::black_box(vectorize_horizontal_default_version(data)); 207 | }); 208 | } 209 | 210 | #[bench] 211 | fn vectorize_horizontal_detect(b: &mut Bencher) { 212 | let (data, _) = gen_data(); 213 | 214 | b.iter(|| { 215 | test::black_box(vectorize_horizontal(data)); 216 | }); 217 | } 218 | 219 | #[bench] 220 | fn sum_vectorize_default(b: &mut Bencher) { 221 | let (data, _) = gen_data(); 222 | 223 | b.iter(|| { 224 | test::black_box(sum_vectorize_default_version(data)); 225 | }) 226 | } 227 | 228 | #[bench] 229 | fn sum_vectorize_detect(b: &mut Bencher) { 230 | let (data, _) = gen_data(); 231 | 232 | b.iter(|| { 233 | test::black_box(sum_vectorize(data)); 234 | }) 235 | } 236 | 237 | #[bench] 238 | fn vectorize_mut_default(b: &mut Bencher) { 239 | let (data, _) = gen_data(); 240 | let mut data = data.to_vec(); 241 | 242 | b.iter(|| { 243 | test::black_box(vectorize_mut_default_version(&mut data)); 244 | }) 245 | } 246 | 247 | #[bench] 248 | fn vectorize_mut_detect(b: &mut Bencher) { 249 | let (data, _) = gen_data(); 250 | let mut data = data.to_vec(); 251 | 252 | b.iter(|| { 253 | test::black_box(vectorize_mut(&mut data)); 254 | }) 255 | } 256 | 257 | #[bench] 258 | fn sum_default(b: &mut Bencher) { 259 | let (data, _) = gen_vecs(); 260 | 261 | b.iter(|| { 262 | test::black_box(sum_default_version(data)); 263 | }) 264 | } 265 | 266 | #[bench] 267 | fn sum_detect(b: &mut Bencher) { 268 | let (data, _) = gen_vecs(); 269 | 270 | b.iter(|| { 271 | test::black_box(sum(data)); 272 | }) 273 | } 274 | 275 | #[bench] 276 | fn vectorized_horizontal_default(b: &mut Bencher) { 277 | let (data, _) = gen_vecs(); 278 | 279 | b.iter(|| { 280 | test::black_box(vectorized_horizontal_default_version(data)); 281 | }) 282 | } 283 | 284 | #[bench] 285 | fn vectorized_horizontal_detect(b: &mut Bencher) { 286 | let (data, _) = gen_vecs(); 287 | 288 | b.iter(|| { 289 | test::black_box(vectorized_horizontal(data)); 290 | }) 291 | } 292 | 293 | #[bench] 294 | fn vectorize_pad_default(b: &mut Bencher) { 295 | let (data, _) = gen_data(); 296 | 297 | b.iter(|| { 298 | test::black_box(vectorize_pad_default_version(data)); 299 | }) 300 | } 301 | 302 | #[bench] 303 | fn vectorize_pad_detect(b: &mut Bencher) { 304 | let (data, _) = gen_data(); 305 | 306 | b.iter(|| { 307 | test::black_box(vectorize_pad(data)); 308 | }) 309 | } 310 | 311 | #[bench] 312 | fn vectorize_split_default(b: &mut Bencher) { 313 | let (data, _) = gen_data(); 314 | 315 | b.iter(|| { 316 | test::black_box(vectorize_split_default_version(data)); 317 | }) 318 | } 319 | 320 | #[bench] 321 | fn vectorize_split_detect(b: &mut Bencher) { 322 | let (data, _) = gen_data(); 323 | 324 | b.iter(|| { 325 | test::black_box(vectorize_split(data)); 326 | }) 327 | } 328 | 329 | #[bench] 330 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] 331 | fn manual_sse(b: &mut Bencher) { 332 | use core::mem; 333 | 334 | use crate::utils::arch::{self, __m128}; 335 | use crate::utils::gen_arch_vecs; 336 | 337 | let (data, _) = gen_arch_vecs(); 338 | 339 | // Note: this is technically not correct on the x86 target, we should check first, but who 340 | // cares in benchmarks. 341 | #[target_feature(enable = "sse")] 342 | unsafe fn inner(d: &[__m128]) -> f32 { 343 | let mut result = arch::_mm_setzero_ps(); 344 | for v in d { 345 | result = arch::_mm_add_ps(result, *v); 346 | } 347 | 348 | let result: [f32; 4] = mem::transmute(result); 349 | result.iter().sum::() 350 | } 351 | 352 | b.iter(|| test::black_box(unsafe { inner(data) })) 353 | } 354 | 355 | #[bench] 356 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] 357 | fn manual_sse_convert(b: &mut Bencher) { 358 | use core::mem; 359 | 360 | use crate::utils::arch; 361 | 362 | let (data, _) = gen_data(); 363 | 364 | // Note: this is technically not correct on the x86 target, we should check first, but who 365 | // cares in benchmarks. 366 | #[target_feature(enable = "sse")] 367 | unsafe fn inner(d: &[f32]) -> f32 { 368 | let mut result = arch::_mm_setzero_ps(); 369 | let iter = d.chunks_exact(4); 370 | let remainder = iter.remainder().iter().sum::(); 371 | for v in iter { 372 | result = arch::_mm_add_ps(result, arch::_mm_loadu_ps(v.as_ptr())); 373 | } 374 | 375 | let result: [f32; 4] = mem::transmute(result); 376 | result.iter().sum::() + remainder 377 | } 378 | 379 | b.iter(|| test::black_box(unsafe { inner(data) })) 380 | } 381 | -------------------------------------------------------------------------------- /benchmarks/benches/utils.rs: -------------------------------------------------------------------------------- 1 | #[cfg(target_arch = "x86")] 2 | pub use core::arch::x86::{self as arch, __m128}; 3 | #[cfg(target_arch = "x86_64")] 4 | pub use core::arch::x86_64::{self as arch, __m128}; 5 | use std::iter; 6 | 7 | use once_cell::sync::Lazy; 8 | 9 | #[macro_export] 10 | macro_rules! mv { 11 | ($(fn $name: ident($($params: tt)*) $(-> $res: ty)? $body: block)*) => { 12 | $( 13 | #[multiversion] 14 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")] 15 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")] 16 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")] 17 | #[clone(target = "[arm|aarch64]+neon")] 18 | fn $name($($params)*) $(-> $res)? $body 19 | )* 20 | }; 21 | } 22 | 23 | pub(crate) const SIZE: usize = 10 * 1024 * 1024; 24 | pub(crate) type V = slipstream::f32x4; 25 | 26 | pub(crate) fn gen_data() -> (&'static [f32], &'static [f32]) { 27 | fn inner() -> Vec { 28 | iter::repeat_with(rand::random).take(SIZE).collect() 29 | } 30 | static CACHED: Lazy<(Vec, Vec)> = Lazy::new(|| (inner(), inner())); 31 | (&CACHED.0, &CACHED.1) 32 | } 33 | 34 | pub(crate) fn gen_vecs() -> (&'static [V], &'static [V]) { 35 | fn inner() -> Vec { 36 | iter::repeat_with(rand::random) 37 | .map(|v: [f32; V::LANES]| V::new(&v)) 38 | .take(SIZE / V::LANES) 39 | .collect() 40 | } 41 | static CACHED: Lazy<(Vec, Vec)> = Lazy::new(|| (inner(), inner())); 42 | (&CACHED.0, &CACHED.1) 43 | } 44 | 45 | #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] 46 | pub(crate) fn gen_arch_vecs() -> (&'static [__m128], &'static [__m128]) { 47 | fn inner() -> Vec<__m128> { 48 | iter::repeat_with(|| { 49 | let v: [f32; 4] = rand::random(); 50 | unsafe { arch::_mm_loadu_ps(v.as_ptr()) } 51 | }) 52 | .take(SIZE / 4) 53 | .collect() 54 | } 55 | 56 | static CACHED: Lazy<(Vec<__m128>, Vec<__m128>)> = Lazy::new(|| (inner(), inner())); 57 | (&CACHED.0, &CACHED.1) 58 | } 59 | -------------------------------------------------------------------------------- /examples/matrix.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Display; 2 | use std::iter; 3 | use std::num::Wrapping; 4 | use std::ops::Mul; 5 | use std::time::Instant; 6 | 7 | use multiversion::{multiversion, target}; 8 | use rand::random; 9 | use slipstream::prelude::*; 10 | 11 | const SIZE: usize = 1024; 12 | type V = wu32x8; 13 | type O = usizex8; 14 | const L: usize = V::LANES; 15 | 16 | #[derive(Debug, PartialEq)] 17 | struct Matrix(Vec>); 18 | 19 | #[inline] 20 | fn at(x: usize, y: usize) -> usize { 21 | y * SIZE + x 22 | } 23 | 24 | impl Matrix { 25 | fn random() -> Self { 26 | Self( 27 | iter::repeat_with(random) 28 | .map(Wrapping) 29 | .take(SIZE * SIZE) 30 | .collect(), 31 | ) 32 | } 33 | 34 | #[multiversion] 35 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")] 36 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")] 37 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")] 38 | fn mult_simd(&self, rhs: &Matrix) -> Matrix { 39 | let mut output = vec![Wrapping(0); SIZE * SIZE]; 40 | 41 | // Pre-compute offsets when gathering the column 42 | let mut column: [V; SIZE / L] = [Default::default(); SIZE / L]; 43 | let offsets = (0..L).collect::>(); 44 | let base_offsets = O::new(offsets) * SIZE; 45 | let mut offsets: [O; SIZE / L] = [Default::default(); SIZE / L]; 46 | for i in 0..SIZE / L { 47 | offsets[i] = base_offsets + i * L * SIZE; 48 | } 49 | 50 | // Across columns 51 | for x in 0..SIZE { 52 | // The gather_load is likely slower than just vectorizing the row, so we do this less 53 | // often and just once for each column instead of each time. 54 | for (col, off) in (&mut column[..], &offsets[..]).vectorize() { 55 | *col = V::gather_load(&rhs.0, off + x); 56 | } 57 | 58 | // Across rows 59 | for y in 0..SIZE { 60 | let row_start = at(0, y); 61 | output[at(x, y)] = 62 | dispatch!(dot_prod(&self.0[row_start..row_start + SIZE], &column)); 63 | } 64 | } 65 | Matrix(output) 66 | } 67 | } 68 | 69 | #[multiversion] 70 | #[specialize( 71 | target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma", 72 | fn = "dot_prod_avx", 73 | unsafe = true 74 | )] 75 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1+avx")] 76 | #[clone(target = "[x86|x86_64]+sse+sse2+sse3+sse4.1")] 77 | fn dot_prod(row: &[Wrapping], column: &[V]) -> Wrapping { 78 | (row, column) 79 | .vectorize() 80 | .map(|(r, c): (V, V)| r * c) 81 | .sum::() 82 | .horizontal_sum() 83 | } 84 | 85 | #[target("[x86|x86_64]+sse+sse2+sse3+sse4.1+avx+avx2+fma")] 86 | unsafe fn dot_prod_avx(row: &[Wrapping], column: &[V]) -> Wrapping { 87 | let mut result = V::default(); 88 | for (r, c) in (row, column).vectorize() { 89 | let r: V = r; 90 | result += r * c; 91 | } 92 | result.horizontal_sum() 93 | } 94 | 95 | impl Mul for &'_ Matrix { 96 | type Output = Matrix; 97 | fn mul(self, rhs: &Matrix) -> Matrix { 98 | let mut output = vec![Wrapping(0); SIZE * SIZE]; 99 | for x in 0..SIZE { 100 | for y in 0..SIZE { 101 | for z in 0..SIZE { 102 | output[at(x, y)] += self.0[at(z, y)] * rhs.0[at(x, z)]; 103 | } 104 | } 105 | } 106 | Matrix(output) 107 | } 108 | } 109 | 110 | fn timed R>(name: N, f: F) -> R { 111 | let now = Instant::now(); 112 | let result = f(); 113 | println!("{} took:\t{:?}", name, now.elapsed()); 114 | result 115 | } 116 | 117 | fn main() { 118 | let a = Matrix::random(); 119 | let b = Matrix::random(); 120 | let z = timed("Scalar multiplication", || &a * &b); 121 | let x = timed("Compile-time detected", || a.mult_simd_default_version(&b)); 122 | let w = timed("Run-time detected", || a.mult_simd(&b)); 123 | assert_eq!(z, x); 124 | assert_eq!(z, w); 125 | } 126 | -------------------------------------------------------------------------------- /proptest-regressions/sse.txt: -------------------------------------------------------------------------------- 1 | # Seeds for failure cases proptest has generated in the past. It is 2 | # automatically read and these particular cases re-run before any 3 | # novel cases are generated. 4 | # 5 | # It is recommended to check this file in to source control so that 6 | # everyone who runs the test benefits from these saved cases. 7 | cc 9ba8d6f5bd1318e956010646a5b78d9b06c736ca22b061ddea8b6bd63b2b8d77 # shrinks to a = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], b = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1] 8 | -------------------------------------------------------------------------------- /src/iterators.rs: -------------------------------------------------------------------------------- 1 | //! The [`Vectorizable`] trait and a lot of its service types. 2 | //! 3 | //! The [`Vectorizable`] trait allows to turning slices of base types to iterators of vectors, both 4 | //! in separation and in tandem. The rest of this module provides the related types and traits. 5 | //! 6 | //! Usually, it is enough to bring in the [`prelude`][crate::prelude], which already contains the 7 | //! trait. It is seldom necessary to interact with this module directly. 8 | //! 9 | //! # Examples 10 | //! 11 | //! ```rust 12 | //! use slipstream::prelude::*; 13 | //! 14 | //! fn double(input: &[u32], output: &mut [u32]) { 15 | //! let two = u32x8::splat(2); 16 | //! for (i, mut o) in (input, output).vectorize() { 17 | //! *o = two * i; 18 | //! } 19 | //! } 20 | //! # double(&[], &mut []) 21 | //! ``` 22 | 23 | use core::iter::FusedIterator; 24 | use core::marker::PhantomData; 25 | use core::mem::{self, MaybeUninit}; 26 | use core::ops::*; 27 | use core::ptr; 28 | use core::slice; 29 | 30 | use crate::inner::Repr; 31 | use crate::vector::align::Align; 32 | use crate::Vector; 33 | 34 | // TODO: Deref to arrays, not slices 35 | /// A proxy object for iterating over mutable slices. 36 | /// 37 | /// For technical reasons (mostly alignment and padding), it's not possible to return a simple 38 | /// reference. This type is returned instead and it can be used to both read and write the vectors 39 | /// a slice is turned into. 40 | /// 41 | /// Note that the data are written in the destructor. Usually, this should not matter, but if you 42 | /// [`forget`][mem::forget], the changes will be lost (this is meant as a warning, not as a way to 43 | /// implement poor-man's transactions). 44 | #[derive(Debug)] 45 | pub struct MutProxy<'a, B, V> 46 | where 47 | V: AsRef<[B]>, 48 | B: Copy, 49 | { 50 | data: V, 51 | restore: &'a mut [B], 52 | } 53 | 54 | impl Deref for MutProxy<'_, B, V> 55 | where 56 | V: AsRef<[B]>, 57 | B: Copy, 58 | { 59 | type Target = V; 60 | #[inline] 61 | fn deref(&self) -> &V { 62 | &self.data 63 | } 64 | } 65 | 66 | impl DerefMut for MutProxy<'_, B, V> 67 | where 68 | V: AsRef<[B]>, 69 | B: Copy, 70 | { 71 | #[inline] 72 | fn deref_mut(&mut self) -> &mut V { 73 | &mut self.data 74 | } 75 | } 76 | 77 | impl Drop for MutProxy<'_, B, V> 78 | where 79 | V: AsRef<[B]>, 80 | B: Copy, 81 | { 82 | #[inline] 83 | fn drop(&mut self) { 84 | self.restore 85 | .copy_from_slice(&self.data.as_ref()[..self.restore.len()]); 86 | } 87 | } 88 | 89 | #[doc(hidden)] 90 | pub trait Partial { 91 | fn take_partial(&mut self) -> Option; 92 | fn size(&self) -> usize; 93 | } 94 | 95 | impl Partial for () { 96 | #[inline] 97 | fn take_partial(&mut self) -> Option { 98 | None 99 | } 100 | #[inline] 101 | fn size(&self) -> usize { 102 | 0 103 | } 104 | } 105 | 106 | impl Partial for Option { 107 | #[inline] 108 | fn take_partial(&mut self) -> Option { 109 | Option::take(self) 110 | } 111 | fn size(&self) -> usize { 112 | self.is_some() as usize 113 | } 114 | } 115 | 116 | #[doc(hidden)] 117 | pub trait Vectorizer { 118 | /// Get the nth vector. 119 | /// 120 | /// # Safety 121 | /// 122 | /// * idx must be in range (as declared on creation). 123 | /// * It may be called at most once per each index. 124 | unsafe fn get(&mut self, idx: usize) -> R; 125 | } 126 | 127 | /// The iterator returned by methods on [`Vectorizable`]. 128 | /// 129 | /// While it's unusual to need to *name* the type, this is the thing that is returned from 130 | /// [`Vectorizable::vectorize`] and [`Vectorizable::vectorize_pad`]. It might be of interest to 131 | /// know that it implements several iterator „extensions“ ([`DoubleEndedIterator`], 132 | /// [`ExactSizeIterator`] and [`FusedIterator`]). Also, several methods are optimized ‒ for 133 | /// example, the `count` is constant time operation, while the generic is linear. 134 | #[derive(Copy, Clone, Debug)] 135 | pub struct VectorizedIter { 136 | partial: P, 137 | vectorizer: V, 138 | left: usize, 139 | right: usize, 140 | _result: PhantomData, 141 | } 142 | 143 | impl Iterator for VectorizedIter 144 | where 145 | V: Vectorizer, 146 | P: Partial, 147 | { 148 | type Item = R; 149 | 150 | #[inline] 151 | fn next(&mut self) -> Option { 152 | if self.left < self.right { 153 | let idx = self.left; 154 | self.left += 1; 155 | Some(unsafe { self.vectorizer.get(idx) }) 156 | } else { 157 | self.partial.take_partial() 158 | } 159 | } 160 | 161 | #[inline] 162 | fn size_hint(&self) -> (usize, Option) { 163 | let len = self.right - self.left + self.partial.size(); 164 | (len, Some(len)) 165 | } 166 | 167 | // Overriden for performance… these things have no side effects, so we can avoid calling next 168 | 169 | #[inline] 170 | fn count(self) -> usize { 171 | self.size_hint().0 172 | } 173 | 174 | #[inline] 175 | fn last(mut self) -> Option { 176 | self.next_back() 177 | } 178 | 179 | // TODO: This wants some tests 180 | #[inline] 181 | fn nth(&mut self, n: usize) -> Option { 182 | let main_len = self.right - self.left; 183 | if main_len >= n { 184 | self.left += n; 185 | self.next() 186 | } else { 187 | self.left = self.right; 188 | self.partial.take_partial(); 189 | None 190 | } 191 | } 192 | } 193 | 194 | impl DoubleEndedIterator for VectorizedIter 195 | where 196 | V: Vectorizer, 197 | P: Partial, 198 | { 199 | // TODO: Tests 200 | #[inline] 201 | fn next_back(&mut self) -> Option { 202 | if let Some(partial) = self.partial.take_partial() { 203 | Some(partial) 204 | } else if self.left < self.right { 205 | self.right -= 1; 206 | Some(unsafe { self.vectorizer.get(self.right) }) 207 | } else { 208 | None 209 | } 210 | } 211 | } 212 | 213 | impl ExactSizeIterator for VectorizedIter 214 | where 215 | V: Vectorizer, 216 | P: Partial, 217 | { 218 | } 219 | 220 | impl FusedIterator for VectorizedIter 221 | where 222 | V: Vectorizer, 223 | P: Partial, 224 | { 225 | } 226 | 227 | /// A trait describing things with direct support for splitting into vectors. 228 | /// 229 | /// This supports vectorized iteration over shared and mutable slices as well as types composed of 230 | /// them (tuples and short fixed-sized arrays). 231 | /// 232 | /// Note that, unlike normal iterators, shared slices return owned values (vectors) and mutable 233 | /// slices return [proxy objects][MutProxy] that allow writing the data back. It is not possible to 234 | /// directly borrow from the slice because of alignment. The tuples and arrays return tuples and 235 | /// arrays of the inner values. 236 | /// 237 | /// Already pre-vectorized inputs are also supported (this is useful in combination with other not 238 | /// vectorized inputs). 239 | /// 240 | /// # Type hints 241 | /// 242 | /// Oftentimes, the compiler can infer the type of the base type, but not the length of the vector. 243 | /// It is therefore needed to provide a type hint. 244 | /// 245 | /// Furthermore, for tuples and arrays, the inner type really needs to be the slice, not something 246 | /// that can coerce into it (eg. vec or array). 247 | /// 248 | /// Alternatively, you can use the free-standing functions [`vectorize`][crate::vectorize] and 249 | /// [`vectorize_pad`][crate::vectorize_pad]. It allows using the turbofish to provide the hint. 250 | /// 251 | /// # Examples 252 | /// 253 | /// ```rust 254 | /// # use slipstream::prelude::*; 255 | /// let data = [1, 2, 3, 4]; 256 | /// let v = data.vectorize().collect::>(); 257 | /// assert_eq!(vec![u32x2::new([1, 2]), u32x2::new([3, 4])], v); 258 | /// ``` 259 | /// 260 | /// ```rust 261 | /// # use slipstream::prelude::*; 262 | /// let data = [1, 2, 3, 4]; 263 | /// for v in data.vectorize() { 264 | /// let v: u32x2 = v; // Type hint 265 | /// println!("{:?}", v); 266 | /// } 267 | /// ``` 268 | /// 269 | /// ```rust 270 | /// # use slipstream::prelude::*; 271 | /// let input = [1, 2, 3, 4]; 272 | /// let mut output = [0; 4]; 273 | /// let mul = u32x2::splat(2); 274 | /// // We have to force the coercion to slice by [..] 275 | /// for (i, mut o) in (&input[..], &mut output[..]).vectorize() { 276 | /// *o = mul * i; 277 | /// } 278 | /// assert_eq!(output, [2, 4, 6, 8]); 279 | /// ``` 280 | /// 281 | /// ```rust 282 | /// # use slipstream::prelude::*; 283 | /// let vectorized = [u32x2::new([1, 2]), u32x2::new([3, 4])]; 284 | /// let not_vectorized = [1, 2, 3, 4]; 285 | /// for (v, n) in (&vectorized[..], ¬_vectorized[..]).vectorize() { 286 | /// assert_eq!(v, n); 287 | /// } 288 | /// ``` 289 | pub trait Vectorizable: Sized { 290 | /// The input type provided by user to fill in the padding/uneven end. 291 | /// 292 | /// Note that this doesn't necessarily have to be the same type as the type returned by the 293 | /// resulting iterator. For example, in case of mutable slices, the input is the vector, while 294 | /// the output is [`MutProxy`]. 295 | type Padding; 296 | 297 | /// An internal type managing the splitting into vectors. 298 | /// 299 | /// Not of direct interest of the users of this crate. 300 | type Vectorizer: Vectorizer; 301 | 302 | /// Internal method to create the vectorizer and kick of the iteration. 303 | fn create(self, pad: Option) -> (Self::Vectorizer, usize, Option); 304 | 305 | /// Vectorize a slice or composite of slices 306 | /// 307 | /// This variant assumes the input is divisible by the size of the vector. Prefer this if 308 | /// possible over [`vectorize_pad`][Vectorizable::vectorize_pad], as it is usually 309 | /// significantly faster. 310 | /// 311 | /// # Panics 312 | /// 313 | /// * If the slice length isn't divisible by the vector size. 314 | /// * If the parts of the composite produce different number of vectors. It is not mandated for 315 | /// the slices to be of equal length, only to produce the same number of vectors. 316 | /// 317 | /// # Examples 318 | /// 319 | /// ```rust 320 | /// # use slipstream::prelude::*; 321 | /// let longer = [1, 2, 3, 4, 5, 6, 7, 8]; 322 | /// let shorter = [1, 2, 3, 4]; 323 | /// for i in (&shorter[..], &longer[..]).vectorize() { 324 | /// let (s, l): (u32x2, u32x4) = i; 325 | /// println!("s: {:?}, l: {:?})", s, l); 326 | /// } 327 | /// ``` 328 | #[inline(always)] 329 | fn vectorize(self) -> VectorizedIter { 330 | let (vectorizer, len, partial) = self.create(None); 331 | assert!(partial.is_none()); 332 | VectorizedIter { 333 | partial: (), 334 | vectorizer, 335 | left: 0, 336 | right: len, 337 | _result: PhantomData, 338 | } 339 | } 340 | 341 | /// Vectorizes a slice or composite of slices, padding the odd end if needed. 342 | /// 343 | /// While the [`vectorize`][Vectorizable::vectorize] assumes the input can be split into 344 | /// vectors without leftover, this version deals with the uneven rest by producing a padding 345 | /// vector (if needed). The unused lanes are taken from the `pad` parameter. This is at the 346 | /// cost of some performance (TODO: figure out why it is so much slower). 347 | /// 348 | /// For mutable slices, padding is used as usual, but the added lanes are not stored anywhere. 349 | /// 350 | /// The padding is produced at the end. 351 | /// 352 | /// In case of composites, this still assumes they produce the same number of full vectors and 353 | /// that they all either do or don't need a padding. 354 | /// 355 | /// # Panics 356 | /// 357 | /// If the above assumption about number of vectors and same padding behaviour is violated. 358 | /// 359 | /// ```rust 360 | /// # use slipstream::prelude::*; 361 | /// let data = [1, 2, 3, 4, 5, 6]; 362 | /// let v = data.vectorize_pad(i32x4::splat(-1)).collect::>(); 363 | /// assert_eq!(v, vec![i32x4::new([1, 2, 3, 4]), i32x4::new([5, 6, -1, -1])]); 364 | /// ``` 365 | #[inline(always)] 366 | fn vectorize_pad(self, pad: Self::Padding) -> VectorizedIter, V> { 367 | let (vectorizer, len, partial) = self.create(Some(pad)); 368 | VectorizedIter { 369 | partial, 370 | vectorizer, 371 | left: 0, 372 | right: len, 373 | _result: PhantomData, 374 | } 375 | } 376 | } 377 | 378 | #[doc(hidden)] 379 | #[derive(Copy, Clone, Debug)] 380 | pub struct ReadVectorizer<'a, A: Align, B: Repr, const S: usize> { 381 | start: *const B, 382 | _vector: PhantomData>, 383 | _slice: PhantomData<&'a [B]>, // To hold the lifetime 384 | } 385 | 386 | // Note: The impls here assume V, B, P are Sync and Send, which they are. Nobody is able to create 387 | // this directly and we do have the limits on Vector, the allowed implementations, etc. 388 | unsafe impl Send for ReadVectorizer<'_, A, B, S> {} 389 | unsafe impl Sync for ReadVectorizer<'_, A, B, S> {} 390 | 391 | impl Vectorizer> 392 | for ReadVectorizer<'_, A, B, S> 393 | { 394 | #[inline(always)] 395 | unsafe fn get(&mut self, idx: usize) -> Vector { 396 | Vector::new_unchecked(self.start.add(S * idx)) 397 | } 398 | } 399 | 400 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable> for &'a [B] { 401 | type Vectorizer = ReadVectorizer<'a, A, B, S>; 402 | type Padding = Vector; 403 | #[inline] 404 | fn create( 405 | self, 406 | pad: Option>, 407 | ) -> (Self::Vectorizer, usize, Option>) { 408 | let len = self.len(); 409 | assert!( 410 | len * mem::size_of::() <= isize::MAX as usize, 411 | "Slice too huge" 412 | ); 413 | let rest = len % S; 414 | let main = len - rest; 415 | let start = self.as_ptr(); 416 | let partial = match (rest, pad) { 417 | (0, _) => None, 418 | (_, Some(mut pad)) => { 419 | pad[..rest].copy_from_slice(&self[main..]); 420 | Some(pad) 421 | } 422 | _ => panic!( 423 | "Data to vectorize not divisible by lanes ({} vs {})", 424 | S, len, 425 | ), 426 | }; 427 | let me = ReadVectorizer { 428 | start, 429 | _vector: PhantomData, 430 | _slice: PhantomData, 431 | }; 432 | (me, main / S, partial) 433 | } 434 | } 435 | 436 | #[doc(hidden)] 437 | #[derive(Copy, Clone, Debug)] 438 | pub struct WriteVectorizer<'a, A: Align, B: Repr, const S: usize> { 439 | start: *mut B, 440 | _vector: PhantomData>, 441 | _slice: PhantomData<&'a mut [B]>, // To hold the lifetime 442 | } 443 | 444 | // Note: The impls here assume V, B, P are Sync and Send, which they are. Nobody is able to create 445 | // this directly and we do have the limits on Vector, the allowed implementations, etc. 446 | unsafe impl Send for WriteVectorizer<'_, A, B, S> {} 447 | unsafe impl Sync for WriteVectorizer<'_, A, B, S> {} 448 | 449 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizer>> 450 | for WriteVectorizer<'a, A, B, S> 451 | { 452 | #[inline(always)] 453 | unsafe fn get(&mut self, idx: usize) -> MutProxy<'a, B, Vector> { 454 | // FIXME: Technically, we extend the lifetime in the from_raw_parts_mut beyond what rust 455 | // would allow us to normally do. But is this OK? As we are guaranteed never to give any 456 | // chunk twice, this should act similar to IterMut from slice or similar. 457 | let ptr = self.start.add(S * idx); 458 | MutProxy { 459 | data: Vector::new_unchecked(ptr), 460 | restore: slice::from_raw_parts_mut(ptr, S), 461 | } 462 | } 463 | } 464 | 465 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable>> 466 | for &'a mut [B] 467 | { 468 | type Vectorizer = WriteVectorizer<'a, A, B, S>; 469 | type Padding = Vector; 470 | #[inline] 471 | #[allow(clippy::type_complexity)] 472 | fn create( 473 | self, 474 | pad: Option>, 475 | ) -> ( 476 | Self::Vectorizer, 477 | usize, 478 | Option>>, 479 | ) { 480 | let len = self.len(); 481 | assert!( 482 | len * mem::size_of::() <= isize::MAX as usize, 483 | "Slice too huge" 484 | ); 485 | let rest = len % S; 486 | let main = len - rest; 487 | let start = self.as_mut_ptr(); 488 | let partial = match (rest, pad) { 489 | (0, _) => None, 490 | (_, Some(mut pad)) => { 491 | let restore = &mut self[main..]; 492 | pad[..rest].copy_from_slice(restore); 493 | Some(MutProxy { data: pad, restore }) 494 | } 495 | _ => panic!( 496 | "Data to vectorize not divisible by lanes ({} vs {})", 497 | S, len, 498 | ), 499 | }; 500 | let me = WriteVectorizer { 501 | start, 502 | _vector: PhantomData, 503 | _slice: PhantomData, 504 | }; 505 | (me, main / S, partial) 506 | } 507 | } 508 | 509 | macro_rules! vectorizable_tuple { 510 | ($(($X: ident, $XR: ident, $X0: tt)),*) => { 511 | impl<$($X, $XR),*> Vectorizer<($($XR),*)> for ($($X),*) 512 | where 513 | $($X: Vectorizer<$XR>,)* 514 | { 515 | #[inline(always)] 516 | unsafe fn get(&mut self, idx: usize) -> ($($XR),*) { 517 | ($(self.$X0.get(idx)),*) 518 | } 519 | } 520 | 521 | impl<$($X, $XR),*> Vectorizable<($($XR),*)> for ($($X),*) 522 | where 523 | $($X: Vectorizable<$XR>,)* 524 | { 525 | type Vectorizer = ($($X::Vectorizer),*); 526 | type Padding = ($($X::Padding),*); 527 | #[inline] 528 | #[allow(clippy::eq_op)] 529 | fn create(self, pad: Option) 530 | -> (Self::Vectorizer, usize, Option<($($XR),*)>) 531 | { 532 | let pad = match pad { 533 | Some(pad) => ($(Some(pad.$X0)),*), 534 | None => Default::default(), // Bunch of Nones in a tuple.. (None, None, None)... 535 | }; 536 | let created = ($(self.$X0.create(pad.$X0)),*); 537 | $( 538 | // TODO: We may want to support this in the padded mode eventually by 539 | // creating more paddings 540 | assert_eq!( 541 | (created.0).1, 542 | created.$X0.1, 543 | "Vectorizing data of different lengths" 544 | ); 545 | // TODO: We could also handle this in the padded mode by doing empty pads 546 | assert_eq!( 547 | (created.0).2.is_some(), 548 | created.$X0.2.is_some(), 549 | "Paddings are not the same for all vectorized data", 550 | ); 551 | )* 552 | let vectorizer = ($(created.$X0.0),*); 553 | let pad = if (created.0).2.is_some() { 554 | Some(($(created.$X0.2.unwrap()),*)) 555 | } else { 556 | None 557 | }; 558 | (vectorizer, (created.0).1, pad) 559 | } 560 | } 561 | } 562 | } 563 | 564 | vectorizable_tuple!((A, AR, 0), (B, BR, 1)); 565 | vectorizable_tuple!((A, AR, 0), (B, BR, 1), (C, CR, 2)); 566 | vectorizable_tuple!((A, AR, 0), (B, BR, 1), (C, CR, 2), (D, DR, 3)); 567 | vectorizable_tuple!((A, AR, 0), (B, BR, 1), (C, CR, 2), (D, DR, 3), (E, ER, 4)); 568 | vectorizable_tuple!( 569 | (A, AR, 0), 570 | (B, BR, 1), 571 | (C, CR, 2), 572 | (D, DR, 3), 573 | (E, ER, 4), 574 | (F, FR, 5) 575 | ); 576 | vectorizable_tuple!( 577 | (A, AR, 0), 578 | (B, BR, 1), 579 | (C, CR, 2), 580 | (D, DR, 3), 581 | (E, ER, 4), 582 | (F, FR, 5), 583 | (G, GR, 6) 584 | ); 585 | vectorizable_tuple!( 586 | (A, AR, 0), 587 | (B, BR, 1), 588 | (C, CR, 2), 589 | (D, DR, 3), 590 | (E, ER, 4), 591 | (F, FR, 5), 592 | (G, GR, 6), 593 | (H, HR, 7) 594 | ); 595 | 596 | impl Vectorizer<[TR; S]> for [T; S] 597 | where 598 | T: Vectorizer, 599 | { 600 | #[inline(always)] 601 | unsafe fn get(&mut self, idx: usize) -> [TR; S] { 602 | let mut res = MaybeUninit::<[TR; S]>::uninit(); 603 | for (i, v) in self.iter_mut().enumerate() { 604 | ptr::write(res.as_mut_ptr().cast::().add(i), v.get(idx)); 605 | } 606 | res.assume_init() 607 | } 608 | } 609 | 610 | impl Vectorizable<[TR; S]> for [T; S] 611 | where 612 | T: Vectorizable + Copy, 613 | T::Padding: Copy, 614 | { 615 | type Vectorizer = [T::Vectorizer; S]; 616 | type Padding = [T::Padding; S]; 617 | #[inline] 618 | fn create(self, pad: Option) -> (Self::Vectorizer, usize, Option<[TR; S]>) { 619 | let mut vectorizer = MaybeUninit::::uninit(); 620 | let mut size = 0; 621 | let mut padding = MaybeUninit::<[TR; S]>::uninit(); 622 | let mut seen_some_pad = false; 623 | let mut seen_none_pad = false; 624 | unsafe { 625 | for i in 0..S { 626 | let (v, s, p) = self[i].create(pad.map(|p| p[i])); 627 | ptr::write(vectorizer.as_mut_ptr().cast::().add(i), v); 628 | if i == 0 { 629 | size = s; 630 | } else { 631 | assert_eq!(size, s, "Vectorized lengths inconsistent across the array",); 632 | } 633 | match p { 634 | Some(p) => { 635 | seen_some_pad = true; 636 | ptr::write(padding.as_mut_ptr().cast::().add(i), p); 637 | } 638 | None => seen_none_pad = true, 639 | } 640 | } 641 | assert!( 642 | !seen_some_pad || !seen_none_pad, 643 | "Paddings inconsistent across the array", 644 | ); 645 | let padding = if seen_some_pad { 646 | Some(padding.assume_init()) 647 | } else { 648 | None 649 | }; 650 | (vectorizer.assume_init(), size, padding) 651 | } 652 | } 653 | } 654 | 655 | impl<'a, T> Vectorizer for &'a [T] 656 | where 657 | T: Copy, 658 | { 659 | unsafe fn get(&mut self, idx: usize) -> T { 660 | *self.get_unchecked(idx) 661 | } 662 | } 663 | 664 | impl<'a, T> Vectorizer<&'a mut T> for &'a mut [T] { 665 | unsafe fn get(&mut self, idx: usize) -> &'a mut T { 666 | // FIXME: Why do we have to extend the lifetime here? Is it safe? Intuitively, it should, 667 | // because we hand out each chunk only once and this is what IterMut does too. 668 | let ptr = self.get_unchecked_mut(idx) as *mut T; 669 | &mut *ptr 670 | } 671 | } 672 | 673 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable> 674 | for &'a [Vector] 675 | { 676 | type Padding = (); 677 | type Vectorizer = &'a [Vector]; 678 | fn create(self, _pad: Option<()>) -> (Self::Vectorizer, usize, Option>) { 679 | (self, self.len(), None) 680 | } 681 | } 682 | 683 | impl<'a, A: Align, B: Repr, const S: usize> Vectorizable<&'a mut Vector> 684 | for &'a mut [Vector] 685 | { 686 | type Padding = (); 687 | type Vectorizer = &'a mut [Vector]; 688 | fn create( 689 | self, 690 | _pad: Option<()>, 691 | ) -> (Self::Vectorizer, usize, Option<&'a mut Vector>) { 692 | let len = self.len(); 693 | (self, len, None) 694 | } 695 | } 696 | 697 | #[cfg(test)] 698 | mod tests { 699 | use super::*; 700 | use crate::prelude::*; 701 | 702 | #[test] 703 | fn iter() { 704 | let data = (0..=10u16).collect::>(); 705 | let vtotal: u16x8 = data.vectorize_pad(u16x8::default()).sum(); 706 | let total: u16 = vtotal.horizontal_sum(); 707 | assert_eq!(total, 55); 708 | } 709 | 710 | #[test] 711 | fn iter_mut() { 712 | let data = (0..33u32).collect::>(); 713 | let mut dst = [0u32; 33]; 714 | let ones = u32x4::splat(1); 715 | for (mut d, s) in 716 | (&mut dst[..], &data[..]).vectorize_pad((u32x4::default(), u32x4::default())) 717 | { 718 | *d = ones + s; 719 | } 720 | 721 | for (l, r) in data.iter().zip(dst.iter()) { 722 | assert_eq!(*l + 1, *r); 723 | } 724 | } 725 | 726 | // Here, one of the inputs is already vectorized 727 | #[test] 728 | fn iter_prevec() { 729 | let src = [0, 1, 2, 3, 4, 5, 6, 7]; 730 | let mut dst = [u16x4::default(); 2]; 731 | 732 | for (dst, src) in (&mut dst[..], &src[..]).vectorize() { 733 | *dst = src; 734 | } 735 | 736 | assert_eq!(dst, [u16x4::new([0, 1, 2, 3]), u16x4::new([4, 5, 6, 7])]); 737 | } 738 | } 739 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![doc(test(attr(deny(warnings))))] 2 | #![warn(missing_docs)] 3 | #![allow(non_camel_case_types)] 4 | #![cfg_attr(not(test), no_std)] 5 | 6 | //! This library helps writing code in a way that incentives the compiler to 7 | //! optimize the results better (without really doing anything itself). 8 | //! 9 | //! Modern compilers, including `rustc`, are able to come up with impressive ways to 10 | //! speed up the resulting code, using techniques like loop unrolling and 11 | //! autovectorization, routinely outperforming what one would hand-craft. 12 | //! Nevertheless, each optimisation has some assumptions that must be proven to hold 13 | //! before it can be applied. 14 | //! 15 | //! This library offers „vector“ types, like [`u16x8`], which act in a very similar 16 | //! way as little fixed-sized arrays (in this case it would be `[u16; 8]`), but with 17 | //! arithmetics defined for them. They also enforce alignment of the whole vectors. 18 | //! Therefore, one can write the algorithm in a way that works on these groups of 19 | //! data and make it easier for the compiler to prove the assumptions. This can 20 | //! result in multiple factor speed ups by giving the compiler these proofs „for 21 | //! free“ and allowing it to apply aggressive optimizations. 22 | //! 23 | //! Unlike several other SIMD libraries, this one doesn't do any actual explicit SIMD. That results 24 | //! in relatively simpler interface while still working on stable compiler. It also works in no-std 25 | //! environment. However, the optimisations are not guaranteed. In particular, while the crate may 26 | //! allow for a significant speed-ups, it can *also make your code slower*. When using the crate, 27 | //! you're strongly advised to benchmark. 28 | //! 29 | //! # Anatomy of the crate 30 | //! 31 | //! ## Vector types 32 | //! 33 | //! On the surface, there are types like [`u16x8`], which is just an wrapper around `[u16; 8]`. 34 | //! These wrappers act a bit like arrays (they can be dereferenced to a slice, they can be indexed) 35 | //! and have **common arithmetic traits** implemented. The arithmetic is applied to each index 36 | //! separately, eg: 37 | //! 38 | //! ``` 39 | //! # use slipstream::prelude::*; 40 | //! let a = u8x2::new([1, 2]); 41 | //! let b = u8x2::new([3, 4]); 42 | //! assert_eq!(a + b, u8x2::new([4, 6])); 43 | //! ``` 44 | //! 45 | //! All these types are backed by the generic [`Vector`] type. See the 46 | //! methods there to see how they can be created and how they interact. 47 | //! 48 | //! All these can be imported by importing prelude: 49 | //! 50 | //! ``` 51 | //! # #[allow(unused_imports)] 52 | //! use slipstream::prelude::*; 53 | //! ``` 54 | //! 55 | //! The names are based on primitive types, therefore there are types like [`u8x2`], [`i8x2`], 56 | //! [`f32x4`], [`f64x2`]. 57 | //! 58 | //! There are some more types: 59 | //! 60 | //! * [`wu8x2`] is based on [`Wrapping`][core::num::Wrapping], [`wi8x2`] is based on 61 | //! [`Wrapping`][core::num::Wrapping]. 62 | //! * [`bx2`] are vectors of [`bool`]s. 63 | //! * [`m8x2`] are mask vectors. They act *a bit* like booleans, but they have width and use all 64 | //! bits set to `1` for `true`. These can be used to [`blend`][Vector::blend] vectors together, 65 | //! mask loads and stores and are results of comparisons. The representation is inspired by what 66 | //! the vector instructions actually use, so they should be possible for the compiler to 67 | //! autovectorize. The widths match the types they work with ‒ comparing two [`u32x2`]s will 68 | //! result in [`m32x2`]. The lanes can be converted to/from [`bool`] with methods on the [`Mask`] 69 | //! trait, but usually these are just fed back to some other vector operations. 70 | //! 71 | //! ## Vectorization of slices 72 | //! 73 | //! While it might be better for performance to store all data already in the vector types, it 74 | //! oftentimes happen that the input is in form of a slice or multiple slices of the primitive 75 | //! types. It would be possible to chunk the input and load them into the vectors one at a time, 76 | //! either manually or by using something like the [`chunks_exact`][core::slice::ChunksExact] 77 | //! and [`zip`][core::iter::Iterator::zip]. Nevertheless, it turns out to be inconvenient and often 78 | //! too complex for the compiler to make sense of and vectorize properly. 79 | //! 80 | //! Therefore, the crate provides its own means for splitting the data into vectors, using the 81 | //! [`Vectorizable`] trait. This is implemented on const and mutable slices as well as tuples and 82 | //! small (fixed-sized) arrays of these. The trait adds the [`vectorize`][Vectorizable::vectorize] 83 | //! and [`vectorize_pad`][Vectorizable::vectorize_pad] methods. 84 | //! 85 | //! As the methods can't know into how wide vectors the input should be split, it is often needed 86 | //! to provide a type hint somewhere. 87 | //! 88 | //! ```rust 89 | //! # use slipstream::prelude::*; 90 | //! fn dot_product(l: &[f32], r: &[f32]) -> f32 { 91 | //! let mut result = f32x8::default(); 92 | //! // This assumes l and r are of the same length and divisible by 8 93 | //! for (l, r) in (l, r).vectorize() { 94 | //! // Force the exact type of l and r vectors 95 | //! let (l, r): (f32x8, f32x8) = (l, r); 96 | //! result += l * r; 97 | //! } 98 | //! // Sum the 8 lanes together 99 | //! result.horizontal_sum() 100 | //! } 101 | //! # dot_product(&[], &[]); 102 | //! ``` 103 | //! 104 | //! # Multiversioning and dynamic instruction set selection 105 | //! 106 | //! If used as in the examples above, the compiler chooses an instruction set at compile time, 107 | //! based on the command line arguments. By default these are conservative, to run on arbitrary 108 | //! (old) CPU. It is possible to either enable newer instructions at compile time (at the cost of 109 | //! not being able to run the program on the older CPUs) or compile multiple versions of the same 110 | //! function and choose the right one at runtime, depending on what the CPU actually supports. 111 | //! 112 | //! While this library doesn't provide any direct support for multiversioning, it has been observed 113 | //! to work reasonably well in combination with the [`multiversion`] crate. 114 | //! 115 | //! Note that using a newer and richer instruction set is not always a win. In some cases it can 116 | //! even lead to performance degradation. In particular: 117 | //! 118 | //! * Wide enough vectors must be used to take advantage of the 256 or more bits of the newer 119 | //! instruction set (using these with older instruction set is not a problem; the vector 120 | //! operations will simply translate to multiple narrower instructions). This might create larger 121 | //! „leftovers“ on the ends of slices that need to be handled in non-vectorized manner. 122 | //! * The CPU may need to switch state, possibly negotiate a higher power supply. This might lead 123 | //! to slow down before that happens and might degrade performance of neighboring cores. 124 | //! * Some AMD processors (Buldozers) know the instructions, but simulate them by dispatching the 125 | //! narrower instructions internally (at least it seems so, one 256bit instruction takes a bit 126 | //! longer than two 128bit ones). 127 | //! 128 | //! Depending on the workload, both slowdowns and full 2* speedups were observed. The chances of 129 | //! speedups are higher when there's a lot of data to crunch „in one go“ (so the CPU has time to 130 | //! „warm up“, the leftovers don't matter that much, etc). 131 | //! 132 | //! # Performance tuning tips 133 | //! 134 | //! The sole purpose of this library is to get faster programs, so here are few things to keep in 135 | //! mind when trying. 136 | //! 137 | //! This library (or SIMD in general) is not a silver bullet. It's good to tackle a lot of data 138 | //! crunching by sheer force (the hammer style approach), but can yield only multiplicative 139 | //! speedups (depending on the width of the instructions, on the size of the base type, etc, one 140 | //! can't expect more than 10 or 20 times speedup, usually less). Oftentimes, more high level 141 | //! optimizations bring significantly better results ‒ choosing a better algorithm, reordering the 142 | //! data in memory to avoid cache misses. These can give you orders of magnitude in some cases. 143 | //! Also, besides instruction level parallelism, one can try using threads to parallelize across 144 | //! cores (for example using [`rayon`]). Therefore, vectorization should be used in the latter 145 | //! stages of performance tuning. 146 | //! 147 | //! Also note that when used on a platform without any SIMD support, it can lead to both speed ups 148 | //! (due to loop unrolling) and slowdowns (probably due to exhaustion of available CPU registers). 149 | //! 150 | //! It is important to measure and profile. Not only because you want to spend the time optimizing 151 | //! the hot parts of the program which actually take significant amount of time, but because the 152 | //! autovectorizer and compiler optimizations sometimes produce surprising results. 153 | //! 154 | //! ## Performance characteristics 155 | //! 156 | //! In general, simple lane-wise operations are significantly faster than horizontal operations 157 | //! (when neighboring lanes may interact) and complex ones. Therefore, adding two vectors using the 158 | //! `+` operator is likely to end up being faster than the 159 | //! [`horizontal_sum`][Vector::horizontal_sum] or the [`gather_load`][Vector::gather_load] 160 | //! constructor. 161 | //! 162 | //! It is advisable to keep as much in vectors as possible instead of operating on separate lanes. 163 | //! 164 | //! Therefore, to compute a sum of bunch of numbers, split the input into vectors, sum these up and 165 | //! do single `horizontal_sum` at the very end. 166 | //! 167 | //! ```rust 168 | //! # use slipstream::prelude::*; 169 | //! fn sum(data: &[f32x8]) -> f32 { 170 | //! data 171 | //! .iter() 172 | //! .copied() 173 | //! .sum::() // Summing up whole f32x8 vectors, result is also f32x8 174 | //! .horizontal_sum() // Summing individual lanes of that vector 175 | //! } 176 | //! # assert_eq!(0.0, sum(&[])); 177 | //! ``` 178 | //! 179 | //! Also keep in mind that there's usually some „warm up“ for vectorized part of code. This partly 180 | //! comes from the need to somehow deal with uneven ends (if the input is not divisible by the 181 | //! vector size). Also, some instructions require the CPU to switch state, possibly lower frequency 182 | //! and negotiate higher power supply, which may even hinder performance of neighboring cores (this 183 | //! is more of a problem for „newer“ instruction sets like AVX-512 than eg. SSE). 184 | //! 185 | //! Therefore, there's little advantage of interspersing otherwise non-vectorized code with 186 | //! occasional vector variable. The best results are for crunching big inputs all at once. 187 | //! 188 | //! ## Suggested process 189 | //! 190 | //! * Write the non-vectorized version first. Make sure to use the correct algorithm, avoid 191 | //! unnecessary work, etc. 192 | //! * Parallelize it across threads where it makes sense. 193 | //! * Prepare a micro-benchmark exercising the hot part. 194 | //! * Try rewriting it using the vector types in this crate, but keep the non-vectorized version 195 | //! around for comparison. Make sure to run the benchmark for both. 196 | //! * If the vectorized version doesn't meet the expectations (or even make things slower), you can 197 | //! check these things: 198 | //! - If using the [`multiversion`] crate, watch out for (not) inlining. The detected instruction 199 | //! set is not propagated to other functions called from the multiversioned one, only to the 200 | //! inlined ones. 201 | //! - Make sure to use reasonably sized vector type. On one side, it needs to be large enough to 202 | //! fill the whole SIMD register (128 bit for SSE and NEON, 256 for AVX, 512 bits for AVX-512). 203 | //! On the other side, it should not be too large ‒ while wider vectors can be simulated by 204 | //! executing multiple narrower instructions, they also take multiple registers and that may 205 | //! lead to unnecessary „juggling“. 206 | //! - See the profiler output if any particular part stands out. Oftentimes, some constructs like 207 | //! the [`zip`][core::iter::Iterator::zip] iterator adaptor were found to be problematic. If a 208 | //! construct is too complex for rustc to „see through“, it can be helped by rewriting that 209 | //! particular part manually in a simpler way. Pulling slice range checks before the loop might 210 | //! help too, as rustc no longer has to ensure a panic from the violation would happen at the 211 | //! right time in the middle of processing. 212 | //! - Check the assembler output if it looks sane. Seeing if it looks vectorized can be done 213 | //! without extensive assembler knowledge ‒ SIMD instructions have longer names and use 214 | //! different named registers (`xmm?` ones for SSE, `ymm?` ones for AVX). 215 | //! 216 | //! See if the profiler can be configured to show inlined functions instead of counting the whole 217 | //! runtime to the whole function. Some profilers can even show annotated assembler code, 218 | //! pinpointing the instruction or area that takes long time. In such case, be aware that an 219 | //! instruction might take a long time because it waits on a data dependency (some preceding 220 | //! instruction still being executed in the pipeline) or data from memory. 221 | //! 222 | //! For the `perf` profile, this can be done with `perf record --call-graph=dwarf `, 223 | //! `perf report` and `perf annotate`. Make sure to profile with both optimizations *and* debug 224 | //! symbols enabled (but if developing a proprietary thing, make sure to ship *without* the debug 225 | //! symbols). 226 | //! 227 | //! ```toml 228 | //! [profile.release] 229 | //! debug = 2 230 | //! ``` 231 | //! 232 | //! When all else fails, you can always rewrite only parts of the algorithm using the explicit 233 | //! intrinsics in [`core::arch`] and leave the rest for autovectorizer. The vector types should be 234 | //! compatible for transmuting to the low-level vectors (eg. `__m128`). 235 | //! 236 | //! # Alternatives 237 | //! 238 | //! There are other crates that try to help with SIMD: 239 | //! 240 | //! * [`packed_simd`]: This is *the* official SIMD library. The downside is, this works only on 241 | //! nighty compiler and the timeline when this could get stabilized is unclear. 242 | //! * [`faster`]: Works only on nightly and looks abandoned. 243 | //! * [`simdeez`]: Doesn't have unsigned ints. Works on stable, but is unsound (can lead to UB 244 | //! without writing a single line of user `unsafe` code). 245 | //! * [`safe_simd`]: It has somewhat more complex API than this library, because it deals with 246 | //! instruction sets explicitly. It supports explicit vectorization (doesn't rely on 247 | //! autovectorizer). It is not yet released. 248 | //! 249 | //! [`multiversion`]: https://crates.io/crates/multiversion 250 | //! [`rayon`]: https://crates.io/crates/rayon 251 | //! [`packed_simd`]: https://crates.io/crates/packed_simd 252 | //! [`faster`]: https://crates.io/crates/faster 253 | //! [`simdeez`]: https://crates.io/crates/simdeez 254 | //! [`safe_simd`]: https://github.com/calebzulawski/safe_simd/ 255 | 256 | pub mod iterators; 257 | pub mod mask; 258 | pub mod types; 259 | pub mod vector; 260 | 261 | pub use iterators::Vectorizable; 262 | pub use mask::Mask; 263 | pub use types::*; 264 | pub use vector::Vector; 265 | 266 | /// Commonly used imports 267 | /// 268 | /// This can be imported to get all the vector types and all the relevant user-facing traits of the 269 | /// crate. 270 | pub mod prelude { 271 | pub use crate::types::*; 272 | pub use crate::vector::Masked as _; 273 | pub use crate::Mask as _; 274 | pub use crate::Vectorizable as _; 275 | } 276 | 277 | mod inner { 278 | use core::num::Wrapping; 279 | 280 | use crate::mask::{m128, m16, m32, m64, m8, msize, Mask}; 281 | 282 | /// A trait to enable vectors to use this type as the base type. 283 | /// 284 | /// # Safety 285 | /// 286 | /// This is in a private module to prevent users creating their own „crazy“ vector 287 | /// implementations. We make some non-trivial assumptions about the inner types and be are 288 | /// conservative at least until we figure out what *exact* assumptions these are and formalize 289 | /// them. 290 | pub unsafe trait Repr: Send + Sync + Copy + 'static { 291 | type Mask: Mask; 292 | const ONE: Self; 293 | } 294 | 295 | unsafe impl Repr for Wrapping { 296 | type Mask = m8; 297 | const ONE: Wrapping = Wrapping(1); 298 | } 299 | unsafe impl Repr for Wrapping { 300 | type Mask = m16; 301 | const ONE: Wrapping = Wrapping(1); 302 | } 303 | unsafe impl Repr for Wrapping { 304 | type Mask = m32; 305 | const ONE: Wrapping = Wrapping(1); 306 | } 307 | unsafe impl Repr for Wrapping { 308 | type Mask = m64; 309 | const ONE: Wrapping = Wrapping(1); 310 | } 311 | unsafe impl Repr for Wrapping { 312 | type Mask = m128; 313 | const ONE: Wrapping = Wrapping(1); 314 | } 315 | unsafe impl Repr for Wrapping { 316 | type Mask = msize; 317 | const ONE: Wrapping = Wrapping(1); 318 | } 319 | unsafe impl Repr for u8 { 320 | type Mask = m8; 321 | const ONE: u8 = 1; 322 | } 323 | unsafe impl Repr for u16 { 324 | type Mask = m16; 325 | const ONE: u16 = 1; 326 | } 327 | unsafe impl Repr for u32 { 328 | type Mask = m32; 329 | const ONE: u32 = 1; 330 | } 331 | unsafe impl Repr for u64 { 332 | type Mask = m64; 333 | const ONE: u64 = 1; 334 | } 335 | unsafe impl Repr for u128 { 336 | type Mask = m128; 337 | const ONE: u128 = 1; 338 | } 339 | unsafe impl Repr for usize { 340 | type Mask = msize; 341 | const ONE: usize = 1; 342 | } 343 | 344 | unsafe impl Repr for Wrapping { 345 | type Mask = m8; 346 | const ONE: Wrapping = Wrapping(1); 347 | } 348 | unsafe impl Repr for Wrapping { 349 | type Mask = m16; 350 | const ONE: Wrapping = Wrapping(1); 351 | } 352 | unsafe impl Repr for Wrapping { 353 | type Mask = m32; 354 | const ONE: Wrapping = Wrapping(1); 355 | } 356 | unsafe impl Repr for Wrapping { 357 | type Mask = m64; 358 | const ONE: Wrapping = Wrapping(1); 359 | } 360 | unsafe impl Repr for Wrapping { 361 | type Mask = m128; 362 | const ONE: Wrapping = Wrapping(1); 363 | } 364 | unsafe impl Repr for Wrapping { 365 | type Mask = msize; 366 | const ONE: Wrapping = Wrapping(1); 367 | } 368 | unsafe impl Repr for i8 { 369 | type Mask = m8; 370 | const ONE: i8 = 1; 371 | } 372 | unsafe impl Repr for i16 { 373 | type Mask = m16; 374 | const ONE: i16 = 1; 375 | } 376 | unsafe impl Repr for i32 { 377 | type Mask = m32; 378 | const ONE: i32 = 1; 379 | } 380 | unsafe impl Repr for i64 { 381 | type Mask = m64; 382 | const ONE: i64 = 1; 383 | } 384 | unsafe impl Repr for i128 { 385 | type Mask = m128; 386 | const ONE: i128 = 1; 387 | } 388 | unsafe impl Repr for isize { 389 | type Mask = msize; 390 | const ONE: isize = 1; 391 | } 392 | 393 | unsafe impl Repr for f32 { 394 | type Mask = m32; 395 | const ONE: f32 = 1.0; 396 | } 397 | unsafe impl Repr for f64 { 398 | type Mask = m64; 399 | const ONE: f64 = 1.0; 400 | } 401 | unsafe impl Repr for M { 402 | type Mask = Self; 403 | const ONE: M = M::TRUE; 404 | } 405 | } 406 | 407 | /// Free-standing version of [`Vectorizable::vectorize`]. 408 | /// 409 | /// This is the same as `a.vectorize()`. Nevertheless, this version might be more convenient as it 410 | /// allows hinting the result vector type with turbofish. 411 | /// 412 | /// ```rust 413 | /// # use slipstream::prelude::*; 414 | /// let data = [1, 2, 3, 4]; 415 | /// for v in slipstream::vectorize::(&data[..]) { 416 | /// println!("{:?}", v); 417 | /// } 418 | /// ``` 419 | #[inline(always)] 420 | pub fn vectorize(a: A) -> impl Iterator 421 | where 422 | A: Vectorizable, 423 | { 424 | a.vectorize() 425 | } 426 | 427 | /// Free-standing version of [`Vectorizable::vectorize_pad`]. 428 | /// 429 | /// Equivalent to `a.vectorize_pad(pad)`, but may be more convenient or readable in certain cases. 430 | /// 431 | /// ```rust 432 | /// # use slipstream::prelude::*; 433 | /// let data = [1, 2, 3, 4, 5, 6]; 434 | /// let v = slipstream::vectorize_pad(&data[..], i32x4::splat(-1)).collect::>(); 435 | /// assert_eq!(v, vec![i32x4::new([1, 2, 3, 4]), i32x4::new([5, 6, -1, -1])]); 436 | /// ``` 437 | #[inline(always)] 438 | pub fn vectorize_pad(a: A, pad: A::Padding) -> impl Iterator 439 | where 440 | A: Vectorizable, 441 | { 442 | a.vectorize_pad(pad) 443 | } 444 | 445 | #[cfg(test)] 446 | mod tests { 447 | use crate::prelude::*; 448 | 449 | #[test] 450 | fn minmax() { 451 | let a = u32x4::new([1, 4, 8, 9]); 452 | let b = u32x4::new([3, 3, 5, 11]); 453 | 454 | assert_eq!(a.minimum(b), u32x4::new([1, 3, 5, 9])); 455 | assert_eq!(a.maximum(b), u32x4::new([3, 4, 8, 11])); 456 | assert_eq!(a.minimum(b), b.minimum(a)); 457 | assert_eq!(a.maximum(b), b.maximum(a)); 458 | assert_eq!(a.maximum(b).ge(a.minimum(b)), m32x4::splat(m32::TRUE)); 459 | } 460 | } 461 | -------------------------------------------------------------------------------- /src/mask.rs: -------------------------------------------------------------------------------- 1 | //! Bool-like types used for masked operations. 2 | //! 3 | //! With multi-lane vectors, it is sometimes useful to do a lane-wise comparison or to disable some 4 | //! of the lanes for a given operation. Naturally, one would express this using a correctly sized 5 | //! `bool` array. 6 | //! 7 | //! Nevertheless, the CPU SIMD instructions don't use bools, but signal `true`/`false` with a 8 | //! full-sized type with either all bits set to 1 or 0 (TODO: this is not true for AVX-512, what do 9 | //! we want to do about it?). Therefore, we define our own types that act like bools, but are 10 | //! represented in the above way. The comparison operators return vectors of these base mask types. 11 | //! The selection operations accept whatever mask vector with the same number of lanes, but they 12 | //! are expected to act fastest with the correct sized ones. 13 | //! 14 | //! For the purpose of input, `bool` is also considered a mask type. 15 | //! 16 | //! The interesting operations are: 17 | //! * Comparisons ([`lt`][crate::Vector::lt], [`le`][crate::Vector::le], [`eq`][crate::Vector::eq], 18 | //! [`ge`][crate::Vector::ge], [`gt`][crate::Vector::gt]) 19 | //! * The [`blend`][crate::Vector::blend] method. 20 | //! * Masked [loading][crate::Vector::gather_load_masked] and 21 | //! [storing][crate::Vector::scatter_store_masked] of vectors. 22 | //! 23 | //! The number in the type name specifies the number of bits. Therefore, for the 24 | //! [`u16x4`][crate::u16x4], the natural mask type is a vector of 4 [`m16`], which is 25 | //! [`m16x4`][crate::m16x4]. 26 | //! 27 | //! While it is possible to operate with the bools (by converting them), it is more common to 28 | //! simply pipe the masks back into the vectors. Note that they *do* implement the usual boolean 29 | //! operators (however, only the non-shortcircuiting/bitwise variants). These work lane-wise. 30 | //! 31 | //! # Examples 32 | //! 33 | //! ```rust 34 | //! # use slipstream::prelude::*; 35 | //! fn abs(vals: &mut [i32]) { 36 | //! let zeroes = i32x8::default(); 37 | //! for mut v in vals.vectorize_pad(i32x8::default()) { 38 | //! // Type of this one is m32x8 and is true whereever the lane isnegative. 39 | //! let negative = v.lt(zeroes); 40 | //! // Pick lanes from v where non-negative, pick from -v where negative. 41 | //! *v = v.blend(-*v, negative); 42 | //! } 43 | //! } 44 | //! let mut data = [1, -2, 3]; 45 | //! abs(&mut data); 46 | //! assert_eq!(data, [1, 2, 3]); 47 | //! ``` 48 | use core::ops::*; 49 | 50 | mod inner { 51 | pub trait Sealed {} 52 | } 53 | 54 | /// The trait implemented by all the mask types. 55 | /// 56 | /// Note that this trait is not implementable by downstream crates, as code in the crate assumes 57 | /// (and relies for safety on the assumption) that the type can ever hold only the two values. 58 | /// 59 | /// See the [module documentation][crate::mask]. 60 | pub trait Mask: 61 | Copy 62 | + Eq 63 | + Send 64 | + Sync 65 | + inner::Sealed 66 | + Not 67 | + BitAnd 68 | + BitAndAssign 69 | + BitOr 70 | + BitOrAssign 71 | + BitXor 72 | + BitXorAssign 73 | + 'static 74 | { 75 | /// A constant specifying the true value of the type. 76 | /// 77 | /// For bool, this is `true`. For the others, this means all bits set to `1` ‒ eg. 256 for 78 | /// [`m8]. 79 | const TRUE: Self; 80 | 81 | /// The false value of the type. 82 | /// 83 | /// For bool, this is `false`. For the others, this means 0 (all bits set to 0). 84 | const FALSE: Self; 85 | 86 | /// Converts the type to bool. 87 | #[inline] 88 | fn bool(self) -> bool { 89 | if self == Self::TRUE { 90 | true 91 | } else if self == Self::FALSE { 92 | false 93 | } else { 94 | unsafe { core::hint::unreachable_unchecked() } 95 | } 96 | } 97 | 98 | /// Converts the type from bool. 99 | #[inline] 100 | fn from_bool(v: bool) -> Self { 101 | if v { 102 | Self::TRUE 103 | } else { 104 | Self::FALSE 105 | } 106 | } 107 | } 108 | 109 | /// Inner implementation of the mask types. 110 | /// 111 | /// This is to be used through the type aliases in this module, like [`m8`], or more often through 112 | /// vectors of these, like [`m8x4`][crate::m8x4]. These are the [`mask vectors`][crate::mask]. 113 | #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd)] 114 | pub struct MaskWrapper(I); 115 | 116 | macro_rules! trait_impl { 117 | ($T: ident, $m: ident, $TA: ident, $ma: ident) => { 118 | impl> $T for MaskWrapper { 119 | type Output = Self; 120 | fn $m(self, rhs: Self) -> Self { 121 | Self((self.0).$m(rhs.0)) 122 | } 123 | } 124 | 125 | impl $TA for MaskWrapper { 126 | fn $ma(&mut self, rhs: Self) { 127 | (self.0).$ma(rhs.0) 128 | } 129 | } 130 | }; 131 | } 132 | 133 | trait_impl!(BitAnd, bitand, BitAndAssign, bitand_assign); 134 | trait_impl!(BitOr, bitor, BitOrAssign, bitor_assign); 135 | trait_impl!(BitXor, bitxor, BitXorAssign, bitxor_assign); 136 | 137 | impl> Not for MaskWrapper { 138 | type Output = Self; 139 | fn not(self) -> Self::Output { 140 | Self(self.0.not()) 141 | } 142 | } 143 | 144 | #[allow(missing_docs)] 145 | pub type m8 = MaskWrapper; 146 | 147 | impl inner::Sealed for m8 {} 148 | 149 | impl Mask for m8 { 150 | const TRUE: Self = MaskWrapper(u8::MAX); 151 | const FALSE: Self = MaskWrapper(0); 152 | } 153 | 154 | #[allow(missing_docs)] 155 | pub type m16 = MaskWrapper; 156 | 157 | impl inner::Sealed for m16 {} 158 | 159 | impl Mask for m16 { 160 | const TRUE: Self = MaskWrapper(u16::MAX); 161 | const FALSE: Self = MaskWrapper(0); 162 | } 163 | 164 | #[allow(missing_docs)] 165 | pub type m32 = MaskWrapper; 166 | 167 | impl inner::Sealed for m32 {} 168 | 169 | impl Mask for m32 { 170 | const TRUE: Self = MaskWrapper(u32::MAX); 171 | const FALSE: Self = MaskWrapper(0); 172 | } 173 | 174 | #[allow(missing_docs)] 175 | pub type m64 = MaskWrapper; 176 | 177 | impl inner::Sealed for m64 {} 178 | 179 | impl Mask for m64 { 180 | const TRUE: Self = MaskWrapper(u64::MAX); 181 | const FALSE: Self = MaskWrapper(0); 182 | } 183 | 184 | #[allow(missing_docs)] 185 | pub type m128 = MaskWrapper; 186 | 187 | impl inner::Sealed for m128 {} 188 | 189 | impl Mask for m128 { 190 | const TRUE: Self = MaskWrapper(u128::MAX); 191 | const FALSE: Self = MaskWrapper(0); 192 | } 193 | 194 | #[allow(missing_docs)] 195 | pub type msize = MaskWrapper; 196 | 197 | impl inner::Sealed for msize {} 198 | 199 | impl Mask for msize { 200 | const TRUE: Self = MaskWrapper(usize::MAX); 201 | const FALSE: Self = MaskWrapper(0); 202 | } 203 | 204 | impl inner::Sealed for bool {} 205 | 206 | impl Mask for bool { 207 | const TRUE: Self = true; 208 | const FALSE: Self = false; 209 | } 210 | -------------------------------------------------------------------------------- /src/types.rs: -------------------------------------------------------------------------------- 1 | #![allow(missing_docs)] 2 | //! Type aliases of the commonly used vector types. 3 | //! 4 | //! While the vector types are created from the [`Vector`] by setting the base type and length, 5 | //! this is seldom done in downstream code. Instead, this module provides the commonly used types 6 | //! as aliases, like [u16x8]. See the [crate introduction](crate) for further details about the 7 | //! naming convention. 8 | //! 9 | //! All these types are also exported as part of the [`prelude`][crate::prelude]. 10 | use core::num::Wrapping; 11 | 12 | pub use crate::mask::{m16, m32, m64, m8, msize}; 13 | use crate::vector::align::*; 14 | use crate::vector::Vector; 15 | 16 | pub type bx2 = Vector; 17 | pub type bx4 = Vector; 18 | pub type bx8 = Vector; 19 | pub type bx16 = Vector; 20 | pub type bx32 = Vector; 21 | 22 | pub type m8x2 = Vector; 23 | pub type m8x4 = Vector; 24 | pub type m8x8 = Vector; 25 | pub type m8x16 = Vector; 26 | pub type m8x32 = Vector; 27 | 28 | pub type m16x2 = Vector; 29 | pub type m16x4 = Vector; 30 | pub type m16x8 = Vector; 31 | pub type m16x16 = Vector; 32 | 33 | pub type m32x2 = Vector; 34 | pub type m32x4 = Vector; 35 | pub type m32x8 = Vector; 36 | pub type m32x16 = Vector; 37 | 38 | pub type m64x2 = Vector; 39 | pub type m64x4 = Vector; 40 | pub type m64x8 = Vector; 41 | pub type m64x16 = Vector; 42 | 43 | pub type u8x2 = Vector; 44 | pub type u8x4 = Vector; 45 | pub type u8x8 = Vector; 46 | pub type u8x16 = Vector; 47 | pub type u8x32 = Vector; 48 | 49 | pub type u16x2 = Vector; 50 | pub type u16x4 = Vector; 51 | pub type u16x8 = Vector; 52 | pub type u16x16 = Vector; 53 | 54 | pub type u32x2 = Vector; 55 | pub type u32x4 = Vector; 56 | pub type u32x8 = Vector; 57 | pub type u32x16 = Vector; 58 | 59 | pub type u64x2 = Vector; 60 | pub type u64x4 = Vector; 61 | pub type u64x8 = Vector; 62 | pub type u64x16 = Vector; 63 | 64 | pub type wu8x2 = Vector, 2>; 65 | pub type wu8x4 = Vector, 4>; 66 | pub type wu8x8 = Vector, 8>; 67 | pub type wu8x16 = Vector, 16>; 68 | pub type wu8x32 = Vector, 32>; 69 | 70 | pub type wu16x2 = Vector, 2>; 71 | pub type wu16x4 = Vector, 4>; 72 | pub type wu16x8 = Vector, 8>; 73 | pub type wu16x16 = Vector, 16>; 74 | 75 | pub type wu32x2 = Vector, 2>; 76 | pub type wu32x4 = Vector, 4>; 77 | pub type wu32x8 = Vector, 8>; 78 | pub type wu32x16 = Vector, 16>; 79 | 80 | pub type wu64x2 = Vector, 2>; 81 | pub type wu64x4 = Vector, 4>; 82 | pub type wu64x8 = Vector, 8>; 83 | pub type wu64x16 = Vector, 16>; 84 | 85 | pub type i8x2 = Vector; 86 | pub type i8x4 = Vector; 87 | pub type i8x8 = Vector; 88 | pub type i8x16 = Vector; 89 | pub type i8x32 = Vector; 90 | 91 | pub type i16x2 = Vector; 92 | pub type i16x4 = Vector; 93 | pub type i16x8 = Vector; 94 | pub type i16x16 = Vector; 95 | 96 | pub type i32x2 = Vector; 97 | pub type i32x4 = Vector; 98 | pub type i32x8 = Vector; 99 | pub type i32x16 = Vector; 100 | 101 | pub type i64x2 = Vector; 102 | pub type i64x4 = Vector; 103 | pub type i64x8 = Vector; 104 | pub type i64x16 = Vector; 105 | 106 | pub type wi8x2 = Vector, 2>; 107 | pub type wi8x4 = Vector, 4>; 108 | pub type wi8x8 = Vector, 8>; 109 | pub type wi8x16 = Vector, 16>; 110 | pub type wi8x32 = Vector, 32>; 111 | 112 | pub type wi16x2 = Vector, 2>; 113 | pub type wi16x4 = Vector, 4>; 114 | pub type wi16x8 = Vector, 8>; 115 | pub type wi16x16 = Vector, 16>; 116 | 117 | pub type wi32x2 = Vector, 2>; 118 | pub type wi32x4 = Vector, 4>; 119 | pub type wi32x8 = Vector, 8>; 120 | pub type wi32x16 = Vector, 16>; 121 | 122 | pub type wi64x2 = Vector, 2>; 123 | pub type wi64x4 = Vector, 4>; 124 | pub type wi64x8 = Vector, 8>; 125 | pub type wi64x16 = Vector, 16>; 126 | 127 | pub type f32x2 = Vector; 128 | pub type f32x4 = Vector; 129 | pub type f32x8 = Vector; 130 | pub type f32x16 = Vector; 131 | 132 | pub type f64x2 = Vector; 133 | pub type f64x4 = Vector; 134 | pub type f64x8 = Vector; 135 | pub type f64x16 = Vector; 136 | 137 | // Note: the usize/isize vectors are per-pointer-width because they need a different alignment. 138 | 139 | #[cfg(target_pointer_width = "32")] 140 | mod sized { 141 | use super::*; 142 | 143 | pub type msizex2 = Vector; 144 | pub type msizex4 = Vector; 145 | pub type msizex8 = Vector; 146 | pub type msizex16 = Vector; 147 | 148 | pub type usizex2 = Vector; 149 | pub type usizex4 = Vector; 150 | pub type usizex8 = Vector; 151 | pub type usizex16 = Vector; 152 | 153 | pub type wusizex2 = Vector, 2>; 154 | pub type wusizex4 = Vector, 4>; 155 | pub type wusizex8 = Vector, 8>; 156 | pub type wusizex16 = Vector, 16>; 157 | 158 | pub type isizex2 = Vector; 159 | pub type isizex4 = Vector; 160 | pub type isizex8 = Vector; 161 | pub type isizex16 = Vector; 162 | 163 | pub type wisizex2 = Vector, 2>; 164 | pub type wisizex4 = Vector, 4>; 165 | pub type wisizex8 = Vector, 8>; 166 | pub type wisizex16 = Vector, 16>; 167 | } 168 | 169 | #[cfg(target_pointer_width = "64")] 170 | mod sized { 171 | use super::*; 172 | 173 | pub type msizex2 = Vector; 174 | pub type msizex4 = Vector; 175 | pub type msizex8 = Vector; 176 | pub type msizex16 = Vector; 177 | 178 | pub type usizex2 = Vector; 179 | pub type usizex4 = Vector; 180 | pub type usizex8 = Vector; 181 | pub type usizex16 = Vector; 182 | 183 | pub type wusizex2 = Vector, 2>; 184 | pub type wusizex4 = Vector, 4>; 185 | pub type wusizex8 = Vector, 8>; 186 | pub type wusizex16 = Vector, 16>; 187 | 188 | pub type isizex2 = Vector; 189 | pub type isizex4 = Vector; 190 | pub type isizex8 = Vector; 191 | pub type isizex16 = Vector; 192 | 193 | pub type wisizex2 = Vector, 2>; 194 | pub type wisizex4 = Vector, 4>; 195 | pub type wisizex8 = Vector, 8>; 196 | pub type wisizex16 = Vector, 16>; 197 | } 198 | 199 | pub use sized::*; 200 | -------------------------------------------------------------------------------- /src/vector.rs: -------------------------------------------------------------------------------- 1 | //! Low-level definitions of the vector types and their traits. 2 | //! 3 | //! While the user usually operates with the type aliases defined in [`types`][crate::types] (and 4 | //! exported through the [`prelude`][crate::prelude], this module provides the actual 5 | //! implementation of the types. 6 | //! 7 | //! The module defines a [`Vector`] type. This allows setting not only the base type and number of 8 | //! lanes, but also alignment (through an additional alignment marker type, available in the 9 | //! [`align`][mod@align] submodule). 10 | //! 11 | //! There are multiple alignments available. Small vectors shouldn't require bigger alignment than 12 | //! their size, while the bigger ones should require larger one to make it possible to use wider 13 | //! SIMD registers. 14 | //! 15 | //! The type aliases in [`types`][crate::types] takes this into account. 16 | //! 17 | //! These types aliases are not thoroughly documented on themselves. The documentation is on the 18 | //! [`Vector`]. A lot of its functionality is in traits it implements. 19 | 20 | use core::fmt::{Debug, Formatter, Result as FmtResult}; 21 | use core::iter::{Product, Sum}; 22 | use core::mem::{self, MaybeUninit}; 23 | use core::ops::*; 24 | use core::ptr; 25 | use num_traits::Float; 26 | 27 | use self::align::Align; 28 | use crate::inner::Repr; 29 | use crate::Mask; 30 | 31 | /// Enforcement of alignment. 32 | /// 33 | /// This is mostly an implementation detail seldom used by consumers of the crate. 34 | pub mod align { 35 | /// Marker trait for alignment enforcers. 36 | /// 37 | /// The SIMD vectors need to be properly aligned. Rust allows doing that by an attribute, but that 38 | /// needs another top-level vector type. We use zero-sized types to enforce it in a different way. 39 | /// 40 | /// This is just a marker type for the enforcers, to avoid people putting the wrong parameter at 41 | /// the wrong place. 42 | pub trait Align: Copy {} 43 | 44 | macro_rules! align { 45 | ($name: ident, $align: expr) => { 46 | /// Alignment marker. 47 | #[derive(Copy, Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)] 48 | #[repr(align($align))] 49 | pub struct $name; 50 | impl Align for $name {} 51 | }; 52 | } 53 | 54 | align!(Align1, 1); 55 | align!(Align2, 2); 56 | align!(Align4, 4); 57 | align!(Align8, 8); 58 | align!(Align16, 16); 59 | align!(Align32, 32); 60 | align!(Align64, 64); 61 | align!(Align128, 128); 62 | } 63 | 64 | // TODO: Seal? 65 | /// Trait to look up a mask corresponding to a type. 66 | /// 67 | /// The [`Vector`] implements this and allows for finding out what the corresponding mask type for 68 | /// it is. This is not an inherent associated type because these don't yet exist in Rust. 69 | pub trait Masked { 70 | /// The mask type for this vector. 71 | /// 72 | /// Masks are vector types of boolean-like base types. They are used as results of lane-wise 73 | /// comparisons like [`eq`][Vector::eq] and for enabling subsets of lanes for certain 74 | /// operations, like [`blend`][Vector::blend] and 75 | /// [`gather_load_masked`][Vector::gather_load_masked]. 76 | /// 77 | /// This associated type describes the native mask for the given vector. For example for 78 | /// [`u32x4`][crate::u32x4] it would be [`m32x4`][crate::m32x4]. This is the type that the 79 | /// comparisons produce. While the selection methods accept any mask type of the right number 80 | /// of lanes, using this type on their input is expected to yield the best performance. 81 | type Mask; 82 | } 83 | 84 | macro_rules! bin_op_impl { 85 | ($tr: ident, $meth: ident, $tr_assign: ident, $meth_assign: ident) => { 86 | impl + Repr, const S: usize> $tr for Vector { 87 | type Output = Self; 88 | #[inline] 89 | fn $meth(self, rhs: Self) -> Self { 90 | unsafe { 91 | let mut data = MaybeUninit::::uninit(); 92 | for i in 0..S { 93 | ptr::write( 94 | data.as_mut_ptr().cast::().add(i), 95 | $tr::$meth(self.data[i], rhs.data[i]), 96 | ); 97 | } 98 | data.assume_init() 99 | } 100 | } 101 | } 102 | 103 | impl + Repr, const S: usize> $tr for Vector { 104 | type Output = Self; 105 | #[inline] 106 | fn $meth(self, rhs: B) -> Self { 107 | unsafe { 108 | let mut data = MaybeUninit::::uninit(); 109 | for i in 0..S { 110 | ptr::write( 111 | data.as_mut_ptr().cast::().add(i), 112 | $tr::$meth(self.data[i], rhs), 113 | ); 114 | } 115 | data.assume_init() 116 | } 117 | } 118 | } 119 | 120 | impl $tr_assign for Vector { 121 | #[inline] 122 | fn $meth_assign(&mut self, rhs: Self) { 123 | for i in 0..S { 124 | $tr_assign::$meth_assign(&mut self.data[i], rhs.data[i]); 125 | } 126 | } 127 | } 128 | 129 | impl $tr_assign for Vector { 130 | #[inline] 131 | fn $meth_assign(&mut self, rhs: B) { 132 | for i in 0..S { 133 | $tr_assign::$meth_assign(&mut self.data[i], rhs); 134 | } 135 | } 136 | } 137 | }; 138 | } 139 | 140 | macro_rules! una_op_impl { 141 | ($tr: ident, $meth: ident) => { 142 | impl + Repr, const S: usize> $tr for Vector { 143 | type Output = Self; 144 | #[inline] 145 | fn $meth(self) -> Self { 146 | unsafe { 147 | let mut data = MaybeUninit::::uninit(); 148 | for i in 0..S { 149 | ptr::write( 150 | data.as_mut_ptr().cast::().add(i), 151 | $tr::$meth(self.data[i]), 152 | ); 153 | } 154 | data.assume_init() 155 | } 156 | } 157 | } 158 | }; 159 | } 160 | 161 | macro_rules! cmp_op { 162 | ($($(#[ $meta: meta ])* $tr: ident => $op: ident;)*) => { 163 | $( 164 | $(#[ $meta ])* 165 | #[inline] 166 | pub fn $op(self, other: Self) -> ::Mask 167 | where 168 | B: $tr, 169 | { 170 | let mut data = MaybeUninit::<::Mask>::uninit(); 171 | unsafe { 172 | for i in 0..S { 173 | ptr::write( 174 | data.as_mut_ptr().cast::().add(i), 175 | B::Mask::from_bool(self.data[i].$op(&other.data[i])), 176 | ); 177 | } 178 | data.assume_init() 179 | } 180 | } 181 | )* 182 | }; 183 | } 184 | 185 | /// A vector type. 186 | /// 187 | /// Vector types are mostly well aligned fixed sized arrays. Unlike the arrays, they have the usual 188 | /// numeric operators and several helpful methods implemented on them. They perform the operations 189 | /// „per lane“ independently and allow the CPU to parallelize the computations. 190 | /// 191 | /// The types have convenient aliases ‒ for example [`u32x4`][crate::u32x4] is an alias for 192 | /// `Vector` and corresponds to `[u32; 4]` (but aligned to 16 bytes). 193 | /// 194 | /// While these can be operated as arrays (indexing, copying between slices, etc), it is better to 195 | /// perform operations on whole vectors at once. 196 | /// 197 | /// The usual comparing operators don't exist (`<=`), but there are „per lane“ comparison operators 198 | /// that return mask vectors ‒ vectors of boolean-like values. These can either be examined 199 | /// manually, or fed into other operations on vectors, like [`blend`][Vector::blend] or 200 | /// [`gather_load_masked`][Vector::gather_load_masked]. 201 | /// 202 | /// # Examples 203 | /// 204 | /// ```rust 205 | /// # use slipstream::prelude::*; 206 | /// let a = i32x4::new([1, -2, 3, -4]); 207 | /// let b = -a; // [-1, 2, -3, 4] 208 | /// let positive = a.ge(i32x4::splat(1)); // Lane-wise a >= 1 209 | /// // Will take from b where positive is true, from a otherwise 210 | /// let abs = b.blend(a, positive); 211 | /// assert_eq!(abs, i32x4::new([1, 2, 3, 4])); 212 | /// ``` 213 | #[repr(C)] 214 | #[derive(Copy, Clone)] 215 | pub struct Vector 216 | where 217 | A: Align, 218 | B: Repr, 219 | { 220 | _align: [A; 0], 221 | data: [B; S], 222 | } 223 | 224 | impl Vector 225 | where 226 | A: Align, 227 | B: Repr, 228 | { 229 | /// Number of lanes of the vector. 230 | pub const LANES: usize = S; 231 | 232 | #[inline(always)] 233 | fn assert_size() { 234 | assert!(S > 0); 235 | assert!( 236 | isize::MAX as usize > mem::size_of::(), 237 | "Vector type too huge", 238 | ); 239 | assert_eq!( 240 | mem::size_of::(), 241 | mem::size_of::<[B; S]>(), 242 | "Must not contain paddings/invalid Align parameter", 243 | ); 244 | } 245 | 246 | /// Loads the vector without doing bounds checks. 247 | /// 248 | /// # Safety 249 | /// 250 | /// The pointed to memory must be valid in `Self::LANES` consecutive cells ‒ eg. it must 251 | /// contain a full array of the base types. 252 | #[inline] 253 | pub unsafe fn new_unchecked(input: *const B) -> Self { 254 | Self::assert_size(); 255 | Self { 256 | _align: [], 257 | data: ptr::read(input.cast()), 258 | } 259 | } 260 | /// Loads the vector from correctly sized slice. 261 | /// 262 | /// This loads the vector from correctly sized slice or anything that can be converted to it ‒ 263 | /// specifically, fixed sized arrays and other vectors work. 264 | /// 265 | /// # Example 266 | /// 267 | /// ```rust 268 | /// # use slipstream::prelude::*; 269 | /// let vec = (0..10).collect::>(); 270 | /// let v1 = u32x4::new(&vec[0..4]); 271 | /// let v2 = u32x4::new(v1); 272 | /// let v3 = u32x4::new([2, 3, 4, 5]); 273 | /// assert_eq!(v1 + v2 + v3, u32x4::new([2, 5, 8, 11])); 274 | /// ``` 275 | /// 276 | /// # Panics 277 | /// 278 | /// If the provided slice is of incompatible size. 279 | #[inline] 280 | pub fn new(input: I) -> Self 281 | where 282 | I: AsRef<[B]>, 283 | { 284 | let input = input.as_ref(); 285 | assert_eq!( 286 | input.len(), 287 | S, 288 | "Creating vector from the wrong sized slice (expected {}, got {})", 289 | S, 290 | input.len(), 291 | ); 292 | unsafe { Self::new_unchecked(input.as_ptr()) } 293 | } 294 | 295 | // TODO: Can we turn it into const fn? 296 | /// Produces a vector of all lanes set to the same value. 297 | /// 298 | /// ```rust 299 | /// # use slipstream::prelude::*; 300 | /// let v = f32x4::splat(1.2); 301 | /// assert_eq!(v, f32x4::new([1.2, 1.2, 1.2, 1.2])); 302 | /// ``` 303 | #[inline] 304 | pub fn splat(value: B) -> Self { 305 | Self::assert_size(); 306 | Self { 307 | _align: [], 308 | data: [value; S], 309 | } 310 | } 311 | 312 | /// Loads the vector from a slice by indexing it. 313 | /// 314 | /// Unlike [`new`], this can load the vector from discontinuous parts of the slice, out of 315 | /// order or multiple lanes from the same location. This flexibility comes at the cost of lower 316 | /// performance (in particular, I've never seen this to get auto-vectorized even though a 317 | /// gather instruction exists), therefore prefer [`new`] where possible. 318 | /// 319 | /// # Examples 320 | /// 321 | /// ```rust 322 | /// # use slipstream::prelude::*; 323 | /// let input = (2..100).collect::>(); 324 | /// let vec = u32x4::gather_load(&input, [3, 3, 1, 32]); 325 | /// assert_eq!(vec, u32x4::new([5, 5, 3, 34])); 326 | /// ``` 327 | /// 328 | /// It is possible to use another vector as the indices: 329 | /// 330 | /// ```rust 331 | /// # use slipstream::prelude::*; 332 | /// let indices = usizex4::new([1, 2, 3, 4]) * usizex4::splat(2); 333 | /// let input = (0..10).collect::>(); 334 | /// let vec = u32x4::gather_load(&input, indices); 335 | /// assert_eq!(vec, u32x4::new([2, 4, 6, 8])); 336 | /// ``` 337 | /// 338 | /// It is possible to use another vector as an input, allowing to narrow it down or shuffle. 339 | /// 340 | /// ```rust 341 | /// # use slipstream::prelude::*; 342 | /// let a = u32x4::new([1, 2, 3, 4]); 343 | /// let b = u32x4::gather_load(a, [2, 0, 1, 3]); 344 | /// assert_eq!(b, u32x4::new([3, 1, 2, 4])); 345 | /// let c = u32x2::gather_load(a, [2, 2]); 346 | /// assert_eq!(c, u32x2::new([3, 3])); 347 | /// ``` 348 | /// 349 | /// # Panics 350 | /// 351 | /// * If the `idx` slice doesn't have the same length as the vector. 352 | /// * If any of the indices is out of bounds of the `input`. 353 | /// 354 | /// [`new`]: Vector::new 355 | #[inline] 356 | pub fn gather_load(input: I, idx: Idx) -> Self 357 | where 358 | I: AsRef<[B]>, 359 | Idx: AsRef<[usize]>, 360 | { 361 | Self::assert_size(); 362 | let input = input.as_ref(); 363 | let idx = idx.as_ref(); 364 | assert_eq!( 365 | S, 366 | idx.len(), 367 | "Gathering vector from wrong number of indexes" 368 | ); 369 | assert!(idx.iter().all(|&l| l < input.len()), "Gather out of bounds"); 370 | let mut data = MaybeUninit::::uninit(); 371 | unsafe { 372 | for i in 0..S { 373 | let idx = *idx.get_unchecked(i); 374 | let input = *input.get_unchecked(idx); 375 | ptr::write(data.as_mut_ptr().cast::().add(i), input); 376 | } 377 | data.assume_init() 378 | } 379 | } 380 | 381 | /// Loads enabled lanes from a slice by indexing it. 382 | /// 383 | /// This is similar to [`gather_load`]. However, the loading of lanes is 384 | /// enabled by a mask. If the corresponding lane mask is not set, the value is taken from 385 | /// `self`. In other words, if the mask is all-true, it is semantically equivalent to 386 | /// [`gather_load`], expect with possible worse performance. 387 | /// 388 | /// # Examples 389 | /// 390 | /// ```rust 391 | /// # use slipstream::prelude::*; 392 | /// let input = (0..100).collect::>(); 393 | /// let v = u32x4::default().gather_load_masked( 394 | /// &input, 395 | /// [1, 4, 2, 2], 396 | /// [m32::TRUE, m32::FALSE, m32::FALSE, m32::TRUE] 397 | /// ); 398 | /// assert_eq!(v, u32x4::new([1, 0, 0, 2])); 399 | /// ``` 400 | /// 401 | /// ```rust 402 | /// # use slipstream::prelude::*; 403 | /// let left = u32x2::new([1, 2]); 404 | /// let right = u32x2::new([3, 4]); 405 | /// let idx = usizex4::new([0, 1, 0, 1]); 406 | /// let mask = m32x4::new([m32::TRUE, m32::TRUE, m32::FALSE, m32::FALSE]); 407 | /// let v = u32x4::default() 408 | /// .gather_load_masked(left, idx, mask) 409 | /// .gather_load_masked(right, idx, !mask); 410 | /// assert_eq!(v, u32x4::new([1, 2, 3, 4])); 411 | /// ``` 412 | /// 413 | /// # Panics 414 | /// 415 | /// * If the `mask` or the `idx` parameter is of different length than the vector. 416 | /// * If any of the active indices are out of bounds of `input`. 417 | /// 418 | /// [`gather_load`]: Vector::gather_load 419 | #[inline] 420 | pub fn gather_load_masked(mut self, input: I, idx: Idx, mask: M) -> Self 421 | where 422 | I: AsRef<[B]>, 423 | Idx: AsRef<[usize]>, 424 | M: AsRef<[MB]>, 425 | MB: Mask, 426 | { 427 | let input = input.as_ref(); 428 | let idx = idx.as_ref(); 429 | let mask = mask.as_ref(); 430 | let len = idx.len(); 431 | assert_eq!(S, len, "Gathering vector from wrong number of indexes"); 432 | assert_eq!(S, mask.len(), "Gathering with wrong sized mask"); 433 | for i in 0..S { 434 | unsafe { 435 | if mask.get_unchecked(i).bool() { 436 | let idx = *idx.get_unchecked(i); 437 | self[i] = input[idx]; 438 | } 439 | } 440 | } 441 | self 442 | } 443 | 444 | /// Stores the content into a continuous slice of the correct length. 445 | /// 446 | /// This is less general than [`scatter_store`][Vector::scatter_store], that one allows storing 447 | /// to different parts of the slice. 448 | /// 449 | /// The counterpart of this is [`new`][Vector::new]. 450 | /// 451 | /// # Panics 452 | /// 453 | /// If the length doesn't match. 454 | #[inline] 455 | pub fn store>(self, mut output: O) { 456 | output.as_mut().copy_from_slice(&self[..]) 457 | } 458 | 459 | /// Store the vector into a slice by indexing it. 460 | /// 461 | /// This is the inverse of [`gather_load`][Vector::gather_load]. It takes the lanes of the 462 | /// vector and stores them into the slice into given indices. 463 | /// 464 | /// If you want to store it into a continuous slice, it is potentially faster to do it using 465 | /// the `copy_from_slice` method or by [`store`][Vector::store]: 466 | /// 467 | /// ```rust 468 | /// # use slipstream::prelude::*; 469 | /// let mut data = vec![0; 6]; 470 | /// let v = u32x4::new([1, 2, 3, 4]); 471 | /// data[0..4].copy_from_slice(&v[..]); 472 | /// assert_eq!(&data[..], &[1, 2, 3, 4, 0, 0]); 473 | /// v.store(&mut data[..4]); 474 | /// assert_eq!(&data[..], &[1, 2, 3, 4, 0, 0]); 475 | /// ``` 476 | /// 477 | /// # Examples 478 | /// 479 | /// ```rust 480 | /// # use slipstream::prelude::*; 481 | /// let mut data = vec![0; 6]; 482 | /// let v = u32x4::new([1, 2, 3, 4]); 483 | /// v.scatter_store(&mut data, [2, 5, 0, 1]); 484 | /// assert_eq!(&data[..], &[3, 4, 1, 0, 0, 2]); 485 | /// ``` 486 | /// 487 | /// # Warning 488 | /// 489 | /// If multiple lanes are to be stored into the same slice element, it is not specified which 490 | /// of them will end up being stored. It is not UB to do so and it'll always be one of them, 491 | /// however it may change between versions or even between compilation targets which. 492 | /// 493 | /// This is to allow for potential different behaviour of different platforms. 494 | /// 495 | /// # Panics 496 | /// 497 | /// * If the `idx` has a different length than the vector. 498 | /// * If any of the indices are out of bounds of `output`. 499 | #[inline] 500 | pub fn scatter_store(self, mut output: O, idx: Idx) 501 | where 502 | O: AsMut<[B]>, 503 | Idx: AsRef<[usize]>, 504 | { 505 | let output = output.as_mut(); 506 | let idx = idx.as_ref(); 507 | assert_eq!(S, idx.len(), "Scattering vector to wrong number of indexes"); 508 | // Check prior to starting the scatter before we write anything. Might be nicer for 509 | // optimizer + we don't want to do partial scatter. 510 | assert!( 511 | idx.iter().all(|&l| l < output.len()), 512 | "Scatter out of bounds" 513 | ); 514 | for i in 0..S { 515 | unsafe { 516 | // get_unchecked: index checked above in bulk and we use this one in hope 517 | // it'll taste better to the autovectorizer and it might find a scatter 518 | // insrtuction for us. 519 | let idx = *idx.get_unchecked(i); 520 | *output.get_unchecked_mut(idx) = self[i]; 521 | } 522 | } 523 | } 524 | 525 | /// A masked version of [`scatter_store`]. 526 | /// 527 | /// This acts in the same way as [`scatter_store`], except lanes disabled by the `mask` are not 528 | /// stored anywhere. 529 | /// 530 | /// # Panics 531 | /// 532 | /// * If the `idx` or `mask` has a different length than the vector. 533 | /// * If any of the active indices are out of bounds of `output`. 534 | /// 535 | /// [`scatter_store`]: Vector::scatter_store 536 | #[inline] 537 | pub fn scatter_store_masked(self, mut output: O, idx: Idx, mask: M) 538 | where 539 | O: AsMut<[B]>, 540 | Idx: AsRef<[usize]>, 541 | M: AsRef<[MB]>, 542 | MB: Mask, 543 | { 544 | let output = output.as_mut(); 545 | let idx = idx.as_ref(); 546 | let mask = mask.as_ref(); 547 | assert_eq!(S, idx.len(), "Scattering vector to wrong number of indexes"); 548 | assert_eq!(S, mask.len(), "Scattering vector with wrong sized mask"); 549 | // Check prior to starting the scatter before we write anything. Might be nicer for 550 | // optimizer + we don't want to do partial scatter. 551 | let in_bounds = idx 552 | .iter() 553 | .enumerate() 554 | .all(|(i, &l)| !mask[i].bool() || l < output.len()); 555 | assert!(in_bounds, "Scatter out of bounds"); 556 | for i in 0..S { 557 | if mask[i].bool() { 558 | unsafe { 559 | // get_unchecked: index checked above in bulk and we use this one in 560 | // hope it'll taste better to the autovectorizer and it might find a 561 | // scatter insrtuction for us. 562 | let idx = *idx.get_unchecked(i); 563 | *output.get_unchecked_mut(idx) = self[i]; 564 | } 565 | } 566 | } 567 | } 568 | 569 | /// Blend self and other using mask. 570 | /// 571 | /// Imports enabled lanes from `other`, keeps disabled lanes from `self`. 572 | /// 573 | /// # Examples 574 | /// 575 | /// ```rust 576 | /// # use slipstream::prelude::*; 577 | /// let odd = u32x4::new([1, 3, 5, 7]); 578 | /// let even = u32x4::new([2, 4, 6, 8]); 579 | /// let mask = m32x4::new([m32::TRUE, m32::FALSE, m32::TRUE, m32::FALSE]); 580 | /// assert_eq!(odd.blend(even, mask), u32x4::new([2, 3, 6, 7])); 581 | /// ``` 582 | #[inline] 583 | pub fn blend(self, other: Self, mask: M) -> Self 584 | where 585 | M: AsRef<[MB]>, 586 | MB: Mask, 587 | { 588 | let mut data = MaybeUninit::::uninit(); 589 | let mask = mask.as_ref(); 590 | unsafe { 591 | for i in 0..S { 592 | ptr::write( 593 | data.as_mut_ptr().cast::().add(i), 594 | if mask[i].bool() { other[i] } else { self[i] }, 595 | ); 596 | } 597 | data.assume_init() 598 | } 599 | } 600 | 601 | /// A lane-wise maximum. 602 | /// 603 | /// # Examples 604 | /// 605 | /// ```rust 606 | /// # use slipstream::prelude::*; 607 | /// let a = u32x4::new([1, 4, 2, 5]); 608 | /// let b = u32x4::new([2, 3, 2, 6]); 609 | /// assert_eq!(a.maximum(b), u32x4::new([2, 4, 2, 6])); 610 | /// ``` 611 | #[inline] 612 | pub fn maximum(self, other: Self) -> Self 613 | where 614 | B: PartialOrd, 615 | { 616 | let m = self.lt(other); 617 | self.blend(other, m) 618 | } 619 | 620 | /// A lane-wise maximum. 621 | /// 622 | /// # Examples 623 | /// 624 | /// ```rust 625 | /// # use slipstream::prelude::*; 626 | /// let a = u32x4::new([1, 4, 2, 5]); 627 | /// let b = u32x4::new([2, 3, 2, 6]); 628 | /// assert_eq!(a.minimum(b), u32x4::new([1, 3, 2, 5])); 629 | /// ``` 630 | #[inline] 631 | pub fn minimum(self, other: Self) -> Self 632 | where 633 | B: PartialOrd, 634 | { 635 | let m = self.gt(other); 636 | self.blend(other, m) 637 | } 638 | 639 | // TODO: Example 640 | /// Sums the lanes together. 641 | /// 642 | /// The additions are done in a tree manner: `(a[0] + a[1]) + (a[2] + a[3])`. 643 | /// 644 | /// Note that this is potentially a slow operation. Prefer to do as many operations on whole 645 | /// vectors and only at the very end perform the horizontal operation. 646 | #[inline] 647 | pub fn horizontal_sum(self) -> B 648 | where 649 | B: Add, 650 | { 651 | #[inline(always)] 652 | fn inner>(d: &[B]) -> B { 653 | if d.len() == 1 { 654 | d[0] 655 | } else { 656 | let mid = d.len() / 2; 657 | inner(&d[..mid]) + inner(&d[mid..]) 658 | } 659 | } 660 | inner(&self.data) 661 | } 662 | 663 | /// Multiplies all the lanes of the vector. 664 | /// 665 | /// The multiplications are done in a tree manner: `(a[0] * a[1]) * (a[2] * a[3])`. 666 | /// 667 | /// Note that this is potentially a slow operation. Prefer to do as many operations on whole 668 | /// vectors and only at the very end perform the horizontal operation. 669 | #[inline] 670 | pub fn horizontal_product(self) -> B 671 | where 672 | B: Mul, 673 | { 674 | #[inline(always)] 675 | fn inner>(d: &[B]) -> B { 676 | if d.len() == 1 { 677 | d[0] 678 | } else { 679 | let mid = d.len() / 2; 680 | inner(&d[..mid]) * inner(&d[mid..]) 681 | } 682 | } 683 | inner(&self.data) 684 | } 685 | 686 | cmp_op!( 687 | /// Lane-wise `==`. 688 | PartialEq => eq; 689 | 690 | /// Lane-wise `<`. 691 | PartialOrd => lt; 692 | 693 | /// Lane-wise `>`. 694 | PartialOrd => gt; 695 | 696 | /// Lane-wise `<=`. 697 | PartialOrd => le; 698 | 699 | /// Lane-wise `>=`. 700 | PartialOrd => ge; 701 | ); 702 | } 703 | 704 | impl Vector 705 | where 706 | A: Align, 707 | B: Repr + Float, 708 | { 709 | /// Fused multiply-add. Computes (self * a) + b with only one rounding 710 | /// error, yielding a more accurate result than an unfused multiply-add. 711 | /// 712 | /// Using mul_add can be more performant than an unfused multiply-add if the 713 | /// target architecture has a dedicated fma CPU instruction. 714 | #[inline] 715 | pub fn mul_add(self, a: Self, b: Self) -> Self { 716 | let mut result = Self::splat(B::zero()); 717 | for ((res, &s), (&a, &b)) in result 718 | .data 719 | .iter_mut() 720 | .zip(self.data.iter()) 721 | .zip(a.data.iter().zip(b.data.iter())) 722 | { 723 | *res = s.mul_add(a, b); 724 | } 725 | result 726 | } 727 | } 728 | 729 | impl Masked for Vector { 730 | type Mask = Vector; 731 | } 732 | 733 | impl Default for Vector { 734 | #[inline] 735 | fn default() -> Self { 736 | Self::splat(Default::default()) 737 | } 738 | } 739 | 740 | impl Debug for Vector { 741 | fn fmt(&self, fmt: &mut Formatter) -> FmtResult { 742 | fmt.debug_tuple("Vector").field(&self.data).finish() 743 | } 744 | } 745 | 746 | impl Deref for Vector { 747 | type Target = [B; S]; 748 | #[inline] 749 | fn deref(&self) -> &[B; S] { 750 | &self.data 751 | } 752 | } 753 | 754 | impl DerefMut for Vector { 755 | #[inline] 756 | fn deref_mut(&mut self) -> &mut [B; S] { 757 | &mut self.data 758 | } 759 | } 760 | 761 | impl AsRef<[B]> for Vector { 762 | #[inline] 763 | fn as_ref(&self) -> &[B] { 764 | &self.data 765 | } 766 | } 767 | 768 | impl AsRef<[B; S]> for Vector { 769 | #[inline] 770 | fn as_ref(&self) -> &[B; S] { 771 | &self.data 772 | } 773 | } 774 | 775 | impl AsMut<[B]> for Vector { 776 | #[inline] 777 | fn as_mut(&mut self) -> &mut [B] { 778 | &mut self.data 779 | } 780 | } 781 | 782 | impl AsMut<[B; S]> for Vector { 783 | #[inline] 784 | fn as_mut(&mut self) -> &mut [B; S] { 785 | &mut self.data 786 | } 787 | } 788 | 789 | impl From<[B; S]> for Vector { 790 | #[inline] 791 | fn from(data: [B; S]) -> Self { 792 | Self::assert_size(); 793 | Self { _align: [], data } 794 | } 795 | } 796 | 797 | impl From> for [B; S] { 798 | #[inline] 799 | fn from(vector: Vector) -> [B; S] { 800 | vector.data 801 | } 802 | } 803 | 804 | impl Index for Vector 805 | where 806 | A: Align, 807 | B: Repr, 808 | [B; S]: Index, 809 | { 810 | type Output = <[B; S] as Index>::Output; 811 | #[inline] 812 | fn index(&self, idx: I) -> &Self::Output { 813 | &self.data[idx] 814 | } 815 | } 816 | 817 | impl IndexMut for Vector 818 | where 819 | A: Align, 820 | B: Repr, 821 | [B; S]: IndexMut, 822 | { 823 | #[inline] 824 | fn index_mut(&mut self, idx: I) -> &mut Self::Output { 825 | &mut self.data[idx] 826 | } 827 | } 828 | 829 | impl Sum for Vector { 830 | #[inline] 831 | fn sum(iter: I) -> Self 832 | where 833 | I: Iterator, 834 | { 835 | let mut result = Self::default(); 836 | for i in iter { 837 | result += i; 838 | } 839 | 840 | result 841 | } 842 | } 843 | 844 | impl Product for Vector { 845 | #[inline] 846 | fn product(iter: I) -> Self 847 | where 848 | I: Iterator, 849 | { 850 | let mut result = Self::splat(B::ONE); 851 | for i in iter { 852 | result *= i; 853 | } 854 | 855 | result 856 | } 857 | } 858 | 859 | bin_op_impl!(Add, add, AddAssign, add_assign); 860 | bin_op_impl!(Sub, sub, SubAssign, sub_assign); 861 | bin_op_impl!(Mul, mul, MulAssign, mul_assign); 862 | bin_op_impl!(Div, div, DivAssign, div_assign); 863 | bin_op_impl!(Rem, rem, RemAssign, rem_assign); 864 | bin_op_impl!(BitAnd, bitand, BitAndAssign, bitand_assign); 865 | bin_op_impl!(BitOr, bitor, BitOrAssign, bitor_assign); 866 | bin_op_impl!(BitXor, bitxor, BitXorAssign, bitxor_assign); 867 | bin_op_impl!(Shl, shl, ShlAssign, shl_assign); 868 | bin_op_impl!(Shr, shr, ShrAssign, shr_assign); 869 | 870 | una_op_impl!(Neg, neg); 871 | una_op_impl!(Not, not); 872 | 873 | impl PartialEq for Vector { 874 | #[inline] 875 | fn eq(&self, other: &Self) -> bool { 876 | self.data == other.data 877 | } 878 | } 879 | 880 | impl Eq for Vector {} 881 | 882 | impl PartialEq<[B; S]> for Vector { 883 | #[inline] 884 | fn eq(&self, other: &[B; S]) -> bool { 885 | self.data == *other 886 | } 887 | } 888 | 889 | impl PartialEq> for [B; S] { 890 | #[inline] 891 | fn eq(&self, other: &Vector) -> bool { 892 | *self == other.data 893 | } 894 | } 895 | 896 | #[cfg(test)] 897 | mod tests { 898 | use super::*; 899 | use crate::prelude::*; 900 | 901 | type V = u16x4; 902 | 903 | #[test] 904 | #[should_panic(expected = "Creating vector from the wrong sized slice (expected 4, got 3)")] 905 | fn wrong_size_new() { 906 | V::new([1, 2, 3]); 907 | } 908 | 909 | #[test] 910 | fn round_trip() { 911 | let orig = [1, 2, 3, 4]; 912 | assert_eq!(<[u16; 4]>::from(u16x4::from(orig)), orig); 913 | } 914 | 915 | #[test] 916 | fn shuffle() { 917 | let v1 = V::new([1, 2, 3, 4]); 918 | let v2 = V::gather_load(v1, [3, 1, 2, 0]); 919 | assert_eq!(v2.deref(), &[4, 2, 3, 1]); 920 | let v3 = V::gather_load(v2, [0, 0, 2, 2]); 921 | assert_eq!(v3.deref(), &[4, 4, 3, 3]); 922 | } 923 | 924 | #[test] 925 | fn gather() { 926 | let data = (1..=10).collect::>(); 927 | let v = V::gather_load(data, [0, 2, 4, 6]); 928 | assert_eq!(v, [1, 3, 5, 7]); 929 | } 930 | 931 | #[test] 932 | fn scatter() { 933 | let v = V::new([1, 2, 3, 4]); 934 | let mut output = [0; 10]; 935 | v.scatter_store(&mut output, [1, 3, 5, 7]); 936 | assert_eq!(output, [0, 1, 0, 2, 0, 3, 0, 4, 0, 0]); 937 | } 938 | 939 | #[test] 940 | #[should_panic(expected = "Gather out of bounds")] 941 | fn gather_oob() { 942 | V::gather_load([1, 2, 3], [0, 1, 2, 3]); 943 | } 944 | 945 | #[test] 946 | #[should_panic(expected = "Gathering vector from wrong number of indexes")] 947 | fn gather_idx_cnt() { 948 | V::gather_load([0, 1, 2, 3, 4], [0, 1]); 949 | } 950 | 951 | #[test] 952 | #[should_panic(expected = "Scatter out of bounds")] 953 | fn scatter_oob() { 954 | let mut out = [0; 10]; 955 | V::new([1, 2, 3, 4]).scatter_store(&mut out, [0, 1, 2, 15]); 956 | } 957 | 958 | #[test] 959 | #[should_panic(expected = "Scattering vector to wrong number of indexes")] 960 | fn scatter_idx_cnt() { 961 | let mut out = [0; 10]; 962 | V::new([1, 2, 3, 4]).scatter_store(&mut out, [0, 1, 2]); 963 | } 964 | 965 | // TODO: Tests for out of bounds index on masked loads/stores + tests for index out of bound 966 | // but disabled by the mask 967 | 968 | const T: m32 = m32::TRUE; 969 | const F: m32 = m32::FALSE; 970 | 971 | #[test] 972 | fn cmp() { 973 | let v1 = u32x4::new([1, 3, 5, 7]); 974 | let v2 = u32x4::new([2, 3, 4, 5]); 975 | 976 | assert_eq!(v1.eq(v2), m32x4::new([F, T, F, F])); 977 | assert_eq!(v1.le(v2), m32x4::new([T, T, F, F])); 978 | assert_eq!(v1.ge(v2), m32x4::new([F, T, T, T])); 979 | } 980 | 981 | #[test] 982 | fn blend() { 983 | let v1 = u32x4::new([1, 2, 3, 4]); 984 | let v2 = u32x4::new([5, 6, 7, 8]); 985 | 986 | let b1 = v1.blend(v2, m32x4::new([F, T, F, T])); 987 | assert_eq!(b1, u32x4::new([1, 6, 3, 8])); 988 | 989 | let b2 = v1.blend(v2, [false, true, false, true]); 990 | assert_eq!(b1, b2); 991 | } 992 | 993 | #[test] 994 | fn fma() { 995 | let a = f32x4::new([1.0, 2.0, 3.0, 4.0]); 996 | let b = f32x4::new([5.0, 6.0, 7.0, 8.0]); 997 | let c = f32x4::new([9.0, 10.0, 11.0, 12.0]); 998 | 999 | assert_eq!(a.mul_add(b, c), f32x4::new([14.0, 22.0, 32.0, 44.0])); 1000 | } 1001 | } 1002 | --------------------------------------------------------------------------------