├── .github ├── FUNDING.yml └── workflows │ └── rustdoc.yml ├── .gitignore ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── README.md ├── crates ├── dispatch │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── thermite-complex │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── thermite-hyperdual │ ├── Cargo.toml │ └── src │ │ └── lib.rs ├── thermite-special │ ├── Cargo.toml │ └── src │ │ ├── bessel.rs │ │ ├── lib.rs │ │ ├── pd.rs │ │ └── ps.rs ├── thermite │ ├── Cargo.toml │ ├── benches │ │ └── main.rs │ ├── examples │ │ ├── asm.rs │ │ ├── geo │ │ │ └── mod.rs │ │ └── plot.rs │ ├── src │ │ ├── arch.rs │ │ ├── backends │ │ │ ├── aarch64 │ │ │ │ └── mod.rs │ │ │ ├── arm │ │ │ │ └── mod.rs │ │ │ ├── avx1 │ │ │ │ ├── mod.rs │ │ │ │ ├── polyfills.rs │ │ │ │ ├── vf32.rs │ │ │ │ ├── vf64.rs │ │ │ │ ├── vi32.rs │ │ │ │ ├── vi32_2.rs │ │ │ │ ├── vi64.rs │ │ │ │ ├── vi64_2.rs │ │ │ │ ├── vu32.rs │ │ │ │ └── vu64.rs │ │ │ ├── avx2 │ │ │ │ ├── mod.rs │ │ │ │ ├── polyfills.rs │ │ │ │ ├── vf32.rs │ │ │ │ ├── vf64.rs │ │ │ │ ├── vi16.rs │ │ │ │ ├── vi32.rs │ │ │ │ ├── vi64.rs │ │ │ │ ├── vu32.rs │ │ │ │ └── vu64.rs │ │ │ ├── macros.rs │ │ │ ├── mod.rs │ │ │ ├── polyfills.rs │ │ │ ├── scalar │ │ │ │ ├── mod.rs │ │ │ │ ├── polyfills.rs │ │ │ │ ├── vf32.rs │ │ │ │ ├── vf64.rs │ │ │ │ ├── vi32.rs │ │ │ │ ├── vi64.rs │ │ │ │ ├── vu32.rs │ │ │ │ └── vu64.rs │ │ │ ├── sse2 │ │ │ │ ├── mod.rs │ │ │ │ ├── polyfills.rs │ │ │ │ └── sse2.rs │ │ │ ├── sse42 │ │ │ │ ├── mod.rs │ │ │ │ ├── polyfills.rs │ │ │ │ ├── vf32.rs │ │ │ │ ├── vi32.rs │ │ │ │ └── vu32.rs │ │ │ └── wasm32 │ │ │ │ └── mod.rs │ │ ├── buffer.rs │ │ ├── divider.rs │ │ ├── element.rs │ │ ├── iter │ │ │ ├── aligned.rs │ │ │ ├── mod.rs │ │ │ └── slice.rs │ │ ├── lib.rs │ │ ├── macros.rs │ │ ├── mask.rs │ │ ├── math │ │ │ ├── compensated.rs │ │ │ ├── consts.rs │ │ │ ├── mod.rs │ │ │ ├── pd.rs │ │ │ ├── poly.rs │ │ │ └── ps.rs │ │ ├── pointer.rs │ │ ├── rng │ │ │ ├── mod.rs │ │ │ ├── pcg32.rs │ │ │ └── xoshiro.rs │ │ └── runtime.rs │ └── tests │ │ ├── counts.rs │ │ ├── reverse.rs │ │ └── sinh.rs └── thermite2 │ ├── Cargo.toml │ └── src │ ├── arch.rs │ ├── backends │ ├── avx2 │ │ ├── mod.rs │ │ ├── polyfills.rs │ │ └── vf32.rs │ ├── mod.rs │ ├── polyfills.rs │ ├── register.rs │ └── vector.rs │ ├── iset.rs │ ├── lib.rs │ ├── macros.rs │ └── widen.rs └── rustfmt.toml /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [novacrazy] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: raygon 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: raygon 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/workflows/rustdoc.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | # Controls when the action will run. Triggers the workflow on push or pull request 4 | # events but only for the master branch 5 | on: 6 | push: 7 | branches: [ master ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 13 | jobs: 14 | # This workflow contains a single job called "build" 15 | build: 16 | # The type of runner that the job will run on 17 | runs-on: ubuntu-latest 18 | 19 | # Steps represent a sequence of tasks that will be executed as part of the job 20 | steps: 21 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 22 | - uses: actions/checkout@v2 23 | 24 | - uses: actions-rs/toolchain@v1 25 | with: 26 | toolchain: stable 27 | 28 | - name: Build Documentation 29 | uses: actions-rs/cargo@v1 30 | with: 31 | command: doc 32 | toolchain: stable 33 | args: --no-deps 34 | 35 | - run: echo "" > target/doc/index.html 36 | 37 | - name: Deploy Documentation 38 | uses: peaceiris/actions-gh-pages@v3 39 | with: 40 | deploy_key: ${{ secrets.ACTIONS_DEPLOY_KEY }} 41 | publish_branch: gh-pages 42 | publish_dir: ./target/doc 43 | keep_files: false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/target/criterion 3 | Cargo.lock 4 | .vscode -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["crates/*"] 3 | 4 | [profile.dev] 5 | opt-level = 2 6 | 7 | [profile.release] 8 | opt-level = 3 9 | lto = 'fat' 10 | codegen-units = 1 11 | 12 | [profile.bench] 13 | opt-level = 3 14 | lto = 'fat' 15 | codegen-units = 1 16 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright 2020 Developers of the Thermite project 2 | Copyright (c) 2014 The Rust Project Developers 3 | 4 | Permission is hereby granted, free of charge, to any 5 | person obtaining a copy of this software and associated 6 | documentation files (the "Software"), to deal in the 7 | Software without restriction, including without 8 | limitation the rights to use, copy, modify, merge, 9 | publish, distribute, sublicense, and/or sell copies of 10 | the Software, and to permit persons to whom the Software 11 | is furnished to do so, subject to the following 12 | conditions: 13 | 14 | The above copyright notice and this permission notice 15 | shall be included in all copies or substantial portions 16 | of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 | DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Thermite SIMD: Melt your CPU 2 | ============================ 3 | 4 | **NOTE: This crate is not yet on crates.io, but I do own the name and will publish it there when ready** 5 | 6 | Thermite is a WIP SIMD library focused on providing portable SIMD acceleration of SoA (Structure of Arrays) algorithms, using consistent-length1 SIMD vectors for lockstep iteration and computation. 7 | 8 | Thermite provides highly optimized **feature-rich backends** for SSE2, SSE4.2, AVX and AVX2, with planned support for AVX512, ARM/Aarch64 NEON, and WASM SIMD extensions. 9 | 10 | In addition to that, Thermite includes a highly optimized **vectorized math library** with many special math functions and algorithms, specialized for both single and double precision. 11 | 12 | 13 | 1 All vectors in an instruction set are the same length, regardless of size. 14 | 15 | 16 | # Current Status 17 | 18 | Refer to issue [#1](https://github.com/raygon-renderer/thermite/issues/1) 19 | 20 | # Motivation and Goals 21 | 22 | Thermite was conceived while working on Raygon renderer, when it was decided we needed a state of the art high-performance SIMD vector library focused on facilitating SoA algorithms. Using SIMD for AoS values was a nightmare, constantly shuffling vectors and performing unnecessary horizontal operations. We also weren't able to take advantage of AVX2 fully due to 3D vectors only using 3 or 4 lanes of a regular 128-bit register. 23 | 24 | Using SIMDeez, `faster`, or redesigning `packed_simd` were all considered, but each has their flaws. SIMDeez is rather limited in functionality, and their handling of `target_feature` leaves much to be desired. `faster` fits well into the SoA paradigm, but the iterator-based API is rather unwieldy, and it is lacking many features. `packed_simd` isn't bad, but it's also missing many features and relies on the Nightly-only `"platform-intrinsic"`s, which can produce suboptimal code in some cases. 25 | 26 | Therefore, the only solution was to write my own, and thus Thermite was born. 27 | 28 | The primary goal of Thermite is to provide optimal codegen for every backend instruction set, and provide a consistent set of features on top of all of them, in such a way as to encourage using chunked SoA or AoSoA algorithms regardless of what data types you need. Furthermore, with the `#[dispatch]` macro, multiple instruction sets can be easily targetted within a single binary. 29 | 30 | # Features 31 | 32 | * SSE2, SSE4.2, AVX, AVX2 backends, with planned support for scalar, AVX512, WASM SIMD and ARM NEON backends. 33 | * Extensive built-in vectorized math library. 34 | * Compile-time policies to emphasize precision, performance or code size (useful for WASM) 35 | * Compile-time monomorphisation with runtime selection 36 | * Aided by a `#[dispatch]` procedural macro to ensure optimal codegen. 37 | * Zero runtime overhead. 38 | * Operator overloading on vector types. 39 | * Abstracts over vector length, giving the same length to all vectors of an instruction set. 40 | * Provides fast polyfills where necessary to provide the same API across all instruction sets. 41 | * Highly optimized value cast routines between vector types where possible. 42 | * Dedicated mask wrapper type with low-cost bitwise vector conversions built-in. 43 | 44 | # Optimized Project Setup 45 | 46 | For optimal performance, ensure you `Cargo.toml` profiles looks something like this: 47 | ```toml 48 | [profile.dev] 49 | opt-level = 2 # Required to inline SIMD intrinsics internally 50 | 51 | [profile.release] 52 | opt-level = 3 # Should be at least 2; level 1 will not use SIMD intrinsics 53 | lto = 'thin' # 'fat' LTO may also improve things, but will increase compile time 54 | codegen-units = 1 # Required for optimal inlining and optimizations 55 | 56 | # optional release options depending on your project and preference 57 | incremental = false # Release builds will take longer to compile, but inter-crate optimizations may work better 58 | panic = 'abort' # Very few functions in Thermite panic, but aborting will avoid the unwind mechanism overhead 59 | ``` 60 | 61 | # Misc. Usage Notes 62 | 63 | * Vectors with 64-bit elements are approximately 2-4x slower than 32-bit vectors. 64 | * Integer vectors are 2x slower on SSE2/AVX1, but nominal on SSE4.1 and AVX2. This compounds the first point. 65 | * Casting floats to signed integers is faster than to unsigned integers. 66 | * Equal-size Signed and Unsigned integer vectors can be cast between each other at zero cost. 67 | * Operations mixing float and integer types can incur a 1-cycle penalty on most modern CPUs. 68 | * Integer division currently can only be done with a scalar fallback, so it's not recommended. 69 | * Dividing integer vectors by constant uniform divisors should use `SimdIntVector::div_const` 70 | * When reusing masks for `all`/`any`/`none` queries, consider using the bitmask directly to avoid recomputing. 71 | * Avoid casting between differently-sized types in hot loops. 72 | * Avoid extracting and replacing elements. 73 | * LLVM will inline many math functions and const-eval as much as possible, but only if it was called in the same instruction-set context. 74 | 75 | # Cargo `--features` 76 | 77 | ### `alloc` (enabled by default) 78 | 79 | The `alloc` feature enables aligned allocation of buffers suitable to reading/writing to with SIMD. 80 | 81 | ### `nightly` 82 | 83 | The `nightly` feature enables nightly-only optimizations such as accelerated half-precision encoding/decoding. 84 | 85 | ### `math` (enabled by default) 86 | 87 | Enables the vectorized math modules 88 | 89 | ### `rng` 90 | 91 | Enables the vectorized random number modules 92 | 93 | ### `emulate_fma` 94 | 95 | Real fused multiply-add instructions are only enabled for AVX2 platforms. However, as FMA is used not only for performance but for its extended precision, falling back to a split multiply and addition will incur two rounding errors, and may be unacceptable for 96 | some applications. Therefore, the `emulate_fma` Cargo feature will enable a slower but more accurate implementation on older platforms. 97 | 98 | For single-precision floats, this is easiest done by simply casting it to double-precision, doing seperate multiply and additions, then casting back. For double-precision, it will use an infinite-precision implementation based on libm. 99 | 100 | On SSE2 platforms, double-precision may fallback to scalar ops, as the effort needed to make it branchless will be more expensive than not. As of writing this, it has not been implemented, so benchmarks will reveal what is needed later. 101 | -------------------------------------------------------------------------------- /crates/dispatch/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thermite-dispatch" 3 | version = "0.1.0" 4 | authors = ["novacrazy "] 5 | edition = "2018" 6 | 7 | [lib] 8 | proc-macro = true 9 | 10 | [dependencies] 11 | quote = "1" 12 | proc-macro2 = "1" 13 | syn = { version = "1", features = ["full", "extra-traits", "visit-mut"] } 14 | 15 | [features] 16 | neon = [] 17 | wasm32 = [] -------------------------------------------------------------------------------- /crates/thermite-complex/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thermite-complex" 3 | version = "0.1.0" 4 | authors = ["novacrazy "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | thermite = { path = "../thermite" } -------------------------------------------------------------------------------- /crates/thermite-hyperdual/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thermite-hyperdual" 3 | version = "0.1.0" 4 | authors = ["novacrazy "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | thermite = { path = "../thermite" } -------------------------------------------------------------------------------- /crates/thermite-hyperdual/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | 3 | use thermite::*; 4 | 5 | use core::{ 6 | fmt, 7 | marker::PhantomData, 8 | ops::{Add, Div, Mul, Sub}, 9 | }; 10 | 11 | pub type Hyperdual = HyperdualP; 12 | pub type DualNumber = Hyperdual; 13 | 14 | pub struct HyperdualP, P: Policy, const N: usize> { 15 | /// Real part 16 | pub re: V, 17 | /// Dual parts 18 | pub du: [V; N], 19 | _simd: PhantomData<(S, P)>, 20 | } 21 | 22 | impl, P: Policy, const N: usize> Clone for HyperdualP { 23 | fn clone(&self) -> Self { 24 | *self 25 | } 26 | } 27 | 28 | impl, P: Policy, const N: usize> Copy for HyperdualP {} 29 | 30 | impl, P: Policy, const N: usize> fmt::Debug for HyperdualP { 31 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 32 | f.debug_struct("HyperdualP") 33 | .field("re", &self.re) 34 | .field("du", &self.du) 35 | .finish() 36 | } 37 | } 38 | 39 | #[dispatch(S)] 40 | impl, P: Policy, const N: usize> HyperdualP { 41 | #[inline(always)] 42 | pub fn new(re: V, du: [V; N]) -> Self { 43 | Self { 44 | re, 45 | du, 46 | _simd: PhantomData, 47 | } 48 | } 49 | 50 | #[inline(always)] 51 | pub fn real(re: V) -> Self { 52 | Self::new(re, [V::zero(); N]) 53 | } 54 | 55 | #[inline(always)] 56 | pub fn one() -> Self { 57 | Self::real(V::one()) 58 | } 59 | 60 | #[inline(always)] 61 | pub fn zero() -> Self { 62 | Self::real(V::zero()) 63 | } 64 | 65 | #[inline(always)] 66 | pub fn map(mut self, f: F) -> Self 67 | where 68 | F: Fn(V) -> V, 69 | { 70 | self.map_dual(f(self.re), f) 71 | } 72 | 73 | #[inline(always)] 74 | pub fn map_dual(mut self, re: V, f: F) -> Self 75 | where 76 | F: Fn(V) -> V, 77 | { 78 | self.re = re; 79 | for dual in &mut self.du { 80 | *dual = f(*dual); 81 | } 82 | self 83 | } 84 | } 85 | 86 | #[dispatch(S)] 87 | impl, P: Policy, const N: usize> HyperdualP 88 | where 89 | V: SimdVectorizedMath, 90 | { 91 | #[inline(always)] 92 | fn div_dual(self, re: V, denom: V) -> Self { 93 | if N > 1 { 94 | let rcp = denom.reciprocal_p::

(); 95 | self.map_dual(re, |x| x * rcp) 96 | } else { 97 | self.map_dual(re, |x| x / denom) 98 | } 99 | } 100 | 101 | #[inline(always)] 102 | pub fn fract(mut self) -> Self { 103 | self.re = self.re.fract(); 104 | self 105 | } 106 | 107 | #[inline(always)] 108 | pub fn signum(self) -> Self { 109 | Self::real(self.re.signum()) 110 | } 111 | 112 | #[inline(always)] 113 | pub fn abs(self) -> Self { 114 | let signum = self.re.signum(); 115 | self.map(|x| x * signum) 116 | } 117 | 118 | #[inline(always)] 119 | pub fn select(mask: Mask, t: Self, f: Self) -> Self { 120 | let mut t = t; // Weird compiler bug 121 | for i in 0..N { 122 | t.du[i] = mask.select(t.du[i], f.du[i]); 123 | } 124 | t.re = mask.select(t.re, f.re); 125 | t 126 | } 127 | 128 | #[inline(always)] 129 | pub fn min(self, other: Self) -> Self { 130 | Self::select(self.re.lt(other.re), self, other) 131 | } 132 | 133 | #[inline(always)] 134 | pub fn max(mut self, other: Self) -> Self { 135 | Self::select(self.re.gt(other.re), self, other) 136 | } 137 | 138 | #[inline(always)] 139 | pub fn mul_add(mut self, m: Self, a: Self) -> Self { 140 | for i in 0..N { 141 | self.du[i] = self.du[i].mul_add(m.re, self.re.mul_add(m.du[i], a.du[i])); 142 | } 143 | self.re = self.re.mul_add(m.re, a.re); 144 | self 145 | } 146 | 147 | #[inline(always)] 148 | pub fn powi(self, n: i32) -> Self { 149 | let r = self.re.powi_p::

(n - 1); 150 | let nf = V::splat_as(n) * r; 151 | self.map_dual(self.re * r, |x| x * nf) 152 | } 153 | 154 | #[inline(always)] 155 | pub fn powf(mut self, n: Self) -> Self { 156 | let re_n1 = self.re.powf_p::

(n.re - V::one()); 157 | 158 | let re = re_n1 * self.re; // re^n 159 | 160 | let a = n.re * re_n1; // n * re^(n-1) 161 | let b = re * self.re.ln_p::

(); 162 | 163 | self.re = re; 164 | for i in 0..N { 165 | self.du[i] = a.mul_add(self.du[i], b * n.du[i]); 166 | } 167 | self 168 | } 169 | 170 | #[inline(always)] 171 | pub fn exp(self) -> Self { 172 | let re = self.re.exp_p::

(); 173 | self.map_dual(re, |x| re * x) 174 | } 175 | 176 | #[inline(always)] 177 | pub fn exp2(self) -> Self { 178 | let re = self.re.exp2_p::

(); 179 | let re_ln2 = V::LN_2() * re; 180 | self.map_dual(re, |x| x * re_ln2) 181 | } 182 | 183 | #[inline(always)] 184 | pub fn ln(self) -> Self { 185 | self.div_dual(self.re.ln_p::

(), self.re) 186 | } 187 | 188 | #[inline(always)] 189 | pub fn sqrt(self) -> Self { 190 | let re = self.re.sqrt(); 191 | self.div_dual(re, re + re) 192 | } 193 | 194 | #[inline(always)] 195 | pub fn cbrt(self) -> Self { 196 | let re = self.re.cbrt(); 197 | self.div_dual(re, re + re + re) 198 | } 199 | 200 | fn hypot(self, other: Self) -> Self { 201 | let c = self.re.hypot(other.re); 202 | let mut v = Self::real(c); 203 | 204 | let inv_c = c.reciprocal_p::

(); 205 | for i in 0..N { 206 | let x = self.du[i]; 207 | let y = other.du[i]; 208 | 209 | v.du[i] = self.re.mul_add(x, other.re * y); 210 | 211 | if N > 1 { 212 | v.du[i] *= inv_c; 213 | } else { 214 | v.du[i] /= c; 215 | } 216 | } 217 | 218 | v 219 | } 220 | 221 | #[inline(always)] 222 | pub fn sin_cos(self) -> (Self, Self) { 223 | let (s, c) = self.re.sin_cos_p::

(); 224 | 225 | let mut sine = self; 226 | let mut cosi = self; 227 | 228 | sine.re = s; 229 | cosi.re = c; 230 | for i in 0..N { 231 | sine.du[i] *= c; 232 | cosi.du[i] *= s; 233 | } 234 | 235 | (sine, cosi) 236 | } 237 | 238 | #[inline(always)] 239 | pub fn tan(self) -> Self { 240 | let t = self.re.tan_p::

(); 241 | let c = t.mul_add(t, V::one()); 242 | self.map_dual(t, |x| x * c) 243 | } 244 | 245 | #[inline(always)] 246 | pub fn asin(self) -> Self { 247 | let c = self.re.nmul_adde(self.re, V::one()).invsqrt_p::

(); 248 | self.map_dual(self.re.asin(), |x| x * c) 249 | } 250 | 251 | #[inline(always)] 252 | pub fn acos(self) -> Self { 253 | let c = self.re.nmul_adde(self.re, V::one()).invsqrt_p::

().neg(); 254 | self.map_dual(self.re.acos(), |x| x * c) 255 | } 256 | 257 | #[inline(always)] 258 | pub fn atan(self) -> Self { 259 | let c = self.re.mul_adde(self.re, V::one()); 260 | self.div_dual(self.re.atan(), c) 261 | } 262 | 263 | pub fn atan2(self, x: Self) -> Self { 264 | let y = self; 265 | let c = y.re.mul_add(y.re, x.re * x.re); 266 | 267 | let mut v = Self::real(y.re.atan2(x.re)); 268 | 269 | let inv_c = c.reciprocal_p::

(); 270 | for i in 0..N { 271 | v.du[i] = x.re.mul_sub(y.du[i], y.re * x.du[i]) * c; 272 | 273 | if N > 1 { 274 | v.du[i] *= inv_c; 275 | } else { 276 | v.du[i] /= c; 277 | } 278 | } 279 | 280 | v 281 | } 282 | 283 | #[inline(always)] 284 | pub fn sinh_cosh(self) -> (Self, Self) { 285 | let s = self.re.sinh_p::

(); 286 | let c = self.re.cosh_p::

(); 287 | (self.map_dual(s, |x| x * c), self.map_dual(c, |x| x * s)) 288 | } 289 | 290 | #[inline(always)] 291 | pub fn tanh(self) -> Self { 292 | let re = self.re.tanh_p::

(); 293 | let c = re.nmul_add(re, V::one()); // 1 - r^2 294 | self.map_dual(re, |x| x * c) 295 | } 296 | } 297 | 298 | #[dispatch(S)] 299 | impl, P: Policy, const N: usize> Add for HyperdualP { 300 | type Output = Self; 301 | 302 | #[inline(always)] 303 | fn add(mut self, rhs: Self) -> Self { 304 | self.re += rhs.re; 305 | for i in 0..N { 306 | self.du[i] += rhs.du[i]; 307 | } 308 | self 309 | } 310 | } 311 | 312 | #[dispatch(S)] 313 | impl, P: Policy, const N: usize> Sub for HyperdualP { 314 | type Output = Self; 315 | 316 | #[inline(always)] 317 | fn sub(mut self, rhs: Self) -> Self { 318 | self.re -= rhs.re; 319 | for i in 0..N { 320 | self.du[i] -= rhs.du[i]; 321 | } 322 | self 323 | } 324 | } 325 | 326 | #[dispatch(S)] 327 | impl, P: Policy, const N: usize> Mul for HyperdualP { 328 | type Output = Self; 329 | 330 | #[inline(always)] 331 | fn mul(mut self, rhs: Self) -> Self { 332 | for i in 0..N { 333 | self.du[i] = self.re.mul_add(rhs.du[i], rhs.re * self.du[i]); 334 | } 335 | self.re *= rhs.re; 336 | self 337 | } 338 | } 339 | 340 | #[dispatch(S)] 341 | impl, P: Policy, const N: usize> Div for HyperdualP 342 | where 343 | V: SimdVectorizedMath, 344 | { 345 | type Output = Self; 346 | 347 | #[inline(always)] 348 | fn div(mut self, rhs: Self) -> Self { 349 | let d = self.re * rhs.re; 350 | 351 | let inv_d = d.reciprocal_p::

(); 352 | for i in 0..N { 353 | self.du[i] = rhs.re.mul_sub(self.du[i], self.re * rhs.du[i]) * d; 354 | 355 | if N > 1 { 356 | self.du[i] *= inv_d; 357 | } else { 358 | self.du[i] /= d; 359 | } 360 | } 361 | self.re /= rhs.re; 362 | self 363 | } 364 | } 365 | -------------------------------------------------------------------------------- /crates/thermite-special/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thermite-special" 3 | version = "0.1.0" 4 | authors = ["novacrazy "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | thermite = { path = "../thermite" } 11 | thermite-complex = { path = "../thermite-complex" } -------------------------------------------------------------------------------- /crates/thermite-special/src/ps.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | const EULERS_CONSTANT: f32 = 5.772156649015328606065120900824024310e-01; 4 | 5 | impl SimdVectorizedSpecialFunctionsInternal for f32 6 | where 7 | ::Vf32: SimdFloatVector, 8 | { 9 | #[inline(always)] 10 | fn tgamma(mut z: Self::Vf) -> Self::Vf { 11 | let zero = Vf32::::zero(); 12 | let one = Vf32::::one(); 13 | let half = Vf32::::splat(0.5); 14 | let quarter = Vf32::::splat(0.25); 15 | let pi = Vf32::::PI(); 16 | 17 | let orig_z = z; 18 | 19 | let is_neg = z.is_negative(); 20 | let mut reflected = Mask::falsey(); 21 | 22 | let mut res = one; 23 | 24 | 'goto_positive: while is_neg.any() { 25 | reflected = z.le(Vf32::::splat(-20.0)); 26 | 27 | let mut refl_res = unsafe { Vf32::::undefined() }; 28 | 29 | // sine is expensive, so branch for it. 30 | if P::POLICY.avoid_precision_branches() || thermite_unlikely!(reflected.any()) { 31 | refl_res = >::sin_pix::

(z); 32 | 33 | // If not branching, all negative values are reflected 34 | if P::POLICY.avoid_precision_branches() { 35 | reflected = is_neg; 36 | 37 | res = reflected.select(refl_res, res); 38 | z = z.conditional_neg(reflected); 39 | 40 | break 'goto_positive; 41 | } 42 | 43 | // NOTE: I chose not to use a bitmask here, because some bitmasks can be 44 | // one extra instruction than the raw call to `all` again, and since z <= -20 is so rare, 45 | // that extra instruction is not worth it. 46 | if reflected.all() { 47 | res = refl_res; 48 | z = -z; 49 | 50 | break 'goto_positive; 51 | } 52 | } 53 | 54 | let mut mod_z = z; 55 | let mut is_neg = is_neg; 56 | 57 | // recursively apply Γ(z+1)/z 58 | while is_neg.any() { 59 | res = is_neg.select(res / mod_z, res); 60 | mod_z = mod_z.conditional_add(one, is_neg); 61 | is_neg = mod_z.is_negative(); 62 | } 63 | 64 | z = reflected.select(-z, mod_z); 65 | res = reflected.select(refl_res, res); 66 | 67 | break 'goto_positive; 68 | } 69 | 70 | // label 71 | //positive: 72 | 73 | // Integers 74 | 75 | let mut z_int = Mask::falsey(); 76 | let mut fact_res = one; 77 | 78 | if P::POLICY.precision > PrecisionPolicy::Worst { 79 | let zf = z.floor(); 80 | z_int = zf.eq(z); 81 | 82 | let bitmask = z_int.bitmask(); 83 | 84 | if thermite_unlikely!(bitmask.any()) { 85 | let mut j = one; 86 | let mut k = j.lt(zf); 87 | 88 | while k.any() { 89 | fact_res = k.select(fact_res * j, fact_res); 90 | j += one; 91 | k = j.lt(zf); 92 | } 93 | 94 | // Γ(-int) = NaN for poles 95 | fact_res = is_neg.select(Vf32::::nan(), fact_res); 96 | // approaching zero from either side results in +/- infinity 97 | fact_res = orig_z.eq(zero).select(Vf32::::infinity().copysign(orig_z), fact_res); 98 | 99 | if bitmask.all() { 100 | return fact_res; 101 | } 102 | } 103 | } 104 | 105 | // Full 106 | 107 | let gh = Vf32::::splat(LANCZOS_G - 0.5); 108 | 109 | let lanczos_sum = z.poly_rational_p::

(LANCZOS_P, LANCZOS_Q); 110 | 111 | let zgh = z + gh; 112 | let lzgh = zgh.ln_p::

(); 113 | 114 | // (z * lzfg) > ln(f32::MAX) 115 | let very_large = (z * lzgh).gt(Vf32::::splat( 116 | 88.722839053130621324601674778549183073943430402325230485234240247, 117 | )); 118 | 119 | // only compute powf once 120 | let h = zgh.powf_p::

(very_large.select(z.mul_sube(half, quarter), z - half)); 121 | 122 | // save a couple cycles by avoiding this division, but worst-case precision is slightly worse 123 | let denom = if P::POLICY.precision >= PrecisionPolicy::Best { 124 | lanczos_sum / zgh.exp_p::

() 125 | } else { 126 | lanczos_sum * (-zgh).exp_p::

() 127 | }; 128 | 129 | let normal_res = very_large.select(h * h, h) * denom; 130 | 131 | // Tiny 132 | if P::POLICY.precision >= PrecisionPolicy::Best { 133 | let is_tiny = z.lt(Vf32::::splat( 134 | >::__SQRT_EPSILON, 135 | )); 136 | let tiny_res = z.reciprocal_p::

() - Vf32::::splat(EULERS_CONSTANT); 137 | res *= is_tiny.select(tiny_res, normal_res); 138 | } else { 139 | res *= normal_res; 140 | } 141 | 142 | reflected.select(-pi / res, z_int.select(fact_res, res)) 143 | } 144 | 145 | #[inline(always)] 146 | fn lgamma(mut z: Self::Vf) -> Self::Vf { 147 | let one = Vf32::::one(); 148 | let zero = Vf32::::zero(); 149 | 150 | let reflect = z.lt(zero); 151 | 152 | let mut t = one; 153 | 154 | if P::POLICY.avoid_branching || reflect.any() { 155 | t = reflect.select(>::sin_pix::

(z).abs(), one); 156 | z = z.conditional_neg(reflect); 157 | } 158 | 159 | let gh = Vf32::::splat(LANCZOS_G - 0.5); 160 | 161 | let mut lanczos_sum = z.poly_rational_p::

(LANCZOS_P_EXPG_SCALED, LANCZOS_Q); 162 | 163 | // Full A 164 | let mut a = (z + gh).ln_p::

() - one; 165 | 166 | // Tiny 167 | if P::POLICY.precision >= PrecisionPolicy::Best { 168 | let is_not_tiny = z.ge(Vf32::::splat_as( 169 | >::__SQRT_EPSILON, 170 | )); 171 | let tiny_res = z.reciprocal_p::

() - Vf32::::splat(EULERS_CONSTANT); 172 | 173 | // shove the tiny result into the log down below 174 | lanczos_sum = is_not_tiny.select(lanczos_sum, tiny_res); 175 | // force multiplier to zero for tiny case, allowing the modified 176 | // lanczos sum and ln(t) to be combined for cheap 177 | a &= is_not_tiny.value(); 178 | } 179 | 180 | // Full 181 | 182 | let b = z - Vf32::::splat(0.5); 183 | let c = (lanczos_sum * t).ln_p::

(); 184 | 185 | let mut res = a.mul_adde(b, c); 186 | 187 | let ln_pi = Vf32::::LN_PI(); 188 | 189 | res = reflect.select(ln_pi - res, res); 190 | 191 | res 192 | } 193 | 194 | #[inline(always)] 195 | fn digamma(mut x: Self::Vf) -> Self::Vf { 196 | let zero = Vf32::::zero(); 197 | let one = Vf32::::one(); 198 | let half = Vf32::::splat(0.5); 199 | let pi = Vf32::::PI(); 200 | 201 | let mut result = zero; 202 | 203 | let reflect = x.le(Vf32::::neg_one()); 204 | 205 | if reflect.any() { 206 | x = reflect.select(one - x, x); 207 | 208 | let mut rem = x - x.floor(); 209 | 210 | rem = rem.conditional_sub(one, rem.gt(half)); 211 | 212 | let (s, c) = (rem * pi).sin_cos_p::

(); 213 | let refl_res = pi * c / s; 214 | 215 | result = reflect.select(refl_res, result); 216 | } 217 | 218 | let lim = Vf32::::splat( 219 | 0.5 * (10 + ((>::__DIGITS as i64 - 50) * 240) / 950) as f32, 220 | ); 221 | 222 | // Rescale to use asymptotic expansion 223 | let mut is_small = x.lt(lim); 224 | while is_small.any() { 225 | result = result.conditional_sub(x.reciprocal_p::

(), is_small); 226 | x = x.conditional_add(one, is_small); 227 | is_small = x.lt(lim); 228 | } 229 | 230 | x -= one; 231 | 232 | let inv_x = x.reciprocal_p::

(); 233 | 234 | let z = inv_x * inv_x; 235 | let a = x.ln_p::

() + (inv_x * half); 236 | 237 | let y = z.poly_p::

(&[ 238 | 0.083333333333333333333333333333333333333333333333333, 239 | -0.0083333333333333333333333333333333333333333333333333, 240 | 0.003968253968253968253968253968253968253968253968254, 241 | ]); 242 | 243 | result += z.nmul_adde(y, a); 244 | 245 | result 246 | } 247 | 248 | #[inline(always)] 249 | fn beta(a: Self::Vf, b: Self::Vf) -> Self::Vf { 250 | let zero = Vf32::::zero(); 251 | 252 | let is_valid = a.gt(zero) & b.gt(zero); 253 | 254 | if P::POLICY.check_overflow && !P::POLICY.avoid_branching { 255 | if is_valid.none() { 256 | return Vf32::::nan(); 257 | } 258 | } 259 | 260 | let c = a + b; 261 | 262 | // if a < b then swap 263 | let (a, b) = (a.max(b), a.min(b)); 264 | 265 | let mut result = a.poly_rational_p::

(LANCZOS_P_EXPG_SCALED, LANCZOS_Q) 266 | * (b.poly_rational_p::

(LANCZOS_P_EXPG_SCALED, LANCZOS_Q) 267 | / c.poly_rational_p::

(LANCZOS_P_EXPG_SCALED, LANCZOS_Q)); 268 | 269 | let gh = Vf32::::splat(LANCZOS_G - 0.5); 270 | 271 | let agh = a + gh; 272 | let bgh = b + gh; 273 | let cgh = c + gh; 274 | 275 | let agh_d_cgh = agh / cgh; 276 | let bgh_d_cgh = bgh / cgh; 277 | let agh_p_bgh = agh * bgh; 278 | let cgh_p_cgh = cgh * cgh; 279 | 280 | let base = cgh 281 | .gt(Vf32::::splat(1e10)) 282 | .select(agh_d_cgh * bgh_d_cgh, agh_p_bgh / cgh_p_cgh); 283 | 284 | let denom = if P::POLICY.precision > PrecisionPolicy::Average { 285 | Vf32::::SQRT_E() / bgh.sqrt() 286 | } else { 287 | // bump up the precision a little to improve beta function accuracy 288 | Vf32::::SQRT_E() * bgh.invsqrt_p::>() 289 | }; 290 | 291 | result *= agh_d_cgh.powf_p::

(a - Vf32::::splat(0.5) - b) * (base.powf_p::

(b) * denom); 292 | 293 | if P::POLICY.check_overflow { 294 | result = is_valid.select(result, Vf32::::nan()); 295 | } 296 | 297 | result 298 | } 299 | } 300 | 301 | const LANCZOS_G: f32 = 1.428456135094165802001953125; 302 | 303 | const LANCZOS_P: &[f32] = &[ 304 | 58.52061591769095910314047740215847630266, 305 | 182.5248962595894264831189414768236280862, 306 | 211.0971093028510041839168287718170827259, 307 | 112.2526547883668146736465390902227161763, 308 | 27.5192015197455403062503721613097825345, 309 | 2.50662858515256974113978724717473206342, 310 | ]; 311 | 312 | const LANCZOS_Q: &[f32] = &[0.0, 24.0, 50.0, 35.0, 10.0, 1.0]; 313 | 314 | const LANCZOS_P_EXPG_SCALED: &[f32] = &[ 315 | 14.0261432874996476619570577285003839357, 316 | 43.74732405540314316089531289293124360129, 317 | 50.59547402616588964511581430025589038612, 318 | 26.90456680562548195593733429204228910299, 319 | 6.595765571169314946316366571954421695196, 320 | 0.6007854010515290065101128585795542383721, 321 | ]; 322 | -------------------------------------------------------------------------------- /crates/thermite/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thermite" 3 | version = "0.1.1-alpha.0" 4 | license = "MIT OR Apache-2.0" 5 | readme = "README.md" 6 | authors = ["novacrazy "] 7 | repository = "https://github.com/raygon-renderer/thermite" 8 | documentation = "https://raygon-renderer.github.io/thermite/" 9 | edition = "2018" 10 | 11 | [features] 12 | default = ["alloc", "math", "rng", "emulate_fma", "static_init"] 13 | neon = ["thermite-dispatch/neon"] 14 | wasm32 = ["thermite-dispatch/wasm32"] 15 | alloc = [] 16 | nightly = [] 17 | math = [] 18 | rng = [] 19 | emulate_fma = [] 20 | 21 | [dependencies] 22 | thermite-dispatch = { path = "../dispatch" } 23 | paste = "1" 24 | half = "1.6.0" 25 | 26 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies.static_init] 27 | version = "1" 28 | optional = true 29 | default_features = false 30 | 31 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] 32 | core_detect = "0.1.0" 33 | 34 | [dev-dependencies] 35 | criterion = "0.3" 36 | libm = "0.2.1" 37 | plotly = "0.6.0" 38 | rand = "0.8" 39 | rand_xoshiro = "0.6.0" 40 | no-panic = "0.1" 41 | thermite-special = { path = "../thermite-special" } 42 | thermite-complex = { path = "../thermite-complex" } 43 | num-complex = "0.4" 44 | 45 | [[bench]] 46 | name = "main" 47 | harness = false 48 | 49 | -------------------------------------------------------------------------------- /crates/thermite/examples/asm.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | // NOTE: This example only exists to be compiled and inspected as assembly via the command: 4 | // `cargo rustc --example asm --release -- -C target-feature=+sse2 --emit asm` 5 | // It's easier to access the example output in the `target/release/examples` directory 6 | 7 | use no_panic::no_panic; 8 | 9 | use thermite::*; 10 | use thermite_special::*; 11 | 12 | pub mod geo; 13 | 14 | use thermite::backends::avx2::AVX2; 15 | use thermite::rng::SimdRng; 16 | 17 | type Vf32 = ::Vf32; 18 | type Vf64 = ::Vf64; 19 | type Vi32 = ::Vi32; 20 | type Vu64 = ::Vu64; 21 | type Vu32 = ::Vu32; 22 | type Vi64 = ::Vi64; 23 | 24 | type Vector3xN = geo::Vector3xN; 25 | 26 | type Xoshiro128Plus = thermite::rng::xoshiro::Xoshiro128Plus; 27 | 28 | #[no_mangle] 29 | #[inline(never)] 30 | pub fn test_dynamic_dispatch(value: &mut [f32]) { 31 | assert_eq!(value.len(), 8); 32 | 33 | #[dispatch] 34 | fn test(value: &mut [f32]) { 35 | thermite::Vf32::::load_unaligned(value).exp2().store_unaligned(value); 36 | } 37 | 38 | dispatch_dyn!({ test::(value) }) 39 | } 40 | 41 | #[no_mangle] 42 | #[inline(never)] 43 | #[target_feature(enable = "avx2,fma")] 44 | pub unsafe fn test_simdrng(rng: &mut Xoshiro128Plus) -> Vf64 { 45 | rng.next_f64() 46 | } 47 | 48 | #[no_mangle] 49 | #[inline(never)] 50 | #[target_feature(enable = "avx2,fma")] 51 | pub unsafe fn test_revbits(x: Vi32) -> Vi32 { 52 | x.reverse_bits() 53 | } 54 | 55 | #[no_mangle] 56 | #[inline(never)] 57 | #[target_feature(enable = "avx2,fma")] 58 | pub unsafe fn test_normalize(v: &mut Vector3xN) { 59 | *v = v.normalize() 60 | } 61 | 62 | #[no_mangle] 63 | #[inline(never)] 64 | #[target_feature(enable = "avx2,fma")] 65 | pub unsafe fn test_u64div(a: Vu64, b: Vu64) -> Vu64 { 66 | a / b 67 | } 68 | 69 | #[no_mangle] 70 | #[inline(never)] 71 | #[target_feature(enable = "avx2,fma")] 72 | pub unsafe fn test_bitmask(b: u16) -> Vu64 { 73 | Mask::from_bitmask(b).value() 74 | } 75 | 76 | #[no_mangle] 77 | #[inline(never)] 78 | #[target_feature(enable = "avx2,fma")] 79 | pub unsafe fn test_cross(a: Vector3xN, b: Vector3xN) -> Vector3xN { 80 | a.cross(&b) 81 | } 82 | 83 | #[no_mangle] 84 | #[inline(never)] 85 | #[target_feature(enable = "avx2,fma")] 86 | pub unsafe fn do_alloc(count: usize) -> VectorBuffer { 87 | Vf32::alloc(count) 88 | } 89 | 90 | #[no_mangle] 91 | #[inline(never)] 92 | #[target_feature(enable = "avx2,fma")] 93 | pub unsafe fn test_powf_ps(y: Vf32, x: Vf32) -> Vf32 { 94 | y.powf(x) 95 | } 96 | 97 | #[no_mangle] 98 | #[inline(never)] 99 | #[target_feature(enable = "avx2,fma")] 100 | pub unsafe fn test_powf_pd(y: Vf64, x: Vf64) -> Vf64 { 101 | y.powf(x) 102 | } 103 | 104 | #[no_mangle] 105 | #[inline(never)] 106 | #[target_feature(enable = "avx2,fma")] 107 | pub unsafe fn test_smootheststep(x: Vf32) -> Vf32 { 108 | x.smootheststep() 109 | } 110 | 111 | #[no_mangle] 112 | #[inline(never)] 113 | //#[target_feature(enable = "avx2,fma")] 114 | pub unsafe fn test_pdsin(x: Vf64) -> Vf64 { 115 | x.sin() 116 | } 117 | 118 | #[no_mangle] 119 | #[inline(never)] 120 | #[target_feature(enable = "avx2,fma")] 121 | pub unsafe fn test_pssin_cos(x: Vf32) -> (Vf32, Vf32) { 122 | x.sin_cos_p::() 123 | } 124 | 125 | #[no_mangle] 126 | #[inline(never)] 127 | #[target_feature(enable = "avx2,fma")] 128 | pub unsafe fn test_select_neg_ps(x: Vf32, a: Vf32, b: Vf32) -> Vf32 { 129 | x.is_negative().select(a, b) 130 | } 131 | 132 | #[no_mangle] 133 | #[inline(never)] 134 | #[target_feature(enable = "avx2,fma")] 135 | pub unsafe fn test_select_neg_epi32(x: Vi32, a: Vi32, b: Vi32) -> Vi32 { 136 | x.is_negative().select(a, b) 137 | } 138 | 139 | #[no_mangle] 140 | #[inline(never)] 141 | #[target_feature(enable = "avx2,fma")] 142 | #[no_panic] 143 | pub unsafe fn test_shuffle(x: Vf64, y: Vf64) -> Vf64 { 144 | match Vf64::NUM_ELEMENTS { 145 | 4 => shuffle!(x, y, [6, 2, 1, 7]), 146 | 8 => shuffle!(x, y, [5, 6, 10, 9, 2, 8, 6, 4]), 147 | _ => unimplemented!(), 148 | } 149 | } 150 | 151 | #[no_mangle] 152 | #[inline(never)] 153 | #[target_feature(enable = "avx2,fma")] 154 | pub unsafe fn test_shuffle_dyn_unchecked(a: Vf32, b: Vf32, indices: &[usize]) -> Vf32 { 155 | a.shuffle_dyn_unchecked(b, indices) 156 | } 157 | 158 | //#[no_mangle] 159 | //#[inline(never)] 160 | //#[target_feature(enable = "avx2,fma")] 161 | //pub unsafe fn test_shuffle_dyn(x: Vf32, y: Vf32, indices: &[usize; 8]) -> Vf32 { 162 | // x.shuffle(y, &indices[..]) 163 | //} 164 | 165 | #[no_mangle] 166 | #[inline(never)] 167 | //#[target_feature(enable = "avx2,fma")] 168 | pub unsafe fn test_pstgamma(x: Vf32) -> Vf32 { 169 | x.tgamma_p::() 170 | } 171 | 172 | #[no_mangle] 173 | #[inline(never)] 174 | //#[target_feature(enable = "avx2,fma")] 175 | pub unsafe fn test_pdtgamma(x: Vf64) -> Vf64 { 176 | x.tgamma() 177 | } 178 | 179 | #[no_mangle] 180 | #[inline(never)] 181 | #[target_feature(enable = "avx2,fma")] 182 | pub unsafe fn test_pserf(x: Vf32) -> Vf32 { 183 | x.erf() 184 | } 185 | 186 | #[no_mangle] 187 | #[inline(never)] 188 | pub unsafe fn test_psexp(x: Vf32) -> Vf32 { 189 | x.exp() 190 | } 191 | 192 | #[no_mangle] 193 | #[inline(never)] 194 | #[target_feature(enable = "avx2,fma")] 195 | pub unsafe fn test_pderfinv(x: Vf64) -> Vf64 { 196 | x.erfinv() 197 | } 198 | 199 | #[no_mangle] 200 | #[inline(never)] 201 | #[target_feature(enable = "avx2,fma")] 202 | pub unsafe fn test_pscbrt(x: Vf32) -> Vf32 { 203 | x.cbrt() 204 | } 205 | 206 | //#[no_mangle] 207 | //#[inline(never)] 208 | //#[target_feature(enable = "avx2,fma")] 209 | //pub unsafe fn test_ps_bessel_y4(x: Vf32) -> Vf32 { 210 | // x.bessel_y_p::(4) 211 | //} 212 | 213 | #[no_mangle] 214 | #[inline(never)] 215 | #[target_feature(enable = "avx2,fma")] 216 | pub unsafe fn test_poly(x: Vf32, e: &[f32]) -> Vf32 { 217 | x.poly_f(128, |i| Vf32::splat(*e.get_unchecked(i))) 218 | } 219 | 220 | #[no_mangle] 221 | #[inline(never)] 222 | #[target_feature(enable = "avx2,fma")] 223 | pub unsafe fn test_rational_poly(x: Vf32, e: &[f32], d: &[f32]) -> Vf32 { 224 | let n0 = x.poly_f(19, |i| Vf32::splat(*e.get_unchecked(i))); 225 | let n1 = x.poly_f(19, |i| Vf32::splat(*d.get_unchecked(i))); 226 | 227 | n0 / n1 228 | } 229 | 230 | #[no_mangle] 231 | #[inline(never)] 232 | #[target_feature(enable = "avx2,fma")] 233 | pub unsafe fn test_rational_poly2(x: Vf32, e: &[f32], d: &[f32]) -> Vf32 { 234 | assert!(e.len() == 19 && e.len() == d.len()); 235 | 236 | x.poly_rational_p::(e, d) 237 | } 238 | 239 | #[no_mangle] 240 | #[inline(never)] 241 | #[target_feature(enable = "avx2,fma")] 242 | pub unsafe fn test_poly2(x: Vf32) -> Vf32 { 243 | x.poly_f(128, |i| { 244 | Vf32::splat((-1.0f32).powi(i as i32) * (2f32.powi(i as i32) - i as f32)) 245 | }) 246 | } 247 | 248 | #[no_mangle] 249 | #[inline(never)] 250 | #[target_feature(enable = "avx2,fma")] 251 | pub unsafe fn test_pdcbrt(x: Vf64) -> Vf64 { 252 | x.cbrt() 253 | } 254 | 255 | #[no_mangle] 256 | #[inline(never)] 257 | #[target_feature(enable = "avx2,fma")] 258 | pub unsafe fn test_pdsinh(x: Vf64) -> Vf64 { 259 | x.sinh_p::() 260 | } 261 | 262 | #[no_mangle] 263 | #[inline(never)] 264 | #[target_feature(enable = "avx2,fma")] 265 | pub unsafe fn test_pssinh(x: Vf32) -> Vf32 { 266 | x.sinh_p::() 267 | } 268 | 269 | #[no_mangle] 270 | #[inline(never)] 271 | #[target_feature(enable = "avx2,fma")] 272 | pub unsafe fn test_jacobi(x: Vf32, alpha: Vf32, beta: Vf32, n: u32, m: u32) -> Vf32 { 273 | x.legendre(50, 0) 274 | } 275 | 276 | #[no_mangle] 277 | #[inline(never)] 278 | #[target_feature(enable = "avx2,fma")] 279 | pub unsafe fn test_cast2(x: Vf64) -> Vi64 { 280 | x.cast() 281 | } 282 | 283 | fn main() {} 284 | -------------------------------------------------------------------------------- /crates/thermite/examples/geo/mod.rs: -------------------------------------------------------------------------------- 1 | use thermite::*; 2 | 3 | #[derive(Debug, Clone, Copy)] 4 | pub struct Vector3xN { 5 | pub x: Vf32, 6 | pub y: Vf32, 7 | pub z: Vf32, 8 | } 9 | 10 | impl Vector3xN { 11 | pub fn dot(&self, other: &Self) -> S::Vf32 { 12 | self.x.mul_add(other.x, self.y.mul_add(other.y, self.z * other.z)) 13 | } 14 | 15 | pub fn cross(&self, other: &Self) -> Self { 16 | Self { 17 | x: self.y.mul_sub(other.z, self.z * other.y), 18 | y: self.z.mul_sub(other.x, self.x * other.z), 19 | z: self.x.mul_sub(other.y, self.y * other.x), 20 | } 21 | } 22 | 23 | pub fn norm_squared(&self) -> S::Vf32 { 24 | self.dot(self) 25 | } 26 | 27 | pub fn norm(&self) -> S::Vf32 { 28 | self.norm_squared().sqrt() 29 | } 30 | 31 | pub fn normalize(&self) -> Self { 32 | let inv_norm = self.norm_squared().invsqrt_p::(); 33 | 34 | Self { 35 | x: self.x * inv_norm, 36 | y: self.y * inv_norm, 37 | z: self.z * inv_norm, 38 | } 39 | } 40 | } 41 | 42 | #[derive(Debug, Clone, Copy)] 43 | pub struct Matrix4xN { 44 | pub m: [[S::Vf32; 4]; 4], 45 | } 46 | 47 | impl Matrix4xN { 48 | pub fn at(&self, row: usize, col: usize) -> &S::Vf32 { 49 | &self.m[col][row] 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /crates/thermite/examples/plot.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use thermite::*; 4 | 5 | pub mod geo; 6 | 7 | use thermite::backends::avx2::AVX2; 8 | 9 | type Vf32 = ::Vf32; 10 | type Vf64 = ::Vf64; 11 | type Vi32 = ::Vi32; 12 | type Vu64 = ::Vu64; 13 | type Vu32 = ::Vu32; 14 | type Vi64 = ::Vi64; 15 | 16 | use plotly::common::{ColorScale, ColorScalePalette, DashType, Fill, Font, Line, LineShape, Marker, Mode, Title}; 17 | use plotly::layout::{Axis, BarMode, Layout, Legend, TicksDirection}; 18 | use plotly::{Bar, NamedColor, Plot, Rgb, Rgba, Scatter}; 19 | 20 | fn plot_function(name: &str, x_axis: &Vec, plot: &mut Plot, mut f: F) 21 | where 22 | F: FnMut(Vf32) -> Vf32, 23 | { 24 | let mut y_axis = vec![0.0; x_axis.len()]; 25 | 26 | for (src, dst) in x_axis 27 | .chunks(Vf32::NUM_ELEMENTS) 28 | .zip(y_axis.chunks_mut(Vf32::NUM_ELEMENTS)) 29 | { 30 | f(Vf32::load_unaligned(src)) 31 | //.clamp(Vf32::splat(-400.0), Vf32::splat(400.0)) 32 | .store_unaligned(dst); 33 | } 34 | 35 | plot.add_trace(Scatter::new(x_axis.clone(), y_axis).mode(Mode::Lines).name(name)); 36 | } 37 | 38 | fn main() { 39 | let num_points = Vf32::NUM_ELEMENTS * 1000; 40 | 41 | let x_axis: Vec = (0..num_points) 42 | .into_iter() 43 | .map(|x| (x as f32 / num_points as f32) * 30.0 - 15.0) 44 | .collect(); 45 | 46 | let layout = Layout::new().title(Title::new("Gamma function")); 47 | let mut plot = Plot::new(); 48 | 49 | //for i in 0..5 { 50 | // plot_function(&format!("Y{}", i), &x_axis, &mut plot, |x| { 51 | // x.bessel_y_p::(i) 52 | // }); 53 | //} 54 | 55 | //plot_function("cos(x) [Precision]", &x_axis, &mut plot, |x| { 56 | // x.cos_p::() 57 | //}); 58 | //plot_function("cos(x) [Reference]", &x_axis, &mut plot, |x| { 59 | // x.cos_p::() 60 | //}); 61 | // 62 | //plot_function("sin(x) [Precision]", &x_axis, &mut plot, |x| { 63 | // x.sin_p::() 64 | //}); 65 | //plot_function("sin(x) [Reference]", &x_axis, &mut plot, |x| { 66 | // x.sin_p::() 67 | //}); 68 | 69 | //plot_function("tgamma(x)", &x_axis, &mut plot, |x| x.tgamma()); 70 | //plot_function("lgamma(x)", &x_axis, &mut plot, |x| x.lgamma()); 71 | //plot_function("ln(tgamma(x))", &x_axis, &mut plot, |x| x.tgamma().ln()); 72 | //plot_function("diff*1000", &x_axis, &mut plot, |x| { 73 | // (x.tgamma().ln() - x.lgamma()) * Vf32::splat(1000.0) 74 | //}); 75 | 76 | //plot_function("digamma(x)", &x_axis, &mut plot, |x| x.digamma()); 77 | 78 | /* 79 | plot_function("Gamma Avg", &x_axis, &mut plot, |x| x.tgamma()); 80 | plot_function("Gamma Worst", &x_axis, &mut plot, |x| { 81 | x.tgamma_p::() 82 | }); 83 | 84 | plot_function("Diffx100", &x_axis, &mut plot, |x| { 85 | (x.tgamma() - x.tgamma_p::()) * Vf32::splat(100.0) 86 | }); 87 | */ 88 | 89 | plot_function("Ln Avg", &x_axis, &mut plot, |x| x.ln()); 90 | plot_function("Ln Worst", &x_axis, &mut plot, |x| { 91 | x.ln_p::() 92 | }); 93 | 94 | plot_function("Diffx100", &x_axis, &mut plot, |x| { 95 | (x.ln() - x.ln_p::()) * Vf32::splat(100.0) 96 | }); 97 | 98 | /* 99 | for i in 0..5 { 100 | plot_function(&format!("beta(x, {}) [UP]", i), &x_axis, &mut plot, |x| { 101 | x.beta_p::(Vf32::splat_as(i + 1)) 102 | }); 103 | } 104 | 105 | for i in 0..5 { 106 | plot_function(&format!("beta(x, {}) [Precision]", i), &x_axis, &mut plot, |x| { 107 | x.beta_p::(Vf32::splat_as(i + 1)) 108 | }); 109 | } 110 | */ 111 | 112 | plot.show(); 113 | } 114 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/aarch64/mod.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/arm/mod.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/avx1/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use crate::*; 4 | 5 | use core::{ 6 | fmt, 7 | marker::PhantomData, 8 | mem::{transmute, transmute_copy}, 9 | ops::*, 10 | }; 11 | 12 | use crate::arch::avx::*; 13 | 14 | use half::f16; 15 | 16 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 17 | pub struct AVX1; 18 | 19 | #[macro_use] 20 | pub(crate) mod polyfills; 21 | 22 | use polyfills::*; 23 | 24 | /* 25 | mod vf32; 26 | mod vf64; 27 | mod vi32; 28 | mod vi32_2; 29 | mod vi64; 30 | //mod vi64_2; 31 | mod vu32; 32 | mod vu64; 33 | 34 | pub use vf32::*; 35 | pub use vf64::*; 36 | pub use vi32::*; 37 | pub use vi64::*; 38 | pub use vu32::*; 39 | pub use vu64::*; 40 | 41 | type Vi32 = i32x8; 42 | type Vi64 = i64x8; 43 | type Vu32 = u32x8; 44 | type Vu64 = u64x8; 45 | type Vf32 = f32x8; 46 | type Vf64 = f64x8; 47 | 48 | impl Simd for AVX1 { 49 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX; 50 | 51 | type Vi32 = Vi32; 52 | type Vi64 = Vi64; 53 | type Vu32 = Vu32; 54 | type Vu64 = Vu64; 55 | type Vf32 = Vf32; 56 | type Vf64 = Vf64; 57 | 58 | #[cfg(target_pointer_width = "32")] 59 | type Vusize = Vu32; 60 | 61 | #[cfg(target_pointer_width = "32")] 62 | type Visize = Vi32; 63 | 64 | #[cfg(target_pointer_width = "64")] 65 | type Vusize = Vu64; 66 | 67 | #[cfg(target_pointer_width = "64")] 68 | type Visize = Vi64; 69 | } 70 | */ 71 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/avx1/polyfills.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/avx1/vi32_2.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(i32x8: i32 => [__m128i; 2]); 4 | impl Default for i32x8 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new([unsafe { _mm_setzero_si128() }; 2]) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for i32x8 { 12 | type Element = i32; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(unsafe { [_mm_set1_epi32(value); 2] }) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new([_mm_undefined_si128(); 2]) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new([_mm_load_si128(src as *const _), _mm_load_si128(src.add(4) as *const _)]) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | let src = src as *const _; 32 | Self::new([_mm_load_si128(src), _mm_load_si128(src.add(1))]) 33 | } 34 | 35 | #[inline(always)] 36 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 37 | let dst = dst as *mut _; 38 | _mm_store_si128(dst, self.value[0]); 39 | _mm_store_si128(dst.add(1), self.value[1]); 40 | } 41 | 42 | #[inline(always)] 43 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 44 | let dst = dst as *mut _; 45 | _mm_storeu_si128(dst, self.value[0]); 46 | _mm_storeu_si128(dst.add(1), self.value[1]); 47 | } 48 | 49 | decl_base_common!(#[target_feature(enable = "avx,fma")] i32x8: i32 => __m256i); 50 | } 51 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/avx1/vi64_2.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(i64x8: i64 => [__m128i; 4]); 4 | impl Default for i64x8 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new([unsafe { _mm_setzero_si128() }; 4]) 8 | } 9 | } 10 | 11 | impl i64x8 { 12 | #[inline(always)] 13 | fn mapv(mut self, f: F) -> Self 14 | where 15 | F: Fn(__m128i, usize) -> __m128i, 16 | { 17 | for i in 0..4 { 18 | self.value[i] = f(self.value[i], i); 19 | } 20 | self 21 | } 22 | 23 | #[inline(always)] 24 | fn zipv(mut self, b: Self, f: F) -> Self 25 | where 26 | F: Fn(__m128i, __m128i) -> __m128i, 27 | { 28 | self.mapv(|a, i| f(a, b.value[i])) 29 | } 30 | } 31 | 32 | impl SimdVectorBase for i64x8 { 33 | type Element = i64; 34 | 35 | #[inline(always)] 36 | fn splat(value: Self::Element) -> Self { 37 | Self::new(unsafe { [_mm_set1_epi64x(value); 4] }) 38 | } 39 | 40 | #[inline(always)] 41 | unsafe fn undefined() -> Self { 42 | Self::new([_mm_undefined_si128(); 4]) 43 | } 44 | 45 | #[inline(always)] 46 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 47 | Self::undefined().mapv(|_, i| _mm_load_si128((src as *const __m128i).add(i))) 48 | } 49 | 50 | #[inline(always)] 51 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 52 | Self::undefined().mapv(|_, i| _mm_loadu_si128((src as *const __m128i).add(i))) 53 | } 54 | 55 | #[inline(always)] 56 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 57 | for i in 0..4 { 58 | _mm_store_si128((dst as *mut __m128i).add(i), self.value[i]); 59 | } 60 | } 61 | 62 | #[inline(always)] 63 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 64 | for i in 0..4 { 65 | _mm_storeu_si128((dst as *mut __m128i).add(i), self.value[i]); 66 | } 67 | } 68 | 69 | #[inline] 70 | #[target_feature(enable = "avx")] 71 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element { 72 | *transmute::<&_, *const Self::Element>(&self).add(index) 73 | } 74 | 75 | #[inline] 76 | #[target_feature(enable = "avx")] 77 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self { 78 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value; 79 | self 80 | } 81 | } 82 | 83 | impl SimdBitwise for i64x8 { 84 | fn and_not(self, other: Self) -> Self { 85 | self.zipv(other, |a, b| unsafe { _mm_andnot_si128(a, b) }) 86 | } 87 | 88 | const FULL_BITMASK: u16 = 0b1111_1111; 89 | 90 | #[inline(always)] 91 | fn bitmask(self) -> u16 { 92 | let mut bitmask = 0; 93 | for i in 0..4 { 94 | // shift mask by 2*i as each vector has 2 64-bit lanes 95 | bitmask |= unsafe { _mm_movemask_pd(_mm_castsi128_pd(self.value[i])) } << (2 * i); 96 | } 97 | bitmask as u16 98 | } 99 | 100 | #[inline(always)] 101 | unsafe fn _mm_not(self) -> Self { 102 | self ^ Self::splat(!0) 103 | } 104 | 105 | #[inline(always)] 106 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 107 | self.zipv(rhs, |a, b| _mm_and_si128(a, b)) 108 | } 109 | 110 | #[inline(always)] 111 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 112 | self.zipv(rhs, |a, b| _mm_or_si128(a, b)) 113 | } 114 | 115 | #[inline(always)] 116 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 117 | self.zipv(rhs, |a, b| _mm_xor_si128(a, b)) 118 | } 119 | 120 | #[inline(always)] 121 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 122 | Self::zip(self, count, |x, s| x >> s) 123 | } 124 | 125 | #[inline(always)] 126 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 127 | Self::zip(self, count, |x, s| x << s) 128 | } 129 | 130 | #[inline(always)] 131 | unsafe fn _mm_shli(self, count: u32) -> Self { 132 | let count = _mm_cvtsi32_si128(count as i32); 133 | self.mapv(|a, _| _mm_sll_epi64(a, count)) 134 | } 135 | 136 | #[inline(always)] 137 | unsafe fn _mm_shri(self, count: u32) -> Self { 138 | let count = _mm_cvtsi32_si128(count as i32); 139 | self.mapv(|a, _| _mm_srl_epi64(a, count)) 140 | } 141 | } 142 | 143 | impl PartialEq for i64x8 { 144 | fn eq(&self, other: &Self) -> bool { 145 | >::eq(*self, *other).all() 146 | } 147 | 148 | fn ne(&self, other: &Self) -> bool { 149 | >::ne(*self, *other).any() 150 | } 151 | } 152 | 153 | impl Eq for i64x8 {} 154 | 155 | impl SimdMask for i64x8 { 156 | #[inline(always)] 157 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self { 158 | self.mapv(|m, i| _mm_blendv_epi8(f.value[i], t.value[i], m)) 159 | } 160 | } 161 | 162 | impl SimdVector for i64x8 { 163 | #[inline(always)] 164 | fn zero() -> Self { 165 | Self::new(unsafe { [_mm_setzero_si128(); 4] }) 166 | } 167 | 168 | #[inline(always)] 169 | fn one() -> Self { 170 | Self::splat(1) 171 | } 172 | 173 | #[inline(always)] 174 | fn min_value() -> Self { 175 | Self::splat(i64::MIN) 176 | } 177 | 178 | #[inline(always)] 179 | fn max_value() -> Self { 180 | Self::splat(i64::MAX) 181 | } 182 | 183 | #[inline] 184 | fn min_element(self) -> Self::Element { 185 | unsafe { self.reduce2(|a, x| a.min(x)) } 186 | } 187 | 188 | #[inline] 189 | fn max_element(self) -> Self::Element { 190 | unsafe { self.reduce2(|a, x| a.max(x)) } 191 | } 192 | 193 | #[inline(always)] 194 | fn eq(self, other: Self) -> Mask { 195 | Mask::new(self.zipv(other, |a, b| unsafe { _mm_cmpeq_epi64(a, b) })) 196 | } 197 | 198 | #[inline(always)] 199 | fn gt(self, other: Self) -> Mask { 200 | Mask::new(self.zipv(other, |a, b| unsafe { _mm_cmpgt_epi64(a, b) })) 201 | } 202 | 203 | #[inline(always)] 204 | unsafe fn _mm_add(self, rhs: Self) -> Self { 205 | self.zipv(rhs, |l, r| _mm_add_epi64(l, r)) 206 | } 207 | 208 | #[inline(always)] 209 | unsafe fn _mm_sub(self, rhs: Self) -> Self { 210 | self.zipv(rhs, |l, r| _mm_sub_epi64(l, r)) 211 | } 212 | 213 | #[inline(always)] 214 | unsafe fn _mm_mul(self, rhs: Self) -> Self { 215 | self.zipv(rhs, |l, r| _mm_mullo_epi64x(l, r)) 216 | } 217 | 218 | #[inline(always)] 219 | unsafe fn _mm_div(self, rhs: Self) -> Self { 220 | Self::zip(self, rhs, Div::div) 221 | } 222 | 223 | #[inline(always)] 224 | unsafe fn _mm_rem(self, rhs: Self) -> Self { 225 | Self::zip(self, rhs, Rem::rem) 226 | } 227 | } 228 | 229 | impl SimdSignedVector for i64x8 { 230 | #[inline(always)] 231 | fn neg_one() -> Self { 232 | Self::splat(-1) 233 | } 234 | 235 | #[inline(always)] 236 | fn min_positive() -> Self { 237 | Self::splat(0) 238 | } 239 | 240 | #[inline(always)] 241 | fn abs(self) -> Self { 242 | self.mapv(|x, _| unsafe { _mm256_abs_epi64x(x) }) 243 | } 244 | 245 | #[inline(always)] 246 | unsafe fn _mm_neg(self) -> Self { 247 | (self ^ Self::neg_one()) + Self::one() 248 | } 249 | } 250 | 251 | impl_ops!(@UNARY i64x8 AVX1 => Not::not, Neg::neg); 252 | impl_ops!(@BINARY i64x8 AVX1 => BitAnd::bitand, BitOr::bitor, BitXor::bitxor); 253 | impl_ops!(@BINARY i64x8 AVX1 => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem); 254 | impl_ops!(@SHIFTS i64x8 AVX1 => Shr::shr, Shl::shl); 255 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/avx2/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use crate::*; 4 | 5 | use core::{ 6 | fmt, 7 | marker::PhantomData, 8 | mem::{transmute, transmute_copy}, 9 | ops::*, 10 | }; 11 | 12 | use crate::arch::avx2::*; 13 | 14 | use half::f16; 15 | 16 | pub(crate) mod polyfills; 17 | 18 | use super::polyfills::*; 19 | use polyfills::*; 20 | 21 | mod vf32; 22 | mod vf64; 23 | //mod vi16; 24 | mod vi32; 25 | mod vi64; 26 | mod vu32; 27 | mod vu64; 28 | 29 | pub use vf32::*; 30 | pub use vf64::*; 31 | //pub use vi16::*; 32 | pub use vi32::*; 33 | pub use vi64::*; 34 | pub use vu32::*; 35 | pub use vu64::*; 36 | 37 | //type Vi16 = i16x8; 38 | type Vi32 = i32x8; 39 | type Vi64 = i64x8; 40 | type Vu32 = u32x8; 41 | type Vu64 = u64x8; 42 | type Vf32 = f32x8; 43 | type Vf64 = f64x8; 44 | 45 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 46 | pub struct AVX2; 47 | 48 | impl Simd for AVX2 { 49 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX2; 50 | 51 | type Vi32 = Vi32; 52 | type Vi64 = Vi64; 53 | type Vu32 = Vu32; 54 | type Vu64 = Vu64; 55 | type Vf32 = Vf32; 56 | type Vf64 = Vf64; 57 | 58 | #[cfg(target_pointer_width = "32")] 59 | type Vusize = Vu32; 60 | 61 | #[cfg(target_pointer_width = "32")] 62 | type Visize = Vi32; 63 | 64 | #[cfg(target_pointer_width = "64")] 65 | type Vusize = Vu64; 66 | 67 | #[cfg(target_pointer_width = "64")] 68 | type Visize = Vi64; 69 | } 70 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/avx2/vi16.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(i16x8: i16 => __m128i); 4 | impl Default for i16x8 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(unsafe { _mm_setzero_si128() }) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for i16x8 { 12 | type Element = i16; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(unsafe { _mm_set1_epi16(value) }) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new(_mm_undefined_si128()) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new(_mm_load_si128(src as *const _)) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | Self::new(_mm_loadu_si128(src as *const _)) 32 | } 33 | 34 | #[inline(always)] 35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 36 | _mm_store_si128(dst as *mut _, self.value) 37 | } 38 | 39 | #[inline(always)] 40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 41 | _mm_storeu_si128(dst as *mut _, self.value) 42 | } 43 | 44 | #[inline] 45 | #[target_feature(enable = "avx2")] 46 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element { 47 | *transmute::<&_, *const Self::Element>(&self).add(index) 48 | } 49 | 50 | #[inline] 51 | #[target_feature(enable = "avx2")] 52 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self { 53 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value; 54 | self 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/macros.rs: -------------------------------------------------------------------------------- 1 | macro_rules! impl_ops { 2 | (@UNARY $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$( 3 | impl $op_trait for $name<$is> { 4 | type Output = Self; 5 | #[inline(always)] fn $op(self) -> Self { unsafe { self. [<_mm_ $op>]() } } 6 | } 7 | )*}}; 8 | 9 | (@BINARY $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$( 10 | impl $op_trait for $name<$is> { 11 | type Output = Self; 12 | #[inline(always)] fn $op(self, rhs: Self) -> Self { unsafe { self. [<_mm_ $op>](rhs) } } 13 | } 14 | //impl $op_trait<>::Element> for $name<$is> { 15 | // type Output = Self; 16 | // #[inline(always)] fn $op(self, rhs: >::Element) -> Self { 17 | // $op_trait::$op(self, Self::splat(rhs)) 18 | // } 19 | //} 20 | //impl $op_trait<$name<$is>> for <$name<$is> as SimdVectorBase<$is>>::Element { 21 | // type Output = $name<$is>; 22 | // #[inline(always)] fn $op(self, rhs: $name<$is>) -> $name<$is> { 23 | // $op_trait::$op($name::<$is>::splat(self), rhs) 24 | // } 25 | //} 26 | 27 | impl [<$op_trait Assign>] for $name<$is> { 28 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: Self) { *self = $op_trait::$op(*self, rhs); } 29 | } 30 | impl [<$op_trait Assign>]<>::Element> for $name<$is> { 31 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: >::Element) { 32 | *self = $op_trait::$op(*self, Self::splat(rhs)); 33 | } 34 | } 35 | )*}}; 36 | 37 | (@SHIFTS $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$( 38 | impl $op_trait<<$is as Simd>::Vu32> for $name<$is> { 39 | type Output = Self; 40 | #[inline(always)] fn $op(self, rhs: <$is as Simd>::Vu32) -> Self { unsafe { self. [<_mm_ $op>](rhs) } } 41 | } 42 | impl $op_trait for $name<$is> { 43 | type Output = Self; 44 | #[inline(always)] fn $op(self, rhs: u32) -> Self { unsafe { self.[<_mm_ $op i>](rhs) } } 45 | } 46 | 47 | impl [<$op_trait Assign>]<<$is as Simd>::Vu32> for $name<$is> { 48 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: <$is as Simd>::Vu32) { *self = $op_trait::$op(*self, rhs); } 49 | } 50 | impl [<$op_trait Assign>] for $name<$is> { 51 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: u32) { *self = $op_trait::$op(*self, rhs); } 52 | } 53 | )*}}; 54 | } 55 | 56 | macro_rules! decl_base_common { 57 | (#[$meta:meta] $name:ident: $ety:ty => $ty:ty) => { 58 | #[inline] 59 | #[$meta] 60 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element { 61 | *transmute::<&_, *const Self::Element>(&self).add(index) 62 | } 63 | 64 | #[inline] 65 | #[$meta] 66 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self { 67 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value; 68 | self 69 | } 70 | 71 | #[inline] 72 | #[$meta] 73 | unsafe fn shuffle_unchecked(self, b: Self, indices: INDICES) -> Self { 74 | let mut dst = Self::undefined(); 75 | for i in 0..Self::NUM_ELEMENTS { 76 | let idx = *INDICES::INDICES.get_unchecked(i); 77 | dst = dst.replace_unchecked( 78 | i, 79 | if idx < Self::NUM_ELEMENTS { 80 | self.extract_unchecked(idx) 81 | } else { 82 | b.extract_unchecked(idx - Self::NUM_ELEMENTS) 83 | }, 84 | ); 85 | } 86 | dst 87 | } 88 | }; 89 | } 90 | 91 | macro_rules! decl { 92 | ($($name:ident: $ety:ty => $ty:ty),*) => {$( 93 | #[derive(Clone, Copy)] 94 | #[repr(transparent)] 95 | pub struct $name { 96 | pub(crate) value: $ty, 97 | _is: PhantomData, 98 | } 99 | 100 | impl $name { 101 | #[inline(always)] 102 | pub(crate) fn new(value: $ty) -> Self { 103 | Self { value, _is: PhantomData } 104 | } 105 | } 106 | 107 | impl $name where Self: SimdVectorBase { 108 | #[inline(always)] 109 | pub(crate) unsafe fn map(mut self, f: F) -> Self 110 | where F: Fn($ety) -> $ety { 111 | for i in 0..Self::NUM_ELEMENTS { 112 | let ptr = transmute::<&mut _, *mut $ety>(&mut self).add(i); 113 | *ptr = f(*ptr); 114 | } 115 | self 116 | } 117 | 118 | #[inline(always)] 119 | pub(crate) unsafe fn zip(a: Self, b: V, f: F) -> Self 120 | where F: Fn($ety, >::Element) -> $ety, 121 | Self: SimdVectorBase, 122 | V: SimdVectorBase { 123 | let mut out = Self::default(); 124 | for i in 0..Self::NUM_ELEMENTS { 125 | *transmute::<&mut _, *mut $ety>(&mut out).add(i) = 126 | f(a.extract_unchecked(i), b.extract_unchecked(i)); 127 | } 128 | out 129 | } 130 | 131 | #[inline(always)] 132 | pub(crate) unsafe fn reduce(self, mut init: $ety, f: F) -> $ety 133 | where F: Fn($ety, $ety) -> $ety { 134 | for i in 0..Self::NUM_ELEMENTS { 135 | init = f(init, self.extract_unchecked(i)); 136 | } 137 | init 138 | } 139 | 140 | #[inline(always)] 141 | pub(crate) unsafe fn reduce2(self, f: F) -> $ety 142 | where F: Fn($ety, $ety) -> $ety { 143 | let mut accum = self.extract_unchecked(0); 144 | for i in 1..Self::NUM_ELEMENTS { 145 | accum = f(accum, self.extract_unchecked(i)); 146 | } 147 | accum 148 | } 149 | } 150 | 151 | impl fmt::Debug for $name where Self: SimdVectorBase { 152 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 153 | let mut t = f.debug_tuple(stringify!($name)); 154 | for i in 0..Self::NUM_ELEMENTS { 155 | t.field(unsafe { &*transmute::<&_, *const $ety>(self).add(i) }); 156 | } 157 | t.finish() 158 | } 159 | } 160 | )*}; 161 | } 162 | 163 | macro_rules! decl_brute_force_convert { 164 | (#[$meta:meta] $from:ty => $to:ty) => { 165 | paste::paste! { 166 | #[$meta] 167 | #[inline] 168 | unsafe fn do_convert(value: []) -> [] { 169 | let mut res = mem::MaybeUninit::uninit(); 170 | for i in 0..[]::NUM_ELEMENTS { 171 | *(res.as_mut_ptr() as *mut $to).add(i) = (*transmute::<&_, *const $from>(&value).add(i)) as $to; 172 | } 173 | res.assume_init() 174 | } 175 | } 176 | }; 177 | } 178 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/mod.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | mod macros; 3 | 4 | pub mod polyfills; 5 | 6 | //pub mod scalar; 7 | 8 | #[cfg(all(feature = "neon", target_arch = "aarch64"))] 9 | pub mod aarch64; 10 | #[cfg(all(feature = "neon", target_arch = "arm"))] 11 | pub mod arm; 12 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 13 | pub mod avx1; 14 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 15 | pub mod avx2; 16 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 17 | pub mod sse2; 18 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 19 | pub mod sse42; 20 | #[cfg(all(feature = "wasm32", target_arch = "wasm32"))] 21 | pub mod wasm32; 22 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/polyfills.rs: -------------------------------------------------------------------------------- 1 | #[inline(always)] 2 | pub const fn _mm_shuffle(w: i32, z: i32, y: i32, x: i32) -> i32 { 3 | (w << 6) | (z << 4) | (y << 2) | x 4 | } 5 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use crate::*; 4 | 5 | use core::{ 6 | fmt, 7 | marker::PhantomData, 8 | mem::{transmute, transmute_copy}, 9 | ops::*, 10 | }; 11 | 12 | mod polyfills; 13 | use polyfills::*; 14 | 15 | use half::f16; 16 | 17 | mod vf32; 18 | mod vf64; 19 | mod vi32; 20 | mod vi64; 21 | mod vu32; 22 | mod vu64; 23 | 24 | pub use vf32::*; 25 | pub use vf64::*; 26 | pub use vi32::*; 27 | pub use vi64::*; 28 | pub use vu32::*; 29 | pub use vu64::*; 30 | 31 | type Vu32 = u32x1; 32 | type Vf32 = f32x1; 33 | type Vf64 = f64x1; 34 | 35 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 36 | pub struct Scalar; 37 | 38 | impl Simd for Scalar { 39 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::Scalar; 40 | 41 | type Vu32 = Vu32; 42 | type Vf32 = Vf32; 43 | type Vf64 = Vf64; 44 | 45 | #[cfg(target_pointer_width = "32")] 46 | type Vusize = Vu32; 47 | 48 | //#[cfg(target_pointer_width = "32")] 49 | //type Visize = Vi32; 50 | 51 | /* 52 | type Vi32 = Vi32; 53 | type Vi64 = Vi64; 54 | 55 | type Vu64 = Vu64; 56 | 57 | #[cfg(target_pointer_width = "64")] 58 | type Vusize = Vu64; 59 | 60 | #[cfg(target_pointer_width = "64")] 61 | type Visize = Vi64; 62 | */ 63 | } 64 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/polyfills.rs: -------------------------------------------------------------------------------- 1 | #[inline(always)] 2 | pub fn bool_to_u32(value: bool) -> u32 { 3 | //if value { 0xFFFF_FFFF } else { 0 } 4 | -(value as i32) as u32 5 | } 6 | 7 | #[inline(always)] 8 | pub fn bool_to_u64(value: bool) -> u32 { 9 | //if value { 0xFFFF_FFFF_FFFF_FFFF } else { 0 } 10 | -(value as i64) as u64 11 | } 12 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/vf32.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(f32x1: f32 => f32); 4 | impl Default for f32x1 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(0.0) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for f32x1 { 12 | type Element = f32; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(value) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new(0.0) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new(*src) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | Self::new(src.read_unaligned()) 32 | } 33 | 34 | #[inline(always)] 35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 36 | *dst = self.value; 37 | } 38 | 39 | #[inline(always)] 40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 41 | dst.write_unaligned(self.value) 42 | } 43 | 44 | decl_base_common!(#[target_feature()] f32x1: f32 => f32); 45 | } 46 | 47 | impl SimdBitwise for f32x1 { 48 | const FULL_BITMASK: u16 = 1; 49 | 50 | #[inline(always)] 51 | fn bitmask(self) -> u16 { 52 | self.into_bits().bitmask() 53 | } 54 | 55 | #[inline(always)] 56 | unsafe fn _mm_not(self) -> Self { 57 | self ^ Self::splat(f32::from_bits(!0)) 58 | } 59 | 60 | #[inline(always)] 61 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 62 | Self::new(f32::from_bits(self.value.to_bits() & rhs.value.to_bits())) 63 | } 64 | 65 | #[inline(always)] 66 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 67 | Self::new(f32::from_bits(self.value.to_bits() | rhs.value.to_bits())) 68 | } 69 | 70 | #[inline(always)] 71 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 72 | Self::new(f32::from_bits(self.value.to_bits() ^ rhs.value.to_bits())) 73 | } 74 | 75 | #[inline(always)] 76 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 77 | Self::new(f32::from_bits(self.value.to_bits() << count.value)) 78 | } 79 | 80 | #[inline(always)] 81 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 82 | Self::new(f32::from_bits(self.value.to_bits() >> count.value)) 83 | } 84 | 85 | #[inline(always)] 86 | unsafe fn _mm_shli(self, count: u32) -> Self { 87 | Self::new(f32::from_bits(self.value.to_bits() << count)) 88 | } 89 | 90 | #[inline(always)] 91 | unsafe fn _mm_shri(self, count: u32) -> Self { 92 | Self::new(f32::from_bits(self.value.to_bits() >> count)) 93 | } 94 | } 95 | 96 | impl PartialEq for f32x1 { 97 | #[inline(always)] 98 | fn eq(&self, other: &Self) -> bool { 99 | self.value == other.value 100 | } 101 | } 102 | 103 | impl SimdMask for f32x1 { 104 | #[inline(always)] 105 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self { 106 | if self.value.to_bits() != 0 { 107 | t 108 | } else { 109 | f 110 | } 111 | } 112 | } 113 | 114 | impl SimdVector for f32x1 { 115 | fn zero() -> Self { 116 | Self::splat(0.0) 117 | } 118 | 119 | fn one() -> Self { 120 | Self::splat(1.0) 121 | } 122 | 123 | fn indexed() -> Self { 124 | Self::splat(0.0) 125 | } 126 | 127 | #[inline(always)] 128 | fn min_value() -> Self { 129 | Self::splat(f32::MIN) 130 | } 131 | 132 | #[inline(always)] 133 | fn max_value() -> Self { 134 | Self::splat(f32::MAX) 135 | } 136 | 137 | #[inline(always)] 138 | fn min(self, other: Self) -> Self { 139 | Self::new(self.value.min(other.value)) 140 | } 141 | 142 | #[inline(always)] 143 | fn max(self, other: Self) -> Self { 144 | Self::new(self.value.max(other.value)) 145 | } 146 | 147 | #[inline(always)] 148 | fn min_element(self) -> Self::Element { 149 | self.value 150 | } 151 | 152 | #[inline(always)] 153 | fn max_element(self) -> Self::Element { 154 | self.value 155 | } 156 | 157 | #[inline(always)] 158 | fn eq(self, other: Self) -> Mask { 159 | Self::new(f32::from_bits(bool_to_u32(self.value == other.value))) 160 | } 161 | 162 | #[inline(always)] 163 | fn lt(self, other: Self) -> Mask { 164 | Self::new(f32::from_bits(bool_to_u32(self.value < other.value))) 165 | } 166 | 167 | #[inline(always)] 168 | fn le(self, other: Self) -> Mask { 169 | Self::new(f32::from_bits(bool_to_u32(self.value <= other.value))) 170 | } 171 | 172 | #[inline(always)] 173 | fn gt(self, other: Self) -> Mask { 174 | Self::new(f32::from_bits(bool_to_u32(self.value > other.value))) 175 | } 176 | 177 | #[inline(always)] 178 | fn ge(self, other: Self) -> Mask { 179 | Self::new(f32::from_bits(bool_to_u32(self.value >= other.value))) 180 | } 181 | 182 | #[inline(always)] 183 | unsafe fn _mm_add(self, rhs: Self) -> Self { 184 | Self::new(Add::add(self.value, rhs.value)) 185 | } 186 | 187 | #[inline(always)] 188 | unsafe fn _mm_sub(self, rhs: Self) -> Self { 189 | Self::new(Sub::sub(self.value, rhs.value)) 190 | } 191 | 192 | #[inline(always)] 193 | unsafe fn _mm_mul(self, rhs: Self) -> Self { 194 | Self::new(Mul::mul(self.value, rhs.value)) 195 | } 196 | 197 | #[inline(always)] 198 | unsafe fn _mm_div(self, rhs: Self) -> Self { 199 | Self::new(Div::div(self.value, rhs.value)) 200 | } 201 | 202 | #[inline(always)] 203 | unsafe fn _mm_rem(self, rhs: Self) -> Self { 204 | Self::new(Rem::rem(self.value, rhs.value)) 205 | } 206 | } 207 | 208 | impl SimdIntoBits for f32x1 { 209 | fn into_bits(self) -> Vu32 { 210 | u32x1::new(self.value.to_bits()) 211 | } 212 | } 213 | 214 | impl SimdFromBits for f32x1 { 215 | fn from_bits(bits: Vu32) -> Self { 216 | Self::new(f32::from_bits(bits.value)) 217 | } 218 | } 219 | 220 | impl_ops!(@UNARY f32x1 Scalar => Not::not, Neg::neg); 221 | impl_ops!(@BINARY f32x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor); 222 | impl_ops!(@SHIFTS f32x1 Scalar => Shr::shr, Shl::shl); 223 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/vf64.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(f64x1: f64 => f64); 4 | impl Default for f64x1 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(0.0) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for f64x1 { 12 | type Element = f64; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(value) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new(0.0) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new(*src) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | Self::new(src.read_unaligned()) 32 | } 33 | 34 | #[inline(always)] 35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 36 | *dst = self.value; 37 | } 38 | 39 | #[inline(always)] 40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 41 | dst.write_unaligned(self.value) 42 | } 43 | 44 | decl_base_common!(#[target_feature()] f64x1: f64 => f64); 45 | } 46 | 47 | impl SimdBitwise for f64x1 { 48 | const FULL_BITMASK: u16 = 1; 49 | 50 | #[inline(always)] 51 | fn bitmask(self) -> u16 { 52 | self.into_bits().bitmask() 53 | } 54 | 55 | #[inline(always)] 56 | unsafe fn _mm_not(self) -> Self { 57 | self ^ Self::splat(f64::from_bits(!0)) 58 | } 59 | 60 | #[inline(always)] 61 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 62 | Self::new(f64::from_bits(self.value.to_bits() & rhs.value.to_bits())) 63 | } 64 | 65 | #[inline(always)] 66 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 67 | Self::new(f64::from_bits(self.value.to_bits() | rhs.value.to_bits())) 68 | } 69 | 70 | #[inline(always)] 71 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 72 | Self::new(f64::from_bits(self.value.to_bits() ^ rhs.value.to_bits())) 73 | } 74 | 75 | #[inline(always)] 76 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 77 | Self::new(f64::from_bits(self.value.to_bits() << count.value)) 78 | } 79 | 80 | #[inline(always)] 81 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 82 | Self::new(f64::from_bits(self.value.to_bits() >> count.value)) 83 | } 84 | 85 | #[inline(always)] 86 | unsafe fn _mm_shli(self, count: u32) -> Self { 87 | Self::new(f64::from_bits(self.value.to_bits() << count)) 88 | } 89 | 90 | #[inline(always)] 91 | unsafe fn _mm_shri(self, count: u32) -> Self { 92 | Self::new(f64::from_bits(self.value.to_bits() >> count)) 93 | } 94 | } 95 | 96 | impl PartialEq for f64x1 { 97 | #[inline(always)] 98 | fn eq(&self, other: &Self) -> bool { 99 | self.value == other.value 100 | } 101 | } 102 | 103 | impl SimdMask for f64x1 { 104 | #[inline(always)] 105 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self { 106 | if self.value.to_bits() != 0 { 107 | t 108 | } else { 109 | f 110 | } 111 | } 112 | } 113 | 114 | impl SimdVector for f64x1 { 115 | fn zero() -> Self { 116 | Self::splat(0.0) 117 | } 118 | 119 | fn one() -> Self { 120 | Self::splat(1.0) 121 | } 122 | 123 | fn indexed() -> Self { 124 | Self::splat(0.0) 125 | } 126 | 127 | #[inline(always)] 128 | fn min_value() -> Self { 129 | Self::splat(f64::MIN) 130 | } 131 | 132 | #[inline(always)] 133 | fn max_value() -> Self { 134 | Self::splat(f64::MAX) 135 | } 136 | 137 | #[inline(always)] 138 | fn min(self, other: Self) -> Self { 139 | Self::new(self.value.min(other.value)) 140 | } 141 | 142 | #[inline(always)] 143 | fn max(self, other: Self) -> Self { 144 | Self::new(self.value.max(other.value)) 145 | } 146 | 147 | #[inline(always)] 148 | fn min_element(self) -> Self::Element { 149 | self.value 150 | } 151 | 152 | #[inline(always)] 153 | fn max_element(self) -> Self::Element { 154 | self.value 155 | } 156 | 157 | #[inline(always)] 158 | fn eq(self, other: Self) -> Mask { 159 | Self::new(f64::from_bits(bool_to_u32(self.value == other.value))) 160 | } 161 | 162 | #[inline(always)] 163 | fn lt(self, other: Self) -> Mask { 164 | Self::new(f64::from_bits(bool_to_u32(self.value < other.value))) 165 | } 166 | 167 | #[inline(always)] 168 | fn le(self, other: Self) -> Mask { 169 | Self::new(f64::from_bits(bool_to_u32(self.value <= other.value))) 170 | } 171 | 172 | #[inline(always)] 173 | fn gt(self, other: Self) -> Mask { 174 | Self::new(f64::from_bits(bool_to_u32(self.value > other.value))) 175 | } 176 | 177 | #[inline(always)] 178 | fn ge(self, other: Self) -> Mask { 179 | Self::new(f64::from_bits(bool_to_u32(self.value >= other.value))) 180 | } 181 | 182 | #[inline(always)] 183 | unsafe fn _mm_add(self, rhs: Self) -> Self { 184 | Self::new(Add::add(self.value, rhs.value)) 185 | } 186 | 187 | #[inline(always)] 188 | unsafe fn _mm_sub(self, rhs: Self) -> Self { 189 | Self::new(Sub::sub(self.value, rhs.value)) 190 | } 191 | 192 | #[inline(always)] 193 | unsafe fn _mm_mul(self, rhs: Self) -> Self { 194 | Self::new(Mul::mul(self.value, rhs.value)) 195 | } 196 | 197 | #[inline(always)] 198 | unsafe fn _mm_div(self, rhs: Self) -> Self { 199 | Self::new(Div::div(self.value, rhs.value)) 200 | } 201 | 202 | #[inline(always)] 203 | unsafe fn _mm_rem(self, rhs: Self) -> Self { 204 | Self::new(Rem::rem(self.value, rhs.value)) 205 | } 206 | } 207 | 208 | impl SimdIntoBits for f64x1 { 209 | fn into_bits(self) -> Vu32 { 210 | u32x1::new(self.value.to_bits()) 211 | } 212 | } 213 | 214 | impl SimdFromBits for f64x1 { 215 | fn from_bits(bits: Vu32) -> Self { 216 | Self::new(f64::from_bits(bits.value)) 217 | } 218 | } 219 | 220 | impl_ops!(@UNARY f64x1 Scalar => Not::not, Neg::neg); 221 | impl_ops!(@BINARY f64x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor); 222 | impl_ops!(@SHIFTS f64x1 Scalar => Shr::shr, Shl::shl); 223 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/vi32.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite/src/backends/scalar/vi32.rs -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/vi64.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite/src/backends/scalar/vi64.rs -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/vu32.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(u32x1: u32 => u32); 4 | impl Default for u32x1 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(0) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for u32x1 { 12 | type Element = u32; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(value) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new(0.0) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new(*src) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | Self::new(src.read_unaligned()) 32 | } 33 | 34 | #[inline(always)] 35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 36 | *dst = self.value; 37 | } 38 | 39 | #[inline(always)] 40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 41 | dst.write_unaligned(self.value) 42 | } 43 | 44 | decl_base_common!(#[target_feature()] u32x1: u32 => u32); 45 | } 46 | 47 | impl SimdBitwise for u32x1 { 48 | const FULL_BITMASK: u16 = 1; 49 | 50 | fn bitmask(self) -> u16 { 51 | (self.value >> 31) as u16 52 | } 53 | 54 | unsafe fn _mm_not(self) -> Self { 55 | Self::new(!self.value) 56 | } 57 | 58 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 59 | Self::new(self.value & rhs.value) 60 | } 61 | 62 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 63 | Self::new(self.value | rhs.value) 64 | } 65 | 66 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 67 | Self::new(self.value ^ rhs.value) 68 | } 69 | 70 | #[inline(always)] 71 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 72 | Self::new(self.value << count.value) 73 | } 74 | 75 | #[inline(always)] 76 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 77 | Self::new(self.value >> count.value) 78 | } 79 | 80 | #[inline(always)] 81 | unsafe fn _mm_shli(self, count: u32) -> Self { 82 | Self::new(self.value << count) 83 | } 84 | 85 | #[inline(always)] 86 | unsafe fn _mm_shri(self, count: u32) -> Self { 87 | Self::new(self.value >> count) 88 | } 89 | } 90 | 91 | impl PartialEq for u32x1 { 92 | #[inline(always)] 93 | fn eq(&self, other: &Self) -> bool { 94 | self.value == other.value 95 | } 96 | } 97 | 98 | impl Eq for u32x1 {} 99 | 100 | impl SimdMask for u32x1 { 101 | #[inline(always)] 102 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self { 103 | if self.value != 0 { 104 | t 105 | } else { 106 | f 107 | } 108 | } 109 | 110 | #[inline(always)] 111 | unsafe fn _mm_all(self) -> bool { 112 | self._mm_any() // only one value 113 | } 114 | 115 | #[inline(always)] 116 | unsafe fn _mm_any(self) -> bool { 117 | self.value != 0 118 | } 119 | 120 | #[inline(always)] 121 | unsafe fn _mm_none(self) -> bool { 122 | self.value == 0 123 | } 124 | } 125 | 126 | impl SimdVector for u32x1 { 127 | fn zero() -> Self { 128 | Self::new(0) 129 | } 130 | 131 | fn one() -> Self { 132 | Self::new(1) 133 | } 134 | 135 | fn indexed() -> Self { 136 | Self::new(0) 137 | } 138 | 139 | #[inline(always)] 140 | fn min_value() -> Self { 141 | Self::splat(u32::MIN) 142 | } 143 | 144 | #[inline(always)] 145 | fn max_value() -> Self { 146 | Self::splat(u32::MAX) 147 | } 148 | 149 | #[inline(always)] 150 | fn min_element(self) -> Self::Element { 151 | self.value 152 | } 153 | 154 | #[inline(always)] 155 | fn max_element(self) -> Self::Element { 156 | self.value 157 | } 158 | 159 | #[inline(always)] 160 | fn eq(self, other: Self) -> Mask { 161 | Self::new(bool_to_u32(self.value == other.value)) 162 | } 163 | 164 | #[inline(always)] 165 | fn lt(self, other: Self) -> Mask { 166 | Self::new(bool_to_u32(self.value < other.value)) 167 | } 168 | 169 | #[inline(always)] 170 | fn le(self, other: Self) -> Mask { 171 | Self::new(bool_to_u32(self.value <= other.value)) 172 | } 173 | 174 | #[inline(always)] 175 | fn gt(self, other: Self) -> Mask { 176 | Self::new(bool_to_u32(self.value > other.value)) 177 | } 178 | 179 | #[inline(always)] 180 | fn ge(self, other: Self) -> Mask { 181 | Self::new(bool_to_u32(self.value >= other.value)) 182 | } 183 | 184 | #[inline(always)] 185 | unsafe fn _mm_add(self, rhs: Self) -> Self { 186 | Self::new(Add::add(self.value, rhs.value)) 187 | } 188 | 189 | #[inline(always)] 190 | unsafe fn _mm_sub(self, rhs: Self) -> Self { 191 | Self::new(Sub::sub(self.value, rhs.value)) 192 | } 193 | 194 | #[inline(always)] 195 | unsafe fn _mm_mul(self, rhs: Self) -> Self { 196 | Self::new(Mul::mul(self.value, rhs.value)) 197 | } 198 | 199 | #[inline(always)] 200 | unsafe fn _mm_div(self, rhs: Self) -> Self { 201 | Self::new(Div::div(self.value, rhs.value)) 202 | } 203 | 204 | #[inline(always)] 205 | unsafe fn _mm_rem(self, rhs: Self) -> Self { 206 | Self::new(Rem::rem(self.value, rhs.value)) 207 | } 208 | } 209 | 210 | impl SimdIntVector for u32x1 { 211 | fn saturating_add(self, rhs: Self) -> Self { 212 | Self::new(self.value.saturating_add(rhs.value)) 213 | } 214 | 215 | fn saturating_sub(self, rhs: Self) -> Self { 216 | Self::new(self.value.saturating_add(rhs.value)) 217 | } 218 | 219 | fn wrapping_sum(self) -> Self::Element { 220 | self.value 221 | } 222 | 223 | fn wrapping_product(self) -> Self::Element { 224 | self.value 225 | } 226 | 227 | fn rolv(self, cnt: Vu32) -> Self { 228 | Self::new(self.value.rotate_left(cnt.value)) 229 | } 230 | 231 | fn rorv(self, cnt: Vu32) -> Self { 232 | Self::new(self.value.rotate_right(cnt.value)) 233 | } 234 | 235 | fn reverse_bits(self) -> Self { 236 | Self::new(self.value.reverse_bits()) 237 | } 238 | 239 | fn count_ones(self) -> Self { 240 | Self::new(self.value.count_ones()) 241 | } 242 | 243 | fn count_zeros(self) -> Self { 244 | Self::new(self.value.count_zeros()) 245 | } 246 | 247 | fn leading_ones(self) -> Self { 248 | Self::new(self.value.leading_ones()) 249 | } 250 | 251 | fn leading_zeros(self) -> Self { 252 | Self::new(self.value.leading_zeros()) 253 | } 254 | } 255 | 256 | impl SimdUnsignedIntVector for u32x1 { 257 | #[inline(always)] 258 | fn next_power_of_two_m1(mut self) -> Self { 259 | self |= (self >> 1); 260 | self |= (self >> 2); 261 | self |= (self >> 4); 262 | self |= (self >> 8); 263 | self |= (self >> 16); 264 | self 265 | } 266 | } 267 | 268 | impl_ops!(@UNARY u32x1 Scalar => Not::not); 269 | impl_ops!(@BINARY u32x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor); 270 | impl_ops!(@SHIFTS u32x1 Scalar => Shr::shr, Shl::shl); 271 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/scalar/vu64.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(u64x1: u64 => u64); 4 | impl Default for u64x1 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(0) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for u64x1 { 12 | type Element = u64; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(value) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new(0.0) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new(*src) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | Self::new(src.read_unaligned()) 32 | } 33 | 34 | #[inline(always)] 35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 36 | *dst = self.value; 37 | } 38 | 39 | #[inline(always)] 40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 41 | dst.write_unaligned(self.value) 42 | } 43 | 44 | decl_base_common!(#[target_feature()] u64x1: u64 => u64); 45 | } 46 | 47 | impl SimdBitwise for u64x1 { 48 | const FULL_BITMASK: u16 = 1; 49 | 50 | fn bitmask(self) -> u16 { 51 | (self.value >> 63) as u16 52 | } 53 | 54 | unsafe fn _mm_not(self) -> Self { 55 | Self::new(!self.value) 56 | } 57 | 58 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 59 | Self::new(self.value & rhs.value) 60 | } 61 | 62 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 63 | Self::new(self.value | rhs.value) 64 | } 65 | 66 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 67 | Self::new(self.value ^ rhs.value) 68 | } 69 | 70 | #[inline(always)] 71 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 72 | Self::new(self.value << count.value) 73 | } 74 | 75 | #[inline(always)] 76 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 77 | Self::new(self.value >> count.value) 78 | } 79 | 80 | #[inline(always)] 81 | unsafe fn _mm_shli(self, count: u32) -> Self { 82 | Self::new(self.value << count) 83 | } 84 | 85 | #[inline(always)] 86 | unsafe fn _mm_shri(self, count: u32) -> Self { 87 | Self::new(self.value >> count) 88 | } 89 | } 90 | 91 | impl PartialEq for u64x1 { 92 | #[inline(always)] 93 | fn eq(&self, other: &Self) -> bool { 94 | self.value == other.value 95 | } 96 | } 97 | 98 | impl Eq for u64x1 {} 99 | 100 | impl SimdMask for u64x1 { 101 | #[inline(always)] 102 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self { 103 | if self.value != 0 { 104 | t 105 | } else { 106 | f 107 | } 108 | } 109 | 110 | #[inline(always)] 111 | unsafe fn _mm_all(self) -> bool { 112 | self._mm_any() // only one value 113 | } 114 | 115 | #[inline(always)] 116 | unsafe fn _mm_any(self) -> bool { 117 | self.value != 0 118 | } 119 | 120 | #[inline(always)] 121 | unsafe fn _mm_none(self) -> bool { 122 | self.value == 0 123 | } 124 | } 125 | 126 | impl SimdVector for u64x1 { 127 | fn zero() -> Self { 128 | Self::new(0) 129 | } 130 | 131 | fn one() -> Self { 132 | Self::new(1) 133 | } 134 | 135 | fn indexed() -> Self { 136 | Self::new(0) 137 | } 138 | 139 | #[inline(always)] 140 | fn min_value() -> Self { 141 | Self::splat(u64::MIN) 142 | } 143 | 144 | #[inline(always)] 145 | fn max_value() -> Self { 146 | Self::splat(u64::MAX) 147 | } 148 | 149 | #[inline(always)] 150 | fn min_element(self) -> Self::Element { 151 | self.value 152 | } 153 | 154 | #[inline(always)] 155 | fn max_element(self) -> Self::Element { 156 | self.value 157 | } 158 | 159 | #[inline(always)] 160 | fn eq(self, other: Self) -> Mask { 161 | Self::new(bool_to_u64(self.value == other.value)) 162 | } 163 | 164 | #[inline(always)] 165 | fn lt(self, other: Self) -> Mask { 166 | Self::new(bool_to_u64(self.value < other.value)) 167 | } 168 | 169 | #[inline(always)] 170 | fn le(self, other: Self) -> Mask { 171 | Self::new(bool_to_u64(self.value <= other.value)) 172 | } 173 | 174 | #[inline(always)] 175 | fn gt(self, other: Self) -> Mask { 176 | Self::new(bool_to_u64(self.value > other.value)) 177 | } 178 | 179 | #[inline(always)] 180 | fn ge(self, other: Self) -> Mask { 181 | Self::new(bool_to_u64(self.value >= other.value)) 182 | } 183 | 184 | #[inline(always)] 185 | unsafe fn _mm_add(self, rhs: Self) -> Self { 186 | Self::new(Add::add(self.value, rhs.value)) 187 | } 188 | 189 | #[inline(always)] 190 | unsafe fn _mm_sub(self, rhs: Self) -> Self { 191 | Self::new(Sub::sub(self.value, rhs.value)) 192 | } 193 | 194 | #[inline(always)] 195 | unsafe fn _mm_mul(self, rhs: Self) -> Self { 196 | Self::new(Mul::mul(self.value, rhs.value)) 197 | } 198 | 199 | #[inline(always)] 200 | unsafe fn _mm_div(self, rhs: Self) -> Self { 201 | Self::new(Div::div(self.value, rhs.value)) 202 | } 203 | 204 | #[inline(always)] 205 | unsafe fn _mm_rem(self, rhs: Self) -> Self { 206 | Self::new(Rem::rem(self.value, rhs.value)) 207 | } 208 | } 209 | 210 | impl SimdIntVector for u64x1 { 211 | fn saturating_add(self, rhs: Self) -> Self { 212 | Self::new(self.value.saturating_add(rhs.value)) 213 | } 214 | 215 | fn saturating_sub(self, rhs: Self) -> Self { 216 | Self::new(self.value.saturating_add(rhs.value)) 217 | } 218 | 219 | fn wrapping_sum(self) -> Self::Element { 220 | self.value 221 | } 222 | 223 | fn wrapping_product(self) -> Self::Element { 224 | self.value 225 | } 226 | 227 | fn rolv(self, cnt: Vu32) -> Self { 228 | Self::new(self.value.rotate_left(cnt.value)) 229 | } 230 | 231 | fn rorv(self, cnt: Vu32) -> Self { 232 | Self::new(self.value.rotate_right(cnt.value)) 233 | } 234 | 235 | fn reverse_bits(self) -> Self { 236 | Self::new(self.value.reverse_bits()) 237 | } 238 | 239 | fn count_ones(self) -> Self { 240 | Self::new(self.value.count_ones()) 241 | } 242 | 243 | fn count_zeros(self) -> Self { 244 | Self::new(self.value.count_zeros()) 245 | } 246 | 247 | fn leading_ones(self) -> Self { 248 | Self::new(self.value.leading_ones()) 249 | } 250 | 251 | fn leading_zeros(self) -> Self { 252 | Self::new(self.value.leading_zeros()) 253 | } 254 | } 255 | 256 | impl SimdUnsignedIntVector for u64x1 { 257 | #[inline(always)] 258 | fn next_power_of_two_m1(mut self) -> Self { 259 | self |= (self >> 1); 260 | self |= (self >> 2); 261 | self |= (self >> 4); 262 | self |= (self >> 8); 263 | self |= (self >> 16); 264 | self |= (self >> 32); 265 | self 266 | } 267 | } 268 | 269 | impl_ops!(@UNARY u64x1 Scalar => Not::not); 270 | impl_ops!(@BINARY u64x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor); 271 | impl_ops!(@SHIFTS u64x1 Scalar => Shr::shr, Shl::shl); 272 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse2/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use crate::*; 4 | 5 | use core::{ 6 | fmt, 7 | marker::PhantomData, 8 | mem::{transmute, transmute_copy}, 9 | ops::*, 10 | }; 11 | 12 | use crate::arch::sse2::*; 13 | 14 | use half::f16; 15 | 16 | pub(crate) mod polyfills; 17 | 18 | use super::polyfills::*; 19 | use polyfills::*; 20 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse2/polyfills.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | #[inline(always)] 4 | pub unsafe fn _mm_blendv_epi8x(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i { 5 | _mm_or_si128(_mm_and_si128(mask, xmm0), _mm_andnot_si128(mask, xmm1)) 6 | } 7 | 8 | #[inline(always)] 9 | pub unsafe fn _mm_signbits_epi32x(v: __m128i) -> __m128i { 10 | _mm_srai_epi32(v, 31) 11 | } 12 | 13 | #[inline(always)] 14 | pub unsafe fn _mm_signbits_epi64x(v: __m128i) -> __m128i { 15 | _mm_srai_epi32(_mm_shuffle_epi32(v, _mm_shuffle(3, 3, 1, 1)), 31) 16 | } 17 | 18 | #[inline(always)] 19 | pub unsafe fn _mm_cmpeq_epi64x(a: __m128i, b: __m128i) -> __m128i { 20 | let t = _mm_cmpeq_epi32(a, b); 21 | _mm_and_si128(t, _mm_shuffle_epi32(t, 177)) 22 | } 23 | 24 | #[inline(always)] 25 | pub unsafe fn _mm_mullo_epi64x(xmm0: __m128i, xmm1: __m128i) -> __m128i { 26 | let xmm2 = _mm_srli_epi64(xmm1, 32); 27 | let xmm3 = _mm_srli_epi64(xmm0, 32); 28 | 29 | let xmm2 = _mm_mul_epu32(xmm2, xmm0); 30 | let xmm3 = _mm_mul_epu32(xmm1, xmm3); 31 | 32 | let xmm2 = _mm_add_epi64(xmm3, xmm2); 33 | let xmm2 = _mm_slli_epi64(xmm2, 32); 34 | 35 | let xmm0 = _mm_mul_epu32(xmm1, xmm0); 36 | let xmm0 = _mm_add_epi64(xmm0, xmm2); 37 | 38 | xmm0 39 | } 40 | 41 | // SSE2 Version 42 | #[inline(always)] 43 | pub unsafe fn _mm_adds_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i { 44 | let res = _mm_add_epi32(lhs, rhs); 45 | 46 | _mm_blendv_epi8x( 47 | res, 48 | _mm_blendv_epi8x( 49 | _mm_set1_epi32(i32::MIN), 50 | _mm_set1_epi32(i32::MAX), 51 | _mm_signbits_epi32x(res), 52 | ), 53 | _mm_xor_si128(rhs, _mm_cmpgt_epi32(lhs, res)), 54 | ) 55 | } 56 | 57 | // SSE2 Version 58 | #[inline(always)] 59 | pub unsafe fn _mm_subs_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i { 60 | let res = _mm_sub_epi32(lhs, rhs); 61 | 62 | _mm_blendv_epi8x( 63 | res, 64 | _mm_blendv_epi8x( 65 | _mm_set1_epi32(i32::MIN), 66 | _mm_set1_epi32(i32::MAX), 67 | _mm_signbits_epi32x(res), 68 | ), 69 | _mm_xor_si128(_mm_cmpgt_epi32(rhs, _mm_setzero_si128()), _mm_cmpgt_epi32(lhs, res)), 70 | ) 71 | } 72 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse42/mod.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use crate::*; 4 | 5 | use core::{ 6 | fmt, 7 | marker::PhantomData, 8 | mem::{transmute, transmute_copy}, 9 | ops::*, 10 | }; 11 | 12 | use crate::arch::sse42::*; 13 | 14 | use half::f16; 15 | 16 | pub(crate) mod polyfills; 17 | 18 | use super::polyfills::*; 19 | use polyfills::*; 20 | 21 | mod vf32; 22 | 23 | /* 24 | //mod vf32; 25 | //mod vf64; 26 | //mod vi16; 27 | mod vi32; 28 | //mod vi64; 29 | mod vu32; 30 | //mod vu64; 31 | 32 | use vi32::i32x4; 33 | use vu32::u32x4; 34 | 35 | pub type Vi32 = i32x4; 36 | pub type Vu32 = u32x4; 37 | 38 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 39 | pub struct SSE42; 40 | 41 | impl Simd for SSE42 { 42 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::SSE42; 43 | 44 | type Vi32 = i32x4; 45 | type Vu32 = u32x4; 46 | } 47 | */ 48 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse42/polyfills.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | use crate::backends::sse2::polyfills::*; 4 | 5 | #[inline(always)] 6 | pub unsafe fn _mm_blendv_epi32x(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i { 7 | _mm_castps_si128(_mm_blendv_ps( 8 | _mm_castsi128_ps(xmm0), 9 | _mm_castsi128_ps(xmm1), 10 | _mm_castsi128_ps(mask), 11 | )) 12 | } 13 | 14 | #[inline(always)] 15 | pub unsafe fn _mm_blendv_epi64x(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i { 16 | _mm_castpd_si128(_mm_blendv_pd( 17 | _mm_castsi128_pd(xmm0), 18 | _mm_castsi128_pd(xmm1), 19 | _mm_castsi128_pd(mask), 20 | )) 21 | } 22 | 23 | #[inline(always)] 24 | pub unsafe fn _mm_cvtepu32_psx(x: __m128i) -> __m128 { 25 | let xmm0 = x; 26 | let xmm1 = _mm_set1_epi32(0x4B000000u32 as i32); 27 | let xmm1 = _mm_blend_epi16(xmm0, xmm1, 170); 28 | let xmm0 = _mm_srli_epi32(xmm0, 16); 29 | let xmm2 = _mm_set1_epi32(0x53000000u32 as i32); 30 | let xmm0 = _mm_castsi128_ps(_mm_blend_epi16(xmm0, xmm2, 170)); 31 | let xmm2 = _mm_set1_ps(f32::from_bits(0x53000080)); 32 | let xmm0 = _mm_sub_ps(xmm0, xmm2); 33 | let xmm0 = _mm_add_ps(_mm_castsi128_ps(xmm1), xmm0); 34 | 35 | xmm0 36 | } 37 | 38 | #[inline(always)] 39 | pub unsafe fn _mm_cvtpd_epi64x_limited(x: __m128d) -> __m128i { 40 | // https://stackoverflow.com/a/41148578/2083075 41 | let m = _mm_set1_pd(transmute::(0x0018000000000000) as f64); 42 | _mm_sub_epi64(_mm_castpd_si128(_mm_add_pd(x, m)), _mm_castpd_si128(m)) 43 | } 44 | 45 | #[inline(always)] 46 | pub unsafe fn _mm_cvtpd_epu64x_limited(x: __m128d) -> __m128i { 47 | // https://stackoverflow.com/a/41148578/2083075 48 | let m = _mm_set1_pd(transmute::(0x0010000000000000) as f64); 49 | _mm_xor_si128(_mm_castpd_si128(_mm_add_pd(x, m)), _mm_castpd_si128(m)) 50 | } 51 | 52 | // https://stackoverflow.com/a/41223013/2083075 53 | #[inline(always)] 54 | #[rustfmt::skip] 55 | pub unsafe fn _mm_cvtepu64_pdx(v: __m128i) -> __m128d { 56 | let magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52 encoded as floating-point 57 | let magic_i_hi32 = _mm_set1_epi64x(0x4530000000000000); // 2^84 encoded as floating-point 58 | let magic_i_all = _mm_set1_epi64x(0x4530000000100000); // 2^84 + 2^52 encoded as floating-point 59 | let magic_d_all = _mm_castsi128_pd(magic_i_all); 60 | 61 | let v_lo = _mm_blend_epi16(magic_i_lo, v, 0b00110011); // Blend the 32 lowest significant bits of v with magic_int_lo 62 | let mut v_hi = _mm_srli_epi64(v, 32); // Extract the 32 most significant bits of v 63 | v_hi = _mm_xor_si128(v_hi, magic_i_hi32); // Blend v_hi with 0x45300000 64 | let v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all); // Compute in double precision: 65 | _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo)) // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! 66 | } 67 | 68 | // https://stackoverflow.com/a/41223013/2083075 69 | #[inline(always)] 70 | #[rustfmt::skip] 71 | pub unsafe fn _mm_cvtepi64_pdx(v: __m128i) -> __m128d { 72 | let magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52 encoded as floating-point 73 | let magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63 encoded as floating-point 74 | let magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52 encoded as floating-point 75 | let magic_d_all = _mm_castsi128_pd(magic_i_all); 76 | 77 | let v_lo = _mm_blend_epi16(magic_i_lo, v, 0b00110011); // Blend the 32 lowest significant bits of v with magic_int_lo 78 | let mut v_hi = _mm_srli_epi64(v, 32); // Extract the 32 most significant bits of v 79 | v_hi = _mm_xor_si128(v_hi, magic_i_hi32); // Flip the msb of v_hi and blend with 0x45300000 80 | let v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all); // Compute in double precision: 81 | _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo)) // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition !! 82 | } 83 | 84 | // NOTE: Saturated add/sub use the "sign bit" as the select bit, 85 | // so when porting to SSE2 it'll need to use the `signbits` methods to properly select with `_mm_blendv_epi8x` 86 | 87 | #[inline(always)] 88 | pub unsafe fn _mm_adds_epi64x(lhs: __m128i, rhs: __m128i) -> __m128i { 89 | let res = _mm_add_epi64(lhs, rhs); 90 | 91 | _mm_blendv_epi64x( 92 | res, 93 | _mm_blendv_epi64x(_mm_set1_epi64x(i64::MIN), _mm_set1_epi64x(i64::MAX), res), 94 | _mm_xor_si128(rhs, _mm_cmpgt_epi64(lhs, res)), 95 | ) 96 | } 97 | 98 | #[inline(always)] 99 | pub unsafe fn _mm_adds_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i { 100 | let res = _mm_add_epi32(lhs, rhs); 101 | 102 | _mm_blendv_epi32x( 103 | res, 104 | _mm_blendv_epi32x(_mm_set1_epi32(i32::MIN), _mm_set1_epi32(i32::MAX), res), 105 | _mm_xor_si128(rhs, _mm_cmpgt_epi32(lhs, res)), 106 | ) 107 | } 108 | 109 | #[inline(always)] 110 | pub unsafe fn _mm_subs_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i { 111 | let res = _mm_sub_epi32(lhs, rhs); 112 | 113 | _mm_blendv_epi32x( 114 | res, 115 | _mm_blendv_epi32x(_mm_set1_epi32(i32::MIN), _mm_set1_epi32(i32::MAX), res), 116 | _mm_xor_si128(_mm_cmpgt_epi32(rhs, _mm_setzero_si128()), _mm_cmpgt_epi32(lhs, res)), 117 | ) 118 | } 119 | 120 | #[inline(always)] 121 | pub unsafe fn _mm_subs_epi64x(lhs: __m128i, rhs: __m128i) -> __m128i { 122 | let res = _mm_sub_epi64(lhs, rhs); 123 | 124 | _mm_blendv_epi64x( 125 | res, 126 | _mm_blendv_epi64x(_mm_set1_epi64x(i64::MIN), _mm_set1_epi64x(i64::MAX), res), 127 | _mm_xor_si128(_mm_cmpgt_epi64(rhs, _mm_setzero_si128()), _mm_cmpgt_epi64(lhs, res)), 128 | ) 129 | } 130 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse42/vf32.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse42/vi32.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(i32x4: i32 => __m128i); 4 | impl Default for i32x4 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(unsafe { _mm_setzero_si128() }) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for i32x4 { 12 | type Element = i32; 13 | 14 | #[inline(always)] 15 | fn splat(value: Self::Element) -> Self { 16 | Self::new(unsafe { _mm_set1_epi32(value) }) 17 | } 18 | 19 | #[inline(always)] 20 | unsafe fn undefined() -> Self { 21 | Self::new(_mm_undefined_si128()) 22 | } 23 | 24 | #[inline(always)] 25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self { 26 | Self::new(_mm_load_si128(src as *const _)) 27 | } 28 | 29 | #[inline(always)] 30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self { 31 | Self::new(_mm_loadu_si128(src as *const _)) 32 | } 33 | 34 | #[inline(always)] 35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) { 36 | _mm_store_si128(dst as *mut _, self.value) 37 | } 38 | 39 | #[inline(always)] 40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) { 41 | _mm_storeu_si128(dst as *mut _, self.value) 42 | } 43 | 44 | #[inline] 45 | #[target_feature(enable = "sse4.1")] 46 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element { 47 | *transmute::<&_, *const Self::Element>(&self).add(index) 48 | } 49 | 50 | #[inline] 51 | #[target_feature(enable = "sse4.1")] 52 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self { 53 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value; 54 | self 55 | } 56 | } 57 | 58 | impl SimdBitwise for i32x4 { 59 | #[inline(always)] 60 | fn and_not(self, other: Self) -> Self { 61 | Self::new(unsafe { _mm_andnot_si128(self.value, other.value) }) 62 | } 63 | 64 | const FULL_BITMASK: u16 = 0b1111; 65 | 66 | #[inline(always)] 67 | fn bitmask(self) -> u16 { 68 | unsafe { _mm_movemask_ps(_mm_castsi128_ps(self.value)) as u16 } 69 | } 70 | 71 | #[inline(always)] 72 | unsafe fn _mm_not(self) -> Self { 73 | self ^ Self::splat(!0) 74 | } 75 | 76 | #[inline(always)] 77 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 78 | Self::new(_mm_and_si128(self.value, rhs.value)) 79 | } 80 | 81 | #[inline(always)] 82 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 83 | Self::new(_mm_or_si128(self.value, rhs.value)) 84 | } 85 | 86 | #[inline(always)] 87 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 88 | Self::new(_mm_xor_si128(self.value, rhs.value)) 89 | } 90 | 91 | #[inline(always)] 92 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 93 | Self::zip(self, count, |x, s| x >> s) 94 | } 95 | 96 | #[inline(always)] 97 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 98 | Self::zip(self, count, |x, s| x << s) 99 | } 100 | 101 | #[inline(always)] 102 | unsafe fn _mm_shli(self, count: u32) -> Self { 103 | Self::new(_mm_sll_epi32(self.value, _mm_cvtsi32_si128(count as i32))) 104 | } 105 | 106 | #[inline(always)] 107 | unsafe fn _mm_shri(self, count: u32) -> Self { 108 | Self::new(_mm_srl_epi32(self.value, _mm_cvtsi32_si128(count as i32))) 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/sse42/vu32.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | decl!(u32x4: u32 => __m128i); 4 | impl Default for u32x4 { 5 | #[inline(always)] 6 | fn default() -> Self { 7 | Self::new(unsafe { _mm_setzero_si128() }) 8 | } 9 | } 10 | 11 | impl SimdVectorBase for u32x4 { 12 | type Element = u32; 13 | 14 | fn splat(value: Self::Element) -> Self { 15 | Self::new(unsafe { _mm_set1_epi32(value as i32) }) 16 | } 17 | 18 | unsafe fn undefined() -> Self { 19 | Self::new(_mm_undefined_si128()) 20 | } 21 | 22 | #[inline(always)] 23 | unsafe fn load_aligned_unchecked(ptr: *const Self::Element) -> Self { 24 | Self::new(_mm_load_si128(ptr as *const _)) 25 | } 26 | 27 | #[inline(always)] 28 | unsafe fn load_unaligned_unchecked(ptr: *const Self::Element) -> Self { 29 | Self::new(_mm_loadu_si128(ptr as *const _)) 30 | } 31 | 32 | #[inline(always)] 33 | unsafe fn store_aligned_unchecked(self, ptr: *mut Self::Element) { 34 | _mm_store_si128(ptr as *mut _, self.value) 35 | } 36 | 37 | #[inline(always)] 38 | unsafe fn store_unaligned_unchecked(self, ptr: *mut Self::Element) { 39 | _mm_storeu_si128(ptr as *mut _, self.value) 40 | } 41 | 42 | decl_base_common!(#[target_feature(enable = "sse4.1")] u32x4: u32 => __m128i); 43 | } 44 | 45 | impl SimdBitwise for u32x4 { 46 | fn and_not(self, other: Self) -> Self { 47 | Self::new(unsafe { _mm_andnot_si128(self.value, other.value) }) 48 | } 49 | 50 | const FULL_BITMASK: u16 = 0b1111; 51 | 52 | fn bitmask(self) -> u16 { 53 | unsafe { _mm_movemask_ps(_mm_castsi128_ps(self.value)) } 54 | } 55 | 56 | #[inline(always)] 57 | unsafe fn _mm_not(self) -> Self { 58 | self ^ Self::splat(!0) 59 | } 60 | 61 | #[inline(always)] 62 | unsafe fn _mm_bitand(self, rhs: Self) -> Self { 63 | Self::new(_mm_and_si128(self.value, rhs.value)) 64 | } 65 | 66 | #[inline(always)] 67 | unsafe fn _mm_bitor(self, rhs: Self) -> Self { 68 | Self::new(_mm_or_si128(self.value, rhs.value)) 69 | } 70 | 71 | #[inline(always)] 72 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self { 73 | Self::new(_mm_xor_si128(self.value, rhs.value)) 74 | } 75 | 76 | #[inline(always)] 77 | unsafe fn _mm_shr(self, count: Vu32) -> Self { 78 | Self::zip(self, count, Shr::shr) 79 | } 80 | 81 | #[inline(always)] 82 | unsafe fn _mm_shl(self, count: Vu32) -> Self { 83 | Self::zip(self, count, Shl::shl) 84 | } 85 | 86 | #[inline(always)] 87 | unsafe fn _mm_shli(self, count: u32) -> Self { 88 | Self::new(_mm_sll_epi32(self.value, _mm_cvtsi32_si128(count as i32))) 89 | } 90 | 91 | #[inline(always)] 92 | unsafe fn _mm_shri(self, count: u32) -> Self { 93 | Self::new(_mm_srl_epi32(self.value, _mm_cvtsi32_si128(count as i32))) 94 | } 95 | } 96 | 97 | impl PartialEq for u32x4 { 98 | fn eq(&self, other: &Self) -> bool { 99 | >::eq(*self, *other).all() 100 | } 101 | 102 | fn ne(&self, other: &Self) -> bool { 103 | >::ne(*self, *other).any() 104 | } 105 | } 106 | 107 | impl Eq for u32x4 {} 108 | 109 | impl SimdMask for u32x8 { 110 | #[inline(always)] 111 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self { 112 | Self::new(_mm256_blendv_epi8(f.value, t.value, self.value)) 113 | } 114 | 115 | #[inline(always)] 116 | unsafe fn _mm_all(self) -> bool { 117 | _mm_movemask_epi8(self.value) as u16 == u16::MAX 118 | } 119 | 120 | #[inline(always)] 121 | unsafe fn _mm_any(self) -> bool { 122 | _mm_movemask_epi8(self.value) != 0 123 | } 124 | 125 | #[inline(always)] 126 | unsafe fn _mm_none(self) -> bool { 127 | _mm_movemask_epi8(self.value) == 0 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /crates/thermite/src/backends/wasm32/mod.rs: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /crates/thermite/src/buffer.rs: -------------------------------------------------------------------------------- 1 | extern crate alloc; 2 | 3 | use crate::*; 4 | 5 | use alloc::alloc::{alloc, dealloc, Layout}; 6 | use core::{ 7 | fmt, mem, 8 | ops::{Deref, DerefMut}, 9 | ptr, 10 | }; 11 | 12 | /// Aligned SIMD vector storage 13 | #[repr(transparent)] 14 | pub struct VectorBuffer> { 15 | buffer: *mut [V::Element], 16 | } 17 | 18 | impl> Deref for VectorBuffer { 19 | type Target = [V::Element]; 20 | 21 | #[inline] 22 | fn deref(&self) -> &Self::Target { 23 | self.as_slice() 24 | } 25 | } 26 | 27 | impl> DerefMut for VectorBuffer { 28 | #[inline] 29 | fn deref_mut(&mut self) -> &mut Self::Target { 30 | self.as_mut_slice() 31 | } 32 | } 33 | 34 | impl> fmt::Debug for VectorBuffer { 35 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 36 | self.as_vector_slice().fmt(f) 37 | } 38 | } 39 | 40 | impl> VectorBuffer { 41 | /// Allocates a new SIMD-aligned element buffer and zeroes the elements. 42 | /// 43 | /// Due to the alignment, it will round up the number of elements to the nearest multiple of `V::NUM_ELEMENTS`, 44 | /// making the "wasted" space visible. 45 | pub fn alloc(count: usize) -> Self { 46 | unsafe { 47 | // round up to multiple of NUM_ELEMENTS 48 | // https://stackoverflow.com/a/9194117/2083075 49 | let count = (count + V::NUM_ELEMENTS - 1) & (-(V::NUM_ELEMENTS as isize) as usize); 50 | 51 | // allocate zeroed buffer. All SIMD types are valid when zeroed 52 | VectorBuffer { 53 | buffer: ptr::slice_from_raw_parts_mut( 54 | alloc::alloc::alloc_zeroed(Self::layout(count)) as *mut V::Element, 55 | count, 56 | ), 57 | } 58 | } 59 | } 60 | 61 | #[inline(always)] 62 | pub fn iter_vectors<'a>(&'a self) -> SimdSliceIter<'a, S, V> { 63 | SimdSliceIter::new(self.as_slice()) 64 | } 65 | 66 | #[inline(always)] 67 | pub fn iter_vectors_mut<'a>(&'a mut self) -> AlignedMutIter<'a, S, V> { 68 | unsafe { AlignedMut::new_unchecked(self.as_mut_slice()).iter_mut() } 69 | } 70 | 71 | /// Gathers values from the buffer using more efficient instructions where possible 72 | #[inline(always)] 73 | pub fn gather(&self, indices: S::Vu32) -> V 74 | where 75 | V: SimdVector, 76 | { 77 | V::gather(self.as_slice(), indices.cast()) 78 | } 79 | 80 | /// Fills the buffer with vectors using aligned stores 81 | #[inline] 82 | pub fn fill(&mut self, value: V) { 83 | unsafe { 84 | let ptr = self.as_mut_slice().as_mut_ptr(); 85 | let mut i = 0; 86 | while i < self.len() { 87 | value.store_aligned_unchecked(ptr.add(i)); 88 | i += V::NUM_ELEMENTS; 89 | } 90 | } 91 | } 92 | 93 | #[inline] 94 | pub fn len(&self) -> usize { 95 | unsafe { (*self.buffer).len() } 96 | } 97 | 98 | #[inline] 99 | pub fn len_vectors(&self) -> usize { 100 | self.len() / V::NUM_ELEMENTS 101 | } 102 | 103 | #[inline] 104 | pub fn as_slice(&self) -> &[V::Element] { 105 | unsafe { &*self.buffer } 106 | } 107 | 108 | #[inline] 109 | pub fn as_vector_slice(&self) -> &[V] { 110 | unsafe { &(*(self.buffer as *const [V]))[..self.len_vectors()] } 111 | } 112 | 113 | #[inline] 114 | pub fn as_mut_slice(&mut self) -> &mut [V::Element] { 115 | unsafe { &mut *self.buffer } 116 | } 117 | 118 | #[inline] 119 | pub fn as_mut_vector_slice(&mut self) -> &mut [V] { 120 | unsafe { &mut (*(self.buffer as *mut [V]))[..self.len() / V::NUM_ELEMENTS] } 121 | } 122 | 123 | #[inline] 124 | pub fn load_vector(&self, vector_index: usize) -> V { 125 | let scalar_index = vector_index * V::NUM_ELEMENTS; 126 | let s = self.as_slice(); 127 | assert!(scalar_index < s.len()); 128 | 129 | unsafe { V::load_aligned_unchecked(s.as_ptr().add(vector_index)) } 130 | } 131 | 132 | #[inline] 133 | pub fn store_vector(&mut self, vector_index: usize, value: V) { 134 | let scalar_index = vector_index * V::NUM_ELEMENTS; 135 | let s = self.as_mut_slice(); 136 | assert!(scalar_index < s.len()); 137 | 138 | unsafe { value.store_aligned_unchecked(s.as_mut_ptr().add(vector_index)) } 139 | } 140 | 141 | #[inline(always)] 142 | fn layout(count: usize) -> Layout { 143 | // ensure the buffer has the proper size and alignment for SIMD values 144 | unsafe { Layout::from_size_align_unchecked(count * mem::size_of::(), V::ALIGNMENT) } 145 | } 146 | } 147 | 148 | unsafe impl> Send for VectorBuffer {} 149 | unsafe impl> Sync for VectorBuffer {} 150 | 151 | impl> Drop for VectorBuffer { 152 | fn drop(&mut self) { 153 | unsafe { dealloc(self.buffer as *mut u8, Self::layout(self.len())) } 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /crates/thermite/src/divider.rs: -------------------------------------------------------------------------------- 1 | // Derived from: 2 | // 3 | // libdivide.h - Optimized integer division 4 | // https://libdivide.com 5 | // 6 | // Copyright (C) 2010 - 2019 ridiculous_fish, 7 | // Copyright (C) 2016 - 2019 Kim Walisch, 8 | 9 | #![allow(unused)] 10 | 11 | use core::ops::Deref; 12 | 13 | macro_rules! decl_div_half { 14 | ($($t:ty => $dt:ty),*) => { 15 | paste::paste! {$( 16 | #[inline(always)] 17 | const fn [](u1: $t, u0: $t, v: $t) -> ($t, $t) { 18 | let v = v as $dt; 19 | let n = ((u1 as $dt) << (core::mem::size_of::<$t>() * 8)) | (u0 as $dt); 20 | let res = (n / v) as $t; // truncate 21 | let rem = n.wrapping_sub((res as $dt).wrapping_mul(v)); 22 | (res, rem as $t) 23 | } 24 | )*} 25 | }; 26 | } 27 | 28 | decl_div_half!(u64 => u128, u32 => u64, u16 => u32, u8 => u16); 29 | 30 | /// Divider recommended for constant divisors. 31 | /// 32 | /// When using constant divisors, divisions using this can remove extra branches 33 | /// and generate ideal integer division code. 34 | /// 35 | /// However, when used with dynamic input, the extra branches can be expensive, 36 | /// therefore it is recommended to use the branchfree alternative for dynamic divisors. 37 | #[repr(C, packed)] 38 | pub struct Divider { 39 | multiplier: T, 40 | shift: u8, 41 | } 42 | 43 | /// Divider without branching, useful for dynamic divisors. 44 | /// 45 | /// However, when used with constant input, this may perform extra unnecessary work that could 46 | /// be removed in the branching [`Divider`] 47 | #[repr(transparent)] 48 | #[derive(Copy, PartialEq)] 49 | pub struct BranchfreeDivider(Divider); 50 | 51 | impl Clone for BranchfreeDivider { 52 | fn clone(&self) -> Self { 53 | *self 54 | } 55 | } 56 | 57 | impl Deref for BranchfreeDivider { 58 | type Target = Divider; 59 | 60 | #[inline(always)] 61 | fn deref(&self) -> &Self::Target { 62 | &self.0 63 | } 64 | } 65 | 66 | impl Clone for Divider { 67 | fn clone(&self) -> Self { 68 | *self 69 | } 70 | } 71 | 72 | impl Copy for Divider {} 73 | 74 | impl PartialEq for Divider { 75 | #[inline(always)] 76 | fn eq(&self, other: &Self) -> bool { 77 | self.multiplier() == other.multiplier() && self.shift == other.shift 78 | } 79 | } 80 | 81 | impl Divider { 82 | #[inline(always)] 83 | pub fn multiplier(&self) -> T { 84 | // with repr(C), self points to first value 85 | unsafe { (self as *const Self as *const T).read_unaligned() } 86 | } 87 | 88 | #[inline(always)] 89 | pub fn shift(&self) -> u8 { 90 | // shift has an alignment of 1 byte anyway, so it's fine to read normally 91 | self.shift 92 | } 93 | } 94 | 95 | pub(crate) const ADD_MARKER: u8 = 0x40; 96 | pub(crate) const NEG_DIVISOR: u8 = 0x80; 97 | 98 | macro_rules! impl_shift_mask { 99 | ($($ty:ty),*) => {$( 100 | impl Divider<$ty> { 101 | const BITS: u32 = 8 * core::mem::size_of::<$ty>() as u32; 102 | /// !log2(N::BITS) 103 | pub(crate) const SHIFT_MASK: u8 = !(<$ty>::MAX << Self::BITS.trailing_zeros()) as u8; 104 | } 105 | )*}; 106 | } 107 | 108 | impl_shift_mask!(u8, u16, u32, u64); 109 | 110 | macro_rules! impl_unsigned_divider { 111 | ($($t:ty => $dt:ty),*) => { 112 | paste::paste! {$( 113 | impl BranchfreeDivider<$t> { 114 | /// See docs for [`BranchfreeDivider`] and [`Divider`] 115 | #[inline(always)] 116 | pub const fn [<$t>](d: $t) -> Self { 117 | Divider::<$t>::[<$t _branchfree>](d) 118 | } 119 | } 120 | 121 | impl Divider<$t> { 122 | /// See docs for [`Divider`] 123 | #[inline(always)] 124 | pub const fn [<$t>](d: $t) -> Self { 125 | Self::[<$t _internal>](d, false) 126 | } 127 | 128 | /// See docs for [`BranchfreeDivider`] and [`Divider`] 129 | #[inline] 130 | pub const fn [<$t _branchfree>](d: $t) -> BranchfreeDivider<$t> { 131 | let mut divider = Self::[<$t _internal>](d, true); 132 | divider.shift &= Self::SHIFT_MASK; 133 | BranchfreeDivider(divider) 134 | } 135 | 136 | #[inline] 137 | const fn [<$t _internal>](d: $t, bf: bool) -> Self { 138 | if d == 0 { 139 | return Divider { 140 | multiplier: 0, 141 | shift: Self::BITS as u8, // shift to zero 142 | } 143 | } 144 | 145 | let floor_log_2_d = Self::BITS - 1 - d.leading_zeros(); 146 | 147 | if d.is_power_of_two() { 148 | Divider { 149 | multiplier: 0, 150 | // We need to subtract 1 from the shift value in case of an unsigned 151 | // branchfree divider because there is a hardcoded right shift by 1 152 | // in its division algorithm. 153 | shift: (floor_log_2_d - bf as u32) as u8, 154 | } 155 | } else { 156 | let k = 1 << floor_log_2_d; 157 | let (mut proposed_m, rem) = [](k, 0, d); 158 | 159 | let e = d.wrapping_sub(rem); 160 | 161 | let shift = if !bf && e < k { 162 | floor_log_2_d as u8 163 | } else { 164 | proposed_m = proposed_m.wrapping_add(proposed_m); 165 | let rem2 = rem.wrapping_add(rem); 166 | 167 | if rem2 >= d || rem2 < rem { 168 | proposed_m = proposed_m.wrapping_add(1); 169 | } 170 | 171 | floor_log_2_d as u8 | ADD_MARKER 172 | }; 173 | 174 | Divider { 175 | multiplier: proposed_m.wrapping_add(1), 176 | shift, 177 | } 178 | } 179 | } 180 | } 181 | )*} 182 | } 183 | } 184 | 185 | macro_rules! impl_signed_divider { 186 | ($($t:ty => $ut:ty => $udt:ty),*) => { 187 | paste::paste!{$( 188 | impl BranchfreeDivider<$t> { 189 | /// See docs for [`BranchfreeDivider`] and [`Divider`] 190 | #[inline(always)] 191 | pub const fn [<$t>](d: $t) -> Self { 192 | Divider::<$t>::[<$t _branchfree>](d) 193 | } 194 | } 195 | 196 | impl Divider<$t> { 197 | /// See docs for [`Divider`] 198 | #[inline(always)] 199 | const fn [<$t>](d: $t) -> Self { 200 | Self::[<$t _internal>](d, false) 201 | } 202 | 203 | /// See docs for [`BranchfreeDivider`] and [`Divider`] 204 | #[inline] 205 | const fn [<$t _branchfree>](d: $t) -> BranchfreeDivider<$t> { 206 | let mut divider = Self::[<$t _internal>](d, true); 207 | divider.shift &= Divider::<$ut>::SHIFT_MASK; 208 | BranchfreeDivider(divider) 209 | } 210 | 211 | #[inline] 212 | const fn [<$t _internal>](d: $t, bf: bool) -> Self { 213 | if d == 0 { 214 | return Divider { 215 | multiplier: 0, 216 | shift: Divider::<$ut>::BITS as u8, // shift to zero 217 | }; 218 | } 219 | 220 | let abs_d = d.abs() as $ut; 221 | 222 | let floor_log_2_d = Divider::<$ut>::BITS - 1 - d.leading_zeros(); 223 | 224 | if abs_d.is_power_of_two() { 225 | Divider { 226 | multiplier: 0, 227 | shift: floor_log_2_d as u8 | if d < 0 { NEG_DIVISOR } else { 0 }, 228 | } 229 | } else { 230 | let (mut proposed_m, rem) = [](1 << (floor_log_2_d - 1), 0, abs_d); 231 | 232 | let e = abs_d.wrapping_sub(rem); 233 | 234 | let mut shift = if !bf && e < (1 << floor_log_2_d) { 235 | (floor_log_2_d - 1) as u8 236 | } else { 237 | proposed_m = proposed_m.wrapping_add(proposed_m); 238 | let rem2 = rem.wrapping_add(rem); 239 | 240 | if rem2 >= abs_d || rem2 < rem { 241 | proposed_m = proposed_m.wrapping_add(1); 242 | } 243 | 244 | floor_log_2_d as u8 | ADD_MARKER 245 | }; 246 | 247 | proposed_m = proposed_m.wrapping_add(1); 248 | 249 | let mut multiplier = proposed_m as $t; 250 | 251 | if d < 0 { 252 | shift |= NEG_DIVISOR; 253 | 254 | if !bf { 255 | multiplier = -multiplier; 256 | } 257 | } 258 | 259 | Divider { multiplier, shift } 260 | } 261 | } 262 | } 263 | )*} 264 | } 265 | } 266 | 267 | impl_unsigned_divider!(u8 => u16, u16 => u32, u32 => u64, u64 => u128); 268 | 269 | impl_signed_divider! { 270 | i8 => u8 => u16, 271 | i16 => u16 => u32, 272 | i32 => u32 => u64, 273 | i64 => u64 => u128 274 | } 275 | -------------------------------------------------------------------------------- /crates/thermite/src/element.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | /// Umbrella trait for SIMD vector element bounds 4 | pub trait SimdElement: 'static + mask::Truthy + CastFromAll + Clone + Debug + Copy + Default + Send + Sync {} 5 | 6 | impl SimdElement for T where T: 'static + mask::Truthy + CastFromAll + Clone + Debug + Copy + Default + Send + Sync {} 7 | 8 | macro_rules! decl_cast_from_all { 9 | ($($ty:ty),*) => { 10 | pub trait CastFromAll: $(CastFrom<$ty>+)* {} 11 | impl CastFromAll for T where T: $(CastFrom<$ty>+)* {} 12 | } 13 | } 14 | 15 | pub trait CastFrom: Sized { 16 | fn cast_from(value: T) -> Self; 17 | } 18 | 19 | macro_rules! impl_cast_from { 20 | (@INNER $ty:ty as $as:ty) => { 21 | impl CastFrom<$ty> for $as { 22 | #[inline(always)] 23 | fn cast_from(value: $ty) -> $as { 24 | value as $as 25 | } 26 | } 27 | }; 28 | ($($ty:ty),*) => { 29 | $( 30 | impl_cast_from_bool!($ty); 31 | impl_cast_from!(@INNER $ty as i8); 32 | impl_cast_from!(@INNER $ty as i16); 33 | impl_cast_from!(@INNER $ty as i32); 34 | impl_cast_from!(@INNER $ty as i64); 35 | impl_cast_from!(@INNER $ty as isize); 36 | impl_cast_from!(@INNER $ty as u8); 37 | impl_cast_from!(@INNER $ty as u16); 38 | impl_cast_from!(@INNER $ty as u32); 39 | impl_cast_from!(@INNER $ty as u64); 40 | impl_cast_from!(@INNER $ty as usize); 41 | impl_cast_from!(@INNER $ty as f32); 42 | impl_cast_from!(@INNER $ty as f64); 43 | )* 44 | }; 45 | } 46 | 47 | macro_rules! impl_cast_from_bool { 48 | ($ty:ty) => { 49 | impl CastFrom for $ty { 50 | #[inline(always)] 51 | fn cast_from(value: bool) -> Self { 52 | if value { 53 | 1 as $ty 54 | } else { 55 | 0 as $ty 56 | } 57 | } 58 | } 59 | 60 | impl CastFrom<$ty> for bool { 61 | #[inline(always)] 62 | fn cast_from(value: $ty) -> bool { 63 | value != (0 as $ty) 64 | } 65 | } 66 | }; 67 | } 68 | 69 | decl_cast_from_all!(i8, i16, i32, i64, u8, u16, u32, u64, isize, usize, f32, f64, bool); 70 | impl_cast_from!(i8, i16, i32, i64, u8, u16, u32, u64, isize, usize, f32, f64); 71 | 72 | impl CastFrom for bool { 73 | #[inline(always)] 74 | fn cast_from(value: bool) -> bool { 75 | value 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /crates/thermite/src/iter/aligned.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | pub struct AlignedMut<'a, S: Simd, V: SimdVectorBase> { 4 | ptr: *mut V::Element, 5 | len: usize, 6 | _lt: PhantomData<&'a S>, 7 | } 8 | 9 | impl<'a, S: Simd, V: SimdVectorBase> AlignedMut<'a, S, V> { 10 | #[inline] 11 | pub unsafe fn new_unchecked(slice: &'a mut [V::Element]) -> Self { 12 | AlignedMut { 13 | ptr: slice.as_mut_ptr(), 14 | len: slice.len(), 15 | _lt: PhantomData, 16 | } 17 | } 18 | 19 | #[inline] 20 | pub fn new(slice: &'a mut [V::Element]) -> Option { 21 | if slice.as_ptr().align_offset(V::ALIGNMENT) != 0 { 22 | None 23 | } else { 24 | Some(unsafe { AlignedMut::new_unchecked(slice) }) 25 | } 26 | } 27 | 28 | #[inline] 29 | pub fn iter_mut(self) -> AlignedMutIter<'a, S, V> { 30 | AlignedMutIter(self) 31 | } 32 | } 33 | 34 | pub struct AlignedMutIter<'a, S: Simd, V: SimdVectorBase>(AlignedMut<'a, S, V>); 35 | 36 | impl<'a, S: Simd, V: SimdVectorBase> AlignedMutIter<'a, S, V> { 37 | /// Returns the remainder of the slice that is being iterated over. 38 | /// 39 | /// If the iterator has been exhausted (`next()` returns `None`), 40 | /// this may still return elements that would not fill an SIMD vector. 41 | pub fn remainder(&mut self) -> &'a mut [V::Element] { 42 | unsafe { core::slice::from_raw_parts_mut(self.0.ptr, self.0.len) } 43 | } 44 | } 45 | 46 | impl<'a, S: Simd, V: SimdVectorBase> Iterator for AlignedMutIter<'a, S, V> { 47 | type Item = &'a mut V; 48 | 49 | #[inline] 50 | fn next(&mut self) -> Option<&'a mut V> { 51 | if self.0.len < V::NUM_ELEMENTS { 52 | None 53 | } else { 54 | unsafe { 55 | let ptr = self.0.ptr; 56 | self.0.ptr = self.0.ptr.add(V::NUM_ELEMENTS); 57 | self.0.len -= V::NUM_ELEMENTS; 58 | Some(&mut *(ptr as *mut V)) 59 | } 60 | } 61 | } 62 | 63 | #[inline] 64 | fn size_hint(&self) -> (usize, Option) { 65 | let remaining = self.0.len / V::NUM_ELEMENTS; 66 | (remaining, Some(remaining)) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /crates/thermite/src/iter/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | mod aligned; 4 | mod slice; 5 | 6 | pub use self::aligned::*; 7 | pub use self::slice::*; 8 | 9 | pub trait SimdIteratorExt: Iterator 10 | where 11 | V: SimdVector, 12 | { 13 | fn store(self, dst: &mut [V::Element], write_zero: bool) 14 | where 15 | Self: Sized; 16 | 17 | #[inline] 18 | fn cast(self) -> SimdCastIter 19 | where 20 | Self: Sized, 21 | U: SimdFromCast, 22 | { 23 | SimdCastIter { 24 | src: self, 25 | _tys: PhantomData, 26 | } 27 | } 28 | } 29 | 30 | pub trait IntoSimdIterator { 31 | type Item; 32 | type IntoIter: Iterator; 33 | 34 | fn into_simd_iter(self) -> Self::IntoIter; 35 | } 36 | 37 | pub struct SimdCastIter { 38 | src: I, 39 | _tys: PhantomData<(S, V, U)>, 40 | } 41 | 42 | impl Clone for SimdCastIter 43 | where 44 | I: Clone, 45 | { 46 | fn clone(&self) -> Self { 47 | SimdCastIter { 48 | src: self.src.clone(), 49 | _tys: PhantomData, 50 | } 51 | } 52 | } 53 | 54 | impl Iterator for SimdCastIter 55 | where 56 | I: Iterator, 57 | U: SimdFromCast, 58 | { 59 | type Item = U; 60 | 61 | #[inline] 62 | fn next(&mut self) -> Option { 63 | self.src.next().map(|v| U::from_cast(v)) 64 | } 65 | } 66 | 67 | impl SimdIteratorExt for T 68 | where 69 | T: Iterator, 70 | V: SimdVector, 71 | { 72 | #[inline] 73 | fn store(mut self, dst: &mut [V::Element], write_zero: bool) 74 | where 75 | Self: Sized, 76 | { 77 | let mut chunks = dst.chunks_exact_mut(V::NUM_ELEMENTS); 78 | 79 | // normal writes 80 | (&mut self).zip(&mut chunks).for_each(|(src, dst)| unsafe { 81 | src.store_unaligned_unchecked(dst.as_mut_ptr()); 82 | }); 83 | 84 | if write_zero { 85 | // fill any remaining chunks with zero 86 | (&mut chunks).for_each(|dst| unsafe { 87 | V::zero().store_unaligned_unchecked(dst.as_mut_ptr()); 88 | }); 89 | } 90 | 91 | // if there is a remainder, check to fill it 92 | let rem = chunks.into_remainder(); 93 | if thermite_unlikely!(!rem.is_empty()) { 94 | // if there are any values left, write what we can or zero it 95 | let value = match self.next() { 96 | Some(value) => value, 97 | None if write_zero => V::zero(), 98 | _ => return, // don't zero and nothing to write, so return 99 | }; 100 | 101 | let indices = Vi32::::indexed(); 102 | let mask = Vi32::::splat(rem.len() as i32).lt(indices); 103 | 104 | unsafe { value.scatter_masked_unchecked(rem.as_mut_ptr(), indices, mask.cast_to()) }; 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /crates/thermite/src/iter/slice.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | pub struct SimdSliceIter<'a, S: Simd, V: SimdVectorBase> { 4 | // TODO: Replace with pointer? 5 | slice: &'a [V::Element], 6 | _tys: PhantomData<&'a S>, 7 | } 8 | 9 | impl> Clone for SimdSliceIter<'_, S, V> { 10 | fn clone(&self) -> Self { 11 | SimdSliceIter { 12 | slice: self.slice.clone(), 13 | _tys: PhantomData, 14 | } 15 | } 16 | } 17 | 18 | impl<'a, S: Simd, T> IntoSimdIterator for &'a [T] 19 | where 20 | T: SimdAssociatedVector, 21 | AssociatedVector: SimdVectorBase, 22 | { 23 | type Item = AssociatedVector; 24 | type IntoIter = SimdSliceIter<'a, S, Self::Item>; 25 | 26 | fn into_simd_iter(self) -> SimdSliceIter<'a, S, Self::Item> { 27 | SimdSliceIter::new(self) 28 | } 29 | } 30 | 31 | impl<'a, S: Simd, V: SimdVectorBase> SimdSliceIter<'a, S, V> { 32 | #[inline] 33 | pub fn new(slice: &'a [V::Element]) -> Self { 34 | SimdSliceIter { 35 | slice, 36 | _tys: PhantomData, 37 | } 38 | } 39 | 40 | /// Returns the remainder of the slice that is being iterated over. 41 | /// 42 | /// If the iterator has been exhausted (`next()` returns `None`), 43 | /// this may still return elements that would not fill an SIMD vector. 44 | #[inline] 45 | pub fn remainder(&self) -> &[V::Element] { 46 | self.slice 47 | } 48 | } 49 | 50 | impl<'a, S: Simd, V> Iterator for SimdSliceIter<'a, S, V> 51 | where 52 | V: SimdVectorBase, 53 | { 54 | type Item = V; 55 | 56 | #[inline] 57 | fn next(&mut self) -> Option { 58 | if self.slice.len() < V::NUM_ELEMENTS { 59 | None 60 | } else { 61 | let vector = V::load_unaligned(self.slice); 62 | self.slice = &self.slice[V::NUM_ELEMENTS..]; 63 | Some(vector) 64 | } 65 | } 66 | 67 | #[inline] 68 | fn size_hint(&self) -> (usize, Option) { 69 | let remaining = self.slice.len() / V::NUM_ELEMENTS; 70 | (remaining, Some(remaining)) 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /crates/thermite/src/macros.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | #[cfg(feature = "nightly")] 4 | pub use core::intrinsics::{likely, unlikely}; 5 | 6 | // borrows technique from https://github.com/rust-lang/hashbrown/pull/209 7 | #[cfg(not(feature = "nightly"))] 8 | #[inline] 9 | #[cold] 10 | fn cold() {} 11 | 12 | #[cfg(not(feature = "nightly"))] 13 | #[rustfmt::skip] 14 | #[inline(always)] 15 | pub unsafe fn likely(b: bool) -> bool { 16 | if !b { cold() } b 17 | } 18 | 19 | #[cfg(not(feature = "nightly"))] 20 | #[rustfmt::skip] 21 | #[inline(always)] 22 | pub unsafe fn unlikely(b: bool) -> bool { 23 | if b { cold() } b 24 | } 25 | 26 | #[doc(hidden)] 27 | #[macro_export] 28 | #[rustfmt::skip] 29 | macro_rules! thermite_likely { 30 | ($e:expr) => {{ 31 | #[allow(unused_unsafe)] 32 | unsafe { $crate::macros::likely($e) } 33 | }}; 34 | } 35 | 36 | #[doc(hidden)] 37 | #[macro_export] 38 | #[rustfmt::skip] 39 | macro_rules! thermite_unlikely { 40 | ($e:expr) => {{ 41 | #[allow(unused_unsafe)] 42 | unsafe { $crate::macros::unlikely($e) } 43 | }}; 44 | } 45 | -------------------------------------------------------------------------------- /crates/thermite/src/math/compensated.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use core::convert::TryFrom; 4 | 5 | #[derive(Debug, Clone, Copy)] 6 | pub struct Compensated> { 7 | pub val: V, 8 | pub err: V, 9 | _simd: PhantomData, 10 | } 11 | 12 | #[dispatch(thermite = "crate")] 13 | impl> Compensated { 14 | #[inline(always)] 15 | fn from_parts(val: V, err: V) -> Self { 16 | Compensated { 17 | val, 18 | err, 19 | _simd: PhantomData, 20 | } 21 | } 22 | 23 | #[inline(always)] 24 | pub fn new(val: V) -> Self { 25 | Self::from_parts(val, V::zero()) 26 | } 27 | 28 | #[inline(always)] 29 | pub fn value(self) -> V { 30 | self.val + self.err 31 | } 32 | 33 | #[inline(always)] 34 | pub fn product(a: V, b: V) -> Self { 35 | let val = a * b; 36 | 37 | if S::INSTRSET.has_true_fma() { 38 | Compensated::from_parts(val, a.mul_sub(b, val)) 39 | } else { 40 | // split into half-ish-precision 41 | let factor = match V::ELEMENT_SIZE { 42 | 4 => V::splat_as::(1u32 << 13 + 1), 43 | 8 => V::splat_as::(1u32 << 27 + 1), 44 | _ => unsafe { crate::unreachable_unchecked() }, 45 | }; 46 | 47 | let (a1, a2) = { 48 | let c = factor * a; 49 | let x = c - (c - a); 50 | (x, a - x) 51 | }; 52 | 53 | let (b1, b2) = { 54 | let c = factor * b; 55 | let x = c - (c - b); 56 | (x, b - x) 57 | }; 58 | 59 | let err = a2 * b2 - (((val - a1 * b1) - a2 * b1) - a1 * b2); 60 | 61 | Compensated::from_parts(val, err) 62 | } 63 | } 64 | 65 | #[inline(always)] 66 | pub fn sum(a: V, b: V) -> Self { 67 | let x = a + b; 68 | let z = x - a; 69 | let y = (a - (x - z)) + (b - z); 70 | 71 | Compensated::from_parts(x, y) 72 | } 73 | } 74 | 75 | impl> Add for Compensated { 76 | type Output = Self; 77 | 78 | fn add(mut self, rhs: V) -> Self { 79 | let pi = Self::sum(self.val, rhs); 80 | self.val = pi.val; 81 | self.err += pi.err; 82 | self 83 | } 84 | } 85 | 86 | // Accurate Floating Point Product, Stef Graillat 87 | // 88 | // https://www-pequan.lip6.fr/~graillat/papers/REC08_Paper_Graillat.pdf 89 | impl> Mul for Compensated { 90 | type Output = Self; 91 | 92 | fn mul(mut self, rhs: V) -> Self { 93 | let pi = Self::product(self.val, rhs); 94 | self.val = pi.val; 95 | self.err = self.err.mul_adde(rhs, pi.err); 96 | self 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /crates/thermite/src/math/consts.rs: -------------------------------------------------------------------------------- 1 | use super::*; 2 | 3 | pub trait SimdFloatVectorConsts: SimdFloatVector { 4 | /// Euler’s number (e) 5 | fn E() -> Self; 6 | 7 | /// 1/π 8 | fn FRAC_1_PI() -> Self; 9 | 10 | /// 1/sqrt(2) 11 | fn FRAC_1_SQRT_2() -> Self; 12 | 13 | /// 2/π 14 | fn FRAC_2_PI() -> Self; 15 | 16 | /// 1/sqrt(π) 17 | fn FRAC_1_SQRT_PI() -> Self; 18 | 19 | /// 2/sqrt(π) 20 | fn FRAC_2_SQRT_PI() -> Self; 21 | 22 | /// π/2 23 | fn FRAC_PI_2() -> Self; 24 | 25 | /// π/3 26 | fn FRAC_PI_3() -> Self; 27 | 28 | /// π/4 29 | fn FRAC_PI_4() -> Self; 30 | 31 | /// π/6 32 | fn FRAC_PI_6() -> Self; 33 | 34 | /// π/8 35 | fn FRAC_PI_8() -> Self; 36 | 37 | /// ln(2) 38 | fn LN_2() -> Self; 39 | 40 | /// ln(10) 41 | fn LN_10() -> Self; 42 | 43 | /// ln(π) 44 | fn LN_PI() -> Self; 45 | 46 | /// log2(10) 47 | fn LOG2_10() -> Self; 48 | 49 | /// log2(e) 50 | fn LOG2_E() -> Self; 51 | 52 | /// log10(2) 53 | fn LOG10_2() -> Self; 54 | 55 | /// log10(e) 56 | fn LOG10_E() -> Self; 57 | 58 | /// Archimedes’ constant (π) 59 | fn PI() -> Self; 60 | 61 | /// sqrt(2) 62 | fn SQRT_2() -> Self; 63 | 64 | /// sqrt(e) 65 | fn SQRT_E() -> Self; 66 | 67 | /// The full circle constant (τ) 68 | fn TAU() -> Self; 69 | 70 | /// sqrt(π/2) 71 | fn SQRT_FRAC_PI_2() -> Self; 72 | } 73 | 74 | #[doc(hidden)] 75 | pub trait SimdFloatVectorConstsInternal: SimdElement { 76 | type Vf: SimdFloatVector; 77 | 78 | fn E() -> Self::Vf; 79 | fn FRAC_1_PI() -> Self::Vf; 80 | fn FRAC_1_SQRT_2() -> Self::Vf; 81 | fn FRAC_2_PI() -> Self::Vf; 82 | fn FRAC_1_SQRT_PI() -> Self::Vf; 83 | fn FRAC_2_SQRT_PI() -> Self::Vf; 84 | fn FRAC_PI_2() -> Self::Vf; 85 | fn FRAC_PI_3() -> Self::Vf; 86 | fn FRAC_PI_4() -> Self::Vf; 87 | fn FRAC_PI_6() -> Self::Vf; 88 | fn FRAC_PI_8() -> Self::Vf; 89 | fn LN_2() -> Self::Vf; 90 | fn LN_10() -> Self::Vf; 91 | fn LN_PI() -> Self::Vf; 92 | fn LOG2_10() -> Self::Vf; 93 | fn LOG2_E() -> Self::Vf; 94 | fn LOG10_2() -> Self::Vf; 95 | fn LOG10_E() -> Self::Vf; 96 | fn PI() -> Self::Vf; 97 | fn SQRT_2() -> Self::Vf; 98 | fn SQRT_E() -> Self::Vf; 99 | fn TAU() -> Self::Vf; 100 | fn SQRT_FRAC_PI_2() -> Self::Vf; 101 | } 102 | 103 | macro_rules! impl_internal_consts { 104 | ($t:ident: $vf:ident => $($name:ident),*) => { 105 | #[inline(always)] 106 | fn FRAC_1_SQRT_PI() -> Self::Vf { 107 | Self::Vf::splat(0.5641895835477562869480794515607725858440506293289988568440857217) 108 | } 109 | 110 | #[inline(always)] 111 | fn SQRT_FRAC_PI_2() -> Self::Vf { 112 | Self::Vf::splat(1.2533141373155002512078826424055226265034933703049691583149617881) 113 | } 114 | 115 | #[inline(always)] 116 | fn LN_PI() -> Self::Vf { 117 | Self::Vf::splat(1.1447298858494001741434273513530587116472948129153115715136230714) 118 | } 119 | 120 | #[inline(always)] 121 | fn SQRT_E() -> Self::Vf { 122 | Self::Vf::splat(1.6487212707001281468486507878141635716537761007101480115750793116) 123 | } 124 | 125 | $( 126 | #[inline(always)] 127 | fn $name() -> Self::Vf { 128 | Self::Vf::splat(core::$t::consts::$name) 129 | } 130 | )* 131 | } 132 | } 133 | 134 | impl SimdFloatVectorConstsInternal for f32 { 135 | type Vf = ::Vf32; 136 | 137 | impl_internal_consts!(f32: Vf32 => E, FRAC_1_PI, FRAC_1_SQRT_2, FRAC_2_PI, FRAC_2_SQRT_PI, FRAC_PI_2, FRAC_PI_3, FRAC_PI_4, FRAC_PI_6, FRAC_PI_8, LN_2, LN_10, LOG2_10, LOG2_E, LOG10_2, LOG10_E, PI, SQRT_2, TAU); 138 | } 139 | 140 | impl SimdFloatVectorConstsInternal for f64 { 141 | type Vf = ::Vf64; 142 | 143 | impl_internal_consts!(f64: Vf64 => E, FRAC_1_PI, FRAC_1_SQRT_2, FRAC_2_PI, FRAC_2_SQRT_PI, FRAC_PI_2, FRAC_PI_3, FRAC_PI_4, FRAC_PI_6, FRAC_PI_8, LN_2, LN_10, LOG2_10, LOG2_E, LOG10_2, LOG10_E, PI, SQRT_2, TAU); 144 | } 145 | 146 | macro_rules! impl_consts { 147 | ($($name:ident),*) => { 148 | $( 149 | #[inline(always)] fn $name() -> Self { 150 | <>::Element as SimdFloatVectorConstsInternal>::$name() 151 | } 152 | )* 153 | } 154 | } 155 | 156 | #[rustfmt::skip] 157 | impl SimdFloatVectorConsts for T 158 | where 159 | T: SimdFloatVector, 160 | >::Element: SimdFloatVectorConstsInternal, 161 | { 162 | impl_consts!(E, FRAC_1_PI, FRAC_1_SQRT_2, FRAC_2_PI, FRAC_1_SQRT_PI, FRAC_2_SQRT_PI, FRAC_PI_2, FRAC_PI_3, FRAC_PI_4, FRAC_PI_6, FRAC_PI_8, LN_2, LN_10, LN_PI, LOG2_10, LOG2_E, LOG10_2, LOG10_E, PI, SQRT_2, SQRT_E, TAU, SQRT_FRAC_PI_2); 163 | } 164 | -------------------------------------------------------------------------------- /crates/thermite/src/math/poly.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | Optimized fixed-degree polynomial evaluation 3 | 4 | All of these polynomials use Estrin's scheme to reduce the dependency chain length 5 | and encourage instruction-level parallelism, which has the potential to improve 6 | performance despite the powers of X being required upfront. 7 | 8 | Powers of x are required, rather than computed internally, so they could be reused 9 | between multiple polynomials. 10 | 11 | Unless you are micro-optimizing, it's recommended to use `SimdVectorizedMath::poly` or `poly_f` 12 | */ 13 | 14 | use crate::*; 15 | 16 | #[inline(always)] 17 | pub fn poly_1>(x: V, c0: V, c1: V) -> V { 18 | x.mul_adde(c1, c0) 19 | } 20 | 21 | #[inline(always)] 22 | pub fn poly_2>(x: V, x2: V, c0: V, c1: V, c2: V) -> V { 23 | x2.mul_adde(c2, x.mul_adde(c1, c0)) 24 | } 25 | 26 | #[inline(always)] 27 | pub fn poly_3>(x: V, x2: V, c0: V, c1: V, c2: V, c3: V) -> V { 28 | // x^2 * (x * c3 + c2) + (x*c1 + c0) 29 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)) 30 | } 31 | 32 | #[inline(always)] 33 | pub fn poly_4>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V) -> V { 34 | // x^4 * c4 + (x^2 * (x * c3 + c2) + (x*c1 + c0)) 35 | x4.mul_adde(c4, x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0))) 36 | } 37 | 38 | #[inline(always)] 39 | pub fn poly_5>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V, c5: V) -> V { 40 | // x^4 * (x * c5 + c4) + (x^2 * (x * c3 + c2) + (x*c1 + c0)) 41 | x4.mul_adde(x.mul_adde(c5, c4), x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0))) 42 | } 43 | 44 | #[rustfmt::skip] 45 | #[inline(always)] 46 | pub fn poly_6>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V) -> V { 47 | // x^4 * (x^2 * c6 + (x * c5 + c4)) + (x^2 * (x * c3 + c2) + (x * c1 + c0)) 48 | x4.mul_adde( 49 | x2.mul_adde(c6, x.mul_adde(c5, c4)), 50 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 51 | ) 52 | } 53 | 54 | #[rustfmt::skip] 55 | #[inline(always)] 56 | pub fn poly_7>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V) -> V { 57 | x4.mul_adde( 58 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 59 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 60 | ) 61 | } 62 | 63 | #[rustfmt::skip] 64 | #[inline(always)] 65 | pub fn poly_8>( 66 | x: V, x2: V, x4: V, x8: V, 67 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V 68 | ) -> V { 69 | x8.mul_adde(c8, x4.mul_adde( 70 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 71 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 72 | )) 73 | } 74 | 75 | #[rustfmt::skip] 76 | #[inline(always)] 77 | pub fn poly_9>( 78 | x: V, x2: V, x4: V, x8: V, 79 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V 80 | ) -> V { 81 | x8.mul_adde(x.mul_adde(c9, c8), x4.mul_adde( 82 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 83 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 84 | )) 85 | } 86 | 87 | #[rustfmt::skip] 88 | #[inline(always)] 89 | pub fn poly_10>( 90 | x: V, x2: V, x4: V, x8: V, 91 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, 92 | ) -> V { 93 | x8.mul_adde(x2.mul_adde(c10, x.mul_adde(c9, c8)), x4.mul_adde( 94 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 95 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 96 | )) 97 | } 98 | 99 | #[rustfmt::skip] 100 | #[inline(always)] 101 | pub fn poly_11>( 102 | x: V, x2: V, x4: V, x8: V, 103 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V 104 | ) -> V { 105 | x8.mul_adde( 106 | x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)), 107 | x4.mul_adde( 108 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 109 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 110 | ), 111 | ) 112 | } 113 | 114 | #[rustfmt::skip] 115 | #[inline(always)] 116 | pub fn poly_12>( 117 | x: V, x2: V, x4: V, x8: V, 118 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, 119 | ) -> V { 120 | x8.mul_adde( 121 | x4.mul_adde( 122 | c12, 123 | x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)), 124 | ), 125 | x4.mul_adde( 126 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 127 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 128 | ), 129 | ) 130 | } 131 | 132 | #[rustfmt::skip] 133 | #[inline(always)] 134 | pub fn poly_13>( 135 | x: V, x2: V, x4: V, x8: V, 136 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, c13: V, 137 | ) -> V { 138 | x8.mul_adde( 139 | x4.mul_adde( 140 | x.mul_adde(c13, c12), 141 | x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)), 142 | ), 143 | x4.mul_adde( 144 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 145 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 146 | ), 147 | ) 148 | } 149 | 150 | #[rustfmt::skip] 151 | #[inline(always)] 152 | pub fn poly_14>( 153 | x: V, x2: V, x4: V, x8: V, 154 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, c13: V, c14: V 155 | ) -> V { 156 | // (((C0+C1x) + (C2+C3x)x2) + ((C4+C5x) + (C6+C7x)x2)x4) + (((C8+C9x) + (C10+C11x)x2) + ((C12+C13x) + C14*x2)x4)x8 157 | x8.mul_adde( 158 | x4.mul_adde( 159 | x2.mul_adde(c14, x.mul_adde(c13, c12)), 160 | x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)), 161 | ), 162 | x4.mul_adde( 163 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 164 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 165 | ), 166 | ) 167 | } 168 | 169 | #[rustfmt::skip] 170 | #[inline(always)] 171 | pub fn poly_15>( 172 | x: V, x2: V, x4: V, x8: V, 173 | c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, c13: V, c14: V, c15: V 174 | ) -> V { 175 | // (((C0+C1x) + (C2+C3x)x2) + ((C4+C5x) + (C6+C7x)x2)x4) + (((C8+C9x) + (C10+C11x)x2) + ((C12+C13x) + (C14+C15x)x2)x4)x8 176 | x8.mul_adde( 177 | x4.mul_adde( 178 | x2.mul_adde(x.mul_adde(c15, c14), x.mul_adde(c13, c12)), 179 | x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)), 180 | ), 181 | x4.mul_adde( 182 | x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)), 183 | x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)), 184 | ), 185 | ) 186 | } 187 | 188 | #[rustfmt::skip] 189 | #[inline(always)] 190 | pub fn poly_30>( 191 | x: V, x2: V, x4: V, x8: V, x16: V, 192 | c00: V, c01: V, c02: V, c03: V, c04: V, c05: V, c06: V, c07: V, c08: V, c09: V, c10: V, c11: V, c12: V, c13: V, c14: V, c15: V, 193 | c16: V, c17: V, c18: V, c19: V, c20: V, c21: V, c22: V, c23: V, c24: V, c25: V, c26: V, c27: V, c28: V, c29: V, c30: V, c31: V 194 | ) -> V { 195 | x16.mul_adde( 196 | x8.mul_adde( 197 | x4.mul_adde( 198 | x2.mul_adde(x.mul_adde(c31, c30), x.mul_adde(c29, c28)), 199 | x2.mul_adde(x.mul_adde(c27, c26), x.mul_adde(c25, c24)), 200 | ), 201 | x4.mul_adde( 202 | x2.mul_adde(x.mul_adde(c23, c22), x.mul_adde(c21, c20)), 203 | x2.mul_adde(x.mul_adde(c19, c18), x.mul_adde(c17, c16)), 204 | ), 205 | ), 206 | x8.mul_adde( 207 | x4.mul_adde( 208 | x2.mul_adde(x.mul_adde(c15, c14), x.mul_adde(c13, c12)), 209 | x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c09, c08)), 210 | ), 211 | x4.mul_adde( 212 | x2.mul_adde(x.mul_adde(c07, c06), x.mul_adde(c05, c04)), 213 | x2.mul_adde(x.mul_adde(c03, c02), x.mul_adde(c01, c00)), 214 | ), 215 | ) 216 | ) 217 | } 218 | -------------------------------------------------------------------------------- /crates/thermite/src/pointer.rs: -------------------------------------------------------------------------------- 1 | use core::marker::PhantomData; 2 | use core::mem; 3 | 4 | use super::*; 5 | 6 | #[derive(Debug, Clone, Copy, PartialEq)] 7 | #[repr(transparent)] 8 | pub struct VPtr { 9 | ptr: S::Vusize, 10 | ty: PhantomData, 11 | } 12 | 13 | impl VPtr 14 | where 15 | T: SimdAssociatedVector, 16 | S::Vusize: SimdPtrInternal>, 17 | { 18 | #[inline(always)] 19 | pub fn splat(ptr: *mut T) -> Self { 20 | Self { 21 | ptr: S::Vusize::splat(ptr as _), 22 | ty: PhantomData, 23 | } 24 | } 25 | 26 | #[inline(always)] 27 | pub fn add(self, offset: S::Vusize) -> Self { 28 | Self { 29 | ptr: self.ptr + offset * S::Vusize::splat(mem::size_of::() as _), 30 | ty: PhantomData, 31 | } 32 | } 33 | 34 | #[inline(always)] 35 | pub fn is_null(self) -> Mask { 36 | self.ptr.eq(S::Vusize::zero()) 37 | } 38 | 39 | #[inline(always)] 40 | pub unsafe fn read(self) -> AssociatedVector { 41 | self.ptr._mm_gather() 42 | } 43 | 44 | #[inline(always)] 45 | pub unsafe fn read_masked( 46 | self, 47 | mask: Mask>, 48 | default: AssociatedVector, 49 | ) -> AssociatedVector { 50 | self.ptr._mm_gather_masked(mask, default) 51 | } 52 | 53 | #[inline(always)] 54 | pub unsafe fn write(self, value: AssociatedVector) { 55 | self.ptr._mm_scatter(value) 56 | } 57 | 58 | #[inline(always)] 59 | pub unsafe fn write_masked(self, mask: Mask>, value: AssociatedVector) { 60 | self.ptr._mm_scatter_masked(mask, value) 61 | } 62 | } 63 | 64 | #[doc(hidden)] 65 | pub trait AsUsize: Sized { 66 | fn as_usize(self) -> usize; 67 | } 68 | 69 | #[doc(hidden)] 70 | pub trait SimdPtrInternal>: SimdVector 71 | where 72 | >::Element: AsUsize, 73 | { 74 | #[inline(always)] 75 | unsafe fn _mm_gather(self) -> V { 76 | self._mm_gather_masked(Mask::truthy(), V::default()) 77 | } 78 | 79 | #[inline(always)] 80 | unsafe fn _mm_scatter(self, value: V) { 81 | self._mm_scatter_masked(Mask::truthy(), value) 82 | } 83 | 84 | #[inline(always)] 85 | unsafe fn _mm_gather_masked(self, mask: Mask, default: V) -> V { 86 | let mut res = default; 87 | for i in 0..Self::NUM_ELEMENTS { 88 | if mask.extract_unchecked(i) { 89 | res = res.replace_unchecked( 90 | i, 91 | mem::transmute::<_, *const V::Element>(self.extract_unchecked(i).as_usize()).read(), 92 | ); 93 | } 94 | } 95 | res 96 | } 97 | 98 | #[inline(always)] 99 | unsafe fn _mm_scatter_masked(self, mask: Mask, value: V) { 100 | for i in 0..Self::NUM_ELEMENTS { 101 | if mask.extract_unchecked(i) { 102 | mem::transmute::<_, *mut V::Element>(self.extract_unchecked(i).as_usize()) 103 | .write(value.extract_unchecked(i)); 104 | } 105 | } 106 | } 107 | } 108 | 109 | impl AsUsize for u32 { 110 | #[inline(always)] 111 | fn as_usize(self) -> usize { 112 | self as usize 113 | } 114 | } 115 | 116 | impl AsUsize for u64 { 117 | #[inline(always)] 118 | fn as_usize(self) -> usize { 119 | self as usize 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /crates/thermite/src/rng/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | pub mod pcg32; 4 | pub mod xoshiro; 5 | 6 | pub trait SimdRng { 7 | fn reseed(&mut self, seed: Vu64); 8 | 9 | #[inline(always)] 10 | fn next_u32(&mut self) -> Vu32 { 11 | // use higher bits in cases where there is low linear complexity in low bits 12 | (self.next_u64() >> 32).cast() 13 | } 14 | 15 | #[inline(always)] 16 | fn next_u64(&mut self) -> Vu64 { 17 | let low: Vu64 = self.next_u32().cast(); 18 | let high: Vu64 = self.next_u32().cast(); 19 | 20 | low | (high << 32) 21 | } 22 | 23 | #[inline(always)] 24 | fn next_f32(&mut self) -> Vf32 { 25 | // NOTE: This has the added benefit of shifting out the lower bits, 26 | // as some RGNs have a low linear complexity in the lower bits 27 | Vf32::::from_bits((self.next_u32() >> 9) | Vu32::::splat(0x3f800000)) - Vf32::::one() 28 | } 29 | 30 | #[inline(always)] 31 | fn next_f64(&mut self) -> Vf64 { 32 | // NOTE: This has the added benefit of shifting out the lower bits, 33 | // as some RGNs have a low linear complexity in the lower bits 34 | Vf64::::from_bits((self.next_u64() >> 20) | Vu64::::splat(0x3ff0000000000000)) - Vf64::::one() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /crates/thermite/src/rng/pcg32.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use crate::*; 4 | 5 | use super::SimdRng; 6 | 7 | const PCG32_DEFAULT_STATE: u64 = 0x853c49e6748fea9b; 8 | const PCG32_DEFAULT_STREAM: u64 = 0xda3e39cb94b95bdb; 9 | const PCG32_MULT: u64 = 0x5851f42d4c957f2d; 10 | 11 | #[derive(Debug, Clone, PartialEq)] 12 | pub struct PCG32 { 13 | state: Vu64, 14 | inc: Vu64, 15 | } 16 | 17 | #[dispatch(S, thermite = "crate")] 18 | impl PCG32 { 19 | #[inline(always)] 20 | pub fn new(seed: Vu64) -> Self { 21 | let mut rng = PCG32 { 22 | state: unsafe { Vu64::::undefined() }, 23 | inc: unsafe { Vu64::::undefined() }, 24 | }; 25 | rng.reseed(seed); 26 | rng 27 | } 28 | } 29 | 30 | #[dispatch(S, thermite = "crate")] 31 | impl SimdRng for PCG32 { 32 | #[inline] 33 | fn reseed(&mut self, seed: Vu64) { 34 | self.state = Vu64::::zero(); 35 | self.inc = (seed << 1) | Vu64::::one(); 36 | 37 | let _ = self.next_u32(); 38 | self.state += Vu64::::splat(PCG32_DEFAULT_STATE); 39 | let _ = self.next_u32(); 40 | } 41 | 42 | #[inline] 43 | fn next_u32(&mut self) -> Vu32 { 44 | let old_state = self.state; 45 | self.state = old_state * Vu64::::splat(PCG32_MULT) + self.inc; 46 | let xorshifted = as SimdFromCast>>::from_cast(((old_state >> 18) ^ old_state) >> 27); 47 | let rot_offset = as SimdFromCast>>::from_cast(old_state >> 59); 48 | 49 | xorshifted.rorv(rot_offset) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /crates/thermite/src/rng/xoshiro.rs: -------------------------------------------------------------------------------- 1 | // TODO 2 | 3 | use crate::*; 4 | 5 | use super::SimdRng; 6 | 7 | #[derive(Debug, Clone, PartialEq)] 8 | pub struct SplitMix64 { 9 | x: Vu64, 10 | } 11 | 12 | const PHI: u64 = 0x9e3779b97f4a7c15; 13 | 14 | impl SplitMix64 { 15 | #[inline(always)] 16 | pub fn new(seed: Vu64) -> Self { 17 | SplitMix64 { x: seed } 18 | } 19 | } 20 | 21 | #[dispatch(S, thermite = "crate")] 22 | impl SimdRng for SplitMix64 { 23 | #[inline(always)] 24 | fn reseed(&mut self, seed: Vu64) { 25 | self.x = seed; 26 | } 27 | 28 | #[inline(always)] 29 | fn next_u32(&mut self) -> Vu32 { 30 | self.x = self.x + Vu64::::splat(PHI); 31 | let mut z = self.x; 32 | 33 | z = (z ^ (z >> 33)) * Vu64::::splat(0x62A9D9ED799705F5); 34 | z = (z ^ (z >> 28)) * Vu64::::splat(0xCB24D0A5C88C35B3); 35 | 36 | (z >> 32).cast() 37 | } 38 | 39 | #[inline(always)] 40 | fn next_u64(&mut self) -> Vu64 { 41 | self.x = self.x + Vu64::::splat(PHI); 42 | let mut z = self.x; 43 | 44 | z = (z ^ (z >> 30)) * Vu64::::splat(0xbf58476d1ce4e5b9); 45 | z = (z ^ (z >> 27)) * Vu64::::splat(0x94d049bb133111eb); 46 | z ^ (z >> 31) 47 | } 48 | } 49 | 50 | #[derive(Debug, Clone, PartialEq)] 51 | pub struct Xoshiro128Plus { 52 | s0: Vu64, 53 | s1: Vu64, 54 | } 55 | 56 | #[dispatch(S, thermite = "crate")] 57 | impl Xoshiro128Plus { 58 | #[inline(always)] 59 | pub fn new(seed: Vu64) -> Self { 60 | let mut rng = SplitMix64::::new(seed); 61 | Xoshiro128Plus { 62 | s0: rng.next_u64(), 63 | s1: rng.next_u64(), 64 | } 65 | } 66 | } 67 | 68 | #[dispatch(S, thermite = "crate")] 69 | impl SimdRng for Xoshiro128Plus { 70 | #[inline(always)] 71 | fn reseed(&mut self, seed: Vu64) { 72 | *self = Self::new(seed); 73 | } 74 | 75 | #[inline(always)] 76 | fn next_u64(&mut self) -> Vu64 { 77 | let result = self.s0 + self.s1; 78 | 79 | self.s1 ^= self.s0; 80 | self.s0 = self.s0.rol(24) ^ self.s1 ^ (self.s1 << 16); 81 | self.s1 = self.s1.rol(37); 82 | 83 | result 84 | } 85 | } 86 | 87 | #[derive(Debug, Clone, PartialEq)] 88 | pub struct Xoshiro256Plus { 89 | state: [Vu64; 4], 90 | } 91 | 92 | #[dispatch(S, thermite = "crate")] 93 | impl Xoshiro256Plus { 94 | #[inline(always)] 95 | pub fn new(seed: Vu64) -> Self { 96 | let mut rng = SplitMix64::::new(seed); 97 | Xoshiro256Plus { 98 | state: [rng.next_u64(), rng.next_u64(), rng.next_u64(), rng.next_u64()], 99 | } 100 | } 101 | } 102 | 103 | #[dispatch(S, thermite = "crate")] 104 | impl SimdRng for Xoshiro256Plus { 105 | #[inline(always)] 106 | fn reseed(&mut self, seed: Vu64) { 107 | *self = Self::new(seed); 108 | } 109 | 110 | #[inline(always)] 111 | fn next_u64(&mut self) -> Vu64 { 112 | let result = self.state[0] + self.state[3]; 113 | 114 | let t = self.state[1] << 17; 115 | 116 | self.state[2] ^= self.state[0]; 117 | self.state[3] ^= self.state[1]; 118 | self.state[1] ^= self.state[2]; 119 | self.state[0] ^= self.state[3]; 120 | 121 | self.state[2] ^= t; 122 | 123 | self.state[3] = self.state[3].rol(45); 124 | 125 | result 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /crates/thermite/src/runtime.rs: -------------------------------------------------------------------------------- 1 | /** 2 | Detects processor architecture at runtime and generates a type definition for the current SIMD instruction-set to be passed into the given code-block. 3 | 4 | The code block given is duplicated, manually monomorphised, to give the type definition to it. 5 | 6 | ```ignore 7 | fn my_algorithm(x: &mut [f32]) { 8 | assert!(x.len() >= Vf32::::NUM_ELEMENTS); 9 | 10 | Vf32::::load_unaligned(x).sin().store_unaligned(x); 11 | } 12 | 13 | let mut values = vec![0.5; 8]; 14 | 15 | dispatch_dyn!({ my_algorithm::(&mut values) }); 16 | 17 | // or with a custom generic parameter name: 18 | 19 | dispatch_dyn!(ISA, { my_algorithm::(&mut values) }); 20 | ``` 21 | */ 22 | #[macro_export] 23 | macro_rules! dispatch_dyn { 24 | ($code:block) => { 25 | dispatch_dyn!(S, $code) 26 | }; 27 | ($s:ident, $code:block) => {{ 28 | use $crate::{backends, Simd, SimdInstructionSet}; 29 | 30 | match SimdInstructionSet::runtime_detect() { 31 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 32 | SimdInstructionSet::AVX2 => { 33 | type $s = backends::avx2::AVX2; 34 | $code 35 | } 36 | _ => unsafe { $crate::unreachable_unchecked() }, 37 | } 38 | }}; 39 | } 40 | -------------------------------------------------------------------------------- /crates/thermite/tests/counts.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use thermite::*; 4 | 5 | type Vi32 = ::Vi32; 6 | type Vu32 = ::Vu32; 7 | type Vu64 = ::Vu64; 8 | type Vf64 = ::Vf64; 9 | type Vf32 = ::Vf32; 10 | type Vi64 = ::Vi64; 11 | 12 | #[test] 13 | fn test_popcnt_32bit() { 14 | for i in -1000..1000 { 15 | let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one()); 16 | 17 | let bits = x.count_ones(); 18 | 19 | for j in 0..Vi32::NUM_ELEMENTS { 20 | let x = x.extract(j); 21 | let b = bits.extract(j) as u32; 22 | 23 | assert_eq!(x.count_ones(), b, "0b{:b} {} == {}", x, x.count_ones(), b); 24 | } 25 | } 26 | } 27 | 28 | #[test] 29 | fn test_popcnt_64bit() { 30 | for i in -1000..1000 { 31 | let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one()); 32 | 33 | let bits = x.count_ones(); 34 | 35 | for j in 0..Vi64::NUM_ELEMENTS { 36 | let x = x.extract(j); 37 | let b = bits.extract(j) as u32; 38 | 39 | assert_eq!(x.count_ones(), b, "0b{:b} {} == {}", x, x.count_ones(), b); 40 | } 41 | } 42 | } 43 | 44 | #[test] 45 | fn test_tzc_64bit() { 46 | for i in -1000..1000 { 47 | let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one()); 48 | 49 | let bits = x.trailing_zeros(); 50 | 51 | for j in 0..Vi64::NUM_ELEMENTS { 52 | let x = x.extract(j); 53 | let b = bits.extract(j) as u32; 54 | 55 | assert_eq!(x.trailing_zeros(), b, "0b{:b} {} == {}", x, x.trailing_zeros(), b); 56 | } 57 | } 58 | } 59 | 60 | #[test] 61 | fn test_tzc_32bit() { 62 | for i in -1000..1000 { 63 | let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one()); 64 | 65 | let bits = x.trailing_zeros(); 66 | 67 | for j in 0..Vi32::NUM_ELEMENTS { 68 | let x = x.extract(j); 69 | let b = bits.extract(j) as u32; 70 | 71 | assert_eq!(x.trailing_zeros(), b, "0b{:b} {} == {}", x, x.trailing_zeros(), b); 72 | } 73 | } 74 | } 75 | 76 | #[test] 77 | fn test_lzc_64bit() { 78 | for i in -1000..1000 { 79 | let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one()); 80 | 81 | let bits = x.leading_zeros(); 82 | 83 | for j in 0..Vi64::NUM_ELEMENTS { 84 | let x = x.extract(j); 85 | let b = bits.extract(j) as u32; 86 | 87 | assert_eq!(x.leading_zeros(), b, "0b{:b} {} == {}", x, x.leading_zeros(), b); 88 | } 89 | } 90 | } 91 | 92 | #[test] 93 | fn test_lzc_32bit() { 94 | for i in -1000..1000 { 95 | let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one()); 96 | 97 | let bits = x.leading_zeros(); 98 | 99 | for j in 0..Vi32::NUM_ELEMENTS { 100 | let x = x.extract(j); 101 | let b = bits.extract(j) as u32; 102 | 103 | assert_eq!(x.leading_zeros(), b, "0b{:b} {} == {}", x, x.leading_zeros(), b); 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /crates/thermite/tests/reverse.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use thermite::*; 4 | 5 | type Vi32 = ::Vi32; 6 | type Vu32 = ::Vu32; 7 | type Vu64 = ::Vu64; 8 | type Vf64 = ::Vf64; 9 | type Vf32 = ::Vf32; 10 | type Vi64 = ::Vi64; 11 | 12 | #[test] 13 | fn test_bitreversal_32bit() { 14 | for i in -1000..1000 { 15 | let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one()); 16 | 17 | let y = x.reverse_bits(); 18 | 19 | for j in 0..Vi32::NUM_ELEMENTS { 20 | let x = x.extract(j).reverse_bits(); 21 | let y = y.extract(j); 22 | 23 | assert_eq!(x, y); 24 | } 25 | } 26 | } 27 | 28 | #[test] 29 | fn test_bitreversal_64bit() { 30 | for i in -1000..1000 { 31 | let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one()); 32 | 33 | let y = x.reverse_bits(); 34 | 35 | for j in 0..Vi64::NUM_ELEMENTS { 36 | let x = x.extract(j).reverse_bits(); 37 | let y = y.extract(j); 38 | 39 | assert_eq!(x, y); 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /crates/thermite/tests/sinh.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | use thermite::*; 4 | 5 | type Vi32 = ::Vi32; 6 | type Vu32 = ::Vu32; 7 | type Vf32 = ::Vf32; 8 | type Vf64 = ::Vf64; 9 | 10 | #[test] 11 | fn test_powi() { 12 | let x = Vf32::splat(5.5); 13 | 14 | let y0 = x.reciprocal_p::(); 15 | let y1 = x.reciprocal_p::(); 16 | let y2 = x.reciprocal_p::(); 17 | let y3 = x.reciprocal_p::(); 18 | 19 | println!( 20 | "{} == {} == {} == {} == {}", 21 | 1.0 / x.extract(0), 22 | y0.extract(0), 23 | y1.extract(0), 24 | y2.extract(0), 25 | y3.extract(0), 26 | ); 27 | } 28 | -------------------------------------------------------------------------------- /crates/thermite2/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "thermite2" 3 | version = "0.1.1-alpha.0" 4 | license = "MIT OR Apache-2.0" 5 | readme = "README.md" 6 | authors = ["novacrazy "] 7 | repository = "https://github.com/raygon-renderer/thermite" 8 | documentation = "https://raygon-renderer.github.io/thermite/" 9 | edition = "2018" 10 | 11 | [features] 12 | default = ["alloc", "math", "rng", "emulate_fma", "static_init"] 13 | # neon = ["thermite-dispatch/neon"] 14 | # wasm32 = ["thermite-dispatch/wasm32"] 15 | alloc = [] 16 | nightly = [] 17 | math = [] 18 | rng = [] 19 | emulate_fma = [] 20 | 21 | [dependencies] 22 | # thermite-dispatch = { path = "../dispatch" } 23 | paste = "1" 24 | half = "1.6.0" 25 | 26 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies.static_init] 27 | version = "1" 28 | optional = true 29 | default_features = false 30 | 31 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies] 32 | core_detect = "1.0.0" 33 | 34 | # [dev-dependencies] 35 | # criterion = "0.3" 36 | # libm = "0.2.1" 37 | # plotly = "0.6.0" 38 | # rand = "0.8" 39 | # rand_xoshiro = "0.6.0" 40 | # no-panic = "0.1" 41 | # thermite-special = { path = "../thermite-special" } 42 | # thermite-complex = { path = "../thermite-complex" } 43 | # num-complex = "0.4" 44 | 45 | # [[bench]] 46 | # name = "main" 47 | # harness = false 48 | 49 | -------------------------------------------------------------------------------- /crates/thermite2/src/backends/avx2/mod.rs: -------------------------------------------------------------------------------- 1 | use crate::{ 2 | arch::avx2::*, 3 | backends::{register::*, vector::Vector}, 4 | widen::Widen, 5 | Simd, SimdInstructionSet, 6 | }; 7 | 8 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] 9 | pub struct AVX2; 10 | 11 | pub mod polyfills; 12 | 13 | pub mod vf32; 14 | 15 | impl Simd for AVX2 { 16 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX2; 17 | 18 | type Vf32 = Self::Vf32x8; 19 | 20 | type Vf32x1 = (); // TODO: wrapped scalar float 21 | type Vf32x2 = (); // TODO: half a 128-bit register 22 | type Vf32x4 = Vector>; 23 | type Vf32x8 = Vector>; 24 | type Vf32x16 = Widen; //2x wider 25 | } 26 | 27 | pub struct AVX2F32Register([(); N]); 28 | pub struct AVX2F64Register([(); N]); 29 | pub struct AVX2U32Register([(); N]); 30 | pub struct AVX2U64Register([(); N]); 31 | pub struct AVX2I32Register([(); N]); 32 | pub struct AVX2I64Register([(); N]); 33 | -------------------------------------------------------------------------------- /crates/thermite2/src/backends/avx2/polyfills.rs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite2/src/backends/avx2/polyfills.rs -------------------------------------------------------------------------------- /crates/thermite2/src/backends/avx2/vf32.rs: -------------------------------------------------------------------------------- 1 | use crate::backends::polyfills::float_rem; 2 | 3 | use super::*; 4 | 5 | #[rustfmt::skip] 6 | unsafe impl Register for AVX2F32Register<4> { 7 | type Element = f32; 8 | type Storage = __m128; 9 | 10 | #[inline(always)] unsafe fn set1(x: f32) -> __m128 { _mm_set1_ps(x) } 11 | } 12 | 13 | #[rustfmt::skip] 14 | unsafe impl Register for AVX2F32Register<8> { 15 | type Element = f32; 16 | type Storage = __m256; 17 | 18 | #[inline(always)] unsafe fn set1(x: f32) -> __m256 { _mm256_set1_ps(x) } 19 | } 20 | 21 | unsafe impl FixedRegister<4> for AVX2F32Register<4> { 22 | #[inline(always)] 23 | unsafe fn setr(values: [f32; 4]) -> __m128 { 24 | core::mem::transmute(values) 25 | } 26 | } 27 | 28 | unsafe impl FixedRegister<8> for AVX2F32Register<8> { 29 | #[inline(always)] 30 | unsafe fn setr(values: [f32; 8]) -> __m256 { 31 | core::mem::transmute(values) 32 | } 33 | } 34 | 35 | unsafe impl UnaryRegisterOps for AVX2F32Register 36 | where 37 | Self: BinaryRegisterOps, 38 | { 39 | #[inline(always)] 40 | unsafe fn bit_not(r: Self::Storage) -> Self::Storage { 41 | Self::bitxor(r, Self::set1(f32::from_bits(!0))) 42 | } 43 | } 44 | 45 | #[rustfmt::skip] 46 | unsafe impl BinaryRegisterOps for AVX2F32Register<4> { 47 | #[inline(always)] unsafe fn bitand(lhs: __m128, rhs: __m128) -> __m128 { _mm_and_ps(lhs, rhs) } 48 | #[inline(always)] unsafe fn bitor(lhs: __m128, rhs: __m128) -> __m128 { _mm_or_ps(lhs, rhs) } 49 | #[inline(always)] unsafe fn bitxor(lhs: __m128, rhs: __m128) -> __m128 { _mm_xor_ps(lhs, rhs) } 50 | #[inline(always)] unsafe fn and_not(lhs: __m128, rhs: __m128) -> __m128 { _mm_andnot_ps(lhs, rhs) } 51 | #[inline(always)] unsafe fn add(lhs: __m128, rhs: __m128) -> __m128 { _mm_add_ps(lhs, rhs) } 52 | #[inline(always)] unsafe fn sub(lhs: __m128, rhs: __m128) -> __m128 { _mm_sub_ps(lhs, rhs) } 53 | #[inline(always)] unsafe fn mul(lhs: __m128, rhs: __m128) -> __m128 { _mm_mul_ps(lhs, rhs) } 54 | #[inline(always)] unsafe fn div(lhs: __m128, rhs: __m128) -> __m128 { _mm_div_ps(lhs, rhs) } 55 | #[inline(always)] unsafe fn rem(lhs: __m128, rhs: __m128) -> __m128 { float_rem::(lhs, rhs) } 56 | } 57 | 58 | #[rustfmt::skip] 59 | unsafe impl BinaryRegisterOps for AVX2F32Register<8> { 60 | #[inline(always)] unsafe fn bitand(lhs: __m256, rhs: __m256) -> __m256 { _mm256_and_ps(lhs, rhs) } 61 | #[inline(always)] unsafe fn bitor(lhs: __m256, rhs: __m256) -> __m256 { _mm256_or_ps(lhs, rhs) } 62 | #[inline(always)] unsafe fn bitxor(lhs: __m256, rhs: __m256) -> __m256 { _mm256_xor_ps(lhs, rhs) } 63 | #[inline(always)] unsafe fn and_not(lhs: __m256, rhs: __m256) -> __m256 { _mm256_andnot_ps(lhs, rhs) } 64 | #[inline(always)] unsafe fn add(lhs: __m256, rhs: __m256) -> __m256 { _mm256_add_ps(lhs, rhs) } 65 | #[inline(always)] unsafe fn sub(lhs: __m256, rhs: __m256) -> __m256 { _mm256_sub_ps(lhs, rhs) } 66 | #[inline(always)] unsafe fn mul(lhs: __m256, rhs: __m256) -> __m256 { _mm256_mul_ps(lhs, rhs) } 67 | #[inline(always)] unsafe fn div(lhs: __m256, rhs: __m256) -> __m256 { _mm256_div_ps(lhs, rhs) } 68 | #[inline(always)] unsafe fn rem(lhs: __m256, rhs: __m256) -> __m256 { float_rem::(lhs, rhs) } 69 | } 70 | 71 | unsafe impl SignedRegisterOps for AVX2F32Register 72 | where 73 | Self: BinaryRegisterOps, 74 | { 75 | #[inline(always)] 76 | unsafe fn neg(x: Self::Storage) -> Self::Storage { 77 | Self::bitxor(x, Self::set1(-0.0)) 78 | } 79 | 80 | #[inline(always)] 81 | unsafe fn abs(x: Self::Storage) -> Self::Storage { 82 | Self::bitand(x, Self::set1(f32::from_bits(0x7fffffff))) 83 | } 84 | } 85 | 86 | #[rustfmt::skip] 87 | unsafe impl FloatRegisterOps for AVX2F32Register<4> { 88 | #[inline(always)] unsafe fn ceil(x: __m128) -> __m128 { _mm_ceil_ps(x) } 89 | #[inline(always)] unsafe fn floor(x: __m128) -> __m128 { _mm_floor_ps(x) } 90 | #[inline(always)] unsafe fn round(x: __m128) -> __m128 { _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) } 91 | #[inline(always)] unsafe fn trunc(x: __m128) -> __m128 { _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) } 92 | 93 | #[inline(always)] unsafe fn sqrt(x: __m128) -> __m128 { _mm_sqrt_ps(x) } 94 | #[inline(always)] unsafe fn rsqrt(x: __m128) -> __m128 { _mm_rsqrt_ps(x) } 95 | #[inline(always)] unsafe fn rcp(x: __m128) -> __m128 { _mm_rcp_ps(x) } 96 | 97 | #[inline(always)] unsafe fn mul_add(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fmadd_ps(x, m, a) } 98 | #[inline(always)] unsafe fn mul_sub(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fmsub_ps(x, m, a) } 99 | #[inline(always)] unsafe fn nmul_add(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fnmadd_ps(x, m, a) } 100 | #[inline(always)] unsafe fn nmul_sub(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fnmsub_ps(x, m, a) } 101 | } 102 | 103 | #[rustfmt::skip] 104 | unsafe impl FloatRegisterOps for AVX2F32Register<8> { 105 | #[inline(always)] unsafe fn ceil(x: __m256) -> __m256 { _mm256_ceil_ps(x) } 106 | #[inline(always)] unsafe fn floor(x: __m256) -> __m256 { _mm256_floor_ps(x) } 107 | #[inline(always)] unsafe fn round(x: __m256) -> __m256 { _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) } 108 | #[inline(always)] unsafe fn trunc(x: __m256) -> __m256 { _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) } 109 | 110 | #[inline(always)] unsafe fn sqrt(x: __m256) -> __m256 { _mm256_sqrt_ps(x) } 111 | #[inline(always)] unsafe fn rsqrt(x: __m256) -> __m256 { _mm256_rsqrt_ps(x) } 112 | #[inline(always)] unsafe fn rcp(x: __m256) -> __m256 { _mm256_rcp_ps(x) } 113 | 114 | #[inline(always)] unsafe fn mul_add(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fmadd_ps(x, m, a) } 115 | #[inline(always)] unsafe fn mul_sub(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fmsub_ps(x, m, a) } 116 | #[inline(always)] unsafe fn nmul_add(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fnmadd_ps(x, m, a) } 117 | #[inline(always)] unsafe fn nmul_sub(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fnmsub_ps(x, m, a) } 118 | } 119 | -------------------------------------------------------------------------------- /crates/thermite2/src/backends/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod register; 2 | pub mod vector; 3 | 4 | pub mod polyfills; 5 | 6 | pub mod avx2; 7 | -------------------------------------------------------------------------------- /crates/thermite2/src/backends/polyfills.rs: -------------------------------------------------------------------------------- 1 | use super::register::{BinaryRegisterOps, FloatRegisterOps}; 2 | 3 | #[inline(always)] 4 | pub const fn _mm_shuffle(w: i32, z: i32, y: i32, x: i32) -> i32 { 5 | (w << 6) | (z << 4) | (y << 2) | x 6 | } 7 | 8 | // https://stackoverflow.com/a/26342944/2083075 + Bernard's comment 9 | #[inline(always)] 10 | pub unsafe fn float_rem(lhs: R::Storage, rhs: R::Storage) -> R::Storage 11 | where 12 | R: FloatRegisterOps + BinaryRegisterOps, 13 | { 14 | R::nmul_add(R::trunc(R::div(lhs, rhs)), rhs, lhs) 15 | } 16 | -------------------------------------------------------------------------------- /crates/thermite2/src/backends/register.rs: -------------------------------------------------------------------------------- 1 | pub unsafe trait Register { 2 | type Element: Clone + Copy; 3 | type Storage: Clone + Copy; 4 | 5 | unsafe fn set1(x: Self::Element) -> Self::Storage; 6 | } 7 | 8 | pub unsafe trait SimpleRegister: Register { 9 | unsafe fn load(ptr: *const Self::Element) -> Self::Storage; 10 | } 11 | 12 | pub unsafe trait FixedRegister: Register { 13 | unsafe fn setr(values: [Self::Element; N]) -> Self::Storage; 14 | } 15 | 16 | pub unsafe trait UnaryRegisterOps: Register { 17 | unsafe fn bit_not(r: Self::Storage) -> Self::Storage; 18 | } 19 | 20 | pub unsafe trait BinaryRegisterOps: UnaryRegisterOps { 21 | unsafe fn bitand(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 22 | unsafe fn bitor(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 23 | unsafe fn bitxor(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 24 | 25 | #[inline(always)] 26 | unsafe fn and_not(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage { 27 | Self::bitand(Self::bit_not(lhs), rhs) 28 | } 29 | 30 | unsafe fn add(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 31 | unsafe fn sub(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 32 | unsafe fn mul(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 33 | unsafe fn div(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 34 | unsafe fn rem(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage; 35 | } 36 | 37 | pub unsafe trait FloatRegisterOps: SignedRegisterOps + BinaryRegisterOps { 38 | unsafe fn round(x: Self::Storage) -> Self::Storage; 39 | unsafe fn ceil(x: Self::Storage) -> Self::Storage; 40 | unsafe fn floor(x: Self::Storage) -> Self::Storage; 41 | unsafe fn trunc(x: Self::Storage) -> Self::Storage; 42 | 43 | #[inline(always)] 44 | unsafe fn fract(x: Self::Storage) -> Self::Storage { 45 | Self::sub(x, Self::trunc(x)) 46 | } 47 | 48 | unsafe fn sqrt(x: Self::Storage) -> Self::Storage; 49 | unsafe fn rsqrt(x: Self::Storage) -> Self::Storage; 50 | unsafe fn rcp(x: Self::Storage) -> Self::Storage; 51 | 52 | unsafe fn mul_add(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage; 53 | unsafe fn mul_sub(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage; 54 | unsafe fn nmul_add(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage; 55 | unsafe fn nmul_sub(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage; 56 | } 57 | 58 | pub unsafe trait SignedRegisterOps: Register { 59 | unsafe fn neg(x: Self::Storage) -> Self::Storage; 60 | unsafe fn abs(x: Self::Storage) -> Self::Storage; 61 | } 62 | 63 | pub unsafe trait MaskRegisterOps: BinaryRegisterOps { 64 | #[inline(always)] 65 | unsafe fn blendv(mask: Self::Storage, t: Self::Storage, f: Self::Storage) -> Self::Storage { 66 | Self::bitor(Self::bitand(mask, t), Self::and_not(mask, f)) 67 | } 68 | 69 | unsafe fn all(mask: Self::Storage) -> bool; 70 | unsafe fn any(mask: Self::Storage) -> bool; 71 | 72 | #[inline(always)] 73 | unsafe fn none(mask: Self::Storage) -> bool { 74 | !Self::any(mask) 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /crates/thermite2/src/backends/vector.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | use super::register::*; 4 | 5 | #[repr(transparent)] 6 | pub struct Vector(R::Storage); 7 | 8 | impl Clone for Vector { 9 | fn clone(&self) -> Self { 10 | *self 11 | } 12 | } 13 | 14 | impl Copy for Vector {} 15 | 16 | pub trait NumericElement { 17 | const ZERO: Self; 18 | const ONE: Self; 19 | const MIN_VALUE: Self; 20 | const MAX_VALUE: Self; 21 | } 22 | 23 | pub trait SignedElement: NumericElement { 24 | const NEG_ONE: Self; 25 | } 26 | 27 | pub trait FloatElement: SignedElement { 28 | const NEG_ZERO: Self; 29 | } 30 | 31 | macro_rules! impl_element { 32 | (NUMERIC $($i:ty),*) => {$( 33 | impl NumericElement for $i { 34 | const ZERO: Self = 0 as $i; 35 | const ONE: Self = 1 as $i; 36 | const MIN_VALUE: Self = <$i>::MIN; 37 | const MAX_VALUE: Self = <$i>::MAX; 38 | } 39 | )*}; 40 | 41 | (SIGNED $($i:ty),*) => {$( 42 | impl SignedElement for $i { 43 | const NEG_ONE: Self = -1 as $i; 44 | } 45 | )*}; 46 | 47 | (FLOAT $($i:ty),*) => {$( 48 | impl FloatElement for $i { 49 | const NEG_ZERO: Self = -0.0; 50 | } 51 | )*} 52 | } 53 | 54 | impl_element!(NUMERIC i8, i16, i32, i64, u8, u16, u32, u64, f32, f64); 55 | impl_element!(SIGNED i8, i16, i32, i64, f32, f64); 56 | impl_element!(FLOAT f32, f64); 57 | 58 | impl SimdVectorBase for Vector { 59 | type Element = ::Element; 60 | 61 | #[inline(always)] 62 | fn splat(value: Self::Element) -> Self { 63 | Vector(unsafe { R::set1(value) }) 64 | } 65 | } 66 | 67 | impl SimdFixedVector for Vector 68 | where 69 | R: FixedRegister, 70 | { 71 | #[inline(always)] 72 | fn set(values: [Self::Element; N]) -> Self { 73 | Vector(unsafe { R::setr(values) }) 74 | } 75 | } 76 | 77 | #[rustfmt::skip] 78 | impl SimdVector for Vector 79 | where 80 | R: BinaryRegisterOps, 81 | Self: SimdVectorBase, 82 | ::Element: NumericElement, 83 | { 84 | #[inline(always)] fn zero() -> Self { Self::splat(NumericElement::ZERO) } 85 | #[inline(always)] fn one() -> Self { Self::splat(NumericElement::ONE) } 86 | #[inline(always)] fn min_value() -> Self { Self::splat(NumericElement::MAX_VALUE) } 87 | #[inline(always)] fn max_value() -> Self { Self::splat(NumericElement::MIN_VALUE) } 88 | } 89 | 90 | #[rustfmt::skip] 91 | impl SimdSignedVector for Vector 92 | where 93 | R: SignedRegisterOps, 94 | Self: SimdVector, 95 | { 96 | #[inline(always)] fn abs(self) -> Self { Vector(unsafe { R::abs(self.0) }) } 97 | } 98 | 99 | #[rustfmt::skip] 100 | impl SimdFloatVector for Vector 101 | where 102 | R: FloatRegisterOps, 103 | Self: SimdVector, 104 | ::Element: FloatElement, 105 | { 106 | #[inline(always)] fn neg_one() -> Self { Self::splat(SignedElement::NEG_ONE) } 107 | #[inline(always)] fn neg_zero() -> Self { Self::splat(FloatElement::NEG_ZERO) } 108 | } 109 | 110 | macro_rules! impl_binary_op { 111 | (VECTOR $($op_trait:ident::$op:ident),*) => {$( 112 | impl $op_trait for Vector where R: BinaryRegisterOps { 113 | type Output = Self; 114 | #[inline(always)] fn $op(self, rhs: Self) -> Self { 115 | Vector(unsafe { R::$op(self.0, rhs.0) }) 116 | } 117 | } 118 | 119 | impl_binary_op!(ELEMENTS $op_trait::$op [i8, i16, i32, i64, u8, u16, u32, u64, f32, f64]); 120 | )*}; 121 | (ELEMENTS $op_trait:ident::$op:ident [$($t:ty),*]) => {$( 122 | impl $op_trait<$t> for Vector where R: Register + BinaryRegisterOps { 123 | type Output = Self; 124 | #[inline(always)] fn $op(self, rhs: $t) -> Self { 125 | Vector(unsafe { R::$op(self.0, R::set1(rhs)) }) 126 | } 127 | } 128 | 129 | //impl $op_trait> for $t where R: Register + BinaryRegisterOps { 130 | // type Output = Vector; 131 | // #[inline(always)] fn $op(self, rhs: Vector) -> Vector { 132 | // Vector(unsafe { R::$op(R::splat(self), rhs.0) }) 133 | // } 134 | //} 135 | )*} 136 | } 137 | 138 | impl_binary_op!(VECTOR Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor); 139 | -------------------------------------------------------------------------------- /crates/thermite2/src/iset.rs: -------------------------------------------------------------------------------- 1 | 2 | /// Enum of supported instruction sets 3 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] 4 | #[repr(u8)] 5 | pub enum SimdInstructionSet { 6 | Scalar, 7 | 8 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 9 | SSE2, 10 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 11 | SSE42, 12 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 13 | AVX, 14 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 15 | AVX2, 16 | 17 | #[cfg(all(feature = "neon", any(target_arch = "arm", target_arch = "aarch64")))] 18 | NEON, 19 | 20 | #[cfg(all(feature = "wasm32", target_arch = "wasm32"))] 21 | WASM32, 22 | } 23 | 24 | impl SimdInstructionSet { 25 | #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "static_init"))] 26 | #[inline] 27 | pub fn runtime_detect() -> SimdInstructionSet { 28 | #[static_init::dynamic(0)] 29 | static SIS: SimdInstructionSet = SimdInstructionSet::runtime_detect_x86_internal(); 30 | 31 | unsafe { *SIS } 32 | } 33 | 34 | #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(feature = "static_init")))] 35 | pub fn runtime_detect() -> SimdInstructionSet { 36 | unsafe { 37 | static mut CACHED: Option = None; 38 | 39 | match CACHED { 40 | Some(value) => value, 41 | None => { 42 | // Allow this to race, they all converge to the same result 43 | let isa = Self::runtime_detect_x86_internal(); 44 | CACHED = Some(isa); 45 | isa 46 | } 47 | } 48 | } 49 | } 50 | 51 | #[cfg(all(feature = "neon", any(target_arch = "arm", target_arch = "aarch64")))] 52 | const fn runtime_detect() -> SimdInstructionSet { 53 | SimdInstructionSet::NEON 54 | } 55 | 56 | #[cfg(all(feature = "wasm32", target_arch = "wasm32"))] 57 | const fn runtime_detect() -> SimdInstructionSet { 58 | SimdInstructionSet::WASM32 59 | } 60 | 61 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 62 | fn runtime_detect_x86_internal() -> SimdInstructionSet { 63 | if core_detect::is_x86_feature_detected!("fma") { 64 | // TODO: AVX512 65 | if core_detect::is_x86_feature_detected!("avx2") { 66 | return SimdInstructionSet::AVX2; 67 | } 68 | } 69 | 70 | if core_detect::is_x86_feature_detected!("avx") { 71 | SimdInstructionSet::AVX 72 | } else if core_detect::is_x86_feature_detected!("sse4.2") { 73 | SimdInstructionSet::SSE42 74 | } else if core_detect::is_x86_feature_detected!("sse2") { 75 | SimdInstructionSet::SSE2 76 | } else { 77 | SimdInstructionSet::Scalar 78 | } 79 | } 80 | 81 | /// True fused multiply-add instructions are only used on AVX2 and above, so this checks for that ergonomically. 82 | pub const fn has_true_fma(self) -> bool { 83 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 84 | if self as u8 >= SimdInstructionSet::AVX2 as u8 { 85 | return true; 86 | } 87 | 88 | false 89 | } 90 | 91 | /// On older platforms, fused multiply-add instructions can be emulated (expensively), 92 | /// but only if the `"emulate_fma"` Cargo feature is enabled. 93 | pub const fn has_emulated_fma(self) -> bool { 94 | !self.has_true_fma() && cfg!(feature = "emulate_fma") 95 | } 96 | 97 | /// The number of general-purpose registers that can be expected to be allocated to algorithms 98 | pub const fn num_registers(self) -> usize { 99 | #[allow(unreachable_patterns)] 100 | match self { 101 | // #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 102 | // SimdInstructionSet::AVX512 => 32, 103 | 104 | // 105 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 106 | SimdInstructionSet::Scalar => 8, 107 | 108 | // x86 has at least 16 registers for xmms, ymms, zmms 109 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] 110 | _ => 16, 111 | 112 | // 32x64-bit or 32x128-bit registers 113 | #[cfg(all(feature = "neon", any(target_arch = "arm", target_arch = "aarch64")))] 114 | SimdInstructionSet::NEON => 32, 115 | 116 | _ => 1, 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /crates/thermite2/src/lib.rs: -------------------------------------------------------------------------------- 1 | #![no_std] 2 | // stdmind for f16c instructions, core_intrinsics for likely/unlikely 3 | #![cfg_attr(feature = "nightly", feature(stdsimd, core_intrinsics))] 4 | #![allow(unused_imports, non_camel_case_types, non_snake_case)] 5 | 6 | #[macro_use] 7 | mod macros; 8 | 9 | pub mod arch; 10 | 11 | pub mod backends; 12 | pub mod iset; 13 | pub mod widen; 14 | 15 | pub use iset::SimdInstructionSet; 16 | 17 | use core::{fmt::Debug, marker::PhantomData, mem, ops::*, ptr}; 18 | 19 | /// SIMD Instruction set, contains all types 20 | /// 21 | /// Take your time to look through this. All trait bounds contain methods and associated values which 22 | /// encapsulate all functionality for this crate. 23 | pub trait Simd: 'static + Debug + Send + Sync + Clone + Copy + PartialEq + Eq { 24 | const INSTRSET: SimdInstructionSet; 25 | 26 | /// Largest native single-precision floating point vector, occupies one register. 27 | type Vf32; 28 | 29 | /// 32-bit single-precision floating point vector 30 | type Vf32x1; 31 | /// 64-bit single-precision floating point vector 32 | type Vf32x2; 33 | /// 128-bit single-precision floating point vector 34 | type Vf32x4: SimdFixedVector + SimdFloatVector + SimdOverloads; 35 | /// 256-bit single-precision floating point vector 36 | type Vf32x8; 37 | /// 512-bit single-precision floating point vector 38 | type Vf32x16; 39 | } 40 | 41 | pub trait SimdVectorBase: Clone + Copy { 42 | type Element; 43 | 44 | fn splat(value: Self::Element) -> Self; 45 | } 46 | 47 | pub trait SimdFixedVector: SimdVectorBase { 48 | fn set(values: [Self::Element; N]) -> Self; 49 | } 50 | 51 | pub trait SimdVector: SimdVectorBase + Add { 52 | fn zero() -> Self; 53 | fn one() -> Self; 54 | fn min_value() -> Self; 55 | fn max_value() -> Self; 56 | } 57 | 58 | pub trait SimdOverloads: 59 | SimdVectorBase 60 | + Add 61 | + Sub 62 | + Mul 63 | + Div 64 | + Rem 65 | { 66 | } 67 | 68 | impl SimdOverloads for T where 69 | T: SimdVectorBase 70 | + Add 71 | + Sub 72 | + Mul 73 | + Div 74 | + Rem 75 | { 76 | } 77 | 78 | pub trait SimdSignedVector: SimdVector { 79 | fn abs(self) -> Self; 80 | } 81 | 82 | pub trait SimdFloatVector: SimdSignedVector { 83 | fn neg_one() -> Self; 84 | fn neg_zero() -> Self; 85 | } 86 | -------------------------------------------------------------------------------- /crates/thermite2/src/macros.rs: -------------------------------------------------------------------------------- 1 | #![allow(unused)] 2 | 3 | #[cfg(feature = "nightly")] 4 | pub use core::intrinsics::{likely, unlikely}; 5 | 6 | // borrows technique from https://github.com/rust-lang/hashbrown/pull/209 7 | #[cfg(not(feature = "nightly"))] 8 | #[inline] 9 | #[cold] 10 | fn cold() {} 11 | 12 | #[cfg(not(feature = "nightly"))] 13 | #[rustfmt::skip] 14 | #[inline(always)] 15 | pub unsafe fn likely(b: bool) -> bool { 16 | if !b { cold() } b 17 | } 18 | 19 | #[cfg(not(feature = "nightly"))] 20 | #[rustfmt::skip] 21 | #[inline(always)] 22 | pub unsafe fn unlikely(b: bool) -> bool { 23 | if b { cold() } b 24 | } 25 | 26 | #[doc(hidden)] 27 | #[macro_export] 28 | #[rustfmt::skip] 29 | macro_rules! thermite_likely { 30 | ($e:expr) => {{ 31 | #[allow(unused_unsafe)] 32 | unsafe { $crate::macros::likely($e) } 33 | }}; 34 | } 35 | 36 | #[doc(hidden)] 37 | #[macro_export] 38 | #[rustfmt::skip] 39 | macro_rules! thermite_unlikely { 40 | ($e:expr) => {{ 41 | #[allow(unused_unsafe)] 42 | unsafe { $crate::macros::unlikely($e) } 43 | }}; 44 | } 45 | -------------------------------------------------------------------------------- /crates/thermite2/src/widen.rs: -------------------------------------------------------------------------------- 1 | use crate::*; 2 | 3 | pub struct Widen { 4 | vectors: [V; N], 5 | _simd: PhantomData, 6 | } 7 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 120 --------------------------------------------------------------------------------