├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── rustdoc.yml
├── .gitignore
├── Cargo.toml
├── LICENSE-APACHE
├── LICENSE-MIT
├── README.md
├── crates
    ├── dispatch
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── thermite-complex
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── thermite-hyperdual
    │   ├── Cargo.toml
    │   └── src
    │   │   └── lib.rs
    ├── thermite-special
    │   ├── Cargo.toml
    │   └── src
    │   │   ├── bessel.rs
    │   │   ├── lib.rs
    │   │   ├── pd.rs
    │   │   └── ps.rs
    ├── thermite
    │   ├── Cargo.toml
    │   ├── benches
    │   │   └── main.rs
    │   ├── examples
    │   │   ├── asm.rs
    │   │   ├── geo
    │   │   │   └── mod.rs
    │   │   └── plot.rs
    │   ├── src
    │   │   ├── arch.rs
    │   │   ├── backends
    │   │   │   ├── aarch64
    │   │   │   │   └── mod.rs
    │   │   │   ├── arm
    │   │   │   │   └── mod.rs
    │   │   │   ├── avx1
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── polyfills.rs
    │   │   │   │   ├── vf32.rs
    │   │   │   │   ├── vf64.rs
    │   │   │   │   ├── vi32.rs
    │   │   │   │   ├── vi32_2.rs
    │   │   │   │   ├── vi64.rs
    │   │   │   │   ├── vi64_2.rs
    │   │   │   │   ├── vu32.rs
    │   │   │   │   └── vu64.rs
    │   │   │   ├── avx2
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── polyfills.rs
    │   │   │   │   ├── vf32.rs
    │   │   │   │   ├── vf64.rs
    │   │   │   │   ├── vi16.rs
    │   │   │   │   ├── vi32.rs
    │   │   │   │   ├── vi64.rs
    │   │   │   │   ├── vu32.rs
    │   │   │   │   └── vu64.rs
    │   │   │   ├── macros.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── polyfills.rs
    │   │   │   ├── scalar
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── polyfills.rs
    │   │   │   │   ├── vf32.rs
    │   │   │   │   ├── vf64.rs
    │   │   │   │   ├── vi32.rs
    │   │   │   │   ├── vi64.rs
    │   │   │   │   ├── vu32.rs
    │   │   │   │   └── vu64.rs
    │   │   │   ├── sse2
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── polyfills.rs
    │   │   │   │   └── sse2.rs
    │   │   │   ├── sse42
    │   │   │   │   ├── mod.rs
    │   │   │   │   ├── polyfills.rs
    │   │   │   │   ├── vf32.rs
    │   │   │   │   ├── vi32.rs
    │   │   │   │   └── vu32.rs
    │   │   │   └── wasm32
    │   │   │   │   └── mod.rs
    │   │   ├── buffer.rs
    │   │   ├── divider.rs
    │   │   ├── element.rs
    │   │   ├── iter
    │   │   │   ├── aligned.rs
    │   │   │   ├── mod.rs
    │   │   │   └── slice.rs
    │   │   ├── lib.rs
    │   │   ├── macros.rs
    │   │   ├── mask.rs
    │   │   ├── math
    │   │   │   ├── compensated.rs
    │   │   │   ├── consts.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── pd.rs
    │   │   │   ├── poly.rs
    │   │   │   └── ps.rs
    │   │   ├── pointer.rs
    │   │   ├── rng
    │   │   │   ├── mod.rs
    │   │   │   ├── pcg32.rs
    │   │   │   └── xoshiro.rs
    │   │   └── runtime.rs
    │   └── tests
    │   │   ├── counts.rs
    │   │   ├── reverse.rs
    │   │   └── sinh.rs
    └── thermite2
    │   ├── Cargo.toml
    │   └── src
    │       ├── arch.rs
    │       ├── backends
    │           ├── avx2
    │           │   ├── mod.rs
    │           │   ├── polyfills.rs
    │           │   └── vf32.rs
    │           ├── mod.rs
    │           ├── polyfills.rs
    │           ├── register.rs
    │           └── vector.rs
    │       ├── iset.rs
    │       ├── lib.rs
    │       ├── macros.rs
    │       └── widen.rs
└── rustfmt.toml


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [novacrazy] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: raygon
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: raygon 
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/workflows/rustdoc.yml:
--------------------------------------------------------------------------------
 1 | name: Docs
 2 | 
 3 | # Controls when the action will run. Triggers the workflow on push or pull request
 4 | # events but only for the master branch
 5 | on:
 6 |   push:
 7 |     branches: [ master ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
13 | jobs:
14 |   # This workflow contains a single job called "build"
15 |   build:
16 |     # The type of runner that the job will run on
17 |     runs-on: ubuntu-latest
18 | 
19 |     # Steps represent a sequence of tasks that will be executed as part of the job
20 |     steps:
21 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
22 |       - uses: actions/checkout@v2
23 | 
24 |       - uses: actions-rs/toolchain@v1
25 |         with:
26 |           toolchain: stable
27 | 
28 |       - name: Build Documentation
29 |         uses: actions-rs/cargo@v1
30 |         with:
31 |           command: doc
32 |           toolchain: stable
33 |           args: --no-deps
34 | 
35 |       - run: echo "<meta http-equiv=refresh content=0;url=`echo $GITHUB_REPOSITORY | cut -d '/' -f 2 | tr '[:upper:]' '[:lower:]'`/index.html>" > target/doc/index.html
36 | 
37 |       - name: Deploy Documentation
38 |         uses: peaceiris/actions-gh-pages@v3
39 |         with:
40 |           deploy_key: ${{ secrets.ACTIONS_DEPLOY_KEY }}
41 |           publish_branch: gh-pages
42 |           publish_dir: ./target/doc
43 |           keep_files: false


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/target/criterion
3 | Cargo.lock
4 | .vscode


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = ["crates/*"]
 3 | 
 4 | [profile.dev]
 5 | opt-level = 2
 6 | 
 7 | [profile.release]
 8 | opt-level = 3
 9 | lto = 'fat'
10 | codegen-units = 1
11 | 
12 | [profile.bench]
13 | opt-level = 3
14 | lto = 'fat'
15 | codegen-units = 1
16 | 


--------------------------------------------------------------------------------
/LICENSE-APACHE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                         Version 2.0, January 2004
  3 |                     http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1. Definitions.
  8 | 
  9 |     "License" shall mean the terms and conditions for use, reproduction,
 10 |     and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |     "Licensor" shall mean the copyright owner or entity authorized by
 13 |     the copyright owner that is granting the License.
 14 | 
 15 |     "Legal Entity" shall mean the union of the acting entity and all
 16 |     other entities that control, are controlled by, or are under common
 17 |     control with that entity. For the purposes of this definition,
 18 |     "control" means (i) the power, direct or indirect, to cause the
 19 |     direction or management of such entity, whether by contract or
 20 |     otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |     outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |     "You" (or "Your") shall mean an individual or Legal Entity
 24 |     exercising permissions granted by this License.
 25 | 
 26 |     "Source" form shall mean the preferred form for making modifications,
 27 |     including but not limited to software source code, documentation
 28 |     source, and configuration files.
 29 | 
 30 |     "Object" form shall mean any form resulting from mechanical
 31 |     transformation or translation of a Source form, including but
 32 |     not limited to compiled object code, generated documentation,
 33 |     and conversions to other media types.
 34 | 
 35 |     "Work" shall mean the work of authorship, whether in Source or
 36 |     Object form, made available under the License, as indicated by a
 37 |     copyright notice that is included in or attached to the work
 38 |     (an example is provided in the Appendix below).
 39 | 
 40 |     "Derivative Works" shall mean any work, whether in Source or Object
 41 |     form, that is based on (or derived from) the Work and for which the
 42 |     editorial revisions, annotations, elaborations, or other modifications
 43 |     represent, as a whole, an original work of authorship. For the purposes
 44 |     of this License, Derivative Works shall not include works that remain
 45 |     separable from, or merely link (or bind by name) to the interfaces of,
 46 |     the Work and Derivative Works thereof.
 47 | 
 48 |     "Contribution" shall mean any work of authorship, including
 49 |     the original version of the Work and any modifications or additions
 50 |     to that Work or Derivative Works thereof, that is intentionally
 51 |     submitted to Licensor for inclusion in the Work by the copyright owner
 52 |     or by an individual or Legal Entity authorized to submit on behalf of
 53 |     the copyright owner. For the purposes of this definition, "submitted"
 54 |     means any form of electronic, verbal, or written communication sent
 55 |     to the Licensor or its representatives, including but not limited to
 56 |     communication on electronic mailing lists, source code control systems,
 57 |     and issue tracking systems that are managed by, or on behalf of, the
 58 |     Licensor for the purpose of discussing and improving the Work, but
 59 |     excluding communication that is conspicuously marked or otherwise
 60 |     designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |     "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |     on behalf of whom a Contribution has been received by Licensor and
 64 |     subsequently incorporated within the Work.
 65 | 
 66 | 2. Grant of Copyright License. Subject to the terms and conditions of
 67 |     this License, each Contributor hereby grants to You a perpetual,
 68 |     worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |     copyright license to reproduce, prepare Derivative Works of,
 70 |     publicly display, publicly perform, sublicense, and distribute the
 71 |     Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3. Grant of Patent License. Subject to the terms and conditions of
 74 |     this License, each Contributor hereby grants to You a perpetual,
 75 |     worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |     (except as stated in this section) patent license to make, have made,
 77 |     use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |     where such license applies only to those patent claims licensable
 79 |     by such Contributor that are necessarily infringed by their
 80 |     Contribution(s) alone or by combination of their Contribution(s)
 81 |     with the Work to which such Contribution(s) was submitted. If You
 82 |     institute patent litigation against any entity (including a
 83 |     cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |     or a Contribution incorporated within the Work constitutes direct
 85 |     or contributory patent infringement, then any patent licenses
 86 |     granted to You under this License for that Work shall terminate
 87 |     as of the date such litigation is filed.
 88 | 
 89 | 4. Redistribution. You may reproduce and distribute copies of the
 90 |     Work or Derivative Works thereof in any medium, with or without
 91 |     modifications, and in Source or Object form, provided that You
 92 |     meet the following conditions:
 93 | 
 94 |     (a) You must give any other recipients of the Work or
 95 |         Derivative Works a copy of this License; and
 96 | 
 97 |     (b) You must cause any modified files to carry prominent notices
 98 |         stating that You changed the files; and
 99 | 
100 |     (c) You must retain, in the Source form of any Derivative Works
101 |         that You distribute, all copyright, patent, trademark, and
102 |         attribution notices from the Source form of the Work,
103 |         excluding those notices that do not pertain to any part of
104 |         the Derivative Works; and
105 | 
106 |     (d) If the Work includes a "NOTICE" text file as part of its
107 |         distribution, then any Derivative Works that You distribute must
108 |         include a readable copy of the attribution notices contained
109 |         within such NOTICE file, excluding those notices that do not
110 |         pertain to any part of the Derivative Works, in at least one
111 |         of the following places: within a NOTICE text file distributed
112 |         as part of the Derivative Works; within the Source form or
113 |         documentation, if provided along with the Derivative Works; or,
114 |         within a display generated by the Derivative Works, if and
115 |         wherever such third-party notices normally appear. The contents
116 |         of the NOTICE file are for informational purposes only and
117 |         do not modify the License. You may add Your own attribution
118 |         notices within Derivative Works that You distribute, alongside
119 |         or as an addendum to the NOTICE text from the Work, provided
120 |         that such additional attribution notices cannot be construed
121 |         as modifying the License.
122 | 
123 |     You may add Your own copyright statement to Your modifications and
124 |     may provide additional or different license terms and conditions
125 |     for use, reproduction, or distribution of Your modifications, or
126 |     for any such Derivative Works as a whole, provided Your use,
127 |     reproduction, and distribution of the Work otherwise complies with
128 |     the conditions stated in this License.
129 | 
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 |     any Contribution intentionally submitted for inclusion in the Work
132 |     by You to the Licensor shall be under the terms and conditions of
133 |     this License, without any additional terms or conditions.
134 |     Notwithstanding the above, nothing herein shall supersede or modify
135 |     the terms of any separate license agreement you may have executed
136 |     with Licensor regarding such Contributions.
137 | 
138 | 6. Trademarks. This License does not grant permission to use the trade
139 |     names, trademarks, service marks, or product names of the Licensor,
140 |     except as required for reasonable and customary use in describing the
141 |     origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 |     agreed to in writing, Licensor provides the Work (and each
145 |     Contributor provides its Contributions) on an "AS IS" BASIS,
146 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |     implied, including, without limitation, any warranties or conditions
148 |     of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |     PARTICULAR PURPOSE. You are solely responsible for determining the
150 |     appropriateness of using or redistributing the Work and assume any
151 |     risks associated with Your exercise of permissions under this License.
152 | 
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 |     whether in tort (including negligence), contract, or otherwise,
155 |     unless required by applicable law (such as deliberate and grossly
156 |     negligent acts) or agreed to in writing, shall any Contributor be
157 |     liable to You for damages, including any direct, indirect, special,
158 |     incidental, or consequential damages of any character arising as a
159 |     result of this License or out of the use or inability to use the
160 |     Work (including but not limited to damages for loss of goodwill,
161 |     work stoppage, computer failure or malfunction, or any and all
162 |     other commercial damages or losses), even if such Contributor
163 |     has been advised of the possibility of such damages.
164 | 
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 |     the Work or Derivative Works thereof, You may choose to offer,
167 |     and charge a fee for, acceptance of support, warranty, indemnity,
168 |     or other liability obligations and/or rights consistent with this
169 |     License. However, in accepting such obligations, You may act only
170 |     on Your own behalf and on Your sole responsibility, not on behalf
171 |     of any other Contributor, and only if You agree to indemnify,
172 |     defend, and hold each Contributor harmless for any liability
173 |     incurred by, or claims asserted against, such Contributor by reason
174 |     of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |     To apply the Apache License to your work, attach the following
181 |     boilerplate notice, with the fields enclosed by brackets "[]"
182 |     replaced with your own identifying information. (Don't include
183 |     the brackets!)  The text should be enclosed in the appropriate
184 |     comment syntax for the file format. We also recommend that a
185 |     file or class name and description of purpose be included on the
186 |     same "printed page" as the copyright notice for easier
187 |     identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |     http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Developers of the Thermite project
 2 | Copyright (c) 2014 The Rust Project Developers
 3 | 
 4 | Permission is hereby granted, free of charge, to any
 5 | person obtaining a copy of this software and associated
 6 | documentation files (the "Software"), to deal in the
 7 | Software without restriction, including without
 8 | limitation the rights to use, copy, modify, merge,
 9 | publish, distribute, sublicense, and/or sell copies of
10 | the Software, and to permit persons to whom the Software
11 | is furnished to do so, subject to the following
12 | conditions:
13 | 
14 | The above copyright notice and this permission notice
15 | shall be included in all copies or substantial portions
16 | of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
19 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
20 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
21 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
22 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
23 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
25 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 | DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Thermite SIMD: Melt your CPU
  2 | ============================
  3 | 
  4 | **NOTE: This crate is not yet on crates.io, but I do own the name and will publish it there when ready**
  5 | 
  6 | Thermite is a WIP SIMD library focused on providing portable SIMD acceleration of SoA (Structure of Arrays) algorithms, using consistent-length<sup>1</sup> SIMD vectors for lockstep iteration and computation.
  7 | 
  8 | Thermite provides highly optimized **feature-rich backends** for SSE2, SSE4.2, AVX and AVX2, with planned support for AVX512, ARM/Aarch64 NEON, and WASM SIMD extensions.
  9 | 
 10 | In addition to that, Thermite includes a highly optimized **vectorized math library** with many special math functions and algorithms, specialized for both single and double precision.
 11 | 
 12 | <sub><small>
 13 | <sup>1</sup> All vectors in an instruction set are the same length, regardless of size.
 14 | </small></sub>
 15 | 
 16 | # Current Status
 17 | 
 18 | Refer to issue [#1](https://github.com/raygon-renderer/thermite/issues/1)
 19 | 
 20 | # Motivation and Goals
 21 | 
 22 | Thermite was conceived while working on Raygon renderer, when it was decided we needed a state of the art high-performance SIMD vector library focused on facilitating SoA algorithms. Using SIMD for AoS values was a nightmare, constantly shuffling vectors and performing unnecessary horizontal operations. We also weren't able to take advantage of AVX2 fully due to 3D vectors only using 3 or 4 lanes of a regular 128-bit register.
 23 | 
 24 | Using SIMDeez, `faster`, or redesigning `packed_simd` were all considered, but each has their flaws. SIMDeez is rather limited in functionality, and their handling of `target_feature` leaves much to be desired. `faster` fits well into the SoA paradigm, but the iterator-based API is rather unwieldy, and it is lacking many features. `packed_simd` isn't bad, but it's also missing many features and relies on the Nightly-only `"platform-intrinsic"`s, which can produce suboptimal code in some cases.
 25 | 
 26 | Therefore, the only solution was to write my own, and thus Thermite was born.
 27 | 
 28 | The primary goal of Thermite is to provide optimal codegen for every backend instruction set, and provide a consistent set of features on top of all of them, in such a way as to encourage using chunked SoA or AoSoA algorithms regardless of what data types you need. Furthermore, with the `#[dispatch]` macro, multiple instruction sets can be easily targetted within a single binary.
 29 | 
 30 | # Features
 31 | 
 32 | * SSE2, SSE4.2, AVX, AVX2 backends, with planned support for scalar, AVX512, WASM SIMD and ARM NEON backends.
 33 | * Extensive built-in vectorized math library.
 34 | * Compile-time policies to emphasize precision, performance or code size (useful for WASM)
 35 | * Compile-time monomorphisation with runtime selection
 36 |     * Aided by a `#[dispatch]` procedural macro to ensure optimal codegen.
 37 | * Zero runtime overhead.
 38 | * Operator overloading on vector types.
 39 | * Abstracts over vector length, giving the same length to all vectors of an instruction set.
 40 | * Provides fast polyfills where necessary to provide the same API across all instruction sets.
 41 | * Highly optimized value cast routines between vector types where possible.
 42 | * Dedicated mask wrapper type with low-cost bitwise vector conversions built-in.
 43 | 
 44 | # Optimized Project Setup
 45 | 
 46 | For optimal performance, ensure you `Cargo.toml` profiles looks something like this:
 47 | ```toml
 48 | [profile.dev]
 49 | opt-level = 2       # Required to inline SIMD intrinsics internally
 50 | 
 51 | [profile.release]
 52 | opt-level = 3       # Should be at least 2; level 1 will not use SIMD intrinsics
 53 | lto = 'thin'        # 'fat' LTO may also improve things, but will increase compile time
 54 | codegen-units = 1   # Required for optimal inlining and optimizations
 55 | 
 56 | # optional release options depending on your project and preference
 57 | incremental = false # Release builds will take longer to compile, but inter-crate optimizations may work better
 58 | panic = 'abort'     # Very few functions in Thermite panic, but aborting will avoid the unwind mechanism overhead
 59 | ```
 60 | 
 61 | # Misc. Usage Notes
 62 | 
 63 | * Vectors with 64-bit elements are approximately 2-4x slower than 32-bit vectors.
 64 | * Integer vectors are 2x slower on SSE2/AVX1, but nominal on SSE4.1 and AVX2. This compounds the first point.
 65 | * Casting floats to signed integers is faster than to unsigned integers.
 66 | * Equal-size Signed and Unsigned integer vectors can be cast between each other at zero cost.
 67 | * Operations mixing float and integer types can incur a 1-cycle penalty on most modern CPUs.
 68 | * Integer division currently can only be done with a scalar fallback, so it's not recommended.
 69 | * Dividing integer vectors by constant uniform divisors should use `SimdIntVector::div_const`
 70 | * When reusing masks for `all`/`any`/`none` queries, consider using the bitmask directly to avoid recomputing.
 71 | * Avoid casting between differently-sized types in hot loops.
 72 | * Avoid extracting and replacing elements.
 73 | * LLVM will inline many math functions and const-eval as much as possible, but only if it was called in the same instruction-set context.
 74 | 
 75 | # Cargo `--features`
 76 | 
 77 | ### `alloc` (enabled by default)
 78 | 
 79 | The `alloc` feature enables aligned allocation of buffers suitable to reading/writing to with SIMD.
 80 | 
 81 | ### `nightly`
 82 | 
 83 | The `nightly` feature enables nightly-only optimizations such as accelerated half-precision encoding/decoding.
 84 | 
 85 | ### `math` (enabled by default)
 86 | 
 87 | Enables the vectorized math modules
 88 | 
 89 | ### `rng`
 90 | 
 91 | Enables the vectorized random number modules
 92 | 
 93 | ### `emulate_fma`
 94 | 
 95 | Real fused multiply-add instructions are only enabled for AVX2 platforms. However, as FMA is used not only for performance but for its extended precision, falling back to a split multiply and addition will incur two rounding errors, and may be unacceptable for
 96 | some applications. Therefore, the `emulate_fma` Cargo feature will enable a slower but more accurate implementation on older platforms.
 97 | 
 98 | For single-precision floats, this is easiest done by simply casting it to double-precision, doing seperate multiply and additions, then casting back. For double-precision, it will use an infinite-precision implementation based on libm.
 99 | 
100 | On SSE2 platforms, double-precision may fallback to scalar ops, as the effort needed to make it branchless will be more expensive than not. As of writing this, it has not been implemented, so benchmarks will reveal what is needed later.
101 | 


--------------------------------------------------------------------------------
/crates/dispatch/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "thermite-dispatch"
 3 | version = "0.1.0"
 4 | authors = ["novacrazy <novacrazy@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | [lib]
 8 | proc-macro = true
 9 | 
10 | [dependencies]
11 | quote = "1"
12 | proc-macro2 = "1"
13 | syn = { version = "1", features = ["full", "extra-traits", "visit-mut"] }
14 | 
15 | [features]
16 | neon = []
17 | wasm32 = []


--------------------------------------------------------------------------------
/crates/thermite-complex/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "thermite-complex"
 3 | version = "0.1.0"
 4 | authors = ["novacrazy <novacrazy@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | thermite = { path = "../thermite" }


--------------------------------------------------------------------------------
/crates/thermite-hyperdual/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "thermite-hyperdual"
 3 | version = "0.1.0"
 4 | authors = ["novacrazy <novacrazy@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | thermite = { path = "../thermite" }


--------------------------------------------------------------------------------
/crates/thermite-hyperdual/src/lib.rs:
--------------------------------------------------------------------------------
  1 | #![no_std]
  2 | 
  3 | use thermite::*;
  4 | 
  5 | use core::{
  6 |     fmt,
  7 |     marker::PhantomData,
  8 |     ops::{Add, Div, Mul, Sub},
  9 | };
 10 | 
 11 | pub type Hyperdual<S, V, const N: usize> = HyperdualP<S, V, policies::Performance, N>;
 12 | pub type DualNumber<S, V> = Hyperdual<S, V, 1>;
 13 | 
 14 | pub struct HyperdualP<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> {
 15 |     /// Real part
 16 |     pub re: V,
 17 |     /// Dual parts
 18 |     pub du: [V; N],
 19 |     _simd: PhantomData<(S, P)>,
 20 | }
 21 | 
 22 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> Clone for HyperdualP<S, V, P, N> {
 23 |     fn clone(&self) -> Self {
 24 |         *self
 25 |     }
 26 | }
 27 | 
 28 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> Copy for HyperdualP<S, V, P, N> {}
 29 | 
 30 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> fmt::Debug for HyperdualP<S, V, P, N> {
 31 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 32 |         f.debug_struct("HyperdualP")
 33 |             .field("re", &self.re)
 34 |             .field("du", &self.du)
 35 |             .finish()
 36 |     }
 37 | }
 38 | 
 39 | #[dispatch(S)]
 40 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> HyperdualP<S, V, P, N> {
 41 |     #[inline(always)]
 42 |     pub fn new(re: V, du: [V; N]) -> Self {
 43 |         Self {
 44 |             re,
 45 |             du,
 46 |             _simd: PhantomData,
 47 |         }
 48 |     }
 49 | 
 50 |     #[inline(always)]
 51 |     pub fn real(re: V) -> Self {
 52 |         Self::new(re, [V::zero(); N])
 53 |     }
 54 | 
 55 |     #[inline(always)]
 56 |     pub fn one() -> Self {
 57 |         Self::real(V::one())
 58 |     }
 59 | 
 60 |     #[inline(always)]
 61 |     pub fn zero() -> Self {
 62 |         Self::real(V::zero())
 63 |     }
 64 | 
 65 |     #[inline(always)]
 66 |     pub fn map<F>(mut self, f: F) -> Self
 67 |     where
 68 |         F: Fn(V) -> V,
 69 |     {
 70 |         self.map_dual(f(self.re), f)
 71 |     }
 72 | 
 73 |     #[inline(always)]
 74 |     pub fn map_dual<F>(mut self, re: V, f: F) -> Self
 75 |     where
 76 |         F: Fn(V) -> V,
 77 |     {
 78 |         self.re = re;
 79 |         for dual in &mut self.du {
 80 |             *dual = f(*dual);
 81 |         }
 82 |         self
 83 |     }
 84 | }
 85 | 
 86 | #[dispatch(S)]
 87 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> HyperdualP<S, V, P, N>
 88 | where
 89 |     V: SimdVectorizedMath<S>,
 90 | {
 91 |     #[inline(always)]
 92 |     fn div_dual(self, re: V, denom: V) -> Self {
 93 |         if N > 1 {
 94 |             let rcp = denom.reciprocal_p::<P>();
 95 |             self.map_dual(re, |x| x * rcp)
 96 |         } else {
 97 |             self.map_dual(re, |x| x / denom)
 98 |         }
 99 |     }
100 | 
101 |     #[inline(always)]
102 |     pub fn fract(mut self) -> Self {
103 |         self.re = self.re.fract();
104 |         self
105 |     }
106 | 
107 |     #[inline(always)]
108 |     pub fn signum(self) -> Self {
109 |         Self::real(self.re.signum())
110 |     }
111 | 
112 |     #[inline(always)]
113 |     pub fn abs(self) -> Self {
114 |         let signum = self.re.signum();
115 |         self.map(|x| x * signum)
116 |     }
117 | 
118 |     #[inline(always)]
119 |     pub fn select(mask: Mask<S, V>, t: Self, f: Self) -> Self {
120 |         let mut t = t; // Weird compiler bug
121 |         for i in 0..N {
122 |             t.du[i] = mask.select(t.du[i], f.du[i]);
123 |         }
124 |         t.re = mask.select(t.re, f.re);
125 |         t
126 |     }
127 | 
128 |     #[inline(always)]
129 |     pub fn min(self, other: Self) -> Self {
130 |         Self::select(self.re.lt(other.re), self, other)
131 |     }
132 | 
133 |     #[inline(always)]
134 |     pub fn max(mut self, other: Self) -> Self {
135 |         Self::select(self.re.gt(other.re), self, other)
136 |     }
137 | 
138 |     #[inline(always)]
139 |     pub fn mul_add(mut self, m: Self, a: Self) -> Self {
140 |         for i in 0..N {
141 |             self.du[i] = self.du[i].mul_add(m.re, self.re.mul_add(m.du[i], a.du[i]));
142 |         }
143 |         self.re = self.re.mul_add(m.re, a.re);
144 |         self
145 |     }
146 | 
147 |     #[inline(always)]
148 |     pub fn powi(self, n: i32) -> Self {
149 |         let r = self.re.powi_p::<P>(n - 1);
150 |         let nf = V::splat_as(n) * r;
151 |         self.map_dual(self.re * r, |x| x * nf)
152 |     }
153 | 
154 |     #[inline(always)]
155 |     pub fn powf(mut self, n: Self) -> Self {
156 |         let re_n1 = self.re.powf_p::<P>(n.re - V::one());
157 | 
158 |         let re = re_n1 * self.re; // re^n
159 | 
160 |         let a = n.re * re_n1; // n * re^(n-1)
161 |         let b = re * self.re.ln_p::<P>();
162 | 
163 |         self.re = re;
164 |         for i in 0..N {
165 |             self.du[i] = a.mul_add(self.du[i], b * n.du[i]);
166 |         }
167 |         self
168 |     }
169 | 
170 |     #[inline(always)]
171 |     pub fn exp(self) -> Self {
172 |         let re = self.re.exp_p::<P>();
173 |         self.map_dual(re, |x| re * x)
174 |     }
175 | 
176 |     #[inline(always)]
177 |     pub fn exp2(self) -> Self {
178 |         let re = self.re.exp2_p::<P>();
179 |         let re_ln2 = V::LN_2() * re;
180 |         self.map_dual(re, |x| x * re_ln2)
181 |     }
182 | 
183 |     #[inline(always)]
184 |     pub fn ln(self) -> Self {
185 |         self.div_dual(self.re.ln_p::<P>(), self.re)
186 |     }
187 | 
188 |     #[inline(always)]
189 |     pub fn sqrt(self) -> Self {
190 |         let re = self.re.sqrt();
191 |         self.div_dual(re, re + re)
192 |     }
193 | 
194 |     #[inline(always)]
195 |     pub fn cbrt(self) -> Self {
196 |         let re = self.re.cbrt();
197 |         self.div_dual(re, re + re + re)
198 |     }
199 | 
200 |     fn hypot(self, other: Self) -> Self {
201 |         let c = self.re.hypot(other.re);
202 |         let mut v = Self::real(c);
203 | 
204 |         let inv_c = c.reciprocal_p::<P>();
205 |         for i in 0..N {
206 |             let x = self.du[i];
207 |             let y = other.du[i];
208 | 
209 |             v.du[i] = self.re.mul_add(x, other.re * y);
210 | 
211 |             if N > 1 {
212 |                 v.du[i] *= inv_c;
213 |             } else {
214 |                 v.du[i] /= c;
215 |             }
216 |         }
217 | 
218 |         v
219 |     }
220 | 
221 |     #[inline(always)]
222 |     pub fn sin_cos(self) -> (Self, Self) {
223 |         let (s, c) = self.re.sin_cos_p::<P>();
224 | 
225 |         let mut sine = self;
226 |         let mut cosi = self;
227 | 
228 |         sine.re = s;
229 |         cosi.re = c;
230 |         for i in 0..N {
231 |             sine.du[i] *= c;
232 |             cosi.du[i] *= s;
233 |         }
234 | 
235 |         (sine, cosi)
236 |     }
237 | 
238 |     #[inline(always)]
239 |     pub fn tan(self) -> Self {
240 |         let t = self.re.tan_p::<P>();
241 |         let c = t.mul_add(t, V::one());
242 |         self.map_dual(t, |x| x * c)
243 |     }
244 | 
245 |     #[inline(always)]
246 |     pub fn asin(self) -> Self {
247 |         let c = self.re.nmul_adde(self.re, V::one()).invsqrt_p::<P>();
248 |         self.map_dual(self.re.asin(), |x| x * c)
249 |     }
250 | 
251 |     #[inline(always)]
252 |     pub fn acos(self) -> Self {
253 |         let c = self.re.nmul_adde(self.re, V::one()).invsqrt_p::<P>().neg();
254 |         self.map_dual(self.re.acos(), |x| x * c)
255 |     }
256 | 
257 |     #[inline(always)]
258 |     pub fn atan(self) -> Self {
259 |         let c = self.re.mul_adde(self.re, V::one());
260 |         self.div_dual(self.re.atan(), c)
261 |     }
262 | 
263 |     pub fn atan2(self, x: Self) -> Self {
264 |         let y = self;
265 |         let c = y.re.mul_add(y.re, x.re * x.re);
266 | 
267 |         let mut v = Self::real(y.re.atan2(x.re));
268 | 
269 |         let inv_c = c.reciprocal_p::<P>();
270 |         for i in 0..N {
271 |             v.du[i] = x.re.mul_sub(y.du[i], y.re * x.du[i]) * c;
272 | 
273 |             if N > 1 {
274 |                 v.du[i] *= inv_c;
275 |             } else {
276 |                 v.du[i] /= c;
277 |             }
278 |         }
279 | 
280 |         v
281 |     }
282 | 
283 |     #[inline(always)]
284 |     pub fn sinh_cosh(self) -> (Self, Self) {
285 |         let s = self.re.sinh_p::<P>();
286 |         let c = self.re.cosh_p::<P>();
287 |         (self.map_dual(s, |x| x * c), self.map_dual(c, |x| x * s))
288 |     }
289 | 
290 |     #[inline(always)]
291 |     pub fn tanh(self) -> Self {
292 |         let re = self.re.tanh_p::<P>();
293 |         let c = re.nmul_add(re, V::one()); // 1 - r^2
294 |         self.map_dual(re, |x| x * c)
295 |     }
296 | }
297 | 
298 | #[dispatch(S)]
299 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> Add<Self> for HyperdualP<S, V, P, N> {
300 |     type Output = Self;
301 | 
302 |     #[inline(always)]
303 |     fn add(mut self, rhs: Self) -> Self {
304 |         self.re += rhs.re;
305 |         for i in 0..N {
306 |             self.du[i] += rhs.du[i];
307 |         }
308 |         self
309 |     }
310 | }
311 | 
312 | #[dispatch(S)]
313 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> Sub<Self> for HyperdualP<S, V, P, N> {
314 |     type Output = Self;
315 | 
316 |     #[inline(always)]
317 |     fn sub(mut self, rhs: Self) -> Self {
318 |         self.re -= rhs.re;
319 |         for i in 0..N {
320 |             self.du[i] -= rhs.du[i];
321 |         }
322 |         self
323 |     }
324 | }
325 | 
326 | #[dispatch(S)]
327 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> Mul<Self> for HyperdualP<S, V, P, N> {
328 |     type Output = Self;
329 | 
330 |     #[inline(always)]
331 |     fn mul(mut self, rhs: Self) -> Self {
332 |         for i in 0..N {
333 |             self.du[i] = self.re.mul_add(rhs.du[i], rhs.re * self.du[i]);
334 |         }
335 |         self.re *= rhs.re;
336 |         self
337 |     }
338 | }
339 | 
340 | #[dispatch(S)]
341 | impl<S: Simd, V: SimdFloatVector<S>, P: Policy, const N: usize> Div<Self> for HyperdualP<S, V, P, N>
342 | where
343 |     V: SimdVectorizedMath<S>,
344 | {
345 |     type Output = Self;
346 | 
347 |     #[inline(always)]
348 |     fn div(mut self, rhs: Self) -> Self {
349 |         let d = self.re * rhs.re;
350 | 
351 |         let inv_d = d.reciprocal_p::<P>();
352 |         for i in 0..N {
353 |             self.du[i] = rhs.re.mul_sub(self.du[i], self.re * rhs.du[i]) * d;
354 | 
355 |             if N > 1 {
356 |                 self.du[i] *= inv_d;
357 |             } else {
358 |                 self.du[i] /= d;
359 |             }
360 |         }
361 |         self.re /= rhs.re;
362 |         self
363 |     }
364 | }
365 | 


--------------------------------------------------------------------------------
/crates/thermite-special/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "thermite-special"
 3 | version = "0.1.0"
 4 | authors = ["novacrazy <novacrazy@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | thermite = { path = "../thermite" }
11 | thermite-complex = { path = "../thermite-complex" }


--------------------------------------------------------------------------------
/crates/thermite-special/src/ps.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | const EULERS_CONSTANT: f32 = 5.772156649015328606065120900824024310e-01;
  4 | 
  5 | impl<S: Simd> SimdVectorizedSpecialFunctionsInternal<S> for f32
  6 | where
  7 |     <S as Simd>::Vf32: SimdFloatVector<S, Element = f32>,
  8 | {
  9 |     #[inline(always)]
 10 |     fn tgamma<P: Policy>(mut z: Self::Vf) -> Self::Vf {
 11 |         let zero = Vf32::<S>::zero();
 12 |         let one = Vf32::<S>::one();
 13 |         let half = Vf32::<S>::splat(0.5);
 14 |         let quarter = Vf32::<S>::splat(0.25);
 15 |         let pi = Vf32::<S>::PI();
 16 | 
 17 |         let orig_z = z;
 18 | 
 19 |         let is_neg = z.is_negative();
 20 |         let mut reflected = Mask::falsey();
 21 | 
 22 |         let mut res = one;
 23 | 
 24 |         'goto_positive: while is_neg.any() {
 25 |             reflected = z.le(Vf32::<S>::splat(-20.0));
 26 | 
 27 |             let mut refl_res = unsafe { Vf32::<S>::undefined() };
 28 | 
 29 |             // sine is expensive, so branch for it.
 30 |             if P::POLICY.avoid_precision_branches() || thermite_unlikely!(reflected.any()) {
 31 |                 refl_res = <Self as SimdVectorizedMathInternal<S>>::sin_pix::<P>(z);
 32 | 
 33 |                 // If not branching, all negative values are reflected
 34 |                 if P::POLICY.avoid_precision_branches() {
 35 |                     reflected = is_neg;
 36 | 
 37 |                     res = reflected.select(refl_res, res);
 38 |                     z = z.conditional_neg(reflected);
 39 | 
 40 |                     break 'goto_positive;
 41 |                 }
 42 | 
 43 |                 // NOTE: I chose not to use a bitmask here, because some bitmasks can be
 44 |                 // one extra instruction than the raw call to `all` again, and since z <= -20 is so rare,
 45 |                 // that extra instruction is not worth it.
 46 |                 if reflected.all() {
 47 |                     res = refl_res;
 48 |                     z = -z;
 49 | 
 50 |                     break 'goto_positive;
 51 |                 }
 52 |             }
 53 | 
 54 |             let mut mod_z = z;
 55 |             let mut is_neg = is_neg;
 56 | 
 57 |             // recursively apply Γ(z+1)/z
 58 |             while is_neg.any() {
 59 |                 res = is_neg.select(res / mod_z, res);
 60 |                 mod_z = mod_z.conditional_add(one, is_neg);
 61 |                 is_neg = mod_z.is_negative();
 62 |             }
 63 | 
 64 |             z = reflected.select(-z, mod_z);
 65 |             res = reflected.select(refl_res, res);
 66 | 
 67 |             break 'goto_positive;
 68 |         }
 69 | 
 70 |         // label
 71 |         //positive:
 72 | 
 73 |         // Integers
 74 | 
 75 |         let mut z_int = Mask::falsey();
 76 |         let mut fact_res = one;
 77 | 
 78 |         if P::POLICY.precision > PrecisionPolicy::Worst {
 79 |             let zf = z.floor();
 80 |             z_int = zf.eq(z);
 81 | 
 82 |             let bitmask = z_int.bitmask();
 83 | 
 84 |             if thermite_unlikely!(bitmask.any()) {
 85 |                 let mut j = one;
 86 |                 let mut k = j.lt(zf);
 87 | 
 88 |                 while k.any() {
 89 |                     fact_res = k.select(fact_res * j, fact_res);
 90 |                     j += one;
 91 |                     k = j.lt(zf);
 92 |                 }
 93 | 
 94 |                 // Γ(-int) = NaN for poles
 95 |                 fact_res = is_neg.select(Vf32::<S>::nan(), fact_res);
 96 |                 // approaching zero from either side results in +/- infinity
 97 |                 fact_res = orig_z.eq(zero).select(Vf32::<S>::infinity().copysign(orig_z), fact_res);
 98 | 
 99 |                 if bitmask.all() {
100 |                     return fact_res;
101 |                 }
102 |             }
103 |         }
104 | 
105 |         // Full
106 | 
107 |         let gh = Vf32::<S>::splat(LANCZOS_G - 0.5);
108 | 
109 |         let lanczos_sum = z.poly_rational_p::<P>(LANCZOS_P, LANCZOS_Q);
110 | 
111 |         let zgh = z + gh;
112 |         let lzgh = zgh.ln_p::<P>();
113 | 
114 |         // (z * lzfg) > ln(f32::MAX)
115 |         let very_large = (z * lzgh).gt(Vf32::<S>::splat(
116 |             88.722839053130621324601674778549183073943430402325230485234240247,
117 |         ));
118 | 
119 |         // only compute powf once
120 |         let h = zgh.powf_p::<P>(very_large.select(z.mul_sube(half, quarter), z - half));
121 | 
122 |         // save a couple cycles by avoiding this division, but worst-case precision is slightly worse
123 |         let denom = if P::POLICY.precision >= PrecisionPolicy::Best {
124 |             lanczos_sum / zgh.exp_p::<P>()
125 |         } else {
126 |             lanczos_sum * (-zgh).exp_p::<P>()
127 |         };
128 | 
129 |         let normal_res = very_large.select(h * h, h) * denom;
130 | 
131 |         // Tiny
132 |         if P::POLICY.precision >= PrecisionPolicy::Best {
133 |             let is_tiny = z.lt(Vf32::<S>::splat(
134 |                 <Self as SimdVectorizedMathInternal<S>>::__SQRT_EPSILON,
135 |             ));
136 |             let tiny_res = z.reciprocal_p::<P>() - Vf32::<S>::splat(EULERS_CONSTANT);
137 |             res *= is_tiny.select(tiny_res, normal_res);
138 |         } else {
139 |             res *= normal_res;
140 |         }
141 | 
142 |         reflected.select(-pi / res, z_int.select(fact_res, res))
143 |     }
144 | 
145 |     #[inline(always)]
146 |     fn lgamma<P: Policy>(mut z: Self::Vf) -> Self::Vf {
147 |         let one = Vf32::<S>::one();
148 |         let zero = Vf32::<S>::zero();
149 | 
150 |         let reflect = z.lt(zero);
151 | 
152 |         let mut t = one;
153 | 
154 |         if P::POLICY.avoid_branching || reflect.any() {
155 |             t = reflect.select(<Self as SimdVectorizedMathInternal<S>>::sin_pix::<P>(z).abs(), one);
156 |             z = z.conditional_neg(reflect);
157 |         }
158 | 
159 |         let gh = Vf32::<S>::splat(LANCZOS_G - 0.5);
160 | 
161 |         let mut lanczos_sum = z.poly_rational_p::<P>(LANCZOS_P_EXPG_SCALED, LANCZOS_Q);
162 | 
163 |         // Full A
164 |         let mut a = (z + gh).ln_p::<P>() - one;
165 | 
166 |         // Tiny
167 |         if P::POLICY.precision >= PrecisionPolicy::Best {
168 |             let is_not_tiny = z.ge(Vf32::<S>::splat_as(
169 |                 <Self as SimdVectorizedMathInternal<S>>::__SQRT_EPSILON,
170 |             ));
171 |             let tiny_res = z.reciprocal_p::<P>() - Vf32::<S>::splat(EULERS_CONSTANT);
172 | 
173 |             // shove the tiny result into the log down below
174 |             lanczos_sum = is_not_tiny.select(lanczos_sum, tiny_res);
175 |             // force multiplier to zero for tiny case, allowing the modified
176 |             // lanczos sum and ln(t) to be combined for cheap
177 |             a &= is_not_tiny.value();
178 |         }
179 | 
180 |         // Full
181 | 
182 |         let b = z - Vf32::<S>::splat(0.5);
183 |         let c = (lanczos_sum * t).ln_p::<P>();
184 | 
185 |         let mut res = a.mul_adde(b, c);
186 | 
187 |         let ln_pi = Vf32::<S>::LN_PI();
188 | 
189 |         res = reflect.select(ln_pi - res, res);
190 | 
191 |         res
192 |     }
193 | 
194 |     #[inline(always)]
195 |     fn digamma<P: Policy>(mut x: Self::Vf) -> Self::Vf {
196 |         let zero = Vf32::<S>::zero();
197 |         let one = Vf32::<S>::one();
198 |         let half = Vf32::<S>::splat(0.5);
199 |         let pi = Vf32::<S>::PI();
200 | 
201 |         let mut result = zero;
202 | 
203 |         let reflect = x.le(Vf32::<S>::neg_one());
204 | 
205 |         if reflect.any() {
206 |             x = reflect.select(one - x, x);
207 | 
208 |             let mut rem = x - x.floor();
209 | 
210 |             rem = rem.conditional_sub(one, rem.gt(half));
211 | 
212 |             let (s, c) = (rem * pi).sin_cos_p::<P>();
213 |             let refl_res = pi * c / s;
214 | 
215 |             result = reflect.select(refl_res, result);
216 |         }
217 | 
218 |         let lim = Vf32::<S>::splat(
219 |             0.5 * (10 + ((<Self as SimdVectorizedMathInternal<S>>::__DIGITS as i64 - 50) * 240) / 950) as f32,
220 |         );
221 | 
222 |         // Rescale to use asymptotic expansion
223 |         let mut is_small = x.lt(lim);
224 |         while is_small.any() {
225 |             result = result.conditional_sub(x.reciprocal_p::<P>(), is_small);
226 |             x = x.conditional_add(one, is_small);
227 |             is_small = x.lt(lim);
228 |         }
229 | 
230 |         x -= one;
231 | 
232 |         let inv_x = x.reciprocal_p::<P>();
233 | 
234 |         let z = inv_x * inv_x;
235 |         let a = x.ln_p::<P>() + (inv_x * half);
236 | 
237 |         let y = z.poly_p::<P>(&[
238 |             0.083333333333333333333333333333333333333333333333333,
239 |             -0.0083333333333333333333333333333333333333333333333333,
240 |             0.003968253968253968253968253968253968253968253968254,
241 |         ]);
242 | 
243 |         result += z.nmul_adde(y, a);
244 | 
245 |         result
246 |     }
247 | 
248 |     #[inline(always)]
249 |     fn beta<P: Policy>(a: Self::Vf, b: Self::Vf) -> Self::Vf {
250 |         let zero = Vf32::<S>::zero();
251 | 
252 |         let is_valid = a.gt(zero) & b.gt(zero);
253 | 
254 |         if P::POLICY.check_overflow && !P::POLICY.avoid_branching {
255 |             if is_valid.none() {
256 |                 return Vf32::<S>::nan();
257 |             }
258 |         }
259 | 
260 |         let c = a + b;
261 | 
262 |         // if a < b then swap
263 |         let (a, b) = (a.max(b), a.min(b));
264 | 
265 |         let mut result = a.poly_rational_p::<P>(LANCZOS_P_EXPG_SCALED, LANCZOS_Q)
266 |             * (b.poly_rational_p::<P>(LANCZOS_P_EXPG_SCALED, LANCZOS_Q)
267 |                 / c.poly_rational_p::<P>(LANCZOS_P_EXPG_SCALED, LANCZOS_Q));
268 | 
269 |         let gh = Vf32::<S>::splat(LANCZOS_G - 0.5);
270 | 
271 |         let agh = a + gh;
272 |         let bgh = b + gh;
273 |         let cgh = c + gh;
274 | 
275 |         let agh_d_cgh = agh / cgh;
276 |         let bgh_d_cgh = bgh / cgh;
277 |         let agh_p_bgh = agh * bgh;
278 |         let cgh_p_cgh = cgh * cgh;
279 | 
280 |         let base = cgh
281 |             .gt(Vf32::<S>::splat(1e10))
282 |             .select(agh_d_cgh * bgh_d_cgh, agh_p_bgh / cgh_p_cgh);
283 | 
284 |         let denom = if P::POLICY.precision > PrecisionPolicy::Average {
285 |             Vf32::<S>::SQRT_E() / bgh.sqrt()
286 |         } else {
287 |             // bump up the precision a little to improve beta function accuracy
288 |             Vf32::<S>::SQRT_E() * bgh.invsqrt_p::<policies::ExtraPrecision<P>>()
289 |         };
290 | 
291 |         result *= agh_d_cgh.powf_p::<P>(a - Vf32::<S>::splat(0.5) - b) * (base.powf_p::<P>(b) * denom);
292 | 
293 |         if P::POLICY.check_overflow {
294 |             result = is_valid.select(result, Vf32::<S>::nan());
295 |         }
296 | 
297 |         result
298 |     }
299 | }
300 | 
301 | const LANCZOS_G: f32 = 1.428456135094165802001953125;
302 | 
303 | const LANCZOS_P: &[f32] = &[
304 |     58.52061591769095910314047740215847630266,
305 |     182.5248962595894264831189414768236280862,
306 |     211.0971093028510041839168287718170827259,
307 |     112.2526547883668146736465390902227161763,
308 |     27.5192015197455403062503721613097825345,
309 |     2.50662858515256974113978724717473206342,
310 | ];
311 | 
312 | const LANCZOS_Q: &[f32] = &[0.0, 24.0, 50.0, 35.0, 10.0, 1.0];
313 | 
314 | const LANCZOS_P_EXPG_SCALED: &[f32] = &[
315 |     14.0261432874996476619570577285003839357,
316 |     43.74732405540314316089531289293124360129,
317 |     50.59547402616588964511581430025589038612,
318 |     26.90456680562548195593733429204228910299,
319 |     6.595765571169314946316366571954421695196,
320 |     0.6007854010515290065101128585795542383721,
321 | ];
322 | 


--------------------------------------------------------------------------------
/crates/thermite/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "thermite"
 3 | version = "0.1.1-alpha.0"
 4 | license = "MIT OR Apache-2.0"
 5 | readme = "README.md"
 6 | authors = ["novacrazy <novacrazy@gmail.com>"]
 7 | repository = "https://github.com/raygon-renderer/thermite"
 8 | documentation = "https://raygon-renderer.github.io/thermite/"
 9 | edition = "2018"
10 | 
11 | [features]
12 | default = ["alloc", "math", "rng", "emulate_fma", "static_init"]
13 | neon = ["thermite-dispatch/neon"]
14 | wasm32 = ["thermite-dispatch/wasm32"]
15 | alloc = []
16 | nightly = []
17 | math = []
18 | rng = []
19 | emulate_fma = []
20 | 
21 | [dependencies]
22 | thermite-dispatch = { path = "../dispatch" }
23 | paste = "1"
24 | half = "1.6.0"
25 | 
26 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies.static_init]
27 | version = "1"
28 | optional = true
29 | default_features = false
30 | 
31 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
32 | core_detect = "0.1.0"
33 | 
34 | [dev-dependencies]
35 | criterion = "0.3"
36 | libm = "0.2.1"
37 | plotly = "0.6.0"
38 | rand = "0.8"
39 | rand_xoshiro = "0.6.0"
40 | no-panic = "0.1"
41 | thermite-special = { path = "../thermite-special" }
42 | thermite-complex = { path = "../thermite-complex" }
43 | num-complex = "0.4"
44 | 
45 | [[bench]]
46 | name = "main"
47 | harness = false
48 | 
49 | 


--------------------------------------------------------------------------------
/crates/thermite/examples/asm.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unused)]
  2 | 
  3 | // NOTE: This example only exists to be compiled and inspected as assembly via the command:
  4 | // `cargo rustc --example asm --release -- -C target-feature=+sse2 --emit asm`
  5 | // It's easier to access the example output in the `target/release/examples` directory
  6 | 
  7 | use no_panic::no_panic;
  8 | 
  9 | use thermite::*;
 10 | use thermite_special::*;
 11 | 
 12 | pub mod geo;
 13 | 
 14 | use thermite::backends::avx2::AVX2;
 15 | use thermite::rng::SimdRng;
 16 | 
 17 | type Vf32 = <AVX2 as Simd>::Vf32;
 18 | type Vf64 = <AVX2 as Simd>::Vf64;
 19 | type Vi32 = <AVX2 as Simd>::Vi32;
 20 | type Vu64 = <AVX2 as Simd>::Vu64;
 21 | type Vu32 = <AVX2 as Simd>::Vu32;
 22 | type Vi64 = <AVX2 as Simd>::Vi64;
 23 | 
 24 | type Vector3xN = geo::Vector3xN<AVX2>;
 25 | 
 26 | type Xoshiro128Plus = thermite::rng::xoshiro::Xoshiro128Plus<AVX2>;
 27 | 
 28 | #[no_mangle]
 29 | #[inline(never)]
 30 | pub fn test_dynamic_dispatch(value: &mut [f32]) {
 31 |     assert_eq!(value.len(), 8);
 32 | 
 33 |     #[dispatch]
 34 |     fn test<S: Simd>(value: &mut [f32]) {
 35 |         thermite::Vf32::<S>::load_unaligned(value).exp2().store_unaligned(value);
 36 |     }
 37 | 
 38 |     dispatch_dyn!({ test::<S>(value) })
 39 | }
 40 | 
 41 | #[no_mangle]
 42 | #[inline(never)]
 43 | #[target_feature(enable = "avx2,fma")]
 44 | pub unsafe fn test_simdrng(rng: &mut Xoshiro128Plus) -> Vf64 {
 45 |     rng.next_f64()
 46 | }
 47 | 
 48 | #[no_mangle]
 49 | #[inline(never)]
 50 | #[target_feature(enable = "avx2,fma")]
 51 | pub unsafe fn test_revbits(x: Vi32) -> Vi32 {
 52 |     x.reverse_bits()
 53 | }
 54 | 
 55 | #[no_mangle]
 56 | #[inline(never)]
 57 | #[target_feature(enable = "avx2,fma")]
 58 | pub unsafe fn test_normalize(v: &mut Vector3xN) {
 59 |     *v = v.normalize()
 60 | }
 61 | 
 62 | #[no_mangle]
 63 | #[inline(never)]
 64 | #[target_feature(enable = "avx2,fma")]
 65 | pub unsafe fn test_u64div(a: Vu64, b: Vu64) -> Vu64 {
 66 |     a / b
 67 | }
 68 | 
 69 | #[no_mangle]
 70 | #[inline(never)]
 71 | #[target_feature(enable = "avx2,fma")]
 72 | pub unsafe fn test_bitmask(b: u16) -> Vu64 {
 73 |     Mask::from_bitmask(b).value()
 74 | }
 75 | 
 76 | #[no_mangle]
 77 | #[inline(never)]
 78 | #[target_feature(enable = "avx2,fma")]
 79 | pub unsafe fn test_cross(a: Vector3xN, b: Vector3xN) -> Vector3xN {
 80 |     a.cross(&b)
 81 | }
 82 | 
 83 | #[no_mangle]
 84 | #[inline(never)]
 85 | #[target_feature(enable = "avx2,fma")]
 86 | pub unsafe fn do_alloc(count: usize) -> VectorBuffer<AVX2, Vf32> {
 87 |     Vf32::alloc(count)
 88 | }
 89 | 
 90 | #[no_mangle]
 91 | #[inline(never)]
 92 | #[target_feature(enable = "avx2,fma")]
 93 | pub unsafe fn test_powf_ps(y: Vf32, x: Vf32) -> Vf32 {
 94 |     y.powf(x)
 95 | }
 96 | 
 97 | #[no_mangle]
 98 | #[inline(never)]
 99 | #[target_feature(enable = "avx2,fma")]
100 | pub unsafe fn test_powf_pd(y: Vf64, x: Vf64) -> Vf64 {
101 |     y.powf(x)
102 | }
103 | 
104 | #[no_mangle]
105 | #[inline(never)]
106 | #[target_feature(enable = "avx2,fma")]
107 | pub unsafe fn test_smootheststep(x: Vf32) -> Vf32 {
108 |     x.smootheststep()
109 | }
110 | 
111 | #[no_mangle]
112 | #[inline(never)]
113 | //#[target_feature(enable = "avx2,fma")]
114 | pub unsafe fn test_pdsin(x: Vf64) -> Vf64 {
115 |     x.sin()
116 | }
117 | 
118 | #[no_mangle]
119 | #[inline(never)]
120 | #[target_feature(enable = "avx2,fma")]
121 | pub unsafe fn test_pssin_cos(x: Vf32) -> (Vf32, Vf32) {
122 |     x.sin_cos_p::<policies::UltraPerformance>()
123 | }
124 | 
125 | #[no_mangle]
126 | #[inline(never)]
127 | #[target_feature(enable = "avx2,fma")]
128 | pub unsafe fn test_select_neg_ps(x: Vf32, a: Vf32, b: Vf32) -> Vf32 {
129 |     x.is_negative().select(a, b)
130 | }
131 | 
132 | #[no_mangle]
133 | #[inline(never)]
134 | #[target_feature(enable = "avx2,fma")]
135 | pub unsafe fn test_select_neg_epi32(x: Vi32, a: Vi32, b: Vi32) -> Vi32 {
136 |     x.is_negative().select(a, b)
137 | }
138 | 
139 | #[no_mangle]
140 | #[inline(never)]
141 | #[target_feature(enable = "avx2,fma")]
142 | #[no_panic]
143 | pub unsafe fn test_shuffle(x: Vf64, y: Vf64) -> Vf64 {
144 |     match Vf64::NUM_ELEMENTS {
145 |         4 => shuffle!(x, y, [6, 2, 1, 7]),
146 |         8 => shuffle!(x, y, [5, 6, 10, 9, 2, 8, 6, 4]),
147 |         _ => unimplemented!(),
148 |     }
149 | }
150 | 
151 | #[no_mangle]
152 | #[inline(never)]
153 | #[target_feature(enable = "avx2,fma")]
154 | pub unsafe fn test_shuffle_dyn_unchecked(a: Vf32, b: Vf32, indices: &[usize]) -> Vf32 {
155 |     a.shuffle_dyn_unchecked(b, indices)
156 | }
157 | 
158 | //#[no_mangle]
159 | //#[inline(never)]
160 | //#[target_feature(enable = "avx2,fma")]
161 | //pub unsafe fn test_shuffle_dyn(x: Vf32, y: Vf32, indices: &[usize; 8]) -> Vf32 {
162 | //    x.shuffle(y, &indices[..])
163 | //}
164 | 
165 | #[no_mangle]
166 | #[inline(never)]
167 | //#[target_feature(enable = "avx2,fma")]
168 | pub unsafe fn test_pstgamma(x: Vf32) -> Vf32 {
169 |     x.tgamma_p::<policies::UltraPerformance>()
170 | }
171 | 
172 | #[no_mangle]
173 | #[inline(never)]
174 | //#[target_feature(enable = "avx2,fma")]
175 | pub unsafe fn test_pdtgamma(x: Vf64) -> Vf64 {
176 |     x.tgamma()
177 | }
178 | 
179 | #[no_mangle]
180 | #[inline(never)]
181 | #[target_feature(enable = "avx2,fma")]
182 | pub unsafe fn test_pserf(x: Vf32) -> Vf32 {
183 |     x.erf()
184 | }
185 | 
186 | #[no_mangle]
187 | #[inline(never)]
188 | pub unsafe fn test_psexp(x: Vf32) -> Vf32 {
189 |     x.exp()
190 | }
191 | 
192 | #[no_mangle]
193 | #[inline(never)]
194 | #[target_feature(enable = "avx2,fma")]
195 | pub unsafe fn test_pderfinv(x: Vf64) -> Vf64 {
196 |     x.erfinv()
197 | }
198 | 
199 | #[no_mangle]
200 | #[inline(never)]
201 | #[target_feature(enable = "avx2,fma")]
202 | pub unsafe fn test_pscbrt(x: Vf32) -> Vf32 {
203 |     x.cbrt()
204 | }
205 | 
206 | //#[no_mangle]
207 | //#[inline(never)]
208 | //#[target_feature(enable = "avx2,fma")]
209 | //pub unsafe fn test_ps_bessel_y4(x: Vf32) -> Vf32 {
210 | //    x.bessel_y_p::<policies::Precision>(4)
211 | //}
212 | 
213 | #[no_mangle]
214 | #[inline(never)]
215 | #[target_feature(enable = "avx2,fma")]
216 | pub unsafe fn test_poly(x: Vf32, e: &[f32]) -> Vf32 {
217 |     x.poly_f(128, |i| Vf32::splat(*e.get_unchecked(i)))
218 | }
219 | 
220 | #[no_mangle]
221 | #[inline(never)]
222 | #[target_feature(enable = "avx2,fma")]
223 | pub unsafe fn test_rational_poly(x: Vf32, e: &[f32], d: &[f32]) -> Vf32 {
224 |     let n0 = x.poly_f(19, |i| Vf32::splat(*e.get_unchecked(i)));
225 |     let n1 = x.poly_f(19, |i| Vf32::splat(*d.get_unchecked(i)));
226 | 
227 |     n0 / n1
228 | }
229 | 
230 | #[no_mangle]
231 | #[inline(never)]
232 | #[target_feature(enable = "avx2,fma")]
233 | pub unsafe fn test_rational_poly2(x: Vf32, e: &[f32], d: &[f32]) -> Vf32 {
234 |     assert!(e.len() == 19 && e.len() == d.len());
235 | 
236 |     x.poly_rational_p::<policies::Size>(e, d)
237 | }
238 | 
239 | #[no_mangle]
240 | #[inline(never)]
241 | #[target_feature(enable = "avx2,fma")]
242 | pub unsafe fn test_poly2(x: Vf32) -> Vf32 {
243 |     x.poly_f(128, |i| {
244 |         Vf32::splat((-1.0f32).powi(i as i32) * (2f32.powi(i as i32) - i as f32))
245 |     })
246 | }
247 | 
248 | #[no_mangle]
249 | #[inline(never)]
250 | #[target_feature(enable = "avx2,fma")]
251 | pub unsafe fn test_pdcbrt(x: Vf64) -> Vf64 {
252 |     x.cbrt()
253 | }
254 | 
255 | #[no_mangle]
256 | #[inline(never)]
257 | #[target_feature(enable = "avx2,fma")]
258 | pub unsafe fn test_pdsinh(x: Vf64) -> Vf64 {
259 |     x.sinh_p::<policies::Precision>()
260 | }
261 | 
262 | #[no_mangle]
263 | #[inline(never)]
264 | #[target_feature(enable = "avx2,fma")]
265 | pub unsafe fn test_pssinh(x: Vf32) -> Vf32 {
266 |     x.sinh_p::<policies::Precision>()
267 | }
268 | 
269 | #[no_mangle]
270 | #[inline(never)]
271 | #[target_feature(enable = "avx2,fma")]
272 | pub unsafe fn test_jacobi(x: Vf32, alpha: Vf32, beta: Vf32, n: u32, m: u32) -> Vf32 {
273 |     x.legendre(50, 0)
274 | }
275 | 
276 | #[no_mangle]
277 | #[inline(never)]
278 | #[target_feature(enable = "avx2,fma")]
279 | pub unsafe fn test_cast2(x: Vf64) -> Vi64 {
280 |     x.cast()
281 | }
282 | 
283 | fn main() {}
284 | 


--------------------------------------------------------------------------------
/crates/thermite/examples/geo/mod.rs:
--------------------------------------------------------------------------------
 1 | use thermite::*;
 2 | 
 3 | #[derive(Debug, Clone, Copy)]
 4 | pub struct Vector3xN<S: Simd> {
 5 |     pub x: Vf32<S>,
 6 |     pub y: Vf32<S>,
 7 |     pub z: Vf32<S>,
 8 | }
 9 | 
10 | impl<S: Simd> Vector3xN<S> {
11 |     pub fn dot(&self, other: &Self) -> S::Vf32 {
12 |         self.x.mul_add(other.x, self.y.mul_add(other.y, self.z * other.z))
13 |     }
14 | 
15 |     pub fn cross(&self, other: &Self) -> Self {
16 |         Self {
17 |             x: self.y.mul_sub(other.z, self.z * other.y),
18 |             y: self.z.mul_sub(other.x, self.x * other.z),
19 |             z: self.x.mul_sub(other.y, self.y * other.x),
20 |         }
21 |     }
22 | 
23 |     pub fn norm_squared(&self) -> S::Vf32 {
24 |         self.dot(self)
25 |     }
26 | 
27 |     pub fn norm(&self) -> S::Vf32 {
28 |         self.norm_squared().sqrt()
29 |     }
30 | 
31 |     pub fn normalize(&self) -> Self {
32 |         let inv_norm = self.norm_squared().invsqrt_p::<policies::Precision>();
33 | 
34 |         Self {
35 |             x: self.x * inv_norm,
36 |             y: self.y * inv_norm,
37 |             z: self.z * inv_norm,
38 |         }
39 |     }
40 | }
41 | 
42 | #[derive(Debug, Clone, Copy)]
43 | pub struct Matrix4xN<S: Simd> {
44 |     pub m: [[S::Vf32; 4]; 4],
45 | }
46 | 
47 | impl<S: Simd> Matrix4xN<S> {
48 |     pub fn at(&self, row: usize, col: usize) -> &S::Vf32 {
49 |         &self.m[col][row]
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/crates/thermite/examples/plot.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unused)]
  2 | 
  3 | use thermite::*;
  4 | 
  5 | pub mod geo;
  6 | 
  7 | use thermite::backends::avx2::AVX2;
  8 | 
  9 | type Vf32 = <AVX2 as Simd>::Vf32;
 10 | type Vf64 = <AVX2 as Simd>::Vf64;
 11 | type Vi32 = <AVX2 as Simd>::Vi32;
 12 | type Vu64 = <AVX2 as Simd>::Vu64;
 13 | type Vu32 = <AVX2 as Simd>::Vu32;
 14 | type Vi64 = <AVX2 as Simd>::Vi64;
 15 | 
 16 | use plotly::common::{ColorScale, ColorScalePalette, DashType, Fill, Font, Line, LineShape, Marker, Mode, Title};
 17 | use plotly::layout::{Axis, BarMode, Layout, Legend, TicksDirection};
 18 | use plotly::{Bar, NamedColor, Plot, Rgb, Rgba, Scatter};
 19 | 
 20 | fn plot_function<F>(name: &str, x_axis: &Vec<f32>, plot: &mut Plot, mut f: F)
 21 | where
 22 |     F: FnMut(Vf32) -> Vf32,
 23 | {
 24 |     let mut y_axis = vec![0.0; x_axis.len()];
 25 | 
 26 |     for (src, dst) in x_axis
 27 |         .chunks(Vf32::NUM_ELEMENTS)
 28 |         .zip(y_axis.chunks_mut(Vf32::NUM_ELEMENTS))
 29 |     {
 30 |         f(Vf32::load_unaligned(src))
 31 |             //.clamp(Vf32::splat(-400.0), Vf32::splat(400.0))
 32 |             .store_unaligned(dst);
 33 |     }
 34 | 
 35 |     plot.add_trace(Scatter::new(x_axis.clone(), y_axis).mode(Mode::Lines).name(name));
 36 | }
 37 | 
 38 | fn main() {
 39 |     let num_points = Vf32::NUM_ELEMENTS * 1000;
 40 | 
 41 |     let x_axis: Vec<f32> = (0..num_points)
 42 |         .into_iter()
 43 |         .map(|x| (x as f32 / num_points as f32) * 30.0 - 15.0)
 44 |         .collect();
 45 | 
 46 |     let layout = Layout::new().title(Title::new("Gamma function"));
 47 |     let mut plot = Plot::new();
 48 | 
 49 |     //for i in 0..5 {
 50 |     //    plot_function(&format!("Y{}", i), &x_axis, &mut plot, |x| {
 51 |     //        x.bessel_y_p::<policies::Precision>(i)
 52 |     //    });
 53 |     //}
 54 | 
 55 |     //plot_function("cos(x) [Precision]", &x_axis, &mut plot, |x| {
 56 |     //    x.cos_p::<policies::Precision>()
 57 |     //});
 58 |     //plot_function("cos(x) [Reference]", &x_axis, &mut plot, |x| {
 59 |     //    x.cos_p::<policies::Reference>()
 60 |     //});
 61 |     //
 62 |     //plot_function("sin(x) [Precision]", &x_axis, &mut plot, |x| {
 63 |     //    x.sin_p::<policies::Precision>()
 64 |     //});
 65 |     //plot_function("sin(x) [Reference]", &x_axis, &mut plot, |x| {
 66 |     //    x.sin_p::<policies::Reference>()
 67 |     //});
 68 | 
 69 |     //plot_function("tgamma(x)", &x_axis, &mut plot, |x| x.tgamma());
 70 |     //plot_function("lgamma(x)", &x_axis, &mut plot, |x| x.lgamma());
 71 |     //plot_function("ln(tgamma(x))", &x_axis, &mut plot, |x| x.tgamma().ln());
 72 |     //plot_function("diff*1000", &x_axis, &mut plot, |x| {
 73 |     //    (x.tgamma().ln() - x.lgamma()) * Vf32::splat(1000.0)
 74 |     //});
 75 | 
 76 |     //plot_function("digamma(x)", &x_axis, &mut plot, |x| x.digamma());
 77 | 
 78 |     /*
 79 |     plot_function("Gamma Avg", &x_axis, &mut plot, |x| x.tgamma());
 80 |     plot_function("Gamma Worst", &x_axis, &mut plot, |x| {
 81 |         x.tgamma_p::<policies::UltraPerformance>()
 82 |     });
 83 | 
 84 |     plot_function("Diffx100", &x_axis, &mut plot, |x| {
 85 |         (x.tgamma() - x.tgamma_p::<policies::UltraPerformance>()) * Vf32::splat(100.0)
 86 |     });
 87 |      */
 88 | 
 89 |     plot_function("Ln Avg", &x_axis, &mut plot, |x| x.ln());
 90 |     plot_function("Ln Worst", &x_axis, &mut plot, |x| {
 91 |         x.ln_p::<policies::UltraPerformance>()
 92 |     });
 93 | 
 94 |     plot_function("Diffx100", &x_axis, &mut plot, |x| {
 95 |         (x.ln() - x.ln_p::<policies::UltraPerformance>()) * Vf32::splat(100.0)
 96 |     });
 97 | 
 98 |     /*
 99 |     for i in 0..5 {
100 |         plot_function(&format!("beta(x, {}) [UP]", i), &x_axis, &mut plot, |x| {
101 |             x.beta_p::<policies::UltraPerformance>(Vf32::splat_as(i + 1))
102 |         });
103 |     }
104 | 
105 |     for i in 0..5 {
106 |         plot_function(&format!("beta(x, {}) [Precision]", i), &x_axis, &mut plot, |x| {
107 |             x.beta_p::<policies::Precision>(Vf32::splat_as(i + 1))
108 |         });
109 |     }
110 |      */
111 | 
112 |     plot.show();
113 | }
114 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/aarch64/mod.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/arm/mod.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use crate::*;
 4 | 
 5 | use core::{
 6 |     fmt,
 7 |     marker::PhantomData,
 8 |     mem::{transmute, transmute_copy},
 9 |     ops::*,
10 | };
11 | 
12 | use crate::arch::avx::*;
13 | 
14 | use half::f16;
15 | 
16 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
17 | pub struct AVX1;
18 | 
19 | #[macro_use]
20 | pub(crate) mod polyfills;
21 | 
22 | use polyfills::*;
23 | 
24 | /*
25 | mod vf32;
26 | mod vf64;
27 | mod vi32;
28 | mod vi32_2;
29 | mod vi64;
30 | //mod vi64_2;
31 | mod vu32;
32 | mod vu64;
33 | 
34 | pub use vf32::*;
35 | pub use vf64::*;
36 | pub use vi32::*;
37 | pub use vi64::*;
38 | pub use vu32::*;
39 | pub use vu64::*;
40 | 
41 | type Vi32 = i32x8<AVX1>;
42 | type Vi64 = i64x8<AVX1>;
43 | type Vu32 = u32x8<AVX1>;
44 | type Vu64 = u64x8<AVX1>;
45 | type Vf32 = f32x8<AVX1>;
46 | type Vf64 = f64x8<AVX1>;
47 | 
48 | impl Simd for AVX1 {
49 |     const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX;
50 | 
51 |     type Vi32 = Vi32;
52 |     type Vi64 = Vi64;
53 |     type Vu32 = Vu32;
54 |     type Vu64 = Vu64;
55 |     type Vf32 = Vf32;
56 |     type Vf64 = Vf64;
57 | 
58 |     #[cfg(target_pointer_width = "32")]
59 |     type Vusize = Vu32;
60 | 
61 |     #[cfg(target_pointer_width = "32")]
62 |     type Visize = Vi32;
63 | 
64 |     #[cfg(target_pointer_width = "64")]
65 |     type Vusize = Vu64;
66 | 
67 |     #[cfg(target_pointer_width = "64")]
68 |     type Visize = Vi64;
69 | }
70 | */
71 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/polyfills.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/vi32_2.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | decl!(i32x8: i32 => [__m128i; 2]);
 4 | impl<S: Simd> Default for i32x8<S> {
 5 |     #[inline(always)]
 6 |     fn default() -> Self {
 7 |         Self::new([unsafe { _mm_setzero_si128() }; 2])
 8 |     }
 9 | }
10 | 
11 | impl SimdVectorBase<AVX1> for i32x8<AVX1> {
12 |     type Element = i32;
13 | 
14 |     #[inline(always)]
15 |     fn splat(value: Self::Element) -> Self {
16 |         Self::new(unsafe { [_mm_set1_epi32(value); 2] })
17 |     }
18 | 
19 |     #[inline(always)]
20 |     unsafe fn undefined() -> Self {
21 |         Self::new([_mm_undefined_si128(); 2])
22 |     }
23 | 
24 |     #[inline(always)]
25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 |         Self::new([_mm_load_si128(src as *const _), _mm_load_si128(src.add(4) as *const _)])
27 |     }
28 | 
29 |     #[inline(always)]
30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 |         let src = src as *const _;
32 |         Self::new([_mm_load_si128(src), _mm_load_si128(src.add(1))])
33 |     }
34 | 
35 |     #[inline(always)]
36 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
37 |         let dst = dst as *mut _;
38 |         _mm_store_si128(dst, self.value[0]);
39 |         _mm_store_si128(dst.add(1), self.value[1]);
40 |     }
41 | 
42 |     #[inline(always)]
43 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
44 |         let dst = dst as *mut _;
45 |         _mm_storeu_si128(dst, self.value[0]);
46 |         _mm_storeu_si128(dst.add(1), self.value[1]);
47 |     }
48 | 
49 |     decl_base_common!(#[target_feature(enable = "avx,fma")] i32x8: i32 => __m256i);
50 | }
51 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/vi64_2.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(i64x8: i64 => [__m128i; 4]);
  4 | impl<S: Simd> Default for i64x8<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new([unsafe { _mm_setzero_si128() }; 4])
  8 |     }
  9 | }
 10 | 
 11 | impl<S: Simd> i64x8<S> {
 12 |     #[inline(always)]
 13 |     fn mapv<F>(mut self, f: F) -> Self
 14 |     where
 15 |         F: Fn(__m128i, usize) -> __m128i,
 16 |     {
 17 |         for i in 0..4 {
 18 |             self.value[i] = f(self.value[i], i);
 19 |         }
 20 |         self
 21 |     }
 22 | 
 23 |     #[inline(always)]
 24 |     fn zipv<F>(mut self, b: Self, f: F) -> Self
 25 |     where
 26 |         F: Fn(__m128i, __m128i) -> __m128i,
 27 |     {
 28 |         self.mapv(|a, i| f(a, b.value[i]))
 29 |     }
 30 | }
 31 | 
 32 | impl SimdVectorBase<AVX1> for i64x8<AVX1> {
 33 |     type Element = i64;
 34 | 
 35 |     #[inline(always)]
 36 |     fn splat(value: Self::Element) -> Self {
 37 |         Self::new(unsafe { [_mm_set1_epi64x(value); 4] })
 38 |     }
 39 | 
 40 |     #[inline(always)]
 41 |     unsafe fn undefined() -> Self {
 42 |         Self::new([_mm_undefined_si128(); 4])
 43 |     }
 44 | 
 45 |     #[inline(always)]
 46 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
 47 |         Self::undefined().mapv(|_, i| _mm_load_si128((src as *const __m128i).add(i)))
 48 |     }
 49 | 
 50 |     #[inline(always)]
 51 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
 52 |         Self::undefined().mapv(|_, i| _mm_loadu_si128((src as *const __m128i).add(i)))
 53 |     }
 54 | 
 55 |     #[inline(always)]
 56 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
 57 |         for i in 0..4 {
 58 |             _mm_store_si128((dst as *mut __m128i).add(i), self.value[i]);
 59 |         }
 60 |     }
 61 | 
 62 |     #[inline(always)]
 63 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
 64 |         for i in 0..4 {
 65 |             _mm_storeu_si128((dst as *mut __m128i).add(i), self.value[i]);
 66 |         }
 67 |     }
 68 | 
 69 |     #[inline]
 70 |     #[target_feature(enable = "avx")]
 71 |     unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
 72 |         *transmute::<&_, *const Self::Element>(&self).add(index)
 73 |     }
 74 | 
 75 |     #[inline]
 76 |     #[target_feature(enable = "avx")]
 77 |     unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
 78 |         *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
 79 |         self
 80 |     }
 81 | }
 82 | 
 83 | impl SimdBitwise<AVX1> for i64x8<AVX1> {
 84 |     fn and_not(self, other: Self) -> Self {
 85 |         self.zipv(other, |a, b| unsafe { _mm_andnot_si128(a, b) })
 86 |     }
 87 | 
 88 |     const FULL_BITMASK: u16 = 0b1111_1111;
 89 | 
 90 |     #[inline(always)]
 91 |     fn bitmask(self) -> u16 {
 92 |         let mut bitmask = 0;
 93 |         for i in 0..4 {
 94 |             // shift mask by 2*i as each vector has 2 64-bit lanes
 95 |             bitmask |= unsafe { _mm_movemask_pd(_mm_castsi128_pd(self.value[i])) } << (2 * i);
 96 |         }
 97 |         bitmask as u16
 98 |     }
 99 | 
100 |     #[inline(always)]
101 |     unsafe fn _mm_not(self) -> Self {
102 |         self ^ Self::splat(!0)
103 |     }
104 | 
105 |     #[inline(always)]
106 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
107 |         self.zipv(rhs, |a, b| _mm_and_si128(a, b))
108 |     }
109 | 
110 |     #[inline(always)]
111 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
112 |         self.zipv(rhs, |a, b| _mm_or_si128(a, b))
113 |     }
114 | 
115 |     #[inline(always)]
116 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
117 |         self.zipv(rhs, |a, b| _mm_xor_si128(a, b))
118 |     }
119 | 
120 |     #[inline(always)]
121 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
122 |         Self::zip(self, count, |x, s| x >> s)
123 |     }
124 | 
125 |     #[inline(always)]
126 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
127 |         Self::zip(self, count, |x, s| x << s)
128 |     }
129 | 
130 |     #[inline(always)]
131 |     unsafe fn _mm_shli(self, count: u32) -> Self {
132 |         let count = _mm_cvtsi32_si128(count as i32);
133 |         self.mapv(|a, _| _mm_sll_epi64(a, count))
134 |     }
135 | 
136 |     #[inline(always)]
137 |     unsafe fn _mm_shri(self, count: u32) -> Self {
138 |         let count = _mm_cvtsi32_si128(count as i32);
139 |         self.mapv(|a, _| _mm_srl_epi64(a, count))
140 |     }
141 | }
142 | 
143 | impl PartialEq<Self> for i64x8<AVX1> {
144 |     fn eq(&self, other: &Self) -> bool {
145 |         <Self as SimdVector<AVX1>>::eq(*self, *other).all()
146 |     }
147 | 
148 |     fn ne(&self, other: &Self) -> bool {
149 |         <Self as SimdVector<AVX1>>::ne(*self, *other).any()
150 |     }
151 | }
152 | 
153 | impl Eq for i64x8<AVX1> {}
154 | 
155 | impl SimdMask<AVX1> for i64x8<AVX1> {
156 |     #[inline(always)]
157 |     unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
158 |         self.mapv(|m, i| _mm_blendv_epi8(f.value[i], t.value[i], m))
159 |     }
160 | }
161 | 
162 | impl SimdVector<AVX1> for i64x8<AVX1> {
163 |     #[inline(always)]
164 |     fn zero() -> Self {
165 |         Self::new(unsafe { [_mm_setzero_si128(); 4] })
166 |     }
167 | 
168 |     #[inline(always)]
169 |     fn one() -> Self {
170 |         Self::splat(1)
171 |     }
172 | 
173 |     #[inline(always)]
174 |     fn min_value() -> Self {
175 |         Self::splat(i64::MIN)
176 |     }
177 | 
178 |     #[inline(always)]
179 |     fn max_value() -> Self {
180 |         Self::splat(i64::MAX)
181 |     }
182 | 
183 |     #[inline]
184 |     fn min_element(self) -> Self::Element {
185 |         unsafe { self.reduce2(|a, x| a.min(x)) }
186 |     }
187 | 
188 |     #[inline]
189 |     fn max_element(self) -> Self::Element {
190 |         unsafe { self.reduce2(|a, x| a.max(x)) }
191 |     }
192 | 
193 |     #[inline(always)]
194 |     fn eq(self, other: Self) -> Mask<AVX1, Self> {
195 |         Mask::new(self.zipv(other, |a, b| unsafe { _mm_cmpeq_epi64(a, b) }))
196 |     }
197 | 
198 |     #[inline(always)]
199 |     fn gt(self, other: Self) -> Mask<AVX1, Self> {
200 |         Mask::new(self.zipv(other, |a, b| unsafe { _mm_cmpgt_epi64(a, b) }))
201 |     }
202 | 
203 |     #[inline(always)]
204 |     unsafe fn _mm_add(self, rhs: Self) -> Self {
205 |         self.zipv(rhs, |l, r| _mm_add_epi64(l, r))
206 |     }
207 | 
208 |     #[inline(always)]
209 |     unsafe fn _mm_sub(self, rhs: Self) -> Self {
210 |         self.zipv(rhs, |l, r| _mm_sub_epi64(l, r))
211 |     }
212 | 
213 |     #[inline(always)]
214 |     unsafe fn _mm_mul(self, rhs: Self) -> Self {
215 |         self.zipv(rhs, |l, r| _mm_mullo_epi64x(l, r))
216 |     }
217 | 
218 |     #[inline(always)]
219 |     unsafe fn _mm_div(self, rhs: Self) -> Self {
220 |         Self::zip(self, rhs, Div::div)
221 |     }
222 | 
223 |     #[inline(always)]
224 |     unsafe fn _mm_rem(self, rhs: Self) -> Self {
225 |         Self::zip(self, rhs, Rem::rem)
226 |     }
227 | }
228 | 
229 | impl SimdSignedVector<AVX1> for i64x8<AVX1> {
230 |     #[inline(always)]
231 |     fn neg_one() -> Self {
232 |         Self::splat(-1)
233 |     }
234 | 
235 |     #[inline(always)]
236 |     fn min_positive() -> Self {
237 |         Self::splat(0)
238 |     }
239 | 
240 |     #[inline(always)]
241 |     fn abs(self) -> Self {
242 |         self.mapv(|x, _| unsafe { _mm256_abs_epi64x(x) })
243 |     }
244 | 
245 |     #[inline(always)]
246 |     unsafe fn _mm_neg(self) -> Self {
247 |         (self ^ Self::neg_one()) + Self::one()
248 |     }
249 | }
250 | 
251 | impl_ops!(@UNARY i64x8 AVX1 => Not::not, Neg::neg);
252 | impl_ops!(@BINARY i64x8 AVX1 => BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
253 | impl_ops!(@BINARY i64x8 AVX1 => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem);
254 | impl_ops!(@SHIFTS i64x8 AVX1 => Shr::shr, Shl::shl);
255 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx2/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use crate::*;
 4 | 
 5 | use core::{
 6 |     fmt,
 7 |     marker::PhantomData,
 8 |     mem::{transmute, transmute_copy},
 9 |     ops::*,
10 | };
11 | 
12 | use crate::arch::avx2::*;
13 | 
14 | use half::f16;
15 | 
16 | pub(crate) mod polyfills;
17 | 
18 | use super::polyfills::*;
19 | use polyfills::*;
20 | 
21 | mod vf32;
22 | mod vf64;
23 | //mod vi16;
24 | mod vi32;
25 | mod vi64;
26 | mod vu32;
27 | mod vu64;
28 | 
29 | pub use vf32::*;
30 | pub use vf64::*;
31 | //pub use vi16::*;
32 | pub use vi32::*;
33 | pub use vi64::*;
34 | pub use vu32::*;
35 | pub use vu64::*;
36 | 
37 | //type Vi16 = i16x8<AVX2>;
38 | type Vi32 = i32x8<AVX2>;
39 | type Vi64 = i64x8<AVX2>;
40 | type Vu32 = u32x8<AVX2>;
41 | type Vu64 = u64x8<AVX2>;
42 | type Vf32 = f32x8<AVX2>;
43 | type Vf64 = f64x8<AVX2>;
44 | 
45 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
46 | pub struct AVX2;
47 | 
48 | impl Simd for AVX2 {
49 |     const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX2;
50 | 
51 |     type Vi32 = Vi32;
52 |     type Vi64 = Vi64;
53 |     type Vu32 = Vu32;
54 |     type Vu64 = Vu64;
55 |     type Vf32 = Vf32;
56 |     type Vf64 = Vf64;
57 | 
58 |     #[cfg(target_pointer_width = "32")]
59 |     type Vusize = Vu32;
60 | 
61 |     #[cfg(target_pointer_width = "32")]
62 |     type Visize = Vi32;
63 | 
64 |     #[cfg(target_pointer_width = "64")]
65 |     type Vusize = Vu64;
66 | 
67 |     #[cfg(target_pointer_width = "64")]
68 |     type Visize = Vi64;
69 | }
70 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx2/vi16.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | decl!(i16x8: i16 => __m128i);
 4 | impl<S: Simd> Default for i16x8<S> {
 5 |     #[inline(always)]
 6 |     fn default() -> Self {
 7 |         Self::new(unsafe { _mm_setzero_si128() })
 8 |     }
 9 | }
10 | 
11 | impl SimdVectorBase<AVX2> for i16x8<AVX2> {
12 |     type Element = i16;
13 | 
14 |     #[inline(always)]
15 |     fn splat(value: Self::Element) -> Self {
16 |         Self::new(unsafe { _mm_set1_epi16(value) })
17 |     }
18 | 
19 |     #[inline(always)]
20 |     unsafe fn undefined() -> Self {
21 |         Self::new(_mm_undefined_si128())
22 |     }
23 | 
24 |     #[inline(always)]
25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 |         Self::new(_mm_load_si128(src as *const _))
27 |     }
28 | 
29 |     #[inline(always)]
30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 |         Self::new(_mm_loadu_si128(src as *const _))
32 |     }
33 | 
34 |     #[inline(always)]
35 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
36 |         _mm_store_si128(dst as *mut _, self.value)
37 |     }
38 | 
39 |     #[inline(always)]
40 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
41 |         _mm_storeu_si128(dst as *mut _, self.value)
42 |     }
43 | 
44 |     #[inline]
45 |     #[target_feature(enable = "avx2")]
46 |     unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
47 |         *transmute::<&_, *const Self::Element>(&self).add(index)
48 |     }
49 | 
50 |     #[inline]
51 |     #[target_feature(enable = "avx2")]
52 |     unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
53 |         *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
54 |         self
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/macros.rs:
--------------------------------------------------------------------------------
  1 | macro_rules! impl_ops {
  2 |     (@UNARY $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$(
  3 |         impl $op_trait for $name<$is> {
  4 |             type Output = Self;
  5 |             #[inline(always)] fn $op(self) -> Self { unsafe { self. [<_mm_ $op>]() } }
  6 |         }
  7 |     )*}};
  8 | 
  9 |     (@BINARY $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$(
 10 |         impl $op_trait<Self> for $name<$is> {
 11 |             type Output = Self;
 12 |             #[inline(always)] fn $op(self, rhs: Self) -> Self { unsafe { self. [<_mm_ $op>](rhs) } }
 13 |         }
 14 |         //impl $op_trait<<Self as SimdVectorBase<$is>>::Element> for $name<$is> {
 15 |         //    type Output = Self;
 16 |         //    #[inline(always)] fn $op(self, rhs: <Self as SimdVectorBase<$is>>::Element) -> Self {
 17 |         //        $op_trait::$op(self, Self::splat(rhs))
 18 |         //    }
 19 |         //}
 20 |         //impl $op_trait<$name<$is>> for <$name<$is> as SimdVectorBase<$is>>::Element {
 21 |         //    type Output = $name<$is>;
 22 |         //    #[inline(always)] fn $op(self, rhs: $name<$is>) -> $name<$is> {
 23 |         //        $op_trait::$op($name::<$is>::splat(self), rhs)
 24 |         //    }
 25 |         //}
 26 | 
 27 |         impl [<$op_trait Assign>]<Self> for $name<$is> {
 28 |             #[inline(always)] fn [<$op _assign>](&mut self, rhs: Self) { *self = $op_trait::$op(*self, rhs); }
 29 |         }
 30 |         impl [<$op_trait Assign>]<<Self as SimdVectorBase<$is>>::Element> for $name<$is> {
 31 |             #[inline(always)] fn [<$op _assign>](&mut self, rhs: <Self as SimdVectorBase<$is>>::Element) {
 32 |                 *self = $op_trait::$op(*self, Self::splat(rhs));
 33 |             }
 34 |         }
 35 |     )*}};
 36 | 
 37 |     (@SHIFTS $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$(
 38 |         impl $op_trait<<$is as Simd>::Vu32> for $name<$is> {
 39 |             type Output = Self;
 40 |             #[inline(always)] fn $op(self, rhs: <$is as Simd>::Vu32) -> Self { unsafe { self. [<_mm_ $op>](rhs) } }
 41 |         }
 42 |         impl $op_trait<u32> for $name<$is> {
 43 |             type Output = Self;
 44 |             #[inline(always)] fn $op(self, rhs: u32) -> Self { unsafe { self.[<_mm_ $op i>](rhs) } }
 45 |         }
 46 | 
 47 |         impl [<$op_trait Assign>]<<$is as Simd>::Vu32> for $name<$is> {
 48 |             #[inline(always)] fn [<$op _assign>](&mut self, rhs: <$is as Simd>::Vu32) { *self = $op_trait::$op(*self, rhs); }
 49 |         }
 50 |         impl [<$op_trait Assign>]<u32> for $name<$is> {
 51 |             #[inline(always)] fn [<$op _assign>](&mut self, rhs: u32) { *self = $op_trait::$op(*self, rhs); }
 52 |         }
 53 |     )*}};
 54 | }
 55 | 
 56 | macro_rules! decl_base_common {
 57 |     (#[$meta:meta] $name:ident: $ety:ty => $ty:ty) => {
 58 |         #[inline]
 59 |         #[$meta]
 60 |         unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
 61 |             *transmute::<&_, *const Self::Element>(&self).add(index)
 62 |         }
 63 | 
 64 |         #[inline]
 65 |         #[$meta]
 66 |         unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
 67 |             *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
 68 |             self
 69 |         }
 70 | 
 71 |         #[inline]
 72 |         #[$meta]
 73 |         unsafe fn shuffle_unchecked<INDICES: SimdShuffleIndices>(self, b: Self, indices: INDICES) -> Self {
 74 |             let mut dst = Self::undefined();
 75 |             for i in 0..Self::NUM_ELEMENTS {
 76 |                 let idx = *INDICES::INDICES.get_unchecked(i);
 77 |                 dst = dst.replace_unchecked(
 78 |                     i,
 79 |                     if idx < Self::NUM_ELEMENTS {
 80 |                         self.extract_unchecked(idx)
 81 |                     } else {
 82 |                         b.extract_unchecked(idx - Self::NUM_ELEMENTS)
 83 |                     },
 84 |                 );
 85 |             }
 86 |             dst
 87 |         }
 88 |     };
 89 | }
 90 | 
 91 | macro_rules! decl {
 92 |     ($($name:ident: $ety:ty => $ty:ty),*) => {$(
 93 |         #[derive(Clone, Copy)]
 94 |         #[repr(transparent)]
 95 |         pub struct $name<S: Simd> {
 96 |             pub(crate) value: $ty,
 97 |             _is: PhantomData<S>,
 98 |         }
 99 | 
100 |         impl<S: Simd> $name<S> {
101 |             #[inline(always)]
102 |             pub(crate) fn new(value: $ty) -> Self {
103 |                 Self { value, _is: PhantomData }
104 |             }
105 |         }
106 | 
107 |         impl<S: Simd> $name<S> where Self: SimdVectorBase<S, Element = $ety> {
108 |             #[inline(always)]
109 |             pub(crate) unsafe fn map<F>(mut self, f: F) -> Self
110 |             where F: Fn($ety) -> $ety {
111 |                 for i in 0..Self::NUM_ELEMENTS {
112 |                     let ptr = transmute::<&mut _, *mut $ety>(&mut self).add(i);
113 |                     *ptr = f(*ptr);
114 |                 }
115 |                 self
116 |             }
117 | 
118 |             #[inline(always)]
119 |             pub(crate) unsafe fn zip<F, V>(a: Self, b: V, f: F) -> Self
120 |             where F: Fn($ety, <V as SimdVectorBase<S>>::Element) -> $ety,
121 |                 Self: SimdVectorBase<S>,
122 |                   V: SimdVectorBase<S> {
123 |                 let mut out = Self::default();
124 |                 for i in 0..Self::NUM_ELEMENTS {
125 |                     *transmute::<&mut _, *mut $ety>(&mut out).add(i) =
126 |                         f(a.extract_unchecked(i), b.extract_unchecked(i));
127 |                 }
128 |                 out
129 |             }
130 | 
131 |             #[inline(always)]
132 |             pub(crate) unsafe fn reduce<F>(self, mut init: $ety, f: F) -> $ety
133 |             where F: Fn($ety, $ety) -> $ety {
134 |                 for i in 0..Self::NUM_ELEMENTS {
135 |                     init = f(init, self.extract_unchecked(i));
136 |                 }
137 |                 init
138 |             }
139 | 
140 |             #[inline(always)]
141 |             pub(crate) unsafe fn reduce2<F>(self, f: F) -> $ety
142 |             where F: Fn($ety, $ety) -> $ety {
143 |                 let mut accum = self.extract_unchecked(0);
144 |                 for i in 1..Self::NUM_ELEMENTS {
145 |                     accum = f(accum, self.extract_unchecked(i));
146 |                 }
147 |                 accum
148 |             }
149 |         }
150 | 
151 |         impl<S: Simd> fmt::Debug for $name<S> where Self: SimdVectorBase<S> {
152 |             fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
153 |                 let mut t = f.debug_tuple(stringify!($name));
154 |                 for i in 0..Self::NUM_ELEMENTS {
155 |                     t.field(unsafe { &*transmute::<&_, *const $ety>(self).add(i) });
156 |                 }
157 |                 t.finish()
158 |             }
159 |         }
160 |     )*};
161 | }
162 | 
163 | macro_rules! decl_brute_force_convert {
164 |     (#[$meta:meta] $from:ty => $to:ty) => {
165 |         paste::paste! {
166 |             #[$meta]
167 |             #[inline]
168 |             unsafe fn do_convert(value: [<V $from>]) -> [<V $to>] {
169 |                 let mut res = mem::MaybeUninit::uninit();
170 |                 for i in 0..[<V $from>]::NUM_ELEMENTS {
171 |                     *(res.as_mut_ptr() as *mut $to).add(i) = (*transmute::<&_, *const $from>(&value).add(i)) as $to;
172 |                 }
173 |                 res.assume_init()
174 |             }
175 |         }
176 |     };
177 | }
178 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/mod.rs:
--------------------------------------------------------------------------------
 1 | #[macro_use]
 2 | mod macros;
 3 | 
 4 | pub mod polyfills;
 5 | 
 6 | //pub mod scalar;
 7 | 
 8 | #[cfg(all(feature = "neon", target_arch = "aarch64"))]
 9 | pub mod aarch64;
10 | #[cfg(all(feature = "neon", target_arch = "arm"))]
11 | pub mod arm;
12 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
13 | pub mod avx1;
14 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
15 | pub mod avx2;
16 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
17 | pub mod sse2;
18 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
19 | pub mod sse42;
20 | #[cfg(all(feature = "wasm32", target_arch = "wasm32"))]
21 | pub mod wasm32;
22 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/polyfills.rs:
--------------------------------------------------------------------------------
1 | #[inline(always)]
2 | pub const fn _mm_shuffle(w: i32, z: i32, y: i32, x: i32) -> i32 {
3 |     (w << 6) | (z << 4) | (y << 2) | x
4 | }
5 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use crate::*;
 4 | 
 5 | use core::{
 6 |     fmt,
 7 |     marker::PhantomData,
 8 |     mem::{transmute, transmute_copy},
 9 |     ops::*,
10 | };
11 | 
12 | mod polyfills;
13 | use polyfills::*;
14 | 
15 | use half::f16;
16 | 
17 | mod vf32;
18 | mod vf64;
19 | mod vi32;
20 | mod vi64;
21 | mod vu32;
22 | mod vu64;
23 | 
24 | pub use vf32::*;
25 | pub use vf64::*;
26 | pub use vi32::*;
27 | pub use vi64::*;
28 | pub use vu32::*;
29 | pub use vu64::*;
30 | 
31 | type Vu32 = u32x1<Scalar>;
32 | type Vf32 = f32x1<Scalar>;
33 | type Vf64 = f64x1<Scalar>;
34 | 
35 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
36 | pub struct Scalar;
37 | 
38 | impl Simd for Scalar {
39 |     const INSTRSET: SimdInstructionSet = SimdInstructionSet::Scalar;
40 | 
41 |     type Vu32 = Vu32;
42 |     type Vf32 = Vf32;
43 |     type Vf64 = Vf64;
44 | 
45 |     #[cfg(target_pointer_width = "32")]
46 |     type Vusize = Vu32;
47 | 
48 |     //#[cfg(target_pointer_width = "32")]
49 |     //type Visize = Vi32;
50 | 
51 |     /*
52 |     type Vi32 = Vi32;
53 |     type Vi64 = Vi64;
54 | 
55 |     type Vu64 = Vu64;
56 | 
57 |     #[cfg(target_pointer_width = "64")]
58 |     type Vusize = Vu64;
59 | 
60 |     #[cfg(target_pointer_width = "64")]
61 |     type Visize = Vi64;
62 |     */
63 | }
64 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/polyfills.rs:
--------------------------------------------------------------------------------
 1 | #[inline(always)]
 2 | pub fn bool_to_u32(value: bool) -> u32 {
 3 |     //if value { 0xFFFF_FFFF } else { 0 }
 4 |     -(value as i32) as u32
 5 | }
 6 | 
 7 | #[inline(always)]
 8 | pub fn bool_to_u64(value: bool) -> u32 {
 9 |     //if value { 0xFFFF_FFFF_FFFF_FFFF } else { 0 }
10 |     -(value as i64) as u64
11 | }
12 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vf32.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(f32x1: f32 => f32);
  4 | impl<S: Simd> Default for f32x1<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new(0.0)
  8 |     }
  9 | }
 10 | 
 11 | impl SimdVectorBase<Scalar> for f32x1<Scalar> {
 12 |     type Element = f32;
 13 | 
 14 |     #[inline(always)]
 15 |     fn splat(value: Self::Element) -> Self {
 16 |         Self::new(value)
 17 |     }
 18 | 
 19 |     #[inline(always)]
 20 |     unsafe fn undefined() -> Self {
 21 |         Self::new(0.0)
 22 |     }
 23 | 
 24 |     #[inline(always)]
 25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
 26 |         Self::new(*src)
 27 |     }
 28 | 
 29 |     #[inline(always)]
 30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
 31 |         Self::new(src.read_unaligned())
 32 |     }
 33 | 
 34 |     #[inline(always)]
 35 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
 36 |         *dst = self.value;
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
 41 |         dst.write_unaligned(self.value)
 42 |     }
 43 | 
 44 |     decl_base_common!(#[target_feature()] f32x1: f32 => f32);
 45 | }
 46 | 
 47 | impl SimdBitwise<Scalar> for f32x1<Scalar> {
 48 |     const FULL_BITMASK: u16 = 1;
 49 | 
 50 |     #[inline(always)]
 51 |     fn bitmask(self) -> u16 {
 52 |         self.into_bits().bitmask()
 53 |     }
 54 | 
 55 |     #[inline(always)]
 56 |     unsafe fn _mm_not(self) -> Self {
 57 |         self ^ Self::splat(f32::from_bits(!0))
 58 |     }
 59 | 
 60 |     #[inline(always)]
 61 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
 62 |         Self::new(f32::from_bits(self.value.to_bits() & rhs.value.to_bits()))
 63 |     }
 64 | 
 65 |     #[inline(always)]
 66 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
 67 |         Self::new(f32::from_bits(self.value.to_bits() | rhs.value.to_bits()))
 68 |     }
 69 | 
 70 |     #[inline(always)]
 71 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
 72 |         Self::new(f32::from_bits(self.value.to_bits() ^ rhs.value.to_bits()))
 73 |     }
 74 | 
 75 |     #[inline(always)]
 76 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
 77 |         Self::new(f32::from_bits(self.value.to_bits() << count.value))
 78 |     }
 79 | 
 80 |     #[inline(always)]
 81 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
 82 |         Self::new(f32::from_bits(self.value.to_bits() >> count.value))
 83 |     }
 84 | 
 85 |     #[inline(always)]
 86 |     unsafe fn _mm_shli(self, count: u32) -> Self {
 87 |         Self::new(f32::from_bits(self.value.to_bits() << count))
 88 |     }
 89 | 
 90 |     #[inline(always)]
 91 |     unsafe fn _mm_shri(self, count: u32) -> Self {
 92 |         Self::new(f32::from_bits(self.value.to_bits() >> count))
 93 |     }
 94 | }
 95 | 
 96 | impl PartialEq<Self> for f32x1<Scalar> {
 97 |     #[inline(always)]
 98 |     fn eq(&self, other: &Self) -> bool {
 99 |         self.value == other.value
100 |     }
101 | }
102 | 
103 | impl SimdMask<Scalar> for f32x1<Scalar> {
104 |     #[inline(always)]
105 |     unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
106 |         if self.value.to_bits() != 0 {
107 |             t
108 |         } else {
109 |             f
110 |         }
111 |     }
112 | }
113 | 
114 | impl SimdVector<Scalar> for f32x1<Scalar> {
115 |     fn zero() -> Self {
116 |         Self::splat(0.0)
117 |     }
118 | 
119 |     fn one() -> Self {
120 |         Self::splat(1.0)
121 |     }
122 | 
123 |     fn indexed() -> Self {
124 |         Self::splat(0.0)
125 |     }
126 | 
127 |     #[inline(always)]
128 |     fn min_value() -> Self {
129 |         Self::splat(f32::MIN)
130 |     }
131 | 
132 |     #[inline(always)]
133 |     fn max_value() -> Self {
134 |         Self::splat(f32::MAX)
135 |     }
136 | 
137 |     #[inline(always)]
138 |     fn min(self, other: Self) -> Self {
139 |         Self::new(self.value.min(other.value))
140 |     }
141 | 
142 |     #[inline(always)]
143 |     fn max(self, other: Self) -> Self {
144 |         Self::new(self.value.max(other.value))
145 |     }
146 | 
147 |     #[inline(always)]
148 |     fn min_element(self) -> Self::Element {
149 |         self.value
150 |     }
151 | 
152 |     #[inline(always)]
153 |     fn max_element(self) -> Self::Element {
154 |         self.value
155 |     }
156 | 
157 |     #[inline(always)]
158 |     fn eq(self, other: Self) -> Mask<Scalar, Self> {
159 |         Self::new(f32::from_bits(bool_to_u32(self.value == other.value)))
160 |     }
161 | 
162 |     #[inline(always)]
163 |     fn lt(self, other: Self) -> Mask<Scalar, Self> {
164 |         Self::new(f32::from_bits(bool_to_u32(self.value < other.value)))
165 |     }
166 | 
167 |     #[inline(always)]
168 |     fn le(self, other: Self) -> Mask<Scalar, Self> {
169 |         Self::new(f32::from_bits(bool_to_u32(self.value <= other.value)))
170 |     }
171 | 
172 |     #[inline(always)]
173 |     fn gt(self, other: Self) -> Mask<Scalar, Self> {
174 |         Self::new(f32::from_bits(bool_to_u32(self.value > other.value)))
175 |     }
176 | 
177 |     #[inline(always)]
178 |     fn ge(self, other: Self) -> Mask<Scalar, Self> {
179 |         Self::new(f32::from_bits(bool_to_u32(self.value >= other.value)))
180 |     }
181 | 
182 |     #[inline(always)]
183 |     unsafe fn _mm_add(self, rhs: Self) -> Self {
184 |         Self::new(Add::add(self.value, rhs.value))
185 |     }
186 | 
187 |     #[inline(always)]
188 |     unsafe fn _mm_sub(self, rhs: Self) -> Self {
189 |         Self::new(Sub::sub(self.value, rhs.value))
190 |     }
191 | 
192 |     #[inline(always)]
193 |     unsafe fn _mm_mul(self, rhs: Self) -> Self {
194 |         Self::new(Mul::mul(self.value, rhs.value))
195 |     }
196 | 
197 |     #[inline(always)]
198 |     unsafe fn _mm_div(self, rhs: Self) -> Self {
199 |         Self::new(Div::div(self.value, rhs.value))
200 |     }
201 | 
202 |     #[inline(always)]
203 |     unsafe fn _mm_rem(self, rhs: Self) -> Self {
204 |         Self::new(Rem::rem(self.value, rhs.value))
205 |     }
206 | }
207 | 
208 | impl SimdIntoBits<Scalar, Vu32> for f32x1<Scalar> {
209 |     fn into_bits(self) -> Vu32 {
210 |         u32x1::new(self.value.to_bits())
211 |     }
212 | }
213 | 
214 | impl SimdFromBits<Scalar, Vu32> for f32x1<Scalar> {
215 |     fn from_bits(bits: Vu32) -> Self {
216 |         Self::new(f32::from_bits(bits.value))
217 |     }
218 | }
219 | 
220 | impl_ops!(@UNARY f32x1 Scalar => Not::not, Neg::neg);
221 | impl_ops!(@BINARY f32x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
222 | impl_ops!(@SHIFTS f32x1 Scalar => Shr::shr, Shl::shl);
223 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vf64.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(f64x1: f64 => f64);
  4 | impl<S: Simd> Default for f64x1<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new(0.0)
  8 |     }
  9 | }
 10 | 
 11 | impl SimdVectorBase<Scalar> for f64x1<Scalar> {
 12 |     type Element = f64;
 13 | 
 14 |     #[inline(always)]
 15 |     fn splat(value: Self::Element) -> Self {
 16 |         Self::new(value)
 17 |     }
 18 | 
 19 |     #[inline(always)]
 20 |     unsafe fn undefined() -> Self {
 21 |         Self::new(0.0)
 22 |     }
 23 | 
 24 |     #[inline(always)]
 25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
 26 |         Self::new(*src)
 27 |     }
 28 | 
 29 |     #[inline(always)]
 30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
 31 |         Self::new(src.read_unaligned())
 32 |     }
 33 | 
 34 |     #[inline(always)]
 35 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
 36 |         *dst = self.value;
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
 41 |         dst.write_unaligned(self.value)
 42 |     }
 43 | 
 44 |     decl_base_common!(#[target_feature()] f64x1: f64 => f64);
 45 | }
 46 | 
 47 | impl SimdBitwise<Scalar> for f64x1<Scalar> {
 48 |     const FULL_BITMASK: u16 = 1;
 49 | 
 50 |     #[inline(always)]
 51 |     fn bitmask(self) -> u16 {
 52 |         self.into_bits().bitmask()
 53 |     }
 54 | 
 55 |     #[inline(always)]
 56 |     unsafe fn _mm_not(self) -> Self {
 57 |         self ^ Self::splat(f64::from_bits(!0))
 58 |     }
 59 | 
 60 |     #[inline(always)]
 61 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
 62 |         Self::new(f64::from_bits(self.value.to_bits() & rhs.value.to_bits()))
 63 |     }
 64 | 
 65 |     #[inline(always)]
 66 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
 67 |         Self::new(f64::from_bits(self.value.to_bits() | rhs.value.to_bits()))
 68 |     }
 69 | 
 70 |     #[inline(always)]
 71 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
 72 |         Self::new(f64::from_bits(self.value.to_bits() ^ rhs.value.to_bits()))
 73 |     }
 74 | 
 75 |     #[inline(always)]
 76 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
 77 |         Self::new(f64::from_bits(self.value.to_bits() << count.value))
 78 |     }
 79 | 
 80 |     #[inline(always)]
 81 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
 82 |         Self::new(f64::from_bits(self.value.to_bits() >> count.value))
 83 |     }
 84 | 
 85 |     #[inline(always)]
 86 |     unsafe fn _mm_shli(self, count: u32) -> Self {
 87 |         Self::new(f64::from_bits(self.value.to_bits() << count))
 88 |     }
 89 | 
 90 |     #[inline(always)]
 91 |     unsafe fn _mm_shri(self, count: u32) -> Self {
 92 |         Self::new(f64::from_bits(self.value.to_bits() >> count))
 93 |     }
 94 | }
 95 | 
 96 | impl PartialEq<Self> for f64x1<Scalar> {
 97 |     #[inline(always)]
 98 |     fn eq(&self, other: &Self) -> bool {
 99 |         self.value == other.value
100 |     }
101 | }
102 | 
103 | impl SimdMask<Scalar> for f64x1<Scalar> {
104 |     #[inline(always)]
105 |     unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
106 |         if self.value.to_bits() != 0 {
107 |             t
108 |         } else {
109 |             f
110 |         }
111 |     }
112 | }
113 | 
114 | impl SimdVector<Scalar> for f64x1<Scalar> {
115 |     fn zero() -> Self {
116 |         Self::splat(0.0)
117 |     }
118 | 
119 |     fn one() -> Self {
120 |         Self::splat(1.0)
121 |     }
122 | 
123 |     fn indexed() -> Self {
124 |         Self::splat(0.0)
125 |     }
126 | 
127 |     #[inline(always)]
128 |     fn min_value() -> Self {
129 |         Self::splat(f64::MIN)
130 |     }
131 | 
132 |     #[inline(always)]
133 |     fn max_value() -> Self {
134 |         Self::splat(f64::MAX)
135 |     }
136 | 
137 |     #[inline(always)]
138 |     fn min(self, other: Self) -> Self {
139 |         Self::new(self.value.min(other.value))
140 |     }
141 | 
142 |     #[inline(always)]
143 |     fn max(self, other: Self) -> Self {
144 |         Self::new(self.value.max(other.value))
145 |     }
146 | 
147 |     #[inline(always)]
148 |     fn min_element(self) -> Self::Element {
149 |         self.value
150 |     }
151 | 
152 |     #[inline(always)]
153 |     fn max_element(self) -> Self::Element {
154 |         self.value
155 |     }
156 | 
157 |     #[inline(always)]
158 |     fn eq(self, other: Self) -> Mask<Scalar, Self> {
159 |         Self::new(f64::from_bits(bool_to_u32(self.value == other.value)))
160 |     }
161 | 
162 |     #[inline(always)]
163 |     fn lt(self, other: Self) -> Mask<Scalar, Self> {
164 |         Self::new(f64::from_bits(bool_to_u32(self.value < other.value)))
165 |     }
166 | 
167 |     #[inline(always)]
168 |     fn le(self, other: Self) -> Mask<Scalar, Self> {
169 |         Self::new(f64::from_bits(bool_to_u32(self.value <= other.value)))
170 |     }
171 | 
172 |     #[inline(always)]
173 |     fn gt(self, other: Self) -> Mask<Scalar, Self> {
174 |         Self::new(f64::from_bits(bool_to_u32(self.value > other.value)))
175 |     }
176 | 
177 |     #[inline(always)]
178 |     fn ge(self, other: Self) -> Mask<Scalar, Self> {
179 |         Self::new(f64::from_bits(bool_to_u32(self.value >= other.value)))
180 |     }
181 | 
182 |     #[inline(always)]
183 |     unsafe fn _mm_add(self, rhs: Self) -> Self {
184 |         Self::new(Add::add(self.value, rhs.value))
185 |     }
186 | 
187 |     #[inline(always)]
188 |     unsafe fn _mm_sub(self, rhs: Self) -> Self {
189 |         Self::new(Sub::sub(self.value, rhs.value))
190 |     }
191 | 
192 |     #[inline(always)]
193 |     unsafe fn _mm_mul(self, rhs: Self) -> Self {
194 |         Self::new(Mul::mul(self.value, rhs.value))
195 |     }
196 | 
197 |     #[inline(always)]
198 |     unsafe fn _mm_div(self, rhs: Self) -> Self {
199 |         Self::new(Div::div(self.value, rhs.value))
200 |     }
201 | 
202 |     #[inline(always)]
203 |     unsafe fn _mm_rem(self, rhs: Self) -> Self {
204 |         Self::new(Rem::rem(self.value, rhs.value))
205 |     }
206 | }
207 | 
208 | impl SimdIntoBits<Scalar, Vu32> for f64x1<Scalar> {
209 |     fn into_bits(self) -> Vu32 {
210 |         u32x1::new(self.value.to_bits())
211 |     }
212 | }
213 | 
214 | impl SimdFromBits<Scalar, Vu32> for f64x1<Scalar> {
215 |     fn from_bits(bits: Vu32) -> Self {
216 |         Self::new(f64::from_bits(bits.value))
217 |     }
218 | }
219 | 
220 | impl_ops!(@UNARY f64x1 Scalar => Not::not, Neg::neg);
221 | impl_ops!(@BINARY f64x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
222 | impl_ops!(@SHIFTS f64x1 Scalar => Shr::shr, Shl::shl);
223 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vi32.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite/src/backends/scalar/vi32.rs


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vi64.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite/src/backends/scalar/vi64.rs


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vu32.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(u32x1: u32 => u32);
  4 | impl<S: Simd> Default for u32x1<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new(0)
  8 |     }
  9 | }
 10 | 
 11 | impl SimdVectorBase<Scalar> for u32x1<Scalar> {
 12 |     type Element = u32;
 13 | 
 14 |     #[inline(always)]
 15 |     fn splat(value: Self::Element) -> Self {
 16 |         Self::new(value)
 17 |     }
 18 | 
 19 |     #[inline(always)]
 20 |     unsafe fn undefined() -> Self {
 21 |         Self::new(0.0)
 22 |     }
 23 | 
 24 |     #[inline(always)]
 25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
 26 |         Self::new(*src)
 27 |     }
 28 | 
 29 |     #[inline(always)]
 30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
 31 |         Self::new(src.read_unaligned())
 32 |     }
 33 | 
 34 |     #[inline(always)]
 35 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
 36 |         *dst = self.value;
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
 41 |         dst.write_unaligned(self.value)
 42 |     }
 43 | 
 44 |     decl_base_common!(#[target_feature()] u32x1: u32 => u32);
 45 | }
 46 | 
 47 | impl SimdBitwise<Scalar> for u32x1<Scalar> {
 48 |     const FULL_BITMASK: u16 = 1;
 49 | 
 50 |     fn bitmask(self) -> u16 {
 51 |         (self.value >> 31) as u16
 52 |     }
 53 | 
 54 |     unsafe fn _mm_not(self) -> Self {
 55 |         Self::new(!self.value)
 56 |     }
 57 | 
 58 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
 59 |         Self::new(self.value & rhs.value)
 60 |     }
 61 | 
 62 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
 63 |         Self::new(self.value | rhs.value)
 64 |     }
 65 | 
 66 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
 67 |         Self::new(self.value ^ rhs.value)
 68 |     }
 69 | 
 70 |     #[inline(always)]
 71 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
 72 |         Self::new(self.value << count.value)
 73 |     }
 74 | 
 75 |     #[inline(always)]
 76 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
 77 |         Self::new(self.value >> count.value)
 78 |     }
 79 | 
 80 |     #[inline(always)]
 81 |     unsafe fn _mm_shli(self, count: u32) -> Self {
 82 |         Self::new(self.value << count)
 83 |     }
 84 | 
 85 |     #[inline(always)]
 86 |     unsafe fn _mm_shri(self, count: u32) -> Self {
 87 |         Self::new(self.value >> count)
 88 |     }
 89 | }
 90 | 
 91 | impl PartialEq<Self> for u32x1<Scalar> {
 92 |     #[inline(always)]
 93 |     fn eq(&self, other: &Self) -> bool {
 94 |         self.value == other.value
 95 |     }
 96 | }
 97 | 
 98 | impl Eq for u32x1<Scalar> {}
 99 | 
100 | impl SimdMask<Scalar> for u32x1<Scalar> {
101 |     #[inline(always)]
102 |     unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
103 |         if self.value != 0 {
104 |             t
105 |         } else {
106 |             f
107 |         }
108 |     }
109 | 
110 |     #[inline(always)]
111 |     unsafe fn _mm_all(self) -> bool {
112 |         self._mm_any() // only one value
113 |     }
114 | 
115 |     #[inline(always)]
116 |     unsafe fn _mm_any(self) -> bool {
117 |         self.value != 0
118 |     }
119 | 
120 |     #[inline(always)]
121 |     unsafe fn _mm_none(self) -> bool {
122 |         self.value == 0
123 |     }
124 | }
125 | 
126 | impl SimdVector<Scalar> for u32x1<Scalar> {
127 |     fn zero() -> Self {
128 |         Self::new(0)
129 |     }
130 | 
131 |     fn one() -> Self {
132 |         Self::new(1)
133 |     }
134 | 
135 |     fn indexed() -> Self {
136 |         Self::new(0)
137 |     }
138 | 
139 |     #[inline(always)]
140 |     fn min_value() -> Self {
141 |         Self::splat(u32::MIN)
142 |     }
143 | 
144 |     #[inline(always)]
145 |     fn max_value() -> Self {
146 |         Self::splat(u32::MAX)
147 |     }
148 | 
149 |     #[inline(always)]
150 |     fn min_element(self) -> Self::Element {
151 |         self.value
152 |     }
153 | 
154 |     #[inline(always)]
155 |     fn max_element(self) -> Self::Element {
156 |         self.value
157 |     }
158 | 
159 |     #[inline(always)]
160 |     fn eq(self, other: Self) -> Mask<Scalar, Self> {
161 |         Self::new(bool_to_u32(self.value == other.value))
162 |     }
163 | 
164 |     #[inline(always)]
165 |     fn lt(self, other: Self) -> Mask<Scalar, Self> {
166 |         Self::new(bool_to_u32(self.value < other.value))
167 |     }
168 | 
169 |     #[inline(always)]
170 |     fn le(self, other: Self) -> Mask<Scalar, Self> {
171 |         Self::new(bool_to_u32(self.value <= other.value))
172 |     }
173 | 
174 |     #[inline(always)]
175 |     fn gt(self, other: Self) -> Mask<Scalar, Self> {
176 |         Self::new(bool_to_u32(self.value > other.value))
177 |     }
178 | 
179 |     #[inline(always)]
180 |     fn ge(self, other: Self) -> Mask<Scalar, Self> {
181 |         Self::new(bool_to_u32(self.value >= other.value))
182 |     }
183 | 
184 |     #[inline(always)]
185 |     unsafe fn _mm_add(self, rhs: Self) -> Self {
186 |         Self::new(Add::add(self.value, rhs.value))
187 |     }
188 | 
189 |     #[inline(always)]
190 |     unsafe fn _mm_sub(self, rhs: Self) -> Self {
191 |         Self::new(Sub::sub(self.value, rhs.value))
192 |     }
193 | 
194 |     #[inline(always)]
195 |     unsafe fn _mm_mul(self, rhs: Self) -> Self {
196 |         Self::new(Mul::mul(self.value, rhs.value))
197 |     }
198 | 
199 |     #[inline(always)]
200 |     unsafe fn _mm_div(self, rhs: Self) -> Self {
201 |         Self::new(Div::div(self.value, rhs.value))
202 |     }
203 | 
204 |     #[inline(always)]
205 |     unsafe fn _mm_rem(self, rhs: Self) -> Self {
206 |         Self::new(Rem::rem(self.value, rhs.value))
207 |     }
208 | }
209 | 
210 | impl SimdIntVector<Scalar> for u32x1<Scalar> {
211 |     fn saturating_add(self, rhs: Self) -> Self {
212 |         Self::new(self.value.saturating_add(rhs.value))
213 |     }
214 | 
215 |     fn saturating_sub(self, rhs: Self) -> Self {
216 |         Self::new(self.value.saturating_add(rhs.value))
217 |     }
218 | 
219 |     fn wrapping_sum(self) -> Self::Element {
220 |         self.value
221 |     }
222 | 
223 |     fn wrapping_product(self) -> Self::Element {
224 |         self.value
225 |     }
226 | 
227 |     fn rolv(self, cnt: Vu32) -> Self {
228 |         Self::new(self.value.rotate_left(cnt.value))
229 |     }
230 | 
231 |     fn rorv(self, cnt: Vu32) -> Self {
232 |         Self::new(self.value.rotate_right(cnt.value))
233 |     }
234 | 
235 |     fn reverse_bits(self) -> Self {
236 |         Self::new(self.value.reverse_bits())
237 |     }
238 | 
239 |     fn count_ones(self) -> Self {
240 |         Self::new(self.value.count_ones())
241 |     }
242 | 
243 |     fn count_zeros(self) -> Self {
244 |         Self::new(self.value.count_zeros())
245 |     }
246 | 
247 |     fn leading_ones(self) -> Self {
248 |         Self::new(self.value.leading_ones())
249 |     }
250 | 
251 |     fn leading_zeros(self) -> Self {
252 |         Self::new(self.value.leading_zeros())
253 |     }
254 | }
255 | 
256 | impl SimdUnsignedIntVector<Scalar> for u32x1<Scalar> {
257 |     #[inline(always)]
258 |     fn next_power_of_two_m1(mut self) -> Self {
259 |         self |= (self >> 1);
260 |         self |= (self >> 2);
261 |         self |= (self >> 4);
262 |         self |= (self >> 8);
263 |         self |= (self >> 16);
264 |         self
265 |     }
266 | }
267 | 
268 | impl_ops!(@UNARY u32x1 Scalar => Not::not);
269 | impl_ops!(@BINARY u32x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
270 | impl_ops!(@SHIFTS u32x1 Scalar => Shr::shr, Shl::shl);
271 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vu64.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(u64x1: u64 => u64);
  4 | impl<S: Simd> Default for u64x1<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new(0)
  8 |     }
  9 | }
 10 | 
 11 | impl SimdVectorBase<Scalar> for u64x1<Scalar> {
 12 |     type Element = u64;
 13 | 
 14 |     #[inline(always)]
 15 |     fn splat(value: Self::Element) -> Self {
 16 |         Self::new(value)
 17 |     }
 18 | 
 19 |     #[inline(always)]
 20 |     unsafe fn undefined() -> Self {
 21 |         Self::new(0.0)
 22 |     }
 23 | 
 24 |     #[inline(always)]
 25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
 26 |         Self::new(*src)
 27 |     }
 28 | 
 29 |     #[inline(always)]
 30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
 31 |         Self::new(src.read_unaligned())
 32 |     }
 33 | 
 34 |     #[inline(always)]
 35 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
 36 |         *dst = self.value;
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
 41 |         dst.write_unaligned(self.value)
 42 |     }
 43 | 
 44 |     decl_base_common!(#[target_feature()] u64x1: u64 => u64);
 45 | }
 46 | 
 47 | impl SimdBitwise<Scalar> for u64x1<Scalar> {
 48 |     const FULL_BITMASK: u16 = 1;
 49 | 
 50 |     fn bitmask(self) -> u16 {
 51 |         (self.value >> 63) as u16
 52 |     }
 53 | 
 54 |     unsafe fn _mm_not(self) -> Self {
 55 |         Self::new(!self.value)
 56 |     }
 57 | 
 58 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
 59 |         Self::new(self.value & rhs.value)
 60 |     }
 61 | 
 62 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
 63 |         Self::new(self.value | rhs.value)
 64 |     }
 65 | 
 66 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
 67 |         Self::new(self.value ^ rhs.value)
 68 |     }
 69 | 
 70 |     #[inline(always)]
 71 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
 72 |         Self::new(self.value << count.value)
 73 |     }
 74 | 
 75 |     #[inline(always)]
 76 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
 77 |         Self::new(self.value >> count.value)
 78 |     }
 79 | 
 80 |     #[inline(always)]
 81 |     unsafe fn _mm_shli(self, count: u32) -> Self {
 82 |         Self::new(self.value << count)
 83 |     }
 84 | 
 85 |     #[inline(always)]
 86 |     unsafe fn _mm_shri(self, count: u32) -> Self {
 87 |         Self::new(self.value >> count)
 88 |     }
 89 | }
 90 | 
 91 | impl PartialEq<Self> for u64x1<Scalar> {
 92 |     #[inline(always)]
 93 |     fn eq(&self, other: &Self) -> bool {
 94 |         self.value == other.value
 95 |     }
 96 | }
 97 | 
 98 | impl Eq for u64x1<Scalar> {}
 99 | 
100 | impl SimdMask<Scalar> for u64x1<Scalar> {
101 |     #[inline(always)]
102 |     unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
103 |         if self.value != 0 {
104 |             t
105 |         } else {
106 |             f
107 |         }
108 |     }
109 | 
110 |     #[inline(always)]
111 |     unsafe fn _mm_all(self) -> bool {
112 |         self._mm_any() // only one value
113 |     }
114 | 
115 |     #[inline(always)]
116 |     unsafe fn _mm_any(self) -> bool {
117 |         self.value != 0
118 |     }
119 | 
120 |     #[inline(always)]
121 |     unsafe fn _mm_none(self) -> bool {
122 |         self.value == 0
123 |     }
124 | }
125 | 
126 | impl SimdVector<Scalar> for u64x1<Scalar> {
127 |     fn zero() -> Self {
128 |         Self::new(0)
129 |     }
130 | 
131 |     fn one() -> Self {
132 |         Self::new(1)
133 |     }
134 | 
135 |     fn indexed() -> Self {
136 |         Self::new(0)
137 |     }
138 | 
139 |     #[inline(always)]
140 |     fn min_value() -> Self {
141 |         Self::splat(u64::MIN)
142 |     }
143 | 
144 |     #[inline(always)]
145 |     fn max_value() -> Self {
146 |         Self::splat(u64::MAX)
147 |     }
148 | 
149 |     #[inline(always)]
150 |     fn min_element(self) -> Self::Element {
151 |         self.value
152 |     }
153 | 
154 |     #[inline(always)]
155 |     fn max_element(self) -> Self::Element {
156 |         self.value
157 |     }
158 | 
159 |     #[inline(always)]
160 |     fn eq(self, other: Self) -> Mask<Scalar, Self> {
161 |         Self::new(bool_to_u64(self.value == other.value))
162 |     }
163 | 
164 |     #[inline(always)]
165 |     fn lt(self, other: Self) -> Mask<Scalar, Self> {
166 |         Self::new(bool_to_u64(self.value < other.value))
167 |     }
168 | 
169 |     #[inline(always)]
170 |     fn le(self, other: Self) -> Mask<Scalar, Self> {
171 |         Self::new(bool_to_u64(self.value <= other.value))
172 |     }
173 | 
174 |     #[inline(always)]
175 |     fn gt(self, other: Self) -> Mask<Scalar, Self> {
176 |         Self::new(bool_to_u64(self.value > other.value))
177 |     }
178 | 
179 |     #[inline(always)]
180 |     fn ge(self, other: Self) -> Mask<Scalar, Self> {
181 |         Self::new(bool_to_u64(self.value >= other.value))
182 |     }
183 | 
184 |     #[inline(always)]
185 |     unsafe fn _mm_add(self, rhs: Self) -> Self {
186 |         Self::new(Add::add(self.value, rhs.value))
187 |     }
188 | 
189 |     #[inline(always)]
190 |     unsafe fn _mm_sub(self, rhs: Self) -> Self {
191 |         Self::new(Sub::sub(self.value, rhs.value))
192 |     }
193 | 
194 |     #[inline(always)]
195 |     unsafe fn _mm_mul(self, rhs: Self) -> Self {
196 |         Self::new(Mul::mul(self.value, rhs.value))
197 |     }
198 | 
199 |     #[inline(always)]
200 |     unsafe fn _mm_div(self, rhs: Self) -> Self {
201 |         Self::new(Div::div(self.value, rhs.value))
202 |     }
203 | 
204 |     #[inline(always)]
205 |     unsafe fn _mm_rem(self, rhs: Self) -> Self {
206 |         Self::new(Rem::rem(self.value, rhs.value))
207 |     }
208 | }
209 | 
210 | impl SimdIntVector<Scalar> for u64x1<Scalar> {
211 |     fn saturating_add(self, rhs: Self) -> Self {
212 |         Self::new(self.value.saturating_add(rhs.value))
213 |     }
214 | 
215 |     fn saturating_sub(self, rhs: Self) -> Self {
216 |         Self::new(self.value.saturating_add(rhs.value))
217 |     }
218 | 
219 |     fn wrapping_sum(self) -> Self::Element {
220 |         self.value
221 |     }
222 | 
223 |     fn wrapping_product(self) -> Self::Element {
224 |         self.value
225 |     }
226 | 
227 |     fn rolv(self, cnt: Vu32) -> Self {
228 |         Self::new(self.value.rotate_left(cnt.value))
229 |     }
230 | 
231 |     fn rorv(self, cnt: Vu32) -> Self {
232 |         Self::new(self.value.rotate_right(cnt.value))
233 |     }
234 | 
235 |     fn reverse_bits(self) -> Self {
236 |         Self::new(self.value.reverse_bits())
237 |     }
238 | 
239 |     fn count_ones(self) -> Self {
240 |         Self::new(self.value.count_ones())
241 |     }
242 | 
243 |     fn count_zeros(self) -> Self {
244 |         Self::new(self.value.count_zeros())
245 |     }
246 | 
247 |     fn leading_ones(self) -> Self {
248 |         Self::new(self.value.leading_ones())
249 |     }
250 | 
251 |     fn leading_zeros(self) -> Self {
252 |         Self::new(self.value.leading_zeros())
253 |     }
254 | }
255 | 
256 | impl SimdUnsignedIntVector<Scalar> for u64x1<Scalar> {
257 |     #[inline(always)]
258 |     fn next_power_of_two_m1(mut self) -> Self {
259 |         self |= (self >> 1);
260 |         self |= (self >> 2);
261 |         self |= (self >> 4);
262 |         self |= (self >> 8);
263 |         self |= (self >> 16);
264 |         self |= (self >> 32);
265 |         self
266 |     }
267 | }
268 | 
269 | impl_ops!(@UNARY u64x1 Scalar => Not::not);
270 | impl_ops!(@BINARY u64x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
271 | impl_ops!(@SHIFTS u64x1 Scalar => Shr::shr, Shl::shl);
272 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse2/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use crate::*;
 4 | 
 5 | use core::{
 6 |     fmt,
 7 |     marker::PhantomData,
 8 |     mem::{transmute, transmute_copy},
 9 |     ops::*,
10 | };
11 | 
12 | use crate::arch::sse2::*;
13 | 
14 | use half::f16;
15 | 
16 | pub(crate) mod polyfills;
17 | 
18 | use super::polyfills::*;
19 | use polyfills::*;
20 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse2/polyfills.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | #[inline(always)]
 4 | pub unsafe fn _mm_blendv_epi8x(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
 5 |     _mm_or_si128(_mm_and_si128(mask, xmm0), _mm_andnot_si128(mask, xmm1))
 6 | }
 7 | 
 8 | #[inline(always)]
 9 | pub unsafe fn _mm_signbits_epi32x(v: __m128i) -> __m128i {
10 |     _mm_srai_epi32(v, 31)
11 | }
12 | 
13 | #[inline(always)]
14 | pub unsafe fn _mm_signbits_epi64x(v: __m128i) -> __m128i {
15 |     _mm_srai_epi32(_mm_shuffle_epi32(v, _mm_shuffle(3, 3, 1, 1)), 31)
16 | }
17 | 
18 | #[inline(always)]
19 | pub unsafe fn _mm_cmpeq_epi64x(a: __m128i, b: __m128i) -> __m128i {
20 |     let t = _mm_cmpeq_epi32(a, b);
21 |     _mm_and_si128(t, _mm_shuffle_epi32(t, 177))
22 | }
23 | 
24 | #[inline(always)]
25 | pub unsafe fn _mm_mullo_epi64x(xmm0: __m128i, xmm1: __m128i) -> __m128i {
26 |     let xmm2 = _mm_srli_epi64(xmm1, 32);
27 |     let xmm3 = _mm_srli_epi64(xmm0, 32);
28 | 
29 |     let xmm2 = _mm_mul_epu32(xmm2, xmm0);
30 |     let xmm3 = _mm_mul_epu32(xmm1, xmm3);
31 | 
32 |     let xmm2 = _mm_add_epi64(xmm3, xmm2);
33 |     let xmm2 = _mm_slli_epi64(xmm2, 32);
34 | 
35 |     let xmm0 = _mm_mul_epu32(xmm1, xmm0);
36 |     let xmm0 = _mm_add_epi64(xmm0, xmm2);
37 | 
38 |     xmm0
39 | }
40 | 
41 | // SSE2 Version
42 | #[inline(always)]
43 | pub unsafe fn _mm_adds_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i {
44 |     let res = _mm_add_epi32(lhs, rhs);
45 | 
46 |     _mm_blendv_epi8x(
47 |         res,
48 |         _mm_blendv_epi8x(
49 |             _mm_set1_epi32(i32::MIN),
50 |             _mm_set1_epi32(i32::MAX),
51 |             _mm_signbits_epi32x(res),
52 |         ),
53 |         _mm_xor_si128(rhs, _mm_cmpgt_epi32(lhs, res)),
54 |     )
55 | }
56 | 
57 | // SSE2 Version
58 | #[inline(always)]
59 | pub unsafe fn _mm_subs_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i {
60 |     let res = _mm_sub_epi32(lhs, rhs);
61 | 
62 |     _mm_blendv_epi8x(
63 |         res,
64 |         _mm_blendv_epi8x(
65 |             _mm_set1_epi32(i32::MIN),
66 |             _mm_set1_epi32(i32::MAX),
67 |             _mm_signbits_epi32x(res),
68 |         ),
69 |         _mm_xor_si128(_mm_cmpgt_epi32(rhs, _mm_setzero_si128()), _mm_cmpgt_epi32(lhs, res)),
70 |     )
71 | }
72 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse42/mod.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use crate::*;
 4 | 
 5 | use core::{
 6 |     fmt,
 7 |     marker::PhantomData,
 8 |     mem::{transmute, transmute_copy},
 9 |     ops::*,
10 | };
11 | 
12 | use crate::arch::sse42::*;
13 | 
14 | use half::f16;
15 | 
16 | pub(crate) mod polyfills;
17 | 
18 | use super::polyfills::*;
19 | use polyfills::*;
20 | 
21 | mod vf32;
22 | 
23 | /*
24 | //mod vf32;
25 | //mod vf64;
26 | //mod vi16;
27 | mod vi32;
28 | //mod vi64;
29 | mod vu32;
30 | //mod vu64;
31 | 
32 | use vi32::i32x4;
33 | use vu32::u32x4;
34 | 
35 | pub type Vi32 = i32x4<SSE42>;
36 | pub type Vu32 = u32x4<SSE42>;
37 | 
38 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
39 | pub struct SSE42;
40 | 
41 | impl Simd for SSE42 {
42 |     const INSTRSET: SimdInstructionSet = SimdInstructionSet::SSE42;
43 | 
44 |     type Vi32 = i32x4<SSE42>;
45 |     type Vu32 = u32x4<SSE42>;
46 | }
47 | */
48 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse42/polyfills.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | use crate::backends::sse2::polyfills::*;
  4 | 
  5 | #[inline(always)]
  6 | pub unsafe fn _mm_blendv_epi32x(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
  7 |     _mm_castps_si128(_mm_blendv_ps(
  8 |         _mm_castsi128_ps(xmm0),
  9 |         _mm_castsi128_ps(xmm1),
 10 |         _mm_castsi128_ps(mask),
 11 |     ))
 12 | }
 13 | 
 14 | #[inline(always)]
 15 | pub unsafe fn _mm_blendv_epi64x(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
 16 |     _mm_castpd_si128(_mm_blendv_pd(
 17 |         _mm_castsi128_pd(xmm0),
 18 |         _mm_castsi128_pd(xmm1),
 19 |         _mm_castsi128_pd(mask),
 20 |     ))
 21 | }
 22 | 
 23 | #[inline(always)]
 24 | pub unsafe fn _mm_cvtepu32_psx(x: __m128i) -> __m128 {
 25 |     let xmm0 = x;
 26 |     let xmm1 = _mm_set1_epi32(0x4B000000u32 as i32);
 27 |     let xmm1 = _mm_blend_epi16(xmm0, xmm1, 170);
 28 |     let xmm0 = _mm_srli_epi32(xmm0, 16);
 29 |     let xmm2 = _mm_set1_epi32(0x53000000u32 as i32);
 30 |     let xmm0 = _mm_castsi128_ps(_mm_blend_epi16(xmm0, xmm2, 170));
 31 |     let xmm2 = _mm_set1_ps(f32::from_bits(0x53000080));
 32 |     let xmm0 = _mm_sub_ps(xmm0, xmm2);
 33 |     let xmm0 = _mm_add_ps(_mm_castsi128_ps(xmm1), xmm0);
 34 | 
 35 |     xmm0
 36 | }
 37 | 
 38 | #[inline(always)]
 39 | pub unsafe fn _mm_cvtpd_epi64x_limited(x: __m128d) -> __m128i {
 40 |     // https://stackoverflow.com/a/41148578/2083075
 41 |     let m = _mm_set1_pd(transmute::<u64, i64>(0x0018000000000000) as f64);
 42 |     _mm_sub_epi64(_mm_castpd_si128(_mm_add_pd(x, m)), _mm_castpd_si128(m))
 43 | }
 44 | 
 45 | #[inline(always)]
 46 | pub unsafe fn _mm_cvtpd_epu64x_limited(x: __m128d) -> __m128i {
 47 |     // https://stackoverflow.com/a/41148578/2083075
 48 |     let m = _mm_set1_pd(transmute::<u64, i64>(0x0010000000000000) as f64);
 49 |     _mm_xor_si128(_mm_castpd_si128(_mm_add_pd(x, m)), _mm_castpd_si128(m))
 50 | }
 51 | 
 52 | // https://stackoverflow.com/a/41223013/2083075
 53 | #[inline(always)]
 54 | #[rustfmt::skip]
 55 | pub unsafe fn _mm_cvtepu64_pdx(v: __m128i) -> __m128d {
 56 |     let magic_i_lo   = _mm_set1_epi64x(0x4330000000000000);  // 2^52        encoded as floating-point
 57 |     let magic_i_hi32 = _mm_set1_epi64x(0x4530000000000000);  // 2^84        encoded as floating-point
 58 |     let magic_i_all  = _mm_set1_epi64x(0x4530000000100000);  // 2^84 + 2^52 encoded as floating-point
 59 |     let magic_d_all  = _mm_castsi128_pd(magic_i_all);
 60 | 
 61 |     let     v_lo     = _mm_blend_epi16(magic_i_lo, v, 0b00110011);      // Blend the 32 lowest significant bits of v with magic_int_lo
 62 |     let mut v_hi     = _mm_srli_epi64(v, 32);                           // Extract the 32 most significant bits of v
 63 |             v_hi     = _mm_xor_si128(v_hi, magic_i_hi32);               // Blend v_hi with 0x45300000
 64 |     let     v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all); // Compute in double precision:
 65 |                        _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo))     // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition !!
 66 | }
 67 | 
 68 | // https://stackoverflow.com/a/41223013/2083075
 69 | #[inline(always)]
 70 | #[rustfmt::skip]
 71 | pub unsafe fn _mm_cvtepi64_pdx(v: __m128i) -> __m128d {
 72 |     let magic_i_lo   = _mm_set1_epi64x(0x4330000000000000); // 2^52               encoded as floating-point
 73 |     let magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63        encoded as floating-point
 74 |     let magic_i_all  = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52 encoded as floating-point
 75 |     let magic_d_all  = _mm_castsi128_pd(magic_i_all);
 76 | 
 77 |     let     v_lo     = _mm_blend_epi16(magic_i_lo, v, 0b00110011);      // Blend the 32 lowest significant bits of v with magic_int_lo
 78 |     let mut v_hi     = _mm_srli_epi64(v, 32);                           // Extract the 32 most significant bits of v
 79 |             v_hi     = _mm_xor_si128(v_hi, magic_i_hi32);               // Flip the msb of v_hi and blend with 0x45300000
 80 |     let     v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all); // Compute in double precision:
 81 |                        _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo))     // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition !!
 82 | }
 83 | 
 84 | // NOTE: Saturated add/sub use the "sign bit" as the select bit,
 85 | // so when porting to SSE2 it'll need to use the `signbits` methods to properly select with `_mm_blendv_epi8x`
 86 | 
 87 | #[inline(always)]
 88 | pub unsafe fn _mm_adds_epi64x(lhs: __m128i, rhs: __m128i) -> __m128i {
 89 |     let res = _mm_add_epi64(lhs, rhs);
 90 | 
 91 |     _mm_blendv_epi64x(
 92 |         res,
 93 |         _mm_blendv_epi64x(_mm_set1_epi64x(i64::MIN), _mm_set1_epi64x(i64::MAX), res),
 94 |         _mm_xor_si128(rhs, _mm_cmpgt_epi64(lhs, res)),
 95 |     )
 96 | }
 97 | 
 98 | #[inline(always)]
 99 | pub unsafe fn _mm_adds_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i {
100 |     let res = _mm_add_epi32(lhs, rhs);
101 | 
102 |     _mm_blendv_epi32x(
103 |         res,
104 |         _mm_blendv_epi32x(_mm_set1_epi32(i32::MIN), _mm_set1_epi32(i32::MAX), res),
105 |         _mm_xor_si128(rhs, _mm_cmpgt_epi32(lhs, res)),
106 |     )
107 | }
108 | 
109 | #[inline(always)]
110 | pub unsafe fn _mm_subs_epi32x(lhs: __m128i, rhs: __m128i) -> __m128i {
111 |     let res = _mm_sub_epi32(lhs, rhs);
112 | 
113 |     _mm_blendv_epi32x(
114 |         res,
115 |         _mm_blendv_epi32x(_mm_set1_epi32(i32::MIN), _mm_set1_epi32(i32::MAX), res),
116 |         _mm_xor_si128(_mm_cmpgt_epi32(rhs, _mm_setzero_si128()), _mm_cmpgt_epi32(lhs, res)),
117 |     )
118 | }
119 | 
120 | #[inline(always)]
121 | pub unsafe fn _mm_subs_epi64x(lhs: __m128i, rhs: __m128i) -> __m128i {
122 |     let res = _mm_sub_epi64(lhs, rhs);
123 | 
124 |     _mm_blendv_epi64x(
125 |         res,
126 |         _mm_blendv_epi64x(_mm_set1_epi64x(i64::MIN), _mm_set1_epi64x(i64::MAX), res),
127 |         _mm_xor_si128(_mm_cmpgt_epi64(rhs, _mm_setzero_si128()), _mm_cmpgt_epi64(lhs, res)),
128 |     )
129 | }
130 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse42/vf32.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse42/vi32.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(i32x4: i32 => __m128i);
  4 | impl<S: Simd> Default for i32x4<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new(unsafe { _mm_setzero_si128() })
  8 |     }
  9 | }
 10 | 
 11 | impl SimdVectorBase<SSE42> for i32x4<SSE42> {
 12 |     type Element = i32;
 13 | 
 14 |     #[inline(always)]
 15 |     fn splat(value: Self::Element) -> Self {
 16 |         Self::new(unsafe { _mm_set1_epi32(value) })
 17 |     }
 18 | 
 19 |     #[inline(always)]
 20 |     unsafe fn undefined() -> Self {
 21 |         Self::new(_mm_undefined_si128())
 22 |     }
 23 | 
 24 |     #[inline(always)]
 25 |     unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
 26 |         Self::new(_mm_load_si128(src as *const _))
 27 |     }
 28 | 
 29 |     #[inline(always)]
 30 |     unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
 31 |         Self::new(_mm_loadu_si128(src as *const _))
 32 |     }
 33 | 
 34 |     #[inline(always)]
 35 |     unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
 36 |         _mm_store_si128(dst as *mut _, self.value)
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
 41 |         _mm_storeu_si128(dst as *mut _, self.value)
 42 |     }
 43 | 
 44 |     #[inline]
 45 |     #[target_feature(enable = "sse4.1")]
 46 |     unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
 47 |         *transmute::<&_, *const Self::Element>(&self).add(index)
 48 |     }
 49 | 
 50 |     #[inline]
 51 |     #[target_feature(enable = "sse4.1")]
 52 |     unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
 53 |         *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
 54 |         self
 55 |     }
 56 | }
 57 | 
 58 | impl SimdBitwise<SSE42> for i32x4<SSE42> {
 59 |     #[inline(always)]
 60 |     fn and_not(self, other: Self) -> Self {
 61 |         Self::new(unsafe { _mm_andnot_si128(self.value, other.value) })
 62 |     }
 63 | 
 64 |     const FULL_BITMASK: u16 = 0b1111;
 65 | 
 66 |     #[inline(always)]
 67 |     fn bitmask(self) -> u16 {
 68 |         unsafe { _mm_movemask_ps(_mm_castsi128_ps(self.value)) as u16 }
 69 |     }
 70 | 
 71 |     #[inline(always)]
 72 |     unsafe fn _mm_not(self) -> Self {
 73 |         self ^ Self::splat(!0)
 74 |     }
 75 | 
 76 |     #[inline(always)]
 77 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
 78 |         Self::new(_mm_and_si128(self.value, rhs.value))
 79 |     }
 80 | 
 81 |     #[inline(always)]
 82 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
 83 |         Self::new(_mm_or_si128(self.value, rhs.value))
 84 |     }
 85 | 
 86 |     #[inline(always)]
 87 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
 88 |         Self::new(_mm_xor_si128(self.value, rhs.value))
 89 |     }
 90 | 
 91 |     #[inline(always)]
 92 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
 93 |         Self::zip(self, count, |x, s| x >> s)
 94 |     }
 95 | 
 96 |     #[inline(always)]
 97 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
 98 |         Self::zip(self, count, |x, s| x << s)
 99 |     }
100 | 
101 |     #[inline(always)]
102 |     unsafe fn _mm_shli(self, count: u32) -> Self {
103 |         Self::new(_mm_sll_epi32(self.value, _mm_cvtsi32_si128(count as i32)))
104 |     }
105 | 
106 |     #[inline(always)]
107 |     unsafe fn _mm_shri(self, count: u32) -> Self {
108 |         Self::new(_mm_srl_epi32(self.value, _mm_cvtsi32_si128(count as i32)))
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/sse42/vu32.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | decl!(u32x4: u32 => __m128i);
  4 | impl<S: Simd> Default for u32x4<S> {
  5 |     #[inline(always)]
  6 |     fn default() -> Self {
  7 |         Self::new(unsafe { _mm_setzero_si128() })
  8 |     }
  9 | }
 10 | 
 11 | impl SimdVectorBase<SSE42> for u32x4<SSE42> {
 12 |     type Element = u32;
 13 | 
 14 |     fn splat(value: Self::Element) -> Self {
 15 |         Self::new(unsafe { _mm_set1_epi32(value as i32) })
 16 |     }
 17 | 
 18 |     unsafe fn undefined() -> Self {
 19 |         Self::new(_mm_undefined_si128())
 20 |     }
 21 | 
 22 |     #[inline(always)]
 23 |     unsafe fn load_aligned_unchecked(ptr: *const Self::Element) -> Self {
 24 |         Self::new(_mm_load_si128(ptr as *const _))
 25 |     }
 26 | 
 27 |     #[inline(always)]
 28 |     unsafe fn load_unaligned_unchecked(ptr: *const Self::Element) -> Self {
 29 |         Self::new(_mm_loadu_si128(ptr as *const _))
 30 |     }
 31 | 
 32 |     #[inline(always)]
 33 |     unsafe fn store_aligned_unchecked(self, ptr: *mut Self::Element) {
 34 |         _mm_store_si128(ptr as *mut _, self.value)
 35 |     }
 36 | 
 37 |     #[inline(always)]
 38 |     unsafe fn store_unaligned_unchecked(self, ptr: *mut Self::Element) {
 39 |         _mm_storeu_si128(ptr as *mut _, self.value)
 40 |     }
 41 | 
 42 |     decl_base_common!(#[target_feature(enable = "sse4.1")] u32x4: u32 => __m128i);
 43 | }
 44 | 
 45 | impl SimdBitwise<SSE42> for u32x4<SSE42> {
 46 |     fn and_not(self, other: Self) -> Self {
 47 |         Self::new(unsafe { _mm_andnot_si128(self.value, other.value) })
 48 |     }
 49 | 
 50 |     const FULL_BITMASK: u16 = 0b1111;
 51 | 
 52 |     fn bitmask(self) -> u16 {
 53 |         unsafe { _mm_movemask_ps(_mm_castsi128_ps(self.value)) }
 54 |     }
 55 | 
 56 |     #[inline(always)]
 57 |     unsafe fn _mm_not(self) -> Self {
 58 |         self ^ Self::splat(!0)
 59 |     }
 60 | 
 61 |     #[inline(always)]
 62 |     unsafe fn _mm_bitand(self, rhs: Self) -> Self {
 63 |         Self::new(_mm_and_si128(self.value, rhs.value))
 64 |     }
 65 | 
 66 |     #[inline(always)]
 67 |     unsafe fn _mm_bitor(self, rhs: Self) -> Self {
 68 |         Self::new(_mm_or_si128(self.value, rhs.value))
 69 |     }
 70 | 
 71 |     #[inline(always)]
 72 |     unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
 73 |         Self::new(_mm_xor_si128(self.value, rhs.value))
 74 |     }
 75 | 
 76 |     #[inline(always)]
 77 |     unsafe fn _mm_shr(self, count: Vu32) -> Self {
 78 |         Self::zip(self, count, Shr::shr)
 79 |     }
 80 | 
 81 |     #[inline(always)]
 82 |     unsafe fn _mm_shl(self, count: Vu32) -> Self {
 83 |         Self::zip(self, count, Shl::shl)
 84 |     }
 85 | 
 86 |     #[inline(always)]
 87 |     unsafe fn _mm_shli(self, count: u32) -> Self {
 88 |         Self::new(_mm_sll_epi32(self.value, _mm_cvtsi32_si128(count as i32)))
 89 |     }
 90 | 
 91 |     #[inline(always)]
 92 |     unsafe fn _mm_shri(self, count: u32) -> Self {
 93 |         Self::new(_mm_srl_epi32(self.value, _mm_cvtsi32_si128(count as i32)))
 94 |     }
 95 | }
 96 | 
 97 | impl PartialEq<Self> for u32x4<SSE42> {
 98 |     fn eq(&self, other: &Self) -> bool {
 99 |         <Self as SimdVector<SSE42>>::eq(*self, *other).all()
100 |     }
101 | 
102 |     fn ne(&self, other: &Self) -> bool {
103 |         <Self as SimdVector<SSE42>>::ne(*self, *other).any()
104 |     }
105 | }
106 | 
107 | impl Eq for u32x4<SSE42> {}
108 | 
109 | impl SimdMask<AVX2> for u32x8<AVX2> {
110 |     #[inline(always)]
111 |     unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
112 |         Self::new(_mm256_blendv_epi8(f.value, t.value, self.value))
113 |     }
114 | 
115 |     #[inline(always)]
116 |     unsafe fn _mm_all(self) -> bool {
117 |         _mm_movemask_epi8(self.value) as u16 == u16::MAX
118 |     }
119 | 
120 |     #[inline(always)]
121 |     unsafe fn _mm_any(self) -> bool {
122 |         _mm_movemask_epi8(self.value) != 0
123 |     }
124 | 
125 |     #[inline(always)]
126 |     unsafe fn _mm_none(self) -> bool {
127 |         _mm_movemask_epi8(self.value) == 0
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/crates/thermite/src/backends/wasm32/mod.rs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/crates/thermite/src/buffer.rs:
--------------------------------------------------------------------------------
  1 | extern crate alloc;
  2 | 
  3 | use crate::*;
  4 | 
  5 | use alloc::alloc::{alloc, dealloc, Layout};
  6 | use core::{
  7 |     fmt, mem,
  8 |     ops::{Deref, DerefMut},
  9 |     ptr,
 10 | };
 11 | 
 12 | /// Aligned SIMD vector storage
 13 | #[repr(transparent)]
 14 | pub struct VectorBuffer<S: Simd, V: SimdVectorBase<S>> {
 15 |     buffer: *mut [V::Element],
 16 | }
 17 | 
 18 | impl<S: Simd, V: SimdVectorBase<S>> Deref for VectorBuffer<S, V> {
 19 |     type Target = [V::Element];
 20 | 
 21 |     #[inline]
 22 |     fn deref(&self) -> &Self::Target {
 23 |         self.as_slice()
 24 |     }
 25 | }
 26 | 
 27 | impl<S: Simd, V: SimdVectorBase<S>> DerefMut for VectorBuffer<S, V> {
 28 |     #[inline]
 29 |     fn deref_mut(&mut self) -> &mut Self::Target {
 30 |         self.as_mut_slice()
 31 |     }
 32 | }
 33 | 
 34 | impl<S: Simd, V: SimdVectorBase<S>> fmt::Debug for VectorBuffer<S, V> {
 35 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 36 |         self.as_vector_slice().fmt(f)
 37 |     }
 38 | }
 39 | 
 40 | impl<S: Simd, V: SimdVectorBase<S>> VectorBuffer<S, V> {
 41 |     /// Allocates a new SIMD-aligned element buffer and zeroes the elements.
 42 |     ///
 43 |     /// Due to the alignment, it will round up the number of elements to the nearest multiple of `V::NUM_ELEMENTS`,
 44 |     /// making the "wasted" space visible.
 45 |     pub fn alloc(count: usize) -> Self {
 46 |         unsafe {
 47 |             // round up to multiple of NUM_ELEMENTS
 48 |             // https://stackoverflow.com/a/9194117/2083075
 49 |             let count = (count + V::NUM_ELEMENTS - 1) & (-(V::NUM_ELEMENTS as isize) as usize);
 50 | 
 51 |             // allocate zeroed buffer. All SIMD types are valid when zeroed
 52 |             VectorBuffer {
 53 |                 buffer: ptr::slice_from_raw_parts_mut(
 54 |                     alloc::alloc::alloc_zeroed(Self::layout(count)) as *mut V::Element,
 55 |                     count,
 56 |                 ),
 57 |             }
 58 |         }
 59 |     }
 60 | 
 61 |     #[inline(always)]
 62 |     pub fn iter_vectors<'a>(&'a self) -> SimdSliceIter<'a, S, V> {
 63 |         SimdSliceIter::new(self.as_slice())
 64 |     }
 65 | 
 66 |     #[inline(always)]
 67 |     pub fn iter_vectors_mut<'a>(&'a mut self) -> AlignedMutIter<'a, S, V> {
 68 |         unsafe { AlignedMut::new_unchecked(self.as_mut_slice()).iter_mut() }
 69 |     }
 70 | 
 71 |     /// Gathers values from the buffer using more efficient instructions where possible
 72 |     #[inline(always)]
 73 |     pub fn gather(&self, indices: S::Vu32) -> V
 74 |     where
 75 |         V: SimdVector<S>,
 76 |     {
 77 |         V::gather(self.as_slice(), indices.cast())
 78 |     }
 79 | 
 80 |     /// Fills the buffer with vectors using aligned stores
 81 |     #[inline]
 82 |     pub fn fill(&mut self, value: V) {
 83 |         unsafe {
 84 |             let ptr = self.as_mut_slice().as_mut_ptr();
 85 |             let mut i = 0;
 86 |             while i < self.len() {
 87 |                 value.store_aligned_unchecked(ptr.add(i));
 88 |                 i += V::NUM_ELEMENTS;
 89 |             }
 90 |         }
 91 |     }
 92 | 
 93 |     #[inline]
 94 |     pub fn len(&self) -> usize {
 95 |         unsafe { (*self.buffer).len() }
 96 |     }
 97 | 
 98 |     #[inline]
 99 |     pub fn len_vectors(&self) -> usize {
100 |         self.len() / V::NUM_ELEMENTS
101 |     }
102 | 
103 |     #[inline]
104 |     pub fn as_slice(&self) -> &[V::Element] {
105 |         unsafe { &*self.buffer }
106 |     }
107 | 
108 |     #[inline]
109 |     pub fn as_vector_slice(&self) -> &[V] {
110 |         unsafe { &(*(self.buffer as *const [V]))[..self.len_vectors()] }
111 |     }
112 | 
113 |     #[inline]
114 |     pub fn as_mut_slice(&mut self) -> &mut [V::Element] {
115 |         unsafe { &mut *self.buffer }
116 |     }
117 | 
118 |     #[inline]
119 |     pub fn as_mut_vector_slice(&mut self) -> &mut [V] {
120 |         unsafe { &mut (*(self.buffer as *mut [V]))[..self.len() / V::NUM_ELEMENTS] }
121 |     }
122 | 
123 |     #[inline]
124 |     pub fn load_vector(&self, vector_index: usize) -> V {
125 |         let scalar_index = vector_index * V::NUM_ELEMENTS;
126 |         let s = self.as_slice();
127 |         assert!(scalar_index < s.len());
128 | 
129 |         unsafe { V::load_aligned_unchecked(s.as_ptr().add(vector_index)) }
130 |     }
131 | 
132 |     #[inline]
133 |     pub fn store_vector(&mut self, vector_index: usize, value: V) {
134 |         let scalar_index = vector_index * V::NUM_ELEMENTS;
135 |         let s = self.as_mut_slice();
136 |         assert!(scalar_index < s.len());
137 | 
138 |         unsafe { value.store_aligned_unchecked(s.as_mut_ptr().add(vector_index)) }
139 |     }
140 | 
141 |     #[inline(always)]
142 |     fn layout(count: usize) -> Layout {
143 |         // ensure the buffer has the proper size and alignment for SIMD values
144 |         unsafe { Layout::from_size_align_unchecked(count * mem::size_of::<V::Element>(), V::ALIGNMENT) }
145 |     }
146 | }
147 | 
148 | unsafe impl<S: Simd, V: SimdVectorBase<S>> Send for VectorBuffer<S, V> {}
149 | unsafe impl<S: Simd, V: SimdVectorBase<S>> Sync for VectorBuffer<S, V> {}
150 | 
151 | impl<S: Simd, V: SimdVectorBase<S>> Drop for VectorBuffer<S, V> {
152 |     fn drop(&mut self) {
153 |         unsafe { dealloc(self.buffer as *mut u8, Self::layout(self.len())) }
154 |     }
155 | }
156 | 


--------------------------------------------------------------------------------
/crates/thermite/src/divider.rs:
--------------------------------------------------------------------------------
  1 | // Derived from:
  2 | //
  3 | // libdivide.h - Optimized integer division
  4 | // https://libdivide.com
  5 | //
  6 | // Copyright (C) 2010 - 2019 ridiculous_fish, <libdivide@ridiculousfish.com>
  7 | // Copyright (C) 2016 - 2019 Kim Walisch, <kim.walisch@gmail.com>
  8 | 
  9 | #![allow(unused)]
 10 | 
 11 | use core::ops::Deref;
 12 | 
 13 | macro_rules! decl_div_half {
 14 |     ($($t:ty => $dt:ty),*) => {
 15 |         paste::paste! {$(
 16 |             #[inline(always)]
 17 |             const fn [<div_ $dt _ $t _to_ $t>](u1: $t, u0: $t, v: $t) -> ($t, $t) {
 18 |                 let v = v as $dt;
 19 |                 let n = ((u1 as $dt) << (core::mem::size_of::<$t>() * 8)) | (u0 as $dt);
 20 |                 let res = (n / v) as $t; // truncate
 21 |                 let rem = n.wrapping_sub((res as $dt).wrapping_mul(v));
 22 |                 (res, rem as $t)
 23 |             }
 24 |         )*}
 25 |     };
 26 | }
 27 | 
 28 | decl_div_half!(u64 => u128, u32 => u64, u16 => u32, u8 => u16);
 29 | 
 30 | /// Divider recommended for constant divisors.
 31 | ///
 32 | /// When using constant divisors, divisions using this can remove extra branches
 33 | /// and generate ideal integer division code.
 34 | ///
 35 | /// However, when used with dynamic input, the extra branches can be expensive,
 36 | /// therefore it is recommended to use the branchfree alternative for dynamic divisors.
 37 | #[repr(C, packed)]
 38 | pub struct Divider<T> {
 39 |     multiplier: T,
 40 |     shift: u8,
 41 | }
 42 | 
 43 | /// Divider without branching, useful for dynamic divisors.
 44 | ///
 45 | /// However, when used with constant input, this may perform extra unnecessary work that could
 46 | /// be removed in the branching [`Divider`]
 47 | #[repr(transparent)]
 48 | #[derive(Copy, PartialEq)]
 49 | pub struct BranchfreeDivider<T>(Divider<T>);
 50 | 
 51 | impl<T: Copy> Clone for BranchfreeDivider<T> {
 52 |     fn clone(&self) -> Self {
 53 |         *self
 54 |     }
 55 | }
 56 | 
 57 | impl<T> Deref for BranchfreeDivider<T> {
 58 |     type Target = Divider<T>;
 59 | 
 60 |     #[inline(always)]
 61 |     fn deref(&self) -> &Self::Target {
 62 |         &self.0
 63 |     }
 64 | }
 65 | 
 66 | impl<T: Copy> Clone for Divider<T> {
 67 |     fn clone(&self) -> Self {
 68 |         *self
 69 |     }
 70 | }
 71 | 
 72 | impl<T: Copy> Copy for Divider<T> {}
 73 | 
 74 | impl<T: PartialEq> PartialEq for Divider<T> {
 75 |     #[inline(always)]
 76 |     fn eq(&self, other: &Self) -> bool {
 77 |         self.multiplier() == other.multiplier() && self.shift == other.shift
 78 |     }
 79 | }
 80 | 
 81 | impl<T> Divider<T> {
 82 |     #[inline(always)]
 83 |     pub fn multiplier(&self) -> T {
 84 |         // with repr(C), self points to first value
 85 |         unsafe { (self as *const Self as *const T).read_unaligned() }
 86 |     }
 87 | 
 88 |     #[inline(always)]
 89 |     pub fn shift(&self) -> u8 {
 90 |         // shift has an alignment of 1 byte anyway, so it's fine to read normally
 91 |         self.shift
 92 |     }
 93 | }
 94 | 
 95 | pub(crate) const ADD_MARKER: u8 = 0x40;
 96 | pub(crate) const NEG_DIVISOR: u8 = 0x80;
 97 | 
 98 | macro_rules! impl_shift_mask {
 99 |     ($($ty:ty),*) => {$(
100 |         impl Divider<$ty> {
101 |             const BITS: u32 = 8 * core::mem::size_of::<$ty>() as u32;
102 |             /// !log2(N::BITS)
103 |             pub(crate) const SHIFT_MASK: u8 = !(<$ty>::MAX << Self::BITS.trailing_zeros()) as u8;
104 |         }
105 |     )*};
106 | }
107 | 
108 | impl_shift_mask!(u8, u16, u32, u64);
109 | 
110 | macro_rules! impl_unsigned_divider {
111 |     ($($t:ty => $dt:ty),*) => {
112 |         paste::paste! {$(
113 |             impl BranchfreeDivider<$t> {
114 |                 /// See docs for [`BranchfreeDivider`] and [`Divider`]
115 |                 #[inline(always)]
116 |                 pub const fn [<$t>](d: $t) -> Self {
117 |                     Divider::<$t>::[<$t _branchfree>](d)
118 |                 }
119 |             }
120 | 
121 |             impl Divider<$t> {
122 |                 /// See docs for [`Divider`]
123 |                 #[inline(always)]
124 |                 pub const fn [<$t>](d: $t) -> Self {
125 |                     Self::[<$t _internal>](d, false)
126 |                 }
127 | 
128 |                 /// See docs for [`BranchfreeDivider`] and [`Divider`]
129 |                 #[inline]
130 |                 pub const fn [<$t _branchfree>](d: $t) -> BranchfreeDivider<$t> {
131 |                     let mut divider = Self::[<$t _internal>](d, true);
132 |                     divider.shift &= Self::SHIFT_MASK;
133 |                     BranchfreeDivider(divider)
134 |                 }
135 | 
136 |                 #[inline]
137 |                 const fn [<$t _internal>](d: $t, bf: bool) -> Self {
138 |                     if d == 0 {
139 |                         return Divider {
140 |                             multiplier: 0,
141 |                             shift: Self::BITS as u8, // shift to zero
142 |                         }
143 |                     }
144 | 
145 |                     let floor_log_2_d = Self::BITS - 1 - d.leading_zeros();
146 | 
147 |                     if d.is_power_of_two() {
148 |                         Divider {
149 |                             multiplier: 0,
150 |                             // We need to subtract 1 from the shift value in case of an unsigned
151 |                             // branchfree divider because there is a hardcoded right shift by 1
152 |                             // in its division algorithm.
153 |                             shift: (floor_log_2_d - bf as u32) as u8,
154 |                         }
155 |                     } else {
156 |                         let k = 1 << floor_log_2_d;
157 |                         let (mut proposed_m, rem) = [<div_ $dt _ $t _to_ $t>](k, 0, d);
158 | 
159 |                         let e = d.wrapping_sub(rem);
160 | 
161 |                         let shift = if !bf && e < k {
162 |                             floor_log_2_d as u8
163 |                         } else {
164 |                             proposed_m = proposed_m.wrapping_add(proposed_m);
165 |                             let rem2 = rem.wrapping_add(rem);
166 | 
167 |                             if rem2 >= d || rem2 < rem {
168 |                                 proposed_m = proposed_m.wrapping_add(1);
169 |                             }
170 | 
171 |                             floor_log_2_d as u8 | ADD_MARKER
172 |                         };
173 | 
174 |                         Divider {
175 |                             multiplier: proposed_m.wrapping_add(1),
176 |                             shift,
177 |                         }
178 |                     }
179 |                 }
180 |             }
181 |         )*}
182 |     }
183 | }
184 | 
185 | macro_rules! impl_signed_divider {
186 |     ($($t:ty => $ut:ty => $udt:ty),*) => {
187 |         paste::paste!{$(
188 |             impl BranchfreeDivider<$t> {
189 |                 /// See docs for [`BranchfreeDivider`] and [`Divider`]
190 |                 #[inline(always)]
191 |                 pub const fn [<$t>](d: $t) -> Self {
192 |                     Divider::<$t>::[<$t _branchfree>](d)
193 |                 }
194 |             }
195 | 
196 |             impl Divider<$t> {
197 |                 /// See docs for [`Divider`]
198 |                 #[inline(always)]
199 |                 const fn [<$t>](d: $t) -> Self {
200 |                     Self::[<$t _internal>](d, false)
201 |                 }
202 | 
203 |                 /// See docs for [`BranchfreeDivider`] and [`Divider`]
204 |                 #[inline]
205 |                 const fn [<$t _branchfree>](d: $t) -> BranchfreeDivider<$t> {
206 |                     let mut divider = Self::[<$t _internal>](d, true);
207 |                     divider.shift &= Divider::<$ut>::SHIFT_MASK;
208 |                     BranchfreeDivider(divider)
209 |                 }
210 | 
211 |                 #[inline]
212 |                 const fn [<$t _internal>](d: $t, bf: bool) -> Self {
213 |                     if d == 0 {
214 |                         return Divider {
215 |                             multiplier: 0,
216 |                             shift: Divider::<$ut>::BITS as u8, // shift to zero
217 |                         };
218 |                     }
219 | 
220 |                     let abs_d = d.abs() as $ut;
221 | 
222 |                     let floor_log_2_d = Divider::<$ut>::BITS - 1 - d.leading_zeros();
223 | 
224 |                     if abs_d.is_power_of_two() {
225 |                         Divider {
226 |                             multiplier: 0,
227 |                             shift: floor_log_2_d as u8 | if d < 0 { NEG_DIVISOR } else { 0 },
228 |                         }
229 |                     } else {
230 |                         let (mut proposed_m, rem) = [<div_ $udt _ $ut _to_ $ut>](1 << (floor_log_2_d - 1), 0, abs_d);
231 | 
232 |                         let e = abs_d.wrapping_sub(rem);
233 | 
234 |                         let mut shift = if !bf && e < (1 << floor_log_2_d) {
235 |                             (floor_log_2_d - 1) as u8
236 |                         } else {
237 |                             proposed_m = proposed_m.wrapping_add(proposed_m);
238 |                             let rem2 = rem.wrapping_add(rem);
239 | 
240 |                             if rem2 >= abs_d || rem2 < rem {
241 |                                 proposed_m = proposed_m.wrapping_add(1);
242 |                             }
243 | 
244 |                             floor_log_2_d as u8 | ADD_MARKER
245 |                         };
246 | 
247 |                         proposed_m = proposed_m.wrapping_add(1);
248 | 
249 |                         let mut multiplier = proposed_m as $t;
250 | 
251 |                         if d < 0 {
252 |                             shift |= NEG_DIVISOR;
253 | 
254 |                             if !bf {
255 |                                 multiplier = -multiplier;
256 |                             }
257 |                         }
258 | 
259 |                         Divider { multiplier, shift }
260 |                     }
261 |                 }
262 |             }
263 |         )*}
264 |     }
265 | }
266 | 
267 | impl_unsigned_divider!(u8 => u16, u16 => u32, u32 => u64, u64 => u128);
268 | 
269 | impl_signed_divider! {
270 |     i8 => u8 => u16,
271 |     i16 => u16 => u32,
272 |     i32 => u32 => u64,
273 |     i64 => u64 => u128
274 | }
275 | 


--------------------------------------------------------------------------------
/crates/thermite/src/element.rs:
--------------------------------------------------------------------------------
 1 | use crate::*;
 2 | 
 3 | /// Umbrella trait for SIMD vector element bounds
 4 | pub trait SimdElement: 'static + mask::Truthy + CastFromAll + Clone + Debug + Copy + Default + Send + Sync {}
 5 | 
 6 | impl<T> SimdElement for T where T: 'static + mask::Truthy + CastFromAll + Clone + Debug + Copy + Default + Send + Sync {}
 7 | 
 8 | macro_rules! decl_cast_from_all {
 9 |     ($($ty:ty),*) => {
10 |         pub trait CastFromAll: $(CastFrom<$ty>+)* {}
11 |         impl<T> CastFromAll for T where T: $(CastFrom<$ty>+)* {}
12 |     }
13 | }
14 | 
15 | pub trait CastFrom<T>: Sized {
16 |     fn cast_from(value: T) -> Self;
17 | }
18 | 
19 | macro_rules! impl_cast_from {
20 |     (@INNER $ty:ty as $as:ty) => {
21 |         impl CastFrom<$ty> for $as {
22 |             #[inline(always)]
23 |             fn cast_from(value: $ty) -> $as {
24 |                 value as $as
25 |             }
26 |         }
27 |     };
28 |     ($($ty:ty),*) => {
29 |         $(
30 |             impl_cast_from_bool!($ty);
31 |             impl_cast_from!(@INNER $ty as i8);
32 |             impl_cast_from!(@INNER $ty as i16);
33 |             impl_cast_from!(@INNER $ty as i32);
34 |             impl_cast_from!(@INNER $ty as i64);
35 |             impl_cast_from!(@INNER $ty as isize);
36 |             impl_cast_from!(@INNER $ty as u8);
37 |             impl_cast_from!(@INNER $ty as u16);
38 |             impl_cast_from!(@INNER $ty as u32);
39 |             impl_cast_from!(@INNER $ty as u64);
40 |             impl_cast_from!(@INNER $ty as usize);
41 |             impl_cast_from!(@INNER $ty as f32);
42 |             impl_cast_from!(@INNER $ty as f64);
43 |         )*
44 |     };
45 | }
46 | 
47 | macro_rules! impl_cast_from_bool {
48 |     ($ty:ty) => {
49 |         impl CastFrom<bool> for $ty {
50 |             #[inline(always)]
51 |             fn cast_from(value: bool) -> Self {
52 |                 if value {
53 |                     1 as $ty
54 |                 } else {
55 |                     0 as $ty
56 |                 }
57 |             }
58 |         }
59 | 
60 |         impl CastFrom<$ty> for bool {
61 |             #[inline(always)]
62 |             fn cast_from(value: $ty) -> bool {
63 |                 value != (0 as $ty)
64 |             }
65 |         }
66 |     };
67 | }
68 | 
69 | decl_cast_from_all!(i8, i16, i32, i64, u8, u16, u32, u64, isize, usize, f32, f64, bool);
70 | impl_cast_from!(i8, i16, i32, i64, u8, u16, u32, u64, isize, usize, f32, f64);
71 | 
72 | impl CastFrom<bool> for bool {
73 |     #[inline(always)]
74 |     fn cast_from(value: bool) -> bool {
75 |         value
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/crates/thermite/src/iter/aligned.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | pub struct AlignedMut<'a, S: Simd, V: SimdVectorBase<S>> {
 4 |     ptr: *mut V::Element,
 5 |     len: usize,
 6 |     _lt: PhantomData<&'a S>,
 7 | }
 8 | 
 9 | impl<'a, S: Simd, V: SimdVectorBase<S>> AlignedMut<'a, S, V> {
10 |     #[inline]
11 |     pub unsafe fn new_unchecked(slice: &'a mut [V::Element]) -> Self {
12 |         AlignedMut {
13 |             ptr: slice.as_mut_ptr(),
14 |             len: slice.len(),
15 |             _lt: PhantomData,
16 |         }
17 |     }
18 | 
19 |     #[inline]
20 |     pub fn new(slice: &'a mut [V::Element]) -> Option<Self> {
21 |         if slice.as_ptr().align_offset(V::ALIGNMENT) != 0 {
22 |             None
23 |         } else {
24 |             Some(unsafe { AlignedMut::new_unchecked(slice) })
25 |         }
26 |     }
27 | 
28 |     #[inline]
29 |     pub fn iter_mut(self) -> AlignedMutIter<'a, S, V> {
30 |         AlignedMutIter(self)
31 |     }
32 | }
33 | 
34 | pub struct AlignedMutIter<'a, S: Simd, V: SimdVectorBase<S>>(AlignedMut<'a, S, V>);
35 | 
36 | impl<'a, S: Simd, V: SimdVectorBase<S>> AlignedMutIter<'a, S, V> {
37 |     /// Returns the remainder of the slice that is being iterated over.
38 |     ///
39 |     /// If the iterator has been exhausted (`next()` returns `None`),
40 |     /// this may still return elements that would not fill an SIMD vector.
41 |     pub fn remainder(&mut self) -> &'a mut [V::Element] {
42 |         unsafe { core::slice::from_raw_parts_mut(self.0.ptr, self.0.len) }
43 |     }
44 | }
45 | 
46 | impl<'a, S: Simd, V: SimdVectorBase<S>> Iterator for AlignedMutIter<'a, S, V> {
47 |     type Item = &'a mut V;
48 | 
49 |     #[inline]
50 |     fn next(&mut self) -> Option<&'a mut V> {
51 |         if self.0.len < V::NUM_ELEMENTS {
52 |             None
53 |         } else {
54 |             unsafe {
55 |                 let ptr = self.0.ptr;
56 |                 self.0.ptr = self.0.ptr.add(V::NUM_ELEMENTS);
57 |                 self.0.len -= V::NUM_ELEMENTS;
58 |                 Some(&mut *(ptr as *mut V))
59 |             }
60 |         }
61 |     }
62 | 
63 |     #[inline]
64 |     fn size_hint(&self) -> (usize, Option<usize>) {
65 |         let remaining = self.0.len / V::NUM_ELEMENTS;
66 |         (remaining, Some(remaining))
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/crates/thermite/src/iter/mod.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | 
  3 | mod aligned;
  4 | mod slice;
  5 | 
  6 | pub use self::aligned::*;
  7 | pub use self::slice::*;
  8 | 
  9 | pub trait SimdIteratorExt<S: Simd, V>: Iterator<Item = V>
 10 | where
 11 |     V: SimdVector<S>,
 12 | {
 13 |     fn store(self, dst: &mut [V::Element], write_zero: bool)
 14 |     where
 15 |         Self: Sized;
 16 | 
 17 |     #[inline]
 18 |     fn cast<U>(self) -> SimdCastIter<S, Self, V, U>
 19 |     where
 20 |         Self: Sized,
 21 |         U: SimdFromCast<S, V>,
 22 |     {
 23 |         SimdCastIter {
 24 |             src: self,
 25 |             _tys: PhantomData,
 26 |         }
 27 |     }
 28 | }
 29 | 
 30 | pub trait IntoSimdIterator<S: Simd> {
 31 |     type Item;
 32 |     type IntoIter: Iterator<Item = Self::Item>;
 33 | 
 34 |     fn into_simd_iter(self) -> Self::IntoIter;
 35 | }
 36 | 
 37 | pub struct SimdCastIter<S: Simd, I, V, U> {
 38 |     src: I,
 39 |     _tys: PhantomData<(S, V, U)>,
 40 | }
 41 | 
 42 | impl<S: Simd, I, V, U> Clone for SimdCastIter<S, I, V, U>
 43 | where
 44 |     I: Clone,
 45 | {
 46 |     fn clone(&self) -> Self {
 47 |         SimdCastIter {
 48 |             src: self.src.clone(),
 49 |             _tys: PhantomData,
 50 |         }
 51 |     }
 52 | }
 53 | 
 54 | impl<S: Simd, I, V, U> Iterator for SimdCastIter<S, I, V, U>
 55 | where
 56 |     I: Iterator<Item = V>,
 57 |     U: SimdFromCast<S, V>,
 58 | {
 59 |     type Item = U;
 60 | 
 61 |     #[inline]
 62 |     fn next(&mut self) -> Option<Self::Item> {
 63 |         self.src.next().map(|v| U::from_cast(v))
 64 |     }
 65 | }
 66 | 
 67 | impl<S: Simd, V, T> SimdIteratorExt<S, V> for T
 68 | where
 69 |     T: Iterator<Item = V>,
 70 |     V: SimdVector<S>,
 71 | {
 72 |     #[inline]
 73 |     fn store(mut self, dst: &mut [V::Element], write_zero: bool)
 74 |     where
 75 |         Self: Sized,
 76 |     {
 77 |         let mut chunks = dst.chunks_exact_mut(V::NUM_ELEMENTS);
 78 | 
 79 |         // normal writes
 80 |         (&mut self).zip(&mut chunks).for_each(|(src, dst)| unsafe {
 81 |             src.store_unaligned_unchecked(dst.as_mut_ptr());
 82 |         });
 83 | 
 84 |         if write_zero {
 85 |             // fill any remaining chunks with zero
 86 |             (&mut chunks).for_each(|dst| unsafe {
 87 |                 V::zero().store_unaligned_unchecked(dst.as_mut_ptr());
 88 |             });
 89 |         }
 90 | 
 91 |         // if there is a remainder, check to fill it
 92 |         let rem = chunks.into_remainder();
 93 |         if thermite_unlikely!(!rem.is_empty()) {
 94 |             // if there are any values left, write what we can or zero it
 95 |             let value = match self.next() {
 96 |                 Some(value) => value,
 97 |                 None if write_zero => V::zero(),
 98 |                 _ => return, // don't zero and nothing to write, so return
 99 |             };
100 | 
101 |             let indices = Vi32::<S>::indexed();
102 |             let mask = Vi32::<S>::splat(rem.len() as i32).lt(indices);
103 | 
104 |             unsafe { value.scatter_masked_unchecked(rem.as_mut_ptr(), indices, mask.cast_to()) };
105 |         }
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/crates/thermite/src/iter/slice.rs:
--------------------------------------------------------------------------------
 1 | use super::*;
 2 | 
 3 | pub struct SimdSliceIter<'a, S: Simd, V: SimdVectorBase<S>> {
 4 |     // TODO: Replace with pointer?
 5 |     slice: &'a [V::Element],
 6 |     _tys: PhantomData<&'a S>,
 7 | }
 8 | 
 9 | impl<S: Simd, V: SimdVectorBase<S>> Clone for SimdSliceIter<'_, S, V> {
10 |     fn clone(&self) -> Self {
11 |         SimdSliceIter {
12 |             slice: self.slice.clone(),
13 |             _tys: PhantomData,
14 |         }
15 |     }
16 | }
17 | 
18 | impl<'a, S: Simd, T> IntoSimdIterator<S> for &'a [T]
19 | where
20 |     T: SimdAssociatedVector<S>,
21 |     AssociatedVector<S, T>: SimdVectorBase<S, Element = T>,
22 | {
23 |     type Item = AssociatedVector<S, T>;
24 |     type IntoIter = SimdSliceIter<'a, S, Self::Item>;
25 | 
26 |     fn into_simd_iter(self) -> SimdSliceIter<'a, S, Self::Item> {
27 |         SimdSliceIter::new(self)
28 |     }
29 | }
30 | 
31 | impl<'a, S: Simd, V: SimdVectorBase<S>> SimdSliceIter<'a, S, V> {
32 |     #[inline]
33 |     pub fn new(slice: &'a [V::Element]) -> Self {
34 |         SimdSliceIter {
35 |             slice,
36 |             _tys: PhantomData,
37 |         }
38 |     }
39 | 
40 |     /// Returns the remainder of the slice that is being iterated over.
41 |     ///
42 |     /// If the iterator has been exhausted (`next()` returns `None`),
43 |     /// this may still return elements that would not fill an SIMD vector.
44 |     #[inline]
45 |     pub fn remainder(&self) -> &[V::Element] {
46 |         self.slice
47 |     }
48 | }
49 | 
50 | impl<'a, S: Simd, V> Iterator for SimdSliceIter<'a, S, V>
51 | where
52 |     V: SimdVectorBase<S>,
53 | {
54 |     type Item = V;
55 | 
56 |     #[inline]
57 |     fn next(&mut self) -> Option<V> {
58 |         if self.slice.len() < V::NUM_ELEMENTS {
59 |             None
60 |         } else {
61 |             let vector = V::load_unaligned(self.slice);
62 |             self.slice = &self.slice[V::NUM_ELEMENTS..];
63 |             Some(vector)
64 |         }
65 |     }
66 | 
67 |     #[inline]
68 |     fn size_hint(&self) -> (usize, Option<usize>) {
69 |         let remaining = self.slice.len() / V::NUM_ELEMENTS;
70 |         (remaining, Some(remaining))
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/crates/thermite/src/macros.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | #[cfg(feature = "nightly")]
 4 | pub use core::intrinsics::{likely, unlikely};
 5 | 
 6 | // borrows technique from https://github.com/rust-lang/hashbrown/pull/209
 7 | #[cfg(not(feature = "nightly"))]
 8 | #[inline]
 9 | #[cold]
10 | fn cold() {}
11 | 
12 | #[cfg(not(feature = "nightly"))]
13 | #[rustfmt::skip]
14 | #[inline(always)]
15 | pub unsafe fn likely(b: bool) -> bool {
16 |     if !b { cold() } b
17 | }
18 | 
19 | #[cfg(not(feature = "nightly"))]
20 | #[rustfmt::skip]
21 | #[inline(always)]
22 | pub unsafe fn unlikely(b: bool) -> bool {
23 |     if b { cold() } b
24 | }
25 | 
26 | #[doc(hidden)]
27 | #[macro_export]
28 | #[rustfmt::skip]
29 | macro_rules! thermite_likely {
30 |     ($e:expr) => {{
31 |         #[allow(unused_unsafe)]
32 |         unsafe { $crate::macros::likely($e) }
33 |     }};
34 | }
35 | 
36 | #[doc(hidden)]
37 | #[macro_export]
38 | #[rustfmt::skip]
39 | macro_rules! thermite_unlikely {
40 |     ($e:expr) => {{
41 |         #[allow(unused_unsafe)]
42 |         unsafe { $crate::macros::unlikely($e) }
43 |     }};
44 | }
45 | 


--------------------------------------------------------------------------------
/crates/thermite/src/math/compensated.rs:
--------------------------------------------------------------------------------
 1 | use crate::*;
 2 | 
 3 | use core::convert::TryFrom;
 4 | 
 5 | #[derive(Debug, Clone, Copy)]
 6 | pub struct Compensated<S: Simd, V: SimdFloatVector<S>> {
 7 |     pub val: V,
 8 |     pub err: V,
 9 |     _simd: PhantomData<S>,
10 | }
11 | 
12 | #[dispatch(thermite = "crate")]
13 | impl<S: Simd, V: SimdFloatVector<S>> Compensated<S, V> {
14 |     #[inline(always)]
15 |     fn from_parts(val: V, err: V) -> Self {
16 |         Compensated {
17 |             val,
18 |             err,
19 |             _simd: PhantomData,
20 |         }
21 |     }
22 | 
23 |     #[inline(always)]
24 |     pub fn new(val: V) -> Self {
25 |         Self::from_parts(val, V::zero())
26 |     }
27 | 
28 |     #[inline(always)]
29 |     pub fn value(self) -> V {
30 |         self.val + self.err
31 |     }
32 | 
33 |     #[inline(always)]
34 |     pub fn product(a: V, b: V) -> Self {
35 |         let val = a * b;
36 | 
37 |         if S::INSTRSET.has_true_fma() {
38 |             Compensated::from_parts(val, a.mul_sub(b, val))
39 |         } else {
40 |             // split into half-ish-precision
41 |             let factor = match V::ELEMENT_SIZE {
42 |                 4 => V::splat_as::<u32>(1u32 << 13 + 1),
43 |                 8 => V::splat_as::<u32>(1u32 << 27 + 1),
44 |                 _ => unsafe { crate::unreachable_unchecked() },
45 |             };
46 | 
47 |             let (a1, a2) = {
48 |                 let c = factor * a;
49 |                 let x = c - (c - a);
50 |                 (x, a - x)
51 |             };
52 | 
53 |             let (b1, b2) = {
54 |                 let c = factor * b;
55 |                 let x = c - (c - b);
56 |                 (x, b - x)
57 |             };
58 | 
59 |             let err = a2 * b2 - (((val - a1 * b1) - a2 * b1) - a1 * b2);
60 | 
61 |             Compensated::from_parts(val, err)
62 |         }
63 |     }
64 | 
65 |     #[inline(always)]
66 |     pub fn sum(a: V, b: V) -> Self {
67 |         let x = a + b;
68 |         let z = x - a;
69 |         let y = (a - (x - z)) + (b - z);
70 | 
71 |         Compensated::from_parts(x, y)
72 |     }
73 | }
74 | 
75 | impl<S: Simd, V: SimdFloatVector<S>> Add<V> for Compensated<S, V> {
76 |     type Output = Self;
77 | 
78 |     fn add(mut self, rhs: V) -> Self {
79 |         let pi = Self::sum(self.val, rhs);
80 |         self.val = pi.val;
81 |         self.err += pi.err;
82 |         self
83 |     }
84 | }
85 | 
86 | // Accurate Floating Point Product, Stef Graillat
87 | //
88 | // https://www-pequan.lip6.fr/~graillat/papers/REC08_Paper_Graillat.pdf
89 | impl<S: Simd, V: SimdFloatVector<S>> Mul<V> for Compensated<S, V> {
90 |     type Output = Self;
91 | 
92 |     fn mul(mut self, rhs: V) -> Self {
93 |         let pi = Self::product(self.val, rhs);
94 |         self.val = pi.val;
95 |         self.err = self.err.mul_adde(rhs, pi.err);
96 |         self
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/crates/thermite/src/math/consts.rs:
--------------------------------------------------------------------------------
  1 | use super::*;
  2 | 
  3 | pub trait SimdFloatVectorConsts<S: Simd>: SimdFloatVector<S> {
  4 |     /// Euler’s number (e)
  5 |     fn E() -> Self;
  6 | 
  7 |     /// 1/π
  8 |     fn FRAC_1_PI() -> Self;
  9 | 
 10 |     /// 1/sqrt(2)
 11 |     fn FRAC_1_SQRT_2() -> Self;
 12 | 
 13 |     /// 2/π
 14 |     fn FRAC_2_PI() -> Self;
 15 | 
 16 |     /// 1/sqrt(π)
 17 |     fn FRAC_1_SQRT_PI() -> Self;
 18 | 
 19 |     /// 2/sqrt(π)
 20 |     fn FRAC_2_SQRT_PI() -> Self;
 21 | 
 22 |     /// π/2
 23 |     fn FRAC_PI_2() -> Self;
 24 | 
 25 |     /// π/3
 26 |     fn FRAC_PI_3() -> Self;
 27 | 
 28 |     /// π/4
 29 |     fn FRAC_PI_4() -> Self;
 30 | 
 31 |     /// π/6
 32 |     fn FRAC_PI_6() -> Self;
 33 | 
 34 |     /// π/8
 35 |     fn FRAC_PI_8() -> Self;
 36 | 
 37 |     /// ln(2)
 38 |     fn LN_2() -> Self;
 39 | 
 40 |     /// ln(10)
 41 |     fn LN_10() -> Self;
 42 | 
 43 |     /// ln(π)
 44 |     fn LN_PI() -> Self;
 45 | 
 46 |     /// log2(10)
 47 |     fn LOG2_10() -> Self;
 48 | 
 49 |     /// log2(e)
 50 |     fn LOG2_E() -> Self;
 51 | 
 52 |     /// log10(2)
 53 |     fn LOG10_2() -> Self;
 54 | 
 55 |     /// log10(e)
 56 |     fn LOG10_E() -> Self;
 57 | 
 58 |     /// Archimedes’ constant (π)
 59 |     fn PI() -> Self;
 60 | 
 61 |     /// sqrt(2)
 62 |     fn SQRT_2() -> Self;
 63 | 
 64 |     /// sqrt(e)
 65 |     fn SQRT_E() -> Self;
 66 | 
 67 |     /// The full circle constant (τ)
 68 |     fn TAU() -> Self;
 69 | 
 70 |     /// sqrt(π/2)
 71 |     fn SQRT_FRAC_PI_2() -> Self;
 72 | }
 73 | 
 74 | #[doc(hidden)]
 75 | pub trait SimdFloatVectorConstsInternal<S: Simd>: SimdElement {
 76 |     type Vf: SimdFloatVector<S, Element = Self>;
 77 | 
 78 |     fn E() -> Self::Vf;
 79 |     fn FRAC_1_PI() -> Self::Vf;
 80 |     fn FRAC_1_SQRT_2() -> Self::Vf;
 81 |     fn FRAC_2_PI() -> Self::Vf;
 82 |     fn FRAC_1_SQRT_PI() -> Self::Vf;
 83 |     fn FRAC_2_SQRT_PI() -> Self::Vf;
 84 |     fn FRAC_PI_2() -> Self::Vf;
 85 |     fn FRAC_PI_3() -> Self::Vf;
 86 |     fn FRAC_PI_4() -> Self::Vf;
 87 |     fn FRAC_PI_6() -> Self::Vf;
 88 |     fn FRAC_PI_8() -> Self::Vf;
 89 |     fn LN_2() -> Self::Vf;
 90 |     fn LN_10() -> Self::Vf;
 91 |     fn LN_PI() -> Self::Vf;
 92 |     fn LOG2_10() -> Self::Vf;
 93 |     fn LOG2_E() -> Self::Vf;
 94 |     fn LOG10_2() -> Self::Vf;
 95 |     fn LOG10_E() -> Self::Vf;
 96 |     fn PI() -> Self::Vf;
 97 |     fn SQRT_2() -> Self::Vf;
 98 |     fn SQRT_E() -> Self::Vf;
 99 |     fn TAU() -> Self::Vf;
100 |     fn SQRT_FRAC_PI_2() -> Self::Vf;
101 | }
102 | 
103 | macro_rules! impl_internal_consts {
104 |     ($t:ident: $vf:ident => $($name:ident),*) => {
105 |         #[inline(always)]
106 |         fn FRAC_1_SQRT_PI() -> Self::Vf {
107 |             Self::Vf::splat(0.5641895835477562869480794515607725858440506293289988568440857217)
108 |         }
109 | 
110 |         #[inline(always)]
111 |         fn SQRT_FRAC_PI_2() -> Self::Vf {
112 |             Self::Vf::splat(1.2533141373155002512078826424055226265034933703049691583149617881)
113 |         }
114 | 
115 |         #[inline(always)]
116 |         fn LN_PI() -> Self::Vf {
117 |             Self::Vf::splat(1.1447298858494001741434273513530587116472948129153115715136230714)
118 |         }
119 | 
120 |         #[inline(always)]
121 |         fn SQRT_E() -> Self::Vf {
122 |             Self::Vf::splat(1.6487212707001281468486507878141635716537761007101480115750793116)
123 |         }
124 | 
125 |         $(
126 |             #[inline(always)]
127 |             fn $name() -> Self::Vf {
128 |                 Self::Vf::splat(core::$t::consts::$name)
129 |             }
130 |         )*
131 |     }
132 | }
133 | 
134 | impl<S: Simd> SimdFloatVectorConstsInternal<S> for f32 {
135 |     type Vf = <S as Simd>::Vf32;
136 | 
137 |     impl_internal_consts!(f32: Vf32 => E, FRAC_1_PI, FRAC_1_SQRT_2, FRAC_2_PI, FRAC_2_SQRT_PI, FRAC_PI_2, FRAC_PI_3, FRAC_PI_4, FRAC_PI_6, FRAC_PI_8, LN_2, LN_10, LOG2_10, LOG2_E, LOG10_2, LOG10_E, PI, SQRT_2, TAU);
138 | }
139 | 
140 | impl<S: Simd> SimdFloatVectorConstsInternal<S> for f64 {
141 |     type Vf = <S as Simd>::Vf64;
142 | 
143 |     impl_internal_consts!(f64: Vf64 => E, FRAC_1_PI, FRAC_1_SQRT_2, FRAC_2_PI, FRAC_2_SQRT_PI, FRAC_PI_2, FRAC_PI_3, FRAC_PI_4, FRAC_PI_6, FRAC_PI_8, LN_2, LN_10, LOG2_10, LOG2_E, LOG10_2, LOG10_E, PI, SQRT_2, TAU);
144 | }
145 | 
146 | macro_rules! impl_consts {
147 |     ($($name:ident),*) => {
148 |         $(
149 |             #[inline(always)] fn $name() -> Self {
150 |                 <<Self as SimdVectorBase<S>>::Element as SimdFloatVectorConstsInternal<S>>::$name()
151 |             }
152 |         )*
153 |     }
154 | }
155 | 
156 | #[rustfmt::skip]
157 | impl<T, S: Simd> SimdFloatVectorConsts<S> for T
158 | where
159 |     T: SimdFloatVector<S>,
160 |     <T as SimdVectorBase<S>>::Element: SimdFloatVectorConstsInternal<S, Vf = T>,
161 | {
162 |     impl_consts!(E, FRAC_1_PI, FRAC_1_SQRT_2, FRAC_2_PI, FRAC_1_SQRT_PI, FRAC_2_SQRT_PI, FRAC_PI_2, FRAC_PI_3, FRAC_PI_4, FRAC_PI_6, FRAC_PI_8, LN_2, LN_10, LN_PI, LOG2_10, LOG2_E, LOG10_2, LOG10_E, PI, SQRT_2, SQRT_E, TAU, SQRT_FRAC_PI_2);
163 | }
164 | 


--------------------------------------------------------------------------------
/crates/thermite/src/math/poly.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | Optimized fixed-degree polynomial evaluation
  3 | 
  4 | All of these polynomials use Estrin's scheme to reduce the dependency chain length
  5 | and encourage instruction-level parallelism, which has the potential to improve
  6 | performance despite the powers of X being required upfront.
  7 | 
  8 | Powers of x are required, rather than computed internally, so they could be reused
  9 | between multiple polynomials.
 10 | 
 11 | Unless you are micro-optimizing, it's recommended to use `SimdVectorizedMath::poly` or `poly_f`
 12 | */
 13 | 
 14 | use crate::*;
 15 | 
 16 | #[inline(always)]
 17 | pub fn poly_1<S: Simd, V: SimdFloatVector<S>>(x: V, c0: V, c1: V) -> V {
 18 |     x.mul_adde(c1, c0)
 19 | }
 20 | 
 21 | #[inline(always)]
 22 | pub fn poly_2<S: Simd, V: SimdFloatVector<S>>(x: V, x2: V, c0: V, c1: V, c2: V) -> V {
 23 |     x2.mul_adde(c2, x.mul_adde(c1, c0))
 24 | }
 25 | 
 26 | #[inline(always)]
 27 | pub fn poly_3<S: Simd, V: SimdFloatVector<S>>(x: V, x2: V, c0: V, c1: V, c2: V, c3: V) -> V {
 28 |     // x^2 * (x * c3 + c2) + (x*c1 + c0)
 29 |     x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0))
 30 | }
 31 | 
 32 | #[inline(always)]
 33 | pub fn poly_4<S: Simd, V: SimdFloatVector<S>>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V) -> V {
 34 |     // x^4 * c4 + (x^2 * (x * c3 + c2) + (x*c1 + c0))
 35 |     x4.mul_adde(c4, x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)))
 36 | }
 37 | 
 38 | #[inline(always)]
 39 | pub fn poly_5<S: Simd, V: SimdFloatVector<S>>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V, c5: V) -> V {
 40 |     // x^4 * (x * c5 + c4) + (x^2 * (x * c3 + c2) + (x*c1 + c0))
 41 |     x4.mul_adde(x.mul_adde(c5, c4), x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)))
 42 | }
 43 | 
 44 | #[rustfmt::skip]
 45 | #[inline(always)]
 46 | pub fn poly_6<S: Simd, V: SimdFloatVector<S>>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V) -> V {
 47 |     // x^4 * (x^2 * c6 + (x * c5 + c4)) + (x^2 * (x * c3 + c2) + (x * c1 + c0))
 48 |     x4.mul_adde(
 49 |         x2.mul_adde(c6, x.mul_adde(c5, c4)),
 50 |         x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
 51 |     )
 52 | }
 53 | 
 54 | #[rustfmt::skip]
 55 | #[inline(always)]
 56 | pub fn poly_7<S: Simd, V: SimdFloatVector<S>>(x: V, x2: V, x4: V, c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V) -> V {
 57 |     x4.mul_adde(
 58 |         x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
 59 |         x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
 60 |     )
 61 | }
 62 | 
 63 | #[rustfmt::skip]
 64 | #[inline(always)]
 65 | pub fn poly_8<S: Simd, V: SimdFloatVector<S>>(
 66 |     x: V, x2: V, x4: V, x8: V,
 67 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V
 68 | ) -> V {
 69 |     x8.mul_adde(c8, x4.mul_adde(
 70 |         x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
 71 |         x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
 72 |     ))
 73 | }
 74 | 
 75 | #[rustfmt::skip]
 76 | #[inline(always)]
 77 | pub fn poly_9<S: Simd, V: SimdFloatVector<S>>(
 78 |     x: V, x2: V, x4: V, x8: V,
 79 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V
 80 | ) -> V {
 81 |     x8.mul_adde(x.mul_adde(c9, c8), x4.mul_adde(
 82 |         x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
 83 |         x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
 84 |     ))
 85 | }
 86 | 
 87 | #[rustfmt::skip]
 88 | #[inline(always)]
 89 | pub fn poly_10<S: Simd, V: SimdFloatVector<S>>(
 90 |     x: V, x2: V, x4: V, x8: V,
 91 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V,
 92 | ) -> V {
 93 |     x8.mul_adde(x2.mul_adde(c10, x.mul_adde(c9, c8)), x4.mul_adde(
 94 |         x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
 95 |         x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
 96 |     ))
 97 | }
 98 | 
 99 | #[rustfmt::skip]
100 | #[inline(always)]
101 | pub fn poly_11<S: Simd, V: SimdFloatVector<S>>(
102 |     x: V, x2: V, x4: V, x8: V,
103 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V
104 | ) -> V {
105 |     x8.mul_adde(
106 |         x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)),
107 |         x4.mul_adde(
108 |             x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
109 |             x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
110 |         ),
111 |     )
112 | }
113 | 
114 | #[rustfmt::skip]
115 | #[inline(always)]
116 | pub fn poly_12<S: Simd, V: SimdFloatVector<S>>(
117 |     x: V, x2: V, x4: V, x8: V,
118 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V,
119 | ) -> V {
120 |     x8.mul_adde(
121 |         x4.mul_adde(
122 |             c12,
123 |             x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)),
124 |         ),
125 |         x4.mul_adde(
126 |             x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
127 |             x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
128 |         ),
129 |     )
130 | }
131 | 
132 | #[rustfmt::skip]
133 | #[inline(always)]
134 | pub fn poly_13<S: Simd, V: SimdFloatVector<S>>(
135 |     x: V, x2: V, x4: V, x8: V,
136 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, c13: V,
137 | ) -> V {
138 |     x8.mul_adde(
139 |         x4.mul_adde(
140 |             x.mul_adde(c13, c12),
141 |             x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)),
142 |         ),
143 |         x4.mul_adde(
144 |             x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
145 |             x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
146 |         ),
147 |     )
148 | }
149 | 
150 | #[rustfmt::skip]
151 | #[inline(always)]
152 | pub fn poly_14<S: Simd, V: SimdFloatVector<S>>(
153 |     x: V, x2: V, x4: V, x8: V,
154 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, c13: V, c14: V
155 | ) -> V {
156 |     // (((C0+C1x) + (C2+C3x)x2) + ((C4+C5x) + (C6+C7x)x2)x4) + (((C8+C9x) + (C10+C11x)x2) + ((C12+C13x) + C14*x2)x4)x8
157 |     x8.mul_adde(
158 |         x4.mul_adde(
159 |             x2.mul_adde(c14, x.mul_adde(c13, c12)),
160 |             x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)),
161 |         ),
162 |         x4.mul_adde(
163 |             x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
164 |             x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
165 |         ),
166 |     )
167 | }
168 | 
169 | #[rustfmt::skip]
170 | #[inline(always)]
171 | pub fn poly_15<S: Simd, V: SimdFloatVector<S>>(
172 |     x: V, x2: V, x4: V, x8: V,
173 |     c0: V, c1: V, c2: V, c3: V, c4: V, c5: V, c6: V, c7: V, c8: V, c9: V, c10: V, c11: V, c12: V, c13: V, c14: V, c15: V
174 | ) -> V {
175 |     // (((C0+C1x) + (C2+C3x)x2) + ((C4+C5x) + (C6+C7x)x2)x4) + (((C8+C9x) + (C10+C11x)x2) + ((C12+C13x) + (C14+C15x)x2)x4)x8
176 |     x8.mul_adde(
177 |         x4.mul_adde(
178 |             x2.mul_adde(x.mul_adde(c15, c14), x.mul_adde(c13, c12)),
179 |             x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c9, c8)),
180 |         ),
181 |         x4.mul_adde(
182 |             x2.mul_adde(x.mul_adde(c7, c6), x.mul_adde(c5, c4)),
183 |             x2.mul_adde(x.mul_adde(c3, c2), x.mul_adde(c1, c0)),
184 |         ),
185 |     )
186 | }
187 | 
188 | #[rustfmt::skip]
189 | #[inline(always)]
190 | pub fn poly_30<S: Simd, V: SimdFloatVector<S>>(
191 |     x: V, x2: V, x4: V, x8: V, x16: V,
192 |     c00: V, c01: V, c02: V, c03: V, c04: V, c05: V, c06: V, c07: V, c08: V, c09: V, c10: V, c11: V, c12: V, c13: V, c14: V, c15: V,
193 |     c16: V, c17: V, c18: V, c19: V, c20: V, c21: V, c22: V, c23: V, c24: V, c25: V, c26: V, c27: V, c28: V, c29: V, c30: V, c31: V
194 | ) -> V {
195 |     x16.mul_adde(
196 |         x8.mul_adde(
197 |             x4.mul_adde(
198 |                 x2.mul_adde(x.mul_adde(c31, c30), x.mul_adde(c29, c28)),
199 |                 x2.mul_adde(x.mul_adde(c27, c26), x.mul_adde(c25, c24)),
200 |             ),
201 |             x4.mul_adde(
202 |                 x2.mul_adde(x.mul_adde(c23, c22), x.mul_adde(c21, c20)),
203 |                 x2.mul_adde(x.mul_adde(c19, c18), x.mul_adde(c17, c16)),
204 |             ),
205 |         ),
206 |         x8.mul_adde(
207 |             x4.mul_adde(
208 |                 x2.mul_adde(x.mul_adde(c15, c14), x.mul_adde(c13, c12)),
209 |                 x2.mul_adde(x.mul_adde(c11, c10), x.mul_adde(c09, c08)),
210 |             ),
211 |             x4.mul_adde(
212 |                 x2.mul_adde(x.mul_adde(c07, c06), x.mul_adde(c05, c04)),
213 |                 x2.mul_adde(x.mul_adde(c03, c02), x.mul_adde(c01, c00)),
214 |             ),
215 |         )
216 |     )
217 | }
218 | 


--------------------------------------------------------------------------------
/crates/thermite/src/pointer.rs:
--------------------------------------------------------------------------------
  1 | use core::marker::PhantomData;
  2 | use core::mem;
  3 | 
  4 | use super::*;
  5 | 
  6 | #[derive(Debug, Clone, Copy, PartialEq)]
  7 | #[repr(transparent)]
  8 | pub struct VPtr<S: Simd, T> {
  9 |     ptr: S::Vusize,
 10 |     ty: PhantomData<T>,
 11 | }
 12 | 
 13 | impl<S: Simd, T> VPtr<S, T>
 14 | where
 15 |     T: SimdAssociatedVector<S>,
 16 |     S::Vusize: SimdPtrInternal<S, AssociatedVector<S, T>>,
 17 | {
 18 |     #[inline(always)]
 19 |     pub fn splat(ptr: *mut T) -> Self {
 20 |         Self {
 21 |             ptr: S::Vusize::splat(ptr as _),
 22 |             ty: PhantomData,
 23 |         }
 24 |     }
 25 | 
 26 |     #[inline(always)]
 27 |     pub fn add(self, offset: S::Vusize) -> Self {
 28 |         Self {
 29 |             ptr: self.ptr + offset * S::Vusize::splat(mem::size_of::<T>() as _),
 30 |             ty: PhantomData,
 31 |         }
 32 |     }
 33 | 
 34 |     #[inline(always)]
 35 |     pub fn is_null(self) -> Mask<S, S::Vusize> {
 36 |         self.ptr.eq(S::Vusize::zero())
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     pub unsafe fn read(self) -> AssociatedVector<S, T> {
 41 |         self.ptr._mm_gather()
 42 |     }
 43 | 
 44 |     #[inline(always)]
 45 |     pub unsafe fn read_masked(
 46 |         self,
 47 |         mask: Mask<S, AssociatedVector<S, T>>,
 48 |         default: AssociatedVector<S, T>,
 49 |     ) -> AssociatedVector<S, T> {
 50 |         self.ptr._mm_gather_masked(mask, default)
 51 |     }
 52 | 
 53 |     #[inline(always)]
 54 |     pub unsafe fn write(self, value: AssociatedVector<S, T>) {
 55 |         self.ptr._mm_scatter(value)
 56 |     }
 57 | 
 58 |     #[inline(always)]
 59 |     pub unsafe fn write_masked(self, mask: Mask<S, AssociatedVector<S, T>>, value: AssociatedVector<S, T>) {
 60 |         self.ptr._mm_scatter_masked(mask, value)
 61 |     }
 62 | }
 63 | 
 64 | #[doc(hidden)]
 65 | pub trait AsUsize: Sized {
 66 |     fn as_usize(self) -> usize;
 67 | }
 68 | 
 69 | #[doc(hidden)]
 70 | pub trait SimdPtrInternal<S: Simd + ?Sized, V: SimdVectorBase<S>>: SimdVector<S>
 71 | where
 72 |     <Self as SimdVectorBase<S>>::Element: AsUsize,
 73 | {
 74 |     #[inline(always)]
 75 |     unsafe fn _mm_gather(self) -> V {
 76 |         self._mm_gather_masked(Mask::truthy(), V::default())
 77 |     }
 78 | 
 79 |     #[inline(always)]
 80 |     unsafe fn _mm_scatter(self, value: V) {
 81 |         self._mm_scatter_masked(Mask::truthy(), value)
 82 |     }
 83 | 
 84 |     #[inline(always)]
 85 |     unsafe fn _mm_gather_masked(self, mask: Mask<S, V>, default: V) -> V {
 86 |         let mut res = default;
 87 |         for i in 0..Self::NUM_ELEMENTS {
 88 |             if mask.extract_unchecked(i) {
 89 |                 res = res.replace_unchecked(
 90 |                     i,
 91 |                     mem::transmute::<_, *const V::Element>(self.extract_unchecked(i).as_usize()).read(),
 92 |                 );
 93 |             }
 94 |         }
 95 |         res
 96 |     }
 97 | 
 98 |     #[inline(always)]
 99 |     unsafe fn _mm_scatter_masked(self, mask: Mask<S, V>, value: V) {
100 |         for i in 0..Self::NUM_ELEMENTS {
101 |             if mask.extract_unchecked(i) {
102 |                 mem::transmute::<_, *mut V::Element>(self.extract_unchecked(i).as_usize())
103 |                     .write(value.extract_unchecked(i));
104 |             }
105 |         }
106 |     }
107 | }
108 | 
109 | impl AsUsize for u32 {
110 |     #[inline(always)]
111 |     fn as_usize(self) -> usize {
112 |         self as usize
113 |     }
114 | }
115 | 
116 | impl AsUsize for u64 {
117 |     #[inline(always)]
118 |     fn as_usize(self) -> usize {
119 |         self as usize
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/crates/thermite/src/rng/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::*;
 2 | 
 3 | pub mod pcg32;
 4 | pub mod xoshiro;
 5 | 
 6 | pub trait SimdRng<S: Simd> {
 7 |     fn reseed(&mut self, seed: Vu64<S>);
 8 | 
 9 |     #[inline(always)]
10 |     fn next_u32(&mut self) -> Vu32<S> {
11 |         // use higher bits in cases where there is low linear complexity in low bits
12 |         (self.next_u64() >> 32).cast()
13 |     }
14 | 
15 |     #[inline(always)]
16 |     fn next_u64(&mut self) -> Vu64<S> {
17 |         let low: Vu64<S> = self.next_u32().cast();
18 |         let high: Vu64<S> = self.next_u32().cast();
19 | 
20 |         low | (high << 32)
21 |     }
22 | 
23 |     #[inline(always)]
24 |     fn next_f32(&mut self) -> Vf32<S> {
25 |         // NOTE: This has the added benefit of shifting out the lower bits,
26 |         // as some RGNs have a low linear complexity in the lower bits
27 |         Vf32::<S>::from_bits((self.next_u32() >> 9) | Vu32::<S>::splat(0x3f800000)) - Vf32::<S>::one()
28 |     }
29 | 
30 |     #[inline(always)]
31 |     fn next_f64(&mut self) -> Vf64<S> {
32 |         // NOTE: This has the added benefit of shifting out the lower bits,
33 |         // as some RGNs have a low linear complexity in the lower bits
34 |         Vf64::<S>::from_bits((self.next_u64() >> 20) | Vu64::<S>::splat(0x3ff0000000000000)) - Vf64::<S>::one()
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/crates/thermite/src/rng/pcg32.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use crate::*;
 4 | 
 5 | use super::SimdRng;
 6 | 
 7 | const PCG32_DEFAULT_STATE: u64 = 0x853c49e6748fea9b;
 8 | const PCG32_DEFAULT_STREAM: u64 = 0xda3e39cb94b95bdb;
 9 | const PCG32_MULT: u64 = 0x5851f42d4c957f2d;
10 | 
11 | #[derive(Debug, Clone, PartialEq)]
12 | pub struct PCG32<S: Simd> {
13 |     state: Vu64<S>,
14 |     inc: Vu64<S>,
15 | }
16 | 
17 | #[dispatch(S, thermite = "crate")]
18 | impl<S: Simd> PCG32<S> {
19 |     #[inline(always)]
20 |     pub fn new(seed: Vu64<S>) -> Self {
21 |         let mut rng = PCG32 {
22 |             state: unsafe { Vu64::<S>::undefined() },
23 |             inc: unsafe { Vu64::<S>::undefined() },
24 |         };
25 |         rng.reseed(seed);
26 |         rng
27 |     }
28 | }
29 | 
30 | #[dispatch(S, thermite = "crate")]
31 | impl<S: Simd> SimdRng<S> for PCG32<S> {
32 |     #[inline]
33 |     fn reseed(&mut self, seed: Vu64<S>) {
34 |         self.state = Vu64::<S>::zero();
35 |         self.inc = (seed << 1) | Vu64::<S>::one();
36 | 
37 |         let _ = self.next_u32();
38 |         self.state += Vu64::<S>::splat(PCG32_DEFAULT_STATE);
39 |         let _ = self.next_u32();
40 |     }
41 | 
42 |     #[inline]
43 |     fn next_u32(&mut self) -> Vu32<S> {
44 |         let old_state = self.state;
45 |         self.state = old_state * Vu64::<S>::splat(PCG32_MULT) + self.inc;
46 |         let xorshifted = <Vu32<S> as SimdFromCast<S, Vu64<S>>>::from_cast(((old_state >> 18) ^ old_state) >> 27);
47 |         let rot_offset = <Vu32<S> as SimdFromCast<S, Vu64<S>>>::from_cast(old_state >> 59);
48 | 
49 |         xorshifted.rorv(rot_offset)
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/crates/thermite/src/rng/xoshiro.rs:
--------------------------------------------------------------------------------
  1 | // TODO
  2 | 
  3 | use crate::*;
  4 | 
  5 | use super::SimdRng;
  6 | 
  7 | #[derive(Debug, Clone, PartialEq)]
  8 | pub struct SplitMix64<S: Simd> {
  9 |     x: Vu64<S>,
 10 | }
 11 | 
 12 | const PHI: u64 = 0x9e3779b97f4a7c15;
 13 | 
 14 | impl<S: Simd> SplitMix64<S> {
 15 |     #[inline(always)]
 16 |     pub fn new(seed: Vu64<S>) -> Self {
 17 |         SplitMix64 { x: seed }
 18 |     }
 19 | }
 20 | 
 21 | #[dispatch(S, thermite = "crate")]
 22 | impl<S: Simd> SimdRng<S> for SplitMix64<S> {
 23 |     #[inline(always)]
 24 |     fn reseed(&mut self, seed: Vu64<S>) {
 25 |         self.x = seed;
 26 |     }
 27 | 
 28 |     #[inline(always)]
 29 |     fn next_u32(&mut self) -> Vu32<S> {
 30 |         self.x = self.x + Vu64::<S>::splat(PHI);
 31 |         let mut z = self.x;
 32 | 
 33 |         z = (z ^ (z >> 33)) * Vu64::<S>::splat(0x62A9D9ED799705F5);
 34 |         z = (z ^ (z >> 28)) * Vu64::<S>::splat(0xCB24D0A5C88C35B3);
 35 | 
 36 |         (z >> 32).cast()
 37 |     }
 38 | 
 39 |     #[inline(always)]
 40 |     fn next_u64(&mut self) -> Vu64<S> {
 41 |         self.x = self.x + Vu64::<S>::splat(PHI);
 42 |         let mut z = self.x;
 43 | 
 44 |         z = (z ^ (z >> 30)) * Vu64::<S>::splat(0xbf58476d1ce4e5b9);
 45 |         z = (z ^ (z >> 27)) * Vu64::<S>::splat(0x94d049bb133111eb);
 46 |         z ^ (z >> 31)
 47 |     }
 48 | }
 49 | 
 50 | #[derive(Debug, Clone, PartialEq)]
 51 | pub struct Xoshiro128Plus<S: Simd> {
 52 |     s0: Vu64<S>,
 53 |     s1: Vu64<S>,
 54 | }
 55 | 
 56 | #[dispatch(S, thermite = "crate")]
 57 | impl<S: Simd> Xoshiro128Plus<S> {
 58 |     #[inline(always)]
 59 |     pub fn new(seed: Vu64<S>) -> Self {
 60 |         let mut rng = SplitMix64::<S>::new(seed);
 61 |         Xoshiro128Plus {
 62 |             s0: rng.next_u64(),
 63 |             s1: rng.next_u64(),
 64 |         }
 65 |     }
 66 | }
 67 | 
 68 | #[dispatch(S, thermite = "crate")]
 69 | impl<S: Simd> SimdRng<S> for Xoshiro128Plus<S> {
 70 |     #[inline(always)]
 71 |     fn reseed(&mut self, seed: Vu64<S>) {
 72 |         *self = Self::new(seed);
 73 |     }
 74 | 
 75 |     #[inline(always)]
 76 |     fn next_u64(&mut self) -> Vu64<S> {
 77 |         let result = self.s0 + self.s1;
 78 | 
 79 |         self.s1 ^= self.s0;
 80 |         self.s0 = self.s0.rol(24) ^ self.s1 ^ (self.s1 << 16);
 81 |         self.s1 = self.s1.rol(37);
 82 | 
 83 |         result
 84 |     }
 85 | }
 86 | 
 87 | #[derive(Debug, Clone, PartialEq)]
 88 | pub struct Xoshiro256Plus<S: Simd> {
 89 |     state: [Vu64<S>; 4],
 90 | }
 91 | 
 92 | #[dispatch(S, thermite = "crate")]
 93 | impl<S: Simd> Xoshiro256Plus<S> {
 94 |     #[inline(always)]
 95 |     pub fn new(seed: Vu64<S>) -> Self {
 96 |         let mut rng = SplitMix64::<S>::new(seed);
 97 |         Xoshiro256Plus {
 98 |             state: [rng.next_u64(), rng.next_u64(), rng.next_u64(), rng.next_u64()],
 99 |         }
100 |     }
101 | }
102 | 
103 | #[dispatch(S, thermite = "crate")]
104 | impl<S: Simd> SimdRng<S> for Xoshiro256Plus<S> {
105 |     #[inline(always)]
106 |     fn reseed(&mut self, seed: Vu64<S>) {
107 |         *self = Self::new(seed);
108 |     }
109 | 
110 |     #[inline(always)]
111 |     fn next_u64(&mut self) -> Vu64<S> {
112 |         let result = self.state[0] + self.state[3];
113 | 
114 |         let t = self.state[1] << 17;
115 | 
116 |         self.state[2] ^= self.state[0];
117 |         self.state[3] ^= self.state[1];
118 |         self.state[1] ^= self.state[2];
119 |         self.state[0] ^= self.state[3];
120 | 
121 |         self.state[2] ^= t;
122 | 
123 |         self.state[3] = self.state[3].rol(45);
124 | 
125 |         result
126 |     }
127 | }
128 | 


--------------------------------------------------------------------------------
/crates/thermite/src/runtime.rs:
--------------------------------------------------------------------------------
 1 | /**
 2 | Detects processor architecture at runtime and generates a type definition for the current SIMD instruction-set to be passed into the given code-block.
 3 | 
 4 | The code block given is duplicated, manually monomorphised, to give the type definition to it.
 5 | 
 6 | ```ignore
 7 | fn my_algorithm<S: Simd>(x: &mut [f32]) {
 8 |     assert!(x.len() >= Vf32::<S>::NUM_ELEMENTS);
 9 | 
10 |     Vf32::<S>::load_unaligned(x).sin().store_unaligned(x);
11 | }
12 | 
13 | let mut values = vec![0.5; 8];
14 | 
15 | dispatch_dyn!({ my_algorithm::<S>(&mut values) });
16 | 
17 | // or with a custom generic parameter name:
18 | 
19 | dispatch_dyn!(ISA, { my_algorithm::<ISA>(&mut values) });
20 | ```
21 | */
22 | #[macro_export]
23 | macro_rules! dispatch_dyn {
24 |     ($code:block) => {
25 |         dispatch_dyn!(S, $code)
26 |     };
27 |     ($s:ident, $code:block) => {{
28 |         use $crate::{backends, Simd, SimdInstructionSet};
29 | 
30 |         match SimdInstructionSet::runtime_detect() {
31 |             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
32 |             SimdInstructionSet::AVX2 => {
33 |                 type $s = backends::avx2::AVX2;
34 |                 $code
35 |             }
36 |             _ => unsafe { $crate::unreachable_unchecked() },
37 |         }
38 |     }};
39 | }
40 | 


--------------------------------------------------------------------------------
/crates/thermite/tests/counts.rs:
--------------------------------------------------------------------------------
  1 | #![allow(unused)]
  2 | 
  3 | use thermite::*;
  4 | 
  5 | type Vi32 = <backends::avx2::AVX2 as Simd>::Vi32;
  6 | type Vu32 = <backends::avx2::AVX2 as Simd>::Vu32;
  7 | type Vu64 = <backends::avx2::AVX2 as Simd>::Vu64;
  8 | type Vf64 = <backends::avx2::AVX2 as Simd>::Vf64;
  9 | type Vf32 = <backends::avx2::AVX2 as Simd>::Vf32;
 10 | type Vi64 = <backends::avx2::AVX2 as Simd>::Vi64;
 11 | 
 12 | #[test]
 13 | fn test_popcnt_32bit() {
 14 |     for i in -1000..1000 {
 15 |         let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one());
 16 | 
 17 |         let bits = x.count_ones();
 18 | 
 19 |         for j in 0..Vi32::NUM_ELEMENTS {
 20 |             let x = x.extract(j);
 21 |             let b = bits.extract(j) as u32;
 22 | 
 23 |             assert_eq!(x.count_ones(), b, "0b{:b} {} == {}", x, x.count_ones(), b);
 24 |         }
 25 |     }
 26 | }
 27 | 
 28 | #[test]
 29 | fn test_popcnt_64bit() {
 30 |     for i in -1000..1000 {
 31 |         let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one());
 32 | 
 33 |         let bits = x.count_ones();
 34 | 
 35 |         for j in 0..Vi64::NUM_ELEMENTS {
 36 |             let x = x.extract(j);
 37 |             let b = bits.extract(j) as u32;
 38 | 
 39 |             assert_eq!(x.count_ones(), b, "0b{:b} {} == {}", x, x.count_ones(), b);
 40 |         }
 41 |     }
 42 | }
 43 | 
 44 | #[test]
 45 | fn test_tzc_64bit() {
 46 |     for i in -1000..1000 {
 47 |         let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one());
 48 | 
 49 |         let bits = x.trailing_zeros();
 50 | 
 51 |         for j in 0..Vi64::NUM_ELEMENTS {
 52 |             let x = x.extract(j);
 53 |             let b = bits.extract(j) as u32;
 54 | 
 55 |             assert_eq!(x.trailing_zeros(), b, "0b{:b} {} == {}", x, x.trailing_zeros(), b);
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | #[test]
 61 | fn test_tzc_32bit() {
 62 |     for i in -1000..1000 {
 63 |         let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one());
 64 | 
 65 |         let bits = x.trailing_zeros();
 66 | 
 67 |         for j in 0..Vi32::NUM_ELEMENTS {
 68 |             let x = x.extract(j);
 69 |             let b = bits.extract(j) as u32;
 70 | 
 71 |             assert_eq!(x.trailing_zeros(), b, "0b{:b} {} == {}", x, x.trailing_zeros(), b);
 72 |         }
 73 |     }
 74 | }
 75 | 
 76 | #[test]
 77 | fn test_lzc_64bit() {
 78 |     for i in -1000..1000 {
 79 |         let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one());
 80 | 
 81 |         let bits = x.leading_zeros();
 82 | 
 83 |         for j in 0..Vi64::NUM_ELEMENTS {
 84 |             let x = x.extract(j);
 85 |             let b = bits.extract(j) as u32;
 86 | 
 87 |             assert_eq!(x.leading_zeros(), b, "0b{:b} {} == {}", x, x.leading_zeros(), b);
 88 |         }
 89 |     }
 90 | }
 91 | 
 92 | #[test]
 93 | fn test_lzc_32bit() {
 94 |     for i in -1000..1000 {
 95 |         let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one());
 96 | 
 97 |         let bits = x.leading_zeros();
 98 | 
 99 |         for j in 0..Vi32::NUM_ELEMENTS {
100 |             let x = x.extract(j);
101 |             let b = bits.extract(j) as u32;
102 | 
103 |             assert_eq!(x.leading_zeros(), b, "0b{:b} {} == {}", x, x.leading_zeros(), b);
104 |         }
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/crates/thermite/tests/reverse.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use thermite::*;
 4 | 
 5 | type Vi32 = <backends::avx2::AVX2 as Simd>::Vi32;
 6 | type Vu32 = <backends::avx2::AVX2 as Simd>::Vu32;
 7 | type Vu64 = <backends::avx2::AVX2 as Simd>::Vu64;
 8 | type Vf64 = <backends::avx2::AVX2 as Simd>::Vf64;
 9 | type Vf32 = <backends::avx2::AVX2 as Simd>::Vf32;
10 | type Vi64 = <backends::avx2::AVX2 as Simd>::Vi64;
11 | 
12 | #[test]
13 | fn test_bitreversal_32bit() {
14 |     for i in -1000..1000 {
15 |         let x = Vi32::splat(i) * (Vi32::indexed() + Vi32::one());
16 | 
17 |         let y = x.reverse_bits();
18 | 
19 |         for j in 0..Vi32::NUM_ELEMENTS {
20 |             let x = x.extract(j).reverse_bits();
21 |             let y = y.extract(j);
22 | 
23 |             assert_eq!(x, y);
24 |         }
25 |     }
26 | }
27 | 
28 | #[test]
29 | fn test_bitreversal_64bit() {
30 |     for i in -1000..1000 {
31 |         let x = Vi64::splat(i) * (Vi64::indexed() + Vi64::one());
32 | 
33 |         let y = x.reverse_bits();
34 | 
35 |         for j in 0..Vi64::NUM_ELEMENTS {
36 |             let x = x.extract(j).reverse_bits();
37 |             let y = y.extract(j);
38 | 
39 |             assert_eq!(x, y);
40 |         }
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/thermite/tests/sinh.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | use thermite::*;
 4 | 
 5 | type Vi32 = <backends::avx2::AVX2 as Simd>::Vi32;
 6 | type Vu32 = <backends::avx2::AVX2 as Simd>::Vu32;
 7 | type Vf32 = <backends::avx2::AVX2 as Simd>::Vf32;
 8 | type Vf64 = <backends::avx2::AVX2 as Simd>::Vf64;
 9 | 
10 | #[test]
11 | fn test_powi() {
12 |     let x = Vf32::splat(5.5);
13 | 
14 |     let y0 = x.reciprocal_p::<policies::UltraPerformance>();
15 |     let y1 = x.reciprocal_p::<policies::Performance>();
16 |     let y2 = x.reciprocal_p::<policies::Precision>();
17 |     let y3 = x.reciprocal_p::<policies::Reference>();
18 | 
19 |     println!(
20 |         "{} == {} == {} == {} == {}",
21 |         1.0 / x.extract(0),
22 |         y0.extract(0),
23 |         y1.extract(0),
24 |         y2.extract(0),
25 |         y3.extract(0),
26 |     );
27 | }
28 | 


--------------------------------------------------------------------------------
/crates/thermite2/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "thermite2"
 3 | version = "0.1.1-alpha.0"
 4 | license = "MIT OR Apache-2.0"
 5 | readme = "README.md"
 6 | authors = ["novacrazy <novacrazy@gmail.com>"]
 7 | repository = "https://github.com/raygon-renderer/thermite"
 8 | documentation = "https://raygon-renderer.github.io/thermite/"
 9 | edition = "2018"
10 | 
11 | [features]
12 | default = ["alloc", "math", "rng", "emulate_fma", "static_init"]
13 | # neon = ["thermite-dispatch/neon"]
14 | # wasm32 = ["thermite-dispatch/wasm32"]
15 | alloc = []
16 | nightly = []
17 | math = []
18 | rng = []
19 | emulate_fma = []
20 | 
21 | [dependencies]
22 | # thermite-dispatch = { path = "../dispatch" }
23 | paste = "1"
24 | half = "1.6.0"
25 | 
26 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies.static_init]
27 | version = "1"
28 | optional = true
29 | default_features = false
30 | 
31 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
32 | core_detect = "1.0.0"
33 | 
34 | # [dev-dependencies]
35 | # criterion = "0.3"
36 | # libm = "0.2.1"
37 | # plotly = "0.6.0"
38 | # rand = "0.8"
39 | # rand_xoshiro = "0.6.0"
40 | # no-panic = "0.1"
41 | # thermite-special = { path = "../thermite-special" }
42 | # thermite-complex = { path = "../thermite-complex" }
43 | # num-complex = "0.4"
44 | 
45 | # [[bench]]
46 | # name = "main"
47 | # harness = false
48 | 
49 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/avx2/mod.rs:
--------------------------------------------------------------------------------
 1 | use crate::{
 2 |     arch::avx2::*,
 3 |     backends::{register::*, vector::Vector},
 4 |     widen::Widen,
 5 |     Simd, SimdInstructionSet,
 6 | };
 7 | 
 8 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 9 | pub struct AVX2;
10 | 
11 | pub mod polyfills;
12 | 
13 | pub mod vf32;
14 | 
15 | impl Simd for AVX2 {
16 |     const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX2;
17 | 
18 |     type Vf32 = Self::Vf32x8;
19 | 
20 |     type Vf32x1 = (); // TODO: wrapped scalar float
21 |     type Vf32x2 = (); // TODO: half a 128-bit register
22 |     type Vf32x4 = Vector<AVX2F32Register<4>>;
23 |     type Vf32x8 = Vector<AVX2F32Register<8>>;
24 |     type Vf32x16 = Widen<Self, Self::Vf32x8, 2>; //2x wider
25 | }
26 | 
27 | pub struct AVX2F32Register<const N: usize>([(); N]);
28 | pub struct AVX2F64Register<const N: usize>([(); N]);
29 | pub struct AVX2U32Register<const N: usize>([(); N]);
30 | pub struct AVX2U64Register<const N: usize>([(); N]);
31 | pub struct AVX2I32Register<const N: usize>([(); N]);
32 | pub struct AVX2I64Register<const N: usize>([(); N]);
33 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/avx2/polyfills.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite2/src/backends/avx2/polyfills.rs


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/avx2/vf32.rs:
--------------------------------------------------------------------------------
  1 | use crate::backends::polyfills::float_rem;
  2 | 
  3 | use super::*;
  4 | 
  5 | #[rustfmt::skip]
  6 | unsafe impl Register for AVX2F32Register<4> {
  7 |     type Element = f32;
  8 |     type Storage = __m128;
  9 | 
 10 |     #[inline(always)] unsafe fn set1(x: f32) -> __m128 { _mm_set1_ps(x) }
 11 | }
 12 | 
 13 | #[rustfmt::skip]
 14 | unsafe impl Register for AVX2F32Register<8> {
 15 |     type Element = f32;
 16 |     type Storage = __m256;
 17 | 
 18 |     #[inline(always)] unsafe fn set1(x: f32) -> __m256 { _mm256_set1_ps(x) }
 19 | }
 20 | 
 21 | unsafe impl FixedRegister<4> for AVX2F32Register<4> {
 22 |     #[inline(always)]
 23 |     unsafe fn setr(values: [f32; 4]) -> __m128 {
 24 |         core::mem::transmute(values)
 25 |     }
 26 | }
 27 | 
 28 | unsafe impl FixedRegister<8> for AVX2F32Register<8> {
 29 |     #[inline(always)]
 30 |     unsafe fn setr(values: [f32; 8]) -> __m256 {
 31 |         core::mem::transmute(values)
 32 |     }
 33 | }
 34 | 
 35 | unsafe impl<const N: usize> UnaryRegisterOps for AVX2F32Register<N>
 36 | where
 37 |     Self: BinaryRegisterOps<Element = f32>,
 38 | {
 39 |     #[inline(always)]
 40 |     unsafe fn bit_not(r: Self::Storage) -> Self::Storage {
 41 |         Self::bitxor(r, Self::set1(f32::from_bits(!0)))
 42 |     }
 43 | }
 44 | 
 45 | #[rustfmt::skip]
 46 | unsafe impl BinaryRegisterOps for AVX2F32Register<4> {
 47 |     #[inline(always)] unsafe fn bitand(lhs: __m128, rhs: __m128) -> __m128 { _mm_and_ps(lhs, rhs) }
 48 |     #[inline(always)] unsafe fn bitor(lhs: __m128, rhs: __m128) -> __m128  { _mm_or_ps(lhs, rhs)  }
 49 |     #[inline(always)] unsafe fn bitxor(lhs: __m128, rhs: __m128) -> __m128 { _mm_xor_ps(lhs, rhs) }
 50 |     #[inline(always)] unsafe fn and_not(lhs: __m128, rhs: __m128) -> __m128 { _mm_andnot_ps(lhs, rhs) }
 51 |     #[inline(always)] unsafe fn add(lhs: __m128, rhs: __m128) -> __m128 { _mm_add_ps(lhs, rhs) }
 52 |     #[inline(always)] unsafe fn sub(lhs: __m128, rhs: __m128) -> __m128 { _mm_sub_ps(lhs, rhs) }
 53 |     #[inline(always)] unsafe fn mul(lhs: __m128, rhs: __m128) -> __m128 { _mm_mul_ps(lhs, rhs) }
 54 |     #[inline(always)] unsafe fn div(lhs: __m128, rhs: __m128) -> __m128 { _mm_div_ps(lhs, rhs) }
 55 |     #[inline(always)] unsafe fn rem(lhs: __m128, rhs: __m128) -> __m128 { float_rem::<Self>(lhs, rhs) }
 56 | }
 57 | 
 58 | #[rustfmt::skip]
 59 | unsafe impl BinaryRegisterOps for AVX2F32Register<8> {
 60 |     #[inline(always)] unsafe fn bitand(lhs: __m256, rhs: __m256) -> __m256 { _mm256_and_ps(lhs, rhs) }
 61 |     #[inline(always)] unsafe fn bitor(lhs: __m256, rhs: __m256) -> __m256  { _mm256_or_ps(lhs, rhs)  }
 62 |     #[inline(always)] unsafe fn bitxor(lhs: __m256, rhs: __m256) -> __m256 { _mm256_xor_ps(lhs, rhs) }
 63 |     #[inline(always)] unsafe fn and_not(lhs: __m256, rhs: __m256) -> __m256 { _mm256_andnot_ps(lhs, rhs) }
 64 |     #[inline(always)] unsafe fn add(lhs: __m256, rhs: __m256) -> __m256 { _mm256_add_ps(lhs, rhs) }
 65 |     #[inline(always)] unsafe fn sub(lhs: __m256, rhs: __m256) -> __m256 { _mm256_sub_ps(lhs, rhs) }
 66 |     #[inline(always)] unsafe fn mul(lhs: __m256, rhs: __m256) -> __m256 { _mm256_mul_ps(lhs, rhs) }
 67 |     #[inline(always)] unsafe fn div(lhs: __m256, rhs: __m256) -> __m256 { _mm256_div_ps(lhs, rhs) }
 68 |     #[inline(always)] unsafe fn rem(lhs: __m256, rhs: __m256) -> __m256 { float_rem::<Self>(lhs, rhs) }
 69 | }
 70 | 
 71 | unsafe impl<const N: usize> SignedRegisterOps for AVX2F32Register<N>
 72 | where
 73 |     Self: BinaryRegisterOps<Element = f32>,
 74 | {
 75 |     #[inline(always)]
 76 |     unsafe fn neg(x: Self::Storage) -> Self::Storage {
 77 |         Self::bitxor(x, Self::set1(-0.0))
 78 |     }
 79 | 
 80 |     #[inline(always)]
 81 |     unsafe fn abs(x: Self::Storage) -> Self::Storage {
 82 |         Self::bitand(x, Self::set1(f32::from_bits(0x7fffffff)))
 83 |     }
 84 | }
 85 | 
 86 | #[rustfmt::skip]
 87 | unsafe impl FloatRegisterOps for AVX2F32Register<4> {
 88 |     #[inline(always)] unsafe fn ceil(x: __m128) -> __m128 { _mm_ceil_ps(x) }
 89 |     #[inline(always)] unsafe fn floor(x: __m128) -> __m128 { _mm_floor_ps(x) }
 90 |     #[inline(always)] unsafe fn round(x: __m128) -> __m128 { _mm_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) }
 91 |     #[inline(always)] unsafe fn trunc(x: __m128) -> __m128 { _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) }
 92 | 
 93 |     #[inline(always)] unsafe fn sqrt(x: __m128) -> __m128 { _mm_sqrt_ps(x) }
 94 |     #[inline(always)] unsafe fn rsqrt(x: __m128) -> __m128 { _mm_rsqrt_ps(x) }
 95 |     #[inline(always)] unsafe fn rcp(x: __m128) -> __m128 { _mm_rcp_ps(x) }
 96 | 
 97 |     #[inline(always)] unsafe fn mul_add(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fmadd_ps(x, m, a) }
 98 |     #[inline(always)] unsafe fn mul_sub(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fmsub_ps(x, m, a) }
 99 |     #[inline(always)] unsafe fn nmul_add(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fnmadd_ps(x, m, a) }
100 |     #[inline(always)] unsafe fn nmul_sub(x: __m128, m: __m128, a: __m128) -> __m128 { _mm_fnmsub_ps(x, m, a) }
101 | }
102 | 
103 | #[rustfmt::skip]
104 | unsafe impl FloatRegisterOps for AVX2F32Register<8> {
105 |     #[inline(always)] unsafe fn ceil(x: __m256) -> __m256 { _mm256_ceil_ps(x) }
106 |     #[inline(always)] unsafe fn floor(x: __m256) -> __m256 { _mm256_floor_ps(x) }
107 |     #[inline(always)] unsafe fn round(x: __m256) -> __m256 { _mm256_round_ps(x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) }
108 |     #[inline(always)] unsafe fn trunc(x: __m256) -> __m256 { _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) }
109 | 
110 |     #[inline(always)] unsafe fn sqrt(x: __m256) -> __m256 { _mm256_sqrt_ps(x) }
111 |     #[inline(always)] unsafe fn rsqrt(x: __m256) -> __m256 { _mm256_rsqrt_ps(x) }
112 |     #[inline(always)] unsafe fn rcp(x: __m256) -> __m256 { _mm256_rcp_ps(x) }
113 | 
114 |     #[inline(always)] unsafe fn mul_add(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fmadd_ps(x, m, a) }
115 |     #[inline(always)] unsafe fn mul_sub(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fmsub_ps(x, m, a) }
116 |     #[inline(always)] unsafe fn nmul_add(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fnmadd_ps(x, m, a) }
117 |     #[inline(always)] unsafe fn nmul_sub(x: __m256, m: __m256, a: __m256) -> __m256 { _mm256_fnmsub_ps(x, m, a) }
118 | }
119 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod register;
2 | pub mod vector;
3 | 
4 | pub mod polyfills;
5 | 
6 | pub mod avx2;
7 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/polyfills.rs:
--------------------------------------------------------------------------------
 1 | use super::register::{BinaryRegisterOps, FloatRegisterOps};
 2 | 
 3 | #[inline(always)]
 4 | pub const fn _mm_shuffle(w: i32, z: i32, y: i32, x: i32) -> i32 {
 5 |     (w << 6) | (z << 4) | (y << 2) | x
 6 | }
 7 | 
 8 | // https://stackoverflow.com/a/26342944/2083075 + Bernard's comment
 9 | #[inline(always)]
10 | pub unsafe fn float_rem<R>(lhs: R::Storage, rhs: R::Storage) -> R::Storage
11 | where
12 |     R: FloatRegisterOps + BinaryRegisterOps,
13 | {
14 |     R::nmul_add(R::trunc(R::div(lhs, rhs)), rhs, lhs)
15 | }
16 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/register.rs:
--------------------------------------------------------------------------------
 1 | pub unsafe trait Register {
 2 |     type Element: Clone + Copy;
 3 |     type Storage: Clone + Copy;
 4 | 
 5 |     unsafe fn set1(x: Self::Element) -> Self::Storage;
 6 | }
 7 | 
 8 | pub unsafe trait SimpleRegister: Register {
 9 |     unsafe fn load(ptr: *const Self::Element) -> Self::Storage;
10 | }
11 | 
12 | pub unsafe trait FixedRegister<const N: usize>: Register {
13 |     unsafe fn setr(values: [Self::Element; N]) -> Self::Storage;
14 | }
15 | 
16 | pub unsafe trait UnaryRegisterOps: Register {
17 |     unsafe fn bit_not(r: Self::Storage) -> Self::Storage;
18 | }
19 | 
20 | pub unsafe trait BinaryRegisterOps: UnaryRegisterOps {
21 |     unsafe fn bitand(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
22 |     unsafe fn bitor(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
23 |     unsafe fn bitxor(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
24 | 
25 |     #[inline(always)]
26 |     unsafe fn and_not(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage {
27 |         Self::bitand(Self::bit_not(lhs), rhs)
28 |     }
29 | 
30 |     unsafe fn add(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
31 |     unsafe fn sub(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
32 |     unsafe fn mul(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
33 |     unsafe fn div(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
34 |     unsafe fn rem(lhs: Self::Storage, rhs: Self::Storage) -> Self::Storage;
35 | }
36 | 
37 | pub unsafe trait FloatRegisterOps: SignedRegisterOps + BinaryRegisterOps {
38 |     unsafe fn round(x: Self::Storage) -> Self::Storage;
39 |     unsafe fn ceil(x: Self::Storage) -> Self::Storage;
40 |     unsafe fn floor(x: Self::Storage) -> Self::Storage;
41 |     unsafe fn trunc(x: Self::Storage) -> Self::Storage;
42 | 
43 |     #[inline(always)]
44 |     unsafe fn fract(x: Self::Storage) -> Self::Storage {
45 |         Self::sub(x, Self::trunc(x))
46 |     }
47 | 
48 |     unsafe fn sqrt(x: Self::Storage) -> Self::Storage;
49 |     unsafe fn rsqrt(x: Self::Storage) -> Self::Storage;
50 |     unsafe fn rcp(x: Self::Storage) -> Self::Storage;
51 | 
52 |     unsafe fn mul_add(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage;
53 |     unsafe fn mul_sub(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage;
54 |     unsafe fn nmul_add(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage;
55 |     unsafe fn nmul_sub(x: Self::Storage, m: Self::Storage, a: Self::Storage) -> Self::Storage;
56 | }
57 | 
58 | pub unsafe trait SignedRegisterOps: Register {
59 |     unsafe fn neg(x: Self::Storage) -> Self::Storage;
60 |     unsafe fn abs(x: Self::Storage) -> Self::Storage;
61 | }
62 | 
63 | pub unsafe trait MaskRegisterOps: BinaryRegisterOps {
64 |     #[inline(always)]
65 |     unsafe fn blendv(mask: Self::Storage, t: Self::Storage, f: Self::Storage) -> Self::Storage {
66 |         Self::bitor(Self::bitand(mask, t), Self::and_not(mask, f))
67 |     }
68 | 
69 |     unsafe fn all(mask: Self::Storage) -> bool;
70 |     unsafe fn any(mask: Self::Storage) -> bool;
71 | 
72 |     #[inline(always)]
73 |     unsafe fn none(mask: Self::Storage) -> bool {
74 |         !Self::any(mask)
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/backends/vector.rs:
--------------------------------------------------------------------------------
  1 | use crate::*;
  2 | 
  3 | use super::register::*;
  4 | 
  5 | #[repr(transparent)]
  6 | pub struct Vector<R: Register>(R::Storage);
  7 | 
  8 | impl<R: Register> Clone for Vector<R> {
  9 |     fn clone(&self) -> Self {
 10 |         *self
 11 |     }
 12 | }
 13 | 
 14 | impl<R: Register> Copy for Vector<R> {}
 15 | 
 16 | pub trait NumericElement {
 17 |     const ZERO: Self;
 18 |     const ONE: Self;
 19 |     const MIN_VALUE: Self;
 20 |     const MAX_VALUE: Self;
 21 | }
 22 | 
 23 | pub trait SignedElement: NumericElement {
 24 |     const NEG_ONE: Self;
 25 | }
 26 | 
 27 | pub trait FloatElement: SignedElement {
 28 |     const NEG_ZERO: Self;
 29 | }
 30 | 
 31 | macro_rules! impl_element {
 32 |     (NUMERIC $($i:ty),*) => {$(
 33 |         impl NumericElement for $i {
 34 |             const ZERO: Self = 0 as $i;
 35 |             const ONE: Self = 1 as $i;
 36 |             const MIN_VALUE: Self = <$i>::MIN;
 37 |             const MAX_VALUE: Self = <$i>::MAX;
 38 |         }
 39 |     )*};
 40 | 
 41 |     (SIGNED $($i:ty),*) => {$(
 42 |         impl SignedElement for $i {
 43 |             const NEG_ONE: Self = -1 as $i;
 44 |         }
 45 |     )*};
 46 | 
 47 |     (FLOAT $($i:ty),*) => {$(
 48 |         impl FloatElement for $i {
 49 |             const NEG_ZERO: Self = -0.0;
 50 |         }
 51 |     )*}
 52 | }
 53 | 
 54 | impl_element!(NUMERIC i8, i16, i32, i64, u8, u16, u32, u64, f32, f64);
 55 | impl_element!(SIGNED i8, i16, i32, i64, f32, f64);
 56 | impl_element!(FLOAT f32, f64);
 57 | 
 58 | impl<S: Simd, R: Register> SimdVectorBase<S> for Vector<R> {
 59 |     type Element = <R as Register>::Element;
 60 | 
 61 |     #[inline(always)]
 62 |     fn splat(value: Self::Element) -> Self {
 63 |         Vector(unsafe { R::set1(value) })
 64 |     }
 65 | }
 66 | 
 67 | impl<S: Simd, R: Register, const N: usize> SimdFixedVector<S, N> for Vector<R>
 68 | where
 69 |     R: FixedRegister<N>,
 70 | {
 71 |     #[inline(always)]
 72 |     fn set(values: [Self::Element; N]) -> Self {
 73 |         Vector(unsafe { R::setr(values) })
 74 |     }
 75 | }
 76 | 
 77 | #[rustfmt::skip]
 78 | impl<S: Simd, R: Register> SimdVector<S> for Vector<R>
 79 | where
 80 |     R: BinaryRegisterOps,
 81 |     Self: SimdVectorBase<S, Element = R::Element>,
 82 |     <R as Register>::Element: NumericElement,
 83 | {
 84 |     #[inline(always)] fn zero() -> Self { Self::splat(NumericElement::ZERO) }
 85 |     #[inline(always)] fn one() -> Self { Self::splat(NumericElement::ONE) }
 86 |     #[inline(always)] fn min_value() -> Self { Self::splat(NumericElement::MAX_VALUE) }
 87 |     #[inline(always)] fn max_value() -> Self { Self::splat(NumericElement::MIN_VALUE) }
 88 | }
 89 | 
 90 | #[rustfmt::skip]
 91 | impl<S: Simd, R: Register> SimdSignedVector<S> for Vector<R>
 92 | where
 93 |     R: SignedRegisterOps,
 94 |     Self: SimdVector<S, Element = R::Element>,
 95 | {
 96 |     #[inline(always)] fn abs(self) -> Self { Vector(unsafe { R::abs(self.0) }) }
 97 | }
 98 | 
 99 | #[rustfmt::skip]
100 | impl<S: Simd, R: Register> SimdFloatVector<S> for Vector<R>
101 | where
102 |     R: FloatRegisterOps,
103 |     Self: SimdVector<S, Element = R::Element>,
104 |     <R as Register>::Element: FloatElement,
105 | {
106 |     #[inline(always)] fn neg_one() -> Self { Self::splat(SignedElement::NEG_ONE) }
107 |     #[inline(always)] fn neg_zero() -> Self { Self::splat(FloatElement::NEG_ZERO) }
108 | }
109 | 
110 | macro_rules! impl_binary_op {
111 |     (VECTOR $($op_trait:ident::$op:ident),*) => {$(
112 |         impl<R: Register> $op_trait<Self> for Vector<R> where R: BinaryRegisterOps {
113 |             type Output = Self;
114 |             #[inline(always)] fn $op(self, rhs: Self) -> Self {
115 |                 Vector(unsafe { R::$op(self.0, rhs.0) })
116 |             }
117 |         }
118 | 
119 |         impl_binary_op!(ELEMENTS $op_trait::$op [i8, i16, i32, i64, u8, u16, u32, u64, f32, f64]);
120 |     )*};
121 |     (ELEMENTS $op_trait:ident::$op:ident [$($t:ty),*]) => {$(
122 |         impl<R> $op_trait<$t> for Vector<R> where R: Register<Element = $t> + BinaryRegisterOps {
123 |             type Output = Self;
124 |             #[inline(always)] fn $op(self, rhs: $t) -> Self {
125 |                 Vector(unsafe { R::$op(self.0, R::set1(rhs)) })
126 |             }
127 |         }
128 | 
129 |         //impl<R> $op_trait<Vector<R>> for $t where R: Register<Element = $t> + BinaryRegisterOps {
130 |         //    type Output = Vector<R>;
131 |         //    #[inline(always)] fn $op(self, rhs: Vector<R>) -> Vector<R> {
132 |         //        Vector(unsafe { R::$op(R::splat(self), rhs.0) })
133 |         //    }
134 |         //}
135 |     )*}
136 | }
137 | 
138 | impl_binary_op!(VECTOR Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
139 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/iset.rs:
--------------------------------------------------------------------------------
  1 | 
  2 | /// Enum of supported instruction sets
  3 | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
  4 | #[repr(u8)]
  5 | pub enum SimdInstructionSet {
  6 |     Scalar,
  7 | 
  8 |     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
  9 |     SSE2,
 10 |     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 11 |     SSE42,
 12 |     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 13 |     AVX,
 14 |     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 15 |     AVX2,
 16 | 
 17 |     #[cfg(all(feature = "neon", any(target_arch = "arm", target_arch = "aarch64")))]
 18 |     NEON,
 19 | 
 20 |     #[cfg(all(feature = "wasm32", target_arch = "wasm32"))]
 21 |     WASM32,
 22 | }
 23 | 
 24 | impl SimdInstructionSet {
 25 |     #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "static_init"))]
 26 |     #[inline]
 27 |     pub fn runtime_detect() -> SimdInstructionSet {
 28 |         #[static_init::dynamic(0)]
 29 |         static SIS: SimdInstructionSet = SimdInstructionSet::runtime_detect_x86_internal();
 30 | 
 31 |         unsafe { *SIS }
 32 |     }
 33 | 
 34 |     #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(feature = "static_init")))]
 35 |     pub fn runtime_detect() -> SimdInstructionSet {
 36 |         unsafe {
 37 |             static mut CACHED: Option<SimdInstructionSet> = None;
 38 | 
 39 |             match CACHED {
 40 |                 Some(value) => value,
 41 |                 None => {
 42 |                     // Allow this to race, they all converge to the same result
 43 |                     let isa = Self::runtime_detect_x86_internal();
 44 |                     CACHED = Some(isa);
 45 |                     isa
 46 |                 }
 47 |             }
 48 |         }
 49 |     }
 50 | 
 51 |     #[cfg(all(feature = "neon", any(target_arch = "arm", target_arch = "aarch64")))]
 52 |     const fn runtime_detect() -> SimdInstructionSet {
 53 |         SimdInstructionSet::NEON
 54 |     }
 55 | 
 56 |     #[cfg(all(feature = "wasm32", target_arch = "wasm32"))]
 57 |     const fn runtime_detect() -> SimdInstructionSet {
 58 |         SimdInstructionSet::WASM32
 59 |     }
 60 | 
 61 |     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 62 |     fn runtime_detect_x86_internal() -> SimdInstructionSet {
 63 |         if core_detect::is_x86_feature_detected!("fma") {
 64 |             // TODO: AVX512
 65 |             if core_detect::is_x86_feature_detected!("avx2") {
 66 |                 return SimdInstructionSet::AVX2;
 67 |             }
 68 |         }
 69 | 
 70 |         if core_detect::is_x86_feature_detected!("avx") {
 71 |             SimdInstructionSet::AVX
 72 |         } else if core_detect::is_x86_feature_detected!("sse4.2") {
 73 |             SimdInstructionSet::SSE42
 74 |         } else if core_detect::is_x86_feature_detected!("sse2") {
 75 |             SimdInstructionSet::SSE2
 76 |         } else {
 77 |             SimdInstructionSet::Scalar
 78 |         }
 79 |     }
 80 | 
 81 |     /// True fused multiply-add instructions are only used on AVX2 and above, so this checks for that ergonomically.
 82 |     pub const fn has_true_fma(self) -> bool {
 83 |         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 84 |         if self as u8 >= SimdInstructionSet::AVX2 as u8 {
 85 |             return true;
 86 |         }
 87 | 
 88 |         false
 89 |     }
 90 | 
 91 |     /// On older platforms, fused multiply-add instructions can be emulated (expensively),
 92 |     /// but only if the `"emulate_fma"` Cargo feature is enabled.
 93 |     pub const fn has_emulated_fma(self) -> bool {
 94 |         !self.has_true_fma() && cfg!(feature = "emulate_fma")
 95 |     }
 96 | 
 97 |     /// The number of general-purpose registers that can be expected to be allocated to algorithms
 98 |     pub const fn num_registers(self) -> usize {
 99 |         #[allow(unreachable_patterns)]
100 |         match self {
101 |             // #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
102 |             // SimdInstructionSet::AVX512 => 32,
103 | 
104 |             //
105 |             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
106 |             SimdInstructionSet::Scalar => 8,
107 | 
108 |             // x86 has at least 16 registers for xmms, ymms, zmms
109 |             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
110 |             _ => 16,
111 | 
112 |             // 32x64-bit or 32x128-bit registers
113 |             #[cfg(all(feature = "neon", any(target_arch = "arm", target_arch = "aarch64")))]
114 |             SimdInstructionSet::NEON => 32,
115 | 
116 |             _ => 1,
117 |         }
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/lib.rs:
--------------------------------------------------------------------------------
 1 | #![no_std]
 2 | // stdmind for f16c instructions, core_intrinsics for likely/unlikely
 3 | #![cfg_attr(feature = "nightly", feature(stdsimd, core_intrinsics))]
 4 | #![allow(unused_imports, non_camel_case_types, non_snake_case)]
 5 | 
 6 | #[macro_use]
 7 | mod macros;
 8 | 
 9 | pub mod arch;
10 | 
11 | pub mod backends;
12 | pub mod iset;
13 | pub mod widen;
14 | 
15 | pub use iset::SimdInstructionSet;
16 | 
17 | use core::{fmt::Debug, marker::PhantomData, mem, ops::*, ptr};
18 | 
19 | /// SIMD Instruction set, contains all types
20 | ///
21 | /// Take your time to look through this. All trait bounds contain methods and associated values which
22 | /// encapsulate all functionality for this crate.
23 | pub trait Simd: 'static + Debug + Send + Sync + Clone + Copy + PartialEq + Eq {
24 |     const INSTRSET: SimdInstructionSet;
25 | 
26 |     /// Largest native single-precision floating point vector, occupies one register.
27 |     type Vf32;
28 | 
29 |     /// 32-bit single-precision floating point vector
30 |     type Vf32x1;
31 |     /// 64-bit single-precision floating point vector
32 |     type Vf32x2;
33 |     /// 128-bit single-precision floating point vector
34 |     type Vf32x4: SimdFixedVector<Self, 4> + SimdFloatVector<Self, Element = f32> + SimdOverloads<Self>;
35 |     /// 256-bit single-precision floating point vector
36 |     type Vf32x8;
37 |     /// 512-bit single-precision floating point vector
38 |     type Vf32x16;
39 | }
40 | 
41 | pub trait SimdVectorBase<S: Simd>: Clone + Copy {
42 |     type Element;
43 | 
44 |     fn splat(value: Self::Element) -> Self;
45 | }
46 | 
47 | pub trait SimdFixedVector<S: Simd, const N: usize>: SimdVectorBase<S> {
48 |     fn set(values: [Self::Element; N]) -> Self;
49 | }
50 | 
51 | pub trait SimdVector<S: Simd>: SimdVectorBase<S> + Add<Self, Output = Self> {
52 |     fn zero() -> Self;
53 |     fn one() -> Self;
54 |     fn min_value() -> Self;
55 |     fn max_value() -> Self;
56 | }
57 | 
58 | pub trait SimdOverloads<S: Simd>:
59 |     SimdVectorBase<S>
60 |     + Add<Self::Element, Output = Self>
61 |     + Sub<Self::Element, Output = Self>
62 |     + Mul<Self::Element, Output = Self>
63 |     + Div<Self::Element, Output = Self>
64 |     + Rem<Self::Element, Output = Self>
65 | {
66 | }
67 | 
68 | impl<T, S: Simd> SimdOverloads<S> for T where
69 |     T: SimdVectorBase<S>
70 |         + Add<Self::Element, Output = Self>
71 |         + Sub<Self::Element, Output = Self>
72 |         + Mul<Self::Element, Output = Self>
73 |         + Div<Self::Element, Output = Self>
74 |         + Rem<Self::Element, Output = Self>
75 | {
76 | }
77 | 
78 | pub trait SimdSignedVector<S: Simd>: SimdVector<S> {
79 |     fn abs(self) -> Self;
80 | }
81 | 
82 | pub trait SimdFloatVector<S: Simd>: SimdSignedVector<S> {
83 |     fn neg_one() -> Self;
84 |     fn neg_zero() -> Self;
85 | }
86 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/macros.rs:
--------------------------------------------------------------------------------
 1 | #![allow(unused)]
 2 | 
 3 | #[cfg(feature = "nightly")]
 4 | pub use core::intrinsics::{likely, unlikely};
 5 | 
 6 | // borrows technique from https://github.com/rust-lang/hashbrown/pull/209
 7 | #[cfg(not(feature = "nightly"))]
 8 | #[inline]
 9 | #[cold]
10 | fn cold() {}
11 | 
12 | #[cfg(not(feature = "nightly"))]
13 | #[rustfmt::skip]
14 | #[inline(always)]
15 | pub unsafe fn likely(b: bool) -> bool {
16 |     if !b { cold() } b
17 | }
18 | 
19 | #[cfg(not(feature = "nightly"))]
20 | #[rustfmt::skip]
21 | #[inline(always)]
22 | pub unsafe fn unlikely(b: bool) -> bool {
23 |     if b { cold() } b
24 | }
25 | 
26 | #[doc(hidden)]
27 | #[macro_export]
28 | #[rustfmt::skip]
29 | macro_rules! thermite_likely {
30 |     ($e:expr) => {{
31 |         #[allow(unused_unsafe)]
32 |         unsafe { $crate::macros::likely($e) }
33 |     }};
34 | }
35 | 
36 | #[doc(hidden)]
37 | #[macro_export]
38 | #[rustfmt::skip]
39 | macro_rules! thermite_unlikely {
40 |     ($e:expr) => {{
41 |         #[allow(unused_unsafe)]
42 |         unsafe { $crate::macros::unlikely($e) }
43 |     }};
44 | }
45 | 


--------------------------------------------------------------------------------
/crates/thermite2/src/widen.rs:
--------------------------------------------------------------------------------
1 | use crate::*;
2 | 
3 | pub struct Widen<S: Simd, V, const N: usize> {
4 |     vectors: [V; N],
5 |     _simd: PhantomData<S>,
6 | }
7 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 120


--------------------------------------------------------------------------------