, P: Policy, const N: usize> HyperdualP
88 | where
89 | V: SimdVectorizedMath,
90 | {
91 | #[inline(always)]
92 | fn div_dual(self, re: V, denom: V) -> Self {
93 | if N > 1 {
94 | let rcp = denom.reciprocal_p::();
95 | self.map_dual(re, |x| x * rcp)
96 | } else {
97 | self.map_dual(re, |x| x / denom)
98 | }
99 | }
100 |
101 | #[inline(always)]
102 | pub fn fract(mut self) -> Self {
103 | self.re = self.re.fract();
104 | self
105 | }
106 |
107 | #[inline(always)]
108 | pub fn signum(self) -> Self {
109 | Self::real(self.re.signum())
110 | }
111 |
112 | #[inline(always)]
113 | pub fn abs(self) -> Self {
114 | let signum = self.re.signum();
115 | self.map(|x| x * signum)
116 | }
117 |
118 | #[inline(always)]
119 | pub fn select(mask: Mask, t: Self, f: Self) -> Self {
120 | let mut t = t; // Weird compiler bug
121 | for i in 0..N {
122 | t.du[i] = mask.select(t.du[i], f.du[i]);
123 | }
124 | t.re = mask.select(t.re, f.re);
125 | t
126 | }
127 |
128 | #[inline(always)]
129 | pub fn min(self, other: Self) -> Self {
130 | Self::select(self.re.lt(other.re), self, other)
131 | }
132 |
133 | #[inline(always)]
134 | pub fn max(mut self, other: Self) -> Self {
135 | Self::select(self.re.gt(other.re), self, other)
136 | }
137 |
138 | #[inline(always)]
139 | pub fn mul_add(mut self, m: Self, a: Self) -> Self {
140 | for i in 0..N {
141 | self.du[i] = self.du[i].mul_add(m.re, self.re.mul_add(m.du[i], a.du[i]));
142 | }
143 | self.re = self.re.mul_add(m.re, a.re);
144 | self
145 | }
146 |
147 | #[inline(always)]
148 | pub fn powi(self, n: i32) -> Self {
149 | let r = self.re.powi_p::
(n - 1);
150 | let nf = V::splat_as(n) * r;
151 | self.map_dual(self.re * r, |x| x * nf)
152 | }
153 |
154 | #[inline(always)]
155 | pub fn powf(mut self, n: Self) -> Self {
156 | let re_n1 = self.re.powf_p::
(n.re - V::one());
157 |
158 | let re = re_n1 * self.re; // re^n
159 |
160 | let a = n.re * re_n1; // n * re^(n-1)
161 | let b = re * self.re.ln_p::
();
162 |
163 | self.re = re;
164 | for i in 0..N {
165 | self.du[i] = a.mul_add(self.du[i], b * n.du[i]);
166 | }
167 | self
168 | }
169 |
170 | #[inline(always)]
171 | pub fn exp(self) -> Self {
172 | let re = self.re.exp_p::
();
173 | self.map_dual(re, |x| re * x)
174 | }
175 |
176 | #[inline(always)]
177 | pub fn exp2(self) -> Self {
178 | let re = self.re.exp2_p::
();
179 | let re_ln2 = V::LN_2() * re;
180 | self.map_dual(re, |x| x * re_ln2)
181 | }
182 |
183 | #[inline(always)]
184 | pub fn ln(self) -> Self {
185 | self.div_dual(self.re.ln_p::
(), self.re)
186 | }
187 |
188 | #[inline(always)]
189 | pub fn sqrt(self) -> Self {
190 | let re = self.re.sqrt();
191 | self.div_dual(re, re + re)
192 | }
193 |
194 | #[inline(always)]
195 | pub fn cbrt(self) -> Self {
196 | let re = self.re.cbrt();
197 | self.div_dual(re, re + re + re)
198 | }
199 |
200 | fn hypot(self, other: Self) -> Self {
201 | let c = self.re.hypot(other.re);
202 | let mut v = Self::real(c);
203 |
204 | let inv_c = c.reciprocal_p::
();
205 | for i in 0..N {
206 | let x = self.du[i];
207 | let y = other.du[i];
208 |
209 | v.du[i] = self.re.mul_add(x, other.re * y);
210 |
211 | if N > 1 {
212 | v.du[i] *= inv_c;
213 | } else {
214 | v.du[i] /= c;
215 | }
216 | }
217 |
218 | v
219 | }
220 |
221 | #[inline(always)]
222 | pub fn sin_cos(self) -> (Self, Self) {
223 | let (s, c) = self.re.sin_cos_p::
();
224 |
225 | let mut sine = self;
226 | let mut cosi = self;
227 |
228 | sine.re = s;
229 | cosi.re = c;
230 | for i in 0..N {
231 | sine.du[i] *= c;
232 | cosi.du[i] *= s;
233 | }
234 |
235 | (sine, cosi)
236 | }
237 |
238 | #[inline(always)]
239 | pub fn tan(self) -> Self {
240 | let t = self.re.tan_p::
();
241 | let c = t.mul_add(t, V::one());
242 | self.map_dual(t, |x| x * c)
243 | }
244 |
245 | #[inline(always)]
246 | pub fn asin(self) -> Self {
247 | let c = self.re.nmul_adde(self.re, V::one()).invsqrt_p::
();
248 | self.map_dual(self.re.asin(), |x| x * c)
249 | }
250 |
251 | #[inline(always)]
252 | pub fn acos(self) -> Self {
253 | let c = self.re.nmul_adde(self.re, V::one()).invsqrt_p::
().neg();
254 | self.map_dual(self.re.acos(), |x| x * c)
255 | }
256 |
257 | #[inline(always)]
258 | pub fn atan(self) -> Self {
259 | let c = self.re.mul_adde(self.re, V::one());
260 | self.div_dual(self.re.atan(), c)
261 | }
262 |
263 | pub fn atan2(self, x: Self) -> Self {
264 | let y = self;
265 | let c = y.re.mul_add(y.re, x.re * x.re);
266 |
267 | let mut v = Self::real(y.re.atan2(x.re));
268 |
269 | let inv_c = c.reciprocal_p::
();
270 | for i in 0..N {
271 | v.du[i] = x.re.mul_sub(y.du[i], y.re * x.du[i]) * c;
272 |
273 | if N > 1 {
274 | v.du[i] *= inv_c;
275 | } else {
276 | v.du[i] /= c;
277 | }
278 | }
279 |
280 | v
281 | }
282 |
283 | #[inline(always)]
284 | pub fn sinh_cosh(self) -> (Self, Self) {
285 | let s = self.re.sinh_p::
();
286 | let c = self.re.cosh_p::
();
287 | (self.map_dual(s, |x| x * c), self.map_dual(c, |x| x * s))
288 | }
289 |
290 | #[inline(always)]
291 | pub fn tanh(self) -> Self {
292 | let re = self.re.tanh_p::
();
293 | let c = re.nmul_add(re, V::one()); // 1 - r^2
294 | self.map_dual(re, |x| x * c)
295 | }
296 | }
297 |
298 | #[dispatch(S)]
299 | impl, P: Policy, const N: usize> Add for HyperdualP {
300 | type Output = Self;
301 |
302 | #[inline(always)]
303 | fn add(mut self, rhs: Self) -> Self {
304 | self.re += rhs.re;
305 | for i in 0..N {
306 | self.du[i] += rhs.du[i];
307 | }
308 | self
309 | }
310 | }
311 |
312 | #[dispatch(S)]
313 | impl, P: Policy, const N: usize> Sub for HyperdualP {
314 | type Output = Self;
315 |
316 | #[inline(always)]
317 | fn sub(mut self, rhs: Self) -> Self {
318 | self.re -= rhs.re;
319 | for i in 0..N {
320 | self.du[i] -= rhs.du[i];
321 | }
322 | self
323 | }
324 | }
325 |
326 | #[dispatch(S)]
327 | impl, P: Policy, const N: usize> Mul for HyperdualP {
328 | type Output = Self;
329 |
330 | #[inline(always)]
331 | fn mul(mut self, rhs: Self) -> Self {
332 | for i in 0..N {
333 | self.du[i] = self.re.mul_add(rhs.du[i], rhs.re * self.du[i]);
334 | }
335 | self.re *= rhs.re;
336 | self
337 | }
338 | }
339 |
340 | #[dispatch(S)]
341 | impl, P: Policy, const N: usize> Div for HyperdualP
342 | where
343 | V: SimdVectorizedMath,
344 | {
345 | type Output = Self;
346 |
347 | #[inline(always)]
348 | fn div(mut self, rhs: Self) -> Self {
349 | let d = self.re * rhs.re;
350 |
351 | let inv_d = d.reciprocal_p::();
352 | for i in 0..N {
353 | self.du[i] = rhs.re.mul_sub(self.du[i], self.re * rhs.du[i]) * d;
354 |
355 | if N > 1 {
356 | self.du[i] *= inv_d;
357 | } else {
358 | self.du[i] /= d;
359 | }
360 | }
361 | self.re /= rhs.re;
362 | self
363 | }
364 | }
365 |
--------------------------------------------------------------------------------
/crates/thermite-special/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "thermite-special"
3 | version = "0.1.0"
4 | authors = ["novacrazy "]
5 | edition = "2018"
6 |
7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
8 |
9 | [dependencies]
10 | thermite = { path = "../thermite" }
11 | thermite-complex = { path = "../thermite-complex" }
--------------------------------------------------------------------------------
/crates/thermite-special/src/ps.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | const EULERS_CONSTANT: f32 = 5.772156649015328606065120900824024310e-01;
4 |
5 | impl SimdVectorizedSpecialFunctionsInternal for f32
6 | where
7 | ::Vf32: SimdFloatVector,
8 | {
9 | #[inline(always)]
10 | fn tgamma(mut z: Self::Vf) -> Self::Vf {
11 | let zero = Vf32::::zero();
12 | let one = Vf32::::one();
13 | let half = Vf32::::splat(0.5);
14 | let quarter = Vf32::::splat(0.25);
15 | let pi = Vf32::::PI();
16 |
17 | let orig_z = z;
18 |
19 | let is_neg = z.is_negative();
20 | let mut reflected = Mask::falsey();
21 |
22 | let mut res = one;
23 |
24 | 'goto_positive: while is_neg.any() {
25 | reflected = z.le(Vf32::::splat(-20.0));
26 |
27 | let mut refl_res = unsafe { Vf32::::undefined() };
28 |
29 | // sine is expensive, so branch for it.
30 | if P::POLICY.avoid_precision_branches() || thermite_unlikely!(reflected.any()) {
31 | refl_res = >::sin_pix::(z);
32 |
33 | // If not branching, all negative values are reflected
34 | if P::POLICY.avoid_precision_branches() {
35 | reflected = is_neg;
36 |
37 | res = reflected.select(refl_res, res);
38 | z = z.conditional_neg(reflected);
39 |
40 | break 'goto_positive;
41 | }
42 |
43 | // NOTE: I chose not to use a bitmask here, because some bitmasks can be
44 | // one extra instruction than the raw call to `all` again, and since z <= -20 is so rare,
45 | // that extra instruction is not worth it.
46 | if reflected.all() {
47 | res = refl_res;
48 | z = -z;
49 |
50 | break 'goto_positive;
51 | }
52 | }
53 |
54 | let mut mod_z = z;
55 | let mut is_neg = is_neg;
56 |
57 | // recursively apply Γ(z+1)/z
58 | while is_neg.any() {
59 | res = is_neg.select(res / mod_z, res);
60 | mod_z = mod_z.conditional_add(one, is_neg);
61 | is_neg = mod_z.is_negative();
62 | }
63 |
64 | z = reflected.select(-z, mod_z);
65 | res = reflected.select(refl_res, res);
66 |
67 | break 'goto_positive;
68 | }
69 |
70 | // label
71 | //positive:
72 |
73 | // Integers
74 |
75 | let mut z_int = Mask::falsey();
76 | let mut fact_res = one;
77 |
78 | if P::POLICY.precision > PrecisionPolicy::Worst {
79 | let zf = z.floor();
80 | z_int = zf.eq(z);
81 |
82 | let bitmask = z_int.bitmask();
83 |
84 | if thermite_unlikely!(bitmask.any()) {
85 | let mut j = one;
86 | let mut k = j.lt(zf);
87 |
88 | while k.any() {
89 | fact_res = k.select(fact_res * j, fact_res);
90 | j += one;
91 | k = j.lt(zf);
92 | }
93 |
94 | // Γ(-int) = NaN for poles
95 | fact_res = is_neg.select(Vf32::::nan(), fact_res);
96 | // approaching zero from either side results in +/- infinity
97 | fact_res = orig_z.eq(zero).select(Vf32::::infinity().copysign(orig_z), fact_res);
98 |
99 | if bitmask.all() {
100 | return fact_res;
101 | }
102 | }
103 | }
104 |
105 | // Full
106 |
107 | let gh = Vf32::::splat(LANCZOS_G - 0.5);
108 |
109 | let lanczos_sum = z.poly_rational_p::
(LANCZOS_P, LANCZOS_Q);
110 |
111 | let zgh = z + gh;
112 | let lzgh = zgh.ln_p::
();
113 |
114 | // (z * lzfg) > ln(f32::MAX)
115 | let very_large = (z * lzgh).gt(Vf32::::splat(
116 | 88.722839053130621324601674778549183073943430402325230485234240247,
117 | ));
118 |
119 | // only compute powf once
120 | let h = zgh.powf_p::
(very_large.select(z.mul_sube(half, quarter), z - half));
121 |
122 | // save a couple cycles by avoiding this division, but worst-case precision is slightly worse
123 | let denom = if P::POLICY.precision >= PrecisionPolicy::Best {
124 | lanczos_sum / zgh.exp_p::
()
125 | } else {
126 | lanczos_sum * (-zgh).exp_p::
()
127 | };
128 |
129 | let normal_res = very_large.select(h * h, h) * denom;
130 |
131 | // Tiny
132 | if P::POLICY.precision >= PrecisionPolicy::Best {
133 | let is_tiny = z.lt(Vf32::::splat(
134 | >::__SQRT_EPSILON,
135 | ));
136 | let tiny_res = z.reciprocal_p::() - Vf32::::splat(EULERS_CONSTANT);
137 | res *= is_tiny.select(tiny_res, normal_res);
138 | } else {
139 | res *= normal_res;
140 | }
141 |
142 | reflected.select(-pi / res, z_int.select(fact_res, res))
143 | }
144 |
145 | #[inline(always)]
146 | fn lgamma(mut z: Self::Vf) -> Self::Vf {
147 | let one = Vf32::::one();
148 | let zero = Vf32::::zero();
149 |
150 | let reflect = z.lt(zero);
151 |
152 | let mut t = one;
153 |
154 | if P::POLICY.avoid_branching || reflect.any() {
155 | t = reflect.select(>::sin_pix::(z).abs(), one);
156 | z = z.conditional_neg(reflect);
157 | }
158 |
159 | let gh = Vf32::::splat(LANCZOS_G - 0.5);
160 |
161 | let mut lanczos_sum = z.poly_rational_p::
(LANCZOS_P_EXPG_SCALED, LANCZOS_Q);
162 |
163 | // Full A
164 | let mut a = (z + gh).ln_p::
() - one;
165 |
166 | // Tiny
167 | if P::POLICY.precision >= PrecisionPolicy::Best {
168 | let is_not_tiny = z.ge(Vf32::::splat_as(
169 | >::__SQRT_EPSILON,
170 | ));
171 | let tiny_res = z.reciprocal_p::() - Vf32::::splat(EULERS_CONSTANT);
172 |
173 | // shove the tiny result into the log down below
174 | lanczos_sum = is_not_tiny.select(lanczos_sum, tiny_res);
175 | // force multiplier to zero for tiny case, allowing the modified
176 | // lanczos sum and ln(t) to be combined for cheap
177 | a &= is_not_tiny.value();
178 | }
179 |
180 | // Full
181 |
182 | let b = z - Vf32::::splat(0.5);
183 | let c = (lanczos_sum * t).ln_p::
();
184 |
185 | let mut res = a.mul_adde(b, c);
186 |
187 | let ln_pi = Vf32::::LN_PI();
188 |
189 | res = reflect.select(ln_pi - res, res);
190 |
191 | res
192 | }
193 |
194 | #[inline(always)]
195 | fn digamma(mut x: Self::Vf) -> Self::Vf {
196 | let zero = Vf32::::zero();
197 | let one = Vf32::::one();
198 | let half = Vf32::::splat(0.5);
199 | let pi = Vf32::::PI();
200 |
201 | let mut result = zero;
202 |
203 | let reflect = x.le(Vf32::::neg_one());
204 |
205 | if reflect.any() {
206 | x = reflect.select(one - x, x);
207 |
208 | let mut rem = x - x.floor();
209 |
210 | rem = rem.conditional_sub(one, rem.gt(half));
211 |
212 | let (s, c) = (rem * pi).sin_cos_p::();
213 | let refl_res = pi * c / s;
214 |
215 | result = reflect.select(refl_res, result);
216 | }
217 |
218 | let lim = Vf32::::splat(
219 | 0.5 * (10 + ((>::__DIGITS as i64 - 50) * 240) / 950) as f32,
220 | );
221 |
222 | // Rescale to use asymptotic expansion
223 | let mut is_small = x.lt(lim);
224 | while is_small.any() {
225 | result = result.conditional_sub(x.reciprocal_p::(), is_small);
226 | x = x.conditional_add(one, is_small);
227 | is_small = x.lt(lim);
228 | }
229 |
230 | x -= one;
231 |
232 | let inv_x = x.reciprocal_p::
();
233 |
234 | let z = inv_x * inv_x;
235 | let a = x.ln_p::
() + (inv_x * half);
236 |
237 | let y = z.poly_p::
(&[
238 | 0.083333333333333333333333333333333333333333333333333,
239 | -0.0083333333333333333333333333333333333333333333333333,
240 | 0.003968253968253968253968253968253968253968253968254,
241 | ]);
242 |
243 | result += z.nmul_adde(y, a);
244 |
245 | result
246 | }
247 |
248 | #[inline(always)]
249 | fn beta(a: Self::Vf, b: Self::Vf) -> Self::Vf {
250 | let zero = Vf32::::zero();
251 |
252 | let is_valid = a.gt(zero) & b.gt(zero);
253 |
254 | if P::POLICY.check_overflow && !P::POLICY.avoid_branching {
255 | if is_valid.none() {
256 | return Vf32::::nan();
257 | }
258 | }
259 |
260 | let c = a + b;
261 |
262 | // if a < b then swap
263 | let (a, b) = (a.max(b), a.min(b));
264 |
265 | let mut result = a.poly_rational_p::(LANCZOS_P_EXPG_SCALED, LANCZOS_Q)
266 | * (b.poly_rational_p::
(LANCZOS_P_EXPG_SCALED, LANCZOS_Q)
267 | / c.poly_rational_p::
(LANCZOS_P_EXPG_SCALED, LANCZOS_Q));
268 |
269 | let gh = Vf32::::splat(LANCZOS_G - 0.5);
270 |
271 | let agh = a + gh;
272 | let bgh = b + gh;
273 | let cgh = c + gh;
274 |
275 | let agh_d_cgh = agh / cgh;
276 | let bgh_d_cgh = bgh / cgh;
277 | let agh_p_bgh = agh * bgh;
278 | let cgh_p_cgh = cgh * cgh;
279 |
280 | let base = cgh
281 | .gt(Vf32::::splat(1e10))
282 | .select(agh_d_cgh * bgh_d_cgh, agh_p_bgh / cgh_p_cgh);
283 |
284 | let denom = if P::POLICY.precision > PrecisionPolicy::Average {
285 | Vf32::::SQRT_E() / bgh.sqrt()
286 | } else {
287 | // bump up the precision a little to improve beta function accuracy
288 | Vf32::::SQRT_E() * bgh.invsqrt_p::>()
289 | };
290 |
291 | result *= agh_d_cgh.powf_p::(a - Vf32::::splat(0.5) - b) * (base.powf_p::
(b) * denom);
292 |
293 | if P::POLICY.check_overflow {
294 | result = is_valid.select(result, Vf32::::nan());
295 | }
296 |
297 | result
298 | }
299 | }
300 |
301 | const LANCZOS_G: f32 = 1.428456135094165802001953125;
302 |
303 | const LANCZOS_P: &[f32] = &[
304 | 58.52061591769095910314047740215847630266,
305 | 182.5248962595894264831189414768236280862,
306 | 211.0971093028510041839168287718170827259,
307 | 112.2526547883668146736465390902227161763,
308 | 27.5192015197455403062503721613097825345,
309 | 2.50662858515256974113978724717473206342,
310 | ];
311 |
312 | const LANCZOS_Q: &[f32] = &[0.0, 24.0, 50.0, 35.0, 10.0, 1.0];
313 |
314 | const LANCZOS_P_EXPG_SCALED: &[f32] = &[
315 | 14.0261432874996476619570577285003839357,
316 | 43.74732405540314316089531289293124360129,
317 | 50.59547402616588964511581430025589038612,
318 | 26.90456680562548195593733429204228910299,
319 | 6.595765571169314946316366571954421695196,
320 | 0.6007854010515290065101128585795542383721,
321 | ];
322 |
--------------------------------------------------------------------------------
/crates/thermite/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "thermite"
3 | version = "0.1.1-alpha.0"
4 | license = "MIT OR Apache-2.0"
5 | readme = "README.md"
6 | authors = ["novacrazy "]
7 | repository = "https://github.com/raygon-renderer/thermite"
8 | documentation = "https://raygon-renderer.github.io/thermite/"
9 | edition = "2018"
10 |
11 | [features]
12 | default = ["alloc", "math", "rng", "emulate_fma", "static_init"]
13 | neon = ["thermite-dispatch/neon"]
14 | wasm32 = ["thermite-dispatch/wasm32"]
15 | alloc = []
16 | nightly = []
17 | math = []
18 | rng = []
19 | emulate_fma = []
20 |
21 | [dependencies]
22 | thermite-dispatch = { path = "../dispatch" }
23 | paste = "1"
24 | half = "1.6.0"
25 |
26 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies.static_init]
27 | version = "1"
28 | optional = true
29 | default_features = false
30 |
31 | [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
32 | core_detect = "0.1.0"
33 |
34 | [dev-dependencies]
35 | criterion = "0.3"
36 | libm = "0.2.1"
37 | plotly = "0.6.0"
38 | rand = "0.8"
39 | rand_xoshiro = "0.6.0"
40 | no-panic = "0.1"
41 | thermite-special = { path = "../thermite-special" }
42 | thermite-complex = { path = "../thermite-complex" }
43 | num-complex = "0.4"
44 |
45 | [[bench]]
46 | name = "main"
47 | harness = false
48 |
49 |
--------------------------------------------------------------------------------
/crates/thermite/examples/asm.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused)]
2 |
3 | // NOTE: This example only exists to be compiled and inspected as assembly via the command:
4 | // `cargo rustc --example asm --release -- -C target-feature=+sse2 --emit asm`
5 | // It's easier to access the example output in the `target/release/examples` directory
6 |
7 | use no_panic::no_panic;
8 |
9 | use thermite::*;
10 | use thermite_special::*;
11 |
12 | pub mod geo;
13 |
14 | use thermite::backends::avx2::AVX2;
15 | use thermite::rng::SimdRng;
16 |
17 | type Vf32 = ::Vf32;
18 | type Vf64 = ::Vf64;
19 | type Vi32 = ::Vi32;
20 | type Vu64 = ::Vu64;
21 | type Vu32 = ::Vu32;
22 | type Vi64 = ::Vi64;
23 |
24 | type Vector3xN = geo::Vector3xN;
25 |
26 | type Xoshiro128Plus = thermite::rng::xoshiro::Xoshiro128Plus;
27 |
28 | #[no_mangle]
29 | #[inline(never)]
30 | pub fn test_dynamic_dispatch(value: &mut [f32]) {
31 | assert_eq!(value.len(), 8);
32 |
33 | #[dispatch]
34 | fn test(value: &mut [f32]) {
35 | thermite::Vf32::::load_unaligned(value).exp2().store_unaligned(value);
36 | }
37 |
38 | dispatch_dyn!({ test::(value) })
39 | }
40 |
41 | #[no_mangle]
42 | #[inline(never)]
43 | #[target_feature(enable = "avx2,fma")]
44 | pub unsafe fn test_simdrng(rng: &mut Xoshiro128Plus) -> Vf64 {
45 | rng.next_f64()
46 | }
47 |
48 | #[no_mangle]
49 | #[inline(never)]
50 | #[target_feature(enable = "avx2,fma")]
51 | pub unsafe fn test_revbits(x: Vi32) -> Vi32 {
52 | x.reverse_bits()
53 | }
54 |
55 | #[no_mangle]
56 | #[inline(never)]
57 | #[target_feature(enable = "avx2,fma")]
58 | pub unsafe fn test_normalize(v: &mut Vector3xN) {
59 | *v = v.normalize()
60 | }
61 |
62 | #[no_mangle]
63 | #[inline(never)]
64 | #[target_feature(enable = "avx2,fma")]
65 | pub unsafe fn test_u64div(a: Vu64, b: Vu64) -> Vu64 {
66 | a / b
67 | }
68 |
69 | #[no_mangle]
70 | #[inline(never)]
71 | #[target_feature(enable = "avx2,fma")]
72 | pub unsafe fn test_bitmask(b: u16) -> Vu64 {
73 | Mask::from_bitmask(b).value()
74 | }
75 |
76 | #[no_mangle]
77 | #[inline(never)]
78 | #[target_feature(enable = "avx2,fma")]
79 | pub unsafe fn test_cross(a: Vector3xN, b: Vector3xN) -> Vector3xN {
80 | a.cross(&b)
81 | }
82 |
83 | #[no_mangle]
84 | #[inline(never)]
85 | #[target_feature(enable = "avx2,fma")]
86 | pub unsafe fn do_alloc(count: usize) -> VectorBuffer {
87 | Vf32::alloc(count)
88 | }
89 |
90 | #[no_mangle]
91 | #[inline(never)]
92 | #[target_feature(enable = "avx2,fma")]
93 | pub unsafe fn test_powf_ps(y: Vf32, x: Vf32) -> Vf32 {
94 | y.powf(x)
95 | }
96 |
97 | #[no_mangle]
98 | #[inline(never)]
99 | #[target_feature(enable = "avx2,fma")]
100 | pub unsafe fn test_powf_pd(y: Vf64, x: Vf64) -> Vf64 {
101 | y.powf(x)
102 | }
103 |
104 | #[no_mangle]
105 | #[inline(never)]
106 | #[target_feature(enable = "avx2,fma")]
107 | pub unsafe fn test_smootheststep(x: Vf32) -> Vf32 {
108 | x.smootheststep()
109 | }
110 |
111 | #[no_mangle]
112 | #[inline(never)]
113 | //#[target_feature(enable = "avx2,fma")]
114 | pub unsafe fn test_pdsin(x: Vf64) -> Vf64 {
115 | x.sin()
116 | }
117 |
118 | #[no_mangle]
119 | #[inline(never)]
120 | #[target_feature(enable = "avx2,fma")]
121 | pub unsafe fn test_pssin_cos(x: Vf32) -> (Vf32, Vf32) {
122 | x.sin_cos_p::()
123 | }
124 |
125 | #[no_mangle]
126 | #[inline(never)]
127 | #[target_feature(enable = "avx2,fma")]
128 | pub unsafe fn test_select_neg_ps(x: Vf32, a: Vf32, b: Vf32) -> Vf32 {
129 | x.is_negative().select(a, b)
130 | }
131 |
132 | #[no_mangle]
133 | #[inline(never)]
134 | #[target_feature(enable = "avx2,fma")]
135 | pub unsafe fn test_select_neg_epi32(x: Vi32, a: Vi32, b: Vi32) -> Vi32 {
136 | x.is_negative().select(a, b)
137 | }
138 |
139 | #[no_mangle]
140 | #[inline(never)]
141 | #[target_feature(enable = "avx2,fma")]
142 | #[no_panic]
143 | pub unsafe fn test_shuffle(x: Vf64, y: Vf64) -> Vf64 {
144 | match Vf64::NUM_ELEMENTS {
145 | 4 => shuffle!(x, y, [6, 2, 1, 7]),
146 | 8 => shuffle!(x, y, [5, 6, 10, 9, 2, 8, 6, 4]),
147 | _ => unimplemented!(),
148 | }
149 | }
150 |
151 | #[no_mangle]
152 | #[inline(never)]
153 | #[target_feature(enable = "avx2,fma")]
154 | pub unsafe fn test_shuffle_dyn_unchecked(a: Vf32, b: Vf32, indices: &[usize]) -> Vf32 {
155 | a.shuffle_dyn_unchecked(b, indices)
156 | }
157 |
158 | //#[no_mangle]
159 | //#[inline(never)]
160 | //#[target_feature(enable = "avx2,fma")]
161 | //pub unsafe fn test_shuffle_dyn(x: Vf32, y: Vf32, indices: &[usize; 8]) -> Vf32 {
162 | // x.shuffle(y, &indices[..])
163 | //}
164 |
165 | #[no_mangle]
166 | #[inline(never)]
167 | //#[target_feature(enable = "avx2,fma")]
168 | pub unsafe fn test_pstgamma(x: Vf32) -> Vf32 {
169 | x.tgamma_p::()
170 | }
171 |
172 | #[no_mangle]
173 | #[inline(never)]
174 | //#[target_feature(enable = "avx2,fma")]
175 | pub unsafe fn test_pdtgamma(x: Vf64) -> Vf64 {
176 | x.tgamma()
177 | }
178 |
179 | #[no_mangle]
180 | #[inline(never)]
181 | #[target_feature(enable = "avx2,fma")]
182 | pub unsafe fn test_pserf(x: Vf32) -> Vf32 {
183 | x.erf()
184 | }
185 |
186 | #[no_mangle]
187 | #[inline(never)]
188 | pub unsafe fn test_psexp(x: Vf32) -> Vf32 {
189 | x.exp()
190 | }
191 |
192 | #[no_mangle]
193 | #[inline(never)]
194 | #[target_feature(enable = "avx2,fma")]
195 | pub unsafe fn test_pderfinv(x: Vf64) -> Vf64 {
196 | x.erfinv()
197 | }
198 |
199 | #[no_mangle]
200 | #[inline(never)]
201 | #[target_feature(enable = "avx2,fma")]
202 | pub unsafe fn test_pscbrt(x: Vf32) -> Vf32 {
203 | x.cbrt()
204 | }
205 |
206 | //#[no_mangle]
207 | //#[inline(never)]
208 | //#[target_feature(enable = "avx2,fma")]
209 | //pub unsafe fn test_ps_bessel_y4(x: Vf32) -> Vf32 {
210 | // x.bessel_y_p::(4)
211 | //}
212 |
213 | #[no_mangle]
214 | #[inline(never)]
215 | #[target_feature(enable = "avx2,fma")]
216 | pub unsafe fn test_poly(x: Vf32, e: &[f32]) -> Vf32 {
217 | x.poly_f(128, |i| Vf32::splat(*e.get_unchecked(i)))
218 | }
219 |
220 | #[no_mangle]
221 | #[inline(never)]
222 | #[target_feature(enable = "avx2,fma")]
223 | pub unsafe fn test_rational_poly(x: Vf32, e: &[f32], d: &[f32]) -> Vf32 {
224 | let n0 = x.poly_f(19, |i| Vf32::splat(*e.get_unchecked(i)));
225 | let n1 = x.poly_f(19, |i| Vf32::splat(*d.get_unchecked(i)));
226 |
227 | n0 / n1
228 | }
229 |
230 | #[no_mangle]
231 | #[inline(never)]
232 | #[target_feature(enable = "avx2,fma")]
233 | pub unsafe fn test_rational_poly2(x: Vf32, e: &[f32], d: &[f32]) -> Vf32 {
234 | assert!(e.len() == 19 && e.len() == d.len());
235 |
236 | x.poly_rational_p::(e, d)
237 | }
238 |
239 | #[no_mangle]
240 | #[inline(never)]
241 | #[target_feature(enable = "avx2,fma")]
242 | pub unsafe fn test_poly2(x: Vf32) -> Vf32 {
243 | x.poly_f(128, |i| {
244 | Vf32::splat((-1.0f32).powi(i as i32) * (2f32.powi(i as i32) - i as f32))
245 | })
246 | }
247 |
248 | #[no_mangle]
249 | #[inline(never)]
250 | #[target_feature(enable = "avx2,fma")]
251 | pub unsafe fn test_pdcbrt(x: Vf64) -> Vf64 {
252 | x.cbrt()
253 | }
254 |
255 | #[no_mangle]
256 | #[inline(never)]
257 | #[target_feature(enable = "avx2,fma")]
258 | pub unsafe fn test_pdsinh(x: Vf64) -> Vf64 {
259 | x.sinh_p::()
260 | }
261 |
262 | #[no_mangle]
263 | #[inline(never)]
264 | #[target_feature(enable = "avx2,fma")]
265 | pub unsafe fn test_pssinh(x: Vf32) -> Vf32 {
266 | x.sinh_p::()
267 | }
268 |
269 | #[no_mangle]
270 | #[inline(never)]
271 | #[target_feature(enable = "avx2,fma")]
272 | pub unsafe fn test_jacobi(x: Vf32, alpha: Vf32, beta: Vf32, n: u32, m: u32) -> Vf32 {
273 | x.legendre(50, 0)
274 | }
275 |
276 | #[no_mangle]
277 | #[inline(never)]
278 | #[target_feature(enable = "avx2,fma")]
279 | pub unsafe fn test_cast2(x: Vf64) -> Vi64 {
280 | x.cast()
281 | }
282 |
283 | fn main() {}
284 |
--------------------------------------------------------------------------------
/crates/thermite/examples/geo/mod.rs:
--------------------------------------------------------------------------------
1 | use thermite::*;
2 |
3 | #[derive(Debug, Clone, Copy)]
4 | pub struct Vector3xN {
5 | pub x: Vf32,
6 | pub y: Vf32,
7 | pub z: Vf32,
8 | }
9 |
10 | impl Vector3xN {
11 | pub fn dot(&self, other: &Self) -> S::Vf32 {
12 | self.x.mul_add(other.x, self.y.mul_add(other.y, self.z * other.z))
13 | }
14 |
15 | pub fn cross(&self, other: &Self) -> Self {
16 | Self {
17 | x: self.y.mul_sub(other.z, self.z * other.y),
18 | y: self.z.mul_sub(other.x, self.x * other.z),
19 | z: self.x.mul_sub(other.y, self.y * other.x),
20 | }
21 | }
22 |
23 | pub fn norm_squared(&self) -> S::Vf32 {
24 | self.dot(self)
25 | }
26 |
27 | pub fn norm(&self) -> S::Vf32 {
28 | self.norm_squared().sqrt()
29 | }
30 |
31 | pub fn normalize(&self) -> Self {
32 | let inv_norm = self.norm_squared().invsqrt_p::();
33 |
34 | Self {
35 | x: self.x * inv_norm,
36 | y: self.y * inv_norm,
37 | z: self.z * inv_norm,
38 | }
39 | }
40 | }
41 |
42 | #[derive(Debug, Clone, Copy)]
43 | pub struct Matrix4xN {
44 | pub m: [[S::Vf32; 4]; 4],
45 | }
46 |
47 | impl Matrix4xN {
48 | pub fn at(&self, row: usize, col: usize) -> &S::Vf32 {
49 | &self.m[col][row]
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/crates/thermite/examples/plot.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused)]
2 |
3 | use thermite::*;
4 |
5 | pub mod geo;
6 |
7 | use thermite::backends::avx2::AVX2;
8 |
9 | type Vf32 = ::Vf32;
10 | type Vf64 = ::Vf64;
11 | type Vi32 = ::Vi32;
12 | type Vu64 = ::Vu64;
13 | type Vu32 = ::Vu32;
14 | type Vi64 = ::Vi64;
15 |
16 | use plotly::common::{ColorScale, ColorScalePalette, DashType, Fill, Font, Line, LineShape, Marker, Mode, Title};
17 | use plotly::layout::{Axis, BarMode, Layout, Legend, TicksDirection};
18 | use plotly::{Bar, NamedColor, Plot, Rgb, Rgba, Scatter};
19 |
20 | fn plot_function(name: &str, x_axis: &Vec, plot: &mut Plot, mut f: F)
21 | where
22 | F: FnMut(Vf32) -> Vf32,
23 | {
24 | let mut y_axis = vec![0.0; x_axis.len()];
25 |
26 | for (src, dst) in x_axis
27 | .chunks(Vf32::NUM_ELEMENTS)
28 | .zip(y_axis.chunks_mut(Vf32::NUM_ELEMENTS))
29 | {
30 | f(Vf32::load_unaligned(src))
31 | //.clamp(Vf32::splat(-400.0), Vf32::splat(400.0))
32 | .store_unaligned(dst);
33 | }
34 |
35 | plot.add_trace(Scatter::new(x_axis.clone(), y_axis).mode(Mode::Lines).name(name));
36 | }
37 |
38 | fn main() {
39 | let num_points = Vf32::NUM_ELEMENTS * 1000;
40 |
41 | let x_axis: Vec = (0..num_points)
42 | .into_iter()
43 | .map(|x| (x as f32 / num_points as f32) * 30.0 - 15.0)
44 | .collect();
45 |
46 | let layout = Layout::new().title(Title::new("Gamma function"));
47 | let mut plot = Plot::new();
48 |
49 | //for i in 0..5 {
50 | // plot_function(&format!("Y{}", i), &x_axis, &mut plot, |x| {
51 | // x.bessel_y_p::(i)
52 | // });
53 | //}
54 |
55 | //plot_function("cos(x) [Precision]", &x_axis, &mut plot, |x| {
56 | // x.cos_p::()
57 | //});
58 | //plot_function("cos(x) [Reference]", &x_axis, &mut plot, |x| {
59 | // x.cos_p::()
60 | //});
61 | //
62 | //plot_function("sin(x) [Precision]", &x_axis, &mut plot, |x| {
63 | // x.sin_p::()
64 | //});
65 | //plot_function("sin(x) [Reference]", &x_axis, &mut plot, |x| {
66 | // x.sin_p::()
67 | //});
68 |
69 | //plot_function("tgamma(x)", &x_axis, &mut plot, |x| x.tgamma());
70 | //plot_function("lgamma(x)", &x_axis, &mut plot, |x| x.lgamma());
71 | //plot_function("ln(tgamma(x))", &x_axis, &mut plot, |x| x.tgamma().ln());
72 | //plot_function("diff*1000", &x_axis, &mut plot, |x| {
73 | // (x.tgamma().ln() - x.lgamma()) * Vf32::splat(1000.0)
74 | //});
75 |
76 | //plot_function("digamma(x)", &x_axis, &mut plot, |x| x.digamma());
77 |
78 | /*
79 | plot_function("Gamma Avg", &x_axis, &mut plot, |x| x.tgamma());
80 | plot_function("Gamma Worst", &x_axis, &mut plot, |x| {
81 | x.tgamma_p::()
82 | });
83 |
84 | plot_function("Diffx100", &x_axis, &mut plot, |x| {
85 | (x.tgamma() - x.tgamma_p::()) * Vf32::splat(100.0)
86 | });
87 | */
88 |
89 | plot_function("Ln Avg", &x_axis, &mut plot, |x| x.ln());
90 | plot_function("Ln Worst", &x_axis, &mut plot, |x| {
91 | x.ln_p::()
92 | });
93 |
94 | plot_function("Diffx100", &x_axis, &mut plot, |x| {
95 | (x.ln() - x.ln_p::()) * Vf32::splat(100.0)
96 | });
97 |
98 | /*
99 | for i in 0..5 {
100 | plot_function(&format!("beta(x, {}) [UP]", i), &x_axis, &mut plot, |x| {
101 | x.beta_p::(Vf32::splat_as(i + 1))
102 | });
103 | }
104 |
105 | for i in 0..5 {
106 | plot_function(&format!("beta(x, {}) [Precision]", i), &x_axis, &mut plot, |x| {
107 | x.beta_p::(Vf32::splat_as(i + 1))
108 | });
109 | }
110 | */
111 |
112 | plot.show();
113 | }
114 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/aarch64/mod.rs:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/arm/mod.rs:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/mod.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused)]
2 |
3 | use crate::*;
4 |
5 | use core::{
6 | fmt,
7 | marker::PhantomData,
8 | mem::{transmute, transmute_copy},
9 | ops::*,
10 | };
11 |
12 | use crate::arch::avx::*;
13 |
14 | use half::f16;
15 |
16 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
17 | pub struct AVX1;
18 |
19 | #[macro_use]
20 | pub(crate) mod polyfills;
21 |
22 | use polyfills::*;
23 |
24 | /*
25 | mod vf32;
26 | mod vf64;
27 | mod vi32;
28 | mod vi32_2;
29 | mod vi64;
30 | //mod vi64_2;
31 | mod vu32;
32 | mod vu64;
33 |
34 | pub use vf32::*;
35 | pub use vf64::*;
36 | pub use vi32::*;
37 | pub use vi64::*;
38 | pub use vu32::*;
39 | pub use vu64::*;
40 |
41 | type Vi32 = i32x8;
42 | type Vi64 = i64x8;
43 | type Vu32 = u32x8;
44 | type Vu64 = u64x8;
45 | type Vf32 = f32x8;
46 | type Vf64 = f64x8;
47 |
48 | impl Simd for AVX1 {
49 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX;
50 |
51 | type Vi32 = Vi32;
52 | type Vi64 = Vi64;
53 | type Vu32 = Vu32;
54 | type Vu64 = Vu64;
55 | type Vf32 = Vf32;
56 | type Vf64 = Vf64;
57 |
58 | #[cfg(target_pointer_width = "32")]
59 | type Vusize = Vu32;
60 |
61 | #[cfg(target_pointer_width = "32")]
62 | type Visize = Vi32;
63 |
64 | #[cfg(target_pointer_width = "64")]
65 | type Vusize = Vu64;
66 |
67 | #[cfg(target_pointer_width = "64")]
68 | type Visize = Vi64;
69 | }
70 | */
71 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/polyfills.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/vi32_2.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | decl!(i32x8: i32 => [__m128i; 2]);
4 | impl Default for i32x8 {
5 | #[inline(always)]
6 | fn default() -> Self {
7 | Self::new([unsafe { _mm_setzero_si128() }; 2])
8 | }
9 | }
10 |
11 | impl SimdVectorBase for i32x8 {
12 | type Element = i32;
13 |
14 | #[inline(always)]
15 | fn splat(value: Self::Element) -> Self {
16 | Self::new(unsafe { [_mm_set1_epi32(value); 2] })
17 | }
18 |
19 | #[inline(always)]
20 | unsafe fn undefined() -> Self {
21 | Self::new([_mm_undefined_si128(); 2])
22 | }
23 |
24 | #[inline(always)]
25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 | Self::new([_mm_load_si128(src as *const _), _mm_load_si128(src.add(4) as *const _)])
27 | }
28 |
29 | #[inline(always)]
30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 | let src = src as *const _;
32 | Self::new([_mm_load_si128(src), _mm_load_si128(src.add(1))])
33 | }
34 |
35 | #[inline(always)]
36 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
37 | let dst = dst as *mut _;
38 | _mm_store_si128(dst, self.value[0]);
39 | _mm_store_si128(dst.add(1), self.value[1]);
40 | }
41 |
42 | #[inline(always)]
43 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
44 | let dst = dst as *mut _;
45 | _mm_storeu_si128(dst, self.value[0]);
46 | _mm_storeu_si128(dst.add(1), self.value[1]);
47 | }
48 |
49 | decl_base_common!(#[target_feature(enable = "avx,fma")] i32x8: i32 => __m256i);
50 | }
51 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx1/vi64_2.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | decl!(i64x8: i64 => [__m128i; 4]);
4 | impl Default for i64x8 {
5 | #[inline(always)]
6 | fn default() -> Self {
7 | Self::new([unsafe { _mm_setzero_si128() }; 4])
8 | }
9 | }
10 |
11 | impl i64x8 {
12 | #[inline(always)]
13 | fn mapv(mut self, f: F) -> Self
14 | where
15 | F: Fn(__m128i, usize) -> __m128i,
16 | {
17 | for i in 0..4 {
18 | self.value[i] = f(self.value[i], i);
19 | }
20 | self
21 | }
22 |
23 | #[inline(always)]
24 | fn zipv(mut self, b: Self, f: F) -> Self
25 | where
26 | F: Fn(__m128i, __m128i) -> __m128i,
27 | {
28 | self.mapv(|a, i| f(a, b.value[i]))
29 | }
30 | }
31 |
32 | impl SimdVectorBase for i64x8 {
33 | type Element = i64;
34 |
35 | #[inline(always)]
36 | fn splat(value: Self::Element) -> Self {
37 | Self::new(unsafe { [_mm_set1_epi64x(value); 4] })
38 | }
39 |
40 | #[inline(always)]
41 | unsafe fn undefined() -> Self {
42 | Self::new([_mm_undefined_si128(); 4])
43 | }
44 |
45 | #[inline(always)]
46 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
47 | Self::undefined().mapv(|_, i| _mm_load_si128((src as *const __m128i).add(i)))
48 | }
49 |
50 | #[inline(always)]
51 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
52 | Self::undefined().mapv(|_, i| _mm_loadu_si128((src as *const __m128i).add(i)))
53 | }
54 |
55 | #[inline(always)]
56 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
57 | for i in 0..4 {
58 | _mm_store_si128((dst as *mut __m128i).add(i), self.value[i]);
59 | }
60 | }
61 |
62 | #[inline(always)]
63 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
64 | for i in 0..4 {
65 | _mm_storeu_si128((dst as *mut __m128i).add(i), self.value[i]);
66 | }
67 | }
68 |
69 | #[inline]
70 | #[target_feature(enable = "avx")]
71 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
72 | *transmute::<&_, *const Self::Element>(&self).add(index)
73 | }
74 |
75 | #[inline]
76 | #[target_feature(enable = "avx")]
77 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
78 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
79 | self
80 | }
81 | }
82 |
83 | impl SimdBitwise for i64x8 {
84 | fn and_not(self, other: Self) -> Self {
85 | self.zipv(other, |a, b| unsafe { _mm_andnot_si128(a, b) })
86 | }
87 |
88 | const FULL_BITMASK: u16 = 0b1111_1111;
89 |
90 | #[inline(always)]
91 | fn bitmask(self) -> u16 {
92 | let mut bitmask = 0;
93 | for i in 0..4 {
94 | // shift mask by 2*i as each vector has 2 64-bit lanes
95 | bitmask |= unsafe { _mm_movemask_pd(_mm_castsi128_pd(self.value[i])) } << (2 * i);
96 | }
97 | bitmask as u16
98 | }
99 |
100 | #[inline(always)]
101 | unsafe fn _mm_not(self) -> Self {
102 | self ^ Self::splat(!0)
103 | }
104 |
105 | #[inline(always)]
106 | unsafe fn _mm_bitand(self, rhs: Self) -> Self {
107 | self.zipv(rhs, |a, b| _mm_and_si128(a, b))
108 | }
109 |
110 | #[inline(always)]
111 | unsafe fn _mm_bitor(self, rhs: Self) -> Self {
112 | self.zipv(rhs, |a, b| _mm_or_si128(a, b))
113 | }
114 |
115 | #[inline(always)]
116 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
117 | self.zipv(rhs, |a, b| _mm_xor_si128(a, b))
118 | }
119 |
120 | #[inline(always)]
121 | unsafe fn _mm_shr(self, count: Vu32) -> Self {
122 | Self::zip(self, count, |x, s| x >> s)
123 | }
124 |
125 | #[inline(always)]
126 | unsafe fn _mm_shl(self, count: Vu32) -> Self {
127 | Self::zip(self, count, |x, s| x << s)
128 | }
129 |
130 | #[inline(always)]
131 | unsafe fn _mm_shli(self, count: u32) -> Self {
132 | let count = _mm_cvtsi32_si128(count as i32);
133 | self.mapv(|a, _| _mm_sll_epi64(a, count))
134 | }
135 |
136 | #[inline(always)]
137 | unsafe fn _mm_shri(self, count: u32) -> Self {
138 | let count = _mm_cvtsi32_si128(count as i32);
139 | self.mapv(|a, _| _mm_srl_epi64(a, count))
140 | }
141 | }
142 |
143 | impl PartialEq for i64x8 {
144 | fn eq(&self, other: &Self) -> bool {
145 | >::eq(*self, *other).all()
146 | }
147 |
148 | fn ne(&self, other: &Self) -> bool {
149 | >::ne(*self, *other).any()
150 | }
151 | }
152 |
153 | impl Eq for i64x8 {}
154 |
155 | impl SimdMask for i64x8 {
156 | #[inline(always)]
157 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
158 | self.mapv(|m, i| _mm_blendv_epi8(f.value[i], t.value[i], m))
159 | }
160 | }
161 |
162 | impl SimdVector for i64x8 {
163 | #[inline(always)]
164 | fn zero() -> Self {
165 | Self::new(unsafe { [_mm_setzero_si128(); 4] })
166 | }
167 |
168 | #[inline(always)]
169 | fn one() -> Self {
170 | Self::splat(1)
171 | }
172 |
173 | #[inline(always)]
174 | fn min_value() -> Self {
175 | Self::splat(i64::MIN)
176 | }
177 |
178 | #[inline(always)]
179 | fn max_value() -> Self {
180 | Self::splat(i64::MAX)
181 | }
182 |
183 | #[inline]
184 | fn min_element(self) -> Self::Element {
185 | unsafe { self.reduce2(|a, x| a.min(x)) }
186 | }
187 |
188 | #[inline]
189 | fn max_element(self) -> Self::Element {
190 | unsafe { self.reduce2(|a, x| a.max(x)) }
191 | }
192 |
193 | #[inline(always)]
194 | fn eq(self, other: Self) -> Mask {
195 | Mask::new(self.zipv(other, |a, b| unsafe { _mm_cmpeq_epi64(a, b) }))
196 | }
197 |
198 | #[inline(always)]
199 | fn gt(self, other: Self) -> Mask {
200 | Mask::new(self.zipv(other, |a, b| unsafe { _mm_cmpgt_epi64(a, b) }))
201 | }
202 |
203 | #[inline(always)]
204 | unsafe fn _mm_add(self, rhs: Self) -> Self {
205 | self.zipv(rhs, |l, r| _mm_add_epi64(l, r))
206 | }
207 |
208 | #[inline(always)]
209 | unsafe fn _mm_sub(self, rhs: Self) -> Self {
210 | self.zipv(rhs, |l, r| _mm_sub_epi64(l, r))
211 | }
212 |
213 | #[inline(always)]
214 | unsafe fn _mm_mul(self, rhs: Self) -> Self {
215 | self.zipv(rhs, |l, r| _mm_mullo_epi64x(l, r))
216 | }
217 |
218 | #[inline(always)]
219 | unsafe fn _mm_div(self, rhs: Self) -> Self {
220 | Self::zip(self, rhs, Div::div)
221 | }
222 |
223 | #[inline(always)]
224 | unsafe fn _mm_rem(self, rhs: Self) -> Self {
225 | Self::zip(self, rhs, Rem::rem)
226 | }
227 | }
228 |
229 | impl SimdSignedVector for i64x8 {
230 | #[inline(always)]
231 | fn neg_one() -> Self {
232 | Self::splat(-1)
233 | }
234 |
235 | #[inline(always)]
236 | fn min_positive() -> Self {
237 | Self::splat(0)
238 | }
239 |
240 | #[inline(always)]
241 | fn abs(self) -> Self {
242 | self.mapv(|x, _| unsafe { _mm256_abs_epi64x(x) })
243 | }
244 |
245 | #[inline(always)]
246 | unsafe fn _mm_neg(self) -> Self {
247 | (self ^ Self::neg_one()) + Self::one()
248 | }
249 | }
250 |
251 | impl_ops!(@UNARY i64x8 AVX1 => Not::not, Neg::neg);
252 | impl_ops!(@BINARY i64x8 AVX1 => BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
253 | impl_ops!(@BINARY i64x8 AVX1 => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem);
254 | impl_ops!(@SHIFTS i64x8 AVX1 => Shr::shr, Shl::shl);
255 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx2/mod.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused)]
2 |
3 | use crate::*;
4 |
5 | use core::{
6 | fmt,
7 | marker::PhantomData,
8 | mem::{transmute, transmute_copy},
9 | ops::*,
10 | };
11 |
12 | use crate::arch::avx2::*;
13 |
14 | use half::f16;
15 |
16 | pub(crate) mod polyfills;
17 |
18 | use super::polyfills::*;
19 | use polyfills::*;
20 |
21 | mod vf32;
22 | mod vf64;
23 | //mod vi16;
24 | mod vi32;
25 | mod vi64;
26 | mod vu32;
27 | mod vu64;
28 |
29 | pub use vf32::*;
30 | pub use vf64::*;
31 | //pub use vi16::*;
32 | pub use vi32::*;
33 | pub use vi64::*;
34 | pub use vu32::*;
35 | pub use vu64::*;
36 |
37 | //type Vi16 = i16x8;
38 | type Vi32 = i32x8;
39 | type Vi64 = i64x8;
40 | type Vu32 = u32x8;
41 | type Vu64 = u64x8;
42 | type Vf32 = f32x8;
43 | type Vf64 = f64x8;
44 |
45 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
46 | pub struct AVX2;
47 |
48 | impl Simd for AVX2 {
49 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::AVX2;
50 |
51 | type Vi32 = Vi32;
52 | type Vi64 = Vi64;
53 | type Vu32 = Vu32;
54 | type Vu64 = Vu64;
55 | type Vf32 = Vf32;
56 | type Vf64 = Vf64;
57 |
58 | #[cfg(target_pointer_width = "32")]
59 | type Vusize = Vu32;
60 |
61 | #[cfg(target_pointer_width = "32")]
62 | type Visize = Vi32;
63 |
64 | #[cfg(target_pointer_width = "64")]
65 | type Vusize = Vu64;
66 |
67 | #[cfg(target_pointer_width = "64")]
68 | type Visize = Vi64;
69 | }
70 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/avx2/vi16.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | decl!(i16x8: i16 => __m128i);
4 | impl Default for i16x8 {
5 | #[inline(always)]
6 | fn default() -> Self {
7 | Self::new(unsafe { _mm_setzero_si128() })
8 | }
9 | }
10 |
11 | impl SimdVectorBase for i16x8 {
12 | type Element = i16;
13 |
14 | #[inline(always)]
15 | fn splat(value: Self::Element) -> Self {
16 | Self::new(unsafe { _mm_set1_epi16(value) })
17 | }
18 |
19 | #[inline(always)]
20 | unsafe fn undefined() -> Self {
21 | Self::new(_mm_undefined_si128())
22 | }
23 |
24 | #[inline(always)]
25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 | Self::new(_mm_load_si128(src as *const _))
27 | }
28 |
29 | #[inline(always)]
30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 | Self::new(_mm_loadu_si128(src as *const _))
32 | }
33 |
34 | #[inline(always)]
35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
36 | _mm_store_si128(dst as *mut _, self.value)
37 | }
38 |
39 | #[inline(always)]
40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
41 | _mm_storeu_si128(dst as *mut _, self.value)
42 | }
43 |
44 | #[inline]
45 | #[target_feature(enable = "avx2")]
46 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
47 | *transmute::<&_, *const Self::Element>(&self).add(index)
48 | }
49 |
50 | #[inline]
51 | #[target_feature(enable = "avx2")]
52 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
53 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
54 | self
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/macros.rs:
--------------------------------------------------------------------------------
1 | macro_rules! impl_ops {
2 | (@UNARY $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$(
3 | impl $op_trait for $name<$is> {
4 | type Output = Self;
5 | #[inline(always)] fn $op(self) -> Self { unsafe { self. [<_mm_ $op>]() } }
6 | }
7 | )*}};
8 |
9 | (@BINARY $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$(
10 | impl $op_trait for $name<$is> {
11 | type Output = Self;
12 | #[inline(always)] fn $op(self, rhs: Self) -> Self { unsafe { self. [<_mm_ $op>](rhs) } }
13 | }
14 | //impl $op_trait<>::Element> for $name<$is> {
15 | // type Output = Self;
16 | // #[inline(always)] fn $op(self, rhs: >::Element) -> Self {
17 | // $op_trait::$op(self, Self::splat(rhs))
18 | // }
19 | //}
20 | //impl $op_trait<$name<$is>> for <$name<$is> as SimdVectorBase<$is>>::Element {
21 | // type Output = $name<$is>;
22 | // #[inline(always)] fn $op(self, rhs: $name<$is>) -> $name<$is> {
23 | // $op_trait::$op($name::<$is>::splat(self), rhs)
24 | // }
25 | //}
26 |
27 | impl [<$op_trait Assign>] for $name<$is> {
28 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: Self) { *self = $op_trait::$op(*self, rhs); }
29 | }
30 | impl [<$op_trait Assign>]<>::Element> for $name<$is> {
31 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: >::Element) {
32 | *self = $op_trait::$op(*self, Self::splat(rhs));
33 | }
34 | }
35 | )*}};
36 |
37 | (@SHIFTS $name:ident $is:ident => $($op_trait:ident::$op:ident),*) => {paste::paste! {$(
38 | impl $op_trait<<$is as Simd>::Vu32> for $name<$is> {
39 | type Output = Self;
40 | #[inline(always)] fn $op(self, rhs: <$is as Simd>::Vu32) -> Self { unsafe { self. [<_mm_ $op>](rhs) } }
41 | }
42 | impl $op_trait for $name<$is> {
43 | type Output = Self;
44 | #[inline(always)] fn $op(self, rhs: u32) -> Self { unsafe { self.[<_mm_ $op i>](rhs) } }
45 | }
46 |
47 | impl [<$op_trait Assign>]<<$is as Simd>::Vu32> for $name<$is> {
48 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: <$is as Simd>::Vu32) { *self = $op_trait::$op(*self, rhs); }
49 | }
50 | impl [<$op_trait Assign>] for $name<$is> {
51 | #[inline(always)] fn [<$op _assign>](&mut self, rhs: u32) { *self = $op_trait::$op(*self, rhs); }
52 | }
53 | )*}};
54 | }
55 |
56 | macro_rules! decl_base_common {
57 | (#[$meta:meta] $name:ident: $ety:ty => $ty:ty) => {
58 | #[inline]
59 | #[$meta]
60 | unsafe fn extract_unchecked(self, index: usize) -> Self::Element {
61 | *transmute::<&_, *const Self::Element>(&self).add(index)
62 | }
63 |
64 | #[inline]
65 | #[$meta]
66 | unsafe fn replace_unchecked(mut self, index: usize, value: Self::Element) -> Self {
67 | *transmute::<&mut _, *mut Self::Element>(&mut self).add(index) = value;
68 | self
69 | }
70 |
71 | #[inline]
72 | #[$meta]
73 | unsafe fn shuffle_unchecked(self, b: Self, indices: INDICES) -> Self {
74 | let mut dst = Self::undefined();
75 | for i in 0..Self::NUM_ELEMENTS {
76 | let idx = *INDICES::INDICES.get_unchecked(i);
77 | dst = dst.replace_unchecked(
78 | i,
79 | if idx < Self::NUM_ELEMENTS {
80 | self.extract_unchecked(idx)
81 | } else {
82 | b.extract_unchecked(idx - Self::NUM_ELEMENTS)
83 | },
84 | );
85 | }
86 | dst
87 | }
88 | };
89 | }
90 |
91 | macro_rules! decl {
92 | ($($name:ident: $ety:ty => $ty:ty),*) => {$(
93 | #[derive(Clone, Copy)]
94 | #[repr(transparent)]
95 | pub struct $name {
96 | pub(crate) value: $ty,
97 | _is: PhantomData,
98 | }
99 |
100 | impl $name {
101 | #[inline(always)]
102 | pub(crate) fn new(value: $ty) -> Self {
103 | Self { value, _is: PhantomData }
104 | }
105 | }
106 |
107 | impl $name where Self: SimdVectorBase {
108 | #[inline(always)]
109 | pub(crate) unsafe fn map(mut self, f: F) -> Self
110 | where F: Fn($ety) -> $ety {
111 | for i in 0..Self::NUM_ELEMENTS {
112 | let ptr = transmute::<&mut _, *mut $ety>(&mut self).add(i);
113 | *ptr = f(*ptr);
114 | }
115 | self
116 | }
117 |
118 | #[inline(always)]
119 | pub(crate) unsafe fn zip(a: Self, b: V, f: F) -> Self
120 | where F: Fn($ety, >::Element) -> $ety,
121 | Self: SimdVectorBase,
122 | V: SimdVectorBase {
123 | let mut out = Self::default();
124 | for i in 0..Self::NUM_ELEMENTS {
125 | *transmute::<&mut _, *mut $ety>(&mut out).add(i) =
126 | f(a.extract_unchecked(i), b.extract_unchecked(i));
127 | }
128 | out
129 | }
130 |
131 | #[inline(always)]
132 | pub(crate) unsafe fn reduce(self, mut init: $ety, f: F) -> $ety
133 | where F: Fn($ety, $ety) -> $ety {
134 | for i in 0..Self::NUM_ELEMENTS {
135 | init = f(init, self.extract_unchecked(i));
136 | }
137 | init
138 | }
139 |
140 | #[inline(always)]
141 | pub(crate) unsafe fn reduce2(self, f: F) -> $ety
142 | where F: Fn($ety, $ety) -> $ety {
143 | let mut accum = self.extract_unchecked(0);
144 | for i in 1..Self::NUM_ELEMENTS {
145 | accum = f(accum, self.extract_unchecked(i));
146 | }
147 | accum
148 | }
149 | }
150 |
151 | impl fmt::Debug for $name where Self: SimdVectorBase {
152 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
153 | let mut t = f.debug_tuple(stringify!($name));
154 | for i in 0..Self::NUM_ELEMENTS {
155 | t.field(unsafe { &*transmute::<&_, *const $ety>(self).add(i) });
156 | }
157 | t.finish()
158 | }
159 | }
160 | )*};
161 | }
162 |
163 | macro_rules! decl_brute_force_convert {
164 | (#[$meta:meta] $from:ty => $to:ty) => {
165 | paste::paste! {
166 | #[$meta]
167 | #[inline]
168 | unsafe fn do_convert(value: []) -> [] {
169 | let mut res = mem::MaybeUninit::uninit();
170 | for i in 0..[]::NUM_ELEMENTS {
171 | *(res.as_mut_ptr() as *mut $to).add(i) = (*transmute::<&_, *const $from>(&value).add(i)) as $to;
172 | }
173 | res.assume_init()
174 | }
175 | }
176 | };
177 | }
178 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/mod.rs:
--------------------------------------------------------------------------------
1 | #[macro_use]
2 | mod macros;
3 |
4 | pub mod polyfills;
5 |
6 | //pub mod scalar;
7 |
8 | #[cfg(all(feature = "neon", target_arch = "aarch64"))]
9 | pub mod aarch64;
10 | #[cfg(all(feature = "neon", target_arch = "arm"))]
11 | pub mod arm;
12 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
13 | pub mod avx1;
14 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
15 | pub mod avx2;
16 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
17 | pub mod sse2;
18 | #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
19 | pub mod sse42;
20 | #[cfg(all(feature = "wasm32", target_arch = "wasm32"))]
21 | pub mod wasm32;
22 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/polyfills.rs:
--------------------------------------------------------------------------------
1 | #[inline(always)]
2 | pub const fn _mm_shuffle(w: i32, z: i32, y: i32, x: i32) -> i32 {
3 | (w << 6) | (z << 4) | (y << 2) | x
4 | }
5 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/mod.rs:
--------------------------------------------------------------------------------
1 | #![allow(unused)]
2 |
3 | use crate::*;
4 |
5 | use core::{
6 | fmt,
7 | marker::PhantomData,
8 | mem::{transmute, transmute_copy},
9 | ops::*,
10 | };
11 |
12 | mod polyfills;
13 | use polyfills::*;
14 |
15 | use half::f16;
16 |
17 | mod vf32;
18 | mod vf64;
19 | mod vi32;
20 | mod vi64;
21 | mod vu32;
22 | mod vu64;
23 |
24 | pub use vf32::*;
25 | pub use vf64::*;
26 | pub use vi32::*;
27 | pub use vi64::*;
28 | pub use vu32::*;
29 | pub use vu64::*;
30 |
31 | type Vu32 = u32x1;
32 | type Vf32 = f32x1;
33 | type Vf64 = f64x1;
34 |
35 | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
36 | pub struct Scalar;
37 |
38 | impl Simd for Scalar {
39 | const INSTRSET: SimdInstructionSet = SimdInstructionSet::Scalar;
40 |
41 | type Vu32 = Vu32;
42 | type Vf32 = Vf32;
43 | type Vf64 = Vf64;
44 |
45 | #[cfg(target_pointer_width = "32")]
46 | type Vusize = Vu32;
47 |
48 | //#[cfg(target_pointer_width = "32")]
49 | //type Visize = Vi32;
50 |
51 | /*
52 | type Vi32 = Vi32;
53 | type Vi64 = Vi64;
54 |
55 | type Vu64 = Vu64;
56 |
57 | #[cfg(target_pointer_width = "64")]
58 | type Vusize = Vu64;
59 |
60 | #[cfg(target_pointer_width = "64")]
61 | type Visize = Vi64;
62 | */
63 | }
64 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/polyfills.rs:
--------------------------------------------------------------------------------
1 | #[inline(always)]
2 | pub fn bool_to_u32(value: bool) -> u32 {
3 | //if value { 0xFFFF_FFFF } else { 0 }
4 | -(value as i32) as u32
5 | }
6 |
7 | #[inline(always)]
8 | pub fn bool_to_u64(value: bool) -> u32 {
9 | //if value { 0xFFFF_FFFF_FFFF_FFFF } else { 0 }
10 | -(value as i64) as u64
11 | }
12 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vf32.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | decl!(f32x1: f32 => f32);
4 | impl Default for f32x1 {
5 | #[inline(always)]
6 | fn default() -> Self {
7 | Self::new(0.0)
8 | }
9 | }
10 |
11 | impl SimdVectorBase for f32x1 {
12 | type Element = f32;
13 |
14 | #[inline(always)]
15 | fn splat(value: Self::Element) -> Self {
16 | Self::new(value)
17 | }
18 |
19 | #[inline(always)]
20 | unsafe fn undefined() -> Self {
21 | Self::new(0.0)
22 | }
23 |
24 | #[inline(always)]
25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 | Self::new(*src)
27 | }
28 |
29 | #[inline(always)]
30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 | Self::new(src.read_unaligned())
32 | }
33 |
34 | #[inline(always)]
35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
36 | *dst = self.value;
37 | }
38 |
39 | #[inline(always)]
40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
41 | dst.write_unaligned(self.value)
42 | }
43 |
44 | decl_base_common!(#[target_feature()] f32x1: f32 => f32);
45 | }
46 |
47 | impl SimdBitwise for f32x1 {
48 | const FULL_BITMASK: u16 = 1;
49 |
50 | #[inline(always)]
51 | fn bitmask(self) -> u16 {
52 | self.into_bits().bitmask()
53 | }
54 |
55 | #[inline(always)]
56 | unsafe fn _mm_not(self) -> Self {
57 | self ^ Self::splat(f32::from_bits(!0))
58 | }
59 |
60 | #[inline(always)]
61 | unsafe fn _mm_bitand(self, rhs: Self) -> Self {
62 | Self::new(f32::from_bits(self.value.to_bits() & rhs.value.to_bits()))
63 | }
64 |
65 | #[inline(always)]
66 | unsafe fn _mm_bitor(self, rhs: Self) -> Self {
67 | Self::new(f32::from_bits(self.value.to_bits() | rhs.value.to_bits()))
68 | }
69 |
70 | #[inline(always)]
71 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
72 | Self::new(f32::from_bits(self.value.to_bits() ^ rhs.value.to_bits()))
73 | }
74 |
75 | #[inline(always)]
76 | unsafe fn _mm_shr(self, count: Vu32) -> Self {
77 | Self::new(f32::from_bits(self.value.to_bits() << count.value))
78 | }
79 |
80 | #[inline(always)]
81 | unsafe fn _mm_shl(self, count: Vu32) -> Self {
82 | Self::new(f32::from_bits(self.value.to_bits() >> count.value))
83 | }
84 |
85 | #[inline(always)]
86 | unsafe fn _mm_shli(self, count: u32) -> Self {
87 | Self::new(f32::from_bits(self.value.to_bits() << count))
88 | }
89 |
90 | #[inline(always)]
91 | unsafe fn _mm_shri(self, count: u32) -> Self {
92 | Self::new(f32::from_bits(self.value.to_bits() >> count))
93 | }
94 | }
95 |
96 | impl PartialEq for f32x1 {
97 | #[inline(always)]
98 | fn eq(&self, other: &Self) -> bool {
99 | self.value == other.value
100 | }
101 | }
102 |
103 | impl SimdMask for f32x1 {
104 | #[inline(always)]
105 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
106 | if self.value.to_bits() != 0 {
107 | t
108 | } else {
109 | f
110 | }
111 | }
112 | }
113 |
114 | impl SimdVector for f32x1 {
115 | fn zero() -> Self {
116 | Self::splat(0.0)
117 | }
118 |
119 | fn one() -> Self {
120 | Self::splat(1.0)
121 | }
122 |
123 | fn indexed() -> Self {
124 | Self::splat(0.0)
125 | }
126 |
127 | #[inline(always)]
128 | fn min_value() -> Self {
129 | Self::splat(f32::MIN)
130 | }
131 |
132 | #[inline(always)]
133 | fn max_value() -> Self {
134 | Self::splat(f32::MAX)
135 | }
136 |
137 | #[inline(always)]
138 | fn min(self, other: Self) -> Self {
139 | Self::new(self.value.min(other.value))
140 | }
141 |
142 | #[inline(always)]
143 | fn max(self, other: Self) -> Self {
144 | Self::new(self.value.max(other.value))
145 | }
146 |
147 | #[inline(always)]
148 | fn min_element(self) -> Self::Element {
149 | self.value
150 | }
151 |
152 | #[inline(always)]
153 | fn max_element(self) -> Self::Element {
154 | self.value
155 | }
156 |
157 | #[inline(always)]
158 | fn eq(self, other: Self) -> Mask {
159 | Self::new(f32::from_bits(bool_to_u32(self.value == other.value)))
160 | }
161 |
162 | #[inline(always)]
163 | fn lt(self, other: Self) -> Mask {
164 | Self::new(f32::from_bits(bool_to_u32(self.value < other.value)))
165 | }
166 |
167 | #[inline(always)]
168 | fn le(self, other: Self) -> Mask {
169 | Self::new(f32::from_bits(bool_to_u32(self.value <= other.value)))
170 | }
171 |
172 | #[inline(always)]
173 | fn gt(self, other: Self) -> Mask {
174 | Self::new(f32::from_bits(bool_to_u32(self.value > other.value)))
175 | }
176 |
177 | #[inline(always)]
178 | fn ge(self, other: Self) -> Mask {
179 | Self::new(f32::from_bits(bool_to_u32(self.value >= other.value)))
180 | }
181 |
182 | #[inline(always)]
183 | unsafe fn _mm_add(self, rhs: Self) -> Self {
184 | Self::new(Add::add(self.value, rhs.value))
185 | }
186 |
187 | #[inline(always)]
188 | unsafe fn _mm_sub(self, rhs: Self) -> Self {
189 | Self::new(Sub::sub(self.value, rhs.value))
190 | }
191 |
192 | #[inline(always)]
193 | unsafe fn _mm_mul(self, rhs: Self) -> Self {
194 | Self::new(Mul::mul(self.value, rhs.value))
195 | }
196 |
197 | #[inline(always)]
198 | unsafe fn _mm_div(self, rhs: Self) -> Self {
199 | Self::new(Div::div(self.value, rhs.value))
200 | }
201 |
202 | #[inline(always)]
203 | unsafe fn _mm_rem(self, rhs: Self) -> Self {
204 | Self::new(Rem::rem(self.value, rhs.value))
205 | }
206 | }
207 |
208 | impl SimdIntoBits for f32x1 {
209 | fn into_bits(self) -> Vu32 {
210 | u32x1::new(self.value.to_bits())
211 | }
212 | }
213 |
214 | impl SimdFromBits for f32x1 {
215 | fn from_bits(bits: Vu32) -> Self {
216 | Self::new(f32::from_bits(bits.value))
217 | }
218 | }
219 |
220 | impl_ops!(@UNARY f32x1 Scalar => Not::not, Neg::neg);
221 | impl_ops!(@BINARY f32x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
222 | impl_ops!(@SHIFTS f32x1 Scalar => Shr::shr, Shl::shl);
223 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vf64.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | decl!(f64x1: f64 => f64);
4 | impl Default for f64x1 {
5 | #[inline(always)]
6 | fn default() -> Self {
7 | Self::new(0.0)
8 | }
9 | }
10 |
11 | impl SimdVectorBase for f64x1 {
12 | type Element = f64;
13 |
14 | #[inline(always)]
15 | fn splat(value: Self::Element) -> Self {
16 | Self::new(value)
17 | }
18 |
19 | #[inline(always)]
20 | unsafe fn undefined() -> Self {
21 | Self::new(0.0)
22 | }
23 |
24 | #[inline(always)]
25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 | Self::new(*src)
27 | }
28 |
29 | #[inline(always)]
30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 | Self::new(src.read_unaligned())
32 | }
33 |
34 | #[inline(always)]
35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
36 | *dst = self.value;
37 | }
38 |
39 | #[inline(always)]
40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
41 | dst.write_unaligned(self.value)
42 | }
43 |
44 | decl_base_common!(#[target_feature()] f64x1: f64 => f64);
45 | }
46 |
47 | impl SimdBitwise for f64x1 {
48 | const FULL_BITMASK: u16 = 1;
49 |
50 | #[inline(always)]
51 | fn bitmask(self) -> u16 {
52 | self.into_bits().bitmask()
53 | }
54 |
55 | #[inline(always)]
56 | unsafe fn _mm_not(self) -> Self {
57 | self ^ Self::splat(f64::from_bits(!0))
58 | }
59 |
60 | #[inline(always)]
61 | unsafe fn _mm_bitand(self, rhs: Self) -> Self {
62 | Self::new(f64::from_bits(self.value.to_bits() & rhs.value.to_bits()))
63 | }
64 |
65 | #[inline(always)]
66 | unsafe fn _mm_bitor(self, rhs: Self) -> Self {
67 | Self::new(f64::from_bits(self.value.to_bits() | rhs.value.to_bits()))
68 | }
69 |
70 | #[inline(always)]
71 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
72 | Self::new(f64::from_bits(self.value.to_bits() ^ rhs.value.to_bits()))
73 | }
74 |
75 | #[inline(always)]
76 | unsafe fn _mm_shr(self, count: Vu32) -> Self {
77 | Self::new(f64::from_bits(self.value.to_bits() << count.value))
78 | }
79 |
80 | #[inline(always)]
81 | unsafe fn _mm_shl(self, count: Vu32) -> Self {
82 | Self::new(f64::from_bits(self.value.to_bits() >> count.value))
83 | }
84 |
85 | #[inline(always)]
86 | unsafe fn _mm_shli(self, count: u32) -> Self {
87 | Self::new(f64::from_bits(self.value.to_bits() << count))
88 | }
89 |
90 | #[inline(always)]
91 | unsafe fn _mm_shri(self, count: u32) -> Self {
92 | Self::new(f64::from_bits(self.value.to_bits() >> count))
93 | }
94 | }
95 |
96 | impl PartialEq for f64x1 {
97 | #[inline(always)]
98 | fn eq(&self, other: &Self) -> bool {
99 | self.value == other.value
100 | }
101 | }
102 |
103 | impl SimdMask for f64x1 {
104 | #[inline(always)]
105 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
106 | if self.value.to_bits() != 0 {
107 | t
108 | } else {
109 | f
110 | }
111 | }
112 | }
113 |
114 | impl SimdVector for f64x1 {
115 | fn zero() -> Self {
116 | Self::splat(0.0)
117 | }
118 |
119 | fn one() -> Self {
120 | Self::splat(1.0)
121 | }
122 |
123 | fn indexed() -> Self {
124 | Self::splat(0.0)
125 | }
126 |
127 | #[inline(always)]
128 | fn min_value() -> Self {
129 | Self::splat(f64::MIN)
130 | }
131 |
132 | #[inline(always)]
133 | fn max_value() -> Self {
134 | Self::splat(f64::MAX)
135 | }
136 |
137 | #[inline(always)]
138 | fn min(self, other: Self) -> Self {
139 | Self::new(self.value.min(other.value))
140 | }
141 |
142 | #[inline(always)]
143 | fn max(self, other: Self) -> Self {
144 | Self::new(self.value.max(other.value))
145 | }
146 |
147 | #[inline(always)]
148 | fn min_element(self) -> Self::Element {
149 | self.value
150 | }
151 |
152 | #[inline(always)]
153 | fn max_element(self) -> Self::Element {
154 | self.value
155 | }
156 |
157 | #[inline(always)]
158 | fn eq(self, other: Self) -> Mask {
159 | Self::new(f64::from_bits(bool_to_u32(self.value == other.value)))
160 | }
161 |
162 | #[inline(always)]
163 | fn lt(self, other: Self) -> Mask {
164 | Self::new(f64::from_bits(bool_to_u32(self.value < other.value)))
165 | }
166 |
167 | #[inline(always)]
168 | fn le(self, other: Self) -> Mask {
169 | Self::new(f64::from_bits(bool_to_u32(self.value <= other.value)))
170 | }
171 |
172 | #[inline(always)]
173 | fn gt(self, other: Self) -> Mask {
174 | Self::new(f64::from_bits(bool_to_u32(self.value > other.value)))
175 | }
176 |
177 | #[inline(always)]
178 | fn ge(self, other: Self) -> Mask {
179 | Self::new(f64::from_bits(bool_to_u32(self.value >= other.value)))
180 | }
181 |
182 | #[inline(always)]
183 | unsafe fn _mm_add(self, rhs: Self) -> Self {
184 | Self::new(Add::add(self.value, rhs.value))
185 | }
186 |
187 | #[inline(always)]
188 | unsafe fn _mm_sub(self, rhs: Self) -> Self {
189 | Self::new(Sub::sub(self.value, rhs.value))
190 | }
191 |
192 | #[inline(always)]
193 | unsafe fn _mm_mul(self, rhs: Self) -> Self {
194 | Self::new(Mul::mul(self.value, rhs.value))
195 | }
196 |
197 | #[inline(always)]
198 | unsafe fn _mm_div(self, rhs: Self) -> Self {
199 | Self::new(Div::div(self.value, rhs.value))
200 | }
201 |
202 | #[inline(always)]
203 | unsafe fn _mm_rem(self, rhs: Self) -> Self {
204 | Self::new(Rem::rem(self.value, rhs.value))
205 | }
206 | }
207 |
208 | impl SimdIntoBits for f64x1 {
209 | fn into_bits(self) -> Vu32 {
210 | u32x1::new(self.value.to_bits())
211 | }
212 | }
213 |
214 | impl SimdFromBits for f64x1 {
215 | fn from_bits(bits: Vu32) -> Self {
216 | Self::new(f64::from_bits(bits.value))
217 | }
218 | }
219 |
220 | impl_ops!(@UNARY f64x1 Scalar => Not::not, Neg::neg);
221 | impl_ops!(@BINARY f64x1 Scalar => Add::add, Sub::sub, Mul::mul, Div::div, Rem::rem, BitAnd::bitand, BitOr::bitor, BitXor::bitxor);
222 | impl_ops!(@SHIFTS f64x1 Scalar => Shr::shr, Shl::shl);
223 |
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vi32.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite/src/backends/scalar/vi32.rs
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vi64.rs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raygon-renderer/thermite/55cbee2b3cb127e09749e711501114bbef3ed118/crates/thermite/src/backends/scalar/vi64.rs
--------------------------------------------------------------------------------
/crates/thermite/src/backends/scalar/vu32.rs:
--------------------------------------------------------------------------------
1 | use super::*;
2 |
3 | decl!(u32x1: u32 => u32);
4 | impl Default for u32x1 {
5 | #[inline(always)]
6 | fn default() -> Self {
7 | Self::new(0)
8 | }
9 | }
10 |
11 | impl SimdVectorBase for u32x1 {
12 | type Element = u32;
13 |
14 | #[inline(always)]
15 | fn splat(value: Self::Element) -> Self {
16 | Self::new(value)
17 | }
18 |
19 | #[inline(always)]
20 | unsafe fn undefined() -> Self {
21 | Self::new(0.0)
22 | }
23 |
24 | #[inline(always)]
25 | unsafe fn load_aligned_unchecked(src: *const Self::Element) -> Self {
26 | Self::new(*src)
27 | }
28 |
29 | #[inline(always)]
30 | unsafe fn load_unaligned_unchecked(src: *const Self::Element) -> Self {
31 | Self::new(src.read_unaligned())
32 | }
33 |
34 | #[inline(always)]
35 | unsafe fn store_aligned_unchecked(self, dst: *mut Self::Element) {
36 | *dst = self.value;
37 | }
38 |
39 | #[inline(always)]
40 | unsafe fn store_unaligned_unchecked(self, dst: *mut Self::Element) {
41 | dst.write_unaligned(self.value)
42 | }
43 |
44 | decl_base_common!(#[target_feature()] u32x1: u32 => u32);
45 | }
46 |
47 | impl SimdBitwise for u32x1 {
48 | const FULL_BITMASK: u16 = 1;
49 |
50 | fn bitmask(self) -> u16 {
51 | (self.value >> 31) as u16
52 | }
53 |
54 | unsafe fn _mm_not(self) -> Self {
55 | Self::new(!self.value)
56 | }
57 |
58 | unsafe fn _mm_bitand(self, rhs: Self) -> Self {
59 | Self::new(self.value & rhs.value)
60 | }
61 |
62 | unsafe fn _mm_bitor(self, rhs: Self) -> Self {
63 | Self::new(self.value | rhs.value)
64 | }
65 |
66 | unsafe fn _mm_bitxor(self, rhs: Self) -> Self {
67 | Self::new(self.value ^ rhs.value)
68 | }
69 |
70 | #[inline(always)]
71 | unsafe fn _mm_shr(self, count: Vu32) -> Self {
72 | Self::new(self.value << count.value)
73 | }
74 |
75 | #[inline(always)]
76 | unsafe fn _mm_shl(self, count: Vu32) -> Self {
77 | Self::new(self.value >> count.value)
78 | }
79 |
80 | #[inline(always)]
81 | unsafe fn _mm_shli(self, count: u32) -> Self {
82 | Self::new(self.value << count)
83 | }
84 |
85 | #[inline(always)]
86 | unsafe fn _mm_shri(self, count: u32) -> Self {
87 | Self::new(self.value >> count)
88 | }
89 | }
90 |
91 | impl PartialEq for u32x1 {
92 | #[inline(always)]
93 | fn eq(&self, other: &Self) -> bool {
94 | self.value == other.value
95 | }
96 | }
97 |
98 | impl Eq for u32x1 {}
99 |
100 | impl SimdMask for u32x1 {
101 | #[inline(always)]
102 | unsafe fn _mm_blendv(self, t: Self, f: Self) -> Self {
103 | if self.value != 0 {
104 | t
105 | } else {
106 | f
107 | }
108 | }
109 |
110 | #[inline(always)]
111 | unsafe fn _mm_all(self) -> bool {
112 | self._mm_any() // only one value
113 | }
114 |
115 | #[inline(always)]
116 | unsafe fn _mm_any(self) -> bool {
117 | self.value != 0
118 | }
119 |
120 | #[inline(always)]
121 | unsafe fn _mm_none(self) -> bool {
122 | self.value == 0
123 | }
124 | }
125 |
126 | impl SimdVector for u32x1 {
127 | fn zero() -> Self {
128 | Self::new(0)
129 | }
130 |
131 | fn one() -> Self {
132 | Self::new(1)
133 | }
134 |
135 | fn indexed() -> Self {
136 | Self::new(0)
137 | }
138 |
139 | #[inline(always)]
140 | fn min_value() -> Self {
141 | Self::splat(u32::MIN)
142 | }
143 |
144 | #[inline(always)]
145 | fn max_value() -> Self {
146 | Self::splat(u32::MAX)
147 | }
148 |
149 | #[inline(always)]
150 | fn min_element(self) -> Self::Element {
151 | self.value
152 | }
153 |
154 | #[inline(always)]
155 | fn max_element(self) -> Self::Element {
156 | self.value
157 | }
158 |
159 | #[inline(always)]
160 | fn eq(self, other: Self) -> Mask