├── .github └── workflows │ └── rust.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE_APACHE ├── LICENSE_MIT ├── README.md ├── benches ├── bench_f32.rs ├── bench_i16.rs ├── bench_i32.rs ├── bench_u16.rs ├── bench_u8.rs └── results └── src ├── generic.rs ├── lib.rs ├── simd ├── mod.rs ├── simd_f32.rs ├── simd_i16.rs ├── simd_i32.rs ├── simd_u16.rs └── simd_u8.rs └── task.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | .DS_Store 4 | .idea/ 5 | *.iml 6 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anes" 16 | version = "0.1.6" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 19 | 20 | [[package]] 21 | name = "anstyle" 22 | version = "1.0.7" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" 25 | 26 | [[package]] 27 | name = "argmm" 28 | version = "0.1.2" 29 | dependencies = [ 30 | "criterion", 31 | "rand", 32 | "rand_distr", 33 | ] 34 | 35 | [[package]] 36 | name = "autocfg" 37 | version = "1.3.0" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" 40 | 41 | [[package]] 42 | name = "bumpalo" 43 | version = "3.16.0" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" 46 | 47 | [[package]] 48 | name = "cast" 49 | version = "0.3.0" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 52 | 53 | [[package]] 54 | name = "cfg-if" 55 | version = "1.0.0" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 58 | 59 | [[package]] 60 | name = "ciborium" 61 | version = "0.2.2" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" 64 | dependencies = [ 65 | "ciborium-io", 66 | "ciborium-ll", 67 | "serde", 68 | ] 69 | 70 | [[package]] 71 | name = "ciborium-io" 72 | version = "0.2.2" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" 75 | 76 | [[package]] 77 | name = "ciborium-ll" 78 | version = "0.2.2" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" 81 | dependencies = [ 82 | "ciborium-io", 83 | "half", 84 | ] 85 | 86 | [[package]] 87 | name = "clap" 88 | version = "4.5.8" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "84b3edb18336f4df585bc9aa31dd99c036dfa5dc5e9a2939a722a188f3a8970d" 91 | dependencies = [ 92 | "clap_builder", 93 | ] 94 | 95 | [[package]] 96 | name = "clap_builder" 97 | version = "4.5.8" 98 | source = "registry+https://github.com/rust-lang/crates.io-index" 99 | checksum = "c1c09dd5ada6c6c78075d6fd0da3f90d8080651e2d6cc8eb2f1aaa4034ced708" 100 | dependencies = [ 101 | "anstyle", 102 | "clap_lex", 103 | ] 104 | 105 | [[package]] 106 | name = "clap_lex" 107 | version = "0.7.1" 108 | source = "registry+https://github.com/rust-lang/crates.io-index" 109 | checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" 110 | 111 | [[package]] 112 | name = "criterion" 113 | version = "0.5.1" 114 | source = "registry+https://github.com/rust-lang/crates.io-index" 115 | checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" 116 | dependencies = [ 117 | "anes", 118 | "cast", 119 | "ciborium", 120 | "clap", 121 | "criterion-plot", 122 | "is-terminal", 123 | "itertools", 124 | "num-traits", 125 | "once_cell", 126 | "oorandom", 127 | "plotters", 128 | "rayon", 129 | "regex", 130 | "serde", 131 | "serde_derive", 132 | "serde_json", 133 | "tinytemplate", 134 | "walkdir", 135 | ] 136 | 137 | [[package]] 138 | name = "criterion-plot" 139 | version = "0.5.0" 140 | source = "registry+https://github.com/rust-lang/crates.io-index" 141 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" 142 | dependencies = [ 143 | "cast", 144 | "itertools", 145 | ] 146 | 147 | [[package]] 148 | name = "crossbeam-deque" 149 | version = "0.8.5" 150 | source = "registry+https://github.com/rust-lang/crates.io-index" 151 | checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" 152 | dependencies = [ 153 | "crossbeam-epoch", 154 | "crossbeam-utils", 155 | ] 156 | 157 | [[package]] 158 | name = "crossbeam-epoch" 159 | version = "0.9.18" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 162 | dependencies = [ 163 | "crossbeam-utils", 164 | ] 165 | 166 | [[package]] 167 | name = "crossbeam-utils" 168 | version = "0.8.20" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" 171 | 172 | [[package]] 173 | name = "crunchy" 174 | version = "0.2.2" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" 177 | 178 | [[package]] 179 | name = "either" 180 | version = "1.13.0" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" 183 | 184 | [[package]] 185 | name = "getrandom" 186 | version = "0.2.15" 187 | source = "registry+https://github.com/rust-lang/crates.io-index" 188 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 189 | dependencies = [ 190 | "cfg-if", 191 | "libc", 192 | "wasi", 193 | ] 194 | 195 | [[package]] 196 | name = "half" 197 | version = "2.4.1" 198 | source = "registry+https://github.com/rust-lang/crates.io-index" 199 | checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" 200 | dependencies = [ 201 | "cfg-if", 202 | "crunchy", 203 | ] 204 | 205 | [[package]] 206 | name = "hermit-abi" 207 | version = "0.3.9" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" 210 | 211 | [[package]] 212 | name = "is-terminal" 213 | version = "0.4.12" 214 | source = "registry+https://github.com/rust-lang/crates.io-index" 215 | checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" 216 | dependencies = [ 217 | "hermit-abi", 218 | "libc", 219 | "windows-sys", 220 | ] 221 | 222 | [[package]] 223 | name = "itertools" 224 | version = "0.10.5" 225 | source = "registry+https://github.com/rust-lang/crates.io-index" 226 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 227 | dependencies = [ 228 | "either", 229 | ] 230 | 231 | [[package]] 232 | name = "itoa" 233 | version = "1.0.11" 234 | source = "registry+https://github.com/rust-lang/crates.io-index" 235 | checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" 236 | 237 | [[package]] 238 | name = "js-sys" 239 | version = "0.3.69" 240 | source = "registry+https://github.com/rust-lang/crates.io-index" 241 | checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" 242 | dependencies = [ 243 | "wasm-bindgen", 244 | ] 245 | 246 | [[package]] 247 | name = "libc" 248 | version = "0.2.155" 249 | source = "registry+https://github.com/rust-lang/crates.io-index" 250 | checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" 251 | 252 | [[package]] 253 | name = "libm" 254 | version = "0.2.8" 255 | source = "registry+https://github.com/rust-lang/crates.io-index" 256 | checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" 257 | 258 | [[package]] 259 | name = "log" 260 | version = "0.4.22" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" 263 | 264 | [[package]] 265 | name = "memchr" 266 | version = "2.7.4" 267 | source = "registry+https://github.com/rust-lang/crates.io-index" 268 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 269 | 270 | [[package]] 271 | name = "num-traits" 272 | version = "0.2.19" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 275 | dependencies = [ 276 | "autocfg", 277 | "libm", 278 | ] 279 | 280 | [[package]] 281 | name = "once_cell" 282 | version = "1.19.0" 283 | source = "registry+https://github.com/rust-lang/crates.io-index" 284 | checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" 285 | 286 | [[package]] 287 | name = "oorandom" 288 | version = "11.1.3" 289 | source = "registry+https://github.com/rust-lang/crates.io-index" 290 | checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" 291 | 292 | [[package]] 293 | name = "plotters" 294 | version = "0.3.6" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "a15b6eccb8484002195a3e44fe65a4ce8e93a625797a063735536fd59cb01cf3" 297 | dependencies = [ 298 | "num-traits", 299 | "plotters-backend", 300 | "plotters-svg", 301 | "wasm-bindgen", 302 | "web-sys", 303 | ] 304 | 305 | [[package]] 306 | name = "plotters-backend" 307 | version = "0.3.6" 308 | source = "registry+https://github.com/rust-lang/crates.io-index" 309 | checksum = "414cec62c6634ae900ea1c56128dfe87cf63e7caece0852ec76aba307cebadb7" 310 | 311 | [[package]] 312 | name = "plotters-svg" 313 | version = "0.3.6" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "81b30686a7d9c3e010b84284bdd26a29f2138574f52f5eb6f794fc0ad924e705" 316 | dependencies = [ 317 | "plotters-backend", 318 | ] 319 | 320 | [[package]] 321 | name = "ppv-lite86" 322 | version = "0.2.17" 323 | source = "registry+https://github.com/rust-lang/crates.io-index" 324 | checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" 325 | 326 | [[package]] 327 | name = "proc-macro2" 328 | version = "1.0.86" 329 | source = "registry+https://github.com/rust-lang/crates.io-index" 330 | checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" 331 | dependencies = [ 332 | "unicode-ident", 333 | ] 334 | 335 | [[package]] 336 | name = "quote" 337 | version = "1.0.36" 338 | source = "registry+https://github.com/rust-lang/crates.io-index" 339 | checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" 340 | dependencies = [ 341 | "proc-macro2", 342 | ] 343 | 344 | [[package]] 345 | name = "rand" 346 | version = "0.8.5" 347 | source = "registry+https://github.com/rust-lang/crates.io-index" 348 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 349 | dependencies = [ 350 | "libc", 351 | "rand_chacha", 352 | "rand_core", 353 | ] 354 | 355 | [[package]] 356 | name = "rand_chacha" 357 | version = "0.3.1" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 360 | dependencies = [ 361 | "ppv-lite86", 362 | "rand_core", 363 | ] 364 | 365 | [[package]] 366 | name = "rand_core" 367 | version = "0.6.4" 368 | source = "registry+https://github.com/rust-lang/crates.io-index" 369 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 370 | dependencies = [ 371 | "getrandom", 372 | ] 373 | 374 | [[package]] 375 | name = "rand_distr" 376 | version = "0.4.3" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" 379 | dependencies = [ 380 | "num-traits", 381 | "rand", 382 | ] 383 | 384 | [[package]] 385 | name = "rayon" 386 | version = "1.10.0" 387 | source = "registry+https://github.com/rust-lang/crates.io-index" 388 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 389 | dependencies = [ 390 | "either", 391 | "rayon-core", 392 | ] 393 | 394 | [[package]] 395 | name = "rayon-core" 396 | version = "1.12.1" 397 | source = "registry+https://github.com/rust-lang/crates.io-index" 398 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 399 | dependencies = [ 400 | "crossbeam-deque", 401 | "crossbeam-utils", 402 | ] 403 | 404 | [[package]] 405 | name = "regex" 406 | version = "1.10.5" 407 | source = "registry+https://github.com/rust-lang/crates.io-index" 408 | checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" 409 | dependencies = [ 410 | "aho-corasick", 411 | "memchr", 412 | "regex-automata", 413 | "regex-syntax", 414 | ] 415 | 416 | [[package]] 417 | name = "regex-automata" 418 | version = "0.4.7" 419 | source = "registry+https://github.com/rust-lang/crates.io-index" 420 | checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" 421 | dependencies = [ 422 | "aho-corasick", 423 | "memchr", 424 | "regex-syntax", 425 | ] 426 | 427 | [[package]] 428 | name = "regex-syntax" 429 | version = "0.8.4" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" 432 | 433 | [[package]] 434 | name = "ryu" 435 | version = "1.0.18" 436 | source = "registry+https://github.com/rust-lang/crates.io-index" 437 | checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" 438 | 439 | [[package]] 440 | name = "same-file" 441 | version = "1.0.6" 442 | source = "registry+https://github.com/rust-lang/crates.io-index" 443 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 444 | dependencies = [ 445 | "winapi-util", 446 | ] 447 | 448 | [[package]] 449 | name = "serde" 450 | version = "1.0.203" 451 | source = "registry+https://github.com/rust-lang/crates.io-index" 452 | checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" 453 | dependencies = [ 454 | "serde_derive", 455 | ] 456 | 457 | [[package]] 458 | name = "serde_derive" 459 | version = "1.0.203" 460 | source = "registry+https://github.com/rust-lang/crates.io-index" 461 | checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" 462 | dependencies = [ 463 | "proc-macro2", 464 | "quote", 465 | "syn", 466 | ] 467 | 468 | [[package]] 469 | name = "serde_json" 470 | version = "1.0.120" 471 | source = "registry+https://github.com/rust-lang/crates.io-index" 472 | checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" 473 | dependencies = [ 474 | "itoa", 475 | "ryu", 476 | "serde", 477 | ] 478 | 479 | [[package]] 480 | name = "syn" 481 | version = "2.0.68" 482 | source = "registry+https://github.com/rust-lang/crates.io-index" 483 | checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" 484 | dependencies = [ 485 | "proc-macro2", 486 | "quote", 487 | "unicode-ident", 488 | ] 489 | 490 | [[package]] 491 | name = "tinytemplate" 492 | version = "1.2.1" 493 | source = "registry+https://github.com/rust-lang/crates.io-index" 494 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 495 | dependencies = [ 496 | "serde", 497 | "serde_json", 498 | ] 499 | 500 | [[package]] 501 | name = "unicode-ident" 502 | version = "1.0.12" 503 | source = "registry+https://github.com/rust-lang/crates.io-index" 504 | checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" 505 | 506 | [[package]] 507 | name = "walkdir" 508 | version = "2.5.0" 509 | source = "registry+https://github.com/rust-lang/crates.io-index" 510 | checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 511 | dependencies = [ 512 | "same-file", 513 | "winapi-util", 514 | ] 515 | 516 | [[package]] 517 | name = "wasi" 518 | version = "0.11.0+wasi-snapshot-preview1" 519 | source = "registry+https://github.com/rust-lang/crates.io-index" 520 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 521 | 522 | [[package]] 523 | name = "wasm-bindgen" 524 | version = "0.2.92" 525 | source = "registry+https://github.com/rust-lang/crates.io-index" 526 | checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" 527 | dependencies = [ 528 | "cfg-if", 529 | "wasm-bindgen-macro", 530 | ] 531 | 532 | [[package]] 533 | name = "wasm-bindgen-backend" 534 | version = "0.2.92" 535 | source = "registry+https://github.com/rust-lang/crates.io-index" 536 | checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" 537 | dependencies = [ 538 | "bumpalo", 539 | "log", 540 | "once_cell", 541 | "proc-macro2", 542 | "quote", 543 | "syn", 544 | "wasm-bindgen-shared", 545 | ] 546 | 547 | [[package]] 548 | name = "wasm-bindgen-macro" 549 | version = "0.2.92" 550 | source = "registry+https://github.com/rust-lang/crates.io-index" 551 | checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" 552 | dependencies = [ 553 | "quote", 554 | "wasm-bindgen-macro-support", 555 | ] 556 | 557 | [[package]] 558 | name = "wasm-bindgen-macro-support" 559 | version = "0.2.92" 560 | source = "registry+https://github.com/rust-lang/crates.io-index" 561 | checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" 562 | dependencies = [ 563 | "proc-macro2", 564 | "quote", 565 | "syn", 566 | "wasm-bindgen-backend", 567 | "wasm-bindgen-shared", 568 | ] 569 | 570 | [[package]] 571 | name = "wasm-bindgen-shared" 572 | version = "0.2.92" 573 | source = "registry+https://github.com/rust-lang/crates.io-index" 574 | checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" 575 | 576 | [[package]] 577 | name = "web-sys" 578 | version = "0.3.69" 579 | source = "registry+https://github.com/rust-lang/crates.io-index" 580 | checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" 581 | dependencies = [ 582 | "js-sys", 583 | "wasm-bindgen", 584 | ] 585 | 586 | [[package]] 587 | name = "winapi-util" 588 | version = "0.1.8" 589 | source = "registry+https://github.com/rust-lang/crates.io-index" 590 | checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" 591 | dependencies = [ 592 | "windows-sys", 593 | ] 594 | 595 | [[package]] 596 | name = "windows-sys" 597 | version = "0.52.0" 598 | source = "registry+https://github.com/rust-lang/crates.io-index" 599 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 600 | dependencies = [ 601 | "windows-targets", 602 | ] 603 | 604 | [[package]] 605 | name = "windows-targets" 606 | version = "0.52.5" 607 | source = "registry+https://github.com/rust-lang/crates.io-index" 608 | checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" 609 | dependencies = [ 610 | "windows_aarch64_gnullvm", 611 | "windows_aarch64_msvc", 612 | "windows_i686_gnu", 613 | "windows_i686_gnullvm", 614 | "windows_i686_msvc", 615 | "windows_x86_64_gnu", 616 | "windows_x86_64_gnullvm", 617 | "windows_x86_64_msvc", 618 | ] 619 | 620 | [[package]] 621 | name = "windows_aarch64_gnullvm" 622 | version = "0.52.5" 623 | source = "registry+https://github.com/rust-lang/crates.io-index" 624 | checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" 625 | 626 | [[package]] 627 | name = "windows_aarch64_msvc" 628 | version = "0.52.5" 629 | source = "registry+https://github.com/rust-lang/crates.io-index" 630 | checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" 631 | 632 | [[package]] 633 | name = "windows_i686_gnu" 634 | version = "0.52.5" 635 | source = "registry+https://github.com/rust-lang/crates.io-index" 636 | checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" 637 | 638 | [[package]] 639 | name = "windows_i686_gnullvm" 640 | version = "0.52.5" 641 | source = "registry+https://github.com/rust-lang/crates.io-index" 642 | checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" 643 | 644 | [[package]] 645 | name = "windows_i686_msvc" 646 | version = "0.52.5" 647 | source = "registry+https://github.com/rust-lang/crates.io-index" 648 | checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" 649 | 650 | [[package]] 651 | name = "windows_x86_64_gnu" 652 | version = "0.52.5" 653 | source = "registry+https://github.com/rust-lang/crates.io-index" 654 | checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" 655 | 656 | [[package]] 657 | name = "windows_x86_64_gnullvm" 658 | version = "0.52.5" 659 | source = "registry+https://github.com/rust-lang/crates.io-index" 660 | checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" 661 | 662 | [[package]] 663 | name = "windows_x86_64_msvc" 664 | version = "0.52.5" 665 | source = "registry+https://github.com/rust-lang/crates.io-index" 666 | checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" 667 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "argmm" 3 | version = "0.1.2" 4 | authors = ["minimalrust "] 5 | edition = "2018" 6 | readme = "README.md" 7 | license = "MIT OR Apache-2.0" 8 | repository = "https://github.com/minimalrust/argmm.git" 9 | description = "Argmin and argmax with SIMD support for u8, u16, i16, i32 and f32" 10 | 11 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 12 | 13 | [dev-dependencies] 14 | criterion = "0.5.1" 15 | rand = "0.8.5" 16 | rand_distr = "0.4.3" 17 | 18 | [[bench]] 19 | name = "bench_f32" 20 | harness = false 21 | 22 | [[bench]] 23 | name = "bench_i32" 24 | harness = false 25 | 26 | [[bench]] 27 | name = "bench_i16" 28 | harness = false 29 | 30 | [[bench]] 31 | name = "bench_u16" 32 | harness = false 33 | 34 | [[bench]] 35 | name = "bench_u8" 36 | harness = false 37 | 38 | -------------------------------------------------------------------------------- /LICENSE_APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LICENSE_MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Caleb Tusubira 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NOTICE 2 | 3 | The preferred package is now [argminmax](https://crates.io/crates/argminmax) which is actively maintained and far more feature complete. This repo will be purely experimental from now on. A special thanks to [jvdd](https://github.com/jvdd) for building on the idea and taking it to the next step :clap:. 4 | 5 | # Argmm 6 | 7 | Argmin/max with SIMD support for u8, i16, u16, i32 and f32 arrays and vectors. 8 | 9 | ## Installing 10 | 11 | Add the following to your Cargo.toml 12 | 13 | ``` 14 | argmm = "0.1.2" 15 | ``` 16 | 17 | ## Getting started 18 | 19 | You can use the extention trait which will take advantage of SIMD if available 20 | ```rust 21 | use argmm::ArgMinMax; 22 | 23 | fn main() { 24 | let v = vec![1., 3., -20., 50., -82., 9., -53., 60., 0.]; 25 | let min_index = v.argmin(); 26 | let max_index = v.argmax(); 27 | assert_eq!(min_index, Some(4)); 28 | assert_eq!(max_index, Some(7)); 29 | } 30 | ``` 31 | 32 | Alternatively, the generic function can be used if you require non-SIMD support for other types 33 | 34 | ```rust 35 | use argmm::generic::{simple_argmin, simple_argmax}; 36 | 37 | fn main() { 38 | let v = vec![1u64, 3, 20, 50, 82, 9, 53, 60, 0]; 39 | let min_index = simple_argmin(&v); 40 | let max_index = simple_argmax(&v); 41 | assert_eq!(min_index, 8); 42 | assert_eq!(max_index, 4); 43 | } 44 | ``` 45 | 46 | ## Benchmarks 47 | 48 | Using a MacBook Pro (Retina, 13-inch, Early 2015) Processor 2.7 GHz Dual-Core Intel Core i5 49 | with an array size of 512. 50 | 51 | See `/benches/results`. 52 | 53 | ## Warning 54 | 55 | NAN values are not supported. 56 | 57 | ## License 58 | 59 | Licensed under either of 60 | * Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) 61 | * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) 62 | at your option. 63 | -------------------------------------------------------------------------------- /benches/bench_f32.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | use rand::{thread_rng, Rng}; 5 | use rand_distr::Uniform; 6 | 7 | use argmm::ArgMinMax; 8 | use criterion::{black_box, Criterion}; 9 | 10 | fn get_array_f32() -> Vec { 11 | let rng = thread_rng(); 12 | let uni = Uniform::new(std::f32::MIN, std::f32::MAX); 13 | rng.sample_iter(uni).take(512).collect() 14 | } 15 | 16 | fn max_f32(c: &mut Criterion) { 17 | let data = get_array_f32(); 18 | c.bench_function("simple_argmax_f32", |b| { 19 | b.iter(|| argmm::generic::simple_argmax(black_box(data.as_slice()))) 20 | }); 21 | let data = get_array_f32(); 22 | c.bench_function("argmax_simd_f32", |b| { 23 | b.iter(|| black_box(data.as_slice().argmax())) 24 | }); 25 | } 26 | 27 | fn min_f32(c: &mut Criterion) { 28 | let data = get_array_f32(); 29 | c.bench_function("simple_argmin_f32", |b| { 30 | b.iter(|| argmm::generic::simple_argmin(black_box(data.as_slice()))) 31 | }); 32 | let data = get_array_f32(); 33 | c.bench_function("argmin_simd_f32", |b| { 34 | b.iter(|| black_box(data.as_slice().argmin())) 35 | }); 36 | } 37 | 38 | criterion_group!(benches, max_f32, min_f32); 39 | criterion_main!(benches); 40 | -------------------------------------------------------------------------------- /benches/bench_i16.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | use rand::{thread_rng, Rng}; 5 | use rand_distr::Uniform; 6 | 7 | use argmm::ArgMinMax; 8 | use criterion::{black_box, Criterion}; 9 | 10 | fn get_array_i16() -> Vec { 11 | let rng = thread_rng(); 12 | let uni = Uniform::from(std::i16::MIN..std::i16::MAX); 13 | rng.sample_iter(uni).take(512).collect() 14 | } 15 | 16 | fn max_i16(c: &mut Criterion) { 17 | let data = get_array_i16(); 18 | c.bench_function("simple_argmax_i16", |b| { 19 | b.iter(|| argmm::generic::simple_argmax(black_box(data.as_slice()))) 20 | }); 21 | let data = get_array_i16(); 22 | c.bench_function("argmax_simd_i16", |b| { 23 | b.iter(|| black_box(data.as_slice().argmax())) 24 | }); 25 | } 26 | 27 | fn min_i16(c: &mut Criterion) { 28 | let data = get_array_i16(); 29 | c.bench_function("simple_argmin_i16", |b| { 30 | b.iter(|| argmm::generic::simple_argmin(black_box(data.as_slice()))) 31 | }); 32 | let data = get_array_i16(); 33 | c.bench_function("argmin_simd_i16", |b| { 34 | b.iter(|| black_box(data.as_slice().argmin())) 35 | }); 36 | } 37 | 38 | criterion_group!(benches, max_i16, min_i16); 39 | criterion_main!(benches); 40 | -------------------------------------------------------------------------------- /benches/bench_i32.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | use rand::{thread_rng, Rng}; 5 | use rand_distr::Uniform; 6 | 7 | use argmm::ArgMinMax; 8 | use criterion::{black_box, Criterion}; 9 | 10 | fn get_array_i32() -> Vec { 11 | let rng = thread_rng(); 12 | let uni = Uniform::new(std::i32::MIN, std::i32::MAX); 13 | rng.sample_iter(uni).take(512).collect() 14 | } 15 | 16 | fn max_i32(c: &mut Criterion) { 17 | let data = get_array_i32(); 18 | c.bench_function("simple_argmax_i32", |b| { 19 | b.iter(|| argmm::generic::simple_argmax(black_box(data.as_slice()))) 20 | }); 21 | let data = get_array_i32(); 22 | c.bench_function("argmax_simd_i32", |b| { 23 | b.iter(|| black_box(data.as_slice().argmax())) 24 | }); 25 | } 26 | 27 | fn min_i32(c: &mut Criterion) { 28 | let data = get_array_i32(); 29 | c.bench_function("simple_argmin_i32", |b| { 30 | b.iter(|| argmm::generic::simple_argmin(black_box(data.as_slice()))) 31 | }); 32 | let data = get_array_i32(); 33 | c.bench_function("argmin_simd_i32", |b| { 34 | b.iter(|| black_box(data.as_slice().argmin())) 35 | }); 36 | } 37 | 38 | criterion_group!(benches, max_i32, min_i32); 39 | criterion_main!(benches); 40 | -------------------------------------------------------------------------------- /benches/bench_u16.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | use rand::{thread_rng, Rng}; 5 | use rand_distr::Uniform; 6 | 7 | use argmm::ArgMinMax; 8 | use criterion::{black_box, Criterion}; 9 | 10 | fn get_array_u16() -> Vec { 11 | let rng = thread_rng(); 12 | let uni = Uniform::from(std::u16::MIN..std::u16::MAX); 13 | rng.sample_iter(uni).take(512).collect() 14 | } 15 | 16 | fn max_u16(c: &mut Criterion) { 17 | let data = get_array_u16(); 18 | c.bench_function("simple_argmax_u16", |b| { 19 | b.iter(|| argmm::generic::simple_argmax(black_box(data.as_slice()))) 20 | }); 21 | let data = get_array_u16(); 22 | c.bench_function("argmax_simd_u16", |b| { 23 | b.iter(|| black_box(data.as_slice().argmax())) 24 | }); 25 | } 26 | 27 | fn min_u16(c: &mut Criterion) { 28 | let data = get_array_u16(); 29 | c.bench_function("simple_argmin_u16", |b| { 30 | b.iter(|| argmm::generic::simple_argmin(black_box(data.as_slice()))) 31 | }); 32 | let data = get_array_u16(); 33 | c.bench_function("argmin_simd_u16", |b| { 34 | b.iter(|| black_box(data.as_slice().argmin())) 35 | }); 36 | } 37 | 38 | criterion_group!(benches, max_u16, min_u16); 39 | criterion_main!(benches); 40 | -------------------------------------------------------------------------------- /benches/bench_u8.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate criterion; 3 | 4 | use rand::{thread_rng, Rng}; 5 | use rand_distr::Uniform; 6 | 7 | use argmm::ArgMinMax; 8 | use criterion::{black_box, Criterion}; 9 | 10 | fn get_array_u8() -> Vec { 11 | let rng = thread_rng(); 12 | let uni = Uniform::from(std::u8::MIN..std::u8::MAX); 13 | rng.sample_iter(uni).take(512).collect() 14 | } 15 | 16 | fn max_u8(c: &mut Criterion) { 17 | let data = get_array_u8(); 18 | c.bench_function("simple_argmax_u8", |b| { 19 | b.iter(|| argmm::generic::simple_argmax(black_box(data.as_slice()))) 20 | }); 21 | let data = get_array_u8(); 22 | c.bench_function("argmax_simd_u8", |b| { 23 | b.iter(|| black_box(data.as_slice().argmax())) 24 | }); 25 | } 26 | 27 | fn min_u8(c: &mut Criterion) { 28 | let data = get_array_u8(); 29 | c.bench_function("simple_argmin_u8", |b| { 30 | b.iter(|| argmm::generic::simple_argmin(black_box(data.as_slice()))) 31 | }); 32 | let data = get_array_u8(); 33 | c.bench_function("argmin_simd_u8", |b| { 34 | b.iter(|| black_box(data.as_slice().argmin())) 35 | }); 36 | } 37 | 38 | criterion_group!(benches, max_u8, min_u8); 39 | criterion_main!(benches); 40 | -------------------------------------------------------------------------------- /benches/results: -------------------------------------------------------------------------------- 1 | simple_argmax_f32 time: [553.67 ns 555.29 ns 556.94 ns] 2 | argmax_simd_f32 time: [157.89 ns 158.02 ns 158.17 ns] 3 | simple_argmin_f32 time: [546.43 ns 546.72 ns 547.04 ns] 4 | argmin_simd_f32 time: [157.22 ns 157.38 ns 157.55 ns] 5 | 6 | simple_argmax_i16 time: [263.69 ns 264.42 ns 265.22 ns] 7 | argmax_simd_i16 time: [102.13 ns 102.69 ns 103.57 ns] 8 | simple_argmin_i16 time: [261.35 ns 261.56 ns 261.80 ns] 9 | argmin_simd_i16 time: [104.94 ns 105.14 ns 105.41 ns] 10 | 11 | simple_argmax_i32 time: [253.35 ns 253.64 ns 253.99 ns] 12 | argmax_simd_i32 time: [191.76 ns 191.92 ns 192.09 ns] 13 | simple_argmin_i32 time: [262.45 ns 262.62 ns 262.81 ns] 14 | argmin_simd_i32 time: [208.83 ns 209.06 ns 209.33 ns] 15 | 16 | simple_argmax_u16 time: [298.31 ns 298.50 ns 298.71 ns] 17 | argmax_simd_u16 time: [207.60 ns 207.79 ns 208.01 ns] 18 | simple_argmin_u16 time: [308.87 ns 309.44 ns 310.00 ns] 19 | argmin_simd_u16 time: [208.35 ns 208.72 ns 209.15 ns] 20 | 21 | simple_argmax_u8 time: [245.48 ns 245.67 ns 245.87 ns] 22 | argmax_simd_u8 time: [115.12 ns 115.21 ns 115.31 ns] 23 | simple_argmin_u8 time: [241.40 ns 241.67 ns 242.01 ns] 24 | argmin_simd_u8 time: [115.07 ns 115.18 ns 115.28 ns] 25 | -------------------------------------------------------------------------------- /src/generic.rs: -------------------------------------------------------------------------------- 1 | #[inline] 2 | pub fn simple_argmin(arr: &[T]) -> usize { 3 | let mut low_index = 0usize; 4 | let mut low = arr[low_index]; 5 | for (i, item) in arr.iter().enumerate() { 6 | if *item < low { 7 | low = *item; 8 | low_index = i; 9 | } 10 | } 11 | low_index 12 | } 13 | 14 | #[inline] 15 | pub fn simple_argmax(arr: &[T]) -> usize { 16 | let mut high_index = 0usize; 17 | let mut high = arr[high_index]; 18 | for (i, item) in arr.iter().enumerate() { 19 | if *item > high { 20 | high = *item; 21 | high_index = i; 22 | } 23 | } 24 | high_index 25 | } 26 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod generic; 2 | #[cfg(target_feature = "sse")] 3 | mod simd; 4 | #[cfg(target_feature = "sse")] 5 | mod task; 6 | 7 | #[cfg(not(target_feature = "sse"))] 8 | pub use generic::{simple_argmax, simple_argmin}; 9 | #[cfg(target_feature = "sse")] 10 | pub use simd::{simd_f32, simd_i16, simd_i32, simd_u16, simd_u8}; 11 | 12 | pub trait ArgMinMax { 13 | fn argmin(&self) -> Option; 14 | fn argmax(&self) -> Option; 15 | } 16 | 17 | macro_rules! impl_argmm_f32 { 18 | ($($b:ty),*) => { 19 | $(impl ArgMinMax for $b { 20 | 21 | fn argmin(&self) -> Option { 22 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmin(self)); 23 | #[cfg(target_feature = "sse")] return simd_f32::argmin_f32(self); 24 | } 25 | 26 | fn argmax(&self) -> Option { 27 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmax(self)); 28 | #[cfg(target_feature = "sse")] return simd_f32::argmax_f32(self); 29 | } 30 | })* 31 | } 32 | } 33 | 34 | macro_rules! impl_argmm_i32 { 35 | ($($b:ty),*) => { 36 | $(impl ArgMinMax for $b { 37 | 38 | fn argmin(&self) -> Option { 39 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmin(self)); 40 | #[cfg(target_feature = "sse")] return simd_i32::argmin_i32(self); 41 | } 42 | 43 | fn argmax(&self) -> Option { 44 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmax(self)); 45 | #[cfg(target_feature = "sse")] return simd_i32::argmax_i32(self); 46 | } 47 | })* 48 | } 49 | } 50 | 51 | macro_rules! impl_argmm_i16 { 52 | ($($b:ty),*) => { 53 | $(impl ArgMinMax for $b { 54 | 55 | fn argmin(&self) -> Option { 56 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmin(self)); 57 | #[cfg(target_feature = "sse")] return simd_i16::argmin_i16(self); 58 | } 59 | 60 | fn argmax(&self) -> Option { 61 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmax(self)); 62 | #[cfg(target_feature = "sse")] return simd_i16::argmax_i16(self); 63 | } 64 | })* 65 | } 66 | } 67 | 68 | macro_rules! impl_argmm_u16 { 69 | ($($b:ty),*) => { 70 | $(impl ArgMinMax for $b { 71 | 72 | fn argmin(&self) -> Option { 73 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmin(self)); 74 | #[cfg(target_feature = "sse")] return simd_u16::argmin_u16(self); 75 | } 76 | 77 | fn argmax(&self) -> Option { 78 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmax(self)); 79 | #[cfg(target_feature = "sse")] return simd_u16::argmax_u16(self); 80 | } 81 | })* 82 | } 83 | } 84 | 85 | macro_rules! impl_argmm_u8 { 86 | ($($b:ty),*) => { 87 | $(impl ArgMinMax for $b { 88 | 89 | fn argmin(&self) -> Option { 90 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmin(self)); 91 | #[cfg(target_feature = "sse")] return simd_u8::argmin_u8(self) ; 92 | } 93 | 94 | fn argmax(&self) -> Option { 95 | #[cfg(not(target_feature = "sse"))] return Some(simple_argmax(self)); 96 | #[cfg(target_feature = "sse")] return simd_u8::argmax_u8(self); 97 | } 98 | })* 99 | } 100 | } 101 | 102 | impl_argmm_f32!(Vec, &[f32], [f32]); 103 | impl_argmm_i32!(Vec, &[i32], [i32]); 104 | impl_argmm_i16!(Vec, &[i16], [i16]); 105 | impl_argmm_u16!(Vec, &[u16], [u16]); 106 | impl_argmm_u8!(Vec, &[u8], [u8]); 107 | -------------------------------------------------------------------------------- /src/simd/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod simd_f32; 2 | pub use simd_f32::*; 3 | pub mod simd_i32; 4 | pub use simd_i32::*; 5 | pub mod simd_i16; 6 | pub use simd_i16::*; 7 | pub mod simd_u16; 8 | pub use simd_u16::*; 9 | pub mod simd_u8; 10 | pub use simd_u8::*; 11 | -------------------------------------------------------------------------------- /src/simd/simd_f32.rs: -------------------------------------------------------------------------------- 1 | use crate::generic::{simple_argmax, simple_argmin}; 2 | use crate::task::{find_final_index_max, find_final_index_min, split_array}; 3 | use std::arch::x86_64::*; 4 | 5 | pub fn argmin_f32(arr: &[f32]) -> Option { 6 | match split_array(arr, 4) { 7 | (Some(rem), Some(sim)) => { 8 | let rem_min_index = simple_argmin(rem); 9 | let rem_result = (rem[rem_min_index], rem_min_index); 10 | let sim_result = unsafe { core_argmin(sim, rem.len()) }; 11 | find_final_index_min(rem_result, sim_result) 12 | } 13 | (Some(rem), None) => Some(simple_argmin(rem)), 14 | (None, Some(sim)) => { 15 | let sim_result = unsafe { core_argmin(sim, 0) }; 16 | Some(sim_result.1) 17 | } 18 | (None, None) => None, 19 | } 20 | } 21 | 22 | unsafe fn core_argmin(sim_arr: &[f32], rem_offset: usize) -> (f32, usize) { 23 | let offset = _mm_set1_ps(rem_offset as f32); 24 | let mut index_low = _mm_add_ps(_mm_set_ps(3.0, 2.0, 1.0, 0.0), offset); 25 | 26 | let increment = _mm_set1_ps(4.0); 27 | let mut new_index_low = index_low; 28 | 29 | let mut values_low = _mm_loadu_ps(sim_arr.as_ptr() as *const f32); 30 | 31 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 32 | new_index_low = _mm_add_ps(new_index_low, increment); 33 | 34 | let new_values = _mm_loadu_ps(step.as_ptr() as *const f32); 35 | let lt_mask = _mm_cmplt_ps(new_values, values_low); 36 | 37 | values_low = _mm_min_ps(new_values, values_low); 38 | index_low = _mm_or_ps( 39 | _mm_and_ps(new_index_low, lt_mask), 40 | _mm_andnot_ps(lt_mask, index_low), 41 | ); 42 | }); 43 | 44 | let highpack = _mm_unpackhi_ps(values_low, values_low); 45 | let lowpack = _mm_unpacklo_ps(values_low, values_low); 46 | let lowest = _mm_min_ps(highpack, lowpack); 47 | 48 | let highpack = _mm_unpackhi_ps(lowest, lowest); 49 | let lowpack = _mm_unpacklo_ps(lowest, lowest); 50 | let lowest = _mm_min_ps(highpack, lowpack); 51 | 52 | let low_mask = _mm_cmpeq_ps(lowest, values_low); 53 | 54 | index_low = _mm_or_ps( 55 | _mm_and_ps(index_low, low_mask), 56 | _mm_andnot_ps(low_mask, _mm_set1_ps(std::f32::MAX)), 57 | ); 58 | 59 | let value_array = std::mem::transmute::<__m128, [f32; 4]>(values_low); 60 | let index_array = std::mem::transmute::<__m128, [f32; 4]>(index_low); 61 | 62 | let min_index = simple_argmin(&index_array); 63 | let value = *value_array.get_unchecked(min_index); 64 | let index = *index_array.get_unchecked(min_index); 65 | 66 | (value, index as usize) 67 | } 68 | 69 | pub fn argmax_f32(arr: &[f32]) -> Option { 70 | match split_array(arr, 4) { 71 | (Some(rem), Some(sim)) => { 72 | let rem_min_index = simple_argmax(rem); 73 | let rem_result = (rem[rem_min_index], rem_min_index); 74 | let sim_result = unsafe { core_argmax(sim, rem.len()) }; 75 | find_final_index_max(rem_result, sim_result) 76 | } 77 | (Some(rem), None) => Some(simple_argmax(rem)), 78 | (None, Some(sim)) => { 79 | let sim_result = unsafe { core_argmax(sim, 0) }; 80 | Some(sim_result.1) 81 | } 82 | (None, None) => None, 83 | } 84 | } 85 | 86 | unsafe fn core_argmax(sim_arr: &[f32], rem_offset: usize) -> (f32, usize) { 87 | let offset = _mm_set1_ps(rem_offset as f32); 88 | let mut index_high = _mm_add_ps(_mm_set_ps(3.0, 2.0, 1.0, 0.0), offset); 89 | let mut new_index_high = index_high; 90 | 91 | let increment = _mm_set1_ps(4.0); 92 | 93 | let mut values_high = _mm_loadu_ps(sim_arr.as_ptr() as *const f32); 94 | 95 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 96 | new_index_high = _mm_add_ps(new_index_high, increment); 97 | 98 | let new_values = _mm_loadu_ps(step.as_ptr() as *const f32); 99 | let gt_mask = _mm_cmpgt_ps(new_values, values_high); 100 | 101 | values_high = _mm_max_ps(new_values, values_high); 102 | index_high = _mm_or_ps( 103 | _mm_and_ps(new_index_high, gt_mask), 104 | _mm_andnot_ps(gt_mask, index_high), 105 | ); 106 | }); 107 | 108 | let highpack = _mm_unpackhi_ps(values_high, values_high); 109 | let lowpack = _mm_unpacklo_ps(values_high, values_high); 110 | let highest = _mm_max_ps(highpack, lowpack); 111 | 112 | let highpack = _mm_unpackhi_ps(highest, highest); 113 | let lowpack = _mm_unpacklo_ps(highest, highest); 114 | let highest = _mm_max_ps(highpack, lowpack); 115 | 116 | let high_mask = _mm_cmpeq_ps(highest, values_high); 117 | 118 | index_high = _mm_or_ps( 119 | _mm_and_ps(index_high, high_mask), 120 | _mm_andnot_ps(high_mask, _mm_set1_ps(std::f32::MAX)), 121 | ); 122 | 123 | let value_array = std::mem::transmute::<__m128, [f32; 4]>(values_high); 124 | let index_array = std::mem::transmute::<__m128, [f32; 4]>(index_high); 125 | 126 | let max_index = simple_argmin(&index_array); 127 | let value = *value_array.get_unchecked(max_index); 128 | let index = *index_array.get_unchecked(max_index); 129 | 130 | (value, index as usize) 131 | } 132 | 133 | #[cfg(test)] 134 | mod tests { 135 | use super::{argmax_f32, argmin_f32, simple_argmax, simple_argmin}; 136 | use rand::{thread_rng, Rng}; 137 | use rand_distr::Uniform; 138 | 139 | fn get_array_f32(n: usize) -> Vec { 140 | let rng = thread_rng(); 141 | let uni = Uniform::new_inclusive(std::f32::MIN, std::f32::MAX); 142 | rng.sample_iter(uni).take(n).collect() 143 | } 144 | 145 | #[test] 146 | fn test_both_versions_return_the_same_results() { 147 | let data = get_array_f32(1025); 148 | assert_eq!(data.len() % 4, 1); 149 | 150 | let min_index = argmin_f32(&data).unwrap(); 151 | let max_index = argmax_f32(&data).unwrap(); 152 | let argmin_index = simple_argmin(&data); 153 | let argmax_index = simple_argmax(&data); 154 | 155 | assert_eq!(argmin_index, min_index); 156 | assert_eq!(argmax_index, max_index); 157 | } 158 | 159 | #[test] 160 | fn test_first_index_is_returned_when_identical_values_found() { 161 | let data = [ 162 | 10., 163 | std::f32::MAX, 164 | 6., 165 | std::f32::NEG_INFINITY, 166 | std::f32::NEG_INFINITY, 167 | std::f32::MAX, 168 | 10_000.0, 169 | ]; 170 | let argmin_index = simple_argmin(&data); 171 | let argmin_simd_index = argmin_f32(&data).unwrap(); 172 | assert_eq!(argmin_index, argmin_simd_index); 173 | assert_eq!(argmin_index, 3); 174 | 175 | let argmax_index = simple_argmax(&data); 176 | let argmax_simd_index = argmax_f32(&data).unwrap(); 177 | assert_eq!(argmax_index, argmax_simd_index); 178 | assert_eq!(argmax_index, 1); 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /src/simd/simd_i16.rs: -------------------------------------------------------------------------------- 1 | use crate::generic::{simple_argmax, simple_argmin}; 2 | use crate::task::{find_final_index_max, find_final_index_min, split_array}; 3 | use std::arch::x86_64::*; 4 | 5 | pub fn argmin_i16(arr: &[i16]) -> Option { 6 | let n = arr.len(); 7 | let mut simd_func: unsafe fn(&[i16], usize) -> (i16, usize) = core_argmin; 8 | let mut mod_size = 4; 9 | 10 | if (17..std::i16::MAX).contains(&(n as i16)) { 11 | simd_func = core_argmin_ext; 12 | mod_size = 8; 13 | }; 14 | 15 | match split_array(arr, mod_size) { 16 | (Some(rem), Some(sim)) => { 17 | let rem_min_index = simple_argmin(rem); 18 | let rem_result = (rem[rem_min_index], rem_min_index); 19 | let sim_result = unsafe { simd_func(sim, rem.len()) }; 20 | find_final_index_min(rem_result, sim_result) 21 | } 22 | (Some(rem), None) => Some(simple_argmin(rem)), 23 | (None, Some(sim)) => { 24 | let sim_result = unsafe { simd_func(sim, 0) }; 25 | Some(sim_result.1) 26 | } 27 | (None, None) => None, 28 | } 29 | } 30 | 31 | unsafe fn core_argmin(sim_arr: &[i16], rem_offset: usize) -> (i16, usize) { 32 | let offset = _mm_set1_epi32(rem_offset as i32); 33 | let mut index_low = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 34 | 35 | let increment = _mm_set1_epi32(4); 36 | let mut new_index_low = index_low; 37 | 38 | let mut values_low = _mm_set_epi32( 39 | sim_arr[3] as i32, 40 | sim_arr[2] as i32, 41 | sim_arr[1] as i32, 42 | sim_arr[0] as i32, 43 | ); 44 | 45 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 46 | new_index_low = _mm_add_epi32(new_index_low, increment); 47 | 48 | let new_values = _mm_set_epi32( 49 | step[3] as i32, 50 | step[2] as i32, 51 | step[1] as i32, 52 | step[0] as i32, 53 | ); 54 | 55 | let lt_mask = _mm_cmplt_epi32(new_values, values_low); 56 | 57 | values_low = _mm_or_si128( 58 | _mm_and_si128(new_values, lt_mask), 59 | _mm_andnot_si128(lt_mask, values_low), 60 | ); 61 | index_low = _mm_or_si128( 62 | _mm_and_si128(new_index_low, lt_mask), 63 | _mm_andnot_si128(lt_mask, index_low), 64 | ); 65 | }); 66 | 67 | let highpack = _mm_unpackhi_epi32(values_low, values_low); 68 | let lowpack = _mm_unpacklo_epi32(values_low, values_low); 69 | let lowest = _mm_min_epi32(highpack, lowpack); 70 | 71 | let highpack = _mm_unpackhi_epi32(lowest, lowest); 72 | let lowpack = _mm_unpacklo_epi32(lowest, lowest); 73 | let lowest = _mm_min_epi32(highpack, lowpack); 74 | 75 | let low_mask = _mm_cmpeq_epi32(lowest, values_low); 76 | 77 | index_low = _mm_or_si128( 78 | _mm_and_si128(index_low, low_mask), 79 | _mm_andnot_si128(low_mask, _mm_set1_epi32(std::i32::MAX)), 80 | ); 81 | 82 | let value_array = std::mem::transmute::<__m128i, [u32; 4]>(values_low); 83 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_low); 84 | 85 | let min_index = simple_argmin(&index_array); 86 | let value = *value_array.get_unchecked(min_index); 87 | let index = *index_array.get_unchecked(min_index); 88 | 89 | (value as i16, index as usize) 90 | } 91 | 92 | unsafe fn core_argmin_ext(sim_arr: &[i16], rem_offset: usize) -> (i16, usize) { 93 | let offset = _mm_set1_epi16(rem_offset as i16); 94 | let mut index_low = _mm_add_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), offset); 95 | 96 | let increment = _mm_set1_epi16(8); 97 | let mut new_index_low = index_low; 98 | 99 | let mut values_low = _mm_set_epi16( 100 | sim_arr[7], sim_arr[6], sim_arr[5], sim_arr[4], sim_arr[3], sim_arr[2], sim_arr[1], 101 | sim_arr[0], 102 | ); 103 | 104 | sim_arr.chunks_exact(8).skip(1).for_each(|step| { 105 | new_index_low = _mm_add_epi16(new_index_low, increment); 106 | 107 | let new_values = _mm_set_epi16( 108 | step[7], step[6], step[5], step[4], step[3], step[2], step[1], step[0], 109 | ); 110 | 111 | let lt_mask = _mm_cmplt_epi16(new_values, values_low); 112 | 113 | values_low = _mm_or_si128( 114 | _mm_and_si128(new_values, lt_mask), 115 | _mm_andnot_si128(lt_mask, values_low), 116 | ); 117 | index_low = _mm_or_si128( 118 | _mm_and_si128(new_index_low, lt_mask), 119 | _mm_andnot_si128(lt_mask, index_low), 120 | ); 121 | }); 122 | 123 | let highpack = _mm_unpackhi_epi16(values_low, values_low); 124 | let lowpack = _mm_unpacklo_epi16(values_low, values_low); 125 | let lowest = _mm_min_epi16(highpack, lowpack); 126 | 127 | let highpack = _mm_unpackhi_epi16(lowest, lowest); 128 | let lowpack = _mm_unpacklo_epi16(lowest, lowest); 129 | let lowest = _mm_min_epi16(highpack, lowpack); 130 | 131 | let highpack = _mm_unpackhi_epi16(lowest, lowest); 132 | let lowpack = _mm_unpacklo_epi16(lowest, lowest); 133 | let lowest = _mm_min_epi16(highpack, lowpack); 134 | 135 | let low_mask = _mm_cmpeq_epi16(lowest, values_low); 136 | 137 | index_low = _mm_or_si128( 138 | _mm_and_si128(index_low, low_mask), 139 | _mm_andnot_si128(low_mask, _mm_set1_epi16(std::i16::MAX)), 140 | ); 141 | 142 | let value_array = std::mem::transmute::<__m128i, [i16; 8]>(values_low); 143 | let index_array = std::mem::transmute::<__m128i, [i16; 8]>(index_low); 144 | 145 | let min_index = simple_argmin(&index_array); 146 | let value = *value_array.get_unchecked(min_index); 147 | let index = *index_array.get_unchecked(min_index); 148 | 149 | (value, index as usize) 150 | } 151 | 152 | pub fn argmax_i16(arr: &[i16]) -> Option { 153 | let n = arr.len(); 154 | let mut simd_func: unsafe fn(&[i16], usize) -> (i16, usize) = core_argmax; 155 | let mut mod_size = 4; 156 | 157 | if (17..=std::i16::MAX).contains(&(n as i16)) { 158 | simd_func = core_argmax_ext; 159 | mod_size = 8; 160 | }; 161 | 162 | match split_array(arr, mod_size) { 163 | (Some(rem), Some(sim)) => { 164 | let rem_min_index = simple_argmax(rem); 165 | let rem_result = (rem[rem_min_index], rem_min_index); 166 | let sim_result = unsafe { simd_func(sim, rem.len()) }; 167 | find_final_index_max(rem_result, sim_result) 168 | } 169 | (Some(rem), None) => Some(simple_argmax(rem)), 170 | (None, Some(sim)) => { 171 | let sim_result = unsafe { simd_func(sim, 0) }; 172 | Some(sim_result.1) 173 | } 174 | (None, None) => None, 175 | } 176 | } 177 | 178 | unsafe fn core_argmax(sim_arr: &[i16], rem_offset: usize) -> (i16, usize) { 179 | let offset = _mm_set1_epi32(rem_offset as i32); 180 | let mut index_high = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 181 | let mut new_index_high = index_high; 182 | 183 | let increment = _mm_set1_epi32(4); 184 | 185 | let mut values_high = _mm_set_epi32( 186 | sim_arr[3] as i32, 187 | sim_arr[2] as i32, 188 | sim_arr[1] as i32, 189 | sim_arr[0] as i32, 190 | ); 191 | 192 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 193 | new_index_high = _mm_add_epi32(new_index_high, increment); 194 | 195 | let new_values = _mm_set_epi32( 196 | step[3] as i32, 197 | step[2] as i32, 198 | step[1] as i32, 199 | step[0] as i32, 200 | ); 201 | 202 | let gt_mask = _mm_cmpgt_epi32(new_values, values_high); 203 | 204 | values_high = _mm_or_si128( 205 | _mm_and_si128(new_values, gt_mask), 206 | _mm_andnot_si128(gt_mask, values_high), 207 | ); 208 | index_high = _mm_or_si128( 209 | _mm_and_si128(new_index_high, gt_mask), 210 | _mm_andnot_si128(gt_mask, index_high), 211 | ); 212 | }); 213 | 214 | let highpack = _mm_unpackhi_epi32(values_high, values_high); 215 | let lowpack = _mm_unpacklo_epi32(values_high, values_high); 216 | let highest = _mm_max_epi32(highpack, lowpack); 217 | 218 | let highpack = _mm_unpackhi_epi32(highest, highest); 219 | let lowpack = _mm_unpacklo_epi32(highest, highest); 220 | let highest = _mm_max_epi32(highpack, lowpack); 221 | 222 | let high_mask = _mm_cmpeq_epi32(highest, values_high); 223 | 224 | index_high = _mm_or_si128( 225 | _mm_and_si128(index_high, high_mask), 226 | _mm_andnot_si128(high_mask, _mm_set1_epi32(std::i32::MAX)), 227 | ); 228 | 229 | let value_array = std::mem::transmute::<__m128i, [u32; 4]>(values_high); 230 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_high); 231 | 232 | let min_index = simple_argmin(&index_array); 233 | let value = *value_array.get_unchecked(min_index); 234 | let index = *index_array.get_unchecked(min_index); 235 | 236 | (value as i16, index as usize) 237 | } 238 | 239 | unsafe fn core_argmax_ext(sim_arr: &[i16], rem_offset: usize) -> (i16, usize) { 240 | let offset = _mm_set1_epi16(rem_offset as i16); 241 | let mut index_high = _mm_add_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), offset); 242 | let mut new_index_high = index_high; 243 | 244 | let increment = _mm_set1_epi16(8); 245 | 246 | let mut values_high = _mm_set_epi16( 247 | sim_arr[7], sim_arr[6], sim_arr[5], sim_arr[4], sim_arr[3], sim_arr[2], sim_arr[1], 248 | sim_arr[0], 249 | ); 250 | 251 | sim_arr.chunks_exact(8).skip(1).for_each(|step| { 252 | new_index_high = _mm_add_epi16(new_index_high, increment); 253 | 254 | let new_values = _mm_set_epi16( 255 | step[7], step[6], step[5], step[4], step[3], step[2], step[1], step[0], 256 | ); 257 | 258 | let gt_mask = _mm_cmpgt_epi16(new_values, values_high); 259 | 260 | values_high = _mm_or_si128( 261 | _mm_and_si128(new_values, gt_mask), 262 | _mm_andnot_si128(gt_mask, values_high), 263 | ); 264 | index_high = _mm_or_si128( 265 | _mm_and_si128(new_index_high, gt_mask), 266 | _mm_andnot_si128(gt_mask, index_high), 267 | ); 268 | }); 269 | 270 | let highpack = _mm_unpackhi_epi16(values_high, values_high); 271 | let lowpack = _mm_unpacklo_epi16(values_high, values_high); 272 | let highest = _mm_max_epi16(highpack, lowpack); 273 | 274 | let highpack = _mm_unpackhi_epi16(highest, highest); 275 | let lowpack = _mm_unpacklo_epi16(highest, highest); 276 | let highest = _mm_max_epi16(highpack, lowpack); 277 | 278 | let highpack = _mm_unpackhi_epi16(highest, highest); 279 | let lowpack = _mm_unpacklo_epi16(highest, highest); 280 | let highest = _mm_max_epi16(highpack, lowpack); 281 | 282 | let high_mask = _mm_cmpeq_epi16(highest, values_high); 283 | 284 | index_high = _mm_or_si128( 285 | _mm_and_si128(index_high, high_mask), 286 | _mm_andnot_si128(high_mask, _mm_set1_epi16(std::i16::MAX)), 287 | ); 288 | 289 | let value_array = std::mem::transmute::<__m128i, [i16; 8]>(values_high); 290 | let index_array = std::mem::transmute::<__m128i, [i16; 8]>(index_high); 291 | 292 | let min_index = simple_argmin(&index_array); 293 | let value = *value_array.get_unchecked(min_index); 294 | let index = *index_array.get_unchecked(min_index); 295 | 296 | (value, index as usize) 297 | } 298 | 299 | #[cfg(test)] 300 | mod tests { 301 | use super::{argmax_i16, argmin_i16, simple_argmax, simple_argmin}; 302 | use rand::{thread_rng, Rng}; 303 | use rand_distr::Uniform; 304 | 305 | fn get_array_i16(n: usize) -> Vec { 306 | let rng = thread_rng(); 307 | let uni = Uniform::new_inclusive(std::i16::MIN, std::i16::MAX); 308 | rng.sample_iter(uni).take(n).collect() 309 | } 310 | 311 | #[test] 312 | fn test_both_versions_return_the_same_results_long_array() { 313 | let data = get_array_i16(1025); 314 | assert_eq!(data.len() % 8, 1); 315 | 316 | let min_index = argmin_i16(&data).unwrap(); 317 | let max_index = argmax_i16(&data).unwrap(); 318 | let argmin_index = simple_argmin(&data); 319 | let argmax_index = simple_argmax(&data); 320 | 321 | assert_eq!(argmin_index, min_index); 322 | assert_eq!(argmax_index, max_index); 323 | } 324 | 325 | #[test] 326 | fn test_both_versions_return_the_same_results_short_array() { 327 | let data = get_array_i16(9); 328 | assert_eq!(data.len() % 4, 1); 329 | 330 | let min_index = argmin_i16(&data).unwrap(); 331 | let max_index = argmax_i16(&data).unwrap(); 332 | let argmin_index = simple_argmin(&data); 333 | let argmax_index = simple_argmax(&data); 334 | 335 | assert_eq!(argmin_index, min_index); 336 | assert_eq!(argmax_index, max_index); 337 | } 338 | 339 | #[test] 340 | fn test_first_index_is_returned_when_identical_values_found() { 341 | let data = [ 342 | 10, 343 | std::i16::MIN, 344 | 6, 345 | 9, 346 | 9, 347 | 22, 348 | std::i16::MAX, 349 | 4, 350 | std::i16::MAX, 351 | ]; 352 | let argmin_index = simple_argmin(&data); 353 | let argmin_simd_index = argmin_i16(&data).unwrap(); 354 | assert_eq!(argmin_index, argmin_simd_index); 355 | assert_eq!(argmin_index, 1); 356 | 357 | let argmax_index = simple_argmax(&data); 358 | let argmax_simd_index = argmax_i16(&data).unwrap(); 359 | assert_eq!(argmax_index, argmax_simd_index); 360 | assert_eq!(argmax_index, 6); 361 | } 362 | } 363 | -------------------------------------------------------------------------------- /src/simd/simd_i32.rs: -------------------------------------------------------------------------------- 1 | use crate::generic::{simple_argmax, simple_argmin}; 2 | use crate::task::{find_final_index_max, find_final_index_min, split_array}; 3 | use std::arch::x86_64::*; 4 | 5 | pub fn argmin_i32(arr: &[i32]) -> Option { 6 | match split_array(arr, 4) { 7 | (Some(rem), Some(sim)) => { 8 | let rem_min_index = simple_argmin(rem); 9 | let rem_result = (rem[rem_min_index], rem_min_index); 10 | let sim_result = unsafe { core_argmin(sim, rem.len()) }; 11 | find_final_index_min(rem_result, sim_result) 12 | } 13 | (Some(rem), None) => Some(simple_argmin(rem)), 14 | (None, Some(sim)) => { 15 | let sim_result = unsafe { core_argmin(sim, 0) }; 16 | Some(sim_result.1) 17 | } 18 | (None, None) => None, 19 | } 20 | } 21 | 22 | unsafe fn core_argmin(sim_arr: &[i32], rem_offset: usize) -> (i32, usize) { 23 | let offset = _mm_set1_epi32(rem_offset as i32); 24 | let mut index_low = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 25 | 26 | let increment = _mm_set1_epi32(4); 27 | let mut new_index_low = index_low; 28 | 29 | let mut values_low = _mm_loadu_si128(sim_arr.as_ptr() as *const __m128i); 30 | 31 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 32 | new_index_low = _mm_add_epi32(new_index_low, increment); 33 | 34 | let new_values = _mm_loadu_si128(step.as_ptr() as *const __m128i); 35 | let lt_mask = _mm_cmplt_epi32(new_values, values_low); 36 | 37 | values_low = _mm_or_si128( 38 | _mm_and_si128(new_values, lt_mask), 39 | _mm_andnot_si128(lt_mask, values_low), 40 | ); 41 | index_low = _mm_or_si128( 42 | _mm_and_si128(new_index_low, lt_mask), 43 | _mm_andnot_si128(lt_mask, index_low), 44 | ); 45 | }); 46 | 47 | let highpack = _mm_unpackhi_epi32(values_low, values_low); 48 | let lowpack = _mm_unpacklo_epi32(values_low, values_low); 49 | let lowest = _mm_min_epi32(highpack, lowpack); 50 | 51 | let highpack = _mm_unpackhi_epi32(lowest, lowest); 52 | let lowpack = _mm_unpacklo_epi32(lowest, lowest); 53 | let lowest = _mm_min_epi32(highpack, lowpack); 54 | 55 | let low_mask = _mm_cmpeq_epi32(lowest, values_low); 56 | 57 | index_low = _mm_or_si128( 58 | _mm_and_si128(index_low, low_mask), 59 | _mm_andnot_si128(low_mask, _mm_set1_epi32(std::i32::MAX)), 60 | ); 61 | 62 | let value_array = std::mem::transmute::<__m128i, [i32; 4]>(values_low); 63 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_low); 64 | 65 | let min_index = simple_argmin(&index_array); 66 | let value = *value_array.get_unchecked(min_index); 67 | let index = *index_array.get_unchecked(min_index); 68 | 69 | (value, index as usize) 70 | } 71 | 72 | pub fn argmax_i32(arr: &[i32]) -> Option { 73 | match split_array(arr, 4) { 74 | (Some(rem), Some(sim)) => { 75 | let rem_min_index = simple_argmax(rem); 76 | let rem_result = (rem[rem_min_index], rem_min_index); 77 | let sim_result = unsafe { core_argmax(sim, rem.len()) }; 78 | find_final_index_max(rem_result, sim_result) 79 | } 80 | (Some(rem), None) => Some(simple_argmax(rem)), 81 | (None, Some(sim)) => { 82 | let sim_result = unsafe { core_argmax(sim, 0) }; 83 | Some(sim_result.1) 84 | } 85 | (None, None) => None, 86 | } 87 | } 88 | 89 | unsafe fn core_argmax(sim_arr: &[i32], rem_offset: usize) -> (i32, usize) { 90 | let offset = _mm_set1_epi32(rem_offset as i32); 91 | let mut index_high = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 92 | let mut new_index_high = index_high; 93 | 94 | let increment = _mm_set1_epi32(4); 95 | 96 | let mut values_high = _mm_loadu_si128(sim_arr.as_ptr() as *const __m128i); 97 | 98 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 99 | new_index_high = _mm_add_epi32(new_index_high, increment); 100 | 101 | let new_values = _mm_loadu_si128(step.as_ptr() as *const __m128i); 102 | let gt_mask = _mm_cmpgt_epi32(new_values, values_high); 103 | 104 | values_high = _mm_or_si128( 105 | _mm_and_si128(new_values, gt_mask), 106 | _mm_andnot_si128(gt_mask, values_high), 107 | ); 108 | index_high = _mm_or_si128( 109 | _mm_and_si128(new_index_high, gt_mask), 110 | _mm_andnot_si128(gt_mask, index_high), 111 | ); 112 | }); 113 | 114 | let highpack = _mm_unpackhi_epi32(values_high, values_high); 115 | let lowpack = _mm_unpacklo_epi32(values_high, values_high); 116 | let highest = _mm_max_epi32(highpack, lowpack); 117 | 118 | let highpack = _mm_unpackhi_epi32(highest, highest); 119 | let lowpack = _mm_unpacklo_epi32(highest, highest); 120 | let highest = _mm_max_epi32(highpack, lowpack); 121 | 122 | let high_mask = _mm_cmpeq_epi32(highest, values_high); 123 | 124 | index_high = _mm_or_si128( 125 | _mm_and_si128(index_high, high_mask), 126 | _mm_andnot_si128(high_mask, _mm_set1_epi32(std::i32::MAX)), 127 | ); 128 | 129 | let value_array = std::mem::transmute::<__m128i, [i32; 4]>(values_high); 130 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_high); 131 | 132 | let min_index = simple_argmin(&index_array); 133 | let value = *value_array.get_unchecked(min_index); 134 | let index = *index_array.get_unchecked(min_index); 135 | 136 | (value, index as usize) 137 | } 138 | 139 | #[cfg(test)] 140 | mod tests { 141 | use super::{argmax_i32, argmin_i32, simple_argmax, simple_argmin}; 142 | use rand::{thread_rng, Rng}; 143 | use rand_distr::Uniform; 144 | 145 | fn get_array_i32(n: usize) -> Vec { 146 | let rng = thread_rng(); 147 | let uni = Uniform::new_inclusive(std::i32::MIN, std::i32::MAX); 148 | rng.sample_iter(uni).take(n).collect() 149 | } 150 | 151 | #[test] 152 | fn test_both_versions_return_the_same_results() { 153 | let data = get_array_i32(1025); 154 | assert_eq!(data.len() % 4, 1); 155 | 156 | let min_index = argmin_i32(&data).unwrap(); 157 | let max_index = argmax_i32(&data).unwrap(); 158 | let argmin_index = simple_argmin(&data); 159 | let argmax_index = simple_argmax(&data); 160 | 161 | assert_eq!(argmin_index, min_index); 162 | assert_eq!(argmax_index, max_index); 163 | } 164 | 165 | #[test] 166 | fn test_first_index_is_returned_when_identical_values_found() { 167 | let data = [ 168 | std::i32::MIN, 169 | std::i32::MIN, 170 | 4, 171 | 6, 172 | 9, 173 | std::i32::MAX, 174 | 22, 175 | std::i32::MAX, 176 | ]; 177 | let argmin_index = simple_argmin(&data); 178 | let argmin_simd_index = argmin_i32(&data).unwrap(); 179 | assert_eq!(argmin_index, argmin_simd_index); 180 | assert_eq!(argmin_index, 0); 181 | 182 | let argmax_index = simple_argmax(&data); 183 | let argmax_simd_index = argmax_i32(&data).unwrap(); 184 | assert_eq!(argmax_index, argmax_simd_index); 185 | assert_eq!(argmax_index, 5); 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/simd/simd_u16.rs: -------------------------------------------------------------------------------- 1 | use crate::generic::{simple_argmax, simple_argmin}; 2 | use crate::task::{find_final_index_max, find_final_index_min, split_array}; 3 | use std::arch::x86_64::*; 4 | 5 | pub fn argmin_u16(arr: &[u16]) -> Option { 6 | match split_array(arr, 4) { 7 | (Some(rem), Some(sim)) => { 8 | let rem_min_index = simple_argmin(rem); 9 | let rem_result = (rem[rem_min_index], rem_min_index); 10 | let sim_result = unsafe { core_argmin(sim, rem.len()) }; 11 | find_final_index_min(rem_result, sim_result) 12 | } 13 | (Some(rem), None) => Some(simple_argmin(rem)), 14 | (None, Some(sim)) => { 15 | let sim_result = unsafe { core_argmin(sim, 0) }; 16 | Some(sim_result.1) 17 | } 18 | (None, None) => None, 19 | } 20 | } 21 | 22 | unsafe fn core_argmin(sim_arr: &[u16], rem_offset: usize) -> (u16, usize) { 23 | let offset = _mm_set1_epi32(rem_offset as i32); 24 | let mut index_low = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 25 | 26 | let increment = _mm_set1_epi32(4); 27 | let mut new_index_low = index_low; 28 | 29 | let mut values_low = _mm_set_epi32( 30 | sim_arr[3] as i32, 31 | sim_arr[2] as i32, 32 | sim_arr[1] as i32, 33 | sim_arr[0] as i32, 34 | ); 35 | 36 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 37 | new_index_low = _mm_add_epi32(new_index_low, increment); 38 | 39 | let new_values = _mm_set_epi32( 40 | step[3] as i32, 41 | step[2] as i32, 42 | step[1] as i32, 43 | step[0] as i32, 44 | ); 45 | 46 | let lt_mask = _mm_cmplt_epi32(new_values, values_low); 47 | 48 | values_low = _mm_or_si128( 49 | _mm_and_si128(new_values, lt_mask), 50 | _mm_andnot_si128(lt_mask, values_low), 51 | ); 52 | index_low = _mm_or_si128( 53 | _mm_and_si128(new_index_low, lt_mask), 54 | _mm_andnot_si128(lt_mask, index_low), 55 | ); 56 | }); 57 | 58 | let highpack = _mm_unpackhi_epi32(values_low, values_low); 59 | let lowpack = _mm_unpacklo_epi32(values_low, values_low); 60 | 61 | let mut lowest = _mm_min_epi32(highpack, lowpack); 62 | 63 | let highestpack = _mm_unpackhi_epi32(lowest, lowest); 64 | let lowestpack = _mm_unpacklo_epi32(lowest, lowest); 65 | 66 | lowest = _mm_min_epi32(highestpack, lowestpack); 67 | 68 | let low_mask = _mm_cmpeq_epi32(lowest, values_low); 69 | 70 | index_low = _mm_or_si128( 71 | _mm_and_si128(index_low, low_mask), 72 | _mm_andnot_si128(low_mask, _mm_set1_epi32(std::i32::MAX)), 73 | ); 74 | 75 | let value_array = std::mem::transmute::<__m128i, [u32; 4]>(values_low); 76 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_low); 77 | 78 | let min_index = simple_argmin(&index_array); 79 | let value = *value_array.get_unchecked(min_index); 80 | let index = *index_array.get_unchecked(min_index); 81 | 82 | (value as u16, index as usize) 83 | } 84 | 85 | pub fn argmax_u16(arr: &[u16]) -> Option { 86 | match split_array(arr, 4) { 87 | (Some(rem), Some(sim)) => { 88 | let rem_min_index = simple_argmax(rem); 89 | let rem_result = (rem[rem_min_index], rem_min_index); 90 | let sim_result = unsafe { core_argmax(sim, rem.len()) }; 91 | find_final_index_max(rem_result, sim_result) 92 | } 93 | (Some(rem), None) => Some(simple_argmax(rem)), 94 | (None, Some(sim)) => { 95 | let sim_result = unsafe { core_argmax(sim, 0) }; 96 | Some(sim_result.1) 97 | } 98 | (None, None) => None, 99 | } 100 | } 101 | 102 | unsafe fn core_argmax(sim_arr: &[u16], rem_offset: usize) -> (u16, usize) { 103 | let offset = _mm_set1_epi32(rem_offset as i32); 104 | let mut index_high = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 105 | let mut new_index_high = index_high; 106 | 107 | let increment = _mm_set1_epi32(4); 108 | 109 | let mut values_high = _mm_set_epi32( 110 | sim_arr[3] as i32, 111 | sim_arr[2] as i32, 112 | sim_arr[1] as i32, 113 | sim_arr[0] as i32, 114 | ); 115 | 116 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 117 | new_index_high = _mm_add_epi32(new_index_high, increment); 118 | 119 | let new_values = _mm_set_epi32( 120 | step[3] as i32, 121 | step[2] as i32, 122 | step[1] as i32, 123 | step[0] as i32, 124 | ); 125 | 126 | let gt_mask = _mm_cmpgt_epi32(new_values, values_high); 127 | 128 | values_high = _mm_or_si128( 129 | _mm_and_si128(new_values, gt_mask), 130 | _mm_andnot_si128(gt_mask, values_high), 131 | ); 132 | index_high = _mm_or_si128( 133 | _mm_and_si128(new_index_high, gt_mask), 134 | _mm_andnot_si128(gt_mask, index_high), 135 | ); 136 | }); 137 | 138 | let highpack = _mm_unpackhi_epi32(values_high, values_high); 139 | let lowpack = _mm_unpacklo_epi32(values_high, values_high); 140 | 141 | let mut highest = _mm_max_epi32(highpack, lowpack); 142 | 143 | let highestpack = _mm_unpackhi_epi32(highest, highest); 144 | let lowestpack = _mm_unpacklo_epi32(highest, highest); 145 | 146 | highest = _mm_max_epi32(highestpack, lowestpack); 147 | 148 | let high_mask = _mm_cmpeq_epi32(highest, values_high); 149 | 150 | index_high = _mm_or_si128( 151 | _mm_and_si128(index_high, high_mask), 152 | _mm_andnot_si128(high_mask, _mm_set1_epi32(std::i32::MAX)), 153 | ); 154 | 155 | let value_array = std::mem::transmute::<__m128i, [u32; 4]>(values_high); 156 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_high); 157 | 158 | let min_index = simple_argmin(&index_array); 159 | let value = *value_array.get_unchecked(min_index); 160 | let index = *index_array.get_unchecked(min_index); 161 | 162 | (value as u16, index as usize) 163 | } 164 | 165 | #[cfg(test)] 166 | mod tests { 167 | use super::{argmax_u16, argmin_u16, simple_argmax, simple_argmin}; 168 | use rand::{thread_rng, Rng}; 169 | use rand_distr::Uniform; 170 | 171 | fn get_array_u16(n: usize) -> Vec { 172 | let rng = thread_rng(); 173 | let uni = Uniform::new_inclusive(std::u16::MIN, std::u16::MAX); 174 | rng.sample_iter(uni).take(n).collect() 175 | } 176 | 177 | #[test] 178 | fn test_both_versions_return_the_same_results() { 179 | let data = get_array_u16(1025); 180 | assert_eq!(data.len() % 4, 1); 181 | 182 | let min_index = argmin_u16(&data).unwrap(); 183 | let max_index = argmax_u16(&data).unwrap(); 184 | let argmin_index = simple_argmin(&data); 185 | let argmax_index = simple_argmax(&data); 186 | 187 | assert_eq!(argmin_index, min_index); 188 | assert_eq!(argmax_index, max_index); 189 | } 190 | 191 | #[test] 192 | fn test_first_index_is_returned_when_identical_values_found() { 193 | let data = [ 194 | 10, 195 | std::u16::MIN, 196 | 6, 197 | 9, 198 | 9, 199 | 22, 200 | std::u16::MAX, 201 | 4, 202 | std::u16::MAX, 203 | ]; 204 | let argmin_index = simple_argmin(&data); 205 | let argmin_simd_index = argmin_u16(&data).unwrap(); 206 | assert_eq!(argmin_index, argmin_simd_index); 207 | assert_eq!(argmin_index, 1); 208 | 209 | let argmax_index = simple_argmax(&data); 210 | let argmax_simd_index = argmax_u16(&data).unwrap(); 211 | assert_eq!(argmax_index, argmax_simd_index); 212 | assert_eq!(argmax_index, 6); 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /src/simd/simd_u8.rs: -------------------------------------------------------------------------------- 1 | use crate::generic::{simple_argmax, simple_argmin}; 2 | use crate::task::{find_final_index_max, find_final_index_min, split_array}; 3 | use std::arch::x86_64::*; 4 | 5 | pub fn argmin_u8(arr: &[u8]) -> Option { 6 | let n = arr.len(); 7 | let mut simd_func: unsafe fn(&[u8], usize) -> (u8, usize) = core_argmin; 8 | let mut mod_size = 4; 9 | 10 | if (17..std::i16::MAX).contains(&(n as i16)) { 11 | simd_func = core_argmin_ext; 12 | mod_size = 8; 13 | }; 14 | 15 | match split_array(arr, mod_size) { 16 | (Some(rem), Some(sim)) => { 17 | let rem_min_index = simple_argmin(rem); 18 | let rem_result = (rem[rem_min_index], rem_min_index); 19 | let sim_result = unsafe { simd_func(sim, rem.len()) }; 20 | find_final_index_min(rem_result, sim_result) 21 | } 22 | (Some(rem), None) => Some(simple_argmin(rem)), 23 | (None, Some(sim)) => { 24 | let sim_result = unsafe { simd_func(sim, 0) }; 25 | Some(sim_result.1) 26 | } 27 | (None, None) => None, 28 | } 29 | } 30 | 31 | unsafe fn core_argmin(sim_arr: &[u8], rem_offset: usize) -> (u8, usize) { 32 | let offset = _mm_set1_epi32(rem_offset as i32); 33 | let mut index_low = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 34 | 35 | let increment = _mm_set1_epi32(4); 36 | let mut new_index_low = index_low; 37 | 38 | let mut values_low = _mm_set_epi32( 39 | sim_arr[3] as i32, 40 | sim_arr[2] as i32, 41 | sim_arr[1] as i32, 42 | sim_arr[0] as i32, 43 | ); 44 | 45 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 46 | new_index_low = _mm_add_epi32(new_index_low, increment); 47 | 48 | let new_values = _mm_set_epi32( 49 | step[3] as i32, 50 | step[2] as i32, 51 | step[1] as i32, 52 | step[0] as i32, 53 | ); 54 | 55 | let lt_mask = _mm_cmplt_epi32(new_values, values_low); 56 | 57 | values_low = _mm_or_si128( 58 | _mm_and_si128(new_values, lt_mask), 59 | _mm_andnot_si128(lt_mask, values_low), 60 | ); 61 | index_low = _mm_or_si128( 62 | _mm_and_si128(new_index_low, lt_mask), 63 | _mm_andnot_si128(lt_mask, index_low), 64 | ); 65 | }); 66 | 67 | let highpack = _mm_unpackhi_epi32(values_low, values_low); 68 | let lowpack = _mm_unpacklo_epi32(values_low, values_low); 69 | let lowest = _mm_min_epi32(highpack, lowpack); 70 | 71 | let highpack = _mm_unpackhi_epi32(lowest, lowest); 72 | let lowpack = _mm_unpacklo_epi32(lowest, lowest); 73 | let lowest = _mm_min_epi32(highpack, lowpack); 74 | 75 | let low_mask = _mm_cmpeq_epi32(lowest, values_low); 76 | 77 | index_low = _mm_or_si128( 78 | _mm_and_si128(index_low, low_mask), 79 | _mm_andnot_si128(low_mask, _mm_set1_epi32(std::i32::MAX)), 80 | ); 81 | 82 | let value_array = std::mem::transmute::<__m128i, [u32; 4]>(values_low); 83 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_low); 84 | 85 | let min_index = simple_argmin(&index_array); 86 | let value = *value_array.get_unchecked(min_index); 87 | let index = *index_array.get_unchecked(min_index); 88 | 89 | (value as u8, index as usize) 90 | } 91 | 92 | unsafe fn core_argmin_ext(sim_arr: &[u8], rem_offset: usize) -> (u8, usize) { 93 | let offset = _mm_set1_epi16(rem_offset as i16); 94 | let mut index_low = _mm_add_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), offset); 95 | 96 | let increment = _mm_set1_epi16(8); 97 | let mut new_index_low = index_low; 98 | 99 | let mut values_low = _mm_set_epi16( 100 | sim_arr[7] as i16, 101 | sim_arr[6] as i16, 102 | sim_arr[5] as i16, 103 | sim_arr[4] as i16, 104 | sim_arr[3] as i16, 105 | sim_arr[2] as i16, 106 | sim_arr[1] as i16, 107 | sim_arr[0] as i16, 108 | ); 109 | 110 | sim_arr.chunks_exact(8).skip(1).for_each(|step| { 111 | new_index_low = _mm_add_epi16(new_index_low, increment); 112 | 113 | let new_values = _mm_set_epi16( 114 | step[7] as i16, 115 | step[6] as i16, 116 | step[5] as i16, 117 | step[4] as i16, 118 | step[3] as i16, 119 | step[2] as i16, 120 | step[1] as i16, 121 | step[0] as i16, 122 | ); 123 | 124 | let lt_mask = _mm_cmplt_epi16(new_values, values_low); 125 | 126 | values_low = _mm_or_si128( 127 | _mm_and_si128(new_values, lt_mask), 128 | _mm_andnot_si128(lt_mask, values_low), 129 | ); 130 | index_low = _mm_or_si128( 131 | _mm_and_si128(new_index_low, lt_mask), 132 | _mm_andnot_si128(lt_mask, index_low), 133 | ); 134 | }); 135 | 136 | let highpack = _mm_unpackhi_epi16(values_low, values_low); 137 | let lowpack = _mm_unpacklo_epi16(values_low, values_low); 138 | let lowest = _mm_min_epi16(highpack, lowpack); 139 | 140 | let highpack = _mm_unpackhi_epi16(lowest, lowest); 141 | let lowpack = _mm_unpacklo_epi16(lowest, lowest); 142 | let lowest = _mm_min_epi16(highpack, lowpack); 143 | 144 | let highpack = _mm_unpackhi_epi16(lowest, lowest); 145 | let lowpack = _mm_unpacklo_epi16(lowest, lowest); 146 | let lowest = _mm_min_epi16(highpack, lowpack); 147 | 148 | let low_mask = _mm_cmpeq_epi16(lowest, values_low); 149 | 150 | index_low = _mm_or_si128( 151 | _mm_and_si128(index_low, low_mask), 152 | _mm_andnot_si128(low_mask, _mm_set1_epi16(std::i16::MAX)), 153 | ); 154 | 155 | let value_array = std::mem::transmute::<__m128i, [i16; 8]>(values_low); 156 | let index_array = std::mem::transmute::<__m128i, [i16; 8]>(index_low); 157 | 158 | let min_index = simple_argmin(&index_array); 159 | let value = *value_array.get_unchecked(min_index); 160 | let index = *index_array.get_unchecked(min_index); 161 | 162 | (value as u8, index as usize) 163 | } 164 | 165 | pub fn argmax_u8(arr: &[u8]) -> Option { 166 | let n = arr.len(); 167 | let mut simd_func: unsafe fn(&[u8], usize) -> (u8, usize) = core_argmax; 168 | let mut mod_size = 4; 169 | 170 | if (17..=std::i16::MAX).contains(&(n as i16)) { 171 | simd_func = core_argmax_ext; 172 | mod_size = 8; 173 | }; 174 | 175 | match split_array(arr, mod_size) { 176 | (Some(rem), Some(sim)) => { 177 | let rem_min_index = simple_argmax(rem); 178 | let rem_result = (rem[rem_min_index], rem_min_index); 179 | let sim_result = unsafe { simd_func(sim, rem.len()) }; 180 | find_final_index_max(rem_result, sim_result) 181 | } 182 | (Some(rem), None) => Some(simple_argmax(rem)), 183 | (None, Some(sim)) => { 184 | let sim_result = unsafe { simd_func(sim, 0) }; 185 | Some(sim_result.1) 186 | } 187 | (None, None) => None, 188 | } 189 | } 190 | 191 | unsafe fn core_argmax(sim_arr: &[u8], rem_offset: usize) -> (u8, usize) { 192 | let offset = _mm_set1_epi32(rem_offset as i32); 193 | let mut index_high = _mm_add_epi32(_mm_set_epi32(3, 2, 1, 0), offset); 194 | 195 | let mut new_index_high = index_high; 196 | 197 | let increment = _mm_set1_epi32(4); 198 | 199 | let mut values_high = _mm_set_epi32( 200 | sim_arr[3] as i32, 201 | sim_arr[2] as i32, 202 | sim_arr[1] as i32, 203 | sim_arr[0] as i32, 204 | ); 205 | 206 | sim_arr.chunks_exact(4).skip(1).for_each(|step| { 207 | new_index_high = _mm_add_epi32(new_index_high, increment); 208 | 209 | let new_values = _mm_set_epi32( 210 | step[3] as i32, 211 | step[2] as i32, 212 | step[1] as i32, 213 | step[0] as i32, 214 | ); 215 | 216 | let gt_mask = _mm_cmpgt_epi32(new_values, values_high); 217 | 218 | values_high = _mm_or_si128( 219 | _mm_and_si128(new_values, gt_mask), 220 | _mm_andnot_si128(gt_mask, values_high), 221 | ); 222 | index_high = _mm_or_si128( 223 | _mm_and_si128(new_index_high, gt_mask), 224 | _mm_andnot_si128(gt_mask, index_high), 225 | ); 226 | }); 227 | 228 | let highpack = _mm_unpackhi_epi32(values_high, values_high); 229 | let lowpack = _mm_unpacklo_epi32(values_high, values_high); 230 | let highest = _mm_max_epi32(highpack, lowpack); 231 | 232 | let highpack = _mm_unpackhi_epi32(highest, highest); 233 | let lowpack = _mm_unpacklo_epi32(highest, highest); 234 | let highest = _mm_max_epi32(highpack, lowpack); 235 | 236 | let high_mask = _mm_cmpeq_epi32(highest, values_high); 237 | 238 | index_high = _mm_or_si128( 239 | _mm_and_si128(index_high, high_mask), 240 | _mm_andnot_si128(high_mask, _mm_set1_epi32(std::i32::MAX)), 241 | ); 242 | 243 | let value_array = std::mem::transmute::<__m128i, [u32; 4]>(values_high); 244 | let index_array = std::mem::transmute::<__m128i, [i32; 4]>(index_high); 245 | 246 | let min_index = simple_argmin(&index_array); 247 | let value = *value_array.get_unchecked(min_index); 248 | let index = *index_array.get_unchecked(min_index); 249 | 250 | (value as u8, index as usize) 251 | } 252 | 253 | unsafe fn core_argmax_ext(sim_arr: &[u8], rem_offset: usize) -> (u8, usize) { 254 | let offset = _mm_set1_epi16(rem_offset as i16); 255 | let mut index_high = _mm_add_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0), offset); 256 | let mut new_index_high = index_high; 257 | 258 | let increment = _mm_set1_epi16(8); 259 | 260 | let mut values_high = _mm_set_epi16( 261 | sim_arr[7] as i16, 262 | sim_arr[6] as i16, 263 | sim_arr[5] as i16, 264 | sim_arr[4] as i16, 265 | sim_arr[3] as i16, 266 | sim_arr[2] as i16, 267 | sim_arr[1] as i16, 268 | sim_arr[0] as i16, 269 | ); 270 | 271 | sim_arr.chunks_exact(8).skip(1).for_each(|step| { 272 | new_index_high = _mm_add_epi16(new_index_high, increment); 273 | 274 | let new_values = _mm_set_epi16( 275 | step[7] as i16, 276 | step[6] as i16, 277 | step[5] as i16, 278 | step[4] as i16, 279 | step[3] as i16, 280 | step[2] as i16, 281 | step[1] as i16, 282 | step[0] as i16, 283 | ); 284 | 285 | let gt_mask = _mm_cmpgt_epi16(new_values, values_high); 286 | 287 | values_high = _mm_or_si128( 288 | _mm_and_si128(new_values, gt_mask), 289 | _mm_andnot_si128(gt_mask, values_high), 290 | ); 291 | index_high = _mm_or_si128( 292 | _mm_and_si128(new_index_high, gt_mask), 293 | _mm_andnot_si128(gt_mask, index_high), 294 | ); 295 | }); 296 | 297 | let highpack = _mm_unpackhi_epi16(values_high, values_high); 298 | let lowpack = _mm_unpacklo_epi16(values_high, values_high); 299 | let highest = _mm_max_epi16(highpack, lowpack); 300 | 301 | let highpack = _mm_unpackhi_epi16(highest, highest); 302 | let lowpack = _mm_unpacklo_epi16(highest, highest); 303 | let highest = _mm_max_epi16(highpack, lowpack); 304 | 305 | let highpack = _mm_unpackhi_epi16(highest, highest); 306 | let lowpack = _mm_unpacklo_epi16(highest, highest); 307 | let highest = _mm_max_epi16(highpack, lowpack); 308 | 309 | let high_mask = _mm_cmpeq_epi16(highest, values_high); 310 | 311 | index_high = _mm_or_si128( 312 | _mm_and_si128(index_high, high_mask), 313 | _mm_andnot_si128(high_mask, _mm_set1_epi16(std::i16::MAX)), 314 | ); 315 | 316 | let value_array = std::mem::transmute::<__m128i, [i16; 8]>(values_high); 317 | let index_array = std::mem::transmute::<__m128i, [i16; 8]>(index_high); 318 | 319 | let min_index = simple_argmin(&index_array); 320 | let value = *value_array.get_unchecked(min_index); 321 | let index = *index_array.get_unchecked(min_index); 322 | 323 | (value as u8, index as usize) 324 | } 325 | 326 | #[cfg(test)] 327 | mod tests { 328 | use super::{argmax_u8, argmin_u8, simple_argmax, simple_argmin}; 329 | use rand::{thread_rng, Rng}; 330 | use rand_distr::Uniform; 331 | 332 | fn get_array_u8(n: usize) -> Vec { 333 | let rng = thread_rng(); 334 | let uni = Uniform::new_inclusive(std::u8::MIN, std::u8::MAX); 335 | rng.sample_iter(uni).take(n).collect() 336 | } 337 | 338 | #[test] 339 | fn test_both_versions_return_the_same_results_short_array() { 340 | let data = get_array_u8(9); 341 | assert_eq!(data.len() % 4, 1); 342 | 343 | let min_index = argmin_u8(&data).unwrap(); 344 | let max_index = argmax_u8(&data).unwrap(); 345 | let argmin_index = simple_argmin(&data); 346 | let argmax_index = simple_argmax(&data); 347 | 348 | assert_eq!(argmin_index, min_index); 349 | assert_eq!(argmax_index, max_index); 350 | } 351 | 352 | #[test] 353 | fn test_both_versions_return_the_same_results_long_array() { 354 | let data = get_array_u8(1025); 355 | assert_eq!(data.len() % 8, 1); 356 | 357 | let min_index = argmin_u8(&data).unwrap(); 358 | let max_index = argmax_u8(&data).unwrap(); 359 | let argmin_index = simple_argmin(&data); 360 | let argmax_index = simple_argmax(&data); 361 | 362 | assert_eq!(argmin_index, min_index); 363 | assert_eq!(argmax_index, max_index); 364 | } 365 | 366 | #[test] 367 | fn test_first_index_is_returned_when_identical_values_found() { 368 | let data = [10, std::u8::MIN, 6, 9, 9, 22, std::u8::MAX, 4, std::u8::MAX]; 369 | let argmin_index = simple_argmin(&data); 370 | let argmin_simd_index = argmin_u8(&data).unwrap(); 371 | assert_eq!(argmin_index, argmin_simd_index); 372 | assert_eq!(argmin_index, 1); 373 | 374 | let argmax_index = simple_argmax(&data); 375 | let argmax_simd_index = argmax_u8(&data).unwrap(); 376 | assert_eq!(argmax_index, argmax_simd_index); 377 | assert_eq!(argmax_index, 6); 378 | } 379 | } 380 | -------------------------------------------------------------------------------- /src/task.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | 3 | #[inline] 4 | pub(crate) fn split_array(arr: &[T], lane_size: usize) -> (Option<&[T]>, Option<&[T]>) { 5 | let n = arr.len(); 6 | 7 | if n < lane_size * 2 { 8 | return (Some(arr), None); 9 | }; 10 | 11 | let (left_arr, right_arr) = arr.split_at(n % lane_size); 12 | 13 | match (left_arr.is_empty(), right_arr.is_empty()) { 14 | (true, true) => (None, None), 15 | (false, false) => (Some(left_arr), Some(right_arr)), 16 | (true, false) => (None, Some(right_arr)), 17 | (false, true) => (Some(left_arr), None), 18 | } 19 | } 20 | 21 | #[inline] 22 | pub fn find_final_index_min( 23 | remainder_result: (T, usize), 24 | simd_result: (T, usize), 25 | ) -> Option { 26 | let result = match remainder_result.0.partial_cmp(&simd_result.0).unwrap() { 27 | Ordering::Less => remainder_result.1, 28 | Ordering::Equal => std::cmp::min(remainder_result.1, simd_result.1), 29 | Ordering::Greater => simd_result.1, 30 | }; 31 | Some(result) 32 | } 33 | 34 | #[inline] 35 | pub fn find_final_index_max( 36 | remainder_result: (T, usize), 37 | simd_result: (T, usize), 38 | ) -> Option { 39 | let result = match simd_result.0.partial_cmp(&remainder_result.0).unwrap() { 40 | Ordering::Less => remainder_result.1, 41 | Ordering::Equal => std::cmp::min(remainder_result.1, simd_result.1), 42 | Ordering::Greater => simd_result.1, 43 | }; 44 | Some(result) 45 | } 46 | --------------------------------------------------------------------------------