├── .cargo └── config.toml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── README.md ├── examples ├── dist.rs └── random.rs ├── justfile ├── py ├── correlation.py ├── stats └── stats.py └── src ├── intrinsics.rs ├── lib.rs └── main.rs /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /bindash 3 | /bindash-rs 4 | /input 5 | /input_files 6 | /output/ 7 | /py/plots 8 | /input2 9 | /input2_files 10 | /output2/ 11 | /py/stats2 12 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "adler2" 7 | version = "2.0.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" 10 | 11 | [[package]] 12 | name = "aho-corasick" 13 | version = "1.1.3" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 16 | dependencies = [ 17 | "memchr", 18 | ] 19 | 20 | [[package]] 21 | name = "anstream" 22 | version = "0.6.18" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" 25 | dependencies = [ 26 | "anstyle", 27 | "anstyle-parse", 28 | "anstyle-query", 29 | "anstyle-wincon", 30 | "colorchoice", 31 | "is_terminal_polyfill", 32 | "utf8parse", 33 | ] 34 | 35 | [[package]] 36 | name = "anstyle" 37 | version = "1.0.10" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" 40 | 41 | [[package]] 42 | name = "anstyle-parse" 43 | version = "0.2.6" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" 46 | dependencies = [ 47 | "utf8parse", 48 | ] 49 | 50 | [[package]] 51 | name = "anstyle-query" 52 | version = "1.1.2" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" 55 | dependencies = [ 56 | "windows-sys", 57 | ] 58 | 59 | [[package]] 60 | name = "anstyle-wincon" 61 | version = "3.0.7" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" 64 | dependencies = [ 65 | "anstyle", 66 | "once_cell", 67 | "windows-sys", 68 | ] 69 | 70 | [[package]] 71 | name = "bitflags" 72 | version = "2.9.0" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" 75 | 76 | [[package]] 77 | name = "buffer-redux" 78 | version = "1.0.2" 79 | source = "registry+https://github.com/rust-lang/crates.io-index" 80 | checksum = "4e8acf87c5b9f5897cd3ebb9a327f420e0cae9dd4e5c1d2e36f2c84c571a58f1" 81 | dependencies = [ 82 | "memchr", 83 | ] 84 | 85 | [[package]] 86 | name = "bumpalo" 87 | version = "3.17.0" 88 | source = "registry+https://github.com/rust-lang/crates.io-index" 89 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" 90 | 91 | [[package]] 92 | name = "bytecount" 93 | version = "0.6.8" 94 | source = "registry+https://github.com/rust-lang/crates.io-index" 95 | checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" 96 | 97 | [[package]] 98 | name = "bytemuck" 99 | version = "1.22.0" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540" 102 | 103 | [[package]] 104 | name = "byteorder" 105 | version = "1.5.0" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 108 | 109 | [[package]] 110 | name = "bzip2" 111 | version = "0.4.4" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" 114 | dependencies = [ 115 | "bzip2-sys", 116 | "libc", 117 | ] 118 | 119 | [[package]] 120 | name = "bzip2-sys" 121 | version = "0.1.13+1.0.8" 122 | source = "registry+https://github.com/rust-lang/crates.io-index" 123 | checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" 124 | dependencies = [ 125 | "cc", 126 | "pkg-config", 127 | ] 128 | 129 | [[package]] 130 | name = "cc" 131 | version = "1.2.16" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" 134 | dependencies = [ 135 | "jobserver", 136 | "libc", 137 | "shlex", 138 | ] 139 | 140 | [[package]] 141 | name = "cfg-if" 142 | version = "1.0.0" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 145 | 146 | [[package]] 147 | name = "clap" 148 | version = "4.5.31" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" 151 | dependencies = [ 152 | "clap_builder", 153 | "clap_derive", 154 | ] 155 | 156 | [[package]] 157 | name = "clap_builder" 158 | version = "4.5.31" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" 161 | dependencies = [ 162 | "anstream", 163 | "anstyle", 164 | "clap_lex", 165 | "strsim", 166 | ] 167 | 168 | [[package]] 169 | name = "clap_derive" 170 | version = "4.5.28" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" 173 | dependencies = [ 174 | "heck", 175 | "proc-macro2", 176 | "quote", 177 | "syn", 178 | ] 179 | 180 | [[package]] 181 | name = "clap_lex" 182 | version = "0.7.4" 183 | source = "registry+https://github.com/rust-lang/crates.io-index" 184 | checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" 185 | 186 | [[package]] 187 | name = "colorchoice" 188 | version = "1.0.3" 189 | source = "registry+https://github.com/rust-lang/crates.io-index" 190 | checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" 191 | 192 | [[package]] 193 | name = "console" 194 | version = "0.15.11" 195 | source = "registry+https://github.com/rust-lang/crates.io-index" 196 | checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" 197 | dependencies = [ 198 | "encode_unicode", 199 | "libc", 200 | "once_cell", 201 | "unicode-width", 202 | "windows-sys", 203 | ] 204 | 205 | [[package]] 206 | name = "crc32fast" 207 | version = "1.4.2" 208 | source = "registry+https://github.com/rust-lang/crates.io-index" 209 | checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" 210 | dependencies = [ 211 | "cfg-if", 212 | ] 213 | 214 | [[package]] 215 | name = "crossbeam-deque" 216 | version = "0.8.6" 217 | source = "registry+https://github.com/rust-lang/crates.io-index" 218 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 219 | dependencies = [ 220 | "crossbeam-epoch", 221 | "crossbeam-utils", 222 | ] 223 | 224 | [[package]] 225 | name = "crossbeam-epoch" 226 | version = "0.9.18" 227 | source = "registry+https://github.com/rust-lang/crates.io-index" 228 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 229 | dependencies = [ 230 | "crossbeam-utils", 231 | ] 232 | 233 | [[package]] 234 | name = "crossbeam-utils" 235 | version = "0.8.21" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 238 | 239 | [[package]] 240 | name = "either" 241 | version = "1.15.0" 242 | source = "registry+https://github.com/rust-lang/crates.io-index" 243 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 244 | 245 | [[package]] 246 | name = "encode_unicode" 247 | version = "1.0.0" 248 | source = "registry+https://github.com/rust-lang/crates.io-index" 249 | checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" 250 | 251 | [[package]] 252 | name = "env_filter" 253 | version = "0.1.3" 254 | source = "registry+https://github.com/rust-lang/crates.io-index" 255 | checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" 256 | dependencies = [ 257 | "log", 258 | "regex", 259 | ] 260 | 261 | [[package]] 262 | name = "env_logger" 263 | version = "0.11.8" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" 266 | dependencies = [ 267 | "anstream", 268 | "anstyle", 269 | "env_filter", 270 | "jiff", 271 | "log", 272 | ] 273 | 274 | [[package]] 275 | name = "flate2" 276 | version = "1.1.1" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" 279 | dependencies = [ 280 | "crc32fast", 281 | "libz-rs-sys", 282 | "miniz_oxide", 283 | ] 284 | 285 | [[package]] 286 | name = "getrandom" 287 | version = "0.3.1" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" 290 | dependencies = [ 291 | "cfg-if", 292 | "libc", 293 | "wasi", 294 | "windows-targets", 295 | ] 296 | 297 | [[package]] 298 | name = "heck" 299 | version = "0.5.0" 300 | source = "registry+https://github.com/rust-lang/crates.io-index" 301 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 302 | 303 | [[package]] 304 | name = "indicatif" 305 | version = "0.17.11" 306 | source = "registry+https://github.com/rust-lang/crates.io-index" 307 | checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" 308 | dependencies = [ 309 | "console", 310 | "number_prefix", 311 | "portable-atomic", 312 | "rayon", 313 | "unicode-width", 314 | "web-time", 315 | ] 316 | 317 | [[package]] 318 | name = "is_terminal_polyfill" 319 | version = "1.70.1" 320 | source = "registry+https://github.com/rust-lang/crates.io-index" 321 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 322 | 323 | [[package]] 324 | name = "itertools" 325 | version = "0.14.0" 326 | source = "registry+https://github.com/rust-lang/crates.io-index" 327 | checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" 328 | dependencies = [ 329 | "either", 330 | ] 331 | 332 | [[package]] 333 | name = "jiff" 334 | version = "0.2.10" 335 | source = "registry+https://github.com/rust-lang/crates.io-index" 336 | checksum = "5a064218214dc6a10fbae5ec5fa888d80c45d611aba169222fc272072bf7aef6" 337 | dependencies = [ 338 | "jiff-static", 339 | "log", 340 | "portable-atomic", 341 | "portable-atomic-util", 342 | "serde", 343 | ] 344 | 345 | [[package]] 346 | name = "jiff-static" 347 | version = "0.2.10" 348 | source = "registry+https://github.com/rust-lang/crates.io-index" 349 | checksum = "199b7932d97e325aff3a7030e141eafe7f2c6268e1d1b24859b753a627f45254" 350 | dependencies = [ 351 | "proc-macro2", 352 | "quote", 353 | "syn", 354 | ] 355 | 356 | [[package]] 357 | name = "jobserver" 358 | version = "0.1.32" 359 | source = "registry+https://github.com/rust-lang/crates.io-index" 360 | checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" 361 | dependencies = [ 362 | "libc", 363 | ] 364 | 365 | [[package]] 366 | name = "js-sys" 367 | version = "0.3.77" 368 | source = "registry+https://github.com/rust-lang/crates.io-index" 369 | checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" 370 | dependencies = [ 371 | "once_cell", 372 | "wasm-bindgen", 373 | ] 374 | 375 | [[package]] 376 | name = "libc" 377 | version = "0.2.170" 378 | source = "registry+https://github.com/rust-lang/crates.io-index" 379 | checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" 380 | 381 | [[package]] 382 | name = "liblzma" 383 | version = "0.3.6" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "a631d2b24be269775ba8f7789a6afa1ac228346a20c9e87dbbbe4975a79fd764" 386 | dependencies = [ 387 | "liblzma-sys", 388 | ] 389 | 390 | [[package]] 391 | name = "liblzma-sys" 392 | version = "0.3.13" 393 | source = "registry+https://github.com/rust-lang/crates.io-index" 394 | checksum = "efdadf1a99aceff34553de1461674ab6ac7e7f0843ae9875e339f4a14eb43475" 395 | dependencies = [ 396 | "cc", 397 | "libc", 398 | "pkg-config", 399 | ] 400 | 401 | [[package]] 402 | name = "libz-rs-sys" 403 | version = "0.5.0" 404 | source = "registry+https://github.com/rust-lang/crates.io-index" 405 | checksum = "6489ca9bd760fe9642d7644e827b0c9add07df89857b0416ee15c1cc1a3b8c5a" 406 | dependencies = [ 407 | "zlib-rs", 408 | ] 409 | 410 | [[package]] 411 | name = "log" 412 | version = "0.4.27" 413 | source = "registry+https://github.com/rust-lang/crates.io-index" 414 | checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" 415 | 416 | [[package]] 417 | name = "mem_dbg" 418 | version = "0.2.5" 419 | source = "registry+https://github.com/rust-lang/crates.io-index" 420 | checksum = "83623b4167d3fbfb0f60eaa0cfd0f303a44e50f35a41b8d6bce87ad9d3e81410" 421 | dependencies = [ 422 | "bitflags", 423 | "mem_dbg-derive", 424 | ] 425 | 426 | [[package]] 427 | name = "mem_dbg-derive" 428 | version = "0.1.7" 429 | source = "registry+https://github.com/rust-lang/crates.io-index" 430 | checksum = "e1ac4642a5a1615ce3cdd3479f8d03c05068aab5a285a0722a24d5080d3f3e22" 431 | dependencies = [ 432 | "proc-macro2", 433 | "quote", 434 | "syn", 435 | ] 436 | 437 | [[package]] 438 | name = "memchr" 439 | version = "2.7.4" 440 | source = "registry+https://github.com/rust-lang/crates.io-index" 441 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 442 | 443 | [[package]] 444 | name = "miniz_oxide" 445 | version = "0.8.5" 446 | source = "registry+https://github.com/rust-lang/crates.io-index" 447 | checksum = "8e3e04debbb59698c15bacbb6d93584a8c0ca9cc3213cb423d31f760d8843ce5" 448 | dependencies = [ 449 | "adler2", 450 | ] 451 | 452 | [[package]] 453 | name = "needletail" 454 | version = "0.6.3" 455 | source = "registry+https://github.com/rust-lang/crates.io-index" 456 | checksum = "6aa22e1ae8bce4ecf257e2475ef2046026caea08d66b1848d073fe7bc77e4351" 457 | dependencies = [ 458 | "buffer-redux", 459 | "bytecount", 460 | "bzip2", 461 | "flate2", 462 | "liblzma", 463 | "memchr", 464 | "zstd", 465 | ] 466 | 467 | [[package]] 468 | name = "number_prefix" 469 | version = "0.4.0" 470 | source = "registry+https://github.com/rust-lang/crates.io-index" 471 | checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" 472 | 473 | [[package]] 474 | name = "once_cell" 475 | version = "1.20.3" 476 | source = "registry+https://github.com/rust-lang/crates.io-index" 477 | checksum = "945462a4b81e43c4e3ba96bd7b49d834c6f61198356aa858733bc4acf3cbe62e" 478 | 479 | [[package]] 480 | name = "packed-seq" 481 | version = "1.0.2" 482 | source = "registry+https://github.com/rust-lang/crates.io-index" 483 | checksum = "66357293f640400f7aaa7bfc9551476f4cdb0577dc3a7b9b7371a33be8e23581" 484 | dependencies = [ 485 | "cfg-if", 486 | "mem_dbg", 487 | "rand", 488 | "wide", 489 | ] 490 | 491 | [[package]] 492 | name = "pkg-config" 493 | version = "0.3.32" 494 | source = "registry+https://github.com/rust-lang/crates.io-index" 495 | checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" 496 | 497 | [[package]] 498 | name = "portable-atomic" 499 | version = "1.11.0" 500 | source = "registry+https://github.com/rust-lang/crates.io-index" 501 | checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" 502 | 503 | [[package]] 504 | name = "portable-atomic-util" 505 | version = "0.2.4" 506 | source = "registry+https://github.com/rust-lang/crates.io-index" 507 | checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" 508 | dependencies = [ 509 | "portable-atomic", 510 | ] 511 | 512 | [[package]] 513 | name = "ppv-lite86" 514 | version = "0.2.20" 515 | source = "registry+https://github.com/rust-lang/crates.io-index" 516 | checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" 517 | dependencies = [ 518 | "zerocopy 0.7.35", 519 | ] 520 | 521 | [[package]] 522 | name = "proc-macro2" 523 | version = "1.0.94" 524 | source = "registry+https://github.com/rust-lang/crates.io-index" 525 | checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" 526 | dependencies = [ 527 | "unicode-ident", 528 | ] 529 | 530 | [[package]] 531 | name = "quote" 532 | version = "1.0.39" 533 | source = "registry+https://github.com/rust-lang/crates.io-index" 534 | checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" 535 | dependencies = [ 536 | "proc-macro2", 537 | ] 538 | 539 | [[package]] 540 | name = "rand" 541 | version = "0.9.0" 542 | source = "registry+https://github.com/rust-lang/crates.io-index" 543 | checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" 544 | dependencies = [ 545 | "rand_chacha", 546 | "rand_core", 547 | "zerocopy 0.8.23", 548 | ] 549 | 550 | [[package]] 551 | name = "rand_chacha" 552 | version = "0.9.0" 553 | source = "registry+https://github.com/rust-lang/crates.io-index" 554 | checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" 555 | dependencies = [ 556 | "ppv-lite86", 557 | "rand_core", 558 | ] 559 | 560 | [[package]] 561 | name = "rand_core" 562 | version = "0.9.3" 563 | source = "registry+https://github.com/rust-lang/crates.io-index" 564 | checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" 565 | dependencies = [ 566 | "getrandom", 567 | ] 568 | 569 | [[package]] 570 | name = "rayon" 571 | version = "1.10.0" 572 | source = "registry+https://github.com/rust-lang/crates.io-index" 573 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 574 | dependencies = [ 575 | "either", 576 | "rayon-core", 577 | ] 578 | 579 | [[package]] 580 | name = "rayon-core" 581 | version = "1.12.1" 582 | source = "registry+https://github.com/rust-lang/crates.io-index" 583 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 584 | dependencies = [ 585 | "crossbeam-deque", 586 | "crossbeam-utils", 587 | ] 588 | 589 | [[package]] 590 | name = "regex" 591 | version = "1.11.1" 592 | source = "registry+https://github.com/rust-lang/crates.io-index" 593 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 594 | dependencies = [ 595 | "aho-corasick", 596 | "memchr", 597 | "regex-automata", 598 | "regex-syntax", 599 | ] 600 | 601 | [[package]] 602 | name = "regex-automata" 603 | version = "0.4.9" 604 | source = "registry+https://github.com/rust-lang/crates.io-index" 605 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 606 | dependencies = [ 607 | "aho-corasick", 608 | "memchr", 609 | "regex-syntax", 610 | ] 611 | 612 | [[package]] 613 | name = "regex-syntax" 614 | version = "0.8.5" 615 | source = "registry+https://github.com/rust-lang/crates.io-index" 616 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 617 | 618 | [[package]] 619 | name = "safe_arch" 620 | version = "0.7.4" 621 | source = "registry+https://github.com/rust-lang/crates.io-index" 622 | checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323" 623 | dependencies = [ 624 | "bytemuck", 625 | ] 626 | 627 | [[package]] 628 | name = "serde" 629 | version = "1.0.219" 630 | source = "registry+https://github.com/rust-lang/crates.io-index" 631 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" 632 | dependencies = [ 633 | "serde_derive", 634 | ] 635 | 636 | [[package]] 637 | name = "serde_derive" 638 | version = "1.0.219" 639 | source = "registry+https://github.com/rust-lang/crates.io-index" 640 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" 641 | dependencies = [ 642 | "proc-macro2", 643 | "quote", 644 | "syn", 645 | ] 646 | 647 | [[package]] 648 | name = "shlex" 649 | version = "1.3.0" 650 | source = "registry+https://github.com/rust-lang/crates.io-index" 651 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 652 | 653 | [[package]] 654 | name = "simd-minimizers" 655 | version = "1.0.0" 656 | source = "registry+https://github.com/rust-lang/crates.io-index" 657 | checksum = "a21e3b5b1be74f7081349fa58f0370528543570896cb89fbf5e3a34f8434a65e" 658 | dependencies = [ 659 | "itertools", 660 | "packed-seq", 661 | "wide", 662 | ] 663 | 664 | [[package]] 665 | name = "simd-sketch" 666 | version = "0.2.0" 667 | dependencies = [ 668 | "clap", 669 | "env_logger", 670 | "flate2", 671 | "indicatif", 672 | "itertools", 673 | "log", 674 | "needletail", 675 | "packed-seq", 676 | "rand", 677 | "rayon", 678 | "simd-minimizers", 679 | "wide", 680 | ] 681 | 682 | [[package]] 683 | name = "strsim" 684 | version = "0.11.1" 685 | source = "registry+https://github.com/rust-lang/crates.io-index" 686 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 687 | 688 | [[package]] 689 | name = "syn" 690 | version = "2.0.99" 691 | source = "registry+https://github.com/rust-lang/crates.io-index" 692 | checksum = "e02e925281e18ffd9d640e234264753c43edc62d64b2d4cf898f1bc5e75f3fc2" 693 | dependencies = [ 694 | "proc-macro2", 695 | "quote", 696 | "unicode-ident", 697 | ] 698 | 699 | [[package]] 700 | name = "unicode-ident" 701 | version = "1.0.18" 702 | source = "registry+https://github.com/rust-lang/crates.io-index" 703 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 704 | 705 | [[package]] 706 | name = "unicode-width" 707 | version = "0.2.0" 708 | source = "registry+https://github.com/rust-lang/crates.io-index" 709 | checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" 710 | 711 | [[package]] 712 | name = "utf8parse" 713 | version = "0.2.2" 714 | source = "registry+https://github.com/rust-lang/crates.io-index" 715 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 716 | 717 | [[package]] 718 | name = "wasi" 719 | version = "0.13.3+wasi-0.2.2" 720 | source = "registry+https://github.com/rust-lang/crates.io-index" 721 | checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" 722 | dependencies = [ 723 | "wit-bindgen-rt", 724 | ] 725 | 726 | [[package]] 727 | name = "wasm-bindgen" 728 | version = "0.2.100" 729 | source = "registry+https://github.com/rust-lang/crates.io-index" 730 | checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" 731 | dependencies = [ 732 | "cfg-if", 733 | "once_cell", 734 | "wasm-bindgen-macro", 735 | ] 736 | 737 | [[package]] 738 | name = "wasm-bindgen-backend" 739 | version = "0.2.100" 740 | source = "registry+https://github.com/rust-lang/crates.io-index" 741 | checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" 742 | dependencies = [ 743 | "bumpalo", 744 | "log", 745 | "proc-macro2", 746 | "quote", 747 | "syn", 748 | "wasm-bindgen-shared", 749 | ] 750 | 751 | [[package]] 752 | name = "wasm-bindgen-macro" 753 | version = "0.2.100" 754 | source = "registry+https://github.com/rust-lang/crates.io-index" 755 | checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" 756 | dependencies = [ 757 | "quote", 758 | "wasm-bindgen-macro-support", 759 | ] 760 | 761 | [[package]] 762 | name = "wasm-bindgen-macro-support" 763 | version = "0.2.100" 764 | source = "registry+https://github.com/rust-lang/crates.io-index" 765 | checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" 766 | dependencies = [ 767 | "proc-macro2", 768 | "quote", 769 | "syn", 770 | "wasm-bindgen-backend", 771 | "wasm-bindgen-shared", 772 | ] 773 | 774 | [[package]] 775 | name = "wasm-bindgen-shared" 776 | version = "0.2.100" 777 | source = "registry+https://github.com/rust-lang/crates.io-index" 778 | checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" 779 | dependencies = [ 780 | "unicode-ident", 781 | ] 782 | 783 | [[package]] 784 | name = "web-time" 785 | version = "1.1.0" 786 | source = "registry+https://github.com/rust-lang/crates.io-index" 787 | checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" 788 | dependencies = [ 789 | "js-sys", 790 | "wasm-bindgen", 791 | ] 792 | 793 | [[package]] 794 | name = "wide" 795 | version = "0.7.32" 796 | source = "registry+https://github.com/rust-lang/crates.io-index" 797 | checksum = "41b5576b9a81633f3e8df296ce0063042a73507636cbe956c61133dd7034ab22" 798 | dependencies = [ 799 | "bytemuck", 800 | "safe_arch", 801 | ] 802 | 803 | [[package]] 804 | name = "windows-sys" 805 | version = "0.59.0" 806 | source = "registry+https://github.com/rust-lang/crates.io-index" 807 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 808 | dependencies = [ 809 | "windows-targets", 810 | ] 811 | 812 | [[package]] 813 | name = "windows-targets" 814 | version = "0.52.6" 815 | source = "registry+https://github.com/rust-lang/crates.io-index" 816 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 817 | dependencies = [ 818 | "windows_aarch64_gnullvm", 819 | "windows_aarch64_msvc", 820 | "windows_i686_gnu", 821 | "windows_i686_gnullvm", 822 | "windows_i686_msvc", 823 | "windows_x86_64_gnu", 824 | "windows_x86_64_gnullvm", 825 | "windows_x86_64_msvc", 826 | ] 827 | 828 | [[package]] 829 | name = "windows_aarch64_gnullvm" 830 | version = "0.52.6" 831 | source = "registry+https://github.com/rust-lang/crates.io-index" 832 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 833 | 834 | [[package]] 835 | name = "windows_aarch64_msvc" 836 | version = "0.52.6" 837 | source = "registry+https://github.com/rust-lang/crates.io-index" 838 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 839 | 840 | [[package]] 841 | name = "windows_i686_gnu" 842 | version = "0.52.6" 843 | source = "registry+https://github.com/rust-lang/crates.io-index" 844 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 845 | 846 | [[package]] 847 | name = "windows_i686_gnullvm" 848 | version = "0.52.6" 849 | source = "registry+https://github.com/rust-lang/crates.io-index" 850 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 851 | 852 | [[package]] 853 | name = "windows_i686_msvc" 854 | version = "0.52.6" 855 | source = "registry+https://github.com/rust-lang/crates.io-index" 856 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 857 | 858 | [[package]] 859 | name = "windows_x86_64_gnu" 860 | version = "0.52.6" 861 | source = "registry+https://github.com/rust-lang/crates.io-index" 862 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 863 | 864 | [[package]] 865 | name = "windows_x86_64_gnullvm" 866 | version = "0.52.6" 867 | source = "registry+https://github.com/rust-lang/crates.io-index" 868 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 869 | 870 | [[package]] 871 | name = "windows_x86_64_msvc" 872 | version = "0.52.6" 873 | source = "registry+https://github.com/rust-lang/crates.io-index" 874 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 875 | 876 | [[package]] 877 | name = "wit-bindgen-rt" 878 | version = "0.33.0" 879 | source = "registry+https://github.com/rust-lang/crates.io-index" 880 | checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" 881 | dependencies = [ 882 | "bitflags", 883 | ] 884 | 885 | [[package]] 886 | name = "zerocopy" 887 | version = "0.7.35" 888 | source = "registry+https://github.com/rust-lang/crates.io-index" 889 | checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" 890 | dependencies = [ 891 | "byteorder", 892 | "zerocopy-derive 0.7.35", 893 | ] 894 | 895 | [[package]] 896 | name = "zerocopy" 897 | version = "0.8.23" 898 | source = "registry+https://github.com/rust-lang/crates.io-index" 899 | checksum = "fd97444d05a4328b90e75e503a34bad781f14e28a823ad3557f0750df1ebcbc6" 900 | dependencies = [ 901 | "zerocopy-derive 0.8.23", 902 | ] 903 | 904 | [[package]] 905 | name = "zerocopy-derive" 906 | version = "0.7.35" 907 | source = "registry+https://github.com/rust-lang/crates.io-index" 908 | checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" 909 | dependencies = [ 910 | "proc-macro2", 911 | "quote", 912 | "syn", 913 | ] 914 | 915 | [[package]] 916 | name = "zerocopy-derive" 917 | version = "0.8.23" 918 | source = "registry+https://github.com/rust-lang/crates.io-index" 919 | checksum = "6352c01d0edd5db859a63e2605f4ea3183ddbd15e2c4a9e7d32184df75e4f154" 920 | dependencies = [ 921 | "proc-macro2", 922 | "quote", 923 | "syn", 924 | ] 925 | 926 | [[package]] 927 | name = "zlib-rs" 928 | version = "0.5.0" 929 | source = "registry+https://github.com/rust-lang/crates.io-index" 930 | checksum = "868b928d7949e09af2f6086dfc1e01936064cc7a819253bce650d4e2a2d63ba8" 931 | 932 | [[package]] 933 | name = "zstd" 934 | version = "0.13.3" 935 | source = "registry+https://github.com/rust-lang/crates.io-index" 936 | checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" 937 | dependencies = [ 938 | "zstd-safe", 939 | ] 940 | 941 | [[package]] 942 | name = "zstd-safe" 943 | version = "7.2.3" 944 | source = "registry+https://github.com/rust-lang/crates.io-index" 945 | checksum = "f3051792fbdc2e1e143244dc28c60f73d8470e93f3f9cbd0ead44da5ed802722" 946 | dependencies = [ 947 | "zstd-sys", 948 | ] 949 | 950 | [[package]] 951 | name = "zstd-sys" 952 | version = "2.0.14+zstd.1.5.7" 953 | source = "registry+https://github.com/rust-lang/crates.io-index" 954 | checksum = "8fb060d4926e4ac3a3ad15d864e99ceb5f343c6b34f5bd6d81ae6ed417311be5" 955 | dependencies = [ 956 | "cc", 957 | "pkg-config", 958 | ] 959 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "simd-sketch" 3 | version = "0.2.0" 4 | edition = "2024" 5 | license = "MIT" 6 | repository = "https://github.com/ragnargrootkoerkamp/simd-sketch" 7 | keywords = ["sketch", 'minhash', "simd", "dna", "bioinformatics"] 8 | categories = ["compression", "data-structures", "science::bioinformatics"] 9 | authors = ["Ragnar Groot Koerkamp"] 10 | description = "A SIMD-accelerated library to compute a b-bit bottom-h sketch" 11 | 12 | [profile.release] 13 | lto = "thin" 14 | incremental = true 15 | debug = false 16 | 17 | [dependencies] 18 | clap = { version = "4.5.31", features = ["derive"] } 19 | itertools = "0.14.0" 20 | packed-seq = "1.0.2" 21 | simd-minimizers = "1.0.0" 22 | wide = "0.7.32" 23 | needletail = "0.6.3" 24 | flate2 = { version = "1.1.1", features = ["zlib-rs"] } 25 | rayon = "1.10.0" 26 | log = "0.4.27" 27 | env_logger = "0.11.8" 28 | indicatif = { version = "0.17.11", features = ["rayon"] } 29 | 30 | [dev-dependencies] 31 | rand = "0.9.0" 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SimdSketch 2 | 3 | [![crates.io](https://img.shields.io/crates/v/simd-sketch.svg)](https://crates.io/crates/simd-sketch) 4 | [![docs.rs](https://img.shields.io/docsrs/simd-sketch.svg)](https://docs.rs/simd-sketch) 5 | 6 | A SIMD-accelerated library to compute two types of sketches: 7 | - Classic bottom $s$ sketch, containing the $s$ smallest distinct k-mer hashes. 8 | - Bucket sketch, which partitions the hashes into $s$ parts and returns the smallest 9 | hash in each part. (Introduced as *one permutation hashing* in Li, Owen, Zhang 2012.) 10 | 11 | See the corresponding [blog post](https://curiouscoding.nl/posts/simd-sketch/) 12 | for background and evaluation. 13 | 14 | Sketching takes 2 seconds for a 3Gbp human genome. This library returns 32-bit `u32` 15 | hashes. This means that currently it may not be very suitable for sequences that are 16 | too close to 1Gbp in length, since the bottom hash values will be relatively dense. 17 | 18 | **Algorithm.** 19 | For the bottom $s$ sketch, we first collect all ``sufficiently small'' hashes 20 | into a vector. Then, that vector is sorted and deduplicated, and the smallest 21 | $s$ values are returned. This ensures that the runtime is $O(n + s \log s)$ when 22 | the number of duplicate k-mers is limited. 23 | 24 | For the bucket sketch, the classic method is to partition hashes linearly, e.g., 25 | for $s=2$ into the bottom and top half. Then, a single value is kept per part, 26 | and each hash is compared against the rolling minimum of its bucket. 27 | 28 | Instead, here we make buckets by the remainder modulo $s$. This way, we can 29 | again pre-filter for ``sufficiently small'' values, and then only scan those for 30 | the minimum. 31 | 32 | In both variants, we double the ``smallness'' threshold until either $s$ 33 | distinct values are found or all $s$ buckets have a value in them. 34 | 35 | **Formulas** 36 | For the bottom sketch, the **Jaccard similarity** `j` is computed as follows: 37 | 1. Find the smallest `s` distinct k-mer hashes in the union of two sketches. 38 | 2. Return the fraction of these k-mers that occurs in both sketches. 39 | 40 | For the bucket sketch, we first identify all buckets that are not left empty by 41 | both sketches. Then, we take the fraction `j0` of the remaining buckets where they 42 | are equal. We use **b-bit sketches**, where only the bottom `b` bits of each 43 | bucket-minimum are stored. This gives a `1/2^b` probability of hash collisions. 44 | To fix this, we compute `j = (j0 - 1/2^b) / (1 - 1/2^b)` as the similarity 45 | corrected for these collisions. 46 | 47 | The **Mash distance** as reported by the CLI is computed as 48 | `-ln(2*j / (1+j))/k`. 49 | This is always `>=0`, but can be as large as `inf` for disjoint sets, that have `j=0`. 50 | 51 | **Implementation notes.** 52 | Good performance is mostly achieved by using a branch-free implementation: all 53 | hashes are computed using 8 parallel streams using SIMD, and appended to a vector when they 54 | are sufficiently small to likely be part of the sketch. 55 | 56 | The underlying streaming and hashing algorithms are described in the following [preprint](https://doi.org/10.1101/2025.01.27.634998): 57 | 58 | - SimdMinimizers: Computing random minimizers, fast. 59 | Ragnar Groot Koerkamp, Igor Martayan 60 | bioRxiv 2025.01.27 [doi.org/10.1101/2025.01.27.634998](https://doi.org/10.1101/2025.01.27.634998) 61 | 62 | 63 | ## Usage 64 | Please see [lib.rs](src/lib.rs) and [docs.rs](https://docs.rs/simd-sketch) for 65 | full docs. 66 | 67 | ```rust 68 | use packed_seq::SeqVec; 69 | 70 | // Bottom h=10000 sketch of k=31-mers. 71 | let k = 31; 72 | let h = 10_000; 73 | 74 | // Use `new_rc` for a canonical version instead. 75 | let sketch = simd_sketch::Sketcher::new(k, h); 76 | 77 | // Generate two random sequences of 2M characters. 78 | let n = 2_000_000; 79 | let seq1 = packed_seq::PackedSeqVec::random(n); 80 | let seq2 = packed_seq::PackedSeqVec::random(n); 81 | 82 | let sketch1: simd_sketch::BottomSketch = sketcher.bottom_sketch(seq1.as_slice()); 83 | let sketch2: simd_sketch::BottomSketch = sketcher.bottom_sketch(seq2.as_slice()); 84 | 85 | // Value between 0 and 1, estimating the fraction of shared k-mers. 86 | let similarity = sketch1.similarity(&sketch2); 87 | 88 | // Bucket sketch variant 89 | 90 | let sketch1: simd_sketch::BinSketch = sketcher.sketch(seq1.as_slice()); 91 | let sketch2: simd_sketch::BinSketch = sketcher.sketch(seq2.as_slice()); 92 | 93 | // Value between 0 and 1, estimating the fraction of shared k-mers. 94 | let similarity = sketch1.similarity(&sketch2); 95 | ``` 96 | 97 | ## Command line tool 98 | 99 | The crate comes with a simple command line tool for computing all-to-all 100 | Mash distances matrices: 101 | 102 | ``` 103 | > simd-sketch triangle --help 104 | Takes paths to fasta files, and outputs a Phylip distance matrix to stdout 105 | 106 | Usage: simd-sketch triangle [OPTIONS] [PATHS]... 107 | 108 | Arguments: 109 | [PATHS]... Paths to (directories of) (gzipped) fasta files 110 | 111 | Options: 112 | --alg Sketch algorithm to use. Defaults to bucket because of its much faster comparisons [default: bucket] [possible values: bottom, bucket] 113 | --fwd When set, use forward instead of canonical k-mer hashes 114 | -k k-mer size [default: 31] 115 | -s Bottom-s sketch, or number of buckets [default: 10000] 116 | -b For bucket-sketch, store only the lower b bits [default: 8] 117 | --output Write phylip distance matrix here, or default to stdout 118 | -h, --help Print help 119 | ``` 120 | 121 | Minimal example usage, printing the matrix to stdout: 122 | 123 | ```sh 124 | simd-sketch triangle inputs/*.fa 125 | ``` 126 | 127 | Typical example usage, for specific `k` and using reverse-complement-aware hashes: 128 | 129 | ```sh 130 | simd-sketch triangle --rc -k 21 inputs/*.fa --output matrix.phylip 131 | ``` 132 | 133 | Maximal usage with default parameters: 134 | 135 | ```sh 136 | simd-sketch triangle --alg bucket -k 31 -s 10000 -b 8 inputs/*.fna.gz --output matrix.phylip 137 | ``` 138 | -------------------------------------------------------------------------------- /examples/dist.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::Parser; 4 | use itertools::Itertools; 5 | use log::{info, trace}; 6 | use packed_seq::{AsciiSeqVec, SeqVec}; 7 | use simd_sketch::SketchParams; 8 | use std::io::Write; 9 | 10 | #[derive(clap::Parser, Debug, Clone)] 11 | struct Args { 12 | #[command(flatten)] 13 | params: SketchParams, 14 | 15 | paths: Vec, 16 | 17 | #[arg(long)] 18 | stats: Option, 19 | } 20 | 21 | fn main() { 22 | env_logger::init(); 23 | 24 | let args = Args::parse(); 25 | let paths = collect_paths(&args.paths); 26 | let q = paths.len(); 27 | 28 | let k = args.params.k; 29 | let s = args.params.s; 30 | let b = args.params.b; 31 | 32 | let sketcher = SketchParams { 33 | alg: simd_sketch::SketchAlg::Bucket, 34 | rc: true, 35 | k, 36 | s, 37 | b, 38 | filter_empty: true, 39 | } 40 | .build(); 41 | 42 | let mut sketches = vec![]; 43 | let start = std::time::Instant::now(); 44 | 45 | for path in paths { 46 | trace!("Sketching {path:?}"); 47 | let mut seq = AsciiSeqVec::default(); 48 | let mut reader = needletail::parse_fastx_file(path).unwrap(); 49 | let start = std::time::Instant::now(); 50 | while let Some(r) = reader.next() { 51 | // let record = r 52 | // .unwrap() 53 | // .seq(); 54 | // .iter() 55 | // .filter_map(|&b| if b == b'N' { None } else { Some(b) }) 56 | // .collect::>(); 57 | // seq.push_ascii(&record); 58 | seq.push_ascii(&r.unwrap().seq()); 59 | // FIXME: Skip adjacent k-mers. 60 | } 61 | trace!("Reading & filtering took {:?}", start.elapsed()); 62 | let start = std::time::Instant::now(); 63 | sketches.push(sketcher.sketch(seq.as_slice())); 64 | trace!("sketching itself took {:?}", start.elapsed()); 65 | } 66 | let t_sketch = start.elapsed(); 67 | info!( 68 | "Sketching {q} seqs took {t_sketch:?} ({:?} avg)", 69 | t_sketch / q as u32 70 | ); 71 | 72 | let start = std::time::Instant::now(); 73 | let dists = sketches 74 | .iter() 75 | .tuple_combinations() 76 | .map(|(s1, s2)| s1.jaccard_similarity(s2)) 77 | .collect_vec(); 78 | let t_dist = start.elapsed(); 79 | let cnt = q * (q - 1) / 2; 80 | info!( 81 | "Computing {cnt} dists took {t_dist:?} ({:?} avg)", 82 | t_dist / cnt.max(1) as u32 83 | ); 84 | info!( 85 | "Params {:?}", 86 | Args { 87 | paths: vec![], 88 | ..args.clone() 89 | } 90 | ); 91 | 92 | if let Some(stats) = &args.stats { 93 | let mut writer = std::fs::File::options() 94 | .create(true) 95 | .append(true) 96 | .write(true) 97 | .open(stats) 98 | .unwrap(); 99 | writeln!( 100 | writer, 101 | "SimdSketch {:?} {q} {k} {s} {b} {} {}", 102 | args.params.alg, 103 | t_sketch.as_secs_f32(), 104 | t_dist.as_secs_f32() 105 | ) 106 | .unwrap(); 107 | } 108 | 109 | for dist in dists { 110 | println!("{dist}"); 111 | } 112 | } 113 | 114 | fn collect_paths(paths: &Vec) -> Vec { 115 | let mut res = vec![]; 116 | for path in paths { 117 | if path.is_dir() { 118 | res.extend(path.read_dir().unwrap().map(|entry| entry.unwrap().path())); 119 | } else { 120 | res.push(path.clone()); 121 | } 122 | } 123 | res.sort(); 124 | res 125 | } 126 | -------------------------------------------------------------------------------- /examples/random.rs: -------------------------------------------------------------------------------- 1 | use packed_seq::{PackedSeqVec, SeqVec}; 2 | 3 | fn main() { 4 | let k = 31; 5 | let n = 4_000_000; 6 | 7 | let s = 32768; 8 | let b = 8; 9 | 10 | let mut bottom = 0.0; 11 | let mut bucket = 0.0; 12 | for _ in 0..10 { 13 | let seq1 = PackedSeqVec::random(n); 14 | let seq2 = PackedSeqVec::random(n); 15 | 16 | let bottom_sketcher = simd_sketch::SketchParams { 17 | alg: simd_sketch::SketchAlg::Bottom, 18 | k, 19 | s, 20 | b, 21 | rc: true, 22 | filter_empty: true, 23 | } 24 | .build(); 25 | let bucket_sketcher = simd_sketch::SketchParams { 26 | alg: simd_sketch::SketchAlg::Bucket, 27 | k, 28 | s, 29 | b, 30 | rc: true, 31 | filter_empty: true, 32 | } 33 | .build(); 34 | let s1 = bottom_sketcher.sketch(seq1.as_slice()); 35 | let s2 = bottom_sketcher.sketch(seq2.as_slice()); 36 | bottom += s1.jaccard_similarity(&s2); 37 | let s1 = bucket_sketcher.sketch(seq1.as_slice()); 38 | let s2 = bucket_sketcher.sketch(seq2.as_slice()); 39 | bucket += s1.jaccard_similarity(&s2); 40 | } 41 | println!("AVG Bottom: {}", bottom / 10.0); 42 | println!("AVG Bucket: {}", bucket / 10.0); 43 | // if b == 8 { 44 | // for (i, v) in counts.iter().enumerate() { 45 | // println!("{i:>3} {v:>3}"); 46 | // } 47 | // } 48 | } 49 | -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | build: 2 | cargo build -r --example dist 3 | 4 | bench: build 5 | 6 | input := "input2" 7 | stats := "py/stats2" 8 | output := "output2" 9 | 10 | simd_bot s: build 11 | ./target/release/examples/dist {{input}} -s {{s}} --stats {{stats}} > {{output}}/simd_bottom_s{{s}}.dist 12 | simd_bucket s b: build 13 | ./target/release/examples/dist {{input}} -s {{s}} -b {{b}} --bucket --stats {{stats}} > {{output}}/simd_bucket_s{{s}}_b{{b}}.dist 14 | 15 | simd_bot_all: (simd_bot "128") (simd_bot "1024") (simd_bot "8192") (simd_bot "32768") (simd_bot "65536") 16 | simd_bucket_all_b s: (simd_bucket s "1") (simd_bucket s "8") (simd_bucket s "16") (simd_bucket s "32") 17 | simd_bucket_all: (simd_bucket_all_b "128") (simd_bucket_all_b "1024") (simd_bucket_all_b "8192") (simd_bucket_all_b "32768") 18 | 19 | simd_bucket_16: (simd_bucket "128" "16") (simd_bucket "1024" "16") (simd_bucket "8192" "16") (simd_bucket "32768" "16") 20 | 21 | bindash_bot s: 22 | time -f %U ./bindash sketch {{input}}/*fna --minhashtype=0 --kmerlen=31 --sketchsize={{s}} --outfname={{output}}/tmp 2>&1 | tee >(cat 1>&2) | tail -1 > {{output}}/tmp_sketch_time 23 | time -f %U ./bindash dist {{output}}/tmp --outfname={{output}}/bindash_bottom_s{{s}}.dist 2>&1 | tee >(cat 1>&2) | tail -1 > {{output}}/tmp_dist_time 24 | echo BinDash bottom 1000 31 {{s}} 64 `cat {{output}}/tmp_sketch_time` `cat {{output}}/tmp_dist_time` >> {{stats}} 25 | bindash_bucket s b: 26 | time -f %U ./bindash sketch {{input}}/*fna --minhashtype=2 --kmerlen=31 --sketchsize={{s}} --bbits={{b}} --outfname={{output}}/tmp 2>&1 | tee >(cat 1>&2) | tail -1 > {{output}}/tmp_sketch_time 27 | time -f %U ./bindash dist {{output}}/tmp --outfname={{output}}/bindash_bucket_s{{s}}_b{{b}}.dist 2>&1 | tee >(cat 1>&2) | tail -1 > {{output}}/tmp_dist_time 28 | echo BinDash bin 1000 31 {{s}} {{b}} `cat {{output}}/tmp_sketch_time` `cat {{output}}/tmp_dist_time` >> {{stats}} 29 | 30 | 31 | bindash_bot_all: (bindash_bot "128") (bindash_bot "1024") (bindash_bot "8192") (bindash_bot "32768") 32 | bindash_bucket_all_b s: (bindash_bucket s "1") (bindash_bucket s "8") (bindash_bucket s "16") (bindash_bucket s "32") 33 | bindash_bucket_all: (bindash_bucket_all_b "128") (bindash_bucket_all_b "1024") (bindash_bucket_all_b "8192") (bindash_bucket_all_b "32768") 34 | bindash_bucket_all_s b: (bindash_bucket "128" b) (bindash_bucket "1024" b) (bindash_bucket "8192" b) (bindash_bucket "32768" b) 35 | 36 | 37 | bindashrs s: 38 | ./bindash-rs -t 6 -k 31 --sketch_size {{s}} -q {{input}}_files -r {{input}}_files --stats {{stats}} -o {{output}}/bindashrs_s{{s}}.dist 39 | 40 | 41 | bindashrs_all: (bindashrs "128") (bindashrs "1024") (bindashrs "8192") (bindashrs "32768") 42 | -------------------------------------------------------------------------------- /py/correlation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import matplotlib.pyplot as plt 4 | from pathlib import Path 5 | import sys 6 | import re 7 | import random 8 | import numpy as np 9 | import matplotlib.patches as mpatches 10 | 11 | plt.close() 12 | paths = sys.argv[1:] 13 | n = 5000 14 | 15 | dir = Path("../output2/") 16 | 17 | 18 | def key(s, _nsre=re.compile(r"(\d+)")): 19 | return [ 20 | int(text) if text.isdigit() else text.lower() for text in _nsre.split(str(s)) 21 | ] 22 | 23 | 24 | def correlation(a, b): 25 | if len(a) != len(b): 26 | return 0 27 | return np.corrcoef(a, b)[0, 1] 28 | 29 | 30 | baseline = dir / "simd_bottom_s65536.dist" 31 | groups = [ 32 | ( 33 | sorted( 34 | list(x for x in dir.glob("simd_bottom_*.dist") if x != baseline), key=key 35 | ), 36 | "SimdSketch bottom", 37 | ), 38 | (sorted(list(dir.glob("bindash_bottom_s*.dist")), key=key), "BinDash bottom"), 39 | (sorted(list(dir.glob("bindashrs_*.dist")), key=key), "BinDash-rs bucket"), 40 | ([], ""), 41 | # ( 42 | # sorted(list(dir.glob("bindash_bottom_fixed*.dist")), key=key), 43 | # "BinDash bottom (fixed)", 44 | # ), 45 | ( 46 | sorted(list(dir.glob("simd_bucket_*b32.dist")), key=key), 47 | "SimdSketch bucket b=32", 48 | ), 49 | ( 50 | sorted(list(dir.glob("simd_bucket_*b16.dist")), key=key), 51 | "SimdSketch bucket b=16", 52 | ), 53 | (sorted(list(dir.glob("simd_bucket_*b8.dist")), key=key), "SimdSketch bucket b=8"), 54 | (sorted(list(dir.glob("simd_bucket_*b1.dist")), key=key), "SimdSketch bucket b=1"), 55 | ( 56 | sorted(list(dir.glob("bindash_bucket_*b32.dist")), key=key), 57 | "BinDash bucket b=32", 58 | ), 59 | ( 60 | sorted(list(dir.glob("bindash_bucket_*b16.dist")), key=key), 61 | "BinDash bucket b=16", 62 | ), 63 | (sorted(list(dir.glob("bindash_bucket_*b8.dist")), key=key), "BinDash bucket b=8"), 64 | (sorted(list(dir.glob("bindash_bucket_*b1.dist")), key=key), "BinDash bucket b=1"), 65 | ] 66 | 67 | 68 | def read(p): 69 | print("Reading", p) 70 | return [float(x) for x in Path(p).read_text().splitlines()] 71 | 72 | 73 | d0 = read(baseline) 74 | 75 | # Sample n random lines from each file 76 | indices = random.sample(range(len(d0)), n) 77 | 78 | 79 | # one subfigure for each group 80 | for i, (group, title) in enumerate(groups): 81 | print(*group) 82 | names = [Path(p).stem for p in group] 83 | dists = [read(p) for p in group] 84 | dists = [d for d in dists if len(d) == len(d0)] 85 | 86 | plt.subplot(3, 4, i + 1) 87 | for name, d in zip(names, dists): 88 | print(f"plotting len") 89 | # extract value of s from name, _s\d+_ 90 | c = correlation(d0, d) 91 | plt.scatter( 92 | [d0[idx] for idx in indices], 93 | [d[idx] for idx in indices], 94 | label=f"{c:.5f}", 95 | alpha=0.4, 96 | s=2, 97 | ) 98 | # plt.xlabel(baseline.stem) 99 | leg = plt.legend() 100 | for lh in leg.legend_handles: 101 | lh.set_alpha(1) 102 | lh.set_sizes([50] * 4) 103 | plt.title(title) 104 | eps = 0.0001 105 | plt.xlim(eps, 1) 106 | plt.ylim(eps, 1) 107 | plt.xticks([eps, 1]) 108 | plt.yticks([eps, 1]) 109 | plt.xscale("log") 110 | plt.yscale("log") 111 | plt.plot([0, 1], [0, 1], color="black", linestyle="-", lw=0.5) 112 | 113 | # Add legend under the plot mapping s to each colour. 114 | # s=128: blue 115 | # s=1024: orange 116 | # s=8192: green 117 | # s=65536: red 118 | 119 | # Build manual legend 120 | plt.subplot(4, 4, 16) 121 | plt.axis("off") 122 | # Red circle for legend 123 | 124 | handles = [ 125 | mpatches.Patch(color="blue", label="s = 128"), 126 | mpatches.Patch(color="orange", label="s = 1024"), 127 | mpatches.Patch(color="green", label="s = 8192"), 128 | mpatches.Patch(color="red", label="s = 32768"), 129 | mpatches.Patch(color="purple", label="s = 131072"), 130 | ] 131 | 132 | plt.figlegend( 133 | handles=handles, 134 | loc="lower center", 135 | ncol=5, 136 | labelspacing=0.0, 137 | bbox_to_anchor=(0.5, 0.05), 138 | ) 139 | 140 | # Set the figure size 141 | plt.gcf().set_size_inches(15, 10) 142 | 143 | plt.tight_layout() 144 | plt.savefig("plots/correlation2.png", dpi=300, bbox_inches="tight") 145 | # plt.show() 146 | -------------------------------------------------------------------------------- /py/stats: -------------------------------------------------------------------------------- 1 | BinDash bottom 1000 31 128 64 118.01 0.53 0.9567 2 | BinDash bottom 1000 31 1024 64 121.20 3.78 0.9911 3 | BinDash bottom 1000 31 8192 64 146.87 32.78 0.9946 4 | SimdSketch bottom 1000 31 128 32 3.7847664 0.24336351 0.9651 5 | SimdSketch bottom 1000 31 1024 32 3.8086712 1.9176451 0.9965 6 | SimdSketch bottom 1000 31 8192 32 4.179488 15.498947 0.9993 7 | SimdSketch bottom 1000 31 65536 32 7.062428 123.53417 1.0000 8 | BinDash bucket 1000 31 128 1 124.68 0.27 0.8996 9 | BinDash bucket 1000 31 128 8 124.68 0.25 0.9649 10 | BinDash bucket 1000 31 128 32 124.58 0.24 0.9650 11 | BinDash bucket 1000 31 1024 1 125.16 0.26 0.9864 12 | BinDash bucket 1000 31 1024 8 124.91 0.30 0.9959 13 | BinDash bucket 1000 31 1024 32 125.07 0.35 0.9959 14 | BinDash bucket 1000 31 8192 1 126.40 0.37 0.9978 15 | BinDash bucket 1000 31 8192 8 126.65 0.60 0.9994 16 | BinDash bucket 1000 31 8192 32 127.13 1.74 0.9994 17 | BinDash-rs bucket 1000 31 128 32 146.37083 0.24578631 0.9581 18 | BinDash-rs bucket 1000 31 1024 32 145.78084 0.35327774 0.9929 19 | BinDash-rs bucket 1000 31 8192 32 147.11873 1.9769586 0.9988 20 | SimdSketch bucket 1000 31 128 1 3.7470505 0.004349289 0.9238 21 | SimdSketch bucket 1000 31 128 8 3.8268027 0.009781611 0.9674 22 | SimdSketch bucket 1000 31 128 32 3.8080878 0.010694598 0.9675 23 | SimdSketch bucket 1000 31 1024 1 3.8242054 0.008696725 0.9860 24 | SimdSketch bucket 1000 31 1024 8 3.8207712 0.051460445 0.9944 25 | SimdSketch bucket 1000 31 1024 32 3.8409657 0.07785798 0.9943 26 | SimdSketch bucket 1000 31 8192 1 4.1340055 0.03768954 0.9976 27 | SimdSketch bucket 1000 31 8192 8 4.0700364 0.36048988 0.9994 28 | SimdSketch bucket 1000 31 8192 32 4.161414 0.92198783 0.9994 29 | SimdSketch bucket 1000 31 16384 1 4.416565 0.06706968 0.9990 30 | SimdSketch bucket 1000 31 32768 1 5.1179295 0.13072741 0.9995 31 | -------------------------------------------------------------------------------- /py/stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | from pathlib import Path 5 | 6 | file = Path("stats").read_text() 7 | 8 | data = [] 9 | for line in file.splitlines(): 10 | name, tp, n, k, s, b, sketch, dist, corr = line.split(" ") 11 | n = int(n) 12 | k = int(k) 13 | s = int(s) 14 | b = int(b) 15 | sketch = float(sketch) 16 | dist = float(dist) 17 | corr = float(corr) 18 | vals = [name, tp, n, k, s, b, sketch, dist, corr] 19 | if s == 65536: 20 | continue 21 | data.append(vals) 22 | 23 | # Make a dataframe from data 24 | df = pd.DataFrame( 25 | data, columns=["name", "type", "n", "k", "s", "b", "sketch", "dist", "corr"] 26 | ) 27 | 28 | # print(df) 29 | import tabulate 30 | 31 | # Drop 'n' column 32 | df2 = df.copy() 33 | df2 = df2.drop(columns=["n", "k"]) 34 | 35 | print( 36 | tabulate.tabulate( 37 | df2, 38 | headers=df2.columns, 39 | tablefmt="orgtbl", 40 | floatfmt=[""] * 4 + [".1f", "0.3f", "0.4f"], 41 | showindex=False, 42 | ) 43 | ) 44 | 45 | df["sketch_one"] = df["sketch"] / df["n"] 46 | df["dist_one"] = df["dist"] / (df["n"] * (df["n"] - 1) / 2) 47 | df["sketch_thp_MB"] = 2.0 / df["sketch_one"] 48 | df["dist_thp_M"] = 1 / df["dist_one"] / 1000000 49 | df["c"] = 1 - df["corr"] 50 | 51 | 52 | import matplotlib.pyplot as plt 53 | import seaborn as sns 54 | 55 | df["variant"] = df["name"] + " " + df["type"] 56 | 57 | colormap = { 58 | "SimdSketch bottom": "pink", 59 | "SimdSketch bucket": "red", 60 | # "SimdSketch bottom": "#ffc107", 61 | # "SimdSketch bucket": "#ffc107", 62 | "BinDash-rs bucket": "black", 63 | "BinDash bottom": "lightblue", 64 | "BinDash bucket": "blue", 65 | } 66 | 67 | # for k, g in df.groupby(["variant", "type", "b"]): 68 | # plt.plot(g["sketch_thp_MB"], g["c"], label=None, color=colormap[k[0]], lw=0.7) 69 | # for k, g in df.groupby(["variant", "type", "s"]): 70 | # plt.plot( 71 | # g["sketch_thp_MB"], g["c"], label=None, color=colormap[k[0]], lw=1, ls="--" 72 | # ) 73 | # sns.scatterplot( 74 | # data=df, 75 | # x="sketch_thp_MB", 76 | # y="c", 77 | # hue="variant", 78 | # size="s", 79 | # style="type", 80 | # sizes=[20, 80, 200], 81 | # palette=colormap, 82 | # ) 83 | # plt.gca().invert_yaxis() 84 | # plt.xscale("log") 85 | # plt.yscale("log") 86 | # # Logarithmic y scale towards 1 87 | # plt.xlabel("Sketch throughput (MB/s)") 88 | # plt.ylabel("1 - Correlation") 89 | # plt.title("Correlation vs sketch throughput") 90 | # plt.savefig("plots/sketching.svg", dpi=300, bbox_inches="tight") 91 | # # plt.show() 92 | 93 | for k, g in df.groupby(["variant", "type", "b"]): 94 | # Hide plot from legend. 95 | plt.plot(g["dist_thp_M"], g["c"], label="_x", color=colormap[k[0]], lw=0.7) 96 | for k, g in df.groupby(["variant", "type", "s"]): 97 | plt.plot(g["dist_thp_M"], g["c"], label="_x", color=colormap[k[0]], lw=1, ls="--") 98 | sns.scatterplot( 99 | data=df, 100 | x="dist_thp_M", 101 | y="c", 102 | hue="variant", 103 | size="s", 104 | style="type", 105 | sizes=[20, 80, 150, 200, 250], 106 | markers=["o", "*"], 107 | palette=colormap, 108 | ) 109 | plt.gca().invert_yaxis() 110 | plt.xscale("log") 111 | plt.yscale("log") 112 | plt.xlabel("Comparison throughput (M/s)") 113 | plt.ylabel("1-correlation") 114 | plt.title("Correlation vs comparison throughput") 115 | plt.gcf().set_size_inches(11, 4.5) 116 | plt.savefig("plots/comparison.svg", dpi=300, bbox_inches="tight") 117 | plt.close() 118 | -------------------------------------------------------------------------------- /src/intrinsics.rs: -------------------------------------------------------------------------------- 1 | use core::mem::transmute; 2 | use packed_seq::u32x8 as S; 3 | const L: usize = 8; 4 | 5 | /// Append subset of values indicated by `mask` to a vector. 6 | #[inline(always)] 7 | #[cfg(not(any(target_feature = "avx2", target_feature = "neon")))] 8 | pub unsafe fn append_from_mask(vals: S, mask: S, v: &mut [T], write_idx: &mut usize) { 9 | unsafe { 10 | let vals = vals.to_array(); 11 | for (val, m) in vals.iter().zip(mask.to_array().iter()) { 12 | if *m > 0 { 13 | v.as_mut_ptr().add(*write_idx).write(*val); 14 | *write_idx += 1; 15 | } 16 | } 17 | } 18 | } 19 | 20 | /// Append subset of values indicated by `mask` to a vector. 21 | #[inline(always)] 22 | #[cfg(target_feature = "avx2")] 23 | pub unsafe fn append_from_mask(vals: S, mask: S, v: &mut [T], write_idx: &mut usize) { 24 | unsafe { 25 | use core::arch::x86_64::*; 26 | 27 | let vals = transmute(vals); 28 | 29 | let m = _mm256_movemask_ps(transmute(!mask)) as usize; 30 | let numberofnewvalues = L - m.count_ones() as usize; 31 | let key = transmute(UNIQSHUF[m]); 32 | let val = _mm256_permutevar8x32_epi32(vals, key); 33 | _mm256_storeu_si256(v.as_mut_ptr().add(*write_idx) as *mut __m256i, val); 34 | *write_idx += numberofnewvalues; 35 | } 36 | } 37 | 38 | /// Append subset of values indicated by `mask` to a vector. 39 | #[inline(always)] 40 | #[cfg(target_feature = "neon")] 41 | pub unsafe fn append_from_mask(vals: S, mask: S, v: &mut [u32], write_idx: &mut usize) { 42 | unsafe { 43 | use core::arch::aarch64::{vpaddd_u64, vpaddlq_u32, vqtbl2q_u8, vst1_u32_x4}; 44 | use wide::u32x4; 45 | 46 | let (d1, d2): (u32x4, u32x4) = transmute(!mask); 47 | let pow1 = u32x4::new([1, 2, 4, 8]); 48 | let pow2 = u32x4::new([16, 32, 64, 128]); 49 | let m1 = vpaddd_u64(vpaddlq_u32(transmute(d1 & pow1))); 50 | let m2 = vpaddd_u64(vpaddlq_u32(transmute(d2 & pow2))); 51 | let m = (m1 | m2) as usize; 52 | 53 | let numberofnewvalues = L - m.count_ones() as usize; 54 | let key = UNIQSHUF[m]; 55 | let idx = key * S::splat(0x04_04_04_04) + S::splat(0x03_02_01_00); 56 | let (i1, i2) = transmute(idx); 57 | let t = transmute(vals); 58 | let r1 = vqtbl2q_u8(t, i1); 59 | let r2 = vqtbl2q_u8(t, i2); 60 | let val: S = transmute((r1, r2)); 61 | vst1_u32_x4(v.as_mut_ptr().add(*write_idx), transmute(val)); 62 | *write_idx += numberofnewvalues; 63 | } 64 | } 65 | 66 | /// For each of 256 masks of which elements are different than their predecessor, 67 | /// a shuffle that sends those new elements to the beginning. 68 | #[rustfmt::skip] 69 | const UNIQSHUF: [S; 256] = unsafe {transmute([ 70 | 0,1,2,3,4,5,6,7, 71 | 1,2,3,4,5,6,7,0, 72 | 0,2,3,4,5,6,7,0, 73 | 2,3,4,5,6,7,0,0, 74 | 0,1,3,4,5,6,7,0, 75 | 1,3,4,5,6,7,0,0, 76 | 0,3,4,5,6,7,0,0, 77 | 3,4,5,6,7,0,0,0, 78 | 0,1,2,4,5,6,7,0, 79 | 1,2,4,5,6,7,0,0, 80 | 0,2,4,5,6,7,0,0, 81 | 2,4,5,6,7,0,0,0, 82 | 0,1,4,5,6,7,0,0, 83 | 1,4,5,6,7,0,0,0, 84 | 0,4,5,6,7,0,0,0, 85 | 4,5,6,7,0,0,0,0, 86 | 0,1,2,3,5,6,7,0, 87 | 1,2,3,5,6,7,0,0, 88 | 0,2,3,5,6,7,0,0, 89 | 2,3,5,6,7,0,0,0, 90 | 0,1,3,5,6,7,0,0, 91 | 1,3,5,6,7,0,0,0, 92 | 0,3,5,6,7,0,0,0, 93 | 3,5,6,7,0,0,0,0, 94 | 0,1,2,5,6,7,0,0, 95 | 1,2,5,6,7,0,0,0, 96 | 0,2,5,6,7,0,0,0, 97 | 2,5,6,7,0,0,0,0, 98 | 0,1,5,6,7,0,0,0, 99 | 1,5,6,7,0,0,0,0, 100 | 0,5,6,7,0,0,0,0, 101 | 5,6,7,0,0,0,0,0, 102 | 0,1,2,3,4,6,7,0, 103 | 1,2,3,4,6,7,0,0, 104 | 0,2,3,4,6,7,0,0, 105 | 2,3,4,6,7,0,0,0, 106 | 0,1,3,4,6,7,0,0, 107 | 1,3,4,6,7,0,0,0, 108 | 0,3,4,6,7,0,0,0, 109 | 3,4,6,7,0,0,0,0, 110 | 0,1,2,4,6,7,0,0, 111 | 1,2,4,6,7,0,0,0, 112 | 0,2,4,6,7,0,0,0, 113 | 2,4,6,7,0,0,0,0, 114 | 0,1,4,6,7,0,0,0, 115 | 1,4,6,7,0,0,0,0, 116 | 0,4,6,7,0,0,0,0, 117 | 4,6,7,0,0,0,0,0, 118 | 0,1,2,3,6,7,0,0, 119 | 1,2,3,6,7,0,0,0, 120 | 0,2,3,6,7,0,0,0, 121 | 2,3,6,7,0,0,0,0, 122 | 0,1,3,6,7,0,0,0, 123 | 1,3,6,7,0,0,0,0, 124 | 0,3,6,7,0,0,0,0, 125 | 3,6,7,0,0,0,0,0, 126 | 0,1,2,6,7,0,0,0, 127 | 1,2,6,7,0,0,0,0, 128 | 0,2,6,7,0,0,0,0, 129 | 2,6,7,0,0,0,0,0, 130 | 0,1,6,7,0,0,0,0, 131 | 1,6,7,0,0,0,0,0, 132 | 0,6,7,0,0,0,0,0, 133 | 6,7,0,0,0,0,0,0, 134 | 0,1,2,3,4,5,7,0, 135 | 1,2,3,4,5,7,0,0, 136 | 0,2,3,4,5,7,0,0, 137 | 2,3,4,5,7,0,0,0, 138 | 0,1,3,4,5,7,0,0, 139 | 1,3,4,5,7,0,0,0, 140 | 0,3,4,5,7,0,0,0, 141 | 3,4,5,7,0,0,0,0, 142 | 0,1,2,4,5,7,0,0, 143 | 1,2,4,5,7,0,0,0, 144 | 0,2,4,5,7,0,0,0, 145 | 2,4,5,7,0,0,0,0, 146 | 0,1,4,5,7,0,0,0, 147 | 1,4,5,7,0,0,0,0, 148 | 0,4,5,7,0,0,0,0, 149 | 4,5,7,0,0,0,0,0, 150 | 0,1,2,3,5,7,0,0, 151 | 1,2,3,5,7,0,0,0, 152 | 0,2,3,5,7,0,0,0, 153 | 2,3,5,7,0,0,0,0, 154 | 0,1,3,5,7,0,0,0, 155 | 1,3,5,7,0,0,0,0, 156 | 0,3,5,7,0,0,0,0, 157 | 3,5,7,0,0,0,0,0, 158 | 0,1,2,5,7,0,0,0, 159 | 1,2,5,7,0,0,0,0, 160 | 0,2,5,7,0,0,0,0, 161 | 2,5,7,0,0,0,0,0, 162 | 0,1,5,7,0,0,0,0, 163 | 1,5,7,0,0,0,0,0, 164 | 0,5,7,0,0,0,0,0, 165 | 5,7,0,0,0,0,0,0, 166 | 0,1,2,3,4,7,0,0, 167 | 1,2,3,4,7,0,0,0, 168 | 0,2,3,4,7,0,0,0, 169 | 2,3,4,7,0,0,0,0, 170 | 0,1,3,4,7,0,0,0, 171 | 1,3,4,7,0,0,0,0, 172 | 0,3,4,7,0,0,0,0, 173 | 3,4,7,0,0,0,0,0, 174 | 0,1,2,4,7,0,0,0, 175 | 1,2,4,7,0,0,0,0, 176 | 0,2,4,7,0,0,0,0, 177 | 2,4,7,0,0,0,0,0, 178 | 0,1,4,7,0,0,0,0, 179 | 1,4,7,0,0,0,0,0, 180 | 0,4,7,0,0,0,0,0, 181 | 4,7,0,0,0,0,0,0, 182 | 0,1,2,3,7,0,0,0, 183 | 1,2,3,7,0,0,0,0, 184 | 0,2,3,7,0,0,0,0, 185 | 2,3,7,0,0,0,0,0, 186 | 0,1,3,7,0,0,0,0, 187 | 1,3,7,0,0,0,0,0, 188 | 0,3,7,0,0,0,0,0, 189 | 3,7,0,0,0,0,0,0, 190 | 0,1,2,7,0,0,0,0, 191 | 1,2,7,0,0,0,0,0, 192 | 0,2,7,0,0,0,0,0, 193 | 2,7,0,0,0,0,0,0, 194 | 0,1,7,0,0,0,0,0, 195 | 1,7,0,0,0,0,0,0, 196 | 0,7,0,0,0,0,0,0, 197 | 7,0,0,0,0,0,0,0, 198 | 0,1,2,3,4,5,6,0, 199 | 1,2,3,4,5,6,0,0, 200 | 0,2,3,4,5,6,0,0, 201 | 2,3,4,5,6,0,0,0, 202 | 0,1,3,4,5,6,0,0, 203 | 1,3,4,5,6,0,0,0, 204 | 0,3,4,5,6,0,0,0, 205 | 3,4,5,6,0,0,0,0, 206 | 0,1,2,4,5,6,0,0, 207 | 1,2,4,5,6,0,0,0, 208 | 0,2,4,5,6,0,0,0, 209 | 2,4,5,6,0,0,0,0, 210 | 0,1,4,5,6,0,0,0, 211 | 1,4,5,6,0,0,0,0, 212 | 0,4,5,6,0,0,0,0, 213 | 4,5,6,0,0,0,0,0, 214 | 0,1,2,3,5,6,0,0, 215 | 1,2,3,5,6,0,0,0, 216 | 0,2,3,5,6,0,0,0, 217 | 2,3,5,6,0,0,0,0, 218 | 0,1,3,5,6,0,0,0, 219 | 1,3,5,6,0,0,0,0, 220 | 0,3,5,6,0,0,0,0, 221 | 3,5,6,0,0,0,0,0, 222 | 0,1,2,5,6,0,0,0, 223 | 1,2,5,6,0,0,0,0, 224 | 0,2,5,6,0,0,0,0, 225 | 2,5,6,0,0,0,0,0, 226 | 0,1,5,6,0,0,0,0, 227 | 1,5,6,0,0,0,0,0, 228 | 0,5,6,0,0,0,0,0, 229 | 5,6,0,0,0,0,0,0, 230 | 0,1,2,3,4,6,0,0, 231 | 1,2,3,4,6,0,0,0, 232 | 0,2,3,4,6,0,0,0, 233 | 2,3,4,6,0,0,0,0, 234 | 0,1,3,4,6,0,0,0, 235 | 1,3,4,6,0,0,0,0, 236 | 0,3,4,6,0,0,0,0, 237 | 3,4,6,0,0,0,0,0, 238 | 0,1,2,4,6,0,0,0, 239 | 1,2,4,6,0,0,0,0, 240 | 0,2,4,6,0,0,0,0, 241 | 2,4,6,0,0,0,0,0, 242 | 0,1,4,6,0,0,0,0, 243 | 1,4,6,0,0,0,0,0, 244 | 0,4,6,0,0,0,0,0, 245 | 4,6,0,0,0,0,0,0, 246 | 0,1,2,3,6,0,0,0, 247 | 1,2,3,6,0,0,0,0, 248 | 0,2,3,6,0,0,0,0, 249 | 2,3,6,0,0,0,0,0, 250 | 0,1,3,6,0,0,0,0, 251 | 1,3,6,0,0,0,0,0, 252 | 0,3,6,0,0,0,0,0, 253 | 3,6,0,0,0,0,0,0, 254 | 0,1,2,6,0,0,0,0, 255 | 1,2,6,0,0,0,0,0, 256 | 0,2,6,0,0,0,0,0, 257 | 2,6,0,0,0,0,0,0, 258 | 0,1,6,0,0,0,0,0, 259 | 1,6,0,0,0,0,0,0, 260 | 0,6,0,0,0,0,0,0, 261 | 6,0,0,0,0,0,0,0, 262 | 0,1,2,3,4,5,0,0, 263 | 1,2,3,4,5,0,0,0, 264 | 0,2,3,4,5,0,0,0, 265 | 2,3,4,5,0,0,0,0, 266 | 0,1,3,4,5,0,0,0, 267 | 1,3,4,5,0,0,0,0, 268 | 0,3,4,5,0,0,0,0, 269 | 3,4,5,0,0,0,0,0, 270 | 0,1,2,4,5,0,0,0, 271 | 1,2,4,5,0,0,0,0, 272 | 0,2,4,5,0,0,0,0, 273 | 2,4,5,0,0,0,0,0, 274 | 0,1,4,5,0,0,0,0, 275 | 1,4,5,0,0,0,0,0, 276 | 0,4,5,0,0,0,0,0, 277 | 4,5,0,0,0,0,0,0, 278 | 0,1,2,3,5,0,0,0, 279 | 1,2,3,5,0,0,0,0, 280 | 0,2,3,5,0,0,0,0, 281 | 2,3,5,0,0,0,0,0, 282 | 0,1,3,5,0,0,0,0, 283 | 1,3,5,0,0,0,0,0, 284 | 0,3,5,0,0,0,0,0, 285 | 3,5,0,0,0,0,0,0, 286 | 0,1,2,5,0,0,0,0, 287 | 1,2,5,0,0,0,0,0, 288 | 0,2,5,0,0,0,0,0, 289 | 2,5,0,0,0,0,0,0, 290 | 0,1,5,0,0,0,0,0, 291 | 1,5,0,0,0,0,0,0, 292 | 0,5,0,0,0,0,0,0, 293 | 5,0,0,0,0,0,0,0, 294 | 0,1,2,3,4,0,0,0, 295 | 1,2,3,4,0,0,0,0, 296 | 0,2,3,4,0,0,0,0, 297 | 2,3,4,0,0,0,0,0, 298 | 0,1,3,4,0,0,0,0, 299 | 1,3,4,0,0,0,0,0, 300 | 0,3,4,0,0,0,0,0, 301 | 3,4,0,0,0,0,0,0, 302 | 0,1,2,4,0,0,0,0, 303 | 1,2,4,0,0,0,0,0, 304 | 0,2,4,0,0,0,0,0, 305 | 2,4,0,0,0,0,0,0, 306 | 0,1,4,0,0,0,0,0, 307 | 1,4,0,0,0,0,0,0, 308 | 0,4,0,0,0,0,0,0, 309 | 4,0,0,0,0,0,0,0, 310 | 0,1,2,3,0,0,0,0, 311 | 1,2,3,0,0,0,0,0, 312 | 0,2,3,0,0,0,0,0, 313 | 2,3,0,0,0,0,0,0, 314 | 0,1,3,0,0,0,0,0, 315 | 1,3,0,0,0,0,0,0, 316 | 0,3,0,0,0,0,0,0, 317 | 3,0,0,0,0,0,0,0, 318 | 0,1,2,0,0,0,0,0, 319 | 1,2,0,0,0,0,0,0, 320 | 0,2,0,0,0,0,0,0, 321 | 2,0,0,0,0,0,0,0, 322 | 0,1,0,0,0,0,0,0, 323 | 1,0,0,0,0,0,0,0, 324 | 0,0,0,0,0,0,0,0, 325 | 0,0,0,0,0,0,0,0, 326 | ])}; 327 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! # SimdSketch 2 | //! 3 | //! This library provides two types of sequence sketches: 4 | //! - the classic bottom-`s` sketch; 5 | //! - the newer bucket sketch, returning the smallest hash in each of `s` buckets. 6 | //! 7 | //! See the corresponding [blogpost](https://curiouscoding.nl/posts/simd-sketch/) for more background and an evaluation. 8 | //! 9 | //! ## Hash function 10 | //! All internal hashes are 32 bits. Either a forward-only hash or 11 | //! reverse-complement-aware (canonical) hash can be used. 12 | //! 13 | //! **TODO:** Current we use (canonical) ntHash. This causes some hash-collisions 14 | //! for `k <= 16`, [which could be avoided](https://curiouscoding.nl/posts/nthash/#is-nthash-injective-on-kmers). 15 | //! 16 | //! ## BucketSketch 17 | //! For classic bottom-sketch, evaluating the similarity is slow because a 18 | //! merge-sort must be done between the two lists. 19 | //! 20 | //! The bucket sketch solves this by partitioning the hashes into `s` partitions. 21 | //! Previous methods partition into ranges of size `u32::MAX/s`, but here we 22 | //! partition by remainder mod `s` instead. 23 | //! 24 | //! We find the smallest hash for each remainder as the sketch. 25 | //! To compute the similarity, we can simply use the hamming distance between 26 | //! two sketches, which is significantly faster. 27 | //! 28 | //! The bucket sketch similarity has a very strong one-to-one correlation with the classic bottom-sketch. 29 | //! 30 | //! **TODO:** A drawback of this method is that some buckets may remain empty 31 | //! when the input sequences are not long enough. In that case, _densification_ 32 | //! could be applied, but this is not currently implemented. If you need this, please reach out. 33 | //! Instead, we currently simply keep a bitvector indicating empty buckets. 34 | //! 35 | //! ## Jaccard similarity 36 | //! For the bottom sketch, we conceptually estimate similarity as follows: 37 | //! 1. Find the smallest `s` distinct k-mer hashes in the union of two sketches. 38 | //! 2. Return the fraction of these k-mers that occurs in both sketches. 39 | //! 40 | //! For the bucket sketch, we simply return the fraction of parts that have 41 | //! the same k-mer for both sequences (out of those that are not both empty). 42 | //! 43 | //! ## b-bit sketches 44 | //! 45 | //! Instead of storing the full 32-bit hashes, it is sufficient to only store the low bits of each hash. 46 | //! In practice, `b=8` is usually fine. 47 | //! When extra fast comparisons are needed, use `b=1` in combination with a 3 to 4x larger `s`. 48 | //! 49 | //! This causes around `1/2^b` matches because of collisions in the lower bits. 50 | //! We correct for this via `j = (j0 - 1/2^b) / (1 - 1/2^b)`. 51 | //! When the fraction of matches is less than `1/2^b`, this is negative, which we explicitly correct to `0`. 52 | //! 53 | //! ## Mash distance 54 | //! We compute the mash distance as `-log( 2*j / (1+j) ) / k`. 55 | //! This is always >=0, but can be as large as `inf` when `j=0` (as is the case for disjoint input sets). 56 | //! 57 | //! ## Usage 58 | //! 59 | //! The main entrypoint of this library is the [`Sketcher`] object. 60 | //! Construct it in either the forward or canonical variant, and give `k` and `s`. 61 | //! Then call either [`Sketcher::bottom_sketch`] or [`Sketcher::sketch`] on it, and use the 62 | //! `similarity` functions on the returned [`BottomSketch`] and [`BucketSketch`] objects. 63 | //! 64 | //! ``` 65 | //! use packed_seq::SeqVec; 66 | //! 67 | //! let sketcher = simd_sketch::SketchParams { 68 | //! alg: simd_sketch::SketchAlg::Bucket, 69 | //! rc: false, // Set to `true` for a canonical (reverse-complement-aware) hash. 70 | //! k: 31, // Hash 31-mers 71 | //! s: 8192, // Sample 8192 hashes 72 | //! b: 8, // Store the bottom 8 bits of each hash. 73 | //! filter_empty: true, // Explicitly filter out empty buckets for BucketSketch. 74 | //! }.build(); 75 | //! 76 | //! // Generate two random sequences of 2M characters. 77 | //! let n = 2_000_000; 78 | //! let seq1 = packed_seq::PackedSeqVec::random(n); 79 | //! let seq2 = packed_seq::PackedSeqVec::random(n); 80 | //! 81 | //! // Sketch using given algorithm: 82 | //! let sketch1: simd_sketch::Sketch = sketcher.sketch(seq1.as_slice()); 83 | //! let sketch2: simd_sketch::Sketch = sketcher.sketch(seq2.as_slice()); 84 | //! 85 | //! // Value between 0 and 1, estimating the fraction of shared k-mers. 86 | //! let j = sketch1.jaccard_similarity(&sketch2); 87 | //! assert!(0.0 <= j && j <= 1.0); 88 | //! 89 | //! let d = sketch1.mash_distance(&sketch2); 90 | //! assert!(0.0 <= d); 91 | //! ``` 92 | //! 93 | //! **TODO:** Currently there is no support yet for merging sketches, or for 94 | //! sketching multiple sequences into one sketch. It's not hard, I just need to find a good API. 95 | //! Please reach out if you're interested in this. 96 | //! 97 | //! **TODO:** If you would like a binary instead of a library, again, please reach out :) 98 | //! 99 | //! ## Implementation notes 100 | //! 101 | //! This library works by partitioning the input sequence into 8 chunks, 102 | //! and processing those in parallel using SIMD. 103 | //! This is based on the [`packed-seq`](../packed_seq/index.html) and [`simd-minimizers`](../simd_minimizers/index.html) crates. 104 | //! 105 | //! For bottom sketch, the largest hash should be around `target = u32::MAX * s / n` (ignoring duplicates). 106 | //! To ensure a branch-free algorithm, we first collect all hashes up to `bound = 1.5 * target`. 107 | //! Then we sort the collected hashes. If there are at least `s` left after deduplicating, we return the bottom `s`. 108 | //! Otherwise, we double the `1.5` multiplication factor and retry. This 109 | //! factor is cached to make the sketching of multiple genomes more efficient. 110 | //! 111 | //! For bucket sketch, we use the same approach, and increase the factor until we find a k-mer hash in every bucket. 112 | //! In expectation, this needs to collect a fraction around `log(n) * s / n` of hashes, rather than `s / n`. 113 | //! In practice this doesn't matter much, as the hashing of all input k-mers is the bottleneck, 114 | //! and the sorting of the small sample of k-mers is relatively fast. 115 | //! 116 | //! For bucket sketch we assign each element to its bucket via its remainder modulo `s`. 117 | //! We compute this efficiently using [fast-mod](https://github.com/lemire/fastmod/blob/master/include/fastmod.h). 118 | //! 119 | //! ## Performance 120 | //! 121 | //! The sketching throughput of this library is around 2 seconds for a 3GB human genome 122 | //! (once the scaling factor is large enough to avoid a second pass). 123 | //! That's typically a few times faster than parsing a Fasta file. 124 | //! 125 | //! [BinDash](https://github.com/zhaoxiaofei/bindash) instead takes 180s (90x 126 | //! more), when running on a single thread. 127 | //! 128 | //! Comparing sketches is relatively fast, but can become a bottleneck when there are many input sequences, 129 | //! since the number of comparisons grows quadratically. In this case, prefer bucket sketch. 130 | //! As an example, when sketching 5MB bacterial genomes using `s=10000`, each sketch takes 4ms. 131 | //! Comparing two sketches takes 1.6us. 132 | //! This starts to be the dominant factor when the number of input sequences is more than 5000. 133 | 134 | mod intrinsics; 135 | 136 | use std::sync::atomic::{AtomicUsize, Ordering::SeqCst}; 137 | 138 | use log::{debug, info}; 139 | use packed_seq::{u32x8, Seq}; 140 | use simd_minimizers::private::nthash::NtHasher; 141 | 142 | pub enum Sketch { 143 | BottomSketch(BottomSketch), 144 | BucketSketch(BucketSketch), 145 | } 146 | 147 | fn compute_mash_distance(j: f32, k: usize) -> f32 { 148 | assert!(j >= 0.0, "Jaccard similarity {j} should not be negative"); 149 | // See eq. 4 of mash paper. 150 | let mash_dist = -(2. * j / (1. + j)).ln() / k as f32; 151 | assert!( 152 | mash_dist >= 0.0, 153 | "Bad mash distance {mash_dist} for jaccard similarity {j}" 154 | ); 155 | // NOTE: Mash distance can be >1 when jaccard similarity is close to 0. 156 | // assert!( 157 | // mash_dist <= 1.0, 158 | // "Bad mash distance {mash_dist} for jaccard similarity {j}" 159 | // ); 160 | // Distance 0 is computed as -log(1) and becomes -0.0. 161 | // This maximum fixes that. 162 | mash_dist.max(0.0) 163 | } 164 | 165 | impl Sketch { 166 | pub fn jaccard_similarity(&self, other: &Self) -> f32 { 167 | match (self, other) { 168 | (Sketch::BottomSketch(a), Sketch::BottomSketch(b)) => a.jaccard_similarity(b), 169 | (Sketch::BucketSketch(a), Sketch::BucketSketch(b)) => a.jaccard_similarity(b), 170 | _ => panic!("Sketches are of different types!"), 171 | } 172 | } 173 | pub fn mash_distance(&self, other: &Self) -> f32 { 174 | let j = self.jaccard_similarity(other); 175 | let k = match self { 176 | Sketch::BottomSketch(sketch) => sketch.k, 177 | Sketch::BucketSketch(sketch) => sketch.k, 178 | }; 179 | compute_mash_distance(j, k) 180 | } 181 | } 182 | 183 | /// Store only the bottom b bits of each input value. 184 | pub enum BitSketch { 185 | B32(Vec), 186 | B16(Vec), 187 | B8(Vec), 188 | B1(Vec), 189 | } 190 | 191 | impl BitSketch { 192 | fn new(b: usize, vals: Vec) -> Self { 193 | match b { 194 | 32 => BitSketch::B32(vals), 195 | 16 => BitSketch::B16(vals.into_iter().map(|x| x as u16).collect()), 196 | 8 => BitSketch::B8(vals.into_iter().map(|x| x as u8).collect()), 197 | 1 => BitSketch::B1({ 198 | assert_eq!(vals.len() % 64, 0); 199 | vals.chunks_exact(64) 200 | .map(|xs| { 201 | xs.iter() 202 | .enumerate() 203 | .fold(0u64, |bits, (i, x)| bits | (((x & 1) as u64) << i)) 204 | }) 205 | .collect() 206 | }), 207 | _ => panic!("Unsupported bit width. Must be 1 or 8 or 16 or 32."), 208 | } 209 | } 210 | } 211 | 212 | /// A sketch containing the `s` smallest k-mer hashes. 213 | pub struct BottomSketch { 214 | rc: bool, 215 | k: usize, 216 | bottom: Vec, 217 | } 218 | 219 | impl BottomSketch { 220 | /// Compute the similarity between two `BottomSketch`es. 221 | pub fn jaccard_similarity(&self, other: &Self) -> f32 { 222 | assert_eq!(self.rc, other.rc); 223 | assert_eq!(self.k, other.k); 224 | let a = &self.bottom; 225 | let b = &other.bottom; 226 | assert_eq!(a.len(), b.len()); 227 | let mut intersection_size = 0; 228 | let mut union_size = 0; 229 | let mut i = 0; 230 | let mut j = 0; 231 | while union_size < a.len() { 232 | intersection_size += (a[i] == b[j]) as usize; 233 | let di = (a[i] <= b[j]) as usize; 234 | let dj = (a[i] >= b[j]) as usize; 235 | i += di; 236 | j += dj; 237 | union_size += 1; 238 | } 239 | 240 | return intersection_size as f32 / a.len() as f32; 241 | } 242 | 243 | pub fn mash_distance(&self, other: &Self) -> f32 { 244 | let j = self.jaccard_similarity(other); 245 | compute_mash_distance(j, self.k) 246 | } 247 | } 248 | 249 | /// A sketch containing the smallest k-mer hash for each remainder mod `s`. 250 | pub struct BucketSketch { 251 | rc: bool, 252 | k: usize, 253 | b: usize, 254 | buckets: BitSketch, 255 | /// Bit-vector indicating empty buckets, so the similarity score can be adjusted accordingly. 256 | empty: Vec, 257 | } 258 | 259 | impl BucketSketch { 260 | /// Compute the similarity between two `BucketSketch`es. 261 | pub fn jaccard_similarity(&self, other: &Self) -> f32 { 262 | assert_eq!(self.rc, other.rc); 263 | assert_eq!(self.k, other.k); 264 | assert_eq!(self.b, other.b); 265 | let both_empty = self.both_empty(other); 266 | if both_empty > 0 { 267 | info!("Both empty: {}", both_empty); 268 | } 269 | match (&self.buckets, &other.buckets) { 270 | (BitSketch::B32(a), BitSketch::B32(b)) => Self::inner_similarity(a, b, both_empty), 271 | (BitSketch::B16(a), BitSketch::B16(b)) => Self::inner_similarity(a, b, both_empty), 272 | (BitSketch::B8(a), BitSketch::B8(b)) => Self::inner_similarity(a, b, both_empty), 273 | (BitSketch::B1(a), BitSketch::B1(b)) => Self::b1_similarity(a, b, both_empty), 274 | _ => panic!("Bit width mismatch"), 275 | } 276 | } 277 | 278 | pub fn mash_distance(&self, other: &Self) -> f32 { 279 | let j = self.jaccard_similarity(other); 280 | compute_mash_distance(j, self.k) 281 | } 282 | 283 | fn inner_similarity(a: &Vec, b: &Vec, both_empty: usize) -> f32 { 284 | assert_eq!(a.len(), b.len()); 285 | let f = 1.0 286 | - std::iter::zip(a, b) 287 | .map(|(a, b)| (a != b) as u32) 288 | .sum::() as f32 289 | / (a.len() - both_empty) as f32; 290 | // Correction for accidental matches. 291 | let bb = (1usize << (size_of::() * 8)) as f32; 292 | 293 | // Correction for accidental matches. 294 | // Take a max with 0 to avoid correcting into a negative jaccard similarity 295 | // for uncorrelated sketches. 296 | (bb * f - 1.0).max(0.0) / (bb - 1.0) 297 | } 298 | 299 | fn b1_similarity(a: &Vec, b: &Vec, both_empty: usize) -> f32 { 300 | assert_eq!(a.len(), b.len()); 301 | let f = 1.0 302 | - std::iter::zip(a, b) 303 | .map(|(a, b)| (*a ^ *b).count_ones()) 304 | .sum::() as f32 305 | / (64 * a.len() - both_empty) as f32; 306 | 307 | // Correction for accidental matches. 308 | // Take a max with 0 to avoid correcting into a negative jaccard similarity 309 | // for uncorrelated sketches. 310 | (2. * f - 1.).max(0.0) 311 | } 312 | 313 | fn both_empty(&self, other: &Self) -> usize { 314 | std::iter::zip(&self.empty, &other.empty) 315 | .map(|(a, b)| (a & b).count_ones()) 316 | .sum::() as usize 317 | } 318 | } 319 | 320 | #[derive(clap::ValueEnum, Clone, Copy, Debug)] 321 | pub enum SketchAlg { 322 | Bottom, 323 | Bucket, 324 | } 325 | 326 | #[derive(clap::Args, Copy, Clone, Debug)] 327 | pub struct SketchParams { 328 | /// Sketch algorithm to use. Defaults to bucket because of its much faster comparisons. 329 | #[arg(long, default_value_t = SketchAlg::Bucket)] 330 | #[arg(value_enum)] 331 | pub alg: SketchAlg, 332 | /// When set, use forward instead of canonical k-mer hashes. 333 | #[arg( 334 | long="fwd", 335 | num_args(0), 336 | action = clap::builder::ArgAction::Set, 337 | default_value = "false", 338 | default_missing_value = "true", 339 | )] 340 | pub rc: bool, 341 | /// k-mer size. 342 | #[arg(short, default_value_t = 31)] 343 | pub k: usize, 344 | /// Bottom-s sketch, or number of buckets. 345 | #[arg(short, default_value_t = 10000)] 346 | pub s: usize, 347 | /// For bucket-sketch, store only the lower b bits. 348 | #[arg(short, default_value_t = 8)] 349 | pub b: usize, 350 | /// For bucket-sketch, store a bitmask of empty buckets, to increase accuracy on small genomes. 351 | #[arg(skip = true)] 352 | pub filter_empty: bool, 353 | } 354 | 355 | /// An object containing the sketch parameters. 356 | /// 357 | /// Contains internal state to optimize the implementation when sketching multiple similar sequences. 358 | pub struct Sketcher { 359 | params: SketchParams, 360 | factor: AtomicUsize, 361 | } 362 | 363 | impl SketchParams { 364 | pub fn build(&self) -> Sketcher { 365 | Sketcher { 366 | params: *self, 367 | factor: 2.into(), 368 | } 369 | } 370 | 371 | /// Default sketcher that very fast at comparisons, but 20% slower at sketching. 372 | /// Use for >= 50000 seqs, and safe default when input sequences are > 500'000 characters. 373 | /// 374 | /// When sequences are < 100'000 characters, inaccuracies may occur due to empty buckets. 375 | pub fn default(k: usize) -> Self { 376 | SketchParams { 377 | alg: SketchAlg::Bucket, 378 | rc: true, 379 | k, 380 | s: 32768, 381 | b: 1, 382 | filter_empty: true, 383 | } 384 | } 385 | 386 | /// Default sketcher that is fast at sketching, but somewhat slower at comparisons. 387 | /// Use for <= 5000 seqs, or when input sequences are < 100'000 characters. 388 | pub fn default_fast_sketching(k: usize) -> Self { 389 | SketchParams { 390 | alg: SketchAlg::Bucket, 391 | rc: true, 392 | k, 393 | s: 8192, 394 | b: 8, 395 | filter_empty: false, 396 | } 397 | } 398 | } 399 | 400 | impl Sketcher { 401 | /// Sketch a single sequence. 402 | pub fn sketch<'s, S: Seq<'s>>(&self, seq: S) -> Sketch { 403 | self.sketch_seqs(&[seq]) 404 | } 405 | 406 | /// Sketch multiple sequence (fasta records) into a single sketch. 407 | pub fn sketch_seqs<'s, S: Seq<'s>>(&self, seqs: &[S]) -> Sketch { 408 | match self.params.alg { 409 | SketchAlg::Bottom => Sketch::BottomSketch(self.bottom_sketch(seqs)), 410 | SketchAlg::Bucket => Sketch::BucketSketch(self.bucket_sketch(seqs)), 411 | } 412 | } 413 | 414 | fn num_kmers<'s, S: Seq<'s>>(&self, seqs: &[S]) -> usize { 415 | seqs.iter() 416 | .map(|seq| seq.len() - self.params.k + 1) 417 | .sum::() 418 | } 419 | 420 | /// Return the `s` smallest `u32` k-mer hashes. 421 | /// Prefer [`Sketcher::sketch`] instead, which is much faster and just as 422 | /// accurate when input sequences are not too short. 423 | fn bottom_sketch<'s, S: Seq<'s>>(&self, seqs: &[S]) -> BottomSketch { 424 | // Iterate all kmers and compute 32bit nthashes. 425 | let n = self.num_kmers(seqs); 426 | let mut out = vec![]; 427 | loop { 428 | let target = u32::MAX as usize / n * self.params.s; 429 | let bound = 430 | (target.saturating_mul(self.factor.load(SeqCst))).min(u32::MAX as usize) as u32; 431 | 432 | self.collect_up_to_bound(seqs, bound, &mut out); 433 | 434 | if bound == u32::MAX || out.len() >= self.params.s { 435 | out.sort_unstable(); 436 | out.dedup(); 437 | if bound == u32::MAX || out.len() >= self.params.s { 438 | out.resize(self.params.s, u32::MAX); 439 | 440 | break BottomSketch { 441 | rc: self.params.rc, 442 | k: self.params.k, 443 | bottom: out, 444 | }; 445 | } 446 | } 447 | self.factor 448 | .fetch_add((self.factor.load(SeqCst) + 1) / 2, SeqCst); 449 | debug!("Increase factor to {}", self.factor.load(SeqCst)); 450 | } 451 | } 452 | 453 | /// s-buckets sketch. Splits the hashes into `s` buckets and returns the smallest hash per bucket. 454 | /// Buckets are determined via the remainder mod `s`. 455 | fn bucket_sketch<'s, S: Seq<'s>>(&self, seqs: &[S]) -> BucketSketch { 456 | // Iterate all kmers and compute 32bit nthashes. 457 | let n = self.num_kmers(seqs); 458 | let mut out = vec![]; 459 | let mut buckets = vec![u32::MAX; self.params.s]; 460 | loop { 461 | let target = u32::MAX as usize / n * self.params.s; 462 | let bound = 463 | (target.saturating_mul(self.factor.load(SeqCst))).min(u32::MAX as usize) as u32; 464 | 465 | self.collect_up_to_bound(seqs, bound, &mut out); 466 | 467 | if bound == u32::MAX || out.len() >= self.params.s { 468 | let m = FM32::new(self.params.s as u32); 469 | for &hash in &out { 470 | let bucket = m.fastmod(hash); 471 | buckets[bucket] = buckets[bucket].min(hash); 472 | } 473 | let mut empty = 0; 474 | for &x in &buckets { 475 | if x == u32::MAX { 476 | empty += 1; 477 | } 478 | } 479 | if bound == u32::MAX || empty == 0 { 480 | if empty > 0 { 481 | info!("Found {empty} empty buckets."); 482 | } 483 | let empty = if empty > 0 && self.params.filter_empty { 484 | info!("Found {empty} empty buckets. Storing bitmask."); 485 | buckets 486 | .chunks(64) 487 | .map(|xs| { 488 | xs.iter().enumerate().fold(0u64, |bits, (i, x)| { 489 | bits | (((*x == u32::MAX) as u64) << i) 490 | }) 491 | }) 492 | .collect() 493 | } else { 494 | vec![] 495 | }; 496 | 497 | break BucketSketch { 498 | rc: self.params.rc, 499 | k: self.params.k, 500 | b: self.params.b, 501 | empty, 502 | buckets: BitSketch::new( 503 | self.params.b, 504 | buckets.into_iter().map(|x| m.fastdiv(x) as u32).collect(), 505 | ), 506 | }; 507 | } 508 | } 509 | self.factor 510 | .fetch_add((self.factor.load(SeqCst) + 1) / 2, SeqCst); 511 | debug!("Increase factor to {}", self.factor.load(SeqCst)); 512 | } 513 | } 514 | 515 | fn collect_up_to_bound<'s, S: Seq<'s>>(&self, seqs: &[S], bound: u32, out: &mut Vec) { 516 | if self.params.rc { 517 | collect_up_to_bound_generic::(seqs, self.params.k, bound, out); 518 | } else { 519 | collect_up_to_bound_generic::(seqs, self.params.k, bound, out); 520 | } 521 | } 522 | } 523 | 524 | fn collect_up_to_bound_generic<'s, const RC: bool, S: Seq<'s>>( 525 | seqs: &[S], 526 | k: usize, 527 | bound: u32, 528 | out: &mut Vec, 529 | ) { 530 | let simd_bound = u32x8::splat(bound); 531 | out.clear(); 532 | 533 | for &seq in seqs { 534 | let (hashes_head, hashes_tail) = 535 | simd_minimizers::private::nthash::nthash_seq_simd::(seq, k, 1); 536 | 537 | let mut write_idx = out.len(); 538 | for hashes in hashes_head { 539 | let mask = hashes.cmp_lt(simd_bound); 540 | if write_idx + 8 >= out.len() { 541 | out.resize(write_idx * 3 / 2 + 8, 0); 542 | } 543 | unsafe { intrinsics::append_from_mask(hashes, mask, out, &mut write_idx) }; 544 | } 545 | 546 | out.resize(write_idx, 0); 547 | 548 | for hash in hashes_tail { 549 | if hash <= bound { 550 | out.push(hash); 551 | } 552 | } 553 | } 554 | } 555 | 556 | /// FastMod32, using the low 32 bits of the hash. 557 | /// Taken from https://github.com/lemire/fastmod/blob/master/include/fastmod.h 558 | #[derive(Copy, Clone, Debug)] 559 | struct FM32 { 560 | d: u64, 561 | m: u64, 562 | } 563 | impl FM32 { 564 | fn new(d: u32) -> Self { 565 | Self { 566 | d: d as u64, 567 | m: u64::MAX / d as u64 + 1, 568 | } 569 | } 570 | fn fastmod(self, h: u32) -> usize { 571 | let lowbits = self.m.wrapping_mul(h as u64); 572 | ((lowbits as u128 * self.d as u128) >> 64) as usize 573 | } 574 | fn fastdiv(self, h: u32) -> usize { 575 | ((self.m as u128 * h as u128) >> 64) as u32 as usize 576 | } 577 | } 578 | 579 | #[cfg(test)] 580 | mod test { 581 | use super::*; 582 | use packed_seq::SeqVec; 583 | 584 | #[test] 585 | fn test() { 586 | use packed_seq::SeqVec; 587 | let b = 16; 588 | 589 | let k = 31; 590 | for n in 31..100 { 591 | let s = n - k + 1; 592 | let seq = packed_seq::PackedSeqVec::random(n); 593 | let sketcher = crate::SketchParams { 594 | alg: SketchAlg::Bottom, 595 | rc: false, 596 | k, 597 | s, 598 | b, 599 | filter_empty: false, 600 | } 601 | .build(); 602 | let bottom = sketcher.bottom_sketch(&[seq.as_slice()]).bottom; 603 | assert_eq!(bottom.len(), s); 604 | assert!(bottom.is_sorted()); 605 | 606 | let s = s.min(10); 607 | let seq = packed_seq::PackedSeqVec::random(n); 608 | let sketcher = crate::SketchParams { 609 | alg: SketchAlg::Bottom, 610 | rc: true, 611 | k, 612 | s, 613 | b, 614 | filter_empty: false, 615 | } 616 | .build(); 617 | let bottom = sketcher.bottom_sketch(&[seq.as_slice()]).bottom; 618 | assert_eq!(bottom.len(), s); 619 | assert!(bottom.is_sorted()); 620 | } 621 | } 622 | 623 | #[test] 624 | fn rc() { 625 | use packed_seq::SeqVec; 626 | 627 | let b = 32; 628 | for k in (0..10).map(|_| rand::random_range(1..100)) { 629 | for n in (0..10).map(|_| rand::random_range(k..1000)) { 630 | for s in (0..10).map(|_| rand::random_range(0..n - k + 1)) { 631 | let seq = packed_seq::AsciiSeqVec::random(n); 632 | let sketcher = crate::SketchParams { 633 | alg: SketchAlg::Bottom, 634 | rc: true, 635 | k, 636 | s, 637 | b, 638 | filter_empty: false, 639 | } 640 | .build(); 641 | let bottom = sketcher.bottom_sketch(&[seq.as_slice()]).bottom; 642 | assert_eq!(bottom.len(), s); 643 | assert!(bottom.is_sorted()); 644 | 645 | let seq_rc = packed_seq::AsciiSeqVec::from_ascii( 646 | &seq.seq 647 | .iter() 648 | .rev() 649 | .map(|c| packed_seq::complement_char(*c)) 650 | .collect::>(), 651 | ); 652 | 653 | let bottom_rc = sketcher.bottom_sketch(&[seq_rc.as_slice()]).bottom; 654 | assert_eq!(bottom, bottom_rc); 655 | } 656 | } 657 | } 658 | } 659 | 660 | #[test] 661 | fn equal_dist() { 662 | let s = 1000; 663 | let k = 10; 664 | let n = 300; 665 | let b = 8; 666 | let seq = packed_seq::AsciiSeqVec::random(n); 667 | 668 | for (alg, filter_empty) in [ 669 | (SketchAlg::Bottom, false), 670 | (SketchAlg::Bucket, false), 671 | (SketchAlg::Bucket, true), 672 | ] { 673 | let sketcher = crate::SketchParams { 674 | alg, 675 | rc: false, 676 | k, 677 | s, 678 | b, 679 | filter_empty, 680 | } 681 | .build(); 682 | let sketch = sketcher.sketch(seq.as_slice()); 683 | assert_eq!(sketch.mash_distance(&sketch), 0.0); 684 | } 685 | } 686 | 687 | #[test] 688 | fn fuzz_short() { 689 | let s = 1024; 690 | let k = 10; 691 | for b in [1, 8, 16, 32] { 692 | for n in [10, 20, 40, 80, 150, 300, 500, 1000, 2000] { 693 | let seq1 = packed_seq::AsciiSeqVec::random(n); 694 | let seq2 = packed_seq::AsciiSeqVec::random(n); 695 | 696 | for (alg, filter_empty) in [ 697 | (SketchAlg::Bottom, false), 698 | (SketchAlg::Bucket, false), 699 | (SketchAlg::Bucket, true), 700 | ] { 701 | let sketcher = crate::SketchParams { 702 | alg, 703 | rc: false, 704 | k, 705 | s, 706 | b, 707 | filter_empty, 708 | } 709 | .build(); 710 | let s1 = sketcher.sketch(seq1.as_slice()); 711 | let s2 = sketcher.sketch(seq2.as_slice()); 712 | s1.mash_distance(&s2); 713 | } 714 | } 715 | } 716 | } 717 | } 718 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::Parser; 4 | use indicatif::ParallelProgressIterator; 5 | use itertools::Itertools; 6 | use log::info; 7 | use packed_seq::{PackedSeqVec, SeqVec}; 8 | use rayon::iter::{IntoParallelIterator, IntoParallelRefIterator, ParallelIterator}; 9 | use simd_sketch::SketchParams; 10 | 11 | /// Compute the sketch distance between two fasta files. 12 | #[derive(clap::Parser)] 13 | struct Args { 14 | #[command(subcommand)] 15 | command: Command, 16 | } 17 | 18 | /// TODO: Support for writing sketches to disk. 19 | #[derive(clap::Subcommand)] 20 | enum Command { 21 | /// Compute the distance between two sequences. 22 | Dist { 23 | #[command(flatten)] 24 | params: SketchParams, 25 | /// First input fasta file. 26 | path_a: PathBuf, 27 | /// Second input fasta file. 28 | path_b: PathBuf, 29 | }, 30 | /// Takes paths to fasta files, and outputs a Phylip distance matrix to stdout. 31 | Triangle { 32 | #[command(flatten)] 33 | params: SketchParams, 34 | /// Paths to (directories of) (gzipped) fasta files. 35 | paths: Vec, 36 | /// Write phylip distance matrix here, or default to stdout. 37 | #[arg(long)] 38 | output: Option, 39 | }, 40 | } 41 | 42 | fn main() { 43 | env_logger::init(); 44 | 45 | let args = Args::parse(); 46 | 47 | let (params, paths) = match &args.command { 48 | Command::Dist { 49 | params, 50 | path_a, 51 | path_b, 52 | } => (params, vec![path_a.clone(), path_b.clone()]), 53 | Command::Triangle { params, paths, .. } => (params, collect_paths(&paths)), 54 | }; 55 | 56 | let q = paths.len(); 57 | 58 | let sketcher = params.build(); 59 | 60 | let style = indicatif::ProgressStyle::with_template( 61 | "{msg:.bold} [{elapsed_precise:.cyan}] {bar} {pos}/{len} ({percent:>3}%)", 62 | ) 63 | .unwrap() 64 | .progress_chars("##-"); 65 | 66 | let start = std::time::Instant::now(); 67 | 68 | let sketches: Vec<_> = paths 69 | .par_iter() 70 | .progress_with_style(style.clone()) 71 | .with_message("Sketching") 72 | .with_finish(indicatif::ProgressFinish::AndLeave) 73 | .map(|path| { 74 | let mut seqs = vec![]; 75 | let mut reader = needletail::parse_fastx_file(&path).unwrap(); 76 | while let Some(r) = reader.next() { 77 | seqs.push(PackedSeqVec::from_ascii(&r.unwrap().seq())); 78 | } 79 | let slices = seqs.iter().map(|s| s.as_slice()).collect_vec(); 80 | sketcher.sketch_seqs(&slices) 81 | }) 82 | .collect(); 83 | let t_sketch = start.elapsed(); 84 | 85 | info!( 86 | "Sketching {q} seqs took {t_sketch:?} ({:?} avg)", 87 | t_sketch / q as u32 88 | ); 89 | 90 | let num_pairs = q * (q - 1) / 2; 91 | let mut pairs = Vec::with_capacity(num_pairs); 92 | for i in 0..q { 93 | for j in 0..i { 94 | pairs.push((i, j)); 95 | } 96 | } 97 | let start = std::time::Instant::now(); 98 | let dists: Vec<_> = pairs 99 | .into_par_iter() 100 | .progress_with_style(style.clone()) 101 | .with_message("Distances") 102 | .with_finish(indicatif::ProgressFinish::AndLeave) 103 | .map(|(i, j)| sketches[i].mash_distance(&sketches[j])) 104 | .collect(); 105 | let t_dist = start.elapsed(); 106 | 107 | let cnt = q * (q - 1) / 2; 108 | info!( 109 | "Computing {cnt} dists took {t_dist:?} ({:?} avg)", 110 | t_dist / cnt.max(1) as u32 111 | ); 112 | 113 | match &args.command { 114 | Command::Dist { .. } => { 115 | println!("Distance: {:.4}", dists[0]); 116 | return; 117 | } 118 | Command::Triangle { output, .. } => { 119 | use std::io::Write; 120 | 121 | // Output Phylip triangle format. 122 | let mut out = Vec::new(); 123 | writeln!(out, "{q}").unwrap(); 124 | let mut d = dists.iter(); 125 | for i in 0..q { 126 | write!(out, "{}", paths[i].to_str().unwrap()).unwrap(); 127 | for _ in 0..i { 128 | write!(out, "\t{:.7}", d.next().unwrap()).unwrap(); 129 | } 130 | writeln!(out).unwrap(); 131 | } 132 | 133 | match output { 134 | Some(output) => std::fs::write(output, out).unwrap(), 135 | None => println!("{}", str::from_utf8(&out).unwrap()), 136 | } 137 | } 138 | } 139 | } 140 | 141 | fn collect_paths(paths: &Vec) -> Vec { 142 | let mut res = vec![]; 143 | for path in paths { 144 | if path.is_dir() { 145 | res.extend(path.read_dir().unwrap().map(|entry| entry.unwrap().path())); 146 | } else { 147 | res.push(path.clone()); 148 | } 149 | } 150 | res.sort(); 151 | res 152 | } 153 | --------------------------------------------------------------------------------