├── .cargo └── config.toml ├── .github └── workflows │ └── rust.yml ├── .gitignore ├── CHANGES.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── analyze_indel_errors.md ├── cli-tests.sh ├── lua-api.md ├── plot.py ├── scripts ├── analyze_indel_errors.py └── count-errors.py ├── src ├── combine_counts.rs ├── combine_errors.rs ├── files.rs ├── fraguracy.rs ├── homopolymer.rs ├── lua.rs ├── main.rs └── plot.rs └── test-data ├── a.errors.bed └── b.errors.bed /.cargo/config.toml: -------------------------------------------------------------------------------- 1 | # Cargo configuration for faster compilation 2 | 3 | # Use all available CPU cores for compilation 4 | 5 | # Enable experimental parallel frontend (Rust 1.70+) 6 | linker = "clang" 7 | rustflags = [ 8 | "-C", "target-cpu=native", # Optimize for your specific CPU 9 | "-C", "link-arg=-fuse-ld=lld", # Use gold linker for faster linking 10 | ] 11 | 12 | # Use faster linker if available 13 | [target.x86_64-unknown-linux-gnu] 14 | linker = "clang" 15 | rustflags = [ 16 | "-C", "link-arg=-fuse-ld=lld", # Use gold linker for faster linking 17 | "-C", "target-cpu=native", 18 | ] 19 | 20 | # Enable pipelined compilation (experimental) 21 | [unstable] 22 | # Uncomment if using nightly Rust 23 | # pipelined-compilation = true 24 | 25 | [env] 26 | # Increase parallel rustc processes 27 | CARGO_BUILD_RUSTC_OPTS = "-C codegen-units=16" 28 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Build 20 | run: cargo build --verbose 21 | - name: Run tests 22 | run: cargo test --verbose 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | *.bed.gz 3 | *-counts.txt 4 | *.bam 5 | *.bam.bai 6 | -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # v0.2.7 2 | + indels: report length and bq-bin and add scripts/analyze-indel-errors.py to plot indel error rate 3 | + combine_counts: better error messages and handle NA 4 | 5 | # v0.2.6 6 | + report cases where neither bases matches the reference as NN:1 when --reference-as-truth is passed. 7 | + add lua expressions to filter reads 8 | 9 | # v0.2.5 10 | + respect include and exclude for indels and denominator calculation (thanks Jason for reporting) 11 | 12 | # v0.2.4 13 | + Allow chromosomes longer than u8::MAX (#11 thanks @pontushojer for reporting) 14 | + Fix: when an include region was given and a non-seen chromosome was queried, it would return all intervals in that chromosome (#10 thanks Jason) 15 | 16 | # v0.2.3 17 | 18 | + Add --chromosome option to restrict analysis to a single chromosome. 19 | + fix counts of indel errors 20 | + fix distance to homopolymer (really) 21 | 22 | 23 | # v0.2.2 24 | 25 | + Fix distance to homopolymer 26 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "adler2" 7 | version = "2.0.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" 10 | 11 | [[package]] 12 | name = "aho-corasick" 13 | version = "1.1.3" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 16 | dependencies = [ 17 | "memchr", 18 | ] 19 | 20 | [[package]] 21 | name = "anstream" 22 | version = "0.6.17" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338" 25 | dependencies = [ 26 | "anstyle", 27 | "anstyle-parse", 28 | "anstyle-query", 29 | "anstyle-wincon", 30 | "colorchoice", 31 | "is_terminal_polyfill", 32 | "utf8parse", 33 | ] 34 | 35 | [[package]] 36 | name = "anstyle" 37 | version = "1.0.9" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56" 40 | 41 | [[package]] 42 | name = "anstyle-parse" 43 | version = "0.2.6" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" 46 | dependencies = [ 47 | "utf8parse", 48 | ] 49 | 50 | [[package]] 51 | name = "anstyle-query" 52 | version = "1.1.2" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" 55 | dependencies = [ 56 | "windows-sys 0.59.0", 57 | ] 58 | 59 | [[package]] 60 | name = "anstyle-wincon" 61 | version = "3.0.6" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" 64 | dependencies = [ 65 | "anstyle", 66 | "windows-sys 0.59.0", 67 | ] 68 | 69 | [[package]] 70 | name = "anyhow" 71 | version = "1.0.98" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" 74 | 75 | [[package]] 76 | name = "autocfg" 77 | version = "1.4.0" 78 | source = "registry+https://github.com/rust-lang/crates.io-index" 79 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 80 | 81 | [[package]] 82 | name = "bindgen" 83 | version = "0.69.5" 84 | source = "registry+https://github.com/rust-lang/crates.io-index" 85 | checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" 86 | dependencies = [ 87 | "bitflags", 88 | "cexpr", 89 | "clang-sys", 90 | "itertools", 91 | "lazy_static", 92 | "lazycell", 93 | "proc-macro2", 94 | "quote", 95 | "regex", 96 | "rustc-hash 1.1.0", 97 | "shlex", 98 | "syn", 99 | ] 100 | 101 | [[package]] 102 | name = "bio-types" 103 | version = "1.0.4" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "f4dcf54f8b7f51450207d54780bab09c05f30b8b0caa991545082842e466ad7e" 106 | dependencies = [ 107 | "derive-new 0.6.0", 108 | "lazy_static", 109 | "regex", 110 | "strum_macros", 111 | "thiserror 1.0.64", 112 | ] 113 | 114 | [[package]] 115 | name = "bitflags" 116 | version = "2.6.0" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" 119 | 120 | [[package]] 121 | name = "bpci" 122 | version = "0.1.0" 123 | source = "registry+https://github.com/rust-lang/crates.io-index" 124 | checksum = "552c3faebbf05f0aebf152b9c90d76732ffac8bfead18786aa9ce3c5aae706a7" 125 | dependencies = [ 126 | "num-traits", 127 | "thiserror 1.0.64", 128 | ] 129 | 130 | [[package]] 131 | name = "bstr" 132 | version = "1.12.0" 133 | source = "registry+https://github.com/rust-lang/crates.io-index" 134 | checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" 135 | dependencies = [ 136 | "memchr", 137 | "serde", 138 | ] 139 | 140 | [[package]] 141 | name = "byteorder" 142 | version = "1.5.0" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" 145 | 146 | [[package]] 147 | name = "bzip2-sys" 148 | version = "0.1.11+1.0.8" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" 151 | dependencies = [ 152 | "cc", 153 | "libc", 154 | "pkg-config", 155 | ] 156 | 157 | [[package]] 158 | name = "cc" 159 | version = "1.2.19" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362" 162 | dependencies = [ 163 | "jobserver", 164 | "libc", 165 | "shlex", 166 | ] 167 | 168 | [[package]] 169 | name = "cexpr" 170 | version = "0.6.0" 171 | source = "registry+https://github.com/rust-lang/crates.io-index" 172 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" 173 | dependencies = [ 174 | "nom", 175 | ] 176 | 177 | [[package]] 178 | name = "cfg-if" 179 | version = "1.0.0" 180 | source = "registry+https://github.com/rust-lang/crates.io-index" 181 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 182 | 183 | [[package]] 184 | name = "clang-sys" 185 | version = "1.8.1" 186 | source = "registry+https://github.com/rust-lang/crates.io-index" 187 | checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" 188 | dependencies = [ 189 | "glob", 190 | "libc", 191 | "libloading", 192 | ] 193 | 194 | [[package]] 195 | name = "clap" 196 | version = "4.5.20" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" 199 | dependencies = [ 200 | "clap_builder", 201 | "clap_derive", 202 | ] 203 | 204 | [[package]] 205 | name = "clap_builder" 206 | version = "4.5.20" 207 | source = "registry+https://github.com/rust-lang/crates.io-index" 208 | checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" 209 | dependencies = [ 210 | "anstream", 211 | "anstyle", 212 | "clap_lex", 213 | "strsim", 214 | ] 215 | 216 | [[package]] 217 | name = "clap_derive" 218 | version = "4.5.18" 219 | source = "registry+https://github.com/rust-lang/crates.io-index" 220 | checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" 221 | dependencies = [ 222 | "heck", 223 | "proc-macro2", 224 | "quote", 225 | "syn", 226 | ] 227 | 228 | [[package]] 229 | name = "clap_lex" 230 | version = "0.7.2" 231 | source = "registry+https://github.com/rust-lang/crates.io-index" 232 | checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" 233 | 234 | [[package]] 235 | name = "cmake" 236 | version = "0.1.51" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" 239 | dependencies = [ 240 | "cc", 241 | ] 242 | 243 | [[package]] 244 | name = "colorchoice" 245 | version = "1.0.3" 246 | source = "registry+https://github.com/rust-lang/crates.io-index" 247 | checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" 248 | 249 | [[package]] 250 | name = "crc32fast" 251 | version = "1.4.2" 252 | source = "registry+https://github.com/rust-lang/crates.io-index" 253 | checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" 254 | dependencies = [ 255 | "cfg-if", 256 | ] 257 | 258 | [[package]] 259 | name = "crossbeam-deque" 260 | version = "0.8.5" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" 263 | dependencies = [ 264 | "crossbeam-epoch", 265 | "crossbeam-utils", 266 | ] 267 | 268 | [[package]] 269 | name = "crossbeam-epoch" 270 | version = "0.9.18" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 273 | dependencies = [ 274 | "crossbeam-utils", 275 | ] 276 | 277 | [[package]] 278 | name = "crossbeam-utils" 279 | version = "0.8.20" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" 282 | 283 | [[package]] 284 | name = "curl-sys" 285 | version = "0.4.78+curl-8.11.0" 286 | source = "registry+https://github.com/rust-lang/crates.io-index" 287 | checksum = "8eec768341c5c7789611ae51cf6c459099f22e64a5d5d0ce4892434e33821eaf" 288 | dependencies = [ 289 | "cc", 290 | "libc", 291 | "libz-sys", 292 | "openssl-sys", 293 | "pkg-config", 294 | "vcpkg", 295 | "windows-sys 0.52.0", 296 | ] 297 | 298 | [[package]] 299 | name = "custom_derive" 300 | version = "0.1.7" 301 | source = "registry+https://github.com/rust-lang/crates.io-index" 302 | checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" 303 | 304 | [[package]] 305 | name = "derive-new" 306 | version = "0.6.0" 307 | source = "registry+https://github.com/rust-lang/crates.io-index" 308 | checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" 309 | dependencies = [ 310 | "proc-macro2", 311 | "quote", 312 | "syn", 313 | ] 314 | 315 | [[package]] 316 | name = "derive-new" 317 | version = "0.7.0" 318 | source = "registry+https://github.com/rust-lang/crates.io-index" 319 | checksum = "2cdc8d50f426189eef89dac62fabfa0abb27d5cc008f25bf4156a0203325becc" 320 | dependencies = [ 321 | "proc-macro2", 322 | "quote", 323 | "syn", 324 | ] 325 | 326 | [[package]] 327 | name = "displaydoc" 328 | version = "0.2.5" 329 | source = "registry+https://github.com/rust-lang/crates.io-index" 330 | checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" 331 | dependencies = [ 332 | "proc-macro2", 333 | "quote", 334 | "syn", 335 | ] 336 | 337 | [[package]] 338 | name = "either" 339 | version = "1.13.0" 340 | source = "registry+https://github.com/rust-lang/crates.io-index" 341 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" 342 | 343 | [[package]] 344 | name = "env_logger" 345 | version = "0.10.2" 346 | source = "registry+https://github.com/rust-lang/crates.io-index" 347 | checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" 348 | dependencies = [ 349 | "humantime", 350 | "is-terminal", 351 | "log", 352 | "regex", 353 | "termcolor", 354 | ] 355 | 356 | [[package]] 357 | name = "flate2" 358 | version = "1.0.34" 359 | source = "registry+https://github.com/rust-lang/crates.io-index" 360 | checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" 361 | dependencies = [ 362 | "crc32fast", 363 | "miniz_oxide", 364 | ] 365 | 366 | [[package]] 367 | name = "form_urlencoded" 368 | version = "1.2.1" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" 371 | dependencies = [ 372 | "percent-encoding", 373 | ] 374 | 375 | [[package]] 376 | name = "fraguracy" 377 | version = "0.2.7" 378 | dependencies = [ 379 | "anyhow", 380 | "bpci", 381 | "clap", 382 | "env_logger", 383 | "flate2", 384 | "idna 1.0.3", 385 | "itertools", 386 | "lazy_static", 387 | "libc", 388 | "linear-map", 389 | "log", 390 | "mlua", 391 | "ndarray", 392 | "rayon", 393 | "regex", 394 | "rust-htslib", 395 | "rust-lapper", 396 | "rustc-hash 1.1.0", 397 | "syn", 398 | ] 399 | 400 | [[package]] 401 | name = "fs-utils" 402 | version = "1.1.4" 403 | source = "registry+https://github.com/rust-lang/crates.io-index" 404 | checksum = "6fc7a9dc005c944c98a935e7fd626faf5bf7e5a609f94bc13e42fc4a02e52593" 405 | dependencies = [ 406 | "quick-error", 407 | ] 408 | 409 | [[package]] 410 | name = "glob" 411 | version = "0.3.1" 412 | source = "registry+https://github.com/rust-lang/crates.io-index" 413 | checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" 414 | 415 | [[package]] 416 | name = "heck" 417 | version = "0.5.0" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 420 | 421 | [[package]] 422 | name = "hermit-abi" 423 | version = "0.4.0" 424 | source = "registry+https://github.com/rust-lang/crates.io-index" 425 | checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" 426 | 427 | [[package]] 428 | name = "hts-sys" 429 | version = "2.2.0" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "e38d7f1c121cd22aa214cb4dadd4277dc5447391eac518b899b29ba6356fbbb2" 432 | dependencies = [ 433 | "bindgen", 434 | "bzip2-sys", 435 | "cc", 436 | "curl-sys", 437 | "fs-utils", 438 | "glob", 439 | "libdeflate-sys", 440 | "libz-sys", 441 | "lzma-sys", 442 | "openssl-sys", 443 | ] 444 | 445 | [[package]] 446 | name = "humantime" 447 | version = "2.1.0" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" 450 | 451 | [[package]] 452 | name = "icu_collections" 453 | version = "1.5.0" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" 456 | dependencies = [ 457 | "displaydoc", 458 | "yoke", 459 | "zerofrom", 460 | "zerovec", 461 | ] 462 | 463 | [[package]] 464 | name = "icu_locid" 465 | version = "1.5.0" 466 | source = "registry+https://github.com/rust-lang/crates.io-index" 467 | checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" 468 | dependencies = [ 469 | "displaydoc", 470 | "litemap", 471 | "tinystr", 472 | "writeable", 473 | "zerovec", 474 | ] 475 | 476 | [[package]] 477 | name = "icu_locid_transform" 478 | version = "1.5.0" 479 | source = "registry+https://github.com/rust-lang/crates.io-index" 480 | checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" 481 | dependencies = [ 482 | "displaydoc", 483 | "icu_locid", 484 | "icu_locid_transform_data", 485 | "icu_provider", 486 | "tinystr", 487 | "zerovec", 488 | ] 489 | 490 | [[package]] 491 | name = "icu_locid_transform_data" 492 | version = "1.5.0" 493 | source = "registry+https://github.com/rust-lang/crates.io-index" 494 | checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" 495 | 496 | [[package]] 497 | name = "icu_normalizer" 498 | version = "1.5.0" 499 | source = "registry+https://github.com/rust-lang/crates.io-index" 500 | checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" 501 | dependencies = [ 502 | "displaydoc", 503 | "icu_collections", 504 | "icu_normalizer_data", 505 | "icu_properties", 506 | "icu_provider", 507 | "smallvec", 508 | "utf16_iter", 509 | "utf8_iter", 510 | "write16", 511 | "zerovec", 512 | ] 513 | 514 | [[package]] 515 | name = "icu_normalizer_data" 516 | version = "1.5.0" 517 | source = "registry+https://github.com/rust-lang/crates.io-index" 518 | checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" 519 | 520 | [[package]] 521 | name = "icu_properties" 522 | version = "1.5.1" 523 | source = "registry+https://github.com/rust-lang/crates.io-index" 524 | checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" 525 | dependencies = [ 526 | "displaydoc", 527 | "icu_collections", 528 | "icu_locid_transform", 529 | "icu_properties_data", 530 | "icu_provider", 531 | "tinystr", 532 | "zerovec", 533 | ] 534 | 535 | [[package]] 536 | name = "icu_properties_data" 537 | version = "1.5.0" 538 | source = "registry+https://github.com/rust-lang/crates.io-index" 539 | checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" 540 | 541 | [[package]] 542 | name = "icu_provider" 543 | version = "1.5.0" 544 | source = "registry+https://github.com/rust-lang/crates.io-index" 545 | checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" 546 | dependencies = [ 547 | "displaydoc", 548 | "icu_locid", 549 | "icu_provider_macros", 550 | "stable_deref_trait", 551 | "tinystr", 552 | "writeable", 553 | "yoke", 554 | "zerofrom", 555 | "zerovec", 556 | ] 557 | 558 | [[package]] 559 | name = "icu_provider_macros" 560 | version = "1.5.0" 561 | source = "registry+https://github.com/rust-lang/crates.io-index" 562 | checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" 563 | dependencies = [ 564 | "proc-macro2", 565 | "quote", 566 | "syn", 567 | ] 568 | 569 | [[package]] 570 | name = "idna" 571 | version = "0.5.0" 572 | source = "registry+https://github.com/rust-lang/crates.io-index" 573 | checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" 574 | dependencies = [ 575 | "unicode-bidi", 576 | "unicode-normalization", 577 | ] 578 | 579 | [[package]] 580 | name = "idna" 581 | version = "1.0.3" 582 | source = "registry+https://github.com/rust-lang/crates.io-index" 583 | checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" 584 | dependencies = [ 585 | "idna_adapter", 586 | "smallvec", 587 | "utf8_iter", 588 | ] 589 | 590 | [[package]] 591 | name = "idna_adapter" 592 | version = "1.2.0" 593 | source = "registry+https://github.com/rust-lang/crates.io-index" 594 | checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" 595 | dependencies = [ 596 | "icu_normalizer", 597 | "icu_properties", 598 | ] 599 | 600 | [[package]] 601 | name = "ieee754" 602 | version = "0.2.6" 603 | source = "registry+https://github.com/rust-lang/crates.io-index" 604 | checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c" 605 | 606 | [[package]] 607 | name = "is-terminal" 608 | version = "0.4.13" 609 | source = "registry+https://github.com/rust-lang/crates.io-index" 610 | checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" 611 | dependencies = [ 612 | "hermit-abi", 613 | "libc", 614 | "windows-sys 0.52.0", 615 | ] 616 | 617 | [[package]] 618 | name = "is_terminal_polyfill" 619 | version = "1.70.1" 620 | source = "registry+https://github.com/rust-lang/crates.io-index" 621 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 622 | 623 | [[package]] 624 | name = "itertools" 625 | version = "0.10.5" 626 | source = "registry+https://github.com/rust-lang/crates.io-index" 627 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 628 | dependencies = [ 629 | "either", 630 | ] 631 | 632 | [[package]] 633 | name = "jobserver" 634 | version = "0.1.32" 635 | source = "registry+https://github.com/rust-lang/crates.io-index" 636 | checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" 637 | dependencies = [ 638 | "libc", 639 | ] 640 | 641 | [[package]] 642 | name = "lazy_static" 643 | version = "1.5.0" 644 | source = "registry+https://github.com/rust-lang/crates.io-index" 645 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 646 | 647 | [[package]] 648 | name = "lazycell" 649 | version = "1.3.0" 650 | source = "registry+https://github.com/rust-lang/crates.io-index" 651 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" 652 | 653 | [[package]] 654 | name = "libc" 655 | version = "0.2.159" 656 | source = "registry+https://github.com/rust-lang/crates.io-index" 657 | checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" 658 | 659 | [[package]] 660 | name = "libdeflate-sys" 661 | version = "1.22.0" 662 | source = "registry+https://github.com/rust-lang/crates.io-index" 663 | checksum = "2f4ae7b48098016dc3bc64a35605668f0af4425ec1a4a175ce2d0c1129067932" 664 | dependencies = [ 665 | "cc", 666 | ] 667 | 668 | [[package]] 669 | name = "libloading" 670 | version = "0.8.5" 671 | source = "registry+https://github.com/rust-lang/crates.io-index" 672 | checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" 673 | dependencies = [ 674 | "cfg-if", 675 | "windows-targets", 676 | ] 677 | 678 | [[package]] 679 | name = "libz-sys" 680 | version = "1.1.20" 681 | source = "registry+https://github.com/rust-lang/crates.io-index" 682 | checksum = "d2d16453e800a8cf6dd2fc3eb4bc99b786a9b90c663b8559a5b1a041bf89e472" 683 | dependencies = [ 684 | "cc", 685 | "cmake", 686 | "libc", 687 | "pkg-config", 688 | "vcpkg", 689 | ] 690 | 691 | [[package]] 692 | name = "linear-map" 693 | version = "1.2.0" 694 | source = "registry+https://github.com/rust-lang/crates.io-index" 695 | checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee" 696 | 697 | [[package]] 698 | name = "litemap" 699 | version = "0.7.4" 700 | source = "registry+https://github.com/rust-lang/crates.io-index" 701 | checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" 702 | 703 | [[package]] 704 | name = "lock_api" 705 | version = "0.4.12" 706 | source = "registry+https://github.com/rust-lang/crates.io-index" 707 | checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" 708 | dependencies = [ 709 | "autocfg", 710 | "scopeguard", 711 | ] 712 | 713 | [[package]] 714 | name = "log" 715 | version = "0.4.22" 716 | source = "registry+https://github.com/rust-lang/crates.io-index" 717 | checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" 718 | 719 | [[package]] 720 | name = "luau0-src" 721 | version = "0.12.3+luau663" 722 | source = "registry+https://github.com/rust-lang/crates.io-index" 723 | checksum = "76ae337c644bbf86a8d8e9ce3ee023311833d41741baf5e51acc31b37843aba1" 724 | dependencies = [ 725 | "cc", 726 | ] 727 | 728 | [[package]] 729 | name = "lzma-sys" 730 | version = "0.1.20" 731 | source = "registry+https://github.com/rust-lang/crates.io-index" 732 | checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" 733 | dependencies = [ 734 | "cc", 735 | "libc", 736 | "pkg-config", 737 | ] 738 | 739 | [[package]] 740 | name = "matrixmultiply" 741 | version = "0.3.9" 742 | source = "registry+https://github.com/rust-lang/crates.io-index" 743 | checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" 744 | dependencies = [ 745 | "autocfg", 746 | "rawpointer", 747 | ] 748 | 749 | [[package]] 750 | name = "memchr" 751 | version = "2.7.4" 752 | source = "registry+https://github.com/rust-lang/crates.io-index" 753 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 754 | 755 | [[package]] 756 | name = "minimal-lexical" 757 | version = "0.2.1" 758 | source = "registry+https://github.com/rust-lang/crates.io-index" 759 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" 760 | 761 | [[package]] 762 | name = "miniz_oxide" 763 | version = "0.8.0" 764 | source = "registry+https://github.com/rust-lang/crates.io-index" 765 | checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" 766 | dependencies = [ 767 | "adler2", 768 | ] 769 | 770 | [[package]] 771 | name = "mlua" 772 | version = "0.10.3" 773 | source = "registry+https://github.com/rust-lang/crates.io-index" 774 | checksum = "d3f763c1041eff92ffb5d7169968a327e1ed2ebfe425dac0ee5a35f29082534b" 775 | dependencies = [ 776 | "bstr", 777 | "either", 778 | "libloading", 779 | "mlua-sys", 780 | "num-traits", 781 | "parking_lot", 782 | "rustc-hash 2.1.1", 783 | ] 784 | 785 | [[package]] 786 | name = "mlua-sys" 787 | version = "0.6.7" 788 | source = "registry+https://github.com/rust-lang/crates.io-index" 789 | checksum = "1901c1a635a22fe9250ffcc4fcc937c16b47c2e9e71adba8784af8bca1f69594" 790 | dependencies = [ 791 | "cc", 792 | "cfg-if", 793 | "luau0-src", 794 | "pkg-config", 795 | ] 796 | 797 | [[package]] 798 | name = "ndarray" 799 | version = "0.15.6" 800 | source = "registry+https://github.com/rust-lang/crates.io-index" 801 | checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" 802 | dependencies = [ 803 | "matrixmultiply", 804 | "num-complex", 805 | "num-integer", 806 | "num-traits", 807 | "rawpointer", 808 | ] 809 | 810 | [[package]] 811 | name = "newtype_derive" 812 | version = "0.1.6" 813 | source = "registry+https://github.com/rust-lang/crates.io-index" 814 | checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" 815 | dependencies = [ 816 | "rustc_version", 817 | ] 818 | 819 | [[package]] 820 | name = "nom" 821 | version = "7.1.3" 822 | source = "registry+https://github.com/rust-lang/crates.io-index" 823 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" 824 | dependencies = [ 825 | "memchr", 826 | "minimal-lexical", 827 | ] 828 | 829 | [[package]] 830 | name = "num-complex" 831 | version = "0.4.6" 832 | source = "registry+https://github.com/rust-lang/crates.io-index" 833 | checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" 834 | dependencies = [ 835 | "num-traits", 836 | ] 837 | 838 | [[package]] 839 | name = "num-integer" 840 | version = "0.1.46" 841 | source = "registry+https://github.com/rust-lang/crates.io-index" 842 | checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" 843 | dependencies = [ 844 | "num-traits", 845 | ] 846 | 847 | [[package]] 848 | name = "num-traits" 849 | version = "0.2.19" 850 | source = "registry+https://github.com/rust-lang/crates.io-index" 851 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 852 | dependencies = [ 853 | "autocfg", 854 | ] 855 | 856 | [[package]] 857 | name = "openssl-src" 858 | version = "300.3.2+3.3.2" 859 | source = "registry+https://github.com/rust-lang/crates.io-index" 860 | checksum = "a211a18d945ef7e648cc6e0058f4c548ee46aab922ea203e0d30e966ea23647b" 861 | dependencies = [ 862 | "cc", 863 | ] 864 | 865 | [[package]] 866 | name = "openssl-sys" 867 | version = "0.9.103" 868 | source = "registry+https://github.com/rust-lang/crates.io-index" 869 | checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" 870 | dependencies = [ 871 | "cc", 872 | "libc", 873 | "openssl-src", 874 | "pkg-config", 875 | "vcpkg", 876 | ] 877 | 878 | [[package]] 879 | name = "parking_lot" 880 | version = "0.12.3" 881 | source = "registry+https://github.com/rust-lang/crates.io-index" 882 | checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" 883 | dependencies = [ 884 | "lock_api", 885 | "parking_lot_core", 886 | ] 887 | 888 | [[package]] 889 | name = "parking_lot_core" 890 | version = "0.9.10" 891 | source = "registry+https://github.com/rust-lang/crates.io-index" 892 | checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" 893 | dependencies = [ 894 | "cfg-if", 895 | "libc", 896 | "redox_syscall", 897 | "smallvec", 898 | "windows-targets", 899 | ] 900 | 901 | [[package]] 902 | name = "percent-encoding" 903 | version = "2.3.1" 904 | source = "registry+https://github.com/rust-lang/crates.io-index" 905 | checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" 906 | 907 | [[package]] 908 | name = "pkg-config" 909 | version = "0.3.31" 910 | source = "registry+https://github.com/rust-lang/crates.io-index" 911 | checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" 912 | 913 | [[package]] 914 | name = "proc-macro2" 915 | version = "1.0.92" 916 | source = "registry+https://github.com/rust-lang/crates.io-index" 917 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" 918 | dependencies = [ 919 | "unicode-ident", 920 | ] 921 | 922 | [[package]] 923 | name = "quick-error" 924 | version = "1.2.3" 925 | source = "registry+https://github.com/rust-lang/crates.io-index" 926 | checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" 927 | 928 | [[package]] 929 | name = "quote" 930 | version = "1.0.37" 931 | source = "registry+https://github.com/rust-lang/crates.io-index" 932 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" 933 | dependencies = [ 934 | "proc-macro2", 935 | ] 936 | 937 | [[package]] 938 | name = "rawpointer" 939 | version = "0.2.1" 940 | source = "registry+https://github.com/rust-lang/crates.io-index" 941 | checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" 942 | 943 | [[package]] 944 | name = "rayon" 945 | version = "1.10.0" 946 | source = "registry+https://github.com/rust-lang/crates.io-index" 947 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 948 | dependencies = [ 949 | "either", 950 | "rayon-core", 951 | ] 952 | 953 | [[package]] 954 | name = "rayon-core" 955 | version = "1.12.1" 956 | source = "registry+https://github.com/rust-lang/crates.io-index" 957 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 958 | dependencies = [ 959 | "crossbeam-deque", 960 | "crossbeam-utils", 961 | ] 962 | 963 | [[package]] 964 | name = "redox_syscall" 965 | version = "0.5.11" 966 | source = "registry+https://github.com/rust-lang/crates.io-index" 967 | checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3" 968 | dependencies = [ 969 | "bitflags", 970 | ] 971 | 972 | [[package]] 973 | name = "regex" 974 | version = "1.11.1" 975 | source = "registry+https://github.com/rust-lang/crates.io-index" 976 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 977 | dependencies = [ 978 | "aho-corasick", 979 | "memchr", 980 | "regex-automata", 981 | "regex-syntax", 982 | ] 983 | 984 | [[package]] 985 | name = "regex-automata" 986 | version = "0.4.8" 987 | source = "registry+https://github.com/rust-lang/crates.io-index" 988 | checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" 989 | dependencies = [ 990 | "aho-corasick", 991 | "memchr", 992 | "regex-syntax", 993 | ] 994 | 995 | [[package]] 996 | name = "regex-syntax" 997 | version = "0.8.5" 998 | source = "registry+https://github.com/rust-lang/crates.io-index" 999 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 1000 | 1001 | [[package]] 1002 | name = "rust-htslib" 1003 | version = "0.49.0" 1004 | source = "registry+https://github.com/rust-lang/crates.io-index" 1005 | checksum = "115ae57b89deb942275566eca8c31da053d8df0d8bda12e50c4c4aa994877068" 1006 | dependencies = [ 1007 | "bio-types", 1008 | "byteorder", 1009 | "custom_derive", 1010 | "derive-new 0.7.0", 1011 | "hts-sys", 1012 | "ieee754", 1013 | "lazy_static", 1014 | "libc", 1015 | "libz-sys", 1016 | "linear-map", 1017 | "newtype_derive", 1018 | "regex", 1019 | "thiserror 2.0.3", 1020 | "url", 1021 | ] 1022 | 1023 | [[package]] 1024 | name = "rust-lapper" 1025 | version = "1.1.0" 1026 | source = "registry+https://github.com/rust-lang/crates.io-index" 1027 | checksum = "ee43d8e721ac803031dbab6a944b957b49a3b11eadbc099880c8aaaebf23ed27" 1028 | dependencies = [ 1029 | "num-traits", 1030 | ] 1031 | 1032 | [[package]] 1033 | name = "rustc-hash" 1034 | version = "1.1.0" 1035 | source = "registry+https://github.com/rust-lang/crates.io-index" 1036 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 1037 | 1038 | [[package]] 1039 | name = "rustc-hash" 1040 | version = "2.1.1" 1041 | source = "registry+https://github.com/rust-lang/crates.io-index" 1042 | checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" 1043 | 1044 | [[package]] 1045 | name = "rustc_version" 1046 | version = "0.1.7" 1047 | source = "registry+https://github.com/rust-lang/crates.io-index" 1048 | checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" 1049 | dependencies = [ 1050 | "semver", 1051 | ] 1052 | 1053 | [[package]] 1054 | name = "rustversion" 1055 | version = "1.0.17" 1056 | source = "registry+https://github.com/rust-lang/crates.io-index" 1057 | checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" 1058 | 1059 | [[package]] 1060 | name = "scopeguard" 1061 | version = "1.2.0" 1062 | source = "registry+https://github.com/rust-lang/crates.io-index" 1063 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" 1064 | 1065 | [[package]] 1066 | name = "semver" 1067 | version = "0.1.20" 1068 | source = "registry+https://github.com/rust-lang/crates.io-index" 1069 | checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" 1070 | 1071 | [[package]] 1072 | name = "serde" 1073 | version = "1.0.215" 1074 | source = "registry+https://github.com/rust-lang/crates.io-index" 1075 | checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" 1076 | dependencies = [ 1077 | "serde_derive", 1078 | ] 1079 | 1080 | [[package]] 1081 | name = "serde_derive" 1082 | version = "1.0.215" 1083 | source = "registry+https://github.com/rust-lang/crates.io-index" 1084 | checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" 1085 | dependencies = [ 1086 | "proc-macro2", 1087 | "quote", 1088 | "syn", 1089 | ] 1090 | 1091 | [[package]] 1092 | name = "shlex" 1093 | version = "1.3.0" 1094 | source = "registry+https://github.com/rust-lang/crates.io-index" 1095 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" 1096 | 1097 | [[package]] 1098 | name = "smallvec" 1099 | version = "1.13.2" 1100 | source = "registry+https://github.com/rust-lang/crates.io-index" 1101 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" 1102 | 1103 | [[package]] 1104 | name = "stable_deref_trait" 1105 | version = "1.2.0" 1106 | source = "registry+https://github.com/rust-lang/crates.io-index" 1107 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" 1108 | 1109 | [[package]] 1110 | name = "strsim" 1111 | version = "0.11.1" 1112 | source = "registry+https://github.com/rust-lang/crates.io-index" 1113 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 1114 | 1115 | [[package]] 1116 | name = "strum_macros" 1117 | version = "0.26.4" 1118 | source = "registry+https://github.com/rust-lang/crates.io-index" 1119 | checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" 1120 | dependencies = [ 1121 | "heck", 1122 | "proc-macro2", 1123 | "quote", 1124 | "rustversion", 1125 | "syn", 1126 | ] 1127 | 1128 | [[package]] 1129 | name = "syn" 1130 | version = "2.0.89" 1131 | source = "registry+https://github.com/rust-lang/crates.io-index" 1132 | checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e" 1133 | dependencies = [ 1134 | "proc-macro2", 1135 | "quote", 1136 | "unicode-ident", 1137 | ] 1138 | 1139 | [[package]] 1140 | name = "synstructure" 1141 | version = "0.13.1" 1142 | source = "registry+https://github.com/rust-lang/crates.io-index" 1143 | checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" 1144 | dependencies = [ 1145 | "proc-macro2", 1146 | "quote", 1147 | "syn", 1148 | ] 1149 | 1150 | [[package]] 1151 | name = "termcolor" 1152 | version = "1.4.1" 1153 | source = "registry+https://github.com/rust-lang/crates.io-index" 1154 | checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" 1155 | dependencies = [ 1156 | "winapi-util", 1157 | ] 1158 | 1159 | [[package]] 1160 | name = "thiserror" 1161 | version = "1.0.64" 1162 | source = "registry+https://github.com/rust-lang/crates.io-index" 1163 | checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" 1164 | dependencies = [ 1165 | "thiserror-impl 1.0.64", 1166 | ] 1167 | 1168 | [[package]] 1169 | name = "thiserror" 1170 | version = "2.0.3" 1171 | source = "registry+https://github.com/rust-lang/crates.io-index" 1172 | checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" 1173 | dependencies = [ 1174 | "thiserror-impl 2.0.3", 1175 | ] 1176 | 1177 | [[package]] 1178 | name = "thiserror-impl" 1179 | version = "1.0.64" 1180 | source = "registry+https://github.com/rust-lang/crates.io-index" 1181 | checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" 1182 | dependencies = [ 1183 | "proc-macro2", 1184 | "quote", 1185 | "syn", 1186 | ] 1187 | 1188 | [[package]] 1189 | name = "thiserror-impl" 1190 | version = "2.0.3" 1191 | source = "registry+https://github.com/rust-lang/crates.io-index" 1192 | checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" 1193 | dependencies = [ 1194 | "proc-macro2", 1195 | "quote", 1196 | "syn", 1197 | ] 1198 | 1199 | [[package]] 1200 | name = "tinystr" 1201 | version = "0.7.6" 1202 | source = "registry+https://github.com/rust-lang/crates.io-index" 1203 | checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" 1204 | dependencies = [ 1205 | "displaydoc", 1206 | "zerovec", 1207 | ] 1208 | 1209 | [[package]] 1210 | name = "tinyvec" 1211 | version = "1.8.0" 1212 | source = "registry+https://github.com/rust-lang/crates.io-index" 1213 | checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" 1214 | dependencies = [ 1215 | "tinyvec_macros", 1216 | ] 1217 | 1218 | [[package]] 1219 | name = "tinyvec_macros" 1220 | version = "0.1.1" 1221 | source = "registry+https://github.com/rust-lang/crates.io-index" 1222 | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" 1223 | 1224 | [[package]] 1225 | name = "unicode-bidi" 1226 | version = "0.3.17" 1227 | source = "registry+https://github.com/rust-lang/crates.io-index" 1228 | checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" 1229 | 1230 | [[package]] 1231 | name = "unicode-ident" 1232 | version = "1.0.13" 1233 | source = "registry+https://github.com/rust-lang/crates.io-index" 1234 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" 1235 | 1236 | [[package]] 1237 | name = "unicode-normalization" 1238 | version = "0.1.24" 1239 | source = "registry+https://github.com/rust-lang/crates.io-index" 1240 | checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" 1241 | dependencies = [ 1242 | "tinyvec", 1243 | ] 1244 | 1245 | [[package]] 1246 | name = "url" 1247 | version = "2.5.2" 1248 | source = "registry+https://github.com/rust-lang/crates.io-index" 1249 | checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" 1250 | dependencies = [ 1251 | "form_urlencoded", 1252 | "idna 0.5.0", 1253 | "percent-encoding", 1254 | ] 1255 | 1256 | [[package]] 1257 | name = "utf16_iter" 1258 | version = "1.0.5" 1259 | source = "registry+https://github.com/rust-lang/crates.io-index" 1260 | checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" 1261 | 1262 | [[package]] 1263 | name = "utf8_iter" 1264 | version = "1.0.4" 1265 | source = "registry+https://github.com/rust-lang/crates.io-index" 1266 | checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" 1267 | 1268 | [[package]] 1269 | name = "utf8parse" 1270 | version = "0.2.2" 1271 | source = "registry+https://github.com/rust-lang/crates.io-index" 1272 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 1273 | 1274 | [[package]] 1275 | name = "vcpkg" 1276 | version = "0.2.15" 1277 | source = "registry+https://github.com/rust-lang/crates.io-index" 1278 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 1279 | 1280 | [[package]] 1281 | name = "winapi-util" 1282 | version = "0.1.9" 1283 | source = "registry+https://github.com/rust-lang/crates.io-index" 1284 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" 1285 | dependencies = [ 1286 | "windows-sys 0.59.0", 1287 | ] 1288 | 1289 | [[package]] 1290 | name = "windows-sys" 1291 | version = "0.52.0" 1292 | source = "registry+https://github.com/rust-lang/crates.io-index" 1293 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" 1294 | dependencies = [ 1295 | "windows-targets", 1296 | ] 1297 | 1298 | [[package]] 1299 | name = "windows-sys" 1300 | version = "0.59.0" 1301 | source = "registry+https://github.com/rust-lang/crates.io-index" 1302 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 1303 | dependencies = [ 1304 | "windows-targets", 1305 | ] 1306 | 1307 | [[package]] 1308 | name = "windows-targets" 1309 | version = "0.52.6" 1310 | source = "registry+https://github.com/rust-lang/crates.io-index" 1311 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 1312 | dependencies = [ 1313 | "windows_aarch64_gnullvm", 1314 | "windows_aarch64_msvc", 1315 | "windows_i686_gnu", 1316 | "windows_i686_gnullvm", 1317 | "windows_i686_msvc", 1318 | "windows_x86_64_gnu", 1319 | "windows_x86_64_gnullvm", 1320 | "windows_x86_64_msvc", 1321 | ] 1322 | 1323 | [[package]] 1324 | name = "windows_aarch64_gnullvm" 1325 | version = "0.52.6" 1326 | source = "registry+https://github.com/rust-lang/crates.io-index" 1327 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 1328 | 1329 | [[package]] 1330 | name = "windows_aarch64_msvc" 1331 | version = "0.52.6" 1332 | source = "registry+https://github.com/rust-lang/crates.io-index" 1333 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 1334 | 1335 | [[package]] 1336 | name = "windows_i686_gnu" 1337 | version = "0.52.6" 1338 | source = "registry+https://github.com/rust-lang/crates.io-index" 1339 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 1340 | 1341 | [[package]] 1342 | name = "windows_i686_gnullvm" 1343 | version = "0.52.6" 1344 | source = "registry+https://github.com/rust-lang/crates.io-index" 1345 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 1346 | 1347 | [[package]] 1348 | name = "windows_i686_msvc" 1349 | version = "0.52.6" 1350 | source = "registry+https://github.com/rust-lang/crates.io-index" 1351 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 1352 | 1353 | [[package]] 1354 | name = "windows_x86_64_gnu" 1355 | version = "0.52.6" 1356 | source = "registry+https://github.com/rust-lang/crates.io-index" 1357 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 1358 | 1359 | [[package]] 1360 | name = "windows_x86_64_gnullvm" 1361 | version = "0.52.6" 1362 | source = "registry+https://github.com/rust-lang/crates.io-index" 1363 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 1364 | 1365 | [[package]] 1366 | name = "windows_x86_64_msvc" 1367 | version = "0.52.6" 1368 | source = "registry+https://github.com/rust-lang/crates.io-index" 1369 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 1370 | 1371 | [[package]] 1372 | name = "write16" 1373 | version = "1.0.0" 1374 | source = "registry+https://github.com/rust-lang/crates.io-index" 1375 | checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" 1376 | 1377 | [[package]] 1378 | name = "writeable" 1379 | version = "0.5.5" 1380 | source = "registry+https://github.com/rust-lang/crates.io-index" 1381 | checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" 1382 | 1383 | [[package]] 1384 | name = "yoke" 1385 | version = "0.7.5" 1386 | source = "registry+https://github.com/rust-lang/crates.io-index" 1387 | checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" 1388 | dependencies = [ 1389 | "serde", 1390 | "stable_deref_trait", 1391 | "yoke-derive", 1392 | "zerofrom", 1393 | ] 1394 | 1395 | [[package]] 1396 | name = "yoke-derive" 1397 | version = "0.7.5" 1398 | source = "registry+https://github.com/rust-lang/crates.io-index" 1399 | checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" 1400 | dependencies = [ 1401 | "proc-macro2", 1402 | "quote", 1403 | "syn", 1404 | "synstructure", 1405 | ] 1406 | 1407 | [[package]] 1408 | name = "zerofrom" 1409 | version = "0.1.5" 1410 | source = "registry+https://github.com/rust-lang/crates.io-index" 1411 | checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" 1412 | dependencies = [ 1413 | "zerofrom-derive", 1414 | ] 1415 | 1416 | [[package]] 1417 | name = "zerofrom-derive" 1418 | version = "0.1.5" 1419 | source = "registry+https://github.com/rust-lang/crates.io-index" 1420 | checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" 1421 | dependencies = [ 1422 | "proc-macro2", 1423 | "quote", 1424 | "syn", 1425 | "synstructure", 1426 | ] 1427 | 1428 | [[package]] 1429 | name = "zerovec" 1430 | version = "0.10.4" 1431 | source = "registry+https://github.com/rust-lang/crates.io-index" 1432 | checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" 1433 | dependencies = [ 1434 | "yoke", 1435 | "zerofrom", 1436 | "zerovec-derive", 1437 | ] 1438 | 1439 | [[package]] 1440 | name = "zerovec-derive" 1441 | version = "0.10.3" 1442 | source = "registry+https://github.com/rust-lang/crates.io-index" 1443 | checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" 1444 | dependencies = [ 1445 | "proc-macro2", 1446 | "quote", 1447 | "syn", 1448 | ] 1449 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "fraguracy" 3 | version = "0.2.7" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | rust-htslib = {features = ["libdeflate", "static", "gcs", "s3"], version = "0.49.0"} 10 | syn = "2.0.87" 11 | idna = "1.0.0" 12 | 13 | 14 | libc = "*" 15 | rustc-hash = "1.1.0" 16 | lazy_static = "1.4.0" 17 | ndarray = "0.15.6" 18 | env_logger = "0.10.0" 19 | log = "0.4.17" 20 | itertools = "0.10.5" 21 | rayon = "1.7.0" 22 | linear-map = "1.2.0" 23 | rust-lapper = "1.1.0" 24 | flate2 = "1.0.25" 25 | bpci = "0.1.0" 26 | clap = {version ="4", features=["derive"]} 27 | regex = "1.11.1" 28 | mlua = { version = "0.10.3", features = ["luau", "send"] } 29 | anyhow = "1.0.98" 30 | #plotly = "0.8.3" 31 | #polars = { version = "0.27.2", features = ["lazy", "strings"] } 32 | 33 | # Faster compilation profiles 34 | [profile.dev] 35 | # Reduce optimization for faster dev builds 36 | opt-level = 0 37 | # Keep incremental compilation enabled (default) 38 | incremental = true 39 | # Use more codegen units for parallel compilation 40 | codegen-units = 256 41 | 42 | # Still optimize dependencies for better runtime performance in dev 43 | [profile.dev.package."*"] 44 | opt-level = 2 # Reduced from 3 to 2 for faster compilation 45 | 46 | [profile.release] 47 | # Use "thin" LTO instead of "fat" for faster release builds 48 | lto = "thin" 49 | # Increase codegen units for faster compilation 50 | codegen-units = 16 51 | # Enable debug info for better profiling 52 | debug = 1 53 | 54 | # Fast release profile for quicker testing 55 | [profile.release-fast] 56 | inherits = "release" 57 | lto = false 58 | codegen-units = 256 59 | debug = false 60 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Brent Pedersen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 5 | # Fraguracy 6 | 7 | [![Rust](https://github.com/brentp/fraguracy/actions/workflows/rust.yml/badge.svg)](https://github.com/brentp/fraguracy/actions/workflows/rust.yml) 8 | 9 | `fraguracy` calculates real error rates using overlapping paired-end reads in a fragment. 10 | It reports a file of error positions and counts, along with a summary of errors by context, read-position, read-orientation (F or R) and base-quality. 11 | While the overlap requirment does limit to the (potentially) small percentage of bases that overlap this can 12 | still be useful to: 13 | 14 | 1. evaluate error rates within and among samples 15 | 2. find sites in the genome with high error rates 16 | 3. find data-driven cutoffs for allele fraction (`AF`) cutoffs in UMI or duplex sequencing. 17 | 18 | # Usage 19 | 20 | The `fraguracy` binary available in releases takes a bam or cram file and outputs error stats. The plotting is currently done via python. 21 | 22 | ``` 23 | $ fraguracy extract \ 24 | --bin-size 1 \ 25 | --output-prefix fraguracy-$sample- \ 26 | --fasta $reference \ 27 | $sample.bam [.. *.bam] \ 28 | 29 | $ python plot.py fraguracy-$sample-consensus-counts.txt # writes read.html 30 | 31 | $ head fraguracy-$sample-errors.bed # records base position of every error observed and count of errors at that site. 32 | chrom start stop bq_bin count contexts 33 | chr1 75822283 75822284 05-19 6 AC:4,AT:2 34 | chr1 75822287 75822288 20-36 4 TC:4 35 | chr1 75822287 75822288 37-59 3 TC:3 36 | chr1 75822287 75822288 60+ 2 CA:2 37 | chr1 75822341 75822342 05-19 2 TC:2 38 | chr1 75822352 75822353 20-36 2 GT:2 39 | chr1 75822360 75822361 20-36 2 AG:2 40 | chr1 241850751 241850752 37-59 2 TC:2 41 | chr1 241850752 241850753 20-36 2 TA:1,TC:1 42 | ``` 43 | 44 | There is also an `$sample-indel-errors.bed` file that contains the columns: 45 | 46 | ``` 47 | chrom start stop count 48 | ``` 49 | 50 | The errors files are useful to find **positions that are frequent errors** -- having count > 1 or with multiple bq_bins showing the same position. 51 | 52 | If multiple samples are given (multiple bam files) then each sample is processed in parallel and $prefix-total-counts.txt and $prefix-total-errors.bed will 53 | be created which sum all values for all samples. 54 | 55 | The plot.py will create an interactive plot that looks like this: 56 | 57 | ![frag-plot](https://user-images.githubusercontent.com/1739/225074861-7b5098d1-b5e9-4bab-8971-0a278f182aaa.png) 58 | 59 | **NOTE** that depending on the goal it can be useful to run `fraguracy extract` once, then exclude sites that are very frequent errors and re-run, 60 | this will prevent a small percentage of sites (often around homopolymers) from dominating the error profile. 61 | 62 | ## CLI 63 | 64 | ``` 65 | error profile pair overlaps in bam/cram 66 | 67 | Usage: fraguracy extract [OPTIONS] [BAMS]... 68 | 69 | Arguments: 70 | [BAMS]... 71 | 72 | Options: 73 | -f, --fasta 74 | fasta for use with crams and/or to use as 'truth' 75 | -o, --output-prefix 76 | prefix for output files [default: fraguracy-] 77 | -C, --chromosome 78 | restrict analysis to this chromosome 79 | -r, --regions 80 | restrict analysis to the regions given in this BED file 81 | -e, --exclude-regions 82 | exclude from analysis the regions given in this BED file 83 | -l, --lua-expression 84 | optional lua expression to filter reads. returns true to skip read. e.g. 'return read.flags.secondary or read.flags.supplementary'. 85 | -m, --max-read-length 86 | indicate the maximum read length in the alignment file [default: 151] 87 | -b, --bin-size 88 | parition the read into chunks/bins of this size [default: 3] 89 | -Q, --min-mapping-quality 90 | only consider pairs where both reads have this mapping-quality or higher (good to leave this high) [default: 50] 91 | -c, --ci 92 | method for confidence interval calculation (see rust bpci crate) [default: agresti-coull] [possible values: agresti-coull, wald, wilson] 93 | -n, --no-denominator 94 | do not calculate denominator. This can shorten runtime. 95 | -H, --homopolymer-regex 96 | regex for homopolymer sequence to consider if denominator is calculated[default: A{3,}|C{3,}|G{3,}|T{3,}] [default: A{3,}|C{3,}|G{3,}|T{3,}] 97 | -t, --reference-as-truth 98 | use reference base as 'truth' 99 | -h, --help 100 | Print help 101 | ``` 102 | 103 | ### Lua Expressions 104 | 105 | The `extract` sub-command allows lua expressions with `-l` that indicate whether to skip a read. See [lua-api.md](lua-api.md) for a full description 106 | of how to use this. 107 | 108 | ### Combine 109 | 110 | `fraguracy extract` can also be run per-sample and then errors can be combined with `fraguracy combine-errors`: 111 | 112 | ``` 113 | Usage: fraguracy combine-errors [OPTIONS] --fai-path [ERRORS]... 114 | 115 | Arguments: 116 | [ERRORS]... path to error bed files from extract 117 | 118 | Options: 119 | -f, --fai-path path for to fai (not fasta) file 120 | -o, --output-path path for output bed file [default: fraguracy-combined-errors.bed] 121 | -h, --help Print help 122 | ``` 123 | 124 | The output is a single file with the error counts from each sample summed. And an additional column indicating the 125 | number of samples that containing the error is reported. 126 | 127 | > ⚠️ **Warning**: You must send either indel error files or snp error files, not both! 128 | 129 | ## Bins 130 | 131 | The aim is to create a model of errors. Many factors can be predictive of the likelihood of an error. 132 | The dimensionality is a consideration because if the data is too sparse, prediction is less reliable. 133 | Because we determine accuracy by the mapping, it is best to require a high mapping-quality. 134 | Therefore we limit to: **Base-Quality**, **Sequence Context**, **Read**, and **Position in Read** 135 | as described and binned below. With those binnings we have **189,720** possible combinations (5 *6* 2 *2* $read_length / $bin-size * 31 ) 136 | 137 | For each combination, while iterating over the bam, we store the number of errors and the number of total bases 138 | in each bin. These become, respectively, the numerator and denominator for the error-rate for that set of parameters. 139 | 140 | ### Qualities (5) 141 | 142 | Base-Qualities and Mapping Qualities will be binned to: 143 | 144 | 0. 0-5 145 | 1. 6-19 146 | 2. 20 - 36, 147 | 3. 37 - 59, 148 | 4. 60+ 149 | 150 | This means that the quantized base-qualities from nova-seq (2, 12, 23 and 37) are each in separate bins. 151 | And other base-quality schemes are also paritioned sanely. 152 | 153 | ### Sequence Context (6) 154 | 155 | 0. C->A (G->T) 156 | 1. C->G (G->C) 157 | 2. C->T (G->A) 158 | 3. T->A (A->T) 159 | 4. T->C (A->G) 160 | 5. T->G (A->C) 161 | 162 | ### Read (2) 163 | 164 | Read 1 or Read 2 165 | 166 | ### Read Position (50) 167 | 168 | read position is simply divided by 3. so bins of 3 bases. 169 | 170 | ### Homopolymer distance (30) 171 | 172 | The errors are also partitioned by homopolymer distance up to +- 15. all errors beyond 15 are put in the 15 base bin 173 | 174 | # vcfanno 175 | 176 | To use the errors files with vcfanno: 177 | 178 | ``` 179 | bgzip fraguracy/fraguracy-19610X19-errors.bed 180 | tabix fraguracy/fraguracy-19610X19-errors.bed.gz 181 | 182 | echo ' 183 | [[annotation]] 184 | file="fraguracy/fraguracy-19610X19-errors.bed.gz" 185 | columns=[4, 5] 186 | names=["frag_bq_bin", "frag_errors"] 187 | ops=["first", "first"] 188 | ' > conf.toml 189 | 190 | vcfanno conf.toml $vcf > annotated.vcf # annotated.vcf will have entries for `frag_bq_bin` and `frag_errors` where there was an error found that was also a variant in the VCF. 191 | ``` 192 | 193 | ## indel errors 194 | 195 | A command like: `fraguracy extract -f $fasta -o $prefix $bam` will create the files needed to evaluate the indel error rate. To plot it, then use: 196 | 197 | ``` 198 | python scripts/analyze_indel_errors.py ${prefix}-indel-errors.bed.gz ${prefix}-counts.txt 199 | ``` 200 | 201 | Which will make a plot like this one: 202 | 203 | ![Image](https://github.com/user-attachments/assets/35c10634-a54b-48a2-9c5f-2c6363def26f) 204 | -------------------------------------------------------------------------------- /analyze_indel_errors.md: -------------------------------------------------------------------------------- 1 | # Indel Error Rate Analysis Summary 2 | 3 | ## Overview 4 | 5 | Successfully analyzed indel error rates from fraguracy output files using Python with polars for data processing and plotly for interactive visualization. 6 | 7 | ## Data Processing Steps 8 | 9 | ### 1. Indel Errors File Processing 10 | 11 | - **Input**: `test_ovl-indel-errors.bed.gz` (compressed BED format) 12 | - **Columns**: chrom, start, end, count, length, bq_bin, hp_dist 13 | - **Processing**: Grouped by length, bq_bin, and hp_dist, then summed counts 14 | - **Result**: 1,887 unique combinations with indel counts 15 | 16 | ### 2. Counts File Processing 17 | 18 | - **Input**: `test_ovl-counts.txt` (tab-separated) 19 | - **Columns**: read12, FR, bq_bin, read_pos, context, hp_dist, total_count, error_count, err_rate_lo, err_rate_hi 20 | - **Processing**: Grouped by bq_bin and hp_dist, then summed total_count 21 | - **Result**: 155 unique combinations with total counts 22 | 23 | ### 3. Error Rate Calculation 24 | 25 | - **Method**: Joined indel counts with total counts on bq_bin and hp_dist 26 | - **Formula**: error_rate = indel_count / total_count 27 | - **Final dataset**: 1,887 records with error rates 28 | 29 | ## Results 30 | 31 | ### Key Findings 32 | 33 | - **Indel Lengths**: 66 different indel lengths observed (ranging from -47 to +28) 34 | - **Error Rate Range**: From ~10^-9 to ~10^-5 (highly variable) 35 | - **Homopolymer Distance Impact**: Clear relationship between hp_dist and error rates 36 | - **Base Quality Impact**: Interactive filtering by BQ bins reveals quality-dependent error patterns (defaults to high-quality 37-59 range) 37 | 38 | ### Summary Statistics by Indel Length 39 | 40 | - **Deletions** (negative lengths): Generally lower error rates 41 | - **Insertions** (positive lengths): More variable error rates 42 | - **Single base changes** (±1): Most common, moderate error rates 43 | 44 | ## Visualization 45 | 46 | - **Interactive Plot**: `indel_error_rates_by_hp_dist.html` (with hover data) 47 | - **Static Plot**: `indel_error_rates_by_hp_dist.png` 48 | - **X-axis**: Homopolymer distance (hp_dist) 49 | - **Y-axis**: Indel error rate (log scale) 50 | - **Colors**: Different indel length categories (aggregated) 51 | - **Indel Length Aggregation**: 52 | - Individual values for -3, -2, -1, 1, 2, 3 53 | - Aggregated ">3" for all insertions > 3 bases 54 | - Aggregated "<-3" for all deletions > 3 bases 55 | - **Features**: 56 | - Log scale for error rates 57 | - Interactive hover showing detailed information (indel count, total count, bq_bin) 58 | - Grid for easier reading 59 | - Legend with "Indel Length" title showing length categories 60 | - Modern plotly-based visualization 61 | - **Line connections (default)**: Points are connected by lines for each indel length category to show trends across hp_distance 62 | - **Optional scatter-only mode**: Use `--no-lines` to show scatter plot without line connections 63 | - **BQ Bin Filtering**: Interactive buttons to filter data by base quality bins (defaults to 37-59) 64 | - "All BQ Bins": Show all data 65 | - "BQ: 37-59 (default)": Show only high-quality base calls 66 | - Individual BQ bin buttons: "BQ: 05-19", "BQ: 20-36", etc. 67 | 68 | ## Files Created 69 | 70 | 1. `analyze_indel_errors.py` - Main analysis script 71 | 2. `indel_error_rates_by_hp_dist.html` - Interactive plot 72 | 3. `indel_error_rates_by_hp_dist.png` - Static plot 73 | 4. `analysis_summary.md` - This summary document 74 | 75 | ## Usage 76 | 77 | ### Command Line Interface 78 | 79 | The script now uses argparse for flexible command line usage: 80 | 81 | ```bash 82 | # Basic usage (with lines - default) 83 | python3 analyze_indel_errors.py indel_errors.bed.gz counts.txt 84 | 85 | # Scatter plot only (no lines) 86 | python3 analyze_indel_errors.py --no-lines indel_errors.bed.gz counts.txt 87 | 88 | # With custom output prefix 89 | python3 analyze_indel_errors.py --output-prefix my_analysis indel_errors.bed.gz counts.txt 90 | 91 | # Combined options (scatter plot with custom output) 92 | python3 analyze_indel_errors.py --no-lines --output-prefix scatter_analysis indel_errors.bed.gz counts.txt 93 | 94 | # View help for all options 95 | python3 analyze_indel_errors.py --help 96 | ``` 97 | 98 | ### Options 99 | 100 | - `--no-lines`: Show scatter plot only, without connecting lines (default: show lines) 101 | - `--output-prefix`: Specify custom output file prefix (default: `indel_error_rates_by_hp_dist`) 102 | - `--help`: Show help message with usage examples 103 | 104 | ### Arguments 105 | 106 | 1. `indel_errors_file`: Input indel errors file (BED format, can be gzipped) 107 | 2. `counts_file`: Input counts file (tab-separated format) 108 | 109 | The script processes the input files and generates both interactive (HTML) and static (PNG) plots along with summary statistics. 110 | -------------------------------------------------------------------------------- /cli-tests.sh: -------------------------------------------------------------------------------- 1 | cargo run combine-errors -f human_g1k_v38_decoy_phix.fasta.fai test-data/*.bed -o t.combined.bed 2 | -------------------------------------------------------------------------------- /lua-api.md: -------------------------------------------------------------------------------- 1 | # Lua API Reference 2 | 3 | This document describes the Lua API for filtering BAM records. 4 | 5 | ## Flags 6 | 7 | The `flags` object represents BAM flags returned from `read.flags` and provides the following fields: 8 | 9 | - `paired`: Returns true if the read is paired 10 | - `proper_pair`: Returns true if the read is in a proper pair 11 | - `unmapped`: Returns true if the read is unmapped 12 | - `mate_unmapped`: Returns true if the mate is unmapped 13 | - `reverse`: Returns true if the read is on the reverse strand 14 | - `forward`: Returns true if the read is on the forward strand 15 | - `mate_reverse`: Returns true if the mate is on the reverse strand 16 | - `mate_forward`: Returns true if the mate is on the forward strand 17 | - `read_1`: Returns true if this is read 1 18 | - `read_2`: Returns true if this is read 2 19 | - `secondary`: Returns true if this is a secondary alignment 20 | - `primary`: Returns true if this is the primary alignment 21 | - `qcfail`: Returns true if the read fails quality checks 22 | - `duplicate`: Returns true if the read is a duplicate 23 | - `supplementary`: Returns true if this is a supplementary alignment 24 | - `flag`: Returns the raw integer flag value 25 | 26 | ## Read 27 | 28 | The `read` object provides access to BAM record data with the following fields and methods: 29 | 30 | ### Fields 31 | 32 | All of these are properties on the `read`, e.g. `read.mapping_quality` 33 | 34 | - `mapping_quality`: Returns the mapping quality 35 | - `flags`: Returns a `Flags` object 36 | - `tid`: Returns the reference sequence ID 37 | - `start`: Returns the 0-based start position 38 | - `stop`: Returns the end position based on CIGAR 39 | - `length`: Returns the sequence length 40 | - `insert_size`: Returns the insert size 41 | - `qname`: Returns the query name as a string 42 | - `sequence`: Returns the read sequence as a string 43 | - `soft_clips_3_prime`: Returns the number of soft-clipped bases at the 3' end 44 | - `soft_clips_5_prime`: Returns the number of soft-clipped bases at the 5' end 45 | - `base_counts`: Returns a table with counts of A, C, G, T, N in the read 46 | - `n_proportion`: Returns the proportion of N bases in the read (see methods to limit to 3' or 5') 47 | - `indel_count`: Returns the number of indels in the read 48 | - `average_base_quality`: Returns the average base-quality in a read 49 | 50 | ### Methods 51 | 52 | - `read:tag(tag_name)`: Returns the value of the specified BAM tag 53 | - `read:n_proportion_3_prime(n:number)`: Returns the proportion of N bases within `n` of the 3' end of the read 54 | - `read:n_proportion_5_prime(n:number)`: Returns the proportion of N bases within `n` of the 5' end of the read 55 | 56 | ### Not Implemented 57 | 58 | The following are not implemented as they require a per-base approach 59 | which is not used in fraguracy. 60 | 61 | - `bq` [NO]: Returns the base quality at the current position 62 | - `distance_from_5prime` [NO]: Returns the distance from the 5' end of the read 63 | - `distance_from_3prime` [NO]: Returns the distance from the 3' end of the read 64 | - `qpos()` [NO]: Returns the query position 65 | 66 | ## Usage Example 67 | 68 | ```lua 69 | -- skip reads with mapping quality >= 20 and not supplementary and where the proportion of N's in the last 10 bases is > 0.1 70 | return read.mapping_quality >= 20 and not read.flags.supplementary and read:n_proportion_3_prime(10) > 0.1 71 | ``` 72 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import plotly.graph_objects as go 3 | from plotly.subplots import make_subplots 4 | import plotly 5 | import polars as pl 6 | import numpy as np 7 | 8 | 9 | df = pl.read_csv(sys.argv[1], sep='\t') 10 | 11 | qual_bin = "20-39" 12 | qual_bin = "60+" 13 | 14 | r1 = df.filter((pl.col("read12") == "r1") & (pl.col("FR") == "f") & ( 15 | pl.col('bq_bin') == qual_bin) & (pl.col('total_count') > 0)) 16 | r2 = df.filter((pl.col("read12") == "r2") & (pl.col("FR") == "r") & ( 17 | pl.col('bq_bin') == qual_bin) & (pl.col('total_count') > 0)) 18 | 19 | print(r1.shape, r2.shape) 20 | 21 | contexts = list(sorted(r1['context'].unique(), reverse=False)) 22 | print(contexts) 23 | 24 | r1 = r1.with_columns([ 25 | (((pl.col('error_count') + 0) / (1 + pl.col('total_count')))).alias('rate')]) 26 | r2 = r2.with_columns([ 27 | (((pl.col('error_count') + 0) / (1 + pl.col('total_count')))).alias('rate')]) 28 | 29 | r1_rate = r1['error_count'].sum() / (r1['total_count'].sum() / 3) * 1_000_000 30 | r2_rate = r2['error_count'].sum() / (r2['total_count'].sum() / 3) * 1_000_000 31 | 32 | cols = plotly.colors.DEFAULT_PLOTLY_COLORS 33 | 34 | # Create figure with secondary y-axis 35 | fig = make_subplots( 36 | rows=2, subplot_titles=[f"read1(F) errors per million read-bases: {r1_rate:.3f}", f"read2(R) errors per million read-bases: {r2_rate: 3f}"], 37 | vertical_spacing=0.1, 38 | ) 39 | 40 | for i, ctx in enumerate(contexts): 41 | sub1 = r1.filter(pl.col('context') == ctx) 42 | sub2 = r2.filter(pl.col('context') == ctx) 43 | 44 | rate1 = sub1['error_count'].sum() / sub1['total_count'].sum() * 1_000_000 45 | rate2 = sub2['error_count'].sum() / sub2['total_count'].sum() * 1_000_000 46 | 47 | t1 = go.Scatter(name=f'{ctx}', x=np.array(sub1["read_pos"]), y=(1_000_000 * np.array( 48 | sub1['rate'])), 49 | hovertemplate="rate/Mb: %{y:.2g} errors:%{text}", 50 | text=[f'{c} of {n:,}' for c, 51 | n in zip(sub1["error_count"], sub1['total_count'])], 52 | line=dict(color=cols[i])) 53 | t2 = go.Scatter(name=f'{ctx}', x=np.array(sub2["read_pos"]), y=(1_000_000 * np.array( 54 | sub2['rate'])), 55 | hovertemplate="rate/Mb: %{y:.2g} errors:%{text}", 56 | text=[f'{c} of {n:,}' for c, 57 | n in zip(sub2["error_count"], sub2['total_count'])], 58 | line=dict(color=cols[i]), showlegend=False) 59 | fig.add_trace(t1, row=1, col=1) 60 | fig.add_trace(t2, row=2, col=1) 61 | 62 | # fig.update_layout(barmode='stack') 63 | fig.update_layout(hovermode='x unified') 64 | fig.update_xaxes(title_text="relative read position") 65 | # fig.update_layout(legend_traceorder="reversed") 66 | 67 | # fig.update_layout(title_text="error-rate along a read") 68 | 69 | fig.update_yaxes(title_text="errors per million read bases") 70 | # fig.update_layout(yaxis_tickformat = '%g') 71 | 72 | 73 | fig.write_html("read.html") 74 | -------------------------------------------------------------------------------- /scripts/analyze_indel_errors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Analyze indel error rates from fraguracy output files. 4 | """ 5 | 6 | import polars as pl 7 | import plotly.express as px 8 | import plotly.graph_objects as go 9 | from plotly.subplots import make_subplots 10 | import numpy as np 11 | import gzip 12 | import argparse 13 | from pathlib import Path 14 | 15 | def read_indel_errors(filepath): 16 | """Read indel errors file and sum counts by length, bq_bin, hp_dist""" 17 | 18 | # Read the file and skip comment lines but keep header 19 | if filepath.endswith('.gz'): 20 | with gzip.open(filepath, 'rt') as f: 21 | lines = f.readlines() 22 | else: 23 | with open(filepath, 'r') as f: 24 | lines = f.readlines() 25 | 26 | # Find header line (starts with #) 27 | header_line = None 28 | data_lines = [] 29 | for line in lines: 30 | # Ensure line is a string 31 | if isinstance(line, bytes): 32 | line_str = line.decode('utf-8') 33 | else: 34 | line_str = str(line) 35 | 36 | if line_str.startswith('#'): 37 | header_line = line_str[1:].strip() # Remove # and whitespace 38 | else: 39 | data_lines.append(line_str.strip()) 40 | 41 | # Create a temporary file-like object with header and data 42 | import io 43 | if header_line is None: 44 | raise ValueError("No header line found in input file") 45 | csv_content = str(header_line) + '\n' + '\n'.join(data_lines) 46 | 47 | df = pl.read_csv(io.StringIO(csv_content), separator='\t', ignore_errors=True, 48 | null_values=['NA', 'N/A', ''], infer_schema_length=10000) 49 | 50 | # Filter out any rows with null values in key columns 51 | df = df.filter( 52 | pl.col('length').is_not_null() & 53 | pl.col('bq_bin').is_not_null() & 54 | pl.col('hp_dist').is_not_null() & 55 | pl.col('count').is_not_null() 56 | ) 57 | 58 | # Group by length, bq_bin, hp_dist and sum counts 59 | indel_grouped = df.group_by(['length', 'bq_bin', 'hp_dist']).agg([ 60 | pl.col('count').sum().alias('indel_count') 61 | ]) 62 | 63 | return indel_grouped 64 | 65 | def read_counts_file(filepath): 66 | """Read counts file and sum total_count by bq_bin, hp_dist""" 67 | 68 | df = pl.read_csv(filepath, separator='\t', ignore_errors=True, 69 | null_values=['NA', 'N/A', ''], infer_schema_length=10000) 70 | 71 | # Filter out any rows with null values in key columns 72 | df = df.filter( 73 | pl.col('bq_bin').is_not_null() & 74 | pl.col('hp_dist').is_not_null() & 75 | pl.col('total_count').is_not_null() 76 | ) 77 | 78 | # Group by bq_bin, hp_dist and sum total_count 79 | counts_grouped = df.group_by(['bq_bin', 'hp_dist']).agg([ 80 | pl.col('total_count').sum().alias('total_count') 81 | ]) 82 | 83 | return counts_grouped 84 | 85 | def calculate_error_rates(indel_df, counts_df): 86 | """Calculate indel error rates by joining indel and total counts""" 87 | 88 | # Join the dataframes on bq_bin and hp_dist 89 | merged = indel_df.join(counts_df, on=['bq_bin', 'hp_dist'], how='inner') 90 | 91 | # Calculate error rate 92 | merged = merged.with_columns([ 93 | (pl.col('indel_count') / pl.col('total_count')).alias('error_rate') 94 | ]) 95 | 96 | return merged 97 | 98 | def create_plot(df, connect_lines=False): 99 | """Create interactive subplot with hp_dist vs error rate and indel length vs error rate""" 100 | 101 | # Aggregate indel lengths: group lengths > 3 and < -3 into single categories 102 | df_plot = df.with_columns([ 103 | pl.when(pl.col('length') > 3) 104 | .then(pl.lit('>3')) 105 | .when(pl.col('length') < -3) 106 | .then(pl.lit('<-3')) 107 | .otherwise(pl.col('length').cast(pl.Utf8)) 108 | .alias('length_category') 109 | ]) 110 | 111 | # Data for first subplot: aggregate by length_category, bq_bin, hp_dist 112 | df_hp_plot = df_plot.group_by(['length_category', 'bq_bin', 'hp_dist']).agg([ 113 | pl.col('indel_count').sum().alias('indel_count'), 114 | pl.col('total_count').first().alias('total_count') # total_count should be the same for same bq_bin/hp_dist 115 | ]).with_columns([ 116 | (pl.col('indel_count') / pl.col('total_count')).alias('error_rate') 117 | ]) 118 | 119 | # Data for second subplot: aggregate by length_category and bq_bin only (sum across all hp_dists) 120 | df_length_plot = df_plot.group_by(['length_category', 'bq_bin']).agg([ 121 | pl.col('indel_count').sum().alias('indel_count'), 122 | pl.col('total_count').sum().alias('total_count') # sum total_count across hp_dists 123 | ]).with_columns([ 124 | (pl.col('indel_count') / pl.col('total_count')).alias('error_rate') 125 | ]) 126 | 127 | # Get unique values for filtering 128 | unique_bq_bins = sorted(df_hp_plot.select('bq_bin').unique().to_numpy().flatten()) 129 | 130 | # Sort length categories in logical numerical order 131 | def sort_length_categories(categories): 132 | """Sort length categories in logical order: <-3, -3, -2, -1, 1, 2, 3, >3""" 133 | def category_sort_key(cat): 134 | if cat == '<-3': 135 | return -1000 # Sort first 136 | elif cat == '>3': 137 | return 1000 # Sort last 138 | else: 139 | return int(cat) # Sort numerically for individual lengths 140 | 141 | return sorted(categories, key=category_sort_key) 142 | 143 | unique_categories = sort_length_categories(df_hp_plot.select('length_category').unique().to_numpy().flatten()) 144 | 145 | print(f"Available BQ bins: {unique_bq_bins}") 146 | print(f"Length categories: {unique_categories}") 147 | 148 | # Create subplot figure 149 | fig = make_subplots( 150 | rows=2, cols=1, 151 | subplot_titles=('Error Rate by Homopolymer Distance', 'Error Rate by Indel Length'), 152 | vertical_spacing=0.15 153 | ) 154 | 155 | # Color palette 156 | colors = px.colors.qualitative.Set1 157 | 158 | # Create traces for each combination of length_category and bq_bin 159 | trace_info = [] 160 | 161 | # First subplot: HP Distance vs Error Rate 162 | for i, category in enumerate(unique_categories): 163 | for j, bq_bin in enumerate(unique_bq_bins): 164 | # Filter data for this combination 165 | df_subset = df_hp_plot.filter( 166 | (pl.col('length_category') == category) & 167 | (pl.col('bq_bin') == bq_bin) 168 | ) 169 | 170 | if df_subset.height == 0: 171 | continue # Skip empty combinations 172 | 173 | # Convert to numpy arrays 174 | x_data = df_subset.select('hp_dist').to_numpy().flatten() 175 | y_data = df_subset.select('error_rate').to_numpy().flatten() 176 | indel_count_data = df_subset.select('indel_count').to_numpy().flatten() 177 | total_count_data = df_subset.select('total_count').to_numpy().flatten() 178 | 179 | # Sort by hp_dist for proper line connection 180 | if connect_lines and len(x_data) > 1: 181 | sort_idx = np.argsort(x_data) 182 | x_data = x_data[sort_idx] 183 | y_data = y_data[sort_idx] 184 | indel_count_data = indel_count_data[sort_idx] 185 | total_count_data = total_count_data[sort_idx] 186 | mode = 'markers+lines' 187 | else: 188 | mode = 'markers' 189 | 190 | # Determine visibility (default to 37-59 only) 191 | visible = True if bq_bin == '37-59' else False 192 | 193 | trace_name = f'{category} (BQ: {bq_bin})' 194 | 195 | fig.add_trace(go.Scatter( 196 | x=x_data, 197 | y=y_data, 198 | mode=mode, 199 | name=trace_name, 200 | visible=visible, 201 | legendgroup=category, # Group traces by category for consistent legend 202 | marker=dict( 203 | color=colors[i % len(colors)], 204 | size=6, 205 | opacity=0.7 206 | ), 207 | line=dict( 208 | color=colors[i % len(colors)], 209 | width=2 210 | ) if connect_lines else None, 211 | customdata=np.column_stack(( 212 | indel_count_data, 213 | total_count_data, 214 | [bq_bin] * len(x_data) 215 | )), 216 | hovertemplate=( 217 | 'Indel Length: ' + category + '
' + 218 | 'BQ Bin: ' + bq_bin + '
' + 219 | 'HP Distance: %{x}
' + 220 | 'Error Rate: %{y:.2e}
' + 221 | 'Indel Count: %{customdata[0]}
' + 222 | 'Total Count: %{customdata[1]}
' + 223 | '' 224 | ) 225 | ), row=1, col=1) 226 | 227 | trace_info.append({ 228 | 'bq_bin': bq_bin, 229 | 'category': category, 230 | 'trace_idx': len(list(fig.data)) - 1, 231 | 'subplot': 'hp_dist' 232 | }) 233 | 234 | # Second subplot: Indel Length vs Error Rate 235 | for i, category in enumerate(unique_categories): 236 | for j, bq_bin in enumerate(unique_bq_bins): 237 | # Filter data for this combination 238 | df_subset = df_length_plot.filter( 239 | (pl.col('length_category') == category) & 240 | (pl.col('bq_bin') == bq_bin) 241 | ) 242 | 243 | if df_subset.height == 0: 244 | continue # Skip empty combinations 245 | 246 | # Convert to numpy arrays 247 | error_rate = df_subset.select('error_rate').to_numpy().flatten()[0] 248 | indel_count = df_subset.select('indel_count').to_numpy().flatten()[0] 249 | total_count = df_subset.select('total_count').to_numpy().flatten()[0] 250 | 251 | # Determine visibility (default to 37-59 only) 252 | visible = True if bq_bin == '37-59' else False 253 | 254 | # For x-axis position, convert category to numeric value for plotting 255 | if category == '<-3': 256 | x_pos = -4 257 | elif category == '>3': 258 | x_pos = 4 259 | else: 260 | x_pos = int(category) 261 | 262 | trace_name = f'{category} (BQ: {bq_bin})' 263 | 264 | fig.add_trace(go.Scatter( 265 | x=[x_pos], 266 | y=[error_rate], 267 | mode='markers', 268 | name=trace_name, # Use same name to group in legend 269 | visible=visible, 270 | legendgroup=category, # Group with first subplot traces 271 | showlegend=False, # Don't show duplicate legend entries 272 | marker=dict( 273 | color=colors[i % len(colors)], 274 | size=8, 275 | opacity=0.7 276 | ), 277 | customdata=np.array([[indel_count, total_count, bq_bin]]), 278 | hovertemplate=( 279 | 'Indel Length: ' + category + '
' + 280 | 'BQ Bin: ' + bq_bin + '
' + 281 | 'Error Rate: %{y:.2e}
' + 282 | 'Indel Count: %{customdata[0]}
' + 283 | 'Total Count: %{customdata[1]}
' + 284 | '' 285 | ) 286 | ), row=2, col=1) 287 | 288 | trace_info.append({ 289 | 'bq_bin': bq_bin, 290 | 'category': category, 291 | 'trace_idx': len(list(fig.data)) - 1, 292 | 'subplot': 'length' 293 | }) 294 | 295 | # Create buttons for BQ bin selection 296 | buttons = [] 297 | 298 | # Add "All" button 299 | all_visible = [True] * len(list(fig.data)) 300 | buttons.append(dict( 301 | label="All BQ Bins", 302 | method="update", 303 | args=[{"visible": all_visible}] 304 | )) 305 | 306 | # Add individual BQ bin buttons 307 | for bq_bin in unique_bq_bins: 308 | visible_list = [] 309 | for trace in trace_info: 310 | visible_list.append(trace['bq_bin'] == bq_bin) 311 | 312 | # Mark 37-59 as default 313 | label = f"BQ: {bq_bin} (default)" if bq_bin == '37-59' else f"BQ: {bq_bin}" 314 | 315 | buttons.append(dict( 316 | label=label, 317 | method="update", 318 | args=[{"visible": visible_list}] 319 | )) 320 | 321 | # Update layout with BQ bin selector 322 | fig.update_layout( 323 | title='Indel Error Rate Analysis', 324 | width=1000, 325 | height=900, 326 | template='plotly_white', 327 | legend=dict( 328 | title="Indel Length", 329 | orientation="v", 330 | yanchor="top", 331 | y=1, 332 | xanchor="left", 333 | x=1.02 334 | ), 335 | updatemenus=[ 336 | dict( 337 | type="buttons", 338 | direction="left", 339 | buttons=buttons, 340 | pad={"r": 10, "t": 10}, 341 | showactive=True, 342 | x=0.01, 343 | xanchor="left", 344 | y=1.02, 345 | yanchor="top" 346 | ), 347 | ], 348 | 349 | #annotations=[ 350 | # dict(text="BQ Bin Filter:", showarrow=False, 351 | # x=0.01, y=1.02, yref="paper", align="left", 352 | # font=dict(size=11, color="black")) 353 | #] 354 | ) 355 | 356 | # Update subplot axes 357 | fig.update_xaxes(title_text="Homopolymer Distance (hp_dist)", showgrid=True, gridwidth=1, gridcolor='lightgray', row=1, col=1) 358 | fig.update_yaxes(title_text="Indel Error Rate", type='log', showgrid=True, gridwidth=1, gridcolor='lightgray', row=1, col=1) 359 | 360 | fig.update_xaxes(title_text="Indel Length", showgrid=True, gridwidth=1, gridcolor='lightgray', row=2, col=1) 361 | fig.update_yaxes(title_text="Indel Error Rate", type='log', showgrid=True, gridwidth=1, gridcolor='lightgray', row=2, col=1) 362 | 363 | # Set custom x-axis labels for second subplot 364 | fig.update_xaxes( 365 | tickvals=[-4, -3, -2, -1, 1, 2, 3, 4], 366 | ticktext=['<-3', '-3', '-2', '-1', '1', '2', '3', '>3'], 367 | row=2, col=1 368 | ) 369 | 370 | return fig 371 | 372 | def parse_arguments(): 373 | """Parse command line arguments""" 374 | parser = argparse.ArgumentParser( 375 | description='Analyze indel error rates from fraguracy output files', 376 | formatter_class=argparse.RawDescriptionHelpFormatter, 377 | epilog=""" 378 | Examples: 379 | python3 analyze_indel_errors.py indel_errors.bed.gz counts.txt 380 | python3 analyze_indel_errors.py --no-lines indel_errors.bed.gz counts.txt 381 | """ 382 | ) 383 | 384 | parser.add_argument( 385 | 'indel_errors_file', 386 | help='Input indel errors file (BED format, can be gzipped)' 387 | ) 388 | 389 | parser.add_argument( 390 | 'counts_file', 391 | help='Input counts file (tab-separated format)' 392 | ) 393 | 394 | parser.add_argument( 395 | '--no-lines', 396 | action='store_false', 397 | dest='lines', 398 | help='Show scatter plot only, without connecting lines (default: show lines)' 399 | ) 400 | 401 | parser.add_argument( 402 | '--output-prefix', 403 | default='indel_error_rates_by_hp_dist', 404 | help='Output file prefix (default: indel_error_rates_by_hp_dist)' 405 | ) 406 | 407 | return parser.parse_args() 408 | 409 | def main(): 410 | """Main analysis function""" 411 | 412 | args = parse_arguments() 413 | 414 | print(f"Reading indel errors file: {args.indel_errors_file}") 415 | indel_df = read_indel_errors(args.indel_errors_file) 416 | print(f"Indel errors shape: {indel_df.shape}") 417 | print("Indel errors preview:") 418 | print(indel_df.head()) 419 | 420 | print(f"\nReading counts file: {args.counts_file}") 421 | counts_df = read_counts_file(args.counts_file) 422 | print(f"Counts shape: {counts_df.shape}") 423 | print("Counts preview:") 424 | print(counts_df.head()) 425 | 426 | print("\nCalculating error rates...") 427 | error_rates_df = calculate_error_rates(indel_df, counts_df) 428 | print(f"Error rates shape: {error_rates_df.shape}") 429 | print("Error rates preview:") 430 | print(error_rates_df.head()) 431 | 432 | print(f"\nCreating plot{' with connected lines' if args.lines else ' (scatter plot only)'}...") 433 | fig = create_plot(error_rates_df, connect_lines=args.lines) 434 | 435 | # Save the plot as HTML for interactivity 436 | html_output = f"{args.output_prefix}.html" 437 | png_output = f"{args.output_prefix}.png" 438 | 439 | fig.write_html(html_output) 440 | print(f"Interactive plot saved as '{html_output}'") 441 | 442 | # Try to save PNG (requires kaleido) 443 | try: 444 | fig.write_image(png_output, width=1000, height=600) 445 | print(f"Static plot also saved as '{png_output}'") 446 | except Exception as e: 447 | print(f"Could not save PNG (install kaleido for PNG export): {e}") 448 | 449 | # Show summary statistics with aggregated lengths 450 | print("\nSummary statistics (with length aggregation):") 451 | summary_df = error_rates_df.with_columns([ 452 | pl.when(pl.col('length') > 3) 453 | .then(pl.lit('>3')) 454 | .when(pl.col('length') < -3) 455 | .then(pl.lit('<-3')) 456 | .otherwise(pl.col('length').cast(pl.Utf8)) 457 | .alias('length_category') 458 | ]) 459 | 460 | summary = summary_df.group_by('length_category').agg([ 461 | pl.col('error_rate').mean().alias('mean_error_rate'), 462 | pl.col('error_rate').std().alias('std_error_rate'), 463 | pl.col('indel_count').sum().alias('total_indel_count'), 464 | pl.col('total_count').sum().alias('total_count') 465 | ]).sort('length_category') 466 | print(summary) 467 | 468 | # Show the plot 469 | fig.show() 470 | 471 | if __name__ == "__main__": 472 | main() -------------------------------------------------------------------------------- /scripts/count-errors.py: -------------------------------------------------------------------------------- 1 | import defopt 2 | import cyvcf2 3 | from collections import defaultdict 4 | 5 | 6 | def main(vcf_file: str, *, prefix: str = "error-counts"): 7 | """ 8 | count errors (de novos) in a VCF file split by SNP/indel and by sample 9 | this expects particular annotations added for paper analyses 10 | 11 | :param vcf_file: path to VCF file 12 | :param prefix: prefix for output files 13 | """ 14 | d = {'snp': defaultdict(lambda: defaultdict(int)), 15 | 'indel': defaultdict(lambda: defaultdict(int))} 16 | 17 | lcr_d = {'snp': defaultdict(lambda: [0, 0]), 'indel': defaultdict(lambda: [0, 0])} 18 | 19 | counts_by_sample = defaultdict(int) 20 | 21 | vcf = cyvcf2.VCF(vcf_file) 22 | for variant in vcf: 23 | if variant.FILTER != 'PASS' and variant.FILTER is not None: 24 | continue 25 | 26 | error_samples = variant.INFO.get('dn').split(',') 27 | frag_count = int(variant.INFO.get('fraguracy_count', 0)) 28 | lcr = bool(variant.INFO.get('LCR', 0)) 29 | #frag_count = variant.INFO.get('fraguracy_samples', 0) 30 | snp = 'snp' if variant.is_snp else 'indel' 31 | for s in error_samples: 32 | d[snp][s][frag_count] += 1 33 | lcr_d[snp][s][lcr] += 1 34 | if snp == 'snp': 35 | counts_by_sample[s] += 1 36 | 37 | skip_samples = {s for s, cnt in counts_by_sample.items() if cnt > 1500} 38 | print({s: counts_by_sample[s] for s in skip_samples}) 39 | 40 | max_count = 50 41 | # now print out the snp and indel dictionaries to separate files 42 | for snp in ['snp', 'indel']: 43 | with open(f'{prefix}.{snp}.errors', 'w') as f: 44 | f.write("sample\tsnp\tcutoff\tcount_at_or_above\tcount\n") 45 | for sample in d[snp]: 46 | if sample in skip_samples: continue 47 | # we want to count all the errors 48 | cur_keys = [c for c in d[snp][sample].keys() if c >= max_count] 49 | cum_sum = sum([d[snp][sample][c] for c in cur_keys]) 50 | f.write(f'{sample}\t{snp}\t{max_count}\t{cum_sum}\t{cum_sum}\n') 51 | for i in range(max_count - 1, -1, -1): 52 | cnt = d[snp][sample].get(i, 0) 53 | cum_sum += cnt 54 | f.write(f'{sample}\t{snp}\t{i}\t{cum_sum}\t{cnt}\n') 55 | if snp == 'snp': 56 | assert(cum_sum == counts_by_sample[sample]) 57 | 58 | with open(f'{prefix}.{snp}.lcr', 'w') as f: 59 | f.write("sample\tsnp\tlcr\tcount\n") 60 | for sample in lcr_d[snp]: 61 | if sample in skip_samples: continue 62 | # we want to count all the errors 63 | f.write(f'{sample}\t{snp}\ttrue\t{lcr_d[snp][sample][True]}\n') 64 | f.write(f'{sample}\t{snp}\tfalse\t{lcr_d[snp][sample][False]}\n') 65 | 66 | if __name__ == '__main__': 67 | defopt.run(main) 68 | -------------------------------------------------------------------------------- /src/combine_counts.rs: -------------------------------------------------------------------------------- 1 | use crate::fraguracy; 2 | use std::io; 3 | use std::io::{BufRead, Write}; 4 | use std::path::PathBuf; 5 | use std::string::String; 6 | 7 | #[derive(Hash, Debug, PartialOrd, PartialEq, Ord, Eq, Clone)] 8 | pub(crate) struct Count { 9 | read12: u8, 10 | orientation: u8, 11 | read_pos: u32, 12 | bq_bin: u8, 13 | context: [char; 2], 14 | homopolymer_dist: i8, 15 | total: u32, 16 | errors: u32, 17 | } 18 | 19 | impl std::ops::AddAssign for Count { 20 | fn add_assign(&mut self, o: Count) { 21 | assert!(self.read12 == o.read12); 22 | assert!(self.orientation == o.orientation); 23 | assert!(self.read_pos == o.read_pos); 24 | assert!(self.bq_bin == o.bq_bin); 25 | assert!(self.context == o.context); 26 | assert!(self.homopolymer_dist == o.homopolymer_dist); 27 | self.errors += o.errors; 28 | self.total += o.total; 29 | } 30 | } 31 | 32 | impl Count { 33 | fn from_line(s: &str, file_name: &str) -> Count { 34 | let mut sp = s.trim().split('\t'); 35 | Count { 36 | read12: sp.next().unwrap_or_else(|| panic!("not enough columns in line: {s} from file: {file_name}"))[1..] 37 | .parse::() 38 | .map(|val| val - 1) 39 | .unwrap_or_else(|e| panic!("error parsing read12 from line: {s} in file: {file_name}, error: {e}")), 40 | orientation: match sp.next() { 41 | Some("f") => 0, 42 | Some("r") => 1, 43 | _ => panic!("error parsing orientation, expected f or r, in line: {s} from file: {file_name}"), 44 | }, 45 | bq_bin: fraguracy::REVERSE_Q_LOOKUP[sp.next().unwrap_or_else(|| panic!("not enough columns for bq_bin in line: {s} from file: {file_name}"))], 46 | read_pos: sp 47 | .next() 48 | .unwrap_or_else(|| panic!("not enough columns for read_pos in line: {s} from file: {file_name}")) 49 | .parse::() 50 | .unwrap_or_else(|e| panic!("error parsing read_pos from line: {s} in file: {file_name}, error: {e}")), 51 | context: { 52 | let ctx_str = sp 53 | .next() 54 | .unwrap_or_else(|| panic!("error getting context string from line: {s} from file: {file_name}")); 55 | let mut ctx_chars = ctx_str.chars(); 56 | [ 57 | ctx_chars.next().unwrap_or_else(|| panic!("expecting two characters for context, got: {ctx_str} in line: {s} from file: {file_name}")), 58 | ctx_chars.next().unwrap_or_else(|| panic!("expecting two characters for context, got: {ctx_str} in line: {s} from file: {file_name}")), 59 | ] 60 | }, 61 | homopolymer_dist: { 62 | let val = sp 63 | .next() 64 | .unwrap_or_else(|| panic!("not enough columns for homopolymer_dist in line: {s} from file: {file_name}")) 65 | .trim(); 66 | if val == "NA" { 67 | i8::MAX 68 | } else { 69 | val.parse::() 70 | .unwrap_or_else(|e| panic!("error parsing homopolymer_dist from line: {s} in file: {file_name}, error: {e}")) 71 | } 72 | }, 73 | total: sp 74 | .next() 75 | .unwrap_or_else(|| panic!("not enough columns for total in line: {s} from file: {file_name}")) 76 | .parse::() 77 | .unwrap_or_else(|e| panic!("error parsing total from line: {s} in file: {file_name}, error: {e}")), 78 | errors: sp 79 | .next() 80 | .unwrap_or_else(|| panic!("not enough columns for errors in line: {s} from file: {file_name}")) 81 | .trim() 82 | .parse::() 83 | .unwrap_or_else(|e| panic!("error parsing errors from line: {s} in file: {file_name}, error: {e}")), 84 | } 85 | } 86 | } 87 | 88 | pub(crate) fn combine_counts_main( 89 | counts_files: Vec, 90 | output_path: String, 91 | ) -> io::Result<()> { 92 | let mut counts: std::collections::HashSet = std::collections::HashSet::new(); 93 | let mut header: String = String::new(); 94 | for count_file in counts_files.iter() { 95 | // open each file and read each line. 96 | let file = std::fs::File::open(count_file)?; 97 | let reader = std::io::BufReader::new(file); 98 | for (i, line) in reader.lines().enumerate() { 99 | let line = line?; 100 | if i == 0 { 101 | assert!( 102 | line.starts_with("read12"), 103 | "expecting header line from counts file" 104 | ); 105 | assert!( 106 | line.contains("hp_dist"), 107 | "expecting hp_dist in header please run with newer version of fraguracy" 108 | ); 109 | // take the first 8 columns as the header 110 | header = line.split('\t').take(8).collect::>().join("\t"); 111 | continue; 112 | } 113 | let mut c = Count::from_line(&line, count_file.to_str().unwrap()); 114 | let entry = counts.take(&c); 115 | if let Some(entry) = entry { 116 | c.total += entry.total; 117 | c.errors += entry.errors; 118 | } 119 | counts.insert(c); 120 | } 121 | } 122 | 123 | let mut out = std::fs::File::create(output_path)?; 124 | writeln!(out, "{}", header)?; 125 | 126 | let mut counts: Vec = counts.into_iter().collect(); 127 | counts.sort(); 128 | for c in counts.iter() { 129 | writeln!( 130 | out, 131 | "r{}\t{}\t{}\t{}\t{}{}\t{}\t{}\t{}", 132 | c.read12 + 1, 133 | ['f', 'r'][c.orientation as usize], 134 | fraguracy::Q_LOOKUP[c.bq_bin as usize], 135 | c.read_pos, 136 | c.context[0], 137 | c.context[1], 138 | c.homopolymer_dist, 139 | c.total, 140 | c.errors 141 | )?; 142 | } 143 | 144 | Ok(()) 145 | } 146 | 147 | #[cfg(test)] 148 | mod tests { 149 | use super::*; 150 | 151 | #[test] 152 | fn test_from_line() { 153 | let line = "r1 f 05-19 0 AC -1 61502 609"; 154 | 155 | let c = Count::from_line(line, "test_file.txt"); 156 | assert_eq!(c.read12, 0); 157 | assert_eq!(c.orientation, 0); 158 | assert_eq!(c.bq_bin, 1); 159 | assert_eq!(c.read_pos, 0); 160 | assert_eq!(c.context, ['A', 'C']); 161 | assert_eq!(c.homopolymer_dist, -1); 162 | assert_eq!(c.total, 61502); 163 | assert_eq!(c.errors, 609); 164 | } 165 | 166 | #[test] 167 | fn test_add_count() { 168 | let mut a = Count { 169 | read12: 0, 170 | orientation: 1, 171 | bq_bin: 2, 172 | read_pos: 3, 173 | context: ['A', 'T'], 174 | homopolymer_dist: -1, 175 | total: 32, 176 | errors: 1, 177 | }; 178 | let mut b = a.clone(); 179 | b.errors = 3; 180 | a += b; 181 | assert_eq!(a.homopolymer_dist, -1); 182 | 183 | assert_eq!(a.errors, 4); 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /src/combine_errors.rs: -------------------------------------------------------------------------------- 1 | use crate::fraguracy; 2 | use core::cmp::Reverse; 3 | use itertools::Itertools; 4 | use rust_htslib::bgzf; 5 | use std::cmp::Ordering; 6 | use std::collections::BinaryHeap; 7 | use std::collections::HashMap; 8 | use std::error::Error; 9 | use std::fs::File; 10 | use std::io; 11 | use std::io::{BufRead, BufReader, Write}; 12 | use std::ops::Add; 13 | use std::path::PathBuf; 14 | use std::string::String; 15 | 16 | #[derive(Eq, Debug, Default, Clone)] 17 | struct Interval { 18 | tid: i32, 19 | chrom: String, 20 | start: u32, 21 | end: u32, 22 | group: u8, 23 | length: i32, 24 | hp_dist: i16, 25 | count: [u32; 7], 26 | file_i: u32, 27 | } 28 | struct IntervalHeap { 29 | // min heap 30 | h: BinaryHeap>, 31 | files: Vec>, 32 | chom_to_tid: HashMap, 33 | is_indel: Vec, 34 | } 35 | 36 | fn read_fai(path: PathBuf) -> HashMap { 37 | let f = File::open(&path); 38 | let mut h = HashMap::new(); 39 | if let Ok(fu) = f { 40 | for l in BufReader::new(fu).lines() { 41 | let l = l.expect("error parsing faidx"); 42 | let chrom = l 43 | .split('\t') 44 | .next() 45 | .expect("expected at least one value per line in faidx"); 46 | if chrom.starts_with('>') { 47 | log::warn!( 48 | "expecting fai, NOT fasta for argument found chrom of {}", 49 | chrom 50 | ); 51 | } 52 | h.insert(String::from(chrom), h.len() as i32); 53 | } 54 | } else { 55 | panic!("couldn't open file: {:?}", path.to_string_lossy()); 56 | } 57 | h 58 | } 59 | 60 | impl Add<&Interval> for &Interval { 61 | type Output = Interval; 62 | fn add(self, other: &Interval) -> Self::Output { 63 | assert_eq!(self.chrom, other.chrom); 64 | assert_eq!(self.start, other.start); 65 | assert_eq!(self.end, other.end); 66 | assert_eq!(self.group, other.group); 67 | assert_eq!(self.length, other.length, "indel lengths must be equal"); 68 | let counts = self 69 | .count 70 | .iter() 71 | .zip(other.count.iter()) 72 | .map(|(a, b)| a + b); 73 | // convert counts to [u32, 7] 74 | let counts: [u32; 7] = counts 75 | .collect::>() 76 | .try_into() 77 | .expect("error converting counts"); 78 | 79 | Interval { 80 | chrom: self.chrom.clone(), 81 | count: counts, 82 | ..*self 83 | } 84 | } 85 | } 86 | 87 | impl IntervalHeap { 88 | fn all_indels(&self) -> bool { 89 | self.is_indel.iter().all(|&x| x) 90 | } 91 | fn new(paths: Vec, fai_path: PathBuf) -> IntervalHeap { 92 | let fhs: Vec> = paths 93 | .iter() 94 | .map(|p| crate::files::open_file(Some(p.clone())).expect("error opening file")) 95 | .collect(); 96 | 97 | let mut ih = IntervalHeap { 98 | h: BinaryHeap::new(), 99 | files: fhs, 100 | chom_to_tid: read_fai(fai_path), 101 | is_indel: paths 102 | .iter() 103 | .map(|p| p.to_string_lossy().ends_with("indel-errors.bed.gz")) 104 | .collect(), 105 | }; 106 | 107 | assert!( 108 | ih.is_indel.iter().all(|&x| x) || ih.is_indel.iter().all(|&x| !x), 109 | "all files must be either indel error files or base error files, not a mix" 110 | ); 111 | 112 | ih.files 113 | .iter_mut() 114 | .enumerate() 115 | .for_each(|(file_i, fh)| loop { 116 | // loop to skip '#' comment lines 117 | let is_indel = ih.is_indel[file_i]; 118 | let mut buf = String::new(); 119 | let line = fh.read_line(&mut buf); 120 | if line.is_ok() && !buf.starts_with('#') { 121 | let r = parse_bed_line(&buf, file_i as u32, &(ih.chom_to_tid), is_indel); 122 | if r.is_err() && buf == "" { 123 | break; 124 | } else { 125 | ih.h.push(Reverse(r.unwrap_or_else(|_| { 126 | if buf != "" {} 127 | panic!("Error parsing first line from file: '{buf}'") 128 | }))); 129 | break; 130 | } 131 | } 132 | }); 133 | ih 134 | } 135 | } 136 | fn parse_bed_line( 137 | line: &str, 138 | file_i: u32, 139 | chrom_to_tid: &HashMap, 140 | is_indel: bool, 141 | ) -> Result> { 142 | let toks: Vec<&str> = line.trim().split('\t').collect(); 143 | // can be 6 if combine-errors was already run once. 144 | let mut iv = if !is_indel { 145 | let mut iv = Interval { 146 | tid: 0, 147 | chrom: String::from(toks[0]), 148 | start: str::parse::(toks[1])?, 149 | end: str::parse::(toks[2])?, 150 | group: (*fraguracy::REVERSE_Q_LOOKUP 151 | .get(toks[3].trim()) 152 | .unwrap_or_else(|| panic!("unknown bq bin: {}", toks[3]))), 153 | length: 0, 154 | count: [0; 7], 155 | hp_dist: i16::MAX, 156 | file_i, 157 | }; 158 | // toks[4] is the total count, which we don't need. because we can sum the count from toks[5] 159 | 160 | // parse the counts which appear as, e.g., 161 | // AC:1,AG:2,AT:3,CG:4,CT:5,GT:6,NN:0 162 | // and increment the appropriate index using CONTEXT_LOOKUP from fraguracy.rs 163 | for s in toks[5].split(',') { 164 | let (context, count) = s.split(':').collect_tuple().unwrap(); 165 | if context.len() != 2 { 166 | return Err( 167 | format!("expecting two characters for context, found {}", context).into(), 168 | ); 169 | } 170 | let mut context = context.chars(); 171 | let a = context.next().unwrap(); 172 | let b = context.next().unwrap(); 173 | let idx = fraguracy::CONTEXT_LOOKUP[&(a as u8, b as u8)]; 174 | iv.count[idx] += count.parse::().unwrap(); 175 | } 176 | iv 177 | } else { 178 | // indel errors 179 | let mut iv = Interval { 180 | tid: 0, 181 | chrom: String::from(toks[0]), 182 | start: str::parse::(toks[1])?, 183 | end: str::parse::(toks[2])?, 184 | group: *fraguracy::REVERSE_Q_LOOKUP 185 | .get(toks[5].trim()) 186 | .unwrap_or_else(|| panic!("unknown bq bin: {}", toks[5])), 187 | count: [0; 7], 188 | length: str::parse::(toks[4])?, 189 | hp_dist: str::parse::(toks[6]).unwrap_or(crate::fraguracy::MAX_HP_DIST + 1), 190 | file_i, 191 | }; 192 | // store the count in the first position for indels. 193 | iv.count[0] = str::parse::(toks[3])?; 194 | iv 195 | }; 196 | iv.tid = *chrom_to_tid 197 | .get(&iv.chrom) 198 | .unwrap_or_else(|| panic!("chromosome '{}' not found in fai file", iv.chrom)); 199 | Ok(iv) 200 | } 201 | 202 | impl Iterator for IntervalHeap { 203 | type Item = Interval; 204 | 205 | /// pop an item out and then read in another interval from that file-handle 206 | fn next(&mut self) -> Option { 207 | if let Some(pop_iv) = self.h.pop() { 208 | let pop_iv = pop_iv.0; 209 | let file_i = pop_iv.file_i; 210 | let is_indel = self.is_indel[file_i as usize]; 211 | let fh = &mut self.files[file_i as usize]; 212 | let mut buf = String::new(); 213 | let line_len = &fh.read_line(&mut buf); 214 | if line_len.is_ok() && *(line_len).as_ref().unwrap() > 0 { 215 | let r = parse_bed_line(&buf, file_i, &self.chom_to_tid, is_indel); 216 | if let Ok(iv) = r { 217 | self.h.push(Reverse(iv)); 218 | } else { 219 | panic!("{:?} line_len: {:?}", r.err().unwrap(), line_len); 220 | } 221 | } 222 | Some(pop_iv) 223 | } else { 224 | None 225 | } 226 | } 227 | } 228 | 229 | impl PartialEq for Interval { 230 | fn eq(&self, b: &Interval) -> bool { 231 | self.chrom == b.chrom 232 | && self.start == b.start 233 | && self.end == b.end 234 | && self.group == b.group 235 | && self.length == b.length 236 | } 237 | } 238 | 239 | impl PartialOrd for Interval { 240 | #[allow(clippy::non_canonical_partial_ord_impl)] 241 | fn partial_cmp(&self, b: &Interval) -> Option { 242 | if self.tid != b.tid { 243 | return if self.tid < b.tid { 244 | Some(Ordering::Less) 245 | } else { 246 | Some(Ordering::Greater) 247 | }; 248 | } 249 | if self.start != b.start { 250 | return if self.start < b.start { 251 | Some(Ordering::Less) 252 | } else { 253 | Some(Ordering::Greater) 254 | }; 255 | } 256 | 257 | if self.end != b.end { 258 | return if self.end < b.end { 259 | Some(Ordering::Less) 260 | } else { 261 | Some(Ordering::Greater) 262 | }; 263 | } 264 | 265 | if self.length != b.length { 266 | return if self.length < b.length { 267 | Some(Ordering::Less) 268 | } else { 269 | Some(Ordering::Greater) 270 | }; 271 | } 272 | 273 | Some(self.group.cmp(&b.group)) 274 | } 275 | } 276 | 277 | impl Ord for Interval { 278 | fn cmp(&self, b: &Interval) -> std::cmp::Ordering { 279 | self.partial_cmp(b).expect("cmp: not expecting None") 280 | } 281 | } 282 | 283 | pub(crate) fn combine_errors_main( 284 | paths: Vec, 285 | fai_path: PathBuf, 286 | output_path: String, 287 | ) -> io::Result<()> { 288 | let ih = IntervalHeap::new(paths, fai_path); 289 | if ih.all_indels() { 290 | log::info!("all indels"); 291 | } 292 | 293 | // Append .gz if not already present 294 | let mut output_path = if !output_path.ends_with(".gz") { 295 | output_path + ".gz" 296 | } else { 297 | output_path 298 | }; 299 | let all_indels = ih.all_indels(); 300 | 301 | if all_indels && !output_path.ends_with("indel-errors.bed.gz") { 302 | log::warn!("all indels, but output path does not end with 'indel-errors.bed.gz'. renaming"); 303 | output_path = output_path.replace(".bed.gz", ".indel-errors.bed.gz"); 304 | } 305 | 306 | let mut writer = 307 | bgzf::Writer::from_path(&output_path).expect("error creating bgzip output file"); 308 | 309 | if all_indels { 310 | writer.write_all(b"#chrom\tstart\tend\tcount\tlength\tbq_bin\thp_dist\tn_samples\n")?; 311 | } else { 312 | writer.write_all(b"#chrom\tstart\tend\tbq_bin\tcount\tcontexts\tn_samples\n")?; 313 | } 314 | 315 | for (_, ivs) in &ih 316 | .into_iter() 317 | .group_by(|iv| (iv.tid, iv.start, iv.end, iv.group, iv.length)) 318 | { 319 | let ivs: Vec = ivs.into_iter().collect(); 320 | let n = ivs 321 | .iter() 322 | .filter(|iv| iv.count.iter().any(|&c| c > 0)) 323 | .count(); 324 | let iv0 = ivs[0].clone(); 325 | let iv = ivs.iter().skip(1).fold(iv0, |acc, iv| &acc + iv); 326 | 327 | let line = if all_indels { 328 | format!( 329 | "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n", 330 | iv.chrom, 331 | iv.start, 332 | iv.end, 333 | iv.count[0], 334 | iv.length, 335 | fraguracy::Q_LOOKUP[iv.group as usize], 336 | if iv.hp_dist == crate::fraguracy::MAX_HP_DIST + 1 { 337 | "NA".to_string() 338 | } else { 339 | iv.hp_dist.to_string() 340 | }, 341 | n 342 | ) 343 | } else { 344 | let (total_count, context_str) = crate::files::format_context_counts(iv.count); 345 | format!( 346 | "{}\t{}\t{}\t{}\t{}\t{}\t{}\n", 347 | iv.chrom, 348 | iv.start, 349 | iv.end, 350 | if iv.group == u8::MAX { 351 | "NA" 352 | } else { 353 | fraguracy::Q_LOOKUP[iv.group as usize] 354 | }, 355 | total_count, 356 | context_str, 357 | n 358 | ) 359 | }; 360 | writer.write_all(line.as_bytes())?; 361 | } 362 | log::info!("wrote {}", output_path); 363 | writer.flush()?; 364 | Ok(()) 365 | } 366 | -------------------------------------------------------------------------------- /src/files.rs: -------------------------------------------------------------------------------- 1 | use crate::fraguracy::{InnerCounts, Stat, CONTEXT_TO_CONTEXT2}; 2 | use std::string::String; 3 | 4 | use flate2::bufread::GzDecoder; 5 | use itertools::Itertools; 6 | use std::fs::File; 7 | use std::io::{BufRead, BufReader}; 8 | use std::path::PathBuf; 9 | 10 | use rust_htslib::bgzf; 11 | use std::io::Write; 12 | 13 | pub(crate) type Iv = rust_lapper::Interval; 14 | 15 | pub(crate) fn write_stats(stats: Vec, output_prefix: PathBuf) { 16 | let header = Stat::header(); 17 | 18 | let mut fh = std::fs::File::create( 19 | output_prefix 20 | .to_str() 21 | .expect("error getting output prefix") 22 | .to_owned() 23 | + "counts.txt", 24 | ) 25 | .expect("error opening file!"); 26 | 27 | writeln!(fh, "{header}").expect("error writing to file"); 28 | stats 29 | .iter() 30 | .for_each(|s| writeln!(fh, "{s}").expect("error writing to file")); 31 | } 32 | 33 | pub(crate) fn format_context_counts(counts: [u32; 7]) -> (u32, String) { 34 | let mut total: u32 = 0; 35 | let contexts: String = counts 36 | .iter() 37 | .enumerate() 38 | .filter(|(_, &count)| count > 0) 39 | .map(|(idx, &count)| { 40 | let context = CONTEXT_TO_CONTEXT2[idx]; 41 | let a = context[0]; 42 | let b = context[1]; 43 | total += count; 44 | format!("{a}{b}:{count}") 45 | }) 46 | .collect::>() 47 | .join(","); 48 | 49 | (total, contexts) 50 | } 51 | 52 | pub(crate) fn write_errors(counts: &InnerCounts, output_prefix: PathBuf, chroms: Vec) { 53 | let path = output_prefix 54 | .to_str() 55 | .expect("error getting output prefix") 56 | .to_owned() 57 | + "errors.bed.gz"; 58 | 59 | let mut errfh = bgzf::Writer::from_path(&path).expect("error opening bgzip file!"); 60 | errfh 61 | .write_all(b"#chrom\tstart\tend\tbq_bin\tcount\tcontexts\n") 62 | .expect("error writing header"); 63 | 64 | for pos in counts.error_positions.keys().sorted() { 65 | let cnt = counts.error_positions[pos]; 66 | let (total, contexts) = format_context_counts(cnt); 67 | let chrom = &chroms[pos.tid as usize]; 68 | let position = pos.pos; 69 | let end = position + 1; 70 | let bqs = crate::fraguracy::Q_LOOKUP[pos.bq_bin as usize]; 71 | let line = format!("{chrom}\t{position}\t{end}\t{bqs}\t{total}\t{contexts}\n"); 72 | errfh 73 | .write_all(line.as_bytes()) 74 | .expect("error writing to error file"); 75 | } 76 | write_indel_errors(counts, output_prefix, chroms); 77 | } 78 | 79 | fn write_indel_errors(counts: &InnerCounts, output_prefix: PathBuf, chroms: Vec) { 80 | let path = output_prefix 81 | .to_str() 82 | .expect("error getting output prefix") 83 | .to_owned() 84 | + "indel-errors.bed.gz"; 85 | 86 | let mut errfh = bgzf::Writer::from_path(&path).expect("error opening bgzip file!"); 87 | 88 | errfh 89 | .write_all(b"#chrom\tstart\tend\tcount\tlength\tbq_bin\thp_dist\n") 90 | .expect("error writing header"); 91 | 92 | for ((pos, len, hp_dist), cnt) in counts.indel_error_positions.iter().sorted() { 93 | let chrom = &chroms[pos.tid as usize]; 94 | let position = pos.pos; 95 | let bq_bin = crate::fraguracy::Q_LOOKUP[pos.bq_bin as usize]; 96 | let end = position as i64 + (if *len > 0 { *len } else { 1 }) as i64; 97 | let hp_dist_str = if *hp_dist == crate::fraguracy::MAX_HP_DIST + 1 { 98 | "NA".to_string() 99 | } else { 100 | hp_dist.to_string() 101 | }; 102 | let line = format!("{chrom}\t{position}\t{end}\t{cnt}\t{len}\t{bq_bin}\t{hp_dist_str}\n"); 103 | errfh 104 | .write_all(line.as_bytes()) 105 | .expect("error writing to indel-error file"); 106 | } 107 | } 108 | 109 | pub(crate) fn open_file(path: Option) -> Option> { 110 | let file = File::open(path.as_ref().unwrap()); 111 | if file.is_err() { 112 | eprintln!("error opening file: {}", file.unwrap_err()); 113 | return None; 114 | } 115 | let file = file.unwrap(); 116 | 117 | // Check if it's a bgzip file 118 | let mut buf_file = BufReader::new(file); 119 | let b = buf_file.fill_buf().expect("error reading from file"); 120 | 121 | let reader: Box = if b.starts_with(b"\x1f\x8b") { 122 | if path.as_ref().unwrap().to_str().unwrap().ends_with(".gz") { 123 | // Try opening as bgzip first 124 | if let Ok(bgzf_reader) = bgzf::Reader::from_path(path.as_ref().unwrap()) { 125 | let buf_reader = BufReader::new(bgzf_reader); 126 | Box::new(buf_reader) 127 | } else { 128 | // Fall back to regular gzip 129 | Box::new(BufReader::new(GzDecoder::new(buf_file))) 130 | } 131 | } else { 132 | Box::new(BufReader::new(GzDecoder::new(buf_file))) 133 | } 134 | } else { 135 | Box::new(buf_file) 136 | }; 137 | Some(reader) 138 | } 139 | 140 | #[cfg(test)] 141 | mod tests { 142 | use super::*; 143 | 144 | #[test] 145 | fn test_format_context_counts() { 146 | // Test case 1: All counts are non-zero 147 | let counts1 = [1, 2, 3, 4, 5, 6, 0]; 148 | let (total1, contexts1) = format_context_counts(counts1); 149 | assert_eq!(total1, 21); 150 | assert_eq!(contexts1, "AC:1,AG:2,AT:3,CA:4,CG:5,CT:6"); 151 | 152 | // Test case 2: Some counts are zero 153 | let counts2 = [0, 2, 0, 4, 0, 6, 0]; 154 | let (total2, contexts2) = format_context_counts(counts2); 155 | assert_eq!(total2, 12); 156 | assert_eq!(contexts2, "AG:2,CA:4,CT:6"); 157 | 158 | // Test case 3: All counts are zero 159 | let counts3 = [0, 0, 0, 0, 0, 0, 0]; 160 | let (total3, contexts3) = format_context_counts(counts3); 161 | assert_eq!(total3, 0); 162 | assert_eq!(contexts3, ""); 163 | 164 | // Test case 4: Only one non-zero count 165 | let counts4 = [0, 0, 0, 0, 5, 0, 0]; 166 | let (total4, contexts4) = format_context_counts(counts4); 167 | assert_eq!(total4, 5); 168 | assert_eq!(contexts4, "CG:5"); 169 | 170 | // Test case 5: Only N has a count 171 | let counts5 = [0, 0, 0, 0, 0, 0, 1]; 172 | let (total5, contexts5) = format_context_counts(counts5); 173 | assert_eq!(total5, 1); 174 | assert_eq!(contexts5, "NN:1"); 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/fraguracy.rs: -------------------------------------------------------------------------------- 1 | use bpci::*; 2 | use ndarray::prelude::Array; 3 | use ndarray::Array6; 4 | use rust_htslib::{ 5 | bam::{ 6 | record::{Cigar, CigarStringView}, 7 | IndexedReader, Read, Record, 8 | }, 9 | bgzf::CompressionLevel, 10 | }; 11 | use rust_htslib::{bgzf, faidx}; 12 | use rust_lapper::Lapper; 13 | use std::collections::BTreeMap; 14 | 15 | use crate::homopolymer as hp; 16 | use std::collections::HashMap; 17 | use std::fmt; 18 | use std::hash::Hash; 19 | use std::io::Write; 20 | use std::rc::Rc; 21 | use std::str; 22 | 23 | #[derive(Eq, Hash, PartialEq, Ord, PartialOrd)] 24 | pub(crate) struct Position { 25 | pub tid: u16, 26 | pub pos: u32, 27 | pub bq_bin: u8, 28 | } 29 | 30 | /// DepthMap is for a given genome position, the depth at each (aq, bq) pair. 31 | type DepthMap = HashMap<(u8, u8), u32>; 32 | type Length = i32; 33 | 34 | pub(crate) const MAX_HP_DIST: i16 = 15; 35 | 36 | /// Returns the homopolymer distance with the minimum absolute value. 37 | /// If both distances are available, returns the one with smaller absolute value. 38 | /// If only one is available, returns that one. 39 | /// If neither is available, returns None. 40 | fn min_abs_hp_distance(dist_a: Option, dist_b: Option) -> Option { 41 | match (dist_a, dist_b) { 42 | (Some(a), Some(b)) => { 43 | if a.abs() <= b.abs() { 44 | Some(a) 45 | } else { 46 | Some(b) 47 | } 48 | } 49 | (Some(a), None) => Some(a), 50 | (None, Some(b)) => Some(b), 51 | (None, None) => None, 52 | } 53 | } 54 | 55 | pub(crate) struct Counts { 56 | pub(crate) ibam: Option, 57 | // read, f/r pos, bq, bp, ctx{6} */ 58 | pub(crate) counts: InnerCounts, 59 | pub(crate) depth: BTreeMap, 60 | pub(crate) last_depth_entry: Option<(String, u32, u32, String, String, u32)>, 61 | pub(crate) depth_writer: Option, 62 | } 63 | 64 | pub(crate) struct InnerCounts { 65 | // genome_pos 66 | pub(crate) errs: Array6, 67 | // read, f/r, pos, bq, ctx{2}, hp_dist */ 68 | pub(crate) cnts: Array6, 69 | pub(crate) mismatches: u64, 70 | pub(crate) matches: u64, 71 | 72 | // position -> error count. nice to find sites that are error-prone. 73 | pub(crate) error_positions: HashMap, 74 | // position -> indel error counts 75 | pub(crate) indel_error_positions: HashMap<(Position, Length, i16), u32>, 76 | } 77 | 78 | fn argmax(slice: &[T]) -> Option { 79 | (0..slice.len()).max_by_key(|i| &slice[*i]) 80 | } 81 | 82 | impl std::ops::AddAssign for InnerCounts { 83 | fn add_assign(&mut self, o: InnerCounts) { 84 | self.errs.add_assign(&o.errs); 85 | self.cnts.add_assign(&o.cnts); 86 | self.mismatches += o.mismatches; 87 | self.matches += o.matches; 88 | 89 | for (pos, cnt) in o.error_positions.into_iter() { 90 | let entry = self.error_positions.entry(pos).or_insert([0; 7]); 91 | for i in 0..entry.len() { 92 | entry[i] += cnt[i]; 93 | } 94 | } 95 | for (pos, cnt) in o.indel_error_positions.into_iter() { 96 | *(self.indel_error_positions.entry(pos)).or_insert(0) += cnt; 97 | } 98 | } 99 | } 100 | 101 | pub(crate) struct Stat { 102 | pub ci: ConfidenceInterval, 103 | read12: u8, 104 | fr: u8, 105 | bq_bin: u8, 106 | read_pos: u32, 107 | context: [char; 2], 108 | homopolymer_distance: i16, 109 | total_count: u64, 110 | error_count: u64, 111 | } 112 | 113 | unsafe impl std::marker::Sync for Counts {} 114 | 115 | impl fmt::Display for Stat { 116 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 117 | let (lo, hi) = self.confidence_interval(&self.ci); 118 | let hp_dist_str = if self.homopolymer_distance == MAX_HP_DIST + 1 { 119 | "NA".to_string() 120 | } else { 121 | self.homopolymer_distance.to_string() 122 | }; 123 | write!( 124 | f, 125 | "{}\t{}\t{}\t{}\t{}{}\t{}\t{}\t{}\t{:e}\t{:e}", 126 | ["r1", "r2"][self.read12 as usize], 127 | ["f", "r"][self.fr as usize], 128 | Q_LOOKUP[self.bq_bin as usize], 129 | self.read_pos, 130 | self.context[0], 131 | self.context[1], 132 | hp_dist_str, 133 | self.total_count, 134 | self.error_count, 135 | lo.max(0.0), 136 | hi.max(0.0), 137 | ) 138 | } 139 | } 140 | 141 | #[derive(Debug, Clone, clap::ValueEnum, Default)] 142 | pub enum ConfidenceInterval { 143 | #[default] 144 | AgrestiCoull, 145 | Wald, 146 | Wilson, 147 | //WilsonWithCC, 148 | } 149 | 150 | impl fmt::Display for ConfidenceInterval { 151 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 152 | write!(f, "{:?}", self) 153 | } 154 | } 155 | 156 | impl Stat { 157 | pub(crate) fn header() -> String { 158 | String::from( 159 | "read12\tFR\tbq_bin\tread_pos\tcontext\thp_dist\ttotal_count\terror_count\terr_rate_lo\terr_rate_hi", 160 | ) 161 | } 162 | 163 | pub(crate) fn confidence_interval(&self, ci: &ConfidenceInterval) -> (f64, f64) { 164 | let sample = bpci::NSuccessesSample::new(self.total_count as f64, self.error_count as f64) 165 | .expect("error with proportion"); 166 | 167 | let f = match ci { 168 | ConfidenceInterval::AgrestiCoull => sample.agresti_coull(1.960), 169 | ConfidenceInterval::Wald => sample.wald(1.960), 170 | ConfidenceInterval::Wilson => sample.wilson_score(1.960), 171 | }; 172 | 173 | (f.lower(), f.upper()) 174 | } 175 | 176 | pub(crate) fn from_counts( 177 | c: &InnerCounts, 178 | bin_size: usize, 179 | ci: ConfidenceInterval, 180 | ) -> Vec { 181 | let mut stats = vec![]; 182 | for readi in 0..c.cnts.shape()[0] { 183 | for fri in 0..c.cnts.shape()[1] { 184 | for read_posi in 0..c.cnts.shape()[2] { 185 | for bqi in 0..c.cnts.shape()[3] { 186 | for ctx6i in 0..c.errs.shape()[4] { 187 | for hp_dist in 0..c.errs.shape()[5] { 188 | let n_err = c.errs[[readi, fri, read_posi, bqi, ctx6i, hp_dist]]; 189 | 190 | // from ctx6i, we get the original context. 191 | let bases = CONTEXT_TO_CONTEXT2[ctx6i]; 192 | 193 | let ctx2i = Counts::base_to_ctx2(bases[0] as u8); 194 | let n_tot = c.cnts[[readi, fri, read_posi, bqi, ctx2i, hp_dist]]; 195 | if n_tot < n_err { 196 | eprintln!( 197 | "BAD: {ctx6i} -> {bases:?}. ctx2i:{ctx2i}", 198 | ctx6i = ctx6i, 199 | bases = bases, 200 | ctx2i = ctx2i 201 | ); 202 | } 203 | 204 | stats.push(Stat { 205 | ci: ci.clone(), 206 | read12: readi as u8, 207 | fr: fri as u8, 208 | bq_bin: bqi as u8, 209 | read_pos: (read_posi * bin_size) as u32, 210 | context: bases, 211 | total_count: n_tot, 212 | error_count: n_err, 213 | homopolymer_distance: hp_dist as i16 - MAX_HP_DIST, 214 | }) 215 | } 216 | } 217 | } 218 | } 219 | } 220 | } 221 | stats 222 | } 223 | } 224 | 225 | impl InnerCounts { 226 | pub(crate) fn new(bins: usize) -> Self { 227 | InnerCounts { 228 | cnts: Array::zeros((2, 2, bins, 5, 2, (2 * MAX_HP_DIST + 2) as usize)), 229 | errs: Array::zeros((2, 2, bins, 5, 6, (2 * MAX_HP_DIST + 2) as usize)), 230 | mismatches: 0, 231 | matches: 0, 232 | error_positions: HashMap::new(), 233 | indel_error_positions: HashMap::new(), 234 | } 235 | } 236 | } 237 | 238 | impl Counts { 239 | pub(crate) fn new(ir: Option, bins: usize) -> Self { 240 | Counts { 241 | ibam: ir, 242 | counts: InnerCounts::new(bins), 243 | depth: BTreeMap::new(), 244 | last_depth_entry: None, 245 | depth_writer: None, 246 | } 247 | } 248 | 249 | pub(crate) fn set_depth_writer(&mut self, path: &str) -> std::io::Result<()> { 250 | let mut w = bgzf::Writer::from_path_with_level(path, CompressionLevel::Level(1)) 251 | .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 252 | w.write_all(b"#chrom\tstart\tend\tread1_bq_bin\tread2_bq_bin\tpair-ovl-depth\n") 253 | .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; 254 | self.depth_writer = Some(w); 255 | Ok(()) 256 | } 257 | 258 | #[inline(always)] 259 | fn qual_to_bin(q: u8) -> u8 { 260 | match q { 261 | 0..=5 => 0, 262 | 6..=19 => 1, 263 | 20..=36 => 2, 264 | 37..=59 => 3, 265 | _ => 4, 266 | } 267 | } 268 | 269 | #[inline(always)] 270 | fn base_to_ctx2(b: u8) -> usize { 271 | match b as char { 272 | 'A' | 'T' => 0, 273 | 'C' | 'G' => 1, 274 | 'N' => 2, 275 | n => unreachable!("base_to_ctx2: {n}"), 276 | } 277 | } 278 | 279 | pub(crate) fn handle_depth(&mut self, bchrom: &str, bpos: i64) { 280 | // this function clears out the BTreeMap of depth entries that are before the current position. 281 | // it is not called if --no-denominator is specified. 282 | // it writes out the depth entries as they are popped out of the BTreeMap. 283 | loop { 284 | let pos = *(self 285 | .depth 286 | .first_key_value() 287 | .unwrap_or((&u32::MAX, &DepthMap::new())) 288 | .0); 289 | if pos == u32::MAX { 290 | break; 291 | } 292 | if (pos as i64) < bpos { 293 | let depthmap = self.depth.remove(&pos).unwrap(); 294 | 295 | for ((aq, bq), dp) in depthmap.iter() { 296 | let a_bin = Q_LOOKUP[*aq as usize]; 297 | let b_bin = Q_LOOKUP[*bq as usize]; 298 | 299 | match &mut self.last_depth_entry { 300 | Some(( 301 | last_chrom, 302 | start_pos, 303 | last_pos, 304 | last_a_bin, 305 | last_b_bin, 306 | last_dp, 307 | )) => { 308 | if bchrom == last_chrom 309 | && a_bin == last_a_bin 310 | && b_bin == last_b_bin 311 | && dp == last_dp 312 | && *last_pos + 1 == pos 313 | { 314 | *last_pos = pos; 315 | } else { 316 | if let Some(writer) = &mut self.depth_writer { 317 | writeln!( 318 | writer, 319 | "{}\t{}\t{}\t{}\t{}\t{}", 320 | last_chrom, 321 | start_pos, 322 | *last_pos + 1, 323 | last_a_bin, 324 | last_b_bin, 325 | last_dp 326 | ) 327 | .expect("error writing to bgzf file"); 328 | } 329 | 330 | self.last_depth_entry = Some(( 331 | bchrom.to_string(), 332 | pos, 333 | pos, 334 | a_bin.to_string(), 335 | b_bin.to_string(), 336 | *dp, 337 | )); 338 | } 339 | } 340 | None => { 341 | self.last_depth_entry = Some(( 342 | bchrom.to_string(), 343 | pos, 344 | pos, 345 | a_bin.to_string(), 346 | b_bin.to_string(), 347 | *dp, 348 | )); 349 | } 350 | } 351 | } 352 | } else { 353 | break; 354 | } 355 | } 356 | } 357 | 358 | #[allow(clippy::too_many_arguments)] 359 | pub(crate) fn increment + std::fmt::Debug>( 360 | &mut self, 361 | a: Rc, 362 | b: Rc, 363 | min_base_qual: u8, 364 | min_map_qual: u8, 365 | bin_size: u32, 366 | fasta: &Option, 367 | chrom: N, 368 | include_tree: &Option<&Lapper>, 369 | exclude_tree: &Option<&Lapper>, 370 | hp_tree: &Option>, 371 | ) { 372 | let pieces = overlap_pieces(&a.cigar(), &b.cigar(), a.qual(), b.qual(), true); 373 | if pieces.is_empty() { 374 | return; 375 | } 376 | let a_seq = a.seq(); 377 | let b_seq = b.seq(); 378 | if a_seq.len() / bin_size as usize >= self.counts.cnts.dim().2 { 379 | panic!( 380 | "index out of bounds: specify a --max-read-length of at least {}", 381 | a_seq.len() 382 | ) 383 | } 384 | 385 | if b_seq.len() / bin_size as usize >= self.counts.cnts.dim().2 { 386 | panic!( 387 | "index out of bounds: specify a --max-read-length of at least {}", 388 | b_seq.len() 389 | ) 390 | } 391 | 392 | let a_qual = a.qual(); 393 | let b_qual = b.qual(); 394 | 395 | let indel_errors = 396 | indel_error_pieces(&a.cigar(), &b.cigar(), a_qual, b_qual, min_base_qual); 397 | indel_errors.iter().for_each(|c: &Coordinates| { 398 | // include the event if any of it overlaps with the include tree. 399 | if let Some(t) = include_tree { 400 | if t.count(c.start, c.stop) == 0 { 401 | return; 402 | } 403 | } 404 | // exclude the event if any of it overlaps with the exclude tree. 405 | if let Some(t) = exclude_tree { 406 | if t.count(c.start, c.stop) != 0 { 407 | return; 408 | } 409 | } 410 | 411 | let len = match c.indel_type { 412 | IndelType::Insertion(len) => len as i32, 413 | IndelType::Deletion(len) => -(len as i32), 414 | IndelType::NotIndel => 0, 415 | }; 416 | 417 | let p = Position { 418 | tid: a.tid() as u16, 419 | pos: c.start, 420 | bq_bin: Counts::qual_to_bin(c.qual), 421 | }; 422 | 423 | let hps = hp_tree.as_ref().map(|t| { 424 | t.find( 425 | c.start.max(MAX_HP_DIST as u32) - MAX_HP_DIST as u32, 426 | c.stop + MAX_HP_DIST as u32, 427 | ) 428 | .collect::>() 429 | }); 430 | 431 | let hp_dist_a = hp::hp_distance( 432 | hps.as_deref(), 433 | c.start, 434 | a.pos() as u32, 435 | a.cigar().end_pos() as u32, 436 | if a.is_reverse() { -1 } else { 1 }, 437 | ); 438 | 439 | let hp_dist_b = hp::hp_distance( 440 | hps.as_deref(), 441 | c.start, 442 | b.pos() as u32, 443 | b.cigar().end_pos() as u32, 444 | if b.is_reverse() { -1 } else { 1 }, 445 | ); 446 | 447 | let indel_hp_dist = 448 | min_abs_hp_distance(hp_dist_a, hp_dist_b).unwrap_or(MAX_HP_DIST + 1 as i16); 449 | 450 | *self 451 | .counts 452 | .indel_error_positions 453 | .entry((p, len, indel_hp_dist)) 454 | .or_insert(0) += 1; 455 | }); 456 | 457 | let mut genome_pos = u32::MAX; 458 | for [a_chunk, b_chunk, g_chunk] in pieces { 459 | // we want to limit to the bounds of the read. since homopolymers outside of the read won't affect it. 460 | let eps = 1; 461 | let g_start = (g_chunk.start.max(MAX_HP_DIST as u32) - MAX_HP_DIST as u32) 462 | .max(a.pos() as u32 + eps) 463 | .max(b.pos() as u32 + eps); 464 | let g_stop = (g_chunk.stop + MAX_HP_DIST as u32) 465 | .min(a.cigar().end_pos() as u32 - eps) 466 | .min(b.cigar().end_pos() as u32 - eps); 467 | let hps: Option> = if g_start <= g_stop { 468 | hp_tree.as_ref().map(|t| t.find(g_start, g_stop).collect()) 469 | } else { 470 | None 471 | }; 472 | 473 | for (ai, bi) in std::iter::zip(a_chunk.start..a_chunk.stop, b_chunk.start..b_chunk.stop) 474 | { 475 | let aq = a_qual[ai as usize]; 476 | if aq < min_base_qual { 477 | continue; 478 | } 479 | let bq = b_qual[bi as usize]; 480 | if bq < min_base_qual { 481 | continue; 482 | } 483 | genome_pos = g_chunk.start + (ai - a_chunk.start); 484 | 485 | if let Some(t) = include_tree { 486 | if t.count(genome_pos, genome_pos + 1) == 0 { 487 | continue; 488 | } 489 | } 490 | if let Some(t) = exclude_tree { 491 | if t.count(genome_pos, genome_pos + 1) != 0 { 492 | continue; 493 | } 494 | } 495 | 496 | let aq = Counts::qual_to_bin(aq); 497 | let bq = Counts::qual_to_bin(bq); 498 | 499 | if self.depth_writer.is_some() { 500 | self.depth 501 | .entry(genome_pos) 502 | .or_default() 503 | .entry((aq, bq)) 504 | .and_modify(|v| *v += 1) 505 | .or_insert(1); 506 | } 507 | 508 | let a_base = unsafe { a_seq.decoded_base_unchecked(ai as usize) }; 509 | let b_base = unsafe { b_seq.decoded_base_unchecked(bi as usize) }; 510 | 511 | let a_bin = (ai / bin_size) as usize; 512 | let b_bin = (bi / bin_size) as usize; 513 | 514 | let a_hp_dist = hp::hp_distance( 515 | hps.as_deref(), 516 | genome_pos, 517 | a.pos() as u32, 518 | a.cigar().end_pos() as u32, 519 | if a.is_reverse() { -1 } else { 1 }, 520 | ) 521 | .map(|d| (d + MAX_HP_DIST) as usize) 522 | .unwrap_or((2 * MAX_HP_DIST + 1) as usize); 523 | 524 | let b_hp_dist = hp::hp_distance( 525 | hps.as_deref(), 526 | genome_pos, 527 | b.pos() as u32, 528 | b.cigar().end_pos() as u32, 529 | if b.is_reverse() { -1 } else { 1 }, 530 | ) 531 | .map(|d| (d + MAX_HP_DIST) as usize) 532 | .unwrap_or((2 * MAX_HP_DIST + 1) as usize); 533 | 534 | /* read1/2, F/R, pos, mq, bq, ctx, hp_dist */ 535 | let mut a_index = [ 536 | 1 - a.is_first_in_template() as usize, // 0 r1 537 | (a.is_reverse() as usize), // 538 | a_bin, 539 | aq as usize, 540 | // NOTE that this could be an error so we might change this later if we learn a_base is an error 541 | Counts::base_to_ctx2(a_base), 542 | a_hp_dist, 543 | ]; 544 | 545 | let mut b_index = [ 546 | 1 - b.is_first_in_template() as usize, 547 | (b.is_reverse() as usize), 548 | b_bin, 549 | bq as usize, 550 | // NOTE that this could be an error so we might change this later if we learn b_base is an error 551 | Counts::base_to_ctx2(b_base), 552 | b_hp_dist, 553 | ]; 554 | 555 | if a_base == b_base { 556 | // fast path to increment separately here because we must do some extra stuff to error base before incrementing count 557 | // if there is an error. 558 | self.counts.cnts[a_index] += 1; 559 | self.counts.cnts[b_index] += 1; 560 | self.counts.matches += 1; 561 | continue; 562 | } 563 | 564 | self.counts.mismatches += 1; 565 | let mut err = ['X', 'X']; 566 | 567 | let real_base = if self.ibam.is_some() { 568 | let mut base_counts = pile( 569 | self.ibam.as_mut().unwrap(), 570 | a.tid(), 571 | genome_pos, 572 | min_map_qual, 573 | min_base_qual, 574 | ); 575 | let am = argmax(&base_counts).expect("error selecting maximum index"); 576 | // check that the 2nd most common base is very low frequency, otherwise might be a het. 577 | base_counts.sort(); 578 | let cmax = base_counts[4]; 579 | // if 3nd most common base is more than 50% of first, then we don't know which is right. 580 | if base_counts[3] as f64 / cmax as f64 > 0.5 { 581 | log::debug!( 582 | "skipping due to unknown truth given base_counts {:?} at pos:{}:{}", 583 | base_counts, 584 | chrom.as_ref(), 585 | genome_pos 586 | ); 587 | continue; 588 | } 589 | ['A', 'C', 'G', 'T', 'N'][am] 590 | } else { 591 | fasta 592 | .as_ref() 593 | .unwrap() 594 | .fetch_seq(&chrom, genome_pos as usize, genome_pos as usize) 595 | .expect("error extracting base")[0] as char 596 | }; 597 | if real_base == 'N' { 598 | let chrom_string = chrom.as_ref(); 599 | log::warn!("got 'N' for {chrom_string}:{genome_pos}. skipping"); 600 | let pos = Position { 601 | tid: a.tid() as u16, 602 | pos: genome_pos, 603 | // we don't know the bq, but assume it's the min. this very rarely happens so doesn't affect results. 604 | bq_bin: aq.min(bq), 605 | }; 606 | let context_counts = self.counts.error_positions.entry(pos).or_insert([0; 7]); 607 | context_counts[6] += 1; 608 | continue; 609 | } 610 | 611 | let err_index = if a_base == real_base as u8 { 612 | // b is the error 613 | let mut index = b_index; 614 | b_index[4] = a_index[4]; // we correct this because we want to track the true base 615 | index[4] = CONTEXT_LOOKUP[&(a_base, b_base)]; 616 | err[0] = a_base as char; 617 | err[1] = b_base as char; 618 | index 619 | } else if b_base == real_base as u8 { 620 | // a is the error 621 | let mut index = a_index; 622 | a_index[4] = b_index[4]; // we correct this because we want to track the true base 623 | index[4] = CONTEXT_LOOKUP[&(b_base, a_base)]; 624 | err[0] = b_base as char; 625 | err[1] = a_base as char; 626 | index 627 | } else { 628 | // can't determine which is error base. 629 | let pos = Position { 630 | tid: a.tid() as u16, 631 | pos: genome_pos, 632 | // we don't know the bq, but assume it's the min. this very rarely happens so doesn't affect results. 633 | bq_bin: aq.min(bq), 634 | }; 635 | log::debug!( 636 | "bases mismatches between reads and neither matches reference at pos:{}:{}. adding N", 637 | chrom.as_ref(), 638 | genome_pos 639 | ); 640 | let context_counts = self.counts.error_positions.entry(pos).or_insert([0; 7]); 641 | context_counts[6] += 1; 642 | continue; 643 | }; 644 | 645 | let pos = Position { 646 | tid: a.tid() as u16, 647 | pos: genome_pos, 648 | bq_bin: err_index[3] as u8, 649 | }; 650 | let context_idx = err_index[4]; 651 | let context_counts = self.counts.error_positions.entry(pos).or_insert([0; 7]); 652 | context_counts[context_idx] += 1; 653 | 654 | self.counts.cnts[a_index] += 1; 655 | self.counts.cnts[b_index] += 1; 656 | 657 | self.counts.errs[err_index] += 1; 658 | if log::log_enabled!(log::Level::Debug) 659 | && unsafe { str::from_utf8_unchecked(a.qname()) } 660 | == "A00744:46:HV3C3DSXX:2:2611:30798:35258" 661 | { 662 | log::debug!( 663 | "gpos:{}, err:{}->{}, err-index:{:?}, ai:{}, bi:{}, {:?}", 664 | genome_pos, 665 | /* base_counts, */ 666 | err[0], 667 | err[1], 668 | err_index, 669 | ai, 670 | bi, 671 | unsafe { str::from_utf8_unchecked(a.qname()) }, 672 | ); 673 | } 674 | } 675 | } 676 | if self.depth_writer.is_some() { 677 | self.handle_depth(chrom.as_ref(), genome_pos as i64); 678 | } 679 | } 680 | } 681 | 682 | fn pile( 683 | ibam: &mut IndexedReader, 684 | tid: i32, 685 | genome_pos: u32, 686 | min_map_qual: u8, 687 | min_base_qual: u8, 688 | ) -> [u32; 5] { 689 | let mut base_counts: [u32; 5] = [0; 5]; 690 | 691 | ibam.fetch((tid, genome_pos, genome_pos + 1)) 692 | .expect("Error seeking to genomic position"); 693 | 694 | let mut p = ibam.pileup(); 695 | p.set_max_depth(100_000); 696 | p.filter(|col| col.as_ref().unwrap().pos() == genome_pos) 697 | .for_each(|col| { 698 | let col = col.unwrap(); 699 | 700 | col.alignments().for_each(|aln| { 701 | if let Some(qpos) = aln.qpos() { 702 | let record = aln.record(); 703 | // here we want a accurate count, so we skip stuff at either 704 | // end of a read (within 3 bases of end) 705 | // along with low base-quality and low mapping-quality 706 | if qpos < 3 || qpos > record.qual().len() - 4 { 707 | return; 708 | } 709 | if record.mapq() < min_map_qual { 710 | return; 711 | } 712 | if record.qual()[qpos] < min_base_qual { 713 | return; 714 | } 715 | let base_idx = match record.seq()[qpos] as char { 716 | 'A' => 0, 717 | 'C' => 1, 718 | 'G' => 2, 719 | 'T' => 3, 720 | _ => 4, 721 | }; 722 | base_counts[base_idx] += 1; 723 | } 724 | }); 725 | }); 726 | base_counts 727 | } 728 | 729 | lazy_static! { 730 | pub(crate) static ref CONTEXT_LOOKUP: HashMap<(u8, u8), usize> = HashMap::from([ 731 | ((b'T', b'G'), 0usize), 732 | ((b'A', b'C'), 0usize), 733 | ((b'T', b'C'), 1usize), 734 | ((b'A', b'G'), 1usize), 735 | ((b'T', b'A'), 2usize), 736 | ((b'A', b'T'), 2usize), 737 | ((b'C', b'A'), 3usize), 738 | ((b'G', b'T'), 3usize), 739 | ((b'C', b'G'), 4usize), 740 | ((b'G', b'C'), 4usize), 741 | ((b'C', b'T'), 5usize), 742 | ((b'G', b'A'), 5usize), 743 | ((b'N', b'N'), 6usize), 744 | ]); 745 | pub(crate) static ref CONTEXT_TO_CONTEXT2: [[char; 2]; 7] = [ 746 | ['A', 'C'], 747 | ['A', 'G'], 748 | ['A', 'T'], 749 | ['C', 'A'], 750 | ['C', 'G'], 751 | ['C', 'T'], 752 | ['N', 'N'], 753 | ]; 754 | pub(crate) static ref Q_LOOKUP: [&'static str; 5] = ["0-5", "05-19", "20-36", "37-59", "60+"]; 755 | pub(crate) static ref REVERSE_Q_LOOKUP: HashMap<&'static str, u8> = HashMap::from([ 756 | ("0-5", 0), 757 | ("05-19", 1), 758 | ("20-36", 2), 759 | ("37-59", 3), 760 | ("60+", 4), 761 | ]); 762 | } 763 | 764 | pub(crate) fn filter_read(r: &Rc) -> bool { 765 | r.tid() == r.mtid() 766 | && r.tid() >= 0 767 | && !r.is_unmapped() 768 | && !r.is_mate_unmapped() 769 | && (r.pos() - r.mpos()).abs() < 1000 770 | && !r.is_supplementary() 771 | && !r.is_secondary() 772 | && !r.is_duplicate() 773 | && !r.is_quality_check_failed() 774 | } 775 | 776 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] 777 | pub struct Coordinates { 778 | pub start: u32, 779 | pub stop: u32, 780 | pub indel_type: IndelType, 781 | pub qual: u8, 782 | } 783 | 784 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] 785 | pub enum IndelType { 786 | Insertion(u32), 787 | Deletion(u32), 788 | NotIndel, 789 | } 790 | 791 | #[inline(always)] 792 | fn is_insertion(a: Cigar) -> bool { 793 | matches!(a, Cigar::Ins(_)) 794 | } 795 | 796 | #[inline(always)] 797 | fn query(a: Cigar) -> i64 { 798 | match a { 799 | Cigar::Match(n) | Cigar::SoftClip(n) | Cigar::Ins(n) | Cigar::Diff(n) | Cigar::Equal(n) => { 800 | n as i64 801 | } 802 | _ => 0, 803 | } 804 | } 805 | #[inline(always)] 806 | fn reference(a: Cigar) -> i64 { 807 | match a { 808 | Cigar::Match(n) | Cigar::Del(n) | Cigar::Diff(n) | Cigar::Equal(n) | Cigar::RefSkip(n) => { 809 | n as i64 810 | } 811 | _ => 0, 812 | } 813 | } 814 | 815 | fn indel_coords( 816 | cig: &CigarStringView, 817 | genomic_min: u32, 818 | genomic_max: u32, 819 | base_quals: &[u8], 820 | ) -> Vec { 821 | let mut result: Vec = Vec::new(); 822 | let mut start: u32 = cig.pos() as u32; 823 | let mut read_i = 0; 824 | 825 | for c in cig { 826 | if start > genomic_max { 827 | break; 828 | } 829 | if start + (reference(*c) as u32) < genomic_min { 830 | start += reference(*c) as u32; 831 | read_i += query(*c) as usize; 832 | continue; 833 | } 834 | // TODO: handle partial overlap of start with the current cigar op. 835 | match c { 836 | Cigar::Ins(l) => { 837 | result.push(Coordinates { 838 | start, 839 | stop: start + 1, 840 | indel_type: IndelType::Insertion(*l as u32), 841 | qual: base_quals[read_i], 842 | }); 843 | } 844 | Cigar::Del(d) => { 845 | result.push(Coordinates { 846 | start, 847 | stop: start + *d, 848 | indel_type: IndelType::Deletion(*d as u32), 849 | qual: base_quals[read_i], 850 | }); 851 | } 852 | _ => {} 853 | } 854 | start += reference(*c) as u32; 855 | read_i += query(*c) as usize; 856 | } 857 | result 858 | } 859 | 860 | fn find_non_exact( 861 | a_indel_coords: &[Coordinates], 862 | b_indel_coords: &[Coordinates], 863 | result: &mut Vec, 864 | min_base_qual: u8, 865 | ) { 866 | for a in a_indel_coords { 867 | if a.qual <= min_base_qual { 868 | continue; 869 | } 870 | match b_indel_coords.binary_search_by(|b| b.cmp(a)) { 871 | Ok(_) => {} 872 | Err(bi) => { 873 | // we check base-qual (first base) of b as well. 874 | // this is a bit weird, but ensures that at least both 875 | // reads were confident at this site. 876 | if bi < b_indel_coords.len() && b_indel_coords[bi].qual < min_base_qual { 877 | continue; 878 | } 879 | let bq = if bi < b_indel_coords.len() { 880 | b_indel_coords[bi].qual 881 | } else { 882 | u8::MAX 883 | }; 884 | // any non-exact matches are errors 885 | result.push(Coordinates { 886 | start: a.start, 887 | stop: a.stop, 888 | indel_type: a.indel_type.clone(), 889 | qual: a.qual.min(bq), 890 | }); 891 | } 892 | } 893 | } 894 | } 895 | 896 | /// Report genomic coordiantes of bases that do not match between the reads. 897 | fn indel_error_pieces( 898 | a: &CigarStringView, 899 | b: &CigarStringView, 900 | a_qual: &[u8], 901 | b_qual: &[u8], 902 | min_base_qual: u8, 903 | ) -> Vec { 904 | let aend = a.end_pos() as u32; 905 | let bend = b.end_pos() as u32; 906 | //let astart = a.pos() + a.leading_softclips(); 907 | //let bstart = b.pos() + b.leading_softclips(); 908 | if aend <= b.pos() as u32 || bend <= a.pos() as u32 { 909 | return vec![]; 910 | } 911 | let a_indel_coords = indel_coords(a, b.pos() as u32, bend, a_qual); 912 | let b_indel_coords = indel_coords(b, a.pos() as u32, aend, b_qual); 913 | if a_indel_coords.is_empty() && b_indel_coords.is_empty() { 914 | return vec![]; 915 | } 916 | 917 | let mut result: Vec = Vec::new(); 918 | find_non_exact(&a_indel_coords, &b_indel_coords, &mut result, min_base_qual); 919 | find_non_exact(&b_indel_coords, &a_indel_coords, &mut result, min_base_qual); 920 | result 921 | } 922 | 923 | /// Return mapped parts of each read that overlap the other. 924 | /// Returns A, B, genome coordiantes. 925 | fn overlap_pieces( 926 | a: &CigarStringView, 927 | b: &CigarStringView, 928 | a_qual: &[u8], 929 | b_qual: &[u8], 930 | skip_insertions: bool, 931 | ) -> Vec<[Coordinates; 3]> { 932 | let aend = a.end_pos(); 933 | let bend = b.end_pos(); 934 | //let astart = a.pos() + a.leading_softclips(); 935 | //let bstart = b.pos() + b.leading_softclips(); 936 | if aend <= b.pos() || bend <= a.pos() { 937 | return vec![]; 938 | } 939 | 940 | let mut result: Vec<[Coordinates; 3]> = Vec::new(); 941 | let mut ai: usize = 0; 942 | let mut bi: usize = 0; 943 | let mut a_genome_pos = a.pos(); 944 | let mut b_genome_pos = b.pos(); 945 | let mut a_read_pos = 0i64; 946 | let mut b_read_pos = 0i64; 947 | while ai < a.len() && bi < b.len() { 948 | let a_genome_stop = a_genome_pos + reference(a[ai.min(a.len() - 1)]); 949 | let b_genome_stop = b_genome_pos + reference(b[bi.min(b.len() - 1)]); 950 | if a_genome_stop < b_genome_pos { 951 | if ai < a.len() { 952 | a_genome_pos += reference(a[ai]); 953 | a_read_pos += query(a[ai]); 954 | ai += 1; 955 | } 956 | } else if b_genome_stop < a_genome_pos { 957 | if bi < b.len() { 958 | b_genome_pos += reference(b[bi]); 959 | b_read_pos += query(b[bi]); 960 | bi += 1; 961 | } 962 | } else { 963 | // we have some overlap. 964 | // if they both consume query, we can append to our result. 965 | let aop = a[ai.min(a.len() - 1)]; 966 | let bop = b[bi.min(b.len() - 1)]; 967 | if query(aop) > 0 && query(bop) > 0 { 968 | let genome_start = a_genome_pos.max(b_genome_pos); 969 | let genome_stop = a_genome_stop.min(b_genome_stop); 970 | 971 | let mut glen = genome_stop - genome_start; 972 | if glen == 0 && !skip_insertions { 973 | // if they are both the same insertion, then we will evaluate. 974 | // otherwise, we can not. 975 | if aop == bop && is_insertion(aop) { 976 | glen = aop.len() as i64; 977 | } 978 | } 979 | 980 | // if glen is 0, we didn't consume any reference, but can have, e.g. both deletions. 981 | if glen > 0 { 982 | //let a_over = aop.len() as i64 - (genome_start - a_genome_pos); 983 | //let b_over = bop.len() as i64 - (genome_start - b_genome_pos); 984 | 985 | // glen can be 0 if, e.g. both reads end with soft-clip. 986 | let a_over = genome_start - a_genome_pos; 987 | let b_over = genome_start - b_genome_pos; 988 | 989 | result.push([ 990 | Coordinates { 991 | start: (a_read_pos + a_over) as u32, 992 | stop: (a_read_pos + a_over + glen) as u32, 993 | indel_type: match aop { 994 | Cigar::Ins(l) => IndelType::Insertion(l as u32), 995 | Cigar::Del(l) => IndelType::Deletion(l as u32), 996 | _ => IndelType::NotIndel, 997 | }, 998 | qual: a_qual[a_read_pos as usize + a_over as usize], 999 | }, 1000 | Coordinates { 1001 | start: (b_read_pos + b_over) as u32, 1002 | stop: (b_read_pos + b_over + glen) as u32, 1003 | indel_type: match bop { 1004 | Cigar::Ins(l) => IndelType::Insertion(l as u32), 1005 | Cigar::Del(l) => IndelType::Deletion(l as u32), 1006 | _ => IndelType::NotIndel, 1007 | }, 1008 | qual: b_qual[b_read_pos as usize + b_over as usize], 1009 | }, 1010 | Coordinates { 1011 | start: genome_start as u32, 1012 | stop: genome_stop as u32, 1013 | indel_type: match aop { 1014 | Cigar::Ins(l) => IndelType::Insertion(l as u32), 1015 | Cigar::Del(l) => IndelType::Deletion(l as u32), 1016 | _ => IndelType::NotIndel, 1017 | }, 1018 | qual: a_qual[a_read_pos as usize + a_over as usize], 1019 | }, 1020 | ]) 1021 | } 1022 | } 1023 | // we had some overlap. now we increment the lowest genome pos by end. 1024 | if a_genome_stop <= b_genome_stop && ai < a.len() { 1025 | a_genome_pos += reference(a[ai]); 1026 | a_read_pos += query(a[ai]); 1027 | ai += 1; 1028 | } 1029 | if b_genome_stop <= a_genome_stop && bi < b.len() { 1030 | b_genome_pos += reference(b[bi]); 1031 | b_read_pos += query(b[bi]); 1032 | bi += 1; 1033 | } 1034 | } 1035 | } 1036 | 1037 | result 1038 | } 1039 | 1040 | #[cfg(test)] 1041 | mod tests { 1042 | use super::*; 1043 | use rust_htslib::bam::record::{Cigar, CigarString}; 1044 | 1045 | #[test] 1046 | fn test_different_alignments() { 1047 | let a = CigarString(vec![Cigar::Match(5), Cigar::Ins(3), Cigar::Match(5)]).into_view(0); 1048 | let b = CigarString(vec![Cigar::Match(13)]).into_view(0); 1049 | let a_bqs = vec![30u8; 20]; 1050 | let b_bqs = vec![30u8; 20]; 1051 | let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, true); 1052 | dbg!(&r); 1053 | } 1054 | 1055 | #[test] 1056 | fn test_same_insertion() { 1057 | let a = CigarString(vec![Cigar::Match(10), Cigar::Ins(8), Cigar::Match(10)]).into_view(5); 1058 | let b = CigarString(vec![Cigar::Match(10), Cigar::Ins(8), Cigar::Match(10)]).into_view(5); 1059 | let a_bqs = vec![30u8; 20]; 1060 | let b_bqs = vec![30u8; 20]; 1061 | let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, false); 1062 | dbg!(&r); 1063 | } 1064 | 1065 | #[test] 1066 | fn test_many_contained() { 1067 | let a = CigarString(vec![Cigar::Match(100)]).into_view(10); 1068 | let b = CigarString(vec![ 1069 | Cigar::Match(10), 1070 | Cigar::Match(11), 1071 | Cigar::Match(12), 1072 | Cigar::Match(13), 1073 | ]) 1074 | .into_view(0); 1075 | let a_bqs = vec![30u8; 200]; 1076 | let b_bqs = vec![30u8; 200]; 1077 | let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, true); 1078 | let expected = [ 1079 | [ 1080 | Coordinates { 1081 | start: 0, 1082 | stop: 11, 1083 | indel_type: IndelType::NotIndel, 1084 | qual: 30, 1085 | }, 1086 | Coordinates { 1087 | start: 10, 1088 | stop: 21, 1089 | indel_type: IndelType::NotIndel, 1090 | qual: 30, 1091 | }, 1092 | Coordinates { 1093 | start: 10, 1094 | stop: 21, 1095 | indel_type: IndelType::NotIndel, 1096 | qual: 30, 1097 | }, 1098 | ], 1099 | [ 1100 | Coordinates { 1101 | start: 11, 1102 | stop: 23, 1103 | indel_type: IndelType::NotIndel, 1104 | qual: 30, 1105 | }, 1106 | Coordinates { 1107 | start: 21, 1108 | stop: 33, 1109 | indel_type: IndelType::NotIndel, 1110 | qual: 30, 1111 | }, 1112 | Coordinates { 1113 | start: 21, 1114 | stop: 33, 1115 | indel_type: IndelType::NotIndel, 1116 | qual: 30, 1117 | }, 1118 | ], 1119 | [ 1120 | Coordinates { 1121 | start: 23, 1122 | stop: 36, 1123 | indel_type: IndelType::NotIndel, 1124 | qual: 30, 1125 | }, 1126 | Coordinates { 1127 | start: 33, 1128 | stop: 46, 1129 | indel_type: IndelType::NotIndel, 1130 | qual: 30, 1131 | }, 1132 | Coordinates { 1133 | start: 33, 1134 | stop: 46, 1135 | indel_type: IndelType::NotIndel, 1136 | qual: 30, 1137 | }, 1138 | ], 1139 | ]; 1140 | assert_eq!(r, expected); 1141 | } 1142 | 1143 | #[test] 1144 | fn test_simple_overlap() { 1145 | let a = CigarString(vec![ 1146 | Cigar::Match(10), 1147 | Cigar::Match(80), 1148 | Cigar::SoftClip(10), 1149 | ]) 1150 | .into_view(8); 1151 | let b = CigarString(vec![ 1152 | Cigar::Match(70), 1153 | Cigar::Match(40), 1154 | Cigar::SoftClip(10), 1155 | ]) 1156 | .into_view(5); 1157 | 1158 | let a_bqs = vec![30u8; 100]; 1159 | let b_bqs = vec![30u8; 100]; 1160 | let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, true); 1161 | 1162 | let expected = [ 1163 | [ 1164 | Coordinates { 1165 | start: 0, 1166 | stop: 10, 1167 | indel_type: IndelType::NotIndel, 1168 | qual: 30, 1169 | }, 1170 | Coordinates { 1171 | start: 3, 1172 | stop: 13, 1173 | indel_type: IndelType::NotIndel, 1174 | qual: 30, 1175 | }, 1176 | Coordinates { 1177 | start: 8, 1178 | stop: 18, 1179 | indel_type: IndelType::NotIndel, 1180 | qual: 30, 1181 | }, 1182 | ], 1183 | [ 1184 | Coordinates { 1185 | start: 10, 1186 | stop: 67, 1187 | indel_type: IndelType::NotIndel, 1188 | qual: 30, 1189 | }, 1190 | Coordinates { 1191 | start: 13, 1192 | stop: 70, 1193 | indel_type: IndelType::NotIndel, 1194 | qual: 30, 1195 | }, 1196 | Coordinates { 1197 | start: 18, 1198 | stop: 75, 1199 | indel_type: IndelType::NotIndel, 1200 | qual: 30, 1201 | }, 1202 | ], 1203 | [ 1204 | Coordinates { 1205 | start: 67, 1206 | stop: 90, 1207 | indel_type: IndelType::NotIndel, 1208 | qual: 30, 1209 | }, 1210 | Coordinates { 1211 | start: 70, 1212 | stop: 93, 1213 | indel_type: IndelType::NotIndel, 1214 | qual: 30, 1215 | }, 1216 | Coordinates { 1217 | start: 75, 1218 | stop: 98, 1219 | indel_type: IndelType::NotIndel, 1220 | qual: 30, 1221 | }, 1222 | ], 1223 | ]; 1224 | 1225 | assert_eq!(r, expected); 1226 | } 1227 | 1228 | #[test] 1229 | fn test_size() { 1230 | assert_eq!(std::mem::size_of::(), 8); 1231 | } 1232 | 1233 | #[test] 1234 | fn test_min_abs_hp_distance() { 1235 | // Both distances available - choose minimum absolute value 1236 | assert_eq!(min_abs_hp_distance(Some(-1), Some(2)), Some(-1)); 1237 | assert_eq!(min_abs_hp_distance(Some(2), Some(-1)), Some(-1)); 1238 | assert_eq!(min_abs_hp_distance(Some(-3), Some(-2)), Some(-2)); 1239 | assert_eq!(min_abs_hp_distance(Some(3), Some(2)), Some(2)); 1240 | 1241 | // Equal absolute values - choose first one 1242 | assert_eq!(min_abs_hp_distance(Some(-2), Some(2)), Some(-2)); 1243 | assert_eq!(min_abs_hp_distance(Some(2), Some(-2)), Some(2)); 1244 | 1245 | // Only one distance available 1246 | assert_eq!(min_abs_hp_distance(Some(5), None), Some(5)); 1247 | assert_eq!(min_abs_hp_distance(None, Some(-3)), Some(-3)); 1248 | 1249 | // No distances available 1250 | assert_eq!(min_abs_hp_distance(None, None), None); 1251 | } 1252 | 1253 | #[test] 1254 | fn test_indel_error_pieces() { 1255 | let cigar_a = 1256 | CigarString(vec![Cigar::Match(10), Cigar::Del(2), Cigar::Match(5)]).into_view(5); 1257 | let cigar_b = 1258 | CigarString(vec![Cigar::Match(10), Cigar::Del(3), Cigar::Match(4)]).into_view(10); 1259 | let expected = vec![ 1260 | Coordinates { 1261 | start: 15, 1262 | stop: 17, 1263 | indel_type: IndelType::Deletion(2), 1264 | qual: 30, 1265 | }, 1266 | Coordinates { 1267 | start: 20, 1268 | stop: 23, 1269 | indel_type: IndelType::Deletion(3), 1270 | qual: 30, 1271 | }, 1272 | ]; 1273 | let a_bqs = vec![30u8; 20]; 1274 | let b_bqs = vec![30u8; 20]; 1275 | assert_eq!( 1276 | indel_error_pieces(&cigar_a, &cigar_b, &a_bqs, &b_bqs, 15), 1277 | expected 1278 | ); 1279 | } 1280 | 1281 | #[test] 1282 | fn test_indel_error_pieces_overlap() { 1283 | let cigar_a = 1284 | CigarString(vec![Cigar::Match(10), Cigar::Del(3), Cigar::Match(5)]).into_view(10); 1285 | let cigar_b = 1286 | CigarString(vec![Cigar::Match(10), Cigar::Ins(3), Cigar::Match(5)]).into_view(10); 1287 | let expected = vec![ 1288 | Coordinates { 1289 | start: 20, 1290 | stop: 23, 1291 | indel_type: IndelType::Deletion(3), 1292 | qual: 30, 1293 | }, 1294 | Coordinates { 1295 | start: 20, 1296 | stop: 21, 1297 | indel_type: IndelType::Insertion(3), 1298 | qual: 30, 1299 | }, 1300 | ]; 1301 | let a_bqs = vec![30u8; 20]; 1302 | let b_bqs = vec![30u8; 20]; 1303 | assert_eq!( 1304 | indel_error_pieces(&cigar_a, &cigar_b, &a_bqs, &b_bqs, 10), 1305 | expected 1306 | ); 1307 | } 1308 | 1309 | #[test] 1310 | fn test_indel_error_quals() { 1311 | let cigar_a = 1312 | CigarString(vec![Cigar::Match(10), Cigar::Del(3), Cigar::Match(5)]).into_view(10); 1313 | let cigar_b = 1314 | CigarString(vec![Cigar::Match(10), Cigar::Ins(3), Cigar::Match(5)]).into_view(10); 1315 | let expected = vec![]; 1316 | let a_bqs = vec![10u8; 20]; 1317 | let b_bqs = vec![10u8; 20]; 1318 | assert_eq!( 1319 | indel_error_pieces(&cigar_a, &cigar_b, &a_bqs, &b_bqs, 60), 1320 | expected 1321 | ); 1322 | } 1323 | } 1324 | -------------------------------------------------------------------------------- /src/homopolymer.rs: -------------------------------------------------------------------------------- 1 | pub(crate) const HP_REGEX: &str = "A{3,}|C{3,}|G{3,}|T{3,}"; 2 | use regex::Regex; 3 | use rust_lapper::{Interval, Lapper}; 4 | 5 | /// Find the homopolymers and return an interval tree with the positions of the homopolymers. 6 | pub(crate) fn find_homopolymers(seq: &[u8], re: &Regex) -> Lapper { 7 | let seq_str = unsafe { std::str::from_utf8_unchecked(seq) }; 8 | let matches = re.find_iter(seq_str); 9 | let intervals: Vec> = matches 10 | .map(|m| Interval { 11 | start: m.range().start as u32, 12 | stop: m.range().end as u32, 13 | val: 0, 14 | }) 15 | .collect(); 16 | log::info!("found {} homopolymers with regex: {re}", intervals.len()); 17 | Lapper::new(intervals) 18 | } 19 | 20 | /// return a negative number if the hp is before the position, accounting for strand. 21 | /// and 0 if the hp contains the position, otherwise a positive number. 22 | /// Returns None if the distance is greater than MAX_HP_DIST 23 | /// hphphp---pos----> 24 | /// 25 | pub(crate) fn hp_distance( 26 | hps: Option<&[&Interval]>, 27 | pos: u32, 28 | read_start: u32, 29 | read_stop: u32, 30 | _strand: i8, 31 | ) -> Option { 32 | let mut dist: Option = None; 33 | if pos < read_start || pos > read_stop { 34 | return dist; 35 | } 36 | 37 | // strand will be 1 for forward, -1 for reverse 38 | for hp in hps.map(|hps| hps.iter()).unwrap_or_default() { 39 | // first we check if the hp is within 3 bases of the read start or stop. 40 | // since this could truncate the hp and not affect the read. 41 | // cases to exclude: 42 | // read: -----------> 43 | // hp: AAAAA 44 | // pos: 45 | 46 | if hp.stop >= read_start && hp.stop < read_start + 3 { 47 | continue; 48 | } 49 | if hp.start <= read_stop && hp.start > read_stop - 3 { 50 | continue; 51 | } 52 | 53 | assert!( 54 | pos >= read_start && pos <= read_stop, 55 | "pos: {}, read_start: {}, read_stop: {}", 56 | pos, 57 | read_start, 58 | read_stop 59 | ); 60 | 61 | let d = if pos < hp.start { 62 | hp.start as i64 - pos as i64 63 | } else if pos > hp.stop { 64 | -(pos as i64 - hp.stop as i64) 65 | } else { 66 | 0i64 67 | }; 68 | // now we check distance of pos to hp. 69 | if d < -crate::fraguracy::MAX_HP_DIST as i64 || d > crate::fraguracy::MAX_HP_DIST as i64 { 70 | continue; 71 | } 72 | 73 | let d = d as i16; 74 | if dist.is_none() || d.abs() < dist.unwrap().abs() { 75 | dist = Some(d); 76 | } 77 | } 78 | dist 79 | } 80 | 81 | #[cfg(test)] 82 | mod tests { 83 | use super::*; 84 | 85 | #[test] 86 | fn test_find_homopolymers() { 87 | // Test sequence with various homopolymers 88 | let seq = b"AAATCCCGAAAGGGGTTTT"; 89 | let re = Regex::new(HP_REGEX).expect("invalid regex"); 90 | let homopolymers = find_homopolymers(seq, &re); 91 | 92 | // Convert results to vec for easier testing 93 | let results: Vec<_> = homopolymers.iter().collect(); 94 | 95 | // Expected homopolymers: AAA, CCC, AAAA, GGGG, TTTT 96 | assert_eq!(results.len(), 5); 97 | 98 | // Check each homopolymer position 99 | assert_eq!(results[0].start, 0); // AAA 100 | assert_eq!(results[0].stop, 3); 101 | 102 | assert_eq!(results[1].start, 4); // CCC 103 | assert_eq!(results[1].stop, 7); 104 | 105 | assert_eq!(results[2].start, 8); // AAAA 106 | assert_eq!(results[2].stop, 11); 107 | 108 | assert_eq!(results[3].start, 11); // GGGG 109 | assert_eq!(results[3].stop, 15); 110 | 111 | assert_eq!(results[4].start, 15); // TTTT 112 | assert_eq!(results[4].stop, 19); 113 | } 114 | 115 | #[test] 116 | fn test_hp_distance() { 117 | let hp = vec![Interval { 118 | start: 9, 119 | stop: 12, 120 | val: 0, 121 | }]; 122 | let hp_refs: Vec<&Interval> = hp.iter().collect(); 123 | 124 | // Test homopolymer near read end 125 | assert_eq!(hp_distance(Some(&hp_refs), 11, 10, 20, 1,), None); 126 | 127 | // Test forward strand 128 | assert_eq!(hp_distance(Some(&hp_refs), 15, 5, 20, 1,), Some(-3)); 129 | 130 | // Test reverse strand 131 | assert_eq!(hp_distance(Some(&hp_refs), 15, 5, 20, -1,), Some(-3)); 132 | 133 | // Test distant homopolymer 134 | assert_eq!( 135 | hp_distance( 136 | Some(&hp_refs), 137 | 115 + crate::fraguracy::MAX_HP_DIST as u32, 138 | 105 + crate::fraguracy::MAX_HP_DIST as u32, 139 | 120 + crate::fraguracy::MAX_HP_DIST as u32, 140 | 1, 141 | ), 142 | None 143 | ); 144 | } 145 | } 146 | -------------------------------------------------------------------------------- /src/lua.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, ops::AddAssign}; 2 | 3 | use anyhow::Result; 4 | use mlua::{prelude::LuaError, Function, Lua, UserData, UserDataFields, UserDataMethods, Value}; 5 | use rust_htslib::bam::{ 6 | record::{Aux, Cigar}, 7 | Record, 8 | }; 9 | 10 | #[derive(Clone)] 11 | pub struct LuaReadFilter { 12 | lua: Lua, 13 | filter_func: Function, 14 | } 15 | 16 | pub struct Flags { 17 | flag: u16, 18 | } 19 | 20 | impl UserData for Flags { 21 | fn add_fields>(fields: &mut M) { 22 | fields.add_field_method_get("paired", |_, this| Ok((this.flag & 0x1) != 0)); 23 | fields.add_field_method_get("proper_pair", |_, this| Ok((this.flag & 0x2) != 0)); 24 | fields.add_field_method_get("unmapped", |_, this| Ok((this.flag & 0x4) != 0)); 25 | fields.add_field_method_get("mate_unmapped", |_, this| Ok((this.flag & 0x8) != 0)); 26 | fields.add_field_method_get("reverse", |_, this| Ok((this.flag & 0x10) != 0)); 27 | fields.add_field_method_get("forward", |_, this| Ok((this.flag & 0x10) == 0)); 28 | fields.add_field_method_get("mate_reverse", |_, this| Ok((this.flag & 0x20) != 0)); 29 | fields.add_field_method_get("mate_forward", |_, this| Ok((this.flag & 0x20) == 0)); 30 | fields.add_field_method_get("read_1", |_, this| Ok((this.flag & 0x40) != 0)); 31 | fields.add_field_method_get("read_2", |_, this| Ok((this.flag & 0x80) != 0)); 32 | fields.add_field_method_get("secondary", |_, this| Ok((this.flag & 0x100) != 0)); 33 | fields.add_field_method_get("primary", |_, this| Ok((this.flag & 0x100) == 0)); 34 | fields.add_field_method_get("qcfail", |_, this| Ok((this.flag & 0x200) != 0)); 35 | fields.add_field_method_get("duplicate", |_, this| Ok((this.flag & 0x400) != 0)); 36 | fields.add_field_method_get("supplementary", |_, this| Ok((this.flag & 0x800) != 0)); 37 | 38 | fields.add_field_method_get("flag", |_, this| Ok(this.flag)); 39 | } 40 | } 41 | 42 | impl LuaReadFilter { 43 | pub fn skip_read(&self, read: &Record) -> Result { 44 | let r = self.lua.scope(|scope| { 45 | let globals = self.lua.globals(); 46 | let user_data = scope.create_any_userdata_ref(read)?; 47 | globals.set("read", user_data).expect("failed to set read"); 48 | self.filter_func.call::(()) 49 | })?; 50 | Ok(r) 51 | } 52 | 53 | pub fn new(expression: &str, lua: Lua) -> Result { 54 | if !expression.contains("return") { 55 | return Err(anyhow::anyhow!( 56 | "expression must contain a return statement" 57 | )); 58 | } 59 | let filter_func = lua.load(expression).into_function()?; 60 | 61 | lua.register_userdata_type::(|reg| { 62 | reg.add_field_method_get("mapping_quality", |_, this| Ok(this.mapq())); 63 | reg.add_field_method_get("flags", |_, this| Ok(Flags { flag: this.flags() })); 64 | reg.add_field_method_get("tid", |_, this| Ok(this.tid())); 65 | reg.add_field_method_get("start", |_, this| Ok(this.pos())); 66 | reg.add_field_method_get("stop", |_, this| Ok(this.cigar().end_pos())); 67 | reg.add_field_method_get("length", |_, this| Ok(this.seq_len())); 68 | reg.add_field_method_get("insert_size", |_, this| Ok(this.insert_size())); 69 | reg.add_field_method_get("qname", |_, this| { 70 | let q = this.qname(); 71 | Ok(std::str::from_utf8(q).unwrap_or("").to_string()) 72 | }); 73 | reg.add_field_method_get("sequence", |_, this| { 74 | let seq = this.seq(); 75 | Ok(std::str::from_utf8(&seq.as_bytes()) 76 | .unwrap_or("") 77 | .to_string()) 78 | }); 79 | 80 | reg.add_field_method_get("soft_clips_3_prime", |_, this| { 81 | let cigar = this.cigar(); 82 | if this.is_reverse() { 83 | Ok(cigar.leading_softclips()) 84 | } else { 85 | Ok(cigar.trailing_softclips()) 86 | } 87 | }); 88 | reg.add_field_method_get("soft_clips_5_prime", |_, this| { 89 | let cigar = this.cigar(); 90 | if this.is_reverse() { 91 | Ok(cigar.trailing_softclips()) 92 | } else { 93 | Ok(cigar.leading_softclips()) 94 | } 95 | }); 96 | reg.add_field_method_get("average_base_quality", |_, this| { 97 | let qual = this.qual(); 98 | let sum = qual.iter().map(|q| *q as u64).sum::(); 99 | let count = qual.len(); 100 | Ok(sum as f64 / count as f64) 101 | }); 102 | 103 | reg.add_method("tag", |lua, this: &Record, tag: String| { 104 | let tag = tag.as_bytes(); 105 | let aux = this.aux(tag).map_err(LuaError::external)?; 106 | let lua_val: Value = match aux { 107 | Aux::Char(v) => Value::String(lua.create_string(&[v])?), 108 | Aux::I8(v) => Value::Number(v as f64), 109 | Aux::U8(v) => Value::Number(v as f64), 110 | Aux::I16(v) => Value::Number(v as f64), 111 | Aux::U16(v) => Value::Number(v as f64), 112 | Aux::I32(v) => Value::Number(v as f64), 113 | Aux::U32(v) => Value::Number(v as f64), 114 | Aux::Float(v) => Value::Number(v as f64), 115 | Aux::Double(v) => Value::Number(v as f64), 116 | Aux::String(v) => Value::String(lua.create_string(&v)?), 117 | Aux::ArrayFloat(v) => { 118 | let mut arr = Vec::new(); 119 | for i in 0..v.len() { 120 | arr.push(v.get(i).unwrap_or(f32::NAN) as f32); 121 | } 122 | Value::Table(lua.create_sequence_from(arr)?) 123 | } 124 | Aux::ArrayI32(v) => { 125 | let mut arr = Vec::new(); 126 | for i in 0..v.len() { 127 | arr.push(v.get(i).unwrap_or(i32::MIN) as i32); 128 | } 129 | Value::Table(lua.create_sequence_from(arr)?) 130 | } 131 | Aux::ArrayI8(v) => { 132 | let mut arr = Vec::new(); 133 | for i in 0..v.len() { 134 | arr.push(v.get(i).unwrap_or(i8::MIN) as i8); 135 | } 136 | Value::Table(lua.create_sequence_from(arr)?) 137 | } 138 | Aux::ArrayU8(v) => { 139 | let mut arr = Vec::new(); 140 | for i in 0..v.len() { 141 | arr.push(v.get(i).unwrap_or(u8::MIN) as u8); 142 | } 143 | Value::Table(lua.create_sequence_from(arr)?) 144 | } 145 | Aux::ArrayU16(v) => { 146 | let mut arr = Vec::new(); 147 | for i in 0..v.len() { 148 | arr.push(v.get(i).unwrap_or(u16::MIN) as u16); 149 | } 150 | Value::Table(lua.create_sequence_from(arr)?) 151 | } 152 | Aux::ArrayU32(v) => { 153 | let mut arr = Vec::new(); 154 | for i in 0..v.len() { 155 | arr.push(v.get(i).unwrap_or(u32::MIN) as u32); 156 | } 157 | Value::Table(lua.create_sequence_from(arr)?) 158 | } 159 | Aux::ArrayI16(v) => { 160 | let mut arr = Vec::new(); 161 | for i in 0..v.len() { 162 | arr.push(v.get(i).unwrap_or(i16::MIN) as i16); 163 | } 164 | Value::Table(lua.create_sequence_from(arr)?) 165 | } 166 | Aux::HexByteArray(v) => { 167 | let lstr = String::from_utf8_lossy(v.as_bytes()).to_string(); 168 | Value::String(lua.create_string(&lstr)?) 169 | } 170 | }; 171 | Ok(Some(lua_val)) 172 | }); 173 | /* 174 | reg.add_field_function_get("bq", |_, this| { 175 | let qpos: usize = match this.named_user_value("qpos") { 176 | Ok(qpos) => qpos, 177 | Err(_) => { 178 | return Ok(-1); 179 | } 180 | }; 181 | let this = this.borrow::()?; 182 | Ok(this.qual()[qpos] as i32) 183 | }); 184 | reg.add_field_function_get("distance_from_5prime", |_, this| { 185 | let qpos: usize = match this.named_user_value("qpos") { 186 | Ok(qpos) => qpos, 187 | Err(_) => { 188 | return Ok(-1); 189 | } 190 | }; 191 | let this = this.borrow::()?; 192 | if this.is_reverse() { 193 | Ok(this.seq_len() as i32 - qpos as i32) 194 | } else { 195 | Ok(qpos as i32) 196 | } 197 | }); 198 | reg.add_field_function_get("distance_from_3prime", |_, this| { 199 | let qpos: usize = match this.named_user_value("qpos") { 200 | Ok(qpos) => qpos, 201 | Err(_) => { 202 | return Ok(usize::MAX); 203 | } 204 | }; 205 | let this = this.borrow::()?; 206 | if this.is_reverse() { 207 | Ok(qpos) 208 | } else { 209 | Ok(this.seq_len() - qpos) 210 | } 211 | }); 212 | */ 213 | // count the number of A, C, G, T, N in the read. Always capitalize and return a table 214 | reg.add_field_method_get("base_counts", |_, this| { 215 | let seq = this.seq(); 216 | let mut counts = HashMap::new(); 217 | for i in 0..seq.len() { 218 | let base = seq[i].to_ascii_uppercase(); 219 | counts.entry(base).or_insert(0).add_assign(1); 220 | } 221 | Ok(counts) 222 | }); 223 | reg.add_field_method_get("n_proportion", |_, this| { 224 | let seq = this.seq(); 225 | let mut count = 0; 226 | for i in 0..seq.len() { 227 | let base = seq[i].to_ascii_uppercase(); 228 | if base == b'N' { 229 | count += 1; 230 | } 231 | } 232 | Ok(count as f64 / seq.len() as f64) 233 | }); 234 | 235 | reg.add_method("n_proportion_3_prime", |_, this, n_bases: usize| { 236 | let seq = this.seq(); 237 | let mut count = 0; 238 | let reverse = this.is_reverse(); 239 | for i in 0..n_bases { 240 | let base = 241 | seq[if reverse { i } else { seq.len() - 1 - i }].to_ascii_uppercase(); 242 | if base == b'N' { 243 | count += 1; 244 | } 245 | } 246 | Ok(count as f64 / n_bases as f64) 247 | }); 248 | 249 | reg.add_method("n_proportion_5_prime", |_, this, n_bases: usize| { 250 | let seq = this.seq(); 251 | let mut count = 0; 252 | let reverse = this.is_reverse(); 253 | for i in 0..n_bases { 254 | let base = 255 | seq[if reverse { seq.len() - 1 - i } else { i }].to_ascii_uppercase(); 256 | if base == b'N' { 257 | count += 1; 258 | } 259 | } 260 | Ok(count as f64 / n_bases as f64) 261 | }); 262 | 263 | reg.add_field_method_get("indel_count", |_, this| { 264 | let cigar = this.cigar(); 265 | let mut count = 0; 266 | for op in cigar.iter() { 267 | count += match op { 268 | Cigar::Ins(_len) => 1, 269 | Cigar::Del(_len) => 1, 270 | _ => 0, 271 | } 272 | } 273 | Ok(count) 274 | }); 275 | })?; 276 | 277 | Ok(Self { lua, filter_func }) 278 | } 279 | } 280 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | mod combine_counts; 2 | mod combine_errors; 3 | mod homopolymer; 4 | 5 | mod files; 6 | mod fraguracy; 7 | mod lua; 8 | 9 | //mod plot; 10 | #[macro_use] 11 | extern crate lazy_static; 12 | use clap::{Parser, Subcommand}; 13 | use fraguracy::ConfidenceInterval; 14 | use homopolymer::find_homopolymers; 15 | use linear_map::LinearMap; 16 | use regex::Regex; 17 | 18 | use rust_lapper::Lapper; 19 | 20 | use std::io::BufRead; 21 | 22 | use crate::files::Iv; 23 | use std::collections::HashMap; 24 | use std::path::PathBuf; 25 | 26 | use rust_htslib::bam; 27 | use rust_htslib::bam::{IndexedReader, Read, Reader}; 28 | use rust_htslib::faidx; 29 | use rustc_hash::FxHashMap; 30 | 31 | use rayon::prelude::*; 32 | 33 | use crate::fraguracy::Stat; 34 | 35 | use std::env; 36 | use std::str; 37 | 38 | lazy_static! { 39 | static ref EMPTY_LAPPER: Lapper = Lapper::new(Vec::new()); 40 | } 41 | 42 | #[derive(Debug, Parser)] 43 | #[command(name = "fraguracy")] 44 | #[command( 45 | version, 46 | about = "fraguracy: unbiased error profile analysis for short read sequencing", 47 | author = "Brent S Pedersen", 48 | help_template = "{about}\nversion:{version}\n\n{usage-heading} {usage} \n\nOPTIONS:\n{options}\n\n\x1b[1m\x1b[4mCOMMANDS:\x1b[0m\n{subcommands}" 49 | )] 50 | #[command(arg_required_else_help = true)] 51 | struct Cli { 52 | #[command(subcommand)] 53 | command: Commands, 54 | } 55 | 56 | #[derive(Debug, Subcommand)] 57 | enum Commands { 58 | #[command( 59 | arg_required_else_help = true, 60 | about = "combine error bed files from extract. file names are used to determine if they are indel errors or base errors." 61 | )] 62 | CombineErrors { 63 | #[arg( 64 | short, 65 | long, 66 | required = true, 67 | help = "path for to fai (not fasta) file" 68 | )] 69 | fai_path: PathBuf, 70 | 71 | #[arg(help = "path to error bed files from extract")] 72 | errors: Vec, 73 | 74 | #[arg( 75 | short, 76 | long, 77 | default_value_t = String::from("fraguracy-combined-errors.bed"), 78 | help = "path for output bed file" 79 | )] 80 | output_path: String, 81 | }, 82 | 83 | #[command( 84 | arg_required_else_help = true, 85 | about = "combine counts.txt files from extract" 86 | )] 87 | CombineCounts { 88 | #[arg(help = "path to counts.txt files from extract")] 89 | counts: Vec, 90 | 91 | #[arg( 92 | short, 93 | long, 94 | default_value_t = String::from("fraguracy-combined-counts.txt"), 95 | help = "path for output counts file" 96 | )] 97 | output_path: String, 98 | }, 99 | 100 | #[command( 101 | arg_required_else_help = true, 102 | about = "error profile pair overlaps in bam/cram" 103 | )] 104 | Extract { 105 | #[arg( 106 | short, 107 | long, 108 | help = "fasta for use with crams and/or to use as 'truth'" 109 | )] 110 | fasta: Option, 111 | #[arg(required = true, help = "bam/cram files to analyze")] 112 | bams: Vec, 113 | #[arg( 114 | short, 115 | long, 116 | default_value_t = String::from("fraguracy-"), 117 | help = "prefix for output files" 118 | )] 119 | output_prefix: String, 120 | 121 | #[arg(short = 'C', long, help = "restrict analysis to this chromosome")] 122 | chromosome: Option, 123 | 124 | #[arg( 125 | short, 126 | long, 127 | help = "restrict analysis to the regions given in this BED file" 128 | )] 129 | regions: Option, 130 | 131 | #[arg( 132 | short, 133 | long, 134 | help = "exclude from analysis the regions given in this BED file" 135 | )] 136 | exclude_regions: Option, 137 | 138 | #[arg( 139 | short = 'l', 140 | long, 141 | help = "optional lua expression to filter reads. returns true to skip read. e.g. 'return read.flags.secondary or read.flags.supplementary'." 142 | )] 143 | lua_expression: Option, 144 | 145 | #[arg( 146 | short, 147 | long, 148 | default_value_t = 151, 149 | help = "indicate the maximum read length in the alignment file" 150 | )] 151 | max_read_length: u32, 152 | #[arg( 153 | short, 154 | long, 155 | default_value_t = 3, 156 | help = "parition the read into chunks/bins of this size" 157 | )] 158 | bin_size: u8, 159 | #[arg( 160 | short = 'Q', 161 | long, 162 | default_value_t = 50, 163 | help = "only consider pairs where both reads have this mapping-quality or higher (good to leave this high)" 164 | )] 165 | min_mapping_quality: u8, 166 | 167 | #[arg( 168 | short, 169 | long = "ci", 170 | help = "method for confidence interval calculation (see rust bpci crate)", 171 | default_value = "agresti-coull" 172 | )] 173 | ci: ConfidenceInterval, 174 | 175 | #[arg( 176 | short, 177 | long, 178 | help = "do not calculate denominator. This can shorten runtime but will also skip the homopolymer distance calculation.", 179 | default_value_t = false 180 | )] 181 | no_denominator: bool, 182 | 183 | #[arg( 184 | short = 'H', 185 | long, 186 | help = format!( 187 | "regex for homopolymer sequence to consider if denominator is calculated[default: {}]", 188 | homopolymer::HP_REGEX 189 | ), 190 | default_value = homopolymer::HP_REGEX 191 | )] 192 | homopolymer_regex: String, 193 | 194 | #[arg( 195 | short = 't', 196 | long, 197 | help = "use reference base as 'truth'", 198 | default_value_t = false 199 | )] 200 | reference_as_truth: bool, 201 | }, 202 | //Plot { tsv: PathBuf, }, 203 | } 204 | 205 | fn get_sample_name(hmap: HashMap>>) -> String { 206 | if let Some(lm) = hmap.get("RG") { 207 | let sm = String::from("SM"); 208 | if let Some(v) = lm[0].get(&sm) { 209 | (*v).clone() 210 | } else { 211 | String::from("") 212 | } 213 | } else { 214 | String::from("") 215 | } 216 | } 217 | 218 | fn main() -> std::io::Result<()> { 219 | let args = Cli::parse(); 220 | if env::var("RUST_LOG").is_err() { 221 | env::set_var("RUST_LOG", "info") 222 | } 223 | env_logger::init(); 224 | 225 | match args.command { 226 | Commands::Extract { 227 | bams, 228 | fasta, 229 | chromosome, 230 | output_prefix, 231 | regions, 232 | exclude_regions, 233 | lua_expression, 234 | bin_size, 235 | max_read_length, 236 | min_mapping_quality, 237 | ci, 238 | reference_as_truth, 239 | no_denominator, 240 | homopolymer_regex, 241 | } => extract_main( 242 | bams, 243 | fasta, 244 | chromosome, 245 | PathBuf::from(output_prefix), 246 | regions, 247 | exclude_regions, 248 | lua_expression, 249 | bin_size as u32, 250 | max_read_length, 251 | min_mapping_quality, 252 | ci, 253 | reference_as_truth, 254 | no_denominator, 255 | homopolymer_regex, 256 | ), //Commands::Plot { tsv } => plot::plot(tsv), 257 | Commands::CombineErrors { 258 | fai_path, 259 | errors, 260 | output_path, 261 | } => combine_errors::combine_errors_main(errors, fai_path, output_path), 262 | 263 | Commands::CombineCounts { 264 | counts, 265 | output_path, 266 | } => combine_counts::combine_counts_main(counts, output_path), 267 | } 268 | } 269 | 270 | fn read_bed(path: Option) -> Option>> { 271 | path.as_ref()?; 272 | 273 | let reader = files::open_file(path); 274 | reader.as_ref()?; 275 | let mut bed = HashMap::new(); 276 | 277 | reader 278 | .expect("checked that reader is available") 279 | .lines() 280 | .for_each(|l| { 281 | let line = l.expect("error reading line"); 282 | let fields: Vec<_> = line.split('\t').collect(); 283 | if let (Ok(start), Ok(stop)) = (fields[1].parse::(), fields[2].parse::()) { 284 | let iv = Iv { 285 | start, 286 | stop, 287 | val: 0, 288 | }; 289 | let chrom = String::from(fields[0]); 290 | bed.entry(chrom).or_insert(Vec::new()).push(iv); 291 | } 292 | }); 293 | 294 | let mut tree: HashMap> = HashMap::new(); 295 | 296 | for (chrom, ivs) in bed.iter() { 297 | let ivs = ivs.clone(); 298 | let chrom = (*chrom).clone(); 299 | tree.insert(chrom, Lapper::new(ivs)); 300 | } 301 | Some(tree) 302 | } 303 | 304 | fn get_tree<'a>( 305 | regions: &'a Option>>, 306 | chrom: &String, 307 | ) -> Option<&'a Lapper> { 308 | if let Some(map) = regions { 309 | Some(map.get(chrom).unwrap_or(&EMPTY_LAPPER)) 310 | } else { 311 | None 312 | } 313 | } 314 | 315 | #[allow(clippy::too_many_arguments)] 316 | fn process_bam( 317 | path: PathBuf, 318 | fasta_path: Option, 319 | regions: Option, 320 | chromosome: Option, 321 | exclude_regions: Option, 322 | lua_expression: Option, 323 | bin_size: u32, 324 | max_read_length: u32, 325 | min_mapping_quality: u8, 326 | min_base_qual: u8, 327 | reference_as_truth: bool, 328 | output_prefix: PathBuf, 329 | no_denominator: bool, 330 | homopolymer_regex: Option, 331 | ) -> (fraguracy::InnerCounts, Vec, String) { 332 | let mut bam = IndexedReader::from_path(&path) 333 | .unwrap_or_else(|_| panic!("error reading bam file {path:?}")); 334 | bam.set_threads(1).expect("error setting threads"); 335 | let mut map = FxHashMap::default(); 336 | 337 | let include_regions = read_bed(regions); 338 | let exclude_regions = read_bed(exclude_regions); 339 | 340 | let mut ibam = IndexedReader::from_path(&path) 341 | .unwrap_or_else(|_| panic!("bam file {path:?} must be sorted and indexed")); 342 | ibam.set_threads(3) 343 | .expect("error setting threads on indexed reader"); 344 | 345 | let fasta: Option = if let Some(fa_path) = fasta_path { 346 | bam.set_reference(&fa_path) 347 | .expect("Error setting reference for file"); 348 | ibam.set_reference(&fa_path) 349 | .expect("Error setting reference for file"); 350 | 351 | let fa = faidx::Reader::from_path(fa_path).expect("error opening faidx"); 352 | Some(fa) 353 | } else { 354 | None 355 | }; 356 | 357 | let bins = (max_read_length as f64 / bin_size as f64).ceil() as u32; 358 | let mut counts = fraguracy::Counts::new( 359 | if reference_as_truth { None } else { Some(ibam) }, 360 | bins as usize, 361 | ); 362 | let hmap = bam::Header::from_template(bam.header()).to_hashmap(); 363 | let sample_name = get_sample_name(hmap); 364 | log::info!("found sample {sample_name}"); 365 | 366 | if !no_denominator { 367 | counts 368 | .set_depth_writer( 369 | &(output_prefix.to_string_lossy().to_string() 370 | + &sample_name 371 | + "-fraguracy-denominator-depth.bed.gz") 372 | .to_string(), 373 | ) 374 | .expect("error setting depth writer. check permissions/existence of output directory."); 375 | } 376 | 377 | if let Some(chromosome) = chromosome { 378 | if let Err(e) = bam.fetch(bam::FetchDefinition::String(chromosome.as_bytes())) { 379 | log::error!("error fetching chromosome {chromosome}: {e}. iterating over all reads."); 380 | } else { 381 | log::info!("limiting analysis to chromosome: \"{chromosome}"); 382 | } 383 | } else if let Err(e) = bam.fetch(bam::FetchDefinition::All) { 384 | log::error!("error fetching all reads: {e}"); 385 | } 386 | 387 | let mut n_total = 0; 388 | let mut n_pairs = 0; 389 | let chroms: Vec = bam 390 | .header() 391 | .target_names() 392 | .iter() 393 | .map(|n| unsafe { str::from_utf8_unchecked(n) }.to_string()) 394 | .collect(); 395 | 396 | let mut include_tree: Option<&Lapper> = get_tree(&include_regions, &chroms[0]); 397 | let mut exclude_tree: Option<&Lapper> = get_tree(&exclude_regions, &chroms[0]); 398 | let mut hp_tree: Option> = None; 399 | 400 | let mut last_tid: i32 = -1; 401 | bam.rc_records() 402 | .map(|r| { 403 | n_total += 1; 404 | r.expect("error parsing read") 405 | }) 406 | .filter(fraguracy::filter_read) 407 | .for_each(|b| { 408 | let name = unsafe { str::from_utf8_unchecked(b.qname()) }.to_string(); 409 | if b.is_first_in_template() { 410 | n_pairs += 1; 411 | } 412 | if b.tid() != last_tid { 413 | if last_tid != -1 { 414 | log::info!( 415 | "processed chromosome: {} unprocessed orphan pairs: {}", 416 | chroms[last_tid as usize], 417 | map.len() 418 | ); 419 | } 420 | last_tid = b.tid(); 421 | 422 | // process the remaining entries in the hashmap in last_depth. 423 | counts.handle_depth(&chroms[last_tid as usize], i64::MAX); 424 | 425 | if include_regions.is_some() { 426 | include_tree = get_tree(&include_regions, &chroms[last_tid as usize]); 427 | } 428 | if exclude_regions.is_some() { 429 | exclude_tree = get_tree(&exclude_regions, &chroms[last_tid as usize]); 430 | } 431 | 432 | if let Some(ref re) = homopolymer_regex { 433 | let chrom_seq = fasta 434 | .as_ref() 435 | .unwrap() 436 | .fetch_seq(&chroms[last_tid as usize], 0, i64::MAX as usize) 437 | .expect("error fetching sequence from fasta."); 438 | hp_tree = Some(find_homopolymers(&chrom_seq, re)); 439 | } 440 | } 441 | 442 | // by not checking the order here, we allow bams sorted by read name (with position flipped) 443 | // this gives about 5% performance penalty over checking b.pos() < b.mpos(), but allows us 444 | // to support more files. 445 | match map.entry(name) { 446 | std::collections::hash_map::Entry::Vacant(e) => { 447 | e.insert(b); 448 | } 449 | std::collections::hash_map::Entry::Occupied(e) => { 450 | let a = e.remove(); 451 | 452 | if a.mapq() < min_mapping_quality { 453 | return; 454 | } 455 | if b.mapq() < min_mapping_quality { 456 | return; 457 | } 458 | if let Some(ref lua_expression) = lua_expression { 459 | match lua_expression.skip_read(&a) { 460 | Ok(true) => return, 461 | Ok(false) => (), 462 | Err(e) => log::error!("error evaluating user expression for read: {e}"), 463 | } 464 | match lua_expression.skip_read(&b) { 465 | Ok(true) => return, 466 | Ok(false) => (), 467 | Err(e) => log::error!("error evaluating user expression for read: {e}"), 468 | } 469 | } 470 | // we know a is before b, but we don't know if they overlap. 471 | if a.cigar().end_pos() < b.pos() { 472 | return; 473 | } 474 | let tid = a.tid() as usize; 475 | counts.increment( 476 | a, 477 | b, 478 | min_base_qual, 479 | min_mapping_quality, 480 | bin_size, 481 | &fasta, 482 | &chroms[tid], 483 | &include_tree, 484 | &exclude_tree, 485 | &hp_tree, 486 | ); 487 | } 488 | } 489 | }); 490 | log::info!( 491 | "[FINAL] map len:{:?} total reads: {:?}, pairs: {} \ 492 | \n mismatches: {} matches: {}", 493 | map.len(), 494 | n_total, 495 | n_pairs, 496 | counts.counts.mismatches, 497 | counts.counts.matches, 498 | ); 499 | 500 | (counts.counts, chroms, sample_name) 501 | } 502 | 503 | #[allow(clippy::too_many_arguments)] 504 | fn extract_main( 505 | paths: Vec, 506 | fasta_path: Option, 507 | chromosome: Option, 508 | output_prefix: PathBuf, 509 | regions: Option, 510 | exclude_regions: Option, 511 | lua_expression: Option, 512 | bin_size: u32, 513 | max_read_length: u32, 514 | min_mapping_quality: u8, 515 | ci: ConfidenceInterval, 516 | reference_as_truth: bool, 517 | no_denominator: bool, 518 | homopolymer_regex: String, 519 | ) -> std::io::Result<()> { 520 | //let args: Vec = env::args().collect(); 521 | let min_base_qual = 5u8; 522 | 523 | let mut homopolymer_regex = 524 | Some(Regex::new(&homopolymer_regex).expect("error compiling homopolymer regex")); 525 | 526 | if no_denominator { 527 | homopolymer_regex = None; 528 | } else if fasta_path.is_none() { 529 | return Err(std::io::Error::new( 530 | std::io::ErrorKind::InvalidInput, 531 | "fasta path must be provided if denominator is calculated", 532 | )); 533 | } 534 | 535 | let lua_expression = lua_expression.map(|e| { 536 | lua::LuaReadFilter::new(&e, mlua::Lua::new()).expect("error creating lua interpreter") 537 | }); 538 | 539 | let total_counts = paths 540 | .par_iter() 541 | .map(|path| { 542 | let (c, chroms, sample_name) = process_bam( 543 | path.clone(), 544 | fasta_path.clone(), 545 | regions.clone(), 546 | chromosome.clone(), 547 | exclude_regions.clone(), 548 | lua_expression.clone(), 549 | bin_size, 550 | max_read_length, 551 | min_mapping_quality, 552 | min_base_qual, 553 | reference_as_truth, 554 | output_prefix.clone(), 555 | no_denominator, 556 | homopolymer_regex.clone(), 557 | ); 558 | let output_prefix: PathBuf = 559 | (output_prefix.to_string_lossy().to_string() + &sample_name + "-").into(); 560 | 561 | let stats = Stat::from_counts(&c, bin_size as usize, ci.clone()); 562 | files::write_stats(stats, output_prefix.clone()); 563 | files::write_errors(&c, output_prefix, chroms); 564 | 565 | c 566 | }) 567 | .reduce_with(|mut a, b| { 568 | a += b; 569 | a 570 | }); 571 | 572 | let total_counts = total_counts.expect("error accumulating total counts"); 573 | if paths.len() > 1 { 574 | let bam = Reader::from_path(&paths[0]).expect("error reading bam file {path}"); 575 | let chroms: Vec = bam 576 | .header() 577 | .target_names() 578 | .iter() 579 | .map(|n| unsafe { str::from_utf8_unchecked(n) }.to_string()) 580 | .collect(); 581 | let output_prefix: PathBuf = 582 | (output_prefix.to_string_lossy().to_string() + "total-").into(); 583 | 584 | let stats = Stat::from_counts(&total_counts, bin_size as usize, ci); 585 | files::write_stats(stats, output_prefix.clone()); 586 | files::write_errors(&total_counts, output_prefix, chroms); 587 | } 588 | Ok(()) 589 | } 590 | 591 | #[cfg(test)] 592 | mod tests { 593 | use super::*; 594 | use std::collections::HashMap; 595 | // Assuming Iv is defined in crate::files and is accessible. 596 | use crate::files::Iv; 597 | 598 | #[test] 599 | fn test_get_tree_found() { 600 | // Create a non-empty lapper for "chr1" 601 | let iv = Iv { 602 | start: 10, 603 | stop: 20, 604 | val: 0, 605 | }; 606 | let lapper_non_empty = Lapper::new(vec![iv]); 607 | let mut regions_map: HashMap> = HashMap::new(); 608 | regions_map.insert("chr1".to_string(), lapper_non_empty); 609 | let regions = Some(regions_map); 610 | 611 | let tree = get_tree(®ions, &"chr1".to_string()).unwrap(); 612 | // Query a point that should overlap the interval [10,20] 613 | let mut iter = tree.find(15, 16); 614 | assert!( 615 | iter.next().is_some(), 616 | "Expected to find an interval for chr1" 617 | ); 618 | } 619 | 620 | #[test] 621 | fn test_get_tree_not_found_in_map() { 622 | // Create a regions map with a lapper only for "chr1" 623 | let iv = Iv { 624 | start: 10, 625 | stop: 20, 626 | val: 0, 627 | }; 628 | let lapper_non_empty = Lapper::new(vec![iv]); 629 | let mut regions_map: HashMap> = HashMap::new(); 630 | regions_map.insert("chr1".to_string(), lapper_non_empty); 631 | let regions = Some(regions_map); 632 | 633 | // Looking up "chr2" should yield the empty lapper. 634 | let tree = get_tree(®ions, &"chr2".to_string()).unwrap(); 635 | assert!( 636 | tree.find(0, 100).next().is_none(), 637 | "Expected empty lapper for non-existent chromosome" 638 | ); 639 | } 640 | 641 | #[test] 642 | fn test_get_tree_no_regions() { 643 | // When regions is None, we expect the function to return None. 644 | let regions: Option>> = None; 645 | let tree = get_tree(®ions, &"any".to_string()); 646 | assert!(tree.is_none(), "Expected None when regions is None"); 647 | } 648 | } 649 | -------------------------------------------------------------------------------- /src/plot.rs: -------------------------------------------------------------------------------- 1 | use plotly::layout::{Axis, GridPattern, Layout, LayoutGrid, Legend, RowOrder}; 2 | use plotly::{Plot, Scatter}; 3 | use polars::prelude::*; 4 | 5 | use std::path::PathBuf; 6 | 7 | pub fn plot(f: PathBuf) { 8 | let df = CsvReader::from_path(f) 9 | .expect("error reading csv") 10 | .has_header(true) 11 | .with_delimiter('\t' as u8) 12 | .finish() 13 | .unwrap(); //.finish().unwrap(); 14 | 15 | let layout = Layout::new().grid(LayoutGrid::new().rows(1).columns(2)); 16 | let mut plot = Plot::new(); 17 | plot.set_layout(layout); 18 | 19 | let contexts: Vec = df["context"] 20 | .unique() 21 | .expect("error getting unique contexts") 22 | .iter() 23 | .map(|s| std::string::String::from(s.get_str().unwrap())) 24 | .collect(); 25 | 26 | dbg!(contexts); 27 | eprintln!("dataframe shape: {:?}", df.shape()); 28 | // In [19]: df.with_columns(pl.all([pl.col("a").str.contains("sick"), ~pl.col("a").str.contains("sick of ")]).alias("match")) 29 | /* 30 | 31 | 32 | contexts.iter().map(|ctx| { 33 | //let mask = df.column("context")?.equal(ctx); 34 | df.select([col("context") == lit(*ctx)]); 35 | 36 | 37 | let sub = df.filter(col("context") == ctx); 38 | 39 | let mask = df 40 | .column("context") 41 | .unwrap() 42 | .contains(ctx) 43 | . 44 | df.filter(&mask).map(|subset| { 45 | eprintln!("{:?}", subset.shape()); 46 | }) 47 | }); 48 | 49 | //let t1 = Scatter::new(); 50 | 51 | dbg!(df.head(Some(10))); 52 | */ 53 | } 54 | -------------------------------------------------------------------------------- /test-data/a.errors.bed: -------------------------------------------------------------------------------- 1 | #chrom start end bq_bin count 2 | chr1 109555 109556 20-36 1 3 | chr1 181470 181471 20-36 1 4 | chr1 202236 202237 20-36 1 5 | chr1 271518 271519 20-36 1 6 | chr1 591695 591696 20-36 1 7 | chr1 596597 596598 37-59 1 8 | chr1 632101 632102 20-36 1 9 | chr1 739054 739055 20-36 1 10 | chr1 775172 775173 20-36 1 11 | chr1 778756 778757 20-36 1 12 | chr1 791072 791073 20-36 1 13 | chr1 806318 806319 20-36 1 14 | chr1 812874 812875 20-36 1 15 | chr1 812928 812929 37-59 1 16 | chr1 820676 820677 20-36 1 17 | chr1 821214 821215 20-36 1 18 | chr1 821223 821224 20-36 1 19 | chr1 821359 821360 20-36 1 20 | chr1 821373 821374 20-36 1 21 | -------------------------------------------------------------------------------- /test-data/b.errors.bed: -------------------------------------------------------------------------------- 1 | #chrom start end bq_bin count 2 | chr1 121033 121034 20-36 1 3 | chr1 136837 136838 20-36 1 4 | chr1 271490 271491 20-36 1 5 | chr1 596617 596618 20-36 1 6 | chr1 598836 598837 20-36 1 7 | chr1 598904 598905 20-36 1 8 | chr1 598929 598930 20-36 1 9 | chr1 605472 605473 20-36 1 10 | chr1 631613 631614 20-36 1 11 | chr1 764663 764664 20-36 1 12 | chr1 775216 775217 37-59 1 13 | chr1 785823 785824 20-36 1 14 | chr1 785896 785897 20-36 1 15 | chr1 788777 788778 20-36 1 16 | chr1 788865 788866 20-36 1 17 | chr1 789070 789071 20-36 1 18 | chr1 790253 790254 20-36 1 19 | chr1 790389 790390 20-36 1 20 | chr1 790420 790421 20-36 1 21 | --------------------------------------------------------------------------------