├── .cargo
    └── config.toml
├── .github
    └── workflows
    │   └── rust.yml
├── .gitignore
├── CHANGES.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── analyze_indel_errors.md
├── cli-tests.sh
├── lua-api.md
├── plot.py
├── scripts
    ├── analyze_indel_errors.py
    └── count-errors.py
├── src
    ├── combine_counts.rs
    ├── combine_errors.rs
    ├── files.rs
    ├── fraguracy.rs
    ├── homopolymer.rs
    ├── lua.rs
    ├── main.rs
    └── plot.rs
└── test-data
    ├── a.errors.bed
    └── b.errors.bed


/.cargo/config.toml:
--------------------------------------------------------------------------------
 1 | # Cargo configuration for faster compilation
 2 | 
 3 | # Use all available CPU cores for compilation
 4 | 
 5 | # Enable experimental parallel frontend (Rust 1.70+)
 6 | linker = "clang"
 7 | rustflags = [
 8 |     "-C", "target-cpu=native",  # Optimize for your specific CPU
 9 |     "-C", "link-arg=-fuse-ld=lld",  # Use gold linker for faster linking
10 | ]
11 | 
12 | # Use faster linker if available
13 | [target.x86_64-unknown-linux-gnu]
14 | linker = "clang"
15 | rustflags = [
16 |     "-C", "link-arg=-fuse-ld=lld",  # Use gold linker for faster linking
17 |     "-C", "target-cpu=native",
18 | ]
19 | 
20 | # Enable pipelined compilation (experimental)
21 | [unstable]
22 | # Uncomment if using nightly Rust
23 | # pipelined-compilation = true
24 | 
25 | [env]
26 | # Increase parallel rustc processes
27 | CARGO_BUILD_RUSTC_OPTS = "-C codegen-units=16"
28 | 


--------------------------------------------------------------------------------
/.github/workflows/rust.yml:
--------------------------------------------------------------------------------
 1 | name: Rust
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | env:
10 |   CARGO_TERM_COLOR: always
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Build
20 |       run: cargo build --verbose
21 |     - name: Run tests
22 |       run: cargo test --verbose
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | *.bed.gz
3 | *-counts.txt
4 | *.bam
5 | *.bam.bai
6 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | # v0.2.7
 2 | + indels: report length and bq-bin and add scripts/analyze-indel-errors.py to plot indel error rate
 3 | + combine_counts: better error messages and handle NA
 4 | 
 5 | # v0.2.6
 6 | + report cases where neither bases matches the reference as NN:1 when --reference-as-truth is passed.
 7 | + add lua expressions to filter reads
 8 | 
 9 | # v0.2.5
10 | + respect include and exclude for indels and denominator calculation (thanks Jason for reporting)
11 | 
12 | # v0.2.4
13 | + Allow chromosomes longer than u8::MAX (#11 thanks @pontushojer for reporting)
14 | + Fix: when an include region was given and a non-seen chromosome was queried, it would return all intervals in that chromosome (#10 thanks Jason)
15 | 
16 | # v0.2.3
17 | 
18 | + Add --chromosome option to restrict analysis to a single chromosome.
19 | + fix counts of indel errors
20 | + fix distance to homopolymer (really)
21 | 
22 | 
23 | # v0.2.2
24 | 
25 | + Fix distance to homopolymer
26 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
   1 | # This file is automatically @generated by Cargo.
   2 | # It is not intended for manual editing.
   3 | version = 4
   4 | 
   5 | [[package]]
   6 | name = "adler2"
   7 | version = "2.0.0"
   8 | source = "registry+https://github.com/rust-lang/crates.io-index"
   9 | checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
  10 | 
  11 | [[package]]
  12 | name = "aho-corasick"
  13 | version = "1.1.3"
  14 | source = "registry+https://github.com/rust-lang/crates.io-index"
  15 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
  16 | dependencies = [
  17 |  "memchr",
  18 | ]
  19 | 
  20 | [[package]]
  21 | name = "anstream"
  22 | version = "0.6.17"
  23 | source = "registry+https://github.com/rust-lang/crates.io-index"
  24 | checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
  25 | dependencies = [
  26 |  "anstyle",
  27 |  "anstyle-parse",
  28 |  "anstyle-query",
  29 |  "anstyle-wincon",
  30 |  "colorchoice",
  31 |  "is_terminal_polyfill",
  32 |  "utf8parse",
  33 | ]
  34 | 
  35 | [[package]]
  36 | name = "anstyle"
  37 | version = "1.0.9"
  38 | source = "registry+https://github.com/rust-lang/crates.io-index"
  39 | checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56"
  40 | 
  41 | [[package]]
  42 | name = "anstyle-parse"
  43 | version = "0.2.6"
  44 | source = "registry+https://github.com/rust-lang/crates.io-index"
  45 | checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
  46 | dependencies = [
  47 |  "utf8parse",
  48 | ]
  49 | 
  50 | [[package]]
  51 | name = "anstyle-query"
  52 | version = "1.1.2"
  53 | source = "registry+https://github.com/rust-lang/crates.io-index"
  54 | checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
  55 | dependencies = [
  56 |  "windows-sys 0.59.0",
  57 | ]
  58 | 
  59 | [[package]]
  60 | name = "anstyle-wincon"
  61 | version = "3.0.6"
  62 | source = "registry+https://github.com/rust-lang/crates.io-index"
  63 | checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
  64 | dependencies = [
  65 |  "anstyle",
  66 |  "windows-sys 0.59.0",
  67 | ]
  68 | 
  69 | [[package]]
  70 | name = "anyhow"
  71 | version = "1.0.98"
  72 | source = "registry+https://github.com/rust-lang/crates.io-index"
  73 | checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487"
  74 | 
  75 | [[package]]
  76 | name = "autocfg"
  77 | version = "1.4.0"
  78 | source = "registry+https://github.com/rust-lang/crates.io-index"
  79 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
  80 | 
  81 | [[package]]
  82 | name = "bindgen"
  83 | version = "0.69.5"
  84 | source = "registry+https://github.com/rust-lang/crates.io-index"
  85 | checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
  86 | dependencies = [
  87 |  "bitflags",
  88 |  "cexpr",
  89 |  "clang-sys",
  90 |  "itertools",
  91 |  "lazy_static",
  92 |  "lazycell",
  93 |  "proc-macro2",
  94 |  "quote",
  95 |  "regex",
  96 |  "rustc-hash 1.1.0",
  97 |  "shlex",
  98 |  "syn",
  99 | ]
 100 | 
 101 | [[package]]
 102 | name = "bio-types"
 103 | version = "1.0.4"
 104 | source = "registry+https://github.com/rust-lang/crates.io-index"
 105 | checksum = "f4dcf54f8b7f51450207d54780bab09c05f30b8b0caa991545082842e466ad7e"
 106 | dependencies = [
 107 |  "derive-new 0.6.0",
 108 |  "lazy_static",
 109 |  "regex",
 110 |  "strum_macros",
 111 |  "thiserror 1.0.64",
 112 | ]
 113 | 
 114 | [[package]]
 115 | name = "bitflags"
 116 | version = "2.6.0"
 117 | source = "registry+https://github.com/rust-lang/crates.io-index"
 118 | checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 119 | 
 120 | [[package]]
 121 | name = "bpci"
 122 | version = "0.1.0"
 123 | source = "registry+https://github.com/rust-lang/crates.io-index"
 124 | checksum = "552c3faebbf05f0aebf152b9c90d76732ffac8bfead18786aa9ce3c5aae706a7"
 125 | dependencies = [
 126 |  "num-traits",
 127 |  "thiserror 1.0.64",
 128 | ]
 129 | 
 130 | [[package]]
 131 | name = "bstr"
 132 | version = "1.12.0"
 133 | source = "registry+https://github.com/rust-lang/crates.io-index"
 134 | checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4"
 135 | dependencies = [
 136 |  "memchr",
 137 |  "serde",
 138 | ]
 139 | 
 140 | [[package]]
 141 | name = "byteorder"
 142 | version = "1.5.0"
 143 | source = "registry+https://github.com/rust-lang/crates.io-index"
 144 | checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 145 | 
 146 | [[package]]
 147 | name = "bzip2-sys"
 148 | version = "0.1.11+1.0.8"
 149 | source = "registry+https://github.com/rust-lang/crates.io-index"
 150 | checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc"
 151 | dependencies = [
 152 |  "cc",
 153 |  "libc",
 154 |  "pkg-config",
 155 | ]
 156 | 
 157 | [[package]]
 158 | name = "cc"
 159 | version = "1.2.19"
 160 | source = "registry+https://github.com/rust-lang/crates.io-index"
 161 | checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
 162 | dependencies = [
 163 |  "jobserver",
 164 |  "libc",
 165 |  "shlex",
 166 | ]
 167 | 
 168 | [[package]]
 169 | name = "cexpr"
 170 | version = "0.6.0"
 171 | source = "registry+https://github.com/rust-lang/crates.io-index"
 172 | checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766"
 173 | dependencies = [
 174 |  "nom",
 175 | ]
 176 | 
 177 | [[package]]
 178 | name = "cfg-if"
 179 | version = "1.0.0"
 180 | source = "registry+https://github.com/rust-lang/crates.io-index"
 181 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 182 | 
 183 | [[package]]
 184 | name = "clang-sys"
 185 | version = "1.8.1"
 186 | source = "registry+https://github.com/rust-lang/crates.io-index"
 187 | checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4"
 188 | dependencies = [
 189 |  "glob",
 190 |  "libc",
 191 |  "libloading",
 192 | ]
 193 | 
 194 | [[package]]
 195 | name = "clap"
 196 | version = "4.5.20"
 197 | source = "registry+https://github.com/rust-lang/crates.io-index"
 198 | checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
 199 | dependencies = [
 200 |  "clap_builder",
 201 |  "clap_derive",
 202 | ]
 203 | 
 204 | [[package]]
 205 | name = "clap_builder"
 206 | version = "4.5.20"
 207 | source = "registry+https://github.com/rust-lang/crates.io-index"
 208 | checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
 209 | dependencies = [
 210 |  "anstream",
 211 |  "anstyle",
 212 |  "clap_lex",
 213 |  "strsim",
 214 | ]
 215 | 
 216 | [[package]]
 217 | name = "clap_derive"
 218 | version = "4.5.18"
 219 | source = "registry+https://github.com/rust-lang/crates.io-index"
 220 | checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 221 | dependencies = [
 222 |  "heck",
 223 |  "proc-macro2",
 224 |  "quote",
 225 |  "syn",
 226 | ]
 227 | 
 228 | [[package]]
 229 | name = "clap_lex"
 230 | version = "0.7.2"
 231 | source = "registry+https://github.com/rust-lang/crates.io-index"
 232 | checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 233 | 
 234 | [[package]]
 235 | name = "cmake"
 236 | version = "0.1.51"
 237 | source = "registry+https://github.com/rust-lang/crates.io-index"
 238 | checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
 239 | dependencies = [
 240 |  "cc",
 241 | ]
 242 | 
 243 | [[package]]
 244 | name = "colorchoice"
 245 | version = "1.0.3"
 246 | source = "registry+https://github.com/rust-lang/crates.io-index"
 247 | checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
 248 | 
 249 | [[package]]
 250 | name = "crc32fast"
 251 | version = "1.4.2"
 252 | source = "registry+https://github.com/rust-lang/crates.io-index"
 253 | checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
 254 | dependencies = [
 255 |  "cfg-if",
 256 | ]
 257 | 
 258 | [[package]]
 259 | name = "crossbeam-deque"
 260 | version = "0.8.5"
 261 | source = "registry+https://github.com/rust-lang/crates.io-index"
 262 | checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
 263 | dependencies = [
 264 |  "crossbeam-epoch",
 265 |  "crossbeam-utils",
 266 | ]
 267 | 
 268 | [[package]]
 269 | name = "crossbeam-epoch"
 270 | version = "0.9.18"
 271 | source = "registry+https://github.com/rust-lang/crates.io-index"
 272 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
 273 | dependencies = [
 274 |  "crossbeam-utils",
 275 | ]
 276 | 
 277 | [[package]]
 278 | name = "crossbeam-utils"
 279 | version = "0.8.20"
 280 | source = "registry+https://github.com/rust-lang/crates.io-index"
 281 | checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
 282 | 
 283 | [[package]]
 284 | name = "curl-sys"
 285 | version = "0.4.78+curl-8.11.0"
 286 | source = "registry+https://github.com/rust-lang/crates.io-index"
 287 | checksum = "8eec768341c5c7789611ae51cf6c459099f22e64a5d5d0ce4892434e33821eaf"
 288 | dependencies = [
 289 |  "cc",
 290 |  "libc",
 291 |  "libz-sys",
 292 |  "openssl-sys",
 293 |  "pkg-config",
 294 |  "vcpkg",
 295 |  "windows-sys 0.52.0",
 296 | ]
 297 | 
 298 | [[package]]
 299 | name = "custom_derive"
 300 | version = "0.1.7"
 301 | source = "registry+https://github.com/rust-lang/crates.io-index"
 302 | checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9"
 303 | 
 304 | [[package]]
 305 | name = "derive-new"
 306 | version = "0.6.0"
 307 | source = "registry+https://github.com/rust-lang/crates.io-index"
 308 | checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad"
 309 | dependencies = [
 310 |  "proc-macro2",
 311 |  "quote",
 312 |  "syn",
 313 | ]
 314 | 
 315 | [[package]]
 316 | name = "derive-new"
 317 | version = "0.7.0"
 318 | source = "registry+https://github.com/rust-lang/crates.io-index"
 319 | checksum = "2cdc8d50f426189eef89dac62fabfa0abb27d5cc008f25bf4156a0203325becc"
 320 | dependencies = [
 321 |  "proc-macro2",
 322 |  "quote",
 323 |  "syn",
 324 | ]
 325 | 
 326 | [[package]]
 327 | name = "displaydoc"
 328 | version = "0.2.5"
 329 | source = "registry+https://github.com/rust-lang/crates.io-index"
 330 | checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
 331 | dependencies = [
 332 |  "proc-macro2",
 333 |  "quote",
 334 |  "syn",
 335 | ]
 336 | 
 337 | [[package]]
 338 | name = "either"
 339 | version = "1.13.0"
 340 | source = "registry+https://github.com/rust-lang/crates.io-index"
 341 | checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 342 | 
 343 | [[package]]
 344 | name = "env_logger"
 345 | version = "0.10.2"
 346 | source = "registry+https://github.com/rust-lang/crates.io-index"
 347 | checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580"
 348 | dependencies = [
 349 |  "humantime",
 350 |  "is-terminal",
 351 |  "log",
 352 |  "regex",
 353 |  "termcolor",
 354 | ]
 355 | 
 356 | [[package]]
 357 | name = "flate2"
 358 | version = "1.0.34"
 359 | source = "registry+https://github.com/rust-lang/crates.io-index"
 360 | checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
 361 | dependencies = [
 362 |  "crc32fast",
 363 |  "miniz_oxide",
 364 | ]
 365 | 
 366 | [[package]]
 367 | name = "form_urlencoded"
 368 | version = "1.2.1"
 369 | source = "registry+https://github.com/rust-lang/crates.io-index"
 370 | checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
 371 | dependencies = [
 372 |  "percent-encoding",
 373 | ]
 374 | 
 375 | [[package]]
 376 | name = "fraguracy"
 377 | version = "0.2.7"
 378 | dependencies = [
 379 |  "anyhow",
 380 |  "bpci",
 381 |  "clap",
 382 |  "env_logger",
 383 |  "flate2",
 384 |  "idna 1.0.3",
 385 |  "itertools",
 386 |  "lazy_static",
 387 |  "libc",
 388 |  "linear-map",
 389 |  "log",
 390 |  "mlua",
 391 |  "ndarray",
 392 |  "rayon",
 393 |  "regex",
 394 |  "rust-htslib",
 395 |  "rust-lapper",
 396 |  "rustc-hash 1.1.0",
 397 |  "syn",
 398 | ]
 399 | 
 400 | [[package]]
 401 | name = "fs-utils"
 402 | version = "1.1.4"
 403 | source = "registry+https://github.com/rust-lang/crates.io-index"
 404 | checksum = "6fc7a9dc005c944c98a935e7fd626faf5bf7e5a609f94bc13e42fc4a02e52593"
 405 | dependencies = [
 406 |  "quick-error",
 407 | ]
 408 | 
 409 | [[package]]
 410 | name = "glob"
 411 | version = "0.3.1"
 412 | source = "registry+https://github.com/rust-lang/crates.io-index"
 413 | checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 414 | 
 415 | [[package]]
 416 | name = "heck"
 417 | version = "0.5.0"
 418 | source = "registry+https://github.com/rust-lang/crates.io-index"
 419 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 420 | 
 421 | [[package]]
 422 | name = "hermit-abi"
 423 | version = "0.4.0"
 424 | source = "registry+https://github.com/rust-lang/crates.io-index"
 425 | checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
 426 | 
 427 | [[package]]
 428 | name = "hts-sys"
 429 | version = "2.2.0"
 430 | source = "registry+https://github.com/rust-lang/crates.io-index"
 431 | checksum = "e38d7f1c121cd22aa214cb4dadd4277dc5447391eac518b899b29ba6356fbbb2"
 432 | dependencies = [
 433 |  "bindgen",
 434 |  "bzip2-sys",
 435 |  "cc",
 436 |  "curl-sys",
 437 |  "fs-utils",
 438 |  "glob",
 439 |  "libdeflate-sys",
 440 |  "libz-sys",
 441 |  "lzma-sys",
 442 |  "openssl-sys",
 443 | ]
 444 | 
 445 | [[package]]
 446 | name = "humantime"
 447 | version = "2.1.0"
 448 | source = "registry+https://github.com/rust-lang/crates.io-index"
 449 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 450 | 
 451 | [[package]]
 452 | name = "icu_collections"
 453 | version = "1.5.0"
 454 | source = "registry+https://github.com/rust-lang/crates.io-index"
 455 | checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
 456 | dependencies = [
 457 |  "displaydoc",
 458 |  "yoke",
 459 |  "zerofrom",
 460 |  "zerovec",
 461 | ]
 462 | 
 463 | [[package]]
 464 | name = "icu_locid"
 465 | version = "1.5.0"
 466 | source = "registry+https://github.com/rust-lang/crates.io-index"
 467 | checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
 468 | dependencies = [
 469 |  "displaydoc",
 470 |  "litemap",
 471 |  "tinystr",
 472 |  "writeable",
 473 |  "zerovec",
 474 | ]
 475 | 
 476 | [[package]]
 477 | name = "icu_locid_transform"
 478 | version = "1.5.0"
 479 | source = "registry+https://github.com/rust-lang/crates.io-index"
 480 | checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
 481 | dependencies = [
 482 |  "displaydoc",
 483 |  "icu_locid",
 484 |  "icu_locid_transform_data",
 485 |  "icu_provider",
 486 |  "tinystr",
 487 |  "zerovec",
 488 | ]
 489 | 
 490 | [[package]]
 491 | name = "icu_locid_transform_data"
 492 | version = "1.5.0"
 493 | source = "registry+https://github.com/rust-lang/crates.io-index"
 494 | checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
 495 | 
 496 | [[package]]
 497 | name = "icu_normalizer"
 498 | version = "1.5.0"
 499 | source = "registry+https://github.com/rust-lang/crates.io-index"
 500 | checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
 501 | dependencies = [
 502 |  "displaydoc",
 503 |  "icu_collections",
 504 |  "icu_normalizer_data",
 505 |  "icu_properties",
 506 |  "icu_provider",
 507 |  "smallvec",
 508 |  "utf16_iter",
 509 |  "utf8_iter",
 510 |  "write16",
 511 |  "zerovec",
 512 | ]
 513 | 
 514 | [[package]]
 515 | name = "icu_normalizer_data"
 516 | version = "1.5.0"
 517 | source = "registry+https://github.com/rust-lang/crates.io-index"
 518 | checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
 519 | 
 520 | [[package]]
 521 | name = "icu_properties"
 522 | version = "1.5.1"
 523 | source = "registry+https://github.com/rust-lang/crates.io-index"
 524 | checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
 525 | dependencies = [
 526 |  "displaydoc",
 527 |  "icu_collections",
 528 |  "icu_locid_transform",
 529 |  "icu_properties_data",
 530 |  "icu_provider",
 531 |  "tinystr",
 532 |  "zerovec",
 533 | ]
 534 | 
 535 | [[package]]
 536 | name = "icu_properties_data"
 537 | version = "1.5.0"
 538 | source = "registry+https://github.com/rust-lang/crates.io-index"
 539 | checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
 540 | 
 541 | [[package]]
 542 | name = "icu_provider"
 543 | version = "1.5.0"
 544 | source = "registry+https://github.com/rust-lang/crates.io-index"
 545 | checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
 546 | dependencies = [
 547 |  "displaydoc",
 548 |  "icu_locid",
 549 |  "icu_provider_macros",
 550 |  "stable_deref_trait",
 551 |  "tinystr",
 552 |  "writeable",
 553 |  "yoke",
 554 |  "zerofrom",
 555 |  "zerovec",
 556 | ]
 557 | 
 558 | [[package]]
 559 | name = "icu_provider_macros"
 560 | version = "1.5.0"
 561 | source = "registry+https://github.com/rust-lang/crates.io-index"
 562 | checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
 563 | dependencies = [
 564 |  "proc-macro2",
 565 |  "quote",
 566 |  "syn",
 567 | ]
 568 | 
 569 | [[package]]
 570 | name = "idna"
 571 | version = "0.5.0"
 572 | source = "registry+https://github.com/rust-lang/crates.io-index"
 573 | checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
 574 | dependencies = [
 575 |  "unicode-bidi",
 576 |  "unicode-normalization",
 577 | ]
 578 | 
 579 | [[package]]
 580 | name = "idna"
 581 | version = "1.0.3"
 582 | source = "registry+https://github.com/rust-lang/crates.io-index"
 583 | checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
 584 | dependencies = [
 585 |  "idna_adapter",
 586 |  "smallvec",
 587 |  "utf8_iter",
 588 | ]
 589 | 
 590 | [[package]]
 591 | name = "idna_adapter"
 592 | version = "1.2.0"
 593 | source = "registry+https://github.com/rust-lang/crates.io-index"
 594 | checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
 595 | dependencies = [
 596 |  "icu_normalizer",
 597 |  "icu_properties",
 598 | ]
 599 | 
 600 | [[package]]
 601 | name = "ieee754"
 602 | version = "0.2.6"
 603 | source = "registry+https://github.com/rust-lang/crates.io-index"
 604 | checksum = "9007da9cacbd3e6343da136e98b0d2df013f553d35bdec8b518f07bea768e19c"
 605 | 
 606 | [[package]]
 607 | name = "is-terminal"
 608 | version = "0.4.13"
 609 | source = "registry+https://github.com/rust-lang/crates.io-index"
 610 | checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
 611 | dependencies = [
 612 |  "hermit-abi",
 613 |  "libc",
 614 |  "windows-sys 0.52.0",
 615 | ]
 616 | 
 617 | [[package]]
 618 | name = "is_terminal_polyfill"
 619 | version = "1.70.1"
 620 | source = "registry+https://github.com/rust-lang/crates.io-index"
 621 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
 622 | 
 623 | [[package]]
 624 | name = "itertools"
 625 | version = "0.10.5"
 626 | source = "registry+https://github.com/rust-lang/crates.io-index"
 627 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
 628 | dependencies = [
 629 |  "either",
 630 | ]
 631 | 
 632 | [[package]]
 633 | name = "jobserver"
 634 | version = "0.1.32"
 635 | source = "registry+https://github.com/rust-lang/crates.io-index"
 636 | checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
 637 | dependencies = [
 638 |  "libc",
 639 | ]
 640 | 
 641 | [[package]]
 642 | name = "lazy_static"
 643 | version = "1.5.0"
 644 | source = "registry+https://github.com/rust-lang/crates.io-index"
 645 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
 646 | 
 647 | [[package]]
 648 | name = "lazycell"
 649 | version = "1.3.0"
 650 | source = "registry+https://github.com/rust-lang/crates.io-index"
 651 | checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 652 | 
 653 | [[package]]
 654 | name = "libc"
 655 | version = "0.2.159"
 656 | source = "registry+https://github.com/rust-lang/crates.io-index"
 657 | checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5"
 658 | 
 659 | [[package]]
 660 | name = "libdeflate-sys"
 661 | version = "1.22.0"
 662 | source = "registry+https://github.com/rust-lang/crates.io-index"
 663 | checksum = "2f4ae7b48098016dc3bc64a35605668f0af4425ec1a4a175ce2d0c1129067932"
 664 | dependencies = [
 665 |  "cc",
 666 | ]
 667 | 
 668 | [[package]]
 669 | name = "libloading"
 670 | version = "0.8.5"
 671 | source = "registry+https://github.com/rust-lang/crates.io-index"
 672 | checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4"
 673 | dependencies = [
 674 |  "cfg-if",
 675 |  "windows-targets",
 676 | ]
 677 | 
 678 | [[package]]
 679 | name = "libz-sys"
 680 | version = "1.1.20"
 681 | source = "registry+https://github.com/rust-lang/crates.io-index"
 682 | checksum = "d2d16453e800a8cf6dd2fc3eb4bc99b786a9b90c663b8559a5b1a041bf89e472"
 683 | dependencies = [
 684 |  "cc",
 685 |  "cmake",
 686 |  "libc",
 687 |  "pkg-config",
 688 |  "vcpkg",
 689 | ]
 690 | 
 691 | [[package]]
 692 | name = "linear-map"
 693 | version = "1.2.0"
 694 | source = "registry+https://github.com/rust-lang/crates.io-index"
 695 | checksum = "bfae20f6b19ad527b550c223fddc3077a547fc70cda94b9b566575423fd303ee"
 696 | 
 697 | [[package]]
 698 | name = "litemap"
 699 | version = "0.7.4"
 700 | source = "registry+https://github.com/rust-lang/crates.io-index"
 701 | checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
 702 | 
 703 | [[package]]
 704 | name = "lock_api"
 705 | version = "0.4.12"
 706 | source = "registry+https://github.com/rust-lang/crates.io-index"
 707 | checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 708 | dependencies = [
 709 |  "autocfg",
 710 |  "scopeguard",
 711 | ]
 712 | 
 713 | [[package]]
 714 | name = "log"
 715 | version = "0.4.22"
 716 | source = "registry+https://github.com/rust-lang/crates.io-index"
 717 | checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
 718 | 
 719 | [[package]]
 720 | name = "luau0-src"
 721 | version = "0.12.3+luau663"
 722 | source = "registry+https://github.com/rust-lang/crates.io-index"
 723 | checksum = "76ae337c644bbf86a8d8e9ce3ee023311833d41741baf5e51acc31b37843aba1"
 724 | dependencies = [
 725 |  "cc",
 726 | ]
 727 | 
 728 | [[package]]
 729 | name = "lzma-sys"
 730 | version = "0.1.20"
 731 | source = "registry+https://github.com/rust-lang/crates.io-index"
 732 | checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27"
 733 | dependencies = [
 734 |  "cc",
 735 |  "libc",
 736 |  "pkg-config",
 737 | ]
 738 | 
 739 | [[package]]
 740 | name = "matrixmultiply"
 741 | version = "0.3.9"
 742 | source = "registry+https://github.com/rust-lang/crates.io-index"
 743 | checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a"
 744 | dependencies = [
 745 |  "autocfg",
 746 |  "rawpointer",
 747 | ]
 748 | 
 749 | [[package]]
 750 | name = "memchr"
 751 | version = "2.7.4"
 752 | source = "registry+https://github.com/rust-lang/crates.io-index"
 753 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 754 | 
 755 | [[package]]
 756 | name = "minimal-lexical"
 757 | version = "0.2.1"
 758 | source = "registry+https://github.com/rust-lang/crates.io-index"
 759 | checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 760 | 
 761 | [[package]]
 762 | name = "miniz_oxide"
 763 | version = "0.8.0"
 764 | source = "registry+https://github.com/rust-lang/crates.io-index"
 765 | checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1"
 766 | dependencies = [
 767 |  "adler2",
 768 | ]
 769 | 
 770 | [[package]]
 771 | name = "mlua"
 772 | version = "0.10.3"
 773 | source = "registry+https://github.com/rust-lang/crates.io-index"
 774 | checksum = "d3f763c1041eff92ffb5d7169968a327e1ed2ebfe425dac0ee5a35f29082534b"
 775 | dependencies = [
 776 |  "bstr",
 777 |  "either",
 778 |  "libloading",
 779 |  "mlua-sys",
 780 |  "num-traits",
 781 |  "parking_lot",
 782 |  "rustc-hash 2.1.1",
 783 | ]
 784 | 
 785 | [[package]]
 786 | name = "mlua-sys"
 787 | version = "0.6.7"
 788 | source = "registry+https://github.com/rust-lang/crates.io-index"
 789 | checksum = "1901c1a635a22fe9250ffcc4fcc937c16b47c2e9e71adba8784af8bca1f69594"
 790 | dependencies = [
 791 |  "cc",
 792 |  "cfg-if",
 793 |  "luau0-src",
 794 |  "pkg-config",
 795 | ]
 796 | 
 797 | [[package]]
 798 | name = "ndarray"
 799 | version = "0.15.6"
 800 | source = "registry+https://github.com/rust-lang/crates.io-index"
 801 | checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32"
 802 | dependencies = [
 803 |  "matrixmultiply",
 804 |  "num-complex",
 805 |  "num-integer",
 806 |  "num-traits",
 807 |  "rawpointer",
 808 | ]
 809 | 
 810 | [[package]]
 811 | name = "newtype_derive"
 812 | version = "0.1.6"
 813 | source = "registry+https://github.com/rust-lang/crates.io-index"
 814 | checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec"
 815 | dependencies = [
 816 |  "rustc_version",
 817 | ]
 818 | 
 819 | [[package]]
 820 | name = "nom"
 821 | version = "7.1.3"
 822 | source = "registry+https://github.com/rust-lang/crates.io-index"
 823 | checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
 824 | dependencies = [
 825 |  "memchr",
 826 |  "minimal-lexical",
 827 | ]
 828 | 
 829 | [[package]]
 830 | name = "num-complex"
 831 | version = "0.4.6"
 832 | source = "registry+https://github.com/rust-lang/crates.io-index"
 833 | checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
 834 | dependencies = [
 835 |  "num-traits",
 836 | ]
 837 | 
 838 | [[package]]
 839 | name = "num-integer"
 840 | version = "0.1.46"
 841 | source = "registry+https://github.com/rust-lang/crates.io-index"
 842 | checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
 843 | dependencies = [
 844 |  "num-traits",
 845 | ]
 846 | 
 847 | [[package]]
 848 | name = "num-traits"
 849 | version = "0.2.19"
 850 | source = "registry+https://github.com/rust-lang/crates.io-index"
 851 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 852 | dependencies = [
 853 |  "autocfg",
 854 | ]
 855 | 
 856 | [[package]]
 857 | name = "openssl-src"
 858 | version = "300.3.2+3.3.2"
 859 | source = "registry+https://github.com/rust-lang/crates.io-index"
 860 | checksum = "a211a18d945ef7e648cc6e0058f4c548ee46aab922ea203e0d30e966ea23647b"
 861 | dependencies = [
 862 |  "cc",
 863 | ]
 864 | 
 865 | [[package]]
 866 | name = "openssl-sys"
 867 | version = "0.9.103"
 868 | source = "registry+https://github.com/rust-lang/crates.io-index"
 869 | checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6"
 870 | dependencies = [
 871 |  "cc",
 872 |  "libc",
 873 |  "openssl-src",
 874 |  "pkg-config",
 875 |  "vcpkg",
 876 | ]
 877 | 
 878 | [[package]]
 879 | name = "parking_lot"
 880 | version = "0.12.3"
 881 | source = "registry+https://github.com/rust-lang/crates.io-index"
 882 | checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
 883 | dependencies = [
 884 |  "lock_api",
 885 |  "parking_lot_core",
 886 | ]
 887 | 
 888 | [[package]]
 889 | name = "parking_lot_core"
 890 | version = "0.9.10"
 891 | source = "registry+https://github.com/rust-lang/crates.io-index"
 892 | checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 893 | dependencies = [
 894 |  "cfg-if",
 895 |  "libc",
 896 |  "redox_syscall",
 897 |  "smallvec",
 898 |  "windows-targets",
 899 | ]
 900 | 
 901 | [[package]]
 902 | name = "percent-encoding"
 903 | version = "2.3.1"
 904 | source = "registry+https://github.com/rust-lang/crates.io-index"
 905 | checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 906 | 
 907 | [[package]]
 908 | name = "pkg-config"
 909 | version = "0.3.31"
 910 | source = "registry+https://github.com/rust-lang/crates.io-index"
 911 | checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
 912 | 
 913 | [[package]]
 914 | name = "proc-macro2"
 915 | version = "1.0.92"
 916 | source = "registry+https://github.com/rust-lang/crates.io-index"
 917 | checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 918 | dependencies = [
 919 |  "unicode-ident",
 920 | ]
 921 | 
 922 | [[package]]
 923 | name = "quick-error"
 924 | version = "1.2.3"
 925 | source = "registry+https://github.com/rust-lang/crates.io-index"
 926 | checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
 927 | 
 928 | [[package]]
 929 | name = "quote"
 930 | version = "1.0.37"
 931 | source = "registry+https://github.com/rust-lang/crates.io-index"
 932 | checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 933 | dependencies = [
 934 |  "proc-macro2",
 935 | ]
 936 | 
 937 | [[package]]
 938 | name = "rawpointer"
 939 | version = "0.2.1"
 940 | source = "registry+https://github.com/rust-lang/crates.io-index"
 941 | checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
 942 | 
 943 | [[package]]
 944 | name = "rayon"
 945 | version = "1.10.0"
 946 | source = "registry+https://github.com/rust-lang/crates.io-index"
 947 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
 948 | dependencies = [
 949 |  "either",
 950 |  "rayon-core",
 951 | ]
 952 | 
 953 | [[package]]
 954 | name = "rayon-core"
 955 | version = "1.12.1"
 956 | source = "registry+https://github.com/rust-lang/crates.io-index"
 957 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
 958 | dependencies = [
 959 |  "crossbeam-deque",
 960 |  "crossbeam-utils",
 961 | ]
 962 | 
 963 | [[package]]
 964 | name = "redox_syscall"
 965 | version = "0.5.11"
 966 | source = "registry+https://github.com/rust-lang/crates.io-index"
 967 | checksum = "d2f103c6d277498fbceb16e84d317e2a400f160f46904d5f5410848c829511a3"
 968 | dependencies = [
 969 |  "bitflags",
 970 | ]
 971 | 
 972 | [[package]]
 973 | name = "regex"
 974 | version = "1.11.1"
 975 | source = "registry+https://github.com/rust-lang/crates.io-index"
 976 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 977 | dependencies = [
 978 |  "aho-corasick",
 979 |  "memchr",
 980 |  "regex-automata",
 981 |  "regex-syntax",
 982 | ]
 983 | 
 984 | [[package]]
 985 | name = "regex-automata"
 986 | version = "0.4.8"
 987 | source = "registry+https://github.com/rust-lang/crates.io-index"
 988 | checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
 989 | dependencies = [
 990 |  "aho-corasick",
 991 |  "memchr",
 992 |  "regex-syntax",
 993 | ]
 994 | 
 995 | [[package]]
 996 | name = "regex-syntax"
 997 | version = "0.8.5"
 998 | source = "registry+https://github.com/rust-lang/crates.io-index"
 999 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
1000 | 
1001 | [[package]]
1002 | name = "rust-htslib"
1003 | version = "0.49.0"
1004 | source = "registry+https://github.com/rust-lang/crates.io-index"
1005 | checksum = "115ae57b89deb942275566eca8c31da053d8df0d8bda12e50c4c4aa994877068"
1006 | dependencies = [
1007 |  "bio-types",
1008 |  "byteorder",
1009 |  "custom_derive",
1010 |  "derive-new 0.7.0",
1011 |  "hts-sys",
1012 |  "ieee754",
1013 |  "lazy_static",
1014 |  "libc",
1015 |  "libz-sys",
1016 |  "linear-map",
1017 |  "newtype_derive",
1018 |  "regex",
1019 |  "thiserror 2.0.3",
1020 |  "url",
1021 | ]
1022 | 
1023 | [[package]]
1024 | name = "rust-lapper"
1025 | version = "1.1.0"
1026 | source = "registry+https://github.com/rust-lang/crates.io-index"
1027 | checksum = "ee43d8e721ac803031dbab6a944b957b49a3b11eadbc099880c8aaaebf23ed27"
1028 | dependencies = [
1029 |  "num-traits",
1030 | ]
1031 | 
1032 | [[package]]
1033 | name = "rustc-hash"
1034 | version = "1.1.0"
1035 | source = "registry+https://github.com/rust-lang/crates.io-index"
1036 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
1037 | 
1038 | [[package]]
1039 | name = "rustc-hash"
1040 | version = "2.1.1"
1041 | source = "registry+https://github.com/rust-lang/crates.io-index"
1042 | checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
1043 | 
1044 | [[package]]
1045 | name = "rustc_version"
1046 | version = "0.1.7"
1047 | source = "registry+https://github.com/rust-lang/crates.io-index"
1048 | checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084"
1049 | dependencies = [
1050 |  "semver",
1051 | ]
1052 | 
1053 | [[package]]
1054 | name = "rustversion"
1055 | version = "1.0.17"
1056 | source = "registry+https://github.com/rust-lang/crates.io-index"
1057 | checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
1058 | 
1059 | [[package]]
1060 | name = "scopeguard"
1061 | version = "1.2.0"
1062 | source = "registry+https://github.com/rust-lang/crates.io-index"
1063 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
1064 | 
1065 | [[package]]
1066 | name = "semver"
1067 | version = "0.1.20"
1068 | source = "registry+https://github.com/rust-lang/crates.io-index"
1069 | checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac"
1070 | 
1071 | [[package]]
1072 | name = "serde"
1073 | version = "1.0.215"
1074 | source = "registry+https://github.com/rust-lang/crates.io-index"
1075 | checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
1076 | dependencies = [
1077 |  "serde_derive",
1078 | ]
1079 | 
1080 | [[package]]
1081 | name = "serde_derive"
1082 | version = "1.0.215"
1083 | source = "registry+https://github.com/rust-lang/crates.io-index"
1084 | checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
1085 | dependencies = [
1086 |  "proc-macro2",
1087 |  "quote",
1088 |  "syn",
1089 | ]
1090 | 
1091 | [[package]]
1092 | name = "shlex"
1093 | version = "1.3.0"
1094 | source = "registry+https://github.com/rust-lang/crates.io-index"
1095 | checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
1096 | 
1097 | [[package]]
1098 | name = "smallvec"
1099 | version = "1.13.2"
1100 | source = "registry+https://github.com/rust-lang/crates.io-index"
1101 | checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
1102 | 
1103 | [[package]]
1104 | name = "stable_deref_trait"
1105 | version = "1.2.0"
1106 | source = "registry+https://github.com/rust-lang/crates.io-index"
1107 | checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
1108 | 
1109 | [[package]]
1110 | name = "strsim"
1111 | version = "0.11.1"
1112 | source = "registry+https://github.com/rust-lang/crates.io-index"
1113 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
1114 | 
1115 | [[package]]
1116 | name = "strum_macros"
1117 | version = "0.26.4"
1118 | source = "registry+https://github.com/rust-lang/crates.io-index"
1119 | checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
1120 | dependencies = [
1121 |  "heck",
1122 |  "proc-macro2",
1123 |  "quote",
1124 |  "rustversion",
1125 |  "syn",
1126 | ]
1127 | 
1128 | [[package]]
1129 | name = "syn"
1130 | version = "2.0.89"
1131 | source = "registry+https://github.com/rust-lang/crates.io-index"
1132 | checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
1133 | dependencies = [
1134 |  "proc-macro2",
1135 |  "quote",
1136 |  "unicode-ident",
1137 | ]
1138 | 
1139 | [[package]]
1140 | name = "synstructure"
1141 | version = "0.13.1"
1142 | source = "registry+https://github.com/rust-lang/crates.io-index"
1143 | checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
1144 | dependencies = [
1145 |  "proc-macro2",
1146 |  "quote",
1147 |  "syn",
1148 | ]
1149 | 
1150 | [[package]]
1151 | name = "termcolor"
1152 | version = "1.4.1"
1153 | source = "registry+https://github.com/rust-lang/crates.io-index"
1154 | checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755"
1155 | dependencies = [
1156 |  "winapi-util",
1157 | ]
1158 | 
1159 | [[package]]
1160 | name = "thiserror"
1161 | version = "1.0.64"
1162 | source = "registry+https://github.com/rust-lang/crates.io-index"
1163 | checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
1164 | dependencies = [
1165 |  "thiserror-impl 1.0.64",
1166 | ]
1167 | 
1168 | [[package]]
1169 | name = "thiserror"
1170 | version = "2.0.3"
1171 | source = "registry+https://github.com/rust-lang/crates.io-index"
1172 | checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa"
1173 | dependencies = [
1174 |  "thiserror-impl 2.0.3",
1175 | ]
1176 | 
1177 | [[package]]
1178 | name = "thiserror-impl"
1179 | version = "1.0.64"
1180 | source = "registry+https://github.com/rust-lang/crates.io-index"
1181 | checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
1182 | dependencies = [
1183 |  "proc-macro2",
1184 |  "quote",
1185 |  "syn",
1186 | ]
1187 | 
1188 | [[package]]
1189 | name = "thiserror-impl"
1190 | version = "2.0.3"
1191 | source = "registry+https://github.com/rust-lang/crates.io-index"
1192 | checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568"
1193 | dependencies = [
1194 |  "proc-macro2",
1195 |  "quote",
1196 |  "syn",
1197 | ]
1198 | 
1199 | [[package]]
1200 | name = "tinystr"
1201 | version = "0.7.6"
1202 | source = "registry+https://github.com/rust-lang/crates.io-index"
1203 | checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
1204 | dependencies = [
1205 |  "displaydoc",
1206 |  "zerovec",
1207 | ]
1208 | 
1209 | [[package]]
1210 | name = "tinyvec"
1211 | version = "1.8.0"
1212 | source = "registry+https://github.com/rust-lang/crates.io-index"
1213 | checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938"
1214 | dependencies = [
1215 |  "tinyvec_macros",
1216 | ]
1217 | 
1218 | [[package]]
1219 | name = "tinyvec_macros"
1220 | version = "0.1.1"
1221 | source = "registry+https://github.com/rust-lang/crates.io-index"
1222 | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
1223 | 
1224 | [[package]]
1225 | name = "unicode-bidi"
1226 | version = "0.3.17"
1227 | source = "registry+https://github.com/rust-lang/crates.io-index"
1228 | checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893"
1229 | 
1230 | [[package]]
1231 | name = "unicode-ident"
1232 | version = "1.0.13"
1233 | source = "registry+https://github.com/rust-lang/crates.io-index"
1234 | checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
1235 | 
1236 | [[package]]
1237 | name = "unicode-normalization"
1238 | version = "0.1.24"
1239 | source = "registry+https://github.com/rust-lang/crates.io-index"
1240 | checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
1241 | dependencies = [
1242 |  "tinyvec",
1243 | ]
1244 | 
1245 | [[package]]
1246 | name = "url"
1247 | version = "2.5.2"
1248 | source = "registry+https://github.com/rust-lang/crates.io-index"
1249 | checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c"
1250 | dependencies = [
1251 |  "form_urlencoded",
1252 |  "idna 0.5.0",
1253 |  "percent-encoding",
1254 | ]
1255 | 
1256 | [[package]]
1257 | name = "utf16_iter"
1258 | version = "1.0.5"
1259 | source = "registry+https://github.com/rust-lang/crates.io-index"
1260 | checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
1261 | 
1262 | [[package]]
1263 | name = "utf8_iter"
1264 | version = "1.0.4"
1265 | source = "registry+https://github.com/rust-lang/crates.io-index"
1266 | checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
1267 | 
1268 | [[package]]
1269 | name = "utf8parse"
1270 | version = "0.2.2"
1271 | source = "registry+https://github.com/rust-lang/crates.io-index"
1272 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
1273 | 
1274 | [[package]]
1275 | name = "vcpkg"
1276 | version = "0.2.15"
1277 | source = "registry+https://github.com/rust-lang/crates.io-index"
1278 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
1279 | 
1280 | [[package]]
1281 | name = "winapi-util"
1282 | version = "0.1.9"
1283 | source = "registry+https://github.com/rust-lang/crates.io-index"
1284 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
1285 | dependencies = [
1286 |  "windows-sys 0.59.0",
1287 | ]
1288 | 
1289 | [[package]]
1290 | name = "windows-sys"
1291 | version = "0.52.0"
1292 | source = "registry+https://github.com/rust-lang/crates.io-index"
1293 | checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
1294 | dependencies = [
1295 |  "windows-targets",
1296 | ]
1297 | 
1298 | [[package]]
1299 | name = "windows-sys"
1300 | version = "0.59.0"
1301 | source = "registry+https://github.com/rust-lang/crates.io-index"
1302 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
1303 | dependencies = [
1304 |  "windows-targets",
1305 | ]
1306 | 
1307 | [[package]]
1308 | name = "windows-targets"
1309 | version = "0.52.6"
1310 | source = "registry+https://github.com/rust-lang/crates.io-index"
1311 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
1312 | dependencies = [
1313 |  "windows_aarch64_gnullvm",
1314 |  "windows_aarch64_msvc",
1315 |  "windows_i686_gnu",
1316 |  "windows_i686_gnullvm",
1317 |  "windows_i686_msvc",
1318 |  "windows_x86_64_gnu",
1319 |  "windows_x86_64_gnullvm",
1320 |  "windows_x86_64_msvc",
1321 | ]
1322 | 
1323 | [[package]]
1324 | name = "windows_aarch64_gnullvm"
1325 | version = "0.52.6"
1326 | source = "registry+https://github.com/rust-lang/crates.io-index"
1327 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
1328 | 
1329 | [[package]]
1330 | name = "windows_aarch64_msvc"
1331 | version = "0.52.6"
1332 | source = "registry+https://github.com/rust-lang/crates.io-index"
1333 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
1334 | 
1335 | [[package]]
1336 | name = "windows_i686_gnu"
1337 | version = "0.52.6"
1338 | source = "registry+https://github.com/rust-lang/crates.io-index"
1339 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
1340 | 
1341 | [[package]]
1342 | name = "windows_i686_gnullvm"
1343 | version = "0.52.6"
1344 | source = "registry+https://github.com/rust-lang/crates.io-index"
1345 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
1346 | 
1347 | [[package]]
1348 | name = "windows_i686_msvc"
1349 | version = "0.52.6"
1350 | source = "registry+https://github.com/rust-lang/crates.io-index"
1351 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
1352 | 
1353 | [[package]]
1354 | name = "windows_x86_64_gnu"
1355 | version = "0.52.6"
1356 | source = "registry+https://github.com/rust-lang/crates.io-index"
1357 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
1358 | 
1359 | [[package]]
1360 | name = "windows_x86_64_gnullvm"
1361 | version = "0.52.6"
1362 | source = "registry+https://github.com/rust-lang/crates.io-index"
1363 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
1364 | 
1365 | [[package]]
1366 | name = "windows_x86_64_msvc"
1367 | version = "0.52.6"
1368 | source = "registry+https://github.com/rust-lang/crates.io-index"
1369 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
1370 | 
1371 | [[package]]
1372 | name = "write16"
1373 | version = "1.0.0"
1374 | source = "registry+https://github.com/rust-lang/crates.io-index"
1375 | checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
1376 | 
1377 | [[package]]
1378 | name = "writeable"
1379 | version = "0.5.5"
1380 | source = "registry+https://github.com/rust-lang/crates.io-index"
1381 | checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
1382 | 
1383 | [[package]]
1384 | name = "yoke"
1385 | version = "0.7.5"
1386 | source = "registry+https://github.com/rust-lang/crates.io-index"
1387 | checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
1388 | dependencies = [
1389 |  "serde",
1390 |  "stable_deref_trait",
1391 |  "yoke-derive",
1392 |  "zerofrom",
1393 | ]
1394 | 
1395 | [[package]]
1396 | name = "yoke-derive"
1397 | version = "0.7.5"
1398 | source = "registry+https://github.com/rust-lang/crates.io-index"
1399 | checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
1400 | dependencies = [
1401 |  "proc-macro2",
1402 |  "quote",
1403 |  "syn",
1404 |  "synstructure",
1405 | ]
1406 | 
1407 | [[package]]
1408 | name = "zerofrom"
1409 | version = "0.1.5"
1410 | source = "registry+https://github.com/rust-lang/crates.io-index"
1411 | checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
1412 | dependencies = [
1413 |  "zerofrom-derive",
1414 | ]
1415 | 
1416 | [[package]]
1417 | name = "zerofrom-derive"
1418 | version = "0.1.5"
1419 | source = "registry+https://github.com/rust-lang/crates.io-index"
1420 | checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
1421 | dependencies = [
1422 |  "proc-macro2",
1423 |  "quote",
1424 |  "syn",
1425 |  "synstructure",
1426 | ]
1427 | 
1428 | [[package]]
1429 | name = "zerovec"
1430 | version = "0.10.4"
1431 | source = "registry+https://github.com/rust-lang/crates.io-index"
1432 | checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
1433 | dependencies = [
1434 |  "yoke",
1435 |  "zerofrom",
1436 |  "zerovec-derive",
1437 | ]
1438 | 
1439 | [[package]]
1440 | name = "zerovec-derive"
1441 | version = "0.10.3"
1442 | source = "registry+https://github.com/rust-lang/crates.io-index"
1443 | checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
1444 | dependencies = [
1445 |  "proc-macro2",
1446 |  "quote",
1447 |  "syn",
1448 | ]
1449 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "fraguracy"
 3 | version = "0.2.7"
 4 | edition = "2021"
 5 | 
 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 7 | 
 8 | [dependencies]
 9 | rust-htslib = {features = ["libdeflate", "static", "gcs", "s3"], version = "0.49.0"}
10 | syn = "2.0.87"
11 | idna = "1.0.0"
12 | 
13 | 
14 | libc = "*"
15 | rustc-hash = "1.1.0"
16 | lazy_static = "1.4.0"
17 | ndarray = "0.15.6"
18 | env_logger = "0.10.0"
19 | log = "0.4.17"
20 | itertools = "0.10.5"
21 | rayon = "1.7.0"
22 | linear-map = "1.2.0"
23 | rust-lapper = "1.1.0"
24 | flate2 = "1.0.25"
25 | bpci = "0.1.0"
26 | clap = {version ="4", features=["derive"]}
27 | regex = "1.11.1"
28 | mlua = { version = "0.10.3", features = ["luau", "send"] }
29 | anyhow = "1.0.98"
30 | #plotly = "0.8.3"
31 | #polars = { version = "0.27.2", features = ["lazy", "strings"] }
32 | 
33 | # Faster compilation profiles
34 | [profile.dev]
35 | # Reduce optimization for faster dev builds
36 | opt-level = 0
37 | # Keep incremental compilation enabled (default)
38 | incremental = true
39 | # Use more codegen units for parallel compilation
40 | codegen-units = 256
41 | 
42 | # Still optimize dependencies for better runtime performance in dev
43 | [profile.dev.package."*"]
44 | opt-level = 2  # Reduced from 3 to 2 for faster compilation
45 | 
46 | [profile.release]
47 | # Use "thin" LTO instead of "fat" for faster release builds
48 | lto = "thin"
49 | # Increase codegen units for faster compilation
50 | codegen-units = 16
51 | # Enable debug info for better profiling
52 | debug = 1
53 | 
54 | # Fast release profile for quicker testing
55 | [profile.release-fast]
56 | inherits = "release"
57 | lto = false
58 | codegen-units = 256
59 | debug = false
60 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Brent Pedersen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!--- 
  2 | # static binary build
  3 | RUSTFLAGS="-C target-feature=+crt-static" cargo build --release  --target x86_64-unknown-linux-gnu
  4 | --->
  5 | # Fraguracy
  6 | 
  7 | [![Rust](https://github.com/brentp/fraguracy/actions/workflows/rust.yml/badge.svg)](https://github.com/brentp/fraguracy/actions/workflows/rust.yml)
  8 | 
  9 | `fraguracy` calculates real error rates using overlapping paired-end reads in a fragment.
 10 | It reports a file of error positions and counts, along with a summary of errors by context, read-position, read-orientation (F or R) and base-quality.
 11 | While the overlap requirment does limit to the (potentially) small percentage of bases that overlap this can
 12 | still be useful to:
 13 | 
 14 | 1. evaluate error rates within and among samples
 15 | 2. find sites in the genome with high error rates
 16 | 3. find data-driven cutoffs for allele fraction (`AF`) cutoffs in UMI or duplex sequencing.
 17 | 
 18 | # Usage
 19 | 
 20 | The `fraguracy` binary available in releases takes a bam or cram file and outputs error stats. The plotting is currently done via python.
 21 | 
 22 | ```
 23 | $ fraguracy extract \
 24 |     --bin-size 1 \
 25 |     --output-prefix fraguracy-$sample- \
 26 |     --fasta $reference \
 27 |     $sample.bam [.. *.bam] \
 28 | 
 29 | $ python plot.py fraguracy-$sample-consensus-counts.txt # writes read.html
 30 | 
 31 | $ head fraguracy-$sample-errors.bed # records base position of every error observed and count of errors at that site.
 32 | chrom start stop bq_bin count contexts
 33 | chr1 75822283 75822284 05-19 6 AC:4,AT:2
 34 | chr1 75822287 75822288 20-36 4 TC:4
 35 | chr1 75822287 75822288 37-59 3 TC:3
 36 | chr1 75822287 75822288 60+ 2 CA:2
 37 | chr1 75822341 75822342 05-19 2 TC:2
 38 | chr1 75822352 75822353 20-36 2 GT:2
 39 | chr1 75822360 75822361 20-36 2 AG:2
 40 | chr1 241850751 241850752 37-59 2 TC:2
 41 | chr1 241850752 241850753 20-36 2 TA:1,TC:1
 42 | ```
 43 | 
 44 | There is also an `$sample-indel-errors.bed` file that contains the columns:
 45 | 
 46 | ```
 47 | chrom   start   stop    count
 48 | ```
 49 | 
 50 | The errors files are useful to find **positions that are frequent errors** -- having count > 1 or with multiple bq_bins showing the same position.
 51 | 
 52 | If multiple samples are given (multiple bam files) then each sample is processed in parallel and $prefix-total-counts.txt and $prefix-total-errors.bed will
 53 | be created which sum all values for all samples.
 54 | 
 55 | The plot.py will create an interactive plot that looks like this:
 56 | 
 57 | ![frag-plot](https://user-images.githubusercontent.com/1739/225074861-7b5098d1-b5e9-4bab-8971-0a278f182aaa.png)
 58 | 
 59 | **NOTE** that depending on the goal it can be useful to run `fraguracy extract` once, then exclude sites that are very frequent errors and re-run,
 60 | this will prevent a small percentage of sites (often around homopolymers) from dominating the error profile.
 61 | 
 62 | ## CLI
 63 | 
 64 | ```
 65 | error profile pair overlaps in bam/cram
 66 | 
 67 | Usage: fraguracy extract [OPTIONS] [BAMS]...
 68 | 
 69 | Arguments:
 70 |   [BAMS]...  
 71 | 
 72 | Options:
 73 |   -f, --fasta <FASTA>
 74 |           fasta for use with crams and/or to use as 'truth'
 75 |   -o, --output-prefix <OUTPUT_PREFIX>
 76 |           prefix for output files [default: fraguracy-]
 77 |   -C, --chromosome <CHROMOSOME>
 78 |           restrict analysis to this chromosome
 79 |   -r, --regions <REGIONS>
 80 |           restrict analysis to the regions given in this BED file
 81 |   -e, --exclude-regions <EXCLUDE_REGIONS>
 82 |           exclude from analysis the regions given in this BED file
 83 |   -l, --lua-expression <LUA_EXPRESSION>
 84 |           optional lua expression to filter reads. returns true to skip read. e.g. 'return read.flags.secondary or read.flags.supplementary'.
 85 |   -m, --max-read-length <MAX_READ_LENGTH>
 86 |           indicate the maximum read length in the alignment file [default: 151]
 87 |   -b, --bin-size <BIN_SIZE>
 88 |           parition the read into chunks/bins of this size [default: 3]
 89 |   -Q, --min-mapping-quality <MIN_MAPPING_QUALITY>
 90 |           only consider pairs where both reads have this mapping-quality or higher (good to leave this high) [default: 50]
 91 |   -c, --ci <CI>
 92 |           method for confidence interval calculation (see rust bpci crate) [default: agresti-coull] [possible values: agresti-coull, wald, wilson]
 93 |   -n, --no-denominator
 94 |           do not calculate denominator. This can shorten runtime.
 95 |   -H, --homopolymer-regex <HOMOPOLYMER_REGEX>
 96 |           regex for homopolymer sequence to consider if denominator is calculated[default: A{3,}|C{3,}|G{3,}|T{3,}] [default: A{3,}|C{3,}|G{3,}|T{3,}]
 97 |   -t, --reference-as-truth
 98 |           use reference base as 'truth'
 99 |   -h, --help
100 |           Print help
101 | ```
102 | 
103 | ### Lua Expressions
104 | 
105 | The `extract` sub-command allows lua expressions with `-l` that indicate whether to skip a read. See [lua-api.md](lua-api.md) for a full description
106 | of how to use this.
107 | 
108 | ### Combine
109 | 
110 | `fraguracy extract` can also be run per-sample and then errors can be combined with `fraguracy combine-errors`:
111 | 
112 | ```
113 | Usage: fraguracy combine-errors [OPTIONS] --fai-path <FAI_PATH> [ERRORS]...
114 | 
115 | Arguments:
116 |   [ERRORS]...  path to error bed files from extract
117 | 
118 | Options:
119 |   -f, --fai-path <FAI_PATH>        path for to fai (not fasta) file
120 |   -o, --output-path <OUTPUT_PATH>  path for output bed file [default: fraguracy-combined-errors.bed]
121 |   -h, --help                       Print help
122 | ```
123 | 
124 | The output is a single file with the error counts from each sample summed. And an additional column indicating the
125 | number of samples that containing the error is reported.
126 | 
127 | > ⚠️ **Warning**: You must send either indel error files or snp error files, not both!
128 | 
129 | ## Bins
130 | 
131 | The aim is to create a model of errors. Many factors can be predictive of the likelihood of an error.
132 | The dimensionality is a consideration because if the data is too sparse, prediction is less reliable.
133 | Because we determine accuracy by the mapping, it is best to require a high mapping-quality.
134 | Therefore we limit to: **Base-Quality**, **Sequence Context**, **Read**, and **Position in Read**
135 | as described and binned below. With those binnings we have **189,720** possible combinations (5 *6* 2 *2* $read_length / $bin-size * 31 )
136 | 
137 | For each combination, while iterating over the bam, we store the number of errors and the number of total bases
138 | in each bin. These become, respectively, the numerator and denominator for the error-rate for that set of parameters.
139 | 
140 | ### Qualities (5)
141 | 
142 | Base-Qualities and Mapping Qualities will be binned to:
143 | 
144 | 0. 0-5
145 | 1. 6-19
146 | 2. 20 - 36,
147 | 3. 37 - 59,
148 | 4. 60+
149 | 
150 | This means that the quantized base-qualities from nova-seq (2, 12, 23 and 37) are each in separate bins.
151 | And other base-quality schemes are also paritioned sanely.
152 | 
153 | ### Sequence Context (6)
154 | 
155 | 0. C->A (G->T)
156 | 1. C->G (G->C)
157 | 2. C->T (G->A)
158 | 3. T->A (A->T)
159 | 4. T->C (A->G)
160 | 5. T->G (A->C)
161 | 
162 | ### Read (2)
163 | 
164 | Read 1 or Read 2
165 | 
166 | ### Read Position (50)
167 | 
168 | read position is simply divided by 3. so bins of 3 bases.
169 | 
170 | ### Homopolymer distance (30)
171 | 
172 | The errors are also partitioned by homopolymer distance up to +- 15. all errors beyond 15 are put in the 15 base bin
173 | 
174 | # vcfanno
175 | 
176 | To use the errors files with vcfanno:
177 | 
178 | ```
179 | bgzip fraguracy/fraguracy-19610X19-errors.bed
180 | tabix fraguracy/fraguracy-19610X19-errors.bed.gz
181 | 
182 | echo '
183 | [[annotation]]
184 | file="fraguracy/fraguracy-19610X19-errors.bed.gz"
185 | columns=[4, 5]
186 | names=["frag_bq_bin", "frag_errors"]
187 | ops=["first", "first"]
188 | ' > conf.toml
189 | 
190 | vcfanno conf.toml $vcf > annotated.vcf # annotated.vcf will have entries for `frag_bq_bin` and `frag_errors` where there was an error found that was also a variant in the VCF.
191 | ```
192 | 
193 | ## indel errors
194 | 
195 | A command like: `fraguracy extract -f $fasta -o $prefix $bam` will create the files needed to evaluate the indel error rate. To plot it, then use:
196 | 
197 | ```
198 | python scripts/analyze_indel_errors.py ${prefix}-indel-errors.bed.gz ${prefix}-counts.txt
199 | ```
200 | 
201 | Which will make a plot like this one:
202 | 
203 | ![Image](https://github.com/user-attachments/assets/35c10634-a54b-48a2-9c5f-2c6363def26f)
204 | 


--------------------------------------------------------------------------------
/analyze_indel_errors.md:
--------------------------------------------------------------------------------
  1 | # Indel Error Rate Analysis Summary
  2 | 
  3 | ## Overview
  4 | 
  5 | Successfully analyzed indel error rates from fraguracy output files using Python with polars for data processing and plotly for interactive visualization.
  6 | 
  7 | ## Data Processing Steps
  8 | 
  9 | ### 1. Indel Errors File Processing
 10 | 
 11 | - **Input**: `test_ovl-indel-errors.bed.gz` (compressed BED format)
 12 | - **Columns**: chrom, start, end, count, length, bq_bin, hp_dist
 13 | - **Processing**: Grouped by length, bq_bin, and hp_dist, then summed counts
 14 | - **Result**: 1,887 unique combinations with indel counts
 15 | 
 16 | ### 2. Counts File Processing  
 17 | 
 18 | - **Input**: `test_ovl-counts.txt` (tab-separated)
 19 | - **Columns**: read12, FR, bq_bin, read_pos, context, hp_dist, total_count, error_count, err_rate_lo, err_rate_hi
 20 | - **Processing**: Grouped by bq_bin and hp_dist, then summed total_count
 21 | - **Result**: 155 unique combinations with total counts
 22 | 
 23 | ### 3. Error Rate Calculation
 24 | 
 25 | - **Method**: Joined indel counts with total counts on bq_bin and hp_dist
 26 | - **Formula**: error_rate = indel_count / total_count
 27 | - **Final dataset**: 1,887 records with error rates
 28 | 
 29 | ## Results
 30 | 
 31 | ### Key Findings
 32 | 
 33 | - **Indel Lengths**: 66 different indel lengths observed (ranging from -47 to +28)
 34 | - **Error Rate Range**: From ~10^-9 to ~10^-5 (highly variable)
 35 | - **Homopolymer Distance Impact**: Clear relationship between hp_dist and error rates
 36 | - **Base Quality Impact**: Interactive filtering by BQ bins reveals quality-dependent error patterns (defaults to high-quality 37-59 range)
 37 | 
 38 | ### Summary Statistics by Indel Length
 39 | 
 40 | - **Deletions** (negative lengths): Generally lower error rates
 41 | - **Insertions** (positive lengths): More variable error rates  
 42 | - **Single base changes** (±1): Most common, moderate error rates
 43 | 
 44 | ## Visualization
 45 | 
 46 | - **Interactive Plot**: `indel_error_rates_by_hp_dist.html` (with hover data)
 47 | - **Static Plot**: `indel_error_rates_by_hp_dist.png`
 48 | - **X-axis**: Homopolymer distance (hp_dist)
 49 | - **Y-axis**: Indel error rate (log scale)
 50 | - **Colors**: Different indel length categories (aggregated)
 51 | - **Indel Length Aggregation**:
 52 |   - Individual values for -3, -2, -1, 1, 2, 3
 53 |   - Aggregated ">3" for all insertions > 3 bases
 54 |   - Aggregated "<-3" for all deletions > 3 bases
 55 | - **Features**:
 56 |   - Log scale for error rates
 57 |   - Interactive hover showing detailed information (indel count, total count, bq_bin)
 58 |   - Grid for easier reading
 59 |   - Legend with "Indel Length" title showing length categories
 60 |   - Modern plotly-based visualization
 61 |   - **Line connections (default)**: Points are connected by lines for each indel length category to show trends across hp_distance
 62 |   - **Optional scatter-only mode**: Use `--no-lines` to show scatter plot without line connections
 63 |   - **BQ Bin Filtering**: Interactive buttons to filter data by base quality bins (defaults to 37-59)
 64 |     - "All BQ Bins": Show all data
 65 |     - "BQ: 37-59 (default)": Show only high-quality base calls
 66 |     - Individual BQ bin buttons: "BQ: 05-19", "BQ: 20-36", etc.
 67 | 
 68 | ## Files Created
 69 | 
 70 | 1. `analyze_indel_errors.py` - Main analysis script
 71 | 2. `indel_error_rates_by_hp_dist.html` - Interactive plot
 72 | 3. `indel_error_rates_by_hp_dist.png` - Static plot  
 73 | 4. `analysis_summary.md` - This summary document
 74 | 
 75 | ## Usage
 76 | 
 77 | ### Command Line Interface
 78 | 
 79 | The script now uses argparse for flexible command line usage:
 80 | 
 81 | ```bash
 82 | # Basic usage (with lines - default)
 83 | python3 analyze_indel_errors.py indel_errors.bed.gz counts.txt
 84 | 
 85 | # Scatter plot only (no lines)
 86 | python3 analyze_indel_errors.py --no-lines indel_errors.bed.gz counts.txt
 87 | 
 88 | # With custom output prefix
 89 | python3 analyze_indel_errors.py --output-prefix my_analysis indel_errors.bed.gz counts.txt
 90 | 
 91 | # Combined options (scatter plot with custom output)
 92 | python3 analyze_indel_errors.py --no-lines --output-prefix scatter_analysis indel_errors.bed.gz counts.txt
 93 | 
 94 | # View help for all options
 95 | python3 analyze_indel_errors.py --help
 96 | ```
 97 | 
 98 | ### Options
 99 | 
100 | - `--no-lines`: Show scatter plot only, without connecting lines (default: show lines)
101 | - `--output-prefix`: Specify custom output file prefix (default: `indel_error_rates_by_hp_dist`)
102 | - `--help`: Show help message with usage examples
103 | 
104 | ### Arguments
105 | 
106 | 1. `indel_errors_file`: Input indel errors file (BED format, can be gzipped)
107 | 2. `counts_file`: Input counts file (tab-separated format)
108 | 
109 | The script processes the input files and generates both interactive (HTML) and static (PNG) plots along with summary statistics.
110 | 


--------------------------------------------------------------------------------
/cli-tests.sh:
--------------------------------------------------------------------------------
1 | cargo run combine-errors -f human_g1k_v38_decoy_phix.fasta.fai test-data/*.bed -o t.combined.bed
2 | 


--------------------------------------------------------------------------------
/lua-api.md:
--------------------------------------------------------------------------------
 1 | # Lua API Reference
 2 | 
 3 | This document describes the Lua API for filtering BAM records.
 4 | 
 5 | ## Flags
 6 | 
 7 | The `flags` object represents BAM flags returned from `read.flags` and provides the following fields:
 8 | 
 9 | - `paired`: Returns true if the read is paired
10 | - `proper_pair`: Returns true if the read is in a proper pair
11 | - `unmapped`: Returns true if the read is unmapped
12 | - `mate_unmapped`: Returns true if the mate is unmapped
13 | - `reverse`: Returns true if the read is on the reverse strand
14 | - `forward`: Returns true if the read is on the forward strand
15 | - `mate_reverse`: Returns true if the mate is on the reverse strand
16 | - `mate_forward`: Returns true if the mate is on the forward strand
17 | - `read_1`: Returns true if this is read 1
18 | - `read_2`: Returns true if this is read 2
19 | - `secondary`: Returns true if this is a secondary alignment
20 | - `primary`: Returns true if this is the primary alignment
21 | - `qcfail`: Returns true if the read fails quality checks
22 | - `duplicate`: Returns true if the read is a duplicate
23 | - `supplementary`: Returns true if this is a supplementary alignment
24 | - `flag`: Returns the raw integer flag value
25 | 
26 | ## Read
27 | 
28 | The `read` object provides access to BAM record data with the following fields and methods:
29 | 
30 | ### Fields
31 | 
32 | All of these are properties on the `read`, e.g. `read.mapping_quality`
33 | 
34 | - `mapping_quality`: Returns the mapping quality
35 | - `flags`: Returns a `Flags` object
36 | - `tid`: Returns the reference sequence ID
37 | - `start`: Returns the 0-based start position
38 | - `stop`: Returns the end position based on CIGAR
39 | - `length`: Returns the sequence length
40 | - `insert_size`: Returns the insert size
41 | - `qname`: Returns the query name as a string
42 | - `sequence`: Returns the read sequence as a string
43 | - `soft_clips_3_prime`: Returns the number of soft-clipped bases at the 3' end
44 | - `soft_clips_5_prime`: Returns the number of soft-clipped bases at the 5' end
45 | - `base_counts`: Returns a table with counts of A, C, G, T, N in the read
46 | - `n_proportion`: Returns the proportion of N bases in the read (see methods to limit to 3' or 5')
47 | - `indel_count`: Returns the number of indels in the read
48 | - `average_base_quality`: Returns the average base-quality in a read
49 | 
50 | ### Methods
51 | 
52 | - `read:tag(tag_name)`: Returns the value of the specified BAM tag
53 | - `read:n_proportion_3_prime(n:number)`: Returns the proportion of N bases within `n` of the 3' end of the read
54 | - `read:n_proportion_5_prime(n:number)`: Returns the proportion of N bases within `n` of the 5' end of the read
55 | 
56 | ### Not Implemented
57 | 
58 | The following are not implemented as they require a per-base approach
59 | which is not used in fraguracy.
60 | 
61 | - `bq` [NO]: Returns the base quality at the current position
62 | - `distance_from_5prime` [NO]: Returns the distance from the 5' end of the read
63 | - `distance_from_3prime` [NO]: Returns the distance from the 3' end of the read
64 | - `qpos()` [NO]: Returns the query position
65 | 
66 | ## Usage Example
67 | 
68 | ```lua
69 | -- skip reads with mapping quality >= 20 and not supplementary and where the proportion of N's in the last 10 bases is > 0.1
70 | return read.mapping_quality >= 20 and not read.flags.supplementary and read:n_proportion_3_prime(10) > 0.1
71 | ```
72 | 


--------------------------------------------------------------------------------
/plot.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import plotly.graph_objects as go
 3 | from plotly.subplots import make_subplots
 4 | import plotly
 5 | import polars as pl
 6 | import numpy as np
 7 | 
 8 | 
 9 | df = pl.read_csv(sys.argv[1], sep='\t')
10 | 
11 | qual_bin = "20-39"
12 | qual_bin = "60+"
13 | 
14 | r1 = df.filter((pl.col("read12") == "r1") & (pl.col("FR") == "f") & (
15 |     pl.col('bq_bin') == qual_bin) & (pl.col('total_count') > 0))
16 | r2 = df.filter((pl.col("read12") == "r2") & (pl.col("FR") == "r") & (
17 |     pl.col('bq_bin') == qual_bin) & (pl.col('total_count') > 0))
18 | 
19 | print(r1.shape, r2.shape)
20 | 
21 | contexts = list(sorted(r1['context'].unique(), reverse=False))
22 | print(contexts)
23 | 
24 | r1 = r1.with_columns([
25 |     (((pl.col('error_count') + 0) / (1 + pl.col('total_count')))).alias('rate')])
26 | r2 = r2.with_columns([
27 |     (((pl.col('error_count') + 0) / (1 + pl.col('total_count')))).alias('rate')])
28 | 
29 | r1_rate = r1['error_count'].sum() / (r1['total_count'].sum() / 3) * 1_000_000
30 | r2_rate = r2['error_count'].sum() / (r2['total_count'].sum() / 3) * 1_000_000
31 | 
32 | cols = plotly.colors.DEFAULT_PLOTLY_COLORS
33 | 
34 | # Create figure with secondary y-axis
35 | fig = make_subplots(
36 |     rows=2, subplot_titles=[f"read1(F) errors  per million read-bases: {r1_rate:.3f}", f"read2(R) errors per million read-bases: {r2_rate: 3f}"],
37 |     vertical_spacing=0.1,
38 | )
39 | 
40 | for i, ctx in enumerate(contexts):
41 |     sub1 = r1.filter(pl.col('context') == ctx)
42 |     sub2 = r2.filter(pl.col('context') == ctx)
43 | 
44 |     rate1 = sub1['error_count'].sum() / sub1['total_count'].sum() * 1_000_000
45 |     rate2 = sub2['error_count'].sum() / sub2['total_count'].sum() * 1_000_000
46 | 
47 |     t1 = go.Scatter(name=f'{ctx}', x=np.array(sub1["read_pos"]), y=(1_000_000 * np.array(
48 |         sub1['rate'])),
49 |         hovertemplate="rate/Mb: %{y:.2g}  <i>errors</i>:%{text}",
50 |         text=[f'<b>{c}</b> of {n:,}' for c,
51 |               n in zip(sub1["error_count"], sub1['total_count'])],
52 |         line=dict(color=cols[i]))
53 |     t2 = go.Scatter(name=f'{ctx}', x=np.array(sub2["read_pos"]), y=(1_000_000 * np.array(
54 |         sub2['rate'])),
55 |         hovertemplate="rate/Mb: %{y:.2g}  <i>errors</i>:%{text}",
56 |         text=[f'<b>{c}</b> of {n:,}' for c,
57 |               n in zip(sub2["error_count"], sub2['total_count'])],
58 |         line=dict(color=cols[i]), showlegend=False)
59 |     fig.add_trace(t1, row=1, col=1)
60 |     fig.add_trace(t2, row=2, col=1)
61 | 
62 | # fig.update_layout(barmode='stack')
63 | fig.update_layout(hovermode='x unified')
64 | fig.update_xaxes(title_text="relative read position")
65 | # fig.update_layout(legend_traceorder="reversed")
66 | 
67 | # fig.update_layout(title_text="error-rate along a read")
68 | 
69 | fig.update_yaxes(title_text="errors per million read bases")
70 | # fig.update_layout(yaxis_tickformat = '%g')
71 | 
72 | 
73 | fig.write_html("read.html")
74 | 


--------------------------------------------------------------------------------
/scripts/analyze_indel_errors.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Analyze indel error rates from fraguracy output files.
  4 | """
  5 | 
  6 | import polars as pl
  7 | import plotly.express as px
  8 | import plotly.graph_objects as go
  9 | from plotly.subplots import make_subplots
 10 | import numpy as np
 11 | import gzip
 12 | import argparse
 13 | from pathlib import Path
 14 | 
 15 | def read_indel_errors(filepath):
 16 |     """Read indel errors file and sum counts by length, bq_bin, hp_dist"""
 17 |     
 18 |     # Read the file and skip comment lines but keep header
 19 |     if filepath.endswith('.gz'):
 20 |         with gzip.open(filepath, 'rt') as f:
 21 |             lines = f.readlines()
 22 |     else:
 23 |         with open(filepath, 'r') as f:
 24 |             lines = f.readlines()
 25 |     
 26 |     # Find header line (starts with #)
 27 |     header_line = None
 28 |     data_lines = []
 29 |     for line in lines:
 30 |         # Ensure line is a string
 31 |         if isinstance(line, bytes):
 32 |             line_str = line.decode('utf-8')
 33 |         else:
 34 |             line_str = str(line)
 35 |         
 36 |         if line_str.startswith('#'):
 37 |             header_line = line_str[1:].strip()  # Remove # and whitespace
 38 |         else:
 39 |             data_lines.append(line_str.strip())
 40 |     
 41 |     # Create a temporary file-like object with header and data
 42 |     import io
 43 |     if header_line is None:
 44 |         raise ValueError("No header line found in input file")
 45 |     csv_content = str(header_line) + '\n' + '\n'.join(data_lines)
 46 |     
 47 |     df = pl.read_csv(io.StringIO(csv_content), separator='\t', ignore_errors=True, 
 48 |                    null_values=['NA', 'N/A', ''], infer_schema_length=10000)
 49 |     
 50 |     # Filter out any rows with null values in key columns
 51 |     df = df.filter(
 52 |         pl.col('length').is_not_null() & 
 53 |         pl.col('bq_bin').is_not_null() & 
 54 |         pl.col('hp_dist').is_not_null() & 
 55 |         pl.col('count').is_not_null()
 56 |     )
 57 |     
 58 |     # Group by length, bq_bin, hp_dist and sum counts
 59 |     indel_grouped = df.group_by(['length', 'bq_bin', 'hp_dist']).agg([
 60 |         pl.col('count').sum().alias('indel_count')
 61 |     ])
 62 |     
 63 |     return indel_grouped
 64 | 
 65 | def read_counts_file(filepath):
 66 |     """Read counts file and sum total_count by bq_bin, hp_dist"""
 67 |     
 68 |     df = pl.read_csv(filepath, separator='\t', ignore_errors=True, 
 69 |                    null_values=['NA', 'N/A', ''], infer_schema_length=10000)
 70 |     
 71 |     # Filter out any rows with null values in key columns
 72 |     df = df.filter(
 73 |         pl.col('bq_bin').is_not_null() & 
 74 |         pl.col('hp_dist').is_not_null() & 
 75 |         pl.col('total_count').is_not_null()
 76 |     )
 77 |     
 78 |     # Group by bq_bin, hp_dist and sum total_count
 79 |     counts_grouped = df.group_by(['bq_bin', 'hp_dist']).agg([
 80 |         pl.col('total_count').sum().alias('total_count')
 81 |     ])
 82 |     
 83 |     return counts_grouped
 84 | 
 85 | def calculate_error_rates(indel_df, counts_df):
 86 |     """Calculate indel error rates by joining indel and total counts"""
 87 |     
 88 |     # Join the dataframes on bq_bin and hp_dist
 89 |     merged = indel_df.join(counts_df, on=['bq_bin', 'hp_dist'], how='inner')
 90 |     
 91 |     # Calculate error rate
 92 |     merged = merged.with_columns([
 93 |         (pl.col('indel_count') / pl.col('total_count')).alias('error_rate')
 94 |     ])
 95 |     
 96 |     return merged
 97 | 
 98 | def create_plot(df, connect_lines=False):
 99 |     """Create interactive subplot with hp_dist vs error rate and indel length vs error rate"""
100 |     
101 |     # Aggregate indel lengths: group lengths > 3 and < -3 into single categories
102 |     df_plot = df.with_columns([
103 |         pl.when(pl.col('length') > 3)
104 |         .then(pl.lit('>3'))
105 |         .when(pl.col('length') < -3)
106 |         .then(pl.lit('<-3'))
107 |         .otherwise(pl.col('length').cast(pl.Utf8))
108 |         .alias('length_category')
109 |     ])
110 |     
111 |     # Data for first subplot: aggregate by length_category, bq_bin, hp_dist
112 |     df_hp_plot = df_plot.group_by(['length_category', 'bq_bin', 'hp_dist']).agg([
113 |         pl.col('indel_count').sum().alias('indel_count'),
114 |         pl.col('total_count').first().alias('total_count')  # total_count should be the same for same bq_bin/hp_dist
115 |     ]).with_columns([
116 |         (pl.col('indel_count') / pl.col('total_count')).alias('error_rate')
117 |     ])
118 |     
119 |     # Data for second subplot: aggregate by length_category and bq_bin only (sum across all hp_dists)
120 |     df_length_plot = df_plot.group_by(['length_category', 'bq_bin']).agg([
121 |         pl.col('indel_count').sum().alias('indel_count'),
122 |         pl.col('total_count').sum().alias('total_count')  # sum total_count across hp_dists
123 |     ]).with_columns([
124 |         (pl.col('indel_count') / pl.col('total_count')).alias('error_rate')
125 |     ])
126 |     
127 |     # Get unique values for filtering
128 |     unique_bq_bins = sorted(df_hp_plot.select('bq_bin').unique().to_numpy().flatten())
129 |     
130 |     # Sort length categories in logical numerical order
131 |     def sort_length_categories(categories):
132 |         """Sort length categories in logical order: <-3, -3, -2, -1, 1, 2, 3, >3"""
133 |         def category_sort_key(cat):
134 |             if cat == '<-3':
135 |                 return -1000  # Sort first
136 |             elif cat == '>3':
137 |                 return 1000   # Sort last
138 |             else:
139 |                 return int(cat)  # Sort numerically for individual lengths
140 |         
141 |         return sorted(categories, key=category_sort_key)
142 |     
143 |     unique_categories = sort_length_categories(df_hp_plot.select('length_category').unique().to_numpy().flatten())
144 |     
145 |     print(f"Available BQ bins: {unique_bq_bins}")
146 |     print(f"Length categories: {unique_categories}")
147 |     
148 |     # Create subplot figure
149 |     fig = make_subplots(
150 |         rows=2, cols=1,
151 |         subplot_titles=('Error Rate by Homopolymer Distance', 'Error Rate by Indel Length'),
152 |         vertical_spacing=0.15
153 |     )
154 |     
155 |     # Color palette
156 |     colors = px.colors.qualitative.Set1
157 |     
158 |     # Create traces for each combination of length_category and bq_bin
159 |     trace_info = []
160 |     
161 |     # First subplot: HP Distance vs Error Rate
162 |     for i, category in enumerate(unique_categories):
163 |         for j, bq_bin in enumerate(unique_bq_bins):
164 |             # Filter data for this combination
165 |             df_subset = df_hp_plot.filter(
166 |                 (pl.col('length_category') == category) & 
167 |                 (pl.col('bq_bin') == bq_bin)
168 |             )
169 |             
170 |             if df_subset.height == 0:
171 |                 continue  # Skip empty combinations
172 |             
173 |             # Convert to numpy arrays
174 |             x_data = df_subset.select('hp_dist').to_numpy().flatten()
175 |             y_data = df_subset.select('error_rate').to_numpy().flatten()
176 |             indel_count_data = df_subset.select('indel_count').to_numpy().flatten()
177 |             total_count_data = df_subset.select('total_count').to_numpy().flatten()
178 |             
179 |             # Sort by hp_dist for proper line connection
180 |             if connect_lines and len(x_data) > 1:
181 |                 sort_idx = np.argsort(x_data)
182 |                 x_data = x_data[sort_idx]
183 |                 y_data = y_data[sort_idx]
184 |                 indel_count_data = indel_count_data[sort_idx]
185 |                 total_count_data = total_count_data[sort_idx]
186 |                 mode = 'markers+lines'
187 |             else:
188 |                 mode = 'markers'
189 |             
190 |             # Determine visibility (default to 37-59 only)
191 |             visible = True if bq_bin == '37-59' else False
192 |             
193 |             trace_name = f'{category} (BQ: {bq_bin})'
194 |             
195 |             fig.add_trace(go.Scatter(
196 |                 x=x_data,
197 |                 y=y_data,
198 |                 mode=mode,
199 |                 name=trace_name,
200 |                 visible=visible,
201 |                 legendgroup=category,  # Group traces by category for consistent legend
202 |                 marker=dict(
203 |                     color=colors[i % len(colors)],
204 |                     size=6,
205 |                     opacity=0.7
206 |                 ),
207 |                 line=dict(
208 |                     color=colors[i % len(colors)],
209 |                     width=2
210 |                 ) if connect_lines else None,
211 |                 customdata=np.column_stack((
212 |                     indel_count_data,
213 |                     total_count_data,
214 |                     [bq_bin] * len(x_data)
215 |                 )),
216 |                 hovertemplate=(
217 |                     '<b>Indel Length:</b> ' + category + '<br>' +
218 |                     '<b>BQ Bin:</b> ' + bq_bin + '<br>' +
219 |                     '<b>HP Distance:</b> %{x}<br>' +
220 |                     '<b>Error Rate:</b> %{y:.2e}<br>' +
221 |                     '<b>Indel Count:</b> %{customdata[0]}<br>' +
222 |                     '<b>Total Count:</b> %{customdata[1]}<br>' +
223 |                     '<extra></extra>'
224 |                 )
225 |             ), row=1, col=1)
226 |             
227 |             trace_info.append({
228 |                 'bq_bin': bq_bin,
229 |                 'category': category,
230 |                 'trace_idx': len(list(fig.data)) - 1,
231 |                 'subplot': 'hp_dist'
232 |             })
233 |     
234 |     # Second subplot: Indel Length vs Error Rate  
235 |     for i, category in enumerate(unique_categories):
236 |         for j, bq_bin in enumerate(unique_bq_bins):
237 |             # Filter data for this combination
238 |             df_subset = df_length_plot.filter(
239 |                 (pl.col('length_category') == category) & 
240 |                 (pl.col('bq_bin') == bq_bin)
241 |             )
242 |             
243 |             if df_subset.height == 0:
244 |                 continue  # Skip empty combinations
245 |             
246 |             # Convert to numpy arrays
247 |             error_rate = df_subset.select('error_rate').to_numpy().flatten()[0]
248 |             indel_count = df_subset.select('indel_count').to_numpy().flatten()[0]
249 |             total_count = df_subset.select('total_count').to_numpy().flatten()[0]
250 |             
251 |             # Determine visibility (default to 37-59 only)
252 |             visible = True if bq_bin == '37-59' else False
253 |             
254 |             # For x-axis position, convert category to numeric value for plotting
255 |             if category == '<-3':
256 |                 x_pos = -4
257 |             elif category == '>3':
258 |                 x_pos = 4
259 |             else:
260 |                 x_pos = int(category)
261 |             
262 |             trace_name = f'{category} (BQ: {bq_bin})'
263 |             
264 |             fig.add_trace(go.Scatter(
265 |                 x=[x_pos],
266 |                 y=[error_rate],
267 |                 mode='markers',
268 |                 name=trace_name,  # Use same name to group in legend
269 |                 visible=visible,
270 |                 legendgroup=category,  # Group with first subplot traces
271 |                 showlegend=False,  # Don't show duplicate legend entries
272 |                 marker=dict(
273 |                     color=colors[i % len(colors)],
274 |                     size=8,
275 |                     opacity=0.7
276 |                 ),
277 |                 customdata=np.array([[indel_count, total_count, bq_bin]]),
278 |                 hovertemplate=(
279 |                     '<b>Indel Length:</b> ' + category + '<br>' +
280 |                     '<b>BQ Bin:</b> ' + bq_bin + '<br>' +
281 |                     '<b>Error Rate:</b> %{y:.2e}<br>' +
282 |                     '<b>Indel Count:</b> %{customdata[0]}<br>' +
283 |                     '<b>Total Count:</b> %{customdata[1]}<br>' +
284 |                     '<extra></extra>'
285 |                 )
286 |             ), row=2, col=1)
287 |             
288 |             trace_info.append({
289 |                 'bq_bin': bq_bin,
290 |                 'category': category,
291 |                 'trace_idx': len(list(fig.data)) - 1,
292 |                 'subplot': 'length'
293 |             })
294 |     
295 |     # Create buttons for BQ bin selection
296 |     buttons = []
297 |     
298 |     # Add "All" button
299 |     all_visible = [True] * len(list(fig.data))
300 |     buttons.append(dict(
301 |         label="All BQ Bins",
302 |         method="update",
303 |         args=[{"visible": all_visible}]
304 |     ))
305 |     
306 |     # Add individual BQ bin buttons
307 |     for bq_bin in unique_bq_bins:
308 |         visible_list = []
309 |         for trace in trace_info:
310 |             visible_list.append(trace['bq_bin'] == bq_bin)
311 |         
312 |         # Mark 37-59 as default
313 |         label = f"BQ: {bq_bin} (default)" if bq_bin == '37-59' else f"BQ: {bq_bin}"
314 |         
315 |         buttons.append(dict(
316 |             label=label,
317 |             method="update", 
318 |             args=[{"visible": visible_list}]
319 |         ))
320 |     
321 |     # Update layout with BQ bin selector
322 |     fig.update_layout(
323 |         title='Indel Error Rate Analysis',
324 |         width=1000,
325 |         height=900,
326 |         template='plotly_white',
327 |         legend=dict(
328 |             title="Indel Length",
329 |             orientation="v",
330 |             yanchor="top",
331 |             y=1,
332 |             xanchor="left",
333 |             x=1.02
334 |         ),
335 |         updatemenus=[
336 |             dict(
337 |                 type="buttons",
338 |                 direction="left",
339 |                 buttons=buttons,
340 |                 pad={"r": 10, "t": 10},
341 |                 showactive=True,
342 |                 x=0.01,
343 |                 xanchor="left",
344 |                 y=1.02,
345 |                 yanchor="top"
346 |             ),
347 |         ],
348 |         
349 |         #annotations=[
350 |         #    dict(text="BQ Bin Filter:", showarrow=False,
351 |         #         x=0.01, y=1.02, yref="paper", align="left", 
352 |         #         font=dict(size=11, color="black"))
353 |         #]
354 |     )
355 |     
356 |     # Update subplot axes
357 |     fig.update_xaxes(title_text="Homopolymer Distance (hp_dist)", showgrid=True, gridwidth=1, gridcolor='lightgray', row=1, col=1)
358 |     fig.update_yaxes(title_text="Indel Error Rate", type='log', showgrid=True, gridwidth=1, gridcolor='lightgray', row=1, col=1)
359 |     
360 |     fig.update_xaxes(title_text="Indel Length", showgrid=True, gridwidth=1, gridcolor='lightgray', row=2, col=1)
361 |     fig.update_yaxes(title_text="Indel Error Rate", type='log', showgrid=True, gridwidth=1, gridcolor='lightgray', row=2, col=1)
362 |     
363 |     # Set custom x-axis labels for second subplot
364 |     fig.update_xaxes(
365 |         tickvals=[-4, -3, -2, -1, 1, 2, 3, 4],
366 |         ticktext=['<-3', '-3', '-2', '-1', '1', '2', '3', '>3'],
367 |         row=2, col=1
368 |     )
369 |     
370 |     return fig
371 | 
372 | def parse_arguments():
373 |     """Parse command line arguments"""
374 |     parser = argparse.ArgumentParser(
375 |         description='Analyze indel error rates from fraguracy output files',
376 |         formatter_class=argparse.RawDescriptionHelpFormatter,
377 |         epilog="""
378 | Examples:
379 |   python3 analyze_indel_errors.py indel_errors.bed.gz counts.txt
380 |   python3 analyze_indel_errors.py --no-lines indel_errors.bed.gz counts.txt
381 |         """
382 |     )
383 |     
384 |     parser.add_argument(
385 |         'indel_errors_file',
386 |         help='Input indel errors file (BED format, can be gzipped)'
387 |     )
388 |     
389 |     parser.add_argument(
390 |         'counts_file', 
391 |         help='Input counts file (tab-separated format)'
392 |     )
393 |     
394 |     parser.add_argument(
395 |         '--no-lines',
396 |         action='store_false',
397 |         dest='lines',
398 |         help='Show scatter plot only, without connecting lines (default: show lines)'
399 |     )
400 |     
401 |     parser.add_argument(
402 |         '--output-prefix',
403 |         default='indel_error_rates_by_hp_dist',
404 |         help='Output file prefix (default: indel_error_rates_by_hp_dist)'
405 |     )
406 |     
407 |     return parser.parse_args()
408 | 
409 | def main():
410 |     """Main analysis function"""
411 |     
412 |     args = parse_arguments()
413 |     
414 |     print(f"Reading indel errors file: {args.indel_errors_file}")
415 |     indel_df = read_indel_errors(args.indel_errors_file)
416 |     print(f"Indel errors shape: {indel_df.shape}")
417 |     print("Indel errors preview:")
418 |     print(indel_df.head())
419 |     
420 |     print(f"\nReading counts file: {args.counts_file}")
421 |     counts_df = read_counts_file(args.counts_file)
422 |     print(f"Counts shape: {counts_df.shape}")
423 |     print("Counts preview:")
424 |     print(counts_df.head())
425 |     
426 |     print("\nCalculating error rates...")
427 |     error_rates_df = calculate_error_rates(indel_df, counts_df)
428 |     print(f"Error rates shape: {error_rates_df.shape}")
429 |     print("Error rates preview:")
430 |     print(error_rates_df.head())
431 |     
432 |     print(f"\nCreating plot{' with connected lines' if args.lines else ' (scatter plot only)'}...")
433 |     fig = create_plot(error_rates_df, connect_lines=args.lines)
434 |     
435 |     # Save the plot as HTML for interactivity
436 |     html_output = f"{args.output_prefix}.html"
437 |     png_output = f"{args.output_prefix}.png"
438 |     
439 |     fig.write_html(html_output)
440 |     print(f"Interactive plot saved as '{html_output}'")
441 |     
442 |     # Try to save PNG (requires kaleido)
443 |     try:
444 |         fig.write_image(png_output, width=1000, height=600)
445 |         print(f"Static plot also saved as '{png_output}'")
446 |     except Exception as e:
447 |         print(f"Could not save PNG (install kaleido for PNG export): {e}")
448 |     
449 |     # Show summary statistics with aggregated lengths
450 |     print("\nSummary statistics (with length aggregation):")
451 |     summary_df = error_rates_df.with_columns([
452 |         pl.when(pl.col('length') > 3)
453 |         .then(pl.lit('>3'))
454 |         .when(pl.col('length') < -3)
455 |         .then(pl.lit('<-3'))
456 |         .otherwise(pl.col('length').cast(pl.Utf8))
457 |         .alias('length_category')
458 |     ])
459 |     
460 |     summary = summary_df.group_by('length_category').agg([
461 |         pl.col('error_rate').mean().alias('mean_error_rate'),
462 |         pl.col('error_rate').std().alias('std_error_rate'),
463 |         pl.col('indel_count').sum().alias('total_indel_count'),
464 |         pl.col('total_count').sum().alias('total_count')
465 |     ]).sort('length_category')
466 |     print(summary)
467 |     
468 |     # Show the plot
469 |     fig.show()
470 | 
471 | if __name__ == "__main__":
472 |     main() 


--------------------------------------------------------------------------------
/scripts/count-errors.py:
--------------------------------------------------------------------------------
 1 | import defopt
 2 | import cyvcf2
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def main(vcf_file: str, *, prefix: str = "error-counts"):
 7 |     """
 8 |     count errors (de novos) in a VCF file split by SNP/indel and by sample
 9 |     this expects particular annotations added for paper analyses
10 | 
11 |     :param vcf_file: path to VCF file
12 |     :param prefix: prefix for output files
13 |     """
14 |     d = {'snp': defaultdict(lambda: defaultdict(int)),
15 |          'indel': defaultdict(lambda: defaultdict(int))}
16 | 
17 |     lcr_d = {'snp': defaultdict(lambda: [0, 0]), 'indel': defaultdict(lambda: [0, 0])}
18 | 
19 |     counts_by_sample = defaultdict(int)
20 | 
21 |     vcf = cyvcf2.VCF(vcf_file)
22 |     for variant in vcf:
23 |         if variant.FILTER != 'PASS' and variant.FILTER is not None:
24 |             continue
25 | 
26 |         error_samples = variant.INFO.get('dn').split(',')
27 |         frag_count = int(variant.INFO.get('fraguracy_count', 0))
28 |         lcr = bool(variant.INFO.get('LCR', 0))
29 |         #frag_count = variant.INFO.get('fraguracy_samples', 0)
30 |         snp = 'snp' if variant.is_snp else 'indel'
31 |         for s in error_samples:
32 |             d[snp][s][frag_count] += 1
33 |             lcr_d[snp][s][lcr] += 1
34 |             if snp == 'snp':
35 |                 counts_by_sample[s] += 1
36 | 
37 |     skip_samples = {s for s, cnt in counts_by_sample.items() if cnt > 1500}
38 |     print({s: counts_by_sample[s] for s in skip_samples})
39 | 
40 |     max_count = 50
41 |     # now print out the snp and indel dictionaries to separate files
42 |     for snp in ['snp', 'indel']:
43 |         with open(f'{prefix}.{snp}.errors', 'w') as f:
44 |             f.write("sample\tsnp\tcutoff\tcount_at_or_above\tcount\n")
45 |             for sample in d[snp]:
46 |                 if sample in skip_samples: continue
47 |                 # we want to count all the errors
48 |                 cur_keys = [c for c in d[snp][sample].keys() if c >= max_count]
49 |                 cum_sum = sum([d[snp][sample][c] for c in cur_keys])
50 |                 f.write(f'{sample}\t{snp}\t{max_count}\t{cum_sum}\t{cum_sum}\n')
51 |                 for i in range(max_count - 1, -1, -1):
52 |                     cnt = d[snp][sample].get(i, 0)
53 |                     cum_sum += cnt
54 |                     f.write(f'{sample}\t{snp}\t{i}\t{cum_sum}\t{cnt}\n')
55 |                 if snp == 'snp':
56 |                     assert(cum_sum == counts_by_sample[sample])
57 | 
58 |         with open(f'{prefix}.{snp}.lcr', 'w') as f:
59 |             f.write("sample\tsnp\tlcr\tcount\n")
60 |             for sample in lcr_d[snp]:
61 |                 if sample in skip_samples: continue
62 |                 # we want to count all the errors
63 |                 f.write(f'{sample}\t{snp}\ttrue\t{lcr_d[snp][sample][True]}\n')
64 |                 f.write(f'{sample}\t{snp}\tfalse\t{lcr_d[snp][sample][False]}\n')
65 | 
66 | if __name__ == '__main__':
67 |     defopt.run(main)
68 | 


--------------------------------------------------------------------------------
/src/combine_counts.rs:
--------------------------------------------------------------------------------
  1 | use crate::fraguracy;
  2 | use std::io;
  3 | use std::io::{BufRead, Write};
  4 | use std::path::PathBuf;
  5 | use std::string::String;
  6 | 
  7 | #[derive(Hash, Debug, PartialOrd, PartialEq, Ord, Eq, Clone)]
  8 | pub(crate) struct Count {
  9 |     read12: u8,
 10 |     orientation: u8,
 11 |     read_pos: u32,
 12 |     bq_bin: u8,
 13 |     context: [char; 2],
 14 |     homopolymer_dist: i8,
 15 |     total: u32,
 16 |     errors: u32,
 17 | }
 18 | 
 19 | impl std::ops::AddAssign<Count> for Count {
 20 |     fn add_assign(&mut self, o: Count) {
 21 |         assert!(self.read12 == o.read12);
 22 |         assert!(self.orientation == o.orientation);
 23 |         assert!(self.read_pos == o.read_pos);
 24 |         assert!(self.bq_bin == o.bq_bin);
 25 |         assert!(self.context == o.context);
 26 |         assert!(self.homopolymer_dist == o.homopolymer_dist);
 27 |         self.errors += o.errors;
 28 |         self.total += o.total;
 29 |     }
 30 | }
 31 | 
 32 | impl Count {
 33 |     fn from_line(s: &str, file_name: &str) -> Count {
 34 |         let mut sp = s.trim().split('\t');
 35 |         Count {
 36 |             read12: sp.next().unwrap_or_else(|| panic!("not enough columns in line: {s} from file: {file_name}"))[1..]
 37 |                 .parse::<u8>()
 38 |                 .map(|val| val - 1)
 39 |                 .unwrap_or_else(|e| panic!("error parsing read12 from line: {s} in file: {file_name}, error: {e}")),
 40 |             orientation: match sp.next() {
 41 |                 Some("f") => 0,
 42 |                 Some("r") => 1,
 43 |                 _ => panic!("error parsing orientation, expected f or r, in line: {s} from file: {file_name}"),
 44 |             },
 45 |             bq_bin: fraguracy::REVERSE_Q_LOOKUP[sp.next().unwrap_or_else(|| panic!("not enough columns for bq_bin in line: {s} from file: {file_name}"))],
 46 |             read_pos: sp
 47 |                 .next()
 48 |                 .unwrap_or_else(|| panic!("not enough columns for read_pos in line: {s} from file: {file_name}"))
 49 |                 .parse::<u32>()
 50 |                 .unwrap_or_else(|e| panic!("error parsing read_pos from line: {s} in file: {file_name}, error: {e}")),
 51 |             context: {
 52 |                 let ctx_str = sp
 53 |                     .next()
 54 |                     .unwrap_or_else(|| panic!("error getting context string from line: {s} from file: {file_name}"));
 55 |                 let mut ctx_chars = ctx_str.chars();
 56 |                 [
 57 |                     ctx_chars.next().unwrap_or_else(|| panic!("expecting two characters for context, got: {ctx_str} in line: {s} from file: {file_name}")),
 58 |                     ctx_chars.next().unwrap_or_else(|| panic!("expecting two characters for context, got: {ctx_str} in line: {s} from file: {file_name}")),
 59 |                 ]
 60 |             },
 61 |             homopolymer_dist: {
 62 |                 let val = sp
 63 |                     .next()
 64 |                     .unwrap_or_else(|| panic!("not enough columns for homopolymer_dist in line: {s} from file: {file_name}"))
 65 |                     .trim();
 66 |                 if val == "NA" {
 67 |                     i8::MAX
 68 |                 } else {
 69 |                     val.parse::<i8>()
 70 |                         .unwrap_or_else(|e| panic!("error parsing homopolymer_dist from line: {s} in file: {file_name}, error: {e}"))
 71 |                 }
 72 |             },
 73 |             total: sp
 74 |                 .next()
 75 |                 .unwrap_or_else(|| panic!("not enough columns for total in line: {s} from file: {file_name}"))
 76 |                 .parse::<u32>()
 77 |                 .unwrap_or_else(|e| panic!("error parsing total from line: {s} in file: {file_name}, error: {e}")),
 78 |             errors: sp
 79 |                 .next()
 80 |                 .unwrap_or_else(|| panic!("not enough columns for errors in line: {s} from file: {file_name}"))
 81 |                 .trim()
 82 |                 .parse::<u32>()
 83 |                 .unwrap_or_else(|e| panic!("error parsing errors from line: {s} in file: {file_name}, error: {e}")),
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | pub(crate) fn combine_counts_main(
 89 |     counts_files: Vec<PathBuf>,
 90 |     output_path: String,
 91 | ) -> io::Result<()> {
 92 |     let mut counts: std::collections::HashSet<Count> = std::collections::HashSet::new();
 93 |     let mut header: String = String::new();
 94 |     for count_file in counts_files.iter() {
 95 |         // open each file and read each line.
 96 |         let file = std::fs::File::open(count_file)?;
 97 |         let reader = std::io::BufReader::new(file);
 98 |         for (i, line) in reader.lines().enumerate() {
 99 |             let line = line?;
100 |             if i == 0 {
101 |                 assert!(
102 |                     line.starts_with("read12"),
103 |                     "expecting header line from counts file"
104 |                 );
105 |                 assert!(
106 |                     line.contains("hp_dist"),
107 |                     "expecting hp_dist in header please run with newer version of fraguracy"
108 |                 );
109 |                 // take the first 8 columns as the header
110 |                 header = line.split('\t').take(8).collect::<Vec<_>>().join("\t");
111 |                 continue;
112 |             }
113 |             let mut c = Count::from_line(&line, count_file.to_str().unwrap());
114 |             let entry = counts.take(&c);
115 |             if let Some(entry) = entry {
116 |                 c.total += entry.total;
117 |                 c.errors += entry.errors;
118 |             }
119 |             counts.insert(c);
120 |         }
121 |     }
122 | 
123 |     let mut out = std::fs::File::create(output_path)?;
124 |     writeln!(out, "{}", header)?;
125 | 
126 |     let mut counts: Vec<Count> = counts.into_iter().collect();
127 |     counts.sort();
128 |     for c in counts.iter() {
129 |         writeln!(
130 |             out,
131 |             "r{}\t{}\t{}\t{}\t{}{}\t{}\t{}\t{}",
132 |             c.read12 + 1,
133 |             ['f', 'r'][c.orientation as usize],
134 |             fraguracy::Q_LOOKUP[c.bq_bin as usize],
135 |             c.read_pos,
136 |             c.context[0],
137 |             c.context[1],
138 |             c.homopolymer_dist,
139 |             c.total,
140 |             c.errors
141 |         )?;
142 |     }
143 | 
144 |     Ok(())
145 | }
146 | 
147 | #[cfg(test)]
148 | mod tests {
149 |     use super::*;
150 | 
151 |     #[test]
152 |     fn test_from_line() {
153 |         let line = "r1	f	05-19	0	AC	-1	61502	609";
154 | 
155 |         let c = Count::from_line(line, "test_file.txt");
156 |         assert_eq!(c.read12, 0);
157 |         assert_eq!(c.orientation, 0);
158 |         assert_eq!(c.bq_bin, 1);
159 |         assert_eq!(c.read_pos, 0);
160 |         assert_eq!(c.context, ['A', 'C']);
161 |         assert_eq!(c.homopolymer_dist, -1);
162 |         assert_eq!(c.total, 61502);
163 |         assert_eq!(c.errors, 609);
164 |     }
165 | 
166 |     #[test]
167 |     fn test_add_count() {
168 |         let mut a = Count {
169 |             read12: 0,
170 |             orientation: 1,
171 |             bq_bin: 2,
172 |             read_pos: 3,
173 |             context: ['A', 'T'],
174 |             homopolymer_dist: -1,
175 |             total: 32,
176 |             errors: 1,
177 |         };
178 |         let mut b = a.clone();
179 |         b.errors = 3;
180 |         a += b;
181 |         assert_eq!(a.homopolymer_dist, -1);
182 | 
183 |         assert_eq!(a.errors, 4);
184 |     }
185 | }
186 | 


--------------------------------------------------------------------------------
/src/combine_errors.rs:
--------------------------------------------------------------------------------
  1 | use crate::fraguracy;
  2 | use core::cmp::Reverse;
  3 | use itertools::Itertools;
  4 | use rust_htslib::bgzf;
  5 | use std::cmp::Ordering;
  6 | use std::collections::BinaryHeap;
  7 | use std::collections::HashMap;
  8 | use std::error::Error;
  9 | use std::fs::File;
 10 | use std::io;
 11 | use std::io::{BufRead, BufReader, Write};
 12 | use std::ops::Add;
 13 | use std::path::PathBuf;
 14 | use std::string::String;
 15 | 
 16 | #[derive(Eq, Debug, Default, Clone)]
 17 | struct Interval {
 18 |     tid: i32,
 19 |     chrom: String,
 20 |     start: u32,
 21 |     end: u32,
 22 |     group: u8,
 23 |     length: i32,
 24 |     hp_dist: i16,
 25 |     count: [u32; 7],
 26 |     file_i: u32,
 27 | }
 28 | struct IntervalHeap {
 29 |     // min heap
 30 |     h: BinaryHeap<Reverse<Interval>>,
 31 |     files: Vec<Box<dyn BufRead>>,
 32 |     chom_to_tid: HashMap<String, i32>,
 33 |     is_indel: Vec<bool>,
 34 | }
 35 | 
 36 | fn read_fai(path: PathBuf) -> HashMap<String, i32> {
 37 |     let f = File::open(&path);
 38 |     let mut h = HashMap::new();
 39 |     if let Ok(fu) = f {
 40 |         for l in BufReader::new(fu).lines() {
 41 |             let l = l.expect("error parsing faidx");
 42 |             let chrom = l
 43 |                 .split('\t')
 44 |                 .next()
 45 |                 .expect("expected at least one value per line in faidx");
 46 |             if chrom.starts_with('>') {
 47 |                 log::warn!(
 48 |                     "expecting fai, NOT fasta for argument found chrom of {}",
 49 |                     chrom
 50 |                 );
 51 |             }
 52 |             h.insert(String::from(chrom), h.len() as i32);
 53 |         }
 54 |     } else {
 55 |         panic!("couldn't open file: {:?}", path.to_string_lossy());
 56 |     }
 57 |     h
 58 | }
 59 | 
 60 | impl Add<&Interval> for &Interval {
 61 |     type Output = Interval;
 62 |     fn add(self, other: &Interval) -> Self::Output {
 63 |         assert_eq!(self.chrom, other.chrom);
 64 |         assert_eq!(self.start, other.start);
 65 |         assert_eq!(self.end, other.end);
 66 |         assert_eq!(self.group, other.group);
 67 |         assert_eq!(self.length, other.length, "indel lengths must be equal");
 68 |         let counts = self
 69 |             .count
 70 |             .iter()
 71 |             .zip(other.count.iter())
 72 |             .map(|(a, b)| a + b);
 73 |         // convert counts to [u32, 7]
 74 |         let counts: [u32; 7] = counts
 75 |             .collect::<Vec<_>>()
 76 |             .try_into()
 77 |             .expect("error converting counts");
 78 | 
 79 |         Interval {
 80 |             chrom: self.chrom.clone(),
 81 |             count: counts,
 82 |             ..*self
 83 |         }
 84 |     }
 85 | }
 86 | 
 87 | impl IntervalHeap {
 88 |     fn all_indels(&self) -> bool {
 89 |         self.is_indel.iter().all(|&x| x)
 90 |     }
 91 |     fn new(paths: Vec<PathBuf>, fai_path: PathBuf) -> IntervalHeap {
 92 |         let fhs: Vec<Box<dyn BufRead>> = paths
 93 |             .iter()
 94 |             .map(|p| crate::files::open_file(Some(p.clone())).expect("error opening file"))
 95 |             .collect();
 96 | 
 97 |         let mut ih = IntervalHeap {
 98 |             h: BinaryHeap::new(),
 99 |             files: fhs,
100 |             chom_to_tid: read_fai(fai_path),
101 |             is_indel: paths
102 |                 .iter()
103 |                 .map(|p| p.to_string_lossy().ends_with("indel-errors.bed.gz"))
104 |                 .collect(),
105 |         };
106 | 
107 |         assert!(
108 |             ih.is_indel.iter().all(|&x| x) || ih.is_indel.iter().all(|&x| !x),
109 |             "all files must be either indel error files or base error files, not a mix"
110 |         );
111 | 
112 |         ih.files
113 |             .iter_mut()
114 |             .enumerate()
115 |             .for_each(|(file_i, fh)| loop {
116 |                 // loop to skip '#' comment lines
117 |                 let is_indel = ih.is_indel[file_i];
118 |                 let mut buf = String::new();
119 |                 let line = fh.read_line(&mut buf);
120 |                 if line.is_ok() && !buf.starts_with('#') {
121 |                     let r = parse_bed_line(&buf, file_i as u32, &(ih.chom_to_tid), is_indel);
122 |                     if r.is_err() && buf == "" {
123 |                         break;
124 |                     } else {
125 |                         ih.h.push(Reverse(r.unwrap_or_else(|_| {
126 |                             if buf != "" {}
127 |                             panic!("Error parsing first line from file: '{buf}'")
128 |                         })));
129 |                         break;
130 |                     }
131 |                 }
132 |             });
133 |         ih
134 |     }
135 | }
136 | fn parse_bed_line(
137 |     line: &str,
138 |     file_i: u32,
139 |     chrom_to_tid: &HashMap<String, i32>,
140 |     is_indel: bool,
141 | ) -> Result<Interval, Box<dyn Error>> {
142 |     let toks: Vec<&str> = line.trim().split('\t').collect();
143 |     // can be 6 if combine-errors was already run once.
144 |     let mut iv = if !is_indel {
145 |         let mut iv = Interval {
146 |             tid: 0,
147 |             chrom: String::from(toks[0]),
148 |             start: str::parse::<u32>(toks[1])?,
149 |             end: str::parse::<u32>(toks[2])?,
150 |             group: (*fraguracy::REVERSE_Q_LOOKUP
151 |                 .get(toks[3].trim())
152 |                 .unwrap_or_else(|| panic!("unknown bq bin: {}", toks[3]))),
153 |             length: 0,
154 |             count: [0; 7],
155 |             hp_dist: i16::MAX,
156 |             file_i,
157 |         };
158 |         // toks[4] is the total count, which we don't need. because we can sum the count from toks[5]
159 | 
160 |         // parse the counts which appear as, e.g.,
161 |         // AC:1,AG:2,AT:3,CG:4,CT:5,GT:6,NN:0
162 |         // and increment the appropriate index using CONTEXT_LOOKUP from fraguracy.rs
163 |         for s in toks[5].split(',') {
164 |             let (context, count) = s.split(':').collect_tuple().unwrap();
165 |             if context.len() != 2 {
166 |                 return Err(
167 |                     format!("expecting two characters for context, found {}", context).into(),
168 |                 );
169 |             }
170 |             let mut context = context.chars();
171 |             let a = context.next().unwrap();
172 |             let b = context.next().unwrap();
173 |             let idx = fraguracy::CONTEXT_LOOKUP[&(a as u8, b as u8)];
174 |             iv.count[idx] += count.parse::<u32>().unwrap();
175 |         }
176 |         iv
177 |     } else {
178 |         // indel errors
179 |         let mut iv = Interval {
180 |             tid: 0,
181 |             chrom: String::from(toks[0]),
182 |             start: str::parse::<u32>(toks[1])?,
183 |             end: str::parse::<u32>(toks[2])?,
184 |             group: *fraguracy::REVERSE_Q_LOOKUP
185 |                 .get(toks[5].trim())
186 |                 .unwrap_or_else(|| panic!("unknown bq bin: {}", toks[5])),
187 |             count: [0; 7],
188 |             length: str::parse::<i32>(toks[4])?,
189 |             hp_dist: str::parse::<i16>(toks[6]).unwrap_or(crate::fraguracy::MAX_HP_DIST + 1),
190 |             file_i,
191 |         };
192 |         // store the count in the first position for indels.
193 |         iv.count[0] = str::parse::<u32>(toks[3])?;
194 |         iv
195 |     };
196 |     iv.tid = *chrom_to_tid
197 |         .get(&iv.chrom)
198 |         .unwrap_or_else(|| panic!("chromosome '{}' not found in fai file", iv.chrom));
199 |     Ok(iv)
200 | }
201 | 
202 | impl Iterator for IntervalHeap {
203 |     type Item = Interval;
204 | 
205 |     /// pop an item out and then read in another interval from that file-handle
206 |     fn next(&mut self) -> Option<Self::Item> {
207 |         if let Some(pop_iv) = self.h.pop() {
208 |             let pop_iv = pop_iv.0;
209 |             let file_i = pop_iv.file_i;
210 |             let is_indel = self.is_indel[file_i as usize];
211 |             let fh = &mut self.files[file_i as usize];
212 |             let mut buf = String::new();
213 |             let line_len = &fh.read_line(&mut buf);
214 |             if line_len.is_ok() && *(line_len).as_ref().unwrap() > 0 {
215 |                 let r = parse_bed_line(&buf, file_i, &self.chom_to_tid, is_indel);
216 |                 if let Ok(iv) = r {
217 |                     self.h.push(Reverse(iv));
218 |                 } else {
219 |                     panic!("{:?} line_len: {:?}", r.err().unwrap(), line_len);
220 |                 }
221 |             }
222 |             Some(pop_iv)
223 |         } else {
224 |             None
225 |         }
226 |     }
227 | }
228 | 
229 | impl PartialEq for Interval {
230 |     fn eq(&self, b: &Interval) -> bool {
231 |         self.chrom == b.chrom
232 |             && self.start == b.start
233 |             && self.end == b.end
234 |             && self.group == b.group
235 |             && self.length == b.length
236 |     }
237 | }
238 | 
239 | impl PartialOrd for Interval {
240 |     #[allow(clippy::non_canonical_partial_ord_impl)]
241 |     fn partial_cmp(&self, b: &Interval) -> Option<Ordering> {
242 |         if self.tid != b.tid {
243 |             return if self.tid < b.tid {
244 |                 Some(Ordering::Less)
245 |             } else {
246 |                 Some(Ordering::Greater)
247 |             };
248 |         }
249 |         if self.start != b.start {
250 |             return if self.start < b.start {
251 |                 Some(Ordering::Less)
252 |             } else {
253 |                 Some(Ordering::Greater)
254 |             };
255 |         }
256 | 
257 |         if self.end != b.end {
258 |             return if self.end < b.end {
259 |                 Some(Ordering::Less)
260 |             } else {
261 |                 Some(Ordering::Greater)
262 |             };
263 |         }
264 | 
265 |         if self.length != b.length {
266 |             return if self.length < b.length {
267 |                 Some(Ordering::Less)
268 |             } else {
269 |                 Some(Ordering::Greater)
270 |             };
271 |         }
272 | 
273 |         Some(self.group.cmp(&b.group))
274 |     }
275 | }
276 | 
277 | impl Ord for Interval {
278 |     fn cmp(&self, b: &Interval) -> std::cmp::Ordering {
279 |         self.partial_cmp(b).expect("cmp: not expecting None")
280 |     }
281 | }
282 | 
283 | pub(crate) fn combine_errors_main(
284 |     paths: Vec<PathBuf>,
285 |     fai_path: PathBuf,
286 |     output_path: String,
287 | ) -> io::Result<()> {
288 |     let ih = IntervalHeap::new(paths, fai_path);
289 |     if ih.all_indels() {
290 |         log::info!("all indels");
291 |     }
292 | 
293 |     // Append .gz if not already present
294 |     let mut output_path = if !output_path.ends_with(".gz") {
295 |         output_path + ".gz"
296 |     } else {
297 |         output_path
298 |     };
299 |     let all_indels = ih.all_indels();
300 | 
301 |     if all_indels && !output_path.ends_with("indel-errors.bed.gz") {
302 |         log::warn!("all indels, but output path does not end with 'indel-errors.bed.gz'. renaming");
303 |         output_path = output_path.replace(".bed.gz", ".indel-errors.bed.gz");
304 |     }
305 | 
306 |     let mut writer =
307 |         bgzf::Writer::from_path(&output_path).expect("error creating bgzip output file");
308 | 
309 |     if all_indels {
310 |         writer.write_all(b"#chrom\tstart\tend\tcount\tlength\tbq_bin\thp_dist\tn_samples\n")?;
311 |     } else {
312 |         writer.write_all(b"#chrom\tstart\tend\tbq_bin\tcount\tcontexts\tn_samples\n")?;
313 |     }
314 | 
315 |     for (_, ivs) in &ih
316 |         .into_iter()
317 |         .group_by(|iv| (iv.tid, iv.start, iv.end, iv.group, iv.length))
318 |     {
319 |         let ivs: Vec<Interval> = ivs.into_iter().collect();
320 |         let n = ivs
321 |             .iter()
322 |             .filter(|iv| iv.count.iter().any(|&c| c > 0))
323 |             .count();
324 |         let iv0 = ivs[0].clone();
325 |         let iv = ivs.iter().skip(1).fold(iv0, |acc, iv| &acc + iv);
326 | 
327 |         let line = if all_indels {
328 |             format!(
329 |                 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
330 |                 iv.chrom,
331 |                 iv.start,
332 |                 iv.end,
333 |                 iv.count[0],
334 |                 iv.length,
335 |                 fraguracy::Q_LOOKUP[iv.group as usize],
336 |                 if iv.hp_dist == crate::fraguracy::MAX_HP_DIST + 1 {
337 |                     "NA".to_string()
338 |                 } else {
339 |                     iv.hp_dist.to_string()
340 |                 },
341 |                 n
342 |             )
343 |         } else {
344 |             let (total_count, context_str) = crate::files::format_context_counts(iv.count);
345 |             format!(
346 |                 "{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
347 |                 iv.chrom,
348 |                 iv.start,
349 |                 iv.end,
350 |                 if iv.group == u8::MAX {
351 |                     "NA"
352 |                 } else {
353 |                     fraguracy::Q_LOOKUP[iv.group as usize]
354 |                 },
355 |                 total_count,
356 |                 context_str,
357 |                 n
358 |             )
359 |         };
360 |         writer.write_all(line.as_bytes())?;
361 |     }
362 |     log::info!("wrote {}", output_path);
363 |     writer.flush()?;
364 |     Ok(())
365 | }
366 | 


--------------------------------------------------------------------------------
/src/files.rs:
--------------------------------------------------------------------------------
  1 | use crate::fraguracy::{InnerCounts, Stat, CONTEXT_TO_CONTEXT2};
  2 | use std::string::String;
  3 | 
  4 | use flate2::bufread::GzDecoder;
  5 | use itertools::Itertools;
  6 | use std::fs::File;
  7 | use std::io::{BufRead, BufReader};
  8 | use std::path::PathBuf;
  9 | 
 10 | use rust_htslib::bgzf;
 11 | use std::io::Write;
 12 | 
 13 | pub(crate) type Iv = rust_lapper::Interval<u32, u8>;
 14 | 
 15 | pub(crate) fn write_stats(stats: Vec<Stat>, output_prefix: PathBuf) {
 16 |     let header = Stat::header();
 17 | 
 18 |     let mut fh = std::fs::File::create(
 19 |         output_prefix
 20 |             .to_str()
 21 |             .expect("error getting output prefix")
 22 |             .to_owned()
 23 |             + "counts.txt",
 24 |     )
 25 |     .expect("error opening file!");
 26 | 
 27 |     writeln!(fh, "{header}").expect("error writing to file");
 28 |     stats
 29 |         .iter()
 30 |         .for_each(|s| writeln!(fh, "{s}").expect("error writing to file"));
 31 | }
 32 | 
 33 | pub(crate) fn format_context_counts(counts: [u32; 7]) -> (u32, String) {
 34 |     let mut total: u32 = 0;
 35 |     let contexts: String = counts
 36 |         .iter()
 37 |         .enumerate()
 38 |         .filter(|(_, &count)| count > 0)
 39 |         .map(|(idx, &count)| {
 40 |             let context = CONTEXT_TO_CONTEXT2[idx];
 41 |             let a = context[0];
 42 |             let b = context[1];
 43 |             total += count;
 44 |             format!("{a}{b}:{count}")
 45 |         })
 46 |         .collect::<Vec<_>>()
 47 |         .join(",");
 48 | 
 49 |     (total, contexts)
 50 | }
 51 | 
 52 | pub(crate) fn write_errors(counts: &InnerCounts, output_prefix: PathBuf, chroms: Vec<String>) {
 53 |     let path = output_prefix
 54 |         .to_str()
 55 |         .expect("error getting output prefix")
 56 |         .to_owned()
 57 |         + "errors.bed.gz";
 58 | 
 59 |     let mut errfh = bgzf::Writer::from_path(&path).expect("error opening bgzip file!");
 60 |     errfh
 61 |         .write_all(b"#chrom\tstart\tend\tbq_bin\tcount\tcontexts\n")
 62 |         .expect("error writing header");
 63 | 
 64 |     for pos in counts.error_positions.keys().sorted() {
 65 |         let cnt = counts.error_positions[pos];
 66 |         let (total, contexts) = format_context_counts(cnt);
 67 |         let chrom = &chroms[pos.tid as usize];
 68 |         let position = pos.pos;
 69 |         let end = position + 1;
 70 |         let bqs = crate::fraguracy::Q_LOOKUP[pos.bq_bin as usize];
 71 |         let line = format!("{chrom}\t{position}\t{end}\t{bqs}\t{total}\t{contexts}\n");
 72 |         errfh
 73 |             .write_all(line.as_bytes())
 74 |             .expect("error writing to error file");
 75 |     }
 76 |     write_indel_errors(counts, output_prefix, chroms);
 77 | }
 78 | 
 79 | fn write_indel_errors(counts: &InnerCounts, output_prefix: PathBuf, chroms: Vec<String>) {
 80 |     let path = output_prefix
 81 |         .to_str()
 82 |         .expect("error getting output prefix")
 83 |         .to_owned()
 84 |         + "indel-errors.bed.gz";
 85 | 
 86 |     let mut errfh = bgzf::Writer::from_path(&path).expect("error opening bgzip file!");
 87 | 
 88 |     errfh
 89 |         .write_all(b"#chrom\tstart\tend\tcount\tlength\tbq_bin\thp_dist\n")
 90 |         .expect("error writing header");
 91 | 
 92 |     for ((pos, len, hp_dist), cnt) in counts.indel_error_positions.iter().sorted() {
 93 |         let chrom = &chroms[pos.tid as usize];
 94 |         let position = pos.pos;
 95 |         let bq_bin = crate::fraguracy::Q_LOOKUP[pos.bq_bin as usize];
 96 |         let end = position as i64 + (if *len > 0 { *len } else { 1 }) as i64;
 97 |         let hp_dist_str = if *hp_dist == crate::fraguracy::MAX_HP_DIST + 1 {
 98 |             "NA".to_string()
 99 |         } else {
100 |             hp_dist.to_string()
101 |         };
102 |         let line = format!("{chrom}\t{position}\t{end}\t{cnt}\t{len}\t{bq_bin}\t{hp_dist_str}\n");
103 |         errfh
104 |             .write_all(line.as_bytes())
105 |             .expect("error writing to indel-error file");
106 |     }
107 | }
108 | 
109 | pub(crate) fn open_file(path: Option<PathBuf>) -> Option<Box<dyn BufRead>> {
110 |     let file = File::open(path.as_ref().unwrap());
111 |     if file.is_err() {
112 |         eprintln!("error opening file: {}", file.unwrap_err());
113 |         return None;
114 |     }
115 |     let file = file.unwrap();
116 | 
117 |     // Check if it's a bgzip file
118 |     let mut buf_file = BufReader::new(file);
119 |     let b = buf_file.fill_buf().expect("error reading from file");
120 | 
121 |     let reader: Box<dyn BufRead> = if b.starts_with(b"\x1f\x8b") {
122 |         if path.as_ref().unwrap().to_str().unwrap().ends_with(".gz") {
123 |             // Try opening as bgzip first
124 |             if let Ok(bgzf_reader) = bgzf::Reader::from_path(path.as_ref().unwrap()) {
125 |                 let buf_reader = BufReader::new(bgzf_reader);
126 |                 Box::new(buf_reader)
127 |             } else {
128 |                 // Fall back to regular gzip
129 |                 Box::new(BufReader::new(GzDecoder::new(buf_file)))
130 |             }
131 |         } else {
132 |             Box::new(BufReader::new(GzDecoder::new(buf_file)))
133 |         }
134 |     } else {
135 |         Box::new(buf_file)
136 |     };
137 |     Some(reader)
138 | }
139 | 
140 | #[cfg(test)]
141 | mod tests {
142 |     use super::*;
143 | 
144 |     #[test]
145 |     fn test_format_context_counts() {
146 |         // Test case 1: All counts are non-zero
147 |         let counts1 = [1, 2, 3, 4, 5, 6, 0];
148 |         let (total1, contexts1) = format_context_counts(counts1);
149 |         assert_eq!(total1, 21);
150 |         assert_eq!(contexts1, "AC:1,AG:2,AT:3,CA:4,CG:5,CT:6");
151 | 
152 |         // Test case 2: Some counts are zero
153 |         let counts2 = [0, 2, 0, 4, 0, 6, 0];
154 |         let (total2, contexts2) = format_context_counts(counts2);
155 |         assert_eq!(total2, 12);
156 |         assert_eq!(contexts2, "AG:2,CA:4,CT:6");
157 | 
158 |         // Test case 3: All counts are zero
159 |         let counts3 = [0, 0, 0, 0, 0, 0, 0];
160 |         let (total3, contexts3) = format_context_counts(counts3);
161 |         assert_eq!(total3, 0);
162 |         assert_eq!(contexts3, "");
163 | 
164 |         // Test case 4: Only one non-zero count
165 |         let counts4 = [0, 0, 0, 0, 5, 0, 0];
166 |         let (total4, contexts4) = format_context_counts(counts4);
167 |         assert_eq!(total4, 5);
168 |         assert_eq!(contexts4, "CG:5");
169 | 
170 |         // Test case 5: Only N has a count
171 |         let counts5 = [0, 0, 0, 0, 0, 0, 1];
172 |         let (total5, contexts5) = format_context_counts(counts5);
173 |         assert_eq!(total5, 1);
174 |         assert_eq!(contexts5, "NN:1");
175 |     }
176 | }
177 | 


--------------------------------------------------------------------------------
/src/fraguracy.rs:
--------------------------------------------------------------------------------
   1 | use bpci::*;
   2 | use ndarray::prelude::Array;
   3 | use ndarray::Array6;
   4 | use rust_htslib::{
   5 |     bam::{
   6 |         record::{Cigar, CigarStringView},
   7 |         IndexedReader, Read, Record,
   8 |     },
   9 |     bgzf::CompressionLevel,
  10 | };
  11 | use rust_htslib::{bgzf, faidx};
  12 | use rust_lapper::Lapper;
  13 | use std::collections::BTreeMap;
  14 | 
  15 | use crate::homopolymer as hp;
  16 | use std::collections::HashMap;
  17 | use std::fmt;
  18 | use std::hash::Hash;
  19 | use std::io::Write;
  20 | use std::rc::Rc;
  21 | use std::str;
  22 | 
  23 | #[derive(Eq, Hash, PartialEq, Ord, PartialOrd)]
  24 | pub(crate) struct Position {
  25 |     pub tid: u16,
  26 |     pub pos: u32,
  27 |     pub bq_bin: u8,
  28 | }
  29 | 
  30 | /// DepthMap is for a given genome position, the depth at each (aq, bq) pair.
  31 | type DepthMap = HashMap<(u8, u8), u32>;
  32 | type Length = i32;
  33 | 
  34 | pub(crate) const MAX_HP_DIST: i16 = 15;
  35 | 
  36 | /// Returns the homopolymer distance with the minimum absolute value.
  37 | /// If both distances are available, returns the one with smaller absolute value.
  38 | /// If only one is available, returns that one.
  39 | /// If neither is available, returns None.
  40 | fn min_abs_hp_distance(dist_a: Option<i16>, dist_b: Option<i16>) -> Option<i16> {
  41 |     match (dist_a, dist_b) {
  42 |         (Some(a), Some(b)) => {
  43 |             if a.abs() <= b.abs() {
  44 |                 Some(a)
  45 |             } else {
  46 |                 Some(b)
  47 |             }
  48 |         }
  49 |         (Some(a), None) => Some(a),
  50 |         (None, Some(b)) => Some(b),
  51 |         (None, None) => None,
  52 |     }
  53 | }
  54 | 
  55 | pub(crate) struct Counts {
  56 |     pub(crate) ibam: Option<IndexedReader>,
  57 |     //  read, f/r pos, bq, bp, ctx{6} */
  58 |     pub(crate) counts: InnerCounts,
  59 |     pub(crate) depth: BTreeMap<u32, DepthMap>,
  60 |     pub(crate) last_depth_entry: Option<(String, u32, u32, String, String, u32)>,
  61 |     pub(crate) depth_writer: Option<bgzf::Writer>,
  62 | }
  63 | 
  64 | pub(crate) struct InnerCounts {
  65 |     // genome_pos
  66 |     pub(crate) errs: Array6<u64>,
  67 |     //  read, f/r, pos, bq, ctx{2}, hp_dist */
  68 |     pub(crate) cnts: Array6<u64>,
  69 |     pub(crate) mismatches: u64,
  70 |     pub(crate) matches: u64,
  71 | 
  72 |     // position -> error count. nice to find sites that are error-prone.
  73 |     pub(crate) error_positions: HashMap<Position, [u32; 7]>,
  74 |     // position -> indel error counts
  75 |     pub(crate) indel_error_positions: HashMap<(Position, Length, i16), u32>,
  76 | }
  77 | 
  78 | fn argmax<T: Ord>(slice: &[T]) -> Option<usize> {
  79 |     (0..slice.len()).max_by_key(|i| &slice[*i])
  80 | }
  81 | 
  82 | impl std::ops::AddAssign<InnerCounts> for InnerCounts {
  83 |     fn add_assign(&mut self, o: InnerCounts) {
  84 |         self.errs.add_assign(&o.errs);
  85 |         self.cnts.add_assign(&o.cnts);
  86 |         self.mismatches += o.mismatches;
  87 |         self.matches += o.matches;
  88 | 
  89 |         for (pos, cnt) in o.error_positions.into_iter() {
  90 |             let entry = self.error_positions.entry(pos).or_insert([0; 7]);
  91 |             for i in 0..entry.len() {
  92 |                 entry[i] += cnt[i];
  93 |             }
  94 |         }
  95 |         for (pos, cnt) in o.indel_error_positions.into_iter() {
  96 |             *(self.indel_error_positions.entry(pos)).or_insert(0) += cnt;
  97 |         }
  98 |     }
  99 | }
 100 | 
 101 | pub(crate) struct Stat {
 102 |     pub ci: ConfidenceInterval,
 103 |     read12: u8,
 104 |     fr: u8,
 105 |     bq_bin: u8,
 106 |     read_pos: u32,
 107 |     context: [char; 2],
 108 |     homopolymer_distance: i16,
 109 |     total_count: u64,
 110 |     error_count: u64,
 111 | }
 112 | 
 113 | unsafe impl std::marker::Sync for Counts {}
 114 | 
 115 | impl fmt::Display for Stat {
 116 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 117 |         let (lo, hi) = self.confidence_interval(&self.ci);
 118 |         let hp_dist_str = if self.homopolymer_distance == MAX_HP_DIST + 1 {
 119 |             "NA".to_string()
 120 |         } else {
 121 |             self.homopolymer_distance.to_string()
 122 |         };
 123 |         write!(
 124 |             f,
 125 |             "{}\t{}\t{}\t{}\t{}{}\t{}\t{}\t{}\t{:e}\t{:e}",
 126 |             ["r1", "r2"][self.read12 as usize],
 127 |             ["f", "r"][self.fr as usize],
 128 |             Q_LOOKUP[self.bq_bin as usize],
 129 |             self.read_pos,
 130 |             self.context[0],
 131 |             self.context[1],
 132 |             hp_dist_str,
 133 |             self.total_count,
 134 |             self.error_count,
 135 |             lo.max(0.0),
 136 |             hi.max(0.0),
 137 |         )
 138 |     }
 139 | }
 140 | 
 141 | #[derive(Debug, Clone, clap::ValueEnum, Default)]
 142 | pub enum ConfidenceInterval {
 143 |     #[default]
 144 |     AgrestiCoull,
 145 |     Wald,
 146 |     Wilson,
 147 |     //WilsonWithCC,
 148 | }
 149 | 
 150 | impl fmt::Display for ConfidenceInterval {
 151 |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 152 |         write!(f, "{:?}", self)
 153 |     }
 154 | }
 155 | 
 156 | impl Stat {
 157 |     pub(crate) fn header() -> String {
 158 |         String::from(
 159 |             "read12\tFR\tbq_bin\tread_pos\tcontext\thp_dist\ttotal_count\terror_count\terr_rate_lo\terr_rate_hi",
 160 |         )
 161 |     }
 162 | 
 163 |     pub(crate) fn confidence_interval(&self, ci: &ConfidenceInterval) -> (f64, f64) {
 164 |         let sample = bpci::NSuccessesSample::new(self.total_count as f64, self.error_count as f64)
 165 |             .expect("error with proportion");
 166 | 
 167 |         let f = match ci {
 168 |             ConfidenceInterval::AgrestiCoull => sample.agresti_coull(1.960),
 169 |             ConfidenceInterval::Wald => sample.wald(1.960),
 170 |             ConfidenceInterval::Wilson => sample.wilson_score(1.960),
 171 |         };
 172 | 
 173 |         (f.lower(), f.upper())
 174 |     }
 175 | 
 176 |     pub(crate) fn from_counts(
 177 |         c: &InnerCounts,
 178 |         bin_size: usize,
 179 |         ci: ConfidenceInterval,
 180 |     ) -> Vec<Stat> {
 181 |         let mut stats = vec![];
 182 |         for readi in 0..c.cnts.shape()[0] {
 183 |             for fri in 0..c.cnts.shape()[1] {
 184 |                 for read_posi in 0..c.cnts.shape()[2] {
 185 |                     for bqi in 0..c.cnts.shape()[3] {
 186 |                         for ctx6i in 0..c.errs.shape()[4] {
 187 |                             for hp_dist in 0..c.errs.shape()[5] {
 188 |                                 let n_err = c.errs[[readi, fri, read_posi, bqi, ctx6i, hp_dist]];
 189 | 
 190 |                                 // from ctx6i, we get the original context.
 191 |                                 let bases = CONTEXT_TO_CONTEXT2[ctx6i];
 192 | 
 193 |                                 let ctx2i = Counts::base_to_ctx2(bases[0] as u8);
 194 |                                 let n_tot = c.cnts[[readi, fri, read_posi, bqi, ctx2i, hp_dist]];
 195 |                                 if n_tot < n_err {
 196 |                                     eprintln!(
 197 |                                         "BAD: {ctx6i} -> {bases:?}. ctx2i:{ctx2i}",
 198 |                                         ctx6i = ctx6i,
 199 |                                         bases = bases,
 200 |                                         ctx2i = ctx2i
 201 |                                     );
 202 |                                 }
 203 | 
 204 |                                 stats.push(Stat {
 205 |                                     ci: ci.clone(),
 206 |                                     read12: readi as u8,
 207 |                                     fr: fri as u8,
 208 |                                     bq_bin: bqi as u8,
 209 |                                     read_pos: (read_posi * bin_size) as u32,
 210 |                                     context: bases,
 211 |                                     total_count: n_tot,
 212 |                                     error_count: n_err,
 213 |                                     homopolymer_distance: hp_dist as i16 - MAX_HP_DIST,
 214 |                                 })
 215 |                             }
 216 |                         }
 217 |                     }
 218 |                 }
 219 |             }
 220 |         }
 221 |         stats
 222 |     }
 223 | }
 224 | 
 225 | impl InnerCounts {
 226 |     pub(crate) fn new(bins: usize) -> Self {
 227 |         InnerCounts {
 228 |             cnts: Array::zeros((2, 2, bins, 5, 2, (2 * MAX_HP_DIST + 2) as usize)),
 229 |             errs: Array::zeros((2, 2, bins, 5, 6, (2 * MAX_HP_DIST + 2) as usize)),
 230 |             mismatches: 0,
 231 |             matches: 0,
 232 |             error_positions: HashMap::new(),
 233 |             indel_error_positions: HashMap::new(),
 234 |         }
 235 |     }
 236 | }
 237 | 
 238 | impl Counts {
 239 |     pub(crate) fn new(ir: Option<IndexedReader>, bins: usize) -> Self {
 240 |         Counts {
 241 |             ibam: ir,
 242 |             counts: InnerCounts::new(bins),
 243 |             depth: BTreeMap::new(),
 244 |             last_depth_entry: None,
 245 |             depth_writer: None,
 246 |         }
 247 |     }
 248 | 
 249 |     pub(crate) fn set_depth_writer(&mut self, path: &str) -> std::io::Result<()> {
 250 |         let mut w = bgzf::Writer::from_path_with_level(path, CompressionLevel::Level(1))
 251 |             .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
 252 |         w.write_all(b"#chrom\tstart\tend\tread1_bq_bin\tread2_bq_bin\tpair-ovl-depth\n")
 253 |             .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
 254 |         self.depth_writer = Some(w);
 255 |         Ok(())
 256 |     }
 257 | 
 258 |     #[inline(always)]
 259 |     fn qual_to_bin(q: u8) -> u8 {
 260 |         match q {
 261 |             0..=5 => 0,
 262 |             6..=19 => 1,
 263 |             20..=36 => 2,
 264 |             37..=59 => 3,
 265 |             _ => 4,
 266 |         }
 267 |     }
 268 | 
 269 |     #[inline(always)]
 270 |     fn base_to_ctx2(b: u8) -> usize {
 271 |         match b as char {
 272 |             'A' | 'T' => 0,
 273 |             'C' | 'G' => 1,
 274 |             'N' => 2,
 275 |             n => unreachable!("base_to_ctx2: {n}"),
 276 |         }
 277 |     }
 278 | 
 279 |     pub(crate) fn handle_depth(&mut self, bchrom: &str, bpos: i64) {
 280 |         // this function clears out the BTreeMap of depth entries that are before the current position.
 281 |         // it is not called if --no-denominator is specified.
 282 |         // it writes out the depth entries as they are popped out of the BTreeMap.
 283 |         loop {
 284 |             let pos = *(self
 285 |                 .depth
 286 |                 .first_key_value()
 287 |                 .unwrap_or((&u32::MAX, &DepthMap::new()))
 288 |                 .0);
 289 |             if pos == u32::MAX {
 290 |                 break;
 291 |             }
 292 |             if (pos as i64) < bpos {
 293 |                 let depthmap = self.depth.remove(&pos).unwrap();
 294 | 
 295 |                 for ((aq, bq), dp) in depthmap.iter() {
 296 |                     let a_bin = Q_LOOKUP[*aq as usize];
 297 |                     let b_bin = Q_LOOKUP[*bq as usize];
 298 | 
 299 |                     match &mut self.last_depth_entry {
 300 |                         Some((
 301 |                             last_chrom,
 302 |                             start_pos,
 303 |                             last_pos,
 304 |                             last_a_bin,
 305 |                             last_b_bin,
 306 |                             last_dp,
 307 |                         )) => {
 308 |                             if bchrom == last_chrom
 309 |                                 && a_bin == last_a_bin
 310 |                                 && b_bin == last_b_bin
 311 |                                 && dp == last_dp
 312 |                                 && *last_pos + 1 == pos
 313 |                             {
 314 |                                 *last_pos = pos;
 315 |                             } else {
 316 |                                 if let Some(writer) = &mut self.depth_writer {
 317 |                                     writeln!(
 318 |                                         writer,
 319 |                                         "{}\t{}\t{}\t{}\t{}\t{}",
 320 |                                         last_chrom,
 321 |                                         start_pos,
 322 |                                         *last_pos + 1,
 323 |                                         last_a_bin,
 324 |                                         last_b_bin,
 325 |                                         last_dp
 326 |                                     )
 327 |                                     .expect("error writing to bgzf file");
 328 |                                 }
 329 | 
 330 |                                 self.last_depth_entry = Some((
 331 |                                     bchrom.to_string(),
 332 |                                     pos,
 333 |                                     pos,
 334 |                                     a_bin.to_string(),
 335 |                                     b_bin.to_string(),
 336 |                                     *dp,
 337 |                                 ));
 338 |                             }
 339 |                         }
 340 |                         None => {
 341 |                             self.last_depth_entry = Some((
 342 |                                 bchrom.to_string(),
 343 |                                 pos,
 344 |                                 pos,
 345 |                                 a_bin.to_string(),
 346 |                                 b_bin.to_string(),
 347 |                                 *dp,
 348 |                             ));
 349 |                         }
 350 |                     }
 351 |                 }
 352 |             } else {
 353 |                 break;
 354 |             }
 355 |         }
 356 |     }
 357 | 
 358 |     #[allow(clippy::too_many_arguments)]
 359 |     pub(crate) fn increment<N: AsRef<str> + std::fmt::Debug>(
 360 |         &mut self,
 361 |         a: Rc<Record>,
 362 |         b: Rc<Record>,
 363 |         min_base_qual: u8,
 364 |         min_map_qual: u8,
 365 |         bin_size: u32,
 366 |         fasta: &Option<faidx::Reader>,
 367 |         chrom: N,
 368 |         include_tree: &Option<&Lapper<u32, u8>>,
 369 |         exclude_tree: &Option<&Lapper<u32, u8>>,
 370 |         hp_tree: &Option<Lapper<u32, u8>>,
 371 |     ) {
 372 |         let pieces = overlap_pieces(&a.cigar(), &b.cigar(), a.qual(), b.qual(), true);
 373 |         if pieces.is_empty() {
 374 |             return;
 375 |         }
 376 |         let a_seq = a.seq();
 377 |         let b_seq = b.seq();
 378 |         if a_seq.len() / bin_size as usize >= self.counts.cnts.dim().2 {
 379 |             panic!(
 380 |                 "index out of bounds: specify a --max-read-length of at least {}",
 381 |                 a_seq.len()
 382 |             )
 383 |         }
 384 | 
 385 |         if b_seq.len() / bin_size as usize >= self.counts.cnts.dim().2 {
 386 |             panic!(
 387 |                 "index out of bounds: specify a --max-read-length of at least {}",
 388 |                 b_seq.len()
 389 |             )
 390 |         }
 391 | 
 392 |         let a_qual = a.qual();
 393 |         let b_qual = b.qual();
 394 | 
 395 |         let indel_errors =
 396 |             indel_error_pieces(&a.cigar(), &b.cigar(), a_qual, b_qual, min_base_qual);
 397 |         indel_errors.iter().for_each(|c: &Coordinates| {
 398 |             // include the event if any of it overlaps with the include tree.
 399 |             if let Some(t) = include_tree {
 400 |                 if t.count(c.start, c.stop) == 0 {
 401 |                     return;
 402 |                 }
 403 |             }
 404 |             // exclude the event if any of it overlaps with the exclude tree.
 405 |             if let Some(t) = exclude_tree {
 406 |                 if t.count(c.start, c.stop) != 0 {
 407 |                     return;
 408 |                 }
 409 |             }
 410 | 
 411 |             let len = match c.indel_type {
 412 |                 IndelType::Insertion(len) => len as i32,
 413 |                 IndelType::Deletion(len) => -(len as i32),
 414 |                 IndelType::NotIndel => 0,
 415 |             };
 416 | 
 417 |             let p = Position {
 418 |                 tid: a.tid() as u16,
 419 |                 pos: c.start,
 420 |                 bq_bin: Counts::qual_to_bin(c.qual),
 421 |             };
 422 | 
 423 |             let hps = hp_tree.as_ref().map(|t| {
 424 |                 t.find(
 425 |                     c.start.max(MAX_HP_DIST as u32) - MAX_HP_DIST as u32,
 426 |                     c.stop + MAX_HP_DIST as u32,
 427 |                 )
 428 |                 .collect::<Vec<_>>()
 429 |             });
 430 | 
 431 |             let hp_dist_a = hp::hp_distance(
 432 |                 hps.as_deref(),
 433 |                 c.start,
 434 |                 a.pos() as u32,
 435 |                 a.cigar().end_pos() as u32,
 436 |                 if a.is_reverse() { -1 } else { 1 },
 437 |             );
 438 | 
 439 |             let hp_dist_b = hp::hp_distance(
 440 |                 hps.as_deref(),
 441 |                 c.start,
 442 |                 b.pos() as u32,
 443 |                 b.cigar().end_pos() as u32,
 444 |                 if b.is_reverse() { -1 } else { 1 },
 445 |             );
 446 | 
 447 |             let indel_hp_dist =
 448 |                 min_abs_hp_distance(hp_dist_a, hp_dist_b).unwrap_or(MAX_HP_DIST + 1 as i16);
 449 | 
 450 |             *self
 451 |                 .counts
 452 |                 .indel_error_positions
 453 |                 .entry((p, len, indel_hp_dist))
 454 |                 .or_insert(0) += 1;
 455 |         });
 456 | 
 457 |         let mut genome_pos = u32::MAX;
 458 |         for [a_chunk, b_chunk, g_chunk] in pieces {
 459 |             // we want to limit to the bounds of the read. since homopolymers outside of the read won't affect it.
 460 |             let eps = 1;
 461 |             let g_start = (g_chunk.start.max(MAX_HP_DIST as u32) - MAX_HP_DIST as u32)
 462 |                 .max(a.pos() as u32 + eps)
 463 |                 .max(b.pos() as u32 + eps);
 464 |             let g_stop = (g_chunk.stop + MAX_HP_DIST as u32)
 465 |                 .min(a.cigar().end_pos() as u32 - eps)
 466 |                 .min(b.cigar().end_pos() as u32 - eps);
 467 |             let hps: Option<Vec<_>> = if g_start <= g_stop {
 468 |                 hp_tree.as_ref().map(|t| t.find(g_start, g_stop).collect())
 469 |             } else {
 470 |                 None
 471 |             };
 472 | 
 473 |             for (ai, bi) in std::iter::zip(a_chunk.start..a_chunk.stop, b_chunk.start..b_chunk.stop)
 474 |             {
 475 |                 let aq = a_qual[ai as usize];
 476 |                 if aq < min_base_qual {
 477 |                     continue;
 478 |                 }
 479 |                 let bq = b_qual[bi as usize];
 480 |                 if bq < min_base_qual {
 481 |                     continue;
 482 |                 }
 483 |                 genome_pos = g_chunk.start + (ai - a_chunk.start);
 484 | 
 485 |                 if let Some(t) = include_tree {
 486 |                     if t.count(genome_pos, genome_pos + 1) == 0 {
 487 |                         continue;
 488 |                     }
 489 |                 }
 490 |                 if let Some(t) = exclude_tree {
 491 |                     if t.count(genome_pos, genome_pos + 1) != 0 {
 492 |                         continue;
 493 |                     }
 494 |                 }
 495 | 
 496 |                 let aq = Counts::qual_to_bin(aq);
 497 |                 let bq = Counts::qual_to_bin(bq);
 498 | 
 499 |                 if self.depth_writer.is_some() {
 500 |                     self.depth
 501 |                         .entry(genome_pos)
 502 |                         .or_default()
 503 |                         .entry((aq, bq))
 504 |                         .and_modify(|v| *v += 1)
 505 |                         .or_insert(1);
 506 |                 }
 507 | 
 508 |                 let a_base = unsafe { a_seq.decoded_base_unchecked(ai as usize) };
 509 |                 let b_base = unsafe { b_seq.decoded_base_unchecked(bi as usize) };
 510 | 
 511 |                 let a_bin = (ai / bin_size) as usize;
 512 |                 let b_bin = (bi / bin_size) as usize;
 513 | 
 514 |                 let a_hp_dist = hp::hp_distance(
 515 |                     hps.as_deref(),
 516 |                     genome_pos,
 517 |                     a.pos() as u32,
 518 |                     a.cigar().end_pos() as u32,
 519 |                     if a.is_reverse() { -1 } else { 1 },
 520 |                 )
 521 |                 .map(|d| (d + MAX_HP_DIST) as usize)
 522 |                 .unwrap_or((2 * MAX_HP_DIST + 1) as usize);
 523 | 
 524 |                 let b_hp_dist = hp::hp_distance(
 525 |                     hps.as_deref(),
 526 |                     genome_pos,
 527 |                     b.pos() as u32,
 528 |                     b.cigar().end_pos() as u32,
 529 |                     if b.is_reverse() { -1 } else { 1 },
 530 |                 )
 531 |                 .map(|d| (d + MAX_HP_DIST) as usize)
 532 |                 .unwrap_or((2 * MAX_HP_DIST + 1) as usize);
 533 | 
 534 |                 /* read1/2, F/R, pos, mq, bq, ctx, hp_dist */
 535 |                 let mut a_index = [
 536 |                     1 - a.is_first_in_template() as usize, // 0 r1
 537 |                     (a.is_reverse() as usize),             //
 538 |                     a_bin,
 539 |                     aq as usize,
 540 |                     // NOTE that this could be an error so we might change this later if we learn a_base is an error
 541 |                     Counts::base_to_ctx2(a_base),
 542 |                     a_hp_dist,
 543 |                 ];
 544 | 
 545 |                 let mut b_index = [
 546 |                     1 - b.is_first_in_template() as usize,
 547 |                     (b.is_reverse() as usize),
 548 |                     b_bin,
 549 |                     bq as usize,
 550 |                     // NOTE that this could be an error so we might change this later if we learn b_base is an error
 551 |                     Counts::base_to_ctx2(b_base),
 552 |                     b_hp_dist,
 553 |                 ];
 554 | 
 555 |                 if a_base == b_base {
 556 |                     // fast path to increment separately here because we must do some extra stuff to error base before incrementing count
 557 |                     // if there is an error.
 558 |                     self.counts.cnts[a_index] += 1;
 559 |                     self.counts.cnts[b_index] += 1;
 560 |                     self.counts.matches += 1;
 561 |                     continue;
 562 |                 }
 563 | 
 564 |                 self.counts.mismatches += 1;
 565 |                 let mut err = ['X', 'X'];
 566 | 
 567 |                 let real_base = if self.ibam.is_some() {
 568 |                     let mut base_counts = pile(
 569 |                         self.ibam.as_mut().unwrap(),
 570 |                         a.tid(),
 571 |                         genome_pos,
 572 |                         min_map_qual,
 573 |                         min_base_qual,
 574 |                     );
 575 |                     let am = argmax(&base_counts).expect("error selecting maximum index");
 576 |                     // check that the 2nd most common base is very low frequency, otherwise might be a het.
 577 |                     base_counts.sort();
 578 |                     let cmax = base_counts[4];
 579 |                     // if 3nd most common base is more than 50% of first, then we don't know which is right.
 580 |                     if base_counts[3] as f64 / cmax as f64 > 0.5 {
 581 |                         log::debug!(
 582 |                             "skipping due to unknown truth given base_counts {:?} at pos:{}:{}",
 583 |                             base_counts,
 584 |                             chrom.as_ref(),
 585 |                             genome_pos
 586 |                         );
 587 |                         continue;
 588 |                     }
 589 |                     ['A', 'C', 'G', 'T', 'N'][am]
 590 |                 } else {
 591 |                     fasta
 592 |                         .as_ref()
 593 |                         .unwrap()
 594 |                         .fetch_seq(&chrom, genome_pos as usize, genome_pos as usize)
 595 |                         .expect("error extracting base")[0] as char
 596 |                 };
 597 |                 if real_base == 'N' {
 598 |                     let chrom_string = chrom.as_ref();
 599 |                     log::warn!("got 'N' for {chrom_string}:{genome_pos}. skipping");
 600 |                     let pos = Position {
 601 |                         tid: a.tid() as u16,
 602 |                         pos: genome_pos,
 603 |                         // we don't know the bq, but assume it's the min. this very rarely happens so doesn't affect results.
 604 |                         bq_bin: aq.min(bq),
 605 |                     };
 606 |                     let context_counts = self.counts.error_positions.entry(pos).or_insert([0; 7]);
 607 |                     context_counts[6] += 1;
 608 |                     continue;
 609 |                 }
 610 | 
 611 |                 let err_index = if a_base == real_base as u8 {
 612 |                     // b is the error
 613 |                     let mut index = b_index;
 614 |                     b_index[4] = a_index[4]; // we correct this because we want to track the true base
 615 |                     index[4] = CONTEXT_LOOKUP[&(a_base, b_base)];
 616 |                     err[0] = a_base as char;
 617 |                     err[1] = b_base as char;
 618 |                     index
 619 |                 } else if b_base == real_base as u8 {
 620 |                     // a is the error
 621 |                     let mut index = a_index;
 622 |                     a_index[4] = b_index[4]; // we correct this because we want to track the true base
 623 |                     index[4] = CONTEXT_LOOKUP[&(b_base, a_base)];
 624 |                     err[0] = b_base as char;
 625 |                     err[1] = a_base as char;
 626 |                     index
 627 |                 } else {
 628 |                     // can't determine which is error base.
 629 |                     let pos = Position {
 630 |                         tid: a.tid() as u16,
 631 |                         pos: genome_pos,
 632 |                         // we don't know the bq, but assume it's the min. this very rarely happens so doesn't affect results.
 633 |                         bq_bin: aq.min(bq),
 634 |                     };
 635 |                     log::debug!(
 636 |                         "bases mismatches between reads and neither matches reference at pos:{}:{}. adding N",
 637 |                         chrom.as_ref(),
 638 |                         genome_pos
 639 |                     );
 640 |                     let context_counts = self.counts.error_positions.entry(pos).or_insert([0; 7]);
 641 |                     context_counts[6] += 1;
 642 |                     continue;
 643 |                 };
 644 | 
 645 |                 let pos = Position {
 646 |                     tid: a.tid() as u16,
 647 |                     pos: genome_pos,
 648 |                     bq_bin: err_index[3] as u8,
 649 |                 };
 650 |                 let context_idx = err_index[4];
 651 |                 let context_counts = self.counts.error_positions.entry(pos).or_insert([0; 7]);
 652 |                 context_counts[context_idx] += 1;
 653 | 
 654 |                 self.counts.cnts[a_index] += 1;
 655 |                 self.counts.cnts[b_index] += 1;
 656 | 
 657 |                 self.counts.errs[err_index] += 1;
 658 |                 if log::log_enabled!(log::Level::Debug)
 659 |                     && unsafe { str::from_utf8_unchecked(a.qname()) }
 660 |                         == "A00744:46:HV3C3DSXX:2:2611:30798:35258"
 661 |                 {
 662 |                     log::debug!(
 663 |                         "gpos:{}, err:{}->{}, err-index:{:?}, ai:{}, bi:{}, {:?}",
 664 |                         genome_pos,
 665 |                         /* base_counts, */
 666 |                         err[0],
 667 |                         err[1],
 668 |                         err_index,
 669 |                         ai,
 670 |                         bi,
 671 |                         unsafe { str::from_utf8_unchecked(a.qname()) },
 672 |                     );
 673 |                 }
 674 |             }
 675 |         }
 676 |         if self.depth_writer.is_some() {
 677 |             self.handle_depth(chrom.as_ref(), genome_pos as i64);
 678 |         }
 679 |     }
 680 | }
 681 | 
 682 | fn pile(
 683 |     ibam: &mut IndexedReader,
 684 |     tid: i32,
 685 |     genome_pos: u32,
 686 |     min_map_qual: u8,
 687 |     min_base_qual: u8,
 688 | ) -> [u32; 5] {
 689 |     let mut base_counts: [u32; 5] = [0; 5];
 690 | 
 691 |     ibam.fetch((tid, genome_pos, genome_pos + 1))
 692 |         .expect("Error seeking to genomic position");
 693 | 
 694 |     let mut p = ibam.pileup();
 695 |     p.set_max_depth(100_000);
 696 |     p.filter(|col| col.as_ref().unwrap().pos() == genome_pos)
 697 |         .for_each(|col| {
 698 |             let col = col.unwrap();
 699 | 
 700 |             col.alignments().for_each(|aln| {
 701 |                 if let Some(qpos) = aln.qpos() {
 702 |                     let record = aln.record();
 703 |                     // here we want a accurate count, so we skip stuff at either
 704 |                     // end of a read (within 3 bases of end)
 705 |                     // along with low base-quality and low mapping-quality
 706 |                     if qpos < 3 || qpos > record.qual().len() - 4 {
 707 |                         return;
 708 |                     }
 709 |                     if record.mapq() < min_map_qual {
 710 |                         return;
 711 |                     }
 712 |                     if record.qual()[qpos] < min_base_qual {
 713 |                         return;
 714 |                     }
 715 |                     let base_idx = match record.seq()[qpos] as char {
 716 |                         'A' => 0,
 717 |                         'C' => 1,
 718 |                         'G' => 2,
 719 |                         'T' => 3,
 720 |                         _ => 4,
 721 |                     };
 722 |                     base_counts[base_idx] += 1;
 723 |                 }
 724 |             });
 725 |         });
 726 |     base_counts
 727 | }
 728 | 
 729 | lazy_static! {
 730 |     pub(crate) static ref CONTEXT_LOOKUP: HashMap<(u8, u8), usize> = HashMap::from([
 731 |         ((b'T', b'G'), 0usize),
 732 |         ((b'A', b'C'), 0usize),
 733 |         ((b'T', b'C'), 1usize),
 734 |         ((b'A', b'G'), 1usize),
 735 |         ((b'T', b'A'), 2usize),
 736 |         ((b'A', b'T'), 2usize),
 737 |         ((b'C', b'A'), 3usize),
 738 |         ((b'G', b'T'), 3usize),
 739 |         ((b'C', b'G'), 4usize),
 740 |         ((b'G', b'C'), 4usize),
 741 |         ((b'C', b'T'), 5usize),
 742 |         ((b'G', b'A'), 5usize),
 743 |         ((b'N', b'N'), 6usize),
 744 |     ]);
 745 |     pub(crate) static ref CONTEXT_TO_CONTEXT2: [[char; 2]; 7] = [
 746 |         ['A', 'C'],
 747 |         ['A', 'G'],
 748 |         ['A', 'T'],
 749 |         ['C', 'A'],
 750 |         ['C', 'G'],
 751 |         ['C', 'T'],
 752 |         ['N', 'N'],
 753 |     ];
 754 |     pub(crate) static ref Q_LOOKUP: [&'static str; 5] = ["0-5", "05-19", "20-36", "37-59", "60+"];
 755 |     pub(crate) static ref REVERSE_Q_LOOKUP: HashMap<&'static str, u8> = HashMap::from([
 756 |         ("0-5", 0),
 757 |         ("05-19", 1),
 758 |         ("20-36", 2),
 759 |         ("37-59", 3),
 760 |         ("60+", 4),
 761 |     ]);
 762 | }
 763 | 
 764 | pub(crate) fn filter_read(r: &Rc<Record>) -> bool {
 765 |     r.tid() == r.mtid()
 766 |         && r.tid() >= 0
 767 |         && !r.is_unmapped()
 768 |         && !r.is_mate_unmapped()
 769 |         && (r.pos() - r.mpos()).abs() < 1000
 770 |         && !r.is_supplementary()
 771 |         && !r.is_secondary()
 772 |         && !r.is_duplicate()
 773 |         && !r.is_quality_check_failed()
 774 | }
 775 | 
 776 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
 777 | pub struct Coordinates {
 778 |     pub start: u32,
 779 |     pub stop: u32,
 780 |     pub indel_type: IndelType,
 781 |     pub qual: u8,
 782 | }
 783 | 
 784 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
 785 | pub enum IndelType {
 786 |     Insertion(u32),
 787 |     Deletion(u32),
 788 |     NotIndel,
 789 | }
 790 | 
 791 | #[inline(always)]
 792 | fn is_insertion(a: Cigar) -> bool {
 793 |     matches!(a, Cigar::Ins(_))
 794 | }
 795 | 
 796 | #[inline(always)]
 797 | fn query(a: Cigar) -> i64 {
 798 |     match a {
 799 |         Cigar::Match(n) | Cigar::SoftClip(n) | Cigar::Ins(n) | Cigar::Diff(n) | Cigar::Equal(n) => {
 800 |             n as i64
 801 |         }
 802 |         _ => 0,
 803 |     }
 804 | }
 805 | #[inline(always)]
 806 | fn reference(a: Cigar) -> i64 {
 807 |     match a {
 808 |         Cigar::Match(n) | Cigar::Del(n) | Cigar::Diff(n) | Cigar::Equal(n) | Cigar::RefSkip(n) => {
 809 |             n as i64
 810 |         }
 811 |         _ => 0,
 812 |     }
 813 | }
 814 | 
 815 | fn indel_coords(
 816 |     cig: &CigarStringView,
 817 |     genomic_min: u32,
 818 |     genomic_max: u32,
 819 |     base_quals: &[u8],
 820 | ) -> Vec<Coordinates> {
 821 |     let mut result: Vec<Coordinates> = Vec::new();
 822 |     let mut start: u32 = cig.pos() as u32;
 823 |     let mut read_i = 0;
 824 | 
 825 |     for c in cig {
 826 |         if start > genomic_max {
 827 |             break;
 828 |         }
 829 |         if start + (reference(*c) as u32) < genomic_min {
 830 |             start += reference(*c) as u32;
 831 |             read_i += query(*c) as usize;
 832 |             continue;
 833 |         }
 834 |         // TODO: handle partial overlap of start with the current cigar op.
 835 |         match c {
 836 |             Cigar::Ins(l) => {
 837 |                 result.push(Coordinates {
 838 |                     start,
 839 |                     stop: start + 1,
 840 |                     indel_type: IndelType::Insertion(*l as u32),
 841 |                     qual: base_quals[read_i],
 842 |                 });
 843 |             }
 844 |             Cigar::Del(d) => {
 845 |                 result.push(Coordinates {
 846 |                     start,
 847 |                     stop: start + *d,
 848 |                     indel_type: IndelType::Deletion(*d as u32),
 849 |                     qual: base_quals[read_i],
 850 |                 });
 851 |             }
 852 |             _ => {}
 853 |         }
 854 |         start += reference(*c) as u32;
 855 |         read_i += query(*c) as usize;
 856 |     }
 857 |     result
 858 | }
 859 | 
 860 | fn find_non_exact(
 861 |     a_indel_coords: &[Coordinates],
 862 |     b_indel_coords: &[Coordinates],
 863 |     result: &mut Vec<Coordinates>,
 864 |     min_base_qual: u8,
 865 | ) {
 866 |     for a in a_indel_coords {
 867 |         if a.qual <= min_base_qual {
 868 |             continue;
 869 |         }
 870 |         match b_indel_coords.binary_search_by(|b| b.cmp(a)) {
 871 |             Ok(_) => {}
 872 |             Err(bi) => {
 873 |                 // we check base-qual (first base) of b as well.
 874 |                 // this is a bit weird, but ensures that at least both
 875 |                 // reads were confident at this site.
 876 |                 if bi < b_indel_coords.len() && b_indel_coords[bi].qual < min_base_qual {
 877 |                     continue;
 878 |                 }
 879 |                 let bq = if bi < b_indel_coords.len() {
 880 |                     b_indel_coords[bi].qual
 881 |                 } else {
 882 |                     u8::MAX
 883 |                 };
 884 |                 // any non-exact matches are errors
 885 |                 result.push(Coordinates {
 886 |                     start: a.start,
 887 |                     stop: a.stop,
 888 |                     indel_type: a.indel_type.clone(),
 889 |                     qual: a.qual.min(bq),
 890 |                 });
 891 |             }
 892 |         }
 893 |     }
 894 | }
 895 | 
 896 | /// Report genomic coordiantes of bases that do not match between the reads.
 897 | fn indel_error_pieces(
 898 |     a: &CigarStringView,
 899 |     b: &CigarStringView,
 900 |     a_qual: &[u8],
 901 |     b_qual: &[u8],
 902 |     min_base_qual: u8,
 903 | ) -> Vec<Coordinates> {
 904 |     let aend = a.end_pos() as u32;
 905 |     let bend = b.end_pos() as u32;
 906 |     //let astart = a.pos() + a.leading_softclips();
 907 |     //let bstart = b.pos() + b.leading_softclips();
 908 |     if aend <= b.pos() as u32 || bend <= a.pos() as u32 {
 909 |         return vec![];
 910 |     }
 911 |     let a_indel_coords = indel_coords(a, b.pos() as u32, bend, a_qual);
 912 |     let b_indel_coords = indel_coords(b, a.pos() as u32, aend, b_qual);
 913 |     if a_indel_coords.is_empty() && b_indel_coords.is_empty() {
 914 |         return vec![];
 915 |     }
 916 | 
 917 |     let mut result: Vec<Coordinates> = Vec::new();
 918 |     find_non_exact(&a_indel_coords, &b_indel_coords, &mut result, min_base_qual);
 919 |     find_non_exact(&b_indel_coords, &a_indel_coords, &mut result, min_base_qual);
 920 |     result
 921 | }
 922 | 
 923 | /// Return mapped parts of each read that overlap the other.
 924 | /// Returns A, B, genome coordiantes.
 925 | fn overlap_pieces(
 926 |     a: &CigarStringView,
 927 |     b: &CigarStringView,
 928 |     a_qual: &[u8],
 929 |     b_qual: &[u8],
 930 |     skip_insertions: bool,
 931 | ) -> Vec<[Coordinates; 3]> {
 932 |     let aend = a.end_pos();
 933 |     let bend = b.end_pos();
 934 |     //let astart = a.pos() + a.leading_softclips();
 935 |     //let bstart = b.pos() + b.leading_softclips();
 936 |     if aend <= b.pos() || bend <= a.pos() {
 937 |         return vec![];
 938 |     }
 939 | 
 940 |     let mut result: Vec<[Coordinates; 3]> = Vec::new();
 941 |     let mut ai: usize = 0;
 942 |     let mut bi: usize = 0;
 943 |     let mut a_genome_pos = a.pos();
 944 |     let mut b_genome_pos = b.pos();
 945 |     let mut a_read_pos = 0i64;
 946 |     let mut b_read_pos = 0i64;
 947 |     while ai < a.len() && bi < b.len() {
 948 |         let a_genome_stop = a_genome_pos + reference(a[ai.min(a.len() - 1)]);
 949 |         let b_genome_stop = b_genome_pos + reference(b[bi.min(b.len() - 1)]);
 950 |         if a_genome_stop < b_genome_pos {
 951 |             if ai < a.len() {
 952 |                 a_genome_pos += reference(a[ai]);
 953 |                 a_read_pos += query(a[ai]);
 954 |                 ai += 1;
 955 |             }
 956 |         } else if b_genome_stop < a_genome_pos {
 957 |             if bi < b.len() {
 958 |                 b_genome_pos += reference(b[bi]);
 959 |                 b_read_pos += query(b[bi]);
 960 |                 bi += 1;
 961 |             }
 962 |         } else {
 963 |             // we have some overlap.
 964 |             // if they both consume query, we can append to our result.
 965 |             let aop = a[ai.min(a.len() - 1)];
 966 |             let bop = b[bi.min(b.len() - 1)];
 967 |             if query(aop) > 0 && query(bop) > 0 {
 968 |                 let genome_start = a_genome_pos.max(b_genome_pos);
 969 |                 let genome_stop = a_genome_stop.min(b_genome_stop);
 970 | 
 971 |                 let mut glen = genome_stop - genome_start;
 972 |                 if glen == 0 && !skip_insertions {
 973 |                     // if they are both the same insertion, then we will evaluate.
 974 |                     // otherwise, we can not.
 975 |                     if aop == bop && is_insertion(aop) {
 976 |                         glen = aop.len() as i64;
 977 |                     }
 978 |                 }
 979 | 
 980 |                 // if glen is 0, we didn't consume any reference, but can have, e.g. both deletions.
 981 |                 if glen > 0 {
 982 |                     //let a_over = aop.len() as i64 - (genome_start - a_genome_pos);
 983 |                     //let b_over = bop.len() as i64 - (genome_start - b_genome_pos);
 984 | 
 985 |                     // glen can be 0 if, e.g. both reads end with soft-clip.
 986 |                     let a_over = genome_start - a_genome_pos;
 987 |                     let b_over = genome_start - b_genome_pos;
 988 | 
 989 |                     result.push([
 990 |                         Coordinates {
 991 |                             start: (a_read_pos + a_over) as u32,
 992 |                             stop: (a_read_pos + a_over + glen) as u32,
 993 |                             indel_type: match aop {
 994 |                                 Cigar::Ins(l) => IndelType::Insertion(l as u32),
 995 |                                 Cigar::Del(l) => IndelType::Deletion(l as u32),
 996 |                                 _ => IndelType::NotIndel,
 997 |                             },
 998 |                             qual: a_qual[a_read_pos as usize + a_over as usize],
 999 |                         },
1000 |                         Coordinates {
1001 |                             start: (b_read_pos + b_over) as u32,
1002 |                             stop: (b_read_pos + b_over + glen) as u32,
1003 |                             indel_type: match bop {
1004 |                                 Cigar::Ins(l) => IndelType::Insertion(l as u32),
1005 |                                 Cigar::Del(l) => IndelType::Deletion(l as u32),
1006 |                                 _ => IndelType::NotIndel,
1007 |                             },
1008 |                             qual: b_qual[b_read_pos as usize + b_over as usize],
1009 |                         },
1010 |                         Coordinates {
1011 |                             start: genome_start as u32,
1012 |                             stop: genome_stop as u32,
1013 |                             indel_type: match aop {
1014 |                                 Cigar::Ins(l) => IndelType::Insertion(l as u32),
1015 |                                 Cigar::Del(l) => IndelType::Deletion(l as u32),
1016 |                                 _ => IndelType::NotIndel,
1017 |                             },
1018 |                             qual: a_qual[a_read_pos as usize + a_over as usize],
1019 |                         },
1020 |                     ])
1021 |                 }
1022 |             }
1023 |             // we had some overlap. now we increment the lowest genome pos by end.
1024 |             if a_genome_stop <= b_genome_stop && ai < a.len() {
1025 |                 a_genome_pos += reference(a[ai]);
1026 |                 a_read_pos += query(a[ai]);
1027 |                 ai += 1;
1028 |             }
1029 |             if b_genome_stop <= a_genome_stop && bi < b.len() {
1030 |                 b_genome_pos += reference(b[bi]);
1031 |                 b_read_pos += query(b[bi]);
1032 |                 bi += 1;
1033 |             }
1034 |         }
1035 |     }
1036 | 
1037 |     result
1038 | }
1039 | 
1040 | #[cfg(test)]
1041 | mod tests {
1042 |     use super::*;
1043 |     use rust_htslib::bam::record::{Cigar, CigarString};
1044 | 
1045 |     #[test]
1046 |     fn test_different_alignments() {
1047 |         let a = CigarString(vec![Cigar::Match(5), Cigar::Ins(3), Cigar::Match(5)]).into_view(0);
1048 |         let b = CigarString(vec![Cigar::Match(13)]).into_view(0);
1049 |         let a_bqs = vec![30u8; 20];
1050 |         let b_bqs = vec![30u8; 20];
1051 |         let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, true);
1052 |         dbg!(&r);
1053 |     }
1054 | 
1055 |     #[test]
1056 |     fn test_same_insertion() {
1057 |         let a = CigarString(vec![Cigar::Match(10), Cigar::Ins(8), Cigar::Match(10)]).into_view(5);
1058 |         let b = CigarString(vec![Cigar::Match(10), Cigar::Ins(8), Cigar::Match(10)]).into_view(5);
1059 |         let a_bqs = vec![30u8; 20];
1060 |         let b_bqs = vec![30u8; 20];
1061 |         let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, false);
1062 |         dbg!(&r);
1063 |     }
1064 | 
1065 |     #[test]
1066 |     fn test_many_contained() {
1067 |         let a = CigarString(vec![Cigar::Match(100)]).into_view(10);
1068 |         let b = CigarString(vec![
1069 |             Cigar::Match(10),
1070 |             Cigar::Match(11),
1071 |             Cigar::Match(12),
1072 |             Cigar::Match(13),
1073 |         ])
1074 |         .into_view(0);
1075 |         let a_bqs = vec![30u8; 200];
1076 |         let b_bqs = vec![30u8; 200];
1077 |         let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, true);
1078 |         let expected = [
1079 |             [
1080 |                 Coordinates {
1081 |                     start: 0,
1082 |                     stop: 11,
1083 |                     indel_type: IndelType::NotIndel,
1084 |                     qual: 30,
1085 |                 },
1086 |                 Coordinates {
1087 |                     start: 10,
1088 |                     stop: 21,
1089 |                     indel_type: IndelType::NotIndel,
1090 |                     qual: 30,
1091 |                 },
1092 |                 Coordinates {
1093 |                     start: 10,
1094 |                     stop: 21,
1095 |                     indel_type: IndelType::NotIndel,
1096 |                     qual: 30,
1097 |                 },
1098 |             ],
1099 |             [
1100 |                 Coordinates {
1101 |                     start: 11,
1102 |                     stop: 23,
1103 |                     indel_type: IndelType::NotIndel,
1104 |                     qual: 30,
1105 |                 },
1106 |                 Coordinates {
1107 |                     start: 21,
1108 |                     stop: 33,
1109 |                     indel_type: IndelType::NotIndel,
1110 |                     qual: 30,
1111 |                 },
1112 |                 Coordinates {
1113 |                     start: 21,
1114 |                     stop: 33,
1115 |                     indel_type: IndelType::NotIndel,
1116 |                     qual: 30,
1117 |                 },
1118 |             ],
1119 |             [
1120 |                 Coordinates {
1121 |                     start: 23,
1122 |                     stop: 36,
1123 |                     indel_type: IndelType::NotIndel,
1124 |                     qual: 30,
1125 |                 },
1126 |                 Coordinates {
1127 |                     start: 33,
1128 |                     stop: 46,
1129 |                     indel_type: IndelType::NotIndel,
1130 |                     qual: 30,
1131 |                 },
1132 |                 Coordinates {
1133 |                     start: 33,
1134 |                     stop: 46,
1135 |                     indel_type: IndelType::NotIndel,
1136 |                     qual: 30,
1137 |                 },
1138 |             ],
1139 |         ];
1140 |         assert_eq!(r, expected);
1141 |     }
1142 | 
1143 |     #[test]
1144 |     fn test_simple_overlap() {
1145 |         let a = CigarString(vec![
1146 |             Cigar::Match(10),
1147 |             Cigar::Match(80),
1148 |             Cigar::SoftClip(10),
1149 |         ])
1150 |         .into_view(8);
1151 |         let b = CigarString(vec![
1152 |             Cigar::Match(70),
1153 |             Cigar::Match(40),
1154 |             Cigar::SoftClip(10),
1155 |         ])
1156 |         .into_view(5);
1157 | 
1158 |         let a_bqs = vec![30u8; 100];
1159 |         let b_bqs = vec![30u8; 100];
1160 |         let r = overlap_pieces(&a, &b, &a_bqs, &b_bqs, true);
1161 | 
1162 |         let expected = [
1163 |             [
1164 |                 Coordinates {
1165 |                     start: 0,
1166 |                     stop: 10,
1167 |                     indel_type: IndelType::NotIndel,
1168 |                     qual: 30,
1169 |                 },
1170 |                 Coordinates {
1171 |                     start: 3,
1172 |                     stop: 13,
1173 |                     indel_type: IndelType::NotIndel,
1174 |                     qual: 30,
1175 |                 },
1176 |                 Coordinates {
1177 |                     start: 8,
1178 |                     stop: 18,
1179 |                     indel_type: IndelType::NotIndel,
1180 |                     qual: 30,
1181 |                 },
1182 |             ],
1183 |             [
1184 |                 Coordinates {
1185 |                     start: 10,
1186 |                     stop: 67,
1187 |                     indel_type: IndelType::NotIndel,
1188 |                     qual: 30,
1189 |                 },
1190 |                 Coordinates {
1191 |                     start: 13,
1192 |                     stop: 70,
1193 |                     indel_type: IndelType::NotIndel,
1194 |                     qual: 30,
1195 |                 },
1196 |                 Coordinates {
1197 |                     start: 18,
1198 |                     stop: 75,
1199 |                     indel_type: IndelType::NotIndel,
1200 |                     qual: 30,
1201 |                 },
1202 |             ],
1203 |             [
1204 |                 Coordinates {
1205 |                     start: 67,
1206 |                     stop: 90,
1207 |                     indel_type: IndelType::NotIndel,
1208 |                     qual: 30,
1209 |                 },
1210 |                 Coordinates {
1211 |                     start: 70,
1212 |                     stop: 93,
1213 |                     indel_type: IndelType::NotIndel,
1214 |                     qual: 30,
1215 |                 },
1216 |                 Coordinates {
1217 |                     start: 75,
1218 |                     stop: 98,
1219 |                     indel_type: IndelType::NotIndel,
1220 |                     qual: 30,
1221 |                 },
1222 |             ],
1223 |         ];
1224 | 
1225 |         assert_eq!(r, expected);
1226 |     }
1227 | 
1228 |     #[test]
1229 |     fn test_size() {
1230 |         assert_eq!(std::mem::size_of::<Position>(), 8);
1231 |     }
1232 | 
1233 |     #[test]
1234 |     fn test_min_abs_hp_distance() {
1235 |         // Both distances available - choose minimum absolute value
1236 |         assert_eq!(min_abs_hp_distance(Some(-1), Some(2)), Some(-1));
1237 |         assert_eq!(min_abs_hp_distance(Some(2), Some(-1)), Some(-1));
1238 |         assert_eq!(min_abs_hp_distance(Some(-3), Some(-2)), Some(-2));
1239 |         assert_eq!(min_abs_hp_distance(Some(3), Some(2)), Some(2));
1240 | 
1241 |         // Equal absolute values - choose first one
1242 |         assert_eq!(min_abs_hp_distance(Some(-2), Some(2)), Some(-2));
1243 |         assert_eq!(min_abs_hp_distance(Some(2), Some(-2)), Some(2));
1244 | 
1245 |         // Only one distance available
1246 |         assert_eq!(min_abs_hp_distance(Some(5), None), Some(5));
1247 |         assert_eq!(min_abs_hp_distance(None, Some(-3)), Some(-3));
1248 | 
1249 |         // No distances available
1250 |         assert_eq!(min_abs_hp_distance(None, None), None);
1251 |     }
1252 | 
1253 |     #[test]
1254 |     fn test_indel_error_pieces() {
1255 |         let cigar_a =
1256 |             CigarString(vec![Cigar::Match(10), Cigar::Del(2), Cigar::Match(5)]).into_view(5);
1257 |         let cigar_b =
1258 |             CigarString(vec![Cigar::Match(10), Cigar::Del(3), Cigar::Match(4)]).into_view(10);
1259 |         let expected = vec![
1260 |             Coordinates {
1261 |                 start: 15,
1262 |                 stop: 17,
1263 |                 indel_type: IndelType::Deletion(2),
1264 |                 qual: 30,
1265 |             },
1266 |             Coordinates {
1267 |                 start: 20,
1268 |                 stop: 23,
1269 |                 indel_type: IndelType::Deletion(3),
1270 |                 qual: 30,
1271 |             },
1272 |         ];
1273 |         let a_bqs = vec![30u8; 20];
1274 |         let b_bqs = vec![30u8; 20];
1275 |         assert_eq!(
1276 |             indel_error_pieces(&cigar_a, &cigar_b, &a_bqs, &b_bqs, 15),
1277 |             expected
1278 |         );
1279 |     }
1280 | 
1281 |     #[test]
1282 |     fn test_indel_error_pieces_overlap() {
1283 |         let cigar_a =
1284 |             CigarString(vec![Cigar::Match(10), Cigar::Del(3), Cigar::Match(5)]).into_view(10);
1285 |         let cigar_b =
1286 |             CigarString(vec![Cigar::Match(10), Cigar::Ins(3), Cigar::Match(5)]).into_view(10);
1287 |         let expected = vec![
1288 |             Coordinates {
1289 |                 start: 20,
1290 |                 stop: 23,
1291 |                 indel_type: IndelType::Deletion(3),
1292 |                 qual: 30,
1293 |             },
1294 |             Coordinates {
1295 |                 start: 20,
1296 |                 stop: 21,
1297 |                 indel_type: IndelType::Insertion(3),
1298 |                 qual: 30,
1299 |             },
1300 |         ];
1301 |         let a_bqs = vec![30u8; 20];
1302 |         let b_bqs = vec![30u8; 20];
1303 |         assert_eq!(
1304 |             indel_error_pieces(&cigar_a, &cigar_b, &a_bqs, &b_bqs, 10),
1305 |             expected
1306 |         );
1307 |     }
1308 | 
1309 |     #[test]
1310 |     fn test_indel_error_quals() {
1311 |         let cigar_a =
1312 |             CigarString(vec![Cigar::Match(10), Cigar::Del(3), Cigar::Match(5)]).into_view(10);
1313 |         let cigar_b =
1314 |             CigarString(vec![Cigar::Match(10), Cigar::Ins(3), Cigar::Match(5)]).into_view(10);
1315 |         let expected = vec![];
1316 |         let a_bqs = vec![10u8; 20];
1317 |         let b_bqs = vec![10u8; 20];
1318 |         assert_eq!(
1319 |             indel_error_pieces(&cigar_a, &cigar_b, &a_bqs, &b_bqs, 60),
1320 |             expected
1321 |         );
1322 |     }
1323 | }
1324 | 


--------------------------------------------------------------------------------
/src/homopolymer.rs:
--------------------------------------------------------------------------------
  1 | pub(crate) const HP_REGEX: &str = "A{3,}|C{3,}|G{3,}|T{3,}";
  2 | use regex::Regex;
  3 | use rust_lapper::{Interval, Lapper};
  4 | 
  5 | /// Find the homopolymers and return an interval tree with the positions of the homopolymers.
  6 | pub(crate) fn find_homopolymers(seq: &[u8], re: &Regex) -> Lapper<u32, u8> {
  7 |     let seq_str = unsafe { std::str::from_utf8_unchecked(seq) };
  8 |     let matches = re.find_iter(seq_str);
  9 |     let intervals: Vec<Interval<u32, u8>> = matches
 10 |         .map(|m| Interval {
 11 |             start: m.range().start as u32,
 12 |             stop: m.range().end as u32,
 13 |             val: 0,
 14 |         })
 15 |         .collect();
 16 |     log::info!("found {} homopolymers with regex: {re}", intervals.len());
 17 |     Lapper::new(intervals)
 18 | }
 19 | 
 20 | /// return a negative number if the hp is before the position, accounting for strand.
 21 | /// and 0 if the hp contains the position, otherwise a positive number.
 22 | /// Returns None if the distance is greater than MAX_HP_DIST
 23 | ///  hphphp---pos---->
 24 | ///
 25 | pub(crate) fn hp_distance(
 26 |     hps: Option<&[&Interval<u32, u8>]>,
 27 |     pos: u32,
 28 |     read_start: u32,
 29 |     read_stop: u32,
 30 |     _strand: i8,
 31 | ) -> Option<i16> {
 32 |     let mut dist: Option<i16> = None;
 33 |     if pos < read_start || pos > read_stop {
 34 |         return dist;
 35 |     }
 36 | 
 37 |     // strand will be 1 for forward, -1 for reverse
 38 |     for hp in hps.map(|hps| hps.iter()).unwrap_or_default() {
 39 |         // first we check if the hp is within 3 bases of the read start or stop.
 40 |         // since this could truncate the hp and not affect the read.
 41 |         // cases to exclude:
 42 |         // read: ----------->
 43 |         // hp:   AAAAA
 44 |         // pos:
 45 | 
 46 |         if hp.stop >= read_start && hp.stop < read_start + 3 {
 47 |             continue;
 48 |         }
 49 |         if hp.start <= read_stop && hp.start > read_stop - 3 {
 50 |             continue;
 51 |         }
 52 | 
 53 |         assert!(
 54 |             pos >= read_start && pos <= read_stop,
 55 |             "pos: {}, read_start: {}, read_stop: {}",
 56 |             pos,
 57 |             read_start,
 58 |             read_stop
 59 |         );
 60 | 
 61 |         let d = if pos < hp.start {
 62 |             hp.start as i64 - pos as i64
 63 |         } else if pos > hp.stop {
 64 |             -(pos as i64 - hp.stop as i64)
 65 |         } else {
 66 |             0i64
 67 |         };
 68 |         // now we check distance of pos to hp.
 69 |         if d < -crate::fraguracy::MAX_HP_DIST as i64 || d > crate::fraguracy::MAX_HP_DIST as i64 {
 70 |             continue;
 71 |         }
 72 | 
 73 |         let d = d as i16;
 74 |         if dist.is_none() || d.abs() < dist.unwrap().abs() {
 75 |             dist = Some(d);
 76 |         }
 77 |     }
 78 |     dist
 79 | }
 80 | 
 81 | #[cfg(test)]
 82 | mod tests {
 83 |     use super::*;
 84 | 
 85 |     #[test]
 86 |     fn test_find_homopolymers() {
 87 |         // Test sequence with various homopolymers
 88 |         let seq = b"AAATCCCGAAAGGGGTTTT";
 89 |         let re = Regex::new(HP_REGEX).expect("invalid regex");
 90 |         let homopolymers = find_homopolymers(seq, &re);
 91 | 
 92 |         // Convert results to vec for easier testing
 93 |         let results: Vec<_> = homopolymers.iter().collect();
 94 | 
 95 |         // Expected homopolymers: AAA, CCC, AAAA, GGGG, TTTT
 96 |         assert_eq!(results.len(), 5);
 97 | 
 98 |         // Check each homopolymer position
 99 |         assert_eq!(results[0].start, 0); // AAA
100 |         assert_eq!(results[0].stop, 3);
101 | 
102 |         assert_eq!(results[1].start, 4); // CCC
103 |         assert_eq!(results[1].stop, 7);
104 | 
105 |         assert_eq!(results[2].start, 8); // AAAA
106 |         assert_eq!(results[2].stop, 11);
107 | 
108 |         assert_eq!(results[3].start, 11); // GGGG
109 |         assert_eq!(results[3].stop, 15);
110 | 
111 |         assert_eq!(results[4].start, 15); // TTTT
112 |         assert_eq!(results[4].stop, 19);
113 |     }
114 | 
115 |     #[test]
116 |     fn test_hp_distance() {
117 |         let hp = vec![Interval {
118 |             start: 9,
119 |             stop: 12,
120 |             val: 0,
121 |         }];
122 |         let hp_refs: Vec<&Interval<u32, u8>> = hp.iter().collect();
123 | 
124 |         // Test homopolymer near read end
125 |         assert_eq!(hp_distance(Some(&hp_refs), 11, 10, 20, 1,), None);
126 | 
127 |         // Test forward strand
128 |         assert_eq!(hp_distance(Some(&hp_refs), 15, 5, 20, 1,), Some(-3));
129 | 
130 |         // Test reverse strand
131 |         assert_eq!(hp_distance(Some(&hp_refs), 15, 5, 20, -1,), Some(-3));
132 | 
133 |         // Test distant homopolymer
134 |         assert_eq!(
135 |             hp_distance(
136 |                 Some(&hp_refs),
137 |                 115 + crate::fraguracy::MAX_HP_DIST as u32,
138 |                 105 + crate::fraguracy::MAX_HP_DIST as u32,
139 |                 120 + crate::fraguracy::MAX_HP_DIST as u32,
140 |                 1,
141 |             ),
142 |             None
143 |         );
144 |     }
145 | }
146 | 


--------------------------------------------------------------------------------
/src/lua.rs:
--------------------------------------------------------------------------------
  1 | use std::{collections::HashMap, ops::AddAssign};
  2 | 
  3 | use anyhow::Result;
  4 | use mlua::{prelude::LuaError, Function, Lua, UserData, UserDataFields, UserDataMethods, Value};
  5 | use rust_htslib::bam::{
  6 |     record::{Aux, Cigar},
  7 |     Record,
  8 | };
  9 | 
 10 | #[derive(Clone)]
 11 | pub struct LuaReadFilter {
 12 |     lua: Lua,
 13 |     filter_func: Function,
 14 | }
 15 | 
 16 | pub struct Flags {
 17 |     flag: u16,
 18 | }
 19 | 
 20 | impl UserData for Flags {
 21 |     fn add_fields<M: UserDataFields<Self>>(fields: &mut M) {
 22 |         fields.add_field_method_get("paired", |_, this| Ok((this.flag & 0x1) != 0));
 23 |         fields.add_field_method_get("proper_pair", |_, this| Ok((this.flag & 0x2) != 0));
 24 |         fields.add_field_method_get("unmapped", |_, this| Ok((this.flag & 0x4) != 0));
 25 |         fields.add_field_method_get("mate_unmapped", |_, this| Ok((this.flag & 0x8) != 0));
 26 |         fields.add_field_method_get("reverse", |_, this| Ok((this.flag & 0x10) != 0));
 27 |         fields.add_field_method_get("forward", |_, this| Ok((this.flag & 0x10) == 0));
 28 |         fields.add_field_method_get("mate_reverse", |_, this| Ok((this.flag & 0x20) != 0));
 29 |         fields.add_field_method_get("mate_forward", |_, this| Ok((this.flag & 0x20) == 0));
 30 |         fields.add_field_method_get("read_1", |_, this| Ok((this.flag & 0x40) != 0));
 31 |         fields.add_field_method_get("read_2", |_, this| Ok((this.flag & 0x80) != 0));
 32 |         fields.add_field_method_get("secondary", |_, this| Ok((this.flag & 0x100) != 0));
 33 |         fields.add_field_method_get("primary", |_, this| Ok((this.flag & 0x100) == 0));
 34 |         fields.add_field_method_get("qcfail", |_, this| Ok((this.flag & 0x200) != 0));
 35 |         fields.add_field_method_get("duplicate", |_, this| Ok((this.flag & 0x400) != 0));
 36 |         fields.add_field_method_get("supplementary", |_, this| Ok((this.flag & 0x800) != 0));
 37 | 
 38 |         fields.add_field_method_get("flag", |_, this| Ok(this.flag));
 39 |     }
 40 | }
 41 | 
 42 | impl LuaReadFilter {
 43 |     pub fn skip_read(&self, read: &Record) -> Result<bool> {
 44 |         let r = self.lua.scope(|scope| {
 45 |             let globals = self.lua.globals();
 46 |             let user_data = scope.create_any_userdata_ref(read)?;
 47 |             globals.set("read", user_data).expect("failed to set read");
 48 |             self.filter_func.call::<bool>(())
 49 |         })?;
 50 |         Ok(r)
 51 |     }
 52 | 
 53 |     pub fn new(expression: &str, lua: Lua) -> Result<Self> {
 54 |         if !expression.contains("return") {
 55 |             return Err(anyhow::anyhow!(
 56 |                 "expression must contain a return statement"
 57 |             ));
 58 |         }
 59 |         let filter_func = lua.load(expression).into_function()?;
 60 | 
 61 |         lua.register_userdata_type::<Record>(|reg| {
 62 |             reg.add_field_method_get("mapping_quality", |_, this| Ok(this.mapq()));
 63 |             reg.add_field_method_get("flags", |_, this| Ok(Flags { flag: this.flags() }));
 64 |             reg.add_field_method_get("tid", |_, this| Ok(this.tid()));
 65 |             reg.add_field_method_get("start", |_, this| Ok(this.pos()));
 66 |             reg.add_field_method_get("stop", |_, this| Ok(this.cigar().end_pos()));
 67 |             reg.add_field_method_get("length", |_, this| Ok(this.seq_len()));
 68 |             reg.add_field_method_get("insert_size", |_, this| Ok(this.insert_size()));
 69 |             reg.add_field_method_get("qname", |_, this| {
 70 |                 let q = this.qname();
 71 |                 Ok(std::str::from_utf8(q).unwrap_or("").to_string())
 72 |             });
 73 |             reg.add_field_method_get("sequence", |_, this| {
 74 |                 let seq = this.seq();
 75 |                 Ok(std::str::from_utf8(&seq.as_bytes())
 76 |                     .unwrap_or("")
 77 |                     .to_string())
 78 |             });
 79 | 
 80 |             reg.add_field_method_get("soft_clips_3_prime", |_, this| {
 81 |                 let cigar = this.cigar();
 82 |                 if this.is_reverse() {
 83 |                     Ok(cigar.leading_softclips())
 84 |                 } else {
 85 |                     Ok(cigar.trailing_softclips())
 86 |                 }
 87 |             });
 88 |             reg.add_field_method_get("soft_clips_5_prime", |_, this| {
 89 |                 let cigar = this.cigar();
 90 |                 if this.is_reverse() {
 91 |                     Ok(cigar.trailing_softclips())
 92 |                 } else {
 93 |                     Ok(cigar.leading_softclips())
 94 |                 }
 95 |             });
 96 |             reg.add_field_method_get("average_base_quality", |_, this| {
 97 |                 let qual = this.qual();
 98 |                 let sum = qual.iter().map(|q| *q as u64).sum::<u64>();
 99 |                 let count = qual.len();
100 |                 Ok(sum as f64 / count as f64)
101 |             });
102 | 
103 |             reg.add_method("tag", |lua, this: &Record, tag: String| {
104 |                 let tag = tag.as_bytes();
105 |                 let aux = this.aux(tag).map_err(LuaError::external)?;
106 |                 let lua_val: Value = match aux {
107 |                     Aux::Char(v) => Value::String(lua.create_string(&[v])?),
108 |                     Aux::I8(v) => Value::Number(v as f64),
109 |                     Aux::U8(v) => Value::Number(v as f64),
110 |                     Aux::I16(v) => Value::Number(v as f64),
111 |                     Aux::U16(v) => Value::Number(v as f64),
112 |                     Aux::I32(v) => Value::Number(v as f64),
113 |                     Aux::U32(v) => Value::Number(v as f64),
114 |                     Aux::Float(v) => Value::Number(v as f64),
115 |                     Aux::Double(v) => Value::Number(v as f64),
116 |                     Aux::String(v) => Value::String(lua.create_string(&v)?),
117 |                     Aux::ArrayFloat(v) => {
118 |                         let mut arr = Vec::new();
119 |                         for i in 0..v.len() {
120 |                             arr.push(v.get(i).unwrap_or(f32::NAN) as f32);
121 |                         }
122 |                         Value::Table(lua.create_sequence_from(arr)?)
123 |                     }
124 |                     Aux::ArrayI32(v) => {
125 |                         let mut arr = Vec::new();
126 |                         for i in 0..v.len() {
127 |                             arr.push(v.get(i).unwrap_or(i32::MIN) as i32);
128 |                         }
129 |                         Value::Table(lua.create_sequence_from(arr)?)
130 |                     }
131 |                     Aux::ArrayI8(v) => {
132 |                         let mut arr = Vec::new();
133 |                         for i in 0..v.len() {
134 |                             arr.push(v.get(i).unwrap_or(i8::MIN) as i8);
135 |                         }
136 |                         Value::Table(lua.create_sequence_from(arr)?)
137 |                     }
138 |                     Aux::ArrayU8(v) => {
139 |                         let mut arr = Vec::new();
140 |                         for i in 0..v.len() {
141 |                             arr.push(v.get(i).unwrap_or(u8::MIN) as u8);
142 |                         }
143 |                         Value::Table(lua.create_sequence_from(arr)?)
144 |                     }
145 |                     Aux::ArrayU16(v) => {
146 |                         let mut arr = Vec::new();
147 |                         for i in 0..v.len() {
148 |                             arr.push(v.get(i).unwrap_or(u16::MIN) as u16);
149 |                         }
150 |                         Value::Table(lua.create_sequence_from(arr)?)
151 |                     }
152 |                     Aux::ArrayU32(v) => {
153 |                         let mut arr = Vec::new();
154 |                         for i in 0..v.len() {
155 |                             arr.push(v.get(i).unwrap_or(u32::MIN) as u32);
156 |                         }
157 |                         Value::Table(lua.create_sequence_from(arr)?)
158 |                     }
159 |                     Aux::ArrayI16(v) => {
160 |                         let mut arr = Vec::new();
161 |                         for i in 0..v.len() {
162 |                             arr.push(v.get(i).unwrap_or(i16::MIN) as i16);
163 |                         }
164 |                         Value::Table(lua.create_sequence_from(arr)?)
165 |                     }
166 |                     Aux::HexByteArray(v) => {
167 |                         let lstr = String::from_utf8_lossy(v.as_bytes()).to_string();
168 |                         Value::String(lua.create_string(&lstr)?)
169 |                     }
170 |                 };
171 |                 Ok(Some(lua_val))
172 |             });
173 |             /*
174 |             reg.add_field_function_get("bq", |_, this| {
175 |                 let qpos: usize = match this.named_user_value("qpos") {
176 |                     Ok(qpos) => qpos,
177 |                     Err(_) => {
178 |                         return Ok(-1);
179 |                     }
180 |                 };
181 |                 let this = this.borrow::<Record>()?;
182 |                 Ok(this.qual()[qpos] as i32)
183 |             });
184 |             reg.add_field_function_get("distance_from_5prime", |_, this| {
185 |                 let qpos: usize = match this.named_user_value("qpos") {
186 |                     Ok(qpos) => qpos,
187 |                     Err(_) => {
188 |                         return Ok(-1);
189 |                     }
190 |                 };
191 |                 let this = this.borrow::<Record>()?;
192 |                 if this.is_reverse() {
193 |                     Ok(this.seq_len() as i32 - qpos as i32)
194 |                 } else {
195 |                     Ok(qpos as i32)
196 |                 }
197 |             });
198 |             reg.add_field_function_get("distance_from_3prime", |_, this| {
199 |                 let qpos: usize = match this.named_user_value("qpos") {
200 |                     Ok(qpos) => qpos,
201 |                     Err(_) => {
202 |                         return Ok(usize::MAX);
203 |                     }
204 |                 };
205 |                 let this = this.borrow::<Record>()?;
206 |                 if this.is_reverse() {
207 |                     Ok(qpos)
208 |                 } else {
209 |                     Ok(this.seq_len() - qpos)
210 |                 }
211 |             });
212 |             */
213 |             // count the number of A, C, G, T, N in the read. Always capitalize and return a table
214 |             reg.add_field_method_get("base_counts", |_, this| {
215 |                 let seq = this.seq();
216 |                 let mut counts = HashMap::new();
217 |                 for i in 0..seq.len() {
218 |                     let base = seq[i].to_ascii_uppercase();
219 |                     counts.entry(base).or_insert(0).add_assign(1);
220 |                 }
221 |                 Ok(counts)
222 |             });
223 |             reg.add_field_method_get("n_proportion", |_, this| {
224 |                 let seq = this.seq();
225 |                 let mut count = 0;
226 |                 for i in 0..seq.len() {
227 |                     let base = seq[i].to_ascii_uppercase();
228 |                     if base == b'N' {
229 |                         count += 1;
230 |                     }
231 |                 }
232 |                 Ok(count as f64 / seq.len() as f64)
233 |             });
234 | 
235 |             reg.add_method("n_proportion_3_prime", |_, this, n_bases: usize| {
236 |                 let seq = this.seq();
237 |                 let mut count = 0;
238 |                 let reverse = this.is_reverse();
239 |                 for i in 0..n_bases {
240 |                     let base =
241 |                         seq[if reverse { i } else { seq.len() - 1 - i }].to_ascii_uppercase();
242 |                     if base == b'N' {
243 |                         count += 1;
244 |                     }
245 |                 }
246 |                 Ok(count as f64 / n_bases as f64)
247 |             });
248 | 
249 |             reg.add_method("n_proportion_5_prime", |_, this, n_bases: usize| {
250 |                 let seq = this.seq();
251 |                 let mut count = 0;
252 |                 let reverse = this.is_reverse();
253 |                 for i in 0..n_bases {
254 |                     let base =
255 |                         seq[if reverse { seq.len() - 1 - i } else { i }].to_ascii_uppercase();
256 |                     if base == b'N' {
257 |                         count += 1;
258 |                     }
259 |                 }
260 |                 Ok(count as f64 / n_bases as f64)
261 |             });
262 | 
263 |             reg.add_field_method_get("indel_count", |_, this| {
264 |                 let cigar = this.cigar();
265 |                 let mut count = 0;
266 |                 for op in cigar.iter() {
267 |                     count += match op {
268 |                         Cigar::Ins(_len) => 1,
269 |                         Cigar::Del(_len) => 1,
270 |                         _ => 0,
271 |                     }
272 |                 }
273 |                 Ok(count)
274 |             });
275 |         })?;
276 | 
277 |         Ok(Self { lua, filter_func })
278 |     }
279 | }
280 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod combine_counts;
  2 | mod combine_errors;
  3 | mod homopolymer;
  4 | 
  5 | mod files;
  6 | mod fraguracy;
  7 | mod lua;
  8 | 
  9 | //mod plot;
 10 | #[macro_use]
 11 | extern crate lazy_static;
 12 | use clap::{Parser, Subcommand};
 13 | use fraguracy::ConfidenceInterval;
 14 | use homopolymer::find_homopolymers;
 15 | use linear_map::LinearMap;
 16 | use regex::Regex;
 17 | 
 18 | use rust_lapper::Lapper;
 19 | 
 20 | use std::io::BufRead;
 21 | 
 22 | use crate::files::Iv;
 23 | use std::collections::HashMap;
 24 | use std::path::PathBuf;
 25 | 
 26 | use rust_htslib::bam;
 27 | use rust_htslib::bam::{IndexedReader, Read, Reader};
 28 | use rust_htslib::faidx;
 29 | use rustc_hash::FxHashMap;
 30 | 
 31 | use rayon::prelude::*;
 32 | 
 33 | use crate::fraguracy::Stat;
 34 | 
 35 | use std::env;
 36 | use std::str;
 37 | 
 38 | lazy_static! {
 39 |     static ref EMPTY_LAPPER: Lapper<u32, u8> = Lapper::new(Vec::new());
 40 | }
 41 | 
 42 | #[derive(Debug, Parser)]
 43 | #[command(name = "fraguracy")]
 44 | #[command(
 45 |     version,
 46 |     about = "fraguracy: unbiased error profile analysis for short read sequencing",
 47 |     author = "Brent S Pedersen",
 48 |     help_template = "{about}\nversion:{version}\n\n{usage-heading} {usage} \n\nOPTIONS:\n{options}\n\n\x1b[1m\x1b[4mCOMMANDS:\x1b[0m\n{subcommands}"
 49 | )]
 50 | #[command(arg_required_else_help = true)]
 51 | struct Cli {
 52 |     #[command(subcommand)]
 53 |     command: Commands,
 54 | }
 55 | 
 56 | #[derive(Debug, Subcommand)]
 57 | enum Commands {
 58 |     #[command(
 59 |         arg_required_else_help = true,
 60 |         about = "combine error bed files from extract. file names are used to determine if they are indel errors or base errors."
 61 |     )]
 62 |     CombineErrors {
 63 |         #[arg(
 64 |             short,
 65 |             long,
 66 |             required = true,
 67 |             help = "path for to fai (not fasta) file"
 68 |         )]
 69 |         fai_path: PathBuf,
 70 | 
 71 |         #[arg(help = "path to error bed files from extract")]
 72 |         errors: Vec<PathBuf>,
 73 | 
 74 |         #[arg(
 75 |             short,
 76 |             long,
 77 |             default_value_t = String::from("fraguracy-combined-errors.bed"),
 78 |             help = "path for output bed file"
 79 |         )]
 80 |         output_path: String,
 81 |     },
 82 | 
 83 |     #[command(
 84 |         arg_required_else_help = true,
 85 |         about = "combine counts.txt files from extract"
 86 |     )]
 87 |     CombineCounts {
 88 |         #[arg(help = "path to counts.txt files from extract")]
 89 |         counts: Vec<PathBuf>,
 90 | 
 91 |         #[arg(
 92 |             short,
 93 |             long,
 94 |             default_value_t = String::from("fraguracy-combined-counts.txt"),
 95 |             help = "path for output counts file"
 96 |         )]
 97 |         output_path: String,
 98 |     },
 99 | 
100 |     #[command(
101 |         arg_required_else_help = true,
102 |         about = "error profile pair overlaps in bam/cram"
103 |     )]
104 |     Extract {
105 |         #[arg(
106 |             short,
107 |             long,
108 |             help = "fasta for use with crams and/or to use as 'truth'"
109 |         )]
110 |         fasta: Option<PathBuf>,
111 |         #[arg(required = true, help = "bam/cram files to analyze")]
112 |         bams: Vec<PathBuf>,
113 |         #[arg(
114 |             short,
115 |             long,
116 |             default_value_t = String::from("fraguracy-"),
117 |             help = "prefix for output files"
118 |         )]
119 |         output_prefix: String,
120 | 
121 |         #[arg(short = 'C', long, help = "restrict analysis to this chromosome")]
122 |         chromosome: Option<String>,
123 | 
124 |         #[arg(
125 |             short,
126 |             long,
127 |             help = "restrict analysis to the regions given in this BED file"
128 |         )]
129 |         regions: Option<PathBuf>,
130 | 
131 |         #[arg(
132 |             short,
133 |             long,
134 |             help = "exclude from analysis the regions given in this BED file"
135 |         )]
136 |         exclude_regions: Option<PathBuf>,
137 | 
138 |         #[arg(
139 |             short = 'l',
140 |             long,
141 |             help = "optional lua expression to filter reads. returns true to skip read. e.g. 'return read.flags.secondary or read.flags.supplementary'."
142 |         )]
143 |         lua_expression: Option<String>,
144 | 
145 |         #[arg(
146 |             short,
147 |             long,
148 |             default_value_t = 151,
149 |             help = "indicate the maximum read length in the alignment file"
150 |         )]
151 |         max_read_length: u32,
152 |         #[arg(
153 |             short,
154 |             long,
155 |             default_value_t = 3,
156 |             help = "parition the read into chunks/bins of this size"
157 |         )]
158 |         bin_size: u8,
159 |         #[arg(
160 |             short = 'Q',
161 |             long,
162 |             default_value_t = 50,
163 |             help = "only consider pairs where both reads have this mapping-quality or higher (good to leave this high)"
164 |         )]
165 |         min_mapping_quality: u8,
166 | 
167 |         #[arg(
168 |             short,
169 |             long = "ci",
170 |             help = "method for confidence interval calculation (see rust bpci crate)",
171 |             default_value = "agresti-coull"
172 |         )]
173 |         ci: ConfidenceInterval,
174 | 
175 |         #[arg(
176 |             short,
177 |             long,
178 |             help = "do not calculate denominator. This can shorten runtime but will also skip the homopolymer distance calculation.",
179 |             default_value_t = false
180 |         )]
181 |         no_denominator: bool,
182 | 
183 |         #[arg(
184 |             short = 'H',
185 |             long,
186 |             help = format!(
187 |                 "regex for homopolymer sequence to consider if denominator is calculated[default: {}]",
188 |                 homopolymer::HP_REGEX
189 |             ),
190 |             default_value = homopolymer::HP_REGEX
191 |         )]
192 |         homopolymer_regex: String,
193 | 
194 |         #[arg(
195 |             short = 't',
196 |             long,
197 |             help = "use reference base as 'truth'",
198 |             default_value_t = false
199 |         )]
200 |         reference_as_truth: bool,
201 |     },
202 |     //Plot { tsv: PathBuf, },
203 | }
204 | 
205 | fn get_sample_name(hmap: HashMap<String, Vec<LinearMap<String, String>>>) -> String {
206 |     if let Some(lm) = hmap.get("RG") {
207 |         let sm = String::from("SM");
208 |         if let Some(v) = lm[0].get(&sm) {
209 |             (*v).clone()
210 |         } else {
211 |             String::from("")
212 |         }
213 |     } else {
214 |         String::from("")
215 |     }
216 | }
217 | 
218 | fn main() -> std::io::Result<()> {
219 |     let args = Cli::parse();
220 |     if env::var("RUST_LOG").is_err() {
221 |         env::set_var("RUST_LOG", "info")
222 |     }
223 |     env_logger::init();
224 | 
225 |     match args.command {
226 |         Commands::Extract {
227 |             bams,
228 |             fasta,
229 |             chromosome,
230 |             output_prefix,
231 |             regions,
232 |             exclude_regions,
233 |             lua_expression,
234 |             bin_size,
235 |             max_read_length,
236 |             min_mapping_quality,
237 |             ci,
238 |             reference_as_truth,
239 |             no_denominator,
240 |             homopolymer_regex,
241 |         } => extract_main(
242 |             bams,
243 |             fasta,
244 |             chromosome,
245 |             PathBuf::from(output_prefix),
246 |             regions,
247 |             exclude_regions,
248 |             lua_expression,
249 |             bin_size as u32,
250 |             max_read_length,
251 |             min_mapping_quality,
252 |             ci,
253 |             reference_as_truth,
254 |             no_denominator,
255 |             homopolymer_regex,
256 |         ), //Commands::Plot { tsv } => plot::plot(tsv),
257 |         Commands::CombineErrors {
258 |             fai_path,
259 |             errors,
260 |             output_path,
261 |         } => combine_errors::combine_errors_main(errors, fai_path, output_path),
262 | 
263 |         Commands::CombineCounts {
264 |             counts,
265 |             output_path,
266 |         } => combine_counts::combine_counts_main(counts, output_path),
267 |     }
268 | }
269 | 
270 | fn read_bed(path: Option<PathBuf>) -> Option<HashMap<String, Lapper<u32, u8>>> {
271 |     path.as_ref()?;
272 | 
273 |     let reader = files::open_file(path);
274 |     reader.as_ref()?;
275 |     let mut bed = HashMap::new();
276 | 
277 |     reader
278 |         .expect("checked that reader is available")
279 |         .lines()
280 |         .for_each(|l| {
281 |             let line = l.expect("error reading line");
282 |             let fields: Vec<_> = line.split('\t').collect();
283 |             if let (Ok(start), Ok(stop)) = (fields[1].parse::<u32>(), fields[2].parse::<u32>()) {
284 |                 let iv = Iv {
285 |                     start,
286 |                     stop,
287 |                     val: 0,
288 |                 };
289 |                 let chrom = String::from(fields[0]);
290 |                 bed.entry(chrom).or_insert(Vec::new()).push(iv);
291 |             }
292 |         });
293 | 
294 |     let mut tree: HashMap<String, Lapper<u32, u8>> = HashMap::new();
295 | 
296 |     for (chrom, ivs) in bed.iter() {
297 |         let ivs = ivs.clone();
298 |         let chrom = (*chrom).clone();
299 |         tree.insert(chrom, Lapper::new(ivs));
300 |     }
301 |     Some(tree)
302 | }
303 | 
304 | fn get_tree<'a>(
305 |     regions: &'a Option<HashMap<String, Lapper<u32, u8>>>,
306 |     chrom: &String,
307 | ) -> Option<&'a Lapper<u32, u8>> {
308 |     if let Some(map) = regions {
309 |         Some(map.get(chrom).unwrap_or(&EMPTY_LAPPER))
310 |     } else {
311 |         None
312 |     }
313 | }
314 | 
315 | #[allow(clippy::too_many_arguments)]
316 | fn process_bam(
317 |     path: PathBuf,
318 |     fasta_path: Option<PathBuf>,
319 |     regions: Option<PathBuf>,
320 |     chromosome: Option<String>,
321 |     exclude_regions: Option<PathBuf>,
322 |     lua_expression: Option<lua::LuaReadFilter>,
323 |     bin_size: u32,
324 |     max_read_length: u32,
325 |     min_mapping_quality: u8,
326 |     min_base_qual: u8,
327 |     reference_as_truth: bool,
328 |     output_prefix: PathBuf,
329 |     no_denominator: bool,
330 |     homopolymer_regex: Option<Regex>,
331 | ) -> (fraguracy::InnerCounts, Vec<String>, String) {
332 |     let mut bam = IndexedReader::from_path(&path)
333 |         .unwrap_or_else(|_| panic!("error reading bam file {path:?}"));
334 |     bam.set_threads(1).expect("error setting threads");
335 |     let mut map = FxHashMap::default();
336 | 
337 |     let include_regions = read_bed(regions);
338 |     let exclude_regions = read_bed(exclude_regions);
339 | 
340 |     let mut ibam = IndexedReader::from_path(&path)
341 |         .unwrap_or_else(|_| panic!("bam file {path:?} must be sorted and indexed"));
342 |     ibam.set_threads(3)
343 |         .expect("error setting threads on indexed reader");
344 | 
345 |     let fasta: Option<faidx::Reader> = if let Some(fa_path) = fasta_path {
346 |         bam.set_reference(&fa_path)
347 |             .expect("Error setting reference for file");
348 |         ibam.set_reference(&fa_path)
349 |             .expect("Error setting reference for file");
350 | 
351 |         let fa = faidx::Reader::from_path(fa_path).expect("error opening faidx");
352 |         Some(fa)
353 |     } else {
354 |         None
355 |     };
356 | 
357 |     let bins = (max_read_length as f64 / bin_size as f64).ceil() as u32;
358 |     let mut counts = fraguracy::Counts::new(
359 |         if reference_as_truth { None } else { Some(ibam) },
360 |         bins as usize,
361 |     );
362 |     let hmap = bam::Header::from_template(bam.header()).to_hashmap();
363 |     let sample_name = get_sample_name(hmap);
364 |     log::info!("found sample {sample_name}");
365 | 
366 |     if !no_denominator {
367 |         counts
368 |             .set_depth_writer(
369 |                 &(output_prefix.to_string_lossy().to_string()
370 |                     + &sample_name
371 |                     + "-fraguracy-denominator-depth.bed.gz")
372 |                     .to_string(),
373 |             )
374 |             .expect("error setting depth writer. check permissions/existence of output directory.");
375 |     }
376 | 
377 |     if let Some(chromosome) = chromosome {
378 |         if let Err(e) = bam.fetch(bam::FetchDefinition::String(chromosome.as_bytes())) {
379 |             log::error!("error fetching chromosome {chromosome}: {e}. iterating over all reads.");
380 |         } else {
381 |             log::info!("limiting analysis to chromosome: \"{chromosome}");
382 |         }
383 |     } else if let Err(e) = bam.fetch(bam::FetchDefinition::All) {
384 |         log::error!("error fetching all reads: {e}");
385 |     }
386 | 
387 |     let mut n_total = 0;
388 |     let mut n_pairs = 0;
389 |     let chroms: Vec<String> = bam
390 |         .header()
391 |         .target_names()
392 |         .iter()
393 |         .map(|n| unsafe { str::from_utf8_unchecked(n) }.to_string())
394 |         .collect();
395 | 
396 |     let mut include_tree: Option<&Lapper<u32, u8>> = get_tree(&include_regions, &chroms[0]);
397 |     let mut exclude_tree: Option<&Lapper<u32, u8>> = get_tree(&exclude_regions, &chroms[0]);
398 |     let mut hp_tree: Option<Lapper<u32, u8>> = None;
399 | 
400 |     let mut last_tid: i32 = -1;
401 |     bam.rc_records()
402 |         .map(|r| {
403 |             n_total += 1;
404 |             r.expect("error parsing read")
405 |         })
406 |         .filter(fraguracy::filter_read)
407 |         .for_each(|b| {
408 |             let name = unsafe { str::from_utf8_unchecked(b.qname()) }.to_string();
409 |             if b.is_first_in_template() {
410 |                 n_pairs += 1;
411 |             }
412 |             if b.tid() != last_tid {
413 |                 if last_tid != -1 {
414 |                     log::info!(
415 |                         "processed chromosome: {} unprocessed orphan pairs: {}",
416 |                         chroms[last_tid as usize],
417 |                         map.len()
418 |                     );
419 |                 }
420 |                 last_tid = b.tid();
421 | 
422 |                 // process the remaining entries in the hashmap in last_depth.
423 |                 counts.handle_depth(&chroms[last_tid as usize], i64::MAX);
424 | 
425 |                 if include_regions.is_some() {
426 |                     include_tree = get_tree(&include_regions, &chroms[last_tid as usize]);
427 |                 }
428 |                 if exclude_regions.is_some() {
429 |                     exclude_tree = get_tree(&exclude_regions, &chroms[last_tid as usize]);
430 |                 }
431 | 
432 |                 if let Some(ref re) = homopolymer_regex {
433 |                     let chrom_seq = fasta
434 |                         .as_ref()
435 |                         .unwrap()
436 |                         .fetch_seq(&chroms[last_tid as usize], 0, i64::MAX as usize)
437 |                         .expect("error fetching sequence from fasta.");
438 |                     hp_tree = Some(find_homopolymers(&chrom_seq, re));
439 |                 }
440 |             }
441 | 
442 |             // by not checking the order here, we allow bams sorted by read name (with position flipped)
443 |             // this gives about 5% performance penalty over checking b.pos() < b.mpos(), but allows us
444 |             // to support more files.
445 |             match map.entry(name) {
446 |                 std::collections::hash_map::Entry::Vacant(e) => {
447 |                     e.insert(b);
448 |                 }
449 |                 std::collections::hash_map::Entry::Occupied(e) => {
450 |                     let a = e.remove();
451 | 
452 |                     if a.mapq() < min_mapping_quality {
453 |                         return;
454 |                     }
455 |                     if b.mapq() < min_mapping_quality {
456 |                         return;
457 |                     }
458 |                     if let Some(ref lua_expression) = lua_expression {
459 |                         match lua_expression.skip_read(&a) {
460 |                             Ok(true) => return,
461 |                             Ok(false) => (),
462 |                             Err(e) => log::error!("error evaluating user expression for read: {e}"),
463 |                         }
464 |                         match lua_expression.skip_read(&b) {
465 |                             Ok(true) => return,
466 |                             Ok(false) => (),
467 |                             Err(e) => log::error!("error evaluating user expression for read: {e}"),
468 |                         }
469 |                     }
470 |                     // we know a is before b, but we don't know if they overlap.
471 |                     if a.cigar().end_pos() < b.pos() {
472 |                         return;
473 |                     }
474 |                     let tid = a.tid() as usize;
475 |                     counts.increment(
476 |                         a,
477 |                         b,
478 |                         min_base_qual,
479 |                         min_mapping_quality,
480 |                         bin_size,
481 |                         &fasta,
482 |                         &chroms[tid],
483 |                         &include_tree,
484 |                         &exclude_tree,
485 |                         &hp_tree,
486 |                     );
487 |                 }
488 |             }
489 |         });
490 |     log::info!(
491 |         "[FINAL] map len:{:?} total reads: {:?}, pairs: {} \
492 |         \n mismatches: {} matches: {}",
493 |         map.len(),
494 |         n_total,
495 |         n_pairs,
496 |         counts.counts.mismatches,
497 |         counts.counts.matches,
498 |     );
499 | 
500 |     (counts.counts, chroms, sample_name)
501 | }
502 | 
503 | #[allow(clippy::too_many_arguments)]
504 | fn extract_main(
505 |     paths: Vec<PathBuf>,
506 |     fasta_path: Option<PathBuf>,
507 |     chromosome: Option<String>,
508 |     output_prefix: PathBuf,
509 |     regions: Option<PathBuf>,
510 |     exclude_regions: Option<PathBuf>,
511 |     lua_expression: Option<String>,
512 |     bin_size: u32,
513 |     max_read_length: u32,
514 |     min_mapping_quality: u8,
515 |     ci: ConfidenceInterval,
516 |     reference_as_truth: bool,
517 |     no_denominator: bool,
518 |     homopolymer_regex: String,
519 | ) -> std::io::Result<()> {
520 |     //let args: Vec<String> = env::args().collect();
521 |     let min_base_qual = 5u8;
522 | 
523 |     let mut homopolymer_regex =
524 |         Some(Regex::new(&homopolymer_regex).expect("error compiling homopolymer regex"));
525 | 
526 |     if no_denominator {
527 |         homopolymer_regex = None;
528 |     } else if fasta_path.is_none() {
529 |         return Err(std::io::Error::new(
530 |             std::io::ErrorKind::InvalidInput,
531 |             "fasta path must be provided if denominator is calculated",
532 |         ));
533 |     }
534 | 
535 |     let lua_expression = lua_expression.map(|e| {
536 |         lua::LuaReadFilter::new(&e, mlua::Lua::new()).expect("error creating lua interpreter")
537 |     });
538 | 
539 |     let total_counts = paths
540 |         .par_iter()
541 |         .map(|path| {
542 |             let (c, chroms, sample_name) = process_bam(
543 |                 path.clone(),
544 |                 fasta_path.clone(),
545 |                 regions.clone(),
546 |                 chromosome.clone(),
547 |                 exclude_regions.clone(),
548 |                 lua_expression.clone(),
549 |                 bin_size,
550 |                 max_read_length,
551 |                 min_mapping_quality,
552 |                 min_base_qual,
553 |                 reference_as_truth,
554 |                 output_prefix.clone(),
555 |                 no_denominator,
556 |                 homopolymer_regex.clone(),
557 |             );
558 |             let output_prefix: PathBuf =
559 |                 (output_prefix.to_string_lossy().to_string() + &sample_name + "-").into();
560 | 
561 |             let stats = Stat::from_counts(&c, bin_size as usize, ci.clone());
562 |             files::write_stats(stats, output_prefix.clone());
563 |             files::write_errors(&c, output_prefix, chroms);
564 | 
565 |             c
566 |         })
567 |         .reduce_with(|mut a, b| {
568 |             a += b;
569 |             a
570 |         });
571 | 
572 |     let total_counts = total_counts.expect("error accumulating total counts");
573 |     if paths.len() > 1 {
574 |         let bam = Reader::from_path(&paths[0]).expect("error reading bam file {path}");
575 |         let chroms: Vec<String> = bam
576 |             .header()
577 |             .target_names()
578 |             .iter()
579 |             .map(|n| unsafe { str::from_utf8_unchecked(n) }.to_string())
580 |             .collect();
581 |         let output_prefix: PathBuf =
582 |             (output_prefix.to_string_lossy().to_string() + "total-").into();
583 | 
584 |         let stats = Stat::from_counts(&total_counts, bin_size as usize, ci);
585 |         files::write_stats(stats, output_prefix.clone());
586 |         files::write_errors(&total_counts, output_prefix, chroms);
587 |     }
588 |     Ok(())
589 | }
590 | 
591 | #[cfg(test)]
592 | mod tests {
593 |     use super::*;
594 |     use std::collections::HashMap;
595 |     // Assuming Iv is defined in crate::files and is accessible.
596 |     use crate::files::Iv;
597 | 
598 |     #[test]
599 |     fn test_get_tree_found() {
600 |         // Create a non-empty lapper for "chr1"
601 |         let iv = Iv {
602 |             start: 10,
603 |             stop: 20,
604 |             val: 0,
605 |         };
606 |         let lapper_non_empty = Lapper::new(vec![iv]);
607 |         let mut regions_map: HashMap<String, Lapper<u32, u8>> = HashMap::new();
608 |         regions_map.insert("chr1".to_string(), lapper_non_empty);
609 |         let regions = Some(regions_map);
610 | 
611 |         let tree = get_tree(&regions, &"chr1".to_string()).unwrap();
612 |         // Query a point that should overlap the interval [10,20]
613 |         let mut iter = tree.find(15, 16);
614 |         assert!(
615 |             iter.next().is_some(),
616 |             "Expected to find an interval for chr1"
617 |         );
618 |     }
619 | 
620 |     #[test]
621 |     fn test_get_tree_not_found_in_map() {
622 |         // Create a regions map with a lapper only for "chr1"
623 |         let iv = Iv {
624 |             start: 10,
625 |             stop: 20,
626 |             val: 0,
627 |         };
628 |         let lapper_non_empty = Lapper::new(vec![iv]);
629 |         let mut regions_map: HashMap<String, Lapper<u32, u8>> = HashMap::new();
630 |         regions_map.insert("chr1".to_string(), lapper_non_empty);
631 |         let regions = Some(regions_map);
632 | 
633 |         // Looking up "chr2" should yield the empty lapper.
634 |         let tree = get_tree(&regions, &"chr2".to_string()).unwrap();
635 |         assert!(
636 |             tree.find(0, 100).next().is_none(),
637 |             "Expected empty lapper for non-existent chromosome"
638 |         );
639 |     }
640 | 
641 |     #[test]
642 |     fn test_get_tree_no_regions() {
643 |         // When regions is None, we expect the function to return None.
644 |         let regions: Option<HashMap<String, Lapper<u32, u8>>> = None;
645 |         let tree = get_tree(&regions, &"any".to_string());
646 |         assert!(tree.is_none(), "Expected None when regions is None");
647 |     }
648 | }
649 | 


--------------------------------------------------------------------------------
/src/plot.rs:
--------------------------------------------------------------------------------
 1 | use plotly::layout::{Axis, GridPattern, Layout, LayoutGrid, Legend, RowOrder};
 2 | use plotly::{Plot, Scatter};
 3 | use polars::prelude::*;
 4 | 
 5 | use std::path::PathBuf;
 6 | 
 7 | pub fn plot(f: PathBuf) {
 8 |     let df = CsvReader::from_path(f)
 9 |         .expect("error reading csv")
10 |         .has_header(true)
11 |         .with_delimiter('\t' as u8)
12 |         .finish()
13 |         .unwrap(); //.finish().unwrap();
14 | 
15 |     let layout = Layout::new().grid(LayoutGrid::new().rows(1).columns(2));
16 |     let mut plot = Plot::new();
17 |     plot.set_layout(layout);
18 | 
19 |     let contexts: Vec<std::string::String> = df["context"]
20 |         .unique()
21 |         .expect("error getting unique contexts")
22 |         .iter()
23 |         .map(|s| std::string::String::from(s.get_str().unwrap()))
24 |         .collect();
25 | 
26 |     dbg!(contexts);
27 |     eprintln!("dataframe shape: {:?}", df.shape());
28 |     // In [19]: df.with_columns(pl.all([pl.col("a").str.contains("sick"), ~pl.col("a").str.contains("sick of ")]).alias("match"))
29 |     /*
30 | 
31 | 
32 |     contexts.iter().map(|ctx| {
33 |         //let mask = df.column("context")?.equal(ctx);
34 |         df.select([col("context") == lit(*ctx)]);
35 | 
36 | 
37 |         let sub = df.filter(col("context") == ctx);
38 | 
39 |         let mask = df
40 |             .column("context")
41 |             .unwrap()
42 |             .contains(ctx)
43 |             .
44 |         df.filter(&mask).map(|subset| {
45 |             eprintln!("{:?}", subset.shape());
46 |         })
47 |     });
48 | 
49 |     //let t1 = Scatter::new();
50 | 
51 |     dbg!(df.head(Some(10)));
52 |     */
53 | }
54 | 


--------------------------------------------------------------------------------
/test-data/a.errors.bed:
--------------------------------------------------------------------------------
 1 | #chrom	start	end	bq_bin	count
 2 | chr1	109555	109556	20-36	1
 3 | chr1	181470	181471	20-36	1
 4 | chr1	202236	202237	20-36	1
 5 | chr1	271518	271519	20-36	1
 6 | chr1	591695	591696	20-36	1
 7 | chr1	596597	596598	37-59	1
 8 | chr1	632101	632102	20-36	1
 9 | chr1	739054	739055	20-36	1
10 | chr1	775172	775173	20-36	1
11 | chr1	778756	778757	20-36	1
12 | chr1	791072	791073	20-36	1
13 | chr1	806318	806319	20-36	1
14 | chr1	812874	812875	20-36	1
15 | chr1	812928	812929	37-59	1
16 | chr1	820676	820677	20-36	1
17 | chr1	821214	821215	20-36	1
18 | chr1	821223	821224	20-36	1
19 | chr1	821359	821360	20-36	1
20 | chr1	821373	821374	20-36	1
21 | 


--------------------------------------------------------------------------------
/test-data/b.errors.bed:
--------------------------------------------------------------------------------
 1 | #chrom	start	end	bq_bin	count
 2 | chr1	121033	121034	20-36	1
 3 | chr1	136837	136838	20-36	1
 4 | chr1	271490	271491	20-36	1
 5 | chr1	596617	596618	20-36	1
 6 | chr1	598836	598837	20-36	1
 7 | chr1	598904	598905	20-36	1
 8 | chr1	598929	598930	20-36	1
 9 | chr1	605472	605473	20-36	1
10 | chr1	631613	631614	20-36	1
11 | chr1	764663	764664	20-36	1
12 | chr1	775216	775217	37-59	1
13 | chr1	785823	785824	20-36	1
14 | chr1	785896	785897	20-36	1
15 | chr1	788777	788778	20-36	1
16 | chr1	788865	788866	20-36	1
17 | chr1	789070	789071	20-36	1
18 | chr1	790253	790254	20-36	1
19 | chr1	790389	790390	20-36	1
20 | chr1	790420	790421	20-36	1
21 | 


--------------------------------------------------------------------------------