├── .gitignore ├── CITATION.cff ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── Usage.md ├── pub ├── get_data.sh ├── run.sh └── run_benchmark.sh └── src ├── bed.rs ├── intervals.rs ├── main.rs ├── stats.rs └── summary.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | data 3 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Liu" 5 | given-names: "Daniel" 6 | orcid: "https://orcid.org/0000-0002-2385-2957" 7 | - family-names: "Belyaeva" 8 | given-names: "Anastasiya" 9 | - family-names: "Shafin" 10 | given-names: "Kishwar" 11 | orcid: "https://orcid.org/0000-0001-5252-3434" 12 | - family-names: "Chang" 13 | given-names: "Pi-Chuan" 14 | orcid: "https://orcid.org/0000-0003-3021-6446" 15 | - family-names: "Carroll" 16 | given-names: "Andrew" 17 | orcid: "https://orcid.org/0000-0002-4824-6689" 18 | - family-names: "Cook" 19 | given-names: "Daniel" 20 | orcid: "https://orcid.org/0000-0003-3347-562X" 21 | title: "Best: A Tool for Characterizing Sequencing Errors" 22 | version: 0.1.0 23 | doi: 10.1101/2022.12.22.521488 24 | date-released: 2020-12-09 25 | url: "https://github.com/google/best" 26 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement (CLA). You (or your employer) retain the copyright to your 10 | contribution; this simply gives us permission to use and redistribute your 11 | contributions as part of the project. Head over to 12 | to see your current agreements on file or 13 | to sign a new one. 14 | 15 | You generally only need to submit a CLA once, so if you've already submitted one 16 | (even if it was for a different project), you probably don't need to do it 17 | again. 18 | 19 | ## Code Reviews 20 | 21 | All submissions, including submissions by project members, require review. We 22 | use GitHub pull requests for this purpose. Consult 23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 24 | information on using pull requests. 25 | 26 | ## Community Guidelines 27 | 28 | This project follows 29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/). 30 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "atty" 13 | version = "0.2.14" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 16 | dependencies = [ 17 | "hermit-abi", 18 | "libc", 19 | "winapi", 20 | ] 21 | 22 | [[package]] 23 | name = "autocfg" 24 | version = "1.1.0" 25 | source = "registry+https://github.com/rust-lang/crates.io-index" 26 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 27 | 28 | [[package]] 29 | name = "best" 30 | version = "0.1.0" 31 | dependencies = [ 32 | "clap", 33 | "flate2", 34 | "fxhash", 35 | "noodles", 36 | "ordered-float", 37 | "rayon", 38 | "rust-lapper", 39 | ] 40 | 41 | [[package]] 42 | name = "bit-vec" 43 | version = "0.6.3" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" 46 | 47 | [[package]] 48 | name = "bitflags" 49 | version = "1.3.2" 50 | source = "registry+https://github.com/rust-lang/crates.io-index" 51 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 52 | 53 | [[package]] 54 | name = "byteorder" 55 | version = "1.4.3" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" 58 | 59 | [[package]] 60 | name = "bytes" 61 | version = "1.1.0" 62 | source = "registry+https://github.com/rust-lang/crates.io-index" 63 | checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" 64 | 65 | [[package]] 66 | name = "cfg-if" 67 | version = "1.0.0" 68 | source = "registry+https://github.com/rust-lang/crates.io-index" 69 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 70 | 71 | [[package]] 72 | name = "clap" 73 | version = "3.2.7" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "5b7b16274bb247b45177db843202209b12191b631a14a9d06e41b3777d6ecf14" 76 | dependencies = [ 77 | "atty", 78 | "bitflags", 79 | "clap_derive", 80 | "clap_lex", 81 | "indexmap", 82 | "once_cell", 83 | "strsim", 84 | "termcolor", 85 | "textwrap", 86 | ] 87 | 88 | [[package]] 89 | name = "clap_derive" 90 | version = "3.2.7" 91 | source = "registry+https://github.com/rust-lang/crates.io-index" 92 | checksum = "759bf187376e1afa7b85b959e6a664a3e7a95203415dba952ad19139e798f902" 93 | dependencies = [ 94 | "heck", 95 | "proc-macro-error", 96 | "proc-macro2", 97 | "quote", 98 | "syn", 99 | ] 100 | 101 | [[package]] 102 | name = "clap_lex" 103 | version = "0.2.4" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5" 106 | dependencies = [ 107 | "os_str_bytes", 108 | ] 109 | 110 | [[package]] 111 | name = "crc32fast" 112 | version = "1.3.2" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" 115 | dependencies = [ 116 | "cfg-if", 117 | ] 118 | 119 | [[package]] 120 | name = "crossbeam-channel" 121 | version = "0.5.6" 122 | source = "registry+https://github.com/rust-lang/crates.io-index" 123 | checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" 124 | dependencies = [ 125 | "cfg-if", 126 | "crossbeam-utils", 127 | ] 128 | 129 | [[package]] 130 | name = "crossbeam-deque" 131 | version = "0.8.1" 132 | source = "registry+https://github.com/rust-lang/crates.io-index" 133 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" 134 | dependencies = [ 135 | "cfg-if", 136 | "crossbeam-epoch", 137 | "crossbeam-utils", 138 | ] 139 | 140 | [[package]] 141 | name = "crossbeam-epoch" 142 | version = "0.9.9" 143 | source = "registry+https://github.com/rust-lang/crates.io-index" 144 | checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d" 145 | dependencies = [ 146 | "autocfg", 147 | "cfg-if", 148 | "crossbeam-utils", 149 | "memoffset", 150 | "once_cell", 151 | "scopeguard", 152 | ] 153 | 154 | [[package]] 155 | name = "crossbeam-utils" 156 | version = "0.8.10" 157 | source = "registry+https://github.com/rust-lang/crates.io-index" 158 | checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83" 159 | dependencies = [ 160 | "cfg-if", 161 | "once_cell", 162 | ] 163 | 164 | [[package]] 165 | name = "either" 166 | version = "1.6.1" 167 | source = "registry+https://github.com/rust-lang/crates.io-index" 168 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 169 | 170 | [[package]] 171 | name = "flate2" 172 | version = "1.0.24" 173 | source = "registry+https://github.com/rust-lang/crates.io-index" 174 | checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" 175 | dependencies = [ 176 | "crc32fast", 177 | "miniz_oxide", 178 | ] 179 | 180 | [[package]] 181 | name = "fxhash" 182 | version = "0.2.1" 183 | source = "registry+https://github.com/rust-lang/crates.io-index" 184 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" 185 | dependencies = [ 186 | "byteorder", 187 | ] 188 | 189 | [[package]] 190 | name = "hashbrown" 191 | version = "0.12.1" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | checksum = "db0d4cf898abf0081f964436dc980e96670a0f36863e4b83aaacdb65c9d7ccc3" 194 | 195 | [[package]] 196 | name = "heck" 197 | version = "0.4.0" 198 | source = "registry+https://github.com/rust-lang/crates.io-index" 199 | checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9" 200 | 201 | [[package]] 202 | name = "hermit-abi" 203 | version = "0.1.19" 204 | source = "registry+https://github.com/rust-lang/crates.io-index" 205 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 206 | dependencies = [ 207 | "libc", 208 | ] 209 | 210 | [[package]] 211 | name = "indexmap" 212 | version = "1.9.1" 213 | source = "registry+https://github.com/rust-lang/crates.io-index" 214 | checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e" 215 | dependencies = [ 216 | "autocfg", 217 | "hashbrown", 218 | ] 219 | 220 | [[package]] 221 | name = "lexical-core" 222 | version = "0.8.5" 223 | source = "registry+https://github.com/rust-lang/crates.io-index" 224 | checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" 225 | dependencies = [ 226 | "lexical-parse-float", 227 | "lexical-parse-integer", 228 | "lexical-util", 229 | "lexical-write-float", 230 | "lexical-write-integer", 231 | ] 232 | 233 | [[package]] 234 | name = "lexical-parse-float" 235 | version = "0.8.5" 236 | source = "registry+https://github.com/rust-lang/crates.io-index" 237 | checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" 238 | dependencies = [ 239 | "lexical-parse-integer", 240 | "lexical-util", 241 | "static_assertions", 242 | ] 243 | 244 | [[package]] 245 | name = "lexical-parse-integer" 246 | version = "0.8.6" 247 | source = "registry+https://github.com/rust-lang/crates.io-index" 248 | checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" 249 | dependencies = [ 250 | "lexical-util", 251 | "static_assertions", 252 | ] 253 | 254 | [[package]] 255 | name = "lexical-util" 256 | version = "0.8.5" 257 | source = "registry+https://github.com/rust-lang/crates.io-index" 258 | checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" 259 | dependencies = [ 260 | "static_assertions", 261 | ] 262 | 263 | [[package]] 264 | name = "lexical-write-float" 265 | version = "0.8.5" 266 | source = "registry+https://github.com/rust-lang/crates.io-index" 267 | checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" 268 | dependencies = [ 269 | "lexical-util", 270 | "lexical-write-integer", 271 | "static_assertions", 272 | ] 273 | 274 | [[package]] 275 | name = "lexical-write-integer" 276 | version = "0.8.5" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" 279 | dependencies = [ 280 | "lexical-util", 281 | "static_assertions", 282 | ] 283 | 284 | [[package]] 285 | name = "libc" 286 | version = "0.2.126" 287 | source = "registry+https://github.com/rust-lang/crates.io-index" 288 | checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" 289 | 290 | [[package]] 291 | name = "memchr" 292 | version = "2.5.0" 293 | source = "registry+https://github.com/rust-lang/crates.io-index" 294 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 295 | 296 | [[package]] 297 | name = "memoffset" 298 | version = "0.6.5" 299 | source = "registry+https://github.com/rust-lang/crates.io-index" 300 | checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce" 301 | dependencies = [ 302 | "autocfg", 303 | ] 304 | 305 | [[package]] 306 | name = "miniz_oxide" 307 | version = "0.5.3" 308 | source = "registry+https://github.com/rust-lang/crates.io-index" 309 | checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc" 310 | dependencies = [ 311 | "adler", 312 | ] 313 | 314 | [[package]] 315 | name = "noodles" 316 | version = "0.26.0" 317 | source = "registry+https://github.com/rust-lang/crates.io-index" 318 | checksum = "dbb261d074f9ca401f21cc62b1a1c7446d63d832c2b80774a3c529f057cfc51a" 319 | dependencies = [ 320 | "noodles-bam", 321 | "noodles-bed", 322 | "noodles-core", 323 | "noodles-fasta", 324 | "noodles-sam", 325 | ] 326 | 327 | [[package]] 328 | name = "noodles-bam" 329 | version = "0.21.0" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "52fe19f088bbeb51025c7911802ff2eb00c3994fee9e0ace892cb26bd19c42d5" 332 | dependencies = [ 333 | "bit-vec", 334 | "byteorder", 335 | "bytes", 336 | "noodles-bgzf", 337 | "noodles-core", 338 | "noodles-csi", 339 | "noodles-fasta", 340 | "noodles-sam", 341 | ] 342 | 343 | [[package]] 344 | name = "noodles-bed" 345 | version = "0.4.0" 346 | source = "registry+https://github.com/rust-lang/crates.io-index" 347 | checksum = "7468b7ffe8194806d9364d2dc4e04d431aa1d60dbab6ce35ea3af470831762b5" 348 | dependencies = [ 349 | "noodles-core", 350 | ] 351 | 352 | [[package]] 353 | name = "noodles-bgzf" 354 | version = "0.14.0" 355 | source = "registry+https://github.com/rust-lang/crates.io-index" 356 | checksum = "36ebbec47ff8c1e931f5da61e077a7774bbb2fc43158d72f334a51dae20dcc4b" 357 | dependencies = [ 358 | "byteorder", 359 | "bytes", 360 | "crossbeam-channel", 361 | "flate2", 362 | ] 363 | 364 | [[package]] 365 | name = "noodles-core" 366 | version = "0.8.0" 367 | source = "registry+https://github.com/rust-lang/crates.io-index" 368 | checksum = "ba58b998fe20d6e7f0c67386c1901daaf8fd2d2c4e7cfb5cf5f47ed22d96a8c8" 369 | 370 | [[package]] 371 | name = "noodles-csi" 372 | version = "0.9.0" 373 | source = "registry+https://github.com/rust-lang/crates.io-index" 374 | checksum = "ccb4bac55bfc031dd493f974ea1ee12e9b85e33cc17a63e6273b04551ca8d99e" 375 | dependencies = [ 376 | "bit-vec", 377 | "byteorder", 378 | "noodles-bgzf", 379 | "noodles-core", 380 | ] 381 | 382 | [[package]] 383 | name = "noodles-fasta" 384 | version = "0.13.0" 385 | source = "registry+https://github.com/rust-lang/crates.io-index" 386 | checksum = "12d5cfdf22a0869b7fcfc3cf6a9c2ad734f3284f090233ffadd59f610d4878b0" 387 | dependencies = [ 388 | "bytes", 389 | "memchr", 390 | "noodles-bgzf", 391 | "noodles-core", 392 | ] 393 | 394 | [[package]] 395 | name = "noodles-sam" 396 | version = "0.18.0" 397 | source = "registry+https://github.com/rust-lang/crates.io-index" 398 | checksum = "aab026833fde10c98fb822bff2bfd3950829e78d446ddb7fb1ad0e11710a4568" 399 | dependencies = [ 400 | "bitflags", 401 | "indexmap", 402 | "lexical-core", 403 | "memchr", 404 | "noodles-bgzf", 405 | "noodles-core", 406 | "noodles-csi", 407 | "noodles-fasta", 408 | "rustc-hash", 409 | ] 410 | 411 | [[package]] 412 | name = "num-traits" 413 | version = "0.2.15" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" 416 | dependencies = [ 417 | "autocfg", 418 | ] 419 | 420 | [[package]] 421 | name = "num_cpus" 422 | version = "1.13.1" 423 | source = "registry+https://github.com/rust-lang/crates.io-index" 424 | checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1" 425 | dependencies = [ 426 | "hermit-abi", 427 | "libc", 428 | ] 429 | 430 | [[package]] 431 | name = "once_cell" 432 | version = "1.12.0" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "7709cef83f0c1f58f666e746a08b21e0085f7440fa6a29cc194d68aac97a4225" 435 | 436 | [[package]] 437 | name = "ordered-float" 438 | version = "3.1.0" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a" 441 | dependencies = [ 442 | "num-traits", 443 | ] 444 | 445 | [[package]] 446 | name = "os_str_bytes" 447 | version = "6.1.0" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa" 450 | 451 | [[package]] 452 | name = "proc-macro-error" 453 | version = "1.0.4" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 456 | dependencies = [ 457 | "proc-macro-error-attr", 458 | "proc-macro2", 459 | "quote", 460 | "syn", 461 | "version_check", 462 | ] 463 | 464 | [[package]] 465 | name = "proc-macro-error-attr" 466 | version = "1.0.4" 467 | source = "registry+https://github.com/rust-lang/crates.io-index" 468 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 469 | dependencies = [ 470 | "proc-macro2", 471 | "quote", 472 | "version_check", 473 | ] 474 | 475 | [[package]] 476 | name = "proc-macro2" 477 | version = "1.0.40" 478 | source = "registry+https://github.com/rust-lang/crates.io-index" 479 | checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7" 480 | dependencies = [ 481 | "unicode-ident", 482 | ] 483 | 484 | [[package]] 485 | name = "quote" 486 | version = "1.0.20" 487 | source = "registry+https://github.com/rust-lang/crates.io-index" 488 | checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804" 489 | dependencies = [ 490 | "proc-macro2", 491 | ] 492 | 493 | [[package]] 494 | name = "rayon" 495 | version = "1.5.3" 496 | source = "registry+https://github.com/rust-lang/crates.io-index" 497 | checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d" 498 | dependencies = [ 499 | "autocfg", 500 | "crossbeam-deque", 501 | "either", 502 | "rayon-core", 503 | ] 504 | 505 | [[package]] 506 | name = "rayon-core" 507 | version = "1.9.3" 508 | source = "registry+https://github.com/rust-lang/crates.io-index" 509 | checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f" 510 | dependencies = [ 511 | "crossbeam-channel", 512 | "crossbeam-deque", 513 | "crossbeam-utils", 514 | "num_cpus", 515 | ] 516 | 517 | [[package]] 518 | name = "rust-lapper" 519 | version = "1.0.1" 520 | source = "registry+https://github.com/rust-lang/crates.io-index" 521 | checksum = "c0da7f82898b906bf29d705adb2c36a461c9cc9e33c1417041c86229d569c144" 522 | dependencies = [ 523 | "num-traits", 524 | ] 525 | 526 | [[package]] 527 | name = "rustc-hash" 528 | version = "1.1.0" 529 | source = "registry+https://github.com/rust-lang/crates.io-index" 530 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" 531 | 532 | [[package]] 533 | name = "scopeguard" 534 | version = "1.1.0" 535 | source = "registry+https://github.com/rust-lang/crates.io-index" 536 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" 537 | 538 | [[package]] 539 | name = "static_assertions" 540 | version = "1.1.0" 541 | source = "registry+https://github.com/rust-lang/crates.io-index" 542 | checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" 543 | 544 | [[package]] 545 | name = "strsim" 546 | version = "0.10.0" 547 | source = "registry+https://github.com/rust-lang/crates.io-index" 548 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 549 | 550 | [[package]] 551 | name = "syn" 552 | version = "1.0.98" 553 | source = "registry+https://github.com/rust-lang/crates.io-index" 554 | checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd" 555 | dependencies = [ 556 | "proc-macro2", 557 | "quote", 558 | "unicode-ident", 559 | ] 560 | 561 | [[package]] 562 | name = "termcolor" 563 | version = "1.1.3" 564 | source = "registry+https://github.com/rust-lang/crates.io-index" 565 | checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755" 566 | dependencies = [ 567 | "winapi-util", 568 | ] 569 | 570 | [[package]] 571 | name = "textwrap" 572 | version = "0.15.0" 573 | source = "registry+https://github.com/rust-lang/crates.io-index" 574 | checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" 575 | 576 | [[package]] 577 | name = "unicode-ident" 578 | version = "1.0.1" 579 | source = "registry+https://github.com/rust-lang/crates.io-index" 580 | checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c" 581 | 582 | [[package]] 583 | name = "version_check" 584 | version = "0.9.4" 585 | source = "registry+https://github.com/rust-lang/crates.io-index" 586 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 587 | 588 | [[package]] 589 | name = "winapi" 590 | version = "0.3.9" 591 | source = "registry+https://github.com/rust-lang/crates.io-index" 592 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 593 | dependencies = [ 594 | "winapi-i686-pc-windows-gnu", 595 | "winapi-x86_64-pc-windows-gnu", 596 | ] 597 | 598 | [[package]] 599 | name = "winapi-i686-pc-windows-gnu" 600 | version = "0.4.0" 601 | source = "registry+https://github.com/rust-lang/crates.io-index" 602 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 603 | 604 | [[package]] 605 | name = "winapi-util" 606 | version = "0.1.5" 607 | source = "registry+https://github.com/rust-lang/crates.io-index" 608 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 609 | dependencies = [ 610 | "winapi", 611 | ] 612 | 613 | [[package]] 614 | name = "winapi-x86_64-pc-windows-gnu" 615 | version = "0.4.0" 616 | source = "registry+https://github.com/rust-lang/crates.io-index" 617 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 618 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "best" 3 | authors = ["Daniel Liu", "Daniel E. Cook"] 4 | version = "0.1.0" 5 | edition = "2021" 6 | description = "Bam Error Stats Tool (best): analysis of error types in aligned reads." 7 | license = "MIT" 8 | 9 | [dependencies] 10 | clap = { version = "^3.2", features = ["derive"] } 11 | rayon = "^1.5" 12 | noodles = { version = "^0.26", features = ["sam", "bam", "fasta", "bed", "core"] } 13 | fxhash = "^0.2" 14 | rust-lapper = "^1.0" 15 | ordered-float = "^3.1" 16 | flate2 = "^1.0" 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Google LLC. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # best 2 | Bam Error Stats Tool (best): analysis of error types in aligned reads. 3 | 4 | `best` is used to assess the quality of reads after aligning them to a 5 | reference assembly. 6 | 7 | ## Features 8 | * Collect overall and per alignment stats 9 | * Distribution of indel lengths 10 | * Yield at different empirical Q-value thresholds 11 | * Bin per read stats to easily examine the distribution of errors for certain 12 | types of reads 13 | * Stats for regions specified by intervals (BED file, homopolymer regions, 14 | windows etc.) 15 | * Stats for quality scores vs empirical Q-values 16 | * Multithreading for speed 17 | 18 | ## Usage 19 | The [`best` Usage Guide](Usage.md) gives an overview of how to use `best`. 20 | 21 | ## Installing 22 | 1. Install [Rust](https://www.rust-lang.org/tools/install). 23 | 2. Clone this repository and navigate into the directory of this repository. 24 | 3. Run `cargo install --locked --path .` 25 | 4. Run `best input.bam reference.fasta prefix/path` 26 | 27 | This will generate stats files with the `prefix/path` prefix. 28 | 29 | ## Development 30 | ### Running 31 | 1. Install [Rust](https://www.rust-lang.org/tools/install). 32 | 2. Clone this repository and navigate into the directory of this repository. 33 | 3. Run `cargo build --release` 34 | 4. Run `cargo run --release -- input.bam reference.fasta prefix/path` or 35 | `target/release/best input.bam reference.fasta prefix/path` 36 | 37 | This will generate stats files with the `prefix/path` prefix. 38 | 39 | The built binary is located at `target/release/best`. 40 | 41 | ### Formatting 42 | ``` 43 | cargo fmt 44 | ``` 45 | 46 | ### Comparing 47 | Remember to pass the `-t 1` option to ensure that only one thread is used for 48 | testing. Best generally tries to ensure the order of outputs is deterministic 49 | with multiple threads, but the order of per-alignment stats is arbitrary unless 50 | only one thread is used. 51 | 52 | ### Disclaimer 53 | 54 | This is not an official Google product. 55 | 56 | The code is not intended for use in any clinical settings. It is not intended to be a medical device and is not intended for clinical use of any kind, including but not limited to diagnosis or prognosis. 57 | 58 | No representations or warranties are made with regards to the accuracy of results generated. User or licensee is responsible for verifying and validating accuracy when using this tool. 59 | -------------------------------------------------------------------------------- /Usage.md: -------------------------------------------------------------------------------- 1 | # `best` Usage Guide 2 | 3 | This guide will give a general overview of how to use `best`. 4 | 5 | ## Alignment Settings 6 | The alignment `bam` file must contain the read sequences and quality scores. 7 | The alignment CIGAR string can either use `M` or `=`/`X` for matches/mismatches. 8 | 9 | ## Example Analysis 10 | Let's say we have aligned reads in `aln.bam` and the reference assembly 11 | `ref.fasta.gz`. We want to collect statistics on the types of 12 | errors that occur in the alignments. Then, we can run `best` like 13 | ``` 14 | best -t 4 aln.bam ref.fasta.gz output 15 | ``` 16 | This will use 4 threads to collect stats and generate the following files: 17 | ``` 18 | output.per_aln_stats.csv.gz 19 | output.summary_cigar_stats.csv 20 | output.summary_identity_stats.csv 21 | output.summary_qual_score_stats.csv 22 | output.summary_yield_stats.csv 23 | ``` 24 | The per-alignment stats file is gzipped to save space. The 25 | `output.summary_cigar_stats.csv` file contains the distribution of lengths of 26 | consecutive insertions and deletions. The `output.summary_identity_stats.csv` 27 | file contains some general stats on the error rates across all alignments. 28 | The `output.summary_qual_score_stats.csv` file contains the empirical Q-value 29 | calculated from matches and mismatches for each corresponding quality score. 30 | The `all_alignments` feature indicates the quality score stats across all 31 | alignments. If intervals are specified, then this is computed per interval feature. 32 | The `output.summary_yield_stats.csv` contains the yield (number of reads/bases) 33 | above certain quality thresholds. 34 | 35 | We can collect even more data in one run of `best`. If we run 36 | ``` 37 | best -t 4 --intervals-hp --bin-types q_len:1000 gc_contant:0.05 -- aln.bam ref.fasta.gz output 38 | ``` 39 | then we will get two more files: 40 | ``` 41 | output.summary_feature_stats.csv 42 | output.summary_bin_stats.csv 43 | ``` 44 | The `output.summary_feature_stats.csv` file contains stats stratified by 45 | intervals. In this case, we use `--intervals-hp` to indicate that the intervals 46 | are homopolymer regions, so this will produce the error types at homopolymer 47 | regions of different lengths. The `output.summary_bin_stats.csv` file will 48 | contain the error types for reads binned by both the read (query) length (`q_len`, 49 | bin by increments of 1000bp) and the GC content (`gc_content`, bin by increments 50 | of 0.05). 51 | 52 | It is also possible to use bed files as custom intervals. These files can have 53 | three columns (all intervals will have the same feature) or four columns, where 54 | the last column indicates the feature. The feature stats are aggregated across 55 | all bed intervals with the same feature. 56 | 57 | ## Help Message: 58 | ``` 59 | best 0.1.0 60 | Daniel Liu, Daniel E. Cook 61 | Bam Error Stats Tool (best): analysis of error types in aligned reads. 62 | 63 | USAGE: 64 | best [OPTIONS] 65 | 66 | ARGS: 67 | 68 | Input BAM file 69 | 70 | 71 | Input reference FASTA file. Can be gzipped 72 | 73 | 74 | Prefix for output files that contain statistics 75 | 76 | OPTIONS: 77 | -b, --bin-types ... 78 | Types of bins to use for per alignment stats. 79 | 80 | Each bin should be of the format :. 81 | 82 | Supported bin types: q_len (read sequence length), subread_passes, mapq, mean_qual, 83 | gc_content, concordance_qv (phred scale Q-value) 84 | 85 | -h, --help 86 | Print help information 87 | 88 | --intervals-bed ... 89 | Use intervals from a BED file. 90 | 91 | The BED file should have the columns chrom, start, stop, and feature. The feature column 92 | is optional. 93 | 94 | This allows stats to be gathered separately for different types of intervals. Note that 95 | all intervals are on the reference, not the reads. 96 | 97 | --intervals-border ... 98 | Use fixed-width nonoverlapping window border regions as intervals. 99 | 100 | This is used to specify the window widths. 101 | 102 | --intervals-hp 103 | Use homopolymer regions in the reference as intervals 104 | 105 | --intervals-match ... 106 | Use regions in the reference that match any of the specified subsequences as intervals 107 | 108 | --intervals-window ... 109 | Use fixed-width nonoverlapping windows as intervals. 110 | 111 | This is used to specify the window widths. 112 | 113 | --intervals-window-pos ... 114 | Use fixed-width nonoverlapping windows with positions as intervals. 115 | 116 | This is used to specify the window widths. 117 | 118 | -n, --name-column 119 | Add column with a specific name in CSV outputs 120 | 121 | --no-per-aln-stats 122 | Turn off outputting per alignment stats 123 | 124 | -t, --threads 125 | Number of threads. Will be automatically determined if this is set to 0 126 | 127 | [default: 0] 128 | 129 | -V, --version 130 | Print version information 131 | ``` 132 | -------------------------------------------------------------------------------- /pub/get_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2022 Google LLC 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | # this software and associated documentation files (the "Software"), to deal in 6 | # the Software without restriction, including without limitation the rights to 7 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | # the Software, and to permit persons to whom the Software is furnished to do so, 9 | # subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | #===================# 23 | # Download Datasets # 24 | #===================# 25 | 26 | mkdir reference illumina ont pacbio 27 | 28 | # Download the CHM13 v1.0 draft 29 | wget -O reference/chm13.draft_v1.0.fasta.gz https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.0.fasta.gz 30 | 31 | function download_bam { 32 | SUBSAMPLE=${1} 33 | DIRECTORY=${2} 34 | URL=${3} 35 | curl ${URL} | \ 36 | samtools view -s ${SUBSAMPLE} -bh > ${DIRECTORY}/$(basename ${URL}) 37 | } 38 | 39 | # Illumina 40 | download_bam 0.1 illumina https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/alignments/chm13.draft_v1.0.pcrfree.bam 41 | 42 | # ONT 43 | download_bam 0.1 ont https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/alignments/chm13.draft_v1.0.ont_guppy_3.6.0.wm_2.01.pri.bam 44 | 45 | # PacBio HiFi 46 | download_bam 0.1 pacbio https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/alignments/chm13.draft_v1.0.hifi_20k.wm_2.01.pri.bam 47 | -------------------------------------------------------------------------------- /pub/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2022 Google LLC 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | # this software and associated documentation files (the "Software"), to deal in 6 | # the Software without restriction, including without limitation the rights to 7 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | # the Software, and to permit persons to whom the Software is furnished to do so, 9 | # subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | #==========# 23 | # Run Best # 24 | #==========# 25 | 26 | REFERENCE=reference/chm13.draft_v1.0.fasta.gz 27 | ARGS="-t 4 --intervals-hp --bin-types gc_content:0.05" 28 | 29 | best ${ARGS} --bin-types q_len:10 -- illumina/chm13.draft_v1.0.pcrfree.bam ${REFERENCE} illumina/illumina 30 | best ${ARGS} --bin-types q_len:10000 -- ont/chm13.draft_v1.0.ont_guppy_3.6.0.wm_2.01.pri.bam ${REFERENCE} ont/ont 31 | best ${ARGS} --bin-types q_len:1000 -- pacbio/chm13.draft_v1.0.hifi_20k.wm_2.01.pri.bam ${REFERENCE} pacbio/pacbio 32 | -------------------------------------------------------------------------------- /pub/run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright (c) 2022 Google LLC 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | # this software and associated documentation files (the "Software"), to deal in 6 | # the Software without restriction, including without limitation the rights to 7 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | # the Software, and to permit persons to whom the Software is furnished to do so, 9 | # subject to the following conditions: 10 | # 11 | # The above copyright notice and this permission notice shall be included in all 12 | # copies or substantial portions of the Software. 13 | # 14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | 21 | 22 | REFERENCE=reference/chm13.draft_v1.0.fasta 23 | BAM_FILE=pacbio/chm13.draft_v1.0.hifi_20k.wm_2.01.pri.bam 24 | 25 | BAM_CONCORDANCE=~/hg002-ccs/concordance/bamConcordance 26 | echo "bamConcordance" 27 | time ${BAM_CONCORDANCE} ${REFERENCE} ${BAM_FILE} pacbio/bamConcordance.csv 28 | 29 | #echo "pomoxis" 30 | #time assess_homopolymers count ${BAM_FILE} -o pacbio/pomoxis -t 4 31 | 32 | echo "best 1 thread" 33 | time best -t 1 ${BAM_FILE} ${REFERENCE} pacbio/best_timing 34 | 35 | echo "best 4 thread" 36 | time best -t 4 ${BAM_FILE} ${REFERENCE} pacbio/best_timing 37 | -------------------------------------------------------------------------------- /src/bed.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Google LLC 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | // this software and associated documentation files (the "Software"), to deal in 5 | // the Software without restriction, including without limitation the rights to 6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | // the Software, and to permit persons to whom the Software is furnished to do so, 8 | // subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | use std::fs::File; 21 | use std::io::prelude::*; 22 | use std::io::BufReader; 23 | 24 | use fxhash::{FxHashMap, FxHashSet}; 25 | 26 | use rust_lapper::{Interval, Lapper}; 27 | 28 | pub type FeatureInterval = Interval; 29 | 30 | pub struct Intervals { 31 | intervals: FxHashMap>, 32 | pub features: FxHashSet, 33 | } 34 | 35 | impl Intervals { 36 | /// Create a new collection of intervals from a BED file. 37 | pub fn new(bed_path: &str) -> Self { 38 | let mut intervals = FxHashMap::default(); 39 | let mut features = FxHashSet::default(); 40 | let reader = BufReader::new(File::open(bed_path).expect("BED file not found.")); 41 | 42 | for line in reader.lines() { 43 | let line = line.unwrap(); 44 | let mut fields = line.split('\t'); 45 | let chrom = fields.next().unwrap().to_owned(); 46 | // convert to 1-indexed [start, stop) 47 | let start = fields.next().unwrap().parse::().unwrap() + 1; 48 | let stop = fields.next().unwrap().parse::().unwrap() + 1; 49 | let feature = fields.next().unwrap_or("none").to_owned(); 50 | 51 | intervals 52 | .entry(chrom) 53 | .or_insert_with(|| Vec::::new()) 54 | .push(Interval { 55 | start, 56 | stop, 57 | val: feature.clone(), 58 | }); 59 | features.insert(feature); 60 | } 61 | 62 | Self { 63 | intervals: intervals 64 | .into_iter() 65 | .map(|(k, v)| (k, Lapper::new(v))) 66 | .collect(), 67 | features, 68 | } 69 | } 70 | 71 | /// Find intervals that intersect a given interval. 72 | pub fn find(&self, chrom: &str, start: usize, end: usize) -> Vec<&FeatureInterval> { 73 | self.intervals 74 | .get(chrom) 75 | .map(|x| x.find(start, end).collect()) 76 | .unwrap_or_else(|| Vec::new()) 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/intervals.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Google LLC 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | // this software and associated documentation files (the "Software"), to deal in 5 | // the Software without restriction, including without limitation the rights to 6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | // the Software, and to permit persons to whom the Software is furnished to do so, 8 | // subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | use noodles::core::Position; 21 | use noodles::fasta; 22 | 23 | use crate::bed::FeatureInterval; 24 | 25 | static COMPLEMENT: [u8; 128] = { 26 | let mut c = [0u8; 128]; 27 | c[b'A' as usize] = b'T'; 28 | c[b'T' as usize] = b'A'; 29 | c[b'C' as usize] = b'G'; 30 | c[b'G' as usize] = b'C'; 31 | c[b'N' as usize] = b'N'; 32 | c 33 | }; 34 | 35 | /// Find homopolymers in a sequence to use as intervals. 36 | pub fn find_homopolymers( 37 | seq: &fasta::record::Sequence, 38 | start: usize, 39 | end: usize, 40 | strand_rev: bool, 41 | ) -> Vec { 42 | let mut res = Vec::new(); 43 | let mut hp_len = 0; 44 | let mut prev = b'?'; 45 | 46 | for i in start..=end { 47 | let curr = seq 48 | .get(Position::new(i).unwrap()) 49 | .unwrap() 50 | .to_ascii_uppercase(); 51 | 52 | if curr == prev && i != end { 53 | hp_len += 1; 54 | continue; 55 | } 56 | 57 | if hp_len > 1 { 58 | let c = if strand_rev { 59 | COMPLEMENT[prev as usize] 60 | } else { 61 | prev 62 | }; 63 | res.push(FeatureInterval { 64 | start: i - hp_len, 65 | stop: i, 66 | val: format!("{: >5}{}", hp_len, c as char), 67 | }); 68 | } 69 | hp_len = 1; 70 | prev = curr; 71 | } 72 | 73 | res 74 | } 75 | 76 | /// Get fixed-length windows as intervals. 77 | pub fn get_windows( 78 | start: usize, 79 | end: usize, 80 | win_len: usize, 81 | pos: bool, 82 | strand_rev: bool, 83 | ) -> Vec { 84 | let mut res = Vec::new(); 85 | 86 | for i in (start..end).step_by(win_len) { 87 | let lo; 88 | let hi; 89 | if strand_rev { 90 | hi = end - (i - start); 91 | lo = hi.saturating_sub(win_len).max(start); 92 | } else { 93 | lo = i; 94 | hi = (i + win_len).min(end); 95 | }; 96 | 97 | if pos { 98 | res.push(FeatureInterval { 99 | start: lo, 100 | stop: hi, 101 | val: format!("window_{}_pos_{}", win_len, i - start), 102 | }); 103 | } else { 104 | res.push(FeatureInterval { 105 | start: lo, 106 | stop: hi, 107 | val: format!("window_{}", win_len), 108 | }); 109 | } 110 | } 111 | 112 | res 113 | } 114 | 115 | const BORDER_CONTEXT: usize = 1; 116 | 117 | /// Get small intervals that represent the region near fixed-width window borders. 118 | pub fn get_borders( 119 | start: usize, 120 | end: usize, 121 | win_len: usize, 122 | strand_rev: bool, 123 | ) -> Vec { 124 | let mut res = Vec::new(); 125 | 126 | for i in (start..end).step_by(win_len).skip(1) { 127 | let idx; 128 | if strand_rev { 129 | idx = end - (i - start); 130 | } else { 131 | idx = i; 132 | }; 133 | 134 | res.push(FeatureInterval { 135 | start: (idx - 1 - BORDER_CONTEXT).max(start), 136 | stop: (idx + BORDER_CONTEXT + 1).min(end), 137 | val: format!("border_{}", win_len), 138 | }); 139 | } 140 | 141 | res 142 | } 143 | 144 | /// Get regions that match a sequence as intervals. 145 | pub fn get_matches( 146 | seq: &fasta::record::Sequence, 147 | start: usize, 148 | end: usize, 149 | s: &str, 150 | strand_rev: bool, 151 | ) -> Vec { 152 | let mut res = Vec::new(); 153 | 154 | for i in start..end { 155 | // convert to zero-indexed 156 | let seq_iter = seq.as_ref()[i - 1..(i - 1 + s.len()).min(end - 1)] 157 | .iter() 158 | .map(|c| c.to_ascii_uppercase()); 159 | let is_match = if strand_rev { 160 | seq_iter.eq(s.bytes().rev().map(|c| COMPLEMENT[c as usize])) 161 | } else { 162 | seq_iter.eq(s.bytes()) 163 | }; 164 | if is_match { 165 | res.push(FeatureInterval { 166 | start: i, 167 | stop: i + s.len(), 168 | val: s.to_owned(), 169 | }); 170 | } 171 | } 172 | 173 | res 174 | } 175 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Google LLC 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | // this software and associated documentation files (the "Software"), to deal in 5 | // the Software without restriction, including without limitation the rights to 6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | // the Software, and to permit persons to whom the Software is furnished to do so, 8 | // subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | use clap::Parser; 21 | 22 | use rayon::prelude::*; 23 | 24 | use noodles::{bam, fasta, sam}; 25 | 26 | use fxhash::FxHashMap; 27 | 28 | use flate2::read::MultiGzDecoder; 29 | use flate2::write::GzEncoder; 30 | 31 | use std::fs::File; 32 | use std::io::{BufReader, BufWriter, Read, Write}; 33 | use std::str::FromStr; 34 | use std::sync::atomic::{AtomicUsize, Ordering}; 35 | use std::sync::Mutex; 36 | use std::time::Instant; 37 | 38 | mod stats; 39 | use stats::*; 40 | mod summary; 41 | use summary::*; 42 | mod bed; 43 | use bed::*; 44 | mod intervals; 45 | use intervals::*; 46 | 47 | const PER_ALN_STATS_NAME: &str = "per_aln_stats.csv.gz"; 48 | const YIELD_STATS_NAME: &str = "summary_yield_stats.csv"; 49 | const IDENTITY_STATS_NAME: &str = "summary_identity_stats.csv"; 50 | const FEATURE_STATS_NAME: &str = "summary_feature_stats.csv"; 51 | const CIGAR_STATS_NAME: &str = "summary_cigar_stats.csv"; 52 | const BIN_STATS_NAME: &str = "summary_bin_stats.csv"; 53 | const QUAL_SCORE_STATS_NAME: &str = "summary_qual_score_stats.csv"; 54 | 55 | fn run( 56 | input_path: String, 57 | reference_path: String, 58 | stats_prefix: String, 59 | bin_types: Option>, 60 | intervals_types: Vec, 61 | name_column: Option, 62 | output_per_aln_stats: bool, 63 | ) { 64 | // read reference sequences from fasta file 65 | let mut ref_reader = { 66 | let f = File::open(&reference_path).unwrap(); 67 | let r: Box = if reference_path.ends_with(".gz") { 68 | Box::new(MultiGzDecoder::new(f)) 69 | } else { 70 | Box::new(f) 71 | }; 72 | fasta::Reader::new(BufReader::new(r)) 73 | }; 74 | let reference_seqs: FxHashMap = ref_reader 75 | .records() 76 | .map(|r| r.unwrap()) 77 | .map(|r| (r.name().to_string(), r)) 78 | .collect(); 79 | 80 | // read bam file 81 | let mut reader = bam::Reader::new(File::open(input_path).unwrap()); 82 | reader.read_header().unwrap(); 83 | let references = reader.read_reference_sequences().unwrap(); 84 | 85 | // create per alignment stats writer that is shared between threads 86 | let aln_stats_path = format!("{}.{}", stats_prefix, PER_ALN_STATS_NAME); 87 | let aln_stats_writer = if output_per_aln_stats { 88 | let mut w = GzEncoder::new( 89 | BufWriter::new(File::create(&aln_stats_path).unwrap()), 90 | flate2::Compression::default(), 91 | ); 92 | write!( 93 | w, 94 | "{}{}\n", 95 | if name_column.is_some() { "name," } else { "" }, 96 | AlnStats::header() 97 | ) 98 | .unwrap(); 99 | Some(Mutex::new(w)) 100 | } else { 101 | None 102 | }; 103 | 104 | let summary_yield = Mutex::new(YieldSummary::new(name_column.clone())); 105 | let summary_identity = Mutex::new(IdentitySummary::new(name_column.clone())); 106 | let summary_features = if intervals_types.is_empty() { 107 | None 108 | } else { 109 | Some(Mutex::new(FeatureSummary::new(name_column.clone()))) 110 | }; 111 | let summary_cigars = Mutex::new(CigarLenSummary::new(name_column.clone())); 112 | let summary_bins = bin_types.map(|b| Mutex::new(BinSummary::new(name_column.clone(), b))); 113 | let summary_qual_score = Mutex::new(QualScoreSummary::new(name_column.clone())); 114 | let total_alns = AtomicUsize::new(0); 115 | 116 | // lazily read records to shift parsing work to individual threads 117 | reader 118 | .lazy_records() 119 | .par_bridge() 120 | .map(|r| r.unwrap()) 121 | .for_each(|record| { 122 | total_alns.fetch_add(1, Ordering::Relaxed); 123 | 124 | let flags = record.flags().unwrap(); 125 | if flags.is_unmapped() || flags.is_secondary() { 126 | // skip 127 | return; 128 | } 129 | 130 | let strand_rev = flags.is_reverse_complemented(); 131 | let aln_ref = references[record.reference_sequence_id().unwrap().unwrap()] 132 | .name() 133 | .as_str(); 134 | if !reference_seqs.contains_key(aln_ref) { 135 | panic!( 136 | "{} is not found in the input reference sequence names!", 137 | aln_ref 138 | ); 139 | } 140 | // convert to one-indexed [aln_start, aln_end) 141 | let aln_start = usize::from(record.alignment_start().unwrap().unwrap()); 142 | let aln_end = aln_start 143 | + sam::record::Cigar::try_from(record.cigar()) 144 | .unwrap() 145 | .alignment_span(); 146 | // get all the intervals relevant for the current alignment record 147 | let mut intervals_vec = Vec::new(); 148 | let mut overlap_intervals = Vec::new(); 149 | intervals_types 150 | .iter() 151 | .for_each(|intervals_type| match intervals_type { 152 | IntervalsType::Homopolymer => intervals_vec.extend(find_homopolymers( 153 | reference_seqs[aln_ref].sequence(), 154 | aln_start, 155 | aln_end, 156 | strand_rev, 157 | )), 158 | IntervalsType::Window(win_len) => intervals_vec 159 | .extend(get_windows(aln_start, aln_end, *win_len, false, strand_rev)), 160 | IntervalsType::WindowPos(win_len) => intervals_vec 161 | .extend(get_windows(aln_start, aln_end, *win_len, true, strand_rev)), 162 | IntervalsType::Border(win_len) => { 163 | intervals_vec.extend(get_borders(aln_start, aln_end, *win_len, strand_rev)) 164 | } 165 | IntervalsType::Match(seq) => intervals_vec.extend(get_matches( 166 | reference_seqs[aln_ref].sequence(), 167 | aln_start, 168 | aln_end, 169 | seq, 170 | strand_rev, 171 | )), 172 | IntervalsType::Bed(intervals) => { 173 | overlap_intervals.extend(intervals.find(aln_ref, aln_start, aln_end)) 174 | } 175 | }); 176 | overlap_intervals.extend(&intervals_vec); 177 | overlap_intervals.sort(); 178 | 179 | let stats = 180 | AlnStats::from_record(&references, &reference_seqs, &record, &overlap_intervals); 181 | 182 | summary_yield.lock().unwrap().update(&stats); 183 | summary_identity.lock().unwrap().update(&stats); 184 | summary_features 185 | .as_ref() 186 | .map(|f| f.lock().unwrap().update(&stats)); 187 | summary_cigars.lock().unwrap().update(&stats); 188 | summary_bins 189 | .as_ref() 190 | .map(|b| b.lock().unwrap().update(&stats)); 191 | summary_qual_score.lock().unwrap().update(&stats); 192 | 193 | if let Some(ref w) = aln_stats_writer { 194 | let mut w = w.lock().unwrap(); 195 | if let Some(ref name) = name_column { 196 | write!(w, "{},{}\n", name, stats.to_csv()).unwrap(); 197 | } else { 198 | write!(w, "{}\n", stats.to_csv()).unwrap(); 199 | } 200 | } 201 | }); 202 | 203 | write_summary( 204 | summary_yield.into_inner().unwrap(), 205 | &stats_prefix, 206 | YIELD_STATS_NAME, 207 | ); 208 | 209 | summary_identity.lock().unwrap().total_alns = total_alns.into_inner(); 210 | write_summary( 211 | summary_identity.into_inner().unwrap(), 212 | &stats_prefix, 213 | IDENTITY_STATS_NAME, 214 | ); 215 | 216 | if let Some(f) = summary_features { 217 | write_summary(f.into_inner().unwrap(), &stats_prefix, FEATURE_STATS_NAME); 218 | } 219 | 220 | write_summary( 221 | summary_cigars.into_inner().unwrap(), 222 | &stats_prefix, 223 | CIGAR_STATS_NAME, 224 | ); 225 | 226 | if let Some(b) = summary_bins { 227 | write_summary(b.into_inner().unwrap(), &stats_prefix, BIN_STATS_NAME); 228 | } 229 | 230 | write_summary( 231 | summary_qual_score.into_inner().unwrap(), 232 | &stats_prefix, 233 | QUAL_SCORE_STATS_NAME, 234 | ); 235 | } 236 | 237 | fn write_summary(s: D, prefix: &str, name: &str) { 238 | let summary_path = format!("{}.{}", prefix, name); 239 | let mut summary_writer = File::create(&summary_path).unwrap(); 240 | write!(summary_writer, "{}", s).unwrap(); 241 | } 242 | 243 | fn main() { 244 | let start_time = Instant::now(); 245 | let args = Args::parse(); 246 | 247 | let bin_types = args 248 | .bin_types 249 | .map(|b| b.iter().map(|s| BinType::from_str(s).unwrap()).collect()); 250 | 251 | let mut intervals_types = Vec::new(); 252 | if args.intervals_hp { 253 | intervals_types.push(IntervalsType::Homopolymer); 254 | } 255 | if let Some(paths) = args.intervals_bed { 256 | intervals_types.extend(paths.iter().map(|p| IntervalsType::Bed(Intervals::new(p)))); 257 | } 258 | if let Some(win_lens) = args.intervals_window { 259 | intervals_types.extend(win_lens.into_iter().map(|l| IntervalsType::Window(l))); 260 | } 261 | if let Some(win_lens) = args.intervals_window_pos { 262 | intervals_types.extend(win_lens.into_iter().map(|l| IntervalsType::WindowPos(l))); 263 | } 264 | if let Some(win_lens) = args.intervals_border { 265 | intervals_types.extend(win_lens.into_iter().map(|l| IntervalsType::Border(l))); 266 | } 267 | if let Some(seqs) = args.intervals_match { 268 | intervals_types.extend(seqs.into_iter().map(|mut s| { 269 | s.make_ascii_uppercase(); 270 | IntervalsType::Match(s) 271 | })); 272 | } 273 | 274 | rayon::ThreadPoolBuilder::new() 275 | .num_threads(args.threads) 276 | .build_global() 277 | .unwrap(); 278 | 279 | run( 280 | args.input, 281 | args.reference, 282 | args.stats_prefix, 283 | bin_types, 284 | intervals_types, 285 | args.name_column, 286 | !args.no_per_aln_stats, 287 | ); 288 | 289 | let duration = start_time.elapsed(); 290 | println!("Run time (s): {}", duration.as_secs()); 291 | } 292 | 293 | enum IntervalsType { 294 | Bed(Intervals), 295 | Homopolymer, 296 | Window(usize), 297 | WindowPos(usize), 298 | Border(usize), 299 | Match(String), 300 | } 301 | 302 | #[derive(Parser)] 303 | #[clap(author, version, about)] 304 | struct Args { 305 | /// Input BAM file. 306 | input: String, 307 | 308 | /// Input reference FASTA file. Can be gzipped. 309 | reference: String, 310 | 311 | /// Prefix for output files that contain statistics. 312 | stats_prefix: String, 313 | 314 | /// Add column with a specific name in CSV outputs. 315 | #[clap(short, long)] 316 | name_column: Option, 317 | 318 | /// Turn off outputting per alignment stats. 319 | #[clap(long)] 320 | no_per_aln_stats: bool, 321 | 322 | /// Types of bins to use for per alignment stats. 323 | /// 324 | /// Each bin should be of the format :. 325 | /// 326 | /// Supported bin types: 327 | /// q_len (read sequence length), 328 | /// subread_passes, 329 | /// mapq, 330 | /// mean_qual, 331 | /// gc_content, 332 | /// concordance_qv (phred scale Q-value) 333 | #[clap(short, long, min_values = 1)] 334 | bin_types: Option>, 335 | 336 | /// Use intervals from a BED file. 337 | /// 338 | /// The BED file should have the columns chrom, start, stop, and feature. 339 | /// The feature column is optional. 340 | /// 341 | /// This allows stats to be gathered separately for different types of intervals. 342 | /// Note that all intervals are on the reference, not the reads. 343 | #[clap(long, min_values = 1)] 344 | intervals_bed: Option>, 345 | 346 | /// Use homopolymer regions in the reference as intervals. 347 | #[clap(long)] 348 | intervals_hp: bool, 349 | 350 | /// Use fixed-width nonoverlapping windows as intervals. 351 | /// 352 | /// This is used to specify the window widths. 353 | #[clap(long, min_values = 1)] 354 | intervals_window: Option>, 355 | 356 | /// Use fixed-width nonoverlapping windows with positions as intervals. 357 | /// 358 | /// This is used to specify the window widths. 359 | #[clap(long, min_values = 1)] 360 | intervals_window_pos: Option>, 361 | 362 | /// Use fixed-width nonoverlapping window border regions as intervals. 363 | /// 364 | /// This is used to specify the window widths. 365 | #[clap(long, min_values = 1)] 366 | intervals_border: Option>, 367 | 368 | /// Use regions in the reference that match any of the specified subsequences as intervals. 369 | #[clap(long, min_values = 1)] 370 | intervals_match: Option>, 371 | 372 | /// Number of threads. Will be automatically determined if this is set to 0. 373 | #[clap(short, long, default_value_t = 0usize)] 374 | threads: usize, 375 | } 376 | -------------------------------------------------------------------------------- /src/stats.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Google LLC 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | // this software and associated documentation files (the "Software"), to deal in 5 | // the Software without restriction, including without limitation the rights to 6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | // the Software, and to permit persons to whom the Software is furnished to do so, 8 | // subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | use noodles::bam; 21 | use noodles::core::Position; 22 | use noodles::fasta; 23 | use noodles::sam; 24 | 25 | use sam::record::cigar::op::Kind; 26 | use sam::record::data::field::Tag; 27 | 28 | use fxhash::FxHashMap; 29 | 30 | use std::fmt; 31 | use std::str::FromStr; 32 | 33 | use crate::bed::*; 34 | 35 | /// Statistics for each alignment. 36 | #[derive(Debug)] 37 | pub struct AlnStats<'a> { 38 | pub read_name: String, 39 | pub chr: String, 40 | pub ref_pos: usize, 41 | pub q_len: usize, 42 | pub effective_cov: Option, 43 | pub subread_passes: Option, 44 | pub pred_concordance: Option, 45 | pub supplementary: bool, 46 | pub strand_rev: bool, 47 | pub mapq: u8, 48 | pub mean_qual: u8, 49 | pub read_len: usize, 50 | pub ref_cov: f64, 51 | pub gc_content: f64, 52 | pub concordance: f64, 53 | pub concordance_gc: f64, 54 | pub concordance_qv: f64, 55 | pub matches: usize, 56 | pub mismatches: usize, 57 | pub non_hp_ins: usize, 58 | pub non_hp_del: usize, 59 | pub hp_ins: usize, 60 | pub hp_del: usize, 61 | pub gc_ins: usize, 62 | pub gc_del: usize, 63 | pub feature_stats: FxHashMap<&'a str, FeatureStats>, 64 | pub cigar_len_stats: FxHashMap<(usize, u8), usize>, 65 | pub q_score_stats: QualScoreStats, 66 | } 67 | 68 | /// Stats on the number of matches and mismatches for each quality score. 69 | #[derive(Debug, Clone)] 70 | pub struct QualScoreStats { 71 | stats: Vec<(usize, usize)>, // (match, mismatch) 72 | } 73 | 74 | impl QualScoreStats { 75 | pub fn assign_add(&mut self, o: &Self) { 76 | self.stats.iter_mut().zip(&o.stats).for_each(|(q, o)| { 77 | q.0 += o.0; 78 | q.1 += o.1; 79 | }); 80 | } 81 | 82 | pub fn increment(&mut self, q_score: usize, is_match: bool) { 83 | if is_match { 84 | self.stats[q_score].0 += 1; 85 | } else { 86 | self.stats[q_score].1 += 1; 87 | } 88 | } 89 | 90 | pub fn empirical_qv(&self) -> Vec<(usize, f64)> { 91 | self.stats 92 | .iter() 93 | .enumerate() 94 | .filter_map(|(i, &(matches, mismatches))| { 95 | if matches == 0 && mismatches == 0 { 96 | None 97 | } else { 98 | Some(( 99 | i, 100 | concordance_qv( 101 | (matches as f64) / ((matches + mismatches) as f64), 102 | mismatches != 0, 103 | ), 104 | )) 105 | } 106 | }) 107 | .collect() 108 | } 109 | } 110 | 111 | impl Default for QualScoreStats { 112 | fn default() -> Self { 113 | Self { 114 | stats: vec![(0usize, 0usize); 256], 115 | } 116 | } 117 | } 118 | 119 | /// Per-read attributes that can be binned. 120 | #[derive(Copy, Clone)] 121 | pub enum BinType { 122 | QLen(usize), 123 | SubreadPasses(usize), 124 | MapQ(u8), 125 | MeanQual(u8), 126 | GcContent(f64), 127 | ConcordanceQv(f64), 128 | } 129 | 130 | impl BinType { 131 | pub fn get_bin(&self, a: &AlnStats) -> String { 132 | match self { 133 | Self::QLen(step) => format!("{}", a.q_len / step * step), 134 | Self::SubreadPasses(step) => format!( 135 | "{}", 136 | a.subread_passes.expect("Subread passes not found!") / step * step 137 | ), 138 | Self::MapQ(step) => format!("{}", a.mapq / step * step), 139 | Self::MeanQual(step) => format!("{}", a.mean_qual / step * step), 140 | Self::GcContent(step) => format!("{:.6}", (a.gc_content / step).floor() * step), 141 | Self::ConcordanceQv(step) => format!("{:.2}", (a.concordance_qv / step).floor() * step), 142 | } 143 | } 144 | } 145 | 146 | impl fmt::Display for BinType { 147 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 148 | match self { 149 | Self::QLen(step) => write!(f, "q_len:{}", step), 150 | Self::SubreadPasses(step) => write!(f, "subread_passes:{}", step), 151 | Self::MapQ(step) => write!(f, "mapq:{}", step), 152 | Self::MeanQual(step) => write!(f, "mean_qual:{}", step), 153 | Self::GcContent(step) => write!(f, "gc_content:{}", step), 154 | Self::ConcordanceQv(step) => write!(f, "concordance_qv:{}", step), 155 | } 156 | } 157 | } 158 | 159 | impl FromStr for BinType { 160 | type Err = Box; 161 | 162 | fn from_str(s: &str) -> Result { 163 | let mut split = s.split(':'); 164 | let a = split 165 | .next() 166 | .expect("Bin type not found! Expected :"); 167 | let b = split 168 | .next() 169 | .expect("Step size not found! Expected :"); 170 | 171 | use BinType::*; 172 | match a { 173 | "q_len" => Ok(QLen(b.parse::<_>().unwrap())), 174 | "subread_passes" => Ok(SubreadPasses(b.parse::<_>().unwrap())), 175 | "mapq" => Ok(MapQ(b.parse::<_>().unwrap())), 176 | "mean_qual" => Ok(MeanQual(b.parse::<_>().unwrap())), 177 | "gc_content" => Ok(GcContent(b.parse::<_>().unwrap())), 178 | "concordance_qv" => Ok(ConcordanceQv(b.parse::<_>().unwrap())), 179 | _ => Err("Invalid stat to bin across!".into()), 180 | } 181 | } 182 | } 183 | 184 | /// Statistics for each bin. 185 | #[derive(Debug, Default)] 186 | pub struct BinStats { 187 | pub num_reads: usize, 188 | pub matches: usize, 189 | pub mismatches: usize, 190 | pub non_hp_ins: usize, 191 | pub non_hp_del: usize, 192 | pub hp_ins: usize, 193 | pub hp_del: usize, 194 | } 195 | 196 | impl BinStats { 197 | pub fn new(stats: &AlnStats) -> Self { 198 | Self { 199 | num_reads: 1, 200 | matches: stats.matches, 201 | mismatches: stats.mismatches, 202 | non_hp_ins: stats.non_hp_ins, 203 | non_hp_del: stats.non_hp_del, 204 | hp_ins: stats.hp_ins, 205 | hp_del: stats.hp_del, 206 | } 207 | } 208 | 209 | pub fn assign_add(&mut self, o: &Self) { 210 | self.num_reads += o.num_reads; 211 | self.matches += o.matches; 212 | self.mismatches += o.mismatches; 213 | self.non_hp_ins += o.non_hp_ins; 214 | self.non_hp_del += o.non_hp_del; 215 | self.hp_ins += o.hp_ins; 216 | self.hp_del += o.hp_del; 217 | } 218 | 219 | pub fn num_bases(&self) -> usize { 220 | self.matches + self.mismatches + self.non_hp_del + self.hp_del 221 | } 222 | 223 | pub fn num_errors(&self) -> usize { 224 | self.mismatches + self.non_hp_ins + self.hp_ins + self.non_hp_del + self.hp_del 225 | } 226 | 227 | pub fn identity(&self) -> f64 { 228 | (self.matches as f64) / ((self.matches + self.num_errors()) as f64) 229 | } 230 | } 231 | 232 | /// Statistics for each interval feature. 233 | #[derive(Debug, Default)] 234 | pub struct FeatureStats { 235 | pub overlaps: usize, 236 | pub identical_overlaps: usize, 237 | pub matches: usize, 238 | pub mismatches: usize, 239 | pub non_hp_ins: usize, 240 | pub non_hp_del: usize, 241 | pub hp_ins: usize, 242 | pub hp_del: usize, 243 | pub total_qual_error: f64, 244 | pub q_score_stats: QualScoreStats, 245 | } 246 | 247 | impl FeatureStats { 248 | pub fn assign_add(&mut self, o: &Self) { 249 | self.overlaps += o.overlaps; 250 | self.identical_overlaps += o.identical_overlaps; 251 | self.matches += o.matches; 252 | self.mismatches += o.mismatches; 253 | self.non_hp_ins += o.non_hp_ins; 254 | self.non_hp_del += o.non_hp_del; 255 | self.hp_ins += o.hp_ins; 256 | self.hp_del += o.hp_del; 257 | self.total_qual_error += o.total_qual_error; 258 | self.q_score_stats.assign_add(&o.q_score_stats); 259 | } 260 | 261 | pub fn num_bases(&self) -> usize { 262 | self.matches + self.mismatches + self.non_hp_del + self.hp_del 263 | } 264 | 265 | pub fn num_errors(&self) -> usize { 266 | self.mismatches + self.non_hp_ins + self.hp_ins + self.non_hp_del + self.hp_del 267 | } 268 | 269 | pub fn identity(&self) -> f64 { 270 | (self.matches as f64) / ((self.matches + self.num_errors()) as f64) 271 | } 272 | 273 | pub fn mean_qual(&self) -> f64 { 274 | // only include quality scores from matches and mismatches 275 | error_to_qual(self.total_qual_error / ((self.matches + self.mismatches) as f64)) 276 | } 277 | } 278 | 279 | impl<'a> AlnStats<'a> { 280 | pub fn from_record( 281 | references: &sam::header::ReferenceSequences, 282 | reference_seqs: &FxHashMap, 283 | r: &bam::lazy::Record, 284 | intervals: &[&'a FeatureInterval], 285 | ) -> Self { 286 | let chr = references[r.reference_sequence_id().unwrap().unwrap()] 287 | .name() 288 | .to_string(); 289 | let mut ref_pos = usize::from(r.alignment_start().unwrap().unwrap()); 290 | let sequence = sam::record::Sequence::try_from(r.sequence()).unwrap(); 291 | let q_scores = sam::record::QualityScores::try_from(r.quality_scores()).unwrap(); 292 | if sequence.is_empty() || q_scores.is_empty() { 293 | panic!("Read sequence or quality scores do not exist!"); 294 | } 295 | let flags = r.flags().unwrap(); 296 | let data = sam::record::Data::try_from(r.data()).unwrap(); 297 | let ec_tag = Tag::try_from(*b"ec").unwrap(); 298 | let ec = data 299 | .get(ec_tag) 300 | .map(|f| f.value().as_float().unwrap() as f64); 301 | let np_tag: Tag = Tag::try_from(*b"np").unwrap(); 302 | let np = data 303 | .get(np_tag) 304 | .map(|f| f.value().as_int().unwrap() as usize); 305 | let rq_tag: Tag = Tag::try_from(*b"rq").unwrap(); 306 | let rq = data 307 | .get(rq_tag) 308 | .map(|f| f.value().as_float().unwrap() as f64); 309 | 310 | let mut res = AlnStats { 311 | read_name: r 312 | .read_name() 313 | .expect("Error parsing read name! Perhaps it contains an '@'?") 314 | .unwrap() 315 | .to_string(), 316 | chr, 317 | ref_pos, 318 | q_len: sequence.len(), 319 | effective_cov: ec, 320 | subread_passes: np, 321 | pred_concordance: rq, 322 | supplementary: flags.is_supplementary(), 323 | strand_rev: flags.is_reverse_complemented(), 324 | mapq: r 325 | .mapping_quality() 326 | .unwrap() 327 | .map(|q| u8::from(q)) 328 | .unwrap_or(255u8), 329 | mean_qual: mean_qual(q_scores.as_ref()), 330 | // fill in the rest afterwards 331 | read_len: 0, 332 | ref_cov: 0.0, 333 | gc_content: 0.0, 334 | concordance: 0.0, 335 | concordance_gc: 0.0, 336 | concordance_qv: 0.0, 337 | matches: 0, 338 | mismatches: 0, 339 | non_hp_ins: 0, 340 | non_hp_del: 0, 341 | hp_ins: 0, 342 | hp_del: 0, 343 | gc_ins: 0, 344 | gc_del: 0, 345 | feature_stats: FxHashMap::default(), 346 | cigar_len_stats: FxHashMap::default(), 347 | q_score_stats: QualScoreStats::default(), 348 | }; 349 | 350 | let mut interval_has_error = vec![false; intervals.len()]; 351 | for i in intervals { 352 | res.feature_stats 353 | .entry(&i.val) 354 | .or_insert_with(|| FeatureStats::default()) 355 | .overlaps += 1; 356 | } 357 | 358 | let mut query_pos = 1; 359 | let mut interval_start_idx = 0; 360 | let curr_ref_seq = reference_seqs[&res.chr].sequence(); 361 | let mut curr_features = Vec::new(); 362 | let mut curr_interval_idxs = Vec::new(); 363 | 364 | let mut intervals_have_error = |v: &[usize]| { 365 | v.iter() 366 | .for_each(|&interval_idx| interval_has_error[interval_idx] = true); 367 | }; 368 | 369 | // count mismatches, indels, and homopolymers 370 | let cigar = sam::record::Cigar::try_from(r.cigar()).unwrap(); 371 | for op in cigar.iter() { 372 | for _i in 0..op.len() { 373 | // skip intervals that cannot overlap the current reference position 374 | while interval_start_idx < intervals.len() 375 | && ref_pos >= intervals[interval_start_idx].stop 376 | { 377 | interval_start_idx += 1; 378 | } 379 | // find the intervals that overlap the current reference position 380 | let mut interval_idx = interval_start_idx; 381 | curr_features.clear(); 382 | curr_interval_idxs.clear(); 383 | while interval_idx < intervals.len() && ref_pos >= intervals[interval_idx].start { 384 | if ref_pos < intervals[interval_idx].stop { 385 | // get feature names of the overlapping intervals 386 | curr_features.push(intervals[interval_idx].val.as_str()); 387 | curr_interval_idxs.push(interval_idx); 388 | } 389 | interval_idx += 1; 390 | } 391 | 392 | match op.kind() { 393 | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Match => { 394 | let c = curr_ref_seq[Position::new(ref_pos).unwrap()].to_ascii_uppercase(); 395 | let is_match = op.kind() == Kind::SequenceMatch 396 | || (op.kind() == Kind::Match 397 | && c == u8::from(sequence[Position::new(query_pos).unwrap()]) 398 | .to_ascii_uppercase()); 399 | let q_score = u8::from(q_scores[Position::new(query_pos).unwrap()]); 400 | let qual_error = qual_to_error(q_score); 401 | if is_match { 402 | res.matches += 1; 403 | res.q_score_stats.increment(q_score as usize, true); 404 | curr_features.iter().for_each(|f| { 405 | let stats = res.feature_stats.get_mut(f).unwrap(); 406 | stats.matches += 1; 407 | stats.total_qual_error += qual_error; 408 | stats.q_score_stats.increment(q_score as usize, true); 409 | }); 410 | } else { 411 | res.mismatches += 1; 412 | res.q_score_stats.increment(q_score as usize, false); 413 | curr_features.iter().for_each(|f| { 414 | let stats = res.feature_stats.get_mut(f).unwrap(); 415 | stats.mismatches += 1; 416 | stats.total_qual_error += qual_error; 417 | stats.q_score_stats.increment(q_score as usize, false); 418 | }); 419 | intervals_have_error(&curr_interval_idxs); 420 | } 421 | if c == b'C' || c == b'G' { 422 | res.gc_content += 1.0; 423 | } 424 | query_pos += 1; 425 | ref_pos += 1; 426 | } 427 | Kind::Insertion => { 428 | // can be computed without looping through the number of insertions 429 | // this does not modify ref_pos 430 | let before_ins = 431 | curr_ref_seq[Position::new(ref_pos).unwrap()].to_ascii_uppercase(); 432 | let after_ins = curr_ref_seq 433 | .get(Position::new(ref_pos + 1).unwrap()) 434 | .unwrap_or(&b'?') 435 | .to_ascii_uppercase(); 436 | let query_ins = &sequence[Position::new(query_pos).unwrap() 437 | ..Position::new(query_pos + op.len()).unwrap()]; 438 | let hp_before = query_ins 439 | .iter() 440 | .map(|&c| u8::from(c).to_ascii_uppercase()) 441 | .all(|c| c == before_ins); 442 | let hp_after = query_ins 443 | .iter() 444 | .map(|&c| u8::from(c).to_ascii_uppercase()) 445 | .all(|c| c == after_ins); 446 | if hp_before || hp_after { 447 | res.hp_ins += op.len(); 448 | curr_features.iter().for_each(|f| { 449 | res.feature_stats.get_mut(f).unwrap().hp_ins += op.len() 450 | }); 451 | } else { 452 | res.non_hp_ins += op.len(); 453 | curr_features.iter().for_each(|f| { 454 | res.feature_stats.get_mut(f).unwrap().non_hp_ins += op.len() 455 | }); 456 | } 457 | intervals_have_error(&curr_interval_idxs); 458 | query_pos += op.len(); 459 | break; 460 | } 461 | Kind::Deletion => { 462 | let before_curr = curr_ref_seq 463 | .get(Position::new(ref_pos - 1).unwrap()) 464 | .unwrap_or(&b'?') 465 | .to_ascii_uppercase(); 466 | let after_curr = curr_ref_seq 467 | .get(Position::new(ref_pos + 1).unwrap()) 468 | .unwrap_or(&b'?') 469 | .to_ascii_uppercase(); 470 | let curr = 471 | curr_ref_seq[Position::new(ref_pos).unwrap()].to_ascii_uppercase(); 472 | if curr == b'C' || curr == b'G' { 473 | res.gc_content += 1.0; 474 | } 475 | let hp = curr == before_curr || curr == after_curr; 476 | if hp { 477 | res.hp_del += 1; 478 | curr_features 479 | .iter() 480 | .for_each(|f| res.feature_stats.get_mut(f).unwrap().hp_del += 1); 481 | } else { 482 | res.non_hp_del += 1; 483 | curr_features.iter().for_each(|f| { 484 | res.feature_stats.get_mut(f).unwrap().non_hp_del += 1 485 | }); 486 | } 487 | intervals_have_error(&curr_interval_idxs); 488 | ref_pos += 1; 489 | } 490 | Kind::SoftClip => { 491 | // does not require looping through the number of soft clips 492 | query_pos += op.len(); 493 | break; 494 | } 495 | Kind::HardClip => { 496 | // does not require looping through the number of hard clips 497 | break; 498 | } 499 | Kind::Skip => { 500 | // does not require looping through the number of skip operations 501 | ref_pos += op.len(); 502 | break; 503 | } 504 | _ => panic!("Unexpected CIGAR operation: {}", op), 505 | } 506 | } 507 | 508 | // gap compressed 509 | match op.kind() { 510 | Kind::SequenceMatch => { 511 | *res.cigar_len_stats.entry((op.len(), b'=')).or_insert(0) += 1; 512 | } 513 | Kind::SequenceMismatch => { 514 | *res.cigar_len_stats.entry((op.len(), b'X')).or_insert(0) += 1; 515 | } 516 | Kind::Match => { 517 | *res.cigar_len_stats.entry((op.len(), b'M')).or_insert(0) += 1; 518 | } 519 | Kind::Insertion => { 520 | *res.cigar_len_stats.entry((op.len(), b'I')).or_insert(0) += 1; 521 | res.gc_ins += 1; 522 | } 523 | Kind::Deletion => { 524 | *res.cigar_len_stats.entry((op.len(), b'D')).or_insert(0) += 1; 525 | res.gc_del += 1; 526 | } 527 | _ => (), 528 | } 529 | } 530 | 531 | let errors = res.mismatches + res.non_hp_ins + res.non_hp_del + res.hp_ins + res.hp_del; 532 | res.read_len = res.matches + res.mismatches + res.non_hp_del + res.hp_del; 533 | res.ref_cov = (res.read_len as f64) / (curr_ref_seq.len() as f64); 534 | res.gc_content /= res.read_len as f64; 535 | res.concordance = (res.matches as f64) / ((res.matches + errors) as f64); 536 | res.concordance_gc = (res.matches as f64) 537 | / ((res.matches + res.mismatches + res.gc_ins + res.gc_del) as f64); 538 | res.concordance_qv = concordance_qv(res.concordance, errors > 0); 539 | 540 | for (i, &has_error) in intervals.iter().zip(&interval_has_error) { 541 | if !has_error { 542 | res.feature_stats 543 | .get_mut(i.val.as_str()) 544 | .unwrap() 545 | .identical_overlaps += 1; 546 | } 547 | } 548 | 549 | res 550 | } 551 | 552 | pub fn header() -> &'static str { 553 | "read,chr,pos,read_length,effective_coverage,subread_passes,predicted_concordance,alignment_type,strand,alignment_mapq,mean_quality,aligned_read_length,reference_coverage,gc_content,concordance,gap_compressed_concordance,concordance_qv,mismatches,non_hp_ins,non_hp_del,hp_ins,hp_del" 554 | } 555 | 556 | pub fn to_csv(&self) -> String { 557 | let supp_str = if self.supplementary { 558 | "supplementary" 559 | } else { 560 | "primary" 561 | }; 562 | let strand_str = if self.strand_rev { "-" } else { "+" }; 563 | let ec = self 564 | .effective_cov 565 | .map(|x| format!("{:.2}", x)) 566 | .unwrap_or_else(|| String::new()); 567 | let np = self 568 | .subread_passes 569 | .map(|x| format!("{}", x)) 570 | .unwrap_or_else(|| String::new()); 571 | let rq = self 572 | .pred_concordance 573 | .map(|x| format!("{:.6}", x)) 574 | .unwrap_or_else(|| String::new()); 575 | format!( 576 | "{},{},{},{},{:.2},{},{:.6},{},{},{},{},{},{:.6},{:.6},{:.6},{:.6},{:.2},{},{},{},{},{}", 577 | self.read_name, 578 | self.chr, 579 | self.ref_pos, 580 | self.q_len, 581 | ec, 582 | np, 583 | rq, 584 | supp_str, 585 | strand_str, 586 | self.mapq, 587 | self.mean_qual, 588 | self.read_len, 589 | self.ref_cov, 590 | self.gc_content, 591 | self.concordance, 592 | self.concordance_gc, 593 | self.concordance_qv, 594 | self.mismatches, 595 | self.non_hp_ins, 596 | self.non_hp_del, 597 | self.hp_ins, 598 | self.hp_del 599 | ) 600 | } 601 | } 602 | 603 | /// Compute the Phred scale Q-value for a certain concordance/identity. 604 | /// 605 | /// Perfect match will have a Q-value of 75. 606 | pub fn concordance_qv(concordance: f64, has_errors: bool) -> f64 { 607 | if has_errors { 608 | -10.0f64 * (1.0f64 - concordance).log10() 609 | } else { 610 | 75.0f64 611 | } 612 | } 613 | 614 | fn qual_to_error(q: u8) -> f64 { 615 | 10.0f64.powf(-(q as f64) / 10.0f64) 616 | } 617 | 618 | fn error_to_qual(e: f64) -> f64 { 619 | -10.0f64 * e.log10() 620 | } 621 | 622 | fn mean_qual(q_scores: &[sam::record::quality_scores::Score]) -> u8 { 623 | let sum_q = q_scores 624 | .iter() 625 | .map(|&q| qual_to_error(u8::from(q))) 626 | .sum::(); 627 | error_to_qual(sum_q / (q_scores.len() as f64)).round() as u8 628 | } 629 | -------------------------------------------------------------------------------- /src/summary.rs: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Google LLC 2 | // 3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | // this software and associated documentation files (the "Software"), to deal in 5 | // the Software without restriction, including without limitation the rights to 6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | // the Software, and to permit persons to whom the Software is furnished to do so, 8 | // subject to the following conditions: 9 | // 10 | // The above copyright notice and this permission notice shall be included in all 11 | // copies or substantial portions of the Software. 12 | // 13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | 20 | use std::fmt; 21 | 22 | use fxhash::FxHashMap; 23 | 24 | use ordered_float::OrderedFloat; 25 | 26 | use crate::stats::*; 27 | 28 | // important to ensure that summary stats are sorted so output order is deterministic 29 | 30 | pub struct YieldSummary { 31 | name_column: Option, 32 | /// (reads, bases) 33 | q_yield: [(usize, usize); 15], 34 | } 35 | 36 | impl YieldSummary { 37 | pub fn new(mut name_column: Option) -> Self { 38 | if let Some(ref mut name) = name_column { 39 | name.push(','); 40 | } 41 | Self { 42 | name_column, 43 | q_yield: [(0usize, 0usize); 15], 44 | } 45 | } 46 | 47 | pub fn update(&mut self, aln_stats: &AlnStats) { 48 | if aln_stats.supplementary { 49 | return; 50 | } 51 | 52 | for i in 0..self.q_yield.len() { 53 | let min_q = i * 5; 54 | if aln_stats.concordance_qv >= (min_q as f64) { 55 | self.q_yield[i].0 += 1; 56 | self.q_yield[i].1 += aln_stats.q_len; 57 | } 58 | } 59 | } 60 | } 61 | 62 | impl fmt::Display for YieldSummary { 63 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 64 | writeln!( 65 | f, 66 | "{}min_empirical_q,yield_reads,yield_bases", 67 | if self.name_column.is_some() { 68 | "name," 69 | } else { 70 | "" 71 | } 72 | )?; 73 | for i in 0..self.q_yield.len() { 74 | writeln!( 75 | f, 76 | "{}{},{},{}", 77 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""), 78 | i * 5, 79 | self.q_yield[i].0, 80 | self.q_yield[i].1 81 | )?; 82 | } 83 | Ok(()) 84 | } 85 | } 86 | 87 | #[derive(Default)] 88 | pub struct IdentitySummary { 89 | name_column: Option, 90 | pub total_alns: usize, 91 | matches: usize, 92 | mismatches: usize, 93 | non_hp_ins: usize, 94 | non_hp_del: usize, 95 | hp_ins: usize, 96 | hp_del: usize, 97 | gc_ins: usize, 98 | gc_del: usize, 99 | num_reads: usize, 100 | } 101 | 102 | impl IdentitySummary { 103 | pub fn new(mut name_column: Option) -> Self { 104 | if let Some(ref mut name) = name_column { 105 | name.push(','); 106 | } 107 | Self { 108 | name_column, 109 | ..Default::default() 110 | } 111 | } 112 | 113 | pub fn update(&mut self, aln_stats: &AlnStats) { 114 | if aln_stats.supplementary { 115 | return; 116 | } 117 | 118 | self.matches += aln_stats.matches; 119 | self.mismatches += aln_stats.mismatches; 120 | self.non_hp_ins += aln_stats.non_hp_ins; 121 | self.non_hp_del += aln_stats.non_hp_del; 122 | self.hp_ins += aln_stats.hp_ins; 123 | self.hp_del += aln_stats.hp_del; 124 | self.gc_ins += aln_stats.gc_ins; 125 | self.gc_del += aln_stats.gc_del; 126 | self.num_reads += 1; 127 | } 128 | } 129 | 130 | impl fmt::Display for IdentitySummary { 131 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 132 | writeln!(f, "{}total_alns,primary_alns,identity,identity_qv,gap_compressed_identity,matches_per_kbp,mismatches_per_kbp,non_hp_ins_per_kbp,non_hp_del_per_kbp,hp_ins_per_kbp,hp_del_per_kbp", if self.name_column.is_some() { "name," } else { "" })?; 133 | let num_errors = 134 | self.mismatches + self.non_hp_ins + self.hp_ins + self.non_hp_del + self.hp_del; 135 | let num_bases = self.matches + self.mismatches + self.non_hp_del + self.hp_del; 136 | let id = (self.matches as f64) / ((self.matches + num_errors) as f64); 137 | let gc_id = (self.matches as f64) 138 | / ((self.matches + self.mismatches + self.gc_ins + self.gc_del) as f64); 139 | let per_kbp = |x| (x as f64) / (num_bases as f64) * 1000.0f64; 140 | writeln!( 141 | f, 142 | "{}{},{},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6}", 143 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""), 144 | self.total_alns, 145 | self.num_reads, 146 | id, 147 | concordance_qv(id, num_errors > 0), 148 | gc_id, 149 | per_kbp(self.matches), 150 | per_kbp(self.mismatches), 151 | per_kbp(self.non_hp_ins), 152 | per_kbp(self.non_hp_del), 153 | per_kbp(self.hp_ins), 154 | per_kbp(self.hp_del) 155 | ) 156 | } 157 | } 158 | 159 | pub struct FeatureSummary { 160 | name_column: Option, 161 | feature_stats: FxHashMap, 162 | } 163 | 164 | impl FeatureSummary { 165 | pub fn new(mut name_column: Option) -> Self { 166 | if let Some(ref mut name) = name_column { 167 | name.push(','); 168 | } 169 | Self { 170 | name_column, 171 | feature_stats: FxHashMap::default(), 172 | } 173 | } 174 | 175 | pub fn update(&mut self, aln_stats: &AlnStats) { 176 | if aln_stats.supplementary { 177 | return; 178 | } 179 | 180 | for (&k, v) in &aln_stats.feature_stats { 181 | self.feature_stats 182 | .entry(k.to_owned()) 183 | .or_insert_with(|| FeatureStats::default()) 184 | .assign_add(v); 185 | } 186 | } 187 | } 188 | 189 | impl fmt::Display for FeatureSummary { 190 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 191 | writeln!(f, "{}feature,intervals,identical_intervals,identity,identity_qv,mean_qual,bases_per_interval,matches_per_interval,mismatches_per_interval,non_hp_ins_per_interval,non_hp_del_per_interval,hp_ins_per_interval,hp_del_per_interval", if self.name_column.is_some() { "name," } else { "" })?; 192 | let mut v = self.feature_stats.iter().collect::>(); 193 | v.sort_by_key(|x| x.0); 194 | for (feature, stats) in v.into_iter() { 195 | let per_interval = |x| (x as f64) / (stats.overlaps as f64); 196 | let id = stats.identity(); 197 | writeln!( 198 | f, 199 | "{}{},{},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6}", 200 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""), 201 | feature.trim(), 202 | stats.overlaps, 203 | per_interval(stats.identical_overlaps), 204 | id, 205 | concordance_qv(id, id != 1.0), 206 | stats.mean_qual(), 207 | per_interval(stats.num_bases()), 208 | per_interval(stats.matches), 209 | per_interval(stats.mismatches), 210 | per_interval(stats.non_hp_ins), 211 | per_interval(stats.non_hp_del), 212 | per_interval(stats.hp_ins), 213 | per_interval(stats.hp_del) 214 | )?; 215 | } 216 | Ok(()) 217 | } 218 | } 219 | 220 | pub struct CigarLenSummary { 221 | name_column: Option, 222 | cigar_len_stats: FxHashMap<(usize, u8), usize>, 223 | } 224 | 225 | impl CigarLenSummary { 226 | pub fn new(mut name_column: Option) -> Self { 227 | if let Some(ref mut name) = name_column { 228 | name.push(','); 229 | } 230 | Self { 231 | name_column, 232 | cigar_len_stats: FxHashMap::default(), 233 | } 234 | } 235 | 236 | pub fn update(&mut self, aln_stats: &AlnStats) { 237 | if aln_stats.supplementary { 238 | return; 239 | } 240 | 241 | for (&k, v) in &aln_stats.cigar_len_stats { 242 | *self.cigar_len_stats.entry(k).or_insert(0) += v; 243 | } 244 | } 245 | } 246 | 247 | impl fmt::Display for CigarLenSummary { 248 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 249 | writeln!( 250 | f, 251 | "{}cigar,length,count,length_count_per_cigar", 252 | if self.name_column.is_some() { 253 | "name," 254 | } else { 255 | "" 256 | } 257 | )?; 258 | let mut v = self.cigar_len_stats.iter().collect::>(); 259 | v.sort_by_key(|(x, _)| (x.1, x.0)); 260 | let mut total_cigars = [0usize; 128]; 261 | for (cigar, &count) in &v { 262 | total_cigars[cigar.1 as usize] += count; 263 | } 264 | 265 | for (cigar, &count) in v.into_iter() { 266 | writeln!( 267 | f, 268 | "{}{},{},{},{:.6}", 269 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""), 270 | cigar.1 as char, 271 | cigar.0, 272 | count, 273 | (count as f64) / (total_cigars[cigar.1 as usize] as f64), 274 | )?; 275 | } 276 | Ok(()) 277 | } 278 | } 279 | 280 | pub struct BinSummary { 281 | name_column: Option, 282 | bin_maps: Vec<(BinType, FxHashMap)>, 283 | } 284 | 285 | impl BinSummary { 286 | pub fn new(mut name_column: Option, bin_types: Vec) -> Self { 287 | if let Some(ref mut name) = name_column { 288 | name.push(','); 289 | } 290 | let bin_maps = bin_types 291 | .into_iter() 292 | .map(|b| (b, FxHashMap::default())) 293 | .collect(); 294 | Self { 295 | name_column, 296 | bin_maps, 297 | } 298 | } 299 | 300 | pub fn update(&mut self, aln_stats: &AlnStats) { 301 | if aln_stats.supplementary { 302 | return; 303 | } 304 | 305 | self.bin_maps.iter_mut().for_each(|(bin_type, bin_map)| { 306 | let bin = bin_type.get_bin(aln_stats); 307 | let bin_stats = BinStats::new(aln_stats); 308 | bin_map 309 | .entry(bin) 310 | .or_insert_with(|| BinStats::default()) 311 | .assign_add(&bin_stats); 312 | }); 313 | } 314 | } 315 | 316 | impl fmt::Display for BinSummary { 317 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 318 | writeln!( 319 | f, 320 | "{}bin_type,bin,num_reads,num_bases,identity,identity_qv,matches_per_kbp,mismatches_per_kbp,non_hp_ins_per_kbp,non_hp_del_per_kbp,hp_ins_per_kbp,hp_del_per_kbp", 321 | if self.name_column.is_some() { 322 | "name," 323 | } else { 324 | "" 325 | } 326 | )?; 327 | for (bin_type, bin_map) in &self.bin_maps { 328 | let mut bins = bin_map.iter().collect::>(); 329 | bins.sort_by_key(|(b, _)| OrderedFloat(b.parse::().unwrap())); 330 | 331 | for (bin, stats) in bins { 332 | let per_kbp = |x| (x as f64) / (stats.num_bases() as f64) * 1000.0f64; 333 | let id = stats.identity(); 334 | writeln!( 335 | f, 336 | "{}{},{},{},{},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6}", 337 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""), 338 | bin_type, 339 | bin, 340 | stats.num_reads, 341 | stats.num_bases(), 342 | id, 343 | concordance_qv(id, id != 1.0), 344 | per_kbp(stats.matches), 345 | per_kbp(stats.mismatches), 346 | per_kbp(stats.non_hp_ins), 347 | per_kbp(stats.non_hp_del), 348 | per_kbp(stats.hp_ins), 349 | per_kbp(stats.hp_del), 350 | )?; 351 | } 352 | } 353 | Ok(()) 354 | } 355 | } 356 | 357 | pub struct QualScoreSummary { 358 | name_column: Option, 359 | feature_qual: FxHashMap, 360 | } 361 | 362 | impl QualScoreSummary { 363 | pub fn new(mut name_column: Option) -> Self { 364 | if let Some(ref mut name) = name_column { 365 | name.push(','); 366 | } 367 | let mut feature_qual = FxHashMap::default(); 368 | feature_qual.insert("all_alignments".to_owned(), QualScoreStats::default()); 369 | Self { 370 | name_column, 371 | feature_qual, 372 | } 373 | } 374 | 375 | pub fn update(&mut self, aln_stats: &AlnStats) { 376 | if aln_stats.supplementary { 377 | return; 378 | } 379 | 380 | self.feature_qual 381 | .get_mut("all_alignments") 382 | .unwrap() 383 | .assign_add(&aln_stats.q_score_stats); 384 | 385 | for (&k, v) in &aln_stats.feature_stats { 386 | self.feature_qual 387 | .entry(k.to_owned()) 388 | .or_insert_with(|| QualScoreStats::default()) 389 | .assign_add(&v.q_score_stats); 390 | } 391 | } 392 | } 393 | 394 | impl fmt::Display for QualScoreSummary { 395 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 396 | writeln!( 397 | f, 398 | "{}feature,qual_score,empirical_qv", 399 | if self.name_column.is_some() { 400 | "name," 401 | } else { 402 | "" 403 | } 404 | )?; 405 | let mut v = self.feature_qual.iter().collect::>(); 406 | v.sort_by_key(|x| x.0); 407 | for (feature, stats) in v.into_iter() { 408 | for (i, qv) in stats.empirical_qv().into_iter() { 409 | writeln!( 410 | f, 411 | "{}{},{},{:.2}", 412 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""), 413 | feature.trim(), 414 | i, 415 | qv 416 | )?; 417 | } 418 | } 419 | Ok(()) 420 | } 421 | } 422 | --------------------------------------------------------------------------------