├── .gitignore
├── CITATION.cff
├── CONTRIBUTING.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── Usage.md
├── pub
├── get_data.sh
├── run.sh
└── run_benchmark.sh
└── src
├── bed.rs
├── intervals.rs
├── main.rs
├── stats.rs
└── summary.rs
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | data
3 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | cff-version: 1.2.0
2 | message: "If you use this software, please cite it as below."
3 | authors:
4 | - family-names: "Liu"
5 | given-names: "Daniel"
6 | orcid: "https://orcid.org/0000-0002-2385-2957"
7 | - family-names: "Belyaeva"
8 | given-names: "Anastasiya"
9 | - family-names: "Shafin"
10 | given-names: "Kishwar"
11 | orcid: "https://orcid.org/0000-0001-5252-3434"
12 | - family-names: "Chang"
13 | given-names: "Pi-Chuan"
14 | orcid: "https://orcid.org/0000-0003-3021-6446"
15 | - family-names: "Carroll"
16 | given-names: "Andrew"
17 | orcid: "https://orcid.org/0000-0002-4824-6689"
18 | - family-names: "Cook"
19 | given-names: "Daniel"
20 | orcid: "https://orcid.org/0000-0003-3347-562X"
21 | title: "Best: A Tool for Characterizing Sequencing Errors"
22 | version: 0.1.0
23 | doi: 10.1101/2022.12.22.521488
24 | date-released: 2020-12-09
25 | url: "https://github.com/google/best"
26 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | We'd love to accept your patches and contributions to this project. There are
4 | just a few small guidelines you need to follow.
5 |
6 | ## Contributor License Agreement
7 |
8 | Contributions to this project must be accompanied by a Contributor License
9 | Agreement (CLA). You (or your employer) retain the copyright to your
10 | contribution; this simply gives us permission to use and redistribute your
11 | contributions as part of the project. Head over to
12 | to see your current agreements on file or
13 | to sign a new one.
14 |
15 | You generally only need to submit a CLA once, so if you've already submitted one
16 | (even if it was for a different project), you probably don't need to do it
17 | again.
18 |
19 | ## Code Reviews
20 |
21 | All submissions, including submissions by project members, require review. We
22 | use GitHub pull requests for this purpose. Consult
23 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
24 | information on using pull requests.
25 |
26 | ## Community Guidelines
27 |
28 | This project follows
29 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
30 |
--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Cargo.
2 | # It is not intended for manual editing.
3 | version = 3
4 |
5 | [[package]]
6 | name = "adler"
7 | version = "1.0.2"
8 | source = "registry+https://github.com/rust-lang/crates.io-index"
9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
10 |
11 | [[package]]
12 | name = "atty"
13 | version = "0.2.14"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
16 | dependencies = [
17 | "hermit-abi",
18 | "libc",
19 | "winapi",
20 | ]
21 |
22 | [[package]]
23 | name = "autocfg"
24 | version = "1.1.0"
25 | source = "registry+https://github.com/rust-lang/crates.io-index"
26 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
27 |
28 | [[package]]
29 | name = "best"
30 | version = "0.1.0"
31 | dependencies = [
32 | "clap",
33 | "flate2",
34 | "fxhash",
35 | "noodles",
36 | "ordered-float",
37 | "rayon",
38 | "rust-lapper",
39 | ]
40 |
41 | [[package]]
42 | name = "bit-vec"
43 | version = "0.6.3"
44 | source = "registry+https://github.com/rust-lang/crates.io-index"
45 | checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
46 |
47 | [[package]]
48 | name = "bitflags"
49 | version = "1.3.2"
50 | source = "registry+https://github.com/rust-lang/crates.io-index"
51 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
52 |
53 | [[package]]
54 | name = "byteorder"
55 | version = "1.4.3"
56 | source = "registry+https://github.com/rust-lang/crates.io-index"
57 | checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
58 |
59 | [[package]]
60 | name = "bytes"
61 | version = "1.1.0"
62 | source = "registry+https://github.com/rust-lang/crates.io-index"
63 | checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8"
64 |
65 | [[package]]
66 | name = "cfg-if"
67 | version = "1.0.0"
68 | source = "registry+https://github.com/rust-lang/crates.io-index"
69 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
70 |
71 | [[package]]
72 | name = "clap"
73 | version = "3.2.7"
74 | source = "registry+https://github.com/rust-lang/crates.io-index"
75 | checksum = "5b7b16274bb247b45177db843202209b12191b631a14a9d06e41b3777d6ecf14"
76 | dependencies = [
77 | "atty",
78 | "bitflags",
79 | "clap_derive",
80 | "clap_lex",
81 | "indexmap",
82 | "once_cell",
83 | "strsim",
84 | "termcolor",
85 | "textwrap",
86 | ]
87 |
88 | [[package]]
89 | name = "clap_derive"
90 | version = "3.2.7"
91 | source = "registry+https://github.com/rust-lang/crates.io-index"
92 | checksum = "759bf187376e1afa7b85b959e6a664a3e7a95203415dba952ad19139e798f902"
93 | dependencies = [
94 | "heck",
95 | "proc-macro-error",
96 | "proc-macro2",
97 | "quote",
98 | "syn",
99 | ]
100 |
101 | [[package]]
102 | name = "clap_lex"
103 | version = "0.2.4"
104 | source = "registry+https://github.com/rust-lang/crates.io-index"
105 | checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
106 | dependencies = [
107 | "os_str_bytes",
108 | ]
109 |
110 | [[package]]
111 | name = "crc32fast"
112 | version = "1.3.2"
113 | source = "registry+https://github.com/rust-lang/crates.io-index"
114 | checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
115 | dependencies = [
116 | "cfg-if",
117 | ]
118 |
119 | [[package]]
120 | name = "crossbeam-channel"
121 | version = "0.5.6"
122 | source = "registry+https://github.com/rust-lang/crates.io-index"
123 | checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
124 | dependencies = [
125 | "cfg-if",
126 | "crossbeam-utils",
127 | ]
128 |
129 | [[package]]
130 | name = "crossbeam-deque"
131 | version = "0.8.1"
132 | source = "registry+https://github.com/rust-lang/crates.io-index"
133 | checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
134 | dependencies = [
135 | "cfg-if",
136 | "crossbeam-epoch",
137 | "crossbeam-utils",
138 | ]
139 |
140 | [[package]]
141 | name = "crossbeam-epoch"
142 | version = "0.9.9"
143 | source = "registry+https://github.com/rust-lang/crates.io-index"
144 | checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d"
145 | dependencies = [
146 | "autocfg",
147 | "cfg-if",
148 | "crossbeam-utils",
149 | "memoffset",
150 | "once_cell",
151 | "scopeguard",
152 | ]
153 |
154 | [[package]]
155 | name = "crossbeam-utils"
156 | version = "0.8.10"
157 | source = "registry+https://github.com/rust-lang/crates.io-index"
158 | checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83"
159 | dependencies = [
160 | "cfg-if",
161 | "once_cell",
162 | ]
163 |
164 | [[package]]
165 | name = "either"
166 | version = "1.6.1"
167 | source = "registry+https://github.com/rust-lang/crates.io-index"
168 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
169 |
170 | [[package]]
171 | name = "flate2"
172 | version = "1.0.24"
173 | source = "registry+https://github.com/rust-lang/crates.io-index"
174 | checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6"
175 | dependencies = [
176 | "crc32fast",
177 | "miniz_oxide",
178 | ]
179 |
180 | [[package]]
181 | name = "fxhash"
182 | version = "0.2.1"
183 | source = "registry+https://github.com/rust-lang/crates.io-index"
184 | checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
185 | dependencies = [
186 | "byteorder",
187 | ]
188 |
189 | [[package]]
190 | name = "hashbrown"
191 | version = "0.12.1"
192 | source = "registry+https://github.com/rust-lang/crates.io-index"
193 | checksum = "db0d4cf898abf0081f964436dc980e96670a0f36863e4b83aaacdb65c9d7ccc3"
194 |
195 | [[package]]
196 | name = "heck"
197 | version = "0.4.0"
198 | source = "registry+https://github.com/rust-lang/crates.io-index"
199 | checksum = "2540771e65fc8cb83cd6e8a237f70c319bd5c29f78ed1084ba5d50eeac86f7f9"
200 |
201 | [[package]]
202 | name = "hermit-abi"
203 | version = "0.1.19"
204 | source = "registry+https://github.com/rust-lang/crates.io-index"
205 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
206 | dependencies = [
207 | "libc",
208 | ]
209 |
210 | [[package]]
211 | name = "indexmap"
212 | version = "1.9.1"
213 | source = "registry+https://github.com/rust-lang/crates.io-index"
214 | checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
215 | dependencies = [
216 | "autocfg",
217 | "hashbrown",
218 | ]
219 |
220 | [[package]]
221 | name = "lexical-core"
222 | version = "0.8.5"
223 | source = "registry+https://github.com/rust-lang/crates.io-index"
224 | checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46"
225 | dependencies = [
226 | "lexical-parse-float",
227 | "lexical-parse-integer",
228 | "lexical-util",
229 | "lexical-write-float",
230 | "lexical-write-integer",
231 | ]
232 |
233 | [[package]]
234 | name = "lexical-parse-float"
235 | version = "0.8.5"
236 | source = "registry+https://github.com/rust-lang/crates.io-index"
237 | checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
238 | dependencies = [
239 | "lexical-parse-integer",
240 | "lexical-util",
241 | "static_assertions",
242 | ]
243 |
244 | [[package]]
245 | name = "lexical-parse-integer"
246 | version = "0.8.6"
247 | source = "registry+https://github.com/rust-lang/crates.io-index"
248 | checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
249 | dependencies = [
250 | "lexical-util",
251 | "static_assertions",
252 | ]
253 |
254 | [[package]]
255 | name = "lexical-util"
256 | version = "0.8.5"
257 | source = "registry+https://github.com/rust-lang/crates.io-index"
258 | checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc"
259 | dependencies = [
260 | "static_assertions",
261 | ]
262 |
263 | [[package]]
264 | name = "lexical-write-float"
265 | version = "0.8.5"
266 | source = "registry+https://github.com/rust-lang/crates.io-index"
267 | checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862"
268 | dependencies = [
269 | "lexical-util",
270 | "lexical-write-integer",
271 | "static_assertions",
272 | ]
273 |
274 | [[package]]
275 | name = "lexical-write-integer"
276 | version = "0.8.5"
277 | source = "registry+https://github.com/rust-lang/crates.io-index"
278 | checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446"
279 | dependencies = [
280 | "lexical-util",
281 | "static_assertions",
282 | ]
283 |
284 | [[package]]
285 | name = "libc"
286 | version = "0.2.126"
287 | source = "registry+https://github.com/rust-lang/crates.io-index"
288 | checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
289 |
290 | [[package]]
291 | name = "memchr"
292 | version = "2.5.0"
293 | source = "registry+https://github.com/rust-lang/crates.io-index"
294 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
295 |
296 | [[package]]
297 | name = "memoffset"
298 | version = "0.6.5"
299 | source = "registry+https://github.com/rust-lang/crates.io-index"
300 | checksum = "5aa361d4faea93603064a027415f07bd8e1d5c88c9fbf68bf56a285428fd79ce"
301 | dependencies = [
302 | "autocfg",
303 | ]
304 |
305 | [[package]]
306 | name = "miniz_oxide"
307 | version = "0.5.3"
308 | source = "registry+https://github.com/rust-lang/crates.io-index"
309 | checksum = "6f5c75688da582b8ffc1f1799e9db273f32133c49e048f614d22ec3256773ccc"
310 | dependencies = [
311 | "adler",
312 | ]
313 |
314 | [[package]]
315 | name = "noodles"
316 | version = "0.26.0"
317 | source = "registry+https://github.com/rust-lang/crates.io-index"
318 | checksum = "dbb261d074f9ca401f21cc62b1a1c7446d63d832c2b80774a3c529f057cfc51a"
319 | dependencies = [
320 | "noodles-bam",
321 | "noodles-bed",
322 | "noodles-core",
323 | "noodles-fasta",
324 | "noodles-sam",
325 | ]
326 |
327 | [[package]]
328 | name = "noodles-bam"
329 | version = "0.21.0"
330 | source = "registry+https://github.com/rust-lang/crates.io-index"
331 | checksum = "52fe19f088bbeb51025c7911802ff2eb00c3994fee9e0ace892cb26bd19c42d5"
332 | dependencies = [
333 | "bit-vec",
334 | "byteorder",
335 | "bytes",
336 | "noodles-bgzf",
337 | "noodles-core",
338 | "noodles-csi",
339 | "noodles-fasta",
340 | "noodles-sam",
341 | ]
342 |
343 | [[package]]
344 | name = "noodles-bed"
345 | version = "0.4.0"
346 | source = "registry+https://github.com/rust-lang/crates.io-index"
347 | checksum = "7468b7ffe8194806d9364d2dc4e04d431aa1d60dbab6ce35ea3af470831762b5"
348 | dependencies = [
349 | "noodles-core",
350 | ]
351 |
352 | [[package]]
353 | name = "noodles-bgzf"
354 | version = "0.14.0"
355 | source = "registry+https://github.com/rust-lang/crates.io-index"
356 | checksum = "36ebbec47ff8c1e931f5da61e077a7774bbb2fc43158d72f334a51dae20dcc4b"
357 | dependencies = [
358 | "byteorder",
359 | "bytes",
360 | "crossbeam-channel",
361 | "flate2",
362 | ]
363 |
364 | [[package]]
365 | name = "noodles-core"
366 | version = "0.8.0"
367 | source = "registry+https://github.com/rust-lang/crates.io-index"
368 | checksum = "ba58b998fe20d6e7f0c67386c1901daaf8fd2d2c4e7cfb5cf5f47ed22d96a8c8"
369 |
370 | [[package]]
371 | name = "noodles-csi"
372 | version = "0.9.0"
373 | source = "registry+https://github.com/rust-lang/crates.io-index"
374 | checksum = "ccb4bac55bfc031dd493f974ea1ee12e9b85e33cc17a63e6273b04551ca8d99e"
375 | dependencies = [
376 | "bit-vec",
377 | "byteorder",
378 | "noodles-bgzf",
379 | "noodles-core",
380 | ]
381 |
382 | [[package]]
383 | name = "noodles-fasta"
384 | version = "0.13.0"
385 | source = "registry+https://github.com/rust-lang/crates.io-index"
386 | checksum = "12d5cfdf22a0869b7fcfc3cf6a9c2ad734f3284f090233ffadd59f610d4878b0"
387 | dependencies = [
388 | "bytes",
389 | "memchr",
390 | "noodles-bgzf",
391 | "noodles-core",
392 | ]
393 |
394 | [[package]]
395 | name = "noodles-sam"
396 | version = "0.18.0"
397 | source = "registry+https://github.com/rust-lang/crates.io-index"
398 | checksum = "aab026833fde10c98fb822bff2bfd3950829e78d446ddb7fb1ad0e11710a4568"
399 | dependencies = [
400 | "bitflags",
401 | "indexmap",
402 | "lexical-core",
403 | "memchr",
404 | "noodles-bgzf",
405 | "noodles-core",
406 | "noodles-csi",
407 | "noodles-fasta",
408 | "rustc-hash",
409 | ]
410 |
411 | [[package]]
412 | name = "num-traits"
413 | version = "0.2.15"
414 | source = "registry+https://github.com/rust-lang/crates.io-index"
415 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
416 | dependencies = [
417 | "autocfg",
418 | ]
419 |
420 | [[package]]
421 | name = "num_cpus"
422 | version = "1.13.1"
423 | source = "registry+https://github.com/rust-lang/crates.io-index"
424 | checksum = "19e64526ebdee182341572e50e9ad03965aa510cd94427a4549448f285e957a1"
425 | dependencies = [
426 | "hermit-abi",
427 | "libc",
428 | ]
429 |
430 | [[package]]
431 | name = "once_cell"
432 | version = "1.12.0"
433 | source = "registry+https://github.com/rust-lang/crates.io-index"
434 | checksum = "7709cef83f0c1f58f666e746a08b21e0085f7440fa6a29cc194d68aac97a4225"
435 |
436 | [[package]]
437 | name = "ordered-float"
438 | version = "3.1.0"
439 | source = "registry+https://github.com/rust-lang/crates.io-index"
440 | checksum = "98ffdb14730ed2ef599c65810c15b000896e21e8776b512de0db0c3d7335cc2a"
441 | dependencies = [
442 | "num-traits",
443 | ]
444 |
445 | [[package]]
446 | name = "os_str_bytes"
447 | version = "6.1.0"
448 | source = "registry+https://github.com/rust-lang/crates.io-index"
449 | checksum = "21326818e99cfe6ce1e524c2a805c189a99b5ae555a35d19f9a284b427d86afa"
450 |
451 | [[package]]
452 | name = "proc-macro-error"
453 | version = "1.0.4"
454 | source = "registry+https://github.com/rust-lang/crates.io-index"
455 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
456 | dependencies = [
457 | "proc-macro-error-attr",
458 | "proc-macro2",
459 | "quote",
460 | "syn",
461 | "version_check",
462 | ]
463 |
464 | [[package]]
465 | name = "proc-macro-error-attr"
466 | version = "1.0.4"
467 | source = "registry+https://github.com/rust-lang/crates.io-index"
468 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
469 | dependencies = [
470 | "proc-macro2",
471 | "quote",
472 | "version_check",
473 | ]
474 |
475 | [[package]]
476 | name = "proc-macro2"
477 | version = "1.0.40"
478 | source = "registry+https://github.com/rust-lang/crates.io-index"
479 | checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7"
480 | dependencies = [
481 | "unicode-ident",
482 | ]
483 |
484 | [[package]]
485 | name = "quote"
486 | version = "1.0.20"
487 | source = "registry+https://github.com/rust-lang/crates.io-index"
488 | checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804"
489 | dependencies = [
490 | "proc-macro2",
491 | ]
492 |
493 | [[package]]
494 | name = "rayon"
495 | version = "1.5.3"
496 | source = "registry+https://github.com/rust-lang/crates.io-index"
497 | checksum = "bd99e5772ead8baa5215278c9b15bf92087709e9c1b2d1f97cdb5a183c933a7d"
498 | dependencies = [
499 | "autocfg",
500 | "crossbeam-deque",
501 | "either",
502 | "rayon-core",
503 | ]
504 |
505 | [[package]]
506 | name = "rayon-core"
507 | version = "1.9.3"
508 | source = "registry+https://github.com/rust-lang/crates.io-index"
509 | checksum = "258bcdb5ac6dad48491bb2992db6b7cf74878b0384908af124823d118c99683f"
510 | dependencies = [
511 | "crossbeam-channel",
512 | "crossbeam-deque",
513 | "crossbeam-utils",
514 | "num_cpus",
515 | ]
516 |
517 | [[package]]
518 | name = "rust-lapper"
519 | version = "1.0.1"
520 | source = "registry+https://github.com/rust-lang/crates.io-index"
521 | checksum = "c0da7f82898b906bf29d705adb2c36a461c9cc9e33c1417041c86229d569c144"
522 | dependencies = [
523 | "num-traits",
524 | ]
525 |
526 | [[package]]
527 | name = "rustc-hash"
528 | version = "1.1.0"
529 | source = "registry+https://github.com/rust-lang/crates.io-index"
530 | checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
531 |
532 | [[package]]
533 | name = "scopeguard"
534 | version = "1.1.0"
535 | source = "registry+https://github.com/rust-lang/crates.io-index"
536 | checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
537 |
538 | [[package]]
539 | name = "static_assertions"
540 | version = "1.1.0"
541 | source = "registry+https://github.com/rust-lang/crates.io-index"
542 | checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
543 |
544 | [[package]]
545 | name = "strsim"
546 | version = "0.10.0"
547 | source = "registry+https://github.com/rust-lang/crates.io-index"
548 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
549 |
550 | [[package]]
551 | name = "syn"
552 | version = "1.0.98"
553 | source = "registry+https://github.com/rust-lang/crates.io-index"
554 | checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
555 | dependencies = [
556 | "proc-macro2",
557 | "quote",
558 | "unicode-ident",
559 | ]
560 |
561 | [[package]]
562 | name = "termcolor"
563 | version = "1.1.3"
564 | source = "registry+https://github.com/rust-lang/crates.io-index"
565 | checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
566 | dependencies = [
567 | "winapi-util",
568 | ]
569 |
570 | [[package]]
571 | name = "textwrap"
572 | version = "0.15.0"
573 | source = "registry+https://github.com/rust-lang/crates.io-index"
574 | checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
575 |
576 | [[package]]
577 | name = "unicode-ident"
578 | version = "1.0.1"
579 | source = "registry+https://github.com/rust-lang/crates.io-index"
580 | checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
581 |
582 | [[package]]
583 | name = "version_check"
584 | version = "0.9.4"
585 | source = "registry+https://github.com/rust-lang/crates.io-index"
586 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
587 |
588 | [[package]]
589 | name = "winapi"
590 | version = "0.3.9"
591 | source = "registry+https://github.com/rust-lang/crates.io-index"
592 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
593 | dependencies = [
594 | "winapi-i686-pc-windows-gnu",
595 | "winapi-x86_64-pc-windows-gnu",
596 | ]
597 |
598 | [[package]]
599 | name = "winapi-i686-pc-windows-gnu"
600 | version = "0.4.0"
601 | source = "registry+https://github.com/rust-lang/crates.io-index"
602 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
603 |
604 | [[package]]
605 | name = "winapi-util"
606 | version = "0.1.5"
607 | source = "registry+https://github.com/rust-lang/crates.io-index"
608 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
609 | dependencies = [
610 | "winapi",
611 | ]
612 |
613 | [[package]]
614 | name = "winapi-x86_64-pc-windows-gnu"
615 | version = "0.4.0"
616 | source = "registry+https://github.com/rust-lang/crates.io-index"
617 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
618 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "best"
3 | authors = ["Daniel Liu", "Daniel E. Cook"]
4 | version = "0.1.0"
5 | edition = "2021"
6 | description = "Bam Error Stats Tool (best): analysis of error types in aligned reads."
7 | license = "MIT"
8 |
9 | [dependencies]
10 | clap = { version = "^3.2", features = ["derive"] }
11 | rayon = "^1.5"
12 | noodles = { version = "^0.26", features = ["sam", "bam", "fasta", "bed", "core"] }
13 | fxhash = "^0.2"
14 | rust-lapper = "^1.0"
15 | ordered-float = "^3.1"
16 | flate2 = "^1.0"
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2022 Google LLC.
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 |
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # best
2 | Bam Error Stats Tool (best): analysis of error types in aligned reads.
3 |
4 | `best` is used to assess the quality of reads after aligning them to a
5 | reference assembly.
6 |
7 | ## Features
8 | * Collect overall and per alignment stats
9 | * Distribution of indel lengths
10 | * Yield at different empirical Q-value thresholds
11 | * Bin per read stats to easily examine the distribution of errors for certain
12 | types of reads
13 | * Stats for regions specified by intervals (BED file, homopolymer regions,
14 | windows etc.)
15 | * Stats for quality scores vs empirical Q-values
16 | * Multithreading for speed
17 |
18 | ## Usage
19 | The [`best` Usage Guide](Usage.md) gives an overview of how to use `best`.
20 |
21 | ## Installing
22 | 1. Install [Rust](https://www.rust-lang.org/tools/install).
23 | 2. Clone this repository and navigate into the directory of this repository.
24 | 3. Run `cargo install --locked --path .`
25 | 4. Run `best input.bam reference.fasta prefix/path`
26 |
27 | This will generate stats files with the `prefix/path` prefix.
28 |
29 | ## Development
30 | ### Running
31 | 1. Install [Rust](https://www.rust-lang.org/tools/install).
32 | 2. Clone this repository and navigate into the directory of this repository.
33 | 3. Run `cargo build --release`
34 | 4. Run `cargo run --release -- input.bam reference.fasta prefix/path` or
35 | `target/release/best input.bam reference.fasta prefix/path`
36 |
37 | This will generate stats files with the `prefix/path` prefix.
38 |
39 | The built binary is located at `target/release/best`.
40 |
41 | ### Formatting
42 | ```
43 | cargo fmt
44 | ```
45 |
46 | ### Comparing
47 | Remember to pass the `-t 1` option to ensure that only one thread is used for
48 | testing. Best generally tries to ensure the order of outputs is deterministic
49 | with multiple threads, but the order of per-alignment stats is arbitrary unless
50 | only one thread is used.
51 |
52 | ### Disclaimer
53 |
54 | This is not an official Google product.
55 |
56 | The code is not intended for use in any clinical settings. It is not intended to be a medical device and is not intended for clinical use of any kind, including but not limited to diagnosis or prognosis.
57 |
58 | No representations or warranties are made with regards to the accuracy of results generated. User or licensee is responsible for verifying and validating accuracy when using this tool.
59 |
--------------------------------------------------------------------------------
/Usage.md:
--------------------------------------------------------------------------------
1 | # `best` Usage Guide
2 |
3 | This guide will give a general overview of how to use `best`.
4 |
5 | ## Alignment Settings
6 | The alignment `bam` file must contain the read sequences and quality scores.
7 | The alignment CIGAR string can either use `M` or `=`/`X` for matches/mismatches.
8 |
9 | ## Example Analysis
10 | Let's say we have aligned reads in `aln.bam` and the reference assembly
11 | `ref.fasta.gz`. We want to collect statistics on the types of
12 | errors that occur in the alignments. Then, we can run `best` like
13 | ```
14 | best -t 4 aln.bam ref.fasta.gz output
15 | ```
16 | This will use 4 threads to collect stats and generate the following files:
17 | ```
18 | output.per_aln_stats.csv.gz
19 | output.summary_cigar_stats.csv
20 | output.summary_identity_stats.csv
21 | output.summary_qual_score_stats.csv
22 | output.summary_yield_stats.csv
23 | ```
24 | The per-alignment stats file is gzipped to save space. The
25 | `output.summary_cigar_stats.csv` file contains the distribution of lengths of
26 | consecutive insertions and deletions. The `output.summary_identity_stats.csv`
27 | file contains some general stats on the error rates across all alignments.
28 | The `output.summary_qual_score_stats.csv` file contains the empirical Q-value
29 | calculated from matches and mismatches for each corresponding quality score.
30 | The `all_alignments` feature indicates the quality score stats across all
31 | alignments. If intervals are specified, then this is computed per interval feature.
32 | The `output.summary_yield_stats.csv` contains the yield (number of reads/bases)
33 | above certain quality thresholds.
34 |
35 | We can collect even more data in one run of `best`. If we run
36 | ```
37 | best -t 4 --intervals-hp --bin-types q_len:1000 gc_contant:0.05 -- aln.bam ref.fasta.gz output
38 | ```
39 | then we will get two more files:
40 | ```
41 | output.summary_feature_stats.csv
42 | output.summary_bin_stats.csv
43 | ```
44 | The `output.summary_feature_stats.csv` file contains stats stratified by
45 | intervals. In this case, we use `--intervals-hp` to indicate that the intervals
46 | are homopolymer regions, so this will produce the error types at homopolymer
47 | regions of different lengths. The `output.summary_bin_stats.csv` file will
48 | contain the error types for reads binned by both the read (query) length (`q_len`,
49 | bin by increments of 1000bp) and the GC content (`gc_content`, bin by increments
50 | of 0.05).
51 |
52 | It is also possible to use bed files as custom intervals. These files can have
53 | three columns (all intervals will have the same feature) or four columns, where
54 | the last column indicates the feature. The feature stats are aggregated across
55 | all bed intervals with the same feature.
56 |
57 | ## Help Message:
58 | ```
59 | best 0.1.0
60 | Daniel Liu, Daniel E. Cook
61 | Bam Error Stats Tool (best): analysis of error types in aligned reads.
62 |
63 | USAGE:
64 | best [OPTIONS]
65 |
66 | ARGS:
67 |
68 | Input BAM file
69 |
70 |
71 | Input reference FASTA file. Can be gzipped
72 |
73 |
74 | Prefix for output files that contain statistics
75 |
76 | OPTIONS:
77 | -b, --bin-types ...
78 | Types of bins to use for per alignment stats.
79 |
80 | Each bin should be of the format :.
81 |
82 | Supported bin types: q_len (read sequence length), subread_passes, mapq, mean_qual,
83 | gc_content, concordance_qv (phred scale Q-value)
84 |
85 | -h, --help
86 | Print help information
87 |
88 | --intervals-bed ...
89 | Use intervals from a BED file.
90 |
91 | The BED file should have the columns chrom, start, stop, and feature. The feature column
92 | is optional.
93 |
94 | This allows stats to be gathered separately for different types of intervals. Note that
95 | all intervals are on the reference, not the reads.
96 |
97 | --intervals-border ...
98 | Use fixed-width nonoverlapping window border regions as intervals.
99 |
100 | This is used to specify the window widths.
101 |
102 | --intervals-hp
103 | Use homopolymer regions in the reference as intervals
104 |
105 | --intervals-match ...
106 | Use regions in the reference that match any of the specified subsequences as intervals
107 |
108 | --intervals-window ...
109 | Use fixed-width nonoverlapping windows as intervals.
110 |
111 | This is used to specify the window widths.
112 |
113 | --intervals-window-pos ...
114 | Use fixed-width nonoverlapping windows with positions as intervals.
115 |
116 | This is used to specify the window widths.
117 |
118 | -n, --name-column
119 | Add column with a specific name in CSV outputs
120 |
121 | --no-per-aln-stats
122 | Turn off outputting per alignment stats
123 |
124 | -t, --threads
125 | Number of threads. Will be automatically determined if this is set to 0
126 |
127 | [default: 0]
128 |
129 | -V, --version
130 | Print version information
131 | ```
132 |
--------------------------------------------------------------------------------
/pub/get_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright (c) 2022 Google LLC
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
5 | # this software and associated documentation files (the "Software"), to deal in
6 | # the Software without restriction, including without limitation the rights to
7 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8 | # the Software, and to permit persons to whom the Software is furnished to do so,
9 | # subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all
12 | # copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |
21 |
22 | #===================#
23 | # Download Datasets #
24 | #===================#
25 |
26 | mkdir reference illumina ont pacbio
27 |
28 | # Download the CHM13 v1.0 draft
29 | wget -O reference/chm13.draft_v1.0.fasta.gz https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.0.fasta.gz
30 |
31 | function download_bam {
32 | SUBSAMPLE=${1}
33 | DIRECTORY=${2}
34 | URL=${3}
35 | curl ${URL} | \
36 | samtools view -s ${SUBSAMPLE} -bh > ${DIRECTORY}/$(basename ${URL})
37 | }
38 |
39 | # Illumina
40 | download_bam 0.1 illumina https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/alignments/chm13.draft_v1.0.pcrfree.bam
41 |
42 | # ONT
43 | download_bam 0.1 ont https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/alignments/chm13.draft_v1.0.ont_guppy_3.6.0.wm_2.01.pri.bam
44 |
45 | # PacBio HiFi
46 | download_bam 0.1 pacbio https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/alignments/chm13.draft_v1.0.hifi_20k.wm_2.01.pri.bam
47 |
--------------------------------------------------------------------------------
/pub/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright (c) 2022 Google LLC
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
5 | # this software and associated documentation files (the "Software"), to deal in
6 | # the Software without restriction, including without limitation the rights to
7 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8 | # the Software, and to permit persons to whom the Software is furnished to do so,
9 | # subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all
12 | # copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |
21 |
22 | #==========#
23 | # Run Best #
24 | #==========#
25 |
26 | REFERENCE=reference/chm13.draft_v1.0.fasta.gz
27 | ARGS="-t 4 --intervals-hp --bin-types gc_content:0.05"
28 |
29 | best ${ARGS} --bin-types q_len:10 -- illumina/chm13.draft_v1.0.pcrfree.bam ${REFERENCE} illumina/illumina
30 | best ${ARGS} --bin-types q_len:10000 -- ont/chm13.draft_v1.0.ont_guppy_3.6.0.wm_2.01.pri.bam ${REFERENCE} ont/ont
31 | best ${ARGS} --bin-types q_len:1000 -- pacbio/chm13.draft_v1.0.hifi_20k.wm_2.01.pri.bam ${REFERENCE} pacbio/pacbio
32 |
--------------------------------------------------------------------------------
/pub/run_benchmark.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright (c) 2022 Google LLC
3 | #
4 | # Permission is hereby granted, free of charge, to any person obtaining a copy of
5 | # this software and associated documentation files (the "Software"), to deal in
6 | # the Software without restriction, including without limitation the rights to
7 | # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8 | # the Software, and to permit persons to whom the Software is furnished to do so,
9 | # subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in all
12 | # copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16 | # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17 | # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18 | # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 |
21 |
22 | REFERENCE=reference/chm13.draft_v1.0.fasta
23 | BAM_FILE=pacbio/chm13.draft_v1.0.hifi_20k.wm_2.01.pri.bam
24 |
25 | BAM_CONCORDANCE=~/hg002-ccs/concordance/bamConcordance
26 | echo "bamConcordance"
27 | time ${BAM_CONCORDANCE} ${REFERENCE} ${BAM_FILE} pacbio/bamConcordance.csv
28 |
29 | #echo "pomoxis"
30 | #time assess_homopolymers count ${BAM_FILE} -o pacbio/pomoxis -t 4
31 |
32 | echo "best 1 thread"
33 | time best -t 1 ${BAM_FILE} ${REFERENCE} pacbio/best_timing
34 |
35 | echo "best 4 thread"
36 | time best -t 4 ${BAM_FILE} ${REFERENCE} pacbio/best_timing
37 |
--------------------------------------------------------------------------------
/src/bed.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Google LLC
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | // this software and associated documentation files (the "Software"), to deal in
5 | // the Software without restriction, including without limitation the rights to
6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | // the Software, and to permit persons to whom the Software is furnished to do so,
8 | // subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
20 | use std::fs::File;
21 | use std::io::prelude::*;
22 | use std::io::BufReader;
23 |
24 | use fxhash::{FxHashMap, FxHashSet};
25 |
26 | use rust_lapper::{Interval, Lapper};
27 |
28 | pub type FeatureInterval = Interval;
29 |
30 | pub struct Intervals {
31 | intervals: FxHashMap>,
32 | pub features: FxHashSet,
33 | }
34 |
35 | impl Intervals {
36 | /// Create a new collection of intervals from a BED file.
37 | pub fn new(bed_path: &str) -> Self {
38 | let mut intervals = FxHashMap::default();
39 | let mut features = FxHashSet::default();
40 | let reader = BufReader::new(File::open(bed_path).expect("BED file not found."));
41 |
42 | for line in reader.lines() {
43 | let line = line.unwrap();
44 | let mut fields = line.split('\t');
45 | let chrom = fields.next().unwrap().to_owned();
46 | // convert to 1-indexed [start, stop)
47 | let start = fields.next().unwrap().parse::().unwrap() + 1;
48 | let stop = fields.next().unwrap().parse::().unwrap() + 1;
49 | let feature = fields.next().unwrap_or("none").to_owned();
50 |
51 | intervals
52 | .entry(chrom)
53 | .or_insert_with(|| Vec::::new())
54 | .push(Interval {
55 | start,
56 | stop,
57 | val: feature.clone(),
58 | });
59 | features.insert(feature);
60 | }
61 |
62 | Self {
63 | intervals: intervals
64 | .into_iter()
65 | .map(|(k, v)| (k, Lapper::new(v)))
66 | .collect(),
67 | features,
68 | }
69 | }
70 |
71 | /// Find intervals that intersect a given interval.
72 | pub fn find(&self, chrom: &str, start: usize, end: usize) -> Vec<&FeatureInterval> {
73 | self.intervals
74 | .get(chrom)
75 | .map(|x| x.find(start, end).collect())
76 | .unwrap_or_else(|| Vec::new())
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/intervals.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Google LLC
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | // this software and associated documentation files (the "Software"), to deal in
5 | // the Software without restriction, including without limitation the rights to
6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | // the Software, and to permit persons to whom the Software is furnished to do so,
8 | // subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
20 | use noodles::core::Position;
21 | use noodles::fasta;
22 |
23 | use crate::bed::FeatureInterval;
24 |
25 | static COMPLEMENT: [u8; 128] = {
26 | let mut c = [0u8; 128];
27 | c[b'A' as usize] = b'T';
28 | c[b'T' as usize] = b'A';
29 | c[b'C' as usize] = b'G';
30 | c[b'G' as usize] = b'C';
31 | c[b'N' as usize] = b'N';
32 | c
33 | };
34 |
35 | /// Find homopolymers in a sequence to use as intervals.
36 | pub fn find_homopolymers(
37 | seq: &fasta::record::Sequence,
38 | start: usize,
39 | end: usize,
40 | strand_rev: bool,
41 | ) -> Vec {
42 | let mut res = Vec::new();
43 | let mut hp_len = 0;
44 | let mut prev = b'?';
45 |
46 | for i in start..=end {
47 | let curr = seq
48 | .get(Position::new(i).unwrap())
49 | .unwrap()
50 | .to_ascii_uppercase();
51 |
52 | if curr == prev && i != end {
53 | hp_len += 1;
54 | continue;
55 | }
56 |
57 | if hp_len > 1 {
58 | let c = if strand_rev {
59 | COMPLEMENT[prev as usize]
60 | } else {
61 | prev
62 | };
63 | res.push(FeatureInterval {
64 | start: i - hp_len,
65 | stop: i,
66 | val: format!("{: >5}{}", hp_len, c as char),
67 | });
68 | }
69 | hp_len = 1;
70 | prev = curr;
71 | }
72 |
73 | res
74 | }
75 |
76 | /// Get fixed-length windows as intervals.
77 | pub fn get_windows(
78 | start: usize,
79 | end: usize,
80 | win_len: usize,
81 | pos: bool,
82 | strand_rev: bool,
83 | ) -> Vec {
84 | let mut res = Vec::new();
85 |
86 | for i in (start..end).step_by(win_len) {
87 | let lo;
88 | let hi;
89 | if strand_rev {
90 | hi = end - (i - start);
91 | lo = hi.saturating_sub(win_len).max(start);
92 | } else {
93 | lo = i;
94 | hi = (i + win_len).min(end);
95 | };
96 |
97 | if pos {
98 | res.push(FeatureInterval {
99 | start: lo,
100 | stop: hi,
101 | val: format!("window_{}_pos_{}", win_len, i - start),
102 | });
103 | } else {
104 | res.push(FeatureInterval {
105 | start: lo,
106 | stop: hi,
107 | val: format!("window_{}", win_len),
108 | });
109 | }
110 | }
111 |
112 | res
113 | }
114 |
115 | const BORDER_CONTEXT: usize = 1;
116 |
117 | /// Get small intervals that represent the region near fixed-width window borders.
118 | pub fn get_borders(
119 | start: usize,
120 | end: usize,
121 | win_len: usize,
122 | strand_rev: bool,
123 | ) -> Vec {
124 | let mut res = Vec::new();
125 |
126 | for i in (start..end).step_by(win_len).skip(1) {
127 | let idx;
128 | if strand_rev {
129 | idx = end - (i - start);
130 | } else {
131 | idx = i;
132 | };
133 |
134 | res.push(FeatureInterval {
135 | start: (idx - 1 - BORDER_CONTEXT).max(start),
136 | stop: (idx + BORDER_CONTEXT + 1).min(end),
137 | val: format!("border_{}", win_len),
138 | });
139 | }
140 |
141 | res
142 | }
143 |
144 | /// Get regions that match a sequence as intervals.
145 | pub fn get_matches(
146 | seq: &fasta::record::Sequence,
147 | start: usize,
148 | end: usize,
149 | s: &str,
150 | strand_rev: bool,
151 | ) -> Vec {
152 | let mut res = Vec::new();
153 |
154 | for i in start..end {
155 | // convert to zero-indexed
156 | let seq_iter = seq.as_ref()[i - 1..(i - 1 + s.len()).min(end - 1)]
157 | .iter()
158 | .map(|c| c.to_ascii_uppercase());
159 | let is_match = if strand_rev {
160 | seq_iter.eq(s.bytes().rev().map(|c| COMPLEMENT[c as usize]))
161 | } else {
162 | seq_iter.eq(s.bytes())
163 | };
164 | if is_match {
165 | res.push(FeatureInterval {
166 | start: i,
167 | stop: i + s.len(),
168 | val: s.to_owned(),
169 | });
170 | }
171 | }
172 |
173 | res
174 | }
175 |
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Google LLC
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | // this software and associated documentation files (the "Software"), to deal in
5 | // the Software without restriction, including without limitation the rights to
6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | // the Software, and to permit persons to whom the Software is furnished to do so,
8 | // subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
20 | use clap::Parser;
21 |
22 | use rayon::prelude::*;
23 |
24 | use noodles::{bam, fasta, sam};
25 |
26 | use fxhash::FxHashMap;
27 |
28 | use flate2::read::MultiGzDecoder;
29 | use flate2::write::GzEncoder;
30 |
31 | use std::fs::File;
32 | use std::io::{BufReader, BufWriter, Read, Write};
33 | use std::str::FromStr;
34 | use std::sync::atomic::{AtomicUsize, Ordering};
35 | use std::sync::Mutex;
36 | use std::time::Instant;
37 |
38 | mod stats;
39 | use stats::*;
40 | mod summary;
41 | use summary::*;
42 | mod bed;
43 | use bed::*;
44 | mod intervals;
45 | use intervals::*;
46 |
47 | const PER_ALN_STATS_NAME: &str = "per_aln_stats.csv.gz";
48 | const YIELD_STATS_NAME: &str = "summary_yield_stats.csv";
49 | const IDENTITY_STATS_NAME: &str = "summary_identity_stats.csv";
50 | const FEATURE_STATS_NAME: &str = "summary_feature_stats.csv";
51 | const CIGAR_STATS_NAME: &str = "summary_cigar_stats.csv";
52 | const BIN_STATS_NAME: &str = "summary_bin_stats.csv";
53 | const QUAL_SCORE_STATS_NAME: &str = "summary_qual_score_stats.csv";
54 |
55 | fn run(
56 | input_path: String,
57 | reference_path: String,
58 | stats_prefix: String,
59 | bin_types: Option>,
60 | intervals_types: Vec,
61 | name_column: Option,
62 | output_per_aln_stats: bool,
63 | ) {
64 | // read reference sequences from fasta file
65 | let mut ref_reader = {
66 | let f = File::open(&reference_path).unwrap();
67 | let r: Box = if reference_path.ends_with(".gz") {
68 | Box::new(MultiGzDecoder::new(f))
69 | } else {
70 | Box::new(f)
71 | };
72 | fasta::Reader::new(BufReader::new(r))
73 | };
74 | let reference_seqs: FxHashMap = ref_reader
75 | .records()
76 | .map(|r| r.unwrap())
77 | .map(|r| (r.name().to_string(), r))
78 | .collect();
79 |
80 | // read bam file
81 | let mut reader = bam::Reader::new(File::open(input_path).unwrap());
82 | reader.read_header().unwrap();
83 | let references = reader.read_reference_sequences().unwrap();
84 |
85 | // create per alignment stats writer that is shared between threads
86 | let aln_stats_path = format!("{}.{}", stats_prefix, PER_ALN_STATS_NAME);
87 | let aln_stats_writer = if output_per_aln_stats {
88 | let mut w = GzEncoder::new(
89 | BufWriter::new(File::create(&aln_stats_path).unwrap()),
90 | flate2::Compression::default(),
91 | );
92 | write!(
93 | w,
94 | "{}{}\n",
95 | if name_column.is_some() { "name," } else { "" },
96 | AlnStats::header()
97 | )
98 | .unwrap();
99 | Some(Mutex::new(w))
100 | } else {
101 | None
102 | };
103 |
104 | let summary_yield = Mutex::new(YieldSummary::new(name_column.clone()));
105 | let summary_identity = Mutex::new(IdentitySummary::new(name_column.clone()));
106 | let summary_features = if intervals_types.is_empty() {
107 | None
108 | } else {
109 | Some(Mutex::new(FeatureSummary::new(name_column.clone())))
110 | };
111 | let summary_cigars = Mutex::new(CigarLenSummary::new(name_column.clone()));
112 | let summary_bins = bin_types.map(|b| Mutex::new(BinSummary::new(name_column.clone(), b)));
113 | let summary_qual_score = Mutex::new(QualScoreSummary::new(name_column.clone()));
114 | let total_alns = AtomicUsize::new(0);
115 |
116 | // lazily read records to shift parsing work to individual threads
117 | reader
118 | .lazy_records()
119 | .par_bridge()
120 | .map(|r| r.unwrap())
121 | .for_each(|record| {
122 | total_alns.fetch_add(1, Ordering::Relaxed);
123 |
124 | let flags = record.flags().unwrap();
125 | if flags.is_unmapped() || flags.is_secondary() {
126 | // skip
127 | return;
128 | }
129 |
130 | let strand_rev = flags.is_reverse_complemented();
131 | let aln_ref = references[record.reference_sequence_id().unwrap().unwrap()]
132 | .name()
133 | .as_str();
134 | if !reference_seqs.contains_key(aln_ref) {
135 | panic!(
136 | "{} is not found in the input reference sequence names!",
137 | aln_ref
138 | );
139 | }
140 | // convert to one-indexed [aln_start, aln_end)
141 | let aln_start = usize::from(record.alignment_start().unwrap().unwrap());
142 | let aln_end = aln_start
143 | + sam::record::Cigar::try_from(record.cigar())
144 | .unwrap()
145 | .alignment_span();
146 | // get all the intervals relevant for the current alignment record
147 | let mut intervals_vec = Vec::new();
148 | let mut overlap_intervals = Vec::new();
149 | intervals_types
150 | .iter()
151 | .for_each(|intervals_type| match intervals_type {
152 | IntervalsType::Homopolymer => intervals_vec.extend(find_homopolymers(
153 | reference_seqs[aln_ref].sequence(),
154 | aln_start,
155 | aln_end,
156 | strand_rev,
157 | )),
158 | IntervalsType::Window(win_len) => intervals_vec
159 | .extend(get_windows(aln_start, aln_end, *win_len, false, strand_rev)),
160 | IntervalsType::WindowPos(win_len) => intervals_vec
161 | .extend(get_windows(aln_start, aln_end, *win_len, true, strand_rev)),
162 | IntervalsType::Border(win_len) => {
163 | intervals_vec.extend(get_borders(aln_start, aln_end, *win_len, strand_rev))
164 | }
165 | IntervalsType::Match(seq) => intervals_vec.extend(get_matches(
166 | reference_seqs[aln_ref].sequence(),
167 | aln_start,
168 | aln_end,
169 | seq,
170 | strand_rev,
171 | )),
172 | IntervalsType::Bed(intervals) => {
173 | overlap_intervals.extend(intervals.find(aln_ref, aln_start, aln_end))
174 | }
175 | });
176 | overlap_intervals.extend(&intervals_vec);
177 | overlap_intervals.sort();
178 |
179 | let stats =
180 | AlnStats::from_record(&references, &reference_seqs, &record, &overlap_intervals);
181 |
182 | summary_yield.lock().unwrap().update(&stats);
183 | summary_identity.lock().unwrap().update(&stats);
184 | summary_features
185 | .as_ref()
186 | .map(|f| f.lock().unwrap().update(&stats));
187 | summary_cigars.lock().unwrap().update(&stats);
188 | summary_bins
189 | .as_ref()
190 | .map(|b| b.lock().unwrap().update(&stats));
191 | summary_qual_score.lock().unwrap().update(&stats);
192 |
193 | if let Some(ref w) = aln_stats_writer {
194 | let mut w = w.lock().unwrap();
195 | if let Some(ref name) = name_column {
196 | write!(w, "{},{}\n", name, stats.to_csv()).unwrap();
197 | } else {
198 | write!(w, "{}\n", stats.to_csv()).unwrap();
199 | }
200 | }
201 | });
202 |
203 | write_summary(
204 | summary_yield.into_inner().unwrap(),
205 | &stats_prefix,
206 | YIELD_STATS_NAME,
207 | );
208 |
209 | summary_identity.lock().unwrap().total_alns = total_alns.into_inner();
210 | write_summary(
211 | summary_identity.into_inner().unwrap(),
212 | &stats_prefix,
213 | IDENTITY_STATS_NAME,
214 | );
215 |
216 | if let Some(f) = summary_features {
217 | write_summary(f.into_inner().unwrap(), &stats_prefix, FEATURE_STATS_NAME);
218 | }
219 |
220 | write_summary(
221 | summary_cigars.into_inner().unwrap(),
222 | &stats_prefix,
223 | CIGAR_STATS_NAME,
224 | );
225 |
226 | if let Some(b) = summary_bins {
227 | write_summary(b.into_inner().unwrap(), &stats_prefix, BIN_STATS_NAME);
228 | }
229 |
230 | write_summary(
231 | summary_qual_score.into_inner().unwrap(),
232 | &stats_prefix,
233 | QUAL_SCORE_STATS_NAME,
234 | );
235 | }
236 |
237 | fn write_summary(s: D, prefix: &str, name: &str) {
238 | let summary_path = format!("{}.{}", prefix, name);
239 | let mut summary_writer = File::create(&summary_path).unwrap();
240 | write!(summary_writer, "{}", s).unwrap();
241 | }
242 |
243 | fn main() {
244 | let start_time = Instant::now();
245 | let args = Args::parse();
246 |
247 | let bin_types = args
248 | .bin_types
249 | .map(|b| b.iter().map(|s| BinType::from_str(s).unwrap()).collect());
250 |
251 | let mut intervals_types = Vec::new();
252 | if args.intervals_hp {
253 | intervals_types.push(IntervalsType::Homopolymer);
254 | }
255 | if let Some(paths) = args.intervals_bed {
256 | intervals_types.extend(paths.iter().map(|p| IntervalsType::Bed(Intervals::new(p))));
257 | }
258 | if let Some(win_lens) = args.intervals_window {
259 | intervals_types.extend(win_lens.into_iter().map(|l| IntervalsType::Window(l)));
260 | }
261 | if let Some(win_lens) = args.intervals_window_pos {
262 | intervals_types.extend(win_lens.into_iter().map(|l| IntervalsType::WindowPos(l)));
263 | }
264 | if let Some(win_lens) = args.intervals_border {
265 | intervals_types.extend(win_lens.into_iter().map(|l| IntervalsType::Border(l)));
266 | }
267 | if let Some(seqs) = args.intervals_match {
268 | intervals_types.extend(seqs.into_iter().map(|mut s| {
269 | s.make_ascii_uppercase();
270 | IntervalsType::Match(s)
271 | }));
272 | }
273 |
274 | rayon::ThreadPoolBuilder::new()
275 | .num_threads(args.threads)
276 | .build_global()
277 | .unwrap();
278 |
279 | run(
280 | args.input,
281 | args.reference,
282 | args.stats_prefix,
283 | bin_types,
284 | intervals_types,
285 | args.name_column,
286 | !args.no_per_aln_stats,
287 | );
288 |
289 | let duration = start_time.elapsed();
290 | println!("Run time (s): {}", duration.as_secs());
291 | }
292 |
293 | enum IntervalsType {
294 | Bed(Intervals),
295 | Homopolymer,
296 | Window(usize),
297 | WindowPos(usize),
298 | Border(usize),
299 | Match(String),
300 | }
301 |
302 | #[derive(Parser)]
303 | #[clap(author, version, about)]
304 | struct Args {
305 | /// Input BAM file.
306 | input: String,
307 |
308 | /// Input reference FASTA file. Can be gzipped.
309 | reference: String,
310 |
311 | /// Prefix for output files that contain statistics.
312 | stats_prefix: String,
313 |
314 | /// Add column with a specific name in CSV outputs.
315 | #[clap(short, long)]
316 | name_column: Option,
317 |
318 | /// Turn off outputting per alignment stats.
319 | #[clap(long)]
320 | no_per_aln_stats: bool,
321 |
322 | /// Types of bins to use for per alignment stats.
323 | ///
324 | /// Each bin should be of the format :.
325 | ///
326 | /// Supported bin types:
327 | /// q_len (read sequence length),
328 | /// subread_passes,
329 | /// mapq,
330 | /// mean_qual,
331 | /// gc_content,
332 | /// concordance_qv (phred scale Q-value)
333 | #[clap(short, long, min_values = 1)]
334 | bin_types: Option>,
335 |
336 | /// Use intervals from a BED file.
337 | ///
338 | /// The BED file should have the columns chrom, start, stop, and feature.
339 | /// The feature column is optional.
340 | ///
341 | /// This allows stats to be gathered separately for different types of intervals.
342 | /// Note that all intervals are on the reference, not the reads.
343 | #[clap(long, min_values = 1)]
344 | intervals_bed: Option>,
345 |
346 | /// Use homopolymer regions in the reference as intervals.
347 | #[clap(long)]
348 | intervals_hp: bool,
349 |
350 | /// Use fixed-width nonoverlapping windows as intervals.
351 | ///
352 | /// This is used to specify the window widths.
353 | #[clap(long, min_values = 1)]
354 | intervals_window: Option>,
355 |
356 | /// Use fixed-width nonoverlapping windows with positions as intervals.
357 | ///
358 | /// This is used to specify the window widths.
359 | #[clap(long, min_values = 1)]
360 | intervals_window_pos: Option>,
361 |
362 | /// Use fixed-width nonoverlapping window border regions as intervals.
363 | ///
364 | /// This is used to specify the window widths.
365 | #[clap(long, min_values = 1)]
366 | intervals_border: Option>,
367 |
368 | /// Use regions in the reference that match any of the specified subsequences as intervals.
369 | #[clap(long, min_values = 1)]
370 | intervals_match: Option>,
371 |
372 | /// Number of threads. Will be automatically determined if this is set to 0.
373 | #[clap(short, long, default_value_t = 0usize)]
374 | threads: usize,
375 | }
376 |
--------------------------------------------------------------------------------
/src/stats.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Google LLC
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | // this software and associated documentation files (the "Software"), to deal in
5 | // the Software without restriction, including without limitation the rights to
6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | // the Software, and to permit persons to whom the Software is furnished to do so,
8 | // subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
20 | use noodles::bam;
21 | use noodles::core::Position;
22 | use noodles::fasta;
23 | use noodles::sam;
24 |
25 | use sam::record::cigar::op::Kind;
26 | use sam::record::data::field::Tag;
27 |
28 | use fxhash::FxHashMap;
29 |
30 | use std::fmt;
31 | use std::str::FromStr;
32 |
33 | use crate::bed::*;
34 |
35 | /// Statistics for each alignment.
36 | #[derive(Debug)]
37 | pub struct AlnStats<'a> {
38 | pub read_name: String,
39 | pub chr: String,
40 | pub ref_pos: usize,
41 | pub q_len: usize,
42 | pub effective_cov: Option,
43 | pub subread_passes: Option,
44 | pub pred_concordance: Option,
45 | pub supplementary: bool,
46 | pub strand_rev: bool,
47 | pub mapq: u8,
48 | pub mean_qual: u8,
49 | pub read_len: usize,
50 | pub ref_cov: f64,
51 | pub gc_content: f64,
52 | pub concordance: f64,
53 | pub concordance_gc: f64,
54 | pub concordance_qv: f64,
55 | pub matches: usize,
56 | pub mismatches: usize,
57 | pub non_hp_ins: usize,
58 | pub non_hp_del: usize,
59 | pub hp_ins: usize,
60 | pub hp_del: usize,
61 | pub gc_ins: usize,
62 | pub gc_del: usize,
63 | pub feature_stats: FxHashMap<&'a str, FeatureStats>,
64 | pub cigar_len_stats: FxHashMap<(usize, u8), usize>,
65 | pub q_score_stats: QualScoreStats,
66 | }
67 |
68 | /// Stats on the number of matches and mismatches for each quality score.
69 | #[derive(Debug, Clone)]
70 | pub struct QualScoreStats {
71 | stats: Vec<(usize, usize)>, // (match, mismatch)
72 | }
73 |
74 | impl QualScoreStats {
75 | pub fn assign_add(&mut self, o: &Self) {
76 | self.stats.iter_mut().zip(&o.stats).for_each(|(q, o)| {
77 | q.0 += o.0;
78 | q.1 += o.1;
79 | });
80 | }
81 |
82 | pub fn increment(&mut self, q_score: usize, is_match: bool) {
83 | if is_match {
84 | self.stats[q_score].0 += 1;
85 | } else {
86 | self.stats[q_score].1 += 1;
87 | }
88 | }
89 |
90 | pub fn empirical_qv(&self) -> Vec<(usize, f64)> {
91 | self.stats
92 | .iter()
93 | .enumerate()
94 | .filter_map(|(i, &(matches, mismatches))| {
95 | if matches == 0 && mismatches == 0 {
96 | None
97 | } else {
98 | Some((
99 | i,
100 | concordance_qv(
101 | (matches as f64) / ((matches + mismatches) as f64),
102 | mismatches != 0,
103 | ),
104 | ))
105 | }
106 | })
107 | .collect()
108 | }
109 | }
110 |
111 | impl Default for QualScoreStats {
112 | fn default() -> Self {
113 | Self {
114 | stats: vec![(0usize, 0usize); 256],
115 | }
116 | }
117 | }
118 |
119 | /// Per-read attributes that can be binned.
120 | #[derive(Copy, Clone)]
121 | pub enum BinType {
122 | QLen(usize),
123 | SubreadPasses(usize),
124 | MapQ(u8),
125 | MeanQual(u8),
126 | GcContent(f64),
127 | ConcordanceQv(f64),
128 | }
129 |
130 | impl BinType {
131 | pub fn get_bin(&self, a: &AlnStats) -> String {
132 | match self {
133 | Self::QLen(step) => format!("{}", a.q_len / step * step),
134 | Self::SubreadPasses(step) => format!(
135 | "{}",
136 | a.subread_passes.expect("Subread passes not found!") / step * step
137 | ),
138 | Self::MapQ(step) => format!("{}", a.mapq / step * step),
139 | Self::MeanQual(step) => format!("{}", a.mean_qual / step * step),
140 | Self::GcContent(step) => format!("{:.6}", (a.gc_content / step).floor() * step),
141 | Self::ConcordanceQv(step) => format!("{:.2}", (a.concordance_qv / step).floor() * step),
142 | }
143 | }
144 | }
145 |
146 | impl fmt::Display for BinType {
147 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
148 | match self {
149 | Self::QLen(step) => write!(f, "q_len:{}", step),
150 | Self::SubreadPasses(step) => write!(f, "subread_passes:{}", step),
151 | Self::MapQ(step) => write!(f, "mapq:{}", step),
152 | Self::MeanQual(step) => write!(f, "mean_qual:{}", step),
153 | Self::GcContent(step) => write!(f, "gc_content:{}", step),
154 | Self::ConcordanceQv(step) => write!(f, "concordance_qv:{}", step),
155 | }
156 | }
157 | }
158 |
159 | impl FromStr for BinType {
160 | type Err = Box;
161 |
162 | fn from_str(s: &str) -> Result {
163 | let mut split = s.split(':');
164 | let a = split
165 | .next()
166 | .expect("Bin type not found! Expected :");
167 | let b = split
168 | .next()
169 | .expect("Step size not found! Expected :");
170 |
171 | use BinType::*;
172 | match a {
173 | "q_len" => Ok(QLen(b.parse::<_>().unwrap())),
174 | "subread_passes" => Ok(SubreadPasses(b.parse::<_>().unwrap())),
175 | "mapq" => Ok(MapQ(b.parse::<_>().unwrap())),
176 | "mean_qual" => Ok(MeanQual(b.parse::<_>().unwrap())),
177 | "gc_content" => Ok(GcContent(b.parse::<_>().unwrap())),
178 | "concordance_qv" => Ok(ConcordanceQv(b.parse::<_>().unwrap())),
179 | _ => Err("Invalid stat to bin across!".into()),
180 | }
181 | }
182 | }
183 |
184 | /// Statistics for each bin.
185 | #[derive(Debug, Default)]
186 | pub struct BinStats {
187 | pub num_reads: usize,
188 | pub matches: usize,
189 | pub mismatches: usize,
190 | pub non_hp_ins: usize,
191 | pub non_hp_del: usize,
192 | pub hp_ins: usize,
193 | pub hp_del: usize,
194 | }
195 |
196 | impl BinStats {
197 | pub fn new(stats: &AlnStats) -> Self {
198 | Self {
199 | num_reads: 1,
200 | matches: stats.matches,
201 | mismatches: stats.mismatches,
202 | non_hp_ins: stats.non_hp_ins,
203 | non_hp_del: stats.non_hp_del,
204 | hp_ins: stats.hp_ins,
205 | hp_del: stats.hp_del,
206 | }
207 | }
208 |
209 | pub fn assign_add(&mut self, o: &Self) {
210 | self.num_reads += o.num_reads;
211 | self.matches += o.matches;
212 | self.mismatches += o.mismatches;
213 | self.non_hp_ins += o.non_hp_ins;
214 | self.non_hp_del += o.non_hp_del;
215 | self.hp_ins += o.hp_ins;
216 | self.hp_del += o.hp_del;
217 | }
218 |
219 | pub fn num_bases(&self) -> usize {
220 | self.matches + self.mismatches + self.non_hp_del + self.hp_del
221 | }
222 |
223 | pub fn num_errors(&self) -> usize {
224 | self.mismatches + self.non_hp_ins + self.hp_ins + self.non_hp_del + self.hp_del
225 | }
226 |
227 | pub fn identity(&self) -> f64 {
228 | (self.matches as f64) / ((self.matches + self.num_errors()) as f64)
229 | }
230 | }
231 |
232 | /// Statistics for each interval feature.
233 | #[derive(Debug, Default)]
234 | pub struct FeatureStats {
235 | pub overlaps: usize,
236 | pub identical_overlaps: usize,
237 | pub matches: usize,
238 | pub mismatches: usize,
239 | pub non_hp_ins: usize,
240 | pub non_hp_del: usize,
241 | pub hp_ins: usize,
242 | pub hp_del: usize,
243 | pub total_qual_error: f64,
244 | pub q_score_stats: QualScoreStats,
245 | }
246 |
247 | impl FeatureStats {
248 | pub fn assign_add(&mut self, o: &Self) {
249 | self.overlaps += o.overlaps;
250 | self.identical_overlaps += o.identical_overlaps;
251 | self.matches += o.matches;
252 | self.mismatches += o.mismatches;
253 | self.non_hp_ins += o.non_hp_ins;
254 | self.non_hp_del += o.non_hp_del;
255 | self.hp_ins += o.hp_ins;
256 | self.hp_del += o.hp_del;
257 | self.total_qual_error += o.total_qual_error;
258 | self.q_score_stats.assign_add(&o.q_score_stats);
259 | }
260 |
261 | pub fn num_bases(&self) -> usize {
262 | self.matches + self.mismatches + self.non_hp_del + self.hp_del
263 | }
264 |
265 | pub fn num_errors(&self) -> usize {
266 | self.mismatches + self.non_hp_ins + self.hp_ins + self.non_hp_del + self.hp_del
267 | }
268 |
269 | pub fn identity(&self) -> f64 {
270 | (self.matches as f64) / ((self.matches + self.num_errors()) as f64)
271 | }
272 |
273 | pub fn mean_qual(&self) -> f64 {
274 | // only include quality scores from matches and mismatches
275 | error_to_qual(self.total_qual_error / ((self.matches + self.mismatches) as f64))
276 | }
277 | }
278 |
279 | impl<'a> AlnStats<'a> {
280 | pub fn from_record(
281 | references: &sam::header::ReferenceSequences,
282 | reference_seqs: &FxHashMap,
283 | r: &bam::lazy::Record,
284 | intervals: &[&'a FeatureInterval],
285 | ) -> Self {
286 | let chr = references[r.reference_sequence_id().unwrap().unwrap()]
287 | .name()
288 | .to_string();
289 | let mut ref_pos = usize::from(r.alignment_start().unwrap().unwrap());
290 | let sequence = sam::record::Sequence::try_from(r.sequence()).unwrap();
291 | let q_scores = sam::record::QualityScores::try_from(r.quality_scores()).unwrap();
292 | if sequence.is_empty() || q_scores.is_empty() {
293 | panic!("Read sequence or quality scores do not exist!");
294 | }
295 | let flags = r.flags().unwrap();
296 | let data = sam::record::Data::try_from(r.data()).unwrap();
297 | let ec_tag = Tag::try_from(*b"ec").unwrap();
298 | let ec = data
299 | .get(ec_tag)
300 | .map(|f| f.value().as_float().unwrap() as f64);
301 | let np_tag: Tag = Tag::try_from(*b"np").unwrap();
302 | let np = data
303 | .get(np_tag)
304 | .map(|f| f.value().as_int().unwrap() as usize);
305 | let rq_tag: Tag = Tag::try_from(*b"rq").unwrap();
306 | let rq = data
307 | .get(rq_tag)
308 | .map(|f| f.value().as_float().unwrap() as f64);
309 |
310 | let mut res = AlnStats {
311 | read_name: r
312 | .read_name()
313 | .expect("Error parsing read name! Perhaps it contains an '@'?")
314 | .unwrap()
315 | .to_string(),
316 | chr,
317 | ref_pos,
318 | q_len: sequence.len(),
319 | effective_cov: ec,
320 | subread_passes: np,
321 | pred_concordance: rq,
322 | supplementary: flags.is_supplementary(),
323 | strand_rev: flags.is_reverse_complemented(),
324 | mapq: r
325 | .mapping_quality()
326 | .unwrap()
327 | .map(|q| u8::from(q))
328 | .unwrap_or(255u8),
329 | mean_qual: mean_qual(q_scores.as_ref()),
330 | // fill in the rest afterwards
331 | read_len: 0,
332 | ref_cov: 0.0,
333 | gc_content: 0.0,
334 | concordance: 0.0,
335 | concordance_gc: 0.0,
336 | concordance_qv: 0.0,
337 | matches: 0,
338 | mismatches: 0,
339 | non_hp_ins: 0,
340 | non_hp_del: 0,
341 | hp_ins: 0,
342 | hp_del: 0,
343 | gc_ins: 0,
344 | gc_del: 0,
345 | feature_stats: FxHashMap::default(),
346 | cigar_len_stats: FxHashMap::default(),
347 | q_score_stats: QualScoreStats::default(),
348 | };
349 |
350 | let mut interval_has_error = vec![false; intervals.len()];
351 | for i in intervals {
352 | res.feature_stats
353 | .entry(&i.val)
354 | .or_insert_with(|| FeatureStats::default())
355 | .overlaps += 1;
356 | }
357 |
358 | let mut query_pos = 1;
359 | let mut interval_start_idx = 0;
360 | let curr_ref_seq = reference_seqs[&res.chr].sequence();
361 | let mut curr_features = Vec::new();
362 | let mut curr_interval_idxs = Vec::new();
363 |
364 | let mut intervals_have_error = |v: &[usize]| {
365 | v.iter()
366 | .for_each(|&interval_idx| interval_has_error[interval_idx] = true);
367 | };
368 |
369 | // count mismatches, indels, and homopolymers
370 | let cigar = sam::record::Cigar::try_from(r.cigar()).unwrap();
371 | for op in cigar.iter() {
372 | for _i in 0..op.len() {
373 | // skip intervals that cannot overlap the current reference position
374 | while interval_start_idx < intervals.len()
375 | && ref_pos >= intervals[interval_start_idx].stop
376 | {
377 | interval_start_idx += 1;
378 | }
379 | // find the intervals that overlap the current reference position
380 | let mut interval_idx = interval_start_idx;
381 | curr_features.clear();
382 | curr_interval_idxs.clear();
383 | while interval_idx < intervals.len() && ref_pos >= intervals[interval_idx].start {
384 | if ref_pos < intervals[interval_idx].stop {
385 | // get feature names of the overlapping intervals
386 | curr_features.push(intervals[interval_idx].val.as_str());
387 | curr_interval_idxs.push(interval_idx);
388 | }
389 | interval_idx += 1;
390 | }
391 |
392 | match op.kind() {
393 | Kind::SequenceMatch | Kind::SequenceMismatch | Kind::Match => {
394 | let c = curr_ref_seq[Position::new(ref_pos).unwrap()].to_ascii_uppercase();
395 | let is_match = op.kind() == Kind::SequenceMatch
396 | || (op.kind() == Kind::Match
397 | && c == u8::from(sequence[Position::new(query_pos).unwrap()])
398 | .to_ascii_uppercase());
399 | let q_score = u8::from(q_scores[Position::new(query_pos).unwrap()]);
400 | let qual_error = qual_to_error(q_score);
401 | if is_match {
402 | res.matches += 1;
403 | res.q_score_stats.increment(q_score as usize, true);
404 | curr_features.iter().for_each(|f| {
405 | let stats = res.feature_stats.get_mut(f).unwrap();
406 | stats.matches += 1;
407 | stats.total_qual_error += qual_error;
408 | stats.q_score_stats.increment(q_score as usize, true);
409 | });
410 | } else {
411 | res.mismatches += 1;
412 | res.q_score_stats.increment(q_score as usize, false);
413 | curr_features.iter().for_each(|f| {
414 | let stats = res.feature_stats.get_mut(f).unwrap();
415 | stats.mismatches += 1;
416 | stats.total_qual_error += qual_error;
417 | stats.q_score_stats.increment(q_score as usize, false);
418 | });
419 | intervals_have_error(&curr_interval_idxs);
420 | }
421 | if c == b'C' || c == b'G' {
422 | res.gc_content += 1.0;
423 | }
424 | query_pos += 1;
425 | ref_pos += 1;
426 | }
427 | Kind::Insertion => {
428 | // can be computed without looping through the number of insertions
429 | // this does not modify ref_pos
430 | let before_ins =
431 | curr_ref_seq[Position::new(ref_pos).unwrap()].to_ascii_uppercase();
432 | let after_ins = curr_ref_seq
433 | .get(Position::new(ref_pos + 1).unwrap())
434 | .unwrap_or(&b'?')
435 | .to_ascii_uppercase();
436 | let query_ins = &sequence[Position::new(query_pos).unwrap()
437 | ..Position::new(query_pos + op.len()).unwrap()];
438 | let hp_before = query_ins
439 | .iter()
440 | .map(|&c| u8::from(c).to_ascii_uppercase())
441 | .all(|c| c == before_ins);
442 | let hp_after = query_ins
443 | .iter()
444 | .map(|&c| u8::from(c).to_ascii_uppercase())
445 | .all(|c| c == after_ins);
446 | if hp_before || hp_after {
447 | res.hp_ins += op.len();
448 | curr_features.iter().for_each(|f| {
449 | res.feature_stats.get_mut(f).unwrap().hp_ins += op.len()
450 | });
451 | } else {
452 | res.non_hp_ins += op.len();
453 | curr_features.iter().for_each(|f| {
454 | res.feature_stats.get_mut(f).unwrap().non_hp_ins += op.len()
455 | });
456 | }
457 | intervals_have_error(&curr_interval_idxs);
458 | query_pos += op.len();
459 | break;
460 | }
461 | Kind::Deletion => {
462 | let before_curr = curr_ref_seq
463 | .get(Position::new(ref_pos - 1).unwrap())
464 | .unwrap_or(&b'?')
465 | .to_ascii_uppercase();
466 | let after_curr = curr_ref_seq
467 | .get(Position::new(ref_pos + 1).unwrap())
468 | .unwrap_or(&b'?')
469 | .to_ascii_uppercase();
470 | let curr =
471 | curr_ref_seq[Position::new(ref_pos).unwrap()].to_ascii_uppercase();
472 | if curr == b'C' || curr == b'G' {
473 | res.gc_content += 1.0;
474 | }
475 | let hp = curr == before_curr || curr == after_curr;
476 | if hp {
477 | res.hp_del += 1;
478 | curr_features
479 | .iter()
480 | .for_each(|f| res.feature_stats.get_mut(f).unwrap().hp_del += 1);
481 | } else {
482 | res.non_hp_del += 1;
483 | curr_features.iter().for_each(|f| {
484 | res.feature_stats.get_mut(f).unwrap().non_hp_del += 1
485 | });
486 | }
487 | intervals_have_error(&curr_interval_idxs);
488 | ref_pos += 1;
489 | }
490 | Kind::SoftClip => {
491 | // does not require looping through the number of soft clips
492 | query_pos += op.len();
493 | break;
494 | }
495 | Kind::HardClip => {
496 | // does not require looping through the number of hard clips
497 | break;
498 | }
499 | Kind::Skip => {
500 | // does not require looping through the number of skip operations
501 | ref_pos += op.len();
502 | break;
503 | }
504 | _ => panic!("Unexpected CIGAR operation: {}", op),
505 | }
506 | }
507 |
508 | // gap compressed
509 | match op.kind() {
510 | Kind::SequenceMatch => {
511 | *res.cigar_len_stats.entry((op.len(), b'=')).or_insert(0) += 1;
512 | }
513 | Kind::SequenceMismatch => {
514 | *res.cigar_len_stats.entry((op.len(), b'X')).or_insert(0) += 1;
515 | }
516 | Kind::Match => {
517 | *res.cigar_len_stats.entry((op.len(), b'M')).or_insert(0) += 1;
518 | }
519 | Kind::Insertion => {
520 | *res.cigar_len_stats.entry((op.len(), b'I')).or_insert(0) += 1;
521 | res.gc_ins += 1;
522 | }
523 | Kind::Deletion => {
524 | *res.cigar_len_stats.entry((op.len(), b'D')).or_insert(0) += 1;
525 | res.gc_del += 1;
526 | }
527 | _ => (),
528 | }
529 | }
530 |
531 | let errors = res.mismatches + res.non_hp_ins + res.non_hp_del + res.hp_ins + res.hp_del;
532 | res.read_len = res.matches + res.mismatches + res.non_hp_del + res.hp_del;
533 | res.ref_cov = (res.read_len as f64) / (curr_ref_seq.len() as f64);
534 | res.gc_content /= res.read_len as f64;
535 | res.concordance = (res.matches as f64) / ((res.matches + errors) as f64);
536 | res.concordance_gc = (res.matches as f64)
537 | / ((res.matches + res.mismatches + res.gc_ins + res.gc_del) as f64);
538 | res.concordance_qv = concordance_qv(res.concordance, errors > 0);
539 |
540 | for (i, &has_error) in intervals.iter().zip(&interval_has_error) {
541 | if !has_error {
542 | res.feature_stats
543 | .get_mut(i.val.as_str())
544 | .unwrap()
545 | .identical_overlaps += 1;
546 | }
547 | }
548 |
549 | res
550 | }
551 |
552 | pub fn header() -> &'static str {
553 | "read,chr,pos,read_length,effective_coverage,subread_passes,predicted_concordance,alignment_type,strand,alignment_mapq,mean_quality,aligned_read_length,reference_coverage,gc_content,concordance,gap_compressed_concordance,concordance_qv,mismatches,non_hp_ins,non_hp_del,hp_ins,hp_del"
554 | }
555 |
556 | pub fn to_csv(&self) -> String {
557 | let supp_str = if self.supplementary {
558 | "supplementary"
559 | } else {
560 | "primary"
561 | };
562 | let strand_str = if self.strand_rev { "-" } else { "+" };
563 | let ec = self
564 | .effective_cov
565 | .map(|x| format!("{:.2}", x))
566 | .unwrap_or_else(|| String::new());
567 | let np = self
568 | .subread_passes
569 | .map(|x| format!("{}", x))
570 | .unwrap_or_else(|| String::new());
571 | let rq = self
572 | .pred_concordance
573 | .map(|x| format!("{:.6}", x))
574 | .unwrap_or_else(|| String::new());
575 | format!(
576 | "{},{},{},{},{:.2},{},{:.6},{},{},{},{},{},{:.6},{:.6},{:.6},{:.6},{:.2},{},{},{},{},{}",
577 | self.read_name,
578 | self.chr,
579 | self.ref_pos,
580 | self.q_len,
581 | ec,
582 | np,
583 | rq,
584 | supp_str,
585 | strand_str,
586 | self.mapq,
587 | self.mean_qual,
588 | self.read_len,
589 | self.ref_cov,
590 | self.gc_content,
591 | self.concordance,
592 | self.concordance_gc,
593 | self.concordance_qv,
594 | self.mismatches,
595 | self.non_hp_ins,
596 | self.non_hp_del,
597 | self.hp_ins,
598 | self.hp_del
599 | )
600 | }
601 | }
602 |
603 | /// Compute the Phred scale Q-value for a certain concordance/identity.
604 | ///
605 | /// Perfect match will have a Q-value of 75.
606 | pub fn concordance_qv(concordance: f64, has_errors: bool) -> f64 {
607 | if has_errors {
608 | -10.0f64 * (1.0f64 - concordance).log10()
609 | } else {
610 | 75.0f64
611 | }
612 | }
613 |
614 | fn qual_to_error(q: u8) -> f64 {
615 | 10.0f64.powf(-(q as f64) / 10.0f64)
616 | }
617 |
618 | fn error_to_qual(e: f64) -> f64 {
619 | -10.0f64 * e.log10()
620 | }
621 |
622 | fn mean_qual(q_scores: &[sam::record::quality_scores::Score]) -> u8 {
623 | let sum_q = q_scores
624 | .iter()
625 | .map(|&q| qual_to_error(u8::from(q)))
626 | .sum::();
627 | error_to_qual(sum_q / (q_scores.len() as f64)).round() as u8
628 | }
629 |
--------------------------------------------------------------------------------
/src/summary.rs:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Google LLC
2 | //
3 | // Permission is hereby granted, free of charge, to any person obtaining a copy of
4 | // this software and associated documentation files (the "Software"), to deal in
5 | // the Software without restriction, including without limitation the rights to
6 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
7 | // the Software, and to permit persons to whom the Software is furnished to do so,
8 | // subject to the following conditions:
9 | //
10 | // The above copyright notice and this permission notice shall be included in all
11 | // copies or substantial portions of the Software.
12 | //
13 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 |
20 | use std::fmt;
21 |
22 | use fxhash::FxHashMap;
23 |
24 | use ordered_float::OrderedFloat;
25 |
26 | use crate::stats::*;
27 |
28 | // important to ensure that summary stats are sorted so output order is deterministic
29 |
30 | pub struct YieldSummary {
31 | name_column: Option,
32 | /// (reads, bases)
33 | q_yield: [(usize, usize); 15],
34 | }
35 |
36 | impl YieldSummary {
37 | pub fn new(mut name_column: Option) -> Self {
38 | if let Some(ref mut name) = name_column {
39 | name.push(',');
40 | }
41 | Self {
42 | name_column,
43 | q_yield: [(0usize, 0usize); 15],
44 | }
45 | }
46 |
47 | pub fn update(&mut self, aln_stats: &AlnStats) {
48 | if aln_stats.supplementary {
49 | return;
50 | }
51 |
52 | for i in 0..self.q_yield.len() {
53 | let min_q = i * 5;
54 | if aln_stats.concordance_qv >= (min_q as f64) {
55 | self.q_yield[i].0 += 1;
56 | self.q_yield[i].1 += aln_stats.q_len;
57 | }
58 | }
59 | }
60 | }
61 |
62 | impl fmt::Display for YieldSummary {
63 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
64 | writeln!(
65 | f,
66 | "{}min_empirical_q,yield_reads,yield_bases",
67 | if self.name_column.is_some() {
68 | "name,"
69 | } else {
70 | ""
71 | }
72 | )?;
73 | for i in 0..self.q_yield.len() {
74 | writeln!(
75 | f,
76 | "{}{},{},{}",
77 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""),
78 | i * 5,
79 | self.q_yield[i].0,
80 | self.q_yield[i].1
81 | )?;
82 | }
83 | Ok(())
84 | }
85 | }
86 |
87 | #[derive(Default)]
88 | pub struct IdentitySummary {
89 | name_column: Option,
90 | pub total_alns: usize,
91 | matches: usize,
92 | mismatches: usize,
93 | non_hp_ins: usize,
94 | non_hp_del: usize,
95 | hp_ins: usize,
96 | hp_del: usize,
97 | gc_ins: usize,
98 | gc_del: usize,
99 | num_reads: usize,
100 | }
101 |
102 | impl IdentitySummary {
103 | pub fn new(mut name_column: Option) -> Self {
104 | if let Some(ref mut name) = name_column {
105 | name.push(',');
106 | }
107 | Self {
108 | name_column,
109 | ..Default::default()
110 | }
111 | }
112 |
113 | pub fn update(&mut self, aln_stats: &AlnStats) {
114 | if aln_stats.supplementary {
115 | return;
116 | }
117 |
118 | self.matches += aln_stats.matches;
119 | self.mismatches += aln_stats.mismatches;
120 | self.non_hp_ins += aln_stats.non_hp_ins;
121 | self.non_hp_del += aln_stats.non_hp_del;
122 | self.hp_ins += aln_stats.hp_ins;
123 | self.hp_del += aln_stats.hp_del;
124 | self.gc_ins += aln_stats.gc_ins;
125 | self.gc_del += aln_stats.gc_del;
126 | self.num_reads += 1;
127 | }
128 | }
129 |
130 | impl fmt::Display for IdentitySummary {
131 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
132 | writeln!(f, "{}total_alns,primary_alns,identity,identity_qv,gap_compressed_identity,matches_per_kbp,mismatches_per_kbp,non_hp_ins_per_kbp,non_hp_del_per_kbp,hp_ins_per_kbp,hp_del_per_kbp", if self.name_column.is_some() { "name," } else { "" })?;
133 | let num_errors =
134 | self.mismatches + self.non_hp_ins + self.hp_ins + self.non_hp_del + self.hp_del;
135 | let num_bases = self.matches + self.mismatches + self.non_hp_del + self.hp_del;
136 | let id = (self.matches as f64) / ((self.matches + num_errors) as f64);
137 | let gc_id = (self.matches as f64)
138 | / ((self.matches + self.mismatches + self.gc_ins + self.gc_del) as f64);
139 | let per_kbp = |x| (x as f64) / (num_bases as f64) * 1000.0f64;
140 | writeln!(
141 | f,
142 | "{}{},{},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6}",
143 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""),
144 | self.total_alns,
145 | self.num_reads,
146 | id,
147 | concordance_qv(id, num_errors > 0),
148 | gc_id,
149 | per_kbp(self.matches),
150 | per_kbp(self.mismatches),
151 | per_kbp(self.non_hp_ins),
152 | per_kbp(self.non_hp_del),
153 | per_kbp(self.hp_ins),
154 | per_kbp(self.hp_del)
155 | )
156 | }
157 | }
158 |
159 | pub struct FeatureSummary {
160 | name_column: Option,
161 | feature_stats: FxHashMap,
162 | }
163 |
164 | impl FeatureSummary {
165 | pub fn new(mut name_column: Option) -> Self {
166 | if let Some(ref mut name) = name_column {
167 | name.push(',');
168 | }
169 | Self {
170 | name_column,
171 | feature_stats: FxHashMap::default(),
172 | }
173 | }
174 |
175 | pub fn update(&mut self, aln_stats: &AlnStats) {
176 | if aln_stats.supplementary {
177 | return;
178 | }
179 |
180 | for (&k, v) in &aln_stats.feature_stats {
181 | self.feature_stats
182 | .entry(k.to_owned())
183 | .or_insert_with(|| FeatureStats::default())
184 | .assign_add(v);
185 | }
186 | }
187 | }
188 |
189 | impl fmt::Display for FeatureSummary {
190 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
191 | writeln!(f, "{}feature,intervals,identical_intervals,identity,identity_qv,mean_qual,bases_per_interval,matches_per_interval,mismatches_per_interval,non_hp_ins_per_interval,non_hp_del_per_interval,hp_ins_per_interval,hp_del_per_interval", if self.name_column.is_some() { "name," } else { "" })?;
192 | let mut v = self.feature_stats.iter().collect::>();
193 | v.sort_by_key(|x| x.0);
194 | for (feature, stats) in v.into_iter() {
195 | let per_interval = |x| (x as f64) / (stats.overlaps as f64);
196 | let id = stats.identity();
197 | writeln!(
198 | f,
199 | "{}{},{},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6}",
200 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""),
201 | feature.trim(),
202 | stats.overlaps,
203 | per_interval(stats.identical_overlaps),
204 | id,
205 | concordance_qv(id, id != 1.0),
206 | stats.mean_qual(),
207 | per_interval(stats.num_bases()),
208 | per_interval(stats.matches),
209 | per_interval(stats.mismatches),
210 | per_interval(stats.non_hp_ins),
211 | per_interval(stats.non_hp_del),
212 | per_interval(stats.hp_ins),
213 | per_interval(stats.hp_del)
214 | )?;
215 | }
216 | Ok(())
217 | }
218 | }
219 |
220 | pub struct CigarLenSummary {
221 | name_column: Option,
222 | cigar_len_stats: FxHashMap<(usize, u8), usize>,
223 | }
224 |
225 | impl CigarLenSummary {
226 | pub fn new(mut name_column: Option) -> Self {
227 | if let Some(ref mut name) = name_column {
228 | name.push(',');
229 | }
230 | Self {
231 | name_column,
232 | cigar_len_stats: FxHashMap::default(),
233 | }
234 | }
235 |
236 | pub fn update(&mut self, aln_stats: &AlnStats) {
237 | if aln_stats.supplementary {
238 | return;
239 | }
240 |
241 | for (&k, v) in &aln_stats.cigar_len_stats {
242 | *self.cigar_len_stats.entry(k).or_insert(0) += v;
243 | }
244 | }
245 | }
246 |
247 | impl fmt::Display for CigarLenSummary {
248 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
249 | writeln!(
250 | f,
251 | "{}cigar,length,count,length_count_per_cigar",
252 | if self.name_column.is_some() {
253 | "name,"
254 | } else {
255 | ""
256 | }
257 | )?;
258 | let mut v = self.cigar_len_stats.iter().collect::>();
259 | v.sort_by_key(|(x, _)| (x.1, x.0));
260 | let mut total_cigars = [0usize; 128];
261 | for (cigar, &count) in &v {
262 | total_cigars[cigar.1 as usize] += count;
263 | }
264 |
265 | for (cigar, &count) in v.into_iter() {
266 | writeln!(
267 | f,
268 | "{}{},{},{},{:.6}",
269 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""),
270 | cigar.1 as char,
271 | cigar.0,
272 | count,
273 | (count as f64) / (total_cigars[cigar.1 as usize] as f64),
274 | )?;
275 | }
276 | Ok(())
277 | }
278 | }
279 |
280 | pub struct BinSummary {
281 | name_column: Option,
282 | bin_maps: Vec<(BinType, FxHashMap)>,
283 | }
284 |
285 | impl BinSummary {
286 | pub fn new(mut name_column: Option, bin_types: Vec) -> Self {
287 | if let Some(ref mut name) = name_column {
288 | name.push(',');
289 | }
290 | let bin_maps = bin_types
291 | .into_iter()
292 | .map(|b| (b, FxHashMap::default()))
293 | .collect();
294 | Self {
295 | name_column,
296 | bin_maps,
297 | }
298 | }
299 |
300 | pub fn update(&mut self, aln_stats: &AlnStats) {
301 | if aln_stats.supplementary {
302 | return;
303 | }
304 |
305 | self.bin_maps.iter_mut().for_each(|(bin_type, bin_map)| {
306 | let bin = bin_type.get_bin(aln_stats);
307 | let bin_stats = BinStats::new(aln_stats);
308 | bin_map
309 | .entry(bin)
310 | .or_insert_with(|| BinStats::default())
311 | .assign_add(&bin_stats);
312 | });
313 | }
314 | }
315 |
316 | impl fmt::Display for BinSummary {
317 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
318 | writeln!(
319 | f,
320 | "{}bin_type,bin,num_reads,num_bases,identity,identity_qv,matches_per_kbp,mismatches_per_kbp,non_hp_ins_per_kbp,non_hp_del_per_kbp,hp_ins_per_kbp,hp_del_per_kbp",
321 | if self.name_column.is_some() {
322 | "name,"
323 | } else {
324 | ""
325 | }
326 | )?;
327 | for (bin_type, bin_map) in &self.bin_maps {
328 | let mut bins = bin_map.iter().collect::>();
329 | bins.sort_by_key(|(b, _)| OrderedFloat(b.parse::().unwrap()));
330 |
331 | for (bin, stats) in bins {
332 | let per_kbp = |x| (x as f64) / (stats.num_bases() as f64) * 1000.0f64;
333 | let id = stats.identity();
334 | writeln!(
335 | f,
336 | "{}{},{},{},{},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6},{:.6}",
337 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""),
338 | bin_type,
339 | bin,
340 | stats.num_reads,
341 | stats.num_bases(),
342 | id,
343 | concordance_qv(id, id != 1.0),
344 | per_kbp(stats.matches),
345 | per_kbp(stats.mismatches),
346 | per_kbp(stats.non_hp_ins),
347 | per_kbp(stats.non_hp_del),
348 | per_kbp(stats.hp_ins),
349 | per_kbp(stats.hp_del),
350 | )?;
351 | }
352 | }
353 | Ok(())
354 | }
355 | }
356 |
357 | pub struct QualScoreSummary {
358 | name_column: Option,
359 | feature_qual: FxHashMap,
360 | }
361 |
362 | impl QualScoreSummary {
363 | pub fn new(mut name_column: Option) -> Self {
364 | if let Some(ref mut name) = name_column {
365 | name.push(',');
366 | }
367 | let mut feature_qual = FxHashMap::default();
368 | feature_qual.insert("all_alignments".to_owned(), QualScoreStats::default());
369 | Self {
370 | name_column,
371 | feature_qual,
372 | }
373 | }
374 |
375 | pub fn update(&mut self, aln_stats: &AlnStats) {
376 | if aln_stats.supplementary {
377 | return;
378 | }
379 |
380 | self.feature_qual
381 | .get_mut("all_alignments")
382 | .unwrap()
383 | .assign_add(&aln_stats.q_score_stats);
384 |
385 | for (&k, v) in &aln_stats.feature_stats {
386 | self.feature_qual
387 | .entry(k.to_owned())
388 | .or_insert_with(|| QualScoreStats::default())
389 | .assign_add(&v.q_score_stats);
390 | }
391 | }
392 | }
393 |
394 | impl fmt::Display for QualScoreSummary {
395 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
396 | writeln!(
397 | f,
398 | "{}feature,qual_score,empirical_qv",
399 | if self.name_column.is_some() {
400 | "name,"
401 | } else {
402 | ""
403 | }
404 | )?;
405 | let mut v = self.feature_qual.iter().collect::>();
406 | v.sort_by_key(|x| x.0);
407 | for (feature, stats) in v.into_iter() {
408 | for (i, qv) in stats.empirical_qv().into_iter() {
409 | writeln!(
410 | f,
411 | "{}{},{},{:.2}",
412 | self.name_column.as_ref().map(|n| n.as_str()).unwrap_or(""),
413 | feature.trim(),
414 | i,
415 | qv
416 | )?;
417 | }
418 | }
419 | Ok(())
420 | }
421 | }
422 |
--------------------------------------------------------------------------------