├── .github
├── CODEOWNERS
├── logos
│ └── fulcrumgenomics.svg
└── workflows
│ └── build_and_test.yml
├── .gitignore
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── ci
└── check.sh
├── comparison-data
├── plot-comparison.R
├── read-counts.png
├── read-counts.txt
├── runtimes.png
└── runtimes.txt
├── metadata
├── tko_essential_genes.txt
└── tko_nonessential_genes.txt
├── rustfmt.toml
└── src
├── commands
├── command.rs
├── count.rs
└── mod.rs
├── guide.rs
└── main.rs
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @tfenne
2 |
--------------------------------------------------------------------------------
/.github/logos/fulcrumgenomics.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.github/workflows/build_and_test.yml:
--------------------------------------------------------------------------------
1 | name: Check
2 |
3 | on: [push, pull_request]
4 |
5 | env:
6 | CARGO_TERM_COLOR: always
7 |
8 | jobs:
9 | check:
10 | name: Check
11 | runs-on: ubuntu-24.04
12 | steps:
13 | - name: Checkout sources
14 | uses: actions/checkout@v2
15 |
16 | - name: Install stable toolchain
17 | uses: actions-rs/toolchain@v1
18 | with:
19 | profile: minimal
20 | toolchain: stable
21 | override: true
22 |
23 | - name: Cache dependencies
24 | uses: Swatinem/rust-cache@v1
25 |
26 | - name: Run cargo check
27 | uses: actions-rs/cargo@v1
28 | with:
29 | command: check
30 |
31 | lints:
32 | name: Lints
33 | runs-on: ubuntu-24.04
34 | steps:
35 | - name: Checkout sources
36 | uses: actions/checkout@v2
37 |
38 | - name: Install stable toolchain
39 | uses: actions-rs/toolchain@v1
40 | with:
41 | profile: minimal
42 | toolchain: stable
43 | override: true
44 | components: rustfmt, clippy
45 |
46 | - name: Cache dependencies
47 | uses: Swatinem/rust-cache@v1
48 |
49 | - name: Run cargo fmt
50 | uses: actions-rs/cargo@v1
51 | with:
52 | command: fmt
53 | args: --all -- --check
54 |
55 | - name: Run cargo clippy
56 | uses: actions-rs/cargo@v1
57 | with:
58 | command: clippy
59 | args: -- -D warnings
60 |
61 | test:
62 | name: Test Suite
63 | runs-on: ${{ matrix.os }}
64 | strategy:
65 | matrix:
66 | os: [ubuntu-latest, macOS-latest]
67 | steps:
68 | - name: Checkout sources
69 | uses: actions/checkout@v2
70 |
71 | - name: Install stable toolchain
72 | uses: actions-rs/toolchain@v1
73 | with:
74 | profile: minimal
75 | toolchain: stable
76 | override: true
77 |
78 | - name: Cache dependencies
79 | uses: Swatinem/rust-cache@v1
80 |
81 | - name: Run tests
82 | run: cargo test --verbose
83 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | .idea
3 | .DS_Store
4 |
--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Cargo.
2 | # It is not intended for manual editing.
3 | version = 3
4 |
5 | [[package]]
6 | name = "adler"
7 | version = "1.0.2"
8 | source = "registry+https://github.com/rust-lang/crates.io-index"
9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
10 |
11 | [[package]]
12 | name = "ahash"
13 | version = "0.7.6"
14 | source = "registry+https://github.com/rust-lang/crates.io-index"
15 | checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
16 | dependencies = [
17 | "getrandom",
18 | "once_cell",
19 | "version_check",
20 | ]
21 |
22 | [[package]]
23 | name = "aho-corasick"
24 | version = "0.7.18"
25 | source = "registry+https://github.com/rust-lang/crates.io-index"
26 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
27 | dependencies = [
28 | "memchr",
29 | ]
30 |
31 | [[package]]
32 | name = "anyhow"
33 | version = "1.0.52"
34 | source = "registry+https://github.com/rust-lang/crates.io-index"
35 | checksum = "84450d0b4a8bd1ba4144ce8ce718fbc5d071358b1e5384bace6536b3d1f2d5b3"
36 |
37 | [[package]]
38 | name = "atty"
39 | version = "0.2.14"
40 | source = "registry+https://github.com/rust-lang/crates.io-index"
41 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
42 | dependencies = [
43 | "hermit-abi",
44 | "libc",
45 | "winapi",
46 | ]
47 |
48 | [[package]]
49 | name = "autocfg"
50 | version = "1.0.1"
51 | source = "registry+https://github.com/rust-lang/crates.io-index"
52 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
53 |
54 | [[package]]
55 | name = "bitflags"
56 | version = "1.3.2"
57 | source = "registry+https://github.com/rust-lang/crates.io-index"
58 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
59 |
60 | [[package]]
61 | name = "bstr"
62 | version = "0.2.17"
63 | source = "registry+https://github.com/rust-lang/crates.io-index"
64 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
65 | dependencies = [
66 | "lazy_static",
67 | "memchr",
68 | "regex-automata",
69 | "serde",
70 | ]
71 |
72 | [[package]]
73 | name = "cc"
74 | version = "1.0.72"
75 | source = "registry+https://github.com/rust-lang/crates.io-index"
76 | checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee"
77 |
78 | [[package]]
79 | name = "cfg-if"
80 | version = "1.0.0"
81 | source = "registry+https://github.com/rust-lang/crates.io-index"
82 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
83 |
84 | [[package]]
85 | name = "clap"
86 | version = "3.0.0-rc.9"
87 | source = "registry+https://github.com/rust-lang/crates.io-index"
88 | checksum = "c7843ae7a539bef687e018bf9edf7e87728024b29d02b0f8409726be8880ae1a"
89 | dependencies = [
90 | "atty",
91 | "bitflags",
92 | "clap_derive",
93 | "indexmap",
94 | "lazy_static",
95 | "os_str_bytes",
96 | "strsim",
97 | "termcolor",
98 | "textwrap",
99 | ]
100 |
101 | [[package]]
102 | name = "clap_derive"
103 | version = "3.0.0-rc.9"
104 | source = "registry+https://github.com/rust-lang/crates.io-index"
105 | checksum = "cae3cc2f259ea636871f5da15b0ac033f1821d7a5506c3d1bfbdde201f14c803"
106 | dependencies = [
107 | "heck",
108 | "proc-macro-error",
109 | "proc-macro2",
110 | "quote",
111 | "syn",
112 | ]
113 |
114 | [[package]]
115 | name = "crc32fast"
116 | version = "1.3.0"
117 | source = "registry+https://github.com/rust-lang/crates.io-index"
118 | checksum = "738c290dfaea84fc1ca15ad9c168d083b05a714e1efddd8edaab678dc28d2836"
119 | dependencies = [
120 | "cfg-if",
121 | ]
122 |
123 | [[package]]
124 | name = "csv"
125 | version = "1.1.6"
126 | source = "registry+https://github.com/rust-lang/crates.io-index"
127 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
128 | dependencies = [
129 | "bstr",
130 | "csv-core",
131 | "itoa",
132 | "ryu",
133 | "serde",
134 | ]
135 |
136 | [[package]]
137 | name = "csv-core"
138 | version = "0.1.10"
139 | source = "registry+https://github.com/rust-lang/crates.io-index"
140 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
141 | dependencies = [
142 | "memchr",
143 | ]
144 |
145 | [[package]]
146 | name = "either"
147 | version = "1.6.1"
148 | source = "registry+https://github.com/rust-lang/crates.io-index"
149 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457"
150 |
151 | [[package]]
152 | name = "enum_dispatch"
153 | version = "0.3.7"
154 | source = "registry+https://github.com/rust-lang/crates.io-index"
155 | checksum = "bd53b3fde38a39a06b2e66dc282f3e86191e53bd04cc499929c15742beae3df8"
156 | dependencies = [
157 | "once_cell",
158 | "proc-macro2",
159 | "quote",
160 | "syn",
161 | ]
162 |
163 | [[package]]
164 | name = "env_logger"
165 | version = "0.8.4"
166 | source = "registry+https://github.com/rust-lang/crates.io-index"
167 | checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3"
168 | dependencies = [
169 | "atty",
170 | "humantime",
171 | "log",
172 | "regex",
173 | "termcolor",
174 | ]
175 |
176 | [[package]]
177 | name = "fastq"
178 | version = "0.6.0"
179 | source = "registry+https://github.com/rust-lang/crates.io-index"
180 | checksum = "c0dc54743d8fa10c176c4be22ccc6da3cc2b7f8b1b1b5a7fa17f4cbb94d3f29c"
181 | dependencies = [
182 | "flate2",
183 | "lz4",
184 | "memchr",
185 | ]
186 |
187 | [[package]]
188 | name = "fgoxide"
189 | version = "0.1.3"
190 | source = "registry+https://github.com/rust-lang/crates.io-index"
191 | checksum = "d571c2c4fb6b56ada5b136196eb40aa4b4e22ae7ca2efb7eb2f8d45e6c13c297"
192 | dependencies = [
193 | "csv",
194 | "flate2",
195 | "serde",
196 | "thiserror",
197 | ]
198 |
199 | [[package]]
200 | name = "flate2"
201 | version = "1.0.22"
202 | source = "registry+https://github.com/rust-lang/crates.io-index"
203 | checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f"
204 | dependencies = [
205 | "cfg-if",
206 | "crc32fast",
207 | "libc",
208 | "libz-sys",
209 | "miniz_oxide",
210 | ]
211 |
212 | [[package]]
213 | name = "getrandom"
214 | version = "0.2.3"
215 | source = "registry+https://github.com/rust-lang/crates.io-index"
216 | checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753"
217 | dependencies = [
218 | "cfg-if",
219 | "libc",
220 | "wasi",
221 | ]
222 |
223 | [[package]]
224 | name = "guide-counter"
225 | version = "0.1.3"
226 | dependencies = [
227 | "ahash",
228 | "anyhow",
229 | "clap",
230 | "csv",
231 | "enum_dispatch",
232 | "env_logger",
233 | "fastq",
234 | "fgoxide",
235 | "flate2",
236 | "itertools",
237 | "log",
238 | "mimalloc",
239 | "regex",
240 | "serde",
241 | "tempfile",
242 | ]
243 |
244 | [[package]]
245 | name = "hashbrown"
246 | version = "0.11.2"
247 | source = "registry+https://github.com/rust-lang/crates.io-index"
248 | checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
249 |
250 | [[package]]
251 | name = "heck"
252 | version = "0.3.3"
253 | source = "registry+https://github.com/rust-lang/crates.io-index"
254 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
255 | dependencies = [
256 | "unicode-segmentation",
257 | ]
258 |
259 | [[package]]
260 | name = "hermit-abi"
261 | version = "0.1.19"
262 | source = "registry+https://github.com/rust-lang/crates.io-index"
263 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
264 | dependencies = [
265 | "libc",
266 | ]
267 |
268 | [[package]]
269 | name = "humantime"
270 | version = "2.1.0"
271 | source = "registry+https://github.com/rust-lang/crates.io-index"
272 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
273 |
274 | [[package]]
275 | name = "indexmap"
276 | version = "1.7.0"
277 | source = "registry+https://github.com/rust-lang/crates.io-index"
278 | checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5"
279 | dependencies = [
280 | "autocfg",
281 | "hashbrown",
282 | ]
283 |
284 | [[package]]
285 | name = "itertools"
286 | version = "0.10.3"
287 | source = "registry+https://github.com/rust-lang/crates.io-index"
288 | checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3"
289 | dependencies = [
290 | "either",
291 | ]
292 |
293 | [[package]]
294 | name = "itoa"
295 | version = "0.4.8"
296 | source = "registry+https://github.com/rust-lang/crates.io-index"
297 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
298 |
299 | [[package]]
300 | name = "lazy_static"
301 | version = "1.4.0"
302 | source = "registry+https://github.com/rust-lang/crates.io-index"
303 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
304 |
305 | [[package]]
306 | name = "libc"
307 | version = "0.2.112"
308 | source = "registry+https://github.com/rust-lang/crates.io-index"
309 | checksum = "1b03d17f364a3a042d5e5d46b053bbbf82c92c9430c592dd4c064dc6ee997125"
310 |
311 | [[package]]
312 | name = "libmimalloc-sys"
313 | version = "0.1.23"
314 | source = "registry+https://github.com/rust-lang/crates.io-index"
315 | checksum = "9636c194f9db483f4d0adf2f99a65011a99f904bd222bbd67fb4df4f37863c30"
316 | dependencies = [
317 | "cc",
318 | ]
319 |
320 | [[package]]
321 | name = "libz-sys"
322 | version = "1.1.3"
323 | source = "registry+https://github.com/rust-lang/crates.io-index"
324 | checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66"
325 | dependencies = [
326 | "cc",
327 | "pkg-config",
328 | "vcpkg",
329 | ]
330 |
331 | [[package]]
332 | name = "log"
333 | version = "0.4.14"
334 | source = "registry+https://github.com/rust-lang/crates.io-index"
335 | checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710"
336 | dependencies = [
337 | "cfg-if",
338 | ]
339 |
340 | [[package]]
341 | name = "lz4"
342 | version = "1.23.2"
343 | source = "registry+https://github.com/rust-lang/crates.io-index"
344 | checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c"
345 | dependencies = [
346 | "libc",
347 | "lz4-sys",
348 | ]
349 |
350 | [[package]]
351 | name = "lz4-sys"
352 | version = "1.9.2"
353 | source = "registry+https://github.com/rust-lang/crates.io-index"
354 | checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae"
355 | dependencies = [
356 | "cc",
357 | "libc",
358 | ]
359 |
360 | [[package]]
361 | name = "memchr"
362 | version = "2.4.1"
363 | source = "registry+https://github.com/rust-lang/crates.io-index"
364 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a"
365 |
366 | [[package]]
367 | name = "mimalloc"
368 | version = "0.1.27"
369 | source = "registry+https://github.com/rust-lang/crates.io-index"
370 | checksum = "cf5f78c1d9892fb5677a8b2f543f967ab891ac0f71feecd961435b74f877283a"
371 | dependencies = [
372 | "libmimalloc-sys",
373 | ]
374 |
375 | [[package]]
376 | name = "miniz_oxide"
377 | version = "0.4.4"
378 | source = "registry+https://github.com/rust-lang/crates.io-index"
379 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b"
380 | dependencies = [
381 | "adler",
382 | "autocfg",
383 | ]
384 |
385 | [[package]]
386 | name = "once_cell"
387 | version = "1.9.0"
388 | source = "registry+https://github.com/rust-lang/crates.io-index"
389 | checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5"
390 |
391 | [[package]]
392 | name = "os_str_bytes"
393 | version = "6.0.0"
394 | source = "registry+https://github.com/rust-lang/crates.io-index"
395 | checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
396 | dependencies = [
397 | "memchr",
398 | ]
399 |
400 | [[package]]
401 | name = "pkg-config"
402 | version = "0.3.24"
403 | source = "registry+https://github.com/rust-lang/crates.io-index"
404 | checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe"
405 |
406 | [[package]]
407 | name = "ppv-lite86"
408 | version = "0.2.15"
409 | source = "registry+https://github.com/rust-lang/crates.io-index"
410 | checksum = "ed0cfbc8191465bed66e1718596ee0b0b35d5ee1f41c5df2189d0fe8bde535ba"
411 |
412 | [[package]]
413 | name = "proc-macro-error"
414 | version = "1.0.4"
415 | source = "registry+https://github.com/rust-lang/crates.io-index"
416 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
417 | dependencies = [
418 | "proc-macro-error-attr",
419 | "proc-macro2",
420 | "quote",
421 | "syn",
422 | "version_check",
423 | ]
424 |
425 | [[package]]
426 | name = "proc-macro-error-attr"
427 | version = "1.0.4"
428 | source = "registry+https://github.com/rust-lang/crates.io-index"
429 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
430 | dependencies = [
431 | "proc-macro2",
432 | "quote",
433 | "version_check",
434 | ]
435 |
436 | [[package]]
437 | name = "proc-macro2"
438 | version = "1.0.36"
439 | source = "registry+https://github.com/rust-lang/crates.io-index"
440 | checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029"
441 | dependencies = [
442 | "unicode-xid",
443 | ]
444 |
445 | [[package]]
446 | name = "quote"
447 | version = "1.0.14"
448 | source = "registry+https://github.com/rust-lang/crates.io-index"
449 | checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d"
450 | dependencies = [
451 | "proc-macro2",
452 | ]
453 |
454 | [[package]]
455 | name = "rand"
456 | version = "0.8.4"
457 | source = "registry+https://github.com/rust-lang/crates.io-index"
458 | checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
459 | dependencies = [
460 | "libc",
461 | "rand_chacha",
462 | "rand_core",
463 | "rand_hc",
464 | ]
465 |
466 | [[package]]
467 | name = "rand_chacha"
468 | version = "0.3.1"
469 | source = "registry+https://github.com/rust-lang/crates.io-index"
470 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
471 | dependencies = [
472 | "ppv-lite86",
473 | "rand_core",
474 | ]
475 |
476 | [[package]]
477 | name = "rand_core"
478 | version = "0.6.3"
479 | source = "registry+https://github.com/rust-lang/crates.io-index"
480 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
481 | dependencies = [
482 | "getrandom",
483 | ]
484 |
485 | [[package]]
486 | name = "rand_hc"
487 | version = "0.3.1"
488 | source = "registry+https://github.com/rust-lang/crates.io-index"
489 | checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
490 | dependencies = [
491 | "rand_core",
492 | ]
493 |
494 | [[package]]
495 | name = "redox_syscall"
496 | version = "0.2.10"
497 | source = "registry+https://github.com/rust-lang/crates.io-index"
498 | checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff"
499 | dependencies = [
500 | "bitflags",
501 | ]
502 |
503 | [[package]]
504 | name = "regex"
505 | version = "1.5.4"
506 | source = "registry+https://github.com/rust-lang/crates.io-index"
507 | checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
508 | dependencies = [
509 | "aho-corasick",
510 | "memchr",
511 | "regex-syntax",
512 | ]
513 |
514 | [[package]]
515 | name = "regex-automata"
516 | version = "0.1.10"
517 | source = "registry+https://github.com/rust-lang/crates.io-index"
518 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
519 |
520 | [[package]]
521 | name = "regex-syntax"
522 | version = "0.6.25"
523 | source = "registry+https://github.com/rust-lang/crates.io-index"
524 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
525 |
526 | [[package]]
527 | name = "remove_dir_all"
528 | version = "0.5.3"
529 | source = "registry+https://github.com/rust-lang/crates.io-index"
530 | checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
531 | dependencies = [
532 | "winapi",
533 | ]
534 |
535 | [[package]]
536 | name = "ryu"
537 | version = "1.0.9"
538 | source = "registry+https://github.com/rust-lang/crates.io-index"
539 | checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f"
540 |
541 | [[package]]
542 | name = "serde"
543 | version = "1.0.132"
544 | source = "registry+https://github.com/rust-lang/crates.io-index"
545 | checksum = "8b9875c23cf305cd1fd7eb77234cbb705f21ea6a72c637a5c6db5fe4b8e7f008"
546 | dependencies = [
547 | "serde_derive",
548 | ]
549 |
550 | [[package]]
551 | name = "serde_derive"
552 | version = "1.0.132"
553 | source = "registry+https://github.com/rust-lang/crates.io-index"
554 | checksum = "ecc0db5cb2556c0e558887d9bbdcf6ac4471e83ff66cf696e5419024d1606276"
555 | dependencies = [
556 | "proc-macro2",
557 | "quote",
558 | "syn",
559 | ]
560 |
561 | [[package]]
562 | name = "strsim"
563 | version = "0.10.0"
564 | source = "registry+https://github.com/rust-lang/crates.io-index"
565 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
566 |
567 | [[package]]
568 | name = "syn"
569 | version = "1.0.84"
570 | source = "registry+https://github.com/rust-lang/crates.io-index"
571 | checksum = "ecb2e6da8ee5eb9a61068762a32fa9619cc591ceb055b3687f4cd4051ec2e06b"
572 | dependencies = [
573 | "proc-macro2",
574 | "quote",
575 | "unicode-xid",
576 | ]
577 |
578 | [[package]]
579 | name = "tempfile"
580 | version = "3.2.0"
581 | source = "registry+https://github.com/rust-lang/crates.io-index"
582 | checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
583 | dependencies = [
584 | "cfg-if",
585 | "libc",
586 | "rand",
587 | "redox_syscall",
588 | "remove_dir_all",
589 | "winapi",
590 | ]
591 |
592 | [[package]]
593 | name = "termcolor"
594 | version = "1.1.2"
595 | source = "registry+https://github.com/rust-lang/crates.io-index"
596 | checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4"
597 | dependencies = [
598 | "winapi-util",
599 | ]
600 |
601 | [[package]]
602 | name = "textwrap"
603 | version = "0.14.2"
604 | source = "registry+https://github.com/rust-lang/crates.io-index"
605 | checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80"
606 |
607 | [[package]]
608 | name = "thiserror"
609 | version = "1.0.30"
610 | source = "registry+https://github.com/rust-lang/crates.io-index"
611 | checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417"
612 | dependencies = [
613 | "thiserror-impl",
614 | ]
615 |
616 | [[package]]
617 | name = "thiserror-impl"
618 | version = "1.0.30"
619 | source = "registry+https://github.com/rust-lang/crates.io-index"
620 | checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b"
621 | dependencies = [
622 | "proc-macro2",
623 | "quote",
624 | "syn",
625 | ]
626 |
627 | [[package]]
628 | name = "unicode-segmentation"
629 | version = "1.8.0"
630 | source = "registry+https://github.com/rust-lang/crates.io-index"
631 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b"
632 |
633 | [[package]]
634 | name = "unicode-xid"
635 | version = "0.2.2"
636 | source = "registry+https://github.com/rust-lang/crates.io-index"
637 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
638 |
639 | [[package]]
640 | name = "vcpkg"
641 | version = "0.2.15"
642 | source = "registry+https://github.com/rust-lang/crates.io-index"
643 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
644 |
645 | [[package]]
646 | name = "version_check"
647 | version = "0.9.3"
648 | source = "registry+https://github.com/rust-lang/crates.io-index"
649 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
650 |
651 | [[package]]
652 | name = "wasi"
653 | version = "0.10.2+wasi-snapshot-preview1"
654 | source = "registry+https://github.com/rust-lang/crates.io-index"
655 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
656 |
657 | [[package]]
658 | name = "winapi"
659 | version = "0.3.9"
660 | source = "registry+https://github.com/rust-lang/crates.io-index"
661 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
662 | dependencies = [
663 | "winapi-i686-pc-windows-gnu",
664 | "winapi-x86_64-pc-windows-gnu",
665 | ]
666 |
667 | [[package]]
668 | name = "winapi-i686-pc-windows-gnu"
669 | version = "0.4.0"
670 | source = "registry+https://github.com/rust-lang/crates.io-index"
671 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
672 |
673 | [[package]]
674 | name = "winapi-util"
675 | version = "0.1.5"
676 | source = "registry+https://github.com/rust-lang/crates.io-index"
677 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
678 | dependencies = [
679 | "winapi",
680 | ]
681 |
682 | [[package]]
683 | name = "winapi-x86_64-pc-windows-gnu"
684 | version = "0.4.0"
685 | source = "registry+https://github.com/rust-lang/crates.io-index"
686 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
687 |
--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "guide-counter"
3 | version = "0.1.3"
4 | edition = "2021"
5 | authors = ["Tim Fennell"]
6 | license = "MIT"
7 | repository = "https://github.com/fulcrumgenomics/guide-counter"
8 | homepage = "https://github.com/fulcrumgenomics/guide-counter"
9 | description = "Fast and accurate guide counting for CRISPR screens."
10 | readme = "README.md"
11 | categories = ["science"]
12 | keywords = ["bioinformatics", "genomic", "crispr"]
13 |
14 | [profile.release]
15 | lto = "fat"
16 | codegen-units = 1
17 |
18 | [dependencies]
19 | ahash = "0.7.6"
20 | anyhow = "1.0.48"
21 | clap = { version = "3.0.0-rc.9", features = ["derive"] }
22 | csv = "1.1.5"
23 | enum_dispatch = "0.3.7"
24 | env_logger = "0.8.2"
25 | fastq = "0.6.0"
26 | fgoxide = "0.1.3"
27 | flate2 = "1.0.22"
28 | itertools = "0.10.1"
29 | log = "0.4.14"
30 | mimalloc = { version = "0.1.17", default-features = false }
31 | regex = "1.5.4"
32 | serde = { version = "1.0.123", features = ["derive"] }
33 |
34 | [dev-dependencies]
35 | tempfile = "3.2.0"
36 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2021 Fulcrum Genomics LLC
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # guide-counter
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | A better, faster way to count guides in CRISPR screens.
13 |
14 |
15 |
16 |
17 |
18 | [Visit us at Fulcrum Genomics](https://www.fulcrumgenomics.com) to learn more about how we can power your Bioinformatics with guide-counter and beyond.
19 |
20 |
21 |
22 |
23 | ## Overview
24 |
25 | `guide-counter` is a tool for processing FASTQ files from CRISPR screen experiments to generate a matrix of per-sample guide counts. It can be used as a faster, more accurate, drop in replacement for `mageck count`. By default `guide-counter` will look for guide seqeunces in the reads with 0 or 1 mismatches vs. the expected guides, but can be run in exact matching mode.
26 |
27 | ## Why `guide-counter`?
28 |
29 | If you have any experience analyzing CRISPR screens you've almost certainly tried [`mageck`][mageck-link]. It's widely used, highly cited and generally works well. Surprisingly though, `mageck count` is both rather slow _and_ misses counting a non-trivial amount of the data.
30 |
31 | As an example, we ran data from the [Sanson et al][sanson-link] paper through both tools. The dataset consists of:
32 |
33 | |Sample|Reads|Gzipped FASTQ Size|
34 | |------|-----|------------------|
35 | |Plasmid|9,821,128|377M|
36 | |RepA|76,471,324|2.3G|
37 | |RepB|85,301,059|2.5G|
38 | |RepC|75,356,900|2.2G|
39 |
40 | The following plot shows the amount of data recovered per sample by each of three different analyses:
41 |
42 | 
43 |
44 | And the following plot shows the runtime for each of the three analyses performed using a single CPU core/thread on an Intel Core i9 powered MacBook Pro laptop:
45 |
46 | 
47 |
48 | ## Installation
49 |
50 | Installation can be done using conda:
51 |
52 | ```
53 | conda install -c bioconda guide-counter
54 | ```
55 |
56 | or with `cargo` if installed:
57 |
58 | ```
59 | cargo install guide-counter
60 | ```
61 | ## Example Workflow
62 |
63 | The following shows an example of running `guide-counter` followed by `mageck test` on data from the [Sanson et al. 2018 paper][sanson-link]:
64 |
65 | ```
66 | guide-counter count \
67 | --input plasmid.fq.gz RepA.fq.gz RepB.fq.gz RepC.fq.gz \
68 | --control-pattern control \
69 | --essential-genes metadata/training_essentials.txt \
70 | --nonessential-genes metadata/training_nonessential.txt \
71 | --library metadata/broadgpp-brunello-library-corrected.txt.gz \
72 | --output sanson
73 |
74 | mageck test \
75 | --count-table sanson.counts.txt \
76 | --control-id plasmid \
77 | --treatment-id RepA,RepB,RepC \
78 | --norm-method median \
79 | --output-prefix sanson.test
80 |
81 | ```
82 |
83 | ## Inputs
84 |
85 | The full usage for `guide-counter count` is reproduced below; this section describes a few of the key inputs in more detail:
86 |
87 | |Input Option|Required|Description|
88 | |------------|--------|-----------|
89 | |`--input`|Yes|FASTQ files one per sample. Files may be gzipped or uncompressed.|
90 | |`--samples`|No|Names for the samples, matched positionally to the FASTQs. If not provided then the input file names minus any `.[fq|fastq][.gz]` suffixes are used instead.|
91 | |`--essential-genes`|No|An optional file of known essential genes. May be gzipped or uncompressed. May be either just gene names, one per line, or tab-delimited with the gene in the first column. If given, guides will be labeled as essential for matching genes, and mean coverage of guides for essential genes computed.|
92 | |`--nonessential-genes`|No|An optional file of known nonessential genes. May be gzipped or uncompressed. May be either just gene names, one per line, or tab-delimited with the gene in the first column. If given, guides will be labeled as nonessential for matching genes, and mean coverage of guides for nonessential genes computed.|
93 | |`--control-guides`|No|An optional file of guide IDs for control guides. May be gzipped or uncompressed. May be either just guide IDs, one per line, or tab-delimited data with the guide ID in the first column. If given, matching guides will be labeled as controls, and mean coverage of control guides computed. May be used alone _or_ in conjunction with `--control-pattern`.|
94 | |`--control-pattern`|No|An optional regular expression which is applied (case insensitive) to _both_ guide IDs and gene names, and when a match is found, guides are labeled as controls. For example `--control-pattern control` works well for many human libraries.|
95 |
96 | ## Outputs
97 |
98 | The output files are generated:
99 |
100 | 1. `{output}.counts.txt` - a standard count matrix with columns for the guide ID and gene, then one column per sample with raw/unnormalized guide counts.
101 | 2. `{output}.-extended-counts.txt` - an extended version of the counts matrix which includes a `guide_type` column which will have one of `[Essential, Nonessential, Control, Other]` per guide as determined based on the gene lists and control information provided.
102 | 3. `{output}.stats.txt` - a file of computed statistics, one row per input sample/FASTQ.
103 |
104 | The columns in the stats file are:
105 |
106 | |Column|Description|
107 | |------|-----------|
108 | |file|The path to the input FASTQ file used to generate the stats.|
109 | |label|The label or sample name given to the sample.|
110 | |total_guides|The total number of guides in the guide library (not sample dependent).|
111 | |total_reads|The total number of reads in the input FASTQ file.|
112 | |mapped_reads|The number of reads that could be mapped to a guide.|
113 | |frac_mapped|The fraction of reads (0-1) that could be mapped to a guide.|
114 | |mean_reads_per_guide|The mean number of reads mapped to each guide in the library.|
115 | |mean_reads_essential|The mean number of reads mapped to guides for essential genes.|
116 | |mean_reads_nonessential|The mean number of reads mapped to guides for nonessential genes.|
117 | |mean_reads_control|The mean number of reads mapped to control guides.|
118 | |mean_reads_other|The mean number of reads mapped to other guides (guides not flagged as essential, nonessential or control).|
119 | |zero_read_guides|
120 |
121 |
122 | ## Usage
123 |
124 | Usage for `guide-counter count`:
125 |
126 | ```
127 | guide-counter-count
128 |
129 | Counts the guides observed in a CRISPR screen, starting from one or more FASTQs. FASTQs are one per
130 | sample and currently only single-end FASTQ inputs are supported.
131 |
132 | A set of sample IDs may be provided using `--samples id1 id2 ..`. If provided it must have the same
133 | number of values as input FASTQs. If not provided the FASTQ names are used minus any fastq/fq/gz
134 | suffixes.
135 |
136 | Automatically determines the range of valid offsets within the sequencing reads where the guide
137 | sequences are located, independently for each FASTQ input. The first `offset-sample-size` reads
138 | from each FASTQ are examined to determine the offsets at which guides are found. When processing the
139 | full FASTQ, checks only those offsets that accounted for at least `offset-min-fraction` of the first
140 | `offset-sample-size` reads.
141 |
142 | Matching by default allows for one mismatch (and no indels) between the read sub-sequence and the
143 | expected guide sequences. Exact matching may be enabled by specifying the `--exact-match` option.
144 |
145 | Two output files are generated. The first is named `{output}.counts.txt` and contains columns for
146 | the guide id, the gene targeted by the guide and one count column per input FASTQ with raw/un-
147 | normalized counts. The second is named `{output}.stats.txt` and contains basic QC statistics per
148 | input FASTQ on the matching process.
149 |
150 | USAGE:
151 | guide-counter count [OPTIONS] --input ... --library --output
152 |
153 | OPTIONS:
154 | -c, --control-guides
155 | Optional path to file with list control guide IDs. IDs should appear one per line and
156 | are case sensitive
157 |
158 | -C, --control-pattern
159 | Optional regular expression pattern used to ID control guides. Pattern is matched, case
160 | insensitive, to guide IDs and Gene names
161 |
162 | -e, --essential-genes
163 | Optional path to file with list of essential genes. Gene names should appear one per
164 | line and are case sensitive
165 |
166 | -f, --offset-min-fraction
167 | After sampling the first `offset_sample_size` reads, use offsets that
168 |
169 | [default: 0.005]
170 |
171 | -h, --help
172 | Print help information
173 |
174 | -i, --input ...
175 | Input fastq file(s)
176 |
177 | -l, --library
178 | Path to the guide library metadata. May be a tab- or comma-separated file. Must have a
179 | header line, and the first three fields must be (in order): i) the ID of the guide, ii)
180 | the base sequence of the guide, iii) the gene the guide targets
181 |
182 | -n, --nonessential-genes
183 | Optional path to file with list of nonessential genes. Gene names should appear one per
184 | line and are case sensitive
185 |
186 | -N, --offset-sample-size
187 | The number of reads to be examined when determining the offsets at which guides may be
188 | found in the input reads
189 |
190 | [default: 100000]
191 |
192 | -o, --output
193 | Path prefix to use for all output files
194 |
195 | -s, --samples ...
196 | Sample names corresponding to the input fastqs. If provided must be the same length as
197 | input. Otherwise will be inferred from input file names
198 |
199 | -x, --exact-match
200 | Perform exact matching only, don't allow mismatches between reads and guides
201 | ```
202 |
203 | [sanson-link]: https://pubmed.ncbi.nlm.nih.gov/30575746/
204 | [mageck-link]: https://pubmed.ncbi.nlm.nih.gov/25476604/
205 |
--------------------------------------------------------------------------------
/ci/check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function banner() {
4 | echo
5 | echo "================================================================================"
6 | echo $*
7 | echo "================================================================================"
8 | echo
9 | }
10 |
11 | #####################################################################
12 | # Takes two parameters, a "name" and a "command".
13 | # Runs the command and prints out whether it succeeded or failed, and
14 | # also tracks a list of failed steps in $failures.
15 | #####################################################################
16 | function run() {
17 | local name=$1
18 | local cmd=$2
19 |
20 | banner "Running $name [$cmd]"
21 | set +e
22 | $cmd
23 | exit_code=$?
24 | set -e
25 |
26 | if [[ $exit_code == 0 ]]; then
27 | echo Passed $name: "[$cmd]"
28 | else
29 | echo Failed $name: "[$cmd]"
30 | if [ -z "$failures" ]; then
31 | failures="$failures $name"
32 | else
33 | failures="$failures, $name"
34 | fi
35 | fi
36 | }
37 |
38 | parent=$(cd $(dirname $0) && pwd -P)
39 | repo_root=$(cd $(dirname $0)/.. && pwd -P)
40 |
41 | run "Formatting" "cargo fmt"
42 | run "Clippy" "cargo clippy -- -D warnings"
43 | run "Unit Tests" "cargo test"
44 |
45 | if [ -z "$failures" ]; then
46 | banner "Checks Passed"
47 | else
48 | banner "Checks Failed with failures in: $failures"
49 | exit 1
50 | fi
51 |
--------------------------------------------------------------------------------
/comparison-data/plot-comparison.R:
--------------------------------------------------------------------------------
1 | require(ggplot2)
2 |
3 | counts = read.table("read-counts.txt", sep="\t", header=T)
4 | runtimes = read.table("runtimes.txt", sep="\t", header=T)
5 |
6 | png(filename="runtimes.png", width=800, height=550, res=100)
7 | ggplot(runtimes) +
8 | aes(x=tool, y=runtime_seconds, fill=tool) +
9 | geom_col() +
10 | scale_y_continuous(minor_breaks=seq(0, 1700, 100)) +
11 | scale_fill_brewer(palette = "Paired") +
12 | theme(plot.title = element_text(hjust = 0.5)) +
13 | theme(plot.subtitle = element_text(hjust = 0.5)) +
14 | labs(x="Tool", y="Runtime (seconds)",
15 | title="Runtime of Counting Guides in 4 FASTQs from Sanson et al.",
16 | subtitle="(Smaller bars are better)")
17 | dev.off()
18 |
19 | png(filename="read-counts.png", width=800, height=550, res=100)
20 | ggplot(counts) +
21 | aes(fill=analysis, x=sample, y=matched_reads) +
22 | scale_fill_brewer(palette = "Paired") +
23 | theme(plot.title = element_text(hjust = 0.5)) +
24 | theme(plot.subtitle = element_text(hjust = 0.5)) +
25 | geom_bar(position="dodge", stat="identity") +
26 | labs(x="Sample", y="Reads Matched to Guides",
27 | title="Matched Reads in 4 FASTQs from Sanson et al.",
28 | subtitle="(Bigger bars are better)")
29 | dev.off()
30 |
--------------------------------------------------------------------------------
/comparison-data/read-counts.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fulcrumgenomics/guide-counter/23f2e20e8f295a9ab9388c7d41ee5659c3d4b6df/comparison-data/read-counts.png
--------------------------------------------------------------------------------
/comparison-data/read-counts.txt:
--------------------------------------------------------------------------------
1 | analysis sample matched_reads
2 | guide-counter --exact plasmid 8758764
3 | guide-counter --exact RepA 64107160
4 | guide-counter --exact RepB 70347268
5 | guide-counter --exact RepC 63477791
6 | guide-counter plasmid 9318181
7 | guide-counter RepA 69491160
8 | guide-counter RepB 76248653
9 | guide-counter RepC 68900168
10 | mageck count plasmid 6716580
11 | mageck count RepA 49074849
12 | mageck count RepB 53814919
13 | mageck count RepC 48579881
14 |
--------------------------------------------------------------------------------
/comparison-data/runtimes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fulcrumgenomics/guide-counter/23f2e20e8f295a9ab9388c7d41ee5659c3d4b6df/comparison-data/runtimes.png
--------------------------------------------------------------------------------
/comparison-data/runtimes.txt:
--------------------------------------------------------------------------------
1 | tool runtime_seconds
2 | guide-counter 207
3 | guide-couter --exact 115
4 | mageck count 1658
5 |
--------------------------------------------------------------------------------
/metadata/tko_essential_genes.txt:
--------------------------------------------------------------------------------
1 | gene HGNC_ID ENTREZ_ID
2 | ACTL6A HGNC:24124 86
3 | ACTR6 HGNC:24025 64431
4 | ALYREF HGNC:19071 10189
5 | ANAPC4 HGNC:19990 29945
6 | ANAPC5 HGNC:15713 51433
7 | AP2S1 HGNC:565 1175
8 | AQR HGNC:29513 9716
9 | ARCN1 HGNC:649 372
10 | ARL5B HGNC:23052 221079
11 | ATP6V0D1 HGNC:13724 9114
12 | ATXN7 HGNC:10560 6314
13 | BIRC6 HGNC:13516 57448
14 | BOP1 HGNC:15519 23246
15 | BPTF HGNC:3581 2186
16 | BRIX1 HGNC:24170 55299
17 | C11orf24 HGNC:1174 53838
18 | C12orf66 HGNC:26517 144577
19 | C14orf166 HGNC:23169 51637
20 | C19orf43 HGNC:28424 79002
21 | CAP1 HGNC:20040 10487
22 | CCNG1 HGNC:1592 900
23 | CCNK HGNC:1596 8812
24 | CCT3 HGNC:1616 7203
25 | CCT4 HGNC:1617 10575
26 | CCT5 HGNC:1618 22948
27 | CCT6A HGNC:1620 908
28 | CCT7 HGNC:1622 10574
29 | CCT8 HGNC:1623 10694
30 | CDC40 HGNC:17350 51362
31 | CDC5L HGNC:1743 988
32 | CDK17 HGNC:8750 5128
33 | CHD4 HGNC:1919 1108
34 | CHMP2A HGNC:30216 27243
35 | CLTC HGNC:2092 1213
36 | COPA HGNC:2230 1314
37 | COPB1 HGNC:2231 1315
38 | COPB2 HGNC:2232 9276
39 | COPE HGNC:2234 11316
40 | COPS2 HGNC:30747 9318
41 | COPS4 HGNC:16702 51138
42 | COPS6 HGNC:21749 10980
43 | COPS8 HGNC:24335 10920
44 | COPZ1 HGNC:2243 22818
45 | COX10 HGNC:2260 1352
46 | CPSF2 HGNC:2325 53981
47 | CPSF3 HGNC:2326 51692
48 | CSE1L HGNC:2431 1434
49 | CTDNEP1 HGNC:19085 23399
50 | DARS HGNC:2678 1615
51 | DDB1 HGNC:2717 1642
52 | DDX18 HGNC:2741 8886
53 | DDX21 HGNC:2744 9188
54 | DDX46 HGNC:18681 9879
55 | DDX49 HGNC:18684 54555
56 | DDX51 HGNC:20082 317781
57 | DDX54 HGNC:20084 79039
58 | DKC1 HGNC:2890 1736
59 | DLG5 HGNC:2904 9231
60 | DNAJC1 HGNC:20090 64215
61 | DNM2 HGNC:2974 1785
62 | DNTTIP2 HGNC:24013 30836
63 | DYNC1H1 HGNC:2961 1778
64 | DYNC1I2 HGNC:2964 1781
65 | EEF2 HGNC:3214 1938
66 | EFTUD2 HGNC:30858 9343
67 | EIF1AX HGNC:3250 1964
68 | EIF2B2 HGNC:3258 8892
69 | EIF2B3 HGNC:3259 8891
70 | EIF2B4 HGNC:3260 8891
71 | EIF2S2 HGNC:3266 8894
72 | EIF3A HGNC:6159 3692
73 | EIF3B HGNC:3280 8662
74 | EIF3C HGNC:3279 8663
75 | EIF3D HGNC:3278 8664
76 | EIF3E HGNC:3277 3646
77 | EIF3F HGNC:3275 8665
78 | EIF3G HGNC:3274 8666
79 | EIF3H HGNC:3273 8667
80 | EIF3I HGNC:3272 8668
81 | EIF4A3 HGNC:18683 9775
82 | EIF4E HGNC:3287 1977
83 | EIF5B HGNC:30793 9669
84 | EIF6 HGNC:6159 3692
85 | ERAP1 HGNC:18173 51752
86 | ERH HGNC:3447 2079
87 | ETF1 HGNC:3477 2107
88 | EXOSC10 HGNC:9138 5394
89 | FDPS HGNC:3631 2224
90 | FLAD1 HGNC:24671 80308
91 | FTSJ3 HGNC:17136 117246
92 | GABPA HGNC:4071 2551
93 | GAPDH HGNC:4141 2597
94 | GAR1 HGNC:14264 54433
95 | GEMIN2 HGNC:10884 8487
96 | GLYR1 HGNC:24434 84656
97 | GMPS HGNC:4378 8833
98 | GNL3 HGNC:29931 26354
99 | GTF3C4 HGNC:4667 9329
100 | HAUS1 HGNC:25174 115106
101 | HAUS7 HGNC:32979 55559
102 | HEATR1 HGNC:25517 55127
103 | HNRNPC HGNC:5035 3183
104 | HNRNPK HGNC:5044 3190
105 | HNRNPL HGNC:5045 3191
106 | HNRNPM HGNC:5046 4670
107 | HNRNPU HGNC:5048 3192
108 | HPS5 HGNC:17022 11234
109 | HSPA9 HGNC:5244 3313
110 | HSPB11 HGNC:25019 51668
111 | HSPE1 HGNC:5269 3336
112 | IARS2 HGNC:29685 55699
113 | ICK HGNC:21219 22858
114 | IMP4 HGNC:30856 92856
115 | IMPAD1 HGNC:26019 54928
116 | INCENP HGNC:6058 3619
117 | INTS9 HGNC:25592 55756
118 | IQGAP1 HGNC:6110 8826
119 | ISCU HGNC:29882 23479
120 | JMJD1C HGNC:12313 221037
121 | KARS HGNC:6215 3735
122 | KPNB1 HGNC:6400 3837
123 | KRAS HGNC:6407 3845
124 | LARS2 HGNC:17095 23395
125 | LIAS HGNC:16429 11019
126 | LSM4 HGNC:17259 25804
127 | LSM6 HGNC:17017 25804
128 | LUC7L3 HGNC:24309 51747
129 | LZIC HGNC:17497 84328
130 | MAPK6 HGNC:6879 5597
131 | MAPKAP1 HGNC:18752 79109
132 | MED12 HGNC:11957 9968
133 | MED14 HGNC:2370 9282
134 | MED30 HGNC:23032 90390
135 | MMAB HGNC:19331 326625
136 | MRPS31 HGNC:16632 10240
137 | MSANTD3 HGNC:23370 91283
138 | MTFMT HGNC:29666 123263
139 | MTX1 HGNC:7504 4580
140 | NAA10 HGNC:18704 8260
141 | NAA38 HGNC:28212 84316
142 | NACA HGNC:7629 4666
143 | NAPA HGNC:7641 8775
144 | NAPG HGNC:7642 8774
145 | NCBP1 HGNC:7658 4686
146 | NEDD8 HGNC:7732 4738
147 | NHP2L1 HGNC:7819 4809
148 | NPTN HGNC:17867 27020
149 | NT5C2 HGNC:8022 22978
150 | NUDT21 HGNC:13870 11051
151 | NUP133 HGNC:18016 55746
152 | NUP205 HGNC:18658 23165
153 | NUP54 HGNC:17359 53371
154 | NUP93 HGNC:28958 9688
155 | NUP98 HGNC:8068 4928
156 | NUPL1 HGNC:20261 9818
157 | NUTF2 HGNC:13722 10204
158 | NXF1 HGNC:8071 10482
159 | OPA1 HGNC:8140 4976
160 | PABPN1 HGNC:8565 8106
161 | PAFAH1B1 HGNC:8574 5048
162 | PAPOLA HGNC:14981 10914
163 | PARN HGNC:8609 5073
164 | PCBP1 HGNC:8647 5093
165 | PCBP2 HGNC:8648 5094
166 | PDE8A HGNC:8793 5151
167 | PDHA1 HGNC:8806 5160
168 | PFDN2 HGNC:8867 5202
169 | PFN1 HGNC:8881 5216
170 | PHB HGNC:8912 5245
171 | PHB2 HGNC:30306 11331
172 | PHF16 HGNC:22982 9767
173 | PHF5A HGNC:18000 84844
174 | PHYH HGNC:8940 5264
175 | PLAGL2 HGNC:9047 5326
176 | POLR1B HGNC:20454 84172
177 | POLR2D HGNC:9191 5433
178 | POLR2F HGNC:9193 5435
179 | POLR2I HGNC:9196 5438
180 | POP5 HGNC:17689 51367
181 | PPIE HGNC:9258 10450
182 | PPP2R1A HGNC:9302 5518
183 | PPP2R2D HGNC:23732 55844
184 | PPP5C HGNC:9322 5536
185 | PREB HGNC:9356 10113
186 | PRKAA1 HGNC:9376 5562
187 | PRKAB2 HGNC:9379 5565
188 | PRKDC HGNC:9413 5591
189 | PRPF18 HGNC:17351 8559
190 | PRPF19 HGNC:17896 27339
191 | PRPF3 HGNC:17348 9129
192 | PRPF31 HGNC:15446 26121
193 | PRPF38A HGNC:25930 84950
194 | PRPF8 HGNC:17340 10594
195 | PRUNE HGNC:13420 58497
196 | PSMA1 HGNC:9530 5682
197 | PSMA2 HGNC:9531 5683
198 | PSMA3 HGNC:9532 5684
199 | PSMA4 HGNC:9533 5685
200 | PSMA6 HGNC:9535 5687
201 | PSMB2 HGNC:9539 5690
202 | PSMB3 HGNC:9540 5691
203 | PSMB5 HGNC:9542 5693
204 | PSMB6 HGNC:9543 5694
205 | PSMC1 HGNC:9547 5700
206 | PSMC2 HGNC:9548 5701
207 | PSMC3 HGNC:9549 5702
208 | PSMC4 HGNC:9551 5704
209 | PSMD1 HGNC:9554 5707
210 | PSMD11 HGNC:9556 5717
211 | PSMD12 HGNC:9557 5718
212 | PSMD13 HGNC:9558 5719
213 | PSMD2 HGNC:9559 5708
214 | PSMD4 HGNC:9561 5710
215 | PSMD7 HGNC:9565 5713
216 | QARS HGNC:9751 5859
217 | RAN HGNC:9846 5901
218 | RANBP2 HGNC:9848 5903
219 | RBM14 HGNC:14219 10432
220 | RBM17 HGNC:16944 84991
221 | RBM22 HGNC:25503 55696
222 | RBM25 HGNC:23244 58517
223 | RBMX HGNC:9910 27316
224 | RILPL2 HGNC:28787 196383
225 | RNF139 HGNC:17023 11236
226 | RPA1 HGNC:10289 6117
227 | RPA2 HGNC:10290 6118
228 | RPL10 HGNC:10298 6134
229 | RPL10A HGNC:10299 4736
230 | RPL11 HGNC:10301 6135
231 | RPL12 HGNC:10302 6136
232 | RPL13 HGNC:10303 6137
233 | RPL13A HGNC:10304 23521
234 | RPL14 HGNC:10305 9045
235 | RPL18 HGNC:10310 6141
236 | RPL18A HGNC:10311 6142
237 | RPL19 HGNC:10312 6143
238 | RPL23A HGNC:10317 6147
239 | RPL24 HGNC:10325 6152
240 | RPL26 HGNC:10327 6154
241 | RPL27 HGNC:10328 6155
242 | RPL3 HGNC:10332 6122
243 | RPL30 HGNC:10333 6156
244 | RPL31 HGNC:10334 6160
245 | RPL32 HGNC:10336 6161
246 | RPL34 HGNC:10340 6164
247 | RPL35 HGNC:10344 11224
248 | RPL35A HGNC:10345 6165
249 | RPL36 HGNC:13631 25873
250 | RPL37 HGNC:10347 6167
251 | RPL37A HGNC:10348 6168
252 | RPL38 HGNC:10349 6169
253 | RPL4 HGNC:10353 6124
254 | RPL5 HGNC:10360 6125
255 | RPL6 HGNC:10362 6128
256 | RPL7 HGNC:10363 6129
257 | RPL7A HGNC:10364 6130
258 | RPLP0 HGNC:10371 6175
259 | RPLP2 HGNC:10377 6181
260 | RPN2 HGNC:10382 6185
261 | RPP38 HGNC:30329 10557
262 | RPP40 HGNC:20992 10799
263 | RPS11 HGNC:10384 6205
264 | RPS12 HGNC:10385 6206
265 | RPS13 HGNC:10386 6207
266 | RPS14 HGNC:10387 6208
267 | RPS15 HGNC:10388 6209
268 | RPS15A HGNC:10389 6210
269 | RPS17 HGNC:10397 6218
270 | RPS18 HGNC:10401 6222
271 | RPS19 HGNC:10402 6223
272 | RPS20 HGNC:10405 6224
273 | RPS24 HGNC:10411 6229
274 | RPS25 HGNC:10413 6230
275 | RPS26 HGNC:10414 6231
276 | RPS27A HGNC:10417 6233
277 | RPS3 HGNC:10420 6188
278 | RPS3A HGNC:10421 6189
279 | RPS4X HGNC:10424 6191
280 | RPS5 HGNC:10426 6193
281 | RPS6 HGNC:10429 6194
282 | RPS7 HGNC:10440 6201
283 | RPS8 HGNC:10441 6202
284 | RPS9 HGNC:10442 6203
285 | RPSA HGNC:6502 3921
286 | RRM1 HGNC:10451 6240
287 | RRM2B HGNC:17296 50484
288 | RTF1 HGNC:28996 23168
289 | RUVBL1 HGNC:10474 8607
290 | RUVBL2 HGNC:10475 10856
291 | SARS HGNC:10537 6301
292 | SDAD1 HGNC:25537 55153
293 | SF3A1 HGNC:10765 10291
294 | SF3A2 HGNC:10766 8175
295 | SF3B1 HGNC:10768 23451
296 | SF3B2 HGNC:10769 10992
297 | SF3B3 HGNC:10770 23450
298 | SF3B4 HGNC:10771 10262
299 | SF3B5 HGNC:21083 83443
300 | SFPQ HGNC:10774 6421
301 | SHFM1 HGNC:10845 7979
302 | SKIV2L2 HGNC:18734 23517
303 | SLC25A43 HGNC:30557 203427
304 | SMC1A HGNC:11111 8243
305 | SMC3 HGNC:2468 9126
306 | SNRNP200 HGNC:30859 23020
307 | SNRNP27 HGNC:30240 11017
308 | SNRPB HGNC:11153 6628
309 | SNRPC HGNC:11157 6631
310 | SNRPD1 HGNC:11159 6633
311 | SNRPD2 HGNC:11159 6633
312 | SNW1 HGNC:16696 22938
313 | SON HGNC:11183 6651
314 | SRBD1 HGNC:25521 55133
315 | SRCAP HGNC:16974 10847
316 | SRFBP1 HGNC:26333 153443
317 | SRRT HGNC:24101 51593
318 | SRSF1 HGNC:10780 6426
319 | SRSF3 HGNC:10785 6428
320 | SSR1 HGNC:11323 6745
321 | STRN HGNC:11424 6801
322 | SUPT5H HGNC:11469 6829
323 | SUPT6H HGNC:11470 6830
324 | SUPV3L1 HGNC:11471 6832
325 | TBC1D25 HGNC:8092 4943
326 | TBCE HGNC:11582 6905
327 | TCERG1 HGNC:15630 10915
328 | TDRD3 HGNC:20612 81550
329 | TFIP11 HGNC:17165 24144
330 | THOP1 HGNC:11793 7064
331 | TIMM10 HGNC:11814 26519
332 | TOMM40 HGNC:18001 10452
333 | TPR HGNC:12017 7175
334 | TRRAP HGNC:12347 8295
335 | TSEN2 HGNC:28422 80746
336 | TSTA3 HGNC:12390 7264
337 | TUBA1B HGNC:18809 10376
338 | TUBB HGNC:20778 203068
339 | TUBGCP2 HGNC:18599 10844
340 | U2AF1 HGNC:12453 7307
341 | U2AF2 HGNC:23156 11338
342 | UBA1 HGNC:12469 7317
343 | UBB HGNC:12463 7314
344 | UBL4A HGNC:12505 8266
345 | USP39 HGNC:20071 10713
346 | VCP HGNC:12666 7415
347 | VTA1 HGNC:20954 51534
348 | WDR12 HGNC:14098 55759
349 | WDR33 HGNC:25651 55339
350 | WDR60 HGNC:21862 55112
351 | WDR61 HGNC:30300 80349
352 | XAB2 HGNC:14089 56949
353 | XIAP HGNC:592 331
354 | XPO1 HGNC:12825 7514
355 | YY1 HGNC:12856 7528
356 | ZBTB48 HGNC:4930 3104
357 | ZC3H13 HGNC:20368 23091
358 | ZC3H18 HGNC:25091 124245
359 | ZFR HGNC:17277 51663
360 | ZNF160 HGNC:12948 90338
361 | ZNF207 HGNC:12998 7756
362 |
--------------------------------------------------------------------------------
/metadata/tko_nonessential_genes.txt:
--------------------------------------------------------------------------------
1 | gene HGNC_ID ENTREZ_ID
2 | ABCG8 HGNC:13887 64241
3 | ACCSL HGNC:34391 390110
4 | ACTL7A HGNC:161 10881
5 | ACTL7B HGNC:162 10880
6 | ACTL9 HGNC:28494 284382
7 | ACTRT1 HGNC:24027 139741
8 | ADAD1 HGNC:30713 132612
9 | ADAM18 HGNC:196 8749
10 | ADAM2 HGNC:198 2515
11 | ADAM20 HGNC:199 8748
12 | ADAM30 HGNC:208 11085
13 | ADH7 HGNC:256 131
14 | AFM HGNC:316 173
15 | AICDA HGNC:13203 57379
16 | AIPL1 HGNC:359 23746
17 | ALPI HGNC:437 248
18 | ALPPL2 HGNC:441 251
19 | ALX3 HGNC:449 257
20 | AMELX HGNC:461 265
21 | ANKRD30A HGNC:17234 91074
22 | ANKRD60 HGNC:16217 140731
23 | ANTXRL HGNC:27277 195977
24 | APOA4 HGNC:602 337
25 | APOBEC1 HGNC:604 339
26 | APOF HGNC:615 319
27 | AQP12A HGNC:19941 375318
28 | AQP8 HGNC:642 343
29 | ARGFX HGNC:30146 503582
30 | ART1 HGNC:723 417
31 | ASB17 HGNC:19769 127247
32 | ASIC5 HGNC:17537 51802
33 | ASZ1 HGNC:1350 136991
34 | ATOH1 HGNC:797 474
35 | ATP4B HGNC:820 496
36 | ATP6V1G3 HGNC:18265 127124
37 | AWAT1 HGNC:23252 158833
38 | AWAT2 HGNC:23251 158835
39 | B3GNT6 HGNC:24141 192134
40 | BANF2 HGNC:16172 140836
41 | BARHL1 HGNC:953 56751
42 | BEND2 HGNC:28509 139105
43 | BHLHE23 HGNC:16093 128408
44 | BIRC8 HGNC:14878 112401
45 | BMP10 HGNC:20869 27302
46 | BMP15 HGNC:1068 9210
47 | BPIFA1 HGNC:15749 51297
48 | BPIFA3 HGNC:16204 128861
49 | BPIFB3 HGNC:16178 359710
50 | BPIFB6 HGNC:16504 128859
51 | BPIFC HGNC:16503 254240
52 | BPY2 HGNC:13508 9083
53 | BRDT HGNC:1105 676
54 | BSND HGNC:16512 7809
55 | C10orf113 HGNC:31447 387638
56 | C10orf120 HGNC:25707 399814
57 | C10orf53 HGNC:27421 282966
58 | C11orf40 HGNC:23986 143501
59 | C12orf40 HGNC:26846 283461
60 | C14orf183 HGNC:27285 196913
61 | C15orf55 HGNC:29919 256646
62 | C16orf78 HGNC:28479 123970
63 | C17orf102 HGNC:34412 400591
64 | C17orf78 HGNC:26831 284099
65 | C18orf26 HGNC:26808 284254
66 | C19orf45 HGNC:24745 374877
67 | C1orf146 HGNC:24032 388649
68 | C20orf173 HGNC:16166 140873
69 | C20orf203 HGNC:26592 284805
70 | C20orf79 HGNC:16211 140856
71 | C2orf57 HGNC:28563 165100
72 | C2orf61 HGNC:26850 285051
73 | C2orf71 HGNC:34383 388939
74 | C2orf83 HGNC:25344 56918
75 | C3orf30 HGNC:26553 152405
76 | C4orf40 HGNC:33193 401137
77 | C5orf20 HGNC:24459 140947
78 | C6orf10 HGNC:13922 10665
79 | C7orf66 HGNC:33712 154907
80 | C7orf71 HGNC:22364 285941
81 | C8A HGNC:1352 731
82 | C8B HGNC:1353 732
83 | C8orf17 HGNC:17737 100507249
84 | C8orf86 HGNC:33774 389649
85 | C9orf53 HGNC:23831 51198
86 | CABP2 HGNC:1385 51475
87 | CABP5 HGNC:13714 56344
88 | CABS1 HGNC:30710 85438
89 | CACNG2 HGNC:1406 10369
90 | CACNG3 HGNC:1407 10368
91 | CACNG5 HGNC:1409 27091
92 | CATSPER4 HGNC:23220 378807
93 | CCDC155 HGNC:26520 147872
94 | CCDC172 HGNC:30524 374355
95 | CCDC83 HGNC:28535 220047
96 | CCKAR HGNC:1570 886
97 | CCL1 HGNC:10609 6346
98 | CCT8L2 HGNC:15553 150160
99 | CD200R1L HGNC:24665 344807
100 | CDCP2 HGNC:27297 200008
101 | CDX2 HGNC:1806 1045
102 | CDX4 HGNC:1808 1046
103 | CDY1 HGNC:1809 9085
104 | CDY1B HGNC:23920 253175
105 | CDY2A HGNC:1810 9426
106 | CDY2B HGNC:23921 203611
107 | CEACAM7 HGNC:1819 1087
108 | CELA2A HGNC:24609 63036
109 | CELA3A HGNC:15944 10136
110 | CELA3B HGNC:15945 23436
111 | CER1 HGNC:1862 9350
112 | CETN1 HGNC:1866 1068
113 | CFHR2 HGNC:4890 3080
114 | CFHR5 HGNC:24668 81494
115 | CHAT HGNC:1912 1103
116 | CHRNA6 HGNC:15963 8973
117 | CHRNB3 HGNC:1963 1142
118 | CLCA1 HGNC:2015 1179
119 | CLDN17 HGNC:2038 26285
120 | CLEC2A HGNC:24191 387836
121 | CLEC3A HGNC:2052 10143
122 | CLEC6A HGNC:14556 93978
123 | CLRN1 HGNC:12605 7401
124 | CNBD1 HGNC:26663 168975
125 | CNGA2 HGNC:2149 1260
126 | CNGB3 HGNC:2153 54714
127 | CNPY1 HGNC:27786 285888
128 | CNTNAP5 HGNC:18748 129684
129 | COL20A1 HGNC:14670 57642
130 | COX7B2 HGNC:24381 170712
131 | CPXCR1 HGNC:2332 53336
132 | CRNN HGNC:1230 49860
133 | CRX HGNC:2383 1406
134 | CRYGB HGNC:2409 1419
135 | CSH1 HGNC:2440 1442
136 | CSHL1 HGNC:2442 1444
137 | CSN2 HGNC:2447 1447
138 | CSN3 HGNC:2446 1448
139 | CST11 HGNC:15959 140880
140 | CST4 HGNC:2476 1472
141 | CST5 HGNC:2477 1473
142 | CST8 HGNC:2480 10047
143 | CST9 HGNC:13261 128822
144 | CST9L HGNC:16233 128821
145 | CSTL1 HGNC:15958 128817
146 | CT45A2 HGNC:28400 728911
147 | CT45A4 HGNC:33269 441520
148 | CT45A5 HGNC:33270 441521
149 | CT47A11 HGNC:27397 255313
150 | CTCFL HGNC:16234 140690
151 | CTRB1 HGNC:2521 1504
152 | CXorf1 HGNC:2562 9142
153 | CXorf66 HGNC:33743 347487
154 | CYLC2 HGNC:2583 1539
155 | CYP11B1 HGNC:2591 1539
156 | CYP11B2 HGNC:2592 1585
157 | CYP26C1 HGNC:20577 340665
158 | CYP2A13 HGNC:2608 1553
159 | CYP2C19 HGNC:2621 1557
160 | CYP4A22 HGNC:20575 284541
161 | CYP4F8 HGNC:2648 11283
162 | CYP7A1 HGNC:2651 1581
163 | DAZ1 HGNC:2682 1617
164 | DAZ2 HGNC:15964 57055
165 | DAZ3 HGNC:15965 57054
166 | DAZ4 HGNC:15966 57135
167 | DAZL HGNC:2685 1618
168 | DCAF4L2 HGNC:26657 138009
169 | DCAF8L1 HGNC:31810 139425
170 | DDI1 HGNC:18961 414301
171 | DDX4 HGNC:18700 54514
172 | DEFA5 HGNC:2764 1670
173 | DEFA6 HGNC:2765 1671
174 | DEFB103B HGNC:31702 55894
175 | DEFB104A HGNC:18115 140596
176 | DEFB106A HGNC:18088 245909
177 | DEFB107A HGNC:18086 245910
178 | DEFB118 HGNC:16196 117285
179 | DEFB123 HGNC:18103 245936
180 | DEFB126 HGNC:15900 81623
181 | DEFB127 HGNC:16206 140850
182 | DEFB129 HGNC:16218 140881
183 | DGAT2L6 HGNC:23250 347516
184 | DGKK HGNC:32395 139189
185 | DIRC1 HGNC:15760 116093
186 | DMP1 HGNC:2932 1758
187 | DMRT1 HGNC:2934 1761
188 | DMRTB1 HGNC:13913 63948
189 | DMRTC2 HGNC:13911 63946
190 | DPCR1 HGNC:21666 135656
191 | DPRX HGNC:32166 503834
192 | DRD3 HGNC:3024 1814
193 | DRGX HGNC:21536 644168
194 | DSCR4 HGNC:3045 10281
195 | DSG4 HGNC:21307 147409
196 | DSPP HGNC:3054 1834
197 | DTX2 HGNC:15973 113878
198 | DUSP21 HGNC:20476 63904
199 | DUX4 HGNC:50800 100288687
200 | DUX4L7 HGNC:37266 653543
201 | DUXA HGNC:32179 503835
202 | EFCAB3 HGNC:26379 146779
203 | EGR4 HGNC:3241 1961
204 | ENTHD1 HGNC:26352 150350
205 | ESX1 HGNC:14865 80712
206 | EVX1 HGNC:3506 2128
207 | F13B HGNC:3534 2165
208 | F9 HGNC:3551 2158
209 | FABP2 HGNC:3556 2169
210 | FAM106A HGNC:25682 80039
211 | FAM47A HGNC:29962 158724
212 | FAM47B HGNC:26659 170062
213 | FAM47C HGNC:25301 442444
214 | FAM71A HGNC:26541 149647
215 | FAM71B HGNC:28397 153745
216 | FAM71C HGNC:28594 196472
217 | FAM75A7 HGNC:32007 26165
218 | FAM75D1 HGNC:37283 389763
219 | FCRL4 HGNC:18507 83417
220 | FEZF1 HGNC:22788 389549
221 | FEZF2 HGNC:13506 55079
222 | FFAR1 HGNC:4498 2864
223 | FGF3 HGNC:3681 2248
224 | FGF4 HGNC:3682 2249
225 | FGF6 HGNC:3684 2251
226 | FIGLA HGNC:24669 344018
227 | FLG2 HGNC:33276 388698
228 | FMR1NB HGNC:26372 158521
229 | FNDC7 HGNC:26668 163479
230 | FNDC9 HGNC:33547 408263
231 | FOXB1 HGNC:3799 27023
232 | FOXB2 HGNC:23315 442425
233 | FOXD4L3 HGNC:18523 286380
234 | FOXD4L4 HGNC:23762 349334
235 | FOXE3 HGNC:3808 2301
236 | FOXN1 HGNC:12765 8456
237 | FOXR1 HGNC:29980 283150
238 | FRG2 HGNC:19136 448831
239 | FRMD7 HGNC:8079 90167
240 | FSCB HGNC:20494 84075
241 | FUT5 HGNC:4016 2527
242 | FUT9 HGNC:4020 10690
243 | G6PC HGNC:4056 2538
244 | GABRA1 HGNC:4075 2554
245 | GABRA6 HGNC:4080 2559
246 | GAGE1 HGNC:4098 2543
247 | GAGE2C HGNC:31958 2574
248 | GALNTL5 HGNC:21725 168391
249 | GALR1 HGNC:4132 2587
250 | GALR3 HGNC:4134 8484
251 | GBP7 HGNC:29606 388646
252 | GCG HGNC:4191 2641
253 | GCM2 HGNC:4198 9247
254 | GDF2 HGNC:4217 2658
255 | GFRA4 HGNC:13821 64096
256 | GFRAL HGNC:32789 389400
257 | GH2 HGNC:4262 2689
258 | GHRH HGNC:4265 2691
259 | GHSR HGNC:4267 2693
260 | GIF HGNC:4268 2694
261 | GJA10 HGNC:19155 81025
262 | GJA8 HGNC:4281 2703
263 | GK2 HGNC:4291 2712
264 | GKN2 HGNC:24588 200504
265 | GLRA1 HGNC:4326 2741
266 | GLRA2 HGNC:4327 2742
267 | GLT6D1 HGNC:23671 360203
268 | GML HGNC:4375 2765
269 | GOLGA6L2 HGNC:26695 283685
270 | GOT1L1 HGNC:28487 137362
271 | GPR101 HGNC:14963 83550
272 | GPR111 HGNC:18991 222611
273 | GPR119 HGNC:19060 139760
274 | GPR128 HGNC:19241 84873
275 | GPR139 HGNC:19995 124274
276 | GPR144 HGNC:18651 347088
277 | GPR148 HGNC:23623 344561
278 | GPR151 HGNC:23624 134391
279 | GPR152 HGNC:23622 390212
280 | GPR26 HGNC:4481 2849
281 | GPR31 HGNC:4486 2853
282 | GPR32 HGNC:4487 2854
283 | GPR45 HGNC:4503 11250
284 | GPR50 HGNC:4506 9248
285 | GPR52 HGNC:4508 9293
286 | GPR78 HGNC:4528 27201
287 | GPRC6A HGNC:18510 27201
288 | GPX5 HGNC:4557 2880
289 | GPX6 HGNC:4558 257202
290 | GRK1 HGNC:10013 6011
291 | GRM4 HGNC:4596 2914
292 | GRM5 HGNC:4597 2915
293 | GRM6 HGNC:4598 2916
294 | GSC2 HGNC:4613 2928
295 | GSTA5 HGNC:19662 221357
296 | GSX1 HGNC:20374 219409
297 | GSX2 HGNC:24959 170825
298 | GUCA2A HGNC:4682 2980
299 | GUCY2F HGNC:4691 2986
300 | H1FOO HGNC:18463 132243
301 | H2BFM HGNC:27867 286436
302 | H2BFWT HGNC:27252 158983
303 | HAO1 HGNC:4809 54363
304 | HCRTR2 HGNC:4849 3062
305 | HDGFL1 HGNC:21095 154150
306 | HHLA1 HGNC:4904 10086
307 | HIST1H2AA HGNC:18729 221613
308 | HIST1H2BA HGNC:18730 255626
309 | HIST1H4G HGNC:4792 8369
310 | HMX1 HGNC:5017 3166
311 | HOXB1 HGNC:5111 3211
312 | HOXD12 HGNC:5135 3238
313 | HRG HGNC:5181 3273
314 | HRH3 HGNC:5184 11255
315 | HSFY1 HGNC:18568 86614
316 | HSFY2 HGNC:23950 159119
317 | HTN3 HGNC:5284 3347
318 | HTR1A HGNC:5286 3350
319 | HTR2C HGNC:5295 3358
320 | HTR3C HGNC:24003 170572
321 | HTR3D HGNC:24004 200909
322 | HTR3E HGNC:24005 285242
323 | HTR5A HGNC:5300 3361
324 | HTR6 HGNC:5301 3362
325 | IAPP HGNC:5329 3375
326 | IFIT1B HGNC:23442 439996
327 | IFNA10 HGNC:5418 3446
328 | IFNA14 HGNC:5420 3448
329 | IFNA16 HGNC:5421 3449
330 | IFNA17 HGNC:5422 3451
331 | IFNA2 HGNC:5423 3440
332 | IFNA21 HGNC:5424 3452
333 | IFNA4 HGNC:5425 3441
334 | IFNA5 HGNC:5426 3442
335 | IFNA6 HGNC:5427 3443
336 | IFNA7 HGNC:5428 3444
337 | IFNA8 HGNC:5429 3445
338 | IFNB1 HGNC:5434 3456
339 | IFNK HGNC:21714 56832
340 | IFNW1 HGNC:5448 3467
341 | IL12B HGNC:5970 3593
342 | IL13 HGNC:5973 3596
343 | IL17A HGNC:5981 3605
344 | IL17F HGNC:16404 112744
345 | IL1F10 HGNC:15552 84639
346 | IL21 HGNC:6005 59067
347 | IL22 HGNC:14900 50616
348 | IL25 HGNC:13765 50616
349 | IL26 HGNC:17119 55801
350 | IL28A HGNC:18364 282616
351 | IL28B HGNC:18365 282617
352 | IL29 HGNC:18363 282618
353 | IL3 HGNC:6011 3562
354 | IL31 HGNC:19372 386653
355 | IL36A HGNC:15562 27179
356 | IL36B HGNC:15564 27177
357 | IL36RN HGNC:15561 26525
358 | IL9 HGNC:6029 3578
359 | INS HGNC:6081 3630
360 | INSL5 HGNC:6088 10022
361 | INSL6 HGNC:6089 11172
362 | INSM2 HGNC:17539 11172
363 | INSRR HGNC:6093 3645
364 | IQCF1 HGNC:28607 132141
365 | IRGC HGNC:28835 56269
366 | ISX HGNC:28084 91464
367 | ITIH6 HGNC:28907 347365
368 | IZUMO2 HGNC:28518 126123
369 | KCNA10 HGNC:6219 3744
370 | KCNB2 HGNC:6232 9312
371 | KCNG4 HGNC:19697 93107
372 | KCNK10 HGNC:6273 54207
373 | KCNK16 HGNC:14464 83795
374 | KCNK18 HGNC:19439 338567
375 | KCNV1 HGNC:18861 27012
376 | KHDC3L HGNC:33699 154288
377 | KIF2B HGNC:29443 84643
378 | KIR2DL1 HGNC:6329 3802
379 | KIR3DL3 HGNC:16312 115653
380 | KLK12 HGNC:6360 43849
381 | KLK9 HGNC:6370 284366
382 | KRT2 HGNC:6439 3849
383 | KRT25 HGNC:30839 147183
384 | KRT26 HGNC:30840 353288
385 | KRT28 HGNC:30842 162605
386 | KRT33A HGNC:6450 3883
387 | KRT35 HGNC:6453 3886
388 | KRT36 HGNC:6454 8689
389 | KRT37 HGNC:6455 8688
390 | KRT38 HGNC:6456 8687
391 | KRT40 HGNC:26707 125115
392 | KRT71 HGNC:28927 112802
393 | KRT73 HGNC:28928 319101
394 | KRT74 HGNC:28929 121391
395 | KRT75 HGNC:24431 9119
396 | KRT76 HGNC:24430 51350
397 | KRT77 HGNC:20411 374454
398 | KRT78 HGNC:28926 196374
399 | KRT82 HGNC:6459 3888
400 | KRT84 HGNC:6461 3890
401 | KRT85 HGNC:6462 3891
402 | KRT86 HGNC:6463 3892
403 | KRT9 HGNC:6447 3857
404 | KRTAP1-1 HGNC:16772 81851
405 | KRTAP10-1 HGNC:22966 386677
406 | KRTAP10-10 HGNC:22972 353333
407 | KRTAP10-11 HGNC:20528 386678
408 | KRTAP10-12 HGNC:20533 386685
409 | KRTAP10-2 HGNC:22967 386679
410 | KRTAP10-4 HGNC:20521 386672
411 | KRTAP10-5 HGNC:22969 386680
412 | KRTAP10-6 HGNC:20523 386674
413 | KRTAP10-7 HGNC:22970 386675
414 | KRTAP10-8 HGNC:20525 386681
415 | KRTAP10-9 HGNC:22971 386676
416 | KRTAP11-1 HGNC:18922 386676
417 | KRTAP13-1 HGNC:18924 140258
418 | KRTAP13-2 HGNC:18923 337959
419 | KRTAP13-3 HGNC:18925 337960
420 | KRTAP13-4 HGNC:18926 284827
421 | KRTAP15-1 HGNC:18927 254950
422 | KRTAP17-1 HGNC:18917 83902
423 | KRTAP19-3 HGNC:18938 337970
424 | KRTAP23-1 HGNC:18928 337963
425 | KRTAP26-1 HGNC:33760 388818
426 | KRTAP3-2 HGNC:16779 83897
427 | KRTAP4-11 HGNC:18911 653240
428 | KRTAP4-12 HGNC:16776 83755
429 | KRTAP4-2 HGNC:18900 85291
430 | KRTAP4-4 HGNC:16928 84616
431 | KRTAP4-7 HGNC:18898 100132476
432 | KRTAP5-2 HGNC:23597 440021
433 | KRTAP9-2 HGNC:16926 83899
434 | KRTAP9-3 HGNC:16927 83900
435 | KRTAP9-4 HGNC:18902 85280
436 | LALBA HGNC:6480 3906
437 | LBX1 HGNC:16960 10660
438 | LCN9 HGNC:17442 392399
439 | LCT HGNC:6530 3938
440 | LGALS13 HGNC:15449 29124
441 | LGALS14 HGNC:30054 56891
442 | LHFPL5 HGNC:21253 222662
443 | LHX3 HGNC:6595 8022
444 | LHX5 HGNC:14216 64211
445 | LIM2 HGNC:6610 3982
446 | LIN28A HGNC:15986 79727
447 | LIPM HGNC:23455 340654
448 | LOR HGNC:6663 4014
449 | LRIT1 HGNC:23404 26103
450 | LRIT2 HGNC:23443 340745
451 | LRRC10 HGNC:20264 376132
452 | LUZP4 HGNC:24971 51213
453 | LYZL1 HGNC:30502 84569
454 | LYZL2 HGNC:29613 119180
455 | LYZL6 HGNC:29614 57151
456 | MAGEA10 HGNC:6797 4109
457 | MAGEA11 HGNC:6798 4110
458 | MAGEB1 HGNC:6808 4112
459 | MAGEB10 HGNC:25377 139422
460 | MAGEB18 HGNC:28515 286514
461 | MAGEB3 HGNC:6810 4114
462 | MAGEB4 HGNC:6811 4115
463 | MAGEC3 HGNC:23798 139081
464 | MAS1 HGNC:6899 4142
465 | MAS1L HGNC:13961 116511
466 | MBD3L1 HGNC:15774 85509
467 | MBD3L2 HGNC:18532 125997
468 | MBL2 HGNC:6922 4153
469 | MC2R HGNC:6930 4153
470 | MC3R HGNC:6931 4159
471 | MC5R HGNC:6933 4161
472 | MEP1A HGNC:7015 4224
473 | MEP1B HGNC:7020 4225
474 | MEPE HGNC:13361 56955
475 | MFRP HGNC:18121 83552
476 | MMD2 HGNC:30133 221938
477 | MMP20 HGNC:7167 9313
478 | MMP21 HGNC:7170 8511
479 | MMP26 HGNC:14249 56547
480 | MMP27 HGNC:14250 64066
481 | MOGAT3 HGNC:23249 346606
482 | MORC1 HGNC:7198 27136
483 | MRGPRD HGNC:29626 116512
484 | MRGPRX1 HGNC:17962 259249
485 | MRGPRX2 HGNC:17983 117194
486 | MRGPRX4 HGNC:17617 117196
487 | MS4A10 HGNC:13368 341116
488 | MS4A13 HGNC:16674 503497
489 | MS4A5 HGNC:13374 64232
490 | MSGN1 HGNC:14907 343930
491 | MT1B HGNC:7394 4490
492 | MTNR1B HGNC:7464 4544
493 | MUC17 HGNC:16800 140453
494 | MUC7 HGNC:7518 4589
495 | MYBPC3 HGNC:7551 4607
496 | MYF5 HGNC:7565 4617
497 | NANOGNB HGNC:24958 360030
498 | NANOS2 HGNC:23292 339345
499 | NCR2 HGNC:6732 9436
500 | NDST4 HGNC:20779 64579
501 | NEUROD2 HGNC:7763 4761
502 | NEUROD4 HGNC:13802 58158
503 | NEUROD6 HGNC:13804 63974
504 | NEUROG1 HGNC:7764 4762
505 | NKX2-1 HGNC:11825 7080
506 | NKX2-2 HGNC:7835 4821
507 | NLRP4 HGNC:22943 147945
508 | NLRP5 HGNC:21269 126206
509 | NLRP8 HGNC:22940 126205
510 | NLRP9 HGNC:22941 338321
511 | NMS HGNC:32203 129521
512 | NOBOX HGNC:22448 135935
513 | NOTO HGNC:31839 344022
514 | NOX3 HGNC:7890 50508
515 | NPFFR1 HGNC:17425 64106
516 | NPHS2 HGNC:13394 7827
517 | NPSR1 HGNC:23631 387129
518 | NPVF HGNC:13782 64111
519 | NR2E1 HGNC:7973 7101
520 | NYX HGNC:8082 60506
521 | OC90 HGNC:8100 729330
522 | OLIG2 HGNC:9398 10215
523 | OLIG3 HGNC:18003 167826
524 | OPALIN HGNC:20707 93377
525 | OPN1LW HGNC:9936 5956
526 | OPN5 HGNC:19992 221391
527 | OR10A2 HGNC:8161 341276
528 | OR10A4 HGNC:15130 283297
529 | OR10A5 HGNC:15131 144124
530 | OR10H1 HGNC:8172 26539
531 | OR10H2 HGNC:8173 26538
532 | OR10H3 HGNC:8174 26532
533 | OR10J1 HGNC:8175 26476
534 | OR10R2 HGNC:14820 343406
535 | OR10S1 HGNC:14807 219873
536 | OR10X1 HGNC:14995 128367
537 | OR10Z1 HGNC:14996 128368
538 | OR11A1 HGNC:8176 128368
539 | OR12D2 HGNC:8178 26529
540 | OR12D3 HGNC:13963 81797
541 | OR13C3 HGNC:14704 138803
542 | OR13D1 HGNC:14695 286365
543 | OR14A16 HGNC:15022 284532
544 | OR1A1 HGNC:8179 8383
545 | OR1A2 HGNC:8180 26189
546 | OR1B1 HGNC:8181 347169
547 | OR1D2 HGNC:8183 4991
548 | OR1E1 HGNC:8189 8387
549 | OR1E2 HGNC:8190 8388
550 | OR1G1 HGNC:8204 8390
551 | OR1L6 HGNC:8218 392390
552 | OR1N2 HGNC:15111 138882
553 | OR1S1 HGNC:8227 219959
554 | OR1S2 HGNC:15141 219958
555 | OR2AK2 HGNC:19569 391191
556 | OR2AT4 HGNC:19620 341152
557 | OR2C1 HGNC:8242 4993
558 | OR2C3 HGNC:15005 81472
559 | OR2D2 HGNC:8244 120776
560 | OR2D3 HGNC:15146 120775
561 | OR2F1 HGNC:8246 26211
562 | OR2G2 HGNC:15007 81470
563 | OR2G3 HGNC:15008 81469
564 | OR2H1 HGNC:8252 26716
565 | OR2J2 HGNC:8260 26707
566 | OR2L3 HGNC:15009 391192
567 | OR2T1 HGNC:8277 26696
568 | OR2T10 HGNC:19573 127069
569 | OR2T12 HGNC:19592 127064
570 | OR2T2 HGNC:14725 401992
571 | OR2T27 HGNC:31252 403239
572 | OR2T33 HGNC:31255 391195
573 | OR2T4 HGNC:15016 127074
574 | OR2T5 HGNC:15017 401993
575 | OR2W1 HGNC:8281 26692
576 | OR3A1 HGNC:8282 4994
577 | OR3A2 HGNC:8283 4995
578 | OR3A3 HGNC:8284 8392
579 | OR4C11 HGNC:15167 219429
580 | OR4C3 HGNC:14697 256144
581 | OR4D1 HGNC:8293 26689
582 | OR4D10 HGNC:15173 390197
583 | OR4D11 HGNC:15174 219986
584 | OR4D9 HGNC:15178 390199
585 | OR4K17 HGNC:15355 390436
586 | OR51B6 HGNC:19600 390058
587 | OR51D1 HGNC:15193 390038
588 | OR51F2 HGNC:15197 119694
589 | OR51T1 HGNC:15205 401665
590 | OR51V1 HGNC:19597 283111
591 | OR52A1 HGNC:8318 23538
592 | OR52A5 HGNC:19580 390054
593 | OR52B2 HGNC:15207 255725
594 | OR52B6 HGNC:15211 340980
595 | OR52E8 HGNC:15217 390079
596 | OR52I2 HGNC:15221 143502
597 | OR52K2 HGNC:15223 119774
598 | OR52L1 HGNC:14785 338751
599 | OR52M1 HGNC:15225 119772
600 | OR52R1 HGNC:15235 119695
601 | OR52W1 HGNC:15239 120787
602 | OR56A1 HGNC:14781 120796
603 | OR56A4 HGNC:14791 120793
604 | OR56B1 HGNC:15245 387748
605 | OR5AU1 HGNC:15362 390445
606 | OR5C1 HGNC:8331 392391
607 | OR5I1 HGNC:8347 10798
608 | OR5M1 HGNC:8352 390168
609 | OR5M10 HGNC:15290 390167
610 | OR5P2 HGNC:14783 120065
611 | OR5P3 HGNC:14784 120066
612 | OR5R1 HGNC:14841 219479
613 | OR5T1 HGNC:14821 390155
614 | OR5T2 HGNC:15296 219464
615 | OR5T3 HGNC:15297 390154
616 | OR5V1 HGNC:13972 81696
617 | OR5W2 HGNC:15299 390148
618 | OR6A2 HGNC:15301 8590
619 | OR6K6 HGNC:15033 128371
620 | OR6S1 HGNC:15363 341799
621 | OR6V1 HGNC:15090 346517
622 | OR7A17 HGNC:8363 26333
623 | OR7C2 HGNC:8374 26658
624 | OR7D4 HGNC:8380 125958
625 | OR7G2 HGNC:8466 390882
626 | OR8A1 HGNC:8469 390275
627 | OR8B8 HGNC:8477 26493
628 | OR8G5 HGNC:19622 219865
629 | OR8U1 HGNC:19611 219417
630 | OR9Q2 HGNC:15328 219957
631 | OTOP1 HGNC:19656 133060
632 | OTOP3 HGNC:19658 347741
633 | OTOR HGNC:8517 56914
634 | OTP HGNC:8518 23440
635 | OTUD6A HGNC:32312 139562
636 | OTX2 HGNC:8522 5015
637 | PAGE3 HGNC:4110 139793
638 | PANX3 HGNC:20573 116337
639 | PASD1 HGNC:20686 139135
640 | PAX1 HGNC:8615 5075
641 | PAX4 HGNC:8618 5078
642 | PBOV1 HGNC:21079 59351
643 | PDCL2 HGNC:29524 132954
644 | PDE6H HGNC:8790 5149
645 | PDILT HGNC:27338 204474
646 | PDX1 HGNC:6107 3651
647 | PDYN HGNC:8820 5173
648 | PGK2 HGNC:8898 5232
649 | PGLYRP2 HGNC:30013 114770
650 | PGLYRP3 HGNC:30014 114771
651 | PIWIL1 HGNC:9007 9271
652 | PIWIL3 HGNC:18443 440822
653 | PKD1L3 HGNC:21716 342372
654 | PLA2G2E HGNC:13414 30814
655 | PLA2G2F HGNC:30040 64600
656 | PLA2G4E HGNC:24791 123745
657 | PLAC1L HGNC:26699 219990
658 | PNLIP HGNC:9155 5406
659 | PNLIPRP1 HGNC:9156 5407
660 | PNLIPRP2 HGNC:9157 5408
661 | PNPLA5 HGNC:24888 150379
662 | POM121L12 HGNC:25369 285877
663 | POTEA HGNC:33893 340441
664 | POTED HGNC:23822 317754
665 | POTEG HGNC:33896 404785
666 | POTEH HGNC:133 23784
667 | POU3F4 HGNC:9217 5456
668 | POU4F2 HGNC:9219 5458
669 | POU4F3 HGNC:9220 5459
670 | POU5F2 HGNC:26367 134187
671 | PPP3R2 HGNC:9318 5535
672 | PRAMEF1 HGNC:28840 65121
673 | PRAMEF19 HGNC:24908 645414
674 | PRAMEF2 HGNC:28841 65122
675 | PRAMEF3 HGNC:14087 401940
676 | PRAMEF4 HGNC:31971 400735
677 | PRAMEF7 HGNC:28415 441871
678 | PRB1 HGNC:9337 5542
679 | PRB4 HGNC:9340 5545
680 | PRDM13 HGNC:13998 59336
681 | PRDM14 HGNC:14001 63978
682 | PRDM7 HGNC:9351 11105
683 | PRDM9 HGNC:13994 56979
684 | PRG3 HGNC:9363 10394
685 | PRLH HGNC:17945 51052
686 | PRLHR HGNC:4464 2834
687 | PROP1 HGNC:9455 5626
688 | PRSS33 HGNC:30405 260429
689 | PRSS37 HGNC:29211 136242
690 | PRSS38 HGNC:29625 339501
691 | PRSS41 HGNC:30715 360226
692 | PRSS55 HGNC:30824 203074
693 | PRSS58 HGNC:39125 136541
694 | PRY2 HGNC:21504 442862
695 | PSKH2 HGNC:18997 85481
696 | PTF1A HGNC:23734 256297
697 | RAX HGNC:18662 30062
698 | RAX2 HGNC:18286 84839
699 | RBM46 HGNC:28401 166863
700 | RBMXL2 HGNC:17886 27288
701 | RBMY1A1 HGNC:9912 5940
702 | RBMY1B HGNC:23914 378948
703 | RBMY1D HGNC:23915 378949
704 | RBMY1E HGNC:23916 378950
705 | RBMY1F HGNC:23974 159163
706 | RBMY1J HGNC:23917 378951
707 | RBP3 HGNC:9921 5949
708 | RBPJL HGNC:13761 11317
709 | RD3 HGNC:19689 343035
710 | RDH8 HGNC:14423 50700
711 | REG3A HGNC:8601 5068
712 | RESP18 HGNC:33762 389075
713 | RETNLB HGNC:20388 84666
714 | REXO1L1 HGNC:24660 254958
715 | RFPL3 HGNC:9980 10738
716 | RFPL4B HGNC:33264 442247
717 | RFX6 HGNC:21478 222546
718 | RHO HGNC:10012 6010
719 | RHOXF2 HGNC:30011 84528
720 | RNASE10 HGNC:19275 338879
721 | RNASE11 HGNC:19269 122651
722 | RNASE12 HGNC:24211 493901
723 | RNASE13 HGNC:25285 440163
724 | RNASE8 HGNC:19277 122665
725 | RNASE9 HGNC:20673 390443
726 | RND2 HGNC:18315 8153
727 | RNF113B HGNC:17267 140432
728 | RNF17 HGNC:10060 56163
729 | RP1 HGNC:10263 6101
730 | RP1L1 HGNC:15946 94137
731 | RPE65 HGNC:10294 6121
732 | RPTN HGNC:26809 126638
733 | RS1 HGNC:10457 6247
734 | RTP1 HGNC:28580 132112
735 | RTP2 HGNC:32486 344892
736 | RXFP2 HGNC:17318 122042
737 | RXFP3 HGNC:24883 51289
738 | S100A7A HGNC:21657 338324
739 | S100G HGNC:1436 795
740 | SAGE1 HGNC:30369 55511
741 | SAMD7 HGNC:25394 344658
742 | SCGB1D1 HGNC:18395 10648
743 | SCN10A HGNC:10582 6336
744 | SCRT2 HGNC:15952 85508
745 | SDR9C7 HGNC:29958 121214
746 | SEC14L3 HGNC:18655 266629
747 | SEMG2 HGNC:10743 6407
748 | SEPT14 HGNC:33280 346288
749 | SERPINA12 HGNC:18359 145264
750 | SERPINA7 HGNC:11583 6906
751 | SERPINA9 HGNC:15995 327657
752 | SERPINB12 HGNC:14220 89777
753 | SHCBP1L HGNC:16788 81626
754 | SHOX HGNC:10853 6473
755 | SI HGNC:10856 6476
756 | SIGLECL1 HGNC:26856 284369
757 | SIX6 HGNC:10892 4990
758 | SLC10A2 HGNC:10906 6555
759 | SLC13A1 HGNC:10916 6561
760 | SLC17A2 HGNC:11019 6569
761 | SLC17A6 HGNC:16703 57084
762 | SLC18A3 HGNC:10936 6572
763 | SLC22A12 HGNC:17989 116085
764 | SLC22A13 HGNC:8494 9390
765 | SLC22A24 HGNC:28542 283238
766 | SLC22A25 HGNC:32935 387601
767 | SLC22A6 HGNC:10970 9356
768 | SLC22A8 HGNC:10972 9376
769 | SLC22A9 HGNC:16261 114571
770 | SLC25A2 HGNC:22921 83884
771 | SLC25A31 HGNC:25319 83447
772 | SLC2A2 HGNC:11006 6514
773 | SLC2A7 HGNC:13445 155184
774 | SLC32A1 HGNC:11018 140679
775 | SLC34A1 HGNC:11019 6569
776 | SLC36A3 HGNC:19659 285641
777 | SLC39A12 HGNC:20860 221074
778 | SLC6A18 HGNC:26441 348932
779 | SLC6A5 HGNC:11051 9152
780 | SLC6A7 HGNC:11054 6534
781 | SLC7A13 HGNC:23092 157724
782 | SLCO1B1 HGNC:10959 10599
783 | SLCO6A1 HGNC:23613 133482
784 | SLITRK1 HGNC:20297 114798
785 | SOHLH1 HGNC:27845 402381
786 | SOX1 HGNC:11189 6656
787 | SOX14 HGNC:11193 8403
788 | SP8 HGNC:19196 221833
789 | SPACA1 HGNC:14967 81833
790 | SPACA5 HGNC:31353 389852
791 | SPACA7 HGNC:29575 122258
792 | SPATA16 HGNC:29935 83893
793 | SPATA21 HGNC:28026 374955
794 | SPEM1 HGNC:32429 374768
795 | SPHAR HGNC:16957 10638
796 | SPINK14 HGNC:33825 408187
797 | SPO11 HGNC:11250 23626
798 | SPPL2C HGNC:28902 162540
799 | SPRR4 HGNC:23173 163778
800 | SSTR4 HGNC:11333 6754
801 | SSX3 HGNC:11337 10214
802 | SSX5 HGNC:11339 6758
803 | SSX7 HGNC:19653 280658
804 | SSX8 HGNC:19654 280659
805 | SSX9 HGNC:19655 280660
806 | STATH HGNC:11369 6779
807 | SULT6B1 HGNC:33433 391365
808 | SUN5 HGNC:16252 140732
809 | T HGNC:11515 6862
810 | TAAR1 HGNC:17734 134864
811 | TAAR2 HGNC:4514 9287
812 | TAAR5 HGNC:30236 9038
813 | TAAR6 HGNC:20978 319100
814 | TAAR8 HGNC:14964 83551
815 | TAAR9 HGNC:20977 134860
816 | TAS1R2 HGNC:14905 80834
817 | TAS2R1 HGNC:14909 50834
818 | TAS2R13 HGNC:14919 50838
819 | TAS2R16 HGNC:14921 50833
820 | TAS2R39 HGNC:18886 259285
821 | TAS2R40 HGNC:18885 259286
822 | TAS2R41 HGNC:18883 259287
823 | TAS2R42 HGNC:18888 353164
824 | TAS2R43 HGNC:18875 259289
825 | TAS2R46 HGNC:18877 259292
826 | TAS2R50 HGNC:18882 259296
827 | TAS2R60 HGNC:20639 338398
828 | TAS2R7 HGNC:14913 50837
829 | TAS2R8 HGNC:14915 50836
830 | TAS2R9 HGNC:14917 50835
831 | TBC1D21 HGNC:28536 161514
832 | TBC1D29 HGNC:24509 26083
833 | TBL1Y HGNC:18502 90665
834 | TBPL2 HGNC:19841 387332
835 | TBR1 HGNC:11590 10716
836 | TBX10 HGNC:11593 347853
837 | TCEB3B HGNC:30771 51224
838 | TCEB3C HGNC:24617 162699
839 | TCHHL1 HGNC:31796 126637
840 | TCP10L2 HGNC:21254 401285
841 | TEDDM1 HGNC:30233 127670
842 | TEX101 HGNC:30722 83639
843 | TEX13A HGNC:11735 56157
844 | TEX28 HGNC:2563 1527
845 | TEX34 HGNC:26349 124783
846 | TFAP2D HGNC:15581 83741
847 | TFDP3 HGNC:24603 51270
848 | TGIF2LX HGNC:18570 90316
849 | TGIF2LY HGNC:18569 90655
850 | TGM6 HGNC:16255 343641
851 | TKTL2 HGNC:25313 84076
852 | TLX1 HGNC:5056 3195
853 | TMEM132D HGNC:29411 121256
854 | TMEM174 HGNC:28187 134288
855 | TMEM207 HGNC:33705 131920
856 | TMEM225 HGNC:32390 338661
857 | TMIGD1 HGNC:32431 388364
858 | TMPRSS11A HGNC:27954 339967
859 | TMPRSS11B HGNC:25398 132724
860 | TMPRSS11F HGNC:29994 389208
861 | TMPRSS12 HGNC:28779 283471
862 | TMPRSS15 HGNC:9490 5651
863 | TNR HGNC:11953 7143
864 | TPD52L3 HGNC:23382 89882
865 | TPH2 HGNC:20692 121278
866 | TPRX1 HGNC:32174 284355
867 | TPTE HGNC:12023 7179
868 | TREML4 HGNC:30807 285852
869 | TRHR HGNC:12299 7201
870 | TRIM40 HGNC:18736 135644
871 | TRIM42 HGNC:19014 287015
872 | TRIM43 HGNC:19015 129868
873 | TRIM48 HGNC:19021 79097
874 | TRIM49 HGNC:13431 57093
875 | TRIM51 HGNC:19023 84767
876 | TRIM60 HGNC:21162 166655
877 | TRIM67 HGNC:31859 440730
878 | TRIML1 HGNC:26698 339976
879 | TRPC5 HGNC:12337 7224
880 | TRPC7 HGNC:20754 57113
881 | TRPM1 HGNC:7146 4308
882 | TRPV5 HGNC:3145 56302
883 | TSGA13 HGNC:12369 114960
884 | TSHB HGNC:12372 7252
885 | TSPAN16 HGNC:30725 26526
886 | TSPO2 HGNC:21256 222642
887 | TSPY1 HGNC:12381 7258
888 | TSPYL6 HGNC:14521 388951
889 | TSSK1B HGNC:14968 83942
890 | TSSK2 HGNC:11401 23617
891 | TXNDC8 HGNC:31454 255220
892 | TYR HGNC:12442 7299
893 | UBQLN3 HGNC:12510 50613
894 | UMOD HGNC:12559 7369
895 | UROC1 HGNC:26444 131669
896 | USP17L2 HGNC:34434 377630
897 | USP26 HGNC:13485 83844
898 | USP29 HGNC:18563 83844
899 | UTS2R HGNC:4468 2837
900 | VAX1 HGNC:12660 11023
901 | VCX3A HGNC:18159 51481
902 | VHLL HGNC:30666 391104
903 | VN1R2 HGNC:19872 317701
904 | VN1R4 HGNC:19871 317703
905 | VN1R5 HGNC:19870 317705
906 | VPREB1 HGNC:12709 7441
907 | VRTN HGNC:20223 55237
908 | VSX2 HGNC:1975 338917
909 | WFDC10A HGNC:16139 140832
910 | WFDC11 HGNC:20478 259239
911 | WFDC9 HGNC:20380 259240
912 | XAGE2 HGNC:4112 9502
913 | XAGE5 HGNC:30930 170627
914 | XKR7 HGNC:23062 343702
915 | ZAN HGNC:12857 7455
916 | ZCCHC13 HGNC:31749 389874
917 | ZCCHC16 HGNC:25214 340595
918 | ZG16 HGNC:30961 653808
919 | ZIC3 HGNC:12874 7547
920 | ZIM3 HGNC:16366 7547
921 | ZNF645 HGNC:26371 158506
922 | ZNF648 HGNC:18190 127665
923 | ZNF679 HGNC:28650 168417
924 | ZNF804B HGNC:21958 219578
925 | ZNRF4 HGNC:17726 148066
926 | ZP2 HGNC:13188 7783
927 | ZP4 HGNC:15770 57829
928 | ZSWIM2 HGNC:30990 151112
929 |
--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | max_width = 100
2 | use_small_heuristics = "max"
--------------------------------------------------------------------------------
/src/commands/command.rs:
--------------------------------------------------------------------------------
1 | use anyhow::Result;
2 | use enum_dispatch::enum_dispatch;
3 |
4 | #[enum_dispatch]
5 | pub trait Command {
6 | fn execute(&self) -> Result<()>;
7 | }
8 |
--------------------------------------------------------------------------------
/src/commands/count.rs:
--------------------------------------------------------------------------------
1 | use crate::command::Command;
2 | use crate::guide::*;
3 | use ahash::AHashMap;
4 | use anyhow::{Context, Result};
5 | use clap::Parser;
6 | use fastq::{parse_path, Record};
7 | use fgoxide::io::{DelimFile, Io};
8 | use itertools::Itertools;
9 | use log::*;
10 | use serde::{Deserialize, Serialize};
11 | use std::collections::{HashMap, HashSet};
12 | use std::io::Write;
13 | use std::path::{Path, PathBuf};
14 |
15 | type GuideMap<'a> = AHashMap, &'a Guide>;
16 |
17 | /// Counts the guides observed in a CRISPR screen, starting from one or more FASTQs. FASTQs are
18 | /// one per sample and currently only single-end FASTQ inputs are supported.
19 | ///
20 | /// A set of sample IDs may be provided using `--samples id1 id2 ..`. If provided it must have the
21 | /// same number of values as input FASTQs. If not provided the FASTQ names are used minus any
22 | /// fastq/fq/gz suffixes.
23 | ///
24 | /// Automatically determines the range of valid offsets within the sequencing reads where the
25 | /// guide sequences are located, independently for each FASTQ input. The first `offset-sample-size`
26 | /// reads from each FASTQ are examined to determine the offsets at which guides are found. When
27 | /// processing the full FASTQ, checks only those offsets that accounted for at least
28 | /// `offset-min-fraction` of the first `offset-sample-size` reads.
29 | ///
30 | /// Matching by default allows for one mismatch (and no indels) between the read sub-sequence
31 | /// and the expected guide sequences. Exact matching may be enabled by specifying the
32 | /// `--exact-match` option.
33 | ///
34 | /// Optionally lists may be provided of essential genes, nonessential genes and control guide ids,
35 | /// as well as a regular expression to be used to identify control guides. Using this information
36 | /// guides are classified as either Essential, Nonessential, Control, or Other.
37 | ///
38 | /// Three output files are generated. The first is named `{output}.counts.txt` and contains columns
39 | /// for the guide id, the gene targeted by the guide and one count column per input FASTQ with
40 | /// raw/un-normalized counts. The second, `{output}.extended-counts.txt` is identical to the first
41 | /// except for having a `guide_type` column inserted as the third column. Finally
42 | /// `{output}.stats.txt` contains basic QC statistics per input FASTQ on the matching process.
43 | #[derive(Parser, Debug)]
44 | pub(crate) struct Count {
45 | /// Input fastq file(s)
46 | #[clap(long, short = 'i', required = true, multiple_values = true)]
47 | input: Vec,
48 |
49 | /// Sample names corresponding to the input fastqs. If provided must be the same length as
50 | /// input. Otherwise will be inferred from input file names.
51 | #[clap(long, short = 's', multiple_values = true)]
52 | samples: Vec,
53 |
54 | /// Path to the guide library metadata. May be a tab- or comma-separated file. Must have
55 | /// a header line, and the first three fields must be (in order): i) the ID of the guide,
56 | /// ii) the base sequence of the guide, iii) the gene the guide targets.
57 | #[clap(long, short = 'l')]
58 | library: PathBuf,
59 |
60 | /// Optional path to file with list of essential genes. Gene names should appear one
61 | /// per line and are case sensitive. If the file has multiple tab-separated columns, the first
62 | /// column is used.
63 | #[clap(long, short = 'e')]
64 | essential_genes: Option,
65 |
66 | /// Optional path to file with list of nonessential genes. Gene names should appear one
67 | /// per line and are case sensitive. If the file has multiple tab-separated columns, the first
68 | /// column is used.
69 | #[clap(long, short = 'n')]
70 | nonessential_genes: Option,
71 |
72 | /// Optional path to file with list control guide IDs. IDs should appear one
73 | /// per line and are case sensitive. If the file has multiple tab-separated columns, the first
74 | /// column is used.
75 | #[clap(long, short = 'c')]
76 | control_guides: Option,
77 |
78 | /// Optional regular expression used to ID control guides. Pattern is matched, case
79 | /// insensitive, to guide IDs and Gene names.
80 | #[clap(long, short = 'C')]
81 | control_pattern: Option,
82 |
83 | /// Perform exact matching only, don't allow mismatches between reads and guides.
84 | #[clap(long, short = 'x')]
85 | exact_match: bool,
86 |
87 | /// The number of reads to be examined when determining the offsets at which guides may
88 | /// be found in the input reads.
89 | #[clap(long, short = 'N', default_value = "100000")]
90 | offset_sample_size: u64,
91 |
92 | /// After sampling the first `offset_sample_size` reads, use offsets that
93 | #[clap(long, short = 'f', default_value = "0.0025")]
94 | offset_min_fraction: f64,
95 |
96 | /// Path prefix to use for all output files
97 | #[clap(long, short = 'o')]
98 | output: String,
99 | }
100 |
101 | /// Simple Command impl that just receives params and delegates off to other functions
102 | impl Command for Count {
103 | /// execute function that is called from the command line parser
104 | fn execute(&self) -> Result<()> {
105 | // Auto-fill the sample names if not given
106 | let sample_ids = if self.samples.is_empty() {
107 | self.input
108 | .iter()
109 | .enumerate()
110 | .map(|(idx, fq)| Count::sample_name(fq, idx + 1))
111 | .collect_vec()
112 | } else {
113 | assert_eq!(
114 | self.samples.len(),
115 | self.input.len(),
116 | "Different numbers of --samples and --input."
117 | );
118 | self.samples.clone()
119 | };
120 |
121 | // Load up the library and guide lookup
122 | let library = GuideLibrary::from_files(
123 | &self.library,
124 | &self.essential_genes,
125 | &self.nonessential_genes,
126 | &self.control_guides,
127 | &self.control_pattern,
128 | )?;
129 | let lookup = Count::build_lookup(&library, !self.exact_match);
130 |
131 | // Generate the counts per sample
132 | let results = self
133 | .input
134 | .iter()
135 | .zip(sample_ids)
136 | .map(|(fq, sample)| {
137 | let prefix_info = Count::determine_prefixes(
138 | fq,
139 | sample.as_str(),
140 | &library,
141 | &lookup,
142 | self.offset_sample_size,
143 | self.offset_min_fraction,
144 | )
145 | .expect("Failed to determine offsets.");
146 |
147 | Count::count_reads(fq, sample.as_str(), &library, &lookup, &prefix_info)
148 | .expect("Failed to count guide.")
149 | })
150 | .collect_vec();
151 |
152 | // Write the outputs
153 | let counts_file = PathBuf::from(format!("{}.counts.txt", self.output));
154 | let ext_counts_file = PathBuf::from(format!("{}.extended-counts.txt", self.output));
155 | let stats_file = PathBuf::from(format!("{}.stats.txt", self.output));
156 |
157 | Count::write_counts(&counts_file, &library, &results, false)?;
158 | Count::write_counts(&ext_counts_file, &library, &results, true)?;
159 | Count::write_stats(&stats_file, &library, &results)?;
160 | Ok(())
161 | }
162 | }
163 |
164 | /// Implementation of the Count command and related functions.
165 | impl Count {
166 | /// Returns a sample name given a fastq file. Strips off any .gz and fastq-like
167 | /// suffixes. If the file doesn't have a valid filename, will return a name
168 | /// based on the index passed in.
169 | fn sample_name(p: &Path, idx: usize) -> String {
170 | if let Some(os_name) = p.file_name() {
171 | if let Some(name) = os_name.to_str() {
172 | return name
173 | .trim_end_matches(".gz")
174 | .trim_end_matches(".fastq")
175 | .trim_end_matches(".fq")
176 | .to_string();
177 | }
178 | }
179 |
180 | format!("s{}", idx)
181 | }
182 |
183 | /// Builds a lookup from a Vec of bases to Guides. The resulting HashMap will contain
184 | /// keys for every exact guide sequence (in upper case). If `allow_mismatch` is true,
185 | /// the map will also contain keys for every one-mismatch version of every guide with the
186 | /// exception of any sequences that match equally well to multiple guides.
187 | fn build_lookup(library: &GuideLibrary, allow_mismatch: bool) -> GuideMap {
188 | info!("Building lookup.");
189 | let mut lookup = GuideMap::default();
190 | let mut dupes = HashSet::new();
191 |
192 | lookup.reserve(library.len());
193 |
194 | if allow_mismatch {
195 | lookup.reserve(library.len() + library.guide_length * 3);
196 |
197 | for guide in library.guides.iter() {
198 | let bases = &guide.bases;
199 |
200 | for i in 0..bases.len() {
201 | for b in [b'A', b'C', b'G', b'T'] {
202 | if bases[i] != b {
203 | let mut modded = bases.clone();
204 | modded[i] = b;
205 |
206 | let prev = lookup.insert(modded, guide);
207 | if prev.is_some() {
208 | let mut dupe = bases.clone();
209 | dupe[i] = b;
210 | dupes.insert(dupe);
211 | }
212 | }
213 | }
214 | }
215 | }
216 |
217 | // Make sure no duplicated sequences remain in the lookup
218 | for dupe in dupes.into_iter() {
219 | lookup.remove(&dupe);
220 | }
221 | }
222 |
223 | // Insert all the exact matches last so they're always present
224 | for guide in library.guides.iter() {
225 | lookup.insert(guide.bases.clone(), guide);
226 | }
227 |
228 | info!("Lookup built with {} entries.", lookup.len());
229 | lookup
230 | }
231 |
232 | /// Goes through an input fastq file and determines the set of prefix-lengths that
233 | /// occur before the guide sequence is observed. Samples the first `sample_size` reads
234 | /// from the FASTQ and checks all possible prefixes. Returns the set of prefixes where
235 | /// each prefix individually accounts for >= `min_fraction` of the reads that matched
236 | /// to a guide.
237 | fn determine_prefixes(
238 | fastq: &Path,
239 | sample: &str,
240 | library: &GuideLibrary,
241 | lookup: &GuideMap,
242 | sample_size: u64,
243 | min_fraction: f64,
244 | ) -> Result {
245 | let guide_length = library.guide_length;
246 | let mut prefix_lengths = vec![0u64; 500];
247 | let mut count = 0u64;
248 |
249 | // Parse the first `sample_size` records to find exact match guides and
250 | // extract the sequence that precedes the guide
251 | parse_path(Some(fastq), |parser| {
252 | parser
253 | .each(|rec| {
254 | let read_bases = rec.seq();
255 | let read_length = read_bases.len();
256 |
257 | if read_length >= guide_length {
258 | for trim in 0..=(read_length - guide_length) {
259 | let bases = &read_bases[trim..trim + guide_length];
260 |
261 | if lookup.contains_key(bases) {
262 | prefix_lengths[trim] += 1;
263 | }
264 | }
265 | }
266 |
267 | count += 1;
268 | count < sample_size
269 | })
270 | .expect("Failed to parse.");
271 | })
272 | .context(format!("Failed to read {:?}", fastq))?;
273 |
274 | let total_matched: u64 = prefix_lengths.iter().sum();
275 | let fraction_matched = total_matched as f64 / count as f64;
276 | info!(
277 | "In {:?} examined {} reads for guide start position and matched {} ({:.4}).",
278 | fastq, count, total_matched, fraction_matched
279 | );
280 |
281 | // Tuple of offset -> count where count is > 0
282 | let non_zeros =
283 | prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0).collect_vec();
284 |
285 | info!(
286 | "{} read offsets: {}",
287 | sample,
288 | non_zeros.iter().map(|(o, n)| format!("{}->{}", o, n)).join(", ")
289 | );
290 |
291 | // Filter to just those trim lengths that have at least min_fraction of the data each
292 | let trims_to_return: Vec = non_zeros
293 | .into_iter()
294 | .filter(|(_idx, n)| *n as f64 / total_matched as f64 >= min_fraction)
295 | .map(|(idx, _n)| idx)
296 | .collect();
297 |
298 | let info = PrefixInfo { lengths: trims_to_return };
299 | Ok(info)
300 | }
301 |
302 | /// Generates a set of guide counts for a single input FASTQ given a guide lookup and a set of
303 | /// read offsets/prefixes to check.
304 | ///
305 | /// Returns a CountResult which contains a count of the total number of reads in the FASTQ
306 | /// and a Map of Guide to count of tht guide. The map will contain an entry for every guide
307 | /// including those with zero counts.
308 | fn count_reads<'a, P>(
309 | fastq: &P,
310 | sample: &str,
311 | library: &'a GuideLibrary,
312 | lookup: &GuideMap,
313 | prefix_info: &PrefixInfo,
314 | ) -> Result>
315 | where
316 | P: AsRef,
317 | {
318 | let mut count: u64 = 0;
319 | let mut counts: Vec = vec![0; library.len()];
320 | let fastq_path = fastq.as_ref();
321 |
322 | // TODO: remove empty file checking once this is moved over to seq_io instead of fastq
323 | let fastq_size = std::fs::metadata(fastq).map(|m| m.len()).unwrap_or(0);
324 | let empty_fastq = fastq_path.is_file() && fastq_path.exists() && fastq_size == 0;
325 |
326 | if !empty_fastq {
327 | parse_path(Some(fastq), |parser| {
328 | parser
329 | .each(|rec| {
330 | let read_bases = rec.seq();
331 | let read_length = read_bases.len();
332 | let guide_length = library.guide_length;
333 |
334 | for trim in prefix_info.lengths.iter() {
335 | if trim + guide_length <= read_length {
336 | let bases = &read_bases[*trim..(*trim + guide_length)];
337 | if let Some(guide) = lookup.get(bases) {
338 | counts[guide.index] += 1;
339 | }
340 | }
341 | }
342 |
343 | count += 1;
344 | if count % 10_000_000 == 0 {
345 | info!("Processed {}m reads from {:?}.", count / 1_000_000, fastq_path);
346 | }
347 |
348 | true
349 | })
350 | .expect("Failed to parse.");
351 | })?;
352 | }
353 |
354 | let count_map: HashMap<&Guide, u64> =
355 | library.guides.iter().map(|g| (g, counts[g.index])).collect();
356 |
357 | let count_result = CountResult {
358 | source: fastq_path.to_str().unwrap_or("").to_string(),
359 | sample: sample.to_string(),
360 | counts: count_map,
361 | total_reads: count,
362 | };
363 |
364 | info!(
365 | "Processed {} reads and matched {} ({:.4}) from {:?}.",
366 | count,
367 | count_result.mapped_reads(),
368 | count_result.mapped_frac(),
369 | fastq_path
370 | );
371 |
372 | Ok(count_result)
373 | }
374 |
375 | /// Writes out the counts matrix given one or more CountResults. If extended is false, the
376 | /// columns produced are "guide", "gene" and then one column per sample with counts for each
377 | /// sample. If extended is true, an additional "guide_type" is inserted after "gene".
378 | fn write_counts(
379 | path: &Path,
380 | library: &GuideLibrary,
381 | counts: &[CountResult],
382 | extended: bool,
383 | ) -> Result<()> {
384 | let mut writer = Io::default().new_writer(&path)?;
385 | let sep = "\t".as_bytes();
386 | let newline = "\n".as_bytes();
387 |
388 | // Output the header
389 | let mut header_fields = vec!["guide", "gene"];
390 | if extended {
391 | header_fields.extend_from_slice(&["guide_type"]);
392 | }
393 |
394 | header_fields.extend(counts.iter().map(|c| c.sample.as_str()));
395 | writer.write_all(header_fields.join("\t").as_bytes())?;
396 | writer.write_all(newline)?;
397 |
398 | // Output the body
399 | for guide in library.guides.iter() {
400 | writer.write_all(guide.id.as_bytes())?;
401 | writer.write_all(sep)?;
402 | writer.write_all(guide.gene.as_bytes())?;
403 |
404 | if extended {
405 | writer.write_all(sep)?;
406 | writer.write_all(guide.kind.to_string().as_bytes())?;
407 | }
408 |
409 | for sample in counts {
410 | let n = sample.counts.get(&guide).copied().unwrap_or(0);
411 | writer.write_all(sep)?;
412 | writer.write_all(n.to_string().as_bytes())?;
413 | }
414 |
415 | writer.write_all(newline)?;
416 | }
417 |
418 | writer.flush()?;
419 | Ok(())
420 | }
421 |
422 | /// Generates and writes out some simple per-sample QC statistics
423 | fn write_stats(
424 | stats_file: &Path,
425 | library: &GuideLibrary,
426 | results: &[CountResult],
427 | ) -> Result<()> {
428 | let recs = results
429 | .iter()
430 | .map(|r| CountStats {
431 | file: r.source.clone(),
432 | label: r.sample.to_string(),
433 | total_guides: library.len() as u64,
434 | total_reads: r.total_reads,
435 | mapped_reads: r.mapped_reads(),
436 | frac_mapped: Count::round(r.mapped_frac(), 4),
437 | mean_reads_per_guide: Count::round(
438 | r.mapped_reads() as f64 / library.len() as f64,
439 | 2,
440 | ),
441 | zero_read_guides: r.counts.values().filter(|n| **n == 0).count() as u64,
442 | mean_reads_essential: Count::compute_mean_cov(r, GuideType::Essential),
443 | mean_reads_nonessential: Count::compute_mean_cov(r, GuideType::Nonessential),
444 | mean_reads_other: Count::compute_mean_cov(r, GuideType::Other),
445 | mean_reads_control: Count::compute_mean_cov(r, GuideType::Control),
446 | })
447 | .collect_vec();
448 |
449 | DelimFile::default().write_tsv(&stats_file, recs)?;
450 | Ok(())
451 | }
452 |
453 | /// Simple method to round f64s to a maximum number of decimal places
454 | fn round(f: f64, dp: i32) -> f64 {
455 | let factor = 10f64.powi(dp);
456 | (f * factor).round() / factor
457 | }
458 |
459 | /// Computes the mean coverage of a subset of guides based on guide type. If no guides of the
460 | /// type exist, returns 0.
461 | fn compute_mean_cov(counts: &CountResult, kind: GuideType) -> f64 {
462 | let subset = counts
463 | .counts
464 | .iter()
465 | .filter_map(|(g, n)| if g.kind == kind { Some(*n as f64) } else { None })
466 | .collect_vec();
467 |
468 | if subset.is_empty() {
469 | 0.0
470 | } else {
471 | let total: f64 = subset.iter().sum::();
472 | Count::round(total / subset.len() as f64, 2)
473 | }
474 | }
475 | }
476 |
477 | /// Struct to hold information about the length and bases of the sequence before the guides
478 | struct PrefixInfo {
479 | pub lengths: Vec,
480 | }
481 |
482 | /// Struct to hold the results of counting a single fastq
483 | struct CountResult<'a> {
484 | source: String,
485 | sample: String,
486 | total_reads: u64,
487 | counts: HashMap<&'a Guide, u64>,
488 | }
489 |
490 | /// Struct to output stats on each sample mapped
491 | #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
492 | struct CountStats {
493 | file: String,
494 | label: String,
495 | total_guides: u64,
496 | total_reads: u64,
497 | mapped_reads: u64,
498 | frac_mapped: f64,
499 | mean_reads_per_guide: f64,
500 | mean_reads_essential: f64,
501 | mean_reads_nonessential: f64,
502 | mean_reads_control: f64,
503 | mean_reads_other: f64,
504 | zero_read_guides: u64,
505 | }
506 |
507 | impl CountResult<'_> {
508 | /// Returns the total number of reads that mapped to a guide.
509 | pub fn mapped_reads(&self) -> u64 {
510 | self.counts.values().sum()
511 | }
512 |
513 | /// Returns the fraction of reads that mapped to a guide.
514 | pub fn mapped_frac(&self) -> f64 {
515 | self.mapped_reads() as f64 / self.total_reads as f64
516 | }
517 | }
518 |
519 | #[cfg(test)]
520 | mod tests {
521 | use super::*;
522 | use fgoxide::io::{DelimFile, Io};
523 | use tempfile::TempDir;
524 |
525 | #[test]
526 | fn test_sample_name() {
527 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fq").as_path(), 1), "splat");
528 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fq.gz").as_path(), 1), "splat");
529 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fastq").as_path(), 1), "splat");
530 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fastq.gz").as_path(), 1), "splat");
531 | assert_eq!(Count::sample_name(PathBuf::new().as_path(), 1), "s1");
532 | }
533 |
534 | #[test]
535 | fn test_build_lookup() {
536 | let g1 = Guide::new(0, "g0", "AAAAAAAAAA", "gene-A", GuideType::Other);
537 | let g2 = Guide::new(1, "g1", "GGGGGGGGGG", "gene-G", GuideType::Other);
538 | let g3 = Guide::new(2, "g2", "AGGGGGGGGG", "gene-AG", GuideType::Other);
539 |
540 | // Build with one guide and no mismatches
541 | let library = GuideLibrary::new(vec![g1.clone()]).unwrap();
542 | let lookup = Count::build_lookup(&library, false);
543 | assert_eq!(lookup.len(), 1);
544 | assert_eq!(lookup[&g1.bases], &g1);
545 |
546 | // One guide with mismatches
547 | let lookup = Count::build_lookup(&library, true);
548 | assert_eq!(lookup.len(), 31); // original plus three mismatches x ten positions
549 | assert_eq!(lookup[&g1.bases], &g1);
550 | assert_eq!(lookup["AAAACAAAAA".as_bytes()], &g1);
551 |
552 | // Two guides without mismatches
553 | let library = GuideLibrary::new(vec![g1.clone(), g2.clone()]).unwrap();
554 | let lookup = Count::build_lookup(&library, false);
555 | assert_eq!(lookup.len(), 2);
556 | assert_eq!(lookup[&g1.bases], &g1);
557 | assert_eq!(lookup[&g2.bases], &g2);
558 |
559 | // Two guides with mismatches and no collisions
560 | let lookup = Count::build_lookup(&library, true);
561 | assert_eq!(lookup.len(), 62);
562 |
563 | // Two guides with mismatches and collisions!
564 | let library = GuideLibrary::new(vec![g2.clone(), g3.clone()]).unwrap();
565 | let lookup = Count::build_lookup(&library, true);
566 | assert_eq!(lookup.len(), 56);
567 | assert_eq!(lookup[&g2.bases], &g2); // collision shouldn't override perfect match
568 | assert_eq!(lookup[&g3.bases], &g3); // collision shouldn't override perfect match
569 | assert!(!lookup.contains_key("CGGGGGGGGG".as_bytes())); // ambiguous
570 | assert!(!lookup.contains_key("TGGGGGGGGG".as_bytes())); // ambiguous
571 | }
572 |
573 | /// Helper function to generate guide library determine_prefixes and counting tests
574 | fn test_library() -> GuideLibrary {
575 | GuideLibrary::new(vec![
576 | Guide::new(0, "g1", "ACGTACGT", "AAA", GuideType::Other),
577 | Guide::new(1, "g1", "AACCGGTT", "AAA", GuideType::Other),
578 | Guide::new(2, "g1", "AAACAGAT", "AAA", GuideType::Other),
579 | ])
580 | .unwrap()
581 | }
582 |
583 | /// Helper function to write a bunch of reads to a FASTQ file
584 | fn write_fastq(reads: &[String], path: PathBuf) -> PathBuf {
585 | let mut file = Io::default().new_writer(&path).unwrap();
586 |
587 | for (idx, read) in reads.iter().enumerate() {
588 | let quals = vec![b'#'; read.len()];
589 |
590 | file.write_all(format!("@q{}\n", idx).as_bytes()).unwrap();
591 | file.write_all(format!("{}\n", read).as_bytes()).unwrap();
592 | file.write_all("+\n".as_bytes()).unwrap();
593 | file.write_all(quals.as_slice()).unwrap();
594 | file.write_all("\n".as_bytes()).unwrap();
595 | }
596 |
597 | file.flush().unwrap();
598 | path
599 | }
600 |
601 | #[test]
602 | fn test_determine_prefixes_finds_offset_zero() {
603 | let library = test_library();
604 | let lookup = Count::build_lookup(&library, false);
605 | let reads = vec![
606 | format!("{}tttttttttt", library.guides[0].bases_str),
607 | format!("{}tttttttttt", library.guides[1].bases_str),
608 | format!("{}tttttttttt", library.guides[2].bases_str),
609 | ];
610 | let tempdir = TempDir::new().unwrap();
611 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
612 | let prefixes = Count::determine_prefixes(&fastq, "s", &library, &lookup, 100, 0.0).unwrap();
613 | assert_eq!(prefixes.lengths, vec![0]);
614 | }
615 |
616 | #[test]
617 | fn test_determine_prefixes_finds_last_offset() {
618 | let library = test_library();
619 | let lookup = Count::build_lookup(&library, false);
620 | let reads = vec![
621 | format!("tttttttttt{}", library.guides[0].bases_str),
622 | format!("tttttttttt{}", library.guides[1].bases_str),
623 | format!("tttttttttt{}", library.guides[2].bases_str),
624 | ];
625 | let tempdir = TempDir::new().unwrap();
626 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
627 | let prefixes = Count::determine_prefixes(&fastq, "s", &library, &lookup, 100, 0.0).unwrap();
628 | assert_eq!(prefixes.lengths, vec![10]);
629 | }
630 |
631 | #[test]
632 | fn test_determine_prefixes_actually_subsamples() {
633 | let library = test_library();
634 | let lookup = Count::build_lookup(&library, false);
635 |
636 | let mut reads = vec![];
637 | for _ in 0..100 {
638 | reads.push(format!("ttttt{}ttttt", library.guides[0].bases_str))
639 | }
640 | for _ in 0..100 {
641 | reads.push(format!("tttttt{}tttt", library.guides[0].bases_str))
642 | }
643 |
644 | let tempdir = TempDir::new().unwrap();
645 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
646 | let p1 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.01).unwrap();
647 | let p2 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 100, 0.01).unwrap();
648 | assert_eq!(p1.lengths, vec![5, 6]);
649 | assert_eq!(p2.lengths, vec![5]);
650 | }
651 |
652 | #[test]
653 | fn test_determine_prefixes_min_fraction() {
654 | let library = test_library();
655 | let lookup = Count::build_lookup(&library, false);
656 |
657 | let mut reads = vec![];
658 | for _ in 0..50 {
659 | // offset = 5
660 | reads.push(format!("ttttt{}ttttt", library.guides[0].bases_str))
661 | }
662 | for _ in 0..30 {
663 | // offset = 6
664 | reads.push(format!("tttttt{}tttt", library.guides[0].bases_str))
665 | }
666 | for _ in 0..20 {
667 | // offset = 7
668 | reads.push(format!("ttttttt{}ttt", library.guides[0].bases_str))
669 | }
670 | for _ in 0..100 {
671 | // doesn't match to guides
672 | reads.push("tttttttttttttttttttt".to_string());
673 | }
674 |
675 | let tempdir = TempDir::new().unwrap();
676 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
677 | let p1 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.01).unwrap();
678 | let p2 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.25).unwrap();
679 | let p3 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.50).unwrap();
680 |
681 | assert_eq!(p1.lengths, vec![5, 6, 7]);
682 | assert_eq!(p2.lengths, vec![5, 6]);
683 | assert_eq!(p3.lengths, vec![5]);
684 | }
685 |
686 | #[test]
687 | fn test_count_reads_handles_empty_fastq() {
688 | let library = test_library();
689 | let lookup = Count::build_lookup(&library, false);
690 | let prefixes = PrefixInfo { lengths: vec![0, 1, 2] };
691 | let tempdir = TempDir::new().unwrap();
692 | let fastq = write_fastq(&[], tempdir.path().join("in.fastq"));
693 |
694 | let counts = Count::count_reads(&fastq, "s1", &library, &lookup, &prefixes).unwrap();
695 | assert_eq!(counts.sample, "s1");
696 | assert_eq!(counts.total_reads, 0);
697 | for guide in library.guides.iter() {
698 | assert_eq!(counts.counts[guide], 0);
699 | }
700 | }
701 |
702 | #[test]
703 | fn test_count_reads() {
704 | let library = test_library();
705 | let lookup = Count::build_lookup(&library, false);
706 | let prefixes = PrefixInfo { lengths: vec![4, 5, 6] };
707 | let tempdir = TempDir::new().unwrap();
708 | let mut reads = vec![];
709 |
710 | // Create a bunch of reads, with each guide getting:
711 | // 100 * 3 * 1_based_guide_index
712 | for prefix in ["tttt", "ttttt", "tttttt"] {
713 | for _ in 0..100 {
714 | for guide in library.guides.iter() {
715 | for _ in 0..(guide.index + 1) {
716 | reads.push(format!("{}{}ggggg", prefix, guide.bases_str));
717 | }
718 | }
719 | }
720 | }
721 |
722 | // Add a few reads at different offsets that won't count
723 | reads.push(format!("{}ttttttttt", library.guides[0].bases_str));
724 | reads.push(format!("t{}tttttttt", library.guides[0].bases_str));
725 | reads.push(format!("tt{}ttttttt", library.guides[0].bases_str));
726 | reads.push(format!("ttt{}tttttt", library.guides[0].bases_str));
727 |
728 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
729 | let counts = Count::count_reads(&fastq, "s1", &library, &lookup, &prefixes).unwrap();
730 | assert_eq!(counts.sample, "s1");
731 | assert_eq!(counts.total_reads, reads.len() as u64);
732 | for guide in library.guides.iter() {
733 | let expected = 100 * 3 * (guide.index + 1) as u64;
734 | assert_eq!(counts.counts[guide], expected);
735 | }
736 | }
737 |
738 | #[test]
739 | fn test_end_to_end() {
740 | let tempdir = TempDir::new().unwrap();
741 |
742 | // Write the library to disk
743 | let library = test_library();
744 | let library_path = tempdir.path().join("library.txt");
745 | let mut lib_lines = vec!["guide\tbases\tgene".to_string()];
746 | for guide in library.guides.iter() {
747 | lib_lines.push(format!("{}\t{}\t{}", guide.id, guide.bases_str, guide.gene));
748 | }
749 | Io::default().write_lines(&library_path, &lib_lines).unwrap();
750 |
751 | // Generate a fastq of reads to count
752 | let mut reads = vec![];
753 |
754 | // Create a bunch of reads, with each guide getting:
755 | // 100 * 3 * 1_based_guide_index
756 | for prefix in ["tttt", "ttttt", "tttttt"] {
757 | for _ in 0..100 {
758 | for guide in library.guides.iter() {
759 | for _ in 0..(guide.index + 1) {
760 | reads.push(format!("{}{}ggggg", prefix, guide.bases_str));
761 | }
762 | }
763 | }
764 | }
765 |
766 | // Add a few reads at different offsets that won't count
767 | reads.push(format!("{}ttttttttt", library.guides[0].bases_str));
768 | reads.push(format!("t{}tttttttt", library.guides[0].bases_str));
769 | reads.push(format!("tt{}ttttttt", library.guides[0].bases_str));
770 | reads.push(format!("ttt{}tttttt", library.guides[0].bases_str));
771 |
772 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
773 |
774 | // Run the count command
775 | let prefix = tempdir.path().join("out").to_str().unwrap().to_string();
776 | let counts = tempdir.path().join("out.counts.txt");
777 | let stats = tempdir.path().join("out.stats.txt");
778 |
779 | let cmd = Count {
780 | library: library_path,
781 | input: vec![fastq],
782 | essential_genes: None,
783 | nonessential_genes: None,
784 | control_guides: None,
785 | control_pattern: None,
786 | samples: vec!["sample1".to_string()],
787 | output: prefix,
788 | exact_match: false,
789 | offset_min_fraction: 0.005,
790 | offset_sample_size: 100000,
791 | };
792 |
793 | cmd.execute().unwrap();
794 |
795 | assert!(counts.exists());
796 | assert!(stats.exists());
797 |
798 | // Read the stats back in
799 | let stat_records: Vec = DelimFile::default().read_tsv(&stats).unwrap();
800 |
801 | assert_eq!(stat_records.len(), 1);
802 | assert_eq!(stat_records[0].label, "sample1");
803 | assert_eq!(stat_records[0].total_guides, 3);
804 | assert_eq!(stat_records[0].total_reads, 1804);
805 | assert_eq!(stat_records[0].mapped_reads, 1800);
806 | assert!((stat_records[0].mean_reads_per_guide - 600.0).abs() <= 0.01);
807 | assert!((stat_records[0].frac_mapped - 1800f64 / 1804f64).abs() <= 0.01);
808 | assert_eq!(stat_records[0].zero_read_guides, 0);
809 | }
810 |
811 | #[test]
812 | fn test_reads_shorter_than_guides_ok() {
813 | let tempdir = TempDir::new().unwrap();
814 |
815 | // Write the library to disk
816 | let library = test_library();
817 | let library_path = tempdir.path().join("library.txt");
818 | let mut lib_lines = vec!["guide\tbases\tgene".to_string()];
819 | for guide in library.guides.iter() {
820 | lib_lines.push(format!("{}\t{}\t{}", guide.id, guide.bases_str, guide.gene));
821 | }
822 | Io::default().write_lines(&library_path, &lib_lines).unwrap();
823 |
824 | // Generate a fastq of reads to count
825 | let mut reads = vec![];
826 | reads.push("A".to_string());
827 | reads.push("AC".to_string());
828 | reads.push("ACG".to_string());
829 | reads.push("ACGT".to_string());
830 | reads.push("ACGTA".to_string());
831 | reads.push("ACGTAC".to_string());
832 | reads.push("ACGTACG".to_string());
833 | reads.push("ACGTACGT".to_string());
834 | reads.push("ACGTACGTA".to_string());
835 | reads.push("ACGTACGTAC".to_string());
836 |
837 | // Create a bunch of reads all with the guide at offset=5, with each guide getting:
838 | for _ in 0..100 {
839 | for guide in library.guides.iter() {
840 | for _ in 0..=guide.index {
841 | reads.push(format!("ttttt{}ggggg", guide.bases_str));
842 | }
843 | }
844 | }
845 |
846 | // Lastly create some reads that are longer than a guide length but shorter than
847 | // guide-length + max prefix
848 | reads.push("tttttACGTACGT".to_string());
849 | reads.push("tttttACGTACGTA".to_string());
850 | reads.push("tttttACGTACGTAC".to_string());
851 |
852 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq"));
853 |
854 | // Run the count command
855 | let prefix = tempdir.path().join("out").to_str().unwrap().to_string();
856 | let counts = tempdir.path().join("out.counts.txt");
857 | let stats = tempdir.path().join("out.stats.txt");
858 |
859 | let cmd = Count {
860 | library: library_path,
861 | input: vec![fastq],
862 | essential_genes: None,
863 | nonessential_genes: None,
864 | control_guides: None,
865 | control_pattern: None,
866 | samples: vec!["sample1".to_string()],
867 | output: prefix,
868 | exact_match: false,
869 | offset_min_fraction: 0.005,
870 | offset_sample_size: 100000,
871 | };
872 |
873 | cmd.execute().unwrap();
874 | assert!(counts.exists());
875 | assert!(stats.exists());
876 | }
877 | }
878 |
--------------------------------------------------------------------------------
/src/commands/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod command;
2 | pub mod count;
3 |
--------------------------------------------------------------------------------
/src/guide.rs:
--------------------------------------------------------------------------------
1 | use anyhow::{anyhow, bail, Result};
2 | use fgoxide::io::Io;
3 | use itertools::Itertools;
4 | use log::*;
5 | use regex::RegexBuilder;
6 | use std::collections::HashSet;
7 | use std::fmt::{Display, Formatter};
8 | use std::path::Path;
9 |
10 | /// Guides can either target essential genes, non-essential genes, control sequences
11 | /// or other genes.
12 | #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
13 | pub enum GuideType {
14 | Essential,
15 | Nonessential,
16 | Control,
17 | Other,
18 | }
19 |
20 | /// Implement Disaply for GuideType
21 | impl Display for GuideType {
22 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
23 | write!(f, "{:?}", self)
24 | }
25 | }
26 |
27 | /// A struct to represent a CRISPR guide
28 | #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
29 | pub struct Guide {
30 | pub index: usize,
31 | pub id: String,
32 | pub kind: GuideType,
33 | pub bases: Vec,
34 | pub bases_str: String,
35 | pub gene: String,
36 | }
37 |
38 | impl Guide {
39 | /// Generates a new guide that stores the bases in upper-case and ensures that the
40 | /// bases and bases_str are in sync.
41 | pub fn new>(index: usize, id: S, bases: S, gene: S, kind: GuideType) -> Guide {
42 | let bases_upper = bases.into().to_uppercase();
43 | Guide {
44 | index,
45 | id: id.into(),
46 | kind,
47 | bases: bases_upper.as_bytes().to_vec(),
48 | bases_str: bases_upper,
49 | gene: gene.into(),
50 | }
51 | }
52 |
53 | /// Returns the length of the guide sequence
54 | fn len(&self) -> usize {
55 | self.bases.len()
56 | }
57 | }
58 |
59 | /// Struct representing a guide library used in a crispr screen
60 | pub struct GuideLibrary {
61 | pub guides: Vec,
62 | pub guide_length: usize,
63 | }
64 |
65 | impl GuideLibrary {
66 | /// Constructs a new guide library from a set of guides. Will return an error if:
67 | /// - the guides have a mix of lengths
68 | /// - the guide sequences are not all A/C/G/T
69 | /// - there are non-unique guide sequences
70 | pub fn new(guides: Vec) -> Result {
71 | let lengths: HashSet = guides.iter().map(|g| g.len()).collect();
72 | let unique: HashSet<&Vec> = guides.iter().map(|g| &g.bases).collect();
73 | let genes: HashSet<&str> = guides.iter().map(|g| g.gene.as_str()).collect();
74 | let bad = guides.iter().filter(|g| !GuideLibrary::is_acgt(&g.bases)).collect_vec();
75 |
76 | if guides.is_empty() {
77 | Ok(GuideLibrary { guides, guide_length: 0 })
78 | } else if lengths.len() != 1 {
79 | Err(anyhow!("More than one guide length found: {}.", lengths.iter().join(", ")))
80 | } else if !bad.is_empty() {
81 | Err(anyhow!("{} guides had non-ACGT bases in their sequence.", bad.len()))
82 | } else if unique.len() < guides.len() {
83 | Err(anyhow!(
84 | "Guide library had {} guides but only {} unique sequences.",
85 | guides.len(),
86 | unique.len()
87 | ))
88 | } else {
89 | info!(
90 | "Loaded library with {} guides for {} genes; {}=essential, {}=nonessential, {}=control, {}=other.",
91 | guides.len(),
92 | genes.len(),
93 | guides.iter().filter(|g| g.kind == GuideType::Essential).count(),
94 | guides.iter().filter(|g| g.kind == GuideType::Nonessential).count(),
95 | guides.iter().filter(|g| g.kind == GuideType::Control).count(),
96 | guides.iter().filter(|g| g.kind == GuideType::Other).count(),
97 | );
98 |
99 | Ok(GuideLibrary {
100 | guides,
101 | guide_length: *lengths.iter().next().expect("Where'd it go?"),
102 | })
103 | }
104 | }
105 |
106 | /// Reads a guide library from a file. The file:
107 | /// - May be either tab- or comma-delimited
108 | /// - Must have a header row
109 | /// - Must have at least three columns with the first three columns in order being:
110 | /// - a unique ID for the guide
111 | /// - the sequence of the guide
112 | /// - the gene, or other target, of the guide
113 | pub fn from_file(path: &P) -> Result
114 | where
115 | P: AsRef,
116 | {
117 | let no_pattern: Option<&str> = None;
118 | GuideLibrary::from_files(path, &None, &None, &None, &no_pattern)
119 | }
120 |
121 | pub fn from_files(
122 | lib_path: &P,
123 | essential_gene_path: &Option
,
124 | non_essential_gene_path: &Option
,
125 | control_guide_list_path: &Option
,
126 | control_pattern: &Option,
127 | ) -> Result
128 | where
129 | P: AsRef,
130 | S: AsRef,
131 | {
132 | let essentials = GuideLibrary::read_to_set(essential_gene_path)?;
133 | let non_essentials = GuideLibrary::read_to_set(non_essential_gene_path)?;
134 | let control_guides = GuideLibrary::read_to_set(control_guide_list_path)?;
135 | let control_regex = if let Some(p) = control_pattern {
136 | Some(RegexBuilder::new(p.as_ref()).case_insensitive(true).build()?)
137 | } else {
138 | None
139 | };
140 |
141 | let lines = Io::default().read_lines(lib_path)?;
142 |
143 | if lines.len() < 2 {
144 | GuideLibrary::new(vec![])
145 | } else {
146 | let delim: char = if lines[0].chars().filter(|ch| *ch == '\t').count() >= 2 {
147 | '\t'
148 | } else if lines[0].chars().filter(|ch| *ch == ',').count() >= 2 {
149 | ','
150 | } else {
151 | bail!("couldn't detect delimiter from first line of {:?}", lib_path.as_ref());
152 | };
153 |
154 | // Read in the guides
155 | let mut guides = Vec::with_capacity(1024);
156 | let mut idx: usize = 0;
157 | for line in lines.iter().skip(1) {
158 | let trimmed = line.trim();
159 |
160 | if !trimmed.is_empty() {
161 | let fields = trimmed.split(delim).collect_vec();
162 | if fields.len() < 3 {
163 | bail!("Too few fields in line: '{}'", line);
164 | }
165 |
166 | let guide_id = fields[0];
167 | let bases = fields[1];
168 | let gene = fields[2];
169 |
170 | let kind = if essentials.contains(gene) {
171 | GuideType::Essential
172 | } else if non_essentials.contains(gene) {
173 | GuideType::Nonessential
174 | } else if control_guides.contains(guide_id)
175 | || control_regex
176 | .as_ref()
177 | .filter(|re| re.is_match(guide_id) || re.is_match(gene))
178 | .is_some()
179 | {
180 | GuideType::Control
181 | } else {
182 | GuideType::Other
183 | };
184 |
185 | let guide = Guide::new(idx, guide_id, bases, gene, kind);
186 | guides.push(guide);
187 | idx += 1;
188 | }
189 | }
190 |
191 | GuideLibrary::new(guides)
192 | }
193 | }
194 |
195 | /// Reads all lines from a file, trims them, extracts the first tab-separated field and then
196 | /// returns the unique set of values as a set.
197 | fn read_to_set>(path: &Option) -> Result> {
198 | let items: HashSet = match path {
199 | None => HashSet::new(),
200 | Some(p) => Io::default()
201 | .read_lines(p)?
202 | .into_iter()
203 | .map(|line| line.trim().to_string())
204 | .filter(|line| !line.is_empty())
205 | .map(|line| line.split('\t').next().unwrap().to_string())
206 | .collect(),
207 | };
208 |
209 | Ok(items)
210 | }
211 |
212 | /// Returns true if the sequence is all upper-case ACGT bases
213 | fn is_acgt(bases: &[u8]) -> bool {
214 | bases.iter().copied().all(|b| b == b'A' || b == b'C' || b == b'G' || b == b'T')
215 | }
216 |
217 | /// Returns the number of guides in the library
218 | pub fn len(&self) -> usize {
219 | self.guides.len()
220 | }
221 |
222 | /// True if there are no guides in the library, false otherwise
223 | pub fn is_empty(&self) -> bool {
224 | self.len() == 0
225 | }
226 | }
227 |
228 | #[cfg(test)]
229 | mod tests {
230 | use super::*;
231 | use std::collections::HashMap;
232 | use tempfile::TempDir;
233 |
234 | const CSV_LIBRARY: &str = "\
235 | id,bases,gene
236 | a1,CGATCGCTTAAGCTAGCA,FOO
237 | a2,ATGCTAGATCGCGCTATT,FOO
238 | a3,GGCTTCTAGATCGCTATA,Control
239 | ";
240 |
241 | const TSV_LIBRARY: &str = "\
242 | id\tbases\tgene
243 | b1\tCGATCGCTTAAGCTAGCA\tFOO
244 | b2\tATGCTAGATCGCGCTATT\tFOO
245 | b3\tGGCTTCTAGATCGCTATA\tControl
246 | ";
247 |
248 | #[test]
249 | fn test_guide_uppercases_sequence() {
250 | let g1 = Guide::new(0, "foo-1", "AAAAACCCCCGGGGGTTTTT", "FOO", GuideType::Other);
251 | assert_eq!(g1.index, 0);
252 | assert_eq!(g1.id, "foo-1");
253 | assert_eq!(g1.bases_str, "AAAAACCCCCGGGGGTTTTT");
254 | assert_eq!(g1.bases, "AAAAACCCCCGGGGGTTTTT".as_bytes());
255 | assert_eq!(g1.gene, "FOO");
256 | assert_eq!(g1.len(), 20);
257 |
258 | let g2 = Guide::new(0, "foo-2", "aAaAcCcCgGgGtTtTacgt", "FOO", GuideType::Other);
259 | assert_eq!(g2.index, 0);
260 | assert_eq!(g2.id, "foo-2");
261 | assert_eq!(g2.bases_str, "AAAACCCCGGGGTTTTACGT");
262 | assert_eq!(g2.bases, "AAAACCCCGGGGTTTTACGT".as_bytes());
263 | assert_eq!(g2.gene, "FOO");
264 | assert_eq!(g2.len(), 20);
265 | }
266 |
267 | #[test]
268 | fn test_is_all_acgt() {
269 | // True cases
270 | assert!(GuideLibrary::is_acgt("".as_bytes()));
271 | assert!(GuideLibrary::is_acgt("AACGCTGACTGA".as_bytes()));
272 |
273 | // False cases
274 | assert!(!GuideLibrary::is_acgt("N".as_bytes()));
275 | assert!(!GuideLibrary::is_acgt("AC GT".as_bytes()));
276 | assert!(!GuideLibrary::is_acgt("AC-GT".as_bytes()));
277 | assert!(!GuideLibrary::is_acgt("acgt".as_bytes()));
278 | }
279 |
280 | #[test]
281 | fn test_guide_library_positive() {
282 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other);
283 | let g2 = Guide::new(1, "foo-2", "GCTAGACTGGACTCTAATGC", "FOO", GuideType::Other);
284 |
285 | let l1 = GuideLibrary::new(vec![]).unwrap();
286 | assert_eq!(l1.len(), 0);
287 | assert!(l1.is_empty());
288 |
289 | let l2 = GuideLibrary::new(vec![g1.clone(), g2.clone()]).unwrap();
290 | assert_eq!(l2.len(), 2);
291 | assert_eq!(l2.guides, vec![g1, g2]);
292 | assert_eq!(l2.guide_length, 20);
293 | }
294 |
295 | #[test]
296 | fn test_guide_library_rejects_mixed_length() {
297 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other);
298 | let g2 = Guide::new(1, "foo-2", "GCTAGACTGGACTCTAATGCC", "FOO", GuideType::Other);
299 |
300 | let result = GuideLibrary::new(vec![g1, g2]);
301 | assert!(result.err().unwrap().to_string().contains("More than one guide length found"));
302 | }
303 |
304 | #[test]
305 | fn test_guide_library_rejects_invalid_sequences() {
306 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCANNNATGACGTT", "FOO", GuideType::Other);
307 | let g2 = Guide::new(1, "foo-2", "hello!", "FOO", GuideType::Other);
308 |
309 | assert!(GuideLibrary::new(vec![g1]).err().unwrap().to_string().contains("non-ACGT"));
310 | assert!(GuideLibrary::new(vec![g2]).err().unwrap().to_string().contains("non-ACGT"));
311 | }
312 |
313 | #[test]
314 | fn test_guide_library_rejects_duplicate_sequences() {
315 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other);
316 | let g2 = Guide::new(1, "foo-2", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other);
317 | assert!(GuideLibrary::new(vec![g1, g2]).err().unwrap().to_string().contains("unique"));
318 | }
319 |
320 | #[test]
321 | fn test_reading_guide_library_from_csv_file() {
322 | let dir = tempfile::tempdir().unwrap();
323 | let path = dir.path().join("lib.csv");
324 | Io::default().write_lines(&path, vec![CSV_LIBRARY]).unwrap();
325 | let lib = GuideLibrary::from_file(&path).unwrap();
326 |
327 | assert_eq!(lib.len(), 3);
328 | assert_eq!(lib.guides.iter().map(|g| g.id.as_str()).collect_vec(), vec!["a1", "a2", "a3"]);
329 | }
330 |
331 | #[test]
332 | fn test_reading_guide_library_from_tsv_file() {
333 | let dir = tempfile::tempdir().unwrap();
334 | let path = dir.path().join("lib.tsv");
335 | Io::default().write_lines(&path, vec![TSV_LIBRARY]).unwrap();
336 | let lib = GuideLibrary::from_file(&path).unwrap();
337 |
338 | assert_eq!(lib.len(), 3);
339 | assert_eq!(lib.guides.iter().map(|g| g.id.as_str()).collect_vec(), vec!["b1", "b2", "b3"]);
340 | }
341 |
342 | #[test]
343 | fn test_load_guide_library_from_files() {
344 | let tmp = TempDir::new().unwrap();
345 | let lib_path = tmp.path().join("library.tsv.gz");
346 | let ess_path = tmp.path().join("essential.txt");
347 | let non_path = tmp.path().join("non-essential.txt");
348 | let ctl_path = tmp.path().join("control-guides.txt");
349 |
350 | let io = Io::default();
351 | io.write_lines(
352 | &lib_path,
353 | vec![
354 | "guide\tbases\tgene",
355 | "g1.1\tAAAAAAAAAA\tG1",
356 | "g1.2\tCCCCCCCCCC\tG1",
357 | "g2.1\tGGGGGGGGGG\tG2",
358 | "g2.2\tTTTTTTTTTT\tG2",
359 | "g3.1\tACACACACAC\tG3",
360 | "g3.2\tAGAGAGAGAG\tG3",
361 | "g4.1\tATATATATAT\tG4",
362 | "g4.2\tCACACACACA\tG4",
363 | "c1\tGGAGGAGGAG\tnon-target1",
364 | "c2\tGGTGGTGGTG\tnon-target2",
365 | "c3\tAAGTAAGTCC\tcontrol",
366 | ],
367 | )
368 | .unwrap();
369 |
370 | io.write_lines(&ess_path, vec!["G1", "G3"]).unwrap();
371 | io.write_lines(&non_path, vec!["G4"]).unwrap();
372 | io.write_lines(&ctl_path, vec!["c1", "c2"]).unwrap();
373 |
374 | let lib = GuideLibrary::from_files(
375 | &lib_path,
376 | &Some(ess_path),
377 | &Some(non_path),
378 | &Some(ctl_path),
379 | &Some("Control"),
380 | )
381 | .unwrap();
382 |
383 | let map: HashMap<&str, &GuideType> =
384 | lib.guides.iter().map(|g| (g.id.as_str(), &g.kind)).collect();
385 | assert_eq!(map["g1.1"], &GuideType::Essential);
386 | assert_eq!(map["g1.2"], &GuideType::Essential);
387 | assert_eq!(map["g2.1"], &GuideType::Other);
388 | assert_eq!(map["g2.2"], &GuideType::Other);
389 | assert_eq!(map["g3.1"], &GuideType::Essential);
390 | assert_eq!(map["g3.2"], &GuideType::Essential);
391 | assert_eq!(map["g4.1"], &GuideType::Nonessential);
392 | assert_eq!(map["g4.2"], &GuideType::Nonessential);
393 | assert_eq!(map["c1"], &GuideType::Control);
394 | assert_eq!(map["c2"], &GuideType::Control);
395 | assert_eq!(map["c3"], &GuideType::Control);
396 | }
397 | }
398 |
--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
1 | pub mod commands;
2 | pub mod guide;
3 |
4 | use anyhow::Result;
5 | use clap::Parser;
6 | use commands::command::Command;
7 | use commands::*;
8 | use enum_dispatch::enum_dispatch;
9 | use env_logger::Env;
10 |
11 | #[global_allocator]
12 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
13 |
14 | #[derive(Parser, Debug)]
15 | struct Args {
16 | #[clap(subcommand)]
17 | subcommand: Subcommand,
18 | }
19 |
20 | #[enum_dispatch(Command)]
21 | #[derive(Parser, Debug)]
22 | enum Subcommand {
23 | Count(count::Count),
24 | }
25 |
26 | fn main() -> Result<()> {
27 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init();
28 | let args: Args = Args::parse();
29 | args.subcommand.execute()
30 | }
31 |
--------------------------------------------------------------------------------