├── .github ├── CODEOWNERS ├── logos │ └── fulcrumgenomics.svg └── workflows │ └── build_and_test.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── ci └── check.sh ├── comparison-data ├── plot-comparison.R ├── read-counts.png ├── read-counts.txt ├── runtimes.png └── runtimes.txt ├── metadata ├── tko_essential_genes.txt └── tko_nonessential_genes.txt ├── rustfmt.toml └── src ├── commands ├── command.rs ├── count.rs └── mod.rs ├── guide.rs └── main.rs /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @tfenne 2 | -------------------------------------------------------------------------------- /.github/logos/fulcrumgenomics.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/workflows/build_and_test.yml: -------------------------------------------------------------------------------- 1 | name: Check 2 | 3 | on: [push, pull_request] 4 | 5 | env: 6 | CARGO_TERM_COLOR: always 7 | 8 | jobs: 9 | check: 10 | name: Check 11 | runs-on: ubuntu-24.04 12 | steps: 13 | - name: Checkout sources 14 | uses: actions/checkout@v2 15 | 16 | - name: Install stable toolchain 17 | uses: actions-rs/toolchain@v1 18 | with: 19 | profile: minimal 20 | toolchain: stable 21 | override: true 22 | 23 | - name: Cache dependencies 24 | uses: Swatinem/rust-cache@v1 25 | 26 | - name: Run cargo check 27 | uses: actions-rs/cargo@v1 28 | with: 29 | command: check 30 | 31 | lints: 32 | name: Lints 33 | runs-on: ubuntu-24.04 34 | steps: 35 | - name: Checkout sources 36 | uses: actions/checkout@v2 37 | 38 | - name: Install stable toolchain 39 | uses: actions-rs/toolchain@v1 40 | with: 41 | profile: minimal 42 | toolchain: stable 43 | override: true 44 | components: rustfmt, clippy 45 | 46 | - name: Cache dependencies 47 | uses: Swatinem/rust-cache@v1 48 | 49 | - name: Run cargo fmt 50 | uses: actions-rs/cargo@v1 51 | with: 52 | command: fmt 53 | args: --all -- --check 54 | 55 | - name: Run cargo clippy 56 | uses: actions-rs/cargo@v1 57 | with: 58 | command: clippy 59 | args: -- -D warnings 60 | 61 | test: 62 | name: Test Suite 63 | runs-on: ${{ matrix.os }} 64 | strategy: 65 | matrix: 66 | os: [ubuntu-latest, macOS-latest] 67 | steps: 68 | - name: Checkout sources 69 | uses: actions/checkout@v2 70 | 71 | - name: Install stable toolchain 72 | uses: actions-rs/toolchain@v1 73 | with: 74 | profile: minimal 75 | toolchain: stable 76 | override: true 77 | 78 | - name: Cache dependencies 79 | uses: Swatinem/rust-cache@v1 80 | 81 | - name: Run tests 82 | run: cargo test --verbose 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | .idea 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "adler" 7 | version = "1.0.2" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" 10 | 11 | [[package]] 12 | name = "ahash" 13 | version = "0.7.6" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" 16 | dependencies = [ 17 | "getrandom", 18 | "once_cell", 19 | "version_check", 20 | ] 21 | 22 | [[package]] 23 | name = "aho-corasick" 24 | version = "0.7.18" 25 | source = "registry+https://github.com/rust-lang/crates.io-index" 26 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 27 | dependencies = [ 28 | "memchr", 29 | ] 30 | 31 | [[package]] 32 | name = "anyhow" 33 | version = "1.0.52" 34 | source = "registry+https://github.com/rust-lang/crates.io-index" 35 | checksum = "84450d0b4a8bd1ba4144ce8ce718fbc5d071358b1e5384bace6536b3d1f2d5b3" 36 | 37 | [[package]] 38 | name = "atty" 39 | version = "0.2.14" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 42 | dependencies = [ 43 | "hermit-abi", 44 | "libc", 45 | "winapi", 46 | ] 47 | 48 | [[package]] 49 | name = "autocfg" 50 | version = "1.0.1" 51 | source = "registry+https://github.com/rust-lang/crates.io-index" 52 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 53 | 54 | [[package]] 55 | name = "bitflags" 56 | version = "1.3.2" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 59 | 60 | [[package]] 61 | name = "bstr" 62 | version = "0.2.17" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" 65 | dependencies = [ 66 | "lazy_static", 67 | "memchr", 68 | "regex-automata", 69 | "serde", 70 | ] 71 | 72 | [[package]] 73 | name = "cc" 74 | version = "1.0.72" 75 | source = "registry+https://github.com/rust-lang/crates.io-index" 76 | checksum = "22a9137b95ea06864e018375b72adfb7db6e6f68cfc8df5a04d00288050485ee" 77 | 78 | [[package]] 79 | name = "cfg-if" 80 | version = "1.0.0" 81 | source = "registry+https://github.com/rust-lang/crates.io-index" 82 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 83 | 84 | [[package]] 85 | name = "clap" 86 | version = "3.0.0-rc.9" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "c7843ae7a539bef687e018bf9edf7e87728024b29d02b0f8409726be8880ae1a" 89 | dependencies = [ 90 | "atty", 91 | "bitflags", 92 | "clap_derive", 93 | "indexmap", 94 | "lazy_static", 95 | "os_str_bytes", 96 | "strsim", 97 | "termcolor", 98 | "textwrap", 99 | ] 100 | 101 | [[package]] 102 | name = "clap_derive" 103 | version = "3.0.0-rc.9" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "cae3cc2f259ea636871f5da15b0ac033f1821d7a5506c3d1bfbdde201f14c803" 106 | dependencies = [ 107 | "heck", 108 | "proc-macro-error", 109 | "proc-macro2", 110 | "quote", 111 | "syn", 112 | ] 113 | 114 | [[package]] 115 | name = "crc32fast" 116 | version = "1.3.0" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "738c290dfaea84fc1ca15ad9c168d083b05a714e1efddd8edaab678dc28d2836" 119 | dependencies = [ 120 | "cfg-if", 121 | ] 122 | 123 | [[package]] 124 | name = "csv" 125 | version = "1.1.6" 126 | source = "registry+https://github.com/rust-lang/crates.io-index" 127 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" 128 | dependencies = [ 129 | "bstr", 130 | "csv-core", 131 | "itoa", 132 | "ryu", 133 | "serde", 134 | ] 135 | 136 | [[package]] 137 | name = "csv-core" 138 | version = "0.1.10" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" 141 | dependencies = [ 142 | "memchr", 143 | ] 144 | 145 | [[package]] 146 | name = "either" 147 | version = "1.6.1" 148 | source = "registry+https://github.com/rust-lang/crates.io-index" 149 | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" 150 | 151 | [[package]] 152 | name = "enum_dispatch" 153 | version = "0.3.7" 154 | source = "registry+https://github.com/rust-lang/crates.io-index" 155 | checksum = "bd53b3fde38a39a06b2e66dc282f3e86191e53bd04cc499929c15742beae3df8" 156 | dependencies = [ 157 | "once_cell", 158 | "proc-macro2", 159 | "quote", 160 | "syn", 161 | ] 162 | 163 | [[package]] 164 | name = "env_logger" 165 | version = "0.8.4" 166 | source = "registry+https://github.com/rust-lang/crates.io-index" 167 | checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" 168 | dependencies = [ 169 | "atty", 170 | "humantime", 171 | "log", 172 | "regex", 173 | "termcolor", 174 | ] 175 | 176 | [[package]] 177 | name = "fastq" 178 | version = "0.6.0" 179 | source = "registry+https://github.com/rust-lang/crates.io-index" 180 | checksum = "c0dc54743d8fa10c176c4be22ccc6da3cc2b7f8b1b1b5a7fa17f4cbb94d3f29c" 181 | dependencies = [ 182 | "flate2", 183 | "lz4", 184 | "memchr", 185 | ] 186 | 187 | [[package]] 188 | name = "fgoxide" 189 | version = "0.1.3" 190 | source = "registry+https://github.com/rust-lang/crates.io-index" 191 | checksum = "d571c2c4fb6b56ada5b136196eb40aa4b4e22ae7ca2efb7eb2f8d45e6c13c297" 192 | dependencies = [ 193 | "csv", 194 | "flate2", 195 | "serde", 196 | "thiserror", 197 | ] 198 | 199 | [[package]] 200 | name = "flate2" 201 | version = "1.0.22" 202 | source = "registry+https://github.com/rust-lang/crates.io-index" 203 | checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f" 204 | dependencies = [ 205 | "cfg-if", 206 | "crc32fast", 207 | "libc", 208 | "libz-sys", 209 | "miniz_oxide", 210 | ] 211 | 212 | [[package]] 213 | name = "getrandom" 214 | version = "0.2.3" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" 217 | dependencies = [ 218 | "cfg-if", 219 | "libc", 220 | "wasi", 221 | ] 222 | 223 | [[package]] 224 | name = "guide-counter" 225 | version = "0.1.3" 226 | dependencies = [ 227 | "ahash", 228 | "anyhow", 229 | "clap", 230 | "csv", 231 | "enum_dispatch", 232 | "env_logger", 233 | "fastq", 234 | "fgoxide", 235 | "flate2", 236 | "itertools", 237 | "log", 238 | "mimalloc", 239 | "regex", 240 | "serde", 241 | "tempfile", 242 | ] 243 | 244 | [[package]] 245 | name = "hashbrown" 246 | version = "0.11.2" 247 | source = "registry+https://github.com/rust-lang/crates.io-index" 248 | checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" 249 | 250 | [[package]] 251 | name = "heck" 252 | version = "0.3.3" 253 | source = "registry+https://github.com/rust-lang/crates.io-index" 254 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 255 | dependencies = [ 256 | "unicode-segmentation", 257 | ] 258 | 259 | [[package]] 260 | name = "hermit-abi" 261 | version = "0.1.19" 262 | source = "registry+https://github.com/rust-lang/crates.io-index" 263 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 264 | dependencies = [ 265 | "libc", 266 | ] 267 | 268 | [[package]] 269 | name = "humantime" 270 | version = "2.1.0" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" 273 | 274 | [[package]] 275 | name = "indexmap" 276 | version = "1.7.0" 277 | source = "registry+https://github.com/rust-lang/crates.io-index" 278 | checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" 279 | dependencies = [ 280 | "autocfg", 281 | "hashbrown", 282 | ] 283 | 284 | [[package]] 285 | name = "itertools" 286 | version = "0.10.3" 287 | source = "registry+https://github.com/rust-lang/crates.io-index" 288 | checksum = "a9a9d19fa1e79b6215ff29b9d6880b706147f16e9b1dbb1e4e5947b5b02bc5e3" 289 | dependencies = [ 290 | "either", 291 | ] 292 | 293 | [[package]] 294 | name = "itoa" 295 | version = "0.4.8" 296 | source = "registry+https://github.com/rust-lang/crates.io-index" 297 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" 298 | 299 | [[package]] 300 | name = "lazy_static" 301 | version = "1.4.0" 302 | source = "registry+https://github.com/rust-lang/crates.io-index" 303 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 304 | 305 | [[package]] 306 | name = "libc" 307 | version = "0.2.112" 308 | source = "registry+https://github.com/rust-lang/crates.io-index" 309 | checksum = "1b03d17f364a3a042d5e5d46b053bbbf82c92c9430c592dd4c064dc6ee997125" 310 | 311 | [[package]] 312 | name = "libmimalloc-sys" 313 | version = "0.1.23" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "9636c194f9db483f4d0adf2f99a65011a99f904bd222bbd67fb4df4f37863c30" 316 | dependencies = [ 317 | "cc", 318 | ] 319 | 320 | [[package]] 321 | name = "libz-sys" 322 | version = "1.1.3" 323 | source = "registry+https://github.com/rust-lang/crates.io-index" 324 | checksum = "de5435b8549c16d423ed0c03dbaafe57cf6c3344744f1242520d59c9d8ecec66" 325 | dependencies = [ 326 | "cc", 327 | "pkg-config", 328 | "vcpkg", 329 | ] 330 | 331 | [[package]] 332 | name = "log" 333 | version = "0.4.14" 334 | source = "registry+https://github.com/rust-lang/crates.io-index" 335 | checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" 336 | dependencies = [ 337 | "cfg-if", 338 | ] 339 | 340 | [[package]] 341 | name = "lz4" 342 | version = "1.23.2" 343 | source = "registry+https://github.com/rust-lang/crates.io-index" 344 | checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c" 345 | dependencies = [ 346 | "libc", 347 | "lz4-sys", 348 | ] 349 | 350 | [[package]] 351 | name = "lz4-sys" 352 | version = "1.9.2" 353 | source = "registry+https://github.com/rust-lang/crates.io-index" 354 | checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" 355 | dependencies = [ 356 | "cc", 357 | "libc", 358 | ] 359 | 360 | [[package]] 361 | name = "memchr" 362 | version = "2.4.1" 363 | source = "registry+https://github.com/rust-lang/crates.io-index" 364 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" 365 | 366 | [[package]] 367 | name = "mimalloc" 368 | version = "0.1.27" 369 | source = "registry+https://github.com/rust-lang/crates.io-index" 370 | checksum = "cf5f78c1d9892fb5677a8b2f543f967ab891ac0f71feecd961435b74f877283a" 371 | dependencies = [ 372 | "libmimalloc-sys", 373 | ] 374 | 375 | [[package]] 376 | name = "miniz_oxide" 377 | version = "0.4.4" 378 | source = "registry+https://github.com/rust-lang/crates.io-index" 379 | checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" 380 | dependencies = [ 381 | "adler", 382 | "autocfg", 383 | ] 384 | 385 | [[package]] 386 | name = "once_cell" 387 | version = "1.9.0" 388 | source = "registry+https://github.com/rust-lang/crates.io-index" 389 | checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" 390 | 391 | [[package]] 392 | name = "os_str_bytes" 393 | version = "6.0.0" 394 | source = "registry+https://github.com/rust-lang/crates.io-index" 395 | checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" 396 | dependencies = [ 397 | "memchr", 398 | ] 399 | 400 | [[package]] 401 | name = "pkg-config" 402 | version = "0.3.24" 403 | source = "registry+https://github.com/rust-lang/crates.io-index" 404 | checksum = "58893f751c9b0412871a09abd62ecd2a00298c6c83befa223ef98c52aef40cbe" 405 | 406 | [[package]] 407 | name = "ppv-lite86" 408 | version = "0.2.15" 409 | source = "registry+https://github.com/rust-lang/crates.io-index" 410 | checksum = "ed0cfbc8191465bed66e1718596ee0b0b35d5ee1f41c5df2189d0fe8bde535ba" 411 | 412 | [[package]] 413 | name = "proc-macro-error" 414 | version = "1.0.4" 415 | source = "registry+https://github.com/rust-lang/crates.io-index" 416 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 417 | dependencies = [ 418 | "proc-macro-error-attr", 419 | "proc-macro2", 420 | "quote", 421 | "syn", 422 | "version_check", 423 | ] 424 | 425 | [[package]] 426 | name = "proc-macro-error-attr" 427 | version = "1.0.4" 428 | source = "registry+https://github.com/rust-lang/crates.io-index" 429 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 430 | dependencies = [ 431 | "proc-macro2", 432 | "quote", 433 | "version_check", 434 | ] 435 | 436 | [[package]] 437 | name = "proc-macro2" 438 | version = "1.0.36" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" 441 | dependencies = [ 442 | "unicode-xid", 443 | ] 444 | 445 | [[package]] 446 | name = "quote" 447 | version = "1.0.14" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "47aa80447ce4daf1717500037052af176af5d38cc3e571d9ec1c7353fc10c87d" 450 | dependencies = [ 451 | "proc-macro2", 452 | ] 453 | 454 | [[package]] 455 | name = "rand" 456 | version = "0.8.4" 457 | source = "registry+https://github.com/rust-lang/crates.io-index" 458 | checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" 459 | dependencies = [ 460 | "libc", 461 | "rand_chacha", 462 | "rand_core", 463 | "rand_hc", 464 | ] 465 | 466 | [[package]] 467 | name = "rand_chacha" 468 | version = "0.3.1" 469 | source = "registry+https://github.com/rust-lang/crates.io-index" 470 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 471 | dependencies = [ 472 | "ppv-lite86", 473 | "rand_core", 474 | ] 475 | 476 | [[package]] 477 | name = "rand_core" 478 | version = "0.6.3" 479 | source = "registry+https://github.com/rust-lang/crates.io-index" 480 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" 481 | dependencies = [ 482 | "getrandom", 483 | ] 484 | 485 | [[package]] 486 | name = "rand_hc" 487 | version = "0.3.1" 488 | source = "registry+https://github.com/rust-lang/crates.io-index" 489 | checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" 490 | dependencies = [ 491 | "rand_core", 492 | ] 493 | 494 | [[package]] 495 | name = "redox_syscall" 496 | version = "0.2.10" 497 | source = "registry+https://github.com/rust-lang/crates.io-index" 498 | checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" 499 | dependencies = [ 500 | "bitflags", 501 | ] 502 | 503 | [[package]] 504 | name = "regex" 505 | version = "1.5.4" 506 | source = "registry+https://github.com/rust-lang/crates.io-index" 507 | checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" 508 | dependencies = [ 509 | "aho-corasick", 510 | "memchr", 511 | "regex-syntax", 512 | ] 513 | 514 | [[package]] 515 | name = "regex-automata" 516 | version = "0.1.10" 517 | source = "registry+https://github.com/rust-lang/crates.io-index" 518 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" 519 | 520 | [[package]] 521 | name = "regex-syntax" 522 | version = "0.6.25" 523 | source = "registry+https://github.com/rust-lang/crates.io-index" 524 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" 525 | 526 | [[package]] 527 | name = "remove_dir_all" 528 | version = "0.5.3" 529 | source = "registry+https://github.com/rust-lang/crates.io-index" 530 | checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" 531 | dependencies = [ 532 | "winapi", 533 | ] 534 | 535 | [[package]] 536 | name = "ryu" 537 | version = "1.0.9" 538 | source = "registry+https://github.com/rust-lang/crates.io-index" 539 | checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" 540 | 541 | [[package]] 542 | name = "serde" 543 | version = "1.0.132" 544 | source = "registry+https://github.com/rust-lang/crates.io-index" 545 | checksum = "8b9875c23cf305cd1fd7eb77234cbb705f21ea6a72c637a5c6db5fe4b8e7f008" 546 | dependencies = [ 547 | "serde_derive", 548 | ] 549 | 550 | [[package]] 551 | name = "serde_derive" 552 | version = "1.0.132" 553 | source = "registry+https://github.com/rust-lang/crates.io-index" 554 | checksum = "ecc0db5cb2556c0e558887d9bbdcf6ac4471e83ff66cf696e5419024d1606276" 555 | dependencies = [ 556 | "proc-macro2", 557 | "quote", 558 | "syn", 559 | ] 560 | 561 | [[package]] 562 | name = "strsim" 563 | version = "0.10.0" 564 | source = "registry+https://github.com/rust-lang/crates.io-index" 565 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 566 | 567 | [[package]] 568 | name = "syn" 569 | version = "1.0.84" 570 | source = "registry+https://github.com/rust-lang/crates.io-index" 571 | checksum = "ecb2e6da8ee5eb9a61068762a32fa9619cc591ceb055b3687f4cd4051ec2e06b" 572 | dependencies = [ 573 | "proc-macro2", 574 | "quote", 575 | "unicode-xid", 576 | ] 577 | 578 | [[package]] 579 | name = "tempfile" 580 | version = "3.2.0" 581 | source = "registry+https://github.com/rust-lang/crates.io-index" 582 | checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" 583 | dependencies = [ 584 | "cfg-if", 585 | "libc", 586 | "rand", 587 | "redox_syscall", 588 | "remove_dir_all", 589 | "winapi", 590 | ] 591 | 592 | [[package]] 593 | name = "termcolor" 594 | version = "1.1.2" 595 | source = "registry+https://github.com/rust-lang/crates.io-index" 596 | checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" 597 | dependencies = [ 598 | "winapi-util", 599 | ] 600 | 601 | [[package]] 602 | name = "textwrap" 603 | version = "0.14.2" 604 | source = "registry+https://github.com/rust-lang/crates.io-index" 605 | checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" 606 | 607 | [[package]] 608 | name = "thiserror" 609 | version = "1.0.30" 610 | source = "registry+https://github.com/rust-lang/crates.io-index" 611 | checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417" 612 | dependencies = [ 613 | "thiserror-impl", 614 | ] 615 | 616 | [[package]] 617 | name = "thiserror-impl" 618 | version = "1.0.30" 619 | source = "registry+https://github.com/rust-lang/crates.io-index" 620 | checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b" 621 | dependencies = [ 622 | "proc-macro2", 623 | "quote", 624 | "syn", 625 | ] 626 | 627 | [[package]] 628 | name = "unicode-segmentation" 629 | version = "1.8.0" 630 | source = "registry+https://github.com/rust-lang/crates.io-index" 631 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" 632 | 633 | [[package]] 634 | name = "unicode-xid" 635 | version = "0.2.2" 636 | source = "registry+https://github.com/rust-lang/crates.io-index" 637 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 638 | 639 | [[package]] 640 | name = "vcpkg" 641 | version = "0.2.15" 642 | source = "registry+https://github.com/rust-lang/crates.io-index" 643 | checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" 644 | 645 | [[package]] 646 | name = "version_check" 647 | version = "0.9.3" 648 | source = "registry+https://github.com/rust-lang/crates.io-index" 649 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" 650 | 651 | [[package]] 652 | name = "wasi" 653 | version = "0.10.2+wasi-snapshot-preview1" 654 | source = "registry+https://github.com/rust-lang/crates.io-index" 655 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" 656 | 657 | [[package]] 658 | name = "winapi" 659 | version = "0.3.9" 660 | source = "registry+https://github.com/rust-lang/crates.io-index" 661 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 662 | dependencies = [ 663 | "winapi-i686-pc-windows-gnu", 664 | "winapi-x86_64-pc-windows-gnu", 665 | ] 666 | 667 | [[package]] 668 | name = "winapi-i686-pc-windows-gnu" 669 | version = "0.4.0" 670 | source = "registry+https://github.com/rust-lang/crates.io-index" 671 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 672 | 673 | [[package]] 674 | name = "winapi-util" 675 | version = "0.1.5" 676 | source = "registry+https://github.com/rust-lang/crates.io-index" 677 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 678 | dependencies = [ 679 | "winapi", 680 | ] 681 | 682 | [[package]] 683 | name = "winapi-x86_64-pc-windows-gnu" 684 | version = "0.4.0" 685 | source = "registry+https://github.com/rust-lang/crates.io-index" 686 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 687 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "guide-counter" 3 | version = "0.1.3" 4 | edition = "2021" 5 | authors = ["Tim Fennell"] 6 | license = "MIT" 7 | repository = "https://github.com/fulcrumgenomics/guide-counter" 8 | homepage = "https://github.com/fulcrumgenomics/guide-counter" 9 | description = "Fast and accurate guide counting for CRISPR screens." 10 | readme = "README.md" 11 | categories = ["science"] 12 | keywords = ["bioinformatics", "genomic", "crispr"] 13 | 14 | [profile.release] 15 | lto = "fat" 16 | codegen-units = 1 17 | 18 | [dependencies] 19 | ahash = "0.7.6" 20 | anyhow = "1.0.48" 21 | clap = { version = "3.0.0-rc.9", features = ["derive"] } 22 | csv = "1.1.5" 23 | enum_dispatch = "0.3.7" 24 | env_logger = "0.8.2" 25 | fastq = "0.6.0" 26 | fgoxide = "0.1.3" 27 | flate2 = "1.0.22" 28 | itertools = "0.10.1" 29 | log = "0.4.14" 30 | mimalloc = { version = "0.1.17", default-features = false } 31 | regex = "1.5.4" 32 | serde = { version = "1.0.123", features = ["derive"] } 33 | 34 | [dev-dependencies] 35 | tempfile = "3.2.0" 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Fulcrum Genomics LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # guide-counter 2 | 3 | 4 |

5 | Build Status 6 | Version info 7 | Bioconda 8 | DOI 9 |
10 |

11 | 12 | A better, faster way to count guides in CRISPR screens. 13 | 14 |

15 | Fulcrum Genomics 16 |

17 | 18 | [Visit us at Fulcrum Genomics](https://www.fulcrumgenomics.com) to learn more about how we can power your Bioinformatics with guide-counter and beyond. 19 | 20 | 21 | 22 | 23 | ## Overview 24 | 25 | `guide-counter` is a tool for processing FASTQ files from CRISPR screen experiments to generate a matrix of per-sample guide counts. It can be used as a faster, more accurate, drop in replacement for `mageck count`. By default `guide-counter` will look for guide seqeunces in the reads with 0 or 1 mismatches vs. the expected guides, but can be run in exact matching mode. 26 | 27 | ## Why `guide-counter`? 28 | 29 | If you have any experience analyzing CRISPR screens you've almost certainly tried [`mageck`][mageck-link]. It's widely used, highly cited and generally works well. Surprisingly though, `mageck count` is both rather slow _and_ misses counting a non-trivial amount of the data. 30 | 31 | As an example, we ran data from the [Sanson et al][sanson-link] paper through both tools. The dataset consists of: 32 | 33 | |Sample|Reads|Gzipped FASTQ Size| 34 | |------|-----|------------------| 35 | |Plasmid|9,821,128|377M| 36 | |RepA|76,471,324|2.3G| 37 | |RepB|85,301,059|2.5G| 38 | |RepC|75,356,900|2.2G| 39 | 40 | The following plot shows the amount of data recovered per sample by each of three different analyses: 41 | 42 | ![Read Counts from analyzing Sanson et al. data](comparison-data/read-counts.png) 43 | 44 | And the following plot shows the runtime for each of the three analyses performed using a single CPU core/thread on an Intel Core i9 powered MacBook Pro laptop: 45 | 46 | ![Runtimes from analyzing Sanson et al. data](comparison-data/runtimes.png) 47 | 48 | ## Installation 49 | 50 | Installation can be done using conda: 51 | 52 | ``` 53 | conda install -c bioconda guide-counter 54 | ``` 55 | 56 | or with `cargo` if installed: 57 | 58 | ``` 59 | cargo install guide-counter 60 | ``` 61 | ## Example Workflow 62 | 63 | The following shows an example of running `guide-counter` followed by `mageck test` on data from the [Sanson et al. 2018 paper][sanson-link]: 64 | 65 | ``` 66 | guide-counter count \ 67 | --input plasmid.fq.gz RepA.fq.gz RepB.fq.gz RepC.fq.gz \ 68 | --control-pattern control \ 69 | --essential-genes metadata/training_essentials.txt \ 70 | --nonessential-genes metadata/training_nonessential.txt \ 71 | --library metadata/broadgpp-brunello-library-corrected.txt.gz \ 72 | --output sanson 73 | 74 | mageck test \ 75 | --count-table sanson.counts.txt \ 76 | --control-id plasmid \ 77 | --treatment-id RepA,RepB,RepC \ 78 | --norm-method median \ 79 | --output-prefix sanson.test 80 | 81 | ``` 82 | 83 | ## Inputs 84 | 85 | The full usage for `guide-counter count` is reproduced below; this section describes a few of the key inputs in more detail: 86 | 87 | |Input Option|Required|Description| 88 | |------------|--------|-----------| 89 | |`--input`|Yes|FASTQ files one per sample. Files may be gzipped or uncompressed.| 90 | |`--samples`|No|Names for the samples, matched positionally to the FASTQs. If not provided then the input file names minus any `.[fq|fastq][.gz]` suffixes are used instead.| 91 | |`--essential-genes`|No|An optional file of known essential genes. May be gzipped or uncompressed. May be either just gene names, one per line, or tab-delimited with the gene in the first column. If given, guides will be labeled as essential for matching genes, and mean coverage of guides for essential genes computed.| 92 | |`--nonessential-genes`|No|An optional file of known nonessential genes. May be gzipped or uncompressed. May be either just gene names, one per line, or tab-delimited with the gene in the first column. If given, guides will be labeled as nonessential for matching genes, and mean coverage of guides for nonessential genes computed.| 93 | |`--control-guides`|No|An optional file of guide IDs for control guides. May be gzipped or uncompressed. May be either just guide IDs, one per line, or tab-delimited data with the guide ID in the first column. If given, matching guides will be labeled as controls, and mean coverage of control guides computed. May be used alone _or_ in conjunction with `--control-pattern`.| 94 | |`--control-pattern`|No|An optional regular expression which is applied (case insensitive) to _both_ guide IDs and gene names, and when a match is found, guides are labeled as controls. For example `--control-pattern control` works well for many human libraries.| 95 | 96 | ## Outputs 97 | 98 | The output files are generated: 99 | 100 | 1. `{output}.counts.txt` - a standard count matrix with columns for the guide ID and gene, then one column per sample with raw/unnormalized guide counts. 101 | 2. `{output}.-extended-counts.txt` - an extended version of the counts matrix which includes a `guide_type` column which will have one of `[Essential, Nonessential, Control, Other]` per guide as determined based on the gene lists and control information provided. 102 | 3. `{output}.stats.txt` - a file of computed statistics, one row per input sample/FASTQ. 103 | 104 | The columns in the stats file are: 105 | 106 | |Column|Description| 107 | |------|-----------| 108 | |file|The path to the input FASTQ file used to generate the stats.| 109 | |label|The label or sample name given to the sample.| 110 | |total_guides|The total number of guides in the guide library (not sample dependent).| 111 | |total_reads|The total number of reads in the input FASTQ file.| 112 | |mapped_reads|The number of reads that could be mapped to a guide.| 113 | |frac_mapped|The fraction of reads (0-1) that could be mapped to a guide.| 114 | |mean_reads_per_guide|The mean number of reads mapped to each guide in the library.| 115 | |mean_reads_essential|The mean number of reads mapped to guides for essential genes.| 116 | |mean_reads_nonessential|The mean number of reads mapped to guides for nonessential genes.| 117 | |mean_reads_control|The mean number of reads mapped to control guides.| 118 | |mean_reads_other|The mean number of reads mapped to other guides (guides not flagged as essential, nonessential or control).| 119 | |zero_read_guides| 120 | 121 | 122 | ## Usage 123 | 124 | Usage for `guide-counter count`: 125 | 126 | ``` 127 | guide-counter-count 128 | 129 | Counts the guides observed in a CRISPR screen, starting from one or more FASTQs. FASTQs are one per 130 | sample and currently only single-end FASTQ inputs are supported. 131 | 132 | A set of sample IDs may be provided using `--samples id1 id2 ..`. If provided it must have the same 133 | number of values as input FASTQs. If not provided the FASTQ names are used minus any fastq/fq/gz 134 | suffixes. 135 | 136 | Automatically determines the range of valid offsets within the sequencing reads where the guide 137 | sequences are located, independently for each FASTQ input. The first `offset-sample-size` reads 138 | from each FASTQ are examined to determine the offsets at which guides are found. When processing the 139 | full FASTQ, checks only those offsets that accounted for at least `offset-min-fraction` of the first 140 | `offset-sample-size` reads. 141 | 142 | Matching by default allows for one mismatch (and no indels) between the read sub-sequence and the 143 | expected guide sequences. Exact matching may be enabled by specifying the `--exact-match` option. 144 | 145 | Two output files are generated. The first is named `{output}.counts.txt` and contains columns for 146 | the guide id, the gene targeted by the guide and one count column per input FASTQ with raw/un- 147 | normalized counts. The second is named `{output}.stats.txt` and contains basic QC statistics per 148 | input FASTQ on the matching process. 149 | 150 | USAGE: 151 | guide-counter count [OPTIONS] --input ... --library --output 152 | 153 | OPTIONS: 154 | -c, --control-guides 155 | Optional path to file with list control guide IDs. IDs should appear one per line and 156 | are case sensitive 157 | 158 | -C, --control-pattern 159 | Optional regular expression pattern used to ID control guides. Pattern is matched, case 160 | insensitive, to guide IDs and Gene names 161 | 162 | -e, --essential-genes 163 | Optional path to file with list of essential genes. Gene names should appear one per 164 | line and are case sensitive 165 | 166 | -f, --offset-min-fraction 167 | After sampling the first `offset_sample_size` reads, use offsets that 168 | 169 | [default: 0.005] 170 | 171 | -h, --help 172 | Print help information 173 | 174 | -i, --input ... 175 | Input fastq file(s) 176 | 177 | -l, --library 178 | Path to the guide library metadata. May be a tab- or comma-separated file. Must have a 179 | header line, and the first three fields must be (in order): i) the ID of the guide, ii) 180 | the base sequence of the guide, iii) the gene the guide targets 181 | 182 | -n, --nonessential-genes 183 | Optional path to file with list of nonessential genes. Gene names should appear one per 184 | line and are case sensitive 185 | 186 | -N, --offset-sample-size 187 | The number of reads to be examined when determining the offsets at which guides may be 188 | found in the input reads 189 | 190 | [default: 100000] 191 | 192 | -o, --output 193 | Path prefix to use for all output files 194 | 195 | -s, --samples ... 196 | Sample names corresponding to the input fastqs. If provided must be the same length as 197 | input. Otherwise will be inferred from input file names 198 | 199 | -x, --exact-match 200 | Perform exact matching only, don't allow mismatches between reads and guides 201 | ``` 202 | 203 | [sanson-link]: https://pubmed.ncbi.nlm.nih.gov/30575746/ 204 | [mageck-link]: https://pubmed.ncbi.nlm.nih.gov/25476604/ 205 | -------------------------------------------------------------------------------- /ci/check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function banner() { 4 | echo 5 | echo "================================================================================" 6 | echo $* 7 | echo "================================================================================" 8 | echo 9 | } 10 | 11 | ##################################################################### 12 | # Takes two parameters, a "name" and a "command". 13 | # Runs the command and prints out whether it succeeded or failed, and 14 | # also tracks a list of failed steps in $failures. 15 | ##################################################################### 16 | function run() { 17 | local name=$1 18 | local cmd=$2 19 | 20 | banner "Running $name [$cmd]" 21 | set +e 22 | $cmd 23 | exit_code=$? 24 | set -e 25 | 26 | if [[ $exit_code == 0 ]]; then 27 | echo Passed $name: "[$cmd]" 28 | else 29 | echo Failed $name: "[$cmd]" 30 | if [ -z "$failures" ]; then 31 | failures="$failures $name" 32 | else 33 | failures="$failures, $name" 34 | fi 35 | fi 36 | } 37 | 38 | parent=$(cd $(dirname $0) && pwd -P) 39 | repo_root=$(cd $(dirname $0)/.. && pwd -P) 40 | 41 | run "Formatting" "cargo fmt" 42 | run "Clippy" "cargo clippy -- -D warnings" 43 | run "Unit Tests" "cargo test" 44 | 45 | if [ -z "$failures" ]; then 46 | banner "Checks Passed" 47 | else 48 | banner "Checks Failed with failures in: $failures" 49 | exit 1 50 | fi 51 | -------------------------------------------------------------------------------- /comparison-data/plot-comparison.R: -------------------------------------------------------------------------------- 1 | require(ggplot2) 2 | 3 | counts = read.table("read-counts.txt", sep="\t", header=T) 4 | runtimes = read.table("runtimes.txt", sep="\t", header=T) 5 | 6 | png(filename="runtimes.png", width=800, height=550, res=100) 7 | ggplot(runtimes) + 8 | aes(x=tool, y=runtime_seconds, fill=tool) + 9 | geom_col() + 10 | scale_y_continuous(minor_breaks=seq(0, 1700, 100)) + 11 | scale_fill_brewer(palette = "Paired") + 12 | theme(plot.title = element_text(hjust = 0.5)) + 13 | theme(plot.subtitle = element_text(hjust = 0.5)) + 14 | labs(x="Tool", y="Runtime (seconds)", 15 | title="Runtime of Counting Guides in 4 FASTQs from Sanson et al.", 16 | subtitle="(Smaller bars are better)") 17 | dev.off() 18 | 19 | png(filename="read-counts.png", width=800, height=550, res=100) 20 | ggplot(counts) + 21 | aes(fill=analysis, x=sample, y=matched_reads) + 22 | scale_fill_brewer(palette = "Paired") + 23 | theme(plot.title = element_text(hjust = 0.5)) + 24 | theme(plot.subtitle = element_text(hjust = 0.5)) + 25 | geom_bar(position="dodge", stat="identity") + 26 | labs(x="Sample", y="Reads Matched to Guides", 27 | title="Matched Reads in 4 FASTQs from Sanson et al.", 28 | subtitle="(Bigger bars are better)") 29 | dev.off() 30 | -------------------------------------------------------------------------------- /comparison-data/read-counts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fulcrumgenomics/guide-counter/23f2e20e8f295a9ab9388c7d41ee5659c3d4b6df/comparison-data/read-counts.png -------------------------------------------------------------------------------- /comparison-data/read-counts.txt: -------------------------------------------------------------------------------- 1 | analysis sample matched_reads 2 | guide-counter --exact plasmid 8758764 3 | guide-counter --exact RepA 64107160 4 | guide-counter --exact RepB 70347268 5 | guide-counter --exact RepC 63477791 6 | guide-counter plasmid 9318181 7 | guide-counter RepA 69491160 8 | guide-counter RepB 76248653 9 | guide-counter RepC 68900168 10 | mageck count plasmid 6716580 11 | mageck count RepA 49074849 12 | mageck count RepB 53814919 13 | mageck count RepC 48579881 14 | -------------------------------------------------------------------------------- /comparison-data/runtimes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fulcrumgenomics/guide-counter/23f2e20e8f295a9ab9388c7d41ee5659c3d4b6df/comparison-data/runtimes.png -------------------------------------------------------------------------------- /comparison-data/runtimes.txt: -------------------------------------------------------------------------------- 1 | tool runtime_seconds 2 | guide-counter 207 3 | guide-couter --exact 115 4 | mageck count 1658 5 | -------------------------------------------------------------------------------- /metadata/tko_essential_genes.txt: -------------------------------------------------------------------------------- 1 | gene HGNC_ID ENTREZ_ID 2 | ACTL6A HGNC:24124 86 3 | ACTR6 HGNC:24025 64431 4 | ALYREF HGNC:19071 10189 5 | ANAPC4 HGNC:19990 29945 6 | ANAPC5 HGNC:15713 51433 7 | AP2S1 HGNC:565 1175 8 | AQR HGNC:29513 9716 9 | ARCN1 HGNC:649 372 10 | ARL5B HGNC:23052 221079 11 | ATP6V0D1 HGNC:13724 9114 12 | ATXN7 HGNC:10560 6314 13 | BIRC6 HGNC:13516 57448 14 | BOP1 HGNC:15519 23246 15 | BPTF HGNC:3581 2186 16 | BRIX1 HGNC:24170 55299 17 | C11orf24 HGNC:1174 53838 18 | C12orf66 HGNC:26517 144577 19 | C14orf166 HGNC:23169 51637 20 | C19orf43 HGNC:28424 79002 21 | CAP1 HGNC:20040 10487 22 | CCNG1 HGNC:1592 900 23 | CCNK HGNC:1596 8812 24 | CCT3 HGNC:1616 7203 25 | CCT4 HGNC:1617 10575 26 | CCT5 HGNC:1618 22948 27 | CCT6A HGNC:1620 908 28 | CCT7 HGNC:1622 10574 29 | CCT8 HGNC:1623 10694 30 | CDC40 HGNC:17350 51362 31 | CDC5L HGNC:1743 988 32 | CDK17 HGNC:8750 5128 33 | CHD4 HGNC:1919 1108 34 | CHMP2A HGNC:30216 27243 35 | CLTC HGNC:2092 1213 36 | COPA HGNC:2230 1314 37 | COPB1 HGNC:2231 1315 38 | COPB2 HGNC:2232 9276 39 | COPE HGNC:2234 11316 40 | COPS2 HGNC:30747 9318 41 | COPS4 HGNC:16702 51138 42 | COPS6 HGNC:21749 10980 43 | COPS8 HGNC:24335 10920 44 | COPZ1 HGNC:2243 22818 45 | COX10 HGNC:2260 1352 46 | CPSF2 HGNC:2325 53981 47 | CPSF3 HGNC:2326 51692 48 | CSE1L HGNC:2431 1434 49 | CTDNEP1 HGNC:19085 23399 50 | DARS HGNC:2678 1615 51 | DDB1 HGNC:2717 1642 52 | DDX18 HGNC:2741 8886 53 | DDX21 HGNC:2744 9188 54 | DDX46 HGNC:18681 9879 55 | DDX49 HGNC:18684 54555 56 | DDX51 HGNC:20082 317781 57 | DDX54 HGNC:20084 79039 58 | DKC1 HGNC:2890 1736 59 | DLG5 HGNC:2904 9231 60 | DNAJC1 HGNC:20090 64215 61 | DNM2 HGNC:2974 1785 62 | DNTTIP2 HGNC:24013 30836 63 | DYNC1H1 HGNC:2961 1778 64 | DYNC1I2 HGNC:2964 1781 65 | EEF2 HGNC:3214 1938 66 | EFTUD2 HGNC:30858 9343 67 | EIF1AX HGNC:3250 1964 68 | EIF2B2 HGNC:3258 8892 69 | EIF2B3 HGNC:3259 8891 70 | EIF2B4 HGNC:3260 8891 71 | EIF2S2 HGNC:3266 8894 72 | EIF3A HGNC:6159 3692 73 | EIF3B HGNC:3280 8662 74 | EIF3C HGNC:3279 8663 75 | EIF3D HGNC:3278 8664 76 | EIF3E HGNC:3277 3646 77 | EIF3F HGNC:3275 8665 78 | EIF3G HGNC:3274 8666 79 | EIF3H HGNC:3273 8667 80 | EIF3I HGNC:3272 8668 81 | EIF4A3 HGNC:18683 9775 82 | EIF4E HGNC:3287 1977 83 | EIF5B HGNC:30793 9669 84 | EIF6 HGNC:6159 3692 85 | ERAP1 HGNC:18173 51752 86 | ERH HGNC:3447 2079 87 | ETF1 HGNC:3477 2107 88 | EXOSC10 HGNC:9138 5394 89 | FDPS HGNC:3631 2224 90 | FLAD1 HGNC:24671 80308 91 | FTSJ3 HGNC:17136 117246 92 | GABPA HGNC:4071 2551 93 | GAPDH HGNC:4141 2597 94 | GAR1 HGNC:14264 54433 95 | GEMIN2 HGNC:10884 8487 96 | GLYR1 HGNC:24434 84656 97 | GMPS HGNC:4378 8833 98 | GNL3 HGNC:29931 26354 99 | GTF3C4 HGNC:4667 9329 100 | HAUS1 HGNC:25174 115106 101 | HAUS7 HGNC:32979 55559 102 | HEATR1 HGNC:25517 55127 103 | HNRNPC HGNC:5035 3183 104 | HNRNPK HGNC:5044 3190 105 | HNRNPL HGNC:5045 3191 106 | HNRNPM HGNC:5046 4670 107 | HNRNPU HGNC:5048 3192 108 | HPS5 HGNC:17022 11234 109 | HSPA9 HGNC:5244 3313 110 | HSPB11 HGNC:25019 51668 111 | HSPE1 HGNC:5269 3336 112 | IARS2 HGNC:29685 55699 113 | ICK HGNC:21219 22858 114 | IMP4 HGNC:30856 92856 115 | IMPAD1 HGNC:26019 54928 116 | INCENP HGNC:6058 3619 117 | INTS9 HGNC:25592 55756 118 | IQGAP1 HGNC:6110 8826 119 | ISCU HGNC:29882 23479 120 | JMJD1C HGNC:12313 221037 121 | KARS HGNC:6215 3735 122 | KPNB1 HGNC:6400 3837 123 | KRAS HGNC:6407 3845 124 | LARS2 HGNC:17095 23395 125 | LIAS HGNC:16429 11019 126 | LSM4 HGNC:17259 25804 127 | LSM6 HGNC:17017 25804 128 | LUC7L3 HGNC:24309 51747 129 | LZIC HGNC:17497 84328 130 | MAPK6 HGNC:6879 5597 131 | MAPKAP1 HGNC:18752 79109 132 | MED12 HGNC:11957 9968 133 | MED14 HGNC:2370 9282 134 | MED30 HGNC:23032 90390 135 | MMAB HGNC:19331 326625 136 | MRPS31 HGNC:16632 10240 137 | MSANTD3 HGNC:23370 91283 138 | MTFMT HGNC:29666 123263 139 | MTX1 HGNC:7504 4580 140 | NAA10 HGNC:18704 8260 141 | NAA38 HGNC:28212 84316 142 | NACA HGNC:7629 4666 143 | NAPA HGNC:7641 8775 144 | NAPG HGNC:7642 8774 145 | NCBP1 HGNC:7658 4686 146 | NEDD8 HGNC:7732 4738 147 | NHP2L1 HGNC:7819 4809 148 | NPTN HGNC:17867 27020 149 | NT5C2 HGNC:8022 22978 150 | NUDT21 HGNC:13870 11051 151 | NUP133 HGNC:18016 55746 152 | NUP205 HGNC:18658 23165 153 | NUP54 HGNC:17359 53371 154 | NUP93 HGNC:28958 9688 155 | NUP98 HGNC:8068 4928 156 | NUPL1 HGNC:20261 9818 157 | NUTF2 HGNC:13722 10204 158 | NXF1 HGNC:8071 10482 159 | OPA1 HGNC:8140 4976 160 | PABPN1 HGNC:8565 8106 161 | PAFAH1B1 HGNC:8574 5048 162 | PAPOLA HGNC:14981 10914 163 | PARN HGNC:8609 5073 164 | PCBP1 HGNC:8647 5093 165 | PCBP2 HGNC:8648 5094 166 | PDE8A HGNC:8793 5151 167 | PDHA1 HGNC:8806 5160 168 | PFDN2 HGNC:8867 5202 169 | PFN1 HGNC:8881 5216 170 | PHB HGNC:8912 5245 171 | PHB2 HGNC:30306 11331 172 | PHF16 HGNC:22982 9767 173 | PHF5A HGNC:18000 84844 174 | PHYH HGNC:8940 5264 175 | PLAGL2 HGNC:9047 5326 176 | POLR1B HGNC:20454 84172 177 | POLR2D HGNC:9191 5433 178 | POLR2F HGNC:9193 5435 179 | POLR2I HGNC:9196 5438 180 | POP5 HGNC:17689 51367 181 | PPIE HGNC:9258 10450 182 | PPP2R1A HGNC:9302 5518 183 | PPP2R2D HGNC:23732 55844 184 | PPP5C HGNC:9322 5536 185 | PREB HGNC:9356 10113 186 | PRKAA1 HGNC:9376 5562 187 | PRKAB2 HGNC:9379 5565 188 | PRKDC HGNC:9413 5591 189 | PRPF18 HGNC:17351 8559 190 | PRPF19 HGNC:17896 27339 191 | PRPF3 HGNC:17348 9129 192 | PRPF31 HGNC:15446 26121 193 | PRPF38A HGNC:25930 84950 194 | PRPF8 HGNC:17340 10594 195 | PRUNE HGNC:13420 58497 196 | PSMA1 HGNC:9530 5682 197 | PSMA2 HGNC:9531 5683 198 | PSMA3 HGNC:9532 5684 199 | PSMA4 HGNC:9533 5685 200 | PSMA6 HGNC:9535 5687 201 | PSMB2 HGNC:9539 5690 202 | PSMB3 HGNC:9540 5691 203 | PSMB5 HGNC:9542 5693 204 | PSMB6 HGNC:9543 5694 205 | PSMC1 HGNC:9547 5700 206 | PSMC2 HGNC:9548 5701 207 | PSMC3 HGNC:9549 5702 208 | PSMC4 HGNC:9551 5704 209 | PSMD1 HGNC:9554 5707 210 | PSMD11 HGNC:9556 5717 211 | PSMD12 HGNC:9557 5718 212 | PSMD13 HGNC:9558 5719 213 | PSMD2 HGNC:9559 5708 214 | PSMD4 HGNC:9561 5710 215 | PSMD7 HGNC:9565 5713 216 | QARS HGNC:9751 5859 217 | RAN HGNC:9846 5901 218 | RANBP2 HGNC:9848 5903 219 | RBM14 HGNC:14219 10432 220 | RBM17 HGNC:16944 84991 221 | RBM22 HGNC:25503 55696 222 | RBM25 HGNC:23244 58517 223 | RBMX HGNC:9910 27316 224 | RILPL2 HGNC:28787 196383 225 | RNF139 HGNC:17023 11236 226 | RPA1 HGNC:10289 6117 227 | RPA2 HGNC:10290 6118 228 | RPL10 HGNC:10298 6134 229 | RPL10A HGNC:10299 4736 230 | RPL11 HGNC:10301 6135 231 | RPL12 HGNC:10302 6136 232 | RPL13 HGNC:10303 6137 233 | RPL13A HGNC:10304 23521 234 | RPL14 HGNC:10305 9045 235 | RPL18 HGNC:10310 6141 236 | RPL18A HGNC:10311 6142 237 | RPL19 HGNC:10312 6143 238 | RPL23A HGNC:10317 6147 239 | RPL24 HGNC:10325 6152 240 | RPL26 HGNC:10327 6154 241 | RPL27 HGNC:10328 6155 242 | RPL3 HGNC:10332 6122 243 | RPL30 HGNC:10333 6156 244 | RPL31 HGNC:10334 6160 245 | RPL32 HGNC:10336 6161 246 | RPL34 HGNC:10340 6164 247 | RPL35 HGNC:10344 11224 248 | RPL35A HGNC:10345 6165 249 | RPL36 HGNC:13631 25873 250 | RPL37 HGNC:10347 6167 251 | RPL37A HGNC:10348 6168 252 | RPL38 HGNC:10349 6169 253 | RPL4 HGNC:10353 6124 254 | RPL5 HGNC:10360 6125 255 | RPL6 HGNC:10362 6128 256 | RPL7 HGNC:10363 6129 257 | RPL7A HGNC:10364 6130 258 | RPLP0 HGNC:10371 6175 259 | RPLP2 HGNC:10377 6181 260 | RPN2 HGNC:10382 6185 261 | RPP38 HGNC:30329 10557 262 | RPP40 HGNC:20992 10799 263 | RPS11 HGNC:10384 6205 264 | RPS12 HGNC:10385 6206 265 | RPS13 HGNC:10386 6207 266 | RPS14 HGNC:10387 6208 267 | RPS15 HGNC:10388 6209 268 | RPS15A HGNC:10389 6210 269 | RPS17 HGNC:10397 6218 270 | RPS18 HGNC:10401 6222 271 | RPS19 HGNC:10402 6223 272 | RPS20 HGNC:10405 6224 273 | RPS24 HGNC:10411 6229 274 | RPS25 HGNC:10413 6230 275 | RPS26 HGNC:10414 6231 276 | RPS27A HGNC:10417 6233 277 | RPS3 HGNC:10420 6188 278 | RPS3A HGNC:10421 6189 279 | RPS4X HGNC:10424 6191 280 | RPS5 HGNC:10426 6193 281 | RPS6 HGNC:10429 6194 282 | RPS7 HGNC:10440 6201 283 | RPS8 HGNC:10441 6202 284 | RPS9 HGNC:10442 6203 285 | RPSA HGNC:6502 3921 286 | RRM1 HGNC:10451 6240 287 | RRM2B HGNC:17296 50484 288 | RTF1 HGNC:28996 23168 289 | RUVBL1 HGNC:10474 8607 290 | RUVBL2 HGNC:10475 10856 291 | SARS HGNC:10537 6301 292 | SDAD1 HGNC:25537 55153 293 | SF3A1 HGNC:10765 10291 294 | SF3A2 HGNC:10766 8175 295 | SF3B1 HGNC:10768 23451 296 | SF3B2 HGNC:10769 10992 297 | SF3B3 HGNC:10770 23450 298 | SF3B4 HGNC:10771 10262 299 | SF3B5 HGNC:21083 83443 300 | SFPQ HGNC:10774 6421 301 | SHFM1 HGNC:10845 7979 302 | SKIV2L2 HGNC:18734 23517 303 | SLC25A43 HGNC:30557 203427 304 | SMC1A HGNC:11111 8243 305 | SMC3 HGNC:2468 9126 306 | SNRNP200 HGNC:30859 23020 307 | SNRNP27 HGNC:30240 11017 308 | SNRPB HGNC:11153 6628 309 | SNRPC HGNC:11157 6631 310 | SNRPD1 HGNC:11159 6633 311 | SNRPD2 HGNC:11159 6633 312 | SNW1 HGNC:16696 22938 313 | SON HGNC:11183 6651 314 | SRBD1 HGNC:25521 55133 315 | SRCAP HGNC:16974 10847 316 | SRFBP1 HGNC:26333 153443 317 | SRRT HGNC:24101 51593 318 | SRSF1 HGNC:10780 6426 319 | SRSF3 HGNC:10785 6428 320 | SSR1 HGNC:11323 6745 321 | STRN HGNC:11424 6801 322 | SUPT5H HGNC:11469 6829 323 | SUPT6H HGNC:11470 6830 324 | SUPV3L1 HGNC:11471 6832 325 | TBC1D25 HGNC:8092 4943 326 | TBCE HGNC:11582 6905 327 | TCERG1 HGNC:15630 10915 328 | TDRD3 HGNC:20612 81550 329 | TFIP11 HGNC:17165 24144 330 | THOP1 HGNC:11793 7064 331 | TIMM10 HGNC:11814 26519 332 | TOMM40 HGNC:18001 10452 333 | TPR HGNC:12017 7175 334 | TRRAP HGNC:12347 8295 335 | TSEN2 HGNC:28422 80746 336 | TSTA3 HGNC:12390 7264 337 | TUBA1B HGNC:18809 10376 338 | TUBB HGNC:20778 203068 339 | TUBGCP2 HGNC:18599 10844 340 | U2AF1 HGNC:12453 7307 341 | U2AF2 HGNC:23156 11338 342 | UBA1 HGNC:12469 7317 343 | UBB HGNC:12463 7314 344 | UBL4A HGNC:12505 8266 345 | USP39 HGNC:20071 10713 346 | VCP HGNC:12666 7415 347 | VTA1 HGNC:20954 51534 348 | WDR12 HGNC:14098 55759 349 | WDR33 HGNC:25651 55339 350 | WDR60 HGNC:21862 55112 351 | WDR61 HGNC:30300 80349 352 | XAB2 HGNC:14089 56949 353 | XIAP HGNC:592 331 354 | XPO1 HGNC:12825 7514 355 | YY1 HGNC:12856 7528 356 | ZBTB48 HGNC:4930 3104 357 | ZC3H13 HGNC:20368 23091 358 | ZC3H18 HGNC:25091 124245 359 | ZFR HGNC:17277 51663 360 | ZNF160 HGNC:12948 90338 361 | ZNF207 HGNC:12998 7756 362 | -------------------------------------------------------------------------------- /metadata/tko_nonessential_genes.txt: -------------------------------------------------------------------------------- 1 | gene HGNC_ID ENTREZ_ID 2 | ABCG8 HGNC:13887 64241 3 | ACCSL HGNC:34391 390110 4 | ACTL7A HGNC:161 10881 5 | ACTL7B HGNC:162 10880 6 | ACTL9 HGNC:28494 284382 7 | ACTRT1 HGNC:24027 139741 8 | ADAD1 HGNC:30713 132612 9 | ADAM18 HGNC:196 8749 10 | ADAM2 HGNC:198 2515 11 | ADAM20 HGNC:199 8748 12 | ADAM30 HGNC:208 11085 13 | ADH7 HGNC:256 131 14 | AFM HGNC:316 173 15 | AICDA HGNC:13203 57379 16 | AIPL1 HGNC:359 23746 17 | ALPI HGNC:437 248 18 | ALPPL2 HGNC:441 251 19 | ALX3 HGNC:449 257 20 | AMELX HGNC:461 265 21 | ANKRD30A HGNC:17234 91074 22 | ANKRD60 HGNC:16217 140731 23 | ANTXRL HGNC:27277 195977 24 | APOA4 HGNC:602 337 25 | APOBEC1 HGNC:604 339 26 | APOF HGNC:615 319 27 | AQP12A HGNC:19941 375318 28 | AQP8 HGNC:642 343 29 | ARGFX HGNC:30146 503582 30 | ART1 HGNC:723 417 31 | ASB17 HGNC:19769 127247 32 | ASIC5 HGNC:17537 51802 33 | ASZ1 HGNC:1350 136991 34 | ATOH1 HGNC:797 474 35 | ATP4B HGNC:820 496 36 | ATP6V1G3 HGNC:18265 127124 37 | AWAT1 HGNC:23252 158833 38 | AWAT2 HGNC:23251 158835 39 | B3GNT6 HGNC:24141 192134 40 | BANF2 HGNC:16172 140836 41 | BARHL1 HGNC:953 56751 42 | BEND2 HGNC:28509 139105 43 | BHLHE23 HGNC:16093 128408 44 | BIRC8 HGNC:14878 112401 45 | BMP10 HGNC:20869 27302 46 | BMP15 HGNC:1068 9210 47 | BPIFA1 HGNC:15749 51297 48 | BPIFA3 HGNC:16204 128861 49 | BPIFB3 HGNC:16178 359710 50 | BPIFB6 HGNC:16504 128859 51 | BPIFC HGNC:16503 254240 52 | BPY2 HGNC:13508 9083 53 | BRDT HGNC:1105 676 54 | BSND HGNC:16512 7809 55 | C10orf113 HGNC:31447 387638 56 | C10orf120 HGNC:25707 399814 57 | C10orf53 HGNC:27421 282966 58 | C11orf40 HGNC:23986 143501 59 | C12orf40 HGNC:26846 283461 60 | C14orf183 HGNC:27285 196913 61 | C15orf55 HGNC:29919 256646 62 | C16orf78 HGNC:28479 123970 63 | C17orf102 HGNC:34412 400591 64 | C17orf78 HGNC:26831 284099 65 | C18orf26 HGNC:26808 284254 66 | C19orf45 HGNC:24745 374877 67 | C1orf146 HGNC:24032 388649 68 | C20orf173 HGNC:16166 140873 69 | C20orf203 HGNC:26592 284805 70 | C20orf79 HGNC:16211 140856 71 | C2orf57 HGNC:28563 165100 72 | C2orf61 HGNC:26850 285051 73 | C2orf71 HGNC:34383 388939 74 | C2orf83 HGNC:25344 56918 75 | C3orf30 HGNC:26553 152405 76 | C4orf40 HGNC:33193 401137 77 | C5orf20 HGNC:24459 140947 78 | C6orf10 HGNC:13922 10665 79 | C7orf66 HGNC:33712 154907 80 | C7orf71 HGNC:22364 285941 81 | C8A HGNC:1352 731 82 | C8B HGNC:1353 732 83 | C8orf17 HGNC:17737 100507249 84 | C8orf86 HGNC:33774 389649 85 | C9orf53 HGNC:23831 51198 86 | CABP2 HGNC:1385 51475 87 | CABP5 HGNC:13714 56344 88 | CABS1 HGNC:30710 85438 89 | CACNG2 HGNC:1406 10369 90 | CACNG3 HGNC:1407 10368 91 | CACNG5 HGNC:1409 27091 92 | CATSPER4 HGNC:23220 378807 93 | CCDC155 HGNC:26520 147872 94 | CCDC172 HGNC:30524 374355 95 | CCDC83 HGNC:28535 220047 96 | CCKAR HGNC:1570 886 97 | CCL1 HGNC:10609 6346 98 | CCT8L2 HGNC:15553 150160 99 | CD200R1L HGNC:24665 344807 100 | CDCP2 HGNC:27297 200008 101 | CDX2 HGNC:1806 1045 102 | CDX4 HGNC:1808 1046 103 | CDY1 HGNC:1809 9085 104 | CDY1B HGNC:23920 253175 105 | CDY2A HGNC:1810 9426 106 | CDY2B HGNC:23921 203611 107 | CEACAM7 HGNC:1819 1087 108 | CELA2A HGNC:24609 63036 109 | CELA3A HGNC:15944 10136 110 | CELA3B HGNC:15945 23436 111 | CER1 HGNC:1862 9350 112 | CETN1 HGNC:1866 1068 113 | CFHR2 HGNC:4890 3080 114 | CFHR5 HGNC:24668 81494 115 | CHAT HGNC:1912 1103 116 | CHRNA6 HGNC:15963 8973 117 | CHRNB3 HGNC:1963 1142 118 | CLCA1 HGNC:2015 1179 119 | CLDN17 HGNC:2038 26285 120 | CLEC2A HGNC:24191 387836 121 | CLEC3A HGNC:2052 10143 122 | CLEC6A HGNC:14556 93978 123 | CLRN1 HGNC:12605 7401 124 | CNBD1 HGNC:26663 168975 125 | CNGA2 HGNC:2149 1260 126 | CNGB3 HGNC:2153 54714 127 | CNPY1 HGNC:27786 285888 128 | CNTNAP5 HGNC:18748 129684 129 | COL20A1 HGNC:14670 57642 130 | COX7B2 HGNC:24381 170712 131 | CPXCR1 HGNC:2332 53336 132 | CRNN HGNC:1230 49860 133 | CRX HGNC:2383 1406 134 | CRYGB HGNC:2409 1419 135 | CSH1 HGNC:2440 1442 136 | CSHL1 HGNC:2442 1444 137 | CSN2 HGNC:2447 1447 138 | CSN3 HGNC:2446 1448 139 | CST11 HGNC:15959 140880 140 | CST4 HGNC:2476 1472 141 | CST5 HGNC:2477 1473 142 | CST8 HGNC:2480 10047 143 | CST9 HGNC:13261 128822 144 | CST9L HGNC:16233 128821 145 | CSTL1 HGNC:15958 128817 146 | CT45A2 HGNC:28400 728911 147 | CT45A4 HGNC:33269 441520 148 | CT45A5 HGNC:33270 441521 149 | CT47A11 HGNC:27397 255313 150 | CTCFL HGNC:16234 140690 151 | CTRB1 HGNC:2521 1504 152 | CXorf1 HGNC:2562 9142 153 | CXorf66 HGNC:33743 347487 154 | CYLC2 HGNC:2583 1539 155 | CYP11B1 HGNC:2591 1539 156 | CYP11B2 HGNC:2592 1585 157 | CYP26C1 HGNC:20577 340665 158 | CYP2A13 HGNC:2608 1553 159 | CYP2C19 HGNC:2621 1557 160 | CYP4A22 HGNC:20575 284541 161 | CYP4F8 HGNC:2648 11283 162 | CYP7A1 HGNC:2651 1581 163 | DAZ1 HGNC:2682 1617 164 | DAZ2 HGNC:15964 57055 165 | DAZ3 HGNC:15965 57054 166 | DAZ4 HGNC:15966 57135 167 | DAZL HGNC:2685 1618 168 | DCAF4L2 HGNC:26657 138009 169 | DCAF8L1 HGNC:31810 139425 170 | DDI1 HGNC:18961 414301 171 | DDX4 HGNC:18700 54514 172 | DEFA5 HGNC:2764 1670 173 | DEFA6 HGNC:2765 1671 174 | DEFB103B HGNC:31702 55894 175 | DEFB104A HGNC:18115 140596 176 | DEFB106A HGNC:18088 245909 177 | DEFB107A HGNC:18086 245910 178 | DEFB118 HGNC:16196 117285 179 | DEFB123 HGNC:18103 245936 180 | DEFB126 HGNC:15900 81623 181 | DEFB127 HGNC:16206 140850 182 | DEFB129 HGNC:16218 140881 183 | DGAT2L6 HGNC:23250 347516 184 | DGKK HGNC:32395 139189 185 | DIRC1 HGNC:15760 116093 186 | DMP1 HGNC:2932 1758 187 | DMRT1 HGNC:2934 1761 188 | DMRTB1 HGNC:13913 63948 189 | DMRTC2 HGNC:13911 63946 190 | DPCR1 HGNC:21666 135656 191 | DPRX HGNC:32166 503834 192 | DRD3 HGNC:3024 1814 193 | DRGX HGNC:21536 644168 194 | DSCR4 HGNC:3045 10281 195 | DSG4 HGNC:21307 147409 196 | DSPP HGNC:3054 1834 197 | DTX2 HGNC:15973 113878 198 | DUSP21 HGNC:20476 63904 199 | DUX4 HGNC:50800 100288687 200 | DUX4L7 HGNC:37266 653543 201 | DUXA HGNC:32179 503835 202 | EFCAB3 HGNC:26379 146779 203 | EGR4 HGNC:3241 1961 204 | ENTHD1 HGNC:26352 150350 205 | ESX1 HGNC:14865 80712 206 | EVX1 HGNC:3506 2128 207 | F13B HGNC:3534 2165 208 | F9 HGNC:3551 2158 209 | FABP2 HGNC:3556 2169 210 | FAM106A HGNC:25682 80039 211 | FAM47A HGNC:29962 158724 212 | FAM47B HGNC:26659 170062 213 | FAM47C HGNC:25301 442444 214 | FAM71A HGNC:26541 149647 215 | FAM71B HGNC:28397 153745 216 | FAM71C HGNC:28594 196472 217 | FAM75A7 HGNC:32007 26165 218 | FAM75D1 HGNC:37283 389763 219 | FCRL4 HGNC:18507 83417 220 | FEZF1 HGNC:22788 389549 221 | FEZF2 HGNC:13506 55079 222 | FFAR1 HGNC:4498 2864 223 | FGF3 HGNC:3681 2248 224 | FGF4 HGNC:3682 2249 225 | FGF6 HGNC:3684 2251 226 | FIGLA HGNC:24669 344018 227 | FLG2 HGNC:33276 388698 228 | FMR1NB HGNC:26372 158521 229 | FNDC7 HGNC:26668 163479 230 | FNDC9 HGNC:33547 408263 231 | FOXB1 HGNC:3799 27023 232 | FOXB2 HGNC:23315 442425 233 | FOXD4L3 HGNC:18523 286380 234 | FOXD4L4 HGNC:23762 349334 235 | FOXE3 HGNC:3808 2301 236 | FOXN1 HGNC:12765 8456 237 | FOXR1 HGNC:29980 283150 238 | FRG2 HGNC:19136 448831 239 | FRMD7 HGNC:8079 90167 240 | FSCB HGNC:20494 84075 241 | FUT5 HGNC:4016 2527 242 | FUT9 HGNC:4020 10690 243 | G6PC HGNC:4056 2538 244 | GABRA1 HGNC:4075 2554 245 | GABRA6 HGNC:4080 2559 246 | GAGE1 HGNC:4098 2543 247 | GAGE2C HGNC:31958 2574 248 | GALNTL5 HGNC:21725 168391 249 | GALR1 HGNC:4132 2587 250 | GALR3 HGNC:4134 8484 251 | GBP7 HGNC:29606 388646 252 | GCG HGNC:4191 2641 253 | GCM2 HGNC:4198 9247 254 | GDF2 HGNC:4217 2658 255 | GFRA4 HGNC:13821 64096 256 | GFRAL HGNC:32789 389400 257 | GH2 HGNC:4262 2689 258 | GHRH HGNC:4265 2691 259 | GHSR HGNC:4267 2693 260 | GIF HGNC:4268 2694 261 | GJA10 HGNC:19155 81025 262 | GJA8 HGNC:4281 2703 263 | GK2 HGNC:4291 2712 264 | GKN2 HGNC:24588 200504 265 | GLRA1 HGNC:4326 2741 266 | GLRA2 HGNC:4327 2742 267 | GLT6D1 HGNC:23671 360203 268 | GML HGNC:4375 2765 269 | GOLGA6L2 HGNC:26695 283685 270 | GOT1L1 HGNC:28487 137362 271 | GPR101 HGNC:14963 83550 272 | GPR111 HGNC:18991 222611 273 | GPR119 HGNC:19060 139760 274 | GPR128 HGNC:19241 84873 275 | GPR139 HGNC:19995 124274 276 | GPR144 HGNC:18651 347088 277 | GPR148 HGNC:23623 344561 278 | GPR151 HGNC:23624 134391 279 | GPR152 HGNC:23622 390212 280 | GPR26 HGNC:4481 2849 281 | GPR31 HGNC:4486 2853 282 | GPR32 HGNC:4487 2854 283 | GPR45 HGNC:4503 11250 284 | GPR50 HGNC:4506 9248 285 | GPR52 HGNC:4508 9293 286 | GPR78 HGNC:4528 27201 287 | GPRC6A HGNC:18510 27201 288 | GPX5 HGNC:4557 2880 289 | GPX6 HGNC:4558 257202 290 | GRK1 HGNC:10013 6011 291 | GRM4 HGNC:4596 2914 292 | GRM5 HGNC:4597 2915 293 | GRM6 HGNC:4598 2916 294 | GSC2 HGNC:4613 2928 295 | GSTA5 HGNC:19662 221357 296 | GSX1 HGNC:20374 219409 297 | GSX2 HGNC:24959 170825 298 | GUCA2A HGNC:4682 2980 299 | GUCY2F HGNC:4691 2986 300 | H1FOO HGNC:18463 132243 301 | H2BFM HGNC:27867 286436 302 | H2BFWT HGNC:27252 158983 303 | HAO1 HGNC:4809 54363 304 | HCRTR2 HGNC:4849 3062 305 | HDGFL1 HGNC:21095 154150 306 | HHLA1 HGNC:4904 10086 307 | HIST1H2AA HGNC:18729 221613 308 | HIST1H2BA HGNC:18730 255626 309 | HIST1H4G HGNC:4792 8369 310 | HMX1 HGNC:5017 3166 311 | HOXB1 HGNC:5111 3211 312 | HOXD12 HGNC:5135 3238 313 | HRG HGNC:5181 3273 314 | HRH3 HGNC:5184 11255 315 | HSFY1 HGNC:18568 86614 316 | HSFY2 HGNC:23950 159119 317 | HTN3 HGNC:5284 3347 318 | HTR1A HGNC:5286 3350 319 | HTR2C HGNC:5295 3358 320 | HTR3C HGNC:24003 170572 321 | HTR3D HGNC:24004 200909 322 | HTR3E HGNC:24005 285242 323 | HTR5A HGNC:5300 3361 324 | HTR6 HGNC:5301 3362 325 | IAPP HGNC:5329 3375 326 | IFIT1B HGNC:23442 439996 327 | IFNA10 HGNC:5418 3446 328 | IFNA14 HGNC:5420 3448 329 | IFNA16 HGNC:5421 3449 330 | IFNA17 HGNC:5422 3451 331 | IFNA2 HGNC:5423 3440 332 | IFNA21 HGNC:5424 3452 333 | IFNA4 HGNC:5425 3441 334 | IFNA5 HGNC:5426 3442 335 | IFNA6 HGNC:5427 3443 336 | IFNA7 HGNC:5428 3444 337 | IFNA8 HGNC:5429 3445 338 | IFNB1 HGNC:5434 3456 339 | IFNK HGNC:21714 56832 340 | IFNW1 HGNC:5448 3467 341 | IL12B HGNC:5970 3593 342 | IL13 HGNC:5973 3596 343 | IL17A HGNC:5981 3605 344 | IL17F HGNC:16404 112744 345 | IL1F10 HGNC:15552 84639 346 | IL21 HGNC:6005 59067 347 | IL22 HGNC:14900 50616 348 | IL25 HGNC:13765 50616 349 | IL26 HGNC:17119 55801 350 | IL28A HGNC:18364 282616 351 | IL28B HGNC:18365 282617 352 | IL29 HGNC:18363 282618 353 | IL3 HGNC:6011 3562 354 | IL31 HGNC:19372 386653 355 | IL36A HGNC:15562 27179 356 | IL36B HGNC:15564 27177 357 | IL36RN HGNC:15561 26525 358 | IL9 HGNC:6029 3578 359 | INS HGNC:6081 3630 360 | INSL5 HGNC:6088 10022 361 | INSL6 HGNC:6089 11172 362 | INSM2 HGNC:17539 11172 363 | INSRR HGNC:6093 3645 364 | IQCF1 HGNC:28607 132141 365 | IRGC HGNC:28835 56269 366 | ISX HGNC:28084 91464 367 | ITIH6 HGNC:28907 347365 368 | IZUMO2 HGNC:28518 126123 369 | KCNA10 HGNC:6219 3744 370 | KCNB2 HGNC:6232 9312 371 | KCNG4 HGNC:19697 93107 372 | KCNK10 HGNC:6273 54207 373 | KCNK16 HGNC:14464 83795 374 | KCNK18 HGNC:19439 338567 375 | KCNV1 HGNC:18861 27012 376 | KHDC3L HGNC:33699 154288 377 | KIF2B HGNC:29443 84643 378 | KIR2DL1 HGNC:6329 3802 379 | KIR3DL3 HGNC:16312 115653 380 | KLK12 HGNC:6360 43849 381 | KLK9 HGNC:6370 284366 382 | KRT2 HGNC:6439 3849 383 | KRT25 HGNC:30839 147183 384 | KRT26 HGNC:30840 353288 385 | KRT28 HGNC:30842 162605 386 | KRT33A HGNC:6450 3883 387 | KRT35 HGNC:6453 3886 388 | KRT36 HGNC:6454 8689 389 | KRT37 HGNC:6455 8688 390 | KRT38 HGNC:6456 8687 391 | KRT40 HGNC:26707 125115 392 | KRT71 HGNC:28927 112802 393 | KRT73 HGNC:28928 319101 394 | KRT74 HGNC:28929 121391 395 | KRT75 HGNC:24431 9119 396 | KRT76 HGNC:24430 51350 397 | KRT77 HGNC:20411 374454 398 | KRT78 HGNC:28926 196374 399 | KRT82 HGNC:6459 3888 400 | KRT84 HGNC:6461 3890 401 | KRT85 HGNC:6462 3891 402 | KRT86 HGNC:6463 3892 403 | KRT9 HGNC:6447 3857 404 | KRTAP1-1 HGNC:16772 81851 405 | KRTAP10-1 HGNC:22966 386677 406 | KRTAP10-10 HGNC:22972 353333 407 | KRTAP10-11 HGNC:20528 386678 408 | KRTAP10-12 HGNC:20533 386685 409 | KRTAP10-2 HGNC:22967 386679 410 | KRTAP10-4 HGNC:20521 386672 411 | KRTAP10-5 HGNC:22969 386680 412 | KRTAP10-6 HGNC:20523 386674 413 | KRTAP10-7 HGNC:22970 386675 414 | KRTAP10-8 HGNC:20525 386681 415 | KRTAP10-9 HGNC:22971 386676 416 | KRTAP11-1 HGNC:18922 386676 417 | KRTAP13-1 HGNC:18924 140258 418 | KRTAP13-2 HGNC:18923 337959 419 | KRTAP13-3 HGNC:18925 337960 420 | KRTAP13-4 HGNC:18926 284827 421 | KRTAP15-1 HGNC:18927 254950 422 | KRTAP17-1 HGNC:18917 83902 423 | KRTAP19-3 HGNC:18938 337970 424 | KRTAP23-1 HGNC:18928 337963 425 | KRTAP26-1 HGNC:33760 388818 426 | KRTAP3-2 HGNC:16779 83897 427 | KRTAP4-11 HGNC:18911 653240 428 | KRTAP4-12 HGNC:16776 83755 429 | KRTAP4-2 HGNC:18900 85291 430 | KRTAP4-4 HGNC:16928 84616 431 | KRTAP4-7 HGNC:18898 100132476 432 | KRTAP5-2 HGNC:23597 440021 433 | KRTAP9-2 HGNC:16926 83899 434 | KRTAP9-3 HGNC:16927 83900 435 | KRTAP9-4 HGNC:18902 85280 436 | LALBA HGNC:6480 3906 437 | LBX1 HGNC:16960 10660 438 | LCN9 HGNC:17442 392399 439 | LCT HGNC:6530 3938 440 | LGALS13 HGNC:15449 29124 441 | LGALS14 HGNC:30054 56891 442 | LHFPL5 HGNC:21253 222662 443 | LHX3 HGNC:6595 8022 444 | LHX5 HGNC:14216 64211 445 | LIM2 HGNC:6610 3982 446 | LIN28A HGNC:15986 79727 447 | LIPM HGNC:23455 340654 448 | LOR HGNC:6663 4014 449 | LRIT1 HGNC:23404 26103 450 | LRIT2 HGNC:23443 340745 451 | LRRC10 HGNC:20264 376132 452 | LUZP4 HGNC:24971 51213 453 | LYZL1 HGNC:30502 84569 454 | LYZL2 HGNC:29613 119180 455 | LYZL6 HGNC:29614 57151 456 | MAGEA10 HGNC:6797 4109 457 | MAGEA11 HGNC:6798 4110 458 | MAGEB1 HGNC:6808 4112 459 | MAGEB10 HGNC:25377 139422 460 | MAGEB18 HGNC:28515 286514 461 | MAGEB3 HGNC:6810 4114 462 | MAGEB4 HGNC:6811 4115 463 | MAGEC3 HGNC:23798 139081 464 | MAS1 HGNC:6899 4142 465 | MAS1L HGNC:13961 116511 466 | MBD3L1 HGNC:15774 85509 467 | MBD3L2 HGNC:18532 125997 468 | MBL2 HGNC:6922 4153 469 | MC2R HGNC:6930 4153 470 | MC3R HGNC:6931 4159 471 | MC5R HGNC:6933 4161 472 | MEP1A HGNC:7015 4224 473 | MEP1B HGNC:7020 4225 474 | MEPE HGNC:13361 56955 475 | MFRP HGNC:18121 83552 476 | MMD2 HGNC:30133 221938 477 | MMP20 HGNC:7167 9313 478 | MMP21 HGNC:7170 8511 479 | MMP26 HGNC:14249 56547 480 | MMP27 HGNC:14250 64066 481 | MOGAT3 HGNC:23249 346606 482 | MORC1 HGNC:7198 27136 483 | MRGPRD HGNC:29626 116512 484 | MRGPRX1 HGNC:17962 259249 485 | MRGPRX2 HGNC:17983 117194 486 | MRGPRX4 HGNC:17617 117196 487 | MS4A10 HGNC:13368 341116 488 | MS4A13 HGNC:16674 503497 489 | MS4A5 HGNC:13374 64232 490 | MSGN1 HGNC:14907 343930 491 | MT1B HGNC:7394 4490 492 | MTNR1B HGNC:7464 4544 493 | MUC17 HGNC:16800 140453 494 | MUC7 HGNC:7518 4589 495 | MYBPC3 HGNC:7551 4607 496 | MYF5 HGNC:7565 4617 497 | NANOGNB HGNC:24958 360030 498 | NANOS2 HGNC:23292 339345 499 | NCR2 HGNC:6732 9436 500 | NDST4 HGNC:20779 64579 501 | NEUROD2 HGNC:7763 4761 502 | NEUROD4 HGNC:13802 58158 503 | NEUROD6 HGNC:13804 63974 504 | NEUROG1 HGNC:7764 4762 505 | NKX2-1 HGNC:11825 7080 506 | NKX2-2 HGNC:7835 4821 507 | NLRP4 HGNC:22943 147945 508 | NLRP5 HGNC:21269 126206 509 | NLRP8 HGNC:22940 126205 510 | NLRP9 HGNC:22941 338321 511 | NMS HGNC:32203 129521 512 | NOBOX HGNC:22448 135935 513 | NOTO HGNC:31839 344022 514 | NOX3 HGNC:7890 50508 515 | NPFFR1 HGNC:17425 64106 516 | NPHS2 HGNC:13394 7827 517 | NPSR1 HGNC:23631 387129 518 | NPVF HGNC:13782 64111 519 | NR2E1 HGNC:7973 7101 520 | NYX HGNC:8082 60506 521 | OC90 HGNC:8100 729330 522 | OLIG2 HGNC:9398 10215 523 | OLIG3 HGNC:18003 167826 524 | OPALIN HGNC:20707 93377 525 | OPN1LW HGNC:9936 5956 526 | OPN5 HGNC:19992 221391 527 | OR10A2 HGNC:8161 341276 528 | OR10A4 HGNC:15130 283297 529 | OR10A5 HGNC:15131 144124 530 | OR10H1 HGNC:8172 26539 531 | OR10H2 HGNC:8173 26538 532 | OR10H3 HGNC:8174 26532 533 | OR10J1 HGNC:8175 26476 534 | OR10R2 HGNC:14820 343406 535 | OR10S1 HGNC:14807 219873 536 | OR10X1 HGNC:14995 128367 537 | OR10Z1 HGNC:14996 128368 538 | OR11A1 HGNC:8176 128368 539 | OR12D2 HGNC:8178 26529 540 | OR12D3 HGNC:13963 81797 541 | OR13C3 HGNC:14704 138803 542 | OR13D1 HGNC:14695 286365 543 | OR14A16 HGNC:15022 284532 544 | OR1A1 HGNC:8179 8383 545 | OR1A2 HGNC:8180 26189 546 | OR1B1 HGNC:8181 347169 547 | OR1D2 HGNC:8183 4991 548 | OR1E1 HGNC:8189 8387 549 | OR1E2 HGNC:8190 8388 550 | OR1G1 HGNC:8204 8390 551 | OR1L6 HGNC:8218 392390 552 | OR1N2 HGNC:15111 138882 553 | OR1S1 HGNC:8227 219959 554 | OR1S2 HGNC:15141 219958 555 | OR2AK2 HGNC:19569 391191 556 | OR2AT4 HGNC:19620 341152 557 | OR2C1 HGNC:8242 4993 558 | OR2C3 HGNC:15005 81472 559 | OR2D2 HGNC:8244 120776 560 | OR2D3 HGNC:15146 120775 561 | OR2F1 HGNC:8246 26211 562 | OR2G2 HGNC:15007 81470 563 | OR2G3 HGNC:15008 81469 564 | OR2H1 HGNC:8252 26716 565 | OR2J2 HGNC:8260 26707 566 | OR2L3 HGNC:15009 391192 567 | OR2T1 HGNC:8277 26696 568 | OR2T10 HGNC:19573 127069 569 | OR2T12 HGNC:19592 127064 570 | OR2T2 HGNC:14725 401992 571 | OR2T27 HGNC:31252 403239 572 | OR2T33 HGNC:31255 391195 573 | OR2T4 HGNC:15016 127074 574 | OR2T5 HGNC:15017 401993 575 | OR2W1 HGNC:8281 26692 576 | OR3A1 HGNC:8282 4994 577 | OR3A2 HGNC:8283 4995 578 | OR3A3 HGNC:8284 8392 579 | OR4C11 HGNC:15167 219429 580 | OR4C3 HGNC:14697 256144 581 | OR4D1 HGNC:8293 26689 582 | OR4D10 HGNC:15173 390197 583 | OR4D11 HGNC:15174 219986 584 | OR4D9 HGNC:15178 390199 585 | OR4K17 HGNC:15355 390436 586 | OR51B6 HGNC:19600 390058 587 | OR51D1 HGNC:15193 390038 588 | OR51F2 HGNC:15197 119694 589 | OR51T1 HGNC:15205 401665 590 | OR51V1 HGNC:19597 283111 591 | OR52A1 HGNC:8318 23538 592 | OR52A5 HGNC:19580 390054 593 | OR52B2 HGNC:15207 255725 594 | OR52B6 HGNC:15211 340980 595 | OR52E8 HGNC:15217 390079 596 | OR52I2 HGNC:15221 143502 597 | OR52K2 HGNC:15223 119774 598 | OR52L1 HGNC:14785 338751 599 | OR52M1 HGNC:15225 119772 600 | OR52R1 HGNC:15235 119695 601 | OR52W1 HGNC:15239 120787 602 | OR56A1 HGNC:14781 120796 603 | OR56A4 HGNC:14791 120793 604 | OR56B1 HGNC:15245 387748 605 | OR5AU1 HGNC:15362 390445 606 | OR5C1 HGNC:8331 392391 607 | OR5I1 HGNC:8347 10798 608 | OR5M1 HGNC:8352 390168 609 | OR5M10 HGNC:15290 390167 610 | OR5P2 HGNC:14783 120065 611 | OR5P3 HGNC:14784 120066 612 | OR5R1 HGNC:14841 219479 613 | OR5T1 HGNC:14821 390155 614 | OR5T2 HGNC:15296 219464 615 | OR5T3 HGNC:15297 390154 616 | OR5V1 HGNC:13972 81696 617 | OR5W2 HGNC:15299 390148 618 | OR6A2 HGNC:15301 8590 619 | OR6K6 HGNC:15033 128371 620 | OR6S1 HGNC:15363 341799 621 | OR6V1 HGNC:15090 346517 622 | OR7A17 HGNC:8363 26333 623 | OR7C2 HGNC:8374 26658 624 | OR7D4 HGNC:8380 125958 625 | OR7G2 HGNC:8466 390882 626 | OR8A1 HGNC:8469 390275 627 | OR8B8 HGNC:8477 26493 628 | OR8G5 HGNC:19622 219865 629 | OR8U1 HGNC:19611 219417 630 | OR9Q2 HGNC:15328 219957 631 | OTOP1 HGNC:19656 133060 632 | OTOP3 HGNC:19658 347741 633 | OTOR HGNC:8517 56914 634 | OTP HGNC:8518 23440 635 | OTUD6A HGNC:32312 139562 636 | OTX2 HGNC:8522 5015 637 | PAGE3 HGNC:4110 139793 638 | PANX3 HGNC:20573 116337 639 | PASD1 HGNC:20686 139135 640 | PAX1 HGNC:8615 5075 641 | PAX4 HGNC:8618 5078 642 | PBOV1 HGNC:21079 59351 643 | PDCL2 HGNC:29524 132954 644 | PDE6H HGNC:8790 5149 645 | PDILT HGNC:27338 204474 646 | PDX1 HGNC:6107 3651 647 | PDYN HGNC:8820 5173 648 | PGK2 HGNC:8898 5232 649 | PGLYRP2 HGNC:30013 114770 650 | PGLYRP3 HGNC:30014 114771 651 | PIWIL1 HGNC:9007 9271 652 | PIWIL3 HGNC:18443 440822 653 | PKD1L3 HGNC:21716 342372 654 | PLA2G2E HGNC:13414 30814 655 | PLA2G2F HGNC:30040 64600 656 | PLA2G4E HGNC:24791 123745 657 | PLAC1L HGNC:26699 219990 658 | PNLIP HGNC:9155 5406 659 | PNLIPRP1 HGNC:9156 5407 660 | PNLIPRP2 HGNC:9157 5408 661 | PNPLA5 HGNC:24888 150379 662 | POM121L12 HGNC:25369 285877 663 | POTEA HGNC:33893 340441 664 | POTED HGNC:23822 317754 665 | POTEG HGNC:33896 404785 666 | POTEH HGNC:133 23784 667 | POU3F4 HGNC:9217 5456 668 | POU4F2 HGNC:9219 5458 669 | POU4F3 HGNC:9220 5459 670 | POU5F2 HGNC:26367 134187 671 | PPP3R2 HGNC:9318 5535 672 | PRAMEF1 HGNC:28840 65121 673 | PRAMEF19 HGNC:24908 645414 674 | PRAMEF2 HGNC:28841 65122 675 | PRAMEF3 HGNC:14087 401940 676 | PRAMEF4 HGNC:31971 400735 677 | PRAMEF7 HGNC:28415 441871 678 | PRB1 HGNC:9337 5542 679 | PRB4 HGNC:9340 5545 680 | PRDM13 HGNC:13998 59336 681 | PRDM14 HGNC:14001 63978 682 | PRDM7 HGNC:9351 11105 683 | PRDM9 HGNC:13994 56979 684 | PRG3 HGNC:9363 10394 685 | PRLH HGNC:17945 51052 686 | PRLHR HGNC:4464 2834 687 | PROP1 HGNC:9455 5626 688 | PRSS33 HGNC:30405 260429 689 | PRSS37 HGNC:29211 136242 690 | PRSS38 HGNC:29625 339501 691 | PRSS41 HGNC:30715 360226 692 | PRSS55 HGNC:30824 203074 693 | PRSS58 HGNC:39125 136541 694 | PRY2 HGNC:21504 442862 695 | PSKH2 HGNC:18997 85481 696 | PTF1A HGNC:23734 256297 697 | RAX HGNC:18662 30062 698 | RAX2 HGNC:18286 84839 699 | RBM46 HGNC:28401 166863 700 | RBMXL2 HGNC:17886 27288 701 | RBMY1A1 HGNC:9912 5940 702 | RBMY1B HGNC:23914 378948 703 | RBMY1D HGNC:23915 378949 704 | RBMY1E HGNC:23916 378950 705 | RBMY1F HGNC:23974 159163 706 | RBMY1J HGNC:23917 378951 707 | RBP3 HGNC:9921 5949 708 | RBPJL HGNC:13761 11317 709 | RD3 HGNC:19689 343035 710 | RDH8 HGNC:14423 50700 711 | REG3A HGNC:8601 5068 712 | RESP18 HGNC:33762 389075 713 | RETNLB HGNC:20388 84666 714 | REXO1L1 HGNC:24660 254958 715 | RFPL3 HGNC:9980 10738 716 | RFPL4B HGNC:33264 442247 717 | RFX6 HGNC:21478 222546 718 | RHO HGNC:10012 6010 719 | RHOXF2 HGNC:30011 84528 720 | RNASE10 HGNC:19275 338879 721 | RNASE11 HGNC:19269 122651 722 | RNASE12 HGNC:24211 493901 723 | RNASE13 HGNC:25285 440163 724 | RNASE8 HGNC:19277 122665 725 | RNASE9 HGNC:20673 390443 726 | RND2 HGNC:18315 8153 727 | RNF113B HGNC:17267 140432 728 | RNF17 HGNC:10060 56163 729 | RP1 HGNC:10263 6101 730 | RP1L1 HGNC:15946 94137 731 | RPE65 HGNC:10294 6121 732 | RPTN HGNC:26809 126638 733 | RS1 HGNC:10457 6247 734 | RTP1 HGNC:28580 132112 735 | RTP2 HGNC:32486 344892 736 | RXFP2 HGNC:17318 122042 737 | RXFP3 HGNC:24883 51289 738 | S100A7A HGNC:21657 338324 739 | S100G HGNC:1436 795 740 | SAGE1 HGNC:30369 55511 741 | SAMD7 HGNC:25394 344658 742 | SCGB1D1 HGNC:18395 10648 743 | SCN10A HGNC:10582 6336 744 | SCRT2 HGNC:15952 85508 745 | SDR9C7 HGNC:29958 121214 746 | SEC14L3 HGNC:18655 266629 747 | SEMG2 HGNC:10743 6407 748 | SEPT14 HGNC:33280 346288 749 | SERPINA12 HGNC:18359 145264 750 | SERPINA7 HGNC:11583 6906 751 | SERPINA9 HGNC:15995 327657 752 | SERPINB12 HGNC:14220 89777 753 | SHCBP1L HGNC:16788 81626 754 | SHOX HGNC:10853 6473 755 | SI HGNC:10856 6476 756 | SIGLECL1 HGNC:26856 284369 757 | SIX6 HGNC:10892 4990 758 | SLC10A2 HGNC:10906 6555 759 | SLC13A1 HGNC:10916 6561 760 | SLC17A2 HGNC:11019 6569 761 | SLC17A6 HGNC:16703 57084 762 | SLC18A3 HGNC:10936 6572 763 | SLC22A12 HGNC:17989 116085 764 | SLC22A13 HGNC:8494 9390 765 | SLC22A24 HGNC:28542 283238 766 | SLC22A25 HGNC:32935 387601 767 | SLC22A6 HGNC:10970 9356 768 | SLC22A8 HGNC:10972 9376 769 | SLC22A9 HGNC:16261 114571 770 | SLC25A2 HGNC:22921 83884 771 | SLC25A31 HGNC:25319 83447 772 | SLC2A2 HGNC:11006 6514 773 | SLC2A7 HGNC:13445 155184 774 | SLC32A1 HGNC:11018 140679 775 | SLC34A1 HGNC:11019 6569 776 | SLC36A3 HGNC:19659 285641 777 | SLC39A12 HGNC:20860 221074 778 | SLC6A18 HGNC:26441 348932 779 | SLC6A5 HGNC:11051 9152 780 | SLC6A7 HGNC:11054 6534 781 | SLC7A13 HGNC:23092 157724 782 | SLCO1B1 HGNC:10959 10599 783 | SLCO6A1 HGNC:23613 133482 784 | SLITRK1 HGNC:20297 114798 785 | SOHLH1 HGNC:27845 402381 786 | SOX1 HGNC:11189 6656 787 | SOX14 HGNC:11193 8403 788 | SP8 HGNC:19196 221833 789 | SPACA1 HGNC:14967 81833 790 | SPACA5 HGNC:31353 389852 791 | SPACA7 HGNC:29575 122258 792 | SPATA16 HGNC:29935 83893 793 | SPATA21 HGNC:28026 374955 794 | SPEM1 HGNC:32429 374768 795 | SPHAR HGNC:16957 10638 796 | SPINK14 HGNC:33825 408187 797 | SPO11 HGNC:11250 23626 798 | SPPL2C HGNC:28902 162540 799 | SPRR4 HGNC:23173 163778 800 | SSTR4 HGNC:11333 6754 801 | SSX3 HGNC:11337 10214 802 | SSX5 HGNC:11339 6758 803 | SSX7 HGNC:19653 280658 804 | SSX8 HGNC:19654 280659 805 | SSX9 HGNC:19655 280660 806 | STATH HGNC:11369 6779 807 | SULT6B1 HGNC:33433 391365 808 | SUN5 HGNC:16252 140732 809 | T HGNC:11515 6862 810 | TAAR1 HGNC:17734 134864 811 | TAAR2 HGNC:4514 9287 812 | TAAR5 HGNC:30236 9038 813 | TAAR6 HGNC:20978 319100 814 | TAAR8 HGNC:14964 83551 815 | TAAR9 HGNC:20977 134860 816 | TAS1R2 HGNC:14905 80834 817 | TAS2R1 HGNC:14909 50834 818 | TAS2R13 HGNC:14919 50838 819 | TAS2R16 HGNC:14921 50833 820 | TAS2R39 HGNC:18886 259285 821 | TAS2R40 HGNC:18885 259286 822 | TAS2R41 HGNC:18883 259287 823 | TAS2R42 HGNC:18888 353164 824 | TAS2R43 HGNC:18875 259289 825 | TAS2R46 HGNC:18877 259292 826 | TAS2R50 HGNC:18882 259296 827 | TAS2R60 HGNC:20639 338398 828 | TAS2R7 HGNC:14913 50837 829 | TAS2R8 HGNC:14915 50836 830 | TAS2R9 HGNC:14917 50835 831 | TBC1D21 HGNC:28536 161514 832 | TBC1D29 HGNC:24509 26083 833 | TBL1Y HGNC:18502 90665 834 | TBPL2 HGNC:19841 387332 835 | TBR1 HGNC:11590 10716 836 | TBX10 HGNC:11593 347853 837 | TCEB3B HGNC:30771 51224 838 | TCEB3C HGNC:24617 162699 839 | TCHHL1 HGNC:31796 126637 840 | TCP10L2 HGNC:21254 401285 841 | TEDDM1 HGNC:30233 127670 842 | TEX101 HGNC:30722 83639 843 | TEX13A HGNC:11735 56157 844 | TEX28 HGNC:2563 1527 845 | TEX34 HGNC:26349 124783 846 | TFAP2D HGNC:15581 83741 847 | TFDP3 HGNC:24603 51270 848 | TGIF2LX HGNC:18570 90316 849 | TGIF2LY HGNC:18569 90655 850 | TGM6 HGNC:16255 343641 851 | TKTL2 HGNC:25313 84076 852 | TLX1 HGNC:5056 3195 853 | TMEM132D HGNC:29411 121256 854 | TMEM174 HGNC:28187 134288 855 | TMEM207 HGNC:33705 131920 856 | TMEM225 HGNC:32390 338661 857 | TMIGD1 HGNC:32431 388364 858 | TMPRSS11A HGNC:27954 339967 859 | TMPRSS11B HGNC:25398 132724 860 | TMPRSS11F HGNC:29994 389208 861 | TMPRSS12 HGNC:28779 283471 862 | TMPRSS15 HGNC:9490 5651 863 | TNR HGNC:11953 7143 864 | TPD52L3 HGNC:23382 89882 865 | TPH2 HGNC:20692 121278 866 | TPRX1 HGNC:32174 284355 867 | TPTE HGNC:12023 7179 868 | TREML4 HGNC:30807 285852 869 | TRHR HGNC:12299 7201 870 | TRIM40 HGNC:18736 135644 871 | TRIM42 HGNC:19014 287015 872 | TRIM43 HGNC:19015 129868 873 | TRIM48 HGNC:19021 79097 874 | TRIM49 HGNC:13431 57093 875 | TRIM51 HGNC:19023 84767 876 | TRIM60 HGNC:21162 166655 877 | TRIM67 HGNC:31859 440730 878 | TRIML1 HGNC:26698 339976 879 | TRPC5 HGNC:12337 7224 880 | TRPC7 HGNC:20754 57113 881 | TRPM1 HGNC:7146 4308 882 | TRPV5 HGNC:3145 56302 883 | TSGA13 HGNC:12369 114960 884 | TSHB HGNC:12372 7252 885 | TSPAN16 HGNC:30725 26526 886 | TSPO2 HGNC:21256 222642 887 | TSPY1 HGNC:12381 7258 888 | TSPYL6 HGNC:14521 388951 889 | TSSK1B HGNC:14968 83942 890 | TSSK2 HGNC:11401 23617 891 | TXNDC8 HGNC:31454 255220 892 | TYR HGNC:12442 7299 893 | UBQLN3 HGNC:12510 50613 894 | UMOD HGNC:12559 7369 895 | UROC1 HGNC:26444 131669 896 | USP17L2 HGNC:34434 377630 897 | USP26 HGNC:13485 83844 898 | USP29 HGNC:18563 83844 899 | UTS2R HGNC:4468 2837 900 | VAX1 HGNC:12660 11023 901 | VCX3A HGNC:18159 51481 902 | VHLL HGNC:30666 391104 903 | VN1R2 HGNC:19872 317701 904 | VN1R4 HGNC:19871 317703 905 | VN1R5 HGNC:19870 317705 906 | VPREB1 HGNC:12709 7441 907 | VRTN HGNC:20223 55237 908 | VSX2 HGNC:1975 338917 909 | WFDC10A HGNC:16139 140832 910 | WFDC11 HGNC:20478 259239 911 | WFDC9 HGNC:20380 259240 912 | XAGE2 HGNC:4112 9502 913 | XAGE5 HGNC:30930 170627 914 | XKR7 HGNC:23062 343702 915 | ZAN HGNC:12857 7455 916 | ZCCHC13 HGNC:31749 389874 917 | ZCCHC16 HGNC:25214 340595 918 | ZG16 HGNC:30961 653808 919 | ZIC3 HGNC:12874 7547 920 | ZIM3 HGNC:16366 7547 921 | ZNF645 HGNC:26371 158506 922 | ZNF648 HGNC:18190 127665 923 | ZNF679 HGNC:28650 168417 924 | ZNF804B HGNC:21958 219578 925 | ZNRF4 HGNC:17726 148066 926 | ZP2 HGNC:13188 7783 927 | ZP4 HGNC:15770 57829 928 | ZSWIM2 HGNC:30990 151112 929 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 100 2 | use_small_heuristics = "max" -------------------------------------------------------------------------------- /src/commands/command.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use enum_dispatch::enum_dispatch; 3 | 4 | #[enum_dispatch] 5 | pub trait Command { 6 | fn execute(&self) -> Result<()>; 7 | } 8 | -------------------------------------------------------------------------------- /src/commands/count.rs: -------------------------------------------------------------------------------- 1 | use crate::command::Command; 2 | use crate::guide::*; 3 | use ahash::AHashMap; 4 | use anyhow::{Context, Result}; 5 | use clap::Parser; 6 | use fastq::{parse_path, Record}; 7 | use fgoxide::io::{DelimFile, Io}; 8 | use itertools::Itertools; 9 | use log::*; 10 | use serde::{Deserialize, Serialize}; 11 | use std::collections::{HashMap, HashSet}; 12 | use std::io::Write; 13 | use std::path::{Path, PathBuf}; 14 | 15 | type GuideMap<'a> = AHashMap, &'a Guide>; 16 | 17 | /// Counts the guides observed in a CRISPR screen, starting from one or more FASTQs. FASTQs are 18 | /// one per sample and currently only single-end FASTQ inputs are supported. 19 | /// 20 | /// A set of sample IDs may be provided using `--samples id1 id2 ..`. If provided it must have the 21 | /// same number of values as input FASTQs. If not provided the FASTQ names are used minus any 22 | /// fastq/fq/gz suffixes. 23 | /// 24 | /// Automatically determines the range of valid offsets within the sequencing reads where the 25 | /// guide sequences are located, independently for each FASTQ input. The first `offset-sample-size` 26 | /// reads from each FASTQ are examined to determine the offsets at which guides are found. When 27 | /// processing the full FASTQ, checks only those offsets that accounted for at least 28 | /// `offset-min-fraction` of the first `offset-sample-size` reads. 29 | /// 30 | /// Matching by default allows for one mismatch (and no indels) between the read sub-sequence 31 | /// and the expected guide sequences. Exact matching may be enabled by specifying the 32 | /// `--exact-match` option. 33 | /// 34 | /// Optionally lists may be provided of essential genes, nonessential genes and control guide ids, 35 | /// as well as a regular expression to be used to identify control guides. Using this information 36 | /// guides are classified as either Essential, Nonessential, Control, or Other. 37 | /// 38 | /// Three output files are generated. The first is named `{output}.counts.txt` and contains columns 39 | /// for the guide id, the gene targeted by the guide and one count column per input FASTQ with 40 | /// raw/un-normalized counts. The second, `{output}.extended-counts.txt` is identical to the first 41 | /// except for having a `guide_type` column inserted as the third column. Finally 42 | /// `{output}.stats.txt` contains basic QC statistics per input FASTQ on the matching process. 43 | #[derive(Parser, Debug)] 44 | pub(crate) struct Count { 45 | /// Input fastq file(s) 46 | #[clap(long, short = 'i', required = true, multiple_values = true)] 47 | input: Vec, 48 | 49 | /// Sample names corresponding to the input fastqs. If provided must be the same length as 50 | /// input. Otherwise will be inferred from input file names. 51 | #[clap(long, short = 's', multiple_values = true)] 52 | samples: Vec, 53 | 54 | /// Path to the guide library metadata. May be a tab- or comma-separated file. Must have 55 | /// a header line, and the first three fields must be (in order): i) the ID of the guide, 56 | /// ii) the base sequence of the guide, iii) the gene the guide targets. 57 | #[clap(long, short = 'l')] 58 | library: PathBuf, 59 | 60 | /// Optional path to file with list of essential genes. Gene names should appear one 61 | /// per line and are case sensitive. If the file has multiple tab-separated columns, the first 62 | /// column is used. 63 | #[clap(long, short = 'e')] 64 | essential_genes: Option, 65 | 66 | /// Optional path to file with list of nonessential genes. Gene names should appear one 67 | /// per line and are case sensitive. If the file has multiple tab-separated columns, the first 68 | /// column is used. 69 | #[clap(long, short = 'n')] 70 | nonessential_genes: Option, 71 | 72 | /// Optional path to file with list control guide IDs. IDs should appear one 73 | /// per line and are case sensitive. If the file has multiple tab-separated columns, the first 74 | /// column is used. 75 | #[clap(long, short = 'c')] 76 | control_guides: Option, 77 | 78 | /// Optional regular expression used to ID control guides. Pattern is matched, case 79 | /// insensitive, to guide IDs and Gene names. 80 | #[clap(long, short = 'C')] 81 | control_pattern: Option, 82 | 83 | /// Perform exact matching only, don't allow mismatches between reads and guides. 84 | #[clap(long, short = 'x')] 85 | exact_match: bool, 86 | 87 | /// The number of reads to be examined when determining the offsets at which guides may 88 | /// be found in the input reads. 89 | #[clap(long, short = 'N', default_value = "100000")] 90 | offset_sample_size: u64, 91 | 92 | /// After sampling the first `offset_sample_size` reads, use offsets that 93 | #[clap(long, short = 'f', default_value = "0.0025")] 94 | offset_min_fraction: f64, 95 | 96 | /// Path prefix to use for all output files 97 | #[clap(long, short = 'o')] 98 | output: String, 99 | } 100 | 101 | /// Simple Command impl that just receives params and delegates off to other functions 102 | impl Command for Count { 103 | /// execute function that is called from the command line parser 104 | fn execute(&self) -> Result<()> { 105 | // Auto-fill the sample names if not given 106 | let sample_ids = if self.samples.is_empty() { 107 | self.input 108 | .iter() 109 | .enumerate() 110 | .map(|(idx, fq)| Count::sample_name(fq, idx + 1)) 111 | .collect_vec() 112 | } else { 113 | assert_eq!( 114 | self.samples.len(), 115 | self.input.len(), 116 | "Different numbers of --samples and --input." 117 | ); 118 | self.samples.clone() 119 | }; 120 | 121 | // Load up the library and guide lookup 122 | let library = GuideLibrary::from_files( 123 | &self.library, 124 | &self.essential_genes, 125 | &self.nonessential_genes, 126 | &self.control_guides, 127 | &self.control_pattern, 128 | )?; 129 | let lookup = Count::build_lookup(&library, !self.exact_match); 130 | 131 | // Generate the counts per sample 132 | let results = self 133 | .input 134 | .iter() 135 | .zip(sample_ids) 136 | .map(|(fq, sample)| { 137 | let prefix_info = Count::determine_prefixes( 138 | fq, 139 | sample.as_str(), 140 | &library, 141 | &lookup, 142 | self.offset_sample_size, 143 | self.offset_min_fraction, 144 | ) 145 | .expect("Failed to determine offsets."); 146 | 147 | Count::count_reads(fq, sample.as_str(), &library, &lookup, &prefix_info) 148 | .expect("Failed to count guide.") 149 | }) 150 | .collect_vec(); 151 | 152 | // Write the outputs 153 | let counts_file = PathBuf::from(format!("{}.counts.txt", self.output)); 154 | let ext_counts_file = PathBuf::from(format!("{}.extended-counts.txt", self.output)); 155 | let stats_file = PathBuf::from(format!("{}.stats.txt", self.output)); 156 | 157 | Count::write_counts(&counts_file, &library, &results, false)?; 158 | Count::write_counts(&ext_counts_file, &library, &results, true)?; 159 | Count::write_stats(&stats_file, &library, &results)?; 160 | Ok(()) 161 | } 162 | } 163 | 164 | /// Implementation of the Count command and related functions. 165 | impl Count { 166 | /// Returns a sample name given a fastq file. Strips off any .gz and fastq-like 167 | /// suffixes. If the file doesn't have a valid filename, will return a name 168 | /// based on the index passed in. 169 | fn sample_name(p: &Path, idx: usize) -> String { 170 | if let Some(os_name) = p.file_name() { 171 | if let Some(name) = os_name.to_str() { 172 | return name 173 | .trim_end_matches(".gz") 174 | .trim_end_matches(".fastq") 175 | .trim_end_matches(".fq") 176 | .to_string(); 177 | } 178 | } 179 | 180 | format!("s{}", idx) 181 | } 182 | 183 | /// Builds a lookup from a Vec of bases to Guides. The resulting HashMap will contain 184 | /// keys for every exact guide sequence (in upper case). If `allow_mismatch` is true, 185 | /// the map will also contain keys for every one-mismatch version of every guide with the 186 | /// exception of any sequences that match equally well to multiple guides. 187 | fn build_lookup(library: &GuideLibrary, allow_mismatch: bool) -> GuideMap { 188 | info!("Building lookup."); 189 | let mut lookup = GuideMap::default(); 190 | let mut dupes = HashSet::new(); 191 | 192 | lookup.reserve(library.len()); 193 | 194 | if allow_mismatch { 195 | lookup.reserve(library.len() + library.guide_length * 3); 196 | 197 | for guide in library.guides.iter() { 198 | let bases = &guide.bases; 199 | 200 | for i in 0..bases.len() { 201 | for b in [b'A', b'C', b'G', b'T'] { 202 | if bases[i] != b { 203 | let mut modded = bases.clone(); 204 | modded[i] = b; 205 | 206 | let prev = lookup.insert(modded, guide); 207 | if prev.is_some() { 208 | let mut dupe = bases.clone(); 209 | dupe[i] = b; 210 | dupes.insert(dupe); 211 | } 212 | } 213 | } 214 | } 215 | } 216 | 217 | // Make sure no duplicated sequences remain in the lookup 218 | for dupe in dupes.into_iter() { 219 | lookup.remove(&dupe); 220 | } 221 | } 222 | 223 | // Insert all the exact matches last so they're always present 224 | for guide in library.guides.iter() { 225 | lookup.insert(guide.bases.clone(), guide); 226 | } 227 | 228 | info!("Lookup built with {} entries.", lookup.len()); 229 | lookup 230 | } 231 | 232 | /// Goes through an input fastq file and determines the set of prefix-lengths that 233 | /// occur before the guide sequence is observed. Samples the first `sample_size` reads 234 | /// from the FASTQ and checks all possible prefixes. Returns the set of prefixes where 235 | /// each prefix individually accounts for >= `min_fraction` of the reads that matched 236 | /// to a guide. 237 | fn determine_prefixes( 238 | fastq: &Path, 239 | sample: &str, 240 | library: &GuideLibrary, 241 | lookup: &GuideMap, 242 | sample_size: u64, 243 | min_fraction: f64, 244 | ) -> Result { 245 | let guide_length = library.guide_length; 246 | let mut prefix_lengths = vec![0u64; 500]; 247 | let mut count = 0u64; 248 | 249 | // Parse the first `sample_size` records to find exact match guides and 250 | // extract the sequence that precedes the guide 251 | parse_path(Some(fastq), |parser| { 252 | parser 253 | .each(|rec| { 254 | let read_bases = rec.seq(); 255 | let read_length = read_bases.len(); 256 | 257 | if read_length >= guide_length { 258 | for trim in 0..=(read_length - guide_length) { 259 | let bases = &read_bases[trim..trim + guide_length]; 260 | 261 | if lookup.contains_key(bases) { 262 | prefix_lengths[trim] += 1; 263 | } 264 | } 265 | } 266 | 267 | count += 1; 268 | count < sample_size 269 | }) 270 | .expect("Failed to parse."); 271 | }) 272 | .context(format!("Failed to read {:?}", fastq))?; 273 | 274 | let total_matched: u64 = prefix_lengths.iter().sum(); 275 | let fraction_matched = total_matched as f64 / count as f64; 276 | info!( 277 | "In {:?} examined {} reads for guide start position and matched {} ({:.4}).", 278 | fastq, count, total_matched, fraction_matched 279 | ); 280 | 281 | // Tuple of offset -> count where count is > 0 282 | let non_zeros = 283 | prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0).collect_vec(); 284 | 285 | info!( 286 | "{} read offsets: {}", 287 | sample, 288 | non_zeros.iter().map(|(o, n)| format!("{}->{}", o, n)).join(", ") 289 | ); 290 | 291 | // Filter to just those trim lengths that have at least min_fraction of the data each 292 | let trims_to_return: Vec = non_zeros 293 | .into_iter() 294 | .filter(|(_idx, n)| *n as f64 / total_matched as f64 >= min_fraction) 295 | .map(|(idx, _n)| idx) 296 | .collect(); 297 | 298 | let info = PrefixInfo { lengths: trims_to_return }; 299 | Ok(info) 300 | } 301 | 302 | /// Generates a set of guide counts for a single input FASTQ given a guide lookup and a set of 303 | /// read offsets/prefixes to check. 304 | /// 305 | /// Returns a CountResult which contains a count of the total number of reads in the FASTQ 306 | /// and a Map of Guide to count of tht guide. The map will contain an entry for every guide 307 | /// including those with zero counts. 308 | fn count_reads<'a, P>( 309 | fastq: &P, 310 | sample: &str, 311 | library: &'a GuideLibrary, 312 | lookup: &GuideMap, 313 | prefix_info: &PrefixInfo, 314 | ) -> Result> 315 | where 316 | P: AsRef, 317 | { 318 | let mut count: u64 = 0; 319 | let mut counts: Vec = vec![0; library.len()]; 320 | let fastq_path = fastq.as_ref(); 321 | 322 | // TODO: remove empty file checking once this is moved over to seq_io instead of fastq 323 | let fastq_size = std::fs::metadata(fastq).map(|m| m.len()).unwrap_or(0); 324 | let empty_fastq = fastq_path.is_file() && fastq_path.exists() && fastq_size == 0; 325 | 326 | if !empty_fastq { 327 | parse_path(Some(fastq), |parser| { 328 | parser 329 | .each(|rec| { 330 | let read_bases = rec.seq(); 331 | let read_length = read_bases.len(); 332 | let guide_length = library.guide_length; 333 | 334 | for trim in prefix_info.lengths.iter() { 335 | if trim + guide_length <= read_length { 336 | let bases = &read_bases[*trim..(*trim + guide_length)]; 337 | if let Some(guide) = lookup.get(bases) { 338 | counts[guide.index] += 1; 339 | } 340 | } 341 | } 342 | 343 | count += 1; 344 | if count % 10_000_000 == 0 { 345 | info!("Processed {}m reads from {:?}.", count / 1_000_000, fastq_path); 346 | } 347 | 348 | true 349 | }) 350 | .expect("Failed to parse."); 351 | })?; 352 | } 353 | 354 | let count_map: HashMap<&Guide, u64> = 355 | library.guides.iter().map(|g| (g, counts[g.index])).collect(); 356 | 357 | let count_result = CountResult { 358 | source: fastq_path.to_str().unwrap_or("").to_string(), 359 | sample: sample.to_string(), 360 | counts: count_map, 361 | total_reads: count, 362 | }; 363 | 364 | info!( 365 | "Processed {} reads and matched {} ({:.4}) from {:?}.", 366 | count, 367 | count_result.mapped_reads(), 368 | count_result.mapped_frac(), 369 | fastq_path 370 | ); 371 | 372 | Ok(count_result) 373 | } 374 | 375 | /// Writes out the counts matrix given one or more CountResults. If extended is false, the 376 | /// columns produced are "guide", "gene" and then one column per sample with counts for each 377 | /// sample. If extended is true, an additional "guide_type" is inserted after "gene". 378 | fn write_counts( 379 | path: &Path, 380 | library: &GuideLibrary, 381 | counts: &[CountResult], 382 | extended: bool, 383 | ) -> Result<()> { 384 | let mut writer = Io::default().new_writer(&path)?; 385 | let sep = "\t".as_bytes(); 386 | let newline = "\n".as_bytes(); 387 | 388 | // Output the header 389 | let mut header_fields = vec!["guide", "gene"]; 390 | if extended { 391 | header_fields.extend_from_slice(&["guide_type"]); 392 | } 393 | 394 | header_fields.extend(counts.iter().map(|c| c.sample.as_str())); 395 | writer.write_all(header_fields.join("\t").as_bytes())?; 396 | writer.write_all(newline)?; 397 | 398 | // Output the body 399 | for guide in library.guides.iter() { 400 | writer.write_all(guide.id.as_bytes())?; 401 | writer.write_all(sep)?; 402 | writer.write_all(guide.gene.as_bytes())?; 403 | 404 | if extended { 405 | writer.write_all(sep)?; 406 | writer.write_all(guide.kind.to_string().as_bytes())?; 407 | } 408 | 409 | for sample in counts { 410 | let n = sample.counts.get(&guide).copied().unwrap_or(0); 411 | writer.write_all(sep)?; 412 | writer.write_all(n.to_string().as_bytes())?; 413 | } 414 | 415 | writer.write_all(newline)?; 416 | } 417 | 418 | writer.flush()?; 419 | Ok(()) 420 | } 421 | 422 | /// Generates and writes out some simple per-sample QC statistics 423 | fn write_stats( 424 | stats_file: &Path, 425 | library: &GuideLibrary, 426 | results: &[CountResult], 427 | ) -> Result<()> { 428 | let recs = results 429 | .iter() 430 | .map(|r| CountStats { 431 | file: r.source.clone(), 432 | label: r.sample.to_string(), 433 | total_guides: library.len() as u64, 434 | total_reads: r.total_reads, 435 | mapped_reads: r.mapped_reads(), 436 | frac_mapped: Count::round(r.mapped_frac(), 4), 437 | mean_reads_per_guide: Count::round( 438 | r.mapped_reads() as f64 / library.len() as f64, 439 | 2, 440 | ), 441 | zero_read_guides: r.counts.values().filter(|n| **n == 0).count() as u64, 442 | mean_reads_essential: Count::compute_mean_cov(r, GuideType::Essential), 443 | mean_reads_nonessential: Count::compute_mean_cov(r, GuideType::Nonessential), 444 | mean_reads_other: Count::compute_mean_cov(r, GuideType::Other), 445 | mean_reads_control: Count::compute_mean_cov(r, GuideType::Control), 446 | }) 447 | .collect_vec(); 448 | 449 | DelimFile::default().write_tsv(&stats_file, recs)?; 450 | Ok(()) 451 | } 452 | 453 | /// Simple method to round f64s to a maximum number of decimal places 454 | fn round(f: f64, dp: i32) -> f64 { 455 | let factor = 10f64.powi(dp); 456 | (f * factor).round() / factor 457 | } 458 | 459 | /// Computes the mean coverage of a subset of guides based on guide type. If no guides of the 460 | /// type exist, returns 0. 461 | fn compute_mean_cov(counts: &CountResult, kind: GuideType) -> f64 { 462 | let subset = counts 463 | .counts 464 | .iter() 465 | .filter_map(|(g, n)| if g.kind == kind { Some(*n as f64) } else { None }) 466 | .collect_vec(); 467 | 468 | if subset.is_empty() { 469 | 0.0 470 | } else { 471 | let total: f64 = subset.iter().sum::(); 472 | Count::round(total / subset.len() as f64, 2) 473 | } 474 | } 475 | } 476 | 477 | /// Struct to hold information about the length and bases of the sequence before the guides 478 | struct PrefixInfo { 479 | pub lengths: Vec, 480 | } 481 | 482 | /// Struct to hold the results of counting a single fastq 483 | struct CountResult<'a> { 484 | source: String, 485 | sample: String, 486 | total_reads: u64, 487 | counts: HashMap<&'a Guide, u64>, 488 | } 489 | 490 | /// Struct to output stats on each sample mapped 491 | #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] 492 | struct CountStats { 493 | file: String, 494 | label: String, 495 | total_guides: u64, 496 | total_reads: u64, 497 | mapped_reads: u64, 498 | frac_mapped: f64, 499 | mean_reads_per_guide: f64, 500 | mean_reads_essential: f64, 501 | mean_reads_nonessential: f64, 502 | mean_reads_control: f64, 503 | mean_reads_other: f64, 504 | zero_read_guides: u64, 505 | } 506 | 507 | impl CountResult<'_> { 508 | /// Returns the total number of reads that mapped to a guide. 509 | pub fn mapped_reads(&self) -> u64 { 510 | self.counts.values().sum() 511 | } 512 | 513 | /// Returns the fraction of reads that mapped to a guide. 514 | pub fn mapped_frac(&self) -> f64 { 515 | self.mapped_reads() as f64 / self.total_reads as f64 516 | } 517 | } 518 | 519 | #[cfg(test)] 520 | mod tests { 521 | use super::*; 522 | use fgoxide::io::{DelimFile, Io}; 523 | use tempfile::TempDir; 524 | 525 | #[test] 526 | fn test_sample_name() { 527 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fq").as_path(), 1), "splat"); 528 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fq.gz").as_path(), 1), "splat"); 529 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fastq").as_path(), 1), "splat"); 530 | assert_eq!(Count::sample_name(PathBuf::from("/foo/splat.fastq.gz").as_path(), 1), "splat"); 531 | assert_eq!(Count::sample_name(PathBuf::new().as_path(), 1), "s1"); 532 | } 533 | 534 | #[test] 535 | fn test_build_lookup() { 536 | let g1 = Guide::new(0, "g0", "AAAAAAAAAA", "gene-A", GuideType::Other); 537 | let g2 = Guide::new(1, "g1", "GGGGGGGGGG", "gene-G", GuideType::Other); 538 | let g3 = Guide::new(2, "g2", "AGGGGGGGGG", "gene-AG", GuideType::Other); 539 | 540 | // Build with one guide and no mismatches 541 | let library = GuideLibrary::new(vec![g1.clone()]).unwrap(); 542 | let lookup = Count::build_lookup(&library, false); 543 | assert_eq!(lookup.len(), 1); 544 | assert_eq!(lookup[&g1.bases], &g1); 545 | 546 | // One guide with mismatches 547 | let lookup = Count::build_lookup(&library, true); 548 | assert_eq!(lookup.len(), 31); // original plus three mismatches x ten positions 549 | assert_eq!(lookup[&g1.bases], &g1); 550 | assert_eq!(lookup["AAAACAAAAA".as_bytes()], &g1); 551 | 552 | // Two guides without mismatches 553 | let library = GuideLibrary::new(vec![g1.clone(), g2.clone()]).unwrap(); 554 | let lookup = Count::build_lookup(&library, false); 555 | assert_eq!(lookup.len(), 2); 556 | assert_eq!(lookup[&g1.bases], &g1); 557 | assert_eq!(lookup[&g2.bases], &g2); 558 | 559 | // Two guides with mismatches and no collisions 560 | let lookup = Count::build_lookup(&library, true); 561 | assert_eq!(lookup.len(), 62); 562 | 563 | // Two guides with mismatches and collisions! 564 | let library = GuideLibrary::new(vec![g2.clone(), g3.clone()]).unwrap(); 565 | let lookup = Count::build_lookup(&library, true); 566 | assert_eq!(lookup.len(), 56); 567 | assert_eq!(lookup[&g2.bases], &g2); // collision shouldn't override perfect match 568 | assert_eq!(lookup[&g3.bases], &g3); // collision shouldn't override perfect match 569 | assert!(!lookup.contains_key("CGGGGGGGGG".as_bytes())); // ambiguous 570 | assert!(!lookup.contains_key("TGGGGGGGGG".as_bytes())); // ambiguous 571 | } 572 | 573 | /// Helper function to generate guide library determine_prefixes and counting tests 574 | fn test_library() -> GuideLibrary { 575 | GuideLibrary::new(vec![ 576 | Guide::new(0, "g1", "ACGTACGT", "AAA", GuideType::Other), 577 | Guide::new(1, "g1", "AACCGGTT", "AAA", GuideType::Other), 578 | Guide::new(2, "g1", "AAACAGAT", "AAA", GuideType::Other), 579 | ]) 580 | .unwrap() 581 | } 582 | 583 | /// Helper function to write a bunch of reads to a FASTQ file 584 | fn write_fastq(reads: &[String], path: PathBuf) -> PathBuf { 585 | let mut file = Io::default().new_writer(&path).unwrap(); 586 | 587 | for (idx, read) in reads.iter().enumerate() { 588 | let quals = vec![b'#'; read.len()]; 589 | 590 | file.write_all(format!("@q{}\n", idx).as_bytes()).unwrap(); 591 | file.write_all(format!("{}\n", read).as_bytes()).unwrap(); 592 | file.write_all("+\n".as_bytes()).unwrap(); 593 | file.write_all(quals.as_slice()).unwrap(); 594 | file.write_all("\n".as_bytes()).unwrap(); 595 | } 596 | 597 | file.flush().unwrap(); 598 | path 599 | } 600 | 601 | #[test] 602 | fn test_determine_prefixes_finds_offset_zero() { 603 | let library = test_library(); 604 | let lookup = Count::build_lookup(&library, false); 605 | let reads = vec![ 606 | format!("{}tttttttttt", library.guides[0].bases_str), 607 | format!("{}tttttttttt", library.guides[1].bases_str), 608 | format!("{}tttttttttt", library.guides[2].bases_str), 609 | ]; 610 | let tempdir = TempDir::new().unwrap(); 611 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 612 | let prefixes = Count::determine_prefixes(&fastq, "s", &library, &lookup, 100, 0.0).unwrap(); 613 | assert_eq!(prefixes.lengths, vec![0]); 614 | } 615 | 616 | #[test] 617 | fn test_determine_prefixes_finds_last_offset() { 618 | let library = test_library(); 619 | let lookup = Count::build_lookup(&library, false); 620 | let reads = vec![ 621 | format!("tttttttttt{}", library.guides[0].bases_str), 622 | format!("tttttttttt{}", library.guides[1].bases_str), 623 | format!("tttttttttt{}", library.guides[2].bases_str), 624 | ]; 625 | let tempdir = TempDir::new().unwrap(); 626 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 627 | let prefixes = Count::determine_prefixes(&fastq, "s", &library, &lookup, 100, 0.0).unwrap(); 628 | assert_eq!(prefixes.lengths, vec![10]); 629 | } 630 | 631 | #[test] 632 | fn test_determine_prefixes_actually_subsamples() { 633 | let library = test_library(); 634 | let lookup = Count::build_lookup(&library, false); 635 | 636 | let mut reads = vec![]; 637 | for _ in 0..100 { 638 | reads.push(format!("ttttt{}ttttt", library.guides[0].bases_str)) 639 | } 640 | for _ in 0..100 { 641 | reads.push(format!("tttttt{}tttt", library.guides[0].bases_str)) 642 | } 643 | 644 | let tempdir = TempDir::new().unwrap(); 645 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 646 | let p1 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.01).unwrap(); 647 | let p2 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 100, 0.01).unwrap(); 648 | assert_eq!(p1.lengths, vec![5, 6]); 649 | assert_eq!(p2.lengths, vec![5]); 650 | } 651 | 652 | #[test] 653 | fn test_determine_prefixes_min_fraction() { 654 | let library = test_library(); 655 | let lookup = Count::build_lookup(&library, false); 656 | 657 | let mut reads = vec![]; 658 | for _ in 0..50 { 659 | // offset = 5 660 | reads.push(format!("ttttt{}ttttt", library.guides[0].bases_str)) 661 | } 662 | for _ in 0..30 { 663 | // offset = 6 664 | reads.push(format!("tttttt{}tttt", library.guides[0].bases_str)) 665 | } 666 | for _ in 0..20 { 667 | // offset = 7 668 | reads.push(format!("ttttttt{}ttt", library.guides[0].bases_str)) 669 | } 670 | for _ in 0..100 { 671 | // doesn't match to guides 672 | reads.push("tttttttttttttttttttt".to_string()); 673 | } 674 | 675 | let tempdir = TempDir::new().unwrap(); 676 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 677 | let p1 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.01).unwrap(); 678 | let p2 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.25).unwrap(); 679 | let p3 = Count::determine_prefixes(&fastq, "s", &library, &lookup, 200, 0.50).unwrap(); 680 | 681 | assert_eq!(p1.lengths, vec![5, 6, 7]); 682 | assert_eq!(p2.lengths, vec![5, 6]); 683 | assert_eq!(p3.lengths, vec![5]); 684 | } 685 | 686 | #[test] 687 | fn test_count_reads_handles_empty_fastq() { 688 | let library = test_library(); 689 | let lookup = Count::build_lookup(&library, false); 690 | let prefixes = PrefixInfo { lengths: vec![0, 1, 2] }; 691 | let tempdir = TempDir::new().unwrap(); 692 | let fastq = write_fastq(&[], tempdir.path().join("in.fastq")); 693 | 694 | let counts = Count::count_reads(&fastq, "s1", &library, &lookup, &prefixes).unwrap(); 695 | assert_eq!(counts.sample, "s1"); 696 | assert_eq!(counts.total_reads, 0); 697 | for guide in library.guides.iter() { 698 | assert_eq!(counts.counts[guide], 0); 699 | } 700 | } 701 | 702 | #[test] 703 | fn test_count_reads() { 704 | let library = test_library(); 705 | let lookup = Count::build_lookup(&library, false); 706 | let prefixes = PrefixInfo { lengths: vec![4, 5, 6] }; 707 | let tempdir = TempDir::new().unwrap(); 708 | let mut reads = vec![]; 709 | 710 | // Create a bunch of reads, with each guide getting: 711 | // 100 * 3 * 1_based_guide_index 712 | for prefix in ["tttt", "ttttt", "tttttt"] { 713 | for _ in 0..100 { 714 | for guide in library.guides.iter() { 715 | for _ in 0..(guide.index + 1) { 716 | reads.push(format!("{}{}ggggg", prefix, guide.bases_str)); 717 | } 718 | } 719 | } 720 | } 721 | 722 | // Add a few reads at different offsets that won't count 723 | reads.push(format!("{}ttttttttt", library.guides[0].bases_str)); 724 | reads.push(format!("t{}tttttttt", library.guides[0].bases_str)); 725 | reads.push(format!("tt{}ttttttt", library.guides[0].bases_str)); 726 | reads.push(format!("ttt{}tttttt", library.guides[0].bases_str)); 727 | 728 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 729 | let counts = Count::count_reads(&fastq, "s1", &library, &lookup, &prefixes).unwrap(); 730 | assert_eq!(counts.sample, "s1"); 731 | assert_eq!(counts.total_reads, reads.len() as u64); 732 | for guide in library.guides.iter() { 733 | let expected = 100 * 3 * (guide.index + 1) as u64; 734 | assert_eq!(counts.counts[guide], expected); 735 | } 736 | } 737 | 738 | #[test] 739 | fn test_end_to_end() { 740 | let tempdir = TempDir::new().unwrap(); 741 | 742 | // Write the library to disk 743 | let library = test_library(); 744 | let library_path = tempdir.path().join("library.txt"); 745 | let mut lib_lines = vec!["guide\tbases\tgene".to_string()]; 746 | for guide in library.guides.iter() { 747 | lib_lines.push(format!("{}\t{}\t{}", guide.id, guide.bases_str, guide.gene)); 748 | } 749 | Io::default().write_lines(&library_path, &lib_lines).unwrap(); 750 | 751 | // Generate a fastq of reads to count 752 | let mut reads = vec![]; 753 | 754 | // Create a bunch of reads, with each guide getting: 755 | // 100 * 3 * 1_based_guide_index 756 | for prefix in ["tttt", "ttttt", "tttttt"] { 757 | for _ in 0..100 { 758 | for guide in library.guides.iter() { 759 | for _ in 0..(guide.index + 1) { 760 | reads.push(format!("{}{}ggggg", prefix, guide.bases_str)); 761 | } 762 | } 763 | } 764 | } 765 | 766 | // Add a few reads at different offsets that won't count 767 | reads.push(format!("{}ttttttttt", library.guides[0].bases_str)); 768 | reads.push(format!("t{}tttttttt", library.guides[0].bases_str)); 769 | reads.push(format!("tt{}ttttttt", library.guides[0].bases_str)); 770 | reads.push(format!("ttt{}tttttt", library.guides[0].bases_str)); 771 | 772 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 773 | 774 | // Run the count command 775 | let prefix = tempdir.path().join("out").to_str().unwrap().to_string(); 776 | let counts = tempdir.path().join("out.counts.txt"); 777 | let stats = tempdir.path().join("out.stats.txt"); 778 | 779 | let cmd = Count { 780 | library: library_path, 781 | input: vec![fastq], 782 | essential_genes: None, 783 | nonessential_genes: None, 784 | control_guides: None, 785 | control_pattern: None, 786 | samples: vec!["sample1".to_string()], 787 | output: prefix, 788 | exact_match: false, 789 | offset_min_fraction: 0.005, 790 | offset_sample_size: 100000, 791 | }; 792 | 793 | cmd.execute().unwrap(); 794 | 795 | assert!(counts.exists()); 796 | assert!(stats.exists()); 797 | 798 | // Read the stats back in 799 | let stat_records: Vec = DelimFile::default().read_tsv(&stats).unwrap(); 800 | 801 | assert_eq!(stat_records.len(), 1); 802 | assert_eq!(stat_records[0].label, "sample1"); 803 | assert_eq!(stat_records[0].total_guides, 3); 804 | assert_eq!(stat_records[0].total_reads, 1804); 805 | assert_eq!(stat_records[0].mapped_reads, 1800); 806 | assert!((stat_records[0].mean_reads_per_guide - 600.0).abs() <= 0.01); 807 | assert!((stat_records[0].frac_mapped - 1800f64 / 1804f64).abs() <= 0.01); 808 | assert_eq!(stat_records[0].zero_read_guides, 0); 809 | } 810 | 811 | #[test] 812 | fn test_reads_shorter_than_guides_ok() { 813 | let tempdir = TempDir::new().unwrap(); 814 | 815 | // Write the library to disk 816 | let library = test_library(); 817 | let library_path = tempdir.path().join("library.txt"); 818 | let mut lib_lines = vec!["guide\tbases\tgene".to_string()]; 819 | for guide in library.guides.iter() { 820 | lib_lines.push(format!("{}\t{}\t{}", guide.id, guide.bases_str, guide.gene)); 821 | } 822 | Io::default().write_lines(&library_path, &lib_lines).unwrap(); 823 | 824 | // Generate a fastq of reads to count 825 | let mut reads = vec![]; 826 | reads.push("A".to_string()); 827 | reads.push("AC".to_string()); 828 | reads.push("ACG".to_string()); 829 | reads.push("ACGT".to_string()); 830 | reads.push("ACGTA".to_string()); 831 | reads.push("ACGTAC".to_string()); 832 | reads.push("ACGTACG".to_string()); 833 | reads.push("ACGTACGT".to_string()); 834 | reads.push("ACGTACGTA".to_string()); 835 | reads.push("ACGTACGTAC".to_string()); 836 | 837 | // Create a bunch of reads all with the guide at offset=5, with each guide getting: 838 | for _ in 0..100 { 839 | for guide in library.guides.iter() { 840 | for _ in 0..=guide.index { 841 | reads.push(format!("ttttt{}ggggg", guide.bases_str)); 842 | } 843 | } 844 | } 845 | 846 | // Lastly create some reads that are longer than a guide length but shorter than 847 | // guide-length + max prefix 848 | reads.push("tttttACGTACGT".to_string()); 849 | reads.push("tttttACGTACGTA".to_string()); 850 | reads.push("tttttACGTACGTAC".to_string()); 851 | 852 | let fastq = write_fastq(&reads, tempdir.path().join("in.fastq")); 853 | 854 | // Run the count command 855 | let prefix = tempdir.path().join("out").to_str().unwrap().to_string(); 856 | let counts = tempdir.path().join("out.counts.txt"); 857 | let stats = tempdir.path().join("out.stats.txt"); 858 | 859 | let cmd = Count { 860 | library: library_path, 861 | input: vec![fastq], 862 | essential_genes: None, 863 | nonessential_genes: None, 864 | control_guides: None, 865 | control_pattern: None, 866 | samples: vec!["sample1".to_string()], 867 | output: prefix, 868 | exact_match: false, 869 | offset_min_fraction: 0.005, 870 | offset_sample_size: 100000, 871 | }; 872 | 873 | cmd.execute().unwrap(); 874 | assert!(counts.exists()); 875 | assert!(stats.exists()); 876 | } 877 | } 878 | -------------------------------------------------------------------------------- /src/commands/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod command; 2 | pub mod count; 3 | -------------------------------------------------------------------------------- /src/guide.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{anyhow, bail, Result}; 2 | use fgoxide::io::Io; 3 | use itertools::Itertools; 4 | use log::*; 5 | use regex::RegexBuilder; 6 | use std::collections::HashSet; 7 | use std::fmt::{Display, Formatter}; 8 | use std::path::Path; 9 | 10 | /// Guides can either target essential genes, non-essential genes, control sequences 11 | /// or other genes. 12 | #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] 13 | pub enum GuideType { 14 | Essential, 15 | Nonessential, 16 | Control, 17 | Other, 18 | } 19 | 20 | /// Implement Disaply for GuideType 21 | impl Display for GuideType { 22 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 23 | write!(f, "{:?}", self) 24 | } 25 | } 26 | 27 | /// A struct to represent a CRISPR guide 28 | #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] 29 | pub struct Guide { 30 | pub index: usize, 31 | pub id: String, 32 | pub kind: GuideType, 33 | pub bases: Vec, 34 | pub bases_str: String, 35 | pub gene: String, 36 | } 37 | 38 | impl Guide { 39 | /// Generates a new guide that stores the bases in upper-case and ensures that the 40 | /// bases and bases_str are in sync. 41 | pub fn new>(index: usize, id: S, bases: S, gene: S, kind: GuideType) -> Guide { 42 | let bases_upper = bases.into().to_uppercase(); 43 | Guide { 44 | index, 45 | id: id.into(), 46 | kind, 47 | bases: bases_upper.as_bytes().to_vec(), 48 | bases_str: bases_upper, 49 | gene: gene.into(), 50 | } 51 | } 52 | 53 | /// Returns the length of the guide sequence 54 | fn len(&self) -> usize { 55 | self.bases.len() 56 | } 57 | } 58 | 59 | /// Struct representing a guide library used in a crispr screen 60 | pub struct GuideLibrary { 61 | pub guides: Vec, 62 | pub guide_length: usize, 63 | } 64 | 65 | impl GuideLibrary { 66 | /// Constructs a new guide library from a set of guides. Will return an error if: 67 | /// - the guides have a mix of lengths 68 | /// - the guide sequences are not all A/C/G/T 69 | /// - there are non-unique guide sequences 70 | pub fn new(guides: Vec) -> Result { 71 | let lengths: HashSet = guides.iter().map(|g| g.len()).collect(); 72 | let unique: HashSet<&Vec> = guides.iter().map(|g| &g.bases).collect(); 73 | let genes: HashSet<&str> = guides.iter().map(|g| g.gene.as_str()).collect(); 74 | let bad = guides.iter().filter(|g| !GuideLibrary::is_acgt(&g.bases)).collect_vec(); 75 | 76 | if guides.is_empty() { 77 | Ok(GuideLibrary { guides, guide_length: 0 }) 78 | } else if lengths.len() != 1 { 79 | Err(anyhow!("More than one guide length found: {}.", lengths.iter().join(", "))) 80 | } else if !bad.is_empty() { 81 | Err(anyhow!("{} guides had non-ACGT bases in their sequence.", bad.len())) 82 | } else if unique.len() < guides.len() { 83 | Err(anyhow!( 84 | "Guide library had {} guides but only {} unique sequences.", 85 | guides.len(), 86 | unique.len() 87 | )) 88 | } else { 89 | info!( 90 | "Loaded library with {} guides for {} genes; {}=essential, {}=nonessential, {}=control, {}=other.", 91 | guides.len(), 92 | genes.len(), 93 | guides.iter().filter(|g| g.kind == GuideType::Essential).count(), 94 | guides.iter().filter(|g| g.kind == GuideType::Nonessential).count(), 95 | guides.iter().filter(|g| g.kind == GuideType::Control).count(), 96 | guides.iter().filter(|g| g.kind == GuideType::Other).count(), 97 | ); 98 | 99 | Ok(GuideLibrary { 100 | guides, 101 | guide_length: *lengths.iter().next().expect("Where'd it go?"), 102 | }) 103 | } 104 | } 105 | 106 | /// Reads a guide library from a file. The file: 107 | /// - May be either tab- or comma-delimited 108 | /// - Must have a header row 109 | /// - Must have at least three columns with the first three columns in order being: 110 | /// - a unique ID for the guide 111 | /// - the sequence of the guide 112 | /// - the gene, or other target, of the guide 113 | pub fn from_file

(path: &P) -> Result 114 | where 115 | P: AsRef, 116 | { 117 | let no_pattern: Option<&str> = None; 118 | GuideLibrary::from_files(path, &None, &None, &None, &no_pattern) 119 | } 120 | 121 | pub fn from_files( 122 | lib_path: &P, 123 | essential_gene_path: &Option

, 124 | non_essential_gene_path: &Option

, 125 | control_guide_list_path: &Option

, 126 | control_pattern: &Option, 127 | ) -> Result 128 | where 129 | P: AsRef, 130 | S: AsRef, 131 | { 132 | let essentials = GuideLibrary::read_to_set(essential_gene_path)?; 133 | let non_essentials = GuideLibrary::read_to_set(non_essential_gene_path)?; 134 | let control_guides = GuideLibrary::read_to_set(control_guide_list_path)?; 135 | let control_regex = if let Some(p) = control_pattern { 136 | Some(RegexBuilder::new(p.as_ref()).case_insensitive(true).build()?) 137 | } else { 138 | None 139 | }; 140 | 141 | let lines = Io::default().read_lines(lib_path)?; 142 | 143 | if lines.len() < 2 { 144 | GuideLibrary::new(vec![]) 145 | } else { 146 | let delim: char = if lines[0].chars().filter(|ch| *ch == '\t').count() >= 2 { 147 | '\t' 148 | } else if lines[0].chars().filter(|ch| *ch == ',').count() >= 2 { 149 | ',' 150 | } else { 151 | bail!("couldn't detect delimiter from first line of {:?}", lib_path.as_ref()); 152 | }; 153 | 154 | // Read in the guides 155 | let mut guides = Vec::with_capacity(1024); 156 | let mut idx: usize = 0; 157 | for line in lines.iter().skip(1) { 158 | let trimmed = line.trim(); 159 | 160 | if !trimmed.is_empty() { 161 | let fields = trimmed.split(delim).collect_vec(); 162 | if fields.len() < 3 { 163 | bail!("Too few fields in line: '{}'", line); 164 | } 165 | 166 | let guide_id = fields[0]; 167 | let bases = fields[1]; 168 | let gene = fields[2]; 169 | 170 | let kind = if essentials.contains(gene) { 171 | GuideType::Essential 172 | } else if non_essentials.contains(gene) { 173 | GuideType::Nonessential 174 | } else if control_guides.contains(guide_id) 175 | || control_regex 176 | .as_ref() 177 | .filter(|re| re.is_match(guide_id) || re.is_match(gene)) 178 | .is_some() 179 | { 180 | GuideType::Control 181 | } else { 182 | GuideType::Other 183 | }; 184 | 185 | let guide = Guide::new(idx, guide_id, bases, gene, kind); 186 | guides.push(guide); 187 | idx += 1; 188 | } 189 | } 190 | 191 | GuideLibrary::new(guides) 192 | } 193 | } 194 | 195 | /// Reads all lines from a file, trims them, extracts the first tab-separated field and then 196 | /// returns the unique set of values as a set. 197 | fn read_to_set>(path: &Option

) -> Result> { 198 | let items: HashSet = match path { 199 | None => HashSet::new(), 200 | Some(p) => Io::default() 201 | .read_lines(p)? 202 | .into_iter() 203 | .map(|line| line.trim().to_string()) 204 | .filter(|line| !line.is_empty()) 205 | .map(|line| line.split('\t').next().unwrap().to_string()) 206 | .collect(), 207 | }; 208 | 209 | Ok(items) 210 | } 211 | 212 | /// Returns true if the sequence is all upper-case ACGT bases 213 | fn is_acgt(bases: &[u8]) -> bool { 214 | bases.iter().copied().all(|b| b == b'A' || b == b'C' || b == b'G' || b == b'T') 215 | } 216 | 217 | /// Returns the number of guides in the library 218 | pub fn len(&self) -> usize { 219 | self.guides.len() 220 | } 221 | 222 | /// True if there are no guides in the library, false otherwise 223 | pub fn is_empty(&self) -> bool { 224 | self.len() == 0 225 | } 226 | } 227 | 228 | #[cfg(test)] 229 | mod tests { 230 | use super::*; 231 | use std::collections::HashMap; 232 | use tempfile::TempDir; 233 | 234 | const CSV_LIBRARY: &str = "\ 235 | id,bases,gene 236 | a1,CGATCGCTTAAGCTAGCA,FOO 237 | a2,ATGCTAGATCGCGCTATT,FOO 238 | a3,GGCTTCTAGATCGCTATA,Control 239 | "; 240 | 241 | const TSV_LIBRARY: &str = "\ 242 | id\tbases\tgene 243 | b1\tCGATCGCTTAAGCTAGCA\tFOO 244 | b2\tATGCTAGATCGCGCTATT\tFOO 245 | b3\tGGCTTCTAGATCGCTATA\tControl 246 | "; 247 | 248 | #[test] 249 | fn test_guide_uppercases_sequence() { 250 | let g1 = Guide::new(0, "foo-1", "AAAAACCCCCGGGGGTTTTT", "FOO", GuideType::Other); 251 | assert_eq!(g1.index, 0); 252 | assert_eq!(g1.id, "foo-1"); 253 | assert_eq!(g1.bases_str, "AAAAACCCCCGGGGGTTTTT"); 254 | assert_eq!(g1.bases, "AAAAACCCCCGGGGGTTTTT".as_bytes()); 255 | assert_eq!(g1.gene, "FOO"); 256 | assert_eq!(g1.len(), 20); 257 | 258 | let g2 = Guide::new(0, "foo-2", "aAaAcCcCgGgGtTtTacgt", "FOO", GuideType::Other); 259 | assert_eq!(g2.index, 0); 260 | assert_eq!(g2.id, "foo-2"); 261 | assert_eq!(g2.bases_str, "AAAACCCCGGGGTTTTACGT"); 262 | assert_eq!(g2.bases, "AAAACCCCGGGGTTTTACGT".as_bytes()); 263 | assert_eq!(g2.gene, "FOO"); 264 | assert_eq!(g2.len(), 20); 265 | } 266 | 267 | #[test] 268 | fn test_is_all_acgt() { 269 | // True cases 270 | assert!(GuideLibrary::is_acgt("".as_bytes())); 271 | assert!(GuideLibrary::is_acgt("AACGCTGACTGA".as_bytes())); 272 | 273 | // False cases 274 | assert!(!GuideLibrary::is_acgt("N".as_bytes())); 275 | assert!(!GuideLibrary::is_acgt("AC GT".as_bytes())); 276 | assert!(!GuideLibrary::is_acgt("AC-GT".as_bytes())); 277 | assert!(!GuideLibrary::is_acgt("acgt".as_bytes())); 278 | } 279 | 280 | #[test] 281 | fn test_guide_library_positive() { 282 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other); 283 | let g2 = Guide::new(1, "foo-2", "GCTAGACTGGACTCTAATGC", "FOO", GuideType::Other); 284 | 285 | let l1 = GuideLibrary::new(vec![]).unwrap(); 286 | assert_eq!(l1.len(), 0); 287 | assert!(l1.is_empty()); 288 | 289 | let l2 = GuideLibrary::new(vec![g1.clone(), g2.clone()]).unwrap(); 290 | assert_eq!(l2.len(), 2); 291 | assert_eq!(l2.guides, vec![g1, g2]); 292 | assert_eq!(l2.guide_length, 20); 293 | } 294 | 295 | #[test] 296 | fn test_guide_library_rejects_mixed_length() { 297 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other); 298 | let g2 = Guide::new(1, "foo-2", "GCTAGACTGGACTCTAATGCC", "FOO", GuideType::Other); 299 | 300 | let result = GuideLibrary::new(vec![g1, g2]); 301 | assert!(result.err().unwrap().to_string().contains("More than one guide length found")); 302 | } 303 | 304 | #[test] 305 | fn test_guide_library_rejects_invalid_sequences() { 306 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCANNNATGACGTT", "FOO", GuideType::Other); 307 | let g2 = Guide::new(1, "foo-2", "hello!", "FOO", GuideType::Other); 308 | 309 | assert!(GuideLibrary::new(vec![g1]).err().unwrap().to_string().contains("non-ACGT")); 310 | assert!(GuideLibrary::new(vec![g2]).err().unwrap().to_string().contains("non-ACGT")); 311 | } 312 | 313 | #[test] 314 | fn test_guide_library_rejects_duplicate_sequences() { 315 | let g1 = Guide::new(0, "foo-1", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other); 316 | let g2 = Guide::new(1, "foo-2", "ACGTCAGCATGCATGACGTT", "FOO", GuideType::Other); 317 | assert!(GuideLibrary::new(vec![g1, g2]).err().unwrap().to_string().contains("unique")); 318 | } 319 | 320 | #[test] 321 | fn test_reading_guide_library_from_csv_file() { 322 | let dir = tempfile::tempdir().unwrap(); 323 | let path = dir.path().join("lib.csv"); 324 | Io::default().write_lines(&path, vec![CSV_LIBRARY]).unwrap(); 325 | let lib = GuideLibrary::from_file(&path).unwrap(); 326 | 327 | assert_eq!(lib.len(), 3); 328 | assert_eq!(lib.guides.iter().map(|g| g.id.as_str()).collect_vec(), vec!["a1", "a2", "a3"]); 329 | } 330 | 331 | #[test] 332 | fn test_reading_guide_library_from_tsv_file() { 333 | let dir = tempfile::tempdir().unwrap(); 334 | let path = dir.path().join("lib.tsv"); 335 | Io::default().write_lines(&path, vec![TSV_LIBRARY]).unwrap(); 336 | let lib = GuideLibrary::from_file(&path).unwrap(); 337 | 338 | assert_eq!(lib.len(), 3); 339 | assert_eq!(lib.guides.iter().map(|g| g.id.as_str()).collect_vec(), vec!["b1", "b2", "b3"]); 340 | } 341 | 342 | #[test] 343 | fn test_load_guide_library_from_files() { 344 | let tmp = TempDir::new().unwrap(); 345 | let lib_path = tmp.path().join("library.tsv.gz"); 346 | let ess_path = tmp.path().join("essential.txt"); 347 | let non_path = tmp.path().join("non-essential.txt"); 348 | let ctl_path = tmp.path().join("control-guides.txt"); 349 | 350 | let io = Io::default(); 351 | io.write_lines( 352 | &lib_path, 353 | vec![ 354 | "guide\tbases\tgene", 355 | "g1.1\tAAAAAAAAAA\tG1", 356 | "g1.2\tCCCCCCCCCC\tG1", 357 | "g2.1\tGGGGGGGGGG\tG2", 358 | "g2.2\tTTTTTTTTTT\tG2", 359 | "g3.1\tACACACACAC\tG3", 360 | "g3.2\tAGAGAGAGAG\tG3", 361 | "g4.1\tATATATATAT\tG4", 362 | "g4.2\tCACACACACA\tG4", 363 | "c1\tGGAGGAGGAG\tnon-target1", 364 | "c2\tGGTGGTGGTG\tnon-target2", 365 | "c3\tAAGTAAGTCC\tcontrol", 366 | ], 367 | ) 368 | .unwrap(); 369 | 370 | io.write_lines(&ess_path, vec!["G1", "G3"]).unwrap(); 371 | io.write_lines(&non_path, vec!["G4"]).unwrap(); 372 | io.write_lines(&ctl_path, vec!["c1", "c2"]).unwrap(); 373 | 374 | let lib = GuideLibrary::from_files( 375 | &lib_path, 376 | &Some(ess_path), 377 | &Some(non_path), 378 | &Some(ctl_path), 379 | &Some("Control"), 380 | ) 381 | .unwrap(); 382 | 383 | let map: HashMap<&str, &GuideType> = 384 | lib.guides.iter().map(|g| (g.id.as_str(), &g.kind)).collect(); 385 | assert_eq!(map["g1.1"], &GuideType::Essential); 386 | assert_eq!(map["g1.2"], &GuideType::Essential); 387 | assert_eq!(map["g2.1"], &GuideType::Other); 388 | assert_eq!(map["g2.2"], &GuideType::Other); 389 | assert_eq!(map["g3.1"], &GuideType::Essential); 390 | assert_eq!(map["g3.2"], &GuideType::Essential); 391 | assert_eq!(map["g4.1"], &GuideType::Nonessential); 392 | assert_eq!(map["g4.2"], &GuideType::Nonessential); 393 | assert_eq!(map["c1"], &GuideType::Control); 394 | assert_eq!(map["c2"], &GuideType::Control); 395 | assert_eq!(map["c3"], &GuideType::Control); 396 | } 397 | } 398 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | pub mod commands; 2 | pub mod guide; 3 | 4 | use anyhow::Result; 5 | use clap::Parser; 6 | use commands::command::Command; 7 | use commands::*; 8 | use enum_dispatch::enum_dispatch; 9 | use env_logger::Env; 10 | 11 | #[global_allocator] 12 | static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; 13 | 14 | #[derive(Parser, Debug)] 15 | struct Args { 16 | #[clap(subcommand)] 17 | subcommand: Subcommand, 18 | } 19 | 20 | #[enum_dispatch(Command)] 21 | #[derive(Parser, Debug)] 22 | enum Subcommand { 23 | Count(count::Count), 24 | } 25 | 26 | fn main() -> Result<()> { 27 | env_logger::Builder::from_env(Env::default().default_filter_or("info")).init(); 28 | let args: Args = Args::parse(); 29 | args.subcommand.execute() 30 | } 31 | --------------------------------------------------------------------------------