├── .gitignore ├── LICENSE ├── README.md ├── crabs ├── docker_intro └── README.md ├── figures_readme ├── amplicon-length-figure.png ├── amplification-efficiency.png ├── crabs_blasttax.png ├── crabs_completeness.png ├── crabs_cutadapt_error.png ├── crabs_dereplicate.png ├── crabs_download_bold.png ├── crabs_download_mitofish.png ├── crabs_download_ncbi.png ├── crabs_download_ncbi_output.png ├── crabs_download_taxonomy.png ├── crabs_export.png ├── crabs_filter.png ├── crabs_greengenes.png ├── crabs_help.png ├── crabs_import.png ├── crabs_insilico.png ├── crabs_merge.png ├── crabs_midori.png ├── crabs_pga.png ├── crabs_silva.png ├── crabs_subset.png ├── diversity-figure.png ├── phylo_tree.png ├── unite_first.png ├── unite_second.png └── unite_third.png ├── function ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── module_1.cpython-36.pyc │ ├── module_3.cpython-36.pyc │ └── module_5.cpython-36.pyc ├── crabs_functions.py └── older_versions │ ├── crabs_v1.0.0 │ ├── reference_database_creator_v2.1.py │ └── reference_database_creator_v2.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | functions/module_1.py 2 | .DS_Store 3 | *.pyc 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 gjeunen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crabs: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ################## 4 | # IMPORT MODULES # 5 | ################## 6 | import os, rich, rich.progress, collections 7 | import rich_click as click 8 | from function import __version__ 9 | from function.crabs_functions import (check_params, 10 | check_midori_values, 11 | embl_url, 12 | midori_url, 13 | parse_exclude, 14 | set_output_dir, 15 | download_file, 16 | remove_tar_intermediary, 17 | unzip_with_progress, 18 | download_chunked_file, 19 | gunzip_with_progress, 20 | download_ncbi_seqs, 21 | retrieve_species, 22 | build_query, 23 | ncbi_download_info, 24 | select_function, 25 | names_to_memory, 26 | nodes_to_memory, 27 | accession_to_memory, 28 | generate_lineages, 29 | fill_missing_lineages, 30 | dict_to_output, 31 | merge_uniq_databases, 32 | merge_databases, 33 | check_files, 34 | write_list_to_output, 35 | filter_function, 36 | select_subset, 37 | subset_function, 38 | classifier_format, 39 | idt_text, 40 | blast_no_tax, 41 | blast_tax, 42 | unknown_base_conversion, 43 | rev_comp, 44 | cutadapt, 45 | crabs_to_fasta, 46 | multiple_crabs_to_fasta, 47 | multiple_list_to_temp, 48 | usearch_global, 49 | extract_alignment_results, 50 | write_dict_to_output, 51 | parse_diversity, 52 | horizontal_bar_chart, 53 | parse_length, 54 | line_graph, 55 | calculate_ncbi_species_genera, 56 | calculate_database_species_genera, 57 | completeness_table_output, 58 | parse_phylo_input, 59 | subset_phylo_input, 60 | dict_to_fasta, 61 | align_sequences, 62 | generate_phylo_tree, 63 | amplicon_import, 64 | raw_import, 65 | extract_primer_regions, 66 | deconstruct_primer_regions, 67 | dict_to_array, 68 | efficiency_barplot, 69 | parse_primer, 70 | ) 71 | 72 | ##################### 73 | # CLI CONFIGURATION # 74 | ##################### 75 | # formatting 76 | click.rich_click.USE_RICH_MARKUP = True 77 | click.rich_click.SHOW_METAVARS_COLUMN = False 78 | click.rich_click.APPEND_METAVARS_HELP = True 79 | click.rich_click.HEADER_TEXT = (f"[yellow]/[/][cyan]/[/][yellow]/[/] [bold][link=https://github.com/gjeunen/reference_database_creator]CRABS[/link][/] | v{__version__}") 80 | click.rich_click.FOOTER_TEXT = "See [link=https://github.com/gjeunen/reference_database_creator]https://github.com/gjeunen/reference_database_creator[/] for more details." 81 | click.rich_click.ERRORS_SUGGESTION = f"This is CRABS [cyan]v{__version__}[/]\nFor more help, run '[yellow]crabs --help[/]' or visit [link=https://github.com/gjeunen/reference_database_creator]https://github.com/gjeunen/reference_database_creator[/]" 82 | click.rich_click.STYLE_ERRORS_SUGGESTION = "" 83 | 84 | # grouping of options 85 | click.rich_click.OPTION_GROUPS = { 86 | "crabs": [ 87 | { 88 | "name": "Download NCBI Taxonomy", 89 | "options": [ 90 | "--download-taxonomy", 91 | "--exclude", 92 | "--output", 93 | ], 94 | #"deduplicate": False 95 | }, 96 | { 97 | "name": "Download BOLD Database", 98 | "options": [ 99 | "--download-bold", 100 | "--taxon", 101 | "--marker", 102 | "--output", 103 | ], 104 | #"deduplicate": False 105 | }, 106 | { 107 | "name": "Download EMBL Database", 108 | "options": [ 109 | "--download-embl", 110 | "--taxon", 111 | "--output", 112 | ], 113 | #"deduplicate": False 114 | }, 115 | { 116 | "name": "Download GreenGenes Database", 117 | "options": [ 118 | "--download-greengenes", 119 | "--output", 120 | ], 121 | #"deduplicate": False 122 | }, 123 | { 124 | "name": "Download MIDORI2 Database", 125 | "options": [ 126 | "--download-midori", 127 | "--gene", 128 | "--gb-number", 129 | "--gb-type", 130 | "--output", 131 | ], 132 | #"deduplicate": False 133 | }, 134 | { 135 | "name": "Download MitoFish Database", 136 | "options": [ 137 | "--download-mitofish", 138 | "--output", 139 | ], 140 | #"deduplicate": False 141 | }, 142 | { 143 | "name": "Download NCBI Database", 144 | "options": [ 145 | "--download-ncbi", 146 | "--email", 147 | "--query", 148 | "--database", 149 | "--batchsize", 150 | "--species", 151 | "--output", 152 | ], 153 | #"deduplicate": False 154 | }, 155 | { 156 | "name": "Download SILVA Database", 157 | "options": [ 158 | "--download-silva", 159 | "--gene", 160 | "--db-type", 161 | "--db-version", 162 | "--output", 163 | ], 164 | #"deduplicate": False 165 | }, 166 | { 167 | "name": "Import sequences into CRABS format", 168 | "options": [ 169 | "--import", 170 | "--import-format", 171 | "--names", 172 | "--nodes", 173 | "--acc2tax", 174 | "--input", 175 | "--output", 176 | "--ranks", 177 | ], 178 | #"deduplicate": False 179 | }, 180 | { 181 | "name": "Merge CRABS databases into one file", 182 | "options": [ 183 | "--merge", 184 | "--input", 185 | "--output", 186 | "--uniq", 187 | ], 188 | #"deduplicate": False 189 | }, 190 | { 191 | "name": "Extract amplicons through in silico PCR", 192 | "options": [ 193 | "--in-silico-pcr", 194 | "--input", 195 | "--output", 196 | "--forward", 197 | "--reverse", 198 | "--mismatch", 199 | "--threads", 200 | "--untrimmed", 201 | ], 202 | #"deduplicate": False 203 | }, 204 | { 205 | "name": "Retrieve amplicons without primer-binding regions", 206 | "options": [ 207 | "--pairwise-global-alignment", 208 | "--input", 209 | "--output", 210 | "--amplicons", 211 | "--forward", 212 | "--reverse", 213 | "--size-select", 214 | "--threads", 215 | "--percent-identity", 216 | "--coverage", 217 | "--all-start-positions", 218 | ], 219 | #"deduplicate": False 220 | }, 221 | { 222 | "name": "Dereplicate CRABS database", 223 | "options": [ 224 | "--dereplicate", 225 | "--input", 226 | "--output", 227 | "--dereplication-method", 228 | ], 229 | #"deduplicate": False 230 | }, 231 | { 232 | "name": "Filter CRABS database", 233 | "options": [ 234 | "--filter", 235 | "--input", 236 | "--output", 237 | "--minimum-length", 238 | "--maximum-length", 239 | "--maximum-n", 240 | "--environmental", 241 | "--no-species-id", 242 | "--rank-na", 243 | ], 244 | #"deduplicate": False 245 | }, 246 | { 247 | "name": "Subset CRABS database on taxonomic ID", 248 | "options": [ 249 | "--subset", 250 | "--input", 251 | "--output", 252 | "--include", 253 | "--exclude", 254 | ], 255 | #"deduplicate": False 256 | }, 257 | { 258 | "name": "Figure: diversity contained within database", 259 | "options": [ 260 | "--diversity-figure", 261 | "--input", 262 | "--output", 263 | "--tax-level", 264 | ], 265 | #"deduplicate": False 266 | }, 267 | { 268 | "name": "Figure: amplicon length distribution", 269 | "options": [ 270 | "--amplicon-length-figure", 271 | "--input", 272 | "--output", 273 | "--tax-level", 274 | ], 275 | #"deduplicate": False 276 | }, 277 | { 278 | "name": "Figure: phylogenetic tree", 279 | "options": [ 280 | "--phylogenetic-tree", 281 | "--input", 282 | "--output", 283 | "--tax-level", 284 | "--species", 285 | ], 286 | #"deduplicate": False 287 | }, 288 | { 289 | "name": "Figure: amplification efficiency", 290 | "options": [ 291 | "--amplification-efficiency-figure", 292 | "--input", 293 | "--amplicons", 294 | "--forward", 295 | "--reverse", 296 | "--output", 297 | "--tax-group", 298 | ], 299 | #"deduplicate": False 300 | }, 301 | { 302 | "name": "Table: database completeness for target taxonomic group", 303 | "options": [ 304 | "--completeness-table", 305 | "--input", 306 | "--output", 307 | "--names", 308 | "--nodes", 309 | "--species", 310 | ], 311 | #"deduplicate": False 312 | }, 313 | { 314 | "name": "Export CRABS database to taxonomic classifier format", 315 | "options": [ 316 | "--export", 317 | "--input", 318 | "--output", 319 | "--export-format", 320 | ], 321 | #"deduplicate": False 322 | }, 323 | ], 324 | } 325 | 326 | # link user-input to options 327 | @click.command(context_settings=dict(help_option_names=["-h", "--help"])) 328 | 329 | # CRABS functions 330 | @click.option("--download-taxonomy", "download_taxonomy_", is_flag = True, help = "Function to download NCBI taxonomy") 331 | @click.option("--download-bold", "download_bold_", is_flag = True, help = "Function to download BOLD database") 332 | @click.option("--download-embl", "download_embl_", is_flag = True, help = "Function to download EMBL database") 333 | @click.option("--download-greengenes", "download_greengenes_", is_flag = True, help = "Function to download GreenGenes database") 334 | @click.option("--download-midori", "download_midori_", is_flag = True, help = "Function to download MIDORI2 database") 335 | @click.option("--download-mitofish", "download_mitofish_", is_flag = True, help = "Function to download MitoFish database") 336 | @click.option("--download-ncbi", "download_ncbi_", is_flag = True, help = "Function to download NCBI database") 337 | @click.option("--download-silva", "download_silva_", is_flag = True, help = "Function to download SILVA database") 338 | @click.option("--import", "import_", is_flag = True, help = "Function to import sequences into CRABS format") 339 | @click.option("--merge", "merge_", is_flag = True, help = "Function to merge CRABS databases into a single file") 340 | @click.option("--in-silico-pcr", "in_silico_pcr_", is_flag = True, help = "Function to extract amplicons through in silico PCR") 341 | @click.option("--pairwise-global-alignment", "pairwise_global_alignment_", is_flag = True, help = "Function to retrieve amplicons without primer-bidning regions") 342 | @click.option("--dereplicate", "dereplicate_", is_flag = True, help = "Function to dereplicate a CRABS database") 343 | @click.option("--filter", "filter_", is_flag = True, help = "Function to filter a CRABS database") 344 | @click.option("--subset", "subset_", is_flag = True, help = "Function to subset a CRABS database") 345 | @click.option("--diversity-figure", "diversity_figure_", is_flag = True, help = "Function to create a horizontal bar chart with included diversity") 346 | @click.option("--amplicon-length-figure", "amplicon_length_figure_", is_flag = True, help = "Function to create a line chart depicting amplicon distributions") 347 | @click.option("--phylogenetic-tree", "phylogenetic_tree_", is_flag = True, help = "Function to create a phylogenetic tree with barcodes for target species list") 348 | @click.option("--amplification-efficiency-figure", "amplification_efficiency_figure_", is_flag = True, help = "Function to create a bar graph displaying mismatches in the primer-binding region") 349 | @click.option("--completeness-table", "completeness_table_", is_flag = True, help = "Function creating a spreadsheet containing barcode availability for taxonomic groups") 350 | @click.option("--export", "export_", is_flag = True, help = "Function to export a CRABS database") 351 | 352 | # CRABS parameters 353 | @click.option("--output", "output_", help = "output directory or filename") 354 | @click.option("--exclude", "exclude_", help = "stop the download of 'acc2taxid' or 'taxdump'") 355 | @click.option("--taxon", "taxon_", help = "taxonomic group to download") 356 | @click.option("--gene", "gene_", help = "gene to download") 357 | @click.option("--gb-number", "gb_number_", help = "database version to download") 358 | @click.option("--gb-type", "gb_type_", type = str, help = "database type to download") 359 | @click.option("--marker", "marker_", help = "genetic marker to download") 360 | @click.option("--email", "email_", help = "email address to connect to NCBI server") 361 | @click.option("--query", "query_", help = "query identifying what to download from NCBI") 362 | @click.option("--database", "database_", help = "the database from which NCBI sequences are downloaded") 363 | @click.option("--batchsize", "batchsize_", default = 5000, type = int, help = "sequences to download from NCBI per chunk (default = 5,000)") 364 | @click.option("--species", "species_", help = "species of interest list") 365 | @click.option("--db-type", "db_type_", help = "database version to download") 366 | @click.option("--db-version", "db_version_", help = "database version to download") 367 | @click.option("--import-format", "import_format_", help = "format of the sequences to import") 368 | @click.option("--names", "names_", help = "NCBI taxonomy 'names.dmp' file") 369 | @click.option("--nodes", "nodes_", help = "NCBI taxonomy 'nodes.dmp' file") 370 | @click.option("--acc2tax", "acc2tax_", help = "NCBI taxonomy 'nucl_gb.accession2taxid' file") 371 | @click.option("--input", "input_", help = "input filename") 372 | @click.option("--ranks", "ranks_", default = 'superkingdom;phylum;class;order;family;genus;species', help = "taxonomic ranks to be included in the taxonomic lineage") 373 | @click.option("--uniq", "uniq_", is_flag = True, help = "keep only unique accession numbers") 374 | @click.option("--dereplication-method", "dereplication_method_", default = 'unique_species', help = 'dereplication method: "strict", "single_species", and "unique_species" (default)') 375 | @click.option("--minimum-length", "minimum_length_", help = "minimum sequence length for amplicon to be retained in the database", type = int) 376 | @click.option("--maximum-length", "maximum_length_", help = "maximum sequence length for amplicon to be retained in the database", type = int) 377 | @click.option("--maximum-n", "maximum_n_", help = "discard amplicons with N or more ambiguous bases", type = int) 378 | @click.option("--environmental", "environmental_", is_flag = True, help = "discard environmental sequences from the database") 379 | @click.option("--no-species-id", "no_species_id_", is_flag = True, help = "discard sequences for which no species name is available") 380 | @click.option("--rank-na", "rank_na_", help = "discard sequences with N or more unspecified taxonomic levels", type = int) 381 | @click.option("--include", "include_", help = "string or file containing taxa to include") 382 | @click.option("--exclude", "exclude_", help = "string or file containing taxa to exclude") 383 | @click.option("--export-format", "export_format_", help = 'export format: "sintax", "rdp", "qiime-fasta", "qiime-text", "dada2-species", "dada2-taxonomy", "idt-fasta", "idt-text", "blast-notax", "blast-tax"') 384 | @click.option("--forward", "forward_", help = "forward primer sequence in 5' -> 3' direction") 385 | @click.option("--reverse", "reverse_", help = "reverse primer sequence in 5' -> 3' direction") 386 | @click.option("--mismatch", "mismatch_", type = float, default = 4.5, help = "number of mismatches allowed in the primer-binding site (default: 4)") 387 | @click.option("--threads", "threads_", type = int, default = 0, help = "number of threads used to compute the in silico PCR (default: autodetection)") 388 | @click.option("--untrimmed", "untrimmed_", help = "file name for untrimmed sequences") 389 | @click.option("--amplicons", "amplicons_", help = "file name for the amplicons retrieved during in silico PCR") 390 | @click.option("--size-select", "size_select_", help = "exclude reads longer than N from the analysis") 391 | @click.option("--percent-identity", "percent_identity_", help = "minimum percent identity threshold for the alignment to pass (0.0 - 1.0)") 392 | @click.option("--coverage", "coverage_", help = "minimum coverage threshold for the alignment to pass (0 - 100)") 393 | @click.option("--all-start-positions", "all_start_positions_", is_flag = True, help = "do not restrict alignment start and end to be within the primer-binding region length") 394 | @click.option("--tax-level", "tax_level_", type = int, help = "taxonomic level to be used as groups for horizontal bar chart") 395 | @click.option("--tax-group", "tax_group_", help = "taxonomic group of interest to be included in the analysis") 396 | 397 | ################# 398 | # MAIN FUNCTION # 399 | ################# 400 | def crabs(**kwargs): 401 | """CRABS is an open-source software program that enables scientists to build custom local reference databases for improved taxonomy assignment of metabarcoding data. 402 | 403 | CRABS is split up into various functions and steps to accomplish this task, including: 404 | 405 | (1) download data from online repositories, 406 | 407 | (2) import downloaded data into CRABS format, 408 | 409 | (3) extract amplicons from imported data, 410 | 411 | (4) retrieve amplicons without primer-binding regions, 412 | 413 | (5) curate and subset the local database, 414 | 415 | (6) export the local database in various taxonomic classifier formats, and 416 | 417 | (7) basic visualisations to explore the local reference database. 418 | 419 | 420 | 421 | A basic example to run CRABS (download NCBI taxonomy information): 422 | 423 | [blue bold]crabs --download-taxonomy --exclude 'acc2taxid'[/] 424 | """ 425 | 426 | # access all functions from kwargs 427 | download_taxonomy_ = kwargs.get("download_taxonomy_") 428 | download_bold_ = kwargs.get("download_bold_") 429 | download_embl_ = kwargs.get("download_embl_") 430 | download_greengenes_ = kwargs.get("download_greengenes_") 431 | download_midori_ = kwargs.get("download_midori_") 432 | download_mitofish_ = kwargs.get("download_mitofish_") 433 | download_ncbi_ = kwargs.get("download_ncbi_") 434 | download_silva_ = kwargs.get("download_silva_") 435 | import_ = kwargs.get("import_") 436 | merge_ = kwargs.get("merge_") 437 | in_silico_pcr_ = kwargs.get("in_silico_pcr_") 438 | pairwise_global_alignment_ = kwargs.get("pairwise_global_alignment_") 439 | dereplicate_ = kwargs.get("dereplicate_") 440 | filter_ = kwargs.get("filter_") 441 | subset_ = kwargs.get("subset_") 442 | diversity_figure_ = kwargs.get("diversity_figure_") 443 | amplicon_length_figure_ = kwargs.get("amplicon_length_figure_") 444 | phylogenetic_tree_ = kwargs.get("phylogenetic_tree_") 445 | amplification_efficiency_figure_ = kwargs.get("amplification_efficiency_figure_") 446 | completeness_table_ = kwargs.get("completeness_table_") 447 | export_ = kwargs.get("export_") 448 | 449 | # access all options from kwargs 450 | output_ = kwargs.get("output_") 451 | exclude_ = kwargs.get("exclude_") 452 | taxon_ = kwargs.get("taxon_") 453 | marker_ = kwargs.get("marker_") 454 | gene_ = kwargs.get("gene_") 455 | gb_number_ = kwargs.get("gb_number_") 456 | gb_type_ = kwargs.get("gb_type_") 457 | email_ = kwargs.get("email_") 458 | query_ = kwargs.get("query_") 459 | database_ = kwargs.get("database_") 460 | batchsize_ = kwargs.get("batchsize_") 461 | species_ = kwargs.get("species_") 462 | db_type_ = kwargs.get("db_type_") 463 | db_version_ = kwargs.get("db_version_") 464 | import_format_ = kwargs.get("import_format_") 465 | names_ = kwargs.get("names_") 466 | nodes_ = kwargs.get("nodes_") 467 | acc2tax_ = kwargs.get("acc2tax_") 468 | input_ = kwargs.get("input_") 469 | ranks_ = kwargs.get("ranks_") 470 | uniq_ = kwargs.get("uniq_") 471 | dereplication_method_ = kwargs.get("dereplication_method_") 472 | minimum_length_ = kwargs.get("minimum_length_") 473 | maximum_length_ = kwargs.get("maximum_length_") 474 | maximum_n_ = kwargs.get("maximum_n_") 475 | environmental_ = kwargs.get("environmental_") 476 | no_species_id_ = kwargs.get("no_species_id_") 477 | rank_na_ = kwargs.get("rank_na_") 478 | include_ = kwargs.get("include_") 479 | exclude_ = kwargs.get("exclude_") 480 | export_format_ = kwargs.get("export_format_") 481 | forward_ = kwargs.get("forward_") 482 | reverse_ = kwargs.get("reverse_") 483 | mismatch_ = kwargs.get("mismatch_") 484 | threads_ = kwargs.get("threads_") 485 | untrimmed_ = kwargs.get("untrimmed_") 486 | amplicons_ = kwargs.get("amplicons_") 487 | size_select_ = kwargs.get("size_select_") 488 | percent_identity_ = kwargs.get("percent_identity_") 489 | coverage_ = kwargs.get("coverage_") 490 | include_all_start_positions_ = kwargs.get("all_start_positions_") 491 | tax_level_ = kwargs.get("tax_level_") 492 | tax_group_ = kwargs.get("tax_group_") 493 | 494 | # print starting info to console 495 | console = rich.console.Console(stderr=True, highlight=False) 496 | console.print(f"\n[yellow]/[/][cyan]/[/][yellow]/[/] [bold][link=https://github.com/gjeunen/reference_database_creator]CRABS[/link][/] | v{__version__}\n") 497 | columns = [*rich.progress.Progress.get_default_columns(), rich.progress.TimeElapsedColumn()] 498 | 499 | # identify function and execute 500 | ##################### 501 | # DOWNLOAD TAXONOMY # 502 | ##################### 503 | if download_taxonomy_: 504 | # print function to console 505 | console.print(f"[cyan]| Function[/] | Download NCBI taxonomy files") 506 | # check what files to download based on exclude_ 507 | download_dict = parse_exclude(exclude_) 508 | # set output directory 509 | output_directory = set_output_dir(output_) 510 | # iterate over download_dict for the different files 511 | for key, value in download_dict.items(): 512 | # set filename 513 | filename = value.split('/')[-1] 514 | # download file 515 | download_file(console, columns, value, output_directory, filename) 516 | # unzip file 517 | unzip_method = select_function(key) 518 | unzip_method(console, columns, output_directory, filename) 519 | # remove zipped and intermediary files 520 | os.remove(f'{output_directory}{filename}') 521 | remove_tar_intermediary(key, output_directory) 522 | 523 | ################# 524 | # DOWNLOAD BOLD # 525 | ################# 526 | if download_bold_: 527 | # print function to console 528 | console.print(f"[cyan]| Function[/] | Download BOLD database") 529 | # check if all parameters have been provided 530 | check_params(console, {'"--output"': output_, '"--taxon"': taxon_}) 531 | # set url, output directory, and filename 532 | url = 'http://v3.boldsystems.org/index.php/API_Public/sequence?taxon=' + taxon_ 533 | if marker_: 534 | url = url + '&marker=' + marker_ 535 | output_directory = f'{os.path.dirname(output_)}/' 536 | if output_directory == '/': 537 | output_directory = '' 538 | filename = output_.split('/')[-1] 539 | # download the file 540 | download_chunked_file(console, columns, url, output_directory, filename) 541 | 542 | ################# 543 | # DOWNLOAD EMBL # 544 | ################# 545 | if download_embl_: 546 | # print function to console 547 | console.print(f"[cyan]| Function[/] | Download EMBL database") 548 | # check if all parameters have been provided 549 | check_params(console, {'"--output"': output_, '"--taxon"': taxon_}) 550 | # find all matching files to taxon_ and store as urls 551 | urls = embl_url(console, taxon_) 552 | # print number of files to download to console 553 | if len(urls) > 1: 554 | console.print(f"[cyan]| Results[/] | Downloading {len(urls)} files from EMBL") 555 | else: 556 | console.print(f"[cyan]| Results[/] | Downloading {len(urls)} file from EMBL") 557 | # set output_directory, and filename 558 | output_directory = f'{os.path.dirname(output_)}/' 559 | if output_directory == '/': 560 | output_directory = '' 561 | outputfilename = output_.split('/')[-1] 562 | # download the files 563 | matching_files = [] 564 | for url in urls: 565 | zipfilename = url.split('/')[-1] 566 | matching_files.append(zipfilename) 567 | download_file(console, columns, url, output_directory, zipfilename) 568 | # unzip files and remove zipped intermediary files 569 | file_count = 0 570 | for file in matching_files: 571 | file_count += 1 572 | if file_count == 1: 573 | gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = False) 574 | else: 575 | gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = True) 576 | os.remove(f'{output_directory}{file}') 577 | 578 | ####################### 579 | # DOWNLOAD GREENGENES # 580 | ####################### 581 | if download_greengenes_: 582 | # print function to console 583 | console.print(f"[cyan]| Function[/] | Download GreenGenes database") 584 | # check if all parameters have been provided 585 | check_params(console, {'"--output"': output_}) 586 | # set url, output directory, and filename 587 | urls = ['https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_GREENGENES_gg16S_unaligned.fasta.gz', 588 | 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_prokMSA_unaligned.fasta.gz', 589 | 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_HOMD_gg16S_unaligned.fasta.gz', 590 | 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_NCBI_gg16S_unaligned.fasta.gz', 591 | 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_RDP_gg16S_unaligned.fasta.gz', 592 | 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_SILVA_gg16S_unaligned.fasta.gz'] 593 | output_directory = f'{os.path.dirname(output_)}/' 594 | if output_directory == '/': 595 | output_directory = '' 596 | outputfilename = output_.split('/')[-1] 597 | # download the zip files 598 | matching_files = [] 599 | for url in urls: 600 | zipfilename = url.split('/')[-1] 601 | matching_files.append(zipfilename) 602 | download_file(console, columns, url, output_directory, zipfilename) 603 | # unzip files and remove zipped intermediary files 604 | file_count = 0 605 | for file in matching_files: 606 | file_count += 1 607 | if file_count == 1: 608 | gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = False) 609 | else: 610 | gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = True) 611 | os.remove(f'{output_directory}{file}') 612 | 613 | #################### 614 | # DOWNLOAD MIDORI2 # 615 | #################### 616 | if download_midori_: 617 | # print function to console 618 | console.print(f"[cyan]| Function[/] | Download MIDORI2 database") 619 | # check if all parameters have been provided and correctly formatted 620 | check_params(console, {'"--output"': output_, '"--gene"': gene_, '"--gb-number"': gb_number_, '"--gb-type': gb_type_}) 621 | check_midori_values(console, gene_, gb_type_, gb_number_) 622 | # set url, output_directory, and filename 623 | zip_type, url = midori_url(gb_number_, gb_type_, gene_) 624 | output_directory = f'{os.path.dirname(output_)}/' 625 | if output_directory == '/': 626 | output_directory = '' 627 | zipfilename = url.split('/')[-1] 628 | outputfilename = output_.split('/')[-1] 629 | # download the zip file 630 | download_file(console, columns, url, output_directory, zipfilename) 631 | # unzip the downloaded file 632 | if zip_type == 'unzip': 633 | unzip_with_progress(console, columns, output_directory, zipfilename, outputfilename) 634 | elif zip_type == 'gunzip': 635 | gunzip_with_progress(console, columns, output_directory, zipfilename, outputfilename, append = False) 636 | # remove intermediary files 637 | os.remove(f'{output_directory}{zipfilename}') 638 | 639 | ##################### 640 | # DOWNLOAD MITOFISH # 641 | ##################### 642 | if download_mitofish_: 643 | # print function to console 644 | console.print(f"[cyan]| Function[/] | Download MitoFish database") 645 | # check if all parameters have been provided 646 | check_params(console, {'"--output"': output_}) 647 | # set url, output_directory, and filename 648 | url = 'http://mitofish.aori.u-tokyo.ac.jp/species/detail/download/?filename=download%2F/complete_partial_mitogenomes.zip' 649 | output_directory = f'{os.path.dirname(output_)}/' 650 | if output_directory == '/': 651 | output_directory = '' 652 | zipfilename = url.split('/')[-1] 653 | outputfilename = output_.split('/')[-1] 654 | # download the zip file 655 | download_file(console, columns, url, output_directory, zipfilename) 656 | # unzip the downloaded file 657 | unzip_with_progress(console, columns, output_directory, zipfilename, outputfilename) 658 | # remove intermediary files 659 | os.remove(f'{output_directory}{zipfilename}') 660 | 661 | ################# 662 | # DOWNLOAD NCBI # 663 | ################# 664 | if download_ncbi_: 665 | # print function to console 666 | console.print(f"[cyan]| Function[/] | Download NCBI database") 667 | # check if all parameters have been provided 668 | check_params(console, {'"--output"': output_, '"--database"': database_, '"--query"': query_, '"--email"': email_}) 669 | # retrieve species information 670 | species_list = retrieve_species(console, columns, species_) if species_ else [] 671 | # build query 672 | query_list = build_query(species_list, query_) 673 | # retrieve the query key and web environment to download NCBI seq data 674 | total_read_count, ncbi_info_dict = ncbi_download_info(console, columns, query_list, database_, email_) 675 | # download NCBI sequences 676 | total_downloaded_seqs = download_ncbi_seqs(console, columns, total_read_count, batchsize_, database_, email_, ncbi_info_dict, output_) 677 | # write log to Terminal window 678 | try: 679 | console.print(f"[cyan]| Results[/] | Number of sequences downloaded: {total_downloaded_seqs}/{total_read_count} ({round(total_downloaded_seqs / total_read_count * 100, 2)}%)") 680 | except ZeroDivisionError: 681 | console.print(f"[cyan]| Results[/] | Number of sequences downloaded: 0") 682 | 683 | ################## 684 | # DOWNLOAD SILVA # 685 | ################## 686 | if download_silva_: 687 | # print function to console 688 | console.print(f"[cyan]| Function[/] | Download SILVA database") 689 | # check if all parameters have been provided 690 | check_params(console, {'"--output"': output_, '"--gene"': gene_, '"--db-type"': db_type_, '"--db-version"': db_version_}) 691 | # set url, output_directory, and filename 692 | if db_type_.upper() == 'FULL': 693 | url = f'https://ftp.arb-silva.de/release_{db_version_}/Exports/SILVA_{db_version_}_{gene_.upper()}Ref_tax_silva.fasta.gz' 694 | elif db_type_.upper() == 'SUBSET': 695 | url = f'https://ftp.arb-silva.de/release_{db_version_}/Exports/SILVA_{db_version_}_{gene_.upper()}Ref_NR99_tax_silva.fasta.gz' 696 | else: 697 | console.print(f"[cyan]| ERROR[/] | [bold yellow]incorrect value provided for '--db-type', aborting analysis...[/]\n") 698 | exit() 699 | output_directory = f'{os.path.dirname(output_)}/' 700 | if output_directory == '/': 701 | output_directory = '' 702 | zipfilename = url.split('/')[-1] 703 | outputfilename = output_.split('/')[-1] 704 | # download the zip file 705 | download_file(console, columns, url, output_directory, zipfilename) 706 | # unzip the downloaded file 707 | gunzip_with_progress(console, columns, output_directory, zipfilename, outputfilename, append = False) 708 | # remove intermediary files 709 | os.remove(f'{output_directory}{zipfilename}') 710 | 711 | ########## 712 | # IMPORT # 713 | ########## 714 | if import_: 715 | # print function to console 716 | console.print(f"[cyan]| Function[/] | Import sequence data into CRABS format") 717 | # check if all parameters have been provided (need to make a distinction in neccesary parameters between different formats) 718 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--format"': import_format_, '"--names"': names_, '"--nodes"': nodes_, '"--acc2taxid"': acc2tax_}) 719 | # read documents into memory 720 | input_files = [input_, names_, nodes_, acc2tax_] 721 | input_file_size = sum(os.path.getsize(input_file) for input_file in input_files) 722 | with rich.progress.Progress(*columns) as progress_bar: 723 | task = progress_bar.add_task(console = console, description = "[cyan]| Read data to memory[/] |", total=input_file_size) 724 | input_to_memory = select_function(import_format_) 725 | seq_input_dict, initial_seq_number = input_to_memory(task, progress_bar, input_) 726 | names_key_tax_number_value_dict, tax_number_key_names_value_dict, synonym_key_dict = names_to_memory(task, progress_bar, names_) 727 | tax_number_key_rank_and_tax_number_up_values_dict = nodes_to_memory(task, progress_bar, nodes_) 728 | acc_key_tax_number_value_dict = accession_to_memory(task, progress_bar, acc2tax_, seq_input_dict) 729 | # generate taxonomic lineages 730 | seq_input_dict, unresolved_lineage = generate_lineages(console, columns, ranks_, seq_input_dict, acc_key_tax_number_value_dict, names_key_tax_number_value_dict, synonym_key_dict, tax_number_key_rank_and_tax_number_up_values_dict, tax_number_key_names_value_dict) 731 | # fill out missing info 732 | seq_input_dict = fill_missing_lineages(console, columns, ranks_, seq_input_dict) 733 | # write to output 734 | dict_to_output(seq_input_dict, ranks_, output_) 735 | # write log to Terminal window 736 | console.print(f"[cyan]| Results[/] | Imported {len(seq_input_dict)} out of {initial_seq_number} sequences into CRABS format ({round(len(seq_input_dict) / initial_seq_number * 100, 2)}%)") 737 | if unresolved_lineage > 0: 738 | console.print(f"[cyan]| [/] | Could not resolve a taxonomic lineage for {unresolved_lineage} imported sequences ({round(unresolved_lineage / len(seq_input_dict) * 100, 2)}%)") 739 | 740 | ######### 741 | # MERGE # 742 | ######### 743 | if merge_: 744 | # print function to console 745 | console.print(f"[cyan]| Function[/] | Merge CRABS databases into a single file") 746 | # check if all parameters have been provided 747 | check_params(console, {'"--input"': input_, '"--output"': output_}) 748 | # check for multiple files and their existence 749 | file_list = check_files(console, input_) 750 | # merge databases based on "--uniq" parameter 751 | if uniq_: 752 | merged_seq_file, initial_read_count = merge_uniq_databases(console, columns, file_list) 753 | else: 754 | merged_seq_file, initial_read_count = merge_databases(console, columns, file_list) 755 | # write merged data to output 756 | write_list_to_output(console, columns, merged_seq_file, output_) 757 | # write log to Terminal window 758 | console.print(f"[cyan]| Results[/] | Written {len(merged_seq_file)} sequences to {output_} by merging {len(file_list)} files containing {initial_read_count} sequences ({round(len(merged_seq_file) / initial_read_count * 100, 2)}%)") 759 | 760 | ################# 761 | # IN SILICO PCR # 762 | ################# 763 | if in_silico_pcr_: 764 | # print function to console 765 | console.print(f"[cyan]| Function[/] | Extract amplicons through in silico PCR") 766 | # check if all parameters have been provided 767 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--forward"': forward_, '"--reverse"': reverse_}) 768 | # check primers for unknown bases and reverse complement reverse primer 769 | forward_ = unknown_base_conversion(forward_) 770 | reverse_ = unknown_base_conversion(reverse_) 771 | reverse_ = rev_comp(reverse_) 772 | # set parameters for cutadapt 773 | overlap = str(min(len(forward_), len(reverse_))) 774 | adapter = forward_ + '...' + reverse_ 775 | # transform input_ to fasta format in a temp file 776 | temp_input_path, fasta_dict = crabs_to_fasta(console, columns, input_) 777 | # run cutadapt 778 | trimmed_seqs, untrimmed_seqs = cutadapt(console, columns, adapter, temp_input_path, fasta_dict, mismatch_, overlap, threads_) 779 | # write data to output 780 | write_list_to_output(console, columns, trimmed_seqs, output_) 781 | if untrimmed_: 782 | write_list_to_output(console, columns, untrimmed_seqs, untrimmed_) 783 | # remove temporary files 784 | os.remove(temp_input_path) 785 | # write log to Terminal window 786 | console.print(f"[cyan]| Results[/] | Extracted {len(trimmed_seqs)} amplicons from {len(fasta_dict)} sequences ({round(len(trimmed_seqs) / len(fasta_dict) * 100, 2)}%)") 787 | 788 | ############################# 789 | # PAIRWISE GLOBAL ALIGNMENT # 790 | ############################# 791 | if pairwise_global_alignment_: 792 | # print function to console 793 | console.print(f"[cyan]| Function[/] | Retrieve amplicons without primer-binding regions") 794 | # check if all parameters have been provided 795 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--amplicons"': amplicons_, '"--forward"': forward_, '"--reverse"': reverse_, '"--percent-identity"': percent_identity_, '"--coverage"': coverage_}) 796 | # read data into memory 797 | file_list = [amplicons_, input_] 798 | raw_fasta_dict, raw_fasta_list, amplicon_fasta_dict, amplicon_fasta_list = multiple_crabs_to_fasta(console, columns, file_list, size_select_) 799 | # write input to temp files in fasta format 800 | raw_temp_path, amplicon_temp_path = multiple_list_to_temp(console, columns, raw_fasta_list, amplicon_fasta_list) 801 | # run pairwise global alignment 802 | align_temp_path = usearch_global(console, columns, raw_temp_path, amplicon_temp_path, percent_identity_, threads_, raw_fasta_dict) 803 | # extract the sequence regions that conform to parameter settings 804 | amplicon_fasta_dict = extract_alignment_results(console, columns, align_temp_path, amplicon_fasta_dict, include_all_start_positions_, coverage_, forward_, reverse_, raw_fasta_dict) 805 | # write data to output 806 | write_dict_to_output(console, columns, amplicon_fasta_dict, output_) 807 | # remove intermediary files 808 | os.remove(raw_temp_path) 809 | os.remove(amplicon_temp_path) 810 | os.remove(align_temp_path) 811 | # write log to Terminal window 812 | console.print(f"[cyan]| Results[/] | Retrieved {len(amplicon_fasta_dict) - len(amplicon_fasta_list)} amplicons without primer-binding regions from {len(raw_fasta_dict)} sequences") 813 | 814 | ############### 815 | # DEREPLICATE # 816 | ############### 817 | if dereplicate_: 818 | # print function to console 819 | console.print(f"[cyan]| Function[/] | Dereplicate CRABS database") 820 | # check if all parameters have been provided 821 | check_params(console, {'"--input"': input_, '"--output"': output_}) 822 | # select dereplication function 823 | dereplication_function = select_function(dereplication_method_) 824 | # dereplicate data 825 | initial_read_count, seq_file = dereplication_function(console, columns, input_) 826 | # write data to output 827 | write_list_to_output(console, columns, seq_file, output_) 828 | # write log to Terminal window 829 | console.print(f"[cyan]| Results[/] | Written {len(seq_file)} unique sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)") 830 | 831 | ########## 832 | # FILTER # 833 | ########## 834 | if filter_: 835 | # print function to console 836 | console.print(f"[cyan]| Function[/] | Filter CRABS database") 837 | # check if all parameters have been provided 838 | check_params(console, {'"--input"': input_, '"--output"': output_}) 839 | # print which filtering parameters are included 840 | included_parameters = [key for key, value in {'"--minimum-length"': minimum_length_, '"--maximum-length"': maximum_length_, '"--maximum-n"': maximum_n_, '"--environmental"': environmental_, '"--no-species-id"': no_species_id_, '"--rank-na"': rank_na_}.items() if value not in [None, False]] 841 | console.print(f"[cyan]| Included parameters[/] | {', '.join(included_parameters)}") 842 | # read input file and parse data 843 | initial_read_count, seq_file, min_len_count, max_len_count, max_n_count, env_count, no_spec_count, rank_count = filter_function(console, columns, input_, minimum_length_, maximum_length_, maximum_n_, environmental_, no_species_id_, rank_na_) 844 | # write data to output 845 | write_list_to_output(console, columns, seq_file, output_) 846 | # write log to Terminal window 847 | console.print(f"[cyan]| Results[/] | Written {len(seq_file)} filtered sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)") 848 | for item in [min_len_count, max_len_count, max_n_count, env_count, no_spec_count, rank_count]: 849 | for key, value in item.items(): 850 | if value != 0: 851 | console.print(f"[cyan]| [/] | {key}: {value} sequences not passing filter ({round(value / initial_read_count * 100, 2)}%)") 852 | 853 | ########## 854 | # SUBSET # 855 | ########## 856 | if subset_: 857 | # print function to console 858 | console.print(f"[cyan]| Function[/] | Subset CRABS database") 859 | # check if all parameters have been provided 860 | check_params(console, {'"--input"': input_, '"--output"': output_}) 861 | # check inclusion or exclusion parameter 862 | subset_dict = select_subset(console, include_, exclude_) 863 | # read input file and parse data 864 | initial_read_count, seq_file = subset_function(console, columns, input_, subset_dict) 865 | # write data to output 866 | write_list_to_output(console, columns, seq_file, output_) 867 | # write log to Terminal window 868 | console.print(f"[cyan]| Results[/] | Written {len(seq_file)} subsetted sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)") 869 | 870 | #################### 871 | # DIVERSITY FIGURE # 872 | #################### 873 | if diversity_figure_: 874 | # print function to console 875 | console.print(f"[cyan]| Function[/] | Generate horizontal bar chart displaying diversity within database") 876 | # check if all parameters have been provided 877 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--tax-level"': tax_level_}) 878 | # read input file and parse data 879 | diversity_seq_dict, diversity_species_dict = parse_diversity(console, columns, input_, tax_level_) 880 | # generate horizontal bar chart 881 | horizontal_bar_chart(diversity_seq_dict, diversity_species_dict, output_) 882 | 883 | ########################## 884 | # AMPLICON LENGTH FIGURE # 885 | ########################## 886 | if amplicon_length_figure_: 887 | # print function to console 888 | console.print(f"[cyan]| Function[/] | Generate line graph displaying amplicon length distributions") 889 | # check if all parameters have been provided 890 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--tax-level"': tax_level_}) 891 | # read input file and parse data 892 | amplicon_length_dict = parse_length(console, columns, input_, tax_level_) 893 | # generate line graph 894 | line_graph(amplicon_length_dict, output_) 895 | 896 | ############################### 897 | # DATABASE COMPLETENESS TABLE # 898 | ############################### 899 | if completeness_table_: 900 | # print function to console 901 | console.print(f"[cyan]| Function[/] | Generate table containing barcode availability for taxonomic group") 902 | # check if all parameters have been provided 903 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--names"': names_, '"--nodes"': nodes_, '"--species"': species_}) 904 | # retrieve species of interest information 905 | species_dict = collections.defaultdict(dict) 906 | species_list = retrieve_species(console, columns, species_) 907 | for item in species_list: 908 | species_dict[item]['taxid'] = item 909 | # retrieve taxonomic lineages 910 | input_files = [names_, nodes_] 911 | input_file_size = sum(os.path.getsize(input_file) for input_file in input_files) 912 | with rich.progress.Progress(*columns) as progress_bar: 913 | task = progress_bar.add_task(console = console, description = "[cyan]| NCBI tax to memory[/] |", total=input_file_size) 914 | names_key_tax_number_value_dict, tax_number_key_names_value_dict, synonym_key_dict = names_to_memory(task, progress_bar, names_) 915 | tax_number_key_rank_and_tax_number_up_values_dict = nodes_to_memory(task, progress_bar, nodes_) 916 | seq_input_dict, unresolved_lineage = generate_lineages(console, columns, ranks_, species_dict, {}, names_key_tax_number_value_dict, synonym_key_dict, tax_number_key_rank_and_tax_number_up_values_dict, tax_number_key_names_value_dict) 917 | # retrieve information about potential number of taxa shared with species of interest on genus and family level 918 | table_info_dict = calculate_ncbi_species_genera(console, columns, seq_input_dict, tax_number_key_rank_and_tax_number_up_values_dict) 919 | # retrieve information about number of taxa shared with species of interest on genus and family level in reference database 920 | table_info_dict = calculate_database_species_genera(console, columns, input_, table_info_dict, seq_input_dict) 921 | # write data to output 922 | completeness_table_output(table_info_dict, output_) 923 | 924 | ##################### 925 | # PHYLOGENETIC TREE # 926 | ##################### 927 | if phylogenetic_tree_: 928 | # print function to console 929 | console.print(f"[cyan]| Function[/] | Generate a phylogenetic tree based on barcodes for target species") 930 | # check if all parameters have been provided 931 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--tax-level"': tax_level_, '"--species"': species_}) 932 | # retrieve species information 933 | species_list = retrieve_species(console, columns, species_) 934 | # parse input_ 935 | input_dict = parse_phylo_input(console, columns, input_, tax_level_) 936 | # subset input_dict to only include relevant sequences 937 | subset_dict = subset_phylo_input(console, columns, input_dict, species_list) 938 | # generate intermediary fasta files 939 | with rich.progress.Progress(*columns) as progress_bar: 940 | task = progress_bar.add_task(console = console, description = "[cyan]| Generate trees[/] |", total=len(subset_dict)) 941 | for target_species in subset_dict: 942 | progress_bar.update(task, advance = 1) 943 | align_input = dict_to_fasta(subset_dict[target_species]) 944 | # align sequences 945 | align_output = align_sequences(align_input) 946 | # generate phylogenetic tree 947 | generate_phylo_tree(align_output, output_, target_species) 948 | # remove intermediary files 949 | os.remove(align_input) 950 | os.remove(align_output) 951 | os.remove(f'{align_input}.dnd') 952 | 953 | ################################### 954 | # AMPLIFICATION EFFICIENCY FIGURE # 955 | ################################### 956 | if amplification_efficiency_figure_: 957 | # print function to console 958 | console.print(f"[cyan]| Function[/] | Generate a bar plot displaying mismatches in the primer-binding regions") 959 | # check if all parameters have been provided 960 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--amplicons"': amplicons_, '"--forward"': forward_, '"--reverse"': reverse_}) 961 | # import data 962 | file_list = [amplicons_, input_] 963 | total_file_size = sum(os.path.getsize(file) for file in file_list) 964 | with rich.progress.Progress(*columns) as progress_bar: 965 | task = progress_bar.add_task(console = console, description = "[cyan]| Import data[/] |", total=total_file_size) 966 | amplicons_dict = amplicon_import(task, progress_bar, amplicons_, tax_group_) 967 | raw_dict = raw_import(task, progress_bar, input_, amplicons_dict) 968 | # extract the primer-binding regions 969 | primer_binding_region_dict = extract_primer_regions(console, columns, amplicons_dict, raw_dict, forward_, reverse_) 970 | # calculate base proportions at each location within the primer-binding regions 971 | forward_position_dict = deconstruct_primer_regions(primer_binding_region_dict, 'forward') 972 | reverse_position_dict = deconstruct_primer_regions(primer_binding_region_dict, 'reverse') 973 | # transform dict to np.array 974 | forward_positions, forward_ordered_counts, forward_bottoms = dict_to_array(forward_position_dict) 975 | reverse_positions, reverse_ordered_counts, reverse_bottoms = dict_to_array(reverse_position_dict) 976 | # parse primer data for plotting 977 | forward_primer_info = parse_primer(forward_) 978 | reverse_primer_info = parse_primer(reverse_) 979 | # generate figure 980 | efficiency_barplot(forward_positions, forward_ordered_counts, forward_bottoms, reverse_positions, reverse_ordered_counts, reverse_bottoms, forward_primer_info, reverse_primer_info, forward_, reverse_, output_) 981 | 982 | ########## 983 | # EXPORT # 984 | ########## 985 | if export_: 986 | # print function to console 987 | console.print(f"[cyan]| Function[/] | Export CRABS database to {export_format_.upper()} format") 988 | # check if all parameters have been provided 989 | check_params(console, {'"--input"': input_, '"--output"': output_, '"--export-format"': export_format_}) 990 | # select format function 991 | if export_format_.upper() == 'IDT-TEXT': 992 | initial_read_count, seq_file = idt_text(console, columns, input_) 993 | write_list_to_output(console, columns, seq_file, output_) 994 | elif export_format_.upper() == 'BLAST-NOTAX': 995 | blast_no_tax(console, columns, input_, output_) 996 | elif export_format_.upper() == 'BLAST-TAX': 997 | blast_tax(console, columns, input_, output_) 998 | else: 999 | output_to_format = select_function(export_format_) 1000 | # read input file and parse data 1001 | initial_read_count, seq_file = classifier_format(console, columns, input_, output_to_format) 1002 | # write data to output 1003 | write_list_to_output(console, columns, seq_file, output_) 1004 | # write log to Terminal window 1005 | console.print(f"[cyan]| Results[/] | Written {len(seq_file)} sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)") 1006 | 1007 | ################ 1008 | # EXECUTE CODE # 1009 | ################ 1010 | if __name__ == "__main__": 1011 | crabs() -------------------------------------------------------------------------------- /docker_intro/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Getting started using Crabs in Docker 4 | 5 | Running your applications with Docker offers many advantages over other approaches, but it can be difficult to get used to. One of the main challenges is that Docker provides an additional layer of abstraction that is outside your own computer's file structure. While this added layer frees you from the frequent nightmares of software installation and dependencies, it can often trip you up. It is important to remember that to process any of the files on your computer with a Docker application it is necessary to copy those files to the Docker's own file system, and then provide a way to get the outputs back into your own computer's file system. Hopefully, the examples below will help make this easier. 6 | 7 | Note: the below examples will work for Mac or Linux. We will add examples for Windows systems soon. 8 | 9 | ## Help command 10 | 11 | After installing any software, most of us will try it out with a help command. Here is an example using crabs on Docker. (All following commands assume that you have already pulled the docker image using `docker pull quay.io/swordfish/crabs:1.7.7.0`.) 12 | 13 | ``` 14 | docker run --rm -it \ 15 | quay.io/swordfish/crabs:1.7.7.0 \ 16 | crabs -h 17 | ``` 18 | 19 | Okay, let's break down that command. The `docker run` command will create a container out of the image that we pulled from the quay.io website. This image can be examined using Docker Desktop. The options after this command are as follows: 20 | 21 | `--rm` will automatically remove the container when the command is finished. There are instances when you will want to keep the container running, but we are keeping everything simple for now. 22 | 23 | `-it` is two commands merged: the `-i` is for interactive, and the `-t` is to allocate a *tty* (essentially acting as a pseudo terminal) for the container. These two commands together allow you to use the command as an interactive process, like a shell. The opposite of this is when you want to run a container in the background, as for web apps run from Docker. 24 | 25 | The next parameter just names the image that will be turned into a container, in this case our crabs image. You have to specify the entire name as it appears above. 26 | 27 | After the image name, the next line is just the crabs command. Note: we are splitting our commands into separate lines using the backslash ('\\'). This helps make the command clear to read. You could just have these commands on one line. If you want to split your code, just make sure there is no space after the backslash. 28 | 29 | ## Actual command 30 | 31 | If you ran the help command above and it worked, that is great. You know that the docker image is working. However, we want to run actual commands and process some data and for that we need to add some more parameters. 32 | 33 | Here is a `db_download` command to download ITS sequences of the fungal genus *Amanita*. This should yield a bit over 6,000 sequences, so a good example to use that should not take too long. 34 | 35 | The best practice is to first go to the directory where you will run the analyses 36 | 37 | ``` 38 | cd /Users/fulanotal/analysis/cool_fungi 39 | ``` 40 | 41 | (The folder above will most likely not exist on your computer, you will have to substitute your own folder paths.) 42 | 43 | Then run the docker command: 44 | 45 | ``` 46 | docker run --rm -it \ 47 | -v $(pwd):/data \ 48 | --workdir="/data" \ 49 | quay.io/swordfish/crabs:1.7.7.0 \ 50 | crabs --download-ncbi \ 51 | --database nucleotide \ 52 | --query '"Amanita"[Organism] AND Internal Transcribed Spacer[All Fields] AND ("1"[SLEN] : "1000"[SLEN])' \ 53 | --output amanita.fasta \ 54 | --email fulano.tal@gmail.com \ 55 | --batchsize 5000 56 | 57 | ``` 58 | 59 | If this worked, then you should see a file called 'amanita.fasta' and the original file 'CRABS_ncbi_download.fasta' in your directory. 60 | 61 | In addition to the `docker run` and `--rm -it` parameters, we have added more to the docker part of the command. 62 | 63 | The `-v` (also `--volume`) parameter will mount a folder on your computer to a folder inside the docker container so it can be accessed. This is organized as host:container, with the absolute path or name to your computer's file system before the colon and the directory inside the docker container after the colon. In the above example we use $(pwd), which is bash for 'present working directory'. This is why we suggest to `cd` to your directory, which makes it easy to just use *pwd* for host folder. 64 | 65 | The next line sets the working directory for inside the container. If you do not specify, the working directory will default to the root of the container: just `/`. If you use the default, then you will have to specify where your output files will go. For some Crabs commands, such as `db_download`, it is important to use this option. You will notice that the `--workdir` option is the same as the destination of the `-v` option. This is because for this command Crabs creates intermediate files and if you do not make the `--workdir` and destination `-v` (after the colon) the same, then crabs will not be able to find these intermediate files. For other commands, this is not so critical, and we will show other options below. Because this is needed for some Crabs commands, we use the `--workdir` as general practice. 66 | 67 | The lines following the image command are just the standard Crabs commands, and these are detailed on the main page. 68 | 69 | 70 | ## Taxonomy files 71 | 72 | If you are like us, you like to keep your folders tidy, and keep your general reference files elsewhere. This is a good idea for the massive NCBI taxonomy files that you need to assign taxonomy to your database sequences. In the following steps, we will go to a different folder, download the taxonomy files, and then use them back in your analysis folder. This will illustrate some good tips for using docker across multiple directories. 73 | 74 | 75 | First, get the taxonomy files. We will move to a different directory to keep it simple: 76 | 77 | ``` 78 | cd /Users/fulanotal/taxonomy_files 79 | ``` 80 | 81 | Then, from this directory, we download the taxonomy files: 82 | 83 | ``` 84 | 85 | docker run --rm -it \ 86 | -v $(pwd):/data \ 87 | --workdir="/data" \ 88 | quay.io/swordfish/crabs:1.7.7.0 \ 89 | crabs --download-taxonomy \ 90 | --output ./ 91 | 92 | ``` 93 | 94 | This should result in the three files downloaded to this folder: *names.dmp*, *nodes.dmp*, and *nucl_gb.accession2taxid*. (Note: the `./` in the `--output` parameter indicates to output to the current folder.) 95 | 96 | Now, the tricky bit. We want to return to our analysis file but use these reference files sitting in another part of our computer. To do this, we can add another `-v` command, but we cannot move to the same directory as the home. Here is how we work this out: 97 | 98 | First, return to the working directory: 99 | 100 | ``` 101 | cd /Users/fulanotal/analysis/cool_fungi 102 | ``` 103 | 104 | Now, to keep things clear, we will create a variable with the path to the reference folder: 105 | 106 | ``` 107 | TAX='/Users/fulanotal/taxonomy_files' 108 | ``` 109 | 110 | We can now find the taxonomy of all our sequences and import the fasta file downloaded in the previous command to create a crabs database for use downstream: 111 | 112 | 113 | ``` 114 | docker run --rm -it \ 115 | -v $(pwd):/data \ 116 | -v ${TAX}:/src \ 117 | --workdir="/data" \ 118 | quay.io/swordfish/crabs:1.7.7.0 \ 119 | crabs --import \ 120 | --import-format NCBI \ 121 | --input amanita.fasta \ 122 | --output amanita_crabs.txt \ 123 | --names /src/names.dmp \ 124 | --nodes /src/nodes.dmp \ 125 | --acc2tax /src/nucl_gb.accession2taxid \ 126 | --ranks 'kingdom;phylum;class;order;family;genus;species' 127 | 128 | ``` 129 | 130 | 131 | You will notice that the additional `-v` command copies ('mounts' in docker lingo) the taxonomy files to the `/src` folder inside the docker container. In order for Crabs to find these files, we had to put `/src/` in front of the taxonomy files within this command. You DO NOT put the path to the files on your computer (e.g., ${TAX}/nodes.dmp), because the process is running inside the docker container. 132 | 133 | 134 | 135 | ## Processing more data 136 | 137 | Continuing from our *Amanita* download and import, we can use more or less the same command structure as above. 138 | 139 | 140 | **insilico PCR** 141 | 142 | Here is an example command to just get the ITS1 region from our downloaded sequences: 143 | 144 | ``` 145 | 146 | docker run --rm -it \ 147 | -v $(pwd):/data \ 148 | --workdir="/data" \ 149 | quay.io/swordfish/crabs:1.7.7.0 \ 150 | crabs --in-silico-pcr \ 151 | --input amanita_crabs.txt \ 152 | --output amanita_crabs_its1.txt \ 153 | --forward CTTGGTCATTTAGAGGAAGTAA \ 154 | --reverse GCTGCGTTCTTCATCGATGC 155 | 156 | ``` 157 | 158 | **Adding pairwise global alignment step:** 159 | 160 | ``` 161 | 162 | docker run --rm -it \ 163 | -v $(pwd):/data \ 164 | --workdir="/data" \ 165 | quay.io/swordfish/crabs:1.7.7.0 \ 166 | crabs --pairwise-global-alignment \ 167 | --input amanita_crabs.txt \ 168 | --amplicons amanita_crabs_its1.txt \ 169 | --output amanita_its1_pga.txt \ 170 | --forward CTTGGTCATTTAGAGGAAGTAA \ 171 | --reverse GCTGCGTTCTTCATCGATGC \ 172 | --size-select 600 \ 173 | --threads 2 \ 174 | --percent-identity 0.9 \ 175 | --coverage 90 \ 176 | --all-start-positions 177 | 178 | ``` 179 | 180 | Note: if you are using a newer mac with a M1 chip, then you might see a warning when running these commands. The commands should still work, but you can eliminate this warning by adding the parameter `--platform linux/amd64` to the command above, before the image name. 181 | 182 | 183 | From these examples you should be able to run most of the Crabs commands to create your reference database. We will continue to add examples, explanations, and tips to this page over the coming weeks. Stay tuned, and stay in touch. 184 | 185 | -------------------------------------------------------------------------------- /figures_readme/amplicon-length-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/amplicon-length-figure.png -------------------------------------------------------------------------------- /figures_readme/amplification-efficiency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/amplification-efficiency.png -------------------------------------------------------------------------------- /figures_readme/crabs_blasttax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_blasttax.png -------------------------------------------------------------------------------- /figures_readme/crabs_completeness.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_completeness.png -------------------------------------------------------------------------------- /figures_readme/crabs_cutadapt_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_cutadapt_error.png -------------------------------------------------------------------------------- /figures_readme/crabs_dereplicate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_dereplicate.png -------------------------------------------------------------------------------- /figures_readme/crabs_download_bold.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_bold.png -------------------------------------------------------------------------------- /figures_readme/crabs_download_mitofish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_mitofish.png -------------------------------------------------------------------------------- /figures_readme/crabs_download_ncbi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_ncbi.png -------------------------------------------------------------------------------- /figures_readme/crabs_download_ncbi_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_ncbi_output.png -------------------------------------------------------------------------------- /figures_readme/crabs_download_taxonomy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_taxonomy.png -------------------------------------------------------------------------------- /figures_readme/crabs_export.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_export.png -------------------------------------------------------------------------------- /figures_readme/crabs_filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_filter.png -------------------------------------------------------------------------------- /figures_readme/crabs_greengenes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_greengenes.png -------------------------------------------------------------------------------- /figures_readme/crabs_help.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_help.png -------------------------------------------------------------------------------- /figures_readme/crabs_import.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_import.png -------------------------------------------------------------------------------- /figures_readme/crabs_insilico.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_insilico.png -------------------------------------------------------------------------------- /figures_readme/crabs_merge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_merge.png -------------------------------------------------------------------------------- /figures_readme/crabs_midori.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_midori.png -------------------------------------------------------------------------------- /figures_readme/crabs_pga.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_pga.png -------------------------------------------------------------------------------- /figures_readme/crabs_silva.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_silva.png -------------------------------------------------------------------------------- /figures_readme/crabs_subset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_subset.png -------------------------------------------------------------------------------- /figures_readme/diversity-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/diversity-figure.png -------------------------------------------------------------------------------- /figures_readme/phylo_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/phylo_tree.png -------------------------------------------------------------------------------- /figures_readme/unite_first.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/unite_first.png -------------------------------------------------------------------------------- /figures_readme/unite_second.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/unite_second.png -------------------------------------------------------------------------------- /figures_readme/unite_third.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/unite_third.png -------------------------------------------------------------------------------- /function/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __version__ = "1.7.7" 3 | -------------------------------------------------------------------------------- /function/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /function/__pycache__/module_1.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/module_1.cpython-36.pyc -------------------------------------------------------------------------------- /function/__pycache__/module_3.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/module_3.cpython-36.pyc -------------------------------------------------------------------------------- /function/__pycache__/module_5.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/module_5.cpython-36.pyc -------------------------------------------------------------------------------- /function/older_versions/crabs_v1.0.0: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ################################################ 4 | ################ IMPORT MODULES ################ 5 | ################################################ 6 | import argparse 7 | import subprocess as sp 8 | import pandas as pd 9 | import os 10 | import shutil 11 | import collections 12 | import matplotlib 13 | import matplotlib.pyplot as plt 14 | from Bio.Align.Applications import MuscleCommandline 15 | from pathlib import Path 16 | from collections import Counter 17 | from Bio import SeqIO 18 | from Bio import AlignIO 19 | from Bio import Phylo 20 | from Bio.Seq import Seq 21 | from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceMatrix, DistanceTreeConstructor 22 | from function.module_1 import wget_ncbi, esearch_fasta, efetch_seqs_from_webenv, ncbi_formatting, mitofish_download, mitofish_format, embl_download, embl_fasta_format, embl_crabs_format, bold_download, bold_format, check_accession, append_primer_seqs, generate_header, merge_databases 23 | from function.module_3 import tax2dict, get_accession, acc_to_dict, get_lineage, final_lineage_comb 24 | from function.module_5 import split_db_by_taxgroup, num_spec_seq_taxgroup, horizontal_barchart, get_amp_length, amplength_figure, file_dmp_to_dict, species_to_taxid, lineage_retrieval 25 | 26 | ################################################ 27 | ########### MODULE DATABASE DOWNLOAD ########### 28 | ################################################ 29 | 30 | ## function download data from online databases 31 | def db_download(args): 32 | SOURCE = args.source 33 | DATABASE = args.database 34 | QUERY = args.query 35 | OUTPUT = args.output 36 | ORIG = args.orig 37 | EMAIL = args.email 38 | BATCHSIZE = args.batchsize 39 | 40 | ## download taxonomy data from NCBI 41 | if SOURCE == 'taxonomy': 42 | print('\ndownloading taxonomy information') 43 | url_acc2taxid = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz' 44 | url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' 45 | results = sp.run(['wget', url_acc2taxid]) 46 | results = sp.run(['gunzip', 'nucl_gb.accession2taxid.gz']) 47 | results = sp.run(['wget', url_taxdump]) 48 | results = sp.run(['tar', '-zxvf', 'taxdump.tar.gz']) 49 | print('removing intermediary files\n') 50 | files_to_remove = ['citations.dmp', 'delnodes.dmp', 'division.dmp', 'gencode.dmp', 'merged.dmp', 'gc.prt', 'readme.txt', 'taxdump.tar.gz'] 51 | for file in files_to_remove: 52 | os.remove(file) 53 | 54 | ## download sequencing data from NCBI 55 | elif SOURCE == 'ncbi': 56 | if all(v is not None for v in [DATABASE, QUERY, OUTPUT, EMAIL]): 57 | print('\ndownloading sequences from NCBI') 58 | ncbi_download = wget_ncbi(QUERY, DATABASE, EMAIL, BATCHSIZE) 59 | print('formatting the downloaded sequencing file to CRABS format') 60 | format_seqs = ncbi_formatting(OUTPUT, ORIG) 61 | print(f'written {format_seqs} sequences to {OUTPUT}\n') 62 | else: 63 | print('\nnot all parameters have an input value\n') 64 | 65 | ## download sequencing data from EMBL 66 | elif SOURCE == 'embl': 67 | if all(v is not None for v in [DATABASE, OUTPUT]): 68 | print('\ndownloading sequences from EMBL') 69 | dl_files = embl_download(DATABASE) 70 | fasta_file = embl_fasta_format(dl_files) 71 | print(f'formatting intermediary file to CRABS format') 72 | crabs_file = embl_crabs_format(fasta_file, OUTPUT, ORIG) 73 | print(f'written {crabs_file} sequences to {OUTPUT}\n') 74 | else: 75 | print('\nnot all parameters have an input value\n') 76 | 77 | ## download sequencing data from MitoFish 78 | elif SOURCE == 'mitofish': 79 | if all(v is not None for v in [OUTPUT]): 80 | print('\ndownloading sequences from the MitoFish database') 81 | url = 'http://mitofish.aori.u-tokyo.ac.jp/files/complete_partial_mitogenomes.zip' 82 | dl_file = mitofish_download(url) 83 | print(f'formatting {dl_file} to CRABS format') 84 | mitoformat = mitofish_format(dl_file, OUTPUT, ORIG) 85 | print(f'written {mitoformat} sequences to {OUTPUT}\n') 86 | else: 87 | print('\nnot all parameters have an input value\n') 88 | 89 | ## download sequencing data from BOLD 90 | elif SOURCE == 'bold': 91 | if all(v is not None for v in [DATABASE, OUTPUT]): 92 | print('\ndownloading sequences from BOLD') 93 | bold_file = bold_download(DATABASE) 94 | print(f'downloaded {bold_file} sequences from BOLD') 95 | print(f'formatting {bold_file} sequences to CRABS format') 96 | boldformat = bold_format(OUTPUT, ORIG) 97 | print(f'written {boldformat} sequences to {OUTPUT}\n') 98 | else: 99 | print('\nnot all parameters have an input value\n') 100 | 101 | ## function: import existing or custom database 102 | def db_import(args): 103 | INPUT = args.input 104 | HEADER = args.header 105 | OUTPUT = args.output 106 | FWD = args.fwd 107 | REV = args.rev 108 | DELIM = args.delim 109 | 110 | ## process file with accession number in header 111 | if HEADER == 'accession': 112 | if all(v is not None for v in [INPUT, OUTPUT, DELIM]): 113 | print(f'\nchecking correct formatting of accession numbers in {INPUT}') 114 | incorrect_accession = check_accession(INPUT, OUTPUT, DELIM) 115 | if len(incorrect_accession) != 0: 116 | print('found incorrectly formatted accession numbers, please check file: "incorrect_accession_numbers.txt"') 117 | with open('incorrect_accession_numbers.txt', 'w') as fout: 118 | for item in incorrect_accession: 119 | fout.write(item + '\n') 120 | if all(v is not None for v in [FWD, REV]): 121 | print(f'appending primer sequences to {OUTPUT}') 122 | numseq = append_primer_seqs(OUTPUT, FWD, REV) 123 | print(f'added primers to {numseq} sequences in {OUTPUT}\n') 124 | else: 125 | print('') 126 | else: 127 | print('\nnot all parameters have an input value\n') 128 | 129 | ## process file with species info in header 130 | elif HEADER == 'species': 131 | if all(v is not None for v in [INPUT, OUTPUT, DELIM]): 132 | print(f'\ngenerating new sequence headers for {INPUT}') 133 | num_header = generate_header(INPUT, OUTPUT, DELIM) 134 | print(f'generated {num_header} headers for {OUTPUT}') 135 | if all(v is not None for v in [FWD, REV]): 136 | print(f'appending primer sequences to {OUTPUT}') 137 | numseq = append_primer_seqs(OUTPUT, FWD, REV) 138 | print(f'added primers to {numseq} sequences in {OUTPUT}\n') 139 | else: 140 | print('') 141 | else: 142 | print('\nnot all parameters have an input value\n') 143 | else: 144 | print('\nplease specify header information: "accession" and "species"\n') 145 | 146 | ## function: merge multiple databases 147 | def db_merge(args): 148 | INPUT = args.input 149 | UNIQ = args.uniq 150 | OUTPUT = args.output 151 | 152 | if UNIQ != '': 153 | print('\nmerging all fasta files and discarding duplicate sequence headers') 154 | num_uniq = merge_databases(INPUT, OUTPUT) 155 | print(f'written {num_uniq} sequences to {OUTPUT}\n') 156 | else: 157 | print('\nmerging all fasta files and keeping duplicate sequence headers') 158 | with open(OUTPUT, 'w') as fout: 159 | for file in INPUT: 160 | num = len(list(SeqIO.parse(file, 'fasta'))) 161 | print(f'found {num} sequences in {file}') 162 | with open(file, 'r') as fin: 163 | for line in fin: 164 | fout.write(line) 165 | num = len(list(SeqIO.parse(OUTPUT, 'fasta'))) 166 | print(f'written {num} sequences to {OUTPUT}\n') 167 | 168 | 169 | ################################################ 170 | ############# MODULE IN SILICO PCR ############# 171 | ################################################ 172 | 173 | ## function: in silico PCR 174 | def insilico_pcr(args): 175 | FWD = args.fwd 176 | REV = args.rev 177 | INPUT = args.input 178 | ERROR = args.error 179 | OUTPUT = args.output 180 | 181 | ## reverse complement reverse primer sequence 182 | REV_CORRECT = str(Seq(REV).reverse_complement()) 183 | 184 | ## setting variable names using the info from user input 185 | TRIMMED_INIT = 'init_trimmed.fasta' 186 | UNTRIMMED_INIT = 'init_untrimmed.fasta' 187 | REVCOMP_UNTRIMMED_INIT = 'revcomp_untrimmed.fasta' 188 | TRIMMED_REVCOMP = 'revcomp_trimmed.fasta' 189 | UNTRIMMED_REVCOMP = 'untrimmed_revcomp.fasta' 190 | 191 | OVERLAP = str(min([len(FWD), len(REV_CORRECT)])) 192 | ADAPTER = FWD + '...' + REV_CORRECT 193 | 194 | ## run cutadapt on downloaded fasta file 195 | count_init = len(list(SeqIO.parse(INPUT, 'fasta'))) 196 | print('\nrunning in silico PCR on fasta file containing {} sequences'.format(count_init)) 197 | cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet'] 198 | sp.call(cmnd_cutadapt_1) 199 | count_trimmed_init = len(list(SeqIO.parse(TRIMMED_INIT, 'fasta'))) 200 | print('found primers in {} sequences'.format(count_trimmed_init)) 201 | 202 | ## run vsearch to reverse complement untrimmed sequences 203 | if count_trimmed_init < count_init: 204 | count_untrimmed_init = len(list(SeqIO.parse(UNTRIMMED_INIT, 'fasta'))) 205 | print('reverse complementing {} untrimmed sequences'.format(count_untrimmed_init)) 206 | cmnd_vsearch_revcomp = ['vsearch', '--fastx_revcomp', UNTRIMMED_INIT, '--fastaout', REVCOMP_UNTRIMMED_INIT, '--quiet'] 207 | sp.call(cmnd_vsearch_revcomp) 208 | 209 | ## run cutadapt on reverse complemented untrimmed sequences 210 | print('running in silico PCR on {} reverse complemented untrimmed sequences'.format(count_untrimmed_init)) 211 | cmnd_cutadapt_2 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_REVCOMP, REVCOMP_UNTRIMMED_INIT, '--untrimmed-output', UNTRIMMED_REVCOMP, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet'] 212 | sp.call(cmnd_cutadapt_2) 213 | count_trimmed_second = len(list(SeqIO.parse(TRIMMED_REVCOMP, 'fasta'))) 214 | print('found primers in {} sequences\n'.format(count_trimmed_second)) 215 | 216 | ## concatenate both trimmed files 217 | with open(OUTPUT, 'wb') as wfd: 218 | for f in [TRIMMED_INIT, TRIMMED_REVCOMP]: 219 | with open(f, 'rb') as fd: 220 | shutil.copyfileobj(fd, wfd) 221 | 222 | ## remove intermediary files 223 | files = [TRIMMED_INIT, UNTRIMMED_INIT, REVCOMP_UNTRIMMED_INIT, TRIMMED_REVCOMP, UNTRIMMED_REVCOMP] 224 | for file in files: 225 | os.remove(file) 226 | 227 | ## don't run reverse complement when initial in silico PCR trims all sequences 228 | else: 229 | print('all sequences trimmed, no reverse complement step\n') 230 | results = sp.run(['mv', TRIMMED_INIT, OUTPUT]) 231 | os.remove(UNTRIMMED_INIT) 232 | 233 | ################################################ 234 | ########## MODULE TAXONOMY ASSIGNMENT ########## 235 | ################################################ 236 | ## function: get taxonomic lineage for each accession number 237 | def assign_tax(args): 238 | INPUT = args.input 239 | OUTPUT = args.output 240 | ACC2TAX = args.acc2tax 241 | TAXID = args.taxid 242 | NAME = args.name 243 | 244 | ## process initial files 245 | print(f'\nretrieving accession numbers from {INPUT}') 246 | accession = get_accession(INPUT) 247 | print(f'found {len(accession)} accession numbers in {INPUT}') 248 | acc2tax, taxid, name, no_acc = tax2dict(ACC2TAX, TAXID, NAME, accession) 249 | print(f'processed {len(acc2tax)} entries in {ACC2TAX}') 250 | print(f'processed {len(taxid)} entries in {TAXID}') 251 | print(f'processed {len(name)} entries in {NAME}') 252 | 253 | 254 | ## get taxonomic lineage 255 | print(f'assigning a tax ID number to {len(accession)} accession numbers from {INPUT}') 256 | acc_taxid_dict, taxid_list = acc_to_dict(accession, acc2tax, no_acc) 257 | print(f'{len(acc_taxid_dict)} accession numbers resulted in {len(taxid_list)} unique tax ID numbers') 258 | print(f'generating taxonomic lineages for {len(taxid_list)} tax ID numbers') 259 | lineage = get_lineage(taxid_list, taxid, name) 260 | print(f'assigning a taxonomic lineage to {len(accession)} accession numbers') 261 | final_lineage = final_lineage_comb(acc_taxid_dict, lineage, INPUT, OUTPUT) 262 | print(f'written {len(final_lineage)} entries to {OUTPUT}\n') 263 | 264 | ################################################ 265 | ########### MODULE DATABASE CLEAN-UP ########### 266 | ################################################ 267 | 268 | ## function: dereplicating the database 269 | def dereplicate(args): 270 | INPUT = args.input 271 | OUTPUT = args.output 272 | METHOD = args.method 273 | 274 | ## dereplicate strict (only unique sequences) 275 | if METHOD == 'strict': 276 | print(f'\nstrict dereplication of {INPUT}, only keeping unique sequences') 277 | uniq_seqs = {} 278 | uniq_line = [] 279 | count = 0 280 | added = 0 281 | with open(INPUT, 'r') as file_in: 282 | for line in file_in: 283 | count = count + 1 284 | lines = line.rstrip('\n') 285 | seq = lines.split('\t')[9] 286 | if seq not in uniq_seqs: 287 | added = added + 1 288 | uniq_seqs[seq] = seq 289 | uniq_line.append(line) 290 | print(f'found {count} sequences in {INPUT}') 291 | print(f'written {added} sequences to {OUTPUT}\n') 292 | with open(OUTPUT, 'w') as file_out: 293 | for line in uniq_line: 294 | file_out.write(line) 295 | 296 | ## dereplicate single species (one sequence per species) 297 | elif METHOD == 'single_species': 298 | print(f'\ndereplicating {INPUT}, only keeping a single sequence per species') 299 | uniq_spec = {} 300 | uniq_line = [] 301 | count = 0 302 | added = 0 303 | with open(INPUT, 'r') as file_in: 304 | for line in file_in: 305 | count = count + 1 306 | lines = line.rstrip('\n') 307 | species = lines.split('\t')[8].split(',')[2] 308 | if species not in uniq_spec: 309 | added = added + 1 310 | uniq_spec[species] = species 311 | uniq_line.append(line) 312 | print(f'found {count} sequences in {INPUT}') 313 | print(f'written {added} sequences to {OUTPUT}\n') 314 | with open(OUTPUT, 'w') as file_out: 315 | for line in uniq_line: 316 | file_out.write(line) 317 | 318 | ## dereplicate unique species (all unique sequences per species) 319 | elif METHOD == 'uniq_species': 320 | print(f'\ndereplicating {INPUT}, keeping all unique sequences per species') 321 | mydict = collections.defaultdict(list) 322 | count = 0 323 | added = 0 324 | with open(INPUT, 'r') as file_in: 325 | for line in file_in: 326 | count = count + 1 327 | lines = line.rstrip('\n') 328 | spec = lines.split('\t')[8].split(',')[2] 329 | seq = lines.split('\t')[9] 330 | line_id = lines 331 | seq_dicts = [] 332 | for item in mydict[spec]: 333 | seq_dict = item.rsplit('\t', 1)[1] 334 | seq_dicts.append(seq_dict) 335 | if seq not in seq_dicts: 336 | added = added + 1 337 | mydict[spec].append(line_id) 338 | print(f'found {count} sequences in {INPUT}') 339 | print(f'written {added} sequences to {OUTPUT}\n') 340 | with open(OUTPUT, 'w') as file_out: 341 | for k, v in mydict.items(): 342 | for i in v: 343 | file_out.write(i + '\n') 344 | 345 | ## dereplicate concensus species (generate concensus sequence for each species) 346 | elif METHOD == 'consensus': 347 | print('still to add...') 348 | 349 | ## unknown method specified 350 | else: 351 | print('\nplease specify one of the accepted dereplication methods: "strict", "single_species", "uniq_species"\n') 352 | 353 | ## function: sequence cleanup 354 | def db_filter(args): 355 | MINLEN = args.minlen 356 | MAXLEN = args.maxlen 357 | MAXNS = args.maxns 358 | INPUT = args.input 359 | OUTPUT = args.output 360 | DISCARD = args.discard 361 | ENV = args.env 362 | SPEC = args.spec 363 | NANS = args.nans 364 | 365 | ## set filtering parameters 366 | print(f'\nfiltering parameters:\nremoving sequences shorter than {MINLEN} and longer than {MAXLEN}\nremoving sequences containing more than {MAXNS} "N"') 367 | if ENV == 'no': 368 | env = 100 369 | print('keeping environmental sequences') 370 | else: 371 | env = 0 372 | print('removing environmental sequences') 373 | if SPEC == 'no': 374 | print('keeping sequences unclassified at species level') 375 | spec = 100 376 | else: 377 | spec = 0 378 | print('removing sequences without a species ID') 379 | if NANS == 'no': 380 | nans = 100 381 | print('keeping sequences with missing taxonomic information') 382 | else: 383 | nans = int(NANS) 384 | print(f'removing sequences with missing information for more than {NANS} taxonomic levels') 385 | 386 | ## read the input file and clean up given the parameters 387 | clean_db = [] 388 | discard_db = [] 389 | count = 0 390 | count_clean = 0 391 | with open(INPUT, 'r') as file_in: 392 | for line in file_in: 393 | count = count + 1 394 | lines = line.rstrip('\n') 395 | upline = lines.upper() 396 | seq = upline.rsplit('\t', 1)[1] 397 | species = upline.split('\t')[8] 398 | if len(seq) >= MINLEN and len(seq) <= MAXLEN and seq.count('N') <= MAXNS and species.count('ENVIRONMENTAL') <= env and species.count('_SP.') <= spec and upline.count(',NAN') <= nans: 399 | count_clean = count_clean + 1 400 | clean_db.append(line) 401 | else: 402 | discard_db.append(line) 403 | 404 | ## write cleaned database to file 405 | cleaned = count - count_clean 406 | print(f'found {count} number of sequences in {INPUT}') 407 | print(f'removed {cleaned} sequences during filtering') 408 | print(f'written {count_clean} sequences to {OUTPUT}\n') 409 | with open(OUTPUT, 'w') as file_out: 410 | for item in clean_db: 411 | file_out.write(item) 412 | if DISCARD != 'no': 413 | with open(DISCARD, 'w') as dis_out: 414 | for item in discard_db: 415 | dis_out.write(item) 416 | 417 | ################################################ 418 | ############# MODULE VISUALISATION ############# 419 | ################################################ 420 | 421 | ## figure output 422 | def visualization(args): 423 | INPUT = args.input 424 | OUTPUT = args.output 425 | METHOD = args.method 426 | LEVEL = args.level 427 | SPECIES = args.species 428 | TAXID = args.taxid 429 | NAME = args.name 430 | 431 | ## horizontal barchart 432 | if METHOD == 'diversity': 433 | tax_group_list, uniq_tax_group_list, species_dict = split_db_by_taxgroup(INPUT, LEVEL) 434 | sequence_counter = Counter(tax_group_list) 435 | list_info_dict = num_spec_seq_taxgroup(uniq_tax_group_list, species_dict, sequence_counter) 436 | sorted_info = sorted(list_info_dict, key = lambda i: (i['sequence'])) 437 | figure = horizontal_barchart(sorted_info) 438 | 439 | ## length distribution 440 | elif METHOD == 'amplicon_length': 441 | amp_length_dict = get_amp_length(INPUT, LEVEL) 442 | figure = amplength_figure(amp_length_dict) 443 | 444 | ## completeness table 445 | elif METHOD == 'db_completeness': 446 | 447 | ## read in the text file with species names 448 | species_list = [] 449 | with open(SPECIES, 'r') as species_file: 450 | for line in species_file: 451 | species = line.rstrip('\n').replace(' ', '_') 452 | species_list.append(species) 453 | print(f'\nfound {len(species_list)} species of interest in {SPECIES}: {species_list}') 454 | 455 | ## retrieve taxonomic lineage 456 | print(f'generating taxonomic lineage for {len(species_list)} species') 457 | name, node, taxid = file_dmp_to_dict(NAME, TAXID) 458 | species_taxid_dict, taxid_list = species_to_taxid(species_list, taxid) 459 | lineage = lineage_retrieval(taxid_list, node, name) 460 | final_dict = collections.defaultdict(list) 461 | for k, v in species_taxid_dict.items(): 462 | final_dict[k] = lineage[v] 463 | print(f'gathering data for {len(final_dict)} species\n') 464 | 465 | ## retrieve information about potential number of taxa shared with species of interest on genus and family level based on NCBI taxonomy files 466 | table_info_dict = collections.defaultdict(dict) 467 | for k, v in species_taxid_dict.items(): 468 | species = k 469 | genus_count = 0 470 | family_count = 0 471 | ## find genus taxids 472 | if v in node: 473 | genus = node[v][1] 474 | ## count number of species in genus 475 | for k, v in node.items(): 476 | if v[1] == genus and v[0] == 'species': 477 | genus_count = genus_count + 1 478 | ## find family taxids 479 | if genus in node: 480 | family = node[genus][1] 481 | ## count number of species in family 482 | for k, v in node.items(): 483 | if v[1] == family and v[0] == 'genus': 484 | genus = k 485 | for key, value in node.items(): 486 | if value[1] == genus and value[0] == 'species': 487 | family_count = family_count + 1 488 | table_info_dict[species] = {'species' : species, 'genus_num_ncbi' : genus_count, 'family_num_ncbi' : family_count} 489 | 490 | ## retrieve information about number of taxa shared with species of interest on genus and family level in reference database 491 | for k, v in final_dict.items(): 492 | species = k 493 | genus = v[5] 494 | family = v[4] 495 | with open(INPUT, 'r') as file_in: 496 | spec_db_count = [] 497 | genus_db_count = [] 498 | family_db_count = [] 499 | for line in file_in: 500 | spec_db = line.split('\t')[8].split(',')[2] 501 | genus_db = line.split('\t')[7].split(',')[2] 502 | family_db = line.split('\t')[6].split(',')[2] 503 | if spec_db == species: 504 | if spec_db not in spec_db_count: 505 | spec_db_count.append(spec_db) 506 | if genus_db == genus: 507 | if spec_db not in genus_db_count: 508 | genus_db_count.append(spec_db) 509 | if family_db == family: 510 | if spec_db not in family_db_count: 511 | family_db_count.append(spec_db) 512 | for k, v in table_info_dict.items(): 513 | if k == species: 514 | v['species_in_ref_DB'] = len(spec_db_count) 515 | v['genus_num_ref_DB'] = len(genus_db_count) 516 | v['family_num_ref_DB'] = len(family_db_count) 517 | v['genus_list_ref_DB'] = genus_db_count 518 | v['family_list_ref_DB'] = family_db_count 519 | df = pd.DataFrame.from_dict(table_info_dict, orient = 'index') 520 | df['Completeness_genus'] = df['genus_num_ref_DB'] / df['genus_num_ncbi'] * 100 521 | df['Completeness_family'] = df['family_num_ref_DB'] / df['family_num_ncbi'] * 100 522 | df = df[['species', 'species_in_ref_DB', 'genus_num_ref_DB', 'genus_num_ncbi', 'Completeness_genus', 'family_num_ref_DB', 'family_num_ncbi', 'Completeness_family', 'genus_list_ref_DB', 'family_list_ref_DB']] 523 | df.to_csv(OUTPUT, sep = '\t', index = None) 524 | 525 | ## phylogenetic tree 526 | elif METHOD == 'phylo': 527 | ## read in the text file with species names 528 | species_list = [] 529 | with open(SPECIES, 'r') as species_file: 530 | for line in species_file: 531 | species = line.rstrip('\n').replace(' ', '_') 532 | species_list.append(species) 533 | print(f'\nfound {len(species_list)} species of interest in {SPECIES}: {species_list}') 534 | 535 | ## retrieve taxonomic lineage 536 | print(f'generating taxonomic lineage for {len(species_list)} species') 537 | name, node, taxid = file_dmp_to_dict(NAME, TAXID) 538 | species_taxid_dict, taxid_list = species_to_taxid(species_list, taxid) 539 | lineage = lineage_retrieval(taxid_list, node, name) 540 | final_dict = collections.defaultdict(list) 541 | for k, v in species_taxid_dict.items(): 542 | final_dict[k] = lineage[v] 543 | print(f'gathering data for {len(final_dict)} species') 544 | 545 | ## gather sequences from database that share taxonomic rank 546 | ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] 547 | count = 0 548 | for item in ranks: 549 | if item == LEVEL: 550 | break 551 | else: 552 | count = count + 1 553 | for k, v in final_dict.items(): 554 | species = k 555 | taxrank = v[count] 556 | species_file = [] 557 | try: 558 | os.remove(f'{species}_phylo.fasta') 559 | except OSError: 560 | pass 561 | with open(INPUT, 'r') as file_in: 562 | for line in file_in: 563 | rank = line.split('\t')[count + 2].split(',')[2] 564 | #print(rank) 565 | if rank == taxrank: 566 | species_file.append(line) 567 | for item in species_file: 568 | if len(species_file) < 2: 569 | print(f'only {len(species_file)} sequence in database that shares the {LEVEL} taxonomic rank with {species}, omitted from phylogenetic analysis.') 570 | elif len(species_file) > 100: 571 | print(f'{len(species_file)} sequences in database that share the {LEVEL} taxonomic rank with {species}, omitted from phylogenetic analysis') 572 | else: 573 | header = '>' + item.split('\t')[0] + '_' + item.split('\t')[8].split(',')[2] 574 | seq = item.rsplit('\t', 1)[1] 575 | with open(f'{species}_phylo.fasta', 'a') as file_out: 576 | file_out.write(header + '\n') 577 | file_out.write(seq) 578 | 579 | for species in species_list: 580 | my_file = Path(f'{species}_phylo.fasta') 581 | if my_file.is_file(): 582 | print(f'generating phylogenetic tree for {species}') 583 | muscle_cline = MuscleCommandline(input = my_file, out = f'{species}_align.clw', diags = True, maxiters = 1, log = f'{species}_align_log.txt', clw = True) 584 | muscle_cline() 585 | with open(f'{species}_align.clw', 'r') as aln: 586 | alignment = AlignIO.read(aln, 'clustal') 587 | calculator = DistanceCalculator('identity') 588 | Distance_matrix = calculator.get_distance(alignment) 589 | constructor = DistanceTreeConstructor(calculator, 'nj') 590 | tree = constructor.build_tree(alignment) 591 | fig = plt.figure(figsize = (25,15), dpi = 100) 592 | matplotlib.rc('font', size=12) 593 | matplotlib.rc('xtick', labelsize=10) 594 | matplotlib.rc('ytick', labelsize=10) 595 | axes = fig.add_subplot(1, 1, 1) 596 | Phylo.draw(tree, axes=axes, do_show = False) 597 | fig.savefig(f'{species}_tree_figure.pdf') 598 | print() 599 | 600 | ## incorrect parameter 601 | else: 602 | print('\nplease specify method of visualization: "diversity", "amplicon_length", "db_completeness", "phylo"\n') 603 | 604 | ## format the taxonomic lineage 605 | def tax_format(args): 606 | INPUT = args.input 607 | OUTPUT = args.output 608 | FORMAT = args.format 609 | 610 | ## format database to sintax 611 | if FORMAT == 'sintax': 612 | print(f'\nformatting {INPUT} to sintax format\n') 613 | with open(OUTPUT, 'w') as f_out: 614 | with open(INPUT, 'r') as f_in: 615 | for line in f_in: 616 | line = line.rstrip('\n') 617 | sintax = '>' + line.split('\t')[0] + ';tax=d:' + line.split('\t')[2].split(',')[2] + ',p:' + line.split('\t')[3].split(',')[2] + ',c:' + line.split('\t')[4].split(',')[2] + ',o:' + line.split('\t')[5].split(',')[2] + ',f:' + line.split('\t')[6].split(',')[2] + ',g:' + line.split('\t')[7].split(',')[2] + ',s:' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n' 618 | f_out.write(sintax) 619 | 620 | ## format database to RDP 621 | elif FORMAT == 'rdp': 622 | print(f'\nformatting {INPUT} to RDP format\n') 623 | with open(OUTPUT, 'w') as f_out: 624 | with open(INPUT, 'r') as f_in: 625 | for line in f_in: 626 | line = line.rstrip('\n') 627 | rdp = '>' + line.split('\t')[0] + '\t' + 'root;' + line.split('\t')[2].split(',')[2] + ';' + line.split('\t')[3].split(',')[2] + ';' + line.split('\t')[4].split(',')[2] + ';' + line.split('\t')[5].split(',')[2] + ';' + line.split('\t')[6].split(',')[2] + ';' + line.split('\t')[7].split(',')[2] + ';' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n' 628 | f_out.write(rdp) 629 | 630 | ## format database to QIIf 631 | elif FORMAT == 'qiif': 632 | print(f'\nformatting {INPUT} to QIIf format\n') 633 | fasta_f = OUTPUT + '.fasta' 634 | txt_f = OUTPUT + '.txt' 635 | with open(fasta_f, 'w') as f_out: 636 | with open(INPUT, 'r') as f_in: 637 | for line in f_in: 638 | line = line.rstrip('\n') 639 | fasta = '>' + line.split('\t')[0] + '\n' + line.split('\t')[9] + '\n' 640 | f_out.write(fasta) 641 | with open(txt_f, 'w') as f_out: 642 | with open(INPUT, 'r') as f_in: 643 | for line in f_in: 644 | tax = line.split('\t')[0] + '\t' + 'k__' + line.split('\t')[2].split(',')[2] + ';p__' + line.split('\t')[3].split(',')[2] + ';c__' + line.split('\t')[4].split(',')[2] + ';o__' + line.split('\t')[5].split(',')[2] + ';f__' + line.split('\t')[6].split(',')[2] + ';g__' + line.split('\t')[7].split(',')[2] + ';s__' + line.split('\t')[8].split(',')[2] + '\n' 645 | f_out.write(tax) 646 | 647 | ## format database to QIIz 648 | elif FORMAT == 'qiiz': 649 | print(f'\nformatting {INPUT} to QIIz format') 650 | print('still to add, not sure how this looks like') 651 | 652 | ## format database to DAD 653 | elif FORMAT == 'dad': 654 | print(f'\nformatting {INPUT} to DAD format\n') 655 | with open(OUTPUT, 'w') as f_out: 656 | with open(INPUT, 'r') as f_in: 657 | for line in f_in: 658 | line = line.rstrip('\n') 659 | dad = '>' + line.split('\t')[2].split(',')[2] + ';' + line.split('\t')[3].split(',')[2] + ';' + line.split('\t')[4].split(',')[2] + ';' + line.split('\t')[5].split(',')[2] + ';' + line.split('\t')[6].split(',')[2] + ';' + line.split('\t')[7].split(',')[2] + '\n' + line.split('\t')[9] + '\n' 660 | f_out.write(dad) 661 | 662 | ## format database to DADs 663 | elif FORMAT == 'dads': 664 | print(f'\nformatting {INPUT} to DADs format\n') 665 | with open(OUTPUT, 'w') as f_out: 666 | with open(INPUT, 'r') as f_in: 667 | for line in f_in: 668 | line = line.rstrip('\n') 669 | dads = '>' + line.split('\t')[0] + ' ' + line.split('\t')[7].split(',')[2] + ' ' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n' 670 | f_out.write(dads) 671 | 672 | ## format database to IDT 673 | elif FORMAT == 'idt': 674 | print(f'\nformatting {INPUT} to IDT format\n') 675 | with open(OUTPUT, 'w') as f_out: 676 | with open(INPUT, 'r') as f_in: 677 | for line in f_in: 678 | line = line.rstrip('\n') 679 | idt = '>' + line.split('\t')[2].split(',')[2] + ';' + line.split('\t')[3].split(',')[2] + ';' + line.split('\t')[4].split(',')[2] + ';' + line.split('\t')[5].split(',')[2] + ';' + line.split('\t')[6].split(',')[2] + ';' + line.split('\t')[7].split(',')[2] + ';' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n' 680 | f_out.write(idt) 681 | 682 | ## unknown format specified 683 | else: 684 | print('\nplease specify one of the accepted formats: "sintax", "rdp", "qiif", "qiiz", "dad", "dads", "idt"\n') 685 | 686 | ################################################ 687 | ################### ARGPARSE ################### 688 | ################################################ 689 | def main(): 690 | parser = argparse.ArgumentParser(description = 'creating a curated reference database') 691 | subparser = parser.add_subparsers() 692 | 693 | db_download_parser = subparser.add_parser('db_download', description = 'downloading sequence data from online databases') 694 | db_download_parser.set_defaults(func = db_download) 695 | db_download_parser.add_argument('-s', '--source', help = 'specify online database used to download sequences. Currently supported options are: (1) ncbi, (2) embl, (3) mitofish, (4) bold, (5) taxonomy', dest = 'source', type = str, required = True) 696 | db_download_parser.add_argument('-db', '--database', help = 'specific database used to download sequences. Example NCBI: nucleotide. Example EMBL: mam*. Example BOLD: Actinopterygii', dest = 'database', type = str) 697 | db_download_parser.add_argument('-q', '--query', help = 'NCBI query search to limit portion of database to be downloaded. Example: "16S[All Fields] AND ("1"[SLEN] : "50000"[SLEN])"', dest = 'query', type = str) 698 | db_download_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str) 699 | db_download_parser.add_argument('-k', '--keep_original', help = 'keep original downloaded file, default = "no"', dest = 'orig', type = str, default = 'no') 700 | db_download_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str) 701 | db_download_parser.add_argument('-b', '--batchsize', help = 'number of sequences downloaded from NCBI per iteration. Default = 5000', dest = 'batchsize', type = int, default = 5000) 702 | 703 | db_import_parser = subparser.add_parser('db_import', description = 'import existing or curated database') 704 | db_import_parser.set_defaults(func = db_import) 705 | db_import_parser.add_argument('-i', '--input', help = 'input database filename', dest = 'input', type = str, required = True) 706 | db_import_parser.add_argument('-s', '--seq_header', help = 'information provided in sequence header: "accession" or "species"', dest = 'header', type = str, required = True) 707 | db_import_parser.add_argument('-o', '--output', help = 'output file name option', dest = 'output', type = str, required = True) 708 | db_import_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str) 709 | db_import_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str) 710 | db_import_parser.add_argument('-d', '--delim', help = 'delimiter specifying species or accession', dest = 'delim', type = str, required = True) 711 | 712 | db_merge_parser = subparser.add_parser('db_merge', description = 'merge multiple databases') 713 | db_merge_parser.set_defaults(func = db_merge) 714 | db_merge_parser.add_argument('-i', '--input', nargs = '+', help = 'list of files to be merged', dest = 'input', required = True) 715 | db_merge_parser.add_argument('-u', '--uniq', help = 'keep only unique accession numbers', dest = 'uniq', type = str, default = '') 716 | db_merge_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 717 | 718 | in_silico_pcr_parser = subparser.add_parser('insilico_pcr', description = 'curating the downloaded reference sequences with an in silico PCR') 719 | in_silico_pcr_parser.set_defaults(func = insilico_pcr) 720 | in_silico_pcr_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str, required = True) 721 | in_silico_pcr_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str, required = True) 722 | in_silico_pcr_parser.add_argument('-i', '--input', help = 'input filename', dest = 'input', type = str, required = True) 723 | in_silico_pcr_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 724 | in_silico_pcr_parser.add_argument('-e', '--error', help = 'number of errors allowed in primer-binding site. Default = 4.5', dest = 'error', type = str, default = '4.5') 725 | 726 | ref_database_parser = subparser.add_parser('assign_tax', description = 'creating the reference database with taxonomic information') 727 | ref_database_parser.set_defaults(func = assign_tax) 728 | ref_database_parser.add_argument('-i', '--input', help = 'input file containing the curated fasta sequences after in silico PCR', dest = 'input', type = str, required = True) 729 | ref_database_parser.add_argument('-o', '--output', help = 'curated reference database output file', dest = 'output', type = str, required = True) 730 | ref_database_parser.add_argument('-a', '--acc2tax', help = 'accession to taxid file name', dest = 'acc2tax', type = str, required = True) 731 | ref_database_parser.add_argument('-t', '--taxid', help = 'taxid file name', dest = 'taxid', type = str, required = True) 732 | ref_database_parser.add_argument('-n', '--name', help = 'phylogeny file name', dest = 'name', type = str, required = True) 733 | 734 | dereplication_parser = subparser.add_parser('dereplicate', description = 'dereplicating the database') 735 | dereplication_parser.set_defaults(func = dereplicate) 736 | dereplication_parser.add_argument('-i', '--input', help = 'filename of the curated reference database', dest = 'input', type = str, required = True) 737 | dereplication_parser.add_argument('-o', '--output', help = 'filename of the dereplicated curated reference database', dest = 'output', type = str, required = True) 738 | dereplication_parser.add_argument('-m', '--method', help = 'method of dereplication: "strict", "single_species", "uniq_species"', dest = 'method', type = str, required = True) 739 | 740 | seq_cleanup_parser = subparser.add_parser('seq_cleanup', description = 'filtering the database on sequence and header parameters') 741 | seq_cleanup_parser.set_defaults(func = db_filter) 742 | seq_cleanup_parser.add_argument('-min', '--minlen', help = 'minimum sequence length to be retained in the database. Default = 100', dest = 'minlen', type = int, default = '100') 743 | seq_cleanup_parser.add_argument('-max', '--maxlen', help = 'maximum sequence length to be retained in the database. Default = 500', dest = 'maxlen', type = int, default = '500') 744 | seq_cleanup_parser.add_argument('-n', '--maxns', help = 'maximum number of ambiguous bases allowed in the sequence. Default = 0', dest = 'maxns', type = int, default = '0') 745 | seq_cleanup_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True) 746 | seq_cleanup_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 747 | seq_cleanup_parser.add_argument('-d', '--discard', help = 'file name of discarded sequences', dest = 'discard', type = str, default = 'no') 748 | seq_cleanup_parser.add_argument('-e', '--enviro', help = 'discard environmental sequences from the dataset. yes/no', dest = 'env', type = str, default = 'no') 749 | seq_cleanup_parser.add_argument('-s', '--species', help = 'discard sequences for which the species name is unspecified. yes/no', dest = 'spec', type = str, default = 'no') 750 | seq_cleanup_parser.add_argument('-na', '--nans', help = 'discard sequences with N number of unspecified taxonomic levels', dest = 'nans', type = str, default = 'no') 751 | 752 | visualization_parser = subparser.add_parser('visualization', description = 'figure displaying various aspects of the reference database') 753 | visualization_parser.set_defaults(func = visualization) 754 | visualization_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True) 755 | visualization_parser.add_argument('-o', '--output', help = 'output file name for db_completeness method', dest = 'output', type = str) 756 | visualization_parser.add_argument('-m', '--method', help = 'method of visualization: "diversity", "amplicon_length", "db_completeness", "phylo"', dest = 'method', type = str, required = True) 757 | visualization_parser.add_argument('-l', '--level', help = 'taxonomic level to split the database for diversity, amplicon_length, and phylo methods: "superkingdom", "phylum", "class", "order", "family", "genus", "species"', dest = 'level', type = str) 758 | visualization_parser.add_argument('-s', '--species', help = 'list of species of interest for phylo and db_completeness methods', dest = 'species', type = str) 759 | visualization_parser.add_argument('-t', '--taxid', help = 'taxid file name for phylo and db_completeness methods', dest = 'taxid', type = str) 760 | visualization_parser.add_argument('-n', '--name', help = 'phylogeny file name for phylo and db_completeness methods', dest = 'name', type = str) 761 | 762 | format_database_parser = subparser.add_parser('tax_format', description = 'formatting the database to various formats') 763 | format_database_parser.set_defaults(func = tax_format) 764 | format_database_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True) 765 | format_database_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 766 | format_database_parser.add_argument('-f', '--format', help = 'process database to format: "sintax", "rdp", "qiif", "qiiz", "dad", "dads", "idt"', dest = 'format', type = str, required = True) 767 | 768 | args = parser.parse_args() 769 | args.func(args) 770 | 771 | if __name__ == '__main__': 772 | main() 773 | -------------------------------------------------------------------------------- /function/older_versions/reference_database_creator_v2.1.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ## import modules 4 | import argparse 5 | from Bio import Entrez 6 | import time 7 | from urllib.error import HTTPError 8 | import http.client 9 | http.client.HTTPConnection._http_vsn = 10 10 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' 11 | import subprocess as sp 12 | import shutil 13 | from string import digits 14 | import pandas as pd 15 | from tqdm import tqdm 16 | from Bio.Seq import Seq 17 | from Bio.SeqRecord import SeqRecord 18 | from Bio import SeqIO 19 | from Bio.SeqIO import FastaIO 20 | import os 21 | import zipfile 22 | from os import listdir 23 | import matplotlib 24 | import matplotlib.pyplot as plt 25 | from Bio import AlignIO 26 | from Bio import Phylo 27 | from Bio.Align.Applications import MuscleCommandline 28 | from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceMatrix 29 | from Bio.Phylo.TreeConstruction import DistanceTreeConstructor 30 | from functions.module_1 import esearch_fasta 31 | from functions.module_1 import efetch_seqs_from_webenv 32 | from functions.module_1 import seq_dict_from_seq_xml 33 | from functions.module_1 import fasta_file_from_seq_dict 34 | from functions.module_1 import get_taxid_from_seq_xml 35 | from functions.module_1 import create_taxid_table 36 | from functions.module_1 import embl_download 37 | from functions.module_1 import embl_format 38 | from functions.module_1 import accession_list_from_fasta 39 | from functions.module_1 import taxid_table_from_accession 40 | from functions.module_1 import mitofish_download 41 | from functions.module_1 import mitofish_format 42 | from functions.module_1 import check_accession 43 | 44 | ##################################################### 45 | ## helper functions ################################# 46 | #### later can be put in separate file ############## 47 | ##################################################### 48 | 49 | def fasta_to_dict_wDesc(fasta_file): 50 | seq_dict = {} 51 | for record in SeqIO.parse(fasta_file, 'fasta'): 52 | record.description = record.description.replace(' ', '_') 53 | record.id = record.description 54 | rec_id = record.id 55 | rec_desc = record.description 56 | rec_seq = str(record.seq) 57 | seq_dict.setdefault(rec_id, {})['sequence'] = rec_seq 58 | seq_dict.setdefault(rec_id, {})['description'] = rec_desc 59 | return seq_dict 60 | 61 | def fasta_to_dict(fasta_file): 62 | """turn fasta file into seq dict with format {seq_id : sequence, seq_id2: sequence2}""" 63 | #seq_input = open(fasta_file, 'r') 64 | seq_dict = {} 65 | for record in SeqIO.parse(fasta_file, 'fasta'): 66 | rec_id = record.id 67 | rec_desc = record.description 68 | rec_seq = str(record.seq) 69 | seq_dict.setdefault(rec_id, {})['sequence']=rec_seq 70 | seq_dict.setdefault(rec_id, {})['description']=rec_desc 71 | return seq_dict 72 | 73 | def derep(seqdict): 74 | rep_dict = {} 75 | derep_dict = {} 76 | for k,v in seqdict.items(): 77 | rep_dict.setdefault(v, []).append(k) 78 | for key, value in rep_dict.items(): 79 | numreads = len(value) 80 | newname = value[0] 81 | derep_dict[newname] = {'seq': key, 'size': numreads, 'readlist': value} 82 | return derep_dict 83 | 84 | def derep_to_seq(derep_dict, size = 'no'): 85 | new_dict = {} 86 | read_dict = {} 87 | for k,v in derep_dict.items(): 88 | data = v 89 | if size == 'no': 90 | base_id = k 91 | else: 92 | base_id = k + ';size='+str(data['size']) 93 | read_dict[base_id] = data['readlist'] 94 | new_dict[base_id] = data['seq'] 95 | return (new_dict, read_dict) 96 | 97 | def read_taxid_table(taxid_table_name): 98 | table_file = open(taxid_table_name, 'r') 99 | taxid_dict = {} 100 | for line in table_file: 101 | line = line.strip('\n') 102 | line_parts = line.split('\t') 103 | taxid_dict[line_parts[0]]=line_parts[1] 104 | table_file.close() 105 | return taxid_dict 106 | 107 | def efetch_taxonomy_xml(taxid_set, email, lineage_batch=5000): 108 | lineage_list = [] 109 | Entrez.email = email 110 | 111 | for start in tqdm(range(0, len(taxid_set), lineage_batch)): 112 | lineage_group = taxid_set[start : start + lineage_batch] 113 | lineage_attempt = 1 114 | lineage_success = False 115 | while lineage_attempt <= 3 and not lineage_success: 116 | lineage_attempt += 1 117 | try: 118 | lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = ','.join(lineage_group)) 119 | lineage_record = Entrez.read(lineage_search) 120 | lineage_list.append(lineage_record) 121 | lineage_success = True 122 | except HTTPError as err: 123 | if 500 <= err.code <= 599: 124 | print(f'Received error from server {err}') 125 | print(f'Attempt {lineage_attempt} of 3') 126 | time.sleep(15) 127 | else: 128 | raise 129 | return lineage_list 130 | 131 | def dataframe_from_taxonomy(taxonomy_list, ranks_used='default'): 132 | if ranks_used == 'default': 133 | ranks = ['superkingdom','phylum', 'class', 'order', 'family', 'genus', 'species'] 134 | else: 135 | ranks = ranks_used 136 | 137 | lineage_info = [] 138 | for key in taxonomy_list: 139 | for i in range(len(key)): 140 | lineage = {d['Rank']:d['ScientificName'] for d in key[i]['LineageEx'] if d['Rank'] in ranks} 141 | lineage['species'] = key[i]['ScientificName'] 142 | lineage['taxid'] = key[i]['TaxId'] 143 | lineage_info.append(lineage) 144 | tax_df = pd.DataFrame(lineage_info) 145 | return tax_df 146 | 147 | def sintax_from_df(df, output_file_name): 148 | df['species'] = df['species'].str.replace(' ', '_') 149 | df['sintax'] = '>' + df['accession'] + ';tax=d:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species'] 150 | datafr = df[['sintax', 'sequence']] 151 | datafr.to_csv(output_file_name, index = None, header = None, sep = '\n') 152 | 153 | ############################################### 154 | ###### MAIN COMMANDS ########################## 155 | ############################################### 156 | ###### MODULE DATABASE DOWNLOAD ############### 157 | ############################################### 158 | 159 | ## function: download sequencing data from online databases 160 | def db_download(args): 161 | SOURCE = args.source 162 | DATABASE = args.database 163 | QUERY = args.query 164 | OUTPUT = args.output 165 | EMAIL = args.email 166 | 167 | ## download sequencing data from NCBI 168 | if SOURCE == 'ncbi': 169 | print('\ndownloading sequences from NCBI') 170 | if all(v is not None for v in [DATABASE, QUERY, OUTPUT, EMAIL]): 171 | print('\nlooking up the number of sequences that match the query\n') 172 | search_record = esearch_fasta(QUERY, DATABASE, EMAIL) 173 | print('found {} matching sequences'.format(search_record['Count'])) 174 | print('\nstarting the download\n') 175 | batch_size = 5000 176 | fetch_seqs = efetch_seqs_from_webenv(search_record, DATABASE, EMAIL, batch_size) 177 | sequences = seq_dict_from_seq_xml(fetch_seqs) 178 | num_sequences = fasta_file_from_seq_dict(sequences, OUTPUT) 179 | print(num_sequences, ' sequences written to file:', OUTPUT) 180 | acc_taxid = get_taxid_from_seq_xml(fetch_seqs) 181 | taxid_tab_name = OUTPUT+'.taxid_table.tsv' 182 | num_accs = create_taxid_table(acc_taxid, taxid_tab_name) 183 | print(num_accs, ' accessions written to file:', 'taxid_table.tsv') 184 | else: 185 | print('parameter missing') 186 | 187 | ## download sequencing data from EMBL 188 | elif SOURCE == 'embl': 189 | if all(v is not None for v in [DATABASE, EMAIL]): 190 | print('\ndownloading sequences from EMBL') 191 | dl_files = embl_download(DATABASE) 192 | print('formatting downloaded files to fasta format') 193 | fasta_files = embl_format(dl_files) 194 | for fasta in fasta_files: 195 | print(f'retrieving tax ID information for each accession in {fasta}') 196 | acc_list = accession_list_from_fasta(fasta) 197 | taxid_tab_name = fasta + '.taxid_table.tsv' 198 | num_taxid = taxid_table_from_accession(acc_list, EMAIL, taxid_tab_name) 199 | print(num_taxid, ' accessions and tax IDs written to file: ', taxid_tab_name) 200 | else: 201 | print('parameter missing') 202 | 203 | ## download sequencing data from MitoFish 204 | elif SOURCE == 'mitofish': 205 | if all(v is not None for v in [OUTPUT, EMAIL]): 206 | print('\ndownloading sequences from MITOFISH') 207 | url = 'http://mitofish.aori.u-tokyo.ac.jp/files/complete_partial_mitogenomes.zip' 208 | dl_file = mitofish_download(url) 209 | print(f'formatting {dl_file} to fasta format') 210 | mitoformat = mitofish_format(dl_file, OUTPUT) 211 | print(f'retrieving tax ID information for each accession in {OUTPUT}') 212 | acc_list = accession_list_from_fasta(OUTPUT) 213 | taxid_tab_name = OUTPUT + '.taxid_table.tsv' 214 | num_taxid = taxid_table_from_accession(acc_list, EMAIL, taxid_tab_name) 215 | print(num_taxid, ' accessions and tax IDs written to file: ', taxid_tab_name) 216 | else: 217 | print('parameter missing') 218 | 219 | ## download sequencing data from BOLD 220 | elif SOURCE == 'bold': 221 | print('\ndownloading sequences from BOLD') 222 | 223 | ## download taxonomy information 224 | elif SOURCE == 'taxonomy': 225 | print('\ndownloading taxonomy information') 226 | url_acc2taxid = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz' 227 | url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' 228 | results = sp.run(['wget', url_acc2taxid]) 229 | results = sp.run(['gunzip', 'nucl_gb.accession2taxid.gz']) 230 | results = sp.run(['wget', url_taxdump]) 231 | results = sp.run(['tar', '-zxvf', 'taxdump.tar.gz']) 232 | files_to_remove = ['citations.dmp', 'delnodes.dmp', 'division.dmp', 'gencode.dmp', 'merged.dmp', 'gc.prt', 'readme.txt', 'taxdump.tar.gz'] 233 | for file in files_to_remove: 234 | os.remove(file) 235 | 236 | ## print statement if source information is missing 237 | else: 238 | print('Please specify a database to download sequences from using the "source" argument. Currently "NCBI", "EMBL", and "MITOFISH" databases are supported.') 239 | 240 | ## function: import existing or custom database 241 | def db_import(args): 242 | INPUT = args.input 243 | HEADER = args.header 244 | OUTPUT = args.output 245 | EMAIL = args.email 246 | FWD = args.fwd 247 | REV = args.rev 248 | 249 | if HEADER == 'accession': 250 | # check for correct formatting of file 251 | if all(v is not None for v in [INPUT, OUTPUT, EMAIL]): 252 | print(f'\nchecking correct formatting of accession numbers in {INPUT}') 253 | incorrect_accession = check_accession(INPUT, OUTPUT) 254 | if len(incorrect_accession) != 0: 255 | print('found incorrectly formatted accession numbers. Please check file: "incorrect_accession_numbers.txt"') 256 | with open('incorrect_accession_numbers.txt', 'w') as fout: 257 | for item in incorrect_accession: 258 | fout.write(item + '\n') 259 | # generate taxid table 260 | else: 261 | print(f'found no formattign issues in {INPUT}') 262 | print(f'retrieving tax ID information for each accession in {INPUT}') 263 | acc_list = accession_list_from_fasta(OUTPUT) 264 | taxid_tab_name = OUTPUT + '.taxid_table.tsv' 265 | num_taxid = taxid_table_from_accession(acc_list, EMAIL, taxid_tab_name) 266 | print(num_taxid, ' accessions and tax IDs written to file: ', taxid_tab_name) 267 | else: 268 | print('parameter missing') 269 | # add primer sequences if option is chosen 270 | if all(v is not None for v in [FWD, REV]): 271 | print(f'appending primer sequences to each sequence in {OUTPUT}') 272 | REV_DNA = Seq(REV) 273 | REV_CORRECT = str(REV_DNA.reverse_complement()) 274 | 275 | 276 | 277 | elif HEADER == 'species': 278 | print('\ngenerating new accession numbers for spcies') 279 | 280 | else: 281 | print('\nPlease specify header information. Currently supported header information: "accession" and "species"') 282 | 283 | ## function: merge multiple databases 284 | def db_merge(args): 285 | INPUT = args.input 286 | UNIQ = args.uniq 287 | OUTPUT = args.output 288 | FORMAT = args.format 289 | DISCARD = args.discard 290 | 291 | # merge database files 292 | if FORMAT == 'db': 293 | # merge based on unique accession numbers 294 | if UNIQ != '': 295 | print('\nmerging all fasta files and discarding duplicate accession numbers') 296 | seqdict = {} 297 | discard = [] 298 | for file in INPUT: 299 | count = 0 300 | added = 0 301 | for record in SeqIO.parse(file, 'fasta'): 302 | count = count + 1 303 | id = '>' + record.id.split('.')[0] + '\n' 304 | seq = str(record.seq) + '\n' 305 | if id not in seqdict: 306 | added = added +1 307 | seqdict[id] = seq 308 | else: 309 | discard.append(id) 310 | print(f'found {count} sequences in {file}') 311 | print(f'added {added} sequences to {OUTPUT}') 312 | with open(OUTPUT, 'w') as file: 313 | for k,v in seqdict.items(): 314 | file.write(k) 315 | file.write(v) 316 | if DISCARD != '': 317 | with open(DISCARD, 'w') as disc: 318 | for item in discard: 319 | disc.write(item) 320 | # merge all sequences without filtering 321 | else: 322 | print('\nmerging all fasta files and keeping duplicate accession numbers') 323 | with open(OUTPUT, 'w') as fout: 324 | for file in INPUT: 325 | with open(file, 'r') as fin: 326 | for line in fin: 327 | fout.write(line) 328 | 329 | # merge taxonomic ID tables 330 | elif FORMAT == 'taxid': 331 | print('merging taxid tables') 332 | 333 | else: 334 | print('Please specify what format to be merged. Accepted options are "db" and "taxid"') 335 | 336 | 337 | ############################################### 338 | ###### MODULE IN SILICO PCR ################### 339 | ############################################### 340 | 341 | ## function: in silico PCR 342 | def ispcr(args): 343 | FWD = args.fwd 344 | REV = args.rev 345 | ASSAY = args.assay 346 | INPUT = args.input 347 | ERROR = args.error 348 | 349 | ## reverse complement reverse primer sequence 350 | REV_DNA = Seq(REV) 351 | REV_CORRECT = str(REV_DNA.reverse_complement()) 352 | 353 | ## setting variable names using the info from user input 354 | TRIMMED_INIT = 'init_trimmed_' + ASSAY + '_' + INPUT 355 | UNTRIMMED_INIT = 'init_untrimmed_' + ASSAY + '_' + INPUT 356 | REVCOMP_UNTRIMMED_INIT = 'revcomp_' + UNTRIMMED_INIT 357 | TRIMMED_REVCOMP = 'revcomp_' + TRIMMED_INIT 358 | UNTRIMMED_REVCOMP = 'untrimmed_' + REVCOMP_UNTRIMMED_INIT 359 | FINAL_TRIMMED = 'final_trimmed_' + ASSAY + '_' + INPUT 360 | 361 | OVERLAP = str(min([len(FWD), len(REV_CORRECT)])) 362 | ADAPTER = FWD + '...' + REV_CORRECT 363 | 364 | ## run cutadapt on downloaded fasta file 365 | count_init = len(list(SeqIO.parse(INPUT, 'fasta'))) 366 | print('\nrunning in silico PCR on fasta file containing {} sequences'.format(count_init)) 367 | cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet'] 368 | sp.call(cmnd_cutadapt_1) 369 | count_trimmed_init = len(list(SeqIO.parse(TRIMMED_INIT, 'fasta'))) 370 | print('\nfound primers in {} sequences'.format(count_trimmed_init)) 371 | 372 | ## run vsearch to reverse complement untrimmed sequences 373 | count_untrimmed_init = len(list(SeqIO.parse(UNTRIMMED_INIT, 'fasta'))) 374 | print('\nreverse complementing {} untrimmed sequences'.format(count_untrimmed_init)) 375 | cmnd_vsearch_revcomp = ['vsearch', '--fastx_revcomp', UNTRIMMED_INIT, '--fastaout', REVCOMP_UNTRIMMED_INIT, '--quiet'] 376 | sp.call(cmnd_vsearch_revcomp) 377 | 378 | ## run cutadapt on reverse complemented untrimmed sequences 379 | print('\nrunning in silico PCR on {} reverse complemented untrimmed sequences'.format(count_untrimmed_init)) 380 | cmnd_cutadapt_2 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_REVCOMP, REVCOMP_UNTRIMMED_INIT, '--untrimmed-output', UNTRIMMED_REVCOMP, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet'] 381 | sp.call(cmnd_cutadapt_2) 382 | count_trimmed_second = len(list(SeqIO.parse(TRIMMED_REVCOMP, 'fasta'))) 383 | print('\nfound primers in {} sequences'.format(count_trimmed_second)) 384 | 385 | ## concatenate both trimmed files 386 | with open(FINAL_TRIMMED, 'wb') as wfd: 387 | for f in [TRIMMED_INIT, TRIMMED_REVCOMP]: 388 | with open(f, 'rb') as fd: 389 | shutil.copyfileobj(fd, wfd) 390 | 391 | ## remove intermediary files 392 | files = [TRIMMED_INIT, UNTRIMMED_INIT, REVCOMP_UNTRIMMED_INIT, TRIMMED_REVCOMP, UNTRIMMED_REVCOMP] 393 | for file in files: 394 | os.remove(file) 395 | 396 | 397 | ############################################### 398 | ###### MODULE TAXONOMY ASSIGNMENT ############# 399 | ############################################### 400 | 401 | ## function: creating reference database with taxonomy 402 | def tax_assign(args): 403 | INPUT = args.input 404 | TABLE = args.taxid_table 405 | OUTPUT = args.output 406 | EMAIL = args.email 407 | 408 | # Get final sequence accessions from sequence file 409 | input_seq_dict = fasta_to_dict(INPUT) 410 | final_acc_list = list(input_seq_dict.keys()) 411 | final_accessions = set(final_acc_list) 412 | 413 | ## retrieve accession numbers from table file and store in list 414 | taxid_dict = read_taxid_table(TABLE) 415 | final_taxid_dict = {} 416 | for k,v in taxid_dict.items(): 417 | if k in final_accessions: 418 | final_taxid_dict[k]=v 419 | taxids = list(final_taxid_dict.values()) 420 | uniq_taxid = list(set(taxids)) 421 | print('\nfound {} accessions in input file'.format(len(final_accessions))) 422 | print("\ndownloading {} taxonomic ID's from NCBI".format(len(uniq_taxid))) 423 | taxonomy_list = efetch_taxonomy_xml(uniq_taxid, EMAIL) 424 | lineage_df = dataframe_from_taxonomy(taxonomy_list) 425 | 426 | #lineage_df = pd.DataFrame(lineage_info) 427 | taxid_colNames = ['taxid'] 428 | taxid_df = (pd.DataFrame.from_dict(final_taxid_dict, orient='index', columns=taxid_colNames).rename_axis('accession').reset_index()) 429 | seq_df = (pd.DataFrame.from_dict(input_seq_dict, orient='index').rename_axis('accession').reset_index()) 430 | taxid_lineage = taxid_df.merge(lineage_df, how = 'left', on = 'taxid') 431 | all_df = taxid_lineage.merge(seq_df, on = 'accession') 432 | 433 | # output a table with all info 434 | out_parts = OUTPUT.split('.') 435 | TABOUT = '.'.join(out_parts[:-1]) 436 | TABOUT = TABOUT+'_table.tsv' 437 | all_df.to_csv(TABOUT, index = None, sep = '\t') 438 | 439 | # create a sintax output (add other options later) 440 | sintax_from_df(all_df, OUTPUT) 441 | 442 | 443 | ############################################### 444 | ###### MODULE DATABASE CLEANUP ################ 445 | ############################################### 446 | 447 | ## function: dereplicating the database 448 | def dereplicate(args): 449 | INPUT = args.input 450 | OUTPUT = args.output 451 | 452 | # split sequence file into two dictionaries and define which species need dereplication 453 | seq_file = INPUT 454 | seqs = fasta_to_dict_wDesc(seq_file) 455 | print('\nfound {} sequences in input file'.format(len(seqs))) 456 | seq_just_id = {} 457 | taxonly = {} 458 | for k,v in seqs.items(): 459 | parts = v['description'].split(';tax=') 460 | seq_id = parts[0] 461 | tax = parts[1] 462 | seq_just_id[seq_id] = v['sequence'] 463 | taxonly.setdefault(tax, []).append(seq_id) 464 | print('\ndatabase is comprised of {} unique taxa'.format(len(taxonly))) 465 | need_derep = [] 466 | singletons = {} 467 | for k,v in taxonly.items(): 468 | if len(v) > 1: 469 | need_derep.append(k) 470 | else: 471 | singletons[v[0]] = k 472 | print('\n{} taxa only occur once in the database'.format(len(singletons))) 473 | print('\n{} taxa occur multiple times in the database'.format(len(need_derep))) 474 | tax_index = {} 475 | for k,v in taxonly.items(): 476 | if k in need_derep: 477 | for seqid in v: 478 | tax_index[seqid] = k 479 | 480 | # dereplicate sequences for species represented more than once in the datbase 481 | all_dereps = {} 482 | for d in need_derep: 483 | temp_seq_dict = {} 484 | for seqid in taxonly[d]: 485 | temp_seq_dict[seqid] = seq_just_id[seqid] 486 | dr_temp = derep(temp_seq_dict) 487 | derep_seq = derep_to_seq(dr_temp, size = 'no') 488 | derep_seq = derep_seq[0] 489 | for k,v in derep_seq.items(): 490 | new_id = k+';tax='+tax_index[k] 491 | all_dereps[new_id] = v 492 | 493 | # combine species present only once in the database with the dereplicated dataset 494 | all_new_seqs = {} 495 | for k,v in singletons.items(): 496 | new_id = k + ';tax=' + v 497 | seq = seq_just_id[k] 498 | all_new_seqs[new_id] = seq 499 | for key, value in all_dereps.items(): 500 | all_new_seqs[key] = value 501 | print('\n{} sequences left after dereplication\n'.format(len(all_new_seqs))) 502 | 503 | # save the dereplicated database 504 | output = OUTPUT 505 | seqout = open(output, 'w') 506 | for k,v in all_new_seqs.items(): 507 | seqout.write('>' + k + '\n' + v + '\n') 508 | seqout.close() 509 | 510 | 511 | ## function: sequence cleanup 512 | def seq_cleanup(args): 513 | MINLEN = args.minlen 514 | MAXLEN = args.maxlen 515 | MAXNS = args.maxns 516 | INPUT = args.input 517 | OUTPUT = args.output 518 | DISCARD = args.discard 519 | 520 | # read in input file and clean up given the parameters 521 | clean_db = [] 522 | discard_db = [] 523 | count = 0 524 | count_clean = 0 525 | for seq_record in SeqIO.parse(INPUT, 'fasta'): 526 | count = count + 1 527 | sequence = str(seq_record.seq).upper() 528 | if len(sequence) >= MINLEN and len(sequence) <= MAXLEN and sequence.count('N') <= MAXNS: 529 | clean_db.append(seq_record) 530 | count_clean = count_clean + 1 531 | else: 532 | discard_db.append(seq_record) 533 | 534 | # write cleaned database to file 535 | cleaned = count - count_clean 536 | print(f'\nfound {count} number of sequences in database prior to cleanup') 537 | print(f'\nremoved {cleaned} sequences during cleanup') 538 | print(f'\n{count_clean} sequences left after cleanup\n') 539 | clean_db_fa = [FastaIO.as_fasta_2line(record) for record in clean_db] 540 | with open(OUTPUT, 'w') as file: 541 | for item in clean_db_fa: 542 | file.write(item) 543 | 544 | # write discarded sequences to file 545 | if DISCARD != 'no': 546 | discard_db_fa = [FastaIO.as_fasta_2line(record) for record in discard_db] 547 | with open(DISCARD, 'w') as file: 548 | for item in discard_db_fa: 549 | file.write(item) 550 | 551 | 552 | ## function: header cleanup 553 | # (3) specific taxonomic groups - still to add 554 | # (4) specific missing taxonomic level - still to add 555 | def header_cleanup(args): 556 | ENV = args.env 557 | SPEC = args.spec 558 | NANS = args.nans 559 | INPUT = args.input 560 | OUTPUT = args.output 561 | 562 | clean_db = [] 563 | # filter data on keyword 'environmental' 564 | if ENV == 'yes': 565 | env_count = 0 566 | env_total = 0 567 | for seq_record in SeqIO.parse(INPUT, 'fasta'): 568 | env_total = env_total + 1 569 | id = str(seq_record.id).upper() 570 | if id.count('ENVIRONMENTAL') == 0: 571 | env_count = env_count + 1 572 | clean_db.append(seq_record) 573 | env_removed = env_total - env_count 574 | print(f'\nremoved {env_removed} environmental sequences from a total of {env_total} sequences in the database') 575 | 576 | # filter data if species name is not specified 577 | if SPEC == 'yes': 578 | if len(clean_db) == 0: 579 | spec_count = 0 580 | spec_total = 0 581 | for seq_record in SeqIO.parse(INPUT, 'fasta'): 582 | spec_total = spec_total + 1 583 | id = str(seq_record.id).upper() 584 | if id.count('_SP.') == 0: 585 | spec_count = spec_count + 1 586 | clean_db.append(seq_record) 587 | spec_removed = spec_total - spec_count 588 | print(f'\nremoved {spec_removed} entries from database not containing a species name from a total of {spec_total} sequences in the database') 589 | else: 590 | spec_db = [] 591 | spec_count = 0 592 | spec_total = 0 593 | for seq_record in clean_db: 594 | spec_total = spec_total + 1 595 | id = str(seq_record.id).upper() 596 | if id.count('_SP.') == 0: 597 | spec_count = spec_count + 1 598 | spec_db.append(seq_record) 599 | spec_removed = spec_total - spec_count 600 | print(f'\nremoved {spec_removed} entries from database not containing a species name from a total of {spec_total} sequences in the database') 601 | clean_db = [] 602 | clean_db = spec_db 603 | 604 | # filter data on missing taxonomic levels 605 | if NANS != 'nan': 606 | if len(clean_db) == 0: 607 | nans_count = 0 608 | nans_total = 0 609 | for seq_record in SeqIO.parse(INPUT, 'fasta'): 610 | nans_total = nans_total + 1 611 | id = str(seq_record.id).upper() 612 | if id.count(':NAN') <= NANS: 613 | nans_count = nans_count + 1 614 | clean_db.append(seq_record) 615 | nans_removed = nans_total - nans_count 616 | print(f'\nremoved {nans_removed} entries from database with {NANS} missing taxonomic level info from a total of {nans_total} sequences in the database') 617 | else: 618 | nans_db = [] 619 | nans_count = 0 620 | nans_total = 0 621 | for seq_record in clean_db: 622 | nans_total = nans_total + 1 623 | id = str(seq_record.id).upper() 624 | if id.count(':NAN') <= NANS: 625 | nans_count = nans_count + 1 626 | nans_db.append(seq_record) 627 | nans_removed = nans_total - nans_count 628 | print(f'\nremoved {nans_removed} entries from database with {NANS} missing taxonomic level info from a total of {nans_total} sequences in the database') 629 | clean_db = [] 630 | clean_db = nans_db 631 | 632 | # write cleaned up database to output file 633 | clean_db_fa = [FastaIO.as_fasta_2line(record) for record in clean_db] 634 | with open(OUTPUT, 'w') as file: 635 | for item in clean_db_fa: 636 | file.write(item) 637 | 638 | 639 | ############################################### 640 | ###### MODULE VISUALISATIONS ################## 641 | ############################################### 642 | 643 | ## function: phylogenetic tree builder 644 | def phylo(args): 645 | SPECIES = args.species 646 | DATABASE = args.database 647 | EMAIL = args.email 648 | OUTPUT = args.output 649 | 650 | Entrez.email = EMAIL 651 | directory = 'temp' 652 | try: 653 | os.makedirs(directory, exist_ok = True) 654 | except OSError as error: 655 | print("Directory '%s' cannot be created" % directory) 656 | 657 | # read in the text file with species names 658 | species = [] 659 | with open(SPECIES) as species_list: 660 | for spec in species_list: 661 | spec = spec.rstrip('\n') 662 | species.append(spec) 663 | print('\nfound ' + str(len(species)) + ' species of interest: ' + str(species) + '\n') 664 | 665 | # retrieve the lineage information for each species 666 | # first: uniq ID from species name 667 | # second: tax ID from uniq ID 668 | # third: taxonomic information from tax ID 669 | # fourth: format similar to database 670 | print('retrieving the taxonomic information from NCBI for ' + str(len(species)) + ' species of interest\n') 671 | uid = [] 672 | for item in species: 673 | handle = Entrez.esearch(db = 'nucleotide', term = item, retmode = 'xml', rettype = 'fasta') 674 | record = Entrez.read(handle) 675 | uid.append(record['IdList'][0]) 676 | 677 | accession_taxid = [] 678 | taxids = [] 679 | for id in uid: 680 | handle = Entrez.efetch(db = 'nuccore', id = id, retmode = 'xml', rettype = 'fasta') 681 | record = Entrez.read(handle) 682 | acc = record[0]['TSeq_accver'] 683 | taxid = record[0]['TSeq_taxid'] 684 | accession_taxid.append(str(acc) + ' ' + str(taxid)) 685 | taxids.append(str(taxid)) 686 | 687 | lineage_list = [] 688 | for taxid in taxids: 689 | lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = taxid) 690 | lineage_record = Entrez.read(lineage_search) 691 | lineage_list.append(lineage_record) 692 | 693 | lineage_info = [] 694 | for key in lineage_list: 695 | lineage = {d['Rank']:d['ScientificName'] for d in key[0]['LineageEx'] if d['Rank'] in ['superkingdom', 'phylum', 'class', 696 | 'order', 'family', 'genus', 'species']} 697 | lineage['species'] = key[0]['ScientificName'] 698 | lineage['taxid'] = key[0]['TaxId'] 699 | lineage_info.append(lineage) 700 | df = pd.DataFrame(lineage_info) 701 | df['species'] = df['species'].str.replace(' ', '_') 702 | df['sintax'] = 'd:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species'] 703 | datafr = df['sintax'] 704 | species_interest = datafr.values.tolist() 705 | 706 | # extract all entries from the database that share a family status with the species of interest 707 | for record in SeqIO.parse(DATABASE, 'fasta'): 708 | family_rec = record.id.split(',')[4] 709 | genus_rec = record.id.split(',')[5] 710 | species_rec = record.id.split(',')[6] 711 | for species in species_interest: 712 | family_int = species.split(',')[4] 713 | genus_int = species.split(',')[5] 714 | species_int = species.split(',')[6] 715 | spec_int = species.split(',')[6].split(':')[1] 716 | if family_int == family_rec: 717 | with open(f'{directory}/{spec_int}_family.fasta', 'a') as f: 718 | SeqIO.write(record, f, 'fasta') 719 | if genus_int == genus_rec: 720 | with open(f'{directory}/{spec_int}_genus.fasta', 'a') as f: 721 | SeqIO.write(record, f, 'fasta') 722 | if species_int == species_rec: 723 | with open(f'{directory}/{spec_int}_species.fasta', 'a') as f: 724 | SeqIO.write(record, f, 'fasta') 725 | 726 | # extract information for data table from newly generated files 727 | newdict = {} 728 | for species in species_interest: 729 | spec_int = species.split(',')[6].split(':')[1] 730 | try: 731 | spec_number = list(SeqIO.parse(f'{directory}/{spec_int}_species.fasta', 'fasta')) 732 | spec_num = len(spec_number) 733 | except: 734 | spec_num = 0 735 | try: 736 | gen_number = list(SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta')) 737 | gen_num = len(gen_number) 738 | gen_list = [] 739 | for record in gen_number: 740 | gen = record.id.split(',')[6].split(':')[1] 741 | if gen not in gen_list: 742 | gen_list.append(gen) 743 | except: 744 | gen_num = 0 745 | gen_list = ['NA'] 746 | try: 747 | fam_number = list(SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta')) 748 | fam_num = len(fam_number) 749 | fam_list = [] 750 | for record in fam_number: 751 | fam = record.id.split(',')[6].split(':')[1] 752 | if fam not in fam_list: 753 | fam_list.append(fam) 754 | except: 755 | fam_num = 0 756 | fam_list = ['NA'] 757 | newdict[spec_int] = {'species': spec_int, 'species_occur': spec_num, 'species_gen': gen_list, 'gen_entries': gen_num, 'species_fam': fam_list, 'fam_entries': fam_num} 758 | 759 | # print information on which species are present in the database 760 | for species in species_interest: 761 | spec_int = species.split(',')[6].split(':')[1] 762 | if newdict[spec_int]['species_occur'] == 0: 763 | print(str(newdict[spec_int]['species']) + ': not present in the reference database\n') 764 | else: 765 | print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['species_occur']) + ' entries in the database\n') 766 | 767 | # output data table on species of interest 768 | df = pd.DataFrame.from_dict(newdict, orient = 'index') 769 | df = df[['species', 'species_occur', 'gen_entries', 'fam_entries', 'species_gen', 'species_fam']] 770 | df.to_csv(OUTPUT, sep = '\t', index = None) 771 | 772 | # generate phylogenetic trees for every species of interest based on number of entries in genus and family 773 | # first: check number of entries in if statement 774 | # second: shorten the headers of the sequences in the file, so that it can be printed on the figure 775 | # third: run muscle to generate alignment 776 | # fourth: calculate distance from alignment 777 | # fifth: generate tree figure 778 | for species in species_interest: 779 | spec_int = species.split(',')[6].split(':')[1] 780 | if newdict[spec_int]['fam_entries'] > 50: 781 | print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries too large. Generating phylogenetic tree on genus level with ' + str(newdict[spec_int]['gen_entries']) + ' entries\n') 782 | 783 | select = [] 784 | for record in SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta'): 785 | record.description = record.description.replace(';', ',') 786 | record.id = record.description 787 | record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1] 788 | record.description = record.id 789 | select.append(record) 790 | handle = open(f'{directory}/{spec_int}_genus_align.fasta', 'w') 791 | SeqIO.write(select, handle, 'fasta') 792 | handle.close() 793 | 794 | muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_genus_align.fasta', 795 | out = f'{directory}/{spec_int}_genus_align.clw', 796 | diags = True, 797 | maxiters = 1, 798 | log = f'{directory}/{spec_int}_genus_align_log.txt', 799 | clw = True) 800 | muscle_cline() 801 | 802 | with open(f'{directory}/{spec_int}_genus_align.clw' , 'r') as aln: 803 | alignment = AlignIO.read(aln, 'clustal') 804 | calculator = DistanceCalculator('identity') 805 | Distance_matrix = calculator.get_distance(alignment) 806 | constructor = DistanceTreeConstructor(calculator, 'nj') 807 | 808 | tree = constructor.build_tree(alignment) 809 | fig = plt.figure(figsize = (25,15), dpi = 100) 810 | matplotlib.rc('font', size=12) 811 | matplotlib.rc('xtick', labelsize=10) 812 | matplotlib.rc('ytick', labelsize=10) 813 | axes = fig.add_subplot(1, 1, 1) 814 | Phylo.draw(tree, axes=axes, do_show = False) 815 | fig.savefig(f'{spec_int}_genus_align_tree.pdf') 816 | 817 | else: 818 | print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries. Generating phylogenetic tree on family level\n') 819 | 820 | select = [] 821 | for record in SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta'): 822 | record.description = record.description.replace(';', ',') 823 | record.id = record.description 824 | record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1] 825 | record.description = record.id 826 | select.append(record) 827 | handle = open(f'{directory}/{spec_int}_family_align.fasta', 'w') 828 | SeqIO.write(select, handle, 'fasta') 829 | handle.close() 830 | 831 | muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_family_align.fasta', 832 | out = f'{directory}/{spec_int}_family_align.clw', 833 | diags = True, 834 | maxiters = 1, 835 | log = f'{directory}/{spec_int}_family_align_log.txt', 836 | clw = True) 837 | muscle_cline() 838 | 839 | with open(f'{directory}/{spec_int}_family_align.clw' , 'r') as aln: 840 | alignment = AlignIO.read(aln, 'clustal') 841 | calculator = DistanceCalculator('identity') 842 | Distance_matrix = calculator.get_distance(alignment) 843 | constructor = DistanceTreeConstructor(calculator, 'nj') 844 | 845 | tree = constructor.build_tree(alignment) 846 | fig = plt.figure(figsize = (25,15), dpi = 100) 847 | matplotlib.rc('font', size=12) 848 | matplotlib.rc('xtick', labelsize=10) 849 | matplotlib.rc('ytick', labelsize=10) 850 | axes = fig.add_subplot(1, 1, 1) 851 | Phylo.draw(tree, axes=axes, do_show = False) 852 | fig.savefig(f'{spec_int}_family_align_tree.pdf') 853 | 854 | 855 | ## function: argparse parser 856 | def main(): 857 | parser = argparse.ArgumentParser(description = 'creating a curated reference database') 858 | subparser = parser.add_subparsers() 859 | 860 | db_download_parser = subparser.add_parser('db_download', description = 'downloading sequence data from online databases') 861 | db_download_parser.set_defaults(func = db_download) 862 | db_download_parser.add_argument('-s', '--source', help = 'specify online database used to download sequences. Currently supported options are: (1) ncbi, (2) embl, (3) mitofish', dest = 'source', type = str, required = True) 863 | db_download_parser.add_argument('-db', '--database', help = 'Specific NCBI or EMBL database used to download sequences. Example NCBI: nucleotide. Example EMBL: mam*', dest = 'database', type = str) 864 | db_download_parser.add_argument('-q', '--query', help = 'NCBI query search to limit portion of database to be downloaded. Example: "16S[All Fields] AND ("1"[SLEN] : "50000"[SLEN])"', dest = 'query', type = str) 865 | db_download_parser.add_argument('-o', '--output', help = 'output file name option for NCBI and MITOFISH databases', dest = 'output', type = str) 866 | db_download_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str) 867 | 868 | db_import_parser = subparser.add_parser('db_import', description = 'import existing or curated database') 869 | db_import_parser.set_defaults(func = db_import) 870 | db_import_parser.add_argument('-i', '--input', help = 'input database filename', dest = 'input', type = str, required = True) 871 | db_import_parser.add_argument('-s', '--seq_header', help = 'information provided in sequence header: "accession" or "species"', dest = 'header', type = str, required = True) 872 | db_import_parser.add_argument('-o', '--output', help = 'output file name option', dest = 'output', type = str, required = True) 873 | db_import_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True) 874 | db_import_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str) 875 | db_import_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str) 876 | 877 | 878 | db_merge_parser = subparser.add_parser('db_merge', description = 'merge multiple databases') 879 | db_merge_parser.set_defaults(func = db_merge) 880 | db_merge_parser.add_argument('-i', '--input', nargs = '+', help = 'list of files to be merged', dest = 'input', required = True) 881 | db_merge_parser.add_argument('-u', '--uniq', help = 'keep only unique accession numbers', dest = 'uniq', type = str, default = '') 882 | db_merge_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 883 | db_merge_parser.add_argument('-f', '--format', help = 'data format to be merged, database (db) or tax ID table (taxid)', dest = 'format', type = str, required = True) 884 | db_merge_parser.add_argument('-d', '--discard', help = 'file name for discarded duplicate accession numbers', dest = 'discard', type = str) 885 | 886 | in_silico_pcr_parser = subparser.add_parser('ispcr', description = 'curating the downloaded reference sequences with an in silico PCR') 887 | in_silico_pcr_parser.set_defaults(func = ispcr) 888 | in_silico_pcr_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str, required = True) 889 | in_silico_pcr_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str, required = True) 890 | in_silico_pcr_parser.add_argument('-a', '--assay', help = 'name of primer assay', dest = 'assay', type = str, required = True) 891 | in_silico_pcr_parser.add_argument('-i', '--input', help = 'input filename', dest = 'input', type = str, required = True) 892 | in_silico_pcr_parser.add_argument('-e', '--error', help = 'number of errors allowed in primer-binding site. Default = 4.5', dest = 'error', type = str, default = '4.5') 893 | 894 | ref_database_parser = subparser.add_parser('tax_assign', description = 'creating the reference database with taxonomic information') 895 | ref_database_parser.set_defaults(func = tax_assign) 896 | ref_database_parser.add_argument('-i', '--input', help = 'input file containing the curated fasta sequences after in silico PCR', dest = 'input', type = str, required = True) 897 | ref_database_parser.add_argument('-t', '--taxid_table', help = 'input taxid table containing the taxid for each accession', dest = 'taxid_table', type = str, required = True) 898 | ref_database_parser.add_argument('-o', '--output', help = 'curated reference database output file', dest = 'output', type = str, required = True) 899 | ref_database_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True) 900 | 901 | dereplication_parser = subparser.add_parser('dereplicate', description = 'dereplicating the database') 902 | dereplication_parser.set_defaults(func = dereplicate) 903 | dereplication_parser.add_argument('-i', '--input', help = 'filename of the curated reference database', dest = 'input', type = str, required = True) 904 | dereplication_parser.add_argument('-o', '--output', help = 'filename of the dereplicated curated reference database', dest = 'output', type = str, required = True) 905 | 906 | seq_cleanup_parser = subparser.add_parser('seq_cleanup', description = 'cleaning database on sequence parameters') 907 | seq_cleanup_parser.set_defaults(func = seq_cleanup) 908 | seq_cleanup_parser.add_argument('-min', '--minlen', help = 'minimum sequence length to be retained in the database. Default = 100', dest = 'minlen', type = str, default = '100') 909 | seq_cleanup_parser.add_argument('-max', '--maxlen', help = 'maximum sequence length to be retained in the database. Default = 500', dest = 'maxlen', type = str, default = '500') 910 | seq_cleanup_parser.add_argument('-n', '--maxns', help = 'maximum number of ambiguous bases allowed in the sequence. Default = 0', dest = 'maxns', type = str, default = '0') 911 | seq_cleanup_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True) 912 | seq_cleanup_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 913 | seq_cleanup_parser.add_argument('-d', '--discard', help = 'file name of discarded sequences', dest = 'discard', type = str, default = 'no') 914 | 915 | header_cleanup_parser = subparser.add_parser('header_cleanup', description = 'cleaning database on header info') 916 | header_cleanup_parser.set_defaults(func = header_cleanup) 917 | header_cleanup_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True) 918 | header_cleanup_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True) 919 | header_cleanup_parser.add_argument('-e', '--enviro', help = 'discard environmental sequences from the dataset. yes/no', dest = 'env', type = str, default = 'no') 920 | header_cleanup_parser.add_argument('-s', '--species', help = 'discard sequences for which the species name is unspecified. yes/no', dest = 'spec', type = str, default = 'no') 921 | header_cleanup_parser.add_argument('-n', '--nans', help = 'discard sequences with N number of unspecified taxonomic levels', dest = 'nans', type = str, default = 'nans') 922 | 923 | phylo_parser = subparser.add_parser('phylo_build', description = 'generating phylogenetic trees for species of interest') 924 | phylo_parser.set_defaults(func = phylo) 925 | phylo_parser.add_argument('-s', '--species', help = 'text file containing list of species separated by newlines', dest = 'species', type = str, required = True) 926 | phylo_parser.add_argument('-db', '--database', help = 'curated reference database', dest = 'database', type = str, required = True) 927 | phylo_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True) 928 | phylo_parser.add_argument('-o', '--output', help = 'filename for output table', dest = 'output', type = str, required = True) 929 | 930 | args = parser.parse_args() 931 | args.func(args) 932 | 933 | if __name__ == '__main__': 934 | main() -------------------------------------------------------------------------------- /function/older_versions/reference_database_creator_v2.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ## import modules 4 | import argparse 5 | from Bio import Entrez 6 | import time 7 | from urllib.error import HTTPError 8 | import http.client 9 | http.client.HTTPConnection._http_vsn = 10 10 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' 11 | import subprocess as sp 12 | import shutil 13 | import re 14 | import pandas as pd 15 | from tqdm import tqdm 16 | from Bio.Seq import Seq 17 | from Bio import SeqIO 18 | import os 19 | import matplotlib 20 | import matplotlib.pyplot as plt 21 | from Bio import AlignIO 22 | from Bio import Phylo 23 | from Bio.Align.Applications import MuscleCommandline 24 | from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceMatrix 25 | from Bio.Phylo.TreeConstruction import DistanceTreeConstructor 26 | 27 | 28 | ## function: download sequencing data from NCBI 29 | def ncbi_download(args): 30 | DB = args.database 31 | QUERY = args.query 32 | OUTPUT = args.output_filename 33 | EMAIL = args.email 34 | 35 | Entrez.email = EMAIL 36 | print('\nlooking up the number of sequences that match the query\n') 37 | first_handle = Entrez.esearch(db=DB, term=QUERY, rettype='fasta') 38 | first_record = Entrez.read(first_handle) 39 | first_handle.close() 40 | count = int(first_record['Count']) 41 | 42 | second_handle = Entrez.esearch(db=DB, term=QUERY, retmax=count, rettype='fasta', usehistory = 'y') 43 | second_record = Entrez.read(second_handle) 44 | second_handle.close() 45 | 46 | id_list = second_record['IdList'] 47 | count = int(second_record['Count']) 48 | assert(count == len(id_list)) 49 | webenv = second_record['WebEnv'] 50 | query_key = second_record['QueryKey'] 51 | 52 | print('found {} matching sequences'.format(second_record['Count'])) 53 | print('\nstarting the download\n') 54 | 55 | batch_size = 5000 56 | out_handle = open(OUTPUT, 'w') 57 | for start in tqdm(range(0, count, batch_size)): 58 | attempt = 1 59 | success = False 60 | while attempt <= 3 and not success: 61 | attempt += 1 62 | try: 63 | fetch_handle = Entrez.efetch(db=DB, rettype='fasta', 64 | retstart=start, retmax=batch_size, 65 | webenv=webenv, query_key=query_key) 66 | success = True 67 | except HTTPError as err: 68 | if 500 <= err.code <= 599: 69 | print(f"Received error from server {err}") 70 | print("Attempt {attempt} of 3") 71 | time.sleep(15) 72 | else: 73 | raise 74 | data = fetch_handle.read() 75 | fetch_handle.close() 76 | out_handle.write(data) 77 | out_handle.close() 78 | 79 | 80 | ## function: in silico PCR 81 | def in_silico_pcr(args): 82 | ## user input 83 | FWD = args.fwd 84 | REV = args.rev 85 | ASSAY = args.assay 86 | INPUT = args.input 87 | 88 | ## reverse complement reverse primer sequence 89 | REV_DNA = Seq(REV) 90 | REV_CORRECT = str(REV_DNA.reverse_complement()) 91 | 92 | ## setting variable names using the info from user input 93 | TRIMMED_INIT = 'init_trimmed_' + ASSAY + '_' + INPUT 94 | UNTRIMMED_INIT = 'init_untrimmed_' + ASSAY + '_' + INPUT 95 | REVCOMP_UNTRIMMED_INIT = 'revcomp_' + UNTRIMMED_INIT 96 | TRIMMED_REVCOMP = 'revcomp_' + TRIMMED_INIT 97 | UNTRIMMED_REVCOMP = 'untrimmed_' + REVCOMP_UNTRIMMED_INIT 98 | FINAL_TRIMMED = 'final_trimmed_' + ASSAY + '_' + INPUT 99 | 100 | OVERLAP = str(min([len(FWD), len(REV_CORRECT)])) 101 | #ERROR = str(round(min([3/len(FWD), 3/len(REV_CORRECT)]), 2)) 102 | #print(ERROR) 103 | ERROR = str(4.5) 104 | ADAPTER = FWD + '...' + REV_CORRECT 105 | 106 | ## run cutadapt on downloaded fasta file 107 | count_init = len(list(SeqIO.parse(INPUT, 'fasta'))) 108 | print('\nrunning in silico PCR on fasta file containing {} sequences'.format(count_init)) 109 | #cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet'] 110 | cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP] 111 | sp.call(cmnd_cutadapt_1) 112 | count_trimmed_init = len(list(SeqIO.parse(TRIMMED_INIT, 'fasta'))) 113 | print('\nfound primers in {} sequences'.format(count_trimmed_init)) 114 | 115 | ## run vsearch to reverse complement untrimmed sequences 116 | count_untrimmed_init = len(list(SeqIO.parse(UNTRIMMED_INIT, 'fasta'))) 117 | print('\nreverse complementing {} untrimmed sequences'.format(count_untrimmed_init)) 118 | cmnd_vsearch_revcomp = ['vsearch', '--fastx_revcomp', UNTRIMMED_INIT, '--fastaout', REVCOMP_UNTRIMMED_INIT, '--quiet'] 119 | sp.call(cmnd_vsearch_revcomp) 120 | 121 | ## run cutadapt on reverse complemented untrimmed sequences 122 | print('\nrunning in silico PCR on {} reverse complemented untrimmed sequences'.format(count_untrimmed_init)) 123 | cmnd_cutadapt_2 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_REVCOMP, REVCOMP_UNTRIMMED_INIT, '--untrimmed-output', UNTRIMMED_REVCOMP, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet'] 124 | sp.call(cmnd_cutadapt_2) 125 | count_trimmed_second = len(list(SeqIO.parse(TRIMMED_REVCOMP, 'fasta'))) 126 | print('\nfound primers in {} sequences'.format(count_trimmed_second)) 127 | 128 | ## concatenate both trimmed files 129 | with open(FINAL_TRIMMED, 'wb') as wfd: 130 | for f in [TRIMMED_INIT, TRIMMED_REVCOMP]: 131 | with open(f, 'rb') as fd: 132 | shutil.copyfileobj(fd, wfd) 133 | 134 | ## remove intermediary files 135 | files = [TRIMMED_INIT, UNTRIMMED_INIT, REVCOMP_UNTRIMMED_INIT, TRIMMED_REVCOMP, UNTRIMMED_REVCOMP] 136 | 137 | for file in files: 138 | os.remove(file) 139 | 140 | 141 | ## function: creating reference database with taxonomy 142 | def ref_database(args): 143 | INPUT = args.input 144 | OUTPUT = args.output 145 | EMAIL = args.email 146 | 147 | ## retrieve accession numbers from fasta file and store in list 148 | Entrez.email = EMAIL 149 | accessions = [] 150 | sequence_number = [] 151 | correct_accessions = [] 152 | with open(INPUT) as myfile: 153 | for line in myfile: 154 | #pattern = re.search(r"^\>(.+?)\.", line) 155 | #print(pattern) 156 | #if pattern: 157 | # found = pattern.group(1) 158 | # accessions.append(found) 159 | if line.startswith('>'): 160 | pattern = line.lstrip('>').split('.')[0] 161 | sequence_number.append(pattern) 162 | if pattern not in accessions: 163 | accessions.append(pattern) 164 | #print(pattern) 165 | 166 | #print(len(accessions)) 167 | 168 | ## remove wrongly formatted lines (not accession numbers) 169 | mistakes = ['@', '#', '$', '%', '&', '(', ')', '!', '<', '?', '|', ',', '.', '+', '=', '`', '~'] 170 | 171 | for item in accessions: 172 | if not any(mistake in item for mistake in mistakes): 173 | correct_accessions.append(item) 174 | 175 | print('\nfound {} accessions in input file'.format(len(sequence_number))) 176 | print('\nfound {} unique accessions in input file'.format(len(accessions))) 177 | if len(accessions) - len(correct_accessions) == 0: 178 | print('\nfound no incorrect formatting in accession numbers') 179 | else: 180 | print('\nremoved {} accessions due to incorrect formatting'.format(len(accessions) - len(correct_accessions))) 181 | 182 | ## find taxids for all correct accession numbers 183 | NCBI_list = [] 184 | batch_size = 5000 185 | accession_taxid = [] 186 | taxids = [] 187 | 188 | print("\ndownloading {} taxonomic ID's from NCBI".format(len(correct_accessions))) 189 | 190 | for start in tqdm(range(0, len(correct_accessions), batch_size)): 191 | group = correct_accessions[start : start + batch_size] 192 | attempt = 1 193 | success = False 194 | while attempt <= 3 and not success: 195 | attempt += 1 196 | try: 197 | handle = Entrez.efetch(db = 'nuccore', id = ",".join(group), retmode = 'xml', rettype = 'fasta') 198 | record = Entrez.read(handle) 199 | NCBI_list.append(record) 200 | success = True 201 | except HTTPError as err: 202 | if 500 <= err.code <= 599: 203 | print(f"Received error from server {err}") 204 | print(f"Attempt {attempt} of 3") 205 | time.sleep(15) 206 | else: 207 | raise 208 | 209 | ## format data into two lists 210 | for record in NCBI_list: 211 | for i in range(len(record)): 212 | acc = record[i]['TSeq_accver'] 213 | taxid = record[i]['TSeq_taxid'] 214 | accession_taxid.append(str(acc) + ' ' + str(taxid)) 215 | taxids.append(str(taxid)) 216 | 217 | uniq_taxid = list(set(taxids)) 218 | print("\nfound {} unique taxonomic ID's".format(len(uniq_taxid))) 219 | 220 | ## retrieve taxonomic lineage for 1000 taxids at a time 221 | lineage_list = [] 222 | lineage_batch = 5000 223 | 224 | print("\ndownloading taxonomic lineage for {} taxonomic ID's".format(len(uniq_taxid))) 225 | 226 | for start in tqdm(range(0, len(uniq_taxid), lineage_batch)): 227 | lineage_group = uniq_taxid[start : start + lineage_batch] 228 | lineage_attempt = 1 229 | lineage_success = False 230 | while lineage_attempt <= 3 and not lineage_success: 231 | lineage_attempt += 1 232 | try: 233 | lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = ','.join(lineage_group)) 234 | lineage_record = Entrez.read(lineage_search) 235 | lineage_list.append(lineage_record) 236 | lineage_success = True 237 | except HTTPError as err: 238 | if 500 <= err.code <= 599: 239 | print(f'Received error from server {err}') 240 | print(f'Attempt {lineage_attempt} of 3') 241 | time.sleep(15) 242 | else: 243 | raise 244 | 245 | ## format downloaded info to pandas dataframe containing needed info for taxonomic lineage 246 | lineage_info = [] 247 | 248 | for key in lineage_list: 249 | for i in range(len(key)): 250 | lineage = {d['Rank']:d['ScientificName'] for d in key[i]['LineageEx'] if d['Rank'] in ['superkingdom', 251 | 'phylum', 'class', 'order', 'family', 'genus', 'species']} 252 | lineage['species'] = key[i]['ScientificName'] 253 | lineage['taxid'] = key[i]['TaxId'] 254 | lineage_info.append(lineage) 255 | 256 | tax_list = pd.DataFrame(lineage_info) 257 | 258 | ## combine dataframe with accession list and fasta sequence file 259 | accession_and_taxid = pd.DataFrame(accession_taxid) 260 | accession_and_taxid = accession_and_taxid[0].str.split(' ', expand = True) 261 | accession_and_taxid['accession'] = accession_and_taxid[0].str.split('.').str[0] 262 | accession_and_taxid.columns = ['acc_name', 'taxid', 'accession'] 263 | 264 | sequence = pd.DataFrame(pd.read_csv(INPUT, sep = '\t', header = None).values.reshape(-1,2)) 265 | sequence['accession'] = sequence[0].str[1:].str.split('.').str[0] 266 | sequence.columns = ['name', 'sequence', 'accession'] 267 | 268 | accession_and_taxid = accession_and_taxid.astype('str') 269 | tax_list = tax_list.astype('str') 270 | sequence = sequence.astype('str') 271 | 272 | df = accession_and_taxid.merge(tax_list, how = 'left', on = 'taxid') 273 | df = df.merge(sequence, on = 'accession') 274 | 275 | ## clean up dataframe 276 | 277 | ## format the dataframe to final output 278 | df['species'] = df['species'].str.replace(' ', '_') 279 | df['sintax'] = '>' + df['accession'] + ';tax=d:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species'] 280 | datafr = df[['sintax', 'sequence']] 281 | datafr.to_csv(OUTPUT, index = None, header = None, sep = '\n') 282 | 283 | 284 | ## function: dereplicating the database 285 | def dereplicate(args): 286 | INPUT = args.input 287 | OUTPUT = args.output 288 | 289 | ## subfunctions to be called 290 | def fasta_to_dict_wDesc(fasta_file): 291 | seq_dict = {} 292 | for record in SeqIO.parse(fasta_file, 'fasta'): 293 | record.description = record.description.replace(' ', '_') 294 | record.id = record.description 295 | rec_id = record.id 296 | rec_desc = record.description 297 | rec_seq = str(record.seq) 298 | seq_dict.setdefault(rec_id, {})['sequence'] = rec_seq 299 | seq_dict.setdefault(rec_id, {})['description'] = rec_desc 300 | return seq_dict 301 | 302 | def derep(seqdict): 303 | rep_dict = {} 304 | derep_dict = {} 305 | for k,v in seqdict.items(): 306 | rep_dict.setdefault(v, []).append(k) 307 | for key, value in rep_dict.items(): 308 | numreads = len(value) 309 | newname = value[0] 310 | derep_dict[newname] = {'seq': key, 'size': numreads, 'readlist': value} 311 | return derep_dict 312 | 313 | def derep_to_seq(derep_dict, size = 'no'): 314 | new_dict = {} 315 | read_dict = {} 316 | for k,v in derep_dict.items(): 317 | data = v 318 | if size == 'no': 319 | base_id = k 320 | else: 321 | base_id = k + ';size='+str(data['size']) 322 | read_dict[base_id] = data['readlist'] 323 | new_dict[base_id] = data['seq'] 324 | return (new_dict, read_dict) 325 | 326 | ## split sequence file into two dictionaries and define which species need dereplication 327 | seq_file = INPUT 328 | seqs = fasta_to_dict_wDesc(seq_file) 329 | 330 | print('\nfound {} sequences in input file'.format(len(seqs))) 331 | 332 | seq_just_id = {} 333 | taxonly = {} 334 | for k,v in seqs.items(): 335 | parts = v['description'].split(';tax=') 336 | seq_id = parts[0] 337 | tax = parts[1] 338 | seq_just_id[seq_id] = v['sequence'] 339 | taxonly.setdefault(tax, []).append(seq_id) 340 | 341 | print('\ndatabase is comprised of {} unique taxa'.format(len(taxonly))) 342 | 343 | need_derep = [] 344 | singletons = {} 345 | for k,v in taxonly.items(): 346 | if len(v) > 1: 347 | need_derep.append(k) 348 | else: 349 | singletons[v[0]] = k 350 | 351 | print('\n{} taxa only occur once in the database'.format(len(singletons))) 352 | print('\n{} taxa occur multiple times in the database'.format(len(need_derep))) 353 | 354 | tax_index = {} 355 | for k,v in taxonly.items(): 356 | if k in need_derep: 357 | for seqid in v: 358 | tax_index[seqid] = k 359 | 360 | ## dereplicate sequences for species represented more than once in the datbase 361 | all_dereps = {} 362 | for d in need_derep: 363 | temp_seq_dict = {} 364 | for seqid in taxonly[d]: 365 | temp_seq_dict[seqid] = seq_just_id[seqid] 366 | dr_temp = derep(temp_seq_dict) 367 | derep_seq = derep_to_seq(dr_temp, size = 'no') 368 | derep_seq = derep_seq[0] 369 | for k,v in derep_seq.items(): 370 | new_id = k+';tax='+tax_index[k] 371 | all_dereps[new_id] = v 372 | 373 | ## combine species present only once in the database with the dereplicated dataset 374 | all_new_seqs = {} 375 | for k,v in singletons.items(): 376 | new_id = k + ';tax=' + v 377 | seq = seq_just_id[k] 378 | all_new_seqs[new_id] = seq 379 | for key, value in all_dereps.items(): 380 | all_new_seqs[key] = value 381 | 382 | print('\n{} sequences left after dereplication\n'.format(len(all_new_seqs))) 383 | 384 | ## save the dereplicated database 385 | output = OUTPUT 386 | seqout = open(output, 'w') 387 | for k,v in all_new_seqs.items(): 388 | seqout.write('>' + k + '\n' + v + '\n') 389 | seqout.close() 390 | 391 | 392 | ## function: phylogenetic tree builder 393 | def phylo(args): 394 | SPECIES = args.species 395 | DATABASE = args.database 396 | EMAIL = args.email 397 | OUTPUT = args.output 398 | 399 | Entrez.email = EMAIL 400 | directory = 'temp' 401 | try: 402 | os.makedirs(directory, exist_ok = True) 403 | except OSError as error: 404 | print("Directory '%s' can not be created" % directory) 405 | 406 | ## read in the text file with species names 407 | species = [] 408 | with open(SPECIES) as species_list: 409 | for spec in species_list: 410 | spec = spec.rstrip('\n') 411 | species.append(spec) 412 | print('\nfound ' + str(len(species)) + ' species of interest: ' + str(species) + '\n') 413 | 414 | ## retrieve the lineage information for each species 415 | ## first: uniq ID from species name 416 | ## second: tax ID from uniq ID 417 | ## third: taxonomic information from tax ID 418 | ## fourth: format similar to database 419 | print('retrieving the taxonomic information from NCBI for ' + str(len(species)) + ' species of interest\n') 420 | uid = [] 421 | for item in species: 422 | handle = Entrez.esearch(db = 'nucleotide', term = item, retmode = 'xml', rettype = 'fasta') 423 | record = Entrez.read(handle) 424 | uid.append(record['IdList'][0]) 425 | 426 | accession_taxid = [] 427 | taxids = [] 428 | for id in uid: 429 | handle = Entrez.efetch(db = 'nuccore', id = id, retmode = 'xml', rettype = 'fasta') 430 | record = Entrez.read(handle) 431 | acc = record[0]['TSeq_accver'] 432 | taxid = record[0]['TSeq_taxid'] 433 | accession_taxid.append(str(acc) + ' ' + str(taxid)) 434 | taxids.append(str(taxid)) 435 | 436 | lineage_list = [] 437 | for taxid in taxids: 438 | lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = taxid) 439 | lineage_record = Entrez.read(lineage_search) 440 | lineage_list.append(lineage_record) 441 | 442 | lineage_info = [] 443 | for key in lineage_list: 444 | lineage = {d['Rank']:d['ScientificName'] for d in key[0]['LineageEx'] if d['Rank'] in ['superkingdom', 'phylum', 'class', 445 | 'order', 'family', 'genus', 'species']} 446 | lineage['species'] = key[0]['ScientificName'] 447 | lineage['taxid'] = key[0]['TaxId'] 448 | lineage_info.append(lineage) 449 | df = pd.DataFrame(lineage_info) 450 | df['species'] = df['species'].str.replace(' ', '_') 451 | df['sintax'] = 'd:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species'] 452 | datafr = df['sintax'] 453 | species_interest = datafr.values.tolist() 454 | 455 | ## extract all entries from the database that share a family status with the species of interest 456 | for record in SeqIO.parse(DATABASE, 'fasta'): 457 | family_rec = record.id.split(',')[4] 458 | genus_rec = record.id.split(',')[5] 459 | species_rec = record.id.split(',')[6] 460 | for species in species_interest: 461 | family_int = species.split(',')[4] 462 | genus_int = species.split(',')[5] 463 | species_int = species.split(',')[6] 464 | spec_int = species.split(',')[6].split(':')[1] 465 | if family_int == family_rec: 466 | with open(f'{directory}/{spec_int}_family.fasta', 'a') as f: 467 | SeqIO.write(record, f, 'fasta') 468 | if genus_int == genus_rec: 469 | with open(f'{directory}/{spec_int}_genus.fasta', 'a') as f: 470 | SeqIO.write(record, f, 'fasta') 471 | if species_int == species_rec: 472 | with open(f'{directory}/{spec_int}_species.fasta', 'a') as f: 473 | SeqIO.write(record, f, 'fasta') 474 | 475 | ## extract information for data table from newly generated files 476 | newdict = {} 477 | for species in species_interest: 478 | spec_int = species.split(',')[6].split(':')[1] 479 | try: 480 | spec_number = list(SeqIO.parse(f'{directory}/{spec_int}_species.fasta', 'fasta')) 481 | spec_num = len(spec_number) 482 | except: 483 | spec_num = 0 484 | try: 485 | gen_number = list(SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta')) 486 | gen_num = len(gen_number) 487 | gen_list = [] 488 | for record in gen_number: 489 | gen = record.id.split(',')[6].split(':')[1] 490 | if gen not in gen_list: 491 | gen_list.append(gen) 492 | except: 493 | gen_num = 0 494 | gen_list = ['NA'] 495 | try: 496 | fam_number = list(SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta')) 497 | fam_num = len(fam_number) 498 | fam_list = [] 499 | for record in fam_number: 500 | fam = record.id.split(',')[6].split(':')[1] 501 | if fam not in fam_list: 502 | fam_list.append(fam) 503 | except: 504 | fam_num = 0 505 | fam_list = ['NA'] 506 | newdict[spec_int] = {'species': spec_int, 'species_occur': spec_num, 'species_gen': gen_list, 'gen_entries': gen_num, 'species_fam': fam_list, 'fam_entries': fam_num} 507 | 508 | ## print information on which species are present in the database 509 | for species in species_interest: 510 | spec_int = species.split(',')[6].split(':')[1] 511 | if newdict[spec_int]['species_occur'] == 0: 512 | print(str(newdict[spec_int]['species']) + ': not present in the reference database\n') 513 | else: 514 | print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['species_occur']) + ' entries in the database\n') 515 | 516 | ## output data table on species of interest 517 | df = pd.DataFrame.from_dict(newdict, orient = 'index') 518 | df = df[['species', 'species_occur', 'gen_entries', 'fam_entries', 'species_gen', 'species_fam']] 519 | df.to_csv(OUTPUT, sep = '\t', index = None) 520 | 521 | ## generate phylogenetic trees for every species of interest based on number of entries in genus and family 522 | ## first: check number of entries in if statement 523 | ## second: shorten the headers of the sequences in the file, so that it can be printed on the figure 524 | ## third: run muscle to generate alignment 525 | ## fourth: calculate distance from alignment 526 | ## fifth: generate tree figure 527 | for species in species_interest: 528 | spec_int = species.split(',')[6].split(':')[1] 529 | if newdict[spec_int]['fam_entries'] > 50: 530 | print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries too large. Generating phylogenetic tree on genus level with ' + str(newdict[spec_int]['gen_entries']) + ' entries\n') 531 | 532 | select = [] 533 | for record in SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta'): 534 | record.description = record.description.replace(';', ',') 535 | record.id = record.description 536 | record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1] 537 | record.description = record.id 538 | select.append(record) 539 | handle = open(f'{directory}/{spec_int}_genus_align.fasta', 'w') 540 | SeqIO.write(select, handle, 'fasta') 541 | handle.close() 542 | 543 | muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_genus_align.fasta', 544 | out = f'{directory}/{spec_int}_genus_align.clw', 545 | diags = True, 546 | maxiters = 1, 547 | log = f'{directory}/{spec_int}_genus_align_log.txt', 548 | clw = True) 549 | muscle_cline() 550 | 551 | with open(f'{directory}/{spec_int}_genus_align.clw' , 'r') as aln: 552 | alignment = AlignIO.read(aln, 'clustal') 553 | calculator = DistanceCalculator('identity') 554 | Distance_matrix = calculator.get_distance(alignment) 555 | constructor = DistanceTreeConstructor(calculator, 'nj') 556 | 557 | tree = constructor.build_tree(alignment) 558 | fig = plt.figure(figsize = (25,15), dpi = 100) 559 | matplotlib.rc('font', size=12) 560 | matplotlib.rc('xtick', labelsize=10) 561 | matplotlib.rc('ytick', labelsize=10) 562 | axes = fig.add_subplot(1, 1, 1) 563 | Phylo.draw(tree, axes=axes, do_show = False) 564 | fig.savefig(f'{spec_int}_genus_align_tree.pdf') 565 | 566 | else: 567 | print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries. Generating phylogenetic tree on family level\n') 568 | 569 | select = [] 570 | for record in SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta'): 571 | record.description = record.description.replace(';', ',') 572 | record.id = record.description 573 | record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1] 574 | record.description = record.id 575 | select.append(record) 576 | handle = open(f'{directory}/{spec_int}_family_align.fasta', 'w') 577 | SeqIO.write(select, handle, 'fasta') 578 | handle.close() 579 | 580 | muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_family_align.fasta', 581 | out = f'{directory}/{spec_int}_family_align.clw', 582 | diags = True, 583 | maxiters = 1, 584 | log = f'{directory}/{spec_int}_family_align_log.txt', 585 | clw = True) 586 | muscle_cline() 587 | 588 | with open(f'{directory}/{spec_int}_family_align.clw' , 'r') as aln: 589 | alignment = AlignIO.read(aln, 'clustal') 590 | calculator = DistanceCalculator('identity') 591 | Distance_matrix = calculator.get_distance(alignment) 592 | constructor = DistanceTreeConstructor(calculator, 'nj') 593 | 594 | tree = constructor.build_tree(alignment) 595 | fig = plt.figure(figsize = (25,15), dpi = 100) 596 | matplotlib.rc('font', size=12) 597 | matplotlib.rc('xtick', labelsize=10) 598 | matplotlib.rc('ytick', labelsize=10) 599 | axes = fig.add_subplot(1, 1, 1) 600 | Phylo.draw(tree, axes=axes, do_show = False) 601 | fig.savefig(f'{spec_int}_family_align_tree.pdf') 602 | 603 | 604 | ## function: argparse parser 605 | def main(): 606 | parser = argparse.ArgumentParser(description = 'creating a curated reference database') 607 | subparser = parser.add_subparsers() 608 | 609 | ncbi_download_parser = subparser.add_parser('ncbi_download', description = 'downloading fasta sequence file from NCBI based on text query') 610 | ncbi_download_parser.set_defaults(func = ncbi_download) 611 | ncbi_download_parser.add_argument('--database', help = 'database used to download sequences. Example: "nucleotide"', dest = 'database', type = str, required = True) 612 | ncbi_download_parser.add_argument('--query', help = 'query search to limit portion of database to be downloaded. Example: "18S[All Fields] NOT "uncultured"[All Fields] AND is_nuccore[filter] AND ("1"[SLEN] : "50000"[SLEN])"', dest = 'query', type = str, required = True) 613 | ncbi_download_parser.add_argument('--output', help = 'output filename. Example: "18S_fasta_NCBI_trial.fasta"', dest = 'output_filename', type = str, required = True) 614 | ncbi_download_parser.add_argument('--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True) 615 | 616 | in_silico_pcr_parser = subparser.add_parser('in_silico_pcr', description = 'curating the downloaded reference sequences with an in silico PCR') 617 | in_silico_pcr_parser.set_defaults(func = in_silico_pcr) 618 | in_silico_pcr_parser.add_argument('--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str, required = True) 619 | in_silico_pcr_parser.add_argument('--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str, required = True) 620 | in_silico_pcr_parser.add_argument('--assay', help = 'name of primer assay', dest = 'assay', type = str, required = True) 621 | in_silico_pcr_parser.add_argument('--input', help = 'input filename', dest = 'input', type = str, required = True) 622 | 623 | ref_database_parser = subparser.add_parser('ref_database', description = 'creating the reference database with taxonomic information') 624 | ref_database_parser.set_defaults(func = ref_database) 625 | ref_database_parser.add_argument('--input', help = 'input file containing the curated fasta sequences after in silico PCR', dest = 'input', type = str, required = True) 626 | ref_database_parser.add_argument('--output', help = 'curated reference database output file', dest = 'output', type = str, required = True) 627 | ref_database_parser.add_argument('--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True) 628 | 629 | dereplication_parser = subparser.add_parser('dereplicate', description = 'dereplicating the database') 630 | dereplication_parser.set_defaults(func = dereplicate) 631 | dereplication_parser.add_argument('--input', help = 'filename of the curated reference database', dest = 'input', type = str, required = True) 632 | dereplication_parser.add_argument('--output', help = 'filename of the dereplicated curated reference database', dest = 'output', type = str, required = True) 633 | 634 | phylo_parser = subparser.add_parser('phylo_build', description = 'generating phylogenetic trees for species of interest') 635 | phylo_parser.set_defaults(func = phylo) 636 | phylo_parser.add_argument('--species', help = 'text file containing list of species separated by newlines', dest = 'species', type = str, required = True) 637 | phylo_parser.add_argument('--database', help = 'curated reference database', dest = 'database', type = str, required = True) 638 | phylo_parser.add_argument('--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True) 639 | phylo_parser.add_argument('--output', help = 'filename for output table', dest = 'output', type = str, required = True) 640 | 641 | args = parser.parse_args() 642 | args.func(args) 643 | 644 | if __name__ == '__main__': 645 | main() 646 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | setup(name='crabs', 3 | description='CRABS: Creating Reference databases for Amplicon-Based Sequencing', 4 | author='Gert-Jan Jeunen', 5 | author_email='gjeunen@gmail.com', 6 | url='https://github.com/gjeunen/reference_database_creator', 7 | version='1.7.7', 8 | packages=['function'], 9 | scripts=['crabs'] 10 | ) 11 | --------------------------------------------------------------------------------