├── .gitignore
├── LICENSE
├── README.md
├── crabs
├── docker_intro
    └── README.md
├── figures_readme
    ├── amplicon-length-figure.png
    ├── amplification-efficiency.png
    ├── crabs_blasttax.png
    ├── crabs_completeness.png
    ├── crabs_cutadapt_error.png
    ├── crabs_dereplicate.png
    ├── crabs_download_bold.png
    ├── crabs_download_mitofish.png
    ├── crabs_download_ncbi.png
    ├── crabs_download_ncbi_output.png
    ├── crabs_download_taxonomy.png
    ├── crabs_export.png
    ├── crabs_filter.png
    ├── crabs_greengenes.png
    ├── crabs_help.png
    ├── crabs_import.png
    ├── crabs_insilico.png
    ├── crabs_merge.png
    ├── crabs_midori.png
    ├── crabs_pga.png
    ├── crabs_silva.png
    ├── crabs_subset.png
    ├── diversity-figure.png
    ├── phylo_tree.png
    ├── unite_first.png
    ├── unite_second.png
    └── unite_third.png
├── function
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── module_1.cpython-36.pyc
    │   ├── module_3.cpython-36.pyc
    │   └── module_5.cpython-36.pyc
    ├── crabs_functions.py
    └── older_versions
    │   ├── crabs_v1.0.0
    │   ├── reference_database_creator_v2.1.py
    │   └── reference_database_creator_v2.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | functions/module_1.py
2 | .DS_Store
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 gjeunen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crabs:
--------------------------------------------------------------------------------
   1 | #! /usr/bin/env python3
   2 | 
   3 | ##################
   4 | # IMPORT MODULES #
   5 | ##################
   6 | import os, rich, rich.progress, collections
   7 | import rich_click as click
   8 | from function import __version__
   9 | from function.crabs_functions import (check_params,
  10 |                                       check_midori_values, 
  11 |                                       embl_url,
  12 |                                       midori_url,
  13 |                                       parse_exclude, 
  14 |                                       set_output_dir, 
  15 |                                       download_file, 
  16 |                                       remove_tar_intermediary, 
  17 |                                       unzip_with_progress, 
  18 |                                       download_chunked_file, 
  19 |                                       gunzip_with_progress, 
  20 |                                       download_ncbi_seqs, 
  21 |                                       retrieve_species, 
  22 |                                       build_query, 
  23 |                                       ncbi_download_info, 
  24 |                                       select_function,
  25 |                                       names_to_memory,
  26 |                                       nodes_to_memory,
  27 |                                       accession_to_memory,
  28 |                                       generate_lineages,
  29 |                                       fill_missing_lineages,
  30 |                                       dict_to_output,
  31 |                                       merge_uniq_databases,
  32 |                                       merge_databases,
  33 |                                       check_files,
  34 |                                       write_list_to_output,
  35 |                                       filter_function,
  36 |                                       select_subset,
  37 |                                       subset_function,
  38 |                                       classifier_format,
  39 |                                       idt_text,
  40 |                                       blast_no_tax,
  41 |                                       blast_tax,
  42 |                                       unknown_base_conversion,
  43 |                                       rev_comp,
  44 |                                       cutadapt,
  45 |                                       crabs_to_fasta,
  46 |                                       multiple_crabs_to_fasta,
  47 |                                       multiple_list_to_temp,
  48 |                                       usearch_global,
  49 |                                       extract_alignment_results,
  50 |                                       write_dict_to_output,
  51 |                                       parse_diversity,
  52 |                                       horizontal_bar_chart,
  53 |                                       parse_length,
  54 |                                       line_graph,
  55 |                                       calculate_ncbi_species_genera,
  56 |                                       calculate_database_species_genera,
  57 |                                       completeness_table_output,
  58 |                                       parse_phylo_input,
  59 |                                       subset_phylo_input,
  60 |                                       dict_to_fasta,
  61 |                                       align_sequences,
  62 |                                       generate_phylo_tree,
  63 |                                       amplicon_import,
  64 |                                       raw_import,
  65 |                                       extract_primer_regions,
  66 |                                       deconstruct_primer_regions,
  67 |                                       dict_to_array,
  68 |                                       efficiency_barplot,
  69 |                                       parse_primer,
  70 |                                       )
  71 | 
  72 | #####################
  73 | # CLI CONFIGURATION #
  74 | #####################
  75 | # formatting
  76 | click.rich_click.USE_RICH_MARKUP = True
  77 | click.rich_click.SHOW_METAVARS_COLUMN = False
  78 | click.rich_click.APPEND_METAVARS_HELP = True
  79 | click.rich_click.HEADER_TEXT = (f"[yellow]/[/][cyan]/[/][yellow]/[/] [bold][link=https://github.com/gjeunen/reference_database_creator]CRABS[/link][/] | v{__version__}")
  80 | click.rich_click.FOOTER_TEXT = "See [link=https://github.com/gjeunen/reference_database_creator]https://github.com/gjeunen/reference_database_creator[/] for more details."
  81 | click.rich_click.ERRORS_SUGGESTION = f"This is CRABS [cyan]v{__version__}[/]\nFor more help, run '[yellow]crabs --help[/]' or visit [link=https://github.com/gjeunen/reference_database_creator]https://github.com/gjeunen/reference_database_creator[/]"
  82 | click.rich_click.STYLE_ERRORS_SUGGESTION = ""
  83 | 
  84 | # grouping of options
  85 | click.rich_click.OPTION_GROUPS = {
  86 |     "crabs": [
  87 |         {
  88 |             "name": "Download NCBI Taxonomy",
  89 |             "options": [
  90 |                 "--download-taxonomy",
  91 |                 "--exclude",
  92 |                 "--output",
  93 |             ],
  94 |             #"deduplicate": False
  95 |         },
  96 |         {
  97 |             "name": "Download BOLD Database",
  98 |             "options": [
  99 |                 "--download-bold",
 100 |                 "--taxon",
 101 |                 "--marker",
 102 |                 "--output",
 103 |             ],
 104 |             #"deduplicate": False
 105 |         },
 106 |         {
 107 |             "name": "Download EMBL Database",
 108 |             "options": [
 109 |                 "--download-embl",
 110 |                 "--taxon",
 111 |                 "--output",
 112 |             ],
 113 |             #"deduplicate": False
 114 |         },
 115 |         {
 116 |             "name": "Download GreenGenes Database",
 117 |             "options": [
 118 |                 "--download-greengenes",
 119 |                 "--output",
 120 |             ],
 121 |             #"deduplicate": False
 122 |         },
 123 |         {
 124 |             "name": "Download MIDORI2 Database",
 125 |             "options": [
 126 |                 "--download-midori",
 127 |                 "--gene",
 128 |                 "--gb-number",
 129 |                 "--gb-type",
 130 |                 "--output",
 131 |             ],
 132 |             #"deduplicate": False
 133 |         },
 134 |         {
 135 |             "name": "Download MitoFish Database",
 136 |             "options": [
 137 |                 "--download-mitofish",
 138 |                 "--output",
 139 |             ],
 140 |             #"deduplicate": False
 141 |         },
 142 |         {
 143 |             "name": "Download NCBI Database",
 144 |             "options": [
 145 |                 "--download-ncbi",
 146 |                 "--email",
 147 |                 "--query",
 148 |                 "--database",
 149 |                 "--batchsize",
 150 |                 "--species",
 151 |                 "--output",
 152 |             ],
 153 |             #"deduplicate": False
 154 |         },
 155 |         {
 156 |             "name": "Download SILVA Database",
 157 |             "options": [
 158 |                 "--download-silva",
 159 |                 "--gene",
 160 |                 "--db-type",
 161 |                 "--db-version",
 162 |                 "--output",
 163 |             ],
 164 |             #"deduplicate": False
 165 |         },
 166 |         {
 167 |             "name": "Import sequences into CRABS format",
 168 |             "options": [
 169 |                 "--import",
 170 |                 "--import-format",
 171 |                 "--names",
 172 |                 "--nodes",
 173 |                 "--acc2tax",
 174 |                 "--input",
 175 |                 "--output",
 176 |                 "--ranks",
 177 |             ],
 178 |             #"deduplicate": False
 179 |         },
 180 |         {
 181 |             "name": "Merge CRABS databases into one file",
 182 |             "options": [
 183 |                 "--merge",
 184 |                 "--input",
 185 |                 "--output",
 186 |                 "--uniq",
 187 |             ],
 188 |             #"deduplicate": False
 189 |         },
 190 |         {
 191 |             "name": "Extract amplicons through in silico PCR",
 192 |             "options": [
 193 |                 "--in-silico-pcr",
 194 |                 "--input",
 195 |                 "--output",
 196 |                 "--forward",
 197 |                 "--reverse",
 198 |                 "--mismatch",
 199 |                 "--threads",
 200 |                 "--untrimmed",
 201 |             ],
 202 |             #"deduplicate": False
 203 |         },
 204 |         {
 205 |             "name": "Retrieve amplicons without primer-binding regions",
 206 |             "options": [
 207 |                 "--pairwise-global-alignment",
 208 |                 "--input",
 209 |                 "--output",
 210 |                 "--amplicons",
 211 |                 "--forward",
 212 |                 "--reverse",
 213 |                 "--size-select",
 214 |                 "--threads",
 215 |                 "--percent-identity",
 216 |                 "--coverage",
 217 |                 "--all-start-positions",
 218 |             ],
 219 |             #"deduplicate": False
 220 |         },
 221 |         {
 222 |             "name": "Dereplicate CRABS database",
 223 |             "options": [
 224 |                 "--dereplicate",
 225 |                 "--input",
 226 |                 "--output",
 227 |                 "--dereplication-method",
 228 |             ],
 229 |             #"deduplicate": False
 230 |         },
 231 |         {
 232 |             "name": "Filter CRABS database",
 233 |             "options": [
 234 |                 "--filter",
 235 |                 "--input",
 236 |                 "--output",
 237 |                 "--minimum-length",
 238 |                 "--maximum-length",
 239 |                 "--maximum-n",
 240 |                 "--environmental",
 241 |                 "--no-species-id",
 242 |                 "--rank-na",
 243 |             ],
 244 |             #"deduplicate": False
 245 |         },
 246 |         {
 247 |             "name": "Subset CRABS database on taxonomic ID",
 248 |             "options": [
 249 |                 "--subset",
 250 |                 "--input",
 251 |                 "--output",
 252 |                 "--include",
 253 |                 "--exclude",
 254 |             ],
 255 |             #"deduplicate": False
 256 |         },
 257 |         {
 258 |             "name": "Figure: diversity contained within database",
 259 |             "options": [
 260 |                 "--diversity-figure",
 261 |                 "--input",
 262 |                 "--output",
 263 |                 "--tax-level",
 264 |             ],
 265 |             #"deduplicate": False
 266 |         },
 267 |         {
 268 |             "name": "Figure: amplicon length distribution",
 269 |             "options": [
 270 |                 "--amplicon-length-figure",
 271 |                 "--input",
 272 |                 "--output",
 273 |                 "--tax-level",
 274 |             ],
 275 |             #"deduplicate": False
 276 |         },
 277 |         {
 278 |             "name": "Figure: phylogenetic tree",
 279 |             "options": [
 280 |                 "--phylogenetic-tree",
 281 |                 "--input",
 282 |                 "--output",
 283 |                 "--tax-level",
 284 |                 "--species",
 285 |             ],
 286 |             #"deduplicate": False
 287 |         },
 288 |         {
 289 |             "name": "Figure: amplification efficiency",
 290 |             "options": [
 291 |                 "--amplification-efficiency-figure",
 292 |                 "--input",
 293 |                 "--amplicons",
 294 |                 "--forward",
 295 |                 "--reverse",
 296 |                 "--output",
 297 |                 "--tax-group",
 298 |             ],
 299 |             #"deduplicate": False
 300 |         },
 301 |         {
 302 |             "name": "Table: database completeness for target taxonomic group",
 303 |             "options": [
 304 |                 "--completeness-table",
 305 |                 "--input",
 306 |                 "--output",
 307 |                 "--names",
 308 |                 "--nodes",
 309 |                 "--species",
 310 |             ],
 311 |             #"deduplicate": False
 312 |         },
 313 |         {
 314 |             "name": "Export CRABS database to taxonomic classifier format",
 315 |             "options": [
 316 |                 "--export",
 317 |                 "--input",
 318 |                 "--output",
 319 |                 "--export-format",
 320 |             ],
 321 |             #"deduplicate": False
 322 |         },
 323 |     ],
 324 | }
 325 | 
 326 | # link user-input to options
 327 | @click.command(context_settings=dict(help_option_names=["-h", "--help"]))
 328 | 
 329 | # CRABS functions
 330 | @click.option("--download-taxonomy", "download_taxonomy_", is_flag = True, help = "Function to download NCBI taxonomy")
 331 | @click.option("--download-bold", "download_bold_", is_flag = True, help = "Function to download BOLD database")
 332 | @click.option("--download-embl", "download_embl_", is_flag = True, help = "Function to download EMBL database")
 333 | @click.option("--download-greengenes", "download_greengenes_", is_flag = True, help = "Function to download GreenGenes database")
 334 | @click.option("--download-midori", "download_midori_", is_flag = True, help = "Function to download MIDORI2 database")
 335 | @click.option("--download-mitofish", "download_mitofish_", is_flag = True, help = "Function to download MitoFish database")
 336 | @click.option("--download-ncbi", "download_ncbi_", is_flag = True, help = "Function to download NCBI database")
 337 | @click.option("--download-silva", "download_silva_", is_flag = True, help = "Function to download SILVA database")
 338 | @click.option("--import", "import_", is_flag = True, help = "Function to import sequences into CRABS format")
 339 | @click.option("--merge", "merge_", is_flag = True, help = "Function to merge CRABS databases into a single file")
 340 | @click.option("--in-silico-pcr", "in_silico_pcr_", is_flag = True, help = "Function to extract amplicons through in silico PCR")
 341 | @click.option("--pairwise-global-alignment", "pairwise_global_alignment_", is_flag = True, help = "Function to retrieve amplicons without primer-bidning regions")
 342 | @click.option("--dereplicate", "dereplicate_", is_flag = True, help = "Function to dereplicate a CRABS database")
 343 | @click.option("--filter", "filter_", is_flag = True, help = "Function to filter a CRABS database")
 344 | @click.option("--subset", "subset_", is_flag = True, help = "Function to subset a CRABS database")
 345 | @click.option("--diversity-figure", "diversity_figure_", is_flag = True, help = "Function to create a horizontal bar chart with included diversity")
 346 | @click.option("--amplicon-length-figure", "amplicon_length_figure_", is_flag = True, help = "Function to create a line chart depicting amplicon distributions")
 347 | @click.option("--phylogenetic-tree", "phylogenetic_tree_", is_flag = True, help = "Function to create a phylogenetic tree with barcodes for target species list")
 348 | @click.option("--amplification-efficiency-figure", "amplification_efficiency_figure_", is_flag = True, help = "Function to create a bar graph displaying mismatches in the primer-binding region")
 349 | @click.option("--completeness-table", "completeness_table_", is_flag = True, help = "Function creating a spreadsheet containing barcode availability for taxonomic groups")
 350 | @click.option("--export", "export_", is_flag = True, help = "Function to export a CRABS database")
 351 | 
 352 | # CRABS parameters
 353 | @click.option("--output", "output_", help = "output directory or filename")
 354 | @click.option("--exclude", "exclude_", help = "stop the download of 'acc2taxid' or 'taxdump'")
 355 | @click.option("--taxon", "taxon_", help = "taxonomic group to download")
 356 | @click.option("--gene", "gene_", help = "gene to download")
 357 | @click.option("--gb-number", "gb_number_", help = "database version to download")
 358 | @click.option("--gb-type", "gb_type_", type = str, help = "database type to download")
 359 | @click.option("--marker", "marker_", help = "genetic marker to download")
 360 | @click.option("--email", "email_", help = "email address to connect to NCBI server")
 361 | @click.option("--query", "query_", help = "query identifying what to download from NCBI")
 362 | @click.option("--database", "database_", help = "the database from which NCBI sequences are downloaded")
 363 | @click.option("--batchsize", "batchsize_", default = 5000, type = int, help = "sequences to download from NCBI per chunk (default = 5,000)")
 364 | @click.option("--species", "species_", help = "species of interest list")
 365 | @click.option("--db-type", "db_type_", help = "database version to download")
 366 | @click.option("--db-version", "db_version_", help = "database version to download")
 367 | @click.option("--import-format", "import_format_", help = "format of the sequences to import")
 368 | @click.option("--names", "names_", help = "NCBI taxonomy 'names.dmp' file")
 369 | @click.option("--nodes", "nodes_", help = "NCBI taxonomy 'nodes.dmp' file")
 370 | @click.option("--acc2tax", "acc2tax_", help = "NCBI taxonomy 'nucl_gb.accession2taxid' file")
 371 | @click.option("--input", "input_", help = "input filename")
 372 | @click.option("--ranks", "ranks_", default = 'superkingdom;phylum;class;order;family;genus;species', help = "taxonomic ranks to be included in the taxonomic lineage")
 373 | @click.option("--uniq", "uniq_", is_flag = True, help = "keep only unique accession numbers")
 374 | @click.option("--dereplication-method", "dereplication_method_", default = 'unique_species', help = 'dereplication method: "strict", "single_species", and "unique_species" (default)')
 375 | @click.option("--minimum-length", "minimum_length_", help = "minimum sequence length for amplicon to be retained in the database", type = int)
 376 | @click.option("--maximum-length", "maximum_length_", help = "maximum sequence length for amplicon to be retained in the database", type = int)
 377 | @click.option("--maximum-n", "maximum_n_", help = "discard amplicons with N or more ambiguous bases", type = int)
 378 | @click.option("--environmental", "environmental_", is_flag = True, help = "discard environmental sequences from the database")
 379 | @click.option("--no-species-id", "no_species_id_", is_flag = True, help = "discard sequences for which no species name is available")
 380 | @click.option("--rank-na", "rank_na_", help = "discard sequences with N or more unspecified taxonomic levels", type = int)
 381 | @click.option("--include", "include_", help = "string or file containing taxa to include")
 382 | @click.option("--exclude", "exclude_", help = "string or file containing taxa to exclude")
 383 | @click.option("--export-format", "export_format_", help = 'export format: "sintax", "rdp", "qiime-fasta", "qiime-text", "dada2-species", "dada2-taxonomy", "idt-fasta", "idt-text", "blast-notax", "blast-tax"')
 384 | @click.option("--forward", "forward_", help = "forward primer sequence in 5' -> 3' direction")
 385 | @click.option("--reverse", "reverse_", help = "reverse primer sequence in 5' -> 3' direction")
 386 | @click.option("--mismatch", "mismatch_", type = float, default = 4.5, help = "number of mismatches allowed in the primer-binding site (default: 4)")
 387 | @click.option("--threads", "threads_", type = int, default = 0, help = "number of threads used to compute the in silico PCR (default: autodetection)")
 388 | @click.option("--untrimmed", "untrimmed_", help = "file name for untrimmed sequences")
 389 | @click.option("--amplicons", "amplicons_", help = "file name for the amplicons retrieved during in silico PCR")
 390 | @click.option("--size-select", "size_select_", help = "exclude reads longer than N from the analysis")
 391 | @click.option("--percent-identity", "percent_identity_", help = "minimum percent identity threshold for the alignment to pass (0.0 - 1.0)")
 392 | @click.option("--coverage", "coverage_", help = "minimum coverage threshold for the alignment to pass (0 - 100)")
 393 | @click.option("--all-start-positions", "all_start_positions_", is_flag = True, help = "do not restrict alignment start and end to be within the primer-binding region length")
 394 | @click.option("--tax-level", "tax_level_", type = int, help = "taxonomic level to be used as groups for horizontal bar chart")
 395 | @click.option("--tax-group", "tax_group_", help = "taxonomic group of interest to be included in the analysis")
 396 | 
 397 | #################
 398 | # MAIN FUNCTION #
 399 | #################
 400 | def crabs(**kwargs):
 401 |     """CRABS is an open-source software program that enables scientists to build custom local reference databases for improved taxonomy assignment of metabarcoding data.
 402 |     
 403 |     CRABS is split up into various functions and steps to accomplish this task, including:
 404 | 
 405 |     (1) download data from online repositories,
 406 | 
 407 |     (2) import downloaded data into CRABS format,
 408 |     
 409 |     (3) extract amplicons from imported data,
 410 | 
 411 |     (4) retrieve amplicons without primer-binding regions,
 412 |     
 413 |     (5) curate and subset the local database,
 414 |     
 415 |     (6) export the local database in various taxonomic classifier formats, and
 416 |     
 417 |     (7) basic visualisations to explore the local reference database.
 418 |     
 419 | 
 420 | 
 421 |     A basic example to run CRABS (download NCBI taxonomy information): 
 422 |     
 423 |     [blue bold]crabs --download-taxonomy --exclude 'acc2taxid'[/]
 424 |     """
 425 | 
 426 | # access all functions from kwargs
 427 |     download_taxonomy_ = kwargs.get("download_taxonomy_")
 428 |     download_bold_ = kwargs.get("download_bold_")
 429 |     download_embl_ = kwargs.get("download_embl_")
 430 |     download_greengenes_ = kwargs.get("download_greengenes_")
 431 |     download_midori_ = kwargs.get("download_midori_")
 432 |     download_mitofish_ = kwargs.get("download_mitofish_")
 433 |     download_ncbi_ = kwargs.get("download_ncbi_")
 434 |     download_silva_ = kwargs.get("download_silva_")
 435 |     import_ = kwargs.get("import_")
 436 |     merge_ = kwargs.get("merge_")
 437 |     in_silico_pcr_ = kwargs.get("in_silico_pcr_")
 438 |     pairwise_global_alignment_ = kwargs.get("pairwise_global_alignment_")
 439 |     dereplicate_ = kwargs.get("dereplicate_")
 440 |     filter_ = kwargs.get("filter_")
 441 |     subset_ = kwargs.get("subset_")
 442 |     diversity_figure_ = kwargs.get("diversity_figure_")
 443 |     amplicon_length_figure_ = kwargs.get("amplicon_length_figure_")
 444 |     phylogenetic_tree_ = kwargs.get("phylogenetic_tree_")
 445 |     amplification_efficiency_figure_ = kwargs.get("amplification_efficiency_figure_")
 446 |     completeness_table_ = kwargs.get("completeness_table_")
 447 |     export_ = kwargs.get("export_")
 448 | 
 449 | # access all options from kwargs
 450 |     output_ = kwargs.get("output_")
 451 |     exclude_ = kwargs.get("exclude_")
 452 |     taxon_ = kwargs.get("taxon_")
 453 |     marker_ = kwargs.get("marker_")
 454 |     gene_ = kwargs.get("gene_")
 455 |     gb_number_ = kwargs.get("gb_number_")
 456 |     gb_type_ = kwargs.get("gb_type_")
 457 |     email_ = kwargs.get("email_")
 458 |     query_ = kwargs.get("query_")
 459 |     database_ = kwargs.get("database_")
 460 |     batchsize_ = kwargs.get("batchsize_")
 461 |     species_ = kwargs.get("species_")
 462 |     db_type_ = kwargs.get("db_type_")
 463 |     db_version_ = kwargs.get("db_version_")
 464 |     import_format_ = kwargs.get("import_format_")
 465 |     names_ = kwargs.get("names_")
 466 |     nodes_ = kwargs.get("nodes_")
 467 |     acc2tax_ = kwargs.get("acc2tax_")
 468 |     input_ = kwargs.get("input_")
 469 |     ranks_ = kwargs.get("ranks_")
 470 |     uniq_ = kwargs.get("uniq_")
 471 |     dereplication_method_ = kwargs.get("dereplication_method_")
 472 |     minimum_length_ = kwargs.get("minimum_length_")
 473 |     maximum_length_ = kwargs.get("maximum_length_")
 474 |     maximum_n_ = kwargs.get("maximum_n_")
 475 |     environmental_ = kwargs.get("environmental_")
 476 |     no_species_id_ = kwargs.get("no_species_id_")
 477 |     rank_na_ = kwargs.get("rank_na_")
 478 |     include_ = kwargs.get("include_")
 479 |     exclude_ = kwargs.get("exclude_")
 480 |     export_format_ = kwargs.get("export_format_")
 481 |     forward_ = kwargs.get("forward_")
 482 |     reverse_ = kwargs.get("reverse_")
 483 |     mismatch_ = kwargs.get("mismatch_")
 484 |     threads_ = kwargs.get("threads_")
 485 |     untrimmed_ = kwargs.get("untrimmed_")
 486 |     amplicons_ = kwargs.get("amplicons_")
 487 |     size_select_ = kwargs.get("size_select_")
 488 |     percent_identity_ = kwargs.get("percent_identity_")
 489 |     coverage_ = kwargs.get("coverage_")
 490 |     include_all_start_positions_ = kwargs.get("all_start_positions_")
 491 |     tax_level_ = kwargs.get("tax_level_")
 492 |     tax_group_ = kwargs.get("tax_group_")
 493 | 
 494 | # print starting info to console
 495 |     console = rich.console.Console(stderr=True, highlight=False)
 496 |     console.print(f"\n[yellow]/[/][cyan]/[/][yellow]/[/] [bold][link=https://github.com/gjeunen/reference_database_creator]CRABS[/link][/] | v{__version__}\n")
 497 |     columns = [*rich.progress.Progress.get_default_columns(), rich.progress.TimeElapsedColumn()]
 498 | 
 499 | # identify function and execute
 500 | #####################
 501 | # DOWNLOAD TAXONOMY #
 502 | #####################
 503 |     if download_taxonomy_:
 504 |         # print function to console
 505 |         console.print(f"[cyan]|            Function[/] | Download NCBI taxonomy files")
 506 |         # check what files to download based on exclude_
 507 |         download_dict = parse_exclude(exclude_)
 508 |         # set output directory
 509 |         output_directory = set_output_dir(output_)
 510 |         # iterate over download_dict for the different files
 511 |         for key, value in download_dict.items():
 512 |             # set filename
 513 |             filename = value.split('/')[-1]
 514 |             # download file
 515 |             download_file(console, columns, value, output_directory, filename)
 516 |             # unzip file
 517 |             unzip_method = select_function(key)
 518 |             unzip_method(console, columns, output_directory, filename)
 519 |             # remove zipped and intermediary files
 520 |             os.remove(f'{output_directory}{filename}')
 521 |             remove_tar_intermediary(key, output_directory)
 522 | 
 523 | #################
 524 | # DOWNLOAD BOLD #
 525 | #################
 526 |     if download_bold_:
 527 |         # print function to console
 528 |         console.print(f"[cyan]|            Function[/] | Download BOLD database")
 529 |         # check if all parameters have been provided
 530 |         check_params(console, {'"--output"': output_, '"--taxon"': taxon_})
 531 |         # set url, output directory, and filename
 532 |         url = 'http://v3.boldsystems.org/index.php/API_Public/sequence?taxon=' + taxon_
 533 |         if marker_:
 534 |             url = url + '&marker=' + marker_
 535 |         output_directory = f'{os.path.dirname(output_)}/'
 536 |         if output_directory == '/':
 537 |             output_directory = ''
 538 |         filename = output_.split('/')[-1]
 539 |         # download the file
 540 |         download_chunked_file(console, columns, url, output_directory, filename)
 541 | 
 542 | #################
 543 | # DOWNLOAD EMBL #
 544 | #################
 545 |     if download_embl_:
 546 |         # print function to console
 547 |         console.print(f"[cyan]|            Function[/] | Download EMBL database")
 548 |         # check if all parameters have been provided
 549 |         check_params(console, {'"--output"': output_, '"--taxon"': taxon_})
 550 |         # find all matching files to taxon_ and store as urls
 551 |         urls = embl_url(console, taxon_)
 552 |         # print number of files to download to console
 553 |         if len(urls) > 1:
 554 |             console.print(f"[cyan]|             Results[/] | Downloading {len(urls)} files from EMBL")
 555 |         else:
 556 |             console.print(f"[cyan]|             Results[/] | Downloading {len(urls)} file from EMBL")
 557 |         # set output_directory, and filename
 558 |         output_directory = f'{os.path.dirname(output_)}/'
 559 |         if output_directory == '/':
 560 |             output_directory = ''
 561 |         outputfilename = output_.split('/')[-1]
 562 |         # download the files
 563 |         matching_files = []
 564 |         for url in urls:
 565 |             zipfilename = url.split('/')[-1]
 566 |             matching_files.append(zipfilename)
 567 |             download_file(console, columns, url, output_directory, zipfilename)
 568 |         # unzip files and remove zipped intermediary files
 569 |         file_count = 0
 570 |         for file in matching_files:
 571 |             file_count += 1
 572 |             if file_count == 1:
 573 |                 gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = False)
 574 |             else:
 575 |                 gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = True)
 576 |             os.remove(f'{output_directory}{file}')
 577 | 
 578 | #######################
 579 | # DOWNLOAD GREENGENES #
 580 | #######################
 581 |     if download_greengenes_:
 582 |         # print function to console
 583 |         console.print(f"[cyan]|            Function[/] | Download GreenGenes database")
 584 |         # check if all parameters have been provided
 585 |         check_params(console, {'"--output"': output_})
 586 |         # set url, output directory, and filename
 587 |         urls = ['https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_GREENGENES_gg16S_unaligned.fasta.gz',
 588 |                 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_prokMSA_unaligned.fasta.gz',
 589 |                 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_HOMD_gg16S_unaligned.fasta.gz',
 590 |                 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_NCBI_gg16S_unaligned.fasta.gz',
 591 |                 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_RDP_gg16S_unaligned.fasta.gz',
 592 |                 'https://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/current_SILVA_gg16S_unaligned.fasta.gz']
 593 |         output_directory = f'{os.path.dirname(output_)}/'
 594 |         if output_directory == '/':
 595 |             output_directory = ''
 596 |         outputfilename = output_.split('/')[-1]
 597 |         # download the zip files
 598 |         matching_files = []
 599 |         for url in urls:
 600 |             zipfilename = url.split('/')[-1]
 601 |             matching_files.append(zipfilename)
 602 |             download_file(console, columns, url, output_directory, zipfilename)
 603 |         # unzip files and remove zipped intermediary files
 604 |         file_count = 0
 605 |         for file in matching_files:
 606 |             file_count += 1
 607 |             if file_count == 1:
 608 |                 gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = False)
 609 |             else:
 610 |                 gunzip_with_progress(console, columns, output_directory, file, outputfilename, append = True)
 611 |             os.remove(f'{output_directory}{file}')
 612 | 
 613 | ####################
 614 | # DOWNLOAD MIDORI2 #
 615 | ####################
 616 |     if download_midori_:
 617 |         # print function to console
 618 |         console.print(f"[cyan]|            Function[/] | Download MIDORI2 database")
 619 |         # check if all parameters have been provided and correctly formatted
 620 |         check_params(console, {'"--output"': output_, '"--gene"': gene_, '"--gb-number"': gb_number_, '"--gb-type': gb_type_})
 621 |         check_midori_values(console, gene_, gb_type_, gb_number_)
 622 |         # set url, output_directory, and filename
 623 |         zip_type, url = midori_url(gb_number_, gb_type_, gene_)
 624 |         output_directory = f'{os.path.dirname(output_)}/'
 625 |         if output_directory == '/':
 626 |             output_directory = ''
 627 |         zipfilename = url.split('/')[-1]
 628 |         outputfilename = output_.split('/')[-1]
 629 |         # download the zip file
 630 |         download_file(console, columns, url, output_directory, zipfilename)
 631 |         # unzip the downloaded file
 632 |         if zip_type == 'unzip':
 633 |             unzip_with_progress(console, columns, output_directory, zipfilename, outputfilename)
 634 |         elif zip_type == 'gunzip':
 635 |             gunzip_with_progress(console, columns, output_directory, zipfilename, outputfilename, append = False)
 636 |         # remove intermediary files
 637 |         os.remove(f'{output_directory}{zipfilename}')
 638 |         
 639 | #####################
 640 | # DOWNLOAD MITOFISH #
 641 | #####################
 642 |     if download_mitofish_:
 643 |         # print function to console
 644 |         console.print(f"[cyan]|            Function[/] | Download MitoFish database")
 645 |         # check if all parameters have been provided
 646 |         check_params(console, {'"--output"': output_})
 647 |         # set url, output_directory, and filename
 648 |         url = 'http://mitofish.aori.u-tokyo.ac.jp/species/detail/download/?filename=download%2F/complete_partial_mitogenomes.zip'
 649 |         output_directory = f'{os.path.dirname(output_)}/'
 650 |         if output_directory == '/':
 651 |             output_directory = ''
 652 |         zipfilename = url.split('/')[-1]
 653 |         outputfilename = output_.split('/')[-1]
 654 |         # download the zip file
 655 |         download_file(console, columns, url, output_directory, zipfilename)
 656 |         # unzip the downloaded file
 657 |         unzip_with_progress(console, columns, output_directory, zipfilename, outputfilename)
 658 |         # remove intermediary files
 659 |         os.remove(f'{output_directory}{zipfilename}')
 660 | 
 661 | #################
 662 | # DOWNLOAD NCBI #
 663 | #################
 664 |     if download_ncbi_:
 665 |         # print function to console
 666 |         console.print(f"[cyan]|            Function[/] | Download NCBI database")
 667 |         # check if all parameters have been provided
 668 |         check_params(console, {'"--output"': output_, '"--database"': database_, '"--query"': query_, '"--email"': email_})
 669 |         # retrieve species information
 670 |         species_list = retrieve_species(console, columns, species_) if species_ else []
 671 |         # build query
 672 |         query_list = build_query(species_list, query_)
 673 |         # retrieve the query key and web environment to download NCBI seq data
 674 |         total_read_count, ncbi_info_dict = ncbi_download_info(console, columns, query_list, database_, email_)
 675 |         # download NCBI sequences
 676 |         total_downloaded_seqs = download_ncbi_seqs(console, columns, total_read_count, batchsize_, database_, email_, ncbi_info_dict, output_)
 677 |         # write log to Terminal window
 678 |         try:
 679 |             console.print(f"[cyan]|             Results[/] | Number of sequences downloaded: {total_downloaded_seqs}/{total_read_count} ({round(total_downloaded_seqs / total_read_count * 100, 2)}%)")
 680 |         except ZeroDivisionError:
 681 |             console.print(f"[cyan]|             Results[/] | Number of sequences downloaded: 0")
 682 | 
 683 | ##################
 684 | # DOWNLOAD SILVA #
 685 | ##################
 686 |     if download_silva_:
 687 |         # print function to console
 688 |         console.print(f"[cyan]|            Function[/] | Download SILVA database")
 689 |         # check if all parameters have been provided
 690 |         check_params(console, {'"--output"': output_, '"--gene"': gene_, '"--db-type"': db_type_, '"--db-version"': db_version_})
 691 |         # set url, output_directory, and filename
 692 |         if db_type_.upper() == 'FULL':
 693 |             url = f'https://ftp.arb-silva.de/release_{db_version_}/Exports/SILVA_{db_version_}_{gene_.upper()}Ref_tax_silva.fasta.gz'
 694 |         elif db_type_.upper() == 'SUBSET':
 695 |             url = f'https://ftp.arb-silva.de/release_{db_version_}/Exports/SILVA_{db_version_}_{gene_.upper()}Ref_NR99_tax_silva.fasta.gz'
 696 |         else:
 697 |             console.print(f"[cyan]|               ERROR[/] | [bold yellow]incorrect value provided for '--db-type', aborting analysis...[/]\n")
 698 |             exit()
 699 |         output_directory = f'{os.path.dirname(output_)}/'
 700 |         if output_directory == '/':
 701 |             output_directory = ''
 702 |         zipfilename = url.split('/')[-1]
 703 |         outputfilename = output_.split('/')[-1]
 704 |         # download the zip file
 705 |         download_file(console, columns, url, output_directory, zipfilename)
 706 |         # unzip the downloaded file
 707 |         gunzip_with_progress(console, columns, output_directory, zipfilename, outputfilename, append = False)
 708 |         # remove intermediary files
 709 |         os.remove(f'{output_directory}{zipfilename}')
 710 | 
 711 | ##########
 712 | # IMPORT #
 713 | ##########
 714 |     if import_:
 715 |         # print function to console
 716 |         console.print(f"[cyan]|            Function[/] | Import sequence data into CRABS format")
 717 |         # check if all parameters have been provided (need to make a distinction in neccesary parameters between different formats)
 718 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--format"': import_format_, '"--names"': names_, '"--nodes"': nodes_, '"--acc2taxid"': acc2tax_})
 719 |         # read documents into memory
 720 |         input_files = [input_, names_, nodes_, acc2tax_]
 721 |         input_file_size = sum(os.path.getsize(input_file) for input_file in input_files)
 722 |         with rich.progress.Progress(*columns) as progress_bar:
 723 |             task = progress_bar.add_task(console = console, description = "[cyan]| Read data to memory[/] |", total=input_file_size)
 724 |             input_to_memory = select_function(import_format_)
 725 |             seq_input_dict, initial_seq_number = input_to_memory(task, progress_bar, input_)
 726 |             names_key_tax_number_value_dict, tax_number_key_names_value_dict, synonym_key_dict = names_to_memory(task, progress_bar, names_)
 727 |             tax_number_key_rank_and_tax_number_up_values_dict = nodes_to_memory(task, progress_bar, nodes_)
 728 |             acc_key_tax_number_value_dict = accession_to_memory(task, progress_bar, acc2tax_, seq_input_dict)
 729 |         # generate taxonomic lineages
 730 |         seq_input_dict, unresolved_lineage = generate_lineages(console, columns, ranks_, seq_input_dict, acc_key_tax_number_value_dict, names_key_tax_number_value_dict, synonym_key_dict, tax_number_key_rank_and_tax_number_up_values_dict, tax_number_key_names_value_dict)
 731 |         # fill out missing info
 732 |         seq_input_dict = fill_missing_lineages(console, columns, ranks_, seq_input_dict)
 733 |         # write to output
 734 |         dict_to_output(seq_input_dict, ranks_, output_)
 735 |         # write log to Terminal window
 736 |         console.print(f"[cyan]|             Results[/] | Imported {len(seq_input_dict)} out of {initial_seq_number} sequences into CRABS format ({round(len(seq_input_dict) / initial_seq_number * 100, 2)}%)")
 737 |         if unresolved_lineage > 0:
 738 |             console.print(f"[cyan]|                    [/] | Could not resolve a taxonomic lineage for {unresolved_lineage} imported sequences ({round(unresolved_lineage / len(seq_input_dict) * 100, 2)}%)")
 739 | 
 740 | #########
 741 | # MERGE #
 742 | #########
 743 |     if merge_:
 744 |         # print function to console
 745 |         console.print(f"[cyan]|            Function[/] | Merge CRABS databases into a single file")
 746 |         # check if all parameters have been provided
 747 |         check_params(console, {'"--input"': input_, '"--output"': output_})
 748 |         # check for multiple files and their existence
 749 |         file_list = check_files(console, input_)
 750 |         # merge databases based on "--uniq" parameter
 751 |         if uniq_:
 752 |             merged_seq_file, initial_read_count = merge_uniq_databases(console, columns, file_list) 
 753 |         else:
 754 |             merged_seq_file, initial_read_count = merge_databases(console, columns, file_list)
 755 |         # write merged data to output
 756 |         write_list_to_output(console, columns, merged_seq_file, output_)
 757 |         # write log to Terminal window
 758 |         console.print(f"[cyan]|             Results[/] | Written {len(merged_seq_file)} sequences to {output_} by merging {len(file_list)} files containing {initial_read_count} sequences ({round(len(merged_seq_file) / initial_read_count * 100, 2)}%)")
 759 | 
 760 | #################
 761 | # IN SILICO PCR #
 762 | #################
 763 |     if in_silico_pcr_:
 764 |         # print function to console
 765 |         console.print(f"[cyan]|            Function[/] | Extract amplicons through in silico PCR")
 766 |         # check if all parameters have been provided
 767 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--forward"': forward_, '"--reverse"': reverse_})
 768 |         # check primers for unknown bases and reverse complement reverse primer
 769 |         forward_ = unknown_base_conversion(forward_)
 770 |         reverse_ = unknown_base_conversion(reverse_)
 771 |         reverse_ = rev_comp(reverse_)
 772 |         # set parameters for cutadapt
 773 |         overlap = str(min(len(forward_), len(reverse_)))
 774 |         adapter = forward_ + '...' + reverse_
 775 |         # transform input_ to fasta format in a temp file
 776 |         temp_input_path, fasta_dict = crabs_to_fasta(console, columns, input_)
 777 |         # run cutadapt
 778 |         trimmed_seqs, untrimmed_seqs = cutadapt(console, columns, adapter, temp_input_path, fasta_dict, mismatch_, overlap, threads_)
 779 |         # write data to output
 780 |         write_list_to_output(console, columns, trimmed_seqs, output_)
 781 |         if untrimmed_:
 782 |             write_list_to_output(console, columns, untrimmed_seqs, untrimmed_)
 783 |         # remove temporary files
 784 |         os.remove(temp_input_path)
 785 |         # write log to Terminal window
 786 |         console.print(f"[cyan]|             Results[/] | Extracted {len(trimmed_seqs)} amplicons from {len(fasta_dict)} sequences ({round(len(trimmed_seqs) / len(fasta_dict) * 100, 2)}%)")
 787 | 
 788 | #############################
 789 | # PAIRWISE GLOBAL ALIGNMENT #
 790 | #############################
 791 |     if pairwise_global_alignment_:
 792 |         # print function to console
 793 |         console.print(f"[cyan]|            Function[/] | Retrieve amplicons without primer-binding regions")
 794 |         # check if all parameters have been provided
 795 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--amplicons"': amplicons_, '"--forward"': forward_, '"--reverse"': reverse_, '"--percent-identity"': percent_identity_, '"--coverage"': coverage_})
 796 |         # read data into memory
 797 |         file_list = [amplicons_, input_]
 798 |         raw_fasta_dict, raw_fasta_list, amplicon_fasta_dict, amplicon_fasta_list = multiple_crabs_to_fasta(console, columns, file_list, size_select_)
 799 |         # write input to temp files in fasta format
 800 |         raw_temp_path, amplicon_temp_path = multiple_list_to_temp(console, columns, raw_fasta_list, amplicon_fasta_list)
 801 |         # run pairwise global alignment
 802 |         align_temp_path = usearch_global(console, columns, raw_temp_path, amplicon_temp_path, percent_identity_, threads_, raw_fasta_dict)
 803 |         # extract the sequence regions that conform to parameter settings
 804 |         amplicon_fasta_dict = extract_alignment_results(console, columns, align_temp_path, amplicon_fasta_dict, include_all_start_positions_, coverage_, forward_, reverse_, raw_fasta_dict)
 805 |         # write data to output
 806 |         write_dict_to_output(console, columns, amplicon_fasta_dict, output_)
 807 |         # remove intermediary files
 808 |         os.remove(raw_temp_path)
 809 |         os.remove(amplicon_temp_path)
 810 |         os.remove(align_temp_path)
 811 |         # write log to Terminal window
 812 |         console.print(f"[cyan]|             Results[/] | Retrieved {len(amplicon_fasta_dict) - len(amplicon_fasta_list)} amplicons without primer-binding regions from {len(raw_fasta_dict)} sequences")
 813 | 
 814 | ###############
 815 | # DEREPLICATE #
 816 | ###############
 817 |     if dereplicate_:
 818 |         # print function to console
 819 |         console.print(f"[cyan]|            Function[/] | Dereplicate CRABS database")
 820 |         # check if all parameters have been provided
 821 |         check_params(console, {'"--input"': input_, '"--output"': output_})
 822 |         # select dereplication function
 823 |         dereplication_function = select_function(dereplication_method_)
 824 |         # dereplicate data
 825 |         initial_read_count, seq_file = dereplication_function(console, columns, input_)
 826 |         # write data to output
 827 |         write_list_to_output(console, columns, seq_file, output_)
 828 |         # write log to Terminal window
 829 |         console.print(f"[cyan]|             Results[/] | Written {len(seq_file)} unique sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)")
 830 | 
 831 | ##########
 832 | # FILTER #
 833 | ##########
 834 |     if filter_:
 835 |         # print function to console
 836 |         console.print(f"[cyan]|            Function[/] | Filter CRABS database")
 837 |         # check if all parameters have been provided
 838 |         check_params(console, {'"--input"': input_, '"--output"': output_})
 839 |         # print which filtering parameters are included
 840 |         included_parameters = [key for key, value in {'"--minimum-length"': minimum_length_, '"--maximum-length"': maximum_length_, '"--maximum-n"': maximum_n_, '"--environmental"': environmental_, '"--no-species-id"': no_species_id_, '"--rank-na"': rank_na_}.items() if value not in [None, False]]
 841 |         console.print(f"[cyan]| Included parameters[/] | {', '.join(included_parameters)}")
 842 |         # read input file and parse data
 843 |         initial_read_count, seq_file, min_len_count, max_len_count, max_n_count, env_count, no_spec_count, rank_count = filter_function(console, columns, input_, minimum_length_, maximum_length_, maximum_n_, environmental_, no_species_id_, rank_na_)
 844 |         # write data to output
 845 |         write_list_to_output(console, columns, seq_file, output_)
 846 |         # write log to Terminal window
 847 |         console.print(f"[cyan]|             Results[/] | Written {len(seq_file)} filtered sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)")
 848 |         for item in [min_len_count, max_len_count, max_n_count, env_count, no_spec_count, rank_count]:
 849 |             for key, value in item.items():
 850 |                 if value != 0:
 851 |                     console.print(f"[cyan]|                    [/] | {key}: {value} sequences not passing filter ({round(value / initial_read_count * 100, 2)}%)")
 852 | 
 853 | ##########
 854 | # SUBSET #
 855 | ##########
 856 |     if subset_:
 857 |         # print function to console
 858 |         console.print(f"[cyan]|            Function[/] | Subset CRABS database")
 859 |         # check if all parameters have been provided
 860 |         check_params(console, {'"--input"': input_, '"--output"': output_})
 861 |         # check inclusion or exclusion parameter
 862 |         subset_dict = select_subset(console, include_, exclude_)
 863 |         # read input file and parse data
 864 |         initial_read_count, seq_file = subset_function(console, columns, input_, subset_dict)
 865 |         # write data to output
 866 |         write_list_to_output(console, columns, seq_file, output_)
 867 |         # write log to Terminal window
 868 |         console.print(f"[cyan]|             Results[/] | Written {len(seq_file)} subsetted sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)")
 869 | 
 870 | ####################
 871 | # DIVERSITY FIGURE #
 872 | ####################
 873 |     if diversity_figure_:
 874 |         # print function to console
 875 |         console.print(f"[cyan]|            Function[/] | Generate horizontal bar chart displaying diversity within database")
 876 |         # check if all parameters have been provided
 877 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--tax-level"': tax_level_})
 878 |         # read input file and parse data
 879 |         diversity_seq_dict, diversity_species_dict = parse_diversity(console, columns, input_, tax_level_)
 880 |         # generate horizontal bar chart
 881 |         horizontal_bar_chart(diversity_seq_dict, diversity_species_dict, output_)
 882 | 
 883 | ##########################
 884 | # AMPLICON LENGTH FIGURE #
 885 | ##########################
 886 |     if amplicon_length_figure_:
 887 |         # print function to console
 888 |         console.print(f"[cyan]|            Function[/] | Generate line graph displaying amplicon length distributions")
 889 |         # check if all parameters have been provided
 890 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--tax-level"': tax_level_})
 891 |         # read input file and parse data
 892 |         amplicon_length_dict = parse_length(console, columns, input_, tax_level_)
 893 |         # generate line graph
 894 |         line_graph(amplicon_length_dict, output_)
 895 | 
 896 | ###############################
 897 | # DATABASE COMPLETENESS TABLE #
 898 | ###############################
 899 |     if completeness_table_:
 900 |         # print function to console
 901 |         console.print(f"[cyan]|            Function[/] | Generate table containing barcode availability for taxonomic group")
 902 |         # check if all parameters have been provided
 903 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--names"': names_, '"--nodes"': nodes_, '"--species"': species_})
 904 |         # retrieve species of interest information
 905 |         species_dict = collections.defaultdict(dict)
 906 |         species_list = retrieve_species(console, columns, species_)
 907 |         for item in species_list:
 908 |             species_dict[item]['taxid'] = item
 909 |         # retrieve taxonomic lineages
 910 |         input_files = [names_, nodes_]
 911 |         input_file_size = sum(os.path.getsize(input_file) for input_file in input_files)
 912 |         with rich.progress.Progress(*columns) as progress_bar:
 913 |             task = progress_bar.add_task(console = console, description = "[cyan]|  NCBI tax to memory[/] |", total=input_file_size)
 914 |             names_key_tax_number_value_dict, tax_number_key_names_value_dict, synonym_key_dict = names_to_memory(task, progress_bar, names_)
 915 |             tax_number_key_rank_and_tax_number_up_values_dict = nodes_to_memory(task, progress_bar, nodes_)
 916 |         seq_input_dict, unresolved_lineage = generate_lineages(console, columns, ranks_, species_dict, {}, names_key_tax_number_value_dict, synonym_key_dict, tax_number_key_rank_and_tax_number_up_values_dict, tax_number_key_names_value_dict)
 917 |         # retrieve information about potential number of taxa shared with species of interest on genus and family level
 918 |         table_info_dict = calculate_ncbi_species_genera(console, columns, seq_input_dict, tax_number_key_rank_and_tax_number_up_values_dict)
 919 |         # retrieve information about number of taxa shared with species of interest on genus and family level in reference database
 920 |         table_info_dict = calculate_database_species_genera(console, columns, input_, table_info_dict, seq_input_dict)
 921 |         # write data to output
 922 |         completeness_table_output(table_info_dict, output_)
 923 | 
 924 | #####################
 925 | # PHYLOGENETIC TREE #
 926 | #####################
 927 |     if phylogenetic_tree_:
 928 |         # print function to console
 929 |         console.print(f"[cyan]|            Function[/] | Generate a phylogenetic tree based on barcodes for target species")
 930 |         # check if all parameters have been provided
 931 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--tax-level"': tax_level_, '"--species"': species_})
 932 |         # retrieve species information
 933 |         species_list = retrieve_species(console, columns, species_)
 934 |         # parse input_
 935 |         input_dict = parse_phylo_input(console, columns, input_, tax_level_)
 936 |         # subset input_dict to only include relevant sequences
 937 |         subset_dict = subset_phylo_input(console, columns, input_dict, species_list)
 938 |         # generate intermediary fasta files
 939 |         with rich.progress.Progress(*columns) as progress_bar:
 940 |             task = progress_bar.add_task(console = console, description = "[cyan]|      Generate trees[/] |", total=len(subset_dict))
 941 |             for target_species in subset_dict:
 942 |                 progress_bar.update(task, advance = 1)
 943 |                 align_input = dict_to_fasta(subset_dict[target_species])
 944 |             # align sequences
 945 |                 align_output = align_sequences(align_input)
 946 |             # generate phylogenetic tree
 947 |                 generate_phylo_tree(align_output, output_, target_species)
 948 |             # remove intermediary files
 949 |                 os.remove(align_input)
 950 |                 os.remove(align_output)
 951 |                 os.remove(f'{align_input}.dnd')
 952 | 
 953 | ###################################
 954 | # AMPLIFICATION EFFICIENCY FIGURE #
 955 | ###################################
 956 |     if amplification_efficiency_figure_:
 957 |         # print function to console
 958 |         console.print(f"[cyan]|            Function[/] | Generate a bar plot displaying mismatches in the primer-binding regions")
 959 |         # check if all parameters have been provided
 960 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--amplicons"': amplicons_, '"--forward"': forward_, '"--reverse"': reverse_})
 961 |         # import data
 962 |         file_list = [amplicons_, input_]
 963 |         total_file_size = sum(os.path.getsize(file) for file in file_list)
 964 |         with rich.progress.Progress(*columns) as progress_bar:
 965 |             task = progress_bar.add_task(console = console, description = "[cyan]|         Import data[/] |", total=total_file_size)
 966 |             amplicons_dict = amplicon_import(task, progress_bar, amplicons_, tax_group_)
 967 |             raw_dict = raw_import(task, progress_bar, input_, amplicons_dict)
 968 |         # extract the primer-binding regions
 969 |         primer_binding_region_dict = extract_primer_regions(console, columns, amplicons_dict, raw_dict, forward_, reverse_)
 970 |         # calculate base proportions at each location within the primer-binding regions
 971 |         forward_position_dict = deconstruct_primer_regions(primer_binding_region_dict, 'forward')
 972 |         reverse_position_dict = deconstruct_primer_regions(primer_binding_region_dict, 'reverse')
 973 |         # transform dict to np.array
 974 |         forward_positions, forward_ordered_counts, forward_bottoms = dict_to_array(forward_position_dict)
 975 |         reverse_positions, reverse_ordered_counts, reverse_bottoms = dict_to_array(reverse_position_dict)
 976 |         # parse primer data for plotting
 977 |         forward_primer_info = parse_primer(forward_)
 978 |         reverse_primer_info = parse_primer(reverse_)
 979 |         # generate figure
 980 |         efficiency_barplot(forward_positions, forward_ordered_counts, forward_bottoms, reverse_positions, reverse_ordered_counts, reverse_bottoms, forward_primer_info, reverse_primer_info, forward_, reverse_, output_)
 981 | 
 982 | ##########
 983 | # EXPORT #
 984 | ##########
 985 |     if export_:
 986 |         # print function to console
 987 |         console.print(f"[cyan]|            Function[/] | Export CRABS database to {export_format_.upper()} format")
 988 |         # check if all parameters have been provided
 989 |         check_params(console, {'"--input"': input_, '"--output"': output_, '"--export-format"': export_format_})
 990 |         # select format function
 991 |         if export_format_.upper() == 'IDT-TEXT':
 992 |             initial_read_count, seq_file = idt_text(console, columns, input_)
 993 |             write_list_to_output(console, columns, seq_file, output_)
 994 |         elif export_format_.upper() == 'BLAST-NOTAX':
 995 |             blast_no_tax(console, columns, input_, output_)
 996 |         elif export_format_.upper() == 'BLAST-TAX':
 997 |             blast_tax(console, columns, input_, output_)
 998 |         else:
 999 |             output_to_format = select_function(export_format_)
1000 |             # read input file and parse data
1001 |             initial_read_count, seq_file = classifier_format(console, columns, input_, output_to_format)
1002 |             # write data to output
1003 |             write_list_to_output(console, columns, seq_file, output_)
1004 |             # write log to Terminal window
1005 |             console.print(f"[cyan]|             Results[/] | Written {len(seq_file)} sequences to {output_} out of {initial_read_count} initial sequences ({round(len(seq_file) / initial_read_count * 100, 2)}%)")
1006 |             
1007 | ################
1008 | # EXECUTE CODE #
1009 | ################
1010 | if __name__ == "__main__":
1011 |     crabs()


--------------------------------------------------------------------------------
/docker_intro/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Getting started using Crabs in Docker 
  4 | 
  5 | Running your applications with Docker offers many advantages over other approaches, but it can be difficult to get used to. One of the main challenges is that Docker provides an additional layer of abstraction that is outside your own computer's file structure. While this added layer frees you from the frequent nightmares of software installation and dependencies, it can often trip you up. It is important to remember that to process any of the files on your computer with a Docker application it is necessary to copy those files to the Docker's own file system, and then provide a way to get the outputs back into your own computer's file system. Hopefully, the examples below will help make this easier. 
  6 | 
  7 | Note: the below examples will work for Mac or Linux. We will add examples for Windows systems soon.
  8 | 
  9 | ## Help command
 10 | 
 11 | After installing any software, most of us will try it out with a help command. Here is an example using crabs on Docker. (All following commands assume that you have already pulled the docker image using `docker pull quay.io/swordfish/crabs:1.7.7.0`.)
 12 | 
 13 | ```
 14 | docker run --rm -it \
 15 |   quay.io/swordfish/crabs:1.7.7.0 \
 16 |   crabs -h
 17 | ```
 18 | 
 19 | Okay, let's break down that command. The `docker run` command will create a container out of the image that we pulled from the quay.io website. This image can be examined using Docker Desktop. The options after this command are as follows:
 20 | 
 21 | `--rm` will automatically remove the container when the command is finished. There are instances when you will want to keep the container running, but we are keeping everything simple for now. 
 22 | 
 23 | `-it` is two commands merged: the `-i` is for interactive, and the `-t` is to allocate a *tty* (essentially acting as a pseudo terminal) for the container. These two commands together allow you to use the command as an interactive process, like a shell. The opposite of this is when you want to run a container in the background, as for web apps run from Docker. 
 24 | 
 25 | The next parameter just names the image that will be turned into a container, in this case our crabs image. You have to specify the entire name as it appears above. 
 26 | 
 27 | After the image name, the next line is just the crabs command. Note: we are splitting our commands into separate lines using the backslash ('\\'). This helps make the command clear to read. You could just have these commands on one line. If you want to split your code, just make sure there is no space after the backslash. 
 28 | 
 29 | ## Actual command
 30 | 
 31 | If you ran the help command above and it worked, that is great. You know that the docker image is working. However, we want to run actual commands and process some data and for that we need to add some more parameters. 
 32 | 
 33 | Here is a `db_download` command to download ITS sequences of the fungal genus *Amanita*. This should yield a bit over 6,000 sequences, so a good example to use that should not take too long. 
 34 | 
 35 | The best practice is to first go to the directory where you will run the analyses
 36 | 
 37 | ```
 38 | cd /Users/fulanotal/analysis/cool_fungi
 39 | ```
 40 | 
 41 | (The folder above will most likely not exist on your computer, you will have to substitute your own folder paths.)
 42 | 
 43 | Then run the docker command:
 44 | 
 45 | ```
 46 | docker run --rm -it \
 47 |   -v $(pwd):/data \
 48 |   --workdir="/data" \
 49 |   quay.io/swordfish/crabs:1.7.7.0 \
 50 |   crabs --download-ncbi \
 51 |   --database nucleotide \
 52 |   --query '"Amanita"[Organism] AND Internal Transcribed Spacer[All Fields] AND ("1"[SLEN] : "1000"[SLEN])' \
 53 |   --output amanita.fasta \
 54 |   --email fulano.tal@gmail.com \
 55 |   --batchsize 5000
 56 | 
 57 | ```
 58 | 
 59 | If this worked, then you should see a file called 'amanita.fasta' and the original file 'CRABS_ncbi_download.fasta' in your directory.
 60 | 
 61 | In addition to the `docker run` and `--rm -it` parameters, we have added more to the docker part of the command.
 62 | 
 63 | The `-v` (also `--volume`) parameter will mount a folder on your computer to a folder inside the docker container so it can be accessed. This is organized as host:container, with the absolute path or name to your computer's file system before the colon and the directory inside the docker container after the colon. In the above example we use $(pwd), which is bash for 'present working directory'. This is why we suggest to `cd` to your directory, which makes it easy to just use *pwd* for host folder. 
 64 | 
 65 | The next line sets the working directory for inside the container. If you do not specify, the working directory will default to the root of the container: just `/`. If you use the default, then you will have to specify where your output files will go. For some Crabs commands, such as `db_download`, it is important to use this option. You will notice that the `--workdir` option is the same as the destination of the `-v` option. This is because for this command Crabs creates intermediate files and if you do not make the `--workdir` and destination `-v` (after the colon) the same, then crabs will not be able to find these intermediate files. For other commands, this is not so critical, and we will show other options below. Because this is needed for some Crabs commands, we use the `--workdir` as general practice. 
 66 | 
 67 | The lines following the image command are just the standard Crabs commands, and these are detailed on the main page.
 68 | 
 69 | 
 70 | ## Taxonomy files 
 71 | 
 72 | If you are like us, you like to keep your folders tidy, and keep your general reference files elsewhere. This is a good idea for the massive NCBI taxonomy files that you need to assign taxonomy to your database sequences. In the following steps, we will go to a different folder, download the taxonomy files, and then use them back in your analysis folder. This will illustrate some good tips for using docker across multiple directories. 
 73 | 
 74 | 
 75 | First, get the taxonomy files. We will move to a different directory to keep it simple:
 76 | 
 77 | ```
 78 | cd /Users/fulanotal/taxonomy_files
 79 | ```
 80 | 
 81 | Then, from this directory, we download the taxonomy files: 
 82 | 
 83 | ```
 84 | 
 85 | docker run --rm -it \
 86 |   -v $(pwd):/data \
 87 |   --workdir="/data" \
 88 |   quay.io/swordfish/crabs:1.7.7.0 \
 89 |   crabs --download-taxonomy \
 90 |     --output ./
 91 | 
 92 | ```
 93 | 
 94 | This should result in the three files downloaded to this folder: *names.dmp*, *nodes.dmp*, and *nucl_gb.accession2taxid*. (Note: the `./` in the `--output` parameter indicates to output to the current folder.)
 95 | 
 96 | Now, the tricky bit. We want to return to our analysis file but use these reference files sitting in another part of our computer. To do this, we can add another `-v` command, but we cannot move to the same directory as the home. Here is how we work this out:
 97 | 
 98 | First, return to the working directory:
 99 | 
100 | ```
101 | cd /Users/fulanotal/analysis/cool_fungi
102 | ```
103 | 
104 | Now, to keep things clear, we will create a variable with the path to the reference folder:
105 | 
106 | ```
107 | TAX='/Users/fulanotal/taxonomy_files'
108 | ```
109 | 
110 | We can now find the taxonomy of all our sequences and import the fasta file downloaded in the previous command to create a crabs database for use downstream:
111 | 
112 | 
113 | ```
114 | docker run --rm -it \
115 |   -v $(pwd):/data \
116 |   -v ${TAX}:/src \
117 |   --workdir="/data" \
118 |   quay.io/swordfish/crabs:1.7.7.0 \
119 |   crabs --import \
120 |     --import-format NCBI \
121 |     --input amanita.fasta \
122 |     --output amanita_crabs.txt \
123 |     --names /src/names.dmp \
124 |     --nodes /src/nodes.dmp \
125 |     --acc2tax /src/nucl_gb.accession2taxid \
126 |     --ranks 'kingdom;phylum;class;order;family;genus;species'
127 | 
128 | ```
129 | 
130 | 
131 | You will notice that the additional `-v` command copies ('mounts' in docker lingo) the taxonomy files to the `/src` folder inside the docker container. In order for Crabs to find these files, we had to put `/src/` in front of the taxonomy files within this command. You DO NOT put the path to the files on your computer (e.g., ${TAX}/nodes.dmp), because the process is running inside the docker container. 
132 | 
133 | 
134 | 
135 | ## Processing more data
136 | 
137 | Continuing from our *Amanita* download and import, we can use more or less the same command structure as above.
138 | 
139 | 
140 | **insilico PCR**
141 | 
142 | Here is an example command to just get the ITS1 region from our downloaded sequences:
143 | 
144 | ```
145 | 
146 | docker run --rm -it \
147 |   -v $(pwd):/data \
148 |   --workdir="/data" \
149 |   quay.io/swordfish/crabs:1.7.7.0 \
150 |   crabs --in-silico-pcr \
151 |   --input amanita_crabs.txt \
152 |   --output amanita_crabs_its1.txt \
153 |   --forward CTTGGTCATTTAGAGGAAGTAA \
154 |   --reverse GCTGCGTTCTTCATCGATGC
155 | 
156 | ```
157 | 
158 | **Adding pairwise global alignment step:**
159 | 
160 | ```
161 | 
162 | docker run --rm -it \
163 |   -v $(pwd):/data \
164 |   --workdir="/data" \
165 |   quay.io/swordfish/crabs:1.7.7.0 \
166 |   crabs --pairwise-global-alignment \
167 |   --input amanita_crabs.txt \
168 |   --amplicons amanita_crabs_its1.txt \
169 |   --output amanita_its1_pga.txt \
170 |   --forward CTTGGTCATTTAGAGGAAGTAA \
171 |   --reverse GCTGCGTTCTTCATCGATGC \
172 |   --size-select 600 \
173 |   --threads 2 \
174 |   --percent-identity 0.9 \
175 |   --coverage 90 \
176 |   --all-start-positions 
177 | 
178 | ```
179 | 
180 | Note: if you are using a newer mac with a M1 chip, then you might see a warning when running these commands. The commands should still work, but you can eliminate this warning by adding the parameter `--platform linux/amd64` to the command above, before the image name.
181 | 
182 | 
183 | From these examples you should be able to run most of the Crabs commands to create your reference database. We will continue to add examples, explanations, and tips to this page over the coming weeks. Stay tuned, and stay in touch. 
184 | 
185 | 


--------------------------------------------------------------------------------
/figures_readme/amplicon-length-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/amplicon-length-figure.png


--------------------------------------------------------------------------------
/figures_readme/amplification-efficiency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/amplification-efficiency.png


--------------------------------------------------------------------------------
/figures_readme/crabs_blasttax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_blasttax.png


--------------------------------------------------------------------------------
/figures_readme/crabs_completeness.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_completeness.png


--------------------------------------------------------------------------------
/figures_readme/crabs_cutadapt_error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_cutadapt_error.png


--------------------------------------------------------------------------------
/figures_readme/crabs_dereplicate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_dereplicate.png


--------------------------------------------------------------------------------
/figures_readme/crabs_download_bold.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_bold.png


--------------------------------------------------------------------------------
/figures_readme/crabs_download_mitofish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_mitofish.png


--------------------------------------------------------------------------------
/figures_readme/crabs_download_ncbi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_ncbi.png


--------------------------------------------------------------------------------
/figures_readme/crabs_download_ncbi_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_ncbi_output.png


--------------------------------------------------------------------------------
/figures_readme/crabs_download_taxonomy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_download_taxonomy.png


--------------------------------------------------------------------------------
/figures_readme/crabs_export.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_export.png


--------------------------------------------------------------------------------
/figures_readme/crabs_filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_filter.png


--------------------------------------------------------------------------------
/figures_readme/crabs_greengenes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_greengenes.png


--------------------------------------------------------------------------------
/figures_readme/crabs_help.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_help.png


--------------------------------------------------------------------------------
/figures_readme/crabs_import.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_import.png


--------------------------------------------------------------------------------
/figures_readme/crabs_insilico.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_insilico.png


--------------------------------------------------------------------------------
/figures_readme/crabs_merge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_merge.png


--------------------------------------------------------------------------------
/figures_readme/crabs_midori.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_midori.png


--------------------------------------------------------------------------------
/figures_readme/crabs_pga.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_pga.png


--------------------------------------------------------------------------------
/figures_readme/crabs_silva.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_silva.png


--------------------------------------------------------------------------------
/figures_readme/crabs_subset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/crabs_subset.png


--------------------------------------------------------------------------------
/figures_readme/diversity-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/diversity-figure.png


--------------------------------------------------------------------------------
/figures_readme/phylo_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/phylo_tree.png


--------------------------------------------------------------------------------
/figures_readme/unite_first.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/unite_first.png


--------------------------------------------------------------------------------
/figures_readme/unite_second.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/unite_second.png


--------------------------------------------------------------------------------
/figures_readme/unite_third.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/figures_readme/unite_third.png


--------------------------------------------------------------------------------
/function/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __version__ = "1.7.7"
3 | 


--------------------------------------------------------------------------------
/function/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/function/__pycache__/module_1.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/module_1.cpython-36.pyc


--------------------------------------------------------------------------------
/function/__pycache__/module_3.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/module_3.cpython-36.pyc


--------------------------------------------------------------------------------
/function/__pycache__/module_5.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gjeunen/reference_database_creator/51db08ca9b57ace3e40a88b7816cd1c9097969f8/function/__pycache__/module_5.cpython-36.pyc


--------------------------------------------------------------------------------
/function/older_versions/crabs_v1.0.0:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | ################################################
  4 | ################ IMPORT MODULES ################
  5 | ################################################
  6 | import argparse
  7 | import subprocess as sp
  8 | import pandas as pd
  9 | import os
 10 | import shutil
 11 | import collections
 12 | import matplotlib
 13 | import matplotlib.pyplot as plt
 14 | from Bio.Align.Applications import MuscleCommandline
 15 | from pathlib import Path
 16 | from collections import Counter
 17 | from Bio import SeqIO
 18 | from Bio import AlignIO
 19 | from Bio import Phylo
 20 | from Bio.Seq import Seq
 21 | from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceMatrix, DistanceTreeConstructor
 22 | from function.module_1 import wget_ncbi, esearch_fasta, efetch_seqs_from_webenv, ncbi_formatting, mitofish_download, mitofish_format, embl_download, embl_fasta_format, embl_crabs_format, bold_download, bold_format, check_accession, append_primer_seqs, generate_header, merge_databases
 23 | from function.module_3 import tax2dict, get_accession, acc_to_dict, get_lineage, final_lineage_comb
 24 | from function.module_5 import split_db_by_taxgroup, num_spec_seq_taxgroup, horizontal_barchart, get_amp_length, amplength_figure, file_dmp_to_dict, species_to_taxid, lineage_retrieval
 25 | 
 26 | ################################################
 27 | ########### MODULE DATABASE DOWNLOAD ###########
 28 | ################################################
 29 | 
 30 | ## function download data from online databases
 31 | def db_download(args):
 32 |     SOURCE = args.source
 33 |     DATABASE = args.database
 34 |     QUERY = args.query
 35 |     OUTPUT = args.output
 36 |     ORIG = args.orig
 37 |     EMAIL = args.email
 38 |     BATCHSIZE = args.batchsize
 39 | 
 40 |     ## download taxonomy data from NCBI
 41 |     if SOURCE == 'taxonomy':
 42 |         print('\ndownloading taxonomy information')
 43 |         url_acc2taxid = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz'
 44 |         url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'        
 45 |         results = sp.run(['wget', url_acc2taxid])
 46 |         results = sp.run(['gunzip', 'nucl_gb.accession2taxid.gz'])
 47 |         results = sp.run(['wget', url_taxdump])
 48 |         results = sp.run(['tar', '-zxvf', 'taxdump.tar.gz'])
 49 |         print('removing intermediary files\n')
 50 |         files_to_remove = ['citations.dmp', 'delnodes.dmp', 'division.dmp', 'gencode.dmp', 'merged.dmp', 'gc.prt', 'readme.txt', 'taxdump.tar.gz']
 51 |         for file in files_to_remove:
 52 |             os.remove(file)
 53 | 
 54 |     ## download sequencing data from NCBI
 55 |     elif SOURCE == 'ncbi':
 56 |         if all(v is not None for v in [DATABASE, QUERY, OUTPUT, EMAIL]):
 57 |             print('\ndownloading sequences from NCBI')
 58 |             ncbi_download = wget_ncbi(QUERY, DATABASE, EMAIL, BATCHSIZE)
 59 |             print('formatting the downloaded sequencing file to CRABS format')
 60 |             format_seqs = ncbi_formatting(OUTPUT, ORIG)
 61 |             print(f'written {format_seqs} sequences to {OUTPUT}\n')
 62 |         else:
 63 |             print('\nnot all parameters have an input value\n')
 64 | 
 65 |     ## download sequencing data from EMBL
 66 |     elif SOURCE == 'embl':
 67 |         if all(v is not None for v in [DATABASE, OUTPUT]):
 68 |             print('\ndownloading sequences from EMBL')
 69 |             dl_files = embl_download(DATABASE)
 70 |             fasta_file = embl_fasta_format(dl_files)
 71 |             print(f'formatting intermediary file to CRABS format')
 72 |             crabs_file = embl_crabs_format(fasta_file, OUTPUT, ORIG)
 73 |             print(f'written {crabs_file} sequences to {OUTPUT}\n')
 74 |         else:
 75 |             print('\nnot all parameters have an input value\n')
 76 | 
 77 |     ## download sequencing data from MitoFish
 78 |     elif SOURCE == 'mitofish':
 79 |         if all(v is not None for v in [OUTPUT]):
 80 |             print('\ndownloading sequences from the MitoFish database')
 81 |             url = 'http://mitofish.aori.u-tokyo.ac.jp/files/complete_partial_mitogenomes.zip'
 82 |             dl_file = mitofish_download(url)
 83 |             print(f'formatting {dl_file} to CRABS format')
 84 |             mitoformat = mitofish_format(dl_file, OUTPUT, ORIG)
 85 |             print(f'written {mitoformat} sequences to {OUTPUT}\n')
 86 |         else:
 87 |             print('\nnot all parameters have an input value\n')
 88 | 
 89 |     ## download sequencing data from BOLD
 90 |     elif SOURCE == 'bold':
 91 |         if all(v is not None for v in [DATABASE, OUTPUT]):
 92 |             print('\ndownloading sequences from BOLD')
 93 |             bold_file = bold_download(DATABASE)
 94 |             print(f'downloaded {bold_file} sequences from BOLD')
 95 |             print(f'formatting {bold_file} sequences to CRABS format')
 96 |             boldformat = bold_format(OUTPUT, ORIG)
 97 |             print(f'written {boldformat} sequences to {OUTPUT}\n')
 98 |         else:
 99 |             print('\nnot all parameters have an input value\n')
100 | 
101 | ## function: import existing or custom database
102 | def db_import(args):
103 |     INPUT = args.input
104 |     HEADER = args.header
105 |     OUTPUT = args.output
106 |     FWD = args.fwd
107 |     REV = args.rev
108 |     DELIM = args.delim
109 | 
110 |     ## process file with accession number in header
111 |     if HEADER == 'accession':
112 |         if all(v is not None for v in [INPUT, OUTPUT, DELIM]):
113 |             print(f'\nchecking correct formatting of accession numbers in {INPUT}')
114 |             incorrect_accession = check_accession(INPUT, OUTPUT, DELIM)
115 |             if len(incorrect_accession) != 0:
116 |                 print('found incorrectly formatted accession numbers, please check file: "incorrect_accession_numbers.txt"')
117 |                 with open('incorrect_accession_numbers.txt', 'w') as fout:
118 |                     for item in incorrect_accession:
119 |                         fout.write(item + '\n')
120 |             if all(v is not None for v in [FWD, REV]):
121 |                 print(f'appending primer sequences to {OUTPUT}')
122 |                 numseq = append_primer_seqs(OUTPUT, FWD, REV)
123 |                 print(f'added primers to {numseq} sequences in {OUTPUT}\n')
124 |             else:
125 |                 print('')
126 |         else:
127 |             print('\nnot all parameters have an input value\n')
128 | 
129 |     ## process file with species info in header
130 |     elif HEADER == 'species':
131 |         if all(v is not None for v in [INPUT, OUTPUT, DELIM]):
132 |             print(f'\ngenerating new sequence headers for {INPUT}')
133 |             num_header = generate_header(INPUT, OUTPUT, DELIM)
134 |             print(f'generated {num_header} headers for {OUTPUT}')
135 |             if all(v is not None for v in [FWD, REV]):
136 |                 print(f'appending primer sequences to {OUTPUT}')
137 |                 numseq = append_primer_seqs(OUTPUT, FWD, REV)
138 |                 print(f'added primers to {numseq} sequences in {OUTPUT}\n')
139 |             else:
140 |                 print('')
141 |         else:
142 |             print('\nnot all parameters have an input value\n')
143 |     else:
144 |         print('\nplease specify header information: "accession" and "species"\n')
145 | 
146 | ## function: merge multiple databases
147 | def db_merge(args):
148 |     INPUT = args.input
149 |     UNIQ = args.uniq
150 |     OUTPUT = args.output
151 | 
152 |     if UNIQ != '':
153 |         print('\nmerging all fasta files and discarding duplicate sequence headers')
154 |         num_uniq = merge_databases(INPUT, OUTPUT)
155 |         print(f'written {num_uniq} sequences to {OUTPUT}\n')
156 |     else:
157 |         print('\nmerging all fasta files and keeping duplicate sequence headers')
158 |         with open(OUTPUT, 'w') as fout:
159 |             for file in INPUT:
160 |                 num = len(list(SeqIO.parse(file, 'fasta')))
161 |                 print(f'found {num} sequences in {file}')
162 |                 with open(file, 'r') as fin:
163 |                     for line in fin:
164 |                         fout.write(line)
165 |         num = len(list(SeqIO.parse(OUTPUT, 'fasta')))
166 |         print(f'written {num} sequences to {OUTPUT}\n')
167 | 
168 | 
169 | ################################################
170 | ############# MODULE IN SILICO PCR #############
171 | ################################################
172 | 
173 | ## function: in silico PCR
174 | def insilico_pcr(args):
175 |     FWD = args.fwd
176 |     REV = args.rev
177 |     INPUT = args.input
178 |     ERROR = args.error
179 |     OUTPUT = args.output
180 | 
181 |     ## reverse complement reverse primer sequence
182 |     REV_CORRECT = str(Seq(REV).reverse_complement())
183 | 
184 |     ## setting variable names using the info from user input
185 |     TRIMMED_INIT = 'init_trimmed.fasta'
186 |     UNTRIMMED_INIT = 'init_untrimmed.fasta'
187 |     REVCOMP_UNTRIMMED_INIT = 'revcomp_untrimmed.fasta'
188 |     TRIMMED_REVCOMP = 'revcomp_trimmed.fasta'
189 |     UNTRIMMED_REVCOMP = 'untrimmed_revcomp.fasta'
190 | 
191 |     OVERLAP = str(min([len(FWD), len(REV_CORRECT)]))
192 |     ADAPTER = FWD + '...' + REV_CORRECT
193 | 
194 |     ## run cutadapt on downloaded fasta file
195 |     count_init = len(list(SeqIO.parse(INPUT, 'fasta')))
196 |     print('\nrunning in silico PCR on fasta file containing {} sequences'.format(count_init))
197 |     cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet']
198 |     sp.call(cmnd_cutadapt_1)
199 |     count_trimmed_init = len(list(SeqIO.parse(TRIMMED_INIT, 'fasta')))
200 |     print('found primers in {} sequences'.format(count_trimmed_init))
201 | 
202 |     ## run vsearch to reverse complement untrimmed sequences
203 |     if count_trimmed_init < count_init:
204 |         count_untrimmed_init = len(list(SeqIO.parse(UNTRIMMED_INIT, 'fasta')))
205 |         print('reverse complementing {} untrimmed sequences'.format(count_untrimmed_init))
206 |         cmnd_vsearch_revcomp = ['vsearch', '--fastx_revcomp', UNTRIMMED_INIT, '--fastaout', REVCOMP_UNTRIMMED_INIT, '--quiet']
207 |         sp.call(cmnd_vsearch_revcomp)
208 | 
209 |         ## run cutadapt on reverse complemented untrimmed sequences
210 |         print('running in silico PCR on {} reverse complemented untrimmed sequences'.format(count_untrimmed_init))
211 |         cmnd_cutadapt_2 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_REVCOMP, REVCOMP_UNTRIMMED_INIT, '--untrimmed-output', UNTRIMMED_REVCOMP, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet']
212 |         sp.call(cmnd_cutadapt_2)
213 |         count_trimmed_second = len(list(SeqIO.parse(TRIMMED_REVCOMP, 'fasta')))
214 |         print('found primers in {} sequences\n'.format(count_trimmed_second))
215 | 
216 |         ## concatenate both trimmed files
217 |         with open(OUTPUT, 'wb') as wfd:
218 |             for f in [TRIMMED_INIT, TRIMMED_REVCOMP]:
219 |                 with open(f, 'rb') as fd:
220 |                     shutil.copyfileobj(fd, wfd)
221 |         
222 |         ## remove intermediary files
223 |         files = [TRIMMED_INIT, UNTRIMMED_INIT, REVCOMP_UNTRIMMED_INIT, TRIMMED_REVCOMP, UNTRIMMED_REVCOMP]
224 |         for file in files:
225 |             os.remove(file)
226 |     
227 |     ## don't run reverse complement when initial in silico PCR trims all sequences
228 |     else:
229 |         print('all sequences trimmed, no reverse complement step\n')
230 |         results = sp.run(['mv', TRIMMED_INIT, OUTPUT])
231 |         os.remove(UNTRIMMED_INIT)
232 | 
233 | ################################################
234 | ########## MODULE TAXONOMY ASSIGNMENT ##########
235 | ################################################
236 | ## function: get taxonomic lineage for each accession number
237 | def assign_tax(args):
238 |     INPUT = args.input
239 |     OUTPUT = args.output
240 |     ACC2TAX = args.acc2tax
241 |     TAXID = args.taxid
242 |     NAME = args.name
243 | 
244 |     ## process initial files
245 |     print(f'\nretrieving accession numbers from {INPUT}')
246 |     accession = get_accession(INPUT)
247 |     print(f'found {len(accession)} accession numbers in {INPUT}')
248 |     acc2tax, taxid, name, no_acc = tax2dict(ACC2TAX, TAXID, NAME, accession)
249 |     print(f'processed {len(acc2tax)} entries in {ACC2TAX}')
250 |     print(f'processed {len(taxid)} entries in {TAXID}')
251 |     print(f'processed {len(name)} entries in {NAME}')
252 | 
253 | 
254 |     ## get taxonomic lineage
255 |     print(f'assigning a tax ID number to {len(accession)} accession numbers from {INPUT}')
256 |     acc_taxid_dict, taxid_list = acc_to_dict(accession, acc2tax, no_acc)
257 |     print(f'{len(acc_taxid_dict)} accession numbers resulted in {len(taxid_list)} unique tax ID numbers')
258 |     print(f'generating taxonomic lineages for {len(taxid_list)} tax ID numbers')
259 |     lineage = get_lineage(taxid_list, taxid, name)
260 |     print(f'assigning a taxonomic lineage to {len(accession)} accession numbers')
261 |     final_lineage = final_lineage_comb(acc_taxid_dict, lineage, INPUT, OUTPUT)
262 |     print(f'written {len(final_lineage)} entries to {OUTPUT}\n')
263 | 
264 | ################################################
265 | ########### MODULE DATABASE CLEAN-UP ###########
266 | ################################################
267 | 
268 | ## function: dereplicating the database
269 | def dereplicate(args):
270 |     INPUT = args.input
271 |     OUTPUT = args.output
272 |     METHOD = args.method
273 | 
274 |     ## dereplicate strict (only unique sequences)
275 |     if METHOD == 'strict':
276 |         print(f'\nstrict dereplication of {INPUT}, only keeping unique sequences')
277 |         uniq_seqs = {}
278 |         uniq_line = []
279 |         count = 0
280 |         added = 0
281 |         with open(INPUT, 'r') as file_in:
282 |             for line in file_in:
283 |                 count = count + 1
284 |                 lines = line.rstrip('\n')
285 |                 seq = lines.split('\t')[9]
286 |                 if seq not in uniq_seqs:
287 |                     added = added + 1
288 |                     uniq_seqs[seq] = seq
289 |                     uniq_line.append(line)
290 |         print(f'found {count} sequences in {INPUT}')
291 |         print(f'written {added} sequences to {OUTPUT}\n')
292 |         with open(OUTPUT, 'w') as file_out:
293 |             for line in uniq_line:
294 |                 file_out.write(line)
295 | 
296 |     ## dereplicate single species (one sequence per species)
297 |     elif METHOD == 'single_species':
298 |         print(f'\ndereplicating {INPUT}, only keeping a single sequence per species')
299 |         uniq_spec = {}
300 |         uniq_line = []
301 |         count = 0
302 |         added = 0
303 |         with open(INPUT, 'r') as file_in:
304 |             for line in file_in:
305 |                 count = count + 1
306 |                 lines = line.rstrip('\n')
307 |                 species = lines.split('\t')[8].split(',')[2]
308 |                 if species not in uniq_spec:
309 |                     added = added + 1
310 |                     uniq_spec[species] = species 
311 |                     uniq_line.append(line)
312 |         print(f'found {count} sequences in {INPUT}')
313 |         print(f'written {added} sequences to {OUTPUT}\n')
314 |         with open(OUTPUT, 'w') as file_out:
315 |             for line in uniq_line:
316 |                 file_out.write(line)
317 | 
318 |     ## dereplicate unique species (all unique sequences per species)
319 |     elif METHOD == 'uniq_species':
320 |         print(f'\ndereplicating {INPUT}, keeping all unique sequences per species')
321 |         mydict = collections.defaultdict(list)
322 |         count = 0
323 |         added = 0
324 |         with open(INPUT, 'r') as file_in:
325 |             for line in file_in:
326 |                 count = count + 1
327 |                 lines = line.rstrip('\n')
328 |                 spec = lines.split('\t')[8].split(',')[2]
329 |                 seq = lines.split('\t')[9]
330 |                 line_id = lines
331 |                 seq_dicts = []
332 |                 for item in mydict[spec]:
333 |                     seq_dict = item.rsplit('\t', 1)[1]
334 |                     seq_dicts.append(seq_dict)
335 |                 if seq not in seq_dicts:
336 |                     added = added + 1
337 |                     mydict[spec].append(line_id)
338 |         print(f'found {count} sequences in {INPUT}')
339 |         print(f'written {added} sequences to {OUTPUT}\n')
340 |         with open(OUTPUT, 'w') as file_out:
341 |             for k, v in mydict.items():
342 |                 for i in v:
343 |                     file_out.write(i + '\n')
344 | 
345 |     ## dereplicate concensus species (generate concensus sequence for each species)
346 |     elif METHOD == 'consensus':
347 |         print('still to add...')
348 | 
349 |     ## unknown method specified
350 |     else:
351 |         print('\nplease specify one of the accepted dereplication methods: "strict", "single_species", "uniq_species"\n')
352 | 
353 | ## function: sequence cleanup
354 | def db_filter(args):
355 |     MINLEN = args.minlen
356 |     MAXLEN = args.maxlen
357 |     MAXNS = args.maxns
358 |     INPUT = args.input
359 |     OUTPUT = args.output
360 |     DISCARD = args.discard
361 |     ENV = args.env
362 |     SPEC = args.spec
363 |     NANS = args.nans
364 | 
365 |     ## set filtering parameters
366 |     print(f'\nfiltering parameters:\nremoving sequences shorter than {MINLEN} and longer than {MAXLEN}\nremoving sequences containing more than {MAXNS} "N"')
367 |     if ENV == 'no':
368 |         env = 100
369 |         print('keeping environmental sequences')
370 |     else:
371 |         env = 0
372 |         print('removing environmental sequences')
373 |     if SPEC == 'no':
374 |         print('keeping sequences unclassified at species level')
375 |         spec = 100
376 |     else:
377 |         spec = 0
378 |         print('removing sequences without a species ID')
379 |     if NANS == 'no':
380 |         nans = 100
381 |         print('keeping sequences with missing taxonomic information')
382 |     else:
383 |         nans = int(NANS)
384 |         print(f'removing sequences with missing information for more than {NANS} taxonomic levels')
385 |     
386 |     ## read the input file and clean up given the parameters
387 |     clean_db = []
388 |     discard_db = []
389 |     count = 0
390 |     count_clean = 0
391 |     with open(INPUT, 'r') as file_in:
392 |         for line in file_in:
393 |             count = count + 1
394 |             lines = line.rstrip('\n')
395 |             upline = lines.upper()
396 |             seq = upline.rsplit('\t', 1)[1]
397 |             species = upline.split('\t')[8]
398 |             if len(seq) >= MINLEN and len(seq) <= MAXLEN and seq.count('N') <= MAXNS and species.count('ENVIRONMENTAL') <= env and species.count('_SP.') <= spec and upline.count(',NAN') <= nans:
399 |                 count_clean = count_clean + 1
400 |                 clean_db.append(line)
401 |             else:
402 |                 discard_db.append(line)
403 |     
404 |     ## write cleaned database to file
405 |     cleaned = count - count_clean
406 |     print(f'found {count} number of sequences in {INPUT}')
407 |     print(f'removed {cleaned} sequences during filtering')
408 |     print(f'written {count_clean} sequences to {OUTPUT}\n')
409 |     with open(OUTPUT, 'w') as file_out:
410 |         for item in clean_db:
411 |             file_out.write(item)
412 |     if DISCARD != 'no':
413 |         with open(DISCARD, 'w') as dis_out:
414 |             for item in discard_db:
415 |                 dis_out.write(item)
416 | 
417 | ################################################
418 | ############# MODULE VISUALISATION #############
419 | ################################################
420 | 
421 | ## figure output
422 | def visualization(args):
423 |     INPUT = args.input
424 |     OUTPUT = args.output
425 |     METHOD = args.method
426 |     LEVEL = args.level
427 |     SPECIES = args.species
428 |     TAXID = args.taxid
429 |     NAME = args.name
430 | 
431 |     ## horizontal barchart
432 |     if METHOD == 'diversity':
433 |         tax_group_list, uniq_tax_group_list, species_dict = split_db_by_taxgroup(INPUT, LEVEL)
434 |         sequence_counter = Counter(tax_group_list)
435 |         list_info_dict = num_spec_seq_taxgroup(uniq_tax_group_list, species_dict, sequence_counter)
436 |         sorted_info = sorted(list_info_dict, key = lambda i: (i['sequence']))
437 |         figure = horizontal_barchart(sorted_info)
438 |     
439 |     ## length distribution
440 |     elif METHOD == 'amplicon_length':
441 |         amp_length_dict = get_amp_length(INPUT, LEVEL)
442 |         figure = amplength_figure(amp_length_dict)
443 |     
444 |     ## completeness table
445 |     elif METHOD == 'db_completeness':
446 | 
447 |         ## read in the text file with species names
448 |         species_list = []
449 |         with open(SPECIES, 'r') as species_file:
450 |             for line in species_file:
451 |                 species = line.rstrip('\n').replace(' ', '_')
452 |                 species_list.append(species)
453 |         print(f'\nfound {len(species_list)} species of interest in {SPECIES}: {species_list}')
454 | 
455 |         ## retrieve taxonomic lineage
456 |         print(f'generating taxonomic lineage for {len(species_list)} species')
457 |         name, node, taxid = file_dmp_to_dict(NAME, TAXID)
458 |         species_taxid_dict, taxid_list = species_to_taxid(species_list, taxid)
459 |         lineage = lineage_retrieval(taxid_list, node, name)
460 |         final_dict = collections.defaultdict(list)
461 |         for k, v in species_taxid_dict.items():
462 |             final_dict[k] = lineage[v]
463 |         print(f'gathering data for {len(final_dict)} species\n')
464 | 
465 |         ## retrieve information about potential number of taxa shared with species of interest on genus and family level based on NCBI taxonomy files
466 |         table_info_dict = collections.defaultdict(dict)
467 |         for k, v in species_taxid_dict.items():
468 |             species = k
469 |             genus_count = 0
470 |             family_count = 0
471 |             ## find genus taxids
472 |             if v in node:
473 |                 genus = node[v][1]
474 |             ## count number of species in genus
475 |             for k, v in node.items():
476 |                 if v[1] == genus and v[0] == 'species':
477 |                     genus_count = genus_count + 1
478 |             ## find family taxids
479 |             if genus in node:
480 |                 family = node[genus][1]
481 |             ## count number of species in family
482 |             for k, v in node.items():
483 |                 if v[1] == family and v[0] == 'genus':
484 |                     genus = k
485 |                     for key, value in node.items():
486 |                         if value[1] == genus and value[0] == 'species':
487 |                             family_count = family_count + 1
488 |             table_info_dict[species] = {'species' : species, 'genus_num_ncbi' : genus_count, 'family_num_ncbi' : family_count}
489 | 
490 |         ## retrieve information about number of taxa shared with species of interest on genus and family level in reference database
491 |         for k, v in final_dict.items():
492 |             species = k
493 |             genus = v[5]
494 |             family = v[4]
495 |             with open(INPUT, 'r') as file_in:
496 |                 spec_db_count = []
497 |                 genus_db_count = []
498 |                 family_db_count = []
499 |                 for line in file_in:
500 |                     spec_db = line.split('\t')[8].split(',')[2]
501 |                     genus_db = line.split('\t')[7].split(',')[2]
502 |                     family_db = line.split('\t')[6].split(',')[2]
503 |                     if spec_db == species:
504 |                         if spec_db not in spec_db_count:
505 |                             spec_db_count.append(spec_db)
506 |                     if genus_db == genus:
507 |                         if spec_db not in genus_db_count:
508 |                             genus_db_count.append(spec_db)
509 |                     if family_db == family:
510 |                         if spec_db not in family_db_count:
511 |                             family_db_count.append(spec_db)
512 |             for k, v in table_info_dict.items():
513 |                 if k == species:
514 |                     v['species_in_ref_DB'] = len(spec_db_count)
515 |                     v['genus_num_ref_DB'] = len(genus_db_count)
516 |                     v['family_num_ref_DB'] = len(family_db_count)
517 |                     v['genus_list_ref_DB'] = genus_db_count
518 |                     v['family_list_ref_DB'] = family_db_count
519 |         df = pd.DataFrame.from_dict(table_info_dict, orient = 'index')
520 |         df['Completeness_genus'] = df['genus_num_ref_DB'] / df['genus_num_ncbi'] * 100
521 |         df['Completeness_family'] = df['family_num_ref_DB'] / df['family_num_ncbi'] * 100
522 |         df = df[['species', 'species_in_ref_DB', 'genus_num_ref_DB', 'genus_num_ncbi', 'Completeness_genus', 'family_num_ref_DB', 'family_num_ncbi', 'Completeness_family', 'genus_list_ref_DB', 'family_list_ref_DB']]
523 |         df.to_csv(OUTPUT, sep = '\t', index = None)
524 |     
525 |     ## phylogenetic tree
526 |     elif METHOD == 'phylo':
527 |         ## read in the text file with species names
528 |         species_list = []
529 |         with open(SPECIES, 'r') as species_file:
530 |             for line in species_file:
531 |                 species = line.rstrip('\n').replace(' ', '_')
532 |                 species_list.append(species)
533 |         print(f'\nfound {len(species_list)} species of interest in {SPECIES}: {species_list}')
534 | 
535 |         ## retrieve taxonomic lineage
536 |         print(f'generating taxonomic lineage for {len(species_list)} species')
537 |         name, node, taxid = file_dmp_to_dict(NAME, TAXID)
538 |         species_taxid_dict, taxid_list = species_to_taxid(species_list, taxid)
539 |         lineage = lineage_retrieval(taxid_list, node, name)
540 |         final_dict = collections.defaultdict(list)
541 |         for k, v in species_taxid_dict.items():
542 |             final_dict[k] = lineage[v]
543 |         print(f'gathering data for {len(final_dict)} species')
544 | 
545 |         ## gather sequences from database that share taxonomic rank
546 |         ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
547 |         count = 0
548 |         for item in ranks:
549 |             if item == LEVEL:
550 |                 break
551 |             else:
552 |                 count = count + 1
553 |         for k, v in final_dict.items():
554 |             species = k
555 |             taxrank = v[count]
556 |             species_file = []
557 |             try:
558 |                 os.remove(f'{species}_phylo.fasta')
559 |             except OSError:
560 |                 pass
561 |             with open(INPUT, 'r') as file_in:
562 |                 for line in file_in:
563 |                     rank = line.split('\t')[count + 2].split(',')[2]
564 |                     #print(rank)
565 |                     if rank == taxrank:
566 |                         species_file.append(line)
567 |             for item in species_file:
568 |                 if len(species_file) < 2:
569 |                     print(f'only {len(species_file)} sequence in database that shares the {LEVEL} taxonomic rank with {species}, omitted from phylogenetic analysis.')
570 |                 elif len(species_file) > 100:
571 |                     print(f'{len(species_file)} sequences in database that share the {LEVEL} taxonomic rank with {species}, omitted from phylogenetic analysis')
572 |                 else:
573 |                     header = '>' + item.split('\t')[0] + '_' + item.split('\t')[8].split(',')[2]
574 |                     seq = item.rsplit('\t', 1)[1]
575 |                     with open(f'{species}_phylo.fasta', 'a') as file_out:
576 |                         file_out.write(header + '\n')
577 |                         file_out.write(seq)
578 |             
579 |         for species in species_list:
580 |             my_file = Path(f'{species}_phylo.fasta')
581 |             if my_file.is_file():
582 |                 print(f'generating phylogenetic tree for {species}')
583 |                 muscle_cline = MuscleCommandline(input = my_file, out = f'{species}_align.clw', diags = True, maxiters = 1, log = f'{species}_align_log.txt', clw = True)
584 |                 muscle_cline()
585 |                 with open(f'{species}_align.clw', 'r') as aln:
586 |                     alignment = AlignIO.read(aln, 'clustal')
587 |                 calculator = DistanceCalculator('identity')
588 |                 Distance_matrix = calculator.get_distance(alignment)
589 |                 constructor = DistanceTreeConstructor(calculator, 'nj')
590 |                 tree = constructor.build_tree(alignment)
591 |                 fig = plt.figure(figsize = (25,15), dpi = 100)
592 |                 matplotlib.rc('font', size=12)   
593 |                 matplotlib.rc('xtick', labelsize=10)   
594 |                 matplotlib.rc('ytick', labelsize=10)    
595 |                 axes = fig.add_subplot(1, 1, 1)
596 |                 Phylo.draw(tree, axes=axes, do_show = False)
597 |                 fig.savefig(f'{species}_tree_figure.pdf')
598 |                 print()
599 | 
600 |     ## incorrect parameter
601 |     else:
602 |         print('\nplease specify method of visualization: "diversity", "amplicon_length", "db_completeness", "phylo"\n')
603 | 
604 | ## format the taxonomic lineage
605 | def tax_format(args):
606 |     INPUT = args.input
607 |     OUTPUT = args.output
608 |     FORMAT = args.format
609 | 
610 |     ## format database to sintax
611 |     if FORMAT == 'sintax':
612 |         print(f'\nformatting {INPUT} to sintax format\n')
613 |         with open(OUTPUT, 'w') as f_out:
614 |             with open(INPUT, 'r') as f_in:
615 |                 for line in f_in:
616 |                     line = line.rstrip('\n')
617 |                     sintax = '>' + line.split('\t')[0] + ';tax=d:' + line.split('\t')[2].split(',')[2] + ',p:' + line.split('\t')[3].split(',')[2] + ',c:' + line.split('\t')[4].split(',')[2] + ',o:' + line.split('\t')[5].split(',')[2] + ',f:' + line.split('\t')[6].split(',')[2] + ',g:' + line.split('\t')[7].split(',')[2] + ',s:' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n'
618 |                     f_out.write(sintax)
619 | 
620 |     ## format database to RDP
621 |     elif FORMAT == 'rdp':
622 |         print(f'\nformatting {INPUT} to RDP format\n')
623 |         with open(OUTPUT, 'w') as f_out:
624 |             with open(INPUT, 'r') as f_in:
625 |                 for line in f_in:
626 |                     line = line.rstrip('\n')
627 |                     rdp = '>' + line.split('\t')[0] + '\t' + 'root;' + line.split('\t')[2].split(',')[2] + ';' + line.split('\t')[3].split(',')[2] + ';' + line.split('\t')[4].split(',')[2] + ';' + line.split('\t')[5].split(',')[2] + ';' + line.split('\t')[6].split(',')[2] + ';' + line.split('\t')[7].split(',')[2] + ';' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n'
628 |                     f_out.write(rdp)
629 | 
630 |     ## format database to QIIf
631 |     elif FORMAT == 'qiif':
632 |         print(f'\nformatting {INPUT} to QIIf format\n')
633 |         fasta_f = OUTPUT + '.fasta'
634 |         txt_f = OUTPUT + '.txt'
635 |         with open(fasta_f, 'w') as f_out:
636 |             with open(INPUT, 'r') as f_in:
637 |                 for line in f_in:
638 |                     line = line.rstrip('\n')
639 |                     fasta = '>' + line.split('\t')[0] + '\n' + line.split('\t')[9] + '\n'
640 |                     f_out.write(fasta)
641 |         with open(txt_f, 'w') as f_out:
642 |             with open(INPUT, 'r') as f_in:
643 |                 for line in f_in:   
644 |                     tax = line.split('\t')[0] + '\t' + 'k__' + line.split('\t')[2].split(',')[2] + ';p__' + line.split('\t')[3].split(',')[2] + ';c__' + line.split('\t')[4].split(',')[2] + ';o__' + line.split('\t')[5].split(',')[2] + ';f__' + line.split('\t')[6].split(',')[2] + ';g__' + line.split('\t')[7].split(',')[2] + ';s__' + line.split('\t')[8].split(',')[2] + '\n'
645 |                     f_out.write(tax)
646 | 
647 |     ## format database to QIIz
648 |     elif FORMAT == 'qiiz':
649 |         print(f'\nformatting {INPUT} to QIIz format')
650 |         print('still to add, not sure how this looks like')
651 |     
652 |     ## format database to DAD
653 |     elif FORMAT == 'dad':
654 |         print(f'\nformatting {INPUT} to DAD format\n')
655 |         with open(OUTPUT, 'w') as f_out:
656 |             with open(INPUT, 'r') as f_in:
657 |                 for line in f_in:
658 |                     line = line.rstrip('\n')
659 |                     dad = '>' + line.split('\t')[2].split(',')[2] + ';' + line.split('\t')[3].split(',')[2] + ';' + line.split('\t')[4].split(',')[2] + ';' + line.split('\t')[5].split(',')[2] + ';' + line.split('\t')[6].split(',')[2] + ';' + line.split('\t')[7].split(',')[2] + '\n' + line.split('\t')[9] + '\n'
660 |                     f_out.write(dad)
661 | 
662 |     ## format database to DADs
663 |     elif FORMAT == 'dads':
664 |         print(f'\nformatting {INPUT} to DADs format\n')
665 |         with open(OUTPUT, 'w') as f_out:
666 |             with open(INPUT, 'r') as f_in:
667 |                 for line in f_in:
668 |                     line = line.rstrip('\n')
669 |                     dads = '>' + line.split('\t')[0] + ' ' + line.split('\t')[7].split(',')[2] + ' ' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n'
670 |                     f_out.write(dads)
671 |     
672 |     ## format database to IDT
673 |     elif FORMAT == 'idt':
674 |         print(f'\nformatting {INPUT} to IDT format\n')
675 |         with open(OUTPUT, 'w') as f_out:
676 |             with open(INPUT, 'r') as f_in:
677 |                 for line in f_in:
678 |                     line = line.rstrip('\n')
679 |                     idt = '>' + line.split('\t')[2].split(',')[2] + ';' + line.split('\t')[3].split(',')[2] + ';' + line.split('\t')[4].split(',')[2] + ';' + line.split('\t')[5].split(',')[2] + ';' + line.split('\t')[6].split(',')[2] + ';' + line.split('\t')[7].split(',')[2] + ';' + line.split('\t')[8].split(',')[2] + '\n' + line.split('\t')[9] + '\n'
680 |                     f_out.write(idt)
681 | 
682 |     ## unknown format specified
683 |     else:
684 |         print('\nplease specify one of the accepted formats: "sintax", "rdp", "qiif", "qiiz", "dad", "dads", "idt"\n')
685 | 
686 | ################################################
687 | ################### ARGPARSE ###################
688 | ################################################
689 | def main():
690 |     parser = argparse.ArgumentParser(description = 'creating a curated reference database')
691 |     subparser = parser.add_subparsers()
692 | 
693 |     db_download_parser = subparser.add_parser('db_download', description = 'downloading sequence data from online databases')
694 |     db_download_parser.set_defaults(func = db_download)
695 |     db_download_parser.add_argument('-s', '--source', help = 'specify online database used to download sequences. Currently supported options are: (1) ncbi, (2) embl, (3) mitofish, (4) bold, (5) taxonomy', dest = 'source', type = str, required = True)
696 |     db_download_parser.add_argument('-db', '--database', help = 'specific database used to download sequences. Example NCBI: nucleotide. Example EMBL: mam*. Example BOLD: Actinopterygii', dest = 'database', type = str)
697 |     db_download_parser.add_argument('-q', '--query', help = 'NCBI query search to limit portion of database to be downloaded. Example: "16S[All Fields] AND ("1"[SLEN] : "50000"[SLEN])"', dest = 'query', type = str)
698 |     db_download_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str)
699 |     db_download_parser.add_argument('-k', '--keep_original', help = 'keep original downloaded file, default = "no"', dest = 'orig', type = str, default = 'no')
700 |     db_download_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str)
701 |     db_download_parser.add_argument('-b', '--batchsize', help = 'number of sequences downloaded from NCBI per iteration. Default = 5000', dest = 'batchsize', type = int, default = 5000)
702 | 
703 |     db_import_parser = subparser.add_parser('db_import', description = 'import existing or curated database')
704 |     db_import_parser.set_defaults(func = db_import)
705 |     db_import_parser.add_argument('-i', '--input', help = 'input database filename', dest = 'input', type = str, required = True)
706 |     db_import_parser.add_argument('-s', '--seq_header', help = 'information provided in sequence header: "accession" or "species"', dest = 'header', type = str, required = True)
707 |     db_import_parser.add_argument('-o', '--output', help = 'output file name option', dest = 'output', type = str, required = True)
708 |     db_import_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str)
709 |     db_import_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str)
710 |     db_import_parser.add_argument('-d', '--delim', help = 'delimiter specifying species or accession', dest = 'delim', type = str, required = True)
711 | 
712 |     db_merge_parser = subparser.add_parser('db_merge', description = 'merge multiple databases')
713 |     db_merge_parser.set_defaults(func = db_merge)
714 |     db_merge_parser.add_argument('-i', '--input', nargs = '+', help = 'list of files to be merged', dest = 'input', required = True)
715 |     db_merge_parser.add_argument('-u', '--uniq', help = 'keep only unique accession numbers', dest = 'uniq', type = str, default = '')
716 |     db_merge_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
717 |     
718 |     in_silico_pcr_parser = subparser.add_parser('insilico_pcr', description = 'curating the downloaded reference sequences with an in silico PCR')
719 |     in_silico_pcr_parser.set_defaults(func = insilico_pcr)
720 |     in_silico_pcr_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str, required = True)
721 |     in_silico_pcr_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str, required = True)
722 |     in_silico_pcr_parser.add_argument('-i', '--input', help = 'input filename', dest = 'input', type = str, required = True)
723 |     in_silico_pcr_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
724 |     in_silico_pcr_parser.add_argument('-e', '--error', help = 'number of errors allowed in primer-binding site. Default = 4.5', dest = 'error', type = str, default = '4.5')
725 | 
726 |     ref_database_parser = subparser.add_parser('assign_tax', description = 'creating the reference database with taxonomic information')
727 |     ref_database_parser.set_defaults(func = assign_tax)
728 |     ref_database_parser.add_argument('-i', '--input', help = 'input file containing the curated fasta sequences after in silico PCR', dest = 'input', type = str, required = True)
729 |     ref_database_parser.add_argument('-o', '--output', help = 'curated reference database output file', dest = 'output', type = str, required = True)
730 |     ref_database_parser.add_argument('-a', '--acc2tax', help = 'accession to taxid file name', dest = 'acc2tax', type = str, required = True)
731 |     ref_database_parser.add_argument('-t', '--taxid', help = 'taxid file name', dest = 'taxid', type = str, required = True)
732 |     ref_database_parser.add_argument('-n', '--name', help = 'phylogeny file name', dest = 'name', type = str, required = True)
733 | 
734 |     dereplication_parser = subparser.add_parser('dereplicate', description = 'dereplicating the database')
735 |     dereplication_parser.set_defaults(func = dereplicate)
736 |     dereplication_parser.add_argument('-i', '--input', help = 'filename of the curated reference database', dest = 'input', type = str, required = True)
737 |     dereplication_parser.add_argument('-o', '--output', help = 'filename of the dereplicated curated reference database', dest = 'output', type = str, required = True)
738 |     dereplication_parser.add_argument('-m', '--method', help = 'method of dereplication: "strict", "single_species", "uniq_species"', dest = 'method', type = str, required = True)
739 | 
740 |     seq_cleanup_parser = subparser.add_parser('seq_cleanup', description = 'filtering the database on sequence and header parameters')
741 |     seq_cleanup_parser.set_defaults(func = db_filter)
742 |     seq_cleanup_parser.add_argument('-min', '--minlen', help = 'minimum sequence length to be retained in the database. Default = 100', dest = 'minlen', type = int, default = '100')
743 |     seq_cleanup_parser.add_argument('-max', '--maxlen', help = 'maximum sequence length to be retained in the database. Default = 500', dest = 'maxlen', type = int, default = '500')
744 |     seq_cleanup_parser.add_argument('-n', '--maxns', help = 'maximum number of ambiguous bases allowed in the sequence. Default = 0', dest = 'maxns', type = int, default = '0')
745 |     seq_cleanup_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True)
746 |     seq_cleanup_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
747 |     seq_cleanup_parser.add_argument('-d', '--discard', help = 'file name of discarded sequences', dest = 'discard', type = str, default = 'no')
748 |     seq_cleanup_parser.add_argument('-e', '--enviro', help = 'discard environmental sequences from the dataset. yes/no', dest = 'env', type = str, default = 'no')
749 |     seq_cleanup_parser.add_argument('-s', '--species', help = 'discard sequences for which the species name is unspecified. yes/no', dest = 'spec', type = str, default = 'no')
750 |     seq_cleanup_parser.add_argument('-na', '--nans', help = 'discard sequences with N number of unspecified taxonomic levels', dest = 'nans', type = str, default = 'no')
751 |     
752 |     visualization_parser = subparser.add_parser('visualization', description = 'figure displaying various aspects of the reference database')
753 |     visualization_parser.set_defaults(func = visualization)
754 |     visualization_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True)
755 |     visualization_parser.add_argument('-o', '--output', help = 'output file name for db_completeness method', dest = 'output', type = str)
756 |     visualization_parser.add_argument('-m', '--method', help = 'method of visualization: "diversity", "amplicon_length", "db_completeness", "phylo"', dest = 'method', type = str, required = True)
757 |     visualization_parser.add_argument('-l', '--level', help = 'taxonomic level to split the database for diversity, amplicon_length, and phylo methods: "superkingdom", "phylum", "class", "order", "family", "genus", "species"', dest = 'level', type = str)
758 |     visualization_parser.add_argument('-s', '--species', help = 'list of species of interest for phylo and db_completeness methods', dest = 'species', type = str)
759 |     visualization_parser.add_argument('-t', '--taxid', help = 'taxid file name for phylo and db_completeness methods', dest = 'taxid', type = str)
760 |     visualization_parser.add_argument('-n', '--name', help = 'phylogeny file name for phylo and db_completeness methods', dest = 'name', type = str)
761 | 
762 |     format_database_parser = subparser.add_parser('tax_format', description = 'formatting the database to various formats')
763 |     format_database_parser.set_defaults(func = tax_format)
764 |     format_database_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True)
765 |     format_database_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
766 |     format_database_parser.add_argument('-f', '--format', help = 'process database to format: "sintax", "rdp", "qiif", "qiiz", "dad", "dads", "idt"', dest = 'format', type = str, required = True)
767 | 
768 |     args = parser.parse_args()
769 |     args.func(args)
770 | 
771 | if __name__ == '__main__':
772 |     main()
773 | 


--------------------------------------------------------------------------------
/function/older_versions/reference_database_creator_v2.1.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | ## import modules
  4 | import argparse
  5 | from Bio import Entrez
  6 | import time
  7 | from urllib.error import HTTPError
  8 | import http.client
  9 | http.client.HTTPConnection._http_vsn = 10
 10 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
 11 | import subprocess as sp
 12 | import shutil
 13 | from string import digits
 14 | import pandas as pd
 15 | from tqdm import tqdm
 16 | from Bio.Seq import Seq
 17 | from Bio.SeqRecord import SeqRecord
 18 | from Bio import SeqIO
 19 | from Bio.SeqIO import FastaIO
 20 | import os
 21 | import zipfile
 22 | from os import listdir
 23 | import matplotlib
 24 | import matplotlib.pyplot as plt
 25 | from Bio import AlignIO
 26 | from Bio import Phylo
 27 | from Bio.Align.Applications import MuscleCommandline
 28 | from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceMatrix
 29 | from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
 30 | from functions.module_1 import esearch_fasta
 31 | from functions.module_1 import efetch_seqs_from_webenv
 32 | from functions.module_1 import seq_dict_from_seq_xml
 33 | from functions.module_1 import fasta_file_from_seq_dict
 34 | from functions.module_1 import get_taxid_from_seq_xml
 35 | from functions.module_1 import create_taxid_table
 36 | from functions.module_1 import embl_download
 37 | from functions.module_1 import embl_format
 38 | from functions.module_1 import accession_list_from_fasta
 39 | from functions.module_1 import taxid_table_from_accession
 40 | from functions.module_1 import mitofish_download
 41 | from functions.module_1 import mitofish_format
 42 | from functions.module_1 import check_accession
 43 | 
 44 | #####################################################
 45 | ## helper functions #################################
 46 | #### later can be put in separate file ##############
 47 | #####################################################
 48 | 
 49 | def fasta_to_dict_wDesc(fasta_file):
 50 |     seq_dict = {}
 51 |     for record in SeqIO.parse(fasta_file, 'fasta'):
 52 |         record.description = record.description.replace(' ', '_')
 53 |         record.id = record.description
 54 |         rec_id = record.id
 55 |         rec_desc = record.description
 56 |         rec_seq = str(record.seq)
 57 |         seq_dict.setdefault(rec_id, {})['sequence'] = rec_seq
 58 |         seq_dict.setdefault(rec_id, {})['description'] = rec_desc
 59 |     return seq_dict
 60 | 
 61 | def fasta_to_dict(fasta_file):
 62 |     """turn fasta file into seq dict with format {seq_id : sequence, seq_id2: sequence2}"""
 63 |     #seq_input = open(fasta_file, 'r')
 64 |     seq_dict = {}
 65 |     for record in SeqIO.parse(fasta_file, 'fasta'):
 66 |         rec_id = record.id
 67 |         rec_desc = record.description
 68 |         rec_seq = str(record.seq)
 69 |         seq_dict.setdefault(rec_id, {})['sequence']=rec_seq
 70 |         seq_dict.setdefault(rec_id, {})['description']=rec_desc
 71 |     return seq_dict
 72 | 
 73 | def derep(seqdict):
 74 |     rep_dict = {}
 75 |     derep_dict = {}
 76 |     for k,v in seqdict.items():
 77 |         rep_dict.setdefault(v, []).append(k)
 78 |     for key, value in rep_dict.items():
 79 |         numreads = len(value)
 80 |         newname = value[0]
 81 |         derep_dict[newname] = {'seq': key, 'size': numreads, 'readlist': value}
 82 |     return derep_dict 
 83 | 
 84 | def derep_to_seq(derep_dict, size = 'no'):
 85 |     new_dict = {}
 86 |     read_dict = {}
 87 |     for k,v in derep_dict.items():
 88 |         data = v
 89 |         if size == 'no':
 90 |             base_id = k 
 91 |         else:
 92 |             base_id = k + ';size='+str(data['size'])
 93 |         read_dict[base_id] = data['readlist']
 94 |         new_dict[base_id] = data['seq']
 95 |     return (new_dict, read_dict)
 96 | 
 97 | def read_taxid_table(taxid_table_name):
 98 |     table_file = open(taxid_table_name, 'r')
 99 |     taxid_dict = {}
100 |     for line in table_file:
101 |         line = line.strip('\n')
102 |         line_parts = line.split('\t')
103 |         taxid_dict[line_parts[0]]=line_parts[1]
104 |     table_file.close()
105 |     return taxid_dict 
106 | 
107 | def efetch_taxonomy_xml(taxid_set, email, lineage_batch=5000):
108 |     lineage_list = []
109 |     Entrez.email = email 
110 | 
111 |     for start in tqdm(range(0, len(taxid_set), lineage_batch)):
112 |         lineage_group = taxid_set[start : start + lineage_batch]
113 |         lineage_attempt = 1
114 |         lineage_success = False
115 |         while lineage_attempt <= 3 and not lineage_success:
116 |             lineage_attempt += 1
117 |             try:
118 |                 lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = ','.join(lineage_group))
119 |                 lineage_record = Entrez.read(lineage_search)
120 |                 lineage_list.append(lineage_record)
121 |                 lineage_success = True
122 |             except HTTPError as err:
123 |                 if 500 <= err.code <= 599:
124 |                     print(f'Received error from server {err}')
125 |                     print(f'Attempt {lineage_attempt} of 3')
126 |                     time.sleep(15)
127 |                 else:
128 |                     raise
129 |     return lineage_list 
130 | 
131 | def dataframe_from_taxonomy(taxonomy_list, ranks_used='default'):
132 |     if ranks_used == 'default':
133 |         ranks = ['superkingdom','phylum', 'class', 'order', 'family', 'genus', 'species']
134 |     else:
135 |         ranks = ranks_used
136 |         
137 |     lineage_info = []
138 |     for key in taxonomy_list:
139 |         for i in range(len(key)):
140 |             lineage = {d['Rank']:d['ScientificName'] for d in key[i]['LineageEx'] if d['Rank'] in ranks}
141 |             lineage['species'] = key[i]['ScientificName']
142 |             lineage['taxid'] = key[i]['TaxId']
143 |             lineage_info.append(lineage)
144 |     tax_df = pd.DataFrame(lineage_info)
145 |     return tax_df
146 | 
147 | def sintax_from_df(df, output_file_name):
148 |     df['species'] = df['species'].str.replace(' ', '_')
149 |     df['sintax'] = '>' + df['accession'] + ';tax=d:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species']
150 |     datafr = df[['sintax', 'sequence']]
151 |     datafr.to_csv(output_file_name, index = None, header = None, sep = '\n')
152 | 
153 | ###############################################
154 | ###### MAIN COMMANDS ##########################
155 | ###############################################
156 | ###### MODULE DATABASE DOWNLOAD ###############
157 | ###############################################
158 | 
159 | ## function: download sequencing data from online databases
160 | def db_download(args):
161 |     SOURCE = args.source
162 |     DATABASE = args.database
163 |     QUERY = args.query
164 |     OUTPUT = args.output
165 |     EMAIL = args.email
166 | 
167 |     ## download sequencing data from NCBI
168 |     if SOURCE == 'ncbi':
169 |         print('\ndownloading sequences from NCBI')
170 |         if all(v is not None for v in [DATABASE, QUERY, OUTPUT, EMAIL]):
171 |             print('\nlooking up the number of sequences that match the query\n')
172 |             search_record = esearch_fasta(QUERY, DATABASE, EMAIL)
173 |             print('found {} matching sequences'.format(search_record['Count']))
174 |             print('\nstarting the download\n')
175 |             batch_size = 5000
176 |             fetch_seqs = efetch_seqs_from_webenv(search_record, DATABASE, EMAIL, batch_size)
177 |             sequences = seq_dict_from_seq_xml(fetch_seqs)
178 |             num_sequences = fasta_file_from_seq_dict(sequences, OUTPUT)
179 |             print(num_sequences, ' sequences written to file:', OUTPUT)
180 |             acc_taxid  = get_taxid_from_seq_xml(fetch_seqs)
181 |             taxid_tab_name = OUTPUT+'.taxid_table.tsv'
182 |             num_accs = create_taxid_table(acc_taxid, taxid_tab_name)
183 |             print(num_accs, ' accessions written to file:', 'taxid_table.tsv')
184 |         else:
185 |             print('parameter missing')
186 | 
187 |     ## download sequencing data from EMBL    
188 |     elif SOURCE == 'embl':
189 |         if all(v is not None for v in [DATABASE, EMAIL]):
190 |             print('\ndownloading sequences from EMBL')
191 |             dl_files = embl_download(DATABASE)
192 |             print('formatting downloaded files to fasta format')
193 |             fasta_files = embl_format(dl_files)
194 |             for fasta in fasta_files:
195 |                 print(f'retrieving tax ID information for each accession in {fasta}')
196 |                 acc_list = accession_list_from_fasta(fasta)
197 |                 taxid_tab_name = fasta + '.taxid_table.tsv'
198 |                 num_taxid = taxid_table_from_accession(acc_list, EMAIL, taxid_tab_name)
199 |                 print(num_taxid, ' accessions and tax IDs written to file: ', taxid_tab_name) 
200 |         else:
201 |             print('parameter missing')
202 | 
203 |     ## download sequencing data from MitoFish    
204 |     elif SOURCE == 'mitofish':
205 |         if all(v is not None for v in [OUTPUT, EMAIL]):
206 |             print('\ndownloading sequences from MITOFISH')
207 |             url = 'http://mitofish.aori.u-tokyo.ac.jp/files/complete_partial_mitogenomes.zip'
208 |             dl_file = mitofish_download(url)
209 |             print(f'formatting {dl_file} to fasta format')
210 |             mitoformat = mitofish_format(dl_file, OUTPUT)
211 |             print(f'retrieving tax ID information for each accession in {OUTPUT}')
212 |             acc_list = accession_list_from_fasta(OUTPUT)
213 |             taxid_tab_name = OUTPUT + '.taxid_table.tsv'
214 |             num_taxid = taxid_table_from_accession(acc_list, EMAIL, taxid_tab_name)
215 |             print(num_taxid, ' accessions and tax IDs written to file: ', taxid_tab_name) 
216 |         else:
217 |             print('parameter missing')
218 | 
219 |     ## download sequencing data from BOLD
220 |     elif SOURCE == 'bold':
221 |         print('\ndownloading sequences from BOLD')
222 |     
223 |     ## download taxonomy information
224 |     elif SOURCE == 'taxonomy':
225 |         print('\ndownloading taxonomy information')
226 |         url_acc2taxid = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz'
227 |         url_taxdump = 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
228 |         results = sp.run(['wget', url_acc2taxid])
229 |         results = sp.run(['gunzip', 'nucl_gb.accession2taxid.gz'])
230 |         results = sp.run(['wget', url_taxdump])
231 |         results = sp.run(['tar', '-zxvf', 'taxdump.tar.gz'])
232 |         files_to_remove = ['citations.dmp', 'delnodes.dmp', 'division.dmp', 'gencode.dmp', 'merged.dmp', 'gc.prt', 'readme.txt', 'taxdump.tar.gz']
233 |         for file in files_to_remove:
234 |             os.remove(file)
235 | 
236 |     ## print statement if source information is missing
237 |     else:
238 |         print('Please specify a database to download sequences from using the "source" argument. Currently "NCBI", "EMBL", and "MITOFISH" databases are supported.')
239 | 
240 | ## function: import existing or custom database
241 | def db_import(args):
242 |     INPUT = args.input
243 |     HEADER = args.header
244 |     OUTPUT = args.output
245 |     EMAIL = args.email
246 |     FWD = args.fwd
247 |     REV = args.rev
248 | 
249 |     if HEADER == 'accession':
250 |         # check for correct formatting of file
251 |         if all(v is not None for v in [INPUT, OUTPUT, EMAIL]):
252 |             print(f'\nchecking correct formatting of accession numbers in {INPUT}')
253 |             incorrect_accession = check_accession(INPUT, OUTPUT)
254 |             if len(incorrect_accession) != 0:
255 |                 print('found incorrectly formatted accession numbers. Please check file: "incorrect_accession_numbers.txt"')
256 |                 with open('incorrect_accession_numbers.txt', 'w') as fout:
257 |                     for item in incorrect_accession:
258 |                         fout.write(item + '\n')
259 |             # generate taxid table 
260 |             else:
261 |                 print(f'found no formattign issues in {INPUT}')
262 |                 print(f'retrieving tax ID information for each accession in {INPUT}')
263 |                 acc_list = accession_list_from_fasta(OUTPUT)
264 |                 taxid_tab_name = OUTPUT + '.taxid_table.tsv'
265 |                 num_taxid = taxid_table_from_accession(acc_list, EMAIL, taxid_tab_name)
266 |                 print(num_taxid, ' accessions and tax IDs written to file: ', taxid_tab_name)
267 |         else:
268 |             print('parameter missing')
269 |         # add primer sequences if option is chosen
270 |         if all(v is not None for v in [FWD, REV]):
271 |             print(f'appending primer sequences to each sequence in {OUTPUT}')
272 |             REV_DNA = Seq(REV)
273 |             REV_CORRECT = str(REV_DNA.reverse_complement())
274 | 
275 | 
276 |     
277 |     elif HEADER == 'species':
278 |         print('\ngenerating new accession numbers for spcies')
279 |     
280 |     else:
281 |         print('\nPlease specify header information. Currently supported header information: "accession" and "species"')
282 | 
283 | ## function: merge multiple databases
284 | def db_merge(args):
285 |     INPUT = args.input
286 |     UNIQ = args.uniq
287 |     OUTPUT = args.output
288 |     FORMAT = args.format
289 |     DISCARD = args.discard
290 | 
291 |     # merge database files
292 |     if FORMAT == 'db':
293 |         # merge based on unique accession numbers
294 |         if UNIQ != '':
295 |             print('\nmerging all fasta files and discarding duplicate accession numbers')
296 |             seqdict = {}
297 |             discard = []
298 |             for file in INPUT:
299 |                 count = 0
300 |                 added = 0
301 |                 for record in SeqIO.parse(file, 'fasta'):
302 |                     count = count + 1
303 |                     id = '>' + record.id.split('.')[0] + '\n'
304 |                     seq = str(record.seq) + '\n'
305 |                     if id not in seqdict:
306 |                         added = added +1
307 |                         seqdict[id] = seq
308 |                     else:
309 |                         discard.append(id)
310 |                 print(f'found {count} sequences in {file}')
311 |                 print(f'added {added} sequences to {OUTPUT}')
312 |             with open(OUTPUT, 'w') as file:
313 |                 for k,v in seqdict.items():
314 |                     file.write(k)
315 |                     file.write(v)
316 |             if DISCARD != '':
317 |                 with open(DISCARD, 'w') as disc:
318 |                     for item in discard:
319 |                         disc.write(item)
320 |         # merge all sequences without filtering
321 |         else:
322 |             print('\nmerging all fasta files and keeping duplicate accession numbers')
323 |             with open(OUTPUT, 'w') as fout:
324 |                 for file in INPUT:
325 |                     with open(file, 'r') as fin:
326 |                         for line in fin:
327 |                             fout.write(line)
328 |     
329 |     # merge taxonomic ID tables
330 |     elif FORMAT == 'taxid':
331 |         print('merging taxid tables')
332 |     
333 |     else:
334 |         print('Please specify what format to be merged. Accepted options are "db" and "taxid"')
335 |         
336 | 
337 | ###############################################
338 | ###### MODULE IN SILICO PCR ###################
339 | ###############################################
340 | 
341 | ## function: in silico PCR
342 | def ispcr(args):
343 |     FWD = args.fwd
344 |     REV = args.rev
345 |     ASSAY = args.assay
346 |     INPUT = args.input
347 |     ERROR = args.error
348 | 
349 |     ## reverse complement reverse primer sequence
350 |     REV_DNA = Seq(REV)
351 |     REV_CORRECT = str(REV_DNA.reverse_complement())
352 | 
353 |     ## setting variable names using the info from user input
354 |     TRIMMED_INIT = 'init_trimmed_' + ASSAY + '_' + INPUT
355 |     UNTRIMMED_INIT = 'init_untrimmed_' + ASSAY + '_' + INPUT
356 |     REVCOMP_UNTRIMMED_INIT = 'revcomp_' + UNTRIMMED_INIT
357 |     TRIMMED_REVCOMP = 'revcomp_' + TRIMMED_INIT
358 |     UNTRIMMED_REVCOMP = 'untrimmed_' + REVCOMP_UNTRIMMED_INIT
359 |     FINAL_TRIMMED = 'final_trimmed_' + ASSAY + '_' + INPUT
360 | 
361 |     OVERLAP = str(min([len(FWD), len(REV_CORRECT)]))
362 |     ADAPTER = FWD + '...' + REV_CORRECT
363 | 
364 |     ## run cutadapt on downloaded fasta file
365 |     count_init = len(list(SeqIO.parse(INPUT, 'fasta')))
366 |     print('\nrunning in silico PCR on fasta file containing {} sequences'.format(count_init))
367 |     cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet']
368 |     sp.call(cmnd_cutadapt_1)
369 |     count_trimmed_init = len(list(SeqIO.parse(TRIMMED_INIT, 'fasta')))
370 |     print('\nfound primers in {} sequences'.format(count_trimmed_init))
371 | 
372 |     ## run vsearch to reverse complement untrimmed sequences
373 |     count_untrimmed_init = len(list(SeqIO.parse(UNTRIMMED_INIT, 'fasta')))
374 |     print('\nreverse complementing {} untrimmed sequences'.format(count_untrimmed_init))
375 |     cmnd_vsearch_revcomp = ['vsearch', '--fastx_revcomp', UNTRIMMED_INIT, '--fastaout', REVCOMP_UNTRIMMED_INIT, '--quiet']
376 |     sp.call(cmnd_vsearch_revcomp)
377 | 
378 |     ## run cutadapt on reverse complemented untrimmed sequences
379 |     print('\nrunning in silico PCR on {} reverse complemented untrimmed sequences'.format(count_untrimmed_init))
380 |     cmnd_cutadapt_2 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_REVCOMP, REVCOMP_UNTRIMMED_INIT, '--untrimmed-output', UNTRIMMED_REVCOMP, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet']
381 |     sp.call(cmnd_cutadapt_2)
382 |     count_trimmed_second = len(list(SeqIO.parse(TRIMMED_REVCOMP, 'fasta')))
383 |     print('\nfound primers in {} sequences'.format(count_trimmed_second))
384 | 
385 |     ## concatenate both trimmed files
386 |     with open(FINAL_TRIMMED, 'wb') as wfd:
387 |         for f in [TRIMMED_INIT, TRIMMED_REVCOMP]:
388 |             with open(f, 'rb') as fd:
389 |                 shutil.copyfileobj(fd, wfd)
390 |     
391 |     ## remove intermediary files
392 |     files = [TRIMMED_INIT, UNTRIMMED_INIT, REVCOMP_UNTRIMMED_INIT, TRIMMED_REVCOMP, UNTRIMMED_REVCOMP]
393 |     for file in files:
394 |         os.remove(file)
395 | 
396 | 
397 | ###############################################
398 | ###### MODULE TAXONOMY ASSIGNMENT #############
399 | ###############################################
400 | 
401 | ## function: creating reference database with taxonomy
402 | def tax_assign(args):
403 |     INPUT = args.input
404 |     TABLE = args.taxid_table
405 |     OUTPUT = args.output
406 |     EMAIL = args.email
407 | 
408 |     # Get final sequence accessions from sequence file
409 |     input_seq_dict = fasta_to_dict(INPUT)
410 |     final_acc_list = list(input_seq_dict.keys())
411 |     final_accessions = set(final_acc_list)
412 | 
413 |     ## retrieve accession numbers from table file and store in list
414 |     taxid_dict = read_taxid_table(TABLE)
415 |     final_taxid_dict = {}
416 |     for k,v in taxid_dict.items():
417 |         if k in final_accessions:
418 |             final_taxid_dict[k]=v 
419 |     taxids = list(final_taxid_dict.values())
420 |     uniq_taxid = list(set(taxids))
421 |     print('\nfound {} accessions in input file'.format(len(final_accessions)))
422 |     print("\ndownloading {} taxonomic ID's from NCBI".format(len(uniq_taxid)))
423 |     taxonomy_list = efetch_taxonomy_xml(uniq_taxid, EMAIL)
424 |     lineage_df = dataframe_from_taxonomy(taxonomy_list)
425 |     
426 |     #lineage_df = pd.DataFrame(lineage_info)
427 |     taxid_colNames = ['taxid']
428 |     taxid_df = (pd.DataFrame.from_dict(final_taxid_dict, orient='index', columns=taxid_colNames).rename_axis('accession').reset_index())
429 |     seq_df = (pd.DataFrame.from_dict(input_seq_dict, orient='index').rename_axis('accession').reset_index())
430 |     taxid_lineage = taxid_df.merge(lineage_df, how = 'left', on = 'taxid')
431 |     all_df = taxid_lineage.merge(seq_df, on = 'accession')
432 | 
433 |     # output a table with all info
434 |     out_parts = OUTPUT.split('.')
435 |     TABOUT = '.'.join(out_parts[:-1])
436 |     TABOUT = TABOUT+'_table.tsv'
437 |     all_df.to_csv(TABOUT, index = None, sep = '\t')
438 | 
439 |     # create a sintax output (add other options later)
440 |     sintax_from_df(all_df, OUTPUT)
441 | 
442 | 
443 | ###############################################
444 | ###### MODULE DATABASE CLEANUP ################
445 | ###############################################
446 | 
447 | ## function: dereplicating the database
448 | def dereplicate(args):
449 |     INPUT = args.input
450 |     OUTPUT = args.output
451 | 
452 |     # split sequence file into two dictionaries and define which species need dereplication
453 |     seq_file = INPUT
454 |     seqs = fasta_to_dict_wDesc(seq_file)
455 |     print('\nfound {} sequences in input file'.format(len(seqs)))
456 |     seq_just_id = {}
457 |     taxonly = {}
458 |     for k,v in seqs.items():
459 |         parts = v['description'].split(';tax=')
460 |         seq_id = parts[0]
461 |         tax = parts[1]
462 |         seq_just_id[seq_id] = v['sequence']
463 |         taxonly.setdefault(tax, []).append(seq_id)
464 |     print('\ndatabase is comprised of {} unique taxa'.format(len(taxonly)))
465 |     need_derep = []
466 |     singletons = {}
467 |     for k,v in taxonly.items():
468 |         if len(v) > 1:
469 |             need_derep.append(k)
470 |         else:
471 |             singletons[v[0]] = k
472 |     print('\n{} taxa only occur once in the database'.format(len(singletons)))
473 |     print('\n{} taxa occur multiple times in the database'.format(len(need_derep)))
474 |     tax_index = {}
475 |     for k,v in taxonly.items():
476 |         if k in need_derep:
477 |             for seqid in v:
478 |                 tax_index[seqid] = k
479 |     
480 |     # dereplicate sequences for species represented more than once in the datbase
481 |     all_dereps = {}
482 |     for d in need_derep:
483 |         temp_seq_dict = {}
484 |         for seqid in taxonly[d]:
485 |             temp_seq_dict[seqid] = seq_just_id[seqid]
486 |         dr_temp = derep(temp_seq_dict)
487 |         derep_seq = derep_to_seq(dr_temp, size = 'no')
488 |         derep_seq = derep_seq[0]
489 |         for k,v in derep_seq.items():
490 |             new_id = k+';tax='+tax_index[k]
491 |             all_dereps[new_id] = v
492 |     
493 |     # combine species present only once in the database with the dereplicated dataset
494 |     all_new_seqs = {}
495 |     for k,v in singletons.items():
496 |         new_id = k + ';tax=' + v
497 |         seq = seq_just_id[k]
498 |         all_new_seqs[new_id] = seq
499 |     for key, value in all_dereps.items():
500 |         all_new_seqs[key] = value
501 |     print('\n{} sequences left after dereplication\n'.format(len(all_new_seqs)))
502 |     
503 |     # save the dereplicated database
504 |     output = OUTPUT
505 |     seqout = open(output, 'w')
506 |     for k,v in all_new_seqs.items():
507 |         seqout.write('>' + k + '\n' + v + '\n')
508 |     seqout.close()
509 | 
510 | 
511 | ## function: sequence cleanup
512 | def seq_cleanup(args):
513 |     MINLEN = args.minlen
514 |     MAXLEN = args.maxlen
515 |     MAXNS = args.maxns
516 |     INPUT = args.input
517 |     OUTPUT = args.output
518 |     DISCARD = args.discard
519 | 
520 |     # read in input file and clean up given the parameters
521 |     clean_db = []
522 |     discard_db = []
523 |     count = 0
524 |     count_clean = 0
525 |     for seq_record in SeqIO.parse(INPUT, 'fasta'):
526 |         count = count + 1
527 |         sequence = str(seq_record.seq).upper()
528 |         if len(sequence) >= MINLEN and len(sequence) <= MAXLEN and sequence.count('N') <= MAXNS:
529 |             clean_db.append(seq_record)
530 |             count_clean = count_clean + 1
531 |         else:
532 |             discard_db.append(seq_record)
533 |     
534 |     # write cleaned database to file
535 |     cleaned = count - count_clean 
536 |     print(f'\nfound {count} number of sequences in database prior to cleanup')
537 |     print(f'\nremoved {cleaned} sequences during cleanup')
538 |     print(f'\n{count_clean} sequences left after cleanup\n')
539 |     clean_db_fa = [FastaIO.as_fasta_2line(record) for record in clean_db]
540 |     with open(OUTPUT, 'w') as file:
541 |         for item in clean_db_fa:
542 |             file.write(item)
543 |     
544 |     # write discarded sequences to file
545 |     if DISCARD != 'no':
546 |         discard_db_fa = [FastaIO.as_fasta_2line(record) for record in discard_db]
547 |         with open(DISCARD, 'w') as file:
548 |             for item in discard_db_fa:
549 |                 file.write(item)
550 | 
551 | 
552 | ## function: header cleanup
553 |         # (3) specific taxonomic groups - still to add
554 |         # (4) specific missing taxonomic level - still to add
555 | def header_cleanup(args):
556 |     ENV = args.env
557 |     SPEC = args.spec
558 |     NANS = args.nans
559 |     INPUT = args.input
560 |     OUTPUT = args.output
561 | 
562 |     clean_db = []
563 |     # filter data on keyword 'environmental'
564 |     if ENV == 'yes':
565 |         env_count = 0
566 |         env_total = 0
567 |         for seq_record in SeqIO.parse(INPUT, 'fasta'):
568 |             env_total = env_total + 1
569 |             id = str(seq_record.id).upper()
570 |             if id.count('ENVIRONMENTAL') == 0:
571 |                 env_count = env_count + 1
572 |                 clean_db.append(seq_record)
573 |         env_removed = env_total - env_count
574 |         print(f'\nremoved {env_removed} environmental sequences from a total of {env_total} sequences in the database')
575 |     
576 |     # filter data if species name is not specified
577 |     if SPEC == 'yes':
578 |         if len(clean_db) == 0:
579 |             spec_count = 0
580 |             spec_total = 0
581 |             for seq_record in SeqIO.parse(INPUT, 'fasta'):
582 |                 spec_total = spec_total + 1
583 |                 id = str(seq_record.id).upper()
584 |                 if id.count('_SP.') == 0:
585 |                     spec_count = spec_count + 1
586 |                     clean_db.append(seq_record)
587 |             spec_removed = spec_total - spec_count
588 |             print(f'\nremoved {spec_removed} entries from database not containing a species name from a total of {spec_total} sequences in the database')
589 |         else:
590 |             spec_db = []
591 |             spec_count = 0
592 |             spec_total = 0
593 |             for seq_record in clean_db:
594 |                 spec_total = spec_total + 1
595 |                 id = str(seq_record.id).upper()
596 |                 if id.count('_SP.') == 0:
597 |                     spec_count = spec_count + 1
598 |                     spec_db.append(seq_record)
599 |             spec_removed = spec_total - spec_count
600 |             print(f'\nremoved {spec_removed} entries from database not containing a species name from a total of {spec_total} sequences in the database')
601 |             clean_db = []
602 |             clean_db = spec_db 
603 |     
604 |     # filter data on missing taxonomic levels
605 |     if NANS != 'nan':
606 |         if len(clean_db) == 0:
607 |             nans_count = 0
608 |             nans_total = 0
609 |             for seq_record in SeqIO.parse(INPUT, 'fasta'):
610 |                 nans_total = nans_total + 1
611 |                 id = str(seq_record.id).upper()
612 |                 if id.count(':NAN') <= NANS:
613 |                     nans_count = nans_count + 1
614 |                     clean_db.append(seq_record)
615 |             nans_removed = nans_total - nans_count
616 |             print(f'\nremoved {nans_removed} entries from database with {NANS} missing taxonomic level info from a total of {nans_total} sequences in the database')
617 |         else:
618 |             nans_db = []
619 |             nans_count = 0
620 |             nans_total = 0
621 |             for seq_record in clean_db:
622 |                 nans_total = nans_total + 1
623 |                 id = str(seq_record.id).upper()
624 |                 if id.count(':NAN') <= NANS:
625 |                     nans_count = nans_count + 1
626 |                     nans_db.append(seq_record)
627 |             nans_removed = nans_total - nans_count
628 |             print(f'\nremoved {nans_removed} entries from database with {NANS} missing taxonomic level info from a total of {nans_total} sequences in the database')
629 |             clean_db = []
630 |             clean_db = nans_db
631 |     
632 |     # write cleaned up database to output file
633 |     clean_db_fa = [FastaIO.as_fasta_2line(record) for record in clean_db]
634 |     with open(OUTPUT, 'w') as file:
635 |         for item in clean_db_fa:
636 |             file.write(item)
637 | 
638 | 
639 | ###############################################
640 | ###### MODULE VISUALISATIONS ##################
641 | ###############################################
642 | 
643 | ## function: phylogenetic tree builder
644 | def phylo(args):
645 |     SPECIES = args.species
646 |     DATABASE = args.database
647 |     EMAIL = args.email
648 |     OUTPUT = args.output
649 | 
650 |     Entrez.email = EMAIL
651 |     directory = 'temp'
652 |     try:
653 |         os.makedirs(directory, exist_ok = True)
654 |     except OSError as error:
655 |         print("Directory '%s' cannot be created" % directory)
656 | 
657 |     # read in the text file with species names
658 |     species = []
659 |     with open(SPECIES) as species_list:
660 |         for spec in species_list:
661 |             spec = spec.rstrip('\n')
662 |             species.append(spec)
663 |     print('\nfound ' + str(len(species)) + ' species of interest: ' + str(species) + '\n')
664 | 
665 |     # retrieve the lineage information for each species
666 |         # first: uniq ID from species name
667 |         # second: tax ID from uniq ID
668 |         # third: taxonomic information from tax ID
669 |         # fourth: format similar to database
670 |     print('retrieving the taxonomic information from NCBI for ' + str(len(species)) + ' species of interest\n')
671 |     uid = []
672 |     for item in species:
673 |         handle = Entrez.esearch(db = 'nucleotide', term = item, retmode = 'xml', rettype = 'fasta')
674 |         record = Entrez.read(handle)
675 |         uid.append(record['IdList'][0])
676 |     
677 |     accession_taxid = []
678 |     taxids = []
679 |     for id in uid:
680 |         handle = Entrez.efetch(db = 'nuccore', id = id, retmode = 'xml', rettype = 'fasta')
681 |         record = Entrez.read(handle)
682 |         acc = record[0]['TSeq_accver']
683 |         taxid = record[0]['TSeq_taxid']
684 |         accession_taxid.append(str(acc) + ' ' + str(taxid))
685 |         taxids.append(str(taxid))
686 |     
687 |     lineage_list = []
688 |     for taxid in taxids:
689 |         lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = taxid)
690 |         lineage_record = Entrez.read(lineage_search)
691 |         lineage_list.append(lineage_record)
692 | 
693 |     lineage_info = []
694 |     for key in lineage_list:
695 |         lineage = {d['Rank']:d['ScientificName'] for d in key[0]['LineageEx'] if d['Rank'] in ['superkingdom', 'phylum', 'class',
696 |         'order', 'family', 'genus', 'species']}
697 |         lineage['species'] = key[0]['ScientificName']
698 |         lineage['taxid'] = key[0]['TaxId']
699 |         lineage_info.append(lineage)
700 |     df = pd.DataFrame(lineage_info)
701 |     df['species'] = df['species'].str.replace(' ', '_')
702 |     df['sintax'] = 'd:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species']
703 |     datafr = df['sintax']
704 |     species_interest = datafr.values.tolist()
705 | 
706 |     # extract all entries from the database that share a family status with the species of interest
707 |     for record in SeqIO.parse(DATABASE, 'fasta'):
708 |         family_rec = record.id.split(',')[4]
709 |         genus_rec = record.id.split(',')[5]
710 |         species_rec = record.id.split(',')[6]
711 |         for species in species_interest:
712 |             family_int = species.split(',')[4]
713 |             genus_int = species.split(',')[5]
714 |             species_int = species.split(',')[6]
715 |             spec_int = species.split(',')[6].split(':')[1]
716 |             if family_int == family_rec:
717 |                 with open(f'{directory}/{spec_int}_family.fasta', 'a') as f:
718 |                     SeqIO.write(record, f, 'fasta')
719 |             if genus_int == genus_rec:
720 |                 with open(f'{directory}/{spec_int}_genus.fasta', 'a') as f:
721 |                     SeqIO.write(record, f, 'fasta')
722 |             if species_int == species_rec:
723 |                 with open(f'{directory}/{spec_int}_species.fasta', 'a') as f:
724 |                     SeqIO.write(record, f, 'fasta')
725 | 
726 |     # extract information for data table from newly generated files
727 |     newdict = {}
728 |     for species in species_interest:
729 |         spec_int = species.split(',')[6].split(':')[1]
730 |         try:
731 |             spec_number = list(SeqIO.parse(f'{directory}/{spec_int}_species.fasta', 'fasta'))
732 |             spec_num = len(spec_number)
733 |         except:
734 |             spec_num = 0
735 |         try:
736 |             gen_number = list(SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta'))
737 |             gen_num = len(gen_number)
738 |             gen_list = []
739 |             for record in gen_number:
740 |                 gen = record.id.split(',')[6].split(':')[1]
741 |                 if gen not in gen_list:
742 |                     gen_list.append(gen)
743 |         except:
744 |             gen_num = 0
745 |             gen_list = ['NA']
746 |         try:
747 |             fam_number = list(SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta'))
748 |             fam_num = len(fam_number)
749 |             fam_list = []
750 |             for record in fam_number:
751 |                 fam = record.id.split(',')[6].split(':')[1]
752 |                 if fam not in fam_list:
753 |                     fam_list.append(fam)
754 |         except:
755 |             fam_num = 0
756 |             fam_list = ['NA']
757 |         newdict[spec_int] = {'species': spec_int, 'species_occur': spec_num, 'species_gen': gen_list, 'gen_entries': gen_num, 'species_fam': fam_list, 'fam_entries': fam_num}
758 | 
759 |     # print information on which species are present in the database
760 |     for species in species_interest:
761 |         spec_int = species.split(',')[6].split(':')[1]
762 |         if newdict[spec_int]['species_occur'] == 0:
763 |             print(str(newdict[spec_int]['species']) + ': not present in the reference database\n')
764 |         else:
765 |             print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['species_occur']) + ' entries in the database\n')
766 | 
767 |     # output data table on species of interest
768 |     df = pd.DataFrame.from_dict(newdict, orient = 'index')
769 |     df = df[['species', 'species_occur', 'gen_entries', 'fam_entries', 'species_gen', 'species_fam']]
770 |     df.to_csv(OUTPUT, sep = '\t', index = None)
771 | 
772 |     # generate phylogenetic trees for every species of interest based on number of entries in genus and family
773 |         # first: check number of entries in if statement
774 |         # second: shorten the headers of the sequences in the file, so that it can be printed on the figure
775 |         # third: run muscle to generate alignment
776 |         # fourth: calculate distance from alignment
777 |         # fifth: generate tree figure 
778 |     for species in species_interest:
779 |         spec_int = species.split(',')[6].split(':')[1]
780 |         if newdict[spec_int]['fam_entries'] > 50:
781 |             print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries too large. Generating phylogenetic tree on genus level with ' + str(newdict[spec_int]['gen_entries']) + ' entries\n')
782 |             
783 |             select = []
784 |             for record in SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta'):
785 |                 record.description = record.description.replace(';', ',')
786 |                 record.id = record.description
787 |                 record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1]
788 |                 record.description = record.id
789 |                 select.append(record)
790 |             handle = open(f'{directory}/{spec_int}_genus_align.fasta', 'w')
791 |             SeqIO.write(select, handle, 'fasta')
792 |             handle.close()
793 | 
794 |             muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_genus_align.fasta',
795 |                                             out = f'{directory}/{spec_int}_genus_align.clw',
796 |                                             diags = True,
797 |                                             maxiters = 1,
798 |                                             log = f'{directory}/{spec_int}_genus_align_log.txt',
799 |                                             clw = True)
800 |             muscle_cline()
801 | 
802 |             with open(f'{directory}/{spec_int}_genus_align.clw' , 'r') as aln:
803 |                 alignment = AlignIO.read(aln, 'clustal')
804 |             calculator = DistanceCalculator('identity')
805 |             Distance_matrix = calculator.get_distance(alignment)
806 |             constructor = DistanceTreeConstructor(calculator, 'nj')
807 |             
808 |             tree = constructor.build_tree(alignment)
809 |             fig = plt.figure(figsize = (25,15), dpi = 100)
810 |             matplotlib.rc('font', size=12)             
811 |             matplotlib.rc('xtick', labelsize=10)       
812 |             matplotlib.rc('ytick', labelsize=10)       
813 |             axes = fig.add_subplot(1, 1, 1)
814 |             Phylo.draw(tree, axes=axes, do_show = False)
815 |             fig.savefig(f'{spec_int}_genus_align_tree.pdf')
816 | 
817 |         else:
818 |             print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries. Generating phylogenetic tree on family level\n')
819 | 
820 |             select = []
821 |             for record in SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta'):
822 |                 record.description = record.description.replace(';', ',')
823 |                 record.id = record.description
824 |                 record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1]
825 |                 record.description = record.id
826 |                 select.append(record)
827 |             handle = open(f'{directory}/{spec_int}_family_align.fasta', 'w')
828 |             SeqIO.write(select, handle, 'fasta')
829 |             handle.close()
830 | 
831 |             muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_family_align.fasta',
832 |                                             out = f'{directory}/{spec_int}_family_align.clw',
833 |                                             diags = True,
834 |                                             maxiters = 1,
835 |                                             log = f'{directory}/{spec_int}_family_align_log.txt',
836 |                                             clw = True)
837 |             muscle_cline()
838 | 
839 |             with open(f'{directory}/{spec_int}_family_align.clw' , 'r') as aln:
840 |                 alignment = AlignIO.read(aln, 'clustal')
841 |             calculator = DistanceCalculator('identity')
842 |             Distance_matrix = calculator.get_distance(alignment)
843 |             constructor = DistanceTreeConstructor(calculator, 'nj')
844 |             
845 |             tree = constructor.build_tree(alignment)
846 |             fig = plt.figure(figsize = (25,15), dpi = 100)
847 |             matplotlib.rc('font', size=12)             
848 |             matplotlib.rc('xtick', labelsize=10)       
849 |             matplotlib.rc('ytick', labelsize=10)       
850 |             axes = fig.add_subplot(1, 1, 1)
851 |             Phylo.draw(tree, axes=axes, do_show = False)
852 |             fig.savefig(f'{spec_int}_family_align_tree.pdf')
853 | 
854 | 
855 | ## function: argparse parser
856 | def main():
857 |     parser = argparse.ArgumentParser(description = 'creating a curated reference database')
858 |     subparser = parser.add_subparsers()
859 | 
860 |     db_download_parser = subparser.add_parser('db_download', description = 'downloading sequence data from online databases')
861 |     db_download_parser.set_defaults(func = db_download)
862 |     db_download_parser.add_argument('-s', '--source', help = 'specify online database used to download sequences. Currently supported options are: (1) ncbi, (2) embl, (3) mitofish', dest = 'source', type = str, required = True)
863 |     db_download_parser.add_argument('-db', '--database', help = 'Specific NCBI or EMBL database used to download sequences. Example NCBI: nucleotide. Example EMBL: mam*', dest = 'database', type = str)
864 |     db_download_parser.add_argument('-q', '--query', help = 'NCBI query search to limit portion of database to be downloaded. Example: "16S[All Fields] AND ("1"[SLEN] : "50000"[SLEN])"', dest = 'query', type = str)
865 |     db_download_parser.add_argument('-o', '--output', help = 'output file name option for NCBI and MITOFISH databases', dest = 'output', type = str)
866 |     db_download_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str)
867 | 
868 |     db_import_parser = subparser.add_parser('db_import', description = 'import existing or curated database')
869 |     db_import_parser.set_defaults(func = db_import)
870 |     db_import_parser.add_argument('-i', '--input', help = 'input database filename', dest = 'input', type = str, required = True)
871 |     db_import_parser.add_argument('-s', '--seq_header', help = 'information provided in sequence header: "accession" or "species"', dest = 'header', type = str, required = True)
872 |     db_import_parser.add_argument('-o', '--output', help = 'output file name option', dest = 'output', type = str, required = True)
873 |     db_import_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True)
874 |     db_import_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str)
875 |     db_import_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str)
876 | 
877 | 
878 |     db_merge_parser = subparser.add_parser('db_merge', description = 'merge multiple databases')
879 |     db_merge_parser.set_defaults(func = db_merge)
880 |     db_merge_parser.add_argument('-i', '--input', nargs = '+', help = 'list of files to be merged', dest = 'input', required = True)
881 |     db_merge_parser.add_argument('-u', '--uniq', help = 'keep only unique accession numbers', dest = 'uniq', type = str, default = '')
882 |     db_merge_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
883 |     db_merge_parser.add_argument('-f', '--format', help = 'data format to be merged, database (db) or tax ID table (taxid)', dest = 'format', type = str, required = True)
884 |     db_merge_parser.add_argument('-d', '--discard', help = 'file name for discarded duplicate accession numbers', dest = 'discard', type = str)
885 | 
886 |     in_silico_pcr_parser = subparser.add_parser('ispcr', description = 'curating the downloaded reference sequences with an in silico PCR')
887 |     in_silico_pcr_parser.set_defaults(func = ispcr)
888 |     in_silico_pcr_parser.add_argument('-f', '--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str, required = True)
889 |     in_silico_pcr_parser.add_argument('-r', '--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str, required = True)
890 |     in_silico_pcr_parser.add_argument('-a', '--assay', help = 'name of primer assay', dest = 'assay', type = str, required = True)
891 |     in_silico_pcr_parser.add_argument('-i', '--input', help = 'input filename', dest = 'input', type = str, required = True)
892 |     in_silico_pcr_parser.add_argument('-e', '--error', help = 'number of errors allowed in primer-binding site. Default = 4.5', dest = 'error', type = str, default = '4.5')
893 | 
894 |     ref_database_parser = subparser.add_parser('tax_assign', description = 'creating the reference database with taxonomic information')
895 |     ref_database_parser.set_defaults(func = tax_assign)
896 |     ref_database_parser.add_argument('-i', '--input', help = 'input file containing the curated fasta sequences after in silico PCR', dest = 'input', type = str, required = True)
897 |     ref_database_parser.add_argument('-t', '--taxid_table', help = 'input taxid table containing the taxid for each accession', dest = 'taxid_table', type = str, required = True)
898 |     ref_database_parser.add_argument('-o', '--output', help = 'curated reference database output file', dest = 'output', type = str, required = True)
899 |     ref_database_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True)
900 | 
901 |     dereplication_parser = subparser.add_parser('dereplicate', description = 'dereplicating the database')
902 |     dereplication_parser.set_defaults(func = dereplicate)
903 |     dereplication_parser.add_argument('-i', '--input', help = 'filename of the curated reference database', dest = 'input', type = str, required = True)
904 |     dereplication_parser.add_argument('-o', '--output', help = 'filename of the dereplicated curated reference database', dest = 'output', type = str, required = True)
905 | 
906 |     seq_cleanup_parser = subparser.add_parser('seq_cleanup', description = 'cleaning database on sequence parameters')
907 |     seq_cleanup_parser.set_defaults(func = seq_cleanup)
908 |     seq_cleanup_parser.add_argument('-min', '--minlen', help = 'minimum sequence length to be retained in the database. Default = 100', dest = 'minlen', type = str, default = '100')
909 |     seq_cleanup_parser.add_argument('-max', '--maxlen', help = 'maximum sequence length to be retained in the database. Default = 500', dest = 'maxlen', type = str, default = '500')
910 |     seq_cleanup_parser.add_argument('-n', '--maxns', help = 'maximum number of ambiguous bases allowed in the sequence. Default = 0', dest = 'maxns', type = str, default = '0')
911 |     seq_cleanup_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True)
912 |     seq_cleanup_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
913 |     seq_cleanup_parser.add_argument('-d', '--discard', help = 'file name of discarded sequences', dest = 'discard', type = str, default = 'no')
914 | 
915 |     header_cleanup_parser = subparser.add_parser('header_cleanup', description = 'cleaning database on header info')
916 |     header_cleanup_parser.set_defaults(func = header_cleanup)
917 |     header_cleanup_parser.add_argument('-i', '--input', help = 'input file name', dest = 'input', type = str, required = True)
918 |     header_cleanup_parser.add_argument('-o', '--output', help = 'output file name', dest = 'output', type = str, required = True)
919 |     header_cleanup_parser.add_argument('-e', '--enviro', help = 'discard environmental sequences from the dataset. yes/no', dest = 'env', type = str, default = 'no')
920 |     header_cleanup_parser.add_argument('-s', '--species', help = 'discard sequences for which the species name is unspecified. yes/no', dest = 'spec', type = str, default = 'no')
921 |     header_cleanup_parser.add_argument('-n', '--nans', help = 'discard sequences with N number of unspecified taxonomic levels', dest = 'nans', type = str, default = 'nans')
922 | 
923 |     phylo_parser = subparser.add_parser('phylo_build', description = 'generating phylogenetic trees for species of interest')
924 |     phylo_parser.set_defaults(func = phylo)
925 |     phylo_parser.add_argument('-s', '--species', help = 'text file containing list of species separated by newlines', dest = 'species', type = str, required = True)
926 |     phylo_parser.add_argument('-db', '--database', help = 'curated reference database', dest = 'database', type = str, required = True)
927 |     phylo_parser.add_argument('-e', '--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True)
928 |     phylo_parser.add_argument('-o', '--output', help = 'filename for output table', dest = 'output', type = str, required = True)
929 | 
930 |     args = parser.parse_args()
931 |     args.func(args)
932 | 
933 | if __name__ == '__main__':
934 |     main()


--------------------------------------------------------------------------------
/function/older_versions/reference_database_creator_v2.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | ## import modules
  4 | import argparse
  5 | from Bio import Entrez
  6 | import time
  7 | from urllib.error import HTTPError
  8 | import http.client
  9 | http.client.HTTPConnection._http_vsn = 10
 10 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
 11 | import subprocess as sp
 12 | import shutil
 13 | import re
 14 | import pandas as pd
 15 | from tqdm import tqdm
 16 | from Bio.Seq import Seq
 17 | from Bio import SeqIO
 18 | import os
 19 | import matplotlib
 20 | import matplotlib.pyplot as plt
 21 | from Bio import AlignIO
 22 | from Bio import Phylo
 23 | from Bio.Align.Applications import MuscleCommandline
 24 | from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceMatrix
 25 | from Bio.Phylo.TreeConstruction import DistanceTreeConstructor
 26 | 
 27 | 
 28 | ## function: download sequencing data from NCBI
 29 | def ncbi_download(args):
 30 |     DB = args.database
 31 |     QUERY = args.query
 32 |     OUTPUT = args.output_filename
 33 |     EMAIL = args.email
 34 | 
 35 |     Entrez.email = EMAIL
 36 |     print('\nlooking up the number of sequences that match the query\n')
 37 |     first_handle = Entrez.esearch(db=DB, term=QUERY, rettype='fasta')
 38 |     first_record = Entrez.read(first_handle)
 39 |     first_handle.close()
 40 |     count = int(first_record['Count'])
 41 | 
 42 |     second_handle = Entrez.esearch(db=DB, term=QUERY, retmax=count, rettype='fasta', usehistory = 'y')
 43 |     second_record = Entrez.read(second_handle)
 44 |     second_handle.close()
 45 | 
 46 |     id_list = second_record['IdList']
 47 |     count = int(second_record['Count'])
 48 |     assert(count == len(id_list))
 49 |     webenv = second_record['WebEnv']
 50 |     query_key = second_record['QueryKey']
 51 | 
 52 |     print('found {} matching sequences'.format(second_record['Count']))
 53 |     print('\nstarting the download\n')
 54 | 
 55 |     batch_size = 5000
 56 |     out_handle = open(OUTPUT, 'w')
 57 |     for start in tqdm(range(0, count, batch_size)):
 58 |         attempt = 1
 59 |         success = False
 60 |         while attempt <= 3 and not success:
 61 |             attempt += 1
 62 |             try:
 63 |                 fetch_handle = Entrez.efetch(db=DB, rettype='fasta',
 64 |                                              retstart=start, retmax=batch_size,
 65 |                                              webenv=webenv, query_key=query_key)
 66 |                 success = True
 67 |             except HTTPError as err:
 68 |                 if 500 <= err.code <= 599:
 69 |                     print(f"Received error from server {err}")
 70 |                     print("Attempt {attempt} of 3")
 71 |                     time.sleep(15)
 72 |                 else:
 73 |                     raise
 74 |         data = fetch_handle.read()
 75 |         fetch_handle.close()
 76 |         out_handle.write(data)
 77 |     out_handle.close()
 78 | 
 79 | 
 80 | ## function: in silico PCR
 81 | def in_silico_pcr(args):
 82 |     ## user input
 83 |     FWD = args.fwd
 84 |     REV = args.rev
 85 |     ASSAY = args.assay
 86 |     INPUT = args.input
 87 | 
 88 |     ## reverse complement reverse primer sequence
 89 |     REV_DNA = Seq(REV)
 90 |     REV_CORRECT = str(REV_DNA.reverse_complement())
 91 | 
 92 |     ## setting variable names using the info from user input
 93 |     TRIMMED_INIT = 'init_trimmed_' + ASSAY + '_' + INPUT
 94 |     UNTRIMMED_INIT = 'init_untrimmed_' + ASSAY + '_' + INPUT
 95 |     REVCOMP_UNTRIMMED_INIT = 'revcomp_' + UNTRIMMED_INIT
 96 |     TRIMMED_REVCOMP = 'revcomp_' + TRIMMED_INIT
 97 |     UNTRIMMED_REVCOMP = 'untrimmed_' + REVCOMP_UNTRIMMED_INIT
 98 |     FINAL_TRIMMED = 'final_trimmed_' + ASSAY + '_' + INPUT
 99 | 
100 |     OVERLAP = str(min([len(FWD), len(REV_CORRECT)]))
101 |     #ERROR = str(round(min([3/len(FWD), 3/len(REV_CORRECT)]), 2))
102 |     #print(ERROR)
103 |     ERROR = str(4.5)
104 |     ADAPTER = FWD + '...' + REV_CORRECT
105 | 
106 |     ## run cutadapt on downloaded fasta file
107 |     count_init = len(list(SeqIO.parse(INPUT, 'fasta')))
108 |     print('\nrunning in silico PCR on fasta file containing {} sequences'.format(count_init))
109 |     #cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet']
110 |     cmnd_cutadapt_1 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_INIT, INPUT, '--untrimmed-output', UNTRIMMED_INIT, '--no-indels', '-e', ERROR, '--overlap', OVERLAP]
111 |     sp.call(cmnd_cutadapt_1)
112 |     count_trimmed_init = len(list(SeqIO.parse(TRIMMED_INIT, 'fasta')))
113 |     print('\nfound primers in {} sequences'.format(count_trimmed_init))
114 | 
115 |     ## run vsearch to reverse complement untrimmed sequences
116 |     count_untrimmed_init = len(list(SeqIO.parse(UNTRIMMED_INIT, 'fasta')))
117 |     print('\nreverse complementing {} untrimmed sequences'.format(count_untrimmed_init))
118 |     cmnd_vsearch_revcomp = ['vsearch', '--fastx_revcomp', UNTRIMMED_INIT, '--fastaout', REVCOMP_UNTRIMMED_INIT, '--quiet']
119 |     sp.call(cmnd_vsearch_revcomp)
120 | 
121 |     ## run cutadapt on reverse complemented untrimmed sequences
122 |     print('\nrunning in silico PCR on {} reverse complemented untrimmed sequences'.format(count_untrimmed_init))
123 |     cmnd_cutadapt_2 = ['cutadapt', '-g', ADAPTER, '-o', TRIMMED_REVCOMP, REVCOMP_UNTRIMMED_INIT, '--untrimmed-output', UNTRIMMED_REVCOMP, '--no-indels', '-e', ERROR, '--overlap', OVERLAP, '--quiet']
124 |     sp.call(cmnd_cutadapt_2)
125 |     count_trimmed_second = len(list(SeqIO.parse(TRIMMED_REVCOMP, 'fasta')))
126 |     print('\nfound primers in {} sequences'.format(count_trimmed_second))
127 | 
128 |     ## concatenate both trimmed files
129 |     with open(FINAL_TRIMMED, 'wb') as wfd:
130 |         for f in [TRIMMED_INIT, TRIMMED_REVCOMP]:
131 |             with open(f, 'rb') as fd:
132 |                 shutil.copyfileobj(fd, wfd)
133 |     
134 |     ## remove intermediary files
135 |     files = [TRIMMED_INIT, UNTRIMMED_INIT, REVCOMP_UNTRIMMED_INIT, TRIMMED_REVCOMP, UNTRIMMED_REVCOMP]
136 | 
137 |     for file in files:
138 |         os.remove(file)
139 | 
140 | 
141 | ## function: creating reference database with taxonomy
142 | def ref_database(args):
143 |     INPUT = args.input
144 |     OUTPUT = args.output
145 |     EMAIL = args.email
146 | 
147 |     ## retrieve accession numbers from fasta file and store in list
148 |     Entrez.email = EMAIL
149 |     accessions = []
150 |     sequence_number = []
151 |     correct_accessions = []
152 |     with open(INPUT) as myfile:
153 |         for line in myfile:
154 |             #pattern = re.search(r"^\>(.+?)\.", line)
155 |             #print(pattern)
156 |             #if pattern:
157 |             #    found = pattern.group(1)
158 |             #    accessions.append(found)
159 |             if line.startswith('>'):
160 |                 pattern = line.lstrip('>').split('.')[0]
161 |                 sequence_number.append(pattern)
162 |                 if pattern not in accessions:
163 |                     accessions.append(pattern)
164 |                     #print(pattern)
165 |     
166 |     #print(len(accessions))
167 | 
168 |     ## remove wrongly formatted lines (not accession numbers)
169 |     mistakes = ['@', '#', '$', '%', '&', '(', ')', '!', '<', '?', '|', ',', '.', '+', '=', '`', '~']
170 | 
171 |     for item in accessions:
172 |         if not any(mistake in item for mistake in mistakes):
173 |             correct_accessions.append(item)
174 |     
175 |     print('\nfound {} accessions in input file'.format(len(sequence_number)))
176 |     print('\nfound {} unique accessions in input file'.format(len(accessions)))
177 |     if len(accessions) - len(correct_accessions) == 0:
178 |         print('\nfound no incorrect formatting in accession numbers')
179 |     else:
180 |         print('\nremoved {} accessions due to incorrect formatting'.format(len(accessions) - len(correct_accessions)))
181 | 
182 |     ## find taxids for all correct accession numbers
183 |     NCBI_list = []
184 |     batch_size = 5000
185 |     accession_taxid = []
186 |     taxids = []
187 | 
188 |     print("\ndownloading {} taxonomic ID's from NCBI".format(len(correct_accessions)))
189 | 
190 |     for start in tqdm(range(0, len(correct_accessions), batch_size)):
191 |         group = correct_accessions[start : start + batch_size]
192 |         attempt = 1
193 |         success = False
194 |         while attempt <= 3 and not success:
195 |             attempt += 1
196 |             try:
197 |                 handle = Entrez.efetch(db = 'nuccore', id = ",".join(group), retmode = 'xml', rettype = 'fasta')
198 |                 record = Entrez.read(handle)
199 |                 NCBI_list.append(record)
200 |                 success = True
201 |             except HTTPError as err:
202 |                 if 500 <= err.code <= 599:
203 |                     print(f"Received error from server {err}")
204 |                     print(f"Attempt {attempt} of 3")
205 |                     time.sleep(15)
206 |                 else:
207 |                     raise
208 | 
209 |     ## format data into two lists
210 |     for record in NCBI_list:
211 |         for i in range(len(record)):
212 |             acc = record[i]['TSeq_accver']
213 |             taxid = record[i]['TSeq_taxid']
214 |             accession_taxid.append(str(acc) + ' ' + str(taxid))
215 |             taxids.append(str(taxid))
216 |     
217 |     uniq_taxid = list(set(taxids))
218 |     print("\nfound {} unique taxonomic ID's".format(len(uniq_taxid)))
219 | 
220 |     ## retrieve taxonomic lineage for 1000 taxids at a time
221 |     lineage_list = []
222 |     lineage_batch = 5000
223 | 
224 |     print("\ndownloading taxonomic lineage for {} taxonomic ID's".format(len(uniq_taxid)))
225 | 
226 |     for start in tqdm(range(0, len(uniq_taxid), lineage_batch)):
227 |         lineage_group = uniq_taxid[start : start + lineage_batch]
228 |         lineage_attempt = 1
229 |         lineage_success = False
230 |         while lineage_attempt <= 3 and not lineage_success:
231 |             lineage_attempt += 1
232 |             try:
233 |                 lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = ','.join(lineage_group))
234 |                 lineage_record = Entrez.read(lineage_search)
235 |                 lineage_list.append(lineage_record)
236 |                 lineage_success = True
237 |             except HTTPError as err:
238 |                 if 500 <= err.code <= 599:
239 |                     print(f'Received error from server {err}')
240 |                     print(f'Attempt {lineage_attempt} of 3')
241 |                     time.sleep(15)
242 |                 else:
243 |                     raise
244 |     
245 |     ## format downloaded info to pandas dataframe containing needed info for taxonomic lineage
246 |     lineage_info = []
247 | 
248 |     for key in lineage_list:
249 |         for i in range(len(key)):
250 |             lineage = {d['Rank']:d['ScientificName'] for d in key[i]['LineageEx'] if d['Rank'] in ['superkingdom',
251 |             'phylum', 'class', 'order', 'family', 'genus', 'species']}
252 |             lineage['species'] = key[i]['ScientificName']
253 |             lineage['taxid'] = key[i]['TaxId']
254 |             lineage_info.append(lineage)
255 |     
256 |     tax_list = pd.DataFrame(lineage_info)
257 | 
258 |     ## combine dataframe with accession list and fasta sequence file
259 |     accession_and_taxid = pd.DataFrame(accession_taxid)
260 |     accession_and_taxid = accession_and_taxid[0].str.split(' ', expand = True)
261 |     accession_and_taxid['accession'] = accession_and_taxid[0].str.split('.').str[0]
262 |     accession_and_taxid.columns = ['acc_name', 'taxid', 'accession']
263 | 
264 |     sequence = pd.DataFrame(pd.read_csv(INPUT, sep = '\t', header = None).values.reshape(-1,2))
265 |     sequence['accession'] = sequence[0].str[1:].str.split('.').str[0]
266 |     sequence.columns = ['name', 'sequence', 'accession']
267 | 
268 |     accession_and_taxid = accession_and_taxid.astype('str')
269 |     tax_list = tax_list.astype('str')
270 |     sequence = sequence.astype('str')
271 | 
272 |     df = accession_and_taxid.merge(tax_list, how = 'left', on = 'taxid')
273 |     df = df.merge(sequence, on = 'accession')
274 | 
275 |     ## clean up dataframe
276 | 
277 |     ## format the dataframe to final output
278 |     df['species'] = df['species'].str.replace(' ', '_')
279 |     df['sintax'] = '>' + df['accession'] + ';tax=d:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species']
280 |     datafr = df[['sintax', 'sequence']]
281 |     datafr.to_csv(OUTPUT, index = None, header = None, sep = '\n')
282 | 
283 | 
284 | ## function: dereplicating the database
285 | def dereplicate(args):
286 |     INPUT = args.input
287 |     OUTPUT = args.output
288 | 
289 |     ## subfunctions to be called
290 |     def fasta_to_dict_wDesc(fasta_file):
291 |         seq_dict = {}
292 |         for record in SeqIO.parse(fasta_file, 'fasta'):
293 |             record.description = record.description.replace(' ', '_')
294 |             record.id = record.description
295 |             rec_id = record.id
296 |             rec_desc = record.description
297 |             rec_seq = str(record.seq)
298 |             seq_dict.setdefault(rec_id, {})['sequence'] = rec_seq
299 |             seq_dict.setdefault(rec_id, {})['description'] = rec_desc
300 |         return seq_dict
301 | 
302 |     def derep(seqdict):
303 |         rep_dict = {}
304 |         derep_dict = {}
305 |         for k,v in seqdict.items():
306 |             rep_dict.setdefault(v, []).append(k)
307 |         for key, value in rep_dict.items():
308 |             numreads = len(value)
309 |             newname = value[0]
310 |             derep_dict[newname] = {'seq': key, 'size': numreads, 'readlist': value}
311 |         return derep_dict 
312 | 
313 |     def derep_to_seq(derep_dict, size = 'no'):
314 |         new_dict = {}
315 |         read_dict = {}
316 |         for k,v in derep_dict.items():
317 |             data = v
318 |             if size == 'no':
319 |                 base_id = k 
320 |             else:
321 |                 base_id = k + ';size='+str(data['size'])
322 |             read_dict[base_id] = data['readlist']
323 |             new_dict[base_id] = data['seq']
324 |         return (new_dict, read_dict)
325 |     
326 |     ## split sequence file into two dictionaries and define which species need dereplication
327 |     seq_file = INPUT
328 |     seqs = fasta_to_dict_wDesc(seq_file)
329 | 
330 |     print('\nfound {} sequences in input file'.format(len(seqs)))
331 | 
332 |     seq_just_id = {}
333 |     taxonly = {}
334 |     for k,v in seqs.items():
335 |         parts = v['description'].split(';tax=')
336 |         seq_id = parts[0]
337 |         tax = parts[1]
338 |         seq_just_id[seq_id] = v['sequence']
339 |         taxonly.setdefault(tax, []).append(seq_id)
340 |     
341 |     print('\ndatabase is comprised of {} unique taxa'.format(len(taxonly)))
342 | 
343 |     need_derep = []
344 |     singletons = {}
345 |     for k,v in taxonly.items():
346 |         if len(v) > 1:
347 |             need_derep.append(k)
348 |         else:
349 |             singletons[v[0]] = k
350 |     
351 |     print('\n{} taxa only occur once in the database'.format(len(singletons)))
352 |     print('\n{} taxa occur multiple times in the database'.format(len(need_derep)))
353 | 
354 |     tax_index = {}
355 |     for k,v in taxonly.items():
356 |         if k in need_derep:
357 |             for seqid in v:
358 |                 tax_index[seqid] = k
359 |     
360 |     ## dereplicate sequences for species represented more than once in the datbase
361 |     all_dereps = {}
362 |     for d in need_derep:
363 |         temp_seq_dict = {}
364 |         for seqid in taxonly[d]:
365 |             temp_seq_dict[seqid] = seq_just_id[seqid]
366 |         dr_temp = derep(temp_seq_dict)
367 |         derep_seq = derep_to_seq(dr_temp, size = 'no')
368 |         derep_seq = derep_seq[0]
369 |         for k,v in derep_seq.items():
370 |             new_id = k+';tax='+tax_index[k]
371 |             all_dereps[new_id] = v
372 |     
373 |     ## combine species present only once in the database with the dereplicated dataset
374 |     all_new_seqs = {}
375 |     for k,v in singletons.items():
376 |         new_id = k + ';tax=' + v
377 |         seq = seq_just_id[k]
378 |         all_new_seqs[new_id] = seq
379 |     for key, value in all_dereps.items():
380 |         all_new_seqs[key] = value
381 |     
382 |     print('\n{} sequences left after dereplication\n'.format(len(all_new_seqs)))
383 |     
384 |     ## save the dereplicated database
385 |     output = OUTPUT
386 |     seqout = open(output, 'w')
387 |     for k,v in all_new_seqs.items():
388 |         seqout.write('>' + k + '\n' + v + '\n')
389 |     seqout.close()
390 | 
391 | 
392 | ## function: phylogenetic tree builder
393 | def phylo(args):
394 |     SPECIES = args.species
395 |     DATABASE = args.database
396 |     EMAIL = args.email
397 |     OUTPUT = args.output
398 | 
399 |     Entrez.email = EMAIL
400 |     directory = 'temp'
401 |     try:
402 |         os.makedirs(directory, exist_ok = True)
403 |     except OSError as error:
404 |         print("Directory '%s' can not be created" % directory)
405 | 
406 |     ## read in the text file with species names
407 |     species = []
408 |     with open(SPECIES) as species_list:
409 |         for spec in species_list:
410 |             spec = spec.rstrip('\n')
411 |             species.append(spec)
412 |     print('\nfound ' + str(len(species)) + ' species of interest: ' + str(species) + '\n')
413 | 
414 |     ## retrieve the lineage information for each species
415 |         ## first: uniq ID from species name
416 |         ## second: tax ID from uniq ID
417 |         ## third: taxonomic information from tax ID
418 |         ## fourth: format similar to database
419 |     print('retrieving the taxonomic information from NCBI for ' + str(len(species)) + ' species of interest\n')
420 |     uid = []
421 |     for item in species:
422 |         handle = Entrez.esearch(db = 'nucleotide', term = item, retmode = 'xml', rettype = 'fasta')
423 |         record = Entrez.read(handle)
424 |         uid.append(record['IdList'][0])
425 |     
426 |     accession_taxid = []
427 |     taxids = []
428 |     for id in uid:
429 |         handle = Entrez.efetch(db = 'nuccore', id = id, retmode = 'xml', rettype = 'fasta')
430 |         record = Entrez.read(handle)
431 |         acc = record[0]['TSeq_accver']
432 |         taxid = record[0]['TSeq_taxid']
433 |         accession_taxid.append(str(acc) + ' ' + str(taxid))
434 |         taxids.append(str(taxid))
435 |     
436 |     lineage_list = []
437 |     for taxid in taxids:
438 |         lineage_search = Entrez.efetch(db = 'taxonomy', retmode = 'xml', id = taxid)
439 |         lineage_record = Entrez.read(lineage_search)
440 |         lineage_list.append(lineage_record)
441 | 
442 |     lineage_info = []
443 |     for key in lineage_list:
444 |         lineage = {d['Rank']:d['ScientificName'] for d in key[0]['LineageEx'] if d['Rank'] in ['superkingdom', 'phylum', 'class',
445 |         'order', 'family', 'genus', 'species']}
446 |         lineage['species'] = key[0]['ScientificName']
447 |         lineage['taxid'] = key[0]['TaxId']
448 |         lineage_info.append(lineage)
449 |     df = pd.DataFrame(lineage_info)
450 |     df['species'] = df['species'].str.replace(' ', '_')
451 |     df['sintax'] = 'd:' + df['superkingdom'] + ',p:' + df['phylum'] + ',c:' + df['class'] + ',o:' + df['order'] + ',f:' + df['family'] + ',g:' + df['genus'] + ',s:' + df['species']
452 |     datafr = df['sintax']
453 |     species_interest = datafr.values.tolist()
454 | 
455 |     ## extract all entries from the database that share a family status with the species of interest
456 |     for record in SeqIO.parse(DATABASE, 'fasta'):
457 |         family_rec = record.id.split(',')[4]
458 |         genus_rec = record.id.split(',')[5]
459 |         species_rec = record.id.split(',')[6]
460 |         for species in species_interest:
461 |             family_int = species.split(',')[4]
462 |             genus_int = species.split(',')[5]
463 |             species_int = species.split(',')[6]
464 |             spec_int = species.split(',')[6].split(':')[1]
465 |             if family_int == family_rec:
466 |                 with open(f'{directory}/{spec_int}_family.fasta', 'a') as f:
467 |                     SeqIO.write(record, f, 'fasta')
468 |             if genus_int == genus_rec:
469 |                 with open(f'{directory}/{spec_int}_genus.fasta', 'a') as f:
470 |                     SeqIO.write(record, f, 'fasta')
471 |             if species_int == species_rec:
472 |                 with open(f'{directory}/{spec_int}_species.fasta', 'a') as f:
473 |                     SeqIO.write(record, f, 'fasta')
474 | 
475 |     ## extract information for data table from newly generated files
476 |     newdict = {}
477 |     for species in species_interest:
478 |         spec_int = species.split(',')[6].split(':')[1]
479 |         try:
480 |             spec_number = list(SeqIO.parse(f'{directory}/{spec_int}_species.fasta', 'fasta'))
481 |             spec_num = len(spec_number)
482 |         except:
483 |             spec_num = 0
484 |         try:
485 |             gen_number = list(SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta'))
486 |             gen_num = len(gen_number)
487 |             gen_list = []
488 |             for record in gen_number:
489 |                 gen = record.id.split(',')[6].split(':')[1]
490 |                 if gen not in gen_list:
491 |                     gen_list.append(gen)
492 |         except:
493 |             gen_num = 0
494 |             gen_list = ['NA']
495 |         try:
496 |             fam_number = list(SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta'))
497 |             fam_num = len(fam_number)
498 |             fam_list = []
499 |             for record in fam_number:
500 |                 fam = record.id.split(',')[6].split(':')[1]
501 |                 if fam not in fam_list:
502 |                     fam_list.append(fam)
503 |         except:
504 |             fam_num = 0
505 |             fam_list = ['NA']
506 |         newdict[spec_int] = {'species': spec_int, 'species_occur': spec_num, 'species_gen': gen_list, 'gen_entries': gen_num, 'species_fam': fam_list, 'fam_entries': fam_num}
507 | 
508 |     ## print information on which species are present in the database
509 |     for species in species_interest:
510 |         spec_int = species.split(',')[6].split(':')[1]
511 |         if newdict[spec_int]['species_occur'] == 0:
512 |             print(str(newdict[spec_int]['species']) + ': not present in the reference database\n')
513 |         else:
514 |             print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['species_occur']) + ' entries in the database\n')
515 | 
516 |     ## output data table on species of interest
517 |     df = pd.DataFrame.from_dict(newdict, orient = 'index')
518 |     df = df[['species', 'species_occur', 'gen_entries', 'fam_entries', 'species_gen', 'species_fam']]
519 |     df.to_csv(OUTPUT, sep = '\t', index = None)
520 | 
521 |     ## generate phylogenetic trees for every species of interest based on number of entries in genus and family
522 |         ## first: check number of entries in if statement
523 |         ## second: shorten the headers of the sequences in the file, so that it can be printed on the figure
524 |         ## third: run muscle to generate alignment
525 |         ## fourth: calculate distance from alignment
526 |         ## fifth: generate tree figure 
527 |     for species in species_interest:
528 |         spec_int = species.split(',')[6].split(':')[1]
529 |         if newdict[spec_int]['fam_entries'] > 50:
530 |             print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries too large. Generating phylogenetic tree on genus level with ' + str(newdict[spec_int]['gen_entries']) + ' entries\n')
531 |             
532 |             select = []
533 |             for record in SeqIO.parse(f'{directory}/{spec_int}_genus.fasta', 'fasta'):
534 |                 record.description = record.description.replace(';', ',')
535 |                 record.id = record.description
536 |                 record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1]
537 |                 record.description = record.id
538 |                 select.append(record)
539 |             handle = open(f'{directory}/{spec_int}_genus_align.fasta', 'w')
540 |             SeqIO.write(select, handle, 'fasta')
541 |             handle.close()
542 | 
543 |             muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_genus_align.fasta',
544 |                                             out = f'{directory}/{spec_int}_genus_align.clw',
545 |                                             diags = True,
546 |                                             maxiters = 1,
547 |                                             log = f'{directory}/{spec_int}_genus_align_log.txt',
548 |                                             clw = True)
549 |             muscle_cline()
550 | 
551 |             with open(f'{directory}/{spec_int}_genus_align.clw' , 'r') as aln:
552 |                 alignment = AlignIO.read(aln, 'clustal')
553 |             calculator = DistanceCalculator('identity')
554 |             Distance_matrix = calculator.get_distance(alignment)
555 |             constructor = DistanceTreeConstructor(calculator, 'nj')
556 |             
557 |             tree = constructor.build_tree(alignment)
558 |             fig = plt.figure(figsize = (25,15), dpi = 100)
559 |             matplotlib.rc('font', size=12)             
560 |             matplotlib.rc('xtick', labelsize=10)       
561 |             matplotlib.rc('ytick', labelsize=10)       
562 |             axes = fig.add_subplot(1, 1, 1)
563 |             Phylo.draw(tree, axes=axes, do_show = False)
564 |             fig.savefig(f'{spec_int}_genus_align_tree.pdf')
565 | 
566 |         else:
567 |             print(str(newdict[spec_int]['species']) + ': ' + str(newdict[spec_int]['fam_entries']) + ' family entries. Generating phylogenetic tree on family level\n')
568 | 
569 |             select = []
570 |             for record in SeqIO.parse(f'{directory}/{spec_int}_family.fasta', 'fasta'):
571 |                 record.description = record.description.replace(';', ',')
572 |                 record.id = record.description
573 |                 record.id = record.id.split(',')[0] + ';' + record.id.split(',')[7].split(':')[1]
574 |                 record.description = record.id
575 |                 select.append(record)
576 |             handle = open(f'{directory}/{spec_int}_family_align.fasta', 'w')
577 |             SeqIO.write(select, handle, 'fasta')
578 |             handle.close()
579 | 
580 |             muscle_cline = MuscleCommandline(input = f'{directory}/{spec_int}_family_align.fasta',
581 |                                             out = f'{directory}/{spec_int}_family_align.clw',
582 |                                             diags = True,
583 |                                             maxiters = 1,
584 |                                             log = f'{directory}/{spec_int}_family_align_log.txt',
585 |                                             clw = True)
586 |             muscle_cline()
587 | 
588 |             with open(f'{directory}/{spec_int}_family_align.clw' , 'r') as aln:
589 |                 alignment = AlignIO.read(aln, 'clustal')
590 |             calculator = DistanceCalculator('identity')
591 |             Distance_matrix = calculator.get_distance(alignment)
592 |             constructor = DistanceTreeConstructor(calculator, 'nj')
593 |             
594 |             tree = constructor.build_tree(alignment)
595 |             fig = plt.figure(figsize = (25,15), dpi = 100)
596 |             matplotlib.rc('font', size=12)             
597 |             matplotlib.rc('xtick', labelsize=10)       
598 |             matplotlib.rc('ytick', labelsize=10)       
599 |             axes = fig.add_subplot(1, 1, 1)
600 |             Phylo.draw(tree, axes=axes, do_show = False)
601 |             fig.savefig(f'{spec_int}_family_align_tree.pdf')
602 | 
603 | 
604 | ## function: argparse parser
605 | def main():
606 |     parser = argparse.ArgumentParser(description = 'creating a curated reference database')
607 |     subparser = parser.add_subparsers()
608 | 
609 |     ncbi_download_parser = subparser.add_parser('ncbi_download', description = 'downloading fasta sequence file from NCBI based on text query')
610 |     ncbi_download_parser.set_defaults(func = ncbi_download)
611 |     ncbi_download_parser.add_argument('--database', help = 'database used to download sequences. Example: "nucleotide"', dest = 'database', type = str, required = True)
612 |     ncbi_download_parser.add_argument('--query', help = 'query search to limit portion of database to be downloaded. Example: "18S[All Fields] NOT "uncultured"[All Fields] AND is_nuccore[filter] AND ("1"[SLEN] : "50000"[SLEN])"', dest = 'query', type = str, required = True)
613 |     ncbi_download_parser.add_argument('--output', help = 'output filename. Example: "18S_fasta_NCBI_trial.fasta"', dest = 'output_filename', type = str, required = True)
614 |     ncbi_download_parser.add_argument('--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True)
615 | 
616 |     in_silico_pcr_parser = subparser.add_parser('in_silico_pcr', description = 'curating the downloaded reference sequences with an in silico PCR')
617 |     in_silico_pcr_parser.set_defaults(func = in_silico_pcr)
618 |     in_silico_pcr_parser.add_argument('--fwd', help = 'forward primer sequence in 5-3 direction', dest = 'fwd', type = str, required = True)
619 |     in_silico_pcr_parser.add_argument('--rev', help = 'reverse primer sequence in 5-3 direction', dest = 'rev', type = str, required = True)
620 |     in_silico_pcr_parser.add_argument('--assay', help = 'name of primer assay', dest = 'assay', type = str, required = True)
621 |     in_silico_pcr_parser.add_argument('--input', help = 'input filename', dest = 'input', type = str, required = True)
622 | 
623 |     ref_database_parser = subparser.add_parser('ref_database', description = 'creating the reference database with taxonomic information')
624 |     ref_database_parser.set_defaults(func = ref_database)
625 |     ref_database_parser.add_argument('--input', help = 'input file containing the curated fasta sequences after in silico PCR', dest = 'input', type = str, required = True)
626 |     ref_database_parser.add_argument('--output', help = 'curated reference database output file', dest = 'output', type = str, required = True)
627 |     ref_database_parser.add_argument('--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True)
628 | 
629 |     dereplication_parser = subparser.add_parser('dereplicate', description = 'dereplicating the database')
630 |     dereplication_parser.set_defaults(func = dereplicate)
631 |     dereplication_parser.add_argument('--input', help = 'filename of the curated reference database', dest = 'input', type = str, required = True)
632 |     dereplication_parser.add_argument('--output', help = 'filename of the dereplicated curated reference database', dest = 'output', type = str, required = True)
633 | 
634 |     phylo_parser = subparser.add_parser('phylo_build', description = 'generating phylogenetic trees for species of interest')
635 |     phylo_parser.set_defaults(func = phylo)
636 |     phylo_parser.add_argument('--species', help = 'text file containing list of species separated by newlines', dest = 'species', type = str, required = True)
637 |     phylo_parser.add_argument('--database', help = 'curated reference database', dest = 'database', type = str, required = True)
638 |     phylo_parser.add_argument('--email', help = 'email address to connect to NCBI servers', dest = 'email', type = str, required = True)
639 |     phylo_parser.add_argument('--output', help = 'filename for output table', dest = 'output', type = str, required = True)
640 | 
641 |     args = parser.parse_args()
642 |     args.func(args)
643 | 
644 | if __name__ == '__main__':
645 |     main()
646 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | setup(name='crabs',
 3 |       description='CRABS: Creating Reference databases for Amplicon-Based Sequencing',
 4 |       author='Gert-Jan Jeunen',
 5 | 	  author_email='gjeunen@gmail.com',
 6 | 	  url='https://github.com/gjeunen/reference_database_creator',
 7 | 	  version='1.7.7',
 8 |       packages=['function'],
 9 |       scripts=['crabs']
10 | )
11 | 


--------------------------------------------------------------------------------