├── .github
    └── workflows
    │   ├── test-conda-installs.yml
    │   └── test-full-run.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bin
    ├── GToTree
    ├── gtt-align-and-trim-parallel.sh
    ├── gtt-amino-acid-parallel.sh
    ├── gtt-amino-acid-serial.sh
    ├── gtt-append-fasta-headers
    ├── gtt-cat-alignments
    ├── gtt-check-or-setup-GTDB-files
    ├── gtt-check-wanted-lineage-info
    ├── gtt-clean-after-test.sh
    ├── gtt-combine-kofamscan-results.sh
    ├── gtt-count-bases-per-seq
    ├── gtt-data-locations
    ├── gtt-fasta-parallel-nt.sh
    ├── gtt-fasta-parallel.sh
    ├── gtt-fasta-serial-nt.sh
    ├── gtt-fasta-serial.sh
    ├── gtt-filter-parallel.sh
    ├── gtt-filter-seqs-by-length
    ├── gtt-gen-KO-iToL-files.sh
    ├── gtt-gen-SCG-HMMs
    ├── gtt-gen-itol-map
    ├── gtt-gen-pfam-iToL-files.sh
    ├── gtt-genbank-parallel.sh
    ├── gtt-genbank-serial.sh
    ├── gtt-genbank-to-AA-seqs
    ├── gtt-genbank-to-fasta
    ├── gtt-get-accessions-from-GTDB
    ├── gtt-get-additional-pfam-targets.sh
    ├── gtt-get-kofamscan-data
    ├── gtt-get-median.sh
    ├── gtt-get-ncbi-assembly-tables
    ├── gtt-get-ncbi-tax-data
    ├── gtt-hmms
    ├── gtt-ncbi-parallel-nt.sh
    ├── gtt-ncbi-parallel.sh
    ├── gtt-ncbi-serial-nt.sh
    ├── gtt-ncbi-serial.sh
    ├── gtt-parse-assembly-summary-file
    ├── gtt-parse-fasta-by-headers
    ├── gtt-parse-gtdb-assembly-summary-file
    ├── gtt-parse-kofamscan-targets.sh
    ├── gtt-pfam-search
    ├── gtt-remove-all-gap-seqs-from-alignment
    ├── gtt-rename-fasta-headers
    ├── gtt-reorder-fasta
    ├── gtt-run-additional-pfam-search.sh
    ├── gtt-run-kofamscan.sh
    ├── gtt-store-SCG-HMMs
    ├── gtt-subset-GTDB-accessions
    ├── gtt-swap-ids
    ├── gtt-test.sh
    └── gtt-update-ncbi-taxonomy
└── hmm_sets
    └── hmm-sources-and-info.tsv


/.github/workflows/test-conda-installs.yml:
--------------------------------------------------------------------------------
 1 | name: testing conda installs
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   install-tests:
 7 |     name: ${{ matrix.os }} conda install test
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: ["ubuntu-latest", "macos-latest"]
12 | 
13 |     steps:
14 | 
15 |       - uses: conda-incubator/setup-miniconda@v3
16 |         with:
17 |           python-version: "3.9"
18 |           channels: astrobiomike,conda-forge,bioconda
19 |           channel-priority: true
20 | 
21 |       - name: Install GToTree
22 |         shell: bash -el {0}
23 |         run: |
24 |           if [[ "${RUNNER_OS}" == "macOS" ]]; then
25 |             conda create --platform osx-64 -n gtotree -y gtotree
26 |           else
27 |             conda create -n gtotree -y gtotree
28 |           fi
29 | 
30 |       - name: Check GToTree
31 |         shell: bash -l {0}
32 |         run: |
33 |           conda activate gtotree
34 |           GToTree -h
35 | 


--------------------------------------------------------------------------------
/.github/workflows/test-full-run.yml:
--------------------------------------------------------------------------------
 1 | name: testing full run
 2 | 
 3 | on: workflow_dispatch
 4 | 
 5 | jobs:
 6 |   install-tests:
 7 |     name: ${{ matrix.os }} conda install test
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: ["ubuntu-latest", "macos-latest"]
12 | 
13 |     steps:
14 | 
15 |       - uses: conda-incubator/setup-miniconda@v3
16 |         with:
17 |           python-version: "3.9"
18 |           channels: astrobiomike,conda-forge,bioconda
19 |           channel-priority: true
20 | 
21 |       - name: Install GToTree
22 |         shell: bash -el {0}
23 |         run: |
24 |           if [[ "${RUNNER_OS}" == "macOS" ]]; then
25 |             conda create --platform osx-64 -n gtotree -y gtotree
26 |           else
27 |             conda create -n gtotree -y gtotree
28 |           fi
29 | 
30 |       - name: Run GToTree
31 |         shell: bash -l {0}
32 |         run: |
33 |           conda activate gtotree
34 |           gtt-test.sh
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | notes.txt
3 | GToTree.egg-info/
4 | build/
5 | gtotree/
6 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change Log
  2 | 
  3 | ## v1.8.14 (21-Apr-2025)
  4 | 
  5 | ### Changed
  6 | - change to taxonkit call when adding NCBI tax info (now using `reformat2` and a pattern) in order to deal with NCBI tax-structure update
  7 | 
  8 | 
  9 | ## v1.8.13 (18-Mar-2025)
 10 | 
 11 | ### Changed
 12 | - changed `gtt-gen-SCG-HMMs` to only use Pfam 37.0 for now (as later versions don't have one of the required files currently; see https://github.com/AstrobioMike/GToTree/issues/104)
 13 | 
 14 | ---
 15 | 
 16 | ## v1.8.12 (11-Mar-2025)
 17 | 
 18 | ### Changed
 19 | - changed GTDB download links from https://data.gtdb.ecogenomic.org/releases/ to https://data.ace.uq.edu.au/public/gtdb/data/releases/ due to the former becoming prohibitively slow recently
 20 | 
 21 | ---
 22 | 
 23 | ## v1.8.11 (10-Mar-2025)
 24 | 
 25 | ### Added
 26 | - VeryFastTree is now an available treeing program (`-T`)
 27 | 
 28 | ### Changed
 29 | - when using `gtt-get-accessions-from-GTDB`, if the requested taxon has spaces in it (e.g., `gtt-get-accessions-from-GTDB -t "Bacillus_A anthracis"`), the output files will have spaces replaced with dashes now
 30 |   - e.g., one of the outputs will now be "GTDB-Bacillus_A-anthracis-species-accs.txt" instead of "GTDB-Bacillus_A anthracis-species-accs.txt"
 31 | 
 32 | ---
 33 | 
 34 | ## v1.8.10 (3-Feb-2025)
 35 | 
 36 | ### Added
 37 | - saving ncbi downloaded files is possible when debug flag (`-d`) is set as requested in https://github.com/AstrobioMike/GToTree/issues/95, implemented in https://github.com/AstrobioMike/GToTree/pull/102
 38 |   - with the debug flag set while running, it will keep specific files in `<output_dir>/<tmp_dir>/ncbi-downloads/`:
 39 |     - if amino-acid seqs are used, it will keep the downloaded amino-acid seqs
 40 |     - if there were no amino-acid seqs, and the genome had to be downloaded, it will keep the downloaded genome and the prodigal-called amino-acid seqs
 41 |     - if using nucleotide mode (`-z`), it will keep the downloaded genome and the prodigal-called nt cds and amino-acid seqs
 42 | 
 43 | ---
 44 | 
 45 | ## v1.8.9 (31-Jan-2025)
 46 | 
 47 | ### Fixed
 48 | - added logic to catch, exit, and report when muscle doesn't successfully produce an alignment for a single-copy gene-set (thanks to https://github.com/AstrobioMike/GToTree/issues/101)
 49 | 
 50 | ---
 51 | 
 52 | ## v1.8.8 (7-Oct-2024)
 53 | 
 54 | ### Changed
 55 | - updated the call to FastTree and FastTreeMP to be include -nt and -gtr when GToTree is run in nucleotide mode (-z)
 56 | 
 57 | ### Fixed
 58 | - properly saving additional pfam target HMMs when that functionality is used
 59 | 
 60 | ---
 61 | 
 62 | ## v1.8.7 (29-Sep-2024)
 63 | 
 64 | ### Added
 65 | - `gtt-gen-SCG-HMMs` now reports which version of PFAM was used (prints it out to the terminal and writes it to a file)
 66 | 
 67 | ### Changed
 68 | - improvements to the "Universal" Hug et al. gene set thanks so much to @molly-kholodova digging in and reaching out!
 69 |     - PF00181 ("Ribosomal_L2") was changed to PF03947 ("Ribosomal_L2_C")
 70 |         - the C-terminal (which PF03947 covers) is better conserved
 71 |     - PF00827 ("Ribosomal_L15") was changed to PF00828 ("Ribosomal_L27A")
 72 |         - PF00827 was archaea/euk only, PF00828 holds the bac/arc L15 also
 73 |     - PF17135 ("Ribosomal_L18") was changed to PF00861 ("Ribosomal_L18p")
 74 |         - the PF00861 model is better distributed
 75 | 
 76 | ---
 77 | 
 78 | ## v1.8.6 (8-May-2024)
 79 | 
 80 | ### Fixed
 81 | - fixed when taxonomy information wasn't being added to labels when running in nucleotide mode (`-z`; https://github.com/AstrobioMike/GToTree/issues/91)
 82 | 
 83 | ---
 84 | 
 85 | ## v1.8.5 (1-May-2024)
 86 | 
 87 | ### Changed
 88 | - update to `gtt-gen-SCG-HMMs` to deal with ncbi assembly summary files having a column name of "#assembly_accession" instead of what was once "# assembly_accession"
 89 | 
 90 | ---
 91 | 
 92 | ## v1.8.4 (28-Nov-2023)
 93 | 
 94 | ### Fixed
 95 | - fixed an issue that prevented moving forward when there were more than 12,500 input genomes (https://github.com/AstrobioMike/GToTree/issues/83)
 96 | 
 97 | ---
 98 | 
 99 | ## v1.8.3 (14-Oct-2023)
100 | 
101 | ### Changed
102 | - updated links to GTDB files as they switched from .tar.gz extensions to .tsv.gz extensions in latest release, thanks to note from @jmtsuji (https://github.com/AstrobioMike/GToTree/issues/81)
103 | 
104 | ---
105 | 
106 | ## v1.8.2 (26-Jul-2023)
107 | 
108 | ### Added
109 | - added http option to gtt-test.sh (`gtt-test.sh http`) thanks to https://github.com/AstrobioMike/GToTree/issues/78 (https://github.com/AstrobioMike/GToTree/commit/9eb248ad5a54563370978d3575727eb63ad93483)
110 | 
111 | ### Fixed
112 | - updated `gtt-get-ncbi-tax-data` to appropriately pull from http instead of ftp also thanks to https://github.com/AstrobioMike/GToTree/issues/78
113 | - fix to check for ncbi assemblies "date-retrieved.txt" file, as also caught and fixed by @hyphaltip (https://github.com/AstrobioMike/GToTree/pull/80) 🙏 
114 | 
115 | ---
116 | 
117 | Earlier version changes are tracked on the [releases page](https://github.com/AstrobioMike/GToTree/releases).
118 | 
119 | ---
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <img align="right" width="600" src="https://github.com/AstrobioMike/AstrobioMike.github.io/blob/master/images/GToTree-logo-1200px.png">  
 3 | 
 4 | <br>
 5 | <br>
 6 | <br>
 7 | <br>
 8 | 
 9 | <a href="https://scholar.google.com/citations?view_op=view_citation&citation_for_view=-ONw6lsAAAAJ:_FxGoFyzp5QC"><img align="right" alt="Citations" src="https://img.shields.io/badge/Citations-475+-blue" height="22"></a>
10 | <br>
11 | <a href="https://github.com/AstrobioMike/GToTree/wiki/installation#conda-quickstart"><img align="right" alt="Conda installs" src="https://img.shields.io/badge/Conda%20installs-20k+-blue" height="22"></a>
12 | <br>
13 | <a href="https://doi.org/10.1093/bioinformatics/btz188"><img align="right" alt="DOI" src="https://img.shields.io/badge/DOI-10.1093/bioinformatics/btz188-blue" height="22"></a>
14 | <br>
15 | <a href="https://twitter.com/AstrobioMike"><img align="right" alt="Twitter Follow" src="https://img.shields.io/twitter/follow/AstrobioMike?color=blue&style=social"></a>
16 | <br>
17 | 
18 | ---
19 | 
20 | # GToTree: a user-friendly workflow for phylogenomics
21 | [GToTree](https://github.com/AstrobioMike/GToTree/wiki) is a user-friendly workflow for phylogenomics intended to give more researchers the capability to easily create phylogenomic trees. Documentation and examples can be found [at the wiki here](https://github.com/AstrobioMike/GToTree/wiki), and the open-access Bioinformatics Journal publication is available [here](https://doi.org/10.1093/bioinformatics/btz188). GToTree can be installed and run on a Mac or Linux machine, as well as on Windows within a Windows Subsystem for Linux environment 👍 
22 | 
23 | ---
24 | **A quick [conda installation](https://github.com/AstrobioMike/GToTree/wiki/installation#conda-quickstart) can be run like so:**
25 | 
26 | ```
27 | conda create -y -n gtotree -c astrobiomike -c conda-forge -c bioconda gtotree
28 | ```
29 | 
30 | 
31 | ---
32 | 
33 | GToTree is a more structured implementation of a workflow I would put together everytime I wanted to make a large-scale phylogenomic tree. What do I mean by large-scale? Anything from a full-blown Tree of Life with all 3 domains, down to, for example, all available genomes of *Staphylococcus* alongside new isolate genomes. At its heart it just takes in genomes and outputs an alignment and phylogenomic tree based on the specified HMM profiles. But I think its value comes from three main things: 1) its flexibility with regard to input format - taking fasta files, GenBank files, and/or NCBI accessions (So if you just recovered a bunch of new genomes and you want to see where they fit in with references, you can provide references by accession and your new genomes as fasta files.); 2) its automation of required between-tool tasks such as filtering hits by gene-length, filtering out genomes with too few hits to the target genes, and swapping genome labels for something more useful; and 3) its scalability – GToTree can turn ~1,700 input genomes into a tree in ~60 minutes on a standard laptop.
34 | 
35 | Also included are several newly generated single-copy gene-sets for 13 different taxonomical groupings. These are presented in the [wiki](https://github.com/AstrobioMike/GToTree/wiki/SCG-sets), along with an explanation and example code/steps used in the generation of them. 
36 | 
37 | GToTree utilizes helper scripts written in python, but is primarily implemented in bash. Every attempt is being made to make it portable across all variations of GNU/Unix, including on Macs, so if you run into any issues, it'd be appreciated if you could [report them](https://github.com/AstrobioMike/GToTree/issues) so the problems can be found and fixed!  
38 | 
39 | <p align="center">
40 | <a href="https://github.com/AstrobioMike/AstrobioMike.github.io/blob/master/images/GToTree-Overview-main.png"><img src="https://github.com/AstrobioMike/AstrobioMike.github.io/blob/master/images/GToTree-Overview.png"></a>
41 | </p>
42 | 
43 | See the ["What is GToTree?" wiki page](https://github.com/AstrobioMike/GToTree/wiki/what-is-gtotree%3F) for some more detail on the processing steps pictured above. For practical ways GToTree can be helpful, check out the [Example usage page](https://github.com/AstrobioMike/GToTree/wiki/example-usage). And for detailed information on using GToTree, see the [User guide](https://github.com/AstrobioMike/GToTree/wiki/user-guide).
44 | 
45 | ---
46 | 
47 | **A quick [conda installation](https://github.com/AstrobioMike/GToTree/wiki/installation#conda-quickstart) can be run like so:**
48 | 
49 | ```
50 | conda create -y -n gtotree -c astrobiomike -c conda-forge -c bioconda gtotree
51 | ```
52 | 
53 | 
54 | ---
55 | 
56 | ## Citation information
57 | 
58 | GToTree will print out a `citations.txt` file with citation information specific for every run that accounts for all programs it relies upon. Please be sure to cite the developers appropriately :)
59 | 
60 | Here is an example output `citations.txt` file from a run, and how I'd cite it in the methods:
61 | 
62 | ```
63 | GToTree v1.6.31
64 | Lee MD. GToTree: a user-friendly workflow for phylogenomics. Bioinformatics. 2019; (March):1-3. doi:10.1093/bioinformatics/btz188
65 | 
66 | Prodigal v2.6.3
67 | Hyatt, D. et al. Gene and translation initiation site prediction in metagenomic sequences. Bioinformatics. 2010; 28, 2223–2230. doi.org/10.1186/1471-2105-11-119
68 | 
69 | HMMER3 v3.3.2
70 | Eddy SR. Accelerated profile HMM searches. PLoS Comput. Biol. 2011; (7)10. doi:10.1371/journal.pcbi.1002195
71 | 
72 | Muscle v5.1
73 | Edgar RC. MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping. bioRxiv. 2021. doi.org/10.1101/2021.06.20.449169
74 | 
75 | TrimAl v1.4.rev15
76 | Gutierrez SC. et al. TrimAl: a Tool for automatic alignment trimming. Bioinformatics. 2009; 25, 1972–1973. doi:10.1093/bioinformatics/btp348
77 | 
78 | TaxonKit v0.9.0
79 | Shen W and Ren H. TaxonKit: a practical and efficient NCBI Taxonomy toolkit. Journal of Genetics and Genomics. 2021. doi.org/10.1016/j.jgg.2021.03.006
80 | 
81 | FastTree 2 v2.1.11
82 | Price MN et al. FastTree 2 - approximately maximum-likelihood trees for large alignments. PLoS One. 2010; 5. doi:10.1371/journal.pone.0009490
83 | ```
84 | 
85 | **Example methods text based on above citation output (be sure to modify as appropriate for your run)**
86 | > *The archaeal phylogenomic tree was produced with GToTree v1.6.31 (Lee 2019), using the prepackaged single-copy gene-set for archaea (76 target genes). Briefly, prodigal v2.6.3 (Hyatt et al. 2010) was used to predict genes on input genomes provided as fasta files. Target genes were identified with HMMER3 v3.2.2 (Eddy 2011), individually aligned with muscle v5.1 (Edgar 2021), trimmed with trimal v1.4.rev15 (Capella-Gutiérrez et al. 2009), and concatenated prior to phylogenetic estimation with FastTree2 v2.1.11 (Price et al. 2010). TaxonKit (Shen and Ren 2021) was used to connect full lineages to taxonomic IDs.*
87 | 


--------------------------------------------------------------------------------
/bin/gtt-align-and-trim-parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # setting colors to use
 4 | GREEN='\033[0;32m'
 5 | RED='\033[0;31m'
 6 | ORANGE='\033[0;33m'
 7 | NC='\033[0m'
 8 | 
 9 | tmp_dir=$2
10 | faster_alignment=$3
11 | num_muscle_threads=$4
12 | target_gene_suffix=$5
13 | 
14 | # removing those genomes that need to be removed based on not having enough hits to the target genes
15 | gtt-parse-fasta-by-headers -i ${tmp_dir}/${1}_hits_filtered.tmp -w ${tmp_dir}/sorted_genomes_to_remove.tmp -o ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} --inverse
16 | 
17 | # aligning
18 | if [ $faster_alignment == 'true' ]; then
19 |     muscle -super5 ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1
20 | else
21 |     muscle -align ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1
22 | fi
23 | 
24 | # checking if alignment was successful (really this is a sloppy way of checking, but it's better than nothing and the muscle log file will be available)
25 | if [ ! -s ${tmp_dir}/${1}_aligned.tmp ]; then
26 |     printf "${1}\n" >> ${tmp_dir}/kill_align_and_trim_parallel.problem
27 |     exit
28 | fi
29 | 
30 | # trimming
31 | trimal -in ${tmp_dir}/${1}_aligned.tmp -out ${tmp_dir}/${1}_trimmed${target_gene_suffix}.tmp -automated1
32 | 
33 | # removing linewraps:
34 | sed 's/ .*$//' ${tmp_dir}/${1}_trimmed${target_gene_suffix}.tmp | awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' > ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp
35 | 
36 | ## adding gap-sequences for genomes missing the current gene ##
37 | # finding here which ones have it
38 | grep ">" ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp | tr -d ">" | sort > ${tmp_dir}/${1}_genomes_with_gene.tmp
39 | 
40 | # now getting which ones don't have it
41 | comm -23 ${tmp_dir}/final_genomes_from_all_sources.tmp ${tmp_dir}/${1}_genomes_with_gene.tmp | sort > ${tmp_dir}/${1}_needed_gappers.tmp
42 | 
43 | # creating gap-sequences if needed
44 | if [ -s ${tmp_dir}/${1}_needed_gappers.tmp ]; then
45 | 
46 |     # making a headers file for when making fasta in a few steps:
47 |     sed 's/^/>/' ${tmp_dir}/${1}_needed_gappers.tmp > ${tmp_dir}/${1}_needed_headers.tmp
48 | 
49 |     # getting length of the alignment for the current gene:
50 |     aln_length_tmp=$(sed -n '2p' ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp | wc -c | tr -s " " | cut -f2 -d " ")
51 |     # subtracting 1 for newline characters
52 |     aln_length_tmp=$(echo "$aln_length_tmp"-1 | bc)
53 |     # making a string of gaps the length of the alignment for those missing it:
54 |     gap_seq=$(printf "%0.s-" $(seq 1 1 $aln_length_tmp))
55 |     # making as many gap sequences as there are genomes missing the current gene:
56 |     num_genomes_to_add=$(wc -l ${tmp_dir}/${1}_needed_gappers.tmp | tr -s " " "\t" | cut -f2)
57 |     for i in $(cat ${tmp_dir}/${1}_needed_gappers.tmp)
58 |     do
59 |         echo "$gap_seq"
60 |     done > ${tmp_dir}/${1}_gaps.tmp
61 | 
62 |     # making fasta of those genomes missing the current gene:
63 |     paste -d "\n" ${tmp_dir}/${1}_needed_headers.tmp ${tmp_dir}/${1}_gaps.tmp > ${tmp_dir}/${1}_missing_genomes${target_gene_suffix}.tmp
64 |     # catting the genomes missing the current gene together with those that have it
65 |     cat ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp ${tmp_dir}/${1}_missing_genomes${target_gene_suffix}.tmp > ${tmp_dir}/${1}${target_gene_suffix}.tmp
66 | else
67 |     mv ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp ${tmp_dir}/${1}${target_gene_suffix}.tmp
68 | fi
69 | 
70 | ## reordering the final fasta of this gene so that all gene sets can be pasted together at end ##
71 | gtt-reorder-fasta -i ${tmp_dir}/${1}${target_gene_suffix}.tmp -w ${tmp_dir}/final_genomes_from_all_sources.tmp -o ${tmp_dir}/${1}_all_aligned${target_gene_suffix}
72 | 
73 | printf "\n\n\n   --------------------------------------------------------------------------   \n"
74 | printf "\t    Finished aligning and formatting gene-set ${GREEN}$1${NC}.\n"
75 | printf "   --------------------------------------------------------------------------   \n"
76 | 


--------------------------------------------------------------------------------
/bin/gtt-amino-acid-parallel.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | num_cpus=$4
 12 | hmm_target_genes_total=$5
 13 | output_dir=$6
 14 | best_hit_mode=$7
 15 | additional_pfam_targets=$8
 16 | ko_targets=$9
 17 | target_KOs=${10}
 18 | 
 19 | ### kill backstop
 20 | # if there is a problem, all child processes launched (by this script) will exit immediately,
 21 | # upon returning to main script, will check and terminate parent process
 22 | if [ -s ${tmp_dir}/kill_amino_acid_parallel.problem ]; then
 23 |     exit
 24 | fi
 25 | 
 26 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 27 | if $(file $1 | grep -q "gzip"); then
 28 |     was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 29 |     file_location=${1%.*}
 30 |     gunzip -f -c $1 > $file_location
 31 |     assembly="$(basename ${file_location%.*})"
 32 | else
 33 |     file_location=$1
 34 |     assembly="$(basename ${1%.*})"
 35 |     was_gzipped=FALSE
 36 | fi
 37 | 
 38 | printf "   --------------------------------------------------------------------------   \n\n"
 39 | printf "     Genome: ${GREEN}$assembly${NC}\n"
 40 | 
 41 | # adding assembly to ongoing genomes list
 42 | echo $assembly >> ${tmp_dir}/amino_acid_genomes_list.tmp
 43 | 
 44 | num=$((num+1)) # to track progress
 45 | 
 46 | 
 47 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
 48 | gtt-filter-seqs-by-length -q -i ${file_location} -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes1.tmp
 49 | 
 50 | ## renaming seqs to have assembly name (also to ensure simple headers)
 51 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp
 52 | 
 53 | 
 54 | ## removing gunzipped genome file if it was gunzipped
 55 | if [ $was_gzipped == "TRUE" ]; then
 56 |     rm -rf $file_location
 57 | fi
 58 | 
 59 | 
 60 | ## exiting here and reporting current input file if something is wrong with it and didn't get coding sequences
 61 | if [ ! -s ${tmp_dir}/${assembly}_genes.tmp ]; then
 62 |     printf "$assembly" >> ${tmp_dir}/kill_amino_acid_parallel.problem
 63 |     exit
 64 | fi
 65 | 
 66 | 
 67 | ### running hmm search ###
 68 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
 69 | 
 70 | ### calculating % completion and redundancy ###
 71 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
 72 | do
 73 |     grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
 74 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp
 75 | 
 76 | ## making list here of only those present in exactly 1 copy
 77 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
 78 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
 79 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
 80 | 
 81 | ## adding SCG-hit counts to table
 82 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
 83 | 
 84 | # total number of unique SCG hits
 85 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
 86 | 
 87 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
 88 | 
 89 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
 90 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
 91 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
 92 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
 93 | 
 94 | ### want to put an explicit notice out if estimated redundancy is greater than 10%
 95 | # needs to be an integer for bash comparison, so multiplying by 100 first
 96 | 
 97 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
 98 | 
 99 | printf "             Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
100 | 
101 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
102 |     printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
103 | 
104 | 
105 |     printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
106 |     printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
107 |     printf "   While there are no \"golden\" cutoff values for these things, typically\n"
108 |     printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
109 |     printf "   You may want to consider taking a closer look and/or removing it from the\n"
110 |     printf "   from the input genomes.\n\n"
111 | 
112 |     printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
113 |     printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
114 | 
115 |     # writing to table of genomes with questionable redundancy estimates
116 |     printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
117 | 
118 | else
119 |     printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
120 | fi
121 | 
122 | # adding NA for taxid so final table can still have the column and lineage for those that do have them
123 | taxid="NA"
124 | 
125 | ## writing summary info to table ##
126 | printf "$assembly\t$1\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Amino_acid_genomes_summary_info.tsv
127 | 
128 | ### Pulling out hits for this genome ###
129 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
130 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
131 | 
132 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
133 | if [ $best_hit_mode  == "false" ]; then
134 | 
135 |     for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
136 |     do
137 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
138 |     done
139 | 
140 | # if best-hit mode is on, taking best hit
141 | else
142 | 
143 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
144 |     do
145 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
146 |     done
147 | 
148 | fi
149 | 
150 | ## searching for additional targets if provided
151 | # getting count of genes if there are additional targets
152 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
153 | 
154 |     gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
155 | 
156 | fi
157 | 
158 | ## KOs
159 | if [ $ko_targets == "true" ]; then
160 | 
161 |     gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
162 | 
163 | fi
164 | 
165 | ## Pfams
166 | if [ $additional_pfam_targets == "true" ]; then
167 | 
168 |     gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
169 | 
170 | fi
171 | 
172 | rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
173 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
174 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
175 | 


--------------------------------------------------------------------------------
/bin/gtt-amino-acid-serial.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | amino_acid_genomes_total=$4
 12 | num_cpus=$5
 13 | hmm_target_genes_total=$6
 14 | output_dir=$7
 15 | best_hit_mode=$8
 16 | additional_pfam_targets=${9}
 17 | ko_targets=${10}
 18 | target_KOs=${11}
 19 | 
 20 | # looping through the lines of the provided [-f] file (this loop operates on one genome at a time)
 21 | while IFS=$'\t' read -r -a file
 22 | do
 23 | 
 24 |     ### kill backstop
 25 |     # if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly
 26 |     if [ -s ${tmp_dir}/kill_amino_acid_serial.problem ]; then
 27 |         exit
 28 |     fi
 29 | 
 30 | 
 31 |     ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 32 |     if $(file $file | grep -q "gzip"); then
 33 |         was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 34 |         file_location=${file%.*}
 35 |         gunzip -f -c $file > $file_location
 36 |         assembly="$(basename ${file_location%.*})"
 37 |     else
 38 |         file_location=$file
 39 |         assembly="$(basename ${file%.*})"
 40 |         was_gzipped=FALSE
 41 |     fi
 42 | 
 43 |     # adding assembly to ongoing genomes list
 44 |     echo $assembly >> ${tmp_dir}/amino_acid_genomes_list.tmp
 45 | 
 46 |     num=$((num+1)) # to track progress
 47 | 
 48 |     printf "   --------------------------------------------------------------------------   \n"
 49 |     printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $amino_acid_genomes_total total.\n"
 50 |     printf "   --------------------------------------------------------------------------   \n\n"
 51 | 
 52 | 
 53 |     # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
 54 |     gtt-filter-seqs-by-length -q -i ${file_location} -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes1.tmp
 55 | 
 56 |     ## renaming seqs to have assembly name (also to ensure simple headers)
 57 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp
 58 | 
 59 |     ## removing gunzipped genome file if it was gunzipped
 60 |     if [ $was_gzipped == "TRUE" ]; then
 61 |         rm -rf $file_location
 62 |     fi
 63 | 
 64 |     ## exiting here and reporting current input file if something is wrong with it and didn't get coding sequences
 65 |     if [ ! -s ${tmp_dir}/${assembly}_genes.tmp ]; then
 66 |         printf "$assembly" >> ${tmp_dir}/kill_amino_acid_serial.problem
 67 |         exit
 68 |     fi
 69 | 
 70 |     printf "      Performing HMM search...\n"
 71 |       
 72 |     ### running hmm search ###
 73 |     hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
 74 | 
 75 |     ### calculating % completion and redundancy ###
 76 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
 77 |     do
 78 |         grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
 79 |     done > ${tmp_dir}/${assembly}_uniq_counts.tmp
 80 | 
 81 |     ## making list here of only those present in exactly 1 copy
 82 |     paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
 83 |     awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
 84 |     uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
 85 | 
 86 |     ## adding SCG-hit counts to table
 87 |     paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
 88 | 
 89 |     # total number of unique SCG hits
 90 |     num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
 91 |     num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
 92 | 
 93 |     perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
 94 |     perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
 95 |     perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
 96 |     perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
 97 | 
 98 |     # want to put a notice out if estimated redundancy is greater than 10
 99 |     # needs to be an integer for bash comparison, so multiplying by 100 first
100 | 
101 |     mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
102 | 
103 |     printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
104 | 
105 |     if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
106 | 
107 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
108 | 
109 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
110 |         printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
111 |         printf "   While there are no \"golden\" cutoff values for these things, typically\n"
112 |         printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
113 |         printf "   You may want to consider taking a closer look and/or removing it from the\n"
114 |         printf "   from the input genomes.\n\n"
115 | 
116 |         printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
117 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
118 | 
119 |         # writing to table of genomes with questionable redundancy estimates
120 |         printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
121 | 
122 |     else
123 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
124 | 
125 |     fi
126 | 
127 |     # adding NA for taxid so final table can still have the column and lineage for those that do have them
128 |     taxid="NA"
129 | 
130 |     ## writing summary info to table ##
131 |     printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Amino_acid_genomes_summary_info.tsv
132 | 
133 |     ### Pulling out hits for this genome ###
134 |     # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
135 |     esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
136 | 
137 |     # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
138 |     if [ $best_hit_mode  == "false" ]; then
139 | 
140 |         for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
141 |         do
142 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
143 |         done
144 | 
145 |     # if best-hit mode is on, taking best hit
146 |     else
147 | 
148 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
149 |         do
150 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
151 |         done
152 | 
153 |     fi
154 | 
155 |     ## searching for additional targets if provided
156 |     # getting count of genes if there are additional targets
157 |     if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
158 | 
159 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
160 | 
161 |     fi
162 | 
163 |     ## KOs
164 |     if [ $ko_targets == "true" ]; then
165 | 
166 |         gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
167 | 
168 |     fi
169 | 
170 |     ## Pfams
171 |     if [ $additional_pfam_targets == "true" ]; then
172 | 
173 |         gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
174 | 
175 |     fi
176 | 
177 |     rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
178 |     rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
179 |     rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
180 | 
181 | done < $1
182 | 


--------------------------------------------------------------------------------
/bin/gtt-append-fasta-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='This script will modify headers of sequences of a multifasta, specific for use in GToTree.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_fasta", help="Starting fasta file", action="store", dest="input_fasta", required=True)
12 | parser.add_argument("-w", "--desired_append", help='Name to append to seqs (default: "Seq"', action="store", dest="wanted_name", default="Seq")
13 | parser.add_argument("-o", "--output_fasta_name", help='Output fasta file (default: "Renamed.fasta").', dest="output_fasta_name", default="Renamed.fasta")
14 | 
15 | if len(sys.argv)==1:
16 |   parser.print_help(sys.stderr)
17 |   sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | in_fasta = open(args.input_fasta, "r")
22 | new_header = args.wanted_name
23 | out_fasta = open(args.output_fasta_name, "w")
24 | 
25 | n = 0
26 | 
27 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
28 | 	n = n + 1
29 | 	out_fasta.write(">" + new_header + "_" + seq_record.id + "_" + str(n) + "\n")
30 | 	out_fasta.write(str(seq_record.seq) + "\n")
31 | 
32 | in_fasta.close()
33 | out_fasta.close()
34 | 


--------------------------------------------------------------------------------
/bin/gtt-cat-alignments:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | from glob import glob
 5 | import argparse
 6 | import os.path
 7 | 
 8 | parser = argparse.ArgumentParser(description='This script is a helper script to concatenate fasta-formatted multiple sequence alignment files, and generate partitions file.')
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-t", "--tmp-dir", help="The working tmp_dir for the current GToTree run", action="store", dest="tmp_dir", required=True)
13 | required.add_argument("-o", "--output-dir", help="The output_dir for the current GToTree run", action="store", dest="output_dir", required=True)
14 | parser.add_argument("--nucleotides", help="Provide this flag if user specified nucleotide mode", action="store_true")
15 | 
16 | 
17 | if len(sys.argv)==1:
18 |   parser.print_help(sys.stderr)
19 |   sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | tmp_dir = args.tmp_dir + "/"
24 | output_dir = args.output_dir + "/"
25 | 
26 | # getting list of all alignment files
27 | if not args.nucleotides:
28 | 
29 |     list_of_alignment_files = glob(tmp_dir + "*_all_aligned.faa")
30 | 
31 | else:
32 | 
33 |     list_of_alignment_files = glob(tmp_dir + "*_all_aligned.fa")
34 | 
35 | # initializing dictionary that will hold headers as keys and a list of all seqs to be cat'd as values
36 | dict_of_genomes = {}
37 | 
38 | # getting headers (they are the same in all files and all are found in all files at this point, so only need to pull from one)
39 | with open (list_of_alignment_files[0]) as file:
40 |     for line in file:
41 |         if line.strip().startswith(">"):
42 |             dict_of_genomes[(line.strip().lstrip(">"))] = []
43 | 
44 | 
45 | # iterating through all files adding seqs
46 | for file in list_of_alignment_files:
47 |     with open(file) as fasta:
48 |         curr_header=""
49 |         for line in fasta:
50 |             line = line.strip()
51 |             if line.startswith(">"):
52 |                 curr_header=line.lstrip(">")
53 |             else:
54 |                 dict_of_genomes[curr_header].append(line)
55 | 
56 | 
57 | # writing out concatenated (horizontally) sequence file
58 | 
59 | if not args.nucleotides:
60 |     with open(output_dir + "Aligned_SCGs.faa", "w") as out:
61 |         for header, seqs in dict_of_genomes.items():
62 |             out.write(">" + header + "\n")
63 |             out.write("XXXXX".join(seqs) + "\n")
64 | 
65 | else:
66 |     with open(output_dir + "Aligned_SCGs.fa", "w") as out:
67 |         for header, seqs in dict_of_genomes.items():
68 |             out.write(">" + header + "\n")
69 |             out.write("NNNNNN".join(seqs) + "\n")
70 | 
71 | # making partitions file
72 |     # getting list of gene names in order they were cat'd together
73 | if not args.nucleotides:
74 |     gene_list = [os.path.basename(x)[:-16] for x in list_of_alignment_files]
75 | else:
76 |     gene_list = [os.path.basename(x)[:-15] for x in list_of_alignment_files]
77 | 
78 |    # all are same length, so just need one genome entry, then to count the bases per element in dict values list, and add 5 for the XXXXX spacers
79 |    # getting all alignment lengths
80 | 
81 | alignment_lengths_list = [len(x) for x in list(dict_of_genomes.values())[0]]
82 | 
83 | curr_start = 1
84 | curr_stop = 0
85 | 
86 | with open(output_dir + "Partitions.txt", "w") as out:
87 |     for i in range(0,len(gene_list)):
88 |         curr_stop = curr_start + alignment_lengths_list[i] - 1
89 | 
90 |         if not args.nucleotides:
91 |             out.write("AA, " + str(gene_list[i]) + " = " + str(curr_start) + "-" + str(curr_stop) + "\n")
92 |             curr_start = curr_stop + 6    
93 |         else:
94 |             out.write("DNA, " + str(gene_list[i]) + " = " + str(curr_start) + "-" + str(curr_stop) + "\n")
95 |             curr_start = curr_stop + 7
96 | 


--------------------------------------------------------------------------------
/bin/gtt-check-or-setup-GTDB-files:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 
  5 | for setting up reference files for the glorious Genome Taxonomy Database (gtdb.ecogenomic.org/).
  6 | 
  7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage
  8 | """
  9 | 
 10 | import sys
 11 | import os
 12 | import urllib.request
 13 | import pandas as pd
 14 | import textwrap
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser(description="This is a helper program to facilitate setting up the reference files for the \
 18 |                                               glorious Genome Taxonomy Database (gtdb.ecogenomic.org). It's really meant for internal \
 19 |                                               use only by the main GToTree program.")
 20 | 
 21 | args = parser.parse_args()
 22 | 
 23 | ################################################################################
 24 | 
 25 | def main():
 26 | 
 27 |     ## checking env variable is set and writable
 28 |     check_location_var_is_set_and_writable("GTDB_dir")
 29 | 
 30 |     ## setting up ref GTDB files if needed
 31 |     check_and_or_get_gtdb_files(os.environ["GTDB_dir"])
 32 | 
 33 | ################################################################################
 34 | 
 35 | 
 36 | # setting some colors
 37 | tty_colors = {
 38 |     'green' : '\033[0;32m%s\033[0m',
 39 |     'yellow' : '\033[0;33m%s\033[0m',
 40 |     'red' : '\033[0;31m%s\033[0m'
 41 | }
 42 | 
 43 | 
 44 | ### functions ###
 45 | def color_text(text, color='green'):
 46 |     if sys.stdout.isatty():
 47 |         return tty_colors[color] % text
 48 |     else:
 49 |         return text
 50 | 
 51 | 
 52 | def wprint(text):
 53 |     print(textwrap.fill(text, width=80, initial_indent="  ", 
 54 |           subsequent_indent="  ", break_on_hyphens=False))
 55 | 
 56 | 
 57 | def check_location_var_is_set_and_writable(variable):
 58 | 
 59 |     # making sure there is an env variable
 60 |     try:
 61 |         path = os.environ[variable]
 62 | 
 63 |         if path == "":
 64 |             raise
 65 | 
 66 |     except:
 67 |         print()
 68 |         wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be set, and we need it if wanting to add GTDB taxonomic lineages :(", "red"))
 69 |         print()
 70 |         wprint("Try to set it with `gtt-data-locations set`, then run GToTree again.")
 71 |         print("\nExiting for now.\n")
 72 |         sys.exit(1)
 73 | 
 74 |     # making sure path is writable for the user
 75 |     path_writable = os.access(path, os.W_OK)
 76 | 
 77 |     if not path_writable:
 78 |         print()
 79 |         wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be writable, and we need it to be if wanting to add GTDB taxonomic lineages :(", "red"))
 80 |         print()
 81 |         wprint("Try to set it somewhere else with `gtt-data-locations set`, then run GToTree again.")
 82 |         print("\nExiting for now.\n")
 83 |         sys.exit(1)
 84 | 
 85 |     return()
 86 | 
 87 | 
 88 | def gen_gtdb_tab(location):
 89 |     """ downloads and parses the GTDB info tables """
 90 | 
 91 |     # getting archaea
 92 |     # arc_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz")
 93 |     arc_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz")
 94 |     arc_tab = pd.read_csv(arc_tsv_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False)
 95 |     arc_tab.rename(columns={arc_tab.columns[0]:"accession"}, inplace=True)
 96 |     arc_tab.dropna(inplace=True, how="all")
 97 | 
 98 |     # getting bacteria
 99 |     # bac_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz")
100 |     bac_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz")
101 |     bac_tab = pd.read_csv(bac_tsv_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False)
102 |     bac_tab.rename(columns={bac_tab.columns[0]:"accession"}, inplace=True)
103 |     bac_tab.dropna(inplace=True, how="all")
104 | 
105 |     # combining
106 |     gtdb_tab = pd.concat([arc_tab, bac_tab])
107 | 
108 |     # splitting gtdb taxonomy column into 7 and dropping the single column
109 |     domain, phylum, rclass, order, family, genus, species = [], [], [], [], [], [], []
110 | 
111 |     for index, row in gtdb_tab.iterrows():
112 |         curr_acc = row["accession"]
113 |         tax_list = row["gtdb_taxonomy"].split(";")
114 | 
115 |         if len(tax_list) != 7:
116 |             wprint(color_text("GTDB entry " + curr_acc + " doesn't seem to have 7-column lineage info. Something is likely wrong :(", "yellow"))
117 |             print("")
118 |             wprint("If this continues to happen, please file an issue at github.com/AstrobioMike/GToTree/issues")
119 |             print("")
120 |             wprint("Aborting for now.")
121 |             print("")
122 |             sys.exit(0)
123 | 
124 |         else:
125 |             domain.append(tax_list[0][3:])
126 |             phylum.append(tax_list[1][3:])
127 |             rclass.append(tax_list[2][3:])
128 |             order.append(tax_list[3][3:])
129 |             family.append(tax_list[4][3:])
130 |             genus.append(tax_list[5][3:])
131 |             species.append(tax_list[6][3:])
132 | 
133 |     gtdb_tab.insert(1, "species", species)
134 |     gtdb_tab.insert(1, "genus", genus)
135 |     gtdb_tab.insert(1, "family", family)
136 |     gtdb_tab.insert(1, "order", order)
137 |     gtdb_tab.insert(1, "class", rclass)
138 |     gtdb_tab.insert(1, "phylum", phylum)
139 |     gtdb_tab.insert(1, "domain", domain)
140 | 
141 |     # writing out
142 |     gtdb_tab.to_csv(location + "GTDB-arc-and-bac-metadata.tsv", index=False, sep="\t")
143 | 
144 |     gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt", location + "GTDB-version-info.txt")
145 | 
146 | 
147 | def check_and_or_get_gtdb_files(GTDB_dir):
148 |     """ checks for and sets up ref GTDB files if needed """
149 | 
150 |     if os.path.exists(GTDB_dir + "GTDB-arc-and-bac-metadata.tsv") and os.path.exists(GTDB_dir + "GTDB-version-info.txt"):
151 | 
152 |         sys.exit(0)
153 | 
154 |     # generating when table doesn't exist yet
155 |     else:
156 |         wprint(color_text("Downloading and parsing archaeal and bacterial metadata tables from GTDB (only needs to be done once)...", "yellow"))
157 |         print("")
158 |         
159 |         gen_gtdb_tab(GTDB_dir)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/bin/gtt-check-wanted-lineage-info:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser(description='This script is for making sure the user-specified desired lineage info is interpretable.')
 7 | 
 8 | required = parser.add_argument_group('required arguments')
 9 | 
10 | required.add_argument("-w", "--wanted_ranks", help="Single-column file with wanted ranks", action="store", dest="wanted_ranks", required=True)
11 | parser.add_argument("-o", "--output_file_with_uninterpretable_ranks", help='Output file default: "gtotree.uninterpretable_ranks.tmp"', action="store", dest="output_file", default="gtotree.uninterpretable_ranks.tmp")
12 | 
13 | if len(sys.argv)==1:
14 |   parser.print_help(sys.stderr)
15 |   sys.exit(0)
16 | 
17 | args = parser.parse_args()
18 | 
19 | out_file = open(args.output_file, "w")
20 | 
21 | acceptable_ranks = ["domain","phylum","class","order","family","genus","species","strain"]
22 | 
23 | with open(args.wanted_ranks, "r") as wanted_ranks:
24 |   for line in wanted_ranks:
25 |     curr_line = line.strip()
26 |     lower_line = curr_line.lower()
27 | 
28 |     if lower_line not in acceptable_ranks:
29 |       out_file.write(str(curr_line) + "\n")
30 | 
31 | out_file.close()
32 | 


--------------------------------------------------------------------------------
/bin/gtt-clean-after-test.sh:
--------------------------------------------------------------------------------
1 | rm -rf GToTree-test-data/ GToTree-test-output/
2 | 


--------------------------------------------------------------------------------
/bin/gtt-combine-kofamscan-results.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | all_assembly_ids=${1}
 4 | tmp_dir=${2}
 5 | output_dir=${3}
 6 | unique_target_KOs=${4}
 7 | 
 8 | KO_output_dir="${output_dir}/KO_search_results"
 9 | KO_hits_fasta_output_dir="${KO_output_dir}/KO_hit_seqs/"
10 | 
11 | mkdir -p ${KO_hits_fasta_output_dir}
12 | 
13 | # combining all fasta files for each individual KO
14 | for ko in $(cat ${unique_target_KOs}); do
15 | 
16 |     find ${tmp_dir}/kofamscan/ -name ${ko}.faa -exec cat {} \; > ${KO_hits_fasta_output_dir}/${ko}-hits.faa
17 | 
18 |     # removing if there were none
19 |     if [ ! -s ${KO_hits_fasta_output_dir}/${ko}-hits.faa ]; then
20 | 
21 |         rm ${KO_hits_fasta_output_dir}/${ko}-hits.faa
22 | 
23 |     fi
24 | 
25 | done
26 | 
27 | # combining counts into one table
28 | final_counts_tab="${KO_output_dir}/KO-hit-counts.tsv"
29 | 
30 | 
31 | # starting first row
32 | # cat <( printf "KO_ID\n" ) ${unique_target_KOs} > ${building_counts_tab}
33 | paste <( printf "assembly_id\ttotal_gene_count" ) <( tr "\n" "\t" < ${unique_target_KOs} | sed 's/\t$/\n/' ) > ${final_counts_tab}
34 | 
35 | # looping through assemblies and adding them
36 | for assembly_id in $(cat ${all_assembly_ids}); do
37 | 
38 |     cat ${tmp_dir}/kofamscan/${assembly_id}/KO-counts.txt >> ${final_counts_tab}
39 | 
40 | done
41 | 


--------------------------------------------------------------------------------
/bin/gtt-count-bases-per-seq:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import sys
 6 | 
 7 | parser = argparse.ArgumentParser(description="This script takes a multifasta as input and returns a tab-delimited file with two columns, header and number of bases or amino acids, for each sequence." )
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta")
12 | parser.add_argument("-o", "--output_txt_file", help='Name of output txt file (default: "Num_bps.txt")', action="store", dest="output_file", default="Num_bps.txt")
13 | 
14 | if len(sys.argv)==1:
15 |   parser.print_help(sys.stderr)
16 |   sys.exit(0)
17 | 
18 | args = parser.parse_args()
19 | 
20 | in_fasta = open(args.input_fasta, "r")
21 | out_file = open(args.output_file, "w")
22 | 
23 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
24 |   out_file.write(seq_record.id + "\t" + str(len(seq_record.seq)) + "\n")
25 | 
26 | in_fasta.close()
27 | out_file.close()
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/bin/gtt-fasta-parallel-nt.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | num_cpus=$4
 12 | hmm_target_genes_total=$5
 13 | output_dir=$6
 14 | best_hit_mode=$7
 15 | additional_pfam_targets=$8
 16 | ko_targets=$9
 17 | target_KOs=${10}
 18 | 
 19 | 
 20 | ### kill backstop
 21 | # if there is a problem, all child processes launched (by this script) will exit immediately,
 22 | # upon returning to main script, will check and terminate parent process
 23 | if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then
 24 |     exit
 25 | fi
 26 | 
 27 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 28 | if $(file $1 | grep -q "gzip"); then
 29 |     was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 30 |     file_location=${1%.*}
 31 |     gunzip -f -c $1 > $file_location
 32 |     assembly="$(basename ${file_location%.*})"
 33 | else
 34 |     file_location=$1
 35 |     assembly="$(basename ${1%.*})"
 36 |     was_gzipped=FALSE
 37 | fi
 38 | 
 39 | printf "   --------------------------------------------------------------------------   \n\n"
 40 | printf "     Genome: ${GREEN}$assembly${NC}\n"
 41 | 
 42 | # adding assembly to ongoing genomes list
 43 | echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp
 44 | 
 45 | num=$((num+1)) # to track progress
 46 | 
 47 | prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null 2> ${file_location}_prodigal.stderr
 48 | 
 49 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then
 50 |     printf "$assembly\n" >> ${tmp_dir}/kill_fasta_parallel.prodigal
 51 |     rm -rf ${file_location}_prodigal.stderr
 52 |     exit
 53 | else
 54 |     rm -rf ${file_location}_prodigal.stderr
 55 | fi
 56 | 
 57 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp
 58 | 
 59 | ## removing gunzipped genome file if it was gunzipped
 60 | if [ $was_gzipped == "TRUE" ]; then
 61 |     rm -rf $file_location
 62 | fi
 63 | 
 64 | ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such
 65 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp
 66 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp
 67 | 
 68 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
 69 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp
 70 |     # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta
 71 |     
 72 | ### running hmm search ###
 73 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null
 74 | 
 75 | ### calculating % completion and redundancy ###
 76 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
 77 | do
 78 |     grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
 79 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp
 80 | 
 81 | ## making list here of only those present in exactly 1 copy
 82 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
 83 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
 84 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
 85 | 
 86 | ## adding SCG-hit counts to table
 87 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
 88 | 
 89 | # total number of unique SCG hits
 90 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
 91 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
 92 | 
 93 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
 94 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
 95 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
 96 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
 97 | 
 98 | # want to put a notice out if estimated redundancy is greater than 10
 99 | # needs to be an integer for bash comparison, so multiplying by 100 first
100 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
101 | 
102 | printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
103 | 
104 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
105 | 
106 |     printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
107 | 
108 |     printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
109 |     printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
110 |     printf "   While there are no \"golden\" cutoff values for these things, typically\n"
111 |     printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
112 |     printf "   You may want to consider taking a closer look and/or removing it from the\n"
113 |     printf "   from the input genomes.\n\n"
114 | 
115 |     printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
116 |     printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
117 | 
118 |     # writing to table of genomes with questionable redundancy estimates
119 |     printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
120 | 
121 | else
122 |     printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
123 | 
124 | fi
125 | 
126 | # adding NA for taxid so final table can still have the column and lineage for those that do have them
127 | taxid="NA"
128 | 
129 | ## writing summary info to table ##
130 | printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv
131 | 
132 | 
133 | ### Pulling out hits for this genome (nucleotide as specified by user) ###
134 | target_genes_suffix="_genes.fa.tmp"
135 | 
136 | # indexing
137 | esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null
138 | 
139 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
140 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
141 | if [ $best_hit_mode  == "false" ]; then
142 | 
143 |     for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
144 |     do
145 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
146 |     done
147 | 
148 | # if best-hit mode is on, taking best hit
149 | else
150 | 
151 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
152 |     do
153 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
154 |     done
155 | 
156 | fi
157 | 
158 | 
159 | ## searching for additional targets if provided
160 | # getting count of genes if there are additional targets
161 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
162 | 
163 |     gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp)
164 | 
165 | fi
166 | 
167 | ## KOs
168 | if [ $ko_targets == "true" ]; then
169 | 
170 |     gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
171 | 
172 | fi
173 | 
174 | ## Pfams
175 | if [ $additional_pfam_targets == "true" ]; then
176 | 
177 |     gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
178 | 
179 | fi
180 | 
181 | rm -rf ${tmp_dir}/${assembly}_genes*.tmp*
182 | rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
183 | rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 
184 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
185 | 


--------------------------------------------------------------------------------
/bin/gtt-fasta-parallel.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | num_cpus=$4
 12 | hmm_target_genes_total=$5
 13 | output_dir=$6
 14 | best_hit_mode=$7
 15 | additional_pfam_targets=$8
 16 | ko_targets=$9
 17 | target_KOs=${10}
 18 | 
 19 | 
 20 | ### kill backstop
 21 | # if there is a problem, all child processes launched (by this script) will exit immediately,
 22 | # upon returning to main script, will check and terminate parent process
 23 | if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then
 24 |     exit
 25 | fi
 26 | 
 27 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 28 | if $(file $1 | grep -q "gzip"); then
 29 |     was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 30 |     file_location=${1%.*}
 31 |     gunzip -f -c $1 > $file_location
 32 |     assembly="$(basename ${file_location%.*})"
 33 | else
 34 |     file_location=$1
 35 |     assembly="$(basename ${1%.*})"
 36 |     was_gzipped=FALSE
 37 | fi
 38 | 
 39 | printf "   --------------------------------------------------------------------------   \n\n"
 40 | printf "     Genome: ${GREEN}$assembly${NC}\n"
 41 | 
 42 | # adding assembly to ongoing genomes list
 43 | echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp
 44 | 
 45 | num=$((num+1)) # to track progress
 46 | 
 47 | 
 48 | prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
 49 | 
 50 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then
 51 |     printf "$assembly\n" >> ${tmp_dir}/kill_fasta_parallel.prodigal
 52 |     rm -rf ${file_location}_prodigal.stderr
 53 |     exit
 54 | else
 55 |     rm -rf ${file_location}_prodigal.stderr
 56 | fi
 57 | 
 58 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
 59 | 
 60 | 
 61 | ## removing gunzipped genome file if it was gunzipped
 62 | if [ $was_gzipped == "TRUE" ]; then
 63 |     rm -rf $file_location
 64 | fi
 65 | 
 66 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
 67 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp
 68 | 
 69 | ## renaming seqs to have assembly name (also to ensure simple headers)
 70 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp
 71 |   
 72 | ### running hmm search ###
 73 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
 74 | 
 75 | ### calculating % completion and redundancy ###
 76 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
 77 | do
 78 |     grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
 79 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp
 80 | 
 81 | ## making list here of only those present in exactly 1 copy
 82 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
 83 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
 84 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
 85 | 
 86 | ## adding SCG-hit counts to table
 87 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
 88 | 
 89 | # total number of unique SCG hits
 90 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
 91 | 
 92 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
 93 | 
 94 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
 95 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
 96 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
 97 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
 98 | 
 99 | ### want to put an explicit notice out if estimated redundancy is greater than 10%
100 | # needs to be an integer for bash comparison, so multiplying by 100 first
101 | 
102 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
103 | 
104 | printf "             Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
105 | 
106 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
107 |     printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
108 | 
109 | 
110 |     printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
111 |     printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
112 |     printf "   While there are no \"golden\" cutoff values for these things, typically\n"
113 |     printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
114 |     printf "   You may want to consider taking a closer look and/or removing it from the\n"
115 |     printf "   from the input genomes.\n\n"
116 | 
117 |     printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
118 |     printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
119 | 
120 |     # writing to table of genomes with questionable redundancy estimates
121 |     printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
122 | 
123 | else
124 |     printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
125 | fi
126 | 
127 | # adding NA for taxid so final table can still have the column and lineage for those that do have them
128 | taxid="NA"
129 | 
130 | ## writing summary info to table ##
131 | printf "$assembly\t$1\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv
132 | 
133 | ### Pulling out hits for this genome ###
134 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
135 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
136 | 
137 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
138 | if [ $best_hit_mode  == "false" ]; then
139 | 
140 |     for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
141 |     do
142 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
143 |     done
144 | 
145 | # if best-hit mode is on, taking best hit
146 | else
147 | 
148 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
149 |     do
150 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
151 |     done
152 | 
153 | fi
154 | 
155 | ## searching for additional targets if provided
156 | # getting count of genes if there are additional targets
157 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
158 | 
159 |     gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
160 | 
161 | fi
162 | 
163 | ## KOs
164 | if [ $ko_targets == "true" ]; then
165 | 
166 |     gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
167 | 
168 | fi
169 | 
170 | ## Pfams
171 | if [ $additional_pfam_targets == "true" ]; then
172 | 
173 |     gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
174 | 
175 | fi
176 | 
177 | rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes3.tmp
178 | rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
179 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
180 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
181 | 


--------------------------------------------------------------------------------
/bin/gtt-fasta-serial-nt.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | fasta_genomes_total=$4
 12 | num_cpus=$5
 13 | hmm_target_genes_total=$6
 14 | output_dir=$7
 15 | best_hit_mode=$8
 16 | additional_pfam_targets=$9
 17 | ko_targets=${10}
 18 | target_KOs=${11}
 19 | 
 20 | # looping through the lines of the provided [-f] file (this loop operates on one genome at a time)
 21 | while IFS=$'\t' read -r -a file
 22 | do
 23 | 
 24 |     ### kill backstop
 25 |     # if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly
 26 |     if [ -s ${tmp_dir}/kill_fasta_serial.prodigal ]; then
 27 |         exit
 28 |     fi
 29 | 
 30 | 
 31 |     ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 32 |     if $(file $file | grep -q "gzip"); then
 33 |         was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 34 |         file_location=${file%.*}
 35 |         gunzip -f -c $file > $file_location
 36 |         assembly="$(basename ${file_location%.*})"
 37 |     else
 38 |         file_location=$file
 39 |         assembly="$(basename ${file%.*})"
 40 |         was_gzipped=FALSE
 41 |     fi
 42 | 
 43 |     # adding assembly to ongoing genomes list
 44 |     echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp
 45 | 
 46 |     num=$((num+1)) # to track progress
 47 | 
 48 |     printf "   --------------------------------------------------------------------------   \n"
 49 |     printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $fasta_genomes_total total.\n"
 50 |     printf "   --------------------------------------------------------------------------   \n\n"
 51 | 
 52 |     printf "      Getting coding seqs...\n\n"
 53 | 
 54 |     ## running prodigal to get coding sequences
 55 |     prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null 2> ${file_location}_prodigal.stderr
 56 | 
 57 |     if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then
 58 |         printf "$assembly\n" >> ${tmp_dir}/kill_fasta_serial.prodigal
 59 |         rm -rf ${file_location}_prodigal.stderr
 60 | 
 61 |         exit
 62 |     else
 63 |         rm -rf ${file_location}_prodigal.stderr
 64 |     fi
 65 | 
 66 |     tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp
 67 | 
 68 |     ## removing gunzipped genome file if it was gunzipped
 69 |     if [ $was_gzipped == "TRUE" ]; then
 70 |         rm -rf $file_location
 71 |     fi
 72 | 
 73 |     ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such
 74 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp
 75 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp
 76 | 
 77 |     # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
 78 |     gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp
 79 |         # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta
 80 | 
 81 |     printf "      Performing HMM search...\n"
 82 |       
 83 |     ### running hmm search ###
 84 |     hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null
 85 | 
 86 |     ### calculating % completion and redundancy ###
 87 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
 88 |     do
 89 |         grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
 90 |     done > ${tmp_dir}/${assembly}_uniq_counts.tmp
 91 | 
 92 |     ## making list here of only those present in exactly 1 copy
 93 |     paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
 94 |     awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
 95 |     uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
 96 | 
 97 |     ## adding SCG-hit counts to table
 98 |     paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
 99 | 
100 |     # total number of unique SCG hits
101 |     num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
102 |     num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
103 | 
104 |     perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
105 |     perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
106 |     perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
107 |     perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
108 | 
109 |     # want to put a notice out if estimated redundancy is greater than 10
110 |     # needs to be an integer for bash comparison, so multiplying by 100 first
111 |     mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
112 | 
113 |     printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
114 | 
115 |     if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
116 | 
117 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
118 | 
119 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
120 |         printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
121 |         printf "   While there are no \"golden\" cutoff values for these things, typically\n"
122 |         printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
123 |         printf "   You may want to consider taking a closer look and/or removing it from the\n"
124 |         printf "   from the input genomes.\n\n"
125 | 
126 |         printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
127 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
128 | 
129 |         # writing to table of genomes with questionable redundancy estimates
130 |         printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
131 | 
132 |     else
133 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
134 | 
135 |     fi
136 | 
137 |     # adding NA for taxid so final table can still have the column and lineage for those that do have them
138 |     taxid="NA"
139 | 
140 |     ## writing summary info to table ##
141 |     printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv
142 | 
143 | 
144 |     ### Pulling out hits for this genome (nucleotide as specified by user) ###
145 |     target_genes_suffix="_genes.fa.tmp"
146 | 
147 |     # indexing
148 |     esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null
149 | 
150 |     # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
151 |     # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
152 |     if [ $best_hit_mode  == "false" ]; then
153 | 
154 |         for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
155 |         do
156 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
157 |         done
158 | 
159 |     # if best-hit mode is on, taking best hit
160 |     else
161 | 
162 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
163 |         do
164 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
165 |         done
166 | 
167 |     fi
168 | 
169 | 
170 |     ## searching for additional targets if provided
171 |     # getting count of genes if there are additional targets
172 |     if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
173 | 
174 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp)
175 | 
176 |     fi
177 | 
178 |     ## KOs
179 |     if [ $ko_targets == "true" ]; then
180 | 
181 |         gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
182 | 
183 |     fi
184 | 
185 |     ## Pfams
186 |     if [ $additional_pfam_targets == "true" ]; then
187 | 
188 |         gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
189 | 
190 |     fi
191 | 
192 |     rm -rf ${tmp_dir}/${assembly}_genes*.tmp*
193 |     rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
194 |     rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 
195 |     rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
196 | 
197 | done < $1
198 | 


--------------------------------------------------------------------------------
/bin/gtt-fasta-serial.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | fasta_genomes_total=$4
 12 | num_cpus=$5
 13 | hmm_target_genes_total=$6
 14 | output_dir=$7
 15 | best_hit_mode=$8
 16 | additional_pfam_targets=$9
 17 | ko_targets=${10}
 18 | target_KOs=${11}
 19 | 
 20 | # looping through the lines of the provided [-f] file (this loop operates on one genome at a time)
 21 | while IFS=$'\t' read -r -a file
 22 | do
 23 | 
 24 |     ### kill backstop
 25 |     # if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly
 26 |     if [ -s ${tmp_dir}/kill_fasta_serial.prodigal ]; then
 27 |         exit
 28 |     fi
 29 | 
 30 | 
 31 |     ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 32 |     if $(file $file | grep -q "gzip"); then
 33 |         was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 34 |         file_location=${file%.*}
 35 |         gunzip -f -c $file > $file_location
 36 |         assembly="$(basename ${file_location%.*})"
 37 |     else
 38 |         file_location=$file
 39 |         assembly="$(basename ${file%.*})"
 40 |         was_gzipped=FALSE
 41 |     fi
 42 | 
 43 |     # adding assembly to ongoing genomes list
 44 |     echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp
 45 | 
 46 |     num=$((num+1)) # to track progress
 47 | 
 48 |     printf "   --------------------------------------------------------------------------   \n"
 49 |     printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $fasta_genomes_total total.\n"
 50 |     printf "   --------------------------------------------------------------------------   \n\n"
 51 | 
 52 |     printf "      Getting coding seqs...\n\n"
 53 | 
 54 |     ## running prodigal to get coding sequences
 55 |     prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
 56 | 
 57 |     if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then
 58 |         printf "$assembly\n" >> ${tmp_dir}/kill_fasta_serial.prodigal
 59 |         rm -rf ${file_location}_prodigal.stderr
 60 | 
 61 |         exit
 62 |     else
 63 |         rm -rf ${file_location}_prodigal.stderr
 64 |     fi
 65 | 
 66 |     tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
 67 | 
 68 |     ## removing gunzipped genome file if it was gunzipped
 69 |     if [ $was_gzipped == "TRUE" ]; then
 70 |         rm -rf $file_location
 71 |     fi
 72 | 
 73 |     # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
 74 |     gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp
 75 | 
 76 |     ## renaming seqs to have assembly name (also to ensure simple headers)
 77 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp
 78 | 
 79 |     printf "      Performing HMM search...\n"
 80 |       
 81 |     ### running hmm search ###
 82 |     hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
 83 | 
 84 |     ### calculating % completion and redundancy ###
 85 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
 86 |     do
 87 |         grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
 88 |     done > ${tmp_dir}/${assembly}_uniq_counts.tmp
 89 | 
 90 |     ## making list here of only those present in exactly 1 copy
 91 |     paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
 92 |     awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
 93 |     uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
 94 | 
 95 |     ## adding SCG-hit counts to table
 96 |     paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
 97 | 
 98 |     # total number of unique SCG hits
 99 |     num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
100 |     num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
101 | 
102 |     perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
103 |     perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
104 |     perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
105 |     perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
106 | 
107 |     # want to put a notice out if estimated redundancy is greater than 10
108 |     # needs to be an integer for bash comparison, so multiplying by 100 first
109 | 
110 |     mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
111 | 
112 |     printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
113 | 
114 |     if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
115 | 
116 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
117 | 
118 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
119 |         printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
120 |         printf "   While there are no \"golden\" cutoff values for these things, typically\n"
121 |         printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
122 |         printf "   You may want to consider taking a closer look and/or removing it from the\n"
123 |         printf "   from the input genomes.\n\n"
124 | 
125 |         printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
126 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
127 | 
128 |         # writing to table of genomes with questionable redundancy estimates
129 |         printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
130 | 
131 |     else
132 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
133 | 
134 |     fi
135 | 
136 |     # adding NA for taxid so final table can still have the column and lineage for those that do have them
137 |     taxid="NA"
138 | 
139 |     ## writing summary info to table ##
140 |     printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv
141 | 
142 | 
143 |     ### Pulling out hits for this genome ###
144 |     # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
145 |     esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
146 | 
147 |     # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
148 |     if [ $best_hit_mode  == "false" ]; then
149 | 
150 |         for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
151 |         do
152 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
153 |         done
154 | 
155 |     # if best-hit mode is on, taking best hit
156 |     else
157 | 
158 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
159 |         do
160 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
161 |         done
162 | 
163 |     fi
164 | 
165 | 
166 |     ## searching for additional targets if provided
167 |     # getting count of genes if there are additional targets
168 |     if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
169 | 
170 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
171 | 
172 |     fi
173 | 
174 |     ## KOs
175 |     if [ $ko_targets == "true" ]; then
176 | 
177 |         gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
178 | 
179 |     fi
180 | 
181 |     ## Pfams
182 |     if [ $additional_pfam_targets == "true" ]; then
183 | 
184 |         gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
185 | 
186 |     fi
187 | 
188 |     rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes3.tmp
189 |     rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
190 |     rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
191 |     rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
192 | 
193 | done < $1
194 | 


--------------------------------------------------------------------------------
/bin/gtt-filter-parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # setting colors to use
 4 | GREEN='\033[0;32m'
 5 | RED='\033[0;31m'
 6 | ORANGE='\033[0;33m'
 7 | NC='\033[0m'
 8 | 
 9 | tmp_dir=$2
10 | len_cutoff=$3
11 | nucleotide=$4
12 | 
13 | ### filtering out sequences that are too long or too short ###
14 | if [ $nucleotide != 'false' ]; then
15 |     target_gene_suffix=".fa"
16 | else
17 |     target_gene_suffix=".faa"
18 | fi
19 | 
20 | 
21 | gtt-count-bases-per-seq -i ${tmp_dir}/${1}_hits${target_gene_suffix} -o ${tmp_dir}/${1}_Num_bps.tmp
22 | cut -f2 ${tmp_dir}/${1}_Num_bps.tmp > ${tmp_dir}/${1}_lengths.tmp
23 | median=$(gtt-get-median.sh ${tmp_dir}/${1}_lengths.tmp)
24 | buff=$(echo "$median * $len_cutoff" | bc)
25 | min_len=$(echo "$median - $buff" | bc)
26 | min_len_rnd=$(printf "%.0f\n" $min_len)
27 | max_len=$(echo "$median + $buff" | bc)
28 | max_len_rnd=$(printf "%.0f\n" $max_len)
29 | 
30 | gtt-filter-seqs-by-length -i ${tmp_dir}/${1}_hits${target_gene_suffix} -m $min_len_rnd -M $max_len_rnd -o ${tmp_dir}/${1}_hits_filtered.tmp > ${tmp_dir}/${1}_filter.out.tmp
31 | 
32 | cat <(printf "\n    Filtering ${GREEN}${1}${NC} sequences by length...\n") ${tmp_dir}/${1}_filter.out.tmp
33 | 
34 | rm ${tmp_dir}/${1}_Num_bps.tmp ${tmp_dir}/${1}_lengths.tmp ${tmp_dir}/${1}_filter.out.tmp
35 | 


--------------------------------------------------------------------------------
/bin/gtt-filter-seqs-by-length:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import sys
 6 | 
 7 | parser = argparse.ArgumentParser(description="This script takes a multifasta as input and filters out sequences based on length.")
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta")
12 | required.add_argument("-m", "--min_length", help="minimum length retained", action="store", dest="min_len")
13 | required.add_argument("-M", "--max_length", help="maximum length retained", action="store", dest="max_len")
14 | parser.add_argument("-o", "--output_file", help='name of output fasta file (default: "filtered.fasta")', action="store", dest="output_file", default="filtered.fasta")
15 | parser.add_argument("-q", "--quiet", help="don't report percentage of retained sequences", action = "store_true")
16 | 
17 | if len(sys.argv)==1:
18 |   parser.print_help(sys.stderr)
19 |   sys.exit(0)
20 | 
21 | args = parser.parse_args()
22 | 
23 | in_fasta = open(args.input_fasta, "r")
24 | out_file = open(args.output_file, "w")
25 | min_len = args.min_len
26 | max_len = args.max_len
27 | 
28 | total=0
29 | kept=0
30 | 
31 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
32 | 
33 |   total+=1
34 | 
35 |   if len(seq_record.seq) >= int(min_len) and len(seq_record.seq) <= int(max_len):
36 | 
37 |     kept+=1
38 |     out_file.write(">" + str(seq_record.description) + "\n" + str(seq_record.seq) + "\n")
39 | 
40 | 
41 | if not args.quiet:
42 |   
43 |   perc = round(float(kept) / float(total) * 100, 2)
44 |   print("\n\tRetained " + str(kept) + " sequences of the initial " + str(total) + " (" + str(perc) + "%).\n")
45 | 
46 | 
47 | 
48 | in_fasta.close()
49 | out_file.close()
50 | 


--------------------------------------------------------------------------------
/bin/gtt-gen-KO-iToL-files.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # setting colors to use
 4 | GREEN='\033[0;32m'
 5 | RED='\033[0;31m'
 6 | ORANGE='\033[0;33m'
 7 | NC='\033[0m'
 8 | 
 9 | tmp_dir=$1
10 | output_dir=$2
11 | 
12 | ## setting variable holding whether or not any labels were swapped
13 | if grep -q label <(head -n 1 ${output_dir}/Genomes_summary_info.tsv); then 
14 |     labels_swapped='true'
15 | else
16 |     labels_swapped='false'
17 | fi
18 | 
19 | curr_target_line=0
20 | 
21 | for target in $(cat ${tmp_dir}/uniq_ko_targets.tmp)
22 | do
23 | 
24 |     curr_target_line=$(($curr_target_line + 1))
25 | 
26 |     target_col=$(($curr_target_line + 2))
27 | 
28 |     awk -F $'\t' -v col="$target_col" ' $col > 0 { print $1 } ' ${output_dir}/KO_search_results/KO-hit-counts.tsv | tail -n +2 > ${tmp_dir}/Genomes_with_hits_to_${target}.tmp
29 | 
30 |     if [ -s ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ]; then
31 | 
32 |         if [ $labels_swapped == 'true' ]; then
33 | 
34 |             for genome in $(cat ${tmp_dir}/Genomes_with_hits_to_${target}.tmp)
35 |             do
36 |                 
37 |                 grep -m1 "^$genome" ${output_dir}/Genomes_summary_info.tsv | cut -f 2 
38 | 
39 |             done > ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp
40 | 
41 |             paste ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp > ${tmp_dir}/Genomes_with_hits_to_${target}.tsv
42 | 
43 |         fi
44 |         
45 |         ## if any, removing those not in final tree before making iToL file
46 |         awk -F $'\t' ' $8 == "No" { print $1 } ' ${output_dir}/Genomes_summary_info.tsv | sort > ${tmp_dir}/sorted_genomes_to_leave_out_of_KO_iToL_files.tmp
47 | 
48 |         if [ -s ${tmp_dir}/sorted_genomes_to_leave_out_of_KO_iToL_files.tmp ]; then
49 |             comm -23 <( sort ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ) ${tmp_dir}/sorted_genomes_to_leave_out_of_KO_iToL_files.tmp > ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp
50 |         else
51 |             cp ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp
52 |         fi
53 | 
54 |         ## making iToL file for each target KO
55 |         if [ $labels_swapped == 'true' ]; then
56 |             for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp)
57 |             do
58 |                 grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tsv
59 |             done | cut -f 2 > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp
60 | 
61 |         else
62 |             for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp)
63 |             do
64 |                 grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tmp
65 |             done > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp
66 |         fi
67 | 
68 |         printf "DATASET_STYLE\nSEPARATOR SPACE\nDATASET_LABEL $target\nCOLOR #0000ff\nDATA\n" > ${output_dir}/KO_search_results/iToL_files/${target}-iToL.txt
69 | 
70 |         cat <(sed 's/$/ branch node #0000ff 3 normal/' ${tmp_dir}/genomes_for_iToL_for_${target}.tmp) >> ${output_dir}/KO_search_results/iToL_files/${target}-iToL.txt
71 |     
72 |     else
73 |         rm ${tmp_dir}/Genomes_with_hits_to_${target}.tmp
74 | 
75 |     fi
76 | 
77 | done
78 | 


--------------------------------------------------------------------------------
/bin/gtt-gen-itol-map:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser(description='This script is for creating a standard iToL "label" and/or "branch" color file when given the IDs of the genomes you want to color.')
 7 | 
 8 | required = parser.add_argument_group('required arguments')
 9 | 
10 | required.add_argument("-g", "--target_genomes", help='Single-column file with the genomes to color (need to match the IDs in the tree file, with no "">")', action="store", dest="target_genomes", required=True)
11 | parser.add_argument("-w", "--what_to_color", help='What to color, must be: "branches", "labels", or "both" (default: "both")', action="store", dest="to_color", default="both")
12 | parser.add_argument("-c", "--color", help='Color to use of either: "blue", "green", or "red" (default: "blue", of course, \'cause it\'s the best)', action="store", dest="color", default="blue")
13 | parser.add_argument("-o", "--output_file", help='Output file for iToL (default: "iToL-colors.txt")', action="store", dest="output_file", default="iToL-colors.txt")
14 | 
15 | if len(sys.argv)==1:
16 |     parser.print_help(sys.stderr)
17 |     sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | if args.color == "blue":
22 |     col = "#0000ff"
23 | elif args.color == "green":
24 |     col = "#00a33f"
25 | elif args.color == "red":
26 |     col = "#a30000"
27 | else:
28 |     print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n")
29 |     parser.print_help(sys.stderr)
30 |     sys.exit(1)
31 | 
32 | if args.to_color not in ["both", "branches", "labels"]:
33 |     print("\n\tSorry, we're not prepared to handle \"" + str(args.to_color) + "\" as the argument for what to color... :(\n")
34 |     parser.print_help(sys.stderr)
35 |     sys.exit(1)
36 | 
37 | target_list = []
38 | 
39 | with open(args.target_genomes, "r") as target_genomes:
40 |     for genome in target_genomes:
41 |         target_list.append(genome.strip())
42 | 
43 | out_file = open(args.output_file, "w")
44 | 
45 | out_file.write("TREE_COLORS\nSEPARATOR TAB\nDATA\n\n")
46 | 
47 | # writing lines for coloring labels if needed
48 | if args.to_color in ["both", "labels"]:
49 | 
50 |     for target in target_list:
51 |         out_file.write(str(target) + "\tlabel\t" + str(col) + "\tbold\n")
52 | 
53 | # writing lines for coloring branches if needed
54 | if args.to_color in ["both", "branches"]:
55 | 
56 |     for target in target_list:
57 |         out_file.write(str(target) + "\tbranch\t" + str(col) + "\tnormal\t1.5\n")
58 | 
59 | out_file.close()
60 | 


--------------------------------------------------------------------------------
/bin/gtt-gen-pfam-iToL-files.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # setting colors to use
 4 | GREEN='\033[0;32m'
 5 | RED='\033[0;31m'
 6 | ORANGE='\033[0;33m'
 7 | NC='\033[0m'
 8 | 
 9 | tmp_dir=$1
10 | output_dir=$2
11 | 
12 | ## setting variable holding whether or not any labels were swapped
13 | if grep -q label <(head -n 1 ${output_dir}/Genomes_summary_info.tsv); then 
14 |     labels_swapped='true'
15 | else
16 |     labels_swapped='false'
17 | fi
18 | 
19 | curr_target_line=0
20 | 
21 | for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
22 | do
23 | 
24 |     curr_target_line=$(($curr_target_line + 1))
25 | 
26 |     target_col=$(($curr_target_line + 2))
27 | 
28 |     awk -F $'\t' -v col="$target_col" ' $col > 0 { print $1 } ' ${output_dir}/Pfam_search_results/Pfam-hit-counts.tsv | tail -n +2 > ${tmp_dir}/Genomes_with_hits_to_${target}.tmp
29 | 
30 |     if [ -s ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ]; then
31 | 
32 |         if [ $labels_swapped == 'true' ]; then
33 | 
34 |             for genome in $(cat ${tmp_dir}/Genomes_with_hits_to_${target}.tmp)
35 |             do
36 |                 
37 |                 grep -m1 "^$genome" ${output_dir}/Genomes_summary_info.tsv | cut -f 2 
38 | 
39 |             done > ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp
40 | 
41 |             paste ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp > ${tmp_dir}/Genomes_with_hits_to_${target}.tsv
42 | 
43 |         fi
44 |         
45 |         ## if any, removing those not in final tree before making iToL file
46 |         awk -F $'\t' ' $8 == "No" { print $1 } ' ${output_dir}/Genomes_summary_info.tsv | sort > ${tmp_dir}/sorted_genomes_to_leave_out_of_Pfam_iToL_files.tmp
47 | 
48 |         if [ -s ${tmp_dir}/sorted_genomes_to_leave_out_of_Pfam_iToL_files.tmp ]; then
49 |             comm -23 <( sort ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ) ${tmp_dir}/sorted_genomes_to_leave_out_of_Pfam_iToL_files.tmp > ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp
50 |         else
51 |             cp ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp
52 |         fi
53 | 
54 |         ## making iToL file for each additional target pfam
55 |         if [ $labels_swapped == 'true' ]; then
56 |             for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp)
57 |             do
58 |                 grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tsv
59 |             done | cut -f 2 > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp
60 | 
61 |         else
62 |             for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp)
63 |             do
64 |                 grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tmp
65 |             done > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp
66 |         fi
67 | 
68 |         printf "DATASET_STYLE\nSEPARATOR SPACE\nDATASET_LABEL $target\nCOLOR #0000ff\nDATA\n" > ${output_dir}/Pfam_search_results/iToL_files/${target}-iToL.txt
69 | 
70 |         cat <(sed 's/$/ branch node #0000ff 3 normal/' ${tmp_dir}/genomes_for_iToL_for_${target}.tmp) >> ${output_dir}/Pfam_search_results/iToL_files/${target}-iToL.txt
71 |     
72 |     else
73 |         rm ${tmp_dir}/Genomes_with_hits_to_${target}.tmp
74 | 
75 |     fi
76 | 
77 | done
78 | 


--------------------------------------------------------------------------------
/bin/gtt-genbank-parallel.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | num_cpus=$4
 12 | hmm_target_genes_total=$5
 13 | output_dir=$6
 14 | best_hit_mode=$7
 15 | additional_pfam_targets=$8
 16 | ko_targets=$9
 17 | target_KOs=${10}
 18 | 
 19 | ### kill backstop
 20 | # if there is a problem, all child processes launched (by this script) will exit immediately,
 21 | # upon returning to main script, will check and terminate parent process
 22 | if [ -s ${tmp_dir}/kill_genbank_parallel.prodigal ]; then
 23 |     exit
 24 | fi
 25 | 
 26 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 27 | if $(file $1 | grep -q "gzip"); then
 28 |     was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 29 |     file_location=${1%.*}
 30 |     gunzip -f -c $1 > $file_location
 31 |     assembly="$(basename ${file_location%.*})"
 32 | else
 33 |     file_location=$1
 34 |     assembly="$(basename ${1%.*})"
 35 |     was_gzipped=FALSE
 36 | fi
 37 | 
 38 | 
 39 | printf "   --------------------------------------------------------------------------   \n\n"
 40 | printf "     Genome: ${GREEN}$assembly${NC}\n"
 41 | 
 42 | # adding assembly to ongoing genomes list
 43 | echo $assembly >> ${tmp_dir}/genbank_genomes_list.tmp
 44 | 
 45 | # storing more info about the assembly if it's present in the genbank file:
 46 | # checking for organism:
 47 | if grep -q "ORGANISM" $file_location; then 
 48 |     org_name=$(grep -m1 "ORGANISM" $file_location | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_")
 49 | else
 50 |     org_name="NA"
 51 | fi
 52 | 
 53 | if grep -q "strain=" $file_location; then 
 54 |     strain=$(grep -m1 "strain=" $file_location | tr -s " " | cut -f 2 -d '"')
 55 | else
 56 |     strain="NA"
 57 | fi
 58 | 
 59 | if grep -q "taxon" $file_location; then
 60 |     taxid=$(grep -m1 "taxon" $file_location | cut -f2 -d ":" | tr -d '"')
 61 | else
 62 |     taxid="NA"
 63 | fi
 64 | 
 65 | # extracting AA coding sequences from genbank file
 66 | gtt-genbank-to-AA-seqs -i $file_location -o ${tmp_dir}/${assembly}_genes2.tmp 2> /dev/null
 67 | 
 68 | # checking that the file had CDS annotations
 69 | if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
 70 | 
 71 |     printf "\n  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
 72 |     printf "   This genbank file doesn't appear to have CDS annotations, so we are\n"
 73 |     printf "   identifying coding sequences with prodigal.\n\n"
 74 | 
 75 |     printf "   Reported in \"${output_dir}/run_files/Genbank_files_with_no_CDSs.txt\".\n"
 76 |     printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
 77 | 
 78 |     echo "$1" >> ${output_dir}/run_files/Genbank_files_with_no_CDSs.txt
 79 |     rm -rf ${tmp_dir}/${assembly}_genes2.tmp
 80 | 
 81 |     # pulling out full nucleotide fasta from genbank file
 82 |     gtt-genbank-to-fasta -i $file_location -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null
 83 | 
 84 |     # running prodigal
 85 |     echo "prodigal used" > ${tmp_dir}/prodigal_used # marking so can add to citations list reported at end
 86 |     prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
 87 | 
 88 |     if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then
 89 |         printf "$assembly\n" >> ${tmp_dir}/kill_genbank_parallel.prodigal
 90 |         rm -rf ${file_location}_prodigal.stderr
 91 |         exit
 92 |     else
 93 |         rm -rf ${file_location}_prodigal.stderr
 94 |     fi
 95 | 
 96 |     tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
 97 | 
 98 | fi
 99 | 
100 | ## removing gunzipped genome file if it was gunzipped
101 | if [ $was_gzipped == "TRUE" ]; then
102 |     rm -rf $file_location
103 | fi
104 | 
105 | 
106 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
107 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp
108 | 
109 | ## renaming seqs to have assembly name
110 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp
111 |   
112 | ### running hmm search ###
113 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
114 | 
115 | ### calculating % completion and redundancy ###
116 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
117 | do
118 |     grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
119 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp
120 | 
121 | ## making list here of only those present in exactly 1 copy
122 | 
123 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
124 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
125 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
126 | 
127 | ## adding SCG-hit counts to table
128 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
129 | 
130 | # total number of unique SCG hits
131 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
132 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
133 | 
134 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
135 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
136 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
137 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
138 | 
139 | ### want to put an explicit notice out if estimated redundancy is greater than 10%
140 | # needs to be an integer for bash comparison, so multiplying by 100 first
141 | 
142 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
143 | 
144 | printf "             Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
145 | 
146 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
147 |     printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
148 | 
149 | 
150 |     printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
151 |     printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
152 |     printf "   While there are no \"golden\" cutoff values for these things, typically\n"
153 |     printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
154 |     printf "   You may want to consider taking a closer look and/or removing it from the\n"
155 |     printf "   from the input genomes.\n\n"
156 | 
157 |     printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
158 |     printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
159 | 
160 |     # writing to table of genomes with questionable redundancy estimates
161 |     printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
162 | 
163 | else
164 |     printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
165 | fi
166 | 
167 | ## writing summary info to table ##
168 | printf "$assembly\t$1\t$taxid\t$org_name\t$strain\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Genbank_genomes_summary_info.tsv
169 | 
170 | ### Pulling out hits for this genome ###
171 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
172 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
173 | 
174 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
175 | if [ $best_hit_mode  == "false" ]; then
176 | 
177 |     for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
178 |     do
179 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
180 |     done
181 | 
182 | # if best-hit mode is on, taking best hit
183 | else
184 | 
185 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
186 |     do
187 |         grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
188 |     done
189 | 
190 | fi
191 | 
192 | ## searching for additional targets if provided
193 | # getting count of genes if there are additional targets
194 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
195 | 
196 |     gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
197 | 
198 | fi
199 | 
200 | ## KOs
201 | if [ $ko_targets == "true" ]; then
202 | 
203 |     gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
204 | 
205 | fi
206 | 
207 | ## Pfams
208 | if [ $additional_pfam_targets == "true" ]; then
209 | 
210 |     gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
211 | 
212 | fi
213 | 
214 | rm -rf ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_fasta.tmp ${tmp_dir}/${assembly}_genes1.tmp
215 | rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
216 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
217 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
218 | 


--------------------------------------------------------------------------------
/bin/gtt-genbank-serial.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir=$2
 10 | hmm_file=$3
 11 | genbank_genomes_total=$4
 12 | num_cpus=$5
 13 | hmm_target_genes_total=$6
 14 | output_dir=$7
 15 | best_hit_mode=$8
 16 | additional_pfam_targets=$9
 17 | ko_targets=${10}
 18 | target_KOs=${11}
 19 | 
 20 | num=0
 21 | 
 22 | rm -rf ${output_dir}/run_files/Genbank_files_with_no_CDSs.txt # deleting if file exists
 23 | 
 24 | # looping through the lines of the provided [-g] file (this loop operates on one genome at a time)
 25 | while IFS=$'\t' read -r -a file
 26 | 
 27 | do
 28 | 
 29 |     ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
 30 |     if $(file $file | grep -q "gzip"); then
 31 |         was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
 32 |         file_location=${file%.*}
 33 |         gunzip -f -c $file > $file_location
 34 |         assembly="$(basename ${file_location%.*})"
 35 |     else
 36 |         file_location=$file
 37 |         assembly="$(basename ${file%.*})"
 38 |         was_gzipped=FALSE
 39 |     fi
 40 | 
 41 | 
 42 |     # adding assembly to ongoing genomes list
 43 |     echo $assembly >> ${tmp_dir}/genbank_genomes_list.tmp
 44 | 
 45 |     num=$((num+1)) # to track progress
 46 | 
 47 |     printf "   --------------------------------------------------------------------------   \n"
 48 |     printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $genbank_genomes_total total.\n"
 49 |     printf "   --------------------------------------------------------------------------   \n\n"
 50 | 
 51 |     # storing more info about the assembly if it's present in the genbank file:
 52 |     if grep -q "ORGANISM" $file_location; then 
 53 |         org_name=$(grep -m1 "ORGANISM" $file_location | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_")
 54 |     else
 55 |         org_name="NA"
 56 |     fi
 57 | 
 58 |     if grep -q "strain=" $file_location; then 
 59 |         strain=$(grep -m1 "strain=" $file_location | tr -s " " | cut -f 2 -d '"')
 60 |     else
 61 |         strain="NA"
 62 |     fi
 63 | 
 64 |     if grep -q "taxon" $file_location; then
 65 |         taxid=$(grep -m1 "taxon" $file_location | cut -f2 -d ":" | tr -d '"')
 66 |     else
 67 |         taxid="NA"
 68 |     fi
 69 | 
 70 |     # extracting AA coding sequences from genbank file
 71 |     gtt-genbank-to-AA-seqs -i $file_location -o ${tmp_dir}/${assembly}_genes2.tmp 2> /dev/null
 72 | 
 73 |     # checking that the file had CDS annotations, if not running prodigal
 74 |     if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
 75 | 
 76 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
 77 |         printf "\t  This genbank file doesn't appear to have CDS annotations, so we are\n"
 78 |         printf "\t  identifying coding sequences with prodigal.\n\n"
 79 | 
 80 |         printf "\t    Reported in \"${output_dir}/run_files/Genbank_files_with_no_CDSs.txt\".\n"
 81 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
 82 | 
 83 |         echo "$file" >> ${output_dir}/run_files/Genbank_files_with_no_CDSs.txt
 84 |         rm -rf ${tmp_dir}/${assembly}_genes2.tmp
 85 | 
 86 |         # pulling out full nucleotide fasta from genbank file
 87 |         gtt-genbank-to-fasta -i $file_location -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null
 88 | 
 89 |         # running prodigal
 90 |         echo "prodigal used" > ${tmp_dir}/prodigal_used # marking so can add to citations list reported at end
 91 |         prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
 92 | 
 93 |         if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then
 94 |             printf "$assembly\n" >> ${tmp_dir}/kill_genbank_serial.prodigal
 95 |             rm -rf ${file_location}_prodigal.stderr
 96 |             exit
 97 |         else
 98 |             rm -rf ${file_location}_prodigal.stderr
 99 |         fi
100 | 
101 |         tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
102 | 
103 |     fi
104 | 
105 |     ## removing gunzipped genome file if it was gunzipped
106 |     if [ $was_gzipped == "TRUE" ]; then
107 |         rm -rf $file_location
108 |     fi
109 | 
110 |     # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
111 |     gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp
112 | 
113 |     ## renaming seqs to have assembly name
114 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp
115 | 
116 |     ### counting how many genes in this genome
117 |     gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
118 | 
119 |     printf "      Performing HMM search...\n"
120 |       
121 |     ### running hmm search ###
122 |     hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
123 | 
124 |     ### calculating % completion and redundancy ###
125 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
126 |     do
127 |         grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
128 |     done > ${tmp_dir}/${assembly}_uniq_counts.tmp
129 | 
130 |     ## making list here of only those present in exactly 1 copy
131 |     paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
132 |     awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
133 |     uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
134 | 
135 | 
136 |     ## adding SCG-hit counts to table
137 |     paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
138 | 
139 |     # total number of unique SCG hits
140 |     num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
141 |     num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
142 | 
143 |     perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
144 |     perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
145 |     perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
146 |     perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
147 | 
148 |     # want to put a notice out if estimated redundancy is greater than 10
149 |     # needs to be an integer for bash comparison, so multiplying by 100 first
150 | 
151 |     mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
152 | 
153 |     printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
154 | 
155 |     if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
156 | 
157 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
158 | 
159 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
160 |         printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
161 |         printf "   While there are no \"golden\" cutoff values for these things, typically\n"
162 |         printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
163 |         printf "   You may want to consider taking a closer look and/or removing it from the\n"
164 |         printf "   from the input genomes.\n\n"
165 | 
166 |         printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
167 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
168 | 
169 |         # writing to table of genomes with questionable redundancy estimates
170 |         printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
171 | 
172 |     else
173 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
174 | 
175 |     fi
176 | 
177 | 
178 |     ## writing summary info to table ##
179 |     printf "$assembly\t$file\t$taxid\t$org_name\t$strain\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Genbank_genomes_summary_info.tsv
180 | 
181 |     ### Pulling out hits for this genome ###
182 |     # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
183 |     esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
184 |         
185 |     # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
186 |     if [ $best_hit_mode  == "false" ]; then
187 | 
188 |         for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
189 |         do
190 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
191 |         done
192 | 
193 |     # if best-hit mode is on, taking best hit
194 |     else
195 | 
196 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
197 |         do
198 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
199 |         done
200 | 
201 |     fi
202 | 
203 | 
204 |     ## searching for additional targets if provided
205 |     # getting count of genes if there are additional targets
206 |     if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
207 | 
208 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
209 | 
210 |     fi
211 | 
212 |     ## KOs
213 |     if [ $ko_targets == "true" ]; then
214 | 
215 |         gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
216 | 
217 |     fi
218 | 
219 |     ## Pfams
220 |     if [ $additional_pfam_targets == "true" ]; then
221 | 
222 |         gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
223 | 
224 |     fi
225 | 
226 |     rm -rf ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_fasta.tmp ${tmp_dir}/${assembly}_genes1.tmp
227 |     rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
228 |     rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
229 |     rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
230 | 
231 | done < $1
232 | 


--------------------------------------------------------------------------------
/bin/gtt-genbank-to-AA-seqs:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import re
 6 | import sys
 7 | 
 8 | parser = argparse.ArgumentParser(description="This script takes a genbank file and returns the amino acid sequences for all coding sequences.")
 9 | 
10 | required = parser.add_argument_group('required arguments')
11 | 
12 | required.add_argument("-i", "--input_gb", help='input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action="store", dest="input_gb", required=True)
13 | parser.add_argument("-o", "--output_fasta", help='Output fasta file (default: "clean.faa")', action="store", dest="output_fasta", default="clean.faa")
14 | 
15 | if len(sys.argv)==1:
16 |   parser.print_help(sys.stderr)
17 |   sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | input_gb = open(args.input_gb, "r")
22 | 
23 | output_fasta = open(args.output_fasta, "w")
24 | 
25 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")]
26 | 
27 | note_terms_to_exclude = ["frameshifted", "internal stop", "incomplete"] # dumping gene if noted as these in the "note" section of the call to keep only complete genes
28 | location_terms_to_exclude = ["join", "<", ">"] # dumping gene if "location" section contains any of these: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig
29 | 
30 | for rec in recs:
31 | 
32 |   genes = [gene for gene in rec.features if gene.type =="CDS"] # focusing on features annotated as "CDS"
33 | 
34 |   for gene in genes:
35 | 
36 |     location = str(gene.location)
37 | 
38 |       # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig
39 |     if any(exclusion_term in location for exclusion_term in location_terms_to_exclude):
40 |       continue
41 | 
42 |     if "note" in gene.qualifiers:
43 |       note = str(gene.qualifiers["note"][0])
44 | 
45 |         # dumping gene if noted as any of these in the "note" section set above
46 |       if any(exclusion_term in note for exclusion_term in note_terms_to_exclude):
47 |         continue
48 | 
49 |         # dumping if overlapping translation frame
50 |     if "transl_except" in gene.qualifiers:
51 |       continue
52 | 
53 |         # dumping if noted a pseudo gene
54 |     if "pseudo" in gene.qualifiers:
55 |       continue
56 |     
57 |         # making gene header locus_tag if present. If not, building by contig name and gene coordinates
58 |     if "locus_tag" in gene.qualifiers:
59 |       header = str(gene.qualifiers["locus_tag"][0])
60 |     else:
61 |       location = location.replace("[", "")
62 |       location = re.sub('](.*)', '', location)
63 |       location = location.split(":")
64 |       start = location[0]
65 |       end = location[1]
66 | 
67 |       header = str(rec.name) + "_" + str(start) + "_" + str(end)
68 | 
69 |     output_fasta.write(">" + str(header)  + "\n" + str(gene.qualifiers["translation"][0]) + "\n")
70 | 
71 | input_gb.close()
72 | output_fasta.close()
73 | 


--------------------------------------------------------------------------------
/bin/gtt-genbank-to-fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import argparse
 5 | import sys
 6 | 
 7 | parser = argparse.ArgumentParser(description="This script takes a genbank file and outputs a flat fasta file of all nucleotides.")
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_gb", help='input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action="store", dest="input_gb", required=True)
12 | parser.add_argument("-o", "--output_fasta", help='Output fasta file with matching, simplified headers to be ready for `anvi-gen-contigs-db` (default: "clean.fa")', action="store", dest="output_fasta", default="clean.fa")
13 | 
14 | if len(sys.argv)==1:
15 |   parser.print_help(sys.stderr)
16 |   sys.exit(0)
17 | 
18 | args = parser.parse_args()
19 | 
20 | input_gb = open(args.input_gb, "r")
21 | 
22 | output_fasta = open(args.output_fasta, "w")
23 | 
24 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")]
25 | 
26 | for rec in recs:
27 |   output_fasta.write(">" + rec.name  + "\n" + str(rec.seq) + "\n") # writing out new fasta with clean headers ready for anvi'o
28 | 
29 | input_gb.close()
30 | output_fasta.close()
31 | 


--------------------------------------------------------------------------------
/bin/gtt-get-additional-pfam-targets.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | tmp_dir=${1}
 4 | output_dir=${2}
 5 | 
 6 | # base_link="https://pfam.xfam.org/family/"
 7 |     # base link updated Oct-2022 when pfam hosting shifted to interpro
 8 | base_link="https://www.ebi.ac.uk/interpro/wwwapi/entry/pfam/"
 9 | 
10 | # starting table of what was requested and what was found (version-wise, the latest is always pulled from Pfam when downloading as below)
11 | printf "requested_Pfam\tpulled_Pfam\n" > ${output_dir}/Pfam_search_results/info/Requested-and-pulled.tsv
12 | 
13 | for initial_target in $(cat ${tmp_dir}/uniq_pfam_targets.tmp)
14 | do
15 | 
16 | 
17 |     # getting target without version specified if there was one (including a version doesn't work anymore since pfam-hosting shifted to interpro)
18 |     target=$(echo ${initial_target} | cut -f 1 -d ".")
19 | 
20 |     # --insecure flag added on 29-Nov-2020, due to pfam certificate being invalid (https://github.com/AstrobioMike/GToTree/issues/28)
21 |     curl --insecure --silent --retry 10 -o ${tmp_dir}/${target}.hmm.gz "${base_link}${target}?annotation=hmm"
22 |     gunzip ${tmp_dir}/${target}.hmm.gz
23 | 
24 |     if [ -s ${tmp_dir}/${target}.hmm ]; then
25 |         # getting accession pulled (to account for current version on Pfam as compared to what was searched)
26 |         actual_target=$(grep -m1 "^ACC" ${tmp_dir}/${target}.hmm | tr -s " " "\t" | cut -f 2)
27 |         printf "$actual_target\n" >> ${tmp_dir}/actual_pfam_targets.tmp
28 | 
29 |         if [ $initial_target != $actual_target ]; then
30 |             mv ${tmp_dir}/${target}.hmm ${tmp_dir}/${actual_target}.hmm
31 |         fi
32 | 
33 |         cat ${tmp_dir}/${actual_target}.hmm >> ${tmp_dir}/all_pfam_targets.hmm
34 | 
35 |         # adding searched and pulled to info table (meaning which versions of a Pfam)
36 |         printf "${initial_target}\t${actual_target}\n" >> ${output_dir}/Pfam_search_results/info/Requested-and-pulled.tsv
37 | 
38 |     else # aborting if any of the pfam targets couldn't be pulled successfully
39 |         printf "\n  ${RED}One of the target Pfams could not be successfully downloaded :(${NC}\n"
40 |         printf "\n  The problem child was ${target}.\n\n"
41 |         printf "\nExiting for now.\n\n"
42 | 
43 |         rm -rf ${output_dir}
44 |         # removing temp directory unless debug mode on
45 |         if [ $debug_flag == 'false' ]; then
46 |              rm -rf $tmp_dir
47 |         fi
48 | 
49 |         exit
50 | 
51 |     fi
52 | 
53 | done
54 | 
55 | # starting the main results table which will have the following as its header:
56 | paste <(printf "assembly_id\ttotal_gene_count") <(printf %s "$(cat ${tmp_dir}/actual_pfam_targets.tmp | tr "\n" "\t")") > ${output_dir}/Pfam_search_results/Pfam-hit-counts.tsv
57 | 
58 | # copying over the additional pfam-target hmms
59 | cp ${tmp_dir}/all_pfam_targets.hmm ${output_dir}/Pfam_search_results/target_Pfam_profiles/all-additional-pfam-targets.hmm
60 | 


--------------------------------------------------------------------------------
/bin/gtt-get-kofamscan-data:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 
  5 | to download and setup the KOFamScan (https://github.com/takaram/kofam_scan) data files for use. 
  6 | 
  7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage
  8 | """
  9 | 
 10 | import sys
 11 | import os
 12 | import urllib.request
 13 | import argparse
 14 | import shutil
 15 | import textwrap
 16 | import filecmp
 17 | import tarfile
 18 | import gzip
 19 | 
 20 | parser = argparse.ArgumentParser(description="This is a helper program to setup the KOFamScan (github.com/takaram/kofam_scan) \
 21 |                                               data files for use.", \
 22 |                                  epilog="Ex. usage: gtt-get-kofamscan-data\n")
 23 | 
 24 | args = parser.parse_args()
 25 | 
 26 | 
 27 | ################################################################################
 28 | 
 29 | def main():
 30 | 
 31 |     KO_data_dir = check_location_var_is_set()
 32 | 
 33 |     data_present = check_if_data_present(KO_data_dir)
 34 | 
 35 |     if data_present:
 36 |         exit()
 37 | 
 38 |     else:
 39 | 
 40 |         print(color_text("    Downloading required KO data (only needs to be done once)...\n", "yellow"))
 41 |         get_kofamscan_data(KO_data_dir)
 42 | 
 43 | 
 44 | ################################################################################
 45 | 
 46 | 
 47 | # setting some colors
 48 | tty_colors = {
 49 |     'green' : '\033[0;32m%s\033[0m',
 50 |     'yellow' : '\033[0;33m%s\033[0m',
 51 |     'red' : '\033[0;31m%s\033[0m'
 52 | }
 53 | 
 54 | 
 55 | ### functions ###
 56 | def color_text(text, color='green'):
 57 |     if sys.stdout.isatty():
 58 |         return tty_colors[color] % text
 59 |     else:
 60 |         return text
 61 | 
 62 | 
 63 | def wprint(text):
 64 |     print(textwrap.fill(text, width=80, initial_indent="  ", 
 65 |           subsequent_indent="  ", break_on_hyphens=False))
 66 | 
 67 | 
 68 | def check_location_var_is_set():
 69 | 
 70 |     # making sure there is a KO_data_dir env variable
 71 |     try:
 72 |         KO_data_dir = os.environ['KO_data_dir']
 73 |     except:
 74 |         wprint(color_text("The environment variable 'KO_data_dir'  does not seem to be set :(", "yellow"))
 75 |         wprint("This shouldn't happen, check on things with `gtt-data-locations check`.")
 76 |         print("")
 77 |         sys.exit(0)
 78 | 
 79 |     return(KO_data_dir)
 80 | 
 81 | 
 82 | def check_stored_data_up_to_date(location):
 83 |     """ checks if the stored kofamscan data is the latest """
 84 | 
 85 |     # getting latest version README
 86 |     kofamscan_current_readme = urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/README", location + "README-latest")
 87 | 
 88 |     # comparing vs one that's present already
 89 |     if filecmp.cmp(location + "README-latest", location + "README"):
 90 |         os.remove(location + "README-latest")
 91 | 
 92 |         return(True)
 93 | 
 94 |     else:
 95 |         os.remove(location + "README-latest")
 96 |         print("")
 97 |         wprint(color_text("A newer version of the KOFamScan data is available, updating...", "yellow"))
 98 | 
 99 |         return(False)
100 | 
101 | 
102 | def check_if_data_present(location):
103 | 
104 |     # seeing if present already, and if so, if those are up-to-date
105 |     # if this function returns True, then data is present and up-to-date
106 |     # if it returns False, then we need to download things
107 |     README_path = str(location) + "/README"
108 |     ko_list_path = str(location) + "/ko_list"
109 |     hmms_dir_path = str(location) + "/profiles/"
110 | 
111 |     if not os.path.isfile(README_path) or not os.path.isfile(ko_list_path) or not os.path.isdir(hmms_dir_path):
112 | 
113 |         if os.path.exists(README_path):
114 |             os.remove(README_path)
115 |         if os.path.exists(ko_list_path):
116 |             os.remove(ko_list_path)
117 |         if os.path.isdir(hmms_dir_path):
118 |             shutil.rmtree(hmms_dir_path)
119 | 
120 |         return(False)
121 | 
122 |     else:
123 | 
124 |         # if here, checking if it is up-to-date (returns True/False), if present and up to date, returning True
125 |         if check_stored_data_up_to_date(location):
126 |             return(True)
127 | 
128 |         else:
129 | 
130 |             # removing current files
131 |             if os.path.exists(README_path):
132 |                 os.remove(README_path)
133 |             if os.path.exists(ko_list_path):
134 |                 os.remove(ko_list_path)
135 |             if os.path.isdir(hmms_dir_path):
136 |                 shutil.rmtree(hmms_dir_path)
137 | 
138 |             return(False)
139 | 
140 | 
141 | def get_kofamscan_data(location):
142 |     """ downloads the needed kofamscan data """
143 | 
144 |     README_path = str(location) + "/README"
145 |     ko_list_gz_path = str(location) + "/ko_list.gz"
146 |     ko_list_path = str(location) + "/ko_list"
147 |     hmms_tar_path = str(location) + "/profiles.tar.gz"
148 | 
149 |     urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/README", README_path)
150 |     urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz", ko_list_gz_path)
151 |     urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz", hmms_tar_path)
152 | 
153 |     # decompressing ko_list file
154 |     with gzip.open(ko_list_gz_path, 'rb') as f_in:
155 |         with open(ko_list_path, 'wb') as f_out:
156 |             shutil.copyfileobj(f_in, f_out)
157 | 
158 |     # removing gzipped ko_list
159 |     os.remove(ko_list_gz_path)
160 | 
161 |     # unpacking profiles
162 |     with tarfile.open(hmms_tar_path) as tarball:
163 |         tarball.extractall(location)
164 | 
165 |     # removing tarball
166 |     os.remove(hmms_tar_path)
167 | 
168 | 
169 | ################################################################################
170 | 
171 | if __name__ == "__main__":
172 |     main()
173 | 


--------------------------------------------------------------------------------
/bin/gtt-get-median.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | sort -n $1 | awk '
 4 |   BEGIN {
 5 |     counts = 0
 6 |     sum = 0
 7 |   }
 8 |   {
 9 |     values[counts++] = $1
10 |     sum += $1
11 |   }
12 |   END {
13 |     if ( NR == 2 ) {
14 |       median = sum/2
15 |     } else if ( (NR % 2) == 1 ) {
16 |       median = values[ int(counts/2) ]
17 |     } else {
18 |       median = ( values[counts/2] + values[counts/2-1] ) / 2
19 |     }
20 |     print median;
21 |   }
22 | '


--------------------------------------------------------------------------------
/bin/gtt-get-ncbi-assembly-tables:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 
  5 | to download the NCBI assembly summary tables if they are not present, or are more than 4 weeks old.
  6 | """
  7 | 
  8 | import sys
  9 | import os
 10 | import urllib.request
 11 | import argparse
 12 | import shutil
 13 | import textwrap
 14 | from datetime import date, timedelta
 15 | import filecmp
 16 | import tarfile
 17 | import gzip
 18 | 
 19 | parser = argparse.ArgumentParser(description="This is a helper program to download and setup the NCBI assembly summary tables if they are \
 20 |                                               not present, or are older than 4 weeks.", \
 21 |                                  epilog="Ex. usage: gtt-get-ncbi-assembly-tables\n")
 22 | 
 23 | parser.add_argument("-P", "--use-http", help='Use http instead of ftp', action = "store_true")
 24 | parser.add_argument("-f", "--force-update", help='Force an update regardless of last date retrieved', action = "store_true")
 25 | 
 26 | 
 27 | args = parser.parse_args()
 28 | 
 29 | 
 30 | ################################################################################
 31 | 
 32 | def main():
 33 | 
 34 |     NCBI_assembly_data_dir = check_location_var_is_set()
 35 | 
 36 |     data_present = check_if_data_present_and_less_than_4_weeks_old(NCBI_assembly_data_dir)
 37 | 
 38 |     if data_present and not args.force_update:
 39 |         exit()
 40 | 
 41 |     else:
 42 | 
 43 |         get_NCBI_assembly_summary_data(NCBI_assembly_data_dir)
 44 | 
 45 | ################################################################################
 46 | 
 47 | 
 48 | # setting some colors
 49 | tty_colors = {
 50 |     'green' : '\033[0;32m%s\033[0m',
 51 |     'yellow' : '\033[0;33m%s\033[0m',
 52 |     'red' : '\033[0;31m%s\033[0m'
 53 | }
 54 | 
 55 | 
 56 | ### functions ###
 57 | def color_text(text, color='green'):
 58 |     if sys.stdout.isatty():
 59 |         return tty_colors[color] % text
 60 |     else:
 61 |         return text
 62 | 
 63 | 
 64 | def wprint(text):
 65 |     print(textwrap.fill(text, width=80, initial_indent="                      ", 
 66 |           subsequent_indent="  ", break_on_hyphens=False))
 67 | 
 68 | 
 69 | def check_location_var_is_set():
 70 | 
 71 |     # making sure there is a KO_data_dir env variable
 72 |     try:
 73 |         NCBI_data_dir = os.environ['NCBI_assembly_data_dir']
 74 |     except:
 75 |         wprint(color_text("The environment variable 'NCBI_assembly_data_dir'  does not seem to be set :(", "yellow"))
 76 |         wprint("This shouldn't happen, check on things with `gtt-data-locations check`.")
 77 |         print("")
 78 |         sys.exit(0)
 79 | 
 80 |     return(NCBI_data_dir)
 81 | 
 82 | 
 83 | def check_if_data_present_and_less_than_4_weeks_old(location):
 84 | 
 85 |     # seeing if present already and if it was downloaded less than 4 weeks ago
 86 |     # if this function returns True, then we don't do anything
 87 |     # if it returns False, then we need to download things
 88 |     table_path = os.path.join(str(location), "ncbi-assembly-info.tsv")
 89 |     date_retrieved_path = os.path.join(str(location), "date-retrieved.txt")
 90 | 
 91 |     # if either file is missing, we are going to download, we also package the date-retrieved file empty with conda to retain directory, so checking it's not empty as well
 92 |     if not os.path.isfile(table_path) or not os.path.isfile(date_retrieved_path) or not os.path.getsize(date_retrieved_path) > 0:
 93 | 
 94 |         if os.path.exists(table_path):
 95 |             os.remove(table_path)
 96 |         if os.path.isdir(date_retrieved_path):
 97 |             shutil.rmtree(date_retrieved_path)
 98 | 
 99 |         return(False)
100 |     
101 |     # if both files are present (and not empty), we are checking if it was downloaded more than 4 weeks ago
102 |     # and will download if it was
103 |     if os.path.isfile(table_path) and os.path.isfile(date_retrieved_path):
104 | 
105 |         # getting current date
106 |         curr_date = date.today()
107 | 
108 |         # reading date it was downloaded
109 |         with open(date_retrieved_path, 'r') as file:
110 |             stored_date = file.read().strip()
111 | 
112 |         # setting to date object
113 |         stored_date_list = stored_date.split(",")
114 |         stored_date = date(int(stored_date_list[0]), int(stored_date_list[1]), int(stored_date_list[2]))
115 | 
116 |         # getting difference
117 |         diff = curr_date - stored_date
118 | 
119 |         # checking if difference is greater than 28 days
120 |         if diff.days > 28:
121 | 
122 |             return(False)
123 |         
124 |         else:
125 | 
126 |             return(True)
127 | 
128 |     else:
129 | 
130 |         return(True)
131 | 
132 | 
133 | def get_NCBI_assembly_summary_data(location):
134 | 
135 |     """ downloads the needed ncbi assembly summary tables and combines them """
136 | 
137 |     # setting links
138 |     if args.use_http:
139 | 
140 |         genbank_link = "https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt"
141 |         refseq_link = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
142 | 
143 |     else:
144 | 
145 |         genbank_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt"
146 |         refseq_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
147 | 
148 |     table_path = os.path.join(str(location), "ncbi-assembly-info.tsv")
149 |     refseq_temp_path = os.path.join(str(location), "refseq-assembly-info.tmp")
150 | 
151 |     print(color_text("    Downloading NCBI assembly summaries (only done once, or updated after 4 weeks)...\n", "yellow"))
152 |     
153 |     urllib.request.urlretrieve(genbank_link, table_path)
154 |     urllib.request.urlretrieve(refseq_link, refseq_temp_path)
155 | 
156 |     # combining
157 |     with open (table_path, "a") as final_table:
158 |         with open(refseq_temp_path, "r") as refseq:
159 |             final_table.write(refseq.read())
160 | 
161 |     # removing temp
162 |     if os.path.exists(refseq_temp_path):
163 |         os.remove(refseq_temp_path)
164 |  
165 |     # storing date retrieved
166 |     date_retrieved = str(date.today()).replace("-", ",")
167 |     date_retrieved.replace("-", ",")
168 | 
169 |     date_retrieved_path = os.path.join(str(location), "date-retrieved.txt")
170 | 
171 |     with open(date_retrieved_path, "w") as outfile:
172 |         outfile.write(date_retrieved + "\n")
173 | 
174 | ################################################################################
175 | 
176 | if __name__ == "__main__":
177 |     main()
178 | 


--------------------------------------------------------------------------------
/bin/gtt-get-ncbi-tax-data:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 
  5 | to download NCBI tax data for using TaxonKit (https://bioinf.shenwei.me/taxonkit/) to add NCBI taxonomy. 
  6 | 
  7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage
  8 | """
  9 | 
 10 | import sys
 11 | import os
 12 | import urllib.request
 13 | import argparse
 14 | import shutil
 15 | import textwrap
 16 | import filecmp
 17 | import tarfile
 18 | import gzip
 19 | 
 20 | parser = argparse.ArgumentParser(description="This is a helper program to setup NCBI tax data for TaxonKit (bioinf.shenwei.me/taxonkit/) \
 21 |                                               to add taxonomy info.", \
 22 |                                  epilog="Ex. usage: gtt-get-ncbi-tax-data\n")
 23 | 
 24 | parser.add_argument("-P", "--use-http", help='Use http instead of ftp', action = "store_true")
 25 | 
 26 | args = parser.parse_args()
 27 | 
 28 | 
 29 | ################################################################################
 30 | 
 31 | def main():
 32 | 
 33 |     NCBI_data_dir = check_location_var_is_set()
 34 | 
 35 |     data_present = check_if_data_present(NCBI_data_dir)
 36 | 
 37 |     if data_present:
 38 |         exit()
 39 | 
 40 |     else:
 41 | 
 42 |         print(color_text("    Downloading required NCBI taxonomy data (only needs to be done once)...\n", "yellow"))
 43 |         get_NCBI_tax_data(NCBI_data_dir)
 44 | 
 45 | 
 46 | ################################################################################
 47 | 
 48 | 
 49 | # setting some colors
 50 | tty_colors = {
 51 |     'green' : '\033[0;32m%s\033[0m',
 52 |     'yellow' : '\033[0;33m%s\033[0m',
 53 |     'red' : '\033[0;31m%s\033[0m'
 54 | }
 55 | 
 56 | 
 57 | ### functions ###
 58 | def color_text(text, color='green'):
 59 |     if sys.stdout.isatty():
 60 |         return tty_colors[color] % text
 61 |     else:
 62 |         return text
 63 | 
 64 | 
 65 | def wprint(text):
 66 |     print(textwrap.fill(text, width=80, initial_indent="  ", 
 67 |           subsequent_indent="  ", break_on_hyphens=False))
 68 | 
 69 | 
 70 | def check_location_var_is_set():
 71 | 
 72 |     # making sure there is a KO_data_dir env variable
 73 |     try:
 74 |         NCBI_data_dir = os.environ['TAXONKIT_DB']
 75 |     except:
 76 |         wprint(color_text("The environment variable 'TAXONKIT_DB'  does not seem to be set :(", "yellow"))
 77 |         wprint("This shouldn't happen, check on things with `gtt-data-locations check`.")
 78 |         print("")
 79 |         sys.exit(0)
 80 | 
 81 |     return(NCBI_data_dir)
 82 | 
 83 | 
 84 | def check_if_data_present(location):
 85 | 
 86 |     # seeing if present already
 87 |     # if this function returns True, then data is present
 88 |     # if it returns False, then we need to download things
 89 |     names_path = os.path.join(str(location) + "/names.dmp")
 90 |     nodes_path = os.path.join(str(location) + "/nodes.dmp")
 91 | 
 92 | 
 93 |     if not os.path.isfile(names_path) or not os.path.isfile(nodes_path):
 94 | 
 95 |         if os.path.exists(names_path):
 96 |             os.remove(names_path)
 97 |         if os.path.isdir(nodes_path):
 98 |             shutil.rmtree(nodes_path)
 99 | 
100 |         return(False)
101 | 
102 |     else:
103 | 
104 |         return(True)
105 | 
106 | 
107 | def get_NCBI_tax_data(location):
108 |     """ downloads the needed ncbi tax data """
109 | 
110 |     taxdump_path = os.path.join(str(location) + "taxdump.tar.gz")
111 | 
112 | 
113 |     # setting links
114 |     if args.use_http:
115 | 
116 |         taxdump_link = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
117 | 
118 |     else:
119 | 
120 |         taxdump_link = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"
121 | 
122 |     urllib.request.urlretrieve(taxdump_link, taxdump_path)
123 | 
124 |     # unpacking
125 |     with tarfile.open(taxdump_path) as tarball:
126 |         tarball.extractall(location)
127 | 
128 |     # removing tarball
129 |     os.remove(taxdump_path)
130 | 
131 | 
132 | ################################################################################
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/bin/gtt-hmms:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # setting colors to use
 4 | GREEN='\033[0;32m'
 5 | YELLOW='\033[0;33m'
 6 | RED='\033[0;31m'
 7 | NC='\033[0m'
 8 | 
 9 | printf "\n${YELLOW}                   GToTree pre-packaged HMM SCG-sets\n${NC}"
10 | printf "   See github.com/AstrobioMike/GToTree/wiki/SCG-sets for more info\n\n"
11 | 
12 | ## making sure expected variable is set
13 | if [ -z ${GToTree_HMM_dir} ]; then
14 |     # reporting it is not set
15 |     printf "\n${YELLOW}   The 'GToTree_HMM_dir' variable is not set :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n"
16 |     exit
17 | 
18 | fi
19 | 
20 | # now making sure directory exists or that we can create it if not
21 | if [ ! -d ${GToTree_HMM_dir} ]; then
22 |     
23 |     # attempting to create
24 |     mkdir -p ${GToTree_HMM_dir} > /dev/null
25 |     if [ $? -ne 0 ]; then
26 |         printf "\n${YELLOW}   The 'GToTree_HMM_dir' location does not exist and can't be created :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n"
27 |         exit
28 |     fi
29 | 
30 | fi
31 | 
32 | # making sure it is writable
33 | if [ ! -w ${GToTree_HMM_dir} ]; then
34 |     printf "\n${YELLOW}   The 'GToTree_HMM_dir' location is not writable for you :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n"
35 |     exit
36 | fi
37 | 
38 | 
39 | ## now moving on to reporting the pre-packaged HMMs
40 | # making sure info table is there, or downloading if not
41 | if [ ! -f "${GToTree_HMM_dir}/hmm-sources-and-info.tsv" ]; then
42 | 
43 |     # downloading to there if not already present
44 |     curl --silent --retry 10 -L -o ${GToTree_HMM_dir}/hmm-sources-and-info.tsv https://figshare.com/ndownloader/files/34066016
45 | 
46 | fi
47 | 
48 | # printing out info for pre-packaged HMMs
49 | num_hmm_files=$( tail -n +2 ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 1 | wc -l | sed "s/^ *//" )
50 | 
51 | printf "   The environment variable ${GREEN}GToTree_HMM_dir${NC} is set to:\n"
52 | printf "     $GToTree_HMM_dir\n\n"
53 | 
54 | printf "   The ${num_hmm_files} available pre-packaged HMM SCG-sets include:\n\n"
55 | 
56 | for gene_set in $(tail -n +2 ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 1); do
57 | 
58 |     gene_set=$(echo $gene_set | sed 's/.hmm//')
59 |     curr_number_of_genes=$(grep -m 1 -w "^${gene_set}" ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 4)
60 | 
61 |     printf "\t   %-30s %14s\n" "${gene_set}" "($curr_number_of_genes genes)"
62 | 
63 | done
64 | 
65 | printf "\n"
66 | printf "   Details can be found in: \n     ${GToTree_HMM_dir}hmm-sources-and-info.tsv\n\n"
67 | 


--------------------------------------------------------------------------------
/bin/gtt-ncbi-parallel-nt.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir="$2"
 10 | hmm_file="$3"
 11 | num_cpus="$4"
 12 | hmm_target_genes_total="$5"
 13 | output_dir="$6"
 14 | best_hit_mode="$7"
 15 | additional_pfam_targets="$8"
 16 | http_flag="$9"
 17 | ko_targets="${10}"
 18 | target_KOs="${11}"
 19 | debug_flag="${12}"
 20 | 
 21 | assembly=$(echo "$1" | cut -f 1)
 22 | downloaded_accession=$(echo "$1" | cut -f 2)
 23 | 
 24 | # storing and building links
 25 | if [ "$http_flag" == 'false' ]; then
 26 |     base_link=$(echo "$1" | cut -f 9)
 27 | else
 28 |     base_link=$(echo "$1" | cut -f 9 | sed 's/^ftp/https/')
 29 | fi
 30 | 
 31 | # checking link was actually present (sometimes, very rarely, it is not there)
 32 | # if not there, attempting to build ourselves
 33 | if [ $base_link == "na" ] || [ -z $base_link ]; then
 34 | 
 35 |     if [ "$http_flag" == 'false' ]; then
 36 |         p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all")
 37 |     else
 38 |         p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all")
 39 |     fi
 40 | 
 41 |     # checking if GCF or GCA
 42 |     if [[ $assembly == "GCF"* ]]; then
 43 |         p2="GCF"
 44 |     else
 45 |         p2="GCA"
 46 |     fi
 47 | 
 48 |     p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
 49 |     p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
 50 |     p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
 51 | 
 52 |     ass_name=$(echo "$1" | cut -f 3)
 53 |     end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name"))
 54 | 
 55 |     base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path"))
 56 | 
 57 | else
 58 | 
 59 |     end_path=$(basename $base_link)
 60 | 
 61 | fi
 62 | 
 63 | printf "   --------------------------------------------------------------------------   \n\n"
 64 | printf "     Genome: ${GREEN}$assembly${NC}\n"
 65 | 
 66 | # attempting to download genome fasta
 67 | curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz"
 68 | 
 69 | # if http, then it pulls down a file still, it just isn't gzipped
 70 | # if ftp, no file is pulled down
 71 | # so to cover both cases, just making this need to be present and gzipped
 72 | if $(file ${tmp_dir}/${assembly}_genome.tmp.gz | grep -q gzip); then
 73 | 
 74 |     gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz
 75 | 
 76 |     prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null
 77 | 
 78 |     tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp
 79 | 
 80 |     ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such
 81 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp
 82 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp
 83 | 
 84 |     # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found)
 85 |     ass_name=$(echo "$1" | cut -f 3)
 86 |     if [ -z "$ass_name" ]; then ass_name="NA"; fi
 87 |     org_name=$(echo "$1" | cut -f 5)
 88 |     if [ -z "$org_name" ]; then org_name="NA"; fi
 89 |     infraspecific_name=$(echo "$1" | cut -f 6)
 90 |     if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi
 91 |     taxid=$(echo "$1" | cut -f 4)
 92 |     if [ -z "$taxid" ]; then taxid="NA"; fi
 93 |     version_status=$(echo "$1" | cut -f 7)
 94 |     if [ -z "$version_status" ]; then version_status="NA"; fi
 95 |     asm_level=$(echo "$1" | cut -f 8)
 96 |     if [ -z "$asm_level" ]; then asm_level="NA"; fi
 97 | 
 98 |     ### counting how many genes in this genome
 99 |     gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes3.faa.tmp)
100 | 
101 |     # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
102 |     gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp
103 |         # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta
104 | 
105 |     ### running hmm search ###
106 |     hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null
107 | 
108 |     ### calculating % completion and redundancy ###
109 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
110 |     do
111 |         grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
112 |     done > ${tmp_dir}/${assembly}_uniq_counts.tmp
113 | 
114 |     ## making list here of only those present in exactly 1 copy, to get count of "unique" SCG-hits
115 |     paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
116 |     awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
117 |     uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
118 | 
119 |     ## adding SCG-hit counts to table
120 |     paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
121 | 
122 |     # total number of unique SCG hits
123 |     num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
124 |     num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
125 | 
126 |     perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
127 |     perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
128 |     perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
129 |     perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
130 | 
131 |     # want to put a notice out if estimated redundancy is greater than 10
132 |     # needs to be an integer for bash comparison, so multiplying by 100 first
133 |     mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
134 | 
135 |     printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
136 | 
137 |     if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
138 | 
139 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
140 | 
141 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
142 |         printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
143 |         printf "   While there are no \"golden\" cutoff values for these things, typically\n"
144 |         printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
145 |         printf "   You may want to consider taking a closer look and/or removing it from the\n"
146 |         printf "   from the input genomes.\n\n"
147 | 
148 |         printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
149 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
150 | 
151 |         # writing to table of genomes with questionable redundancy estimates
152 |         printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
153 | 
154 |     else
155 |         printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
156 | 
157 |     fi
158 | 
159 |     ## writing summary info to table ##
160 |     printf "$assembly\t$downloaded_accession\t$ass_name\t$taxid\t$org_name\t$infraspecific_name\t$version_status\t$asm_level\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/NCBI_genomes_summary_info.tsv
161 | 
162 |     ### Pulling out hits for this genome (nucleotide as specified by user) ###
163 |     target_genes_suffix="_genes.fa.tmp"
164 | 
165 |     # indexing
166 |     esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null
167 | 
168 |     # looping through and pulling out each first hit (hmm results tab is sorted by e-value):
169 |     # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
170 |     if [ $best_hit_mode  == "false" ]; then
171 | 
172 |         for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
173 |         do
174 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
175 |         done
176 | 
177 |     # if best-hit mode is on, taking best hit
178 |     else
179 | 
180 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
181 |         do
182 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
183 |         done
184 | 
185 |     fi
186 | 
187 | 
188 |     ## searching for additional targets if provided
189 |     # getting count of genes if there are additional targets
190 |     if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
191 | 
192 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp)
193 | 
194 |     fi
195 | 
196 |     ## KOs
197 |     if [ $ko_targets == "true" ]; then
198 | 
199 |         gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
200 | 
201 |     fi
202 | 
203 |     ## Pfams
204 |     if [ $additional_pfam_targets == "true" ]; then
205 | 
206 |         gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
207 | 
208 |     fi
209 | 
210 |     if [ $debug_flag == "true" ]; then
211 |         if [ -s ${tmp_dir}/${assembly}_genes2.faa.tmp ]; then
212 |             mv ${tmp_dir}/${assembly}_genes2.faa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa
213 |         fi
214 |         if [ -s ${tmp_dir}/${assembly}_genes1.fa.tmp ]; then
215 |             mv ${tmp_dir}/${assembly}_genes1.fa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_cds.fa
216 |         fi
217 |         if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then
218 |             mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna
219 |         fi
220 |     fi
221 | 
222 |     rm -rf ${tmp_dir}/${assembly}_genes*.tmp*
223 |     rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
224 |     rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
225 |     rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
226 | 
227 | else
228 |     printf "     ${ORANGE}******************************* ${NC}NOTICE ${ORANGE}*******************************${NC}  \n"
229 |     printf "\t  $assembly's genome not successfully downloaded :(\n\n"
230 |     printf "\t    Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\"\n"
231 |     printf "     ${ORANGE}************************************************************************ ${NC}\n"
232 |     rm -rf ${tmp_dir}/${assembly}_genome.tmp.gz
233 | 
234 |     sleep 2
235 | 
236 |     echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt
237 | 
238 | fi
239 | 


--------------------------------------------------------------------------------
/bin/gtt-ncbi-parallel.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir="$2"
 10 | hmm_file="$3"
 11 | num_cpus="$4"
 12 | hmm_target_genes_total="$5"
 13 | output_dir="$6"
 14 | best_hit_mode="$7"
 15 | additional_pfam_targets="$8"
 16 | http_flag="$9"
 17 | ko_targets="${10}"
 18 | target_KOs="${11}"
 19 | debug_flag="${12}"
 20 | 
 21 | assembly=$(echo "$1" | cut -f 1)
 22 | downloaded_accession=$(echo "$1" | cut -f 2)
 23 | 
 24 | # storing and building links
 25 | if [ "$http_flag" == 'false' ]; then
 26 |     base_link=$(echo "$1" | cut -f 9)
 27 | else
 28 |     base_link=$(echo "$1" | cut -f 9 | sed 's/^ftp/https/')
 29 | fi
 30 | 
 31 | # checking link was actually present (sometimes, very rarely, it is not there)
 32 | # if not there, attempting to build ourselves
 33 | if [ $base_link == "na" ] || [ -z $base_link ]; then
 34 | 
 35 |     if [ "$http_flag" == 'false' ]; then
 36 |         p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all")
 37 |     else
 38 |         p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all")
 39 |     fi
 40 | 
 41 |     # checking if GCF or GCA
 42 |     if [[ $assembly == "GCF"* ]]; then
 43 |         p2="GCF"
 44 |     else
 45 |         p2="GCA"
 46 |     fi
 47 | 
 48 |     p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
 49 |     p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
 50 |     p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
 51 | 
 52 |     ass_name=$(echo "$1" | cut -f 3)
 53 |     end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name"))
 54 | 
 55 |     base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path"))
 56 | 
 57 | else
 58 | 
 59 |     end_path=$(basename $base_link)
 60 | 
 61 | fi
 62 | 
 63 | printf "   --------------------------------------------------------------------------   \n\n"
 64 | printf "     Genome: ${GREEN}$assembly${NC}\n"
 65 | 
 66 | curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genes2.tmp.gz "${base_link}/${end_path}_protein.faa.gz"
 67 | 
 68 | if $(file ${tmp_dir}/${assembly}_genes2.tmp.gz | grep -q gzip); then
 69 |     gunzip -f ${tmp_dir}/${assembly}_genes2.tmp.gz
 70 |     # renaming headers to avoid problems with odd characters and how hmmer parses and such
 71 |     gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.tmp
 72 | 
 73 | else # trying to get assembly if there were no gene annotations available
 74 |     curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz"
 75 | 
 76 |     if [ -s ${tmp_dir}/${assembly}_genome.tmp.gz ]; then
 77 | 
 78 |       gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz
 79 | 
 80 |       printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
 81 |       printf "   $assembly doesn't appear to have gene annotations.\n\n"
 82 |       printf "   Downloaded the genome and identifying CDSs with prodigal.\n"
 83 |       printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
 84 | 
 85 |       printf "      Getting coding seqs...\n\n"
 86 |       echo "prodigal used" > ${tmp_dir}/prodigal_used # marking so can add to citations list reported at end
 87 |       prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null
 88 |       tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
 89 | 
 90 |       ## renaming seqs to have assembly name
 91 |       gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.tmp
 92 |     fi
 93 | fi
 94 | 
 95 | if [ -s ${tmp_dir}/${assembly}_genes3.tmp ]; then
 96 | 
 97 |     # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found)
 98 |     ass_name=$(echo "$1" | cut -f 3)
 99 |     if [ -z "$ass_name" ]; then ass_name="NA"; fi
100 |     org_name=$(echo "$1" | cut -f 5)
101 |     if [ -z "$org_name" ]; then org_name="NA"; fi
102 |     infraspecific_name=$(echo "$1" | cut -f 6)
103 |     if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi
104 |     taxid=$(echo "$1" | cut -f 4)
105 |     if [ -z "$taxid" ]; then taxid="NA"; fi
106 |     version_status=$(echo "$1" | cut -f 7)
107 |     if [ -z "$version_status" ]; then version_status="NA"; fi
108 |     asm_level=$(echo "$1" | cut -f 8)
109 |     if [ -z "$asm_level" ]; then asm_level="NA"; fi
110 | 
111 | 
112 |     # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
113 |     gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.tmp
114 | 
115 | 
116 |     ### running hmm search ###
117 |     hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null
118 | 
119 |     ### calculating % completion and redundancy ###
120 |     for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
121 |     do
122 |         grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
123 |     done > ${tmp_dir}/${assembly}_uniq_counts.tmp
124 | 
125 |     ## making list here of only those present in exactly 1 copy
126 |     paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
127 |     awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
128 |     uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
129 | 
130 |     ## adding SCG-hit counts to table
131 |     paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
132 | 
133 |     # total number of unique SCG hits
134 |     num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
135 |     num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
136 | 
137 |     perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
138 |     perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
139 |     perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
140 |     perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
141 | 
142 |     ### want to put an explicit notice out if estimated redundancy is greater than 10%
143 |     # needs to be an integer for bash comparison, so multiplying by 100 first
144 | 
145 |     mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
146 | 
147 |     printf "             Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
148 | 
149 |     if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
150 |         printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
151 | 
152 | 
153 |         printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
154 |         printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
155 |         printf "   While there are no \"golden\" cutoff values for these things, typically\n"
156 |         printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
157 |         printf "   You may want to consider taking a closer look and/or removing it from the\n"
158 |         printf "   from the input genomes.\n\n"
159 | 
160 |         printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
161 |         printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
162 | 
163 |         # writing to table of genomes with questionable redundancy estimates
164 |         printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
165 | 
166 |     else
167 |         printf "             Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
168 |     fi
169 | 
170 |     ## writing summary info to table ##
171 |     printf "$assembly\t$downloaded_accession\t$ass_name\t$taxid\t$org_name\t$infraspecific_name\t$version_status\t$asm_level\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/NCBI_genomes_summary_info.tsv
172 | 
173 |     ### Pulling out hits for this genome ###
174 |     # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value):
175 |     esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null
176 | 
177 |     # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
178 |     if [ $best_hit_mode  == "false" ]; then
179 | 
180 |         for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
181 |         do
182 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
183 |         done
184 | 
185 |     # if best-hit mode is on, taking best hit
186 |     else
187 | 
188 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
189 |         do
190 |             grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa
191 |         done
192 | 
193 |     fi
194 | 
195 |     ## searching for additional targets if provided
196 |     # getting count of genes if there are additional targets
197 |     if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
198 | 
199 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
200 | 
201 |     fi
202 | 
203 |     ## KOs
204 |     if [ $ko_targets == "true" ]; then
205 | 
206 |         gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
207 | 
208 |     fi
209 | 
210 |     ## Pfams
211 |     if [ $additional_pfam_targets == "true" ]; then
212 | 
213 |         gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
214 | 
215 |     fi
216 | 
217 |     if [ $debug_flag == "true" ]; then
218 |         printf "\n\n   Debug mode on, keeping intermediate files.\n\n"
219 |         if [ -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
220 |             printf "   Keeping ${tmp_dir}/${assembly}_genes2.tmp\n"
221 |             mv ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa
222 |         fi
223 |         if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then
224 |             printf "   Keeping ${tmp_dir}/${assembly}_genome.tmp\n"
225 |             mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna
226 |         fi
227 |     fi
228 | 
229 |     rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes1.tmp
230 |     rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
231 |     rm -rf ${tmp_dir}/${assembly}_genes.tmp.ssi ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
232 |     rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
233 | 
234 | else
235 |     printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
236 |     printf "   $assembly did not download properly :(\n\n"
237 |     printf "   Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\"\n"
238 |     printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
239 |     rm -rf ${tmp_dir}/${assembly}_report1.tmp ${tmp_dir}/${assembly}_genes.tmp.gz
240 |     sleep 3
241 |     echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt
242 | fi
243 | 


--------------------------------------------------------------------------------
/bin/gtt-ncbi-serial-nt.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | RED='\033[0;31m'
  6 | ORANGE='\033[0;33m'
  7 | NC='\033[0m'
  8 | 
  9 | tmp_dir="$2"
 10 | hmm_file="$3"
 11 | NCBI_remaining_genomes_total="$4"
 12 | num_cpus="$5"
 13 | hmm_target_genes_total="$6"
 14 | output_dir="$7"
 15 | best_hit_mode="$8"
 16 | additional_pfam_targets="$9"
 17 | http_flag="${10}"
 18 | ko_targets="${11}"
 19 | target_KOs="${12}"
 20 | debug_flag="${13}"
 21 | 
 22 | num=0
 23 | 
 24 | while IFS=$'\t' read -r -a curr_line
 25 | 
 26 | do
 27 | 
 28 |     assembly="${curr_line[0]}"
 29 |     downloaded_accession="${curr_line[1]}"
 30 |     num=$((num+1))
 31 | 
 32 |     printf "   --------------------------------------------------------------------------   \n"
 33 |     printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $NCBI_remaining_genomes_total total.\n"
 34 |     printf "   --------------------------------------------------------------------------   \n\n"
 35 | 
 36 |     # storing and building links
 37 |     if [ "$http_flag" == 'false' ]; then
 38 |         base_link="${curr_line[8]}"
 39 |     else
 40 |         base_link=$(echo ${curr_line[8]} | sed 's/^ftp/https/')
 41 |     fi
 42 | 
 43 |     # checking link was actually present (sometimes, very rarely, it is not there)
 44 |     # if not there, attempting to build ourselves
 45 |     if [ $base_link == "na" ] || [ -z $base_link ]; then
 46 | 
 47 |         if [ "$http_flag" == 'false' ]; then
 48 |             p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all")
 49 |         else
 50 |             p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all")
 51 |         fi
 52 | 
 53 |         # checking if GCF or GCA
 54 |         if [[ $assembly == "GCF"* ]]; then
 55 |             p2="GCF"
 56 |         else
 57 |             p2="GCA"
 58 |         fi
 59 | 
 60 |         p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3)
 61 |         p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6)
 62 |         p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9)
 63 | 
 64 |         ass_name="${curr_line[2]}"
 65 |         end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name"))
 66 | 
 67 |         base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path"))
 68 | 
 69 |     else
 70 | 
 71 |         end_path=$(basename $base_link)
 72 | 
 73 |     fi
 74 | 
 75 |     # attempting to download genome fasta
 76 |     printf "      Downloading genome fasta...\n\n"
 77 |     curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz"
 78 | 
 79 |     # if http, then it pulls down a file still, it just isn't gzipped
 80 |     # if ftp, no file is pulled down
 81 |     # so to cover both cases, just making this need to be present and gzipped
 82 |     if $(file ${tmp_dir}/${assembly}_genome.tmp.gz | grep -q gzip); then
 83 | 
 84 |         gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz
 85 | 
 86 |         printf "      Getting coding seqs...\n\n"
 87 | 
 88 |         prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null
 89 | 
 90 |         tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp
 91 | 
 92 |         ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such
 93 |         gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp
 94 |         gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp
 95 | 
 96 | 
 97 |         # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found)
 98 |         ass_name="${curr_line[2]}"
 99 |         if [ -z "$ass_name" ]; then ass_name="NA"; fi
100 |         org_name="${curr_line[4]}"
101 |         if [ -z "$org_name" ]; then org_name="NA"; fi
102 |         infraspecific_name="${curr_line[5]}"
103 |         if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi
104 |         taxid="${curr_line[3]}"
105 |         if [ -z "$taxid" ]; then taxid="NA"; fi
106 |         version_status="${curr_line[6]}"
107 |         if [ -z "$version_status" ]; then version_status="NA"; fi
108 |         asm_level="${curr_line[7]}"
109 |         if [ -z "$asm_level" ]; then asm_level="NA"; fi
110 | 
111 | 
112 |         ### counting how many genes in this genome
113 |         gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes3.faa.tmp)
114 | 
115 |         # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244)
116 |         gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp
117 |             # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta
118 | 
119 |         printf "      Performing HMM search...\n"
120 | 
121 |         ### running hmm search ###
122 |         hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null
123 | 
124 |         ### calculating % completion and redundancy ###
125 |         for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
126 |         do
127 |             grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp
128 |         done > ${tmp_dir}/${assembly}_uniq_counts.tmp
129 | 
130 |         ## making list here of only those present in exactly 1 copy, to get count of "unique" SCG-hits
131 |         paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
132 |         awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
133 |         uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
134 | 
135 |         ## adding SCG-hit counts to table
136 |         paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv
137 | 
138 |         # total number of unique SCG hits
139 |         num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
140 |         num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
141 | 
142 |         perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l)
143 |         perc_comp_rnd=$(printf "%.2f\n" $perc_comp)
144 |         perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l)
145 |         perc_redund_rnd=$(printf "%.2f\n" $perc_redund)
146 | 
147 |         # want to put a notice out if estimated redundancy is greater than 10
148 |         # needs to be an integer for bash comparison, so multiplying by 100 first
149 |         mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".")
150 | 
151 |         printf "        Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n"
152 | 
153 |         if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
154 | 
155 |             printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n"
156 | 
157 |             printf "  ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC}  \n"
158 |             printf "   Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n"
159 |             printf "   While there are no \"golden\" cutoff values for these things, typically\n"
160 |             printf "   going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n"
161 |             printf "   You may want to consider taking a closer look and/or removing it from the\n"
162 |             printf "   from the input genomes.\n\n"
163 | 
164 |             printf "   Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
165 |             printf "  ${ORANGE}****************************************************************************${NC}  \n\n"
166 | 
167 |             # writing to table of genomes with questionable redundancy estimates
168 |             printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp
169 | 
170 |         else
171 |             printf "        Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n"
172 | 
173 |         fi
174 | 
175 |         ## writing summary info to table ##
176 |         printf "$assembly\t$downloaded_accession\t$ass_name\t$taxid\t$org_name\t$infraspecific_name\t$version_status\t$asm_level\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/NCBI_genomes_summary_info.tsv
177 | 
178 |         ### Pulling out hits for this genome (nucleotide as specified by user) ###
179 |         target_genes_suffix="_genes.fa.tmp"
180 | 
181 |         # indexing
182 |         esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null
183 | 
184 |         # looping through and pulling out each first hit (hmm results tab is sorted by e-value):
185 |         # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy
186 |         if [ $best_hit_mode  == "false" ]; then
187 | 
188 |             for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp)
189 |             do
190 |                 grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
191 |             done
192 | 
193 |         # if best-hit mode is on, taking best hit
194 |         else
195 | 
196 |             for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp)
197 |             do
198 |                 grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa
199 |             done
200 | 
201 |         fi
202 | 
203 | 
204 |         ## searching for additional targets if provided
205 |         # getting count of genes if there are additional targets
206 |         if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then
207 | 
208 |             gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp)
209 | 
210 |         fi
211 | 
212 |         ## KOs
213 |         if [ $ko_targets == "true" ]; then
214 | 
215 |             gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs}
216 | 
217 |         fi
218 | 
219 |         ## Pfams
220 |         if [ $additional_pfam_targets == "true" ]; then
221 | 
222 |             gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir}
223 | 
224 |         fi
225 | 
226 |         if [ $debug_flag == "true" ]; then
227 |             if [ -s ${tmp_dir}/${assembly}_genes2.faa.tmp ]; then
228 |                 mv ${tmp_dir}/${assembly}_genes2.faa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa
229 |             fi
230 |             if [ -s ${tmp_dir}/${assembly}_genes1.fa.tmp ]; then
231 |                 mv ${tmp_dir}/${assembly}_genes1.fa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_cds.fa
232 |             fi
233 |             if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then
234 |                 mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna
235 |             fi
236 |         fi
237 | 
238 |         rm -rf ${tmp_dir}/${assembly}_genes*.tmp*
239 |         rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp
240 |         rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp
241 |         rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp
242 | 
243 |     else
244 |         printf "     ${ORANGE}******************************* ${NC}NOTICE ${ORANGE}*******************************${NC}  \n"
245 |         printf "\t  $assembly's genome not successfully downloaded :(\n\n"
246 |         printf "\t    Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\"\n"
247 |         printf "     ${ORANGE}************************************************************************ ${NC}\n"
248 |         rm -rf ${tmp_dir}/${assembly}_genome.tmp.gz
249 | 
250 |         sleep 2
251 | 
252 |         echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt
253 | 
254 |     fi
255 | 
256 | done < $1
257 | 


--------------------------------------------------------------------------------
/bin/gtt-parse-assembly-summary-file:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser(description='This script is for parsing NCBI\'s assembly summary file down to the provided accessions.')
 7 | 
 8 | required = parser.add_argument_group('required arguments')
 9 | 
10 | required.add_argument("-a", "--assembly_summary", help="NCBI's assembly summary file", action="store", dest="all_assemblies", required=True)
11 | required.add_argument("-w", "--wanted_accessions", help="Single-column file with wanted accessions", action="store", dest="wanted_accs", required=True)
12 | parser.add_argument("-o", "--output_file", help='Wanted summary info only (default: "Wanted.tsv")', action="store", dest="output_file", default="Wanted.tsv")
13 | 
14 | if len(sys.argv)==1:
15 |     parser.print_help(sys.stderr)
16 |     sys.exit(0)
17 | 
18 | args = parser.parse_args()
19 | 
20 | wanted_dict = {}
21 | 
22 | with open(args.wanted_accs, "r") as wanted_accs:
23 |     for line in wanted_accs:
24 |         root_acc = line.strip().split(".")[0]
25 |         wanted_dict[str(root_acc)] = line.strip()
26 | 
27 | out_file = open(args.output_file, "w")
28 | 
29 | # out_file.write("input_accession\tdownloaded_accession\tassembly_name\ttaxid\torganism_name\tinfraspecific_name\tversion_status\tassembly_level\tftp_path\n")
30 | 
31 | with open(args.all_assemblies) as assemblies:
32 |     for line in assemblies:
33 |         line = line.split("\t")
34 |         if line[0].split(".")[0] in wanted_dict:
35 |             
36 |             dl_acc = str(line[0])
37 |             if not dl_acc:
38 |                 dl_acc = "NA"
39 |             
40 |             ass_name = str(line[15])
41 |             if not ass_name:
42 |                 ass_name = "NA"
43 |             
44 |             taxid = str(line[5])
45 |             if not taxid:
46 |                 taxid = "NA"
47 |             
48 |             org_name = str(line[7])
49 |             if not org_name:
50 |                 org_name = "NA"
51 |             
52 |             infra_name = str(line[8])
53 |             if not infra_name:
54 |                 infra_name = "NA"
55 |             
56 |             version_status = str(line[10])
57 |             if not version_status:
58 |                 version_status = "NA"
59 |             
60 |             ass_level = str(line[11])
61 |             if not ass_level:
62 |                 ass_level = "NA"
63 |             
64 |             ftp_path = str(line[19])
65 |             if not ftp_path:
66 |                 ftp_path = "NA"
67 | 
68 |             out_file.write(str(wanted_dict[str(line[0].split(".")[0])]) + "\t" + str(dl_acc) + "\t"  + str(ass_name) + "\t" + str(taxid) + "\t" + str(org_name) + "\t" + str(infra_name) + "\t" + str(version_status) + "\t" + str(ass_level) + "\t" + str(ftp_path) + "\n")
69 | 


--------------------------------------------------------------------------------
/bin/gtt-parse-fasta-by-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='This script is for parsing a fasta file by pulling out sequences with the desired headers. If you want all sequences EXCEPT the ones with the headers you are providing, add the flag "--inverse".')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input-fasta", help="Original fasta file", action="store", required=True)
12 | required.add_argument("-w", "--wanted-headers", help="Single-column file with sequence headers", action="store", required=True)
13 | parser.add_argument("-o", "--output-fasta", help='Output fasta file default: "Wanted.fa"', action="store", default="Wanted.fa")
14 | parser.add_argument("--inverse", help="Add this flag to pull out all sequences with headers NOT in the provided header file.", action="store_true")
15 | 
16 | if len(sys.argv)==1:
17 |     parser.print_help(sys.stderr)
18 |     sys.exit(0)
19 | 
20 | args = parser.parse_args()
21 | 
22 | headers_of_int_list = [line.strip() for line in open(args.wanted_headers)]
23 | 
24 | if not args.inverse:
25 | 
26 |     with open(args.output_fasta, "w") as output_file:
27 |         
28 |         with open(args.input_fasta, "r") as input_file:
29 | 
30 |             for header, seq in SimpleFastaParser(input_file):
31 | 
32 |                 if header in headers_of_int_list:
33 | 
34 |                     output_file.write(">%s\n%s\n" % (header, seq))
35 | 
36 | else:
37 | 
38 |     with open(args.output_fasta, "w") as output_file:
39 |         
40 |         with open(args.input_fasta, "r") as input_file:
41 | 
42 |             for header, seq in SimpleFastaParser(input_file):
43 | 
44 |                 if header not in headers_of_int_list:
45 | 
46 |                     output_file.write(">%s\n%s\n" % (header, seq))
47 | 


--------------------------------------------------------------------------------
/bin/gtt-parse-gtdb-assembly-summary-file:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import argparse
  5 | 
  6 | parser = argparse.ArgumentParser(description='This script is for parsing GTDB\'s assembly metadata file down to the target accessions.')
  7 | 
  8 | required = parser.add_argument_group('required arguments')
  9 | 
 10 | required.add_argument("-a", "--assembly_summary", help="GTDB's assembly metadata file", action="store", dest="all_assemblies", required=True)
 11 | required.add_argument("-w", "--wanted_accessions", help="Single-column file with wanted accessions", action="store", dest="wanted_accs", required=True)
 12 | parser.add_argument("-o", "--output_file", help='Wanted summary info only (default: "target-gtdb.tsv")', action="store", dest="output_file", default="target-gtdb.tsv")
 13 | parser.add_argument("-f", "--found_accs_output_file", help='Accessions found in GTDB (default: "gtdb-found-accs.txt")', action="store", dest="found_accs_output_file", default="gtdb-found-accs.txt")
 14 | parser.add_argument("-n", "--not_found_accs_output_file", help='Accessions not found in GTDB (default: "gtdb-not-found-accs.tsv")', action="store", dest="not_found_accs_output_file", default="gtdb-not-found-accs.tsv")
 15 | parser.add_argument("-t", "--gtdb_tax_output_file", help='Target GTDB taxonomy table (default: "target-gtdb-tax.tsv")', action="store", dest="gtdb_tax_output_file", default="target-gtdb-tax.tsv")
 16 | 
 17 | 
 18 | 
 19 | if len(sys.argv)==1:
 20 |     parser.print_help(sys.stderr)
 21 |     sys.exit(0)
 22 | 
 23 | args = parser.parse_args()
 24 | 
 25 | wanted_dict = {}
 26 | 
 27 | with open(args.wanted_accs, "r") as wanted_accs:
 28 |     for line in wanted_accs:
 29 |         root_acc = line.strip().split(".")[0][4:]
 30 |         wanted_dict[str(root_acc)] = line.strip()
 31 | 
 32 | out_file = open(args.output_file, "w")
 33 | 
 34 | # tracking found so can write out those not found at end too
 35 | found_accs = []
 36 | 
 37 | gtdb_found_accs_out_file = open(args.found_accs_output_file, "w")
 38 | # adding header
 39 | gtdb_found_accs_out_file.write("input_searched\tgtdb_acc_found\tfull_gtdb_acc\n")
 40 | 
 41 | 
 42 | with open(args.all_assemblies) as assemblies:
 43 |     # writing out header to keep
 44 |     out_file.write(assemblies.readline())
 45 | 
 46 |     for line in assemblies:
 47 |         split_line = line.strip().split("\t")
 48 |         
 49 |         acc_with_no_version = split_line[0][7:].split(".")[0]
 50 |         
 51 |         # i believe refseq typically only has 1 version, so taking even if not the same version as specified (this info, what was searched and what was found, is reported in the output "/run_files/gtdb_to_input_accession_map.tsv" file)
 52 |         if acc_with_no_version in wanted_dict:
 53 | 
 54 |             out_file.write(line)
 55 |             gtdb_found_accs_out_file.write(wanted_dict[acc_with_no_version] + "\t" + split_line[0][3:] + "\t" + split_line[0] + "\n")
 56 |             
 57 |             # adding to found list
 58 |             found_accs.append(wanted_dict[acc_with_no_version])
 59 |             
 60 | out_file.close()
 61 | gtdb_found_accs_out_file.close()
 62 | 
 63 | ## getting and writing out entries that weren't found (and how they were searched)
 64 | wanted_accs = list(wanted_dict.values())
 65 | not_found_accs = list(set(wanted_accs) - set(found_accs))
 66 | 
 67 | if len(not_found_accs) > 0:
 68 |     with open(args.not_found_accs_output_file, "w") as not_found_output_file:
 69 | 
 70 |         # writing out header
 71 |         not_found_output_file.write("input\tsearched_as\n")
 72 | 
 73 |         for key in wanted_dict:
 74 |             if wanted_dict[key] in not_found_accs:
 75 | 
 76 |                 not_found_output_file.write(wanted_dict[key] + "\t" + key + "\n")
 77 | 
 78 | # making GTDB taxonomy table only
 79 | out_gtdb_tax_table = open(args.gtdb_tax_output_file, "w")
 80 | 
 81 | # adding header
 82 | out_gtdb_tax_table.write("base_gtdb_acc\tfull_gtdb_acc\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
 83 | 
 84 | with open(args.output_file, "r") as assemblies:
 85 | 
 86 |     # skipping header
 87 |     next(assemblies)
 88 |     
 89 |     for line in assemblies:
 90 |         line = line.strip().split("\t")
 91 |         full_gtdb_acc = line[0]
 92 |         base_gtdb_acc = full_gtdb_acc[3:]
 93 | 
 94 |         gtdb_tax_list = [line[1], line[2], line[3], line[4], line[5], line[6], line[7]]
 95 |         
 96 |         if len(gtdb_tax_list) != 7:
 97 |             print("GTDB entry " + full_gtdb_acc + " doesn't seem to have full lineage info.")
 98 | 
 99 |         out_gtdb_tax_table.write(base_gtdb_acc + "\t" + full_gtdb_acc + "\t" + gtdb_tax_list[0] + "\t" + gtdb_tax_list[1] + "\t" + gtdb_tax_list[2] + "\t" + gtdb_tax_list[3] + "\t" + gtdb_tax_list[4] + "\t" + gtdb_tax_list[5] + "\t" + gtdb_tax_list[6] + "\n")
100 | 
101 | out_gtdb_tax_table.close()
102 | 


--------------------------------------------------------------------------------
/bin/gtt-parse-kofamscan-targets.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | target_KOs_file=$1
 4 | output_dir=$2
 5 | 
 6 | full_KO_list_file="${KO_data_dir}/ko_list"
 7 | full_KO_hmms_dir="${KO_data_dir}/profiles"
 8 | 
 9 | sub_KO_list_file="${output_dir}/KO_search_results/target-KOs.tsv"
10 | sub_KO_hmms_dir="${output_dir}/KO_search_results/target_KO_profiles/"
11 | 
12 | # making target ko_list file and copying target hmms to working area
13 | head -n 1 ${full_KO_list_file} > ${sub_KO_list_file}
14 | 
15 | for ko in $(cat ${target_KOs_file}); do
16 |     grep -m 1 -w "^${ko}" ${full_KO_list_file} >> ${sub_KO_list_file}
17 |     cp ${full_KO_hmms_dir}/${ko}.hmm ${sub_KO_hmms_dir}
18 | done
19 | 


--------------------------------------------------------------------------------
/bin/gtt-remove-all-gap-seqs-from-alignment:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='This script will remove sequences that are entirely gap characters ("-") from an alignment fasta file, specific for use in GToTree.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_fasta", help="Starting fasta file", action="store", dest="input_fasta", required=True)
12 | parser.add_argument("-o", "--output_fasta", help='Output fasta file (default: "No-gap-seqs-aln.faa").', dest="output_fasta", default="No-gap-seqs-aln.faa")
13 | 
14 | if len(sys.argv)==1:
15 |   parser.print_help(sys.stderr)
16 |   sys.exit(0)
17 | 
18 | args = parser.parse_args()
19 | 
20 | with open(args.input_fasta, "r") as in_fasta:
21 |     with open(args.output_fasta, "w") as out:
22 |         for seq_record in SeqIO.parse(in_fasta, "fasta"):
23 |             if seq_record.seq != len(seq_record.seq) * "-":
24 |                 out.write(">" + str(seq_record.id) + "\n" + str(seq_record.seq) + "\n")
25 | 


--------------------------------------------------------------------------------
/bin/gtt-rename-fasta-headers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio.SeqIO.FastaIO import SimpleFastaParser
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='This script will rename all sequences of a multifasta with the same name with an appended number to keep them unique.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input-fasta", help="Starting fasta file", action="store", required=True)
12 | parser.add_argument("-w", "--wanted-name", help='Name to give seqs (default: "Seq")', action="store", default="Seq")
13 | parser.add_argument("-o", "--output-fasta", help='Output fasta file (default: "Renamed.fasta").', default="Renamed.fasta")
14 | 
15 | if len(sys.argv)==1:
16 |     parser.print_help(sys.stderr)
17 |     sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | n = 0
22 | 
23 | with open(args.output_fasta, "w") as output_file:
24 | 
25 | 	  with open(args.input_fasta, "r") as input_file:
26 | 
27 | 	  	  for header, seq in SimpleFastaParser(input_file):
28 | 
29 | 	  	  	  n = n + 1
30 | 	  	  	  output_file.write(">" + str(args.wanted_name) + "_" + str(n) + "\n" + seq + "\n")
31 | 


--------------------------------------------------------------------------------
/bin/gtt-reorder-fasta:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser('This script takes a multifasta file and reorders the sequences according to the headers provided.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta", required=True)
12 | required.add_argument("-w", "--wanted_sequence_order", help="Single-column file with headers in desired order", action="store", dest="ordered_headers", required=True)
13 | parser.add_argument("-o", "--output_fasta", help='Reordered output fasta (default: "Reordered.fa").', dest="output_fasta", default="Reordered.fa")
14 | 
15 | if len(sys.argv)==1:
16 |   parser.print_help(sys.stderr)
17 |   sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | ordered_seqs = open(args.ordered_headers, "r")
22 | 
23 | ordered_list = list(line.strip() for line in ordered_seqs)
24 | 
25 | fasta_dict = SeqIO.index(args.input_fasta, "fasta")
26 | 
27 | fasta_out = open(args.output_fasta, "w")
28 | 
29 | for header in ordered_list:
30 |   fasta_out.write(fasta_dict.get_raw(header).decode())
31 | 
32 | ordered_seqs.close()
33 | fasta_out.close()
34 | 


--------------------------------------------------------------------------------
/bin/gtt-run-additional-pfam-search.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | assembly_id=${1}
 4 | genes_file=${2}
 5 | gene_count=${3}
 6 | num_cpus=${4}
 7 | tmp_dir=${5}
 8 | output_dir=${6}
 9 | 
10 | hmmsearch_output_file="${output_dir}/Pfam_search_results/individual_genome_results/${assembly_id}_hmmsearch.txt"
11 | 
12 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${hmmsearch_output_file} ${tmp_dir}/all_pfam_targets.hmm ${genes_file} > /dev/null
13 | 
14 | ### getting counts of each target in this genome
15 | for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
16 | do
17 |     grep -w ${target} ${hmmsearch_output_file} | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly_id}_hit_counts.tmp
18 | done
19 | 
20 | ### writing results to main output file
21 | paste <( printf "${assembly_id}\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly_id}_hit_counts.tmp | tr "\n" "\t" | sed 's/\t$/\n/')" ) >> ${output_dir}/Pfam_search_results/Pfam-hit-counts.tsv
22 | 
23 | ### Pulling out hits to additional pfam targets for this genome ###
24 | for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
25 | do
26 |     if grep -w -q "$target" ${hmmsearch_output_file}; then
27 | 
28 |         grep -w "$target" ${hmmsearch_output_file} | cut -f 1 -d " " >> ${tmp_dir}/${assembly_id}_${target}_genes_of_int.tmp
29 | 
30 |         for gene in $(cat ${tmp_dir}/${assembly_id}_${target}_genes_of_int.tmp)
31 |         do
32 |             echo $gene | esl-sfetch -f ${tmp_dir}/${assembly_id}_genes.tmp -
33 |         done >> ${tmp_dir}/${assembly_id}_${target}_genes1.tmp
34 | 
35 |         gtt-append-fasta-headers -i ${tmp_dir}/${assembly_id}_${target}_genes1.tmp -w ${assembly_id}_${target} -o ${tmp_dir}/${assembly_id}_${target}_genes.tmp
36 |     
37 |         # adding to fasta of that target holding all genomes
38 |         cat ${tmp_dir}/${assembly_id}_${target}_genes.tmp >> ${output_dir}/Pfam_search_results/Pfam_hit_seqs/${target}-hits.faa
39 |     fi
40 | 
41 | done
42 | 


--------------------------------------------------------------------------------
/bin/gtt-run-kofamscan.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | assembly_id=${1}
 4 | genes_file=${2}
 5 | gene_count=${3}
 6 | target_KOs_table_file=${4}
 7 | target_KO_hmms_dir=${5}
 8 | tmp_dir=${6}
 9 | output_dir=${7}
10 | unique_target_KOs=${8}
11 | 
12 | 
13 | curr_genome_output_dir="${output_dir}/KO_search_results/individual_genome_results/${assembly_id}/"
14 | mkdir -p ${curr_genome_output_dir}
15 | 
16 | output_results_table_file="${curr_genome_output_dir}/kofamscan-results.tsv"
17 | tmp_ko_working_dir="${tmp_dir}/kofamscan/${assembly_id}/"
18 | tmp_unique_ko_hits="${tmp_ko_working_dir}/unique-KOs.txt"
19 | output_counts_file="${tmp_ko_working_dir}/KO-counts.txt"
20 | 
21 | # running scan
22 | exec_annotation -p ${target_KO_hmms_dir} -k ${target_KOs_table_file} --cpu 1 -f mapper --no-report-unannotated --tmp-dir ${tmp_ko_working_dir} -o ${output_results_table_file} ${genes_file}
23 | 
24 | # moving forward only if there were any hits
25 | if [ -s ${output_results_table_file} ]; then
26 | 
27 |     # getting all unique KOs with hits in this genome
28 |     cut -f 2 ${output_results_table_file} | sort -u > ${tmp_unique_ko_hits}
29 | 
30 |     # creating individual fasta files with hits for each
31 |     for ko in $(cat ${tmp_unique_ko_hits}); do
32 | 
33 |         # getting gene IDs with hits to the current KO
34 |         grep -w ${ko} ${output_results_table_file} | cut -f 1 > ${tmp_ko_working_dir}/${ko}-gene-IDs.txt
35 | 
36 |         # pulling out seqs for this genome
37 |         gtt-parse-fasta-by-headers -i ${genes_file} -w ${tmp_ko_working_dir}/${ko}-gene-IDs.txt -o ${tmp_ko_working_dir}/${ko}.faa
38 | 
39 |         # removing gene IDs file
40 |         rm ${tmp_ko_working_dir}/${ko}-gene-IDs.txt
41 | 
42 |     done
43 | 
44 | else
45 | 
46 |     printf "No hits detected.\n" > ${output_results_table_file}
47 | 
48 | fi
49 | 
50 | # creating count file that can be stuck together at end
51 | rm -rf ${output_counts_file}.tmp
52 | 
53 | for ko in $(cat ${unique_target_KOs}); do
54 | 
55 |     grep -w -c ${ko} ${output_results_table_file} >> ${output_counts_file}.tmp
56 | 
57 | done
58 | 
59 | # rearranging so we can combine them afterwards more easily
60 | paste <( printf "${assembly_id}\t${gene_count}" ) <( tr "\n" "\t" < ${output_counts_file}.tmp | sed 's/\t$/\n/' ) > ${output_counts_file}
61 | rm ${output_counts_file}.tmp
62 | 


--------------------------------------------------------------------------------
/bin/gtt-store-SCG-HMMs:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import sys
  5 | import argparse
  6 | import textwrap
  7 | import shutil
  8 | import time
  9 | from glob import glob
 10 | 
 11 | parser = argparse.ArgumentParser(description="This program adds a single-copy-gene-HMMs file to the stored GToTree location.")
 12 | 
 13 | required = parser.add_argument_group('required arguments')
 14 | required.add_argument("hmm_file", metavar='hmm_file', type=str, help="HMM file to be added", action="store")
 15 | 
 16 | if len(sys.argv)==1:
 17 |     parser.print_help(sys.stderr)
 18 |     sys.exit(0)
 19 | 
 20 | args = parser.parse_args()
 21 | 
 22 | def main():
 23 | 
 24 |     check_input()
 25 | 
 26 |     HMM_dir = get_HMM_dir()
 27 | 
 28 |     copy_if_safe(HMM_dir)
 29 | 
 30 |     report_available_HMMs(HMM_dir)
 31 | 
 32 | # setting some colors
 33 | tty_colors = {
 34 |     'green' : '\033[0;32m%s\033[0m',
 35 |     'yellow' : '\033[0;33m%s\033[0m',
 36 |     'red' : '\033[0;31m%s\033[0m'
 37 | }
 38 | 
 39 | 
 40 | ### functions ###
 41 | def color_text(text, color='green'):
 42 |     if sys.stdout.isatty():
 43 |         return tty_colors[color] % text
 44 |     else:
 45 |         return text
 46 | 
 47 | 
 48 | # print wrapper
 49 | def wprint(text):
 50 |     print(textwrap.fill(text, width=80, initial_indent="  ", 
 51 |           subsequent_indent="  ", break_on_hyphens=False))
 52 | 
 53 | def check_input():
 54 |     if not os.path.exists(args.hmm_file):
 55 |         print("")
 56 |         wprint(color_text("Seems there is no file called '" + str(args.hmm_file) + "' here :(", "yellow"))
 57 |         print("")
 58 |         print("Exiting for now.\n")
 59 |         sys.exit(0)
 60 | 
 61 | 
 62 | def get_HMM_dir():
 63 | 
 64 |     # should be stored here if conda install of GToTree was performed
 65 |     try:
 66 |         HMM_dir = os.environ['GToTree_HMM_dir']
 67 |     except:
 68 |         HMM_dir = False
 69 | 
 70 |     if not HMM_dir:
 71 |         print("")
 72 |         wprint(color_text("Seems there is no stored GToTree HMM directory :(", "yellow"))
 73 |         print("  Installing GToTree with conda would take care of it if interested.\n")
 74 |         print("Exiting for now.\n")
 75 |         sys.exit(0)
 76 | 
 77 |     print("")
 78 |     print("  GToTree stored SCG-HMMs are located in:")
 79 |     print("    " + HMM_dir + "\n")
 80 | 
 81 |     return(HMM_dir)
 82 | 
 83 | def copy_if_safe(path):
 84 |     contents = os.listdir(path)
 85 | 
 86 |     hmm_file = os.path.basename(args.hmm_file)
 87 | 
 88 |     if hmm_file in contents:
 89 | 
 90 |         wprint(color_text("Seems there is already a file named '" + str(args.hmm_file) + "' stored in the GToTree HMM directory.", "yellow"))
 91 |         print("")
 92 |         print("Exiting for now.\n")
 93 |         sys.exit(0)
 94 | 
 95 |     else:
 96 |         shutil.copy(args.hmm_file, path + hmm_file)
 97 |         wprint("The file '" + color_text(str(hmm_file)) + "' now happily lives with the rest of the SCG-HMMs stored with GToTree :)")
 98 |         print("")
 99 | 
100 | def report_available_HMMs(path):
101 | 
102 |     time.sleep(1)
103 | 
104 | 
105 |     files = [os.path.basename(x) for x in glob(path + "*.hmm")]
106 | 
107 |     files.sort()
108 | 
109 |     files_and_counts = {}
110 | 
111 |     for file in files:
112 |         if file.endswith(".hmm"):
113 |             count = 0
114 |             with open(path + file, "r") as f:
115 |                 for line in f:
116 |                     if line.startswith("ACC"):
117 |                         count += 1
118 |             files_and_counts[file] = count
119 | 
120 |     print("      The " + str(len(files)) + " currently stored SCG-HMMs include:\n")
121 | 
122 |     for key, value in files_and_counts.items():
123 |         print("\t  {:<27} {:>15}".format(key, "(" + str(value) + " genes)"))
124 | 
125 |     print("")
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/bin/gtt-subset-GTDB-accessions:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki)
  5 | to facilitate subsetting accessions pulled from the GTDB database (with 'gtt-get-accessions-from-GTDB').
  6 | 
  7 | It is intended to help when wanting a tree to span a breadth of diversity we know about, while also helping
  8 | to reduce over-representation of certain taxa.
  9 | 
 10 | There are two primary methods to use it:
 11 | 
 12 | 1) If a specific class makes up > 0.05% (by default) of the total number of accessions, it will randomly subset
 13 | that class down to 5% of what it was. So if there are 40,000 total target genomes, and Gammaproteobacteria have
 14 | 8,000 of them (20% of the total), the program will randomly select 80 Gammaproteobacteria to include (1% of 8,000).
 15 | 
 16 | 2) Select 1 random genome from each of the specific rank: "phylum", "class", "order", "family", "genus", or "species".
 17 | """
 18 | 
 19 | import sys
 20 | import argparse
 21 | import textwrap
 22 | import pandas as pd
 23 | 
 24 | 
 25 | parser = argparse.ArgumentParser(description='This script is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki)\
 26 |                                               to facilitate subsetting accessions pulled from the GTDB database (with\
 27 |                                               \'gtt-get-accessions-from-GTDB\' – the input file is the "*metadata.tsv" from that program).\
 28 |                                               It is intended to help when wanting a tree to span the breadth\
 29 |                                               of diversity we know about, while also helping to reduce over-representation of certain taxa. \
 30 |                                               There are 2 primary methods for using it: \
 31 |                                               1) If a specific Class makes up > 0.05% (by default) of the total number of target genomes, the script\
 32 |                                               will randomly subset that class down to 1% of what it was. So if there are 40,000 total target genomes,\
 33 |                                               and Gammaproteobacteria make up 8,000 of them (20% of the total), the program will randomly select 80 Gammaproteobacteria\
 34 |                                               to include (1% of 8,000). \
 35 |                                               2) Select 1 random genome from each taxa of the specified rank. \
 36 |                                               It ultimately outputs a new subset accessions file ready for use with the main GToTree program.',
 37 |                                 epilog = "Ex. usage: gtt-subset-GTDB-classes -i GTDB-arc-and-bac-refseq-rep-metadata.tsv --get-only-individuals-for-the-rank order")
 38 | 
 39 | required = parser.add_argument_group('required arguments')
 40 | 
 41 | required.add_argument("-i", "--GTDB-table", help="GTDB metadata table produced with 'gtt-get-accessions-from-GTDB'", action="store", required=True)
 42 | parser.add_argument("-o", "--output-prefix", help='output prefix for output subset accessions (*.txt) and GTDB taxonomy files (*.tsv) (default: "subset-accessions")', action="store", dest="output_prefix", default="subset-accessions")
 43 | 
 44 | parser.add_argument("-p", "--cutoff-fraction", help='Fraction of total target genomes that any given Class must contribute in order for that class to be randomly subset (default: 0.0005)', action="store", default=0.0005)
 45 | parser.add_argument("-f", "--fraction-to-subset", help='Fraction those that are filtered should be randomly subset down to (default: 0.01)', action="store", dest="filter_fraction", default=0.01)
 46 | 
 47 | # this is being left for backwards-compatibility reasons only (same )
 48 | parser.add_argument("--get-Order-representatives-only", action="store_true", help = "Provide this flag to simply get 1 random genome from each Order in GTDB (same as if specifying \
 49 |                     `--get-only-individuals-for-the-rank order`, but left here for backwards-compatibility purposes)", dest = "order_only")
 50 | 
 51 | parser.add_argument("--get-only-individuals-for-the-rank", action="store", choices = {"phylum", "class", "order", "family", "genus", "species"}, 
 52 |                     help = "Use this option with a specified rank if wanting to randomly subset such that we retain 1 genome from each entry in a specific rank in GTDB", dest = "target_rank")
 53 | 
 54 | 
 55 | parser.add_argument("--seed", action = "store", help = "Specify the seed for random subsampling (default = 1)", default = 1, type = int)
 56 | 
 57 | if len(sys.argv)==1:
 58 |     parser.print_help(sys.stderr)
 59 |     sys.exit(0)
 60 | 
 61 | args = parser.parse_args()
 62 | 
 63 | 
 64 | ############################################################
 65 | 
 66 | # setting some colors
 67 | tty_colors = {
 68 |     'green' : '\033[0;32m%s\033[0m',
 69 |     'yellow' : '\033[0;33m%s\033[0m',
 70 |     'red' : '\033[0;31m%s\033[0m'
 71 | }
 72 | 
 73 | 
 74 | ### functions ###
 75 | def color_text(text, color='green'):
 76 |     if sys.stdout.isatty():
 77 |         return tty_colors[color] % text
 78 |     else:
 79 |         return text
 80 | 
 81 | def wprint(text):
 82 |     print(textwrap.fill(text, width=80, initial_indent="  ",
 83 |           subsequent_indent="  ", break_on_hyphens=False))
 84 | 
 85 | 
 86 | # function to subset master table to one for each order
 87 | def order_subset_table(order_to_subset, input_tab, seed):
 88 | 
 89 |     sub_master_df_to_keep = input_tab.loc[input_tab["order"] != order_to_subset]
 90 | 
 91 |     curr_order_df = input_tab.loc[input_tab["order"] == order_to_subset]
 92 | 
 93 |     random_sub_df = curr_order_df.sample(n = 1, random_state = int(seed))
 94 | 
 95 |     new_df = pd.concat([sub_master_df_to_keep, random_sub_df])
 96 | 
 97 |     return new_df
 98 | 
 99 | 
100 | # function to subset master table
101 | def subset_table(class_to_subset, input_tab, seed):
102 | 
103 |     curr_class = class_to_subset
104 | 
105 |     sub_master_df_to_keep = input_tab.loc[input_tab["class"] != curr_class]
106 | 
107 |     curr_class_df = input_tab.loc[input_tab["class"] == curr_class]
108 | 
109 |     random_sub_df = curr_class_df.sample(n=int(len(curr_class_df.index) * float(args.filter_fraction)), random_state = int(seed))
110 | 
111 |     new_df = pd.concat([sub_master_df_to_keep, random_sub_df])
112 | 
113 |     return new_df
114 | 
115 | 
116 | # function to subset arbitrary rank
117 | def taxa_subset_table(taxa_to_subset, rank, input_tab, seed):
118 | 
119 |     sub_master_df_to_keep = input_tab.loc[input_tab[rank] != taxa_to_subset]
120 | 
121 |     curr_order_df = input_tab.loc[input_tab[rank] == taxa_to_subset]
122 | 
123 |     random_sub_df = curr_order_df.sample(n = 1, random_state = int(seed))
124 | 
125 |     new_df = pd.concat([sub_master_df_to_keep, random_sub_df])
126 | 
127 |     return new_df
128 | 
129 | 
130 | ################################################################################
131 | 
132 | 
133 | 
134 | # reading lineage table into pandas dataframe
135 | lineage_df = pd.read_csv(args.GTDB_table, delimiter="\t", usecols = range(8))
136 | 
137 | starting_size = len(lineage_df.index)
138 | 
139 | # just giving 1 of each order if requested (left here like this for backwards-compatibility purposes)
140 | if args.order_only:
141 | 
142 |     args.target_rank = "order"
143 | 
144 | 
145 | if args.target_rank:
146 | 
147 |     # getting list of all unique entries of wanted rank
148 |     unique_entries = lineage_df[args.target_rank].unique()
149 | 
150 |     # getting one rep genome of each of these
151 |     for entry in unique_entries:
152 | 
153 |         lineage_df = taxa_subset_table(entry, args.target_rank, lineage_df, args.seed)
154 | 
155 |     filtered_size = len(lineage_df.index)
156 | 
157 |     # removing "RS_" and "GB_" prefixes and writing out output accs
158 |     output_accessions = args.output_prefix + ".txt"
159 |     with open(output_accessions, "w") as out:
160 |         for acc in lineage_df.accession:
161 |             out.write(acc[3:] + "\n")
162 | 
163 |     # writing out subset GTDB taxonomy
164 |     output_tax = args.output_prefix + "-taxonomy.tsv"
165 |     lineage_df.to_csv(output_tax, sep = "\t", index = False)
166 | 
167 |     # reporting and exiting
168 |     print("")
169 |     wprint(color_text(str("{:,}".format(starting_size)) + " initial entries were subset down to " + str("{:,}".format(filtered_size)) + "\n", "yellow"))
170 |     print("")
171 |     wprint("Subset accessions file for GToTree written to:")
172 |     print(color_text("    " + str(output_accessions)) + "\n")
173 |     wprint("A subset GTDB taxonomy table for these accessions written to:")
174 |     print(color_text("    " + str(output_tax)) + "\n")
175 | 
176 |     exit()
177 | 
178 | 
179 | # if the above didn't run and exit, then we are going to randomly subset based on class
180 | class_dict = {}
181 | 
182 | # counting how many times each class shows up
183 | for index, row in lineage_df.iterrows():
184 |     if row["class"] not in class_dict:
185 |         class_dict[row["class"]] = 1
186 |     else:
187 |         class_dict[row["class"]] += 1
188 | 
189 | # getting cutoff threshold of total number of entries
190 | cutoff = int(starting_size * float(args.cutoff_fraction))
191 | 
192 | # getting which classes are above this threshold
193 | classes_to_subset = []
194 | 
195 | for key in class_dict:
196 |     if class_dict[key] >= cutoff:
197 |         classes_to_subset.append(key)
198 | 
199 | # subsetting each class
200 | for rank in classes_to_subset:
201 |     lineage_df = subset_table(rank, lineage_df, args.seed)
202 | 
203 | filtered_size = len(lineage_df.index)
204 | 
205 | # removing "RS_" and "GB_" prefixes and writing out output accs
206 | output_accessions = args.output_prefix + ".txt"
207 | with open(output_accessions, "w") as out:
208 |     for acc in lineage_df.accession:
209 |         out.write(acc[3:] + "\n")
210 | 
211 | # writing out subset GTDB taxonomy
212 | output_tax = args.output_prefix + "-taxonomy.tsv"
213 | lineage_df.to_csv(output_tax, sep = "\t", index = False)
214 | 
215 | 
216 | print("")
217 | wprint(color_text(str("{:,}".format(starting_size)) + " initial entries were subset down to " + str("{:,}".format(filtered_size)) + "\n", "yellow"))
218 | # print("\n    Subset classes included: \n\t\t\t\t" + "\n\t\t\t\t".join(classes_to_subset) + "\n")
219 | print("")
220 | wprint("Subset accessions written to file:")
221 | print(color_text("    " + str(output_accessions)) + "\n")
222 | wprint("A subset GTDB taxonomy table for these accessions written to:")
223 | print(color_text("    " + str(output_tax)) + "\n")
224 | 


--------------------------------------------------------------------------------
/bin/gtt-swap-ids:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from Bio import SeqIO
 4 | import sys
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser(description='This script will swap the headers of a fasta file.')
 8 | 
 9 | required = parser.add_argument_group('required arguments')
10 | 
11 | required.add_argument("-i", "--input_fasta", help="Starting fasta file", action="store", dest="input_fasta", required=True)
12 | parser.add_argument("-s", "--map_of_ids_to_swap", help="Two column tab-delimited file where column 1 holds the original headers and column 2 holds the desired headers. (doesn't need to hold all headers)", action="store", dest="id_map")
13 | parser.add_argument("-o", "--output_fasta_name", help='Output fasta file (default: "Renamed.fasta").', dest="output_fasta_name", default="Renamed.fasta")
14 | 
15 | if len(sys.argv)==1:
16 |   parser.print_help(sys.stderr)
17 |   sys.exit(0)
18 | 
19 | args = parser.parse_args()
20 | 
21 | map_dict = {}
22 | 
23 | with open(args.id_map) as map:
24 |   for line in map:
25 |     line = line.strip()
26 |     line = line.split("\t")
27 |     map_dict[line[0]] = line[1]
28 | 
29 | in_fasta = open(args.input_fasta, "r")
30 | out_fasta = open(args.output_fasta_name, "w")
31 | 
32 | for seq_record in SeqIO.parse(in_fasta, "fasta"):
33 |   if seq_record.id in map_dict:
34 |     out_fasta.write(">" + str(map_dict[seq_record.id]) + "\n")
35 |     out_fasta.write(str(seq_record.seq) + "\n")
36 |   else:
37 |     out_fasta.write(">" + str(seq_record.id)  + "\n")
38 |     out_fasta.write(str(seq_record.seq) + "\n")
39 | 
40 | in_fasta.close()
41 | out_fasta.close()
42 | 


--------------------------------------------------------------------------------
/bin/gtt-test.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # setting colors to use
  4 | GREEN='\033[0;32m'
  5 | YELLOW='\033[0;33m'
  6 | RED='\033[0;31m'
  7 | NC='\033[0m'
  8 | 
  9 | printf "\n"
 10 | printf "  ${GREEN}Downloading GToTree test data into the subdirectory ${YELLOW}GToTree-test-data/\n\n${NC}"
 11 | printf "  ${GREEN}Test data being pulled from here:\n${NC}"
 12 | printf "    ${YELLOW}https://zenodo.org/record/7860720#.ZEcWkexlA_8${NC}\n\n\n"
 13 | 
 14 | curl -L --retry 10 --fail -o GToTree-test-data.tar.gz "https://zenodo.org/record/7860720/files/GToTree-test-data.tar.gz?download=1"
 15 | 
 16 | # if run as 'gtt-test.sh http', will use http instead of ftp
 17 | if [ ! -z $1 ] && [ $1 == "http" ]; then
 18 |     p_flag="-P"
 19 | else
 20 |     p_flag=""
 21 | fi
 22 | 
 23 | # checking download was successfull (can finish with 0 exit)
 24 | if [ $? -ne 0 ] ; then
 25 | 
 26 |     printf "\n${RED}  Downloading the small test data failed for some reason :(${NC}\n"
 27 |     printf "  You can try downloading it yourself from the link printed above and running the test as follows after unpacking it:\n\n"
 28 | 
 29 |     printf "    ${YELLOW}GToTree -a GToTree-test-data/ncbi_accessions.txt "'\\ \n'
 30 |     printf "            -g GToTree-test-data/genbank_files.txt "'\\ \n'
 31 |     printf "            -f GToTree-test-data/fasta_files.txt "'\\ \n'
 32 |     printf "            -A GToTree-test-data/amino_acid_files.txt "'\\ \n'
 33 |     printf "            -m GToTree-test-data/genome_to_id_map.tsv "'\\ \n'
 34 |     printf "            -p GToTree-test-data/pfam_targets.txt "'\\ \n'
 35 |     printf "            -H Universal -t -D -j 4 -o GToTree-test-output -F ${p_flag}\n\n${NC}"
 36 | 
 37 |     printf "  Then you can compare the output to what is depicted here:\n"
 38 |     printf "    https://github.com/AstrobioMike/GToTree/wiki/Installation#test-run${NC}\n\n"
 39 | 
 40 |     printf "Exiting for now.\n\n"
 41 |     exit
 42 | 
 43 | fi
 44 | 
 45 | # putting here instead of at top so that the above message is still sent if curl fails
 46 | set -e
 47 | 
 48 | tar -xf GToTree-test-data.tar.gz
 49 | rm GToTree-test-data.tar.gz
 50 | 
 51 | printf "\n\n"
 52 | 
 53 | TEST_DATA_DIR="GToTree-test-data"
 54 | 
 55 | ## modifying paths of input genomes in input files as needed (not using sed -i so compatible with darwin sed too)
 56 | 
 57 | sed "s/^/${TEST_DATA_DIR}\//" ${TEST_DATA_DIR}/genbank_files.txt > ${TEST_DATA_DIR}/genbank_files.txt.tmp && mv ${TEST_DATA_DIR}/genbank_files.txt.tmp ${TEST_DATA_DIR}/genbank_files.txt
 58 | sed "s/^/${TEST_DATA_DIR}\//" ${TEST_DATA_DIR}/fasta_files.txt > ${TEST_DATA_DIR}/fasta_files.txt.tmp && mv ${TEST_DATA_DIR}/fasta_files.txt.tmp ${TEST_DATA_DIR}/fasta_files.txt
 59 | sed "s/^/${TEST_DATA_DIR}\//" ${TEST_DATA_DIR}/amino_acid_files.txt > ${TEST_DATA_DIR}/amino_acid_files.txt.tmp && mv ${TEST_DATA_DIR}/amino_acid_files.txt.tmp ${TEST_DATA_DIR}/amino_acid_files.txt
 60 | 
 61 | printf "  ${GREEN}Running test as:\n${NC}"
 62 | printf "    ${YELLOW}GToTree -a ${TEST_DATA_DIR}/ncbi_accessions.txt "'\\ \n'
 63 | printf "            -g ${TEST_DATA_DIR}/genbank_files.txt "'\\ \n'
 64 | printf "            -f ${TEST_DATA_DIR}/fasta_files.txt "'\\ \n'
 65 | printf "            -A ${TEST_DATA_DIR}/amino_acid_files.txt "'\\ \n'
 66 | printf "            -m ${TEST_DATA_DIR}/genome_to_id_map.tsv "'\\ \n'
 67 | printf "            -p ${TEST_DATA_DIR}/pfam_targets.txt "'\\ \n'
 68 | printf "            -H Universal -t -D -j 4 -o GToTree-test-output -F ${p_flag}\n\n${NC}"
 69 | 
 70 | sleep 2
 71 | 
 72 | printf "  ${YELLOW}The test run includes some things that shouldn't be found, so\n"
 73 | printf "  don't be alarmed when seeing those messages.${NC}\n\n"
 74 | 
 75 | sleep 2
 76 | 
 77 | printf "  ${GREEN}Starting run now:\n${NC}"
 78 | 
 79 | GToTree -a ${TEST_DATA_DIR}/ncbi_accessions.txt -g ${TEST_DATA_DIR}/genbank_files.txt -f ${TEST_DATA_DIR}/fasta_files.txt -A ${TEST_DATA_DIR}/amino_acid_files.txt -H Universal -m ${TEST_DATA_DIR}/genome_to_id_map.tsv -p ${TEST_DATA_DIR}/pfam_targets.txt -t -D -j 4 -o GToTree-test-output -F ${p_flag}
 80 | 
 81 | if [ -d "GToTree-test-output/" ]; then
 82 | 
 83 |     printf "\n ${YELLOW}_______________________________________________________________________________${NC}\n\n"
 84 |     printf "\n  ${GREEN}Test completed! See here for how things should look:\n${NC}"
 85 |     printf "    ${YELLOW}https://github.com/AstrobioMike/GToTree/wiki/Installation#test-run${NC}\n\n"
 86 | 
 87 | else
 88 | 
 89 |     printf "\n ${YELLOW}_______________________________________________________________________________${NC}\n\n"
 90 |     printf "\n  ${RED}There seems to have been a problem with the test run :(\n${NC}"
 91 |     printf "    ${YELLOW}If this continues, please consider submitting an issue here:\n${NC}"
 92 |     printf "        ${YELLOW}https://github.com/AstrobioMike/GToTree/issues${NC}\n\n"
 93 | 
 94 |     printf "  ${GREEN}You can clear out the test data and results by running:${NC}\n"
 95 |     printf "    ${YELLOW}gtt-clean-after-test.sh\n\n${NC}"
 96 | 
 97 | fi
 98 | 
 99 | printf "  ${GREEN}You can clear out the test data and results by running:${NC}\n"
100 | printf "    ${YELLOW}gtt-clean-after-test.sh\n\n${NC}"
101 | 


--------------------------------------------------------------------------------
/bin/gtt-update-ncbi-taxonomy:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # setting colors to use
 4 | GREEN='\033[0;32m'
 5 | RED='\033[0;31m'
 6 | NC='\033[0m'
 7 | 
 8 | printf "\n"
 9 | 
10 | ## trying https if ftp fails
11 | curl --retry 10 -o ${TAXONKIT_DB}/taxdump.tar.gz ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz || curl --retry 10 -o ${TAXONKIT_DB}/taxdump.tar.gz https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
12 | 
13 | tar -xzf ${TAXONKIT_DB}/taxdump.tar.gz -C ${TAXONKIT_DB}
14 | 
15 | rm ${TAXONKIT_DB}/taxdump.tar.gz
16 | 
17 | printf "\n\t\t${GREEN}The NCBI taxonomy database info has been updated!${NC}\n\n"
18 | 


--------------------------------------------------------------------------------