├── .github └── workflows │ ├── test-conda-installs.yml │ └── test-full-run.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── bin ├── GToTree ├── gtt-align-and-trim-parallel.sh ├── gtt-amino-acid-parallel.sh ├── gtt-amino-acid-serial.sh ├── gtt-append-fasta-headers ├── gtt-cat-alignments ├── gtt-check-or-setup-GTDB-files ├── gtt-check-wanted-lineage-info ├── gtt-clean-after-test.sh ├── gtt-combine-kofamscan-results.sh ├── gtt-count-bases-per-seq ├── gtt-data-locations ├── gtt-fasta-parallel-nt.sh ├── gtt-fasta-parallel.sh ├── gtt-fasta-serial-nt.sh ├── gtt-fasta-serial.sh ├── gtt-filter-parallel.sh ├── gtt-filter-seqs-by-length ├── gtt-gen-KO-iToL-files.sh ├── gtt-gen-SCG-HMMs ├── gtt-gen-itol-map ├── gtt-gen-pfam-iToL-files.sh ├── gtt-genbank-parallel.sh ├── gtt-genbank-serial.sh ├── gtt-genbank-to-AA-seqs ├── gtt-genbank-to-fasta ├── gtt-get-accessions-from-GTDB ├── gtt-get-additional-pfam-targets.sh ├── gtt-get-kofamscan-data ├── gtt-get-median.sh ├── gtt-get-ncbi-assembly-tables ├── gtt-get-ncbi-tax-data ├── gtt-hmms ├── gtt-ncbi-parallel-nt.sh ├── gtt-ncbi-parallel.sh ├── gtt-ncbi-serial-nt.sh ├── gtt-ncbi-serial.sh ├── gtt-parse-assembly-summary-file ├── gtt-parse-fasta-by-headers ├── gtt-parse-gtdb-assembly-summary-file ├── gtt-parse-kofamscan-targets.sh ├── gtt-pfam-search ├── gtt-remove-all-gap-seqs-from-alignment ├── gtt-rename-fasta-headers ├── gtt-reorder-fasta ├── gtt-run-additional-pfam-search.sh ├── gtt-run-kofamscan.sh ├── gtt-store-SCG-HMMs ├── gtt-subset-GTDB-accessions ├── gtt-swap-ids ├── gtt-test.sh └── gtt-update-ncbi-taxonomy └── hmm_sets └── hmm-sources-and-info.tsv /.github/workflows/test-conda-installs.yml: -------------------------------------------------------------------------------- 1 | name: testing conda installs 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | install-tests: 7 | name: ${{ matrix.os }} conda install test 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: ["ubuntu-latest", "macos-latest"] 12 | 13 | steps: 14 | 15 | - uses: conda-incubator/setup-miniconda@v3 16 | with: 17 | python-version: "3.9" 18 | channels: astrobiomike,conda-forge,bioconda 19 | channel-priority: true 20 | 21 | - name: Install GToTree 22 | shell: bash -el {0} 23 | run: | 24 | if [[ "${RUNNER_OS}" == "macOS" ]]; then 25 | conda create --platform osx-64 -n gtotree -y gtotree 26 | else 27 | conda create -n gtotree -y gtotree 28 | fi 29 | 30 | - name: Check GToTree 31 | shell: bash -l {0} 32 | run: | 33 | conda activate gtotree 34 | GToTree -h 35 | -------------------------------------------------------------------------------- /.github/workflows/test-full-run.yml: -------------------------------------------------------------------------------- 1 | name: testing full run 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | install-tests: 7 | name: ${{ matrix.os }} conda install test 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: ["ubuntu-latest", "macos-latest"] 12 | 13 | steps: 14 | 15 | - uses: conda-incubator/setup-miniconda@v3 16 | with: 17 | python-version: "3.9" 18 | channels: astrobiomike,conda-forge,bioconda 19 | channel-priority: true 20 | 21 | - name: Install GToTree 22 | shell: bash -el {0} 23 | run: | 24 | if [[ "${RUNNER_OS}" == "macOS" ]]; then 25 | conda create --platform osx-64 -n gtotree -y gtotree 26 | else 27 | conda create -n gtotree -y gtotree 28 | fi 29 | 30 | - name: Run GToTree 31 | shell: bash -l {0} 32 | run: | 33 | conda activate gtotree 34 | gtt-test.sh 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | notes.txt 3 | GToTree.egg-info/ 4 | build/ 5 | gtotree/ 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## v1.8.14 (21-Apr-2025) 4 | 5 | ### Changed 6 | - change to taxonkit call when adding NCBI tax info (now using `reformat2` and a pattern) in order to deal with NCBI tax-structure update 7 | 8 | 9 | ## v1.8.13 (18-Mar-2025) 10 | 11 | ### Changed 12 | - changed `gtt-gen-SCG-HMMs` to only use Pfam 37.0 for now (as later versions don't have one of the required files currently; see https://github.com/AstrobioMike/GToTree/issues/104) 13 | 14 | --- 15 | 16 | ## v1.8.12 (11-Mar-2025) 17 | 18 | ### Changed 19 | - changed GTDB download links from https://data.gtdb.ecogenomic.org/releases/ to https://data.ace.uq.edu.au/public/gtdb/data/releases/ due to the former becoming prohibitively slow recently 20 | 21 | --- 22 | 23 | ## v1.8.11 (10-Mar-2025) 24 | 25 | ### Added 26 | - VeryFastTree is now an available treeing program (`-T`) 27 | 28 | ### Changed 29 | - when using `gtt-get-accessions-from-GTDB`, if the requested taxon has spaces in it (e.g., `gtt-get-accessions-from-GTDB -t "Bacillus_A anthracis"`), the output files will have spaces replaced with dashes now 30 | - e.g., one of the outputs will now be "GTDB-Bacillus_A-anthracis-species-accs.txt" instead of "GTDB-Bacillus_A anthracis-species-accs.txt" 31 | 32 | --- 33 | 34 | ## v1.8.10 (3-Feb-2025) 35 | 36 | ### Added 37 | - saving ncbi downloaded files is possible when debug flag (`-d`) is set as requested in https://github.com/AstrobioMike/GToTree/issues/95, implemented in https://github.com/AstrobioMike/GToTree/pull/102 38 | - with the debug flag set while running, it will keep specific files in `//ncbi-downloads/`: 39 | - if amino-acid seqs are used, it will keep the downloaded amino-acid seqs 40 | - if there were no amino-acid seqs, and the genome had to be downloaded, it will keep the downloaded genome and the prodigal-called amino-acid seqs 41 | - if using nucleotide mode (`-z`), it will keep the downloaded genome and the prodigal-called nt cds and amino-acid seqs 42 | 43 | --- 44 | 45 | ## v1.8.9 (31-Jan-2025) 46 | 47 | ### Fixed 48 | - added logic to catch, exit, and report when muscle doesn't successfully produce an alignment for a single-copy gene-set (thanks to https://github.com/AstrobioMike/GToTree/issues/101) 49 | 50 | --- 51 | 52 | ## v1.8.8 (7-Oct-2024) 53 | 54 | ### Changed 55 | - updated the call to FastTree and FastTreeMP to be include -nt and -gtr when GToTree is run in nucleotide mode (-z) 56 | 57 | ### Fixed 58 | - properly saving additional pfam target HMMs when that functionality is used 59 | 60 | --- 61 | 62 | ## v1.8.7 (29-Sep-2024) 63 | 64 | ### Added 65 | - `gtt-gen-SCG-HMMs` now reports which version of PFAM was used (prints it out to the terminal and writes it to a file) 66 | 67 | ### Changed 68 | - improvements to the "Universal" Hug et al. gene set thanks so much to @molly-kholodova digging in and reaching out! 69 | - PF00181 ("Ribosomal_L2") was changed to PF03947 ("Ribosomal_L2_C") 70 | - the C-terminal (which PF03947 covers) is better conserved 71 | - PF00827 ("Ribosomal_L15") was changed to PF00828 ("Ribosomal_L27A") 72 | - PF00827 was archaea/euk only, PF00828 holds the bac/arc L15 also 73 | - PF17135 ("Ribosomal_L18") was changed to PF00861 ("Ribosomal_L18p") 74 | - the PF00861 model is better distributed 75 | 76 | --- 77 | 78 | ## v1.8.6 (8-May-2024) 79 | 80 | ### Fixed 81 | - fixed when taxonomy information wasn't being added to labels when running in nucleotide mode (`-z`; https://github.com/AstrobioMike/GToTree/issues/91) 82 | 83 | --- 84 | 85 | ## v1.8.5 (1-May-2024) 86 | 87 | ### Changed 88 | - update to `gtt-gen-SCG-HMMs` to deal with ncbi assembly summary files having a column name of "#assembly_accession" instead of what was once "# assembly_accession" 89 | 90 | --- 91 | 92 | ## v1.8.4 (28-Nov-2023) 93 | 94 | ### Fixed 95 | - fixed an issue that prevented moving forward when there were more than 12,500 input genomes (https://github.com/AstrobioMike/GToTree/issues/83) 96 | 97 | --- 98 | 99 | ## v1.8.3 (14-Oct-2023) 100 | 101 | ### Changed 102 | - updated links to GTDB files as they switched from .tar.gz extensions to .tsv.gz extensions in latest release, thanks to note from @jmtsuji (https://github.com/AstrobioMike/GToTree/issues/81) 103 | 104 | --- 105 | 106 | ## v1.8.2 (26-Jul-2023) 107 | 108 | ### Added 109 | - added http option to gtt-test.sh (`gtt-test.sh http`) thanks to https://github.com/AstrobioMike/GToTree/issues/78 (https://github.com/AstrobioMike/GToTree/commit/9eb248ad5a54563370978d3575727eb63ad93483) 110 | 111 | ### Fixed 112 | - updated `gtt-get-ncbi-tax-data` to appropriately pull from http instead of ftp also thanks to https://github.com/AstrobioMike/GToTree/issues/78 113 | - fix to check for ncbi assemblies "date-retrieved.txt" file, as also caught and fixed by @hyphaltip (https://github.com/AstrobioMike/GToTree/pull/80) 🙏 114 | 115 | --- 116 | 117 | Earlier version changes are tracked on the [releases page](https://github.com/AstrobioMike/GToTree/releases). 118 | 119 | --- 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 |
6 |
7 |
8 | 9 | Citations 10 |
11 | Conda installs 12 |
13 | DOI 14 |
15 | Twitter Follow 16 |
17 | 18 | --- 19 | 20 | # GToTree: a user-friendly workflow for phylogenomics 21 | [GToTree](https://github.com/AstrobioMike/GToTree/wiki) is a user-friendly workflow for phylogenomics intended to give more researchers the capability to easily create phylogenomic trees. Documentation and examples can be found [at the wiki here](https://github.com/AstrobioMike/GToTree/wiki), and the open-access Bioinformatics Journal publication is available [here](https://doi.org/10.1093/bioinformatics/btz188). GToTree can be installed and run on a Mac or Linux machine, as well as on Windows within a Windows Subsystem for Linux environment 👍 22 | 23 | --- 24 | **A quick [conda installation](https://github.com/AstrobioMike/GToTree/wiki/installation#conda-quickstart) can be run like so:** 25 | 26 | ``` 27 | conda create -y -n gtotree -c astrobiomike -c conda-forge -c bioconda gtotree 28 | ``` 29 | 30 | 31 | --- 32 | 33 | GToTree is a more structured implementation of a workflow I would put together everytime I wanted to make a large-scale phylogenomic tree. What do I mean by large-scale? Anything from a full-blown Tree of Life with all 3 domains, down to, for example, all available genomes of *Staphylococcus* alongside new isolate genomes. At its heart it just takes in genomes and outputs an alignment and phylogenomic tree based on the specified HMM profiles. But I think its value comes from three main things: 1) its flexibility with regard to input format - taking fasta files, GenBank files, and/or NCBI accessions (So if you just recovered a bunch of new genomes and you want to see where they fit in with references, you can provide references by accession and your new genomes as fasta files.); 2) its automation of required between-tool tasks such as filtering hits by gene-length, filtering out genomes with too few hits to the target genes, and swapping genome labels for something more useful; and 3) its scalability – GToTree can turn ~1,700 input genomes into a tree in ~60 minutes on a standard laptop. 34 | 35 | Also included are several newly generated single-copy gene-sets for 13 different taxonomical groupings. These are presented in the [wiki](https://github.com/AstrobioMike/GToTree/wiki/SCG-sets), along with an explanation and example code/steps used in the generation of them. 36 | 37 | GToTree utilizes helper scripts written in python, but is primarily implemented in bash. Every attempt is being made to make it portable across all variations of GNU/Unix, including on Macs, so if you run into any issues, it'd be appreciated if you could [report them](https://github.com/AstrobioMike/GToTree/issues) so the problems can be found and fixed! 38 | 39 |

40 | 41 |

42 | 43 | See the ["What is GToTree?" wiki page](https://github.com/AstrobioMike/GToTree/wiki/what-is-gtotree%3F) for some more detail on the processing steps pictured above. For practical ways GToTree can be helpful, check out the [Example usage page](https://github.com/AstrobioMike/GToTree/wiki/example-usage). And for detailed information on using GToTree, see the [User guide](https://github.com/AstrobioMike/GToTree/wiki/user-guide). 44 | 45 | --- 46 | 47 | **A quick [conda installation](https://github.com/AstrobioMike/GToTree/wiki/installation#conda-quickstart) can be run like so:** 48 | 49 | ``` 50 | conda create -y -n gtotree -c astrobiomike -c conda-forge -c bioconda gtotree 51 | ``` 52 | 53 | 54 | --- 55 | 56 | ## Citation information 57 | 58 | GToTree will print out a `citations.txt` file with citation information specific for every run that accounts for all programs it relies upon. Please be sure to cite the developers appropriately :) 59 | 60 | Here is an example output `citations.txt` file from a run, and how I'd cite it in the methods: 61 | 62 | ``` 63 | GToTree v1.6.31 64 | Lee MD. GToTree: a user-friendly workflow for phylogenomics. Bioinformatics. 2019; (March):1-3. doi:10.1093/bioinformatics/btz188 65 | 66 | Prodigal v2.6.3 67 | Hyatt, D. et al. Gene and translation initiation site prediction in metagenomic sequences. Bioinformatics. 2010; 28, 2223–2230. doi.org/10.1186/1471-2105-11-119 68 | 69 | HMMER3 v3.3.2 70 | Eddy SR. Accelerated profile HMM searches. PLoS Comput. Biol. 2011; (7)10. doi:10.1371/journal.pcbi.1002195 71 | 72 | Muscle v5.1 73 | Edgar RC. MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping. bioRxiv. 2021. doi.org/10.1101/2021.06.20.449169 74 | 75 | TrimAl v1.4.rev15 76 | Gutierrez SC. et al. TrimAl: a Tool for automatic alignment trimming. Bioinformatics. 2009; 25, 1972–1973. doi:10.1093/bioinformatics/btp348 77 | 78 | TaxonKit v0.9.0 79 | Shen W and Ren H. TaxonKit: a practical and efficient NCBI Taxonomy toolkit. Journal of Genetics and Genomics. 2021. doi.org/10.1016/j.jgg.2021.03.006 80 | 81 | FastTree 2 v2.1.11 82 | Price MN et al. FastTree 2 - approximately maximum-likelihood trees for large alignments. PLoS One. 2010; 5. doi:10.1371/journal.pone.0009490 83 | ``` 84 | 85 | **Example methods text based on above citation output (be sure to modify as appropriate for your run)** 86 | > *The archaeal phylogenomic tree was produced with GToTree v1.6.31 (Lee 2019), using the prepackaged single-copy gene-set for archaea (76 target genes). Briefly, prodigal v2.6.3 (Hyatt et al. 2010) was used to predict genes on input genomes provided as fasta files. Target genes were identified with HMMER3 v3.2.2 (Eddy 2011), individually aligned with muscle v5.1 (Edgar 2021), trimmed with trimal v1.4.rev15 (Capella-Gutiérrez et al. 2009), and concatenated prior to phylogenetic estimation with FastTree2 v2.1.11 (Price et al. 2010). TaxonKit (Shen and Ren 2021) was used to connect full lineages to taxonomic IDs.* 87 | -------------------------------------------------------------------------------- /bin/gtt-align-and-trim-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | faster_alignment=$3 11 | num_muscle_threads=$4 12 | target_gene_suffix=$5 13 | 14 | # removing those genomes that need to be removed based on not having enough hits to the target genes 15 | gtt-parse-fasta-by-headers -i ${tmp_dir}/${1}_hits_filtered.tmp -w ${tmp_dir}/sorted_genomes_to_remove.tmp -o ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} --inverse 16 | 17 | # aligning 18 | if [ $faster_alignment == 'true' ]; then 19 | muscle -super5 ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1 20 | else 21 | muscle -align ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1 22 | fi 23 | 24 | # checking if alignment was successful (really this is a sloppy way of checking, but it's better than nothing and the muscle log file will be available) 25 | if [ ! -s ${tmp_dir}/${1}_aligned.tmp ]; then 26 | printf "${1}\n" >> ${tmp_dir}/kill_align_and_trim_parallel.problem 27 | exit 28 | fi 29 | 30 | # trimming 31 | trimal -in ${tmp_dir}/${1}_aligned.tmp -out ${tmp_dir}/${1}_trimmed${target_gene_suffix}.tmp -automated1 32 | 33 | # removing linewraps: 34 | sed 's/ .*$//' ${tmp_dir}/${1}_trimmed${target_gene_suffix}.tmp | awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' > ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp 35 | 36 | ## adding gap-sequences for genomes missing the current gene ## 37 | # finding here which ones have it 38 | grep ">" ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp | tr -d ">" | sort > ${tmp_dir}/${1}_genomes_with_gene.tmp 39 | 40 | # now getting which ones don't have it 41 | comm -23 ${tmp_dir}/final_genomes_from_all_sources.tmp ${tmp_dir}/${1}_genomes_with_gene.tmp | sort > ${tmp_dir}/${1}_needed_gappers.tmp 42 | 43 | # creating gap-sequences if needed 44 | if [ -s ${tmp_dir}/${1}_needed_gappers.tmp ]; then 45 | 46 | # making a headers file for when making fasta in a few steps: 47 | sed 's/^/>/' ${tmp_dir}/${1}_needed_gappers.tmp > ${tmp_dir}/${1}_needed_headers.tmp 48 | 49 | # getting length of the alignment for the current gene: 50 | aln_length_tmp=$(sed -n '2p' ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp | wc -c | tr -s " " | cut -f2 -d " ") 51 | # subtracting 1 for newline characters 52 | aln_length_tmp=$(echo "$aln_length_tmp"-1 | bc) 53 | # making a string of gaps the length of the alignment for those missing it: 54 | gap_seq=$(printf "%0.s-" $(seq 1 1 $aln_length_tmp)) 55 | # making as many gap sequences as there are genomes missing the current gene: 56 | num_genomes_to_add=$(wc -l ${tmp_dir}/${1}_needed_gappers.tmp | tr -s " " "\t" | cut -f2) 57 | for i in $(cat ${tmp_dir}/${1}_needed_gappers.tmp) 58 | do 59 | echo "$gap_seq" 60 | done > ${tmp_dir}/${1}_gaps.tmp 61 | 62 | # making fasta of those genomes missing the current gene: 63 | paste -d "\n" ${tmp_dir}/${1}_needed_headers.tmp ${tmp_dir}/${1}_gaps.tmp > ${tmp_dir}/${1}_missing_genomes${target_gene_suffix}.tmp 64 | # catting the genomes missing the current gene together with those that have it 65 | cat ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp ${tmp_dir}/${1}_missing_genomes${target_gene_suffix}.tmp > ${tmp_dir}/${1}${target_gene_suffix}.tmp 66 | else 67 | mv ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp ${tmp_dir}/${1}${target_gene_suffix}.tmp 68 | fi 69 | 70 | ## reordering the final fasta of this gene so that all gene sets can be pasted together at end ## 71 | gtt-reorder-fasta -i ${tmp_dir}/${1}${target_gene_suffix}.tmp -w ${tmp_dir}/final_genomes_from_all_sources.tmp -o ${tmp_dir}/${1}_all_aligned${target_gene_suffix} 72 | 73 | printf "\n\n\n -------------------------------------------------------------------------- \n" 74 | printf "\t Finished aligning and formatting gene-set ${GREEN}$1${NC}.\n" 75 | printf " -------------------------------------------------------------------------- \n" 76 | -------------------------------------------------------------------------------- /bin/gtt-amino-acid-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | num_cpus=$4 12 | hmm_target_genes_total=$5 13 | output_dir=$6 14 | best_hit_mode=$7 15 | additional_pfam_targets=$8 16 | ko_targets=$9 17 | target_KOs=${10} 18 | 19 | ### kill backstop 20 | # if there is a problem, all child processes launched (by this script) will exit immediately, 21 | # upon returning to main script, will check and terminate parent process 22 | if [ -s ${tmp_dir}/kill_amino_acid_parallel.problem ]; then 23 | exit 24 | fi 25 | 26 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 27 | if $(file $1 | grep -q "gzip"); then 28 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 29 | file_location=${1%.*} 30 | gunzip -f -c $1 > $file_location 31 | assembly="$(basename ${file_location%.*})" 32 | else 33 | file_location=$1 34 | assembly="$(basename ${1%.*})" 35 | was_gzipped=FALSE 36 | fi 37 | 38 | printf " -------------------------------------------------------------------------- \n\n" 39 | printf " Genome: ${GREEN}$assembly${NC}\n" 40 | 41 | # adding assembly to ongoing genomes list 42 | echo $assembly >> ${tmp_dir}/amino_acid_genomes_list.tmp 43 | 44 | num=$((num+1)) # to track progress 45 | 46 | 47 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 48 | gtt-filter-seqs-by-length -q -i ${file_location} -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes1.tmp 49 | 50 | ## renaming seqs to have assembly name (also to ensure simple headers) 51 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp 52 | 53 | 54 | ## removing gunzipped genome file if it was gunzipped 55 | if [ $was_gzipped == "TRUE" ]; then 56 | rm -rf $file_location 57 | fi 58 | 59 | 60 | ## exiting here and reporting current input file if something is wrong with it and didn't get coding sequences 61 | if [ ! -s ${tmp_dir}/${assembly}_genes.tmp ]; then 62 | printf "$assembly" >> ${tmp_dir}/kill_amino_acid_parallel.problem 63 | exit 64 | fi 65 | 66 | 67 | ### running hmm search ### 68 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 69 | 70 | ### calculating % completion and redundancy ### 71 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 72 | do 73 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 74 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 75 | 76 | ## making list here of only those present in exactly 1 copy 77 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 78 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 79 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 80 | 81 | ## adding SCG-hit counts to table 82 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 83 | 84 | # total number of unique SCG hits 85 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 86 | 87 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 88 | 89 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 90 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 91 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 92 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 93 | 94 | ### want to put an explicit notice out if estimated redundancy is greater than 10% 95 | # needs to be an integer for bash comparison, so multiplying by 100 first 96 | 97 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 98 | 99 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 100 | 101 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 102 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 103 | 104 | 105 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 106 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 107 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 108 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 109 | printf " You may want to consider taking a closer look and/or removing it from the\n" 110 | printf " from the input genomes.\n\n" 111 | 112 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 113 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 114 | 115 | # writing to table of genomes with questionable redundancy estimates 116 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 117 | 118 | else 119 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 120 | fi 121 | 122 | # adding NA for taxid so final table can still have the column and lineage for those that do have them 123 | taxid="NA" 124 | 125 | ## writing summary info to table ## 126 | printf "$assembly\t$1\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Amino_acid_genomes_summary_info.tsv 127 | 128 | ### Pulling out hits for this genome ### 129 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 130 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 131 | 132 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 133 | if [ $best_hit_mode == "false" ]; then 134 | 135 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 136 | do 137 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 138 | done 139 | 140 | # if best-hit mode is on, taking best hit 141 | else 142 | 143 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 144 | do 145 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 146 | done 147 | 148 | fi 149 | 150 | ## searching for additional targets if provided 151 | # getting count of genes if there are additional targets 152 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 153 | 154 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 155 | 156 | fi 157 | 158 | ## KOs 159 | if [ $ko_targets == "true" ]; then 160 | 161 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 162 | 163 | fi 164 | 165 | ## Pfams 166 | if [ $additional_pfam_targets == "true" ]; then 167 | 168 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 169 | 170 | fi 171 | 172 | rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 173 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 174 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi 175 | -------------------------------------------------------------------------------- /bin/gtt-amino-acid-serial.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | amino_acid_genomes_total=$4 12 | num_cpus=$5 13 | hmm_target_genes_total=$6 14 | output_dir=$7 15 | best_hit_mode=$8 16 | additional_pfam_targets=${9} 17 | ko_targets=${10} 18 | target_KOs=${11} 19 | 20 | # looping through the lines of the provided [-f] file (this loop operates on one genome at a time) 21 | while IFS=$'\t' read -r -a file 22 | do 23 | 24 | ### kill backstop 25 | # if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly 26 | if [ -s ${tmp_dir}/kill_amino_acid_serial.problem ]; then 27 | exit 28 | fi 29 | 30 | 31 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 32 | if $(file $file | grep -q "gzip"); then 33 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 34 | file_location=${file%.*} 35 | gunzip -f -c $file > $file_location 36 | assembly="$(basename ${file_location%.*})" 37 | else 38 | file_location=$file 39 | assembly="$(basename ${file%.*})" 40 | was_gzipped=FALSE 41 | fi 42 | 43 | # adding assembly to ongoing genomes list 44 | echo $assembly >> ${tmp_dir}/amino_acid_genomes_list.tmp 45 | 46 | num=$((num+1)) # to track progress 47 | 48 | printf " -------------------------------------------------------------------------- \n" 49 | printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $amino_acid_genomes_total total.\n" 50 | printf " -------------------------------------------------------------------------- \n\n" 51 | 52 | 53 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 54 | gtt-filter-seqs-by-length -q -i ${file_location} -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes1.tmp 55 | 56 | ## renaming seqs to have assembly name (also to ensure simple headers) 57 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp 58 | 59 | ## removing gunzipped genome file if it was gunzipped 60 | if [ $was_gzipped == "TRUE" ]; then 61 | rm -rf $file_location 62 | fi 63 | 64 | ## exiting here and reporting current input file if something is wrong with it and didn't get coding sequences 65 | if [ ! -s ${tmp_dir}/${assembly}_genes.tmp ]; then 66 | printf "$assembly" >> ${tmp_dir}/kill_amino_acid_serial.problem 67 | exit 68 | fi 69 | 70 | printf " Performing HMM search...\n" 71 | 72 | ### running hmm search ### 73 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 74 | 75 | ### calculating % completion and redundancy ### 76 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 77 | do 78 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 79 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 80 | 81 | ## making list here of only those present in exactly 1 copy 82 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 83 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 84 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 85 | 86 | ## adding SCG-hit counts to table 87 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 88 | 89 | # total number of unique SCG hits 90 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 91 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 92 | 93 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 94 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 95 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 96 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 97 | 98 | # want to put a notice out if estimated redundancy is greater than 10 99 | # needs to be an integer for bash comparison, so multiplying by 100 first 100 | 101 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 102 | 103 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 104 | 105 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 106 | 107 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 108 | 109 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 110 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 111 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 112 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 113 | printf " You may want to consider taking a closer look and/or removing it from the\n" 114 | printf " from the input genomes.\n\n" 115 | 116 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 117 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 118 | 119 | # writing to table of genomes with questionable redundancy estimates 120 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 121 | 122 | else 123 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 124 | 125 | fi 126 | 127 | # adding NA for taxid so final table can still have the column and lineage for those that do have them 128 | taxid="NA" 129 | 130 | ## writing summary info to table ## 131 | printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Amino_acid_genomes_summary_info.tsv 132 | 133 | ### Pulling out hits for this genome ### 134 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 135 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 136 | 137 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 138 | if [ $best_hit_mode == "false" ]; then 139 | 140 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 141 | do 142 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 143 | done 144 | 145 | # if best-hit mode is on, taking best hit 146 | else 147 | 148 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 149 | do 150 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 151 | done 152 | 153 | fi 154 | 155 | ## searching for additional targets if provided 156 | # getting count of genes if there are additional targets 157 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 158 | 159 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 160 | 161 | fi 162 | 163 | ## KOs 164 | if [ $ko_targets == "true" ]; then 165 | 166 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 167 | 168 | fi 169 | 170 | ## Pfams 171 | if [ $additional_pfam_targets == "true" ]; then 172 | 173 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 174 | 175 | fi 176 | 177 | rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 178 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 179 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi 180 | 181 | done < $1 182 | -------------------------------------------------------------------------------- /bin/gtt-append-fasta-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='This script will modify headers of sequences of a multifasta, specific for use in GToTree.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_fasta", help="Starting fasta file", action="store", dest="input_fasta", required=True) 12 | parser.add_argument("-w", "--desired_append", help='Name to append to seqs (default: "Seq"', action="store", dest="wanted_name", default="Seq") 13 | parser.add_argument("-o", "--output_fasta_name", help='Output fasta file (default: "Renamed.fasta").', dest="output_fasta_name", default="Renamed.fasta") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | in_fasta = open(args.input_fasta, "r") 22 | new_header = args.wanted_name 23 | out_fasta = open(args.output_fasta_name, "w") 24 | 25 | n = 0 26 | 27 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 28 | n = n + 1 29 | out_fasta.write(">" + new_header + "_" + seq_record.id + "_" + str(n) + "\n") 30 | out_fasta.write(str(seq_record.seq) + "\n") 31 | 32 | in_fasta.close() 33 | out_fasta.close() 34 | -------------------------------------------------------------------------------- /bin/gtt-cat-alignments: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from glob import glob 5 | import argparse 6 | import os.path 7 | 8 | parser = argparse.ArgumentParser(description='This script is a helper script to concatenate fasta-formatted multiple sequence alignment files, and generate partitions file.') 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-t", "--tmp-dir", help="The working tmp_dir for the current GToTree run", action="store", dest="tmp_dir", required=True) 13 | required.add_argument("-o", "--output-dir", help="The output_dir for the current GToTree run", action="store", dest="output_dir", required=True) 14 | parser.add_argument("--nucleotides", help="Provide this flag if user specified nucleotide mode", action="store_true") 15 | 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | tmp_dir = args.tmp_dir + "/" 24 | output_dir = args.output_dir + "/" 25 | 26 | # getting list of all alignment files 27 | if not args.nucleotides: 28 | 29 | list_of_alignment_files = glob(tmp_dir + "*_all_aligned.faa") 30 | 31 | else: 32 | 33 | list_of_alignment_files = glob(tmp_dir + "*_all_aligned.fa") 34 | 35 | # initializing dictionary that will hold headers as keys and a list of all seqs to be cat'd as values 36 | dict_of_genomes = {} 37 | 38 | # getting headers (they are the same in all files and all are found in all files at this point, so only need to pull from one) 39 | with open (list_of_alignment_files[0]) as file: 40 | for line in file: 41 | if line.strip().startswith(">"): 42 | dict_of_genomes[(line.strip().lstrip(">"))] = [] 43 | 44 | 45 | # iterating through all files adding seqs 46 | for file in list_of_alignment_files: 47 | with open(file) as fasta: 48 | curr_header="" 49 | for line in fasta: 50 | line = line.strip() 51 | if line.startswith(">"): 52 | curr_header=line.lstrip(">") 53 | else: 54 | dict_of_genomes[curr_header].append(line) 55 | 56 | 57 | # writing out concatenated (horizontally) sequence file 58 | 59 | if not args.nucleotides: 60 | with open(output_dir + "Aligned_SCGs.faa", "w") as out: 61 | for header, seqs in dict_of_genomes.items(): 62 | out.write(">" + header + "\n") 63 | out.write("XXXXX".join(seqs) + "\n") 64 | 65 | else: 66 | with open(output_dir + "Aligned_SCGs.fa", "w") as out: 67 | for header, seqs in dict_of_genomes.items(): 68 | out.write(">" + header + "\n") 69 | out.write("NNNNNN".join(seqs) + "\n") 70 | 71 | # making partitions file 72 | # getting list of gene names in order they were cat'd together 73 | if not args.nucleotides: 74 | gene_list = [os.path.basename(x)[:-16] for x in list_of_alignment_files] 75 | else: 76 | gene_list = [os.path.basename(x)[:-15] for x in list_of_alignment_files] 77 | 78 | # all are same length, so just need one genome entry, then to count the bases per element in dict values list, and add 5 for the XXXXX spacers 79 | # getting all alignment lengths 80 | 81 | alignment_lengths_list = [len(x) for x in list(dict_of_genomes.values())[0]] 82 | 83 | curr_start = 1 84 | curr_stop = 0 85 | 86 | with open(output_dir + "Partitions.txt", "w") as out: 87 | for i in range(0,len(gene_list)): 88 | curr_stop = curr_start + alignment_lengths_list[i] - 1 89 | 90 | if not args.nucleotides: 91 | out.write("AA, " + str(gene_list[i]) + " = " + str(curr_start) + "-" + str(curr_stop) + "\n") 92 | curr_start = curr_stop + 6 93 | else: 94 | out.write("DNA, " + str(gene_list[i]) + " = " + str(curr_start) + "-" + str(curr_stop) + "\n") 95 | curr_start = curr_stop + 7 96 | -------------------------------------------------------------------------------- /bin/gtt-check-or-setup-GTDB-files: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 5 | for setting up reference files for the glorious Genome Taxonomy Database (gtdb.ecogenomic.org/). 6 | 7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage 8 | """ 9 | 10 | import sys 11 | import os 12 | import urllib.request 13 | import pandas as pd 14 | import textwrap 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser(description="This is a helper program to facilitate setting up the reference files for the \ 18 | glorious Genome Taxonomy Database (gtdb.ecogenomic.org). It's really meant for internal \ 19 | use only by the main GToTree program.") 20 | 21 | args = parser.parse_args() 22 | 23 | ################################################################################ 24 | 25 | def main(): 26 | 27 | ## checking env variable is set and writable 28 | check_location_var_is_set_and_writable("GTDB_dir") 29 | 30 | ## setting up ref GTDB files if needed 31 | check_and_or_get_gtdb_files(os.environ["GTDB_dir"]) 32 | 33 | ################################################################################ 34 | 35 | 36 | # setting some colors 37 | tty_colors = { 38 | 'green' : '\033[0;32m%s\033[0m', 39 | 'yellow' : '\033[0;33m%s\033[0m', 40 | 'red' : '\033[0;31m%s\033[0m' 41 | } 42 | 43 | 44 | ### functions ### 45 | def color_text(text, color='green'): 46 | if sys.stdout.isatty(): 47 | return tty_colors[color] % text 48 | else: 49 | return text 50 | 51 | 52 | def wprint(text): 53 | print(textwrap.fill(text, width=80, initial_indent=" ", 54 | subsequent_indent=" ", break_on_hyphens=False)) 55 | 56 | 57 | def check_location_var_is_set_and_writable(variable): 58 | 59 | # making sure there is an env variable 60 | try: 61 | path = os.environ[variable] 62 | 63 | if path == "": 64 | raise 65 | 66 | except: 67 | print() 68 | wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be set, and we need it if wanting to add GTDB taxonomic lineages :(", "red")) 69 | print() 70 | wprint("Try to set it with `gtt-data-locations set`, then run GToTree again.") 71 | print("\nExiting for now.\n") 72 | sys.exit(1) 73 | 74 | # making sure path is writable for the user 75 | path_writable = os.access(path, os.W_OK) 76 | 77 | if not path_writable: 78 | print() 79 | wprint(color_text("The environment variable '" + str(variable) + "' does not seem to be writable, and we need it to be if wanting to add GTDB taxonomic lineages :(", "red")) 80 | print() 81 | wprint("Try to set it somewhere else with `gtt-data-locations set`, then run GToTree again.") 82 | print("\nExiting for now.\n") 83 | sys.exit(1) 84 | 85 | return() 86 | 87 | 88 | def gen_gtdb_tab(location): 89 | """ downloads and parses the GTDB info tables """ 90 | 91 | # getting archaea 92 | # arc_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz") 93 | arc_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz") 94 | arc_tab = pd.read_csv(arc_tsv_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False) 95 | arc_tab.rename(columns={arc_tab.columns[0]:"accession"}, inplace=True) 96 | arc_tab.dropna(inplace=True, how="all") 97 | 98 | # getting bacteria 99 | # bac_tsv_gz = urllib.request.urlopen("https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz") 100 | bac_tsv_gz = urllib.request.urlopen("https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz") 101 | bac_tab = pd.read_csv(bac_tsv_gz, sep="\t", compression="gzip", on_bad_lines = 'skip', header=0, low_memory=False) 102 | bac_tab.rename(columns={bac_tab.columns[0]:"accession"}, inplace=True) 103 | bac_tab.dropna(inplace=True, how="all") 104 | 105 | # combining 106 | gtdb_tab = pd.concat([arc_tab, bac_tab]) 107 | 108 | # splitting gtdb taxonomy column into 7 and dropping the single column 109 | domain, phylum, rclass, order, family, genus, species = [], [], [], [], [], [], [] 110 | 111 | for index, row in gtdb_tab.iterrows(): 112 | curr_acc = row["accession"] 113 | tax_list = row["gtdb_taxonomy"].split(";") 114 | 115 | if len(tax_list) != 7: 116 | wprint(color_text("GTDB entry " + curr_acc + " doesn't seem to have 7-column lineage info. Something is likely wrong :(", "yellow")) 117 | print("") 118 | wprint("If this continues to happen, please file an issue at github.com/AstrobioMike/GToTree/issues") 119 | print("") 120 | wprint("Aborting for now.") 121 | print("") 122 | sys.exit(0) 123 | 124 | else: 125 | domain.append(tax_list[0][3:]) 126 | phylum.append(tax_list[1][3:]) 127 | rclass.append(tax_list[2][3:]) 128 | order.append(tax_list[3][3:]) 129 | family.append(tax_list[4][3:]) 130 | genus.append(tax_list[5][3:]) 131 | species.append(tax_list[6][3:]) 132 | 133 | gtdb_tab.insert(1, "species", species) 134 | gtdb_tab.insert(1, "genus", genus) 135 | gtdb_tab.insert(1, "family", family) 136 | gtdb_tab.insert(1, "order", order) 137 | gtdb_tab.insert(1, "class", rclass) 138 | gtdb_tab.insert(1, "phylum", phylum) 139 | gtdb_tab.insert(1, "domain", domain) 140 | 141 | # writing out 142 | gtdb_tab.to_csv(location + "GTDB-arc-and-bac-metadata.tsv", index=False, sep="\t") 143 | 144 | gtdb_version_info = urllib.request.urlretrieve("https://data.gtdb.ecogenomic.org/releases/latest/VERSION.txt", location + "GTDB-version-info.txt") 145 | 146 | 147 | def check_and_or_get_gtdb_files(GTDB_dir): 148 | """ checks for and sets up ref GTDB files if needed """ 149 | 150 | if os.path.exists(GTDB_dir + "GTDB-arc-and-bac-metadata.tsv") and os.path.exists(GTDB_dir + "GTDB-version-info.txt"): 151 | 152 | sys.exit(0) 153 | 154 | # generating when table doesn't exist yet 155 | else: 156 | wprint(color_text("Downloading and parsing archaeal and bacterial metadata tables from GTDB (only needs to be done once)...", "yellow")) 157 | print("") 158 | 159 | gen_gtdb_tab(GTDB_dir) 160 | 161 | 162 | if __name__ == "__main__": 163 | main() 164 | -------------------------------------------------------------------------------- /bin/gtt-check-wanted-lineage-info: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser(description='This script is for making sure the user-specified desired lineage info is interpretable.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-w", "--wanted_ranks", help="Single-column file with wanted ranks", action="store", dest="wanted_ranks", required=True) 11 | parser.add_argument("-o", "--output_file_with_uninterpretable_ranks", help='Output file default: "gtotree.uninterpretable_ranks.tmp"', action="store", dest="output_file", default="gtotree.uninterpretable_ranks.tmp") 12 | 13 | if len(sys.argv)==1: 14 | parser.print_help(sys.stderr) 15 | sys.exit(0) 16 | 17 | args = parser.parse_args() 18 | 19 | out_file = open(args.output_file, "w") 20 | 21 | acceptable_ranks = ["domain","phylum","class","order","family","genus","species","strain"] 22 | 23 | with open(args.wanted_ranks, "r") as wanted_ranks: 24 | for line in wanted_ranks: 25 | curr_line = line.strip() 26 | lower_line = curr_line.lower() 27 | 28 | if lower_line not in acceptable_ranks: 29 | out_file.write(str(curr_line) + "\n") 30 | 31 | out_file.close() 32 | -------------------------------------------------------------------------------- /bin/gtt-clean-after-test.sh: -------------------------------------------------------------------------------- 1 | rm -rf GToTree-test-data/ GToTree-test-output/ 2 | -------------------------------------------------------------------------------- /bin/gtt-combine-kofamscan-results.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | all_assembly_ids=${1} 4 | tmp_dir=${2} 5 | output_dir=${3} 6 | unique_target_KOs=${4} 7 | 8 | KO_output_dir="${output_dir}/KO_search_results" 9 | KO_hits_fasta_output_dir="${KO_output_dir}/KO_hit_seqs/" 10 | 11 | mkdir -p ${KO_hits_fasta_output_dir} 12 | 13 | # combining all fasta files for each individual KO 14 | for ko in $(cat ${unique_target_KOs}); do 15 | 16 | find ${tmp_dir}/kofamscan/ -name ${ko}.faa -exec cat {} \; > ${KO_hits_fasta_output_dir}/${ko}-hits.faa 17 | 18 | # removing if there were none 19 | if [ ! -s ${KO_hits_fasta_output_dir}/${ko}-hits.faa ]; then 20 | 21 | rm ${KO_hits_fasta_output_dir}/${ko}-hits.faa 22 | 23 | fi 24 | 25 | done 26 | 27 | # combining counts into one table 28 | final_counts_tab="${KO_output_dir}/KO-hit-counts.tsv" 29 | 30 | 31 | # starting first row 32 | # cat <( printf "KO_ID\n" ) ${unique_target_KOs} > ${building_counts_tab} 33 | paste <( printf "assembly_id\ttotal_gene_count" ) <( tr "\n" "\t" < ${unique_target_KOs} | sed 's/\t$/\n/' ) > ${final_counts_tab} 34 | 35 | # looping through assemblies and adding them 36 | for assembly_id in $(cat ${all_assembly_ids}); do 37 | 38 | cat ${tmp_dir}/kofamscan/${assembly_id}/KO-counts.txt >> ${final_counts_tab} 39 | 40 | done 41 | -------------------------------------------------------------------------------- /bin/gtt-count-bases-per-seq: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import sys 6 | 7 | parser = argparse.ArgumentParser(description="This script takes a multifasta as input and returns a tab-delimited file with two columns, header and number of bases or amino acids, for each sequence." ) 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta") 12 | parser.add_argument("-o", "--output_txt_file", help='Name of output txt file (default: "Num_bps.txt")', action="store", dest="output_file", default="Num_bps.txt") 13 | 14 | if len(sys.argv)==1: 15 | parser.print_help(sys.stderr) 16 | sys.exit(0) 17 | 18 | args = parser.parse_args() 19 | 20 | in_fasta = open(args.input_fasta, "r") 21 | out_file = open(args.output_file, "w") 22 | 23 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 24 | out_file.write(seq_record.id + "\t" + str(len(seq_record.seq)) + "\n") 25 | 26 | in_fasta.close() 27 | out_file.close() 28 | 29 | 30 | -------------------------------------------------------------------------------- /bin/gtt-fasta-parallel-nt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | num_cpus=$4 12 | hmm_target_genes_total=$5 13 | output_dir=$6 14 | best_hit_mode=$7 15 | additional_pfam_targets=$8 16 | ko_targets=$9 17 | target_KOs=${10} 18 | 19 | 20 | ### kill backstop 21 | # if there is a problem, all child processes launched (by this script) will exit immediately, 22 | # upon returning to main script, will check and terminate parent process 23 | if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then 24 | exit 25 | fi 26 | 27 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 28 | if $(file $1 | grep -q "gzip"); then 29 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 30 | file_location=${1%.*} 31 | gunzip -f -c $1 > $file_location 32 | assembly="$(basename ${file_location%.*})" 33 | else 34 | file_location=$1 35 | assembly="$(basename ${1%.*})" 36 | was_gzipped=FALSE 37 | fi 38 | 39 | printf " -------------------------------------------------------------------------- \n\n" 40 | printf " Genome: ${GREEN}$assembly${NC}\n" 41 | 42 | # adding assembly to ongoing genomes list 43 | echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp 44 | 45 | num=$((num+1)) # to track progress 46 | 47 | prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null 2> ${file_location}_prodigal.stderr 48 | 49 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then 50 | printf "$assembly\n" >> ${tmp_dir}/kill_fasta_parallel.prodigal 51 | rm -rf ${file_location}_prodigal.stderr 52 | exit 53 | else 54 | rm -rf ${file_location}_prodigal.stderr 55 | fi 56 | 57 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp 58 | 59 | ## removing gunzipped genome file if it was gunzipped 60 | if [ $was_gzipped == "TRUE" ]; then 61 | rm -rf $file_location 62 | fi 63 | 64 | ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such 65 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp 66 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp 67 | 68 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 69 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp 70 | # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta 71 | 72 | ### running hmm search ### 73 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null 74 | 75 | ### calculating % completion and redundancy ### 76 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 77 | do 78 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 79 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 80 | 81 | ## making list here of only those present in exactly 1 copy 82 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 83 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 84 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 85 | 86 | ## adding SCG-hit counts to table 87 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 88 | 89 | # total number of unique SCG hits 90 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 91 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 92 | 93 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 94 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 95 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 96 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 97 | 98 | # want to put a notice out if estimated redundancy is greater than 10 99 | # needs to be an integer for bash comparison, so multiplying by 100 first 100 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 101 | 102 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 103 | 104 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 105 | 106 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 107 | 108 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 109 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 110 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 111 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 112 | printf " You may want to consider taking a closer look and/or removing it from the\n" 113 | printf " from the input genomes.\n\n" 114 | 115 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 116 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 117 | 118 | # writing to table of genomes with questionable redundancy estimates 119 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 120 | 121 | else 122 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 123 | 124 | fi 125 | 126 | # adding NA for taxid so final table can still have the column and lineage for those that do have them 127 | taxid="NA" 128 | 129 | ## writing summary info to table ## 130 | printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv 131 | 132 | 133 | ### Pulling out hits for this genome (nucleotide as specified by user) ### 134 | target_genes_suffix="_genes.fa.tmp" 135 | 136 | # indexing 137 | esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null 138 | 139 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 140 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 141 | if [ $best_hit_mode == "false" ]; then 142 | 143 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 144 | do 145 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 146 | done 147 | 148 | # if best-hit mode is on, taking best hit 149 | else 150 | 151 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 152 | do 153 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 154 | done 155 | 156 | fi 157 | 158 | 159 | ## searching for additional targets if provided 160 | # getting count of genes if there are additional targets 161 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 162 | 163 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp) 164 | 165 | fi 166 | 167 | ## KOs 168 | if [ $ko_targets == "true" ]; then 169 | 170 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 171 | 172 | fi 173 | 174 | ## Pfams 175 | if [ $additional_pfam_targets == "true" ]; then 176 | 177 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 178 | 179 | fi 180 | 181 | rm -rf ${tmp_dir}/${assembly}_genes*.tmp* 182 | rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp 183 | rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 184 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 185 | -------------------------------------------------------------------------------- /bin/gtt-fasta-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | num_cpus=$4 12 | hmm_target_genes_total=$5 13 | output_dir=$6 14 | best_hit_mode=$7 15 | additional_pfam_targets=$8 16 | ko_targets=$9 17 | target_KOs=${10} 18 | 19 | 20 | ### kill backstop 21 | # if there is a problem, all child processes launched (by this script) will exit immediately, 22 | # upon returning to main script, will check and terminate parent process 23 | if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then 24 | exit 25 | fi 26 | 27 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 28 | if $(file $1 | grep -q "gzip"); then 29 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 30 | file_location=${1%.*} 31 | gunzip -f -c $1 > $file_location 32 | assembly="$(basename ${file_location%.*})" 33 | else 34 | file_location=$1 35 | assembly="$(basename ${1%.*})" 36 | was_gzipped=FALSE 37 | fi 38 | 39 | printf " -------------------------------------------------------------------------- \n\n" 40 | printf " Genome: ${GREEN}$assembly${NC}\n" 41 | 42 | # adding assembly to ongoing genomes list 43 | echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp 44 | 45 | num=$((num+1)) # to track progress 46 | 47 | 48 | prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr 49 | 50 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then 51 | printf "$assembly\n" >> ${tmp_dir}/kill_fasta_parallel.prodigal 52 | rm -rf ${file_location}_prodigal.stderr 53 | exit 54 | else 55 | rm -rf ${file_location}_prodigal.stderr 56 | fi 57 | 58 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp 59 | 60 | 61 | ## removing gunzipped genome file if it was gunzipped 62 | if [ $was_gzipped == "TRUE" ]; then 63 | rm -rf $file_location 64 | fi 65 | 66 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 67 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp 68 | 69 | ## renaming seqs to have assembly name (also to ensure simple headers) 70 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp 71 | 72 | ### running hmm search ### 73 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 74 | 75 | ### calculating % completion and redundancy ### 76 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 77 | do 78 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 79 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 80 | 81 | ## making list here of only those present in exactly 1 copy 82 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 83 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 84 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 85 | 86 | ## adding SCG-hit counts to table 87 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 88 | 89 | # total number of unique SCG hits 90 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 91 | 92 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 93 | 94 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 95 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 96 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 97 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 98 | 99 | ### want to put an explicit notice out if estimated redundancy is greater than 10% 100 | # needs to be an integer for bash comparison, so multiplying by 100 first 101 | 102 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 103 | 104 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 105 | 106 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 107 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 108 | 109 | 110 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 111 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 112 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 113 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 114 | printf " You may want to consider taking a closer look and/or removing it from the\n" 115 | printf " from the input genomes.\n\n" 116 | 117 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 118 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 119 | 120 | # writing to table of genomes with questionable redundancy estimates 121 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 122 | 123 | else 124 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 125 | fi 126 | 127 | # adding NA for taxid so final table can still have the column and lineage for those that do have them 128 | taxid="NA" 129 | 130 | ## writing summary info to table ## 131 | printf "$assembly\t$1\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv 132 | 133 | ### Pulling out hits for this genome ### 134 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 135 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 136 | 137 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 138 | if [ $best_hit_mode == "false" ]; then 139 | 140 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 141 | do 142 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 143 | done 144 | 145 | # if best-hit mode is on, taking best hit 146 | else 147 | 148 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 149 | do 150 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 151 | done 152 | 153 | fi 154 | 155 | ## searching for additional targets if provided 156 | # getting count of genes if there are additional targets 157 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 158 | 159 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 160 | 161 | fi 162 | 163 | ## KOs 164 | if [ $ko_targets == "true" ]; then 165 | 166 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 167 | 168 | fi 169 | 170 | ## Pfams 171 | if [ $additional_pfam_targets == "true" ]; then 172 | 173 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 174 | 175 | fi 176 | 177 | rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes3.tmp 178 | rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 179 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 180 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi 181 | -------------------------------------------------------------------------------- /bin/gtt-fasta-serial-nt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | fasta_genomes_total=$4 12 | num_cpus=$5 13 | hmm_target_genes_total=$6 14 | output_dir=$7 15 | best_hit_mode=$8 16 | additional_pfam_targets=$9 17 | ko_targets=${10} 18 | target_KOs=${11} 19 | 20 | # looping through the lines of the provided [-f] file (this loop operates on one genome at a time) 21 | while IFS=$'\t' read -r -a file 22 | do 23 | 24 | ### kill backstop 25 | # if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly 26 | if [ -s ${tmp_dir}/kill_fasta_serial.prodigal ]; then 27 | exit 28 | fi 29 | 30 | 31 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 32 | if $(file $file | grep -q "gzip"); then 33 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 34 | file_location=${file%.*} 35 | gunzip -f -c $file > $file_location 36 | assembly="$(basename ${file_location%.*})" 37 | else 38 | file_location=$file 39 | assembly="$(basename ${file%.*})" 40 | was_gzipped=FALSE 41 | fi 42 | 43 | # adding assembly to ongoing genomes list 44 | echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp 45 | 46 | num=$((num+1)) # to track progress 47 | 48 | printf " -------------------------------------------------------------------------- \n" 49 | printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $fasta_genomes_total total.\n" 50 | printf " -------------------------------------------------------------------------- \n\n" 51 | 52 | printf " Getting coding seqs...\n\n" 53 | 54 | ## running prodigal to get coding sequences 55 | prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null 2> ${file_location}_prodigal.stderr 56 | 57 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then 58 | printf "$assembly\n" >> ${tmp_dir}/kill_fasta_serial.prodigal 59 | rm -rf ${file_location}_prodigal.stderr 60 | 61 | exit 62 | else 63 | rm -rf ${file_location}_prodigal.stderr 64 | fi 65 | 66 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp 67 | 68 | ## removing gunzipped genome file if it was gunzipped 69 | if [ $was_gzipped == "TRUE" ]; then 70 | rm -rf $file_location 71 | fi 72 | 73 | ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such 74 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp 75 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp 76 | 77 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 78 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp 79 | # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta 80 | 81 | printf " Performing HMM search...\n" 82 | 83 | ### running hmm search ### 84 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null 85 | 86 | ### calculating % completion and redundancy ### 87 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 88 | do 89 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 90 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 91 | 92 | ## making list here of only those present in exactly 1 copy 93 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 94 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 95 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 96 | 97 | ## adding SCG-hit counts to table 98 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 99 | 100 | # total number of unique SCG hits 101 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 102 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 103 | 104 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 105 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 106 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 107 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 108 | 109 | # want to put a notice out if estimated redundancy is greater than 10 110 | # needs to be an integer for bash comparison, so multiplying by 100 first 111 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 112 | 113 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 114 | 115 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 116 | 117 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 118 | 119 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 120 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 121 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 122 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 123 | printf " You may want to consider taking a closer look and/or removing it from the\n" 124 | printf " from the input genomes.\n\n" 125 | 126 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 127 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 128 | 129 | # writing to table of genomes with questionable redundancy estimates 130 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 131 | 132 | else 133 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 134 | 135 | fi 136 | 137 | # adding NA for taxid so final table can still have the column and lineage for those that do have them 138 | taxid="NA" 139 | 140 | ## writing summary info to table ## 141 | printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv 142 | 143 | 144 | ### Pulling out hits for this genome (nucleotide as specified by user) ### 145 | target_genes_suffix="_genes.fa.tmp" 146 | 147 | # indexing 148 | esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null 149 | 150 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 151 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 152 | if [ $best_hit_mode == "false" ]; then 153 | 154 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 155 | do 156 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 157 | done 158 | 159 | # if best-hit mode is on, taking best hit 160 | else 161 | 162 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 163 | do 164 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 165 | done 166 | 167 | fi 168 | 169 | 170 | ## searching for additional targets if provided 171 | # getting count of genes if there are additional targets 172 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 173 | 174 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp) 175 | 176 | fi 177 | 178 | ## KOs 179 | if [ $ko_targets == "true" ]; then 180 | 181 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 182 | 183 | fi 184 | 185 | ## Pfams 186 | if [ $additional_pfam_targets == "true" ]; then 187 | 188 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 189 | 190 | fi 191 | 192 | rm -rf ${tmp_dir}/${assembly}_genes*.tmp* 193 | rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp 194 | rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 195 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 196 | 197 | done < $1 198 | -------------------------------------------------------------------------------- /bin/gtt-fasta-serial.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | fasta_genomes_total=$4 12 | num_cpus=$5 13 | hmm_target_genes_total=$6 14 | output_dir=$7 15 | best_hit_mode=$8 16 | additional_pfam_targets=$9 17 | ko_targets=${10} 18 | target_KOs=${11} 19 | 20 | # looping through the lines of the provided [-f] file (this loop operates on one genome at a time) 21 | while IFS=$'\t' read -r -a file 22 | do 23 | 24 | ### kill backstop 25 | # if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly 26 | if [ -s ${tmp_dir}/kill_fasta_serial.prodigal ]; then 27 | exit 28 | fi 29 | 30 | 31 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 32 | if $(file $file | grep -q "gzip"); then 33 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 34 | file_location=${file%.*} 35 | gunzip -f -c $file > $file_location 36 | assembly="$(basename ${file_location%.*})" 37 | else 38 | file_location=$file 39 | assembly="$(basename ${file%.*})" 40 | was_gzipped=FALSE 41 | fi 42 | 43 | # adding assembly to ongoing genomes list 44 | echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp 45 | 46 | num=$((num+1)) # to track progress 47 | 48 | printf " -------------------------------------------------------------------------- \n" 49 | printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $fasta_genomes_total total.\n" 50 | printf " -------------------------------------------------------------------------- \n\n" 51 | 52 | printf " Getting coding seqs...\n\n" 53 | 54 | ## running prodigal to get coding sequences 55 | prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr 56 | 57 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then 58 | printf "$assembly\n" >> ${tmp_dir}/kill_fasta_serial.prodigal 59 | rm -rf ${file_location}_prodigal.stderr 60 | 61 | exit 62 | else 63 | rm -rf ${file_location}_prodigal.stderr 64 | fi 65 | 66 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp 67 | 68 | ## removing gunzipped genome file if it was gunzipped 69 | if [ $was_gzipped == "TRUE" ]; then 70 | rm -rf $file_location 71 | fi 72 | 73 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 74 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp 75 | 76 | ## renaming seqs to have assembly name (also to ensure simple headers) 77 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp 78 | 79 | printf " Performing HMM search...\n" 80 | 81 | ### running hmm search ### 82 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 83 | 84 | ### calculating % completion and redundancy ### 85 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 86 | do 87 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 88 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 89 | 90 | ## making list here of only those present in exactly 1 copy 91 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 92 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 93 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 94 | 95 | ## adding SCG-hit counts to table 96 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 97 | 98 | # total number of unique SCG hits 99 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 100 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 101 | 102 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 103 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 104 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 105 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 106 | 107 | # want to put a notice out if estimated redundancy is greater than 10 108 | # needs to be an integer for bash comparison, so multiplying by 100 first 109 | 110 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 111 | 112 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 113 | 114 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 115 | 116 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 117 | 118 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 119 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 120 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 121 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 122 | printf " You may want to consider taking a closer look and/or removing it from the\n" 123 | printf " from the input genomes.\n\n" 124 | 125 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 126 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 127 | 128 | # writing to table of genomes with questionable redundancy estimates 129 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 130 | 131 | else 132 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 133 | 134 | fi 135 | 136 | # adding NA for taxid so final table can still have the column and lineage for those that do have them 137 | taxid="NA" 138 | 139 | ## writing summary info to table ## 140 | printf "$assembly\t$file\t$taxid\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Fasta_genomes_summary_info.tsv 141 | 142 | 143 | ### Pulling out hits for this genome ### 144 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 145 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 146 | 147 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 148 | if [ $best_hit_mode == "false" ]; then 149 | 150 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 151 | do 152 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 153 | done 154 | 155 | # if best-hit mode is on, taking best hit 156 | else 157 | 158 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 159 | do 160 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 161 | done 162 | 163 | fi 164 | 165 | 166 | ## searching for additional targets if provided 167 | # getting count of genes if there are additional targets 168 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 169 | 170 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 171 | 172 | fi 173 | 174 | ## KOs 175 | if [ $ko_targets == "true" ]; then 176 | 177 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 178 | 179 | fi 180 | 181 | ## Pfams 182 | if [ $additional_pfam_targets == "true" ]; then 183 | 184 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 185 | 186 | fi 187 | 188 | rm -rf ${tmp_dir}/${assembly}_genes1.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes3.tmp 189 | rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 190 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 191 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi 192 | 193 | done < $1 194 | -------------------------------------------------------------------------------- /bin/gtt-filter-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | len_cutoff=$3 11 | nucleotide=$4 12 | 13 | ### filtering out sequences that are too long or too short ### 14 | if [ $nucleotide != 'false' ]; then 15 | target_gene_suffix=".fa" 16 | else 17 | target_gene_suffix=".faa" 18 | fi 19 | 20 | 21 | gtt-count-bases-per-seq -i ${tmp_dir}/${1}_hits${target_gene_suffix} -o ${tmp_dir}/${1}_Num_bps.tmp 22 | cut -f2 ${tmp_dir}/${1}_Num_bps.tmp > ${tmp_dir}/${1}_lengths.tmp 23 | median=$(gtt-get-median.sh ${tmp_dir}/${1}_lengths.tmp) 24 | buff=$(echo "$median * $len_cutoff" | bc) 25 | min_len=$(echo "$median - $buff" | bc) 26 | min_len_rnd=$(printf "%.0f\n" $min_len) 27 | max_len=$(echo "$median + $buff" | bc) 28 | max_len_rnd=$(printf "%.0f\n" $max_len) 29 | 30 | gtt-filter-seqs-by-length -i ${tmp_dir}/${1}_hits${target_gene_suffix} -m $min_len_rnd -M $max_len_rnd -o ${tmp_dir}/${1}_hits_filtered.tmp > ${tmp_dir}/${1}_filter.out.tmp 31 | 32 | cat <(printf "\n Filtering ${GREEN}${1}${NC} sequences by length...\n") ${tmp_dir}/${1}_filter.out.tmp 33 | 34 | rm ${tmp_dir}/${1}_Num_bps.tmp ${tmp_dir}/${1}_lengths.tmp ${tmp_dir}/${1}_filter.out.tmp 35 | -------------------------------------------------------------------------------- /bin/gtt-filter-seqs-by-length: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import sys 6 | 7 | parser = argparse.ArgumentParser(description="This script takes a multifasta as input and filters out sequences based on length.") 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta") 12 | required.add_argument("-m", "--min_length", help="minimum length retained", action="store", dest="min_len") 13 | required.add_argument("-M", "--max_length", help="maximum length retained", action="store", dest="max_len") 14 | parser.add_argument("-o", "--output_file", help='name of output fasta file (default: "filtered.fasta")', action="store", dest="output_file", default="filtered.fasta") 15 | parser.add_argument("-q", "--quiet", help="don't report percentage of retained sequences", action = "store_true") 16 | 17 | if len(sys.argv)==1: 18 | parser.print_help(sys.stderr) 19 | sys.exit(0) 20 | 21 | args = parser.parse_args() 22 | 23 | in_fasta = open(args.input_fasta, "r") 24 | out_file = open(args.output_file, "w") 25 | min_len = args.min_len 26 | max_len = args.max_len 27 | 28 | total=0 29 | kept=0 30 | 31 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 32 | 33 | total+=1 34 | 35 | if len(seq_record.seq) >= int(min_len) and len(seq_record.seq) <= int(max_len): 36 | 37 | kept+=1 38 | out_file.write(">" + str(seq_record.description) + "\n" + str(seq_record.seq) + "\n") 39 | 40 | 41 | if not args.quiet: 42 | 43 | perc = round(float(kept) / float(total) * 100, 2) 44 | print("\n\tRetained " + str(kept) + " sequences of the initial " + str(total) + " (" + str(perc) + "%).\n") 45 | 46 | 47 | 48 | in_fasta.close() 49 | out_file.close() 50 | -------------------------------------------------------------------------------- /bin/gtt-gen-KO-iToL-files.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$1 10 | output_dir=$2 11 | 12 | ## setting variable holding whether or not any labels were swapped 13 | if grep -q label <(head -n 1 ${output_dir}/Genomes_summary_info.tsv); then 14 | labels_swapped='true' 15 | else 16 | labels_swapped='false' 17 | fi 18 | 19 | curr_target_line=0 20 | 21 | for target in $(cat ${tmp_dir}/uniq_ko_targets.tmp) 22 | do 23 | 24 | curr_target_line=$(($curr_target_line + 1)) 25 | 26 | target_col=$(($curr_target_line + 2)) 27 | 28 | awk -F $'\t' -v col="$target_col" ' $col > 0 { print $1 } ' ${output_dir}/KO_search_results/KO-hit-counts.tsv | tail -n +2 > ${tmp_dir}/Genomes_with_hits_to_${target}.tmp 29 | 30 | if [ -s ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ]; then 31 | 32 | if [ $labels_swapped == 'true' ]; then 33 | 34 | for genome in $(cat ${tmp_dir}/Genomes_with_hits_to_${target}.tmp) 35 | do 36 | 37 | grep -m1 "^$genome" ${output_dir}/Genomes_summary_info.tsv | cut -f 2 38 | 39 | done > ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp 40 | 41 | paste ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp > ${tmp_dir}/Genomes_with_hits_to_${target}.tsv 42 | 43 | fi 44 | 45 | ## if any, removing those not in final tree before making iToL file 46 | awk -F $'\t' ' $8 == "No" { print $1 } ' ${output_dir}/Genomes_summary_info.tsv | sort > ${tmp_dir}/sorted_genomes_to_leave_out_of_KO_iToL_files.tmp 47 | 48 | if [ -s ${tmp_dir}/sorted_genomes_to_leave_out_of_KO_iToL_files.tmp ]; then 49 | comm -23 <( sort ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ) ${tmp_dir}/sorted_genomes_to_leave_out_of_KO_iToL_files.tmp > ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp 50 | else 51 | cp ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp 52 | fi 53 | 54 | ## making iToL file for each target KO 55 | if [ $labels_swapped == 'true' ]; then 56 | for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp) 57 | do 58 | grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tsv 59 | done | cut -f 2 > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp 60 | 61 | else 62 | for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp) 63 | do 64 | grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tmp 65 | done > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp 66 | fi 67 | 68 | printf "DATASET_STYLE\nSEPARATOR SPACE\nDATASET_LABEL $target\nCOLOR #0000ff\nDATA\n" > ${output_dir}/KO_search_results/iToL_files/${target}-iToL.txt 69 | 70 | cat <(sed 's/$/ branch node #0000ff 3 normal/' ${tmp_dir}/genomes_for_iToL_for_${target}.tmp) >> ${output_dir}/KO_search_results/iToL_files/${target}-iToL.txt 71 | 72 | else 73 | rm ${tmp_dir}/Genomes_with_hits_to_${target}.tmp 74 | 75 | fi 76 | 77 | done 78 | -------------------------------------------------------------------------------- /bin/gtt-gen-itol-map: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser(description='This script is for creating a standard iToL "label" and/or "branch" color file when given the IDs of the genomes you want to color.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-g", "--target_genomes", help='Single-column file with the genomes to color (need to match the IDs in the tree file, with no "">")', action="store", dest="target_genomes", required=True) 11 | parser.add_argument("-w", "--what_to_color", help='What to color, must be: "branches", "labels", or "both" (default: "both")', action="store", dest="to_color", default="both") 12 | parser.add_argument("-c", "--color", help='Color to use of either: "blue", "green", or "red" (default: "blue", of course, \'cause it\'s the best)', action="store", dest="color", default="blue") 13 | parser.add_argument("-o", "--output_file", help='Output file for iToL (default: "iToL-colors.txt")', action="store", dest="output_file", default="iToL-colors.txt") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | if args.color == "blue": 22 | col = "#0000ff" 23 | elif args.color == "green": 24 | col = "#00a33f" 25 | elif args.color == "red": 26 | col = "#a30000" 27 | else: 28 | print("\n\tSorry, we're not prepared to handle \"" + str(args.color) + "\" as the color... :(\n") 29 | parser.print_help(sys.stderr) 30 | sys.exit(1) 31 | 32 | if args.to_color not in ["both", "branches", "labels"]: 33 | print("\n\tSorry, we're not prepared to handle \"" + str(args.to_color) + "\" as the argument for what to color... :(\n") 34 | parser.print_help(sys.stderr) 35 | sys.exit(1) 36 | 37 | target_list = [] 38 | 39 | with open(args.target_genomes, "r") as target_genomes: 40 | for genome in target_genomes: 41 | target_list.append(genome.strip()) 42 | 43 | out_file = open(args.output_file, "w") 44 | 45 | out_file.write("TREE_COLORS\nSEPARATOR TAB\nDATA\n\n") 46 | 47 | # writing lines for coloring labels if needed 48 | if args.to_color in ["both", "labels"]: 49 | 50 | for target in target_list: 51 | out_file.write(str(target) + "\tlabel\t" + str(col) + "\tbold\n") 52 | 53 | # writing lines for coloring branches if needed 54 | if args.to_color in ["both", "branches"]: 55 | 56 | for target in target_list: 57 | out_file.write(str(target) + "\tbranch\t" + str(col) + "\tnormal\t1.5\n") 58 | 59 | out_file.close() 60 | -------------------------------------------------------------------------------- /bin/gtt-gen-pfam-iToL-files.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$1 10 | output_dir=$2 11 | 12 | ## setting variable holding whether or not any labels were swapped 13 | if grep -q label <(head -n 1 ${output_dir}/Genomes_summary_info.tsv); then 14 | labels_swapped='true' 15 | else 16 | labels_swapped='false' 17 | fi 18 | 19 | curr_target_line=0 20 | 21 | for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp) 22 | do 23 | 24 | curr_target_line=$(($curr_target_line + 1)) 25 | 26 | target_col=$(($curr_target_line + 2)) 27 | 28 | awk -F $'\t' -v col="$target_col" ' $col > 0 { print $1 } ' ${output_dir}/Pfam_search_results/Pfam-hit-counts.tsv | tail -n +2 > ${tmp_dir}/Genomes_with_hits_to_${target}.tmp 29 | 30 | if [ -s ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ]; then 31 | 32 | if [ $labels_swapped == 'true' ]; then 33 | 34 | for genome in $(cat ${tmp_dir}/Genomes_with_hits_to_${target}.tmp) 35 | do 36 | 37 | grep -m1 "^$genome" ${output_dir}/Genomes_summary_info.tsv | cut -f 2 38 | 39 | done > ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp 40 | 41 | paste ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/Genome_labels_with_hits_to_${target}.tmp > ${tmp_dir}/Genomes_with_hits_to_${target}.tsv 42 | 43 | fi 44 | 45 | ## if any, removing those not in final tree before making iToL file 46 | awk -F $'\t' ' $8 == "No" { print $1 } ' ${output_dir}/Genomes_summary_info.tsv | sort > ${tmp_dir}/sorted_genomes_to_leave_out_of_Pfam_iToL_files.tmp 47 | 48 | if [ -s ${tmp_dir}/sorted_genomes_to_leave_out_of_Pfam_iToL_files.tmp ]; then 49 | comm -23 <( sort ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ) ${tmp_dir}/sorted_genomes_to_leave_out_of_Pfam_iToL_files.tmp > ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp 50 | else 51 | cp ${tmp_dir}/Genomes_with_hits_to_${target}.tmp ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp 52 | fi 53 | 54 | ## making iToL file for each additional target pfam 55 | if [ $labels_swapped == 'true' ]; then 56 | for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp) 57 | do 58 | grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tsv 59 | done | cut -f 2 > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp 60 | 61 | else 62 | for genome in $(cat ${tmp_dir}/genomes_retained_for_${target}_iToL.tmp) 63 | do 64 | grep -m 1 -w "$genome" ${tmp_dir}/Genomes_with_hits_to_${target}.tmp 65 | done > ${tmp_dir}/genomes_for_iToL_for_${target}.tmp 66 | fi 67 | 68 | printf "DATASET_STYLE\nSEPARATOR SPACE\nDATASET_LABEL $target\nCOLOR #0000ff\nDATA\n" > ${output_dir}/Pfam_search_results/iToL_files/${target}-iToL.txt 69 | 70 | cat <(sed 's/$/ branch node #0000ff 3 normal/' ${tmp_dir}/genomes_for_iToL_for_${target}.tmp) >> ${output_dir}/Pfam_search_results/iToL_files/${target}-iToL.txt 71 | 72 | else 73 | rm ${tmp_dir}/Genomes_with_hits_to_${target}.tmp 74 | 75 | fi 76 | 77 | done 78 | -------------------------------------------------------------------------------- /bin/gtt-genbank-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | num_cpus=$4 12 | hmm_target_genes_total=$5 13 | output_dir=$6 14 | best_hit_mode=$7 15 | additional_pfam_targets=$8 16 | ko_targets=$9 17 | target_KOs=${10} 18 | 19 | ### kill backstop 20 | # if there is a problem, all child processes launched (by this script) will exit immediately, 21 | # upon returning to main script, will check and terminate parent process 22 | if [ -s ${tmp_dir}/kill_genbank_parallel.prodigal ]; then 23 | exit 24 | fi 25 | 26 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 27 | if $(file $1 | grep -q "gzip"); then 28 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 29 | file_location=${1%.*} 30 | gunzip -f -c $1 > $file_location 31 | assembly="$(basename ${file_location%.*})" 32 | else 33 | file_location=$1 34 | assembly="$(basename ${1%.*})" 35 | was_gzipped=FALSE 36 | fi 37 | 38 | 39 | printf " -------------------------------------------------------------------------- \n\n" 40 | printf " Genome: ${GREEN}$assembly${NC}\n" 41 | 42 | # adding assembly to ongoing genomes list 43 | echo $assembly >> ${tmp_dir}/genbank_genomes_list.tmp 44 | 45 | # storing more info about the assembly if it's present in the genbank file: 46 | # checking for organism: 47 | if grep -q "ORGANISM" $file_location; then 48 | org_name=$(grep -m1 "ORGANISM" $file_location | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_") 49 | else 50 | org_name="NA" 51 | fi 52 | 53 | if grep -q "strain=" $file_location; then 54 | strain=$(grep -m1 "strain=" $file_location | tr -s " " | cut -f 2 -d '"') 55 | else 56 | strain="NA" 57 | fi 58 | 59 | if grep -q "taxon" $file_location; then 60 | taxid=$(grep -m1 "taxon" $file_location | cut -f2 -d ":" | tr -d '"') 61 | else 62 | taxid="NA" 63 | fi 64 | 65 | # extracting AA coding sequences from genbank file 66 | gtt-genbank-to-AA-seqs -i $file_location -o ${tmp_dir}/${assembly}_genes2.tmp 2> /dev/null 67 | 68 | # checking that the file had CDS annotations 69 | if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then 70 | 71 | printf "\n ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 72 | printf " This genbank file doesn't appear to have CDS annotations, so we are\n" 73 | printf " identifying coding sequences with prodigal.\n\n" 74 | 75 | printf " Reported in \"${output_dir}/run_files/Genbank_files_with_no_CDSs.txt\".\n" 76 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 77 | 78 | echo "$1" >> ${output_dir}/run_files/Genbank_files_with_no_CDSs.txt 79 | rm -rf ${tmp_dir}/${assembly}_genes2.tmp 80 | 81 | # pulling out full nucleotide fasta from genbank file 82 | gtt-genbank-to-fasta -i $file_location -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null 83 | 84 | # running prodigal 85 | echo "prodigal used" > ${tmp_dir}/prodigal_used # marking so can add to citations list reported at end 86 | prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr 87 | 88 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then 89 | printf "$assembly\n" >> ${tmp_dir}/kill_genbank_parallel.prodigal 90 | rm -rf ${file_location}_prodigal.stderr 91 | exit 92 | else 93 | rm -rf ${file_location}_prodigal.stderr 94 | fi 95 | 96 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp 97 | 98 | fi 99 | 100 | ## removing gunzipped genome file if it was gunzipped 101 | if [ $was_gzipped == "TRUE" ]; then 102 | rm -rf $file_location 103 | fi 104 | 105 | 106 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 107 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp 108 | 109 | ## renaming seqs to have assembly name 110 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp 111 | 112 | ### running hmm search ### 113 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 114 | 115 | ### calculating % completion and redundancy ### 116 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 117 | do 118 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 119 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 120 | 121 | ## making list here of only those present in exactly 1 copy 122 | 123 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 124 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 125 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 126 | 127 | ## adding SCG-hit counts to table 128 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 129 | 130 | # total number of unique SCG hits 131 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 132 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 133 | 134 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 135 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 136 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 137 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 138 | 139 | ### want to put an explicit notice out if estimated redundancy is greater than 10% 140 | # needs to be an integer for bash comparison, so multiplying by 100 first 141 | 142 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 143 | 144 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 145 | 146 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 147 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 148 | 149 | 150 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 151 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 152 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 153 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 154 | printf " You may want to consider taking a closer look and/or removing it from the\n" 155 | printf " from the input genomes.\n\n" 156 | 157 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 158 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 159 | 160 | # writing to table of genomes with questionable redundancy estimates 161 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 162 | 163 | else 164 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 165 | fi 166 | 167 | ## writing summary info to table ## 168 | printf "$assembly\t$1\t$taxid\t$org_name\t$strain\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Genbank_genomes_summary_info.tsv 169 | 170 | ### Pulling out hits for this genome ### 171 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 172 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 173 | 174 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 175 | if [ $best_hit_mode == "false" ]; then 176 | 177 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 178 | do 179 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 180 | done 181 | 182 | # if best-hit mode is on, taking best hit 183 | else 184 | 185 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 186 | do 187 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 188 | done 189 | 190 | fi 191 | 192 | ## searching for additional targets if provided 193 | # getting count of genes if there are additional targets 194 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 195 | 196 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 197 | 198 | fi 199 | 200 | ## KOs 201 | if [ $ko_targets == "true" ]; then 202 | 203 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 204 | 205 | fi 206 | 207 | ## Pfams 208 | if [ $additional_pfam_targets == "true" ]; then 209 | 210 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 211 | 212 | fi 213 | 214 | rm -rf ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_fasta.tmp ${tmp_dir}/${assembly}_genes1.tmp 215 | rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 216 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 217 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi 218 | -------------------------------------------------------------------------------- /bin/gtt-genbank-serial.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir=$2 10 | hmm_file=$3 11 | genbank_genomes_total=$4 12 | num_cpus=$5 13 | hmm_target_genes_total=$6 14 | output_dir=$7 15 | best_hit_mode=$8 16 | additional_pfam_targets=$9 17 | ko_targets=${10} 18 | target_KOs=${11} 19 | 20 | num=0 21 | 22 | rm -rf ${output_dir}/run_files/Genbank_files_with_no_CDSs.txt # deleting if file exists 23 | 24 | # looping through the lines of the provided [-g] file (this loop operates on one genome at a time) 25 | while IFS=$'\t' read -r -a file 26 | 27 | do 28 | 29 | ## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way 30 | if $(file $file | grep -q "gzip"); then 31 | was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards 32 | file_location=${file%.*} 33 | gunzip -f -c $file > $file_location 34 | assembly="$(basename ${file_location%.*})" 35 | else 36 | file_location=$file 37 | assembly="$(basename ${file%.*})" 38 | was_gzipped=FALSE 39 | fi 40 | 41 | 42 | # adding assembly to ongoing genomes list 43 | echo $assembly >> ${tmp_dir}/genbank_genomes_list.tmp 44 | 45 | num=$((num+1)) # to track progress 46 | 47 | printf " -------------------------------------------------------------------------- \n" 48 | printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $genbank_genomes_total total.\n" 49 | printf " -------------------------------------------------------------------------- \n\n" 50 | 51 | # storing more info about the assembly if it's present in the genbank file: 52 | if grep -q "ORGANISM" $file_location; then 53 | org_name=$(grep -m1 "ORGANISM" $file_location | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_") 54 | else 55 | org_name="NA" 56 | fi 57 | 58 | if grep -q "strain=" $file_location; then 59 | strain=$(grep -m1 "strain=" $file_location | tr -s " " | cut -f 2 -d '"') 60 | else 61 | strain="NA" 62 | fi 63 | 64 | if grep -q "taxon" $file_location; then 65 | taxid=$(grep -m1 "taxon" $file_location | cut -f2 -d ":" | tr -d '"') 66 | else 67 | taxid="NA" 68 | fi 69 | 70 | # extracting AA coding sequences from genbank file 71 | gtt-genbank-to-AA-seqs -i $file_location -o ${tmp_dir}/${assembly}_genes2.tmp 2> /dev/null 72 | 73 | # checking that the file had CDS annotations, if not running prodigal 74 | if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then 75 | 76 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 77 | printf "\t This genbank file doesn't appear to have CDS annotations, so we are\n" 78 | printf "\t identifying coding sequences with prodigal.\n\n" 79 | 80 | printf "\t Reported in \"${output_dir}/run_files/Genbank_files_with_no_CDSs.txt\".\n" 81 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 82 | 83 | echo "$file" >> ${output_dir}/run_files/Genbank_files_with_no_CDSs.txt 84 | rm -rf ${tmp_dir}/${assembly}_genes2.tmp 85 | 86 | # pulling out full nucleotide fasta from genbank file 87 | gtt-genbank-to-fasta -i $file_location -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null 88 | 89 | # running prodigal 90 | echo "prodigal used" > ${tmp_dir}/prodigal_used # marking so can add to citations list reported at end 91 | prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr 92 | 93 | if grep -q "at least 100000 bases for training." ${file_location}_prodigal.stderr; then 94 | printf "$assembly\n" >> ${tmp_dir}/kill_genbank_serial.prodigal 95 | rm -rf ${file_location}_prodigal.stderr 96 | exit 97 | else 98 | rm -rf ${file_location}_prodigal.stderr 99 | fi 100 | 101 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp 102 | 103 | fi 104 | 105 | ## removing gunzipped genome file if it was gunzipped 106 | if [ $was_gzipped == "TRUE" ]; then 107 | rm -rf $file_location 108 | fi 109 | 110 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 111 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes2.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes3.tmp 112 | 113 | ## renaming seqs to have assembly name 114 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes3.tmp -w ${assembly} -o ${tmp_dir}/${assembly}_genes.tmp 115 | 116 | ### counting how many genes in this genome 117 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 118 | 119 | printf " Performing HMM search...\n" 120 | 121 | ### running hmm search ### 122 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 123 | 124 | ### calculating % completion and redundancy ### 125 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 126 | do 127 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 128 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 129 | 130 | ## making list here of only those present in exactly 1 copy 131 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 132 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 133 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 134 | 135 | 136 | ## adding SCG-hit counts to table 137 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 138 | 139 | # total number of unique SCG hits 140 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 141 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 142 | 143 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 144 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 145 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 146 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 147 | 148 | # want to put a notice out if estimated redundancy is greater than 10 149 | # needs to be an integer for bash comparison, so multiplying by 100 first 150 | 151 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 152 | 153 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 154 | 155 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 156 | 157 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 158 | 159 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 160 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 161 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 162 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 163 | printf " You may want to consider taking a closer look and/or removing it from the\n" 164 | printf " from the input genomes.\n\n" 165 | 166 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 167 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 168 | 169 | # writing to table of genomes with questionable redundancy estimates 170 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 171 | 172 | else 173 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 174 | 175 | fi 176 | 177 | 178 | ## writing summary info to table ## 179 | printf "$assembly\t$file\t$taxid\t$org_name\t$strain\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/Genbank_genomes_summary_info.tsv 180 | 181 | ### Pulling out hits for this genome ### 182 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 183 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 184 | 185 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 186 | if [ $best_hit_mode == "false" ]; then 187 | 188 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 189 | do 190 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 191 | done 192 | 193 | # if best-hit mode is on, taking best hit 194 | else 195 | 196 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 197 | do 198 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 199 | done 200 | 201 | fi 202 | 203 | 204 | ## searching for additional targets if provided 205 | # getting count of genes if there are additional targets 206 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 207 | 208 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 209 | 210 | fi 211 | 212 | ## KOs 213 | if [ $ko_targets == "true" ]; then 214 | 215 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 216 | 217 | fi 218 | 219 | ## Pfams 220 | if [ $additional_pfam_targets == "true" ]; then 221 | 222 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 223 | 224 | fi 225 | 226 | rm -rf ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_fasta.tmp ${tmp_dir}/${assembly}_genes1.tmp 227 | rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 228 | rm -rf ${tmp_dir}/${assembly}_uniq_counts.tmp ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 229 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi 230 | 231 | done < $1 232 | -------------------------------------------------------------------------------- /bin/gtt-genbank-to-AA-seqs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import re 6 | import sys 7 | 8 | parser = argparse.ArgumentParser(description="This script takes a genbank file and returns the amino acid sequences for all coding sequences.") 9 | 10 | required = parser.add_argument_group('required arguments') 11 | 12 | required.add_argument("-i", "--input_gb", help='input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action="store", dest="input_gb", required=True) 13 | parser.add_argument("-o", "--output_fasta", help='Output fasta file (default: "clean.faa")', action="store", dest="output_fasta", default="clean.faa") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | input_gb = open(args.input_gb, "r") 22 | 23 | output_fasta = open(args.output_fasta, "w") 24 | 25 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")] 26 | 27 | note_terms_to_exclude = ["frameshifted", "internal stop", "incomplete"] # dumping gene if noted as these in the "note" section of the call to keep only complete genes 28 | location_terms_to_exclude = ["join", "<", ">"] # dumping gene if "location" section contains any of these: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig 29 | 30 | for rec in recs: 31 | 32 | genes = [gene for gene in rec.features if gene.type =="CDS"] # focusing on features annotated as "CDS" 33 | 34 | for gene in genes: 35 | 36 | location = str(gene.location) 37 | 38 | # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig 39 | if any(exclusion_term in location for exclusion_term in location_terms_to_exclude): 40 | continue 41 | 42 | if "note" in gene.qualifiers: 43 | note = str(gene.qualifiers["note"][0]) 44 | 45 | # dumping gene if noted as any of these in the "note" section set above 46 | if any(exclusion_term in note for exclusion_term in note_terms_to_exclude): 47 | continue 48 | 49 | # dumping if overlapping translation frame 50 | if "transl_except" in gene.qualifiers: 51 | continue 52 | 53 | # dumping if noted a pseudo gene 54 | if "pseudo" in gene.qualifiers: 55 | continue 56 | 57 | # making gene header locus_tag if present. If not, building by contig name and gene coordinates 58 | if "locus_tag" in gene.qualifiers: 59 | header = str(gene.qualifiers["locus_tag"][0]) 60 | else: 61 | location = location.replace("[", "") 62 | location = re.sub('](.*)', '', location) 63 | location = location.split(":") 64 | start = location[0] 65 | end = location[1] 66 | 67 | header = str(rec.name) + "_" + str(start) + "_" + str(end) 68 | 69 | output_fasta.write(">" + str(header) + "\n" + str(gene.qualifiers["translation"][0]) + "\n") 70 | 71 | input_gb.close() 72 | output_fasta.close() 73 | -------------------------------------------------------------------------------- /bin/gtt-genbank-to-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import argparse 5 | import sys 6 | 7 | parser = argparse.ArgumentParser(description="This script takes a genbank file and outputs a flat fasta file of all nucleotides.") 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_gb", help='input Genbank file (e.g. "*.gbk", "*.gb", "*.gbff")', action="store", dest="input_gb", required=True) 12 | parser.add_argument("-o", "--output_fasta", help='Output fasta file with matching, simplified headers to be ready for `anvi-gen-contigs-db` (default: "clean.fa")', action="store", dest="output_fasta", default="clean.fa") 13 | 14 | if len(sys.argv)==1: 15 | parser.print_help(sys.stderr) 16 | sys.exit(0) 17 | 18 | args = parser.parse_args() 19 | 20 | input_gb = open(args.input_gb, "r") 21 | 22 | output_fasta = open(args.output_fasta, "w") 23 | 24 | recs = [rec for rec in SeqIO.parse(input_gb, "genbank")] 25 | 26 | for rec in recs: 27 | output_fasta.write(">" + rec.name + "\n" + str(rec.seq) + "\n") # writing out new fasta with clean headers ready for anvi'o 28 | 29 | input_gb.close() 30 | output_fasta.close() 31 | -------------------------------------------------------------------------------- /bin/gtt-get-additional-pfam-targets.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | tmp_dir=${1} 4 | output_dir=${2} 5 | 6 | # base_link="https://pfam.xfam.org/family/" 7 | # base link updated Oct-2022 when pfam hosting shifted to interpro 8 | base_link="https://www.ebi.ac.uk/interpro/wwwapi/entry/pfam/" 9 | 10 | # starting table of what was requested and what was found (version-wise, the latest is always pulled from Pfam when downloading as below) 11 | printf "requested_Pfam\tpulled_Pfam\n" > ${output_dir}/Pfam_search_results/info/Requested-and-pulled.tsv 12 | 13 | for initial_target in $(cat ${tmp_dir}/uniq_pfam_targets.tmp) 14 | do 15 | 16 | 17 | # getting target without version specified if there was one (including a version doesn't work anymore since pfam-hosting shifted to interpro) 18 | target=$(echo ${initial_target} | cut -f 1 -d ".") 19 | 20 | # --insecure flag added on 29-Nov-2020, due to pfam certificate being invalid (https://github.com/AstrobioMike/GToTree/issues/28) 21 | curl --insecure --silent --retry 10 -o ${tmp_dir}/${target}.hmm.gz "${base_link}${target}?annotation=hmm" 22 | gunzip ${tmp_dir}/${target}.hmm.gz 23 | 24 | if [ -s ${tmp_dir}/${target}.hmm ]; then 25 | # getting accession pulled (to account for current version on Pfam as compared to what was searched) 26 | actual_target=$(grep -m1 "^ACC" ${tmp_dir}/${target}.hmm | tr -s " " "\t" | cut -f 2) 27 | printf "$actual_target\n" >> ${tmp_dir}/actual_pfam_targets.tmp 28 | 29 | if [ $initial_target != $actual_target ]; then 30 | mv ${tmp_dir}/${target}.hmm ${tmp_dir}/${actual_target}.hmm 31 | fi 32 | 33 | cat ${tmp_dir}/${actual_target}.hmm >> ${tmp_dir}/all_pfam_targets.hmm 34 | 35 | # adding searched and pulled to info table (meaning which versions of a Pfam) 36 | printf "${initial_target}\t${actual_target}\n" >> ${output_dir}/Pfam_search_results/info/Requested-and-pulled.tsv 37 | 38 | else # aborting if any of the pfam targets couldn't be pulled successfully 39 | printf "\n ${RED}One of the target Pfams could not be successfully downloaded :(${NC}\n" 40 | printf "\n The problem child was ${target}.\n\n" 41 | printf "\nExiting for now.\n\n" 42 | 43 | rm -rf ${output_dir} 44 | # removing temp directory unless debug mode on 45 | if [ $debug_flag == 'false' ]; then 46 | rm -rf $tmp_dir 47 | fi 48 | 49 | exit 50 | 51 | fi 52 | 53 | done 54 | 55 | # starting the main results table which will have the following as its header: 56 | paste <(printf "assembly_id\ttotal_gene_count") <(printf %s "$(cat ${tmp_dir}/actual_pfam_targets.tmp | tr "\n" "\t")") > ${output_dir}/Pfam_search_results/Pfam-hit-counts.tsv 57 | 58 | # copying over the additional pfam-target hmms 59 | cp ${tmp_dir}/all_pfam_targets.hmm ${output_dir}/Pfam_search_results/target_Pfam_profiles/all-additional-pfam-targets.hmm 60 | -------------------------------------------------------------------------------- /bin/gtt-get-kofamscan-data: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 5 | to download and setup the KOFamScan (https://github.com/takaram/kofam_scan) data files for use. 6 | 7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage 8 | """ 9 | 10 | import sys 11 | import os 12 | import urllib.request 13 | import argparse 14 | import shutil 15 | import textwrap 16 | import filecmp 17 | import tarfile 18 | import gzip 19 | 20 | parser = argparse.ArgumentParser(description="This is a helper program to setup the KOFamScan (github.com/takaram/kofam_scan) \ 21 | data files for use.", \ 22 | epilog="Ex. usage: gtt-get-kofamscan-data\n") 23 | 24 | args = parser.parse_args() 25 | 26 | 27 | ################################################################################ 28 | 29 | def main(): 30 | 31 | KO_data_dir = check_location_var_is_set() 32 | 33 | data_present = check_if_data_present(KO_data_dir) 34 | 35 | if data_present: 36 | exit() 37 | 38 | else: 39 | 40 | print(color_text(" Downloading required KO data (only needs to be done once)...\n", "yellow")) 41 | get_kofamscan_data(KO_data_dir) 42 | 43 | 44 | ################################################################################ 45 | 46 | 47 | # setting some colors 48 | tty_colors = { 49 | 'green' : '\033[0;32m%s\033[0m', 50 | 'yellow' : '\033[0;33m%s\033[0m', 51 | 'red' : '\033[0;31m%s\033[0m' 52 | } 53 | 54 | 55 | ### functions ### 56 | def color_text(text, color='green'): 57 | if sys.stdout.isatty(): 58 | return tty_colors[color] % text 59 | else: 60 | return text 61 | 62 | 63 | def wprint(text): 64 | print(textwrap.fill(text, width=80, initial_indent=" ", 65 | subsequent_indent=" ", break_on_hyphens=False)) 66 | 67 | 68 | def check_location_var_is_set(): 69 | 70 | # making sure there is a KO_data_dir env variable 71 | try: 72 | KO_data_dir = os.environ['KO_data_dir'] 73 | except: 74 | wprint(color_text("The environment variable 'KO_data_dir' does not seem to be set :(", "yellow")) 75 | wprint("This shouldn't happen, check on things with `gtt-data-locations check`.") 76 | print("") 77 | sys.exit(0) 78 | 79 | return(KO_data_dir) 80 | 81 | 82 | def check_stored_data_up_to_date(location): 83 | """ checks if the stored kofamscan data is the latest """ 84 | 85 | # getting latest version README 86 | kofamscan_current_readme = urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/README", location + "README-latest") 87 | 88 | # comparing vs one that's present already 89 | if filecmp.cmp(location + "README-latest", location + "README"): 90 | os.remove(location + "README-latest") 91 | 92 | return(True) 93 | 94 | else: 95 | os.remove(location + "README-latest") 96 | print("") 97 | wprint(color_text("A newer version of the KOFamScan data is available, updating...", "yellow")) 98 | 99 | return(False) 100 | 101 | 102 | def check_if_data_present(location): 103 | 104 | # seeing if present already, and if so, if those are up-to-date 105 | # if this function returns True, then data is present and up-to-date 106 | # if it returns False, then we need to download things 107 | README_path = str(location) + "/README" 108 | ko_list_path = str(location) + "/ko_list" 109 | hmms_dir_path = str(location) + "/profiles/" 110 | 111 | if not os.path.isfile(README_path) or not os.path.isfile(ko_list_path) or not os.path.isdir(hmms_dir_path): 112 | 113 | if os.path.exists(README_path): 114 | os.remove(README_path) 115 | if os.path.exists(ko_list_path): 116 | os.remove(ko_list_path) 117 | if os.path.isdir(hmms_dir_path): 118 | shutil.rmtree(hmms_dir_path) 119 | 120 | return(False) 121 | 122 | else: 123 | 124 | # if here, checking if it is up-to-date (returns True/False), if present and up to date, returning True 125 | if check_stored_data_up_to_date(location): 126 | return(True) 127 | 128 | else: 129 | 130 | # removing current files 131 | if os.path.exists(README_path): 132 | os.remove(README_path) 133 | if os.path.exists(ko_list_path): 134 | os.remove(ko_list_path) 135 | if os.path.isdir(hmms_dir_path): 136 | shutil.rmtree(hmms_dir_path) 137 | 138 | return(False) 139 | 140 | 141 | def get_kofamscan_data(location): 142 | """ downloads the needed kofamscan data """ 143 | 144 | README_path = str(location) + "/README" 145 | ko_list_gz_path = str(location) + "/ko_list.gz" 146 | ko_list_path = str(location) + "/ko_list" 147 | hmms_tar_path = str(location) + "/profiles.tar.gz" 148 | 149 | urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/README", README_path) 150 | urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/ko_list.gz", ko_list_gz_path) 151 | urllib.request.urlretrieve("ftp://ftp.genome.jp/pub/db/kofam/profiles.tar.gz", hmms_tar_path) 152 | 153 | # decompressing ko_list file 154 | with gzip.open(ko_list_gz_path, 'rb') as f_in: 155 | with open(ko_list_path, 'wb') as f_out: 156 | shutil.copyfileobj(f_in, f_out) 157 | 158 | # removing gzipped ko_list 159 | os.remove(ko_list_gz_path) 160 | 161 | # unpacking profiles 162 | with tarfile.open(hmms_tar_path) as tarball: 163 | tarball.extractall(location) 164 | 165 | # removing tarball 166 | os.remove(hmms_tar_path) 167 | 168 | 169 | ################################################################################ 170 | 171 | if __name__ == "__main__": 172 | main() 173 | -------------------------------------------------------------------------------- /bin/gtt-get-median.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | sort -n $1 | awk ' 4 | BEGIN { 5 | counts = 0 6 | sum = 0 7 | } 8 | { 9 | values[counts++] = $1 10 | sum += $1 11 | } 12 | END { 13 | if ( NR == 2 ) { 14 | median = sum/2 15 | } else if ( (NR % 2) == 1 ) { 16 | median = values[ int(counts/2) ] 17 | } else { 18 | median = ( values[counts/2] + values[counts/2-1] ) / 2 19 | } 20 | print median; 21 | } 22 | ' -------------------------------------------------------------------------------- /bin/gtt-get-ncbi-assembly-tables: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 5 | to download the NCBI assembly summary tables if they are not present, or are more than 4 weeks old. 6 | """ 7 | 8 | import sys 9 | import os 10 | import urllib.request 11 | import argparse 12 | import shutil 13 | import textwrap 14 | from datetime import date, timedelta 15 | import filecmp 16 | import tarfile 17 | import gzip 18 | 19 | parser = argparse.ArgumentParser(description="This is a helper program to download and setup the NCBI assembly summary tables if they are \ 20 | not present, or are older than 4 weeks.", \ 21 | epilog="Ex. usage: gtt-get-ncbi-assembly-tables\n") 22 | 23 | parser.add_argument("-P", "--use-http", help='Use http instead of ftp', action = "store_true") 24 | parser.add_argument("-f", "--force-update", help='Force an update regardless of last date retrieved', action = "store_true") 25 | 26 | 27 | args = parser.parse_args() 28 | 29 | 30 | ################################################################################ 31 | 32 | def main(): 33 | 34 | NCBI_assembly_data_dir = check_location_var_is_set() 35 | 36 | data_present = check_if_data_present_and_less_than_4_weeks_old(NCBI_assembly_data_dir) 37 | 38 | if data_present and not args.force_update: 39 | exit() 40 | 41 | else: 42 | 43 | get_NCBI_assembly_summary_data(NCBI_assembly_data_dir) 44 | 45 | ################################################################################ 46 | 47 | 48 | # setting some colors 49 | tty_colors = { 50 | 'green' : '\033[0;32m%s\033[0m', 51 | 'yellow' : '\033[0;33m%s\033[0m', 52 | 'red' : '\033[0;31m%s\033[0m' 53 | } 54 | 55 | 56 | ### functions ### 57 | def color_text(text, color='green'): 58 | if sys.stdout.isatty(): 59 | return tty_colors[color] % text 60 | else: 61 | return text 62 | 63 | 64 | def wprint(text): 65 | print(textwrap.fill(text, width=80, initial_indent=" ", 66 | subsequent_indent=" ", break_on_hyphens=False)) 67 | 68 | 69 | def check_location_var_is_set(): 70 | 71 | # making sure there is a KO_data_dir env variable 72 | try: 73 | NCBI_data_dir = os.environ['NCBI_assembly_data_dir'] 74 | except: 75 | wprint(color_text("The environment variable 'NCBI_assembly_data_dir' does not seem to be set :(", "yellow")) 76 | wprint("This shouldn't happen, check on things with `gtt-data-locations check`.") 77 | print("") 78 | sys.exit(0) 79 | 80 | return(NCBI_data_dir) 81 | 82 | 83 | def check_if_data_present_and_less_than_4_weeks_old(location): 84 | 85 | # seeing if present already and if it was downloaded less than 4 weeks ago 86 | # if this function returns True, then we don't do anything 87 | # if it returns False, then we need to download things 88 | table_path = os.path.join(str(location), "ncbi-assembly-info.tsv") 89 | date_retrieved_path = os.path.join(str(location), "date-retrieved.txt") 90 | 91 | # if either file is missing, we are going to download, we also package the date-retrieved file empty with conda to retain directory, so checking it's not empty as well 92 | if not os.path.isfile(table_path) or not os.path.isfile(date_retrieved_path) or not os.path.getsize(date_retrieved_path) > 0: 93 | 94 | if os.path.exists(table_path): 95 | os.remove(table_path) 96 | if os.path.isdir(date_retrieved_path): 97 | shutil.rmtree(date_retrieved_path) 98 | 99 | return(False) 100 | 101 | # if both files are present (and not empty), we are checking if it was downloaded more than 4 weeks ago 102 | # and will download if it was 103 | if os.path.isfile(table_path) and os.path.isfile(date_retrieved_path): 104 | 105 | # getting current date 106 | curr_date = date.today() 107 | 108 | # reading date it was downloaded 109 | with open(date_retrieved_path, 'r') as file: 110 | stored_date = file.read().strip() 111 | 112 | # setting to date object 113 | stored_date_list = stored_date.split(",") 114 | stored_date = date(int(stored_date_list[0]), int(stored_date_list[1]), int(stored_date_list[2])) 115 | 116 | # getting difference 117 | diff = curr_date - stored_date 118 | 119 | # checking if difference is greater than 28 days 120 | if diff.days > 28: 121 | 122 | return(False) 123 | 124 | else: 125 | 126 | return(True) 127 | 128 | else: 129 | 130 | return(True) 131 | 132 | 133 | def get_NCBI_assembly_summary_data(location): 134 | 135 | """ downloads the needed ncbi assembly summary tables and combines them """ 136 | 137 | # setting links 138 | if args.use_http: 139 | 140 | genbank_link = "https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt" 141 | refseq_link = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt" 142 | 143 | else: 144 | 145 | genbank_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt" 146 | refseq_link = "ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt" 147 | 148 | table_path = os.path.join(str(location), "ncbi-assembly-info.tsv") 149 | refseq_temp_path = os.path.join(str(location), "refseq-assembly-info.tmp") 150 | 151 | print(color_text(" Downloading NCBI assembly summaries (only done once, or updated after 4 weeks)...\n", "yellow")) 152 | 153 | urllib.request.urlretrieve(genbank_link, table_path) 154 | urllib.request.urlretrieve(refseq_link, refseq_temp_path) 155 | 156 | # combining 157 | with open (table_path, "a") as final_table: 158 | with open(refseq_temp_path, "r") as refseq: 159 | final_table.write(refseq.read()) 160 | 161 | # removing temp 162 | if os.path.exists(refseq_temp_path): 163 | os.remove(refseq_temp_path) 164 | 165 | # storing date retrieved 166 | date_retrieved = str(date.today()).replace("-", ",") 167 | date_retrieved.replace("-", ",") 168 | 169 | date_retrieved_path = os.path.join(str(location), "date-retrieved.txt") 170 | 171 | with open(date_retrieved_path, "w") as outfile: 172 | outfile.write(date_retrieved + "\n") 173 | 174 | ################################################################################ 175 | 176 | if __name__ == "__main__": 177 | main() 178 | -------------------------------------------------------------------------------- /bin/gtt-get-ncbi-tax-data: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 5 | to download NCBI tax data for using TaxonKit (https://bioinf.shenwei.me/taxonkit/) to add NCBI taxonomy. 6 | 7 | For examples, please visit the GToTree wiki here: https://github.com/AstrobioMike/GToTree/wiki/example-usage 8 | """ 9 | 10 | import sys 11 | import os 12 | import urllib.request 13 | import argparse 14 | import shutil 15 | import textwrap 16 | import filecmp 17 | import tarfile 18 | import gzip 19 | 20 | parser = argparse.ArgumentParser(description="This is a helper program to setup NCBI tax data for TaxonKit (bioinf.shenwei.me/taxonkit/) \ 21 | to add taxonomy info.", \ 22 | epilog="Ex. usage: gtt-get-ncbi-tax-data\n") 23 | 24 | parser.add_argument("-P", "--use-http", help='Use http instead of ftp', action = "store_true") 25 | 26 | args = parser.parse_args() 27 | 28 | 29 | ################################################################################ 30 | 31 | def main(): 32 | 33 | NCBI_data_dir = check_location_var_is_set() 34 | 35 | data_present = check_if_data_present(NCBI_data_dir) 36 | 37 | if data_present: 38 | exit() 39 | 40 | else: 41 | 42 | print(color_text(" Downloading required NCBI taxonomy data (only needs to be done once)...\n", "yellow")) 43 | get_NCBI_tax_data(NCBI_data_dir) 44 | 45 | 46 | ################################################################################ 47 | 48 | 49 | # setting some colors 50 | tty_colors = { 51 | 'green' : '\033[0;32m%s\033[0m', 52 | 'yellow' : '\033[0;33m%s\033[0m', 53 | 'red' : '\033[0;31m%s\033[0m' 54 | } 55 | 56 | 57 | ### functions ### 58 | def color_text(text, color='green'): 59 | if sys.stdout.isatty(): 60 | return tty_colors[color] % text 61 | else: 62 | return text 63 | 64 | 65 | def wprint(text): 66 | print(textwrap.fill(text, width=80, initial_indent=" ", 67 | subsequent_indent=" ", break_on_hyphens=False)) 68 | 69 | 70 | def check_location_var_is_set(): 71 | 72 | # making sure there is a KO_data_dir env variable 73 | try: 74 | NCBI_data_dir = os.environ['TAXONKIT_DB'] 75 | except: 76 | wprint(color_text("The environment variable 'TAXONKIT_DB' does not seem to be set :(", "yellow")) 77 | wprint("This shouldn't happen, check on things with `gtt-data-locations check`.") 78 | print("") 79 | sys.exit(0) 80 | 81 | return(NCBI_data_dir) 82 | 83 | 84 | def check_if_data_present(location): 85 | 86 | # seeing if present already 87 | # if this function returns True, then data is present 88 | # if it returns False, then we need to download things 89 | names_path = os.path.join(str(location) + "/names.dmp") 90 | nodes_path = os.path.join(str(location) + "/nodes.dmp") 91 | 92 | 93 | if not os.path.isfile(names_path) or not os.path.isfile(nodes_path): 94 | 95 | if os.path.exists(names_path): 96 | os.remove(names_path) 97 | if os.path.isdir(nodes_path): 98 | shutil.rmtree(nodes_path) 99 | 100 | return(False) 101 | 102 | else: 103 | 104 | return(True) 105 | 106 | 107 | def get_NCBI_tax_data(location): 108 | """ downloads the needed ncbi tax data """ 109 | 110 | taxdump_path = os.path.join(str(location) + "taxdump.tar.gz") 111 | 112 | 113 | # setting links 114 | if args.use_http: 115 | 116 | taxdump_link = "https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" 117 | 118 | else: 119 | 120 | taxdump_link = "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz" 121 | 122 | urllib.request.urlretrieve(taxdump_link, taxdump_path) 123 | 124 | # unpacking 125 | with tarfile.open(taxdump_path) as tarball: 126 | tarball.extractall(location) 127 | 128 | # removing tarball 129 | os.remove(taxdump_path) 130 | 131 | 132 | ################################################################################ 133 | 134 | if __name__ == "__main__": 135 | main() 136 | -------------------------------------------------------------------------------- /bin/gtt-hmms: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | YELLOW='\033[0;33m' 6 | RED='\033[0;31m' 7 | NC='\033[0m' 8 | 9 | printf "\n${YELLOW} GToTree pre-packaged HMM SCG-sets\n${NC}" 10 | printf " See github.com/AstrobioMike/GToTree/wiki/SCG-sets for more info\n\n" 11 | 12 | ## making sure expected variable is set 13 | if [ -z ${GToTree_HMM_dir} ]; then 14 | # reporting it is not set 15 | printf "\n${YELLOW} The 'GToTree_HMM_dir' variable is not set :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n" 16 | exit 17 | 18 | fi 19 | 20 | # now making sure directory exists or that we can create it if not 21 | if [ ! -d ${GToTree_HMM_dir} ]; then 22 | 23 | # attempting to create 24 | mkdir -p ${GToTree_HMM_dir} > /dev/null 25 | if [ $? -ne 0 ]; then 26 | printf "\n${YELLOW} The 'GToTree_HMM_dir' location does not exist and can't be created :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n" 27 | exit 28 | fi 29 | 30 | fi 31 | 32 | # making sure it is writable 33 | if [ ! -w ${GToTree_HMM_dir} ]; then 34 | printf "\n${YELLOW} The 'GToTree_HMM_dir' location is not writable for you :( Use \`gtt-data-locations\` to check and configure.\n${NC}\n" 35 | exit 36 | fi 37 | 38 | 39 | ## now moving on to reporting the pre-packaged HMMs 40 | # making sure info table is there, or downloading if not 41 | if [ ! -f "${GToTree_HMM_dir}/hmm-sources-and-info.tsv" ]; then 42 | 43 | # downloading to there if not already present 44 | curl --silent --retry 10 -L -o ${GToTree_HMM_dir}/hmm-sources-and-info.tsv https://figshare.com/ndownloader/files/34066016 45 | 46 | fi 47 | 48 | # printing out info for pre-packaged HMMs 49 | num_hmm_files=$( tail -n +2 ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 1 | wc -l | sed "s/^ *//" ) 50 | 51 | printf " The environment variable ${GREEN}GToTree_HMM_dir${NC} is set to:\n" 52 | printf " $GToTree_HMM_dir\n\n" 53 | 54 | printf " The ${num_hmm_files} available pre-packaged HMM SCG-sets include:\n\n" 55 | 56 | for gene_set in $(tail -n +2 ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 1); do 57 | 58 | gene_set=$(echo $gene_set | sed 's/.hmm//') 59 | curr_number_of_genes=$(grep -m 1 -w "^${gene_set}" ${GToTree_HMM_dir}/hmm-sources-and-info.tsv | cut -f 4) 60 | 61 | printf "\t %-30s %14s\n" "${gene_set}" "($curr_number_of_genes genes)" 62 | 63 | done 64 | 65 | printf "\n" 66 | printf " Details can be found in: \n ${GToTree_HMM_dir}hmm-sources-and-info.tsv\n\n" 67 | -------------------------------------------------------------------------------- /bin/gtt-ncbi-parallel-nt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir="$2" 10 | hmm_file="$3" 11 | num_cpus="$4" 12 | hmm_target_genes_total="$5" 13 | output_dir="$6" 14 | best_hit_mode="$7" 15 | additional_pfam_targets="$8" 16 | http_flag="$9" 17 | ko_targets="${10}" 18 | target_KOs="${11}" 19 | debug_flag="${12}" 20 | 21 | assembly=$(echo "$1" | cut -f 1) 22 | downloaded_accession=$(echo "$1" | cut -f 2) 23 | 24 | # storing and building links 25 | if [ "$http_flag" == 'false' ]; then 26 | base_link=$(echo "$1" | cut -f 9) 27 | else 28 | base_link=$(echo "$1" | cut -f 9 | sed 's/^ftp/https/') 29 | fi 30 | 31 | # checking link was actually present (sometimes, very rarely, it is not there) 32 | # if not there, attempting to build ourselves 33 | if [ $base_link == "na" ] || [ -z $base_link ]; then 34 | 35 | if [ "$http_flag" == 'false' ]; then 36 | p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all") 37 | else 38 | p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all") 39 | fi 40 | 41 | # checking if GCF or GCA 42 | if [[ $assembly == "GCF"* ]]; then 43 | p2="GCF" 44 | else 45 | p2="GCA" 46 | fi 47 | 48 | p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3) 49 | p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6) 50 | p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9) 51 | 52 | ass_name=$(echo "$1" | cut -f 3) 53 | end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name")) 54 | 55 | base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path")) 56 | 57 | else 58 | 59 | end_path=$(basename $base_link) 60 | 61 | fi 62 | 63 | printf " -------------------------------------------------------------------------- \n\n" 64 | printf " Genome: ${GREEN}$assembly${NC}\n" 65 | 66 | # attempting to download genome fasta 67 | curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz" 68 | 69 | # if http, then it pulls down a file still, it just isn't gzipped 70 | # if ftp, no file is pulled down 71 | # so to cover both cases, just making this need to be present and gzipped 72 | if $(file ${tmp_dir}/${assembly}_genome.tmp.gz | grep -q gzip); then 73 | 74 | gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz 75 | 76 | prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null 77 | 78 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp 79 | 80 | ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such 81 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp 82 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp 83 | 84 | # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found) 85 | ass_name=$(echo "$1" | cut -f 3) 86 | if [ -z "$ass_name" ]; then ass_name="NA"; fi 87 | org_name=$(echo "$1" | cut -f 5) 88 | if [ -z "$org_name" ]; then org_name="NA"; fi 89 | infraspecific_name=$(echo "$1" | cut -f 6) 90 | if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi 91 | taxid=$(echo "$1" | cut -f 4) 92 | if [ -z "$taxid" ]; then taxid="NA"; fi 93 | version_status=$(echo "$1" | cut -f 7) 94 | if [ -z "$version_status" ]; then version_status="NA"; fi 95 | asm_level=$(echo "$1" | cut -f 8) 96 | if [ -z "$asm_level" ]; then asm_level="NA"; fi 97 | 98 | ### counting how many genes in this genome 99 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes3.faa.tmp) 100 | 101 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 102 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp 103 | # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta 104 | 105 | ### running hmm search ### 106 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null 107 | 108 | ### calculating % completion and redundancy ### 109 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 110 | do 111 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 112 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 113 | 114 | ## making list here of only those present in exactly 1 copy, to get count of "unique" SCG-hits 115 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 116 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 117 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 118 | 119 | ## adding SCG-hit counts to table 120 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 121 | 122 | # total number of unique SCG hits 123 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 124 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 125 | 126 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 127 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 128 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 129 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 130 | 131 | # want to put a notice out if estimated redundancy is greater than 10 132 | # needs to be an integer for bash comparison, so multiplying by 100 first 133 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 134 | 135 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 136 | 137 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 138 | 139 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 140 | 141 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 142 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 143 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 144 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 145 | printf " You may want to consider taking a closer look and/or removing it from the\n" 146 | printf " from the input genomes.\n\n" 147 | 148 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 149 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 150 | 151 | # writing to table of genomes with questionable redundancy estimates 152 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 153 | 154 | else 155 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 156 | 157 | fi 158 | 159 | ## writing summary info to table ## 160 | printf "$assembly\t$downloaded_accession\t$ass_name\t$taxid\t$org_name\t$infraspecific_name\t$version_status\t$asm_level\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/NCBI_genomes_summary_info.tsv 161 | 162 | ### Pulling out hits for this genome (nucleotide as specified by user) ### 163 | target_genes_suffix="_genes.fa.tmp" 164 | 165 | # indexing 166 | esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null 167 | 168 | # looping through and pulling out each first hit (hmm results tab is sorted by e-value): 169 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 170 | if [ $best_hit_mode == "false" ]; then 171 | 172 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 173 | do 174 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 175 | done 176 | 177 | # if best-hit mode is on, taking best hit 178 | else 179 | 180 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 181 | do 182 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 183 | done 184 | 185 | fi 186 | 187 | 188 | ## searching for additional targets if provided 189 | # getting count of genes if there are additional targets 190 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 191 | 192 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp) 193 | 194 | fi 195 | 196 | ## KOs 197 | if [ $ko_targets == "true" ]; then 198 | 199 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 200 | 201 | fi 202 | 203 | ## Pfams 204 | if [ $additional_pfam_targets == "true" ]; then 205 | 206 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 207 | 208 | fi 209 | 210 | if [ $debug_flag == "true" ]; then 211 | if [ -s ${tmp_dir}/${assembly}_genes2.faa.tmp ]; then 212 | mv ${tmp_dir}/${assembly}_genes2.faa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa 213 | fi 214 | if [ -s ${tmp_dir}/${assembly}_genes1.fa.tmp ]; then 215 | mv ${tmp_dir}/${assembly}_genes1.fa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_cds.fa 216 | fi 217 | if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then 218 | mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna 219 | fi 220 | fi 221 | 222 | rm -rf ${tmp_dir}/${assembly}_genes*.tmp* 223 | rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp 224 | rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 225 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 226 | 227 | else 228 | printf " ${ORANGE}******************************* ${NC}NOTICE ${ORANGE}*******************************${NC} \n" 229 | printf "\t $assembly's genome not successfully downloaded :(\n\n" 230 | printf "\t Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\"\n" 231 | printf " ${ORANGE}************************************************************************ ${NC}\n" 232 | rm -rf ${tmp_dir}/${assembly}_genome.tmp.gz 233 | 234 | sleep 2 235 | 236 | echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt 237 | 238 | fi 239 | -------------------------------------------------------------------------------- /bin/gtt-ncbi-parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir="$2" 10 | hmm_file="$3" 11 | num_cpus="$4" 12 | hmm_target_genes_total="$5" 13 | output_dir="$6" 14 | best_hit_mode="$7" 15 | additional_pfam_targets="$8" 16 | http_flag="$9" 17 | ko_targets="${10}" 18 | target_KOs="${11}" 19 | debug_flag="${12}" 20 | 21 | assembly=$(echo "$1" | cut -f 1) 22 | downloaded_accession=$(echo "$1" | cut -f 2) 23 | 24 | # storing and building links 25 | if [ "$http_flag" == 'false' ]; then 26 | base_link=$(echo "$1" | cut -f 9) 27 | else 28 | base_link=$(echo "$1" | cut -f 9 | sed 's/^ftp/https/') 29 | fi 30 | 31 | # checking link was actually present (sometimes, very rarely, it is not there) 32 | # if not there, attempting to build ourselves 33 | if [ $base_link == "na" ] || [ -z $base_link ]; then 34 | 35 | if [ "$http_flag" == 'false' ]; then 36 | p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all") 37 | else 38 | p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all") 39 | fi 40 | 41 | # checking if GCF or GCA 42 | if [[ $assembly == "GCF"* ]]; then 43 | p2="GCF" 44 | else 45 | p2="GCA" 46 | fi 47 | 48 | p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3) 49 | p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6) 50 | p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9) 51 | 52 | ass_name=$(echo "$1" | cut -f 3) 53 | end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name")) 54 | 55 | base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path")) 56 | 57 | else 58 | 59 | end_path=$(basename $base_link) 60 | 61 | fi 62 | 63 | printf " -------------------------------------------------------------------------- \n\n" 64 | printf " Genome: ${GREEN}$assembly${NC}\n" 65 | 66 | curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genes2.tmp.gz "${base_link}/${end_path}_protein.faa.gz" 67 | 68 | if $(file ${tmp_dir}/${assembly}_genes2.tmp.gz | grep -q gzip); then 69 | gunzip -f ${tmp_dir}/${assembly}_genes2.tmp.gz 70 | # renaming headers to avoid problems with odd characters and how hmmer parses and such 71 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.tmp 72 | 73 | else # trying to get assembly if there were no gene annotations available 74 | curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz" 75 | 76 | if [ -s ${tmp_dir}/${assembly}_genome.tmp.gz ]; then 77 | 78 | gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz 79 | 80 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 81 | printf " $assembly doesn't appear to have gene annotations.\n\n" 82 | printf " Downloaded the genome and identifying CDSs with prodigal.\n" 83 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 84 | 85 | printf " Getting coding seqs...\n\n" 86 | echo "prodigal used" > ${tmp_dir}/prodigal_used # marking so can add to citations list reported at end 87 | prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 88 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp 89 | 90 | ## renaming seqs to have assembly name 91 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.tmp 92 | fi 93 | fi 94 | 95 | if [ -s ${tmp_dir}/${assembly}_genes3.tmp ]; then 96 | 97 | # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found) 98 | ass_name=$(echo "$1" | cut -f 3) 99 | if [ -z "$ass_name" ]; then ass_name="NA"; fi 100 | org_name=$(echo "$1" | cut -f 5) 101 | if [ -z "$org_name" ]; then org_name="NA"; fi 102 | infraspecific_name=$(echo "$1" | cut -f 6) 103 | if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi 104 | taxid=$(echo "$1" | cut -f 4) 105 | if [ -z "$taxid" ]; then taxid="NA"; fi 106 | version_status=$(echo "$1" | cut -f 7) 107 | if [ -z "$version_status" ]; then version_status="NA"; fi 108 | asm_level=$(echo "$1" | cut -f 8) 109 | if [ -z "$asm_level" ]; then asm_level="NA"; fi 110 | 111 | 112 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 113 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.tmp 114 | 115 | 116 | ### running hmm search ### 117 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.tmp > /dev/null 118 | 119 | ### calculating % completion and redundancy ### 120 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 121 | do 122 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 123 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 124 | 125 | ## making list here of only those present in exactly 1 copy 126 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 127 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 128 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 129 | 130 | ## adding SCG-hit counts to table 131 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 132 | 133 | # total number of unique SCG hits 134 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 135 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 136 | 137 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 138 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 139 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 140 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 141 | 142 | ### want to put an explicit notice out if estimated redundancy is greater than 10% 143 | # needs to be an integer for bash comparison, so multiplying by 100 first 144 | 145 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 146 | 147 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 148 | 149 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 150 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 151 | 152 | 153 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 154 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 155 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 156 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 157 | printf " You may want to consider taking a closer look and/or removing it from the\n" 158 | printf " from the input genomes.\n\n" 159 | 160 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 161 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 162 | 163 | # writing to table of genomes with questionable redundancy estimates 164 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 165 | 166 | else 167 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 168 | fi 169 | 170 | ## writing summary info to table ## 171 | printf "$assembly\t$downloaded_accession\t$ass_name\t$taxid\t$org_name\t$infraspecific_name\t$version_status\t$asm_level\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/NCBI_genomes_summary_info.tsv 172 | 173 | ### Pulling out hits for this genome ### 174 | # looping through SCGs and pulling out each first hit (hmm results tab is sorted by e-value): 175 | esl-sfetch --index ${tmp_dir}/${assembly}_genes.tmp > /dev/null 176 | 177 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 178 | if [ $best_hit_mode == "false" ]; then 179 | 180 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 181 | do 182 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 183 | done 184 | 185 | # if best-hit mode is on, taking best hit 186 | else 187 | 188 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 189 | do 190 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.faa 191 | done 192 | 193 | fi 194 | 195 | ## searching for additional targets if provided 196 | # getting count of genes if there are additional targets 197 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 198 | 199 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp) 200 | 201 | fi 202 | 203 | ## KOs 204 | if [ $ko_targets == "true" ]; then 205 | 206 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 207 | 208 | fi 209 | 210 | ## Pfams 211 | if [ $additional_pfam_targets == "true" ]; then 212 | 213 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 214 | 215 | fi 216 | 217 | if [ $debug_flag == "true" ]; then 218 | printf "\n\n Debug mode on, keeping intermediate files.\n\n" 219 | if [ -s ${tmp_dir}/${assembly}_genes2.tmp ]; then 220 | printf " Keeping ${tmp_dir}/${assembly}_genes2.tmp\n" 221 | mv ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa 222 | fi 223 | if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then 224 | printf " Keeping ${tmp_dir}/${assembly}_genome.tmp\n" 225 | mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna 226 | fi 227 | fi 228 | 229 | rm -rf ${tmp_dir}/${assembly}_genes3.tmp ${tmp_dir}/${assembly}_genes2.tmp ${tmp_dir}/${assembly}_genes1.tmp 230 | rm -rf ${tmp_dir}/${assembly}_genes.tmp ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp 231 | rm -rf ${tmp_dir}/${assembly}_genes.tmp.ssi ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 232 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 233 | 234 | else 235 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 236 | printf " $assembly did not download properly :(\n\n" 237 | printf " Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\"\n" 238 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 239 | rm -rf ${tmp_dir}/${assembly}_report1.tmp ${tmp_dir}/${assembly}_genes.tmp.gz 240 | sleep 3 241 | echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt 242 | fi 243 | -------------------------------------------------------------------------------- /bin/gtt-ncbi-serial-nt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | ORANGE='\033[0;33m' 7 | NC='\033[0m' 8 | 9 | tmp_dir="$2" 10 | hmm_file="$3" 11 | NCBI_remaining_genomes_total="$4" 12 | num_cpus="$5" 13 | hmm_target_genes_total="$6" 14 | output_dir="$7" 15 | best_hit_mode="$8" 16 | additional_pfam_targets="$9" 17 | http_flag="${10}" 18 | ko_targets="${11}" 19 | target_KOs="${12}" 20 | debug_flag="${13}" 21 | 22 | num=0 23 | 24 | while IFS=$'\t' read -r -a curr_line 25 | 26 | do 27 | 28 | assembly="${curr_line[0]}" 29 | downloaded_accession="${curr_line[1]}" 30 | num=$((num+1)) 31 | 32 | printf " -------------------------------------------------------------------------- \n" 33 | printf "\tOn assembly ${GREEN}$assembly${NC}; Number $num of $NCBI_remaining_genomes_total total.\n" 34 | printf " -------------------------------------------------------------------------- \n\n" 35 | 36 | # storing and building links 37 | if [ "$http_flag" == 'false' ]; then 38 | base_link="${curr_line[8]}" 39 | else 40 | base_link=$(echo ${curr_line[8]} | sed 's/^ftp/https/') 41 | fi 42 | 43 | # checking link was actually present (sometimes, very rarely, it is not there) 44 | # if not there, attempting to build ourselves 45 | if [ $base_link == "na" ] || [ -z $base_link ]; then 46 | 47 | if [ "$http_flag" == 'false' ]; then 48 | p1=$(printf "ftp://ftp.ncbi.nlm.nih.gov/genomes/all") 49 | else 50 | p1=$(printf "https://ftp.ncbi.nlm.nih.gov/genomes/all") 51 | fi 52 | 53 | # checking if GCF or GCA 54 | if [[ $assembly == "GCF"* ]]; then 55 | p2="GCF" 56 | else 57 | p2="GCA" 58 | fi 59 | 60 | p3=$(echo $assembly | cut -f 2 -d "_" | cut -c 1-3) 61 | p4=$(echo $assembly | cut -f 2 -d "_" | cut -c 4-6) 62 | p5=$(echo $assembly | cut -f 2 -d "_" | cut -c 7-9) 63 | 64 | ass_name="${curr_line[2]}" 65 | end_path=$(paste -d "_" <(echo "$assembly") <(echo "$ass_name")) 66 | 67 | base_link=$(paste -d "/" <(echo "$p1") <(echo "$p2") <(echo "$p3") <(echo "$p4") <(echo "$p5") <(echo "$end_path")) 68 | 69 | else 70 | 71 | end_path=$(basename $base_link) 72 | 73 | fi 74 | 75 | # attempting to download genome fasta 76 | printf " Downloading genome fasta...\n\n" 77 | curl --silent --retry 10 -o ${tmp_dir}/${assembly}_genome.tmp.gz "${base_link}/${end_path}_genomic.fna.gz" 78 | 79 | # if http, then it pulls down a file still, it just isn't gzipped 80 | # if ftp, no file is pulled down 81 | # so to cover both cases, just making this need to be present and gzipped 82 | if $(file ${tmp_dir}/${assembly}_genome.tmp.gz | grep -q gzip); then 83 | 84 | gunzip -f ${tmp_dir}/${assembly}_genome.tmp.gz 85 | 86 | printf " Getting coding seqs...\n\n" 87 | 88 | prodigal -c -q -i ${tmp_dir}/${assembly}_genome.tmp -a ${tmp_dir}/${assembly}_genes1.faa.tmp -d ${tmp_dir}/${assembly}_genes1.fa.tmp > /dev/null 89 | 90 | tr -d '*' < ${tmp_dir}/${assembly}_genes1.faa.tmp > ${tmp_dir}/${assembly}_genes2.faa.tmp 91 | 92 | ## renaming seqs to have assembly name to avoid problems with odd characters and how hmmer parses and such 93 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.faa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes3.faa.tmp 94 | gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes1.fa.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.fa.tmp 95 | 96 | 97 | # storing more info about the assembly to write out into ncbi-derived-genome summary file (for each setting to NA if not found) 98 | ass_name="${curr_line[2]}" 99 | if [ -z "$ass_name" ]; then ass_name="NA"; fi 100 | org_name="${curr_line[4]}" 101 | if [ -z "$org_name" ]; then org_name="NA"; fi 102 | infraspecific_name="${curr_line[5]}" 103 | if [ -z "$infraspecific_name" ]; then infraspecific_name="NA"; fi 104 | taxid="${curr_line[3]}" 105 | if [ -z "$taxid" ]; then taxid="NA"; fi 106 | version_status="${curr_line[6]}" 107 | if [ -z "$version_status" ]; then version_status="NA"; fi 108 | asm_level="${curr_line[7]}" 109 | if [ -z "$asm_level" ]; then asm_level="NA"; fi 110 | 111 | 112 | ### counting how many genes in this genome 113 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes3.faa.tmp) 114 | 115 | # filtering sequences by length to be sure none with > 99,999 amino acids are there, as this breaks hmmer (https://github.com/AstrobioMike/GToTree/issues/50 ; https://github.com/EddyRivasLab/hmmer/issues/244) 116 | gtt-filter-seqs-by-length -q -i ${tmp_dir}/${assembly}_genes3.faa.tmp -m 0 -M 99999 -o ${tmp_dir}/${assembly}_genes.faa.tmp 117 | # don't need to filter the nucleotide fasta, as we will only be looking for some found in the amino-acid fasta 118 | 119 | printf " Performing HMM search...\n" 120 | 121 | ### running hmm search ### 122 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp $hmm_file ${tmp_dir}/${assembly}_genes.faa.tmp > /dev/null 123 | 124 | ### calculating % completion and redundancy ### 125 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 126 | do 127 | grep -w -c "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp 128 | done > ${tmp_dir}/${assembly}_uniq_counts.tmp 129 | 130 | ## making list here of only those present in exactly 1 copy, to get count of "unique" SCG-hits 131 | paste ${tmp_dir}/uniq_hmm_names.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp > ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 132 | awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp | cut -f 1 > ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 133 | uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ") 134 | 135 | ## adding SCG-hit counts to table 136 | paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t" | sed 's/.$//')") >> ${output_dir}/SCG_hit_counts.tsv 137 | 138 | # total number of unique SCG hits 139 | num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ") 140 | num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }') 141 | 142 | perc_comp=$(echo "$num_SCG_hits / $hmm_target_genes_total * 100" | bc -l) 143 | perc_comp_rnd=$(printf "%.2f\n" $perc_comp) 144 | perc_redund=$(echo "$num_SCG_redund / $hmm_target_genes_total * 100" | bc -l) 145 | perc_redund_rnd=$(printf "%.2f\n" $perc_redund) 146 | 147 | # want to put a notice out if estimated redundancy is greater than 10 148 | # needs to be an integer for bash comparison, so multiplying by 100 first 149 | mult_perc_redund_rnd=$(echo "$perc_redund_rnd * 100" | bc | cut -f 1 -d ".") 150 | 151 | printf " Found $num_SCG_hits of the targeted $hmm_target_genes_total genes.\n" 152 | 153 | if [ ${mult_perc_redund_rnd} -ge 1000 ]; then 154 | 155 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${RED}${perc_redund_rnd}${NC}\n\n" 156 | 157 | printf " ${ORANGE}********************************** ${NC}NOTICE ${ORANGE}**********************************${NC} \n" 158 | printf " Estimated redundancy of this genome based on the specified HMMs is ${RED}${perc_redund_rnd}%%${NC}.\n" 159 | printf " While there are no \"golden\" cutoff values for these things, typically\n" 160 | printf " going over 10%% (if bacterial/archaeal) is getting into the questionable range.\n" 161 | printf " You may want to consider taking a closer look and/or removing it from the\n" 162 | printf " from the input genomes.\n\n" 163 | 164 | printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n" 165 | printf " ${ORANGE}****************************************************************************${NC} \n\n" 166 | 167 | # writing to table of genomes with questionable redundancy estimates 168 | printf "$assembly\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${tmp_dir}/Genomes_with_questionable_redundancy_estimates.tmp 169 | 170 | else 171 | printf " Est. %% comp: ${perc_comp_rnd}; Est. %% redund: ${perc_redund_rnd}\n\n" 172 | 173 | fi 174 | 175 | ## writing summary info to table ## 176 | printf "$assembly\t$downloaded_accession\t$ass_name\t$taxid\t$org_name\t$infraspecific_name\t$version_status\t$asm_level\t$num_SCG_hits\t$uniq_SCG_hits\t$perc_comp_rnd\t$perc_redund_rnd\n" >> ${output_dir}/NCBI_genomes_summary_info.tsv 177 | 178 | ### Pulling out hits for this genome (nucleotide as specified by user) ### 179 | target_genes_suffix="_genes.fa.tmp" 180 | 181 | # indexing 182 | esl-sfetch --index ${tmp_dir}/${assembly}${target_genes_suffix} > /dev/null 183 | 184 | # looping through and pulling out each first hit (hmm results tab is sorted by e-value): 185 | # if best-hit mode is off, then only pulling genes that were identified in exactly 1 copy 186 | if [ $best_hit_mode == "false" ]; then 187 | 188 | for SCG in $(cat ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp) 189 | do 190 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 191 | done 192 | 193 | # if best-hit mode is on, taking best hit 194 | else 195 | 196 | for SCG in $(cat ${tmp_dir}/uniq_hmm_names.tmp) 197 | do 198 | grep -w "$SCG" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | awk '!x[$3]++' | cut -f1 -d " " | esl-sfetch -f ${tmp_dir}/${assembly}${target_genes_suffix} - | sed "s/>.*$/>$assembly/" | sed 's/^Usage.*$//' | sed 's/^To see.*$//' | sed '/^$/d' >> ${tmp_dir}/${SCG}_hits.fa 199 | done 200 | 201 | fi 202 | 203 | 204 | ## searching for additional targets if provided 205 | # getting count of genes if there are additional targets 206 | if [ $ko_targets == "true" ] || [ $additional_pfam_targets == "true" ]; then 207 | 208 | gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.faa.tmp) 209 | 210 | fi 211 | 212 | ## KOs 213 | if [ $ko_targets == "true" ]; then 214 | 215 | gtt-run-kofamscan.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${output_dir}/KO_search_results/target-KOs.tsv ${output_dir}/KO_search_results/target_KO_profiles/ ${tmp_dir} ${output_dir} ${target_KOs} 216 | 217 | fi 218 | 219 | ## Pfams 220 | if [ $additional_pfam_targets == "true" ]; then 221 | 222 | gtt-run-additional-pfam-search.sh ${assembly} ${tmp_dir}/${assembly}_genes.faa.tmp ${gene_count} ${num_cpus} ${tmp_dir} ${output_dir} 223 | 224 | fi 225 | 226 | if [ $debug_flag == "true" ]; then 227 | if [ -s ${tmp_dir}/${assembly}_genes2.faa.tmp ]; then 228 | mv ${tmp_dir}/${assembly}_genes2.faa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_protein.faa 229 | fi 230 | if [ -s ${tmp_dir}/${assembly}_genes1.fa.tmp ]; then 231 | mv ${tmp_dir}/${assembly}_genes1.fa.tmp ${tmp_dir}/ncbi-downloads/${assembly}_cds.fa 232 | fi 233 | if [ -s ${tmp_dir}/${assembly}_genome.tmp ]; then 234 | mv ${tmp_dir}/${assembly}_genome.tmp ${tmp_dir}/ncbi-downloads/${assembly}_genomic.fna 235 | fi 236 | fi 237 | 238 | rm -rf ${tmp_dir}/${assembly}_genes*.tmp* 239 | rm -rf ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/${assembly}_uniq_counts.tmp 240 | rm -rf ${tmp_dir}/${assembly}_conservative_filtering_counts_tab.tmp 241 | rm -rf ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp 242 | 243 | else 244 | printf " ${ORANGE}******************************* ${NC}NOTICE ${ORANGE}*******************************${NC} \n" 245 | printf "\t $assembly's genome not successfully downloaded :(\n\n" 246 | printf "\t Reported in \"${output_dir}/run_files/NCBI_accessions_not_downloaded.txt\"\n" 247 | printf " ${ORANGE}************************************************************************ ${NC}\n" 248 | rm -rf ${tmp_dir}/${assembly}_genome.tmp.gz 249 | 250 | sleep 2 251 | 252 | echo $assembly >> ${output_dir}/NCBI_accessions_not_downloaded.txt 253 | 254 | fi 255 | 256 | done < $1 257 | -------------------------------------------------------------------------------- /bin/gtt-parse-assembly-summary-file: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser(description='This script is for parsing NCBI\'s assembly summary file down to the provided accessions.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-a", "--assembly_summary", help="NCBI's assembly summary file", action="store", dest="all_assemblies", required=True) 11 | required.add_argument("-w", "--wanted_accessions", help="Single-column file with wanted accessions", action="store", dest="wanted_accs", required=True) 12 | parser.add_argument("-o", "--output_file", help='Wanted summary info only (default: "Wanted.tsv")', action="store", dest="output_file", default="Wanted.tsv") 13 | 14 | if len(sys.argv)==1: 15 | parser.print_help(sys.stderr) 16 | sys.exit(0) 17 | 18 | args = parser.parse_args() 19 | 20 | wanted_dict = {} 21 | 22 | with open(args.wanted_accs, "r") as wanted_accs: 23 | for line in wanted_accs: 24 | root_acc = line.strip().split(".")[0] 25 | wanted_dict[str(root_acc)] = line.strip() 26 | 27 | out_file = open(args.output_file, "w") 28 | 29 | # out_file.write("input_accession\tdownloaded_accession\tassembly_name\ttaxid\torganism_name\tinfraspecific_name\tversion_status\tassembly_level\tftp_path\n") 30 | 31 | with open(args.all_assemblies) as assemblies: 32 | for line in assemblies: 33 | line = line.split("\t") 34 | if line[0].split(".")[0] in wanted_dict: 35 | 36 | dl_acc = str(line[0]) 37 | if not dl_acc: 38 | dl_acc = "NA" 39 | 40 | ass_name = str(line[15]) 41 | if not ass_name: 42 | ass_name = "NA" 43 | 44 | taxid = str(line[5]) 45 | if not taxid: 46 | taxid = "NA" 47 | 48 | org_name = str(line[7]) 49 | if not org_name: 50 | org_name = "NA" 51 | 52 | infra_name = str(line[8]) 53 | if not infra_name: 54 | infra_name = "NA" 55 | 56 | version_status = str(line[10]) 57 | if not version_status: 58 | version_status = "NA" 59 | 60 | ass_level = str(line[11]) 61 | if not ass_level: 62 | ass_level = "NA" 63 | 64 | ftp_path = str(line[19]) 65 | if not ftp_path: 66 | ftp_path = "NA" 67 | 68 | out_file.write(str(wanted_dict[str(line[0].split(".")[0])]) + "\t" + str(dl_acc) + "\t" + str(ass_name) + "\t" + str(taxid) + "\t" + str(org_name) + "\t" + str(infra_name) + "\t" + str(version_status) + "\t" + str(ass_level) + "\t" + str(ftp_path) + "\n") 69 | -------------------------------------------------------------------------------- /bin/gtt-parse-fasta-by-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio.SeqIO.FastaIO import SimpleFastaParser 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='This script is for parsing a fasta file by pulling out sequences with the desired headers. If you want all sequences EXCEPT the ones with the headers you are providing, add the flag "--inverse".') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input-fasta", help="Original fasta file", action="store", required=True) 12 | required.add_argument("-w", "--wanted-headers", help="Single-column file with sequence headers", action="store", required=True) 13 | parser.add_argument("-o", "--output-fasta", help='Output fasta file default: "Wanted.fa"', action="store", default="Wanted.fa") 14 | parser.add_argument("--inverse", help="Add this flag to pull out all sequences with headers NOT in the provided header file.", action="store_true") 15 | 16 | if len(sys.argv)==1: 17 | parser.print_help(sys.stderr) 18 | sys.exit(0) 19 | 20 | args = parser.parse_args() 21 | 22 | headers_of_int_list = [line.strip() for line in open(args.wanted_headers)] 23 | 24 | if not args.inverse: 25 | 26 | with open(args.output_fasta, "w") as output_file: 27 | 28 | with open(args.input_fasta, "r") as input_file: 29 | 30 | for header, seq in SimpleFastaParser(input_file): 31 | 32 | if header in headers_of_int_list: 33 | 34 | output_file.write(">%s\n%s\n" % (header, seq)) 35 | 36 | else: 37 | 38 | with open(args.output_fasta, "w") as output_file: 39 | 40 | with open(args.input_fasta, "r") as input_file: 41 | 42 | for header, seq in SimpleFastaParser(input_file): 43 | 44 | if header not in headers_of_int_list: 45 | 46 | output_file.write(">%s\n%s\n" % (header, seq)) 47 | -------------------------------------------------------------------------------- /bin/gtt-parse-gtdb-assembly-summary-file: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser(description='This script is for parsing GTDB\'s assembly metadata file down to the target accessions.') 7 | 8 | required = parser.add_argument_group('required arguments') 9 | 10 | required.add_argument("-a", "--assembly_summary", help="GTDB's assembly metadata file", action="store", dest="all_assemblies", required=True) 11 | required.add_argument("-w", "--wanted_accessions", help="Single-column file with wanted accessions", action="store", dest="wanted_accs", required=True) 12 | parser.add_argument("-o", "--output_file", help='Wanted summary info only (default: "target-gtdb.tsv")', action="store", dest="output_file", default="target-gtdb.tsv") 13 | parser.add_argument("-f", "--found_accs_output_file", help='Accessions found in GTDB (default: "gtdb-found-accs.txt")', action="store", dest="found_accs_output_file", default="gtdb-found-accs.txt") 14 | parser.add_argument("-n", "--not_found_accs_output_file", help='Accessions not found in GTDB (default: "gtdb-not-found-accs.tsv")', action="store", dest="not_found_accs_output_file", default="gtdb-not-found-accs.tsv") 15 | parser.add_argument("-t", "--gtdb_tax_output_file", help='Target GTDB taxonomy table (default: "target-gtdb-tax.tsv")', action="store", dest="gtdb_tax_output_file", default="target-gtdb-tax.tsv") 16 | 17 | 18 | 19 | if len(sys.argv)==1: 20 | parser.print_help(sys.stderr) 21 | sys.exit(0) 22 | 23 | args = parser.parse_args() 24 | 25 | wanted_dict = {} 26 | 27 | with open(args.wanted_accs, "r") as wanted_accs: 28 | for line in wanted_accs: 29 | root_acc = line.strip().split(".")[0][4:] 30 | wanted_dict[str(root_acc)] = line.strip() 31 | 32 | out_file = open(args.output_file, "w") 33 | 34 | # tracking found so can write out those not found at end too 35 | found_accs = [] 36 | 37 | gtdb_found_accs_out_file = open(args.found_accs_output_file, "w") 38 | # adding header 39 | gtdb_found_accs_out_file.write("input_searched\tgtdb_acc_found\tfull_gtdb_acc\n") 40 | 41 | 42 | with open(args.all_assemblies) as assemblies: 43 | # writing out header to keep 44 | out_file.write(assemblies.readline()) 45 | 46 | for line in assemblies: 47 | split_line = line.strip().split("\t") 48 | 49 | acc_with_no_version = split_line[0][7:].split(".")[0] 50 | 51 | # i believe refseq typically only has 1 version, so taking even if not the same version as specified (this info, what was searched and what was found, is reported in the output "/run_files/gtdb_to_input_accession_map.tsv" file) 52 | if acc_with_no_version in wanted_dict: 53 | 54 | out_file.write(line) 55 | gtdb_found_accs_out_file.write(wanted_dict[acc_with_no_version] + "\t" + split_line[0][3:] + "\t" + split_line[0] + "\n") 56 | 57 | # adding to found list 58 | found_accs.append(wanted_dict[acc_with_no_version]) 59 | 60 | out_file.close() 61 | gtdb_found_accs_out_file.close() 62 | 63 | ## getting and writing out entries that weren't found (and how they were searched) 64 | wanted_accs = list(wanted_dict.values()) 65 | not_found_accs = list(set(wanted_accs) - set(found_accs)) 66 | 67 | if len(not_found_accs) > 0: 68 | with open(args.not_found_accs_output_file, "w") as not_found_output_file: 69 | 70 | # writing out header 71 | not_found_output_file.write("input\tsearched_as\n") 72 | 73 | for key in wanted_dict: 74 | if wanted_dict[key] in not_found_accs: 75 | 76 | not_found_output_file.write(wanted_dict[key] + "\t" + key + "\n") 77 | 78 | # making GTDB taxonomy table only 79 | out_gtdb_tax_table = open(args.gtdb_tax_output_file, "w") 80 | 81 | # adding header 82 | out_gtdb_tax_table.write("base_gtdb_acc\tfull_gtdb_acc\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") 83 | 84 | with open(args.output_file, "r") as assemblies: 85 | 86 | # skipping header 87 | next(assemblies) 88 | 89 | for line in assemblies: 90 | line = line.strip().split("\t") 91 | full_gtdb_acc = line[0] 92 | base_gtdb_acc = full_gtdb_acc[3:] 93 | 94 | gtdb_tax_list = [line[1], line[2], line[3], line[4], line[5], line[6], line[7]] 95 | 96 | if len(gtdb_tax_list) != 7: 97 | print("GTDB entry " + full_gtdb_acc + " doesn't seem to have full lineage info.") 98 | 99 | out_gtdb_tax_table.write(base_gtdb_acc + "\t" + full_gtdb_acc + "\t" + gtdb_tax_list[0] + "\t" + gtdb_tax_list[1] + "\t" + gtdb_tax_list[2] + "\t" + gtdb_tax_list[3] + "\t" + gtdb_tax_list[4] + "\t" + gtdb_tax_list[5] + "\t" + gtdb_tax_list[6] + "\n") 100 | 101 | out_gtdb_tax_table.close() 102 | -------------------------------------------------------------------------------- /bin/gtt-parse-kofamscan-targets.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | target_KOs_file=$1 4 | output_dir=$2 5 | 6 | full_KO_list_file="${KO_data_dir}/ko_list" 7 | full_KO_hmms_dir="${KO_data_dir}/profiles" 8 | 9 | sub_KO_list_file="${output_dir}/KO_search_results/target-KOs.tsv" 10 | sub_KO_hmms_dir="${output_dir}/KO_search_results/target_KO_profiles/" 11 | 12 | # making target ko_list file and copying target hmms to working area 13 | head -n 1 ${full_KO_list_file} > ${sub_KO_list_file} 14 | 15 | for ko in $(cat ${target_KOs_file}); do 16 | grep -m 1 -w "^${ko}" ${full_KO_list_file} >> ${sub_KO_list_file} 17 | cp ${full_KO_hmms_dir}/${ko}.hmm ${sub_KO_hmms_dir} 18 | done 19 | -------------------------------------------------------------------------------- /bin/gtt-remove-all-gap-seqs-from-alignment: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='This script will remove sequences that are entirely gap characters ("-") from an alignment fasta file, specific for use in GToTree.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_fasta", help="Starting fasta file", action="store", dest="input_fasta", required=True) 12 | parser.add_argument("-o", "--output_fasta", help='Output fasta file (default: "No-gap-seqs-aln.faa").', dest="output_fasta", default="No-gap-seqs-aln.faa") 13 | 14 | if len(sys.argv)==1: 15 | parser.print_help(sys.stderr) 16 | sys.exit(0) 17 | 18 | args = parser.parse_args() 19 | 20 | with open(args.input_fasta, "r") as in_fasta: 21 | with open(args.output_fasta, "w") as out: 22 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 23 | if seq_record.seq != len(seq_record.seq) * "-": 24 | out.write(">" + str(seq_record.id) + "\n" + str(seq_record.seq) + "\n") 25 | -------------------------------------------------------------------------------- /bin/gtt-rename-fasta-headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio.SeqIO.FastaIO import SimpleFastaParser 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='This script will rename all sequences of a multifasta with the same name with an appended number to keep them unique.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input-fasta", help="Starting fasta file", action="store", required=True) 12 | parser.add_argument("-w", "--wanted-name", help='Name to give seqs (default: "Seq")', action="store", default="Seq") 13 | parser.add_argument("-o", "--output-fasta", help='Output fasta file (default: "Renamed.fasta").', default="Renamed.fasta") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | n = 0 22 | 23 | with open(args.output_fasta, "w") as output_file: 24 | 25 | with open(args.input_fasta, "r") as input_file: 26 | 27 | for header, seq in SimpleFastaParser(input_file): 28 | 29 | n = n + 1 30 | output_file.write(">" + str(args.wanted_name) + "_" + str(n) + "\n" + seq + "\n") 31 | -------------------------------------------------------------------------------- /bin/gtt-reorder-fasta: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser('This script takes a multifasta file and reorders the sequences according to the headers provided.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_fasta", help="Original fasta file", action="store", dest="input_fasta", required=True) 12 | required.add_argument("-w", "--wanted_sequence_order", help="Single-column file with headers in desired order", action="store", dest="ordered_headers", required=True) 13 | parser.add_argument("-o", "--output_fasta", help='Reordered output fasta (default: "Reordered.fa").', dest="output_fasta", default="Reordered.fa") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | ordered_seqs = open(args.ordered_headers, "r") 22 | 23 | ordered_list = list(line.strip() for line in ordered_seqs) 24 | 25 | fasta_dict = SeqIO.index(args.input_fasta, "fasta") 26 | 27 | fasta_out = open(args.output_fasta, "w") 28 | 29 | for header in ordered_list: 30 | fasta_out.write(fasta_dict.get_raw(header).decode()) 31 | 32 | ordered_seqs.close() 33 | fasta_out.close() 34 | -------------------------------------------------------------------------------- /bin/gtt-run-additional-pfam-search.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | assembly_id=${1} 4 | genes_file=${2} 5 | gene_count=${3} 6 | num_cpus=${4} 7 | tmp_dir=${5} 8 | output_dir=${6} 9 | 10 | hmmsearch_output_file="${output_dir}/Pfam_search_results/individual_genome_results/${assembly_id}_hmmsearch.txt" 11 | 12 | hmmsearch --cut_ga --cpu $num_cpus --tblout ${hmmsearch_output_file} ${tmp_dir}/all_pfam_targets.hmm ${genes_file} > /dev/null 13 | 14 | ### getting counts of each target in this genome 15 | for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp) 16 | do 17 | grep -w ${target} ${hmmsearch_output_file} | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly_id}_hit_counts.tmp 18 | done 19 | 20 | ### writing results to main output file 21 | paste <( printf "${assembly_id}\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly_id}_hit_counts.tmp | tr "\n" "\t" | sed 's/\t$/\n/')" ) >> ${output_dir}/Pfam_search_results/Pfam-hit-counts.tsv 22 | 23 | ### Pulling out hits to additional pfam targets for this genome ### 24 | for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp) 25 | do 26 | if grep -w -q "$target" ${hmmsearch_output_file}; then 27 | 28 | grep -w "$target" ${hmmsearch_output_file} | cut -f 1 -d " " >> ${tmp_dir}/${assembly_id}_${target}_genes_of_int.tmp 29 | 30 | for gene in $(cat ${tmp_dir}/${assembly_id}_${target}_genes_of_int.tmp) 31 | do 32 | echo $gene | esl-sfetch -f ${tmp_dir}/${assembly_id}_genes.tmp - 33 | done >> ${tmp_dir}/${assembly_id}_${target}_genes1.tmp 34 | 35 | gtt-append-fasta-headers -i ${tmp_dir}/${assembly_id}_${target}_genes1.tmp -w ${assembly_id}_${target} -o ${tmp_dir}/${assembly_id}_${target}_genes.tmp 36 | 37 | # adding to fasta of that target holding all genomes 38 | cat ${tmp_dir}/${assembly_id}_${target}_genes.tmp >> ${output_dir}/Pfam_search_results/Pfam_hit_seqs/${target}-hits.faa 39 | fi 40 | 41 | done 42 | -------------------------------------------------------------------------------- /bin/gtt-run-kofamscan.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | assembly_id=${1} 4 | genes_file=${2} 5 | gene_count=${3} 6 | target_KOs_table_file=${4} 7 | target_KO_hmms_dir=${5} 8 | tmp_dir=${6} 9 | output_dir=${7} 10 | unique_target_KOs=${8} 11 | 12 | 13 | curr_genome_output_dir="${output_dir}/KO_search_results/individual_genome_results/${assembly_id}/" 14 | mkdir -p ${curr_genome_output_dir} 15 | 16 | output_results_table_file="${curr_genome_output_dir}/kofamscan-results.tsv" 17 | tmp_ko_working_dir="${tmp_dir}/kofamscan/${assembly_id}/" 18 | tmp_unique_ko_hits="${tmp_ko_working_dir}/unique-KOs.txt" 19 | output_counts_file="${tmp_ko_working_dir}/KO-counts.txt" 20 | 21 | # running scan 22 | exec_annotation -p ${target_KO_hmms_dir} -k ${target_KOs_table_file} --cpu 1 -f mapper --no-report-unannotated --tmp-dir ${tmp_ko_working_dir} -o ${output_results_table_file} ${genes_file} 23 | 24 | # moving forward only if there were any hits 25 | if [ -s ${output_results_table_file} ]; then 26 | 27 | # getting all unique KOs with hits in this genome 28 | cut -f 2 ${output_results_table_file} | sort -u > ${tmp_unique_ko_hits} 29 | 30 | # creating individual fasta files with hits for each 31 | for ko in $(cat ${tmp_unique_ko_hits}); do 32 | 33 | # getting gene IDs with hits to the current KO 34 | grep -w ${ko} ${output_results_table_file} | cut -f 1 > ${tmp_ko_working_dir}/${ko}-gene-IDs.txt 35 | 36 | # pulling out seqs for this genome 37 | gtt-parse-fasta-by-headers -i ${genes_file} -w ${tmp_ko_working_dir}/${ko}-gene-IDs.txt -o ${tmp_ko_working_dir}/${ko}.faa 38 | 39 | # removing gene IDs file 40 | rm ${tmp_ko_working_dir}/${ko}-gene-IDs.txt 41 | 42 | done 43 | 44 | else 45 | 46 | printf "No hits detected.\n" > ${output_results_table_file} 47 | 48 | fi 49 | 50 | # creating count file that can be stuck together at end 51 | rm -rf ${output_counts_file}.tmp 52 | 53 | for ko in $(cat ${unique_target_KOs}); do 54 | 55 | grep -w -c ${ko} ${output_results_table_file} >> ${output_counts_file}.tmp 56 | 57 | done 58 | 59 | # rearranging so we can combine them afterwards more easily 60 | paste <( printf "${assembly_id}\t${gene_count}" ) <( tr "\n" "\t" < ${output_counts_file}.tmp | sed 's/\t$/\n/' ) > ${output_counts_file} 61 | rm ${output_counts_file}.tmp 62 | -------------------------------------------------------------------------------- /bin/gtt-store-SCG-HMMs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | import argparse 6 | import textwrap 7 | import shutil 8 | import time 9 | from glob import glob 10 | 11 | parser = argparse.ArgumentParser(description="This program adds a single-copy-gene-HMMs file to the stored GToTree location.") 12 | 13 | required = parser.add_argument_group('required arguments') 14 | required.add_argument("hmm_file", metavar='hmm_file', type=str, help="HMM file to be added", action="store") 15 | 16 | if len(sys.argv)==1: 17 | parser.print_help(sys.stderr) 18 | sys.exit(0) 19 | 20 | args = parser.parse_args() 21 | 22 | def main(): 23 | 24 | check_input() 25 | 26 | HMM_dir = get_HMM_dir() 27 | 28 | copy_if_safe(HMM_dir) 29 | 30 | report_available_HMMs(HMM_dir) 31 | 32 | # setting some colors 33 | tty_colors = { 34 | 'green' : '\033[0;32m%s\033[0m', 35 | 'yellow' : '\033[0;33m%s\033[0m', 36 | 'red' : '\033[0;31m%s\033[0m' 37 | } 38 | 39 | 40 | ### functions ### 41 | def color_text(text, color='green'): 42 | if sys.stdout.isatty(): 43 | return tty_colors[color] % text 44 | else: 45 | return text 46 | 47 | 48 | # print wrapper 49 | def wprint(text): 50 | print(textwrap.fill(text, width=80, initial_indent=" ", 51 | subsequent_indent=" ", break_on_hyphens=False)) 52 | 53 | def check_input(): 54 | if not os.path.exists(args.hmm_file): 55 | print("") 56 | wprint(color_text("Seems there is no file called '" + str(args.hmm_file) + "' here :(", "yellow")) 57 | print("") 58 | print("Exiting for now.\n") 59 | sys.exit(0) 60 | 61 | 62 | def get_HMM_dir(): 63 | 64 | # should be stored here if conda install of GToTree was performed 65 | try: 66 | HMM_dir = os.environ['GToTree_HMM_dir'] 67 | except: 68 | HMM_dir = False 69 | 70 | if not HMM_dir: 71 | print("") 72 | wprint(color_text("Seems there is no stored GToTree HMM directory :(", "yellow")) 73 | print(" Installing GToTree with conda would take care of it if interested.\n") 74 | print("Exiting for now.\n") 75 | sys.exit(0) 76 | 77 | print("") 78 | print(" GToTree stored SCG-HMMs are located in:") 79 | print(" " + HMM_dir + "\n") 80 | 81 | return(HMM_dir) 82 | 83 | def copy_if_safe(path): 84 | contents = os.listdir(path) 85 | 86 | hmm_file = os.path.basename(args.hmm_file) 87 | 88 | if hmm_file in contents: 89 | 90 | wprint(color_text("Seems there is already a file named '" + str(args.hmm_file) + "' stored in the GToTree HMM directory.", "yellow")) 91 | print("") 92 | print("Exiting for now.\n") 93 | sys.exit(0) 94 | 95 | else: 96 | shutil.copy(args.hmm_file, path + hmm_file) 97 | wprint("The file '" + color_text(str(hmm_file)) + "' now happily lives with the rest of the SCG-HMMs stored with GToTree :)") 98 | print("") 99 | 100 | def report_available_HMMs(path): 101 | 102 | time.sleep(1) 103 | 104 | 105 | files = [os.path.basename(x) for x in glob(path + "*.hmm")] 106 | 107 | files.sort() 108 | 109 | files_and_counts = {} 110 | 111 | for file in files: 112 | if file.endswith(".hmm"): 113 | count = 0 114 | with open(path + file, "r") as f: 115 | for line in f: 116 | if line.startswith("ACC"): 117 | count += 1 118 | files_and_counts[file] = count 119 | 120 | print(" The " + str(len(files)) + " currently stored SCG-HMMs include:\n") 121 | 122 | for key, value in files_and_counts.items(): 123 | print("\t {:<27} {:>15}".format(key, "(" + str(value) + " genes)")) 124 | 125 | print("") 126 | 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /bin/gtt-subset-GTDB-accessions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki) 5 | to facilitate subsetting accessions pulled from the GTDB database (with 'gtt-get-accessions-from-GTDB'). 6 | 7 | It is intended to help when wanting a tree to span a breadth of diversity we know about, while also helping 8 | to reduce over-representation of certain taxa. 9 | 10 | There are two primary methods to use it: 11 | 12 | 1) If a specific class makes up > 0.05% (by default) of the total number of accessions, it will randomly subset 13 | that class down to 5% of what it was. So if there are 40,000 total target genomes, and Gammaproteobacteria have 14 | 8,000 of them (20% of the total), the program will randomly select 80 Gammaproteobacteria to include (1% of 8,000). 15 | 16 | 2) Select 1 random genome from each of the specific rank: "phylum", "class", "order", "family", "genus", or "species". 17 | """ 18 | 19 | import sys 20 | import argparse 21 | import textwrap 22 | import pandas as pd 23 | 24 | 25 | parser = argparse.ArgumentParser(description='This script is a helper program of GToTree (https://github.com/AstrobioMike/GToTree/wiki)\ 26 | to facilitate subsetting accessions pulled from the GTDB database (with\ 27 | \'gtt-get-accessions-from-GTDB\' – the input file is the "*metadata.tsv" from that program).\ 28 | It is intended to help when wanting a tree to span the breadth\ 29 | of diversity we know about, while also helping to reduce over-representation of certain taxa. \ 30 | There are 2 primary methods for using it: \ 31 | 1) If a specific Class makes up > 0.05% (by default) of the total number of target genomes, the script\ 32 | will randomly subset that class down to 1% of what it was. So if there are 40,000 total target genomes,\ 33 | and Gammaproteobacteria make up 8,000 of them (20% of the total), the program will randomly select 80 Gammaproteobacteria\ 34 | to include (1% of 8,000). \ 35 | 2) Select 1 random genome from each taxa of the specified rank. \ 36 | It ultimately outputs a new subset accessions file ready for use with the main GToTree program.', 37 | epilog = "Ex. usage: gtt-subset-GTDB-classes -i GTDB-arc-and-bac-refseq-rep-metadata.tsv --get-only-individuals-for-the-rank order") 38 | 39 | required = parser.add_argument_group('required arguments') 40 | 41 | required.add_argument("-i", "--GTDB-table", help="GTDB metadata table produced with 'gtt-get-accessions-from-GTDB'", action="store", required=True) 42 | parser.add_argument("-o", "--output-prefix", help='output prefix for output subset accessions (*.txt) and GTDB taxonomy files (*.tsv) (default: "subset-accessions")', action="store", dest="output_prefix", default="subset-accessions") 43 | 44 | parser.add_argument("-p", "--cutoff-fraction", help='Fraction of total target genomes that any given Class must contribute in order for that class to be randomly subset (default: 0.0005)', action="store", default=0.0005) 45 | parser.add_argument("-f", "--fraction-to-subset", help='Fraction those that are filtered should be randomly subset down to (default: 0.01)', action="store", dest="filter_fraction", default=0.01) 46 | 47 | # this is being left for backwards-compatibility reasons only (same ) 48 | parser.add_argument("--get-Order-representatives-only", action="store_true", help = "Provide this flag to simply get 1 random genome from each Order in GTDB (same as if specifying \ 49 | `--get-only-individuals-for-the-rank order`, but left here for backwards-compatibility purposes)", dest = "order_only") 50 | 51 | parser.add_argument("--get-only-individuals-for-the-rank", action="store", choices = {"phylum", "class", "order", "family", "genus", "species"}, 52 | help = "Use this option with a specified rank if wanting to randomly subset such that we retain 1 genome from each entry in a specific rank in GTDB", dest = "target_rank") 53 | 54 | 55 | parser.add_argument("--seed", action = "store", help = "Specify the seed for random subsampling (default = 1)", default = 1, type = int) 56 | 57 | if len(sys.argv)==1: 58 | parser.print_help(sys.stderr) 59 | sys.exit(0) 60 | 61 | args = parser.parse_args() 62 | 63 | 64 | ############################################################ 65 | 66 | # setting some colors 67 | tty_colors = { 68 | 'green' : '\033[0;32m%s\033[0m', 69 | 'yellow' : '\033[0;33m%s\033[0m', 70 | 'red' : '\033[0;31m%s\033[0m' 71 | } 72 | 73 | 74 | ### functions ### 75 | def color_text(text, color='green'): 76 | if sys.stdout.isatty(): 77 | return tty_colors[color] % text 78 | else: 79 | return text 80 | 81 | def wprint(text): 82 | print(textwrap.fill(text, width=80, initial_indent=" ", 83 | subsequent_indent=" ", break_on_hyphens=False)) 84 | 85 | 86 | # function to subset master table to one for each order 87 | def order_subset_table(order_to_subset, input_tab, seed): 88 | 89 | sub_master_df_to_keep = input_tab.loc[input_tab["order"] != order_to_subset] 90 | 91 | curr_order_df = input_tab.loc[input_tab["order"] == order_to_subset] 92 | 93 | random_sub_df = curr_order_df.sample(n = 1, random_state = int(seed)) 94 | 95 | new_df = pd.concat([sub_master_df_to_keep, random_sub_df]) 96 | 97 | return new_df 98 | 99 | 100 | # function to subset master table 101 | def subset_table(class_to_subset, input_tab, seed): 102 | 103 | curr_class = class_to_subset 104 | 105 | sub_master_df_to_keep = input_tab.loc[input_tab["class"] != curr_class] 106 | 107 | curr_class_df = input_tab.loc[input_tab["class"] == curr_class] 108 | 109 | random_sub_df = curr_class_df.sample(n=int(len(curr_class_df.index) * float(args.filter_fraction)), random_state = int(seed)) 110 | 111 | new_df = pd.concat([sub_master_df_to_keep, random_sub_df]) 112 | 113 | return new_df 114 | 115 | 116 | # function to subset arbitrary rank 117 | def taxa_subset_table(taxa_to_subset, rank, input_tab, seed): 118 | 119 | sub_master_df_to_keep = input_tab.loc[input_tab[rank] != taxa_to_subset] 120 | 121 | curr_order_df = input_tab.loc[input_tab[rank] == taxa_to_subset] 122 | 123 | random_sub_df = curr_order_df.sample(n = 1, random_state = int(seed)) 124 | 125 | new_df = pd.concat([sub_master_df_to_keep, random_sub_df]) 126 | 127 | return new_df 128 | 129 | 130 | ################################################################################ 131 | 132 | 133 | 134 | # reading lineage table into pandas dataframe 135 | lineage_df = pd.read_csv(args.GTDB_table, delimiter="\t", usecols = range(8)) 136 | 137 | starting_size = len(lineage_df.index) 138 | 139 | # just giving 1 of each order if requested (left here like this for backwards-compatibility purposes) 140 | if args.order_only: 141 | 142 | args.target_rank = "order" 143 | 144 | 145 | if args.target_rank: 146 | 147 | # getting list of all unique entries of wanted rank 148 | unique_entries = lineage_df[args.target_rank].unique() 149 | 150 | # getting one rep genome of each of these 151 | for entry in unique_entries: 152 | 153 | lineage_df = taxa_subset_table(entry, args.target_rank, lineage_df, args.seed) 154 | 155 | filtered_size = len(lineage_df.index) 156 | 157 | # removing "RS_" and "GB_" prefixes and writing out output accs 158 | output_accessions = args.output_prefix + ".txt" 159 | with open(output_accessions, "w") as out: 160 | for acc in lineage_df.accession: 161 | out.write(acc[3:] + "\n") 162 | 163 | # writing out subset GTDB taxonomy 164 | output_tax = args.output_prefix + "-taxonomy.tsv" 165 | lineage_df.to_csv(output_tax, sep = "\t", index = False) 166 | 167 | # reporting and exiting 168 | print("") 169 | wprint(color_text(str("{:,}".format(starting_size)) + " initial entries were subset down to " + str("{:,}".format(filtered_size)) + "\n", "yellow")) 170 | print("") 171 | wprint("Subset accessions file for GToTree written to:") 172 | print(color_text(" " + str(output_accessions)) + "\n") 173 | wprint("A subset GTDB taxonomy table for these accessions written to:") 174 | print(color_text(" " + str(output_tax)) + "\n") 175 | 176 | exit() 177 | 178 | 179 | # if the above didn't run and exit, then we are going to randomly subset based on class 180 | class_dict = {} 181 | 182 | # counting how many times each class shows up 183 | for index, row in lineage_df.iterrows(): 184 | if row["class"] not in class_dict: 185 | class_dict[row["class"]] = 1 186 | else: 187 | class_dict[row["class"]] += 1 188 | 189 | # getting cutoff threshold of total number of entries 190 | cutoff = int(starting_size * float(args.cutoff_fraction)) 191 | 192 | # getting which classes are above this threshold 193 | classes_to_subset = [] 194 | 195 | for key in class_dict: 196 | if class_dict[key] >= cutoff: 197 | classes_to_subset.append(key) 198 | 199 | # subsetting each class 200 | for rank in classes_to_subset: 201 | lineage_df = subset_table(rank, lineage_df, args.seed) 202 | 203 | filtered_size = len(lineage_df.index) 204 | 205 | # removing "RS_" and "GB_" prefixes and writing out output accs 206 | output_accessions = args.output_prefix + ".txt" 207 | with open(output_accessions, "w") as out: 208 | for acc in lineage_df.accession: 209 | out.write(acc[3:] + "\n") 210 | 211 | # writing out subset GTDB taxonomy 212 | output_tax = args.output_prefix + "-taxonomy.tsv" 213 | lineage_df.to_csv(output_tax, sep = "\t", index = False) 214 | 215 | 216 | print("") 217 | wprint(color_text(str("{:,}".format(starting_size)) + " initial entries were subset down to " + str("{:,}".format(filtered_size)) + "\n", "yellow")) 218 | # print("\n Subset classes included: \n\t\t\t\t" + "\n\t\t\t\t".join(classes_to_subset) + "\n") 219 | print("") 220 | wprint("Subset accessions written to file:") 221 | print(color_text(" " + str(output_accessions)) + "\n") 222 | wprint("A subset GTDB taxonomy table for these accessions written to:") 223 | print(color_text(" " + str(output_tax)) + "\n") 224 | -------------------------------------------------------------------------------- /bin/gtt-swap-ids: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from Bio import SeqIO 4 | import sys 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='This script will swap the headers of a fasta file.') 8 | 9 | required = parser.add_argument_group('required arguments') 10 | 11 | required.add_argument("-i", "--input_fasta", help="Starting fasta file", action="store", dest="input_fasta", required=True) 12 | parser.add_argument("-s", "--map_of_ids_to_swap", help="Two column tab-delimited file where column 1 holds the original headers and column 2 holds the desired headers. (doesn't need to hold all headers)", action="store", dest="id_map") 13 | parser.add_argument("-o", "--output_fasta_name", help='Output fasta file (default: "Renamed.fasta").', dest="output_fasta_name", default="Renamed.fasta") 14 | 15 | if len(sys.argv)==1: 16 | parser.print_help(sys.stderr) 17 | sys.exit(0) 18 | 19 | args = parser.parse_args() 20 | 21 | map_dict = {} 22 | 23 | with open(args.id_map) as map: 24 | for line in map: 25 | line = line.strip() 26 | line = line.split("\t") 27 | map_dict[line[0]] = line[1] 28 | 29 | in_fasta = open(args.input_fasta, "r") 30 | out_fasta = open(args.output_fasta_name, "w") 31 | 32 | for seq_record in SeqIO.parse(in_fasta, "fasta"): 33 | if seq_record.id in map_dict: 34 | out_fasta.write(">" + str(map_dict[seq_record.id]) + "\n") 35 | out_fasta.write(str(seq_record.seq) + "\n") 36 | else: 37 | out_fasta.write(">" + str(seq_record.id) + "\n") 38 | out_fasta.write(str(seq_record.seq) + "\n") 39 | 40 | in_fasta.close() 41 | out_fasta.close() 42 | -------------------------------------------------------------------------------- /bin/gtt-test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | YELLOW='\033[0;33m' 6 | RED='\033[0;31m' 7 | NC='\033[0m' 8 | 9 | printf "\n" 10 | printf " ${GREEN}Downloading GToTree test data into the subdirectory ${YELLOW}GToTree-test-data/\n\n${NC}" 11 | printf " ${GREEN}Test data being pulled from here:\n${NC}" 12 | printf " ${YELLOW}https://zenodo.org/record/7860720#.ZEcWkexlA_8${NC}\n\n\n" 13 | 14 | curl -L --retry 10 --fail -o GToTree-test-data.tar.gz "https://zenodo.org/record/7860720/files/GToTree-test-data.tar.gz?download=1" 15 | 16 | # if run as 'gtt-test.sh http', will use http instead of ftp 17 | if [ ! -z $1 ] && [ $1 == "http" ]; then 18 | p_flag="-P" 19 | else 20 | p_flag="" 21 | fi 22 | 23 | # checking download was successfull (can finish with 0 exit) 24 | if [ $? -ne 0 ] ; then 25 | 26 | printf "\n${RED} Downloading the small test data failed for some reason :(${NC}\n" 27 | printf " You can try downloading it yourself from the link printed above and running the test as follows after unpacking it:\n\n" 28 | 29 | printf " ${YELLOW}GToTree -a GToTree-test-data/ncbi_accessions.txt "'\\ \n' 30 | printf " -g GToTree-test-data/genbank_files.txt "'\\ \n' 31 | printf " -f GToTree-test-data/fasta_files.txt "'\\ \n' 32 | printf " -A GToTree-test-data/amino_acid_files.txt "'\\ \n' 33 | printf " -m GToTree-test-data/genome_to_id_map.tsv "'\\ \n' 34 | printf " -p GToTree-test-data/pfam_targets.txt "'\\ \n' 35 | printf " -H Universal -t -D -j 4 -o GToTree-test-output -F ${p_flag}\n\n${NC}" 36 | 37 | printf " Then you can compare the output to what is depicted here:\n" 38 | printf " https://github.com/AstrobioMike/GToTree/wiki/Installation#test-run${NC}\n\n" 39 | 40 | printf "Exiting for now.\n\n" 41 | exit 42 | 43 | fi 44 | 45 | # putting here instead of at top so that the above message is still sent if curl fails 46 | set -e 47 | 48 | tar -xf GToTree-test-data.tar.gz 49 | rm GToTree-test-data.tar.gz 50 | 51 | printf "\n\n" 52 | 53 | TEST_DATA_DIR="GToTree-test-data" 54 | 55 | ## modifying paths of input genomes in input files as needed (not using sed -i so compatible with darwin sed too) 56 | 57 | sed "s/^/${TEST_DATA_DIR}\//" ${TEST_DATA_DIR}/genbank_files.txt > ${TEST_DATA_DIR}/genbank_files.txt.tmp && mv ${TEST_DATA_DIR}/genbank_files.txt.tmp ${TEST_DATA_DIR}/genbank_files.txt 58 | sed "s/^/${TEST_DATA_DIR}\//" ${TEST_DATA_DIR}/fasta_files.txt > ${TEST_DATA_DIR}/fasta_files.txt.tmp && mv ${TEST_DATA_DIR}/fasta_files.txt.tmp ${TEST_DATA_DIR}/fasta_files.txt 59 | sed "s/^/${TEST_DATA_DIR}\//" ${TEST_DATA_DIR}/amino_acid_files.txt > ${TEST_DATA_DIR}/amino_acid_files.txt.tmp && mv ${TEST_DATA_DIR}/amino_acid_files.txt.tmp ${TEST_DATA_DIR}/amino_acid_files.txt 60 | 61 | printf " ${GREEN}Running test as:\n${NC}" 62 | printf " ${YELLOW}GToTree -a ${TEST_DATA_DIR}/ncbi_accessions.txt "'\\ \n' 63 | printf " -g ${TEST_DATA_DIR}/genbank_files.txt "'\\ \n' 64 | printf " -f ${TEST_DATA_DIR}/fasta_files.txt "'\\ \n' 65 | printf " -A ${TEST_DATA_DIR}/amino_acid_files.txt "'\\ \n' 66 | printf " -m ${TEST_DATA_DIR}/genome_to_id_map.tsv "'\\ \n' 67 | printf " -p ${TEST_DATA_DIR}/pfam_targets.txt "'\\ \n' 68 | printf " -H Universal -t -D -j 4 -o GToTree-test-output -F ${p_flag}\n\n${NC}" 69 | 70 | sleep 2 71 | 72 | printf " ${YELLOW}The test run includes some things that shouldn't be found, so\n" 73 | printf " don't be alarmed when seeing those messages.${NC}\n\n" 74 | 75 | sleep 2 76 | 77 | printf " ${GREEN}Starting run now:\n${NC}" 78 | 79 | GToTree -a ${TEST_DATA_DIR}/ncbi_accessions.txt -g ${TEST_DATA_DIR}/genbank_files.txt -f ${TEST_DATA_DIR}/fasta_files.txt -A ${TEST_DATA_DIR}/amino_acid_files.txt -H Universal -m ${TEST_DATA_DIR}/genome_to_id_map.tsv -p ${TEST_DATA_DIR}/pfam_targets.txt -t -D -j 4 -o GToTree-test-output -F ${p_flag} 80 | 81 | if [ -d "GToTree-test-output/" ]; then 82 | 83 | printf "\n ${YELLOW}_______________________________________________________________________________${NC}\n\n" 84 | printf "\n ${GREEN}Test completed! See here for how things should look:\n${NC}" 85 | printf " ${YELLOW}https://github.com/AstrobioMike/GToTree/wiki/Installation#test-run${NC}\n\n" 86 | 87 | else 88 | 89 | printf "\n ${YELLOW}_______________________________________________________________________________${NC}\n\n" 90 | printf "\n ${RED}There seems to have been a problem with the test run :(\n${NC}" 91 | printf " ${YELLOW}If this continues, please consider submitting an issue here:\n${NC}" 92 | printf " ${YELLOW}https://github.com/AstrobioMike/GToTree/issues${NC}\n\n" 93 | 94 | printf " ${GREEN}You can clear out the test data and results by running:${NC}\n" 95 | printf " ${YELLOW}gtt-clean-after-test.sh\n\n${NC}" 96 | 97 | fi 98 | 99 | printf " ${GREEN}You can clear out the test data and results by running:${NC}\n" 100 | printf " ${YELLOW}gtt-clean-after-test.sh\n\n${NC}" 101 | -------------------------------------------------------------------------------- /bin/gtt-update-ncbi-taxonomy: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # setting colors to use 4 | GREEN='\033[0;32m' 5 | RED='\033[0;31m' 6 | NC='\033[0m' 7 | 8 | printf "\n" 9 | 10 | ## trying https if ftp fails 11 | curl --retry 10 -o ${TAXONKIT_DB}/taxdump.tar.gz ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz || curl --retry 10 -o ${TAXONKIT_DB}/taxdump.tar.gz https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 12 | 13 | tar -xzf ${TAXONKIT_DB}/taxdump.tar.gz -C ${TAXONKIT_DB} 14 | 15 | rm ${TAXONKIT_DB}/taxdump.tar.gz 16 | 17 | printf "\n\t\t${GREEN}The NCBI taxonomy database info has been updated!${NC}\n\n" 18 | --------------------------------------------------------------------------------