├── .github └── workflows │ └── pythonpackage.yml ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── Snakefile ├── bin ├── Snakefile ├── db_create │ ├── Snakefile │ ├── bracken │ │ └── Snakefile │ ├── genes │ │ └── Snakefile │ ├── humann3 │ │ ├── Snakefile │ │ ├── db_create │ │ │ └── Snakefile │ │ ├── prepare_query │ │ │ └── Snakefile │ │ └── query │ │ │ ├── Snakefile │ │ │ ├── dmnd │ │ │ └── Snakefile │ │ │ └── mmseqs │ │ │ └── Snakefile │ └── kraken2 │ │ └── Snakefile ├── db_update │ ├── Snakefile │ ├── bracken │ │ └── Snakefile │ ├── genes │ │ ├── Snakefile │ │ ├── db_update │ │ │ └── Snakefile │ │ └── input │ │ │ ├── Snakefile │ │ │ ├── check │ │ │ └── Snakefile │ │ │ ├── from_gene_set │ │ │ └── Snakefile │ │ │ └── from_genomes │ │ │ └── Snakefile │ ├── humann3 │ │ ├── Snakefile │ │ ├── db_create │ │ │ └── Snakefile │ │ ├── input_from_genes │ │ │ └── Snakefile │ │ ├── prepare_query │ │ │ └── Snakefile │ │ ├── query_dmnd │ │ │ └── Snakefile │ │ └── query_mmseqs │ │ │ └── Snakefile │ └── kraken2 │ │ └── Snakefile ├── dirs ├── envs │ ├── genes.yaml │ ├── humann2.yaml │ ├── humann3.yaml │ ├── kraken2.yaml │ └── krakenuniq.yaml ├── scripts │ ├── add_user_seqs.py │ ├── annotate_genes.py │ ├── bracken-build.py │ ├── cat_files.py │ ├── check_gene_info.py │ ├── download_taxonomy.sh │ ├── filter_cluster_reps.py │ ├── filter_seqs.py │ ├── kraken2-build │ ├── kraken2_rename_genome.py │ ├── log_summarize.py │ ├── metaphlan_db_from_clusts.py │ ├── metaphlan_db_from_uniref.py │ ├── propagate_annotations.py │ ├── species_specific.py │ ├── uncomp.py │ ├── uncomp_tarball.py │ └── uniref_clst_trans.py └── utils │ └── Snakefile ├── conda_env.yaml ├── config-update.yaml ├── config.yaml ├── data └── GTDBr95_n5 │ ├── GCA_000014945.1_ASM1494v1_genomic.fna.gz │ ├── GCA_000720375.1_ASM72037v1_genomic.fna.gz │ ├── GCA_002478565.1_ASM247856v1_genomic.fna.gz │ ├── GCA_006715045.1_ASM671504v1_genomic.fna.gz │ ├── GCA_007116575.1_ASM711657v1_genomic.fna.gz │ └── GTDBr95_n5.tsv ├── img └── logo.png ├── notebooks ├── GTDB_release202 │ └── 01_metadata │ │ ├── 01_GTDB_metadata_summary.ipynb │ │ ├── 02_phylogeny.ipynb │ │ └── 03_Struo2.ipynb ├── GTDB_release95 │ ├── 01_metadata │ │ └── 01_GTDB_metadata_summary.ipynb │ ├── 02_struo_version_db-create_benchmarking │ │ ├── 01_benchmarking_UniRef50-90_db-create.ipynb │ │ └── 02_benchmarking_UniRef50-90_db-update.ipynb │ └── 03_GTDBr95_db │ │ ├── 01_phylogeny.ipynb │ │ └── 02_GTDBr95_create.ipynb ├── Misc │ └── 01_GTDB_release_summary.ipynb └── struo_dev │ └── 01_test_dataset.ipynb ├── snakemake_clean.sh ├── snakemake_conda-list.sh ├── snakemake_sge.sh ├── tests └── samples │ ├── GTDBr95_n10.tsv │ └── GTDBr95_n5.tsv └── util_scripts ├── GTDB_metadata_filter.R ├── database_download.py ├── genome_download.R ├── genome_gene_content.py ├── genome_mis-asmbl_sim.py ├── genome_traitar.py └── tree_prune.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Struo2 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | name: build (${{ matrix.python-version }}, ${{ matrix.os }}) 12 | runs-on: ubuntu-latest 13 | env: 14 | DATADIR: data 15 | strategy: 16 | matrix: 17 | python-version: [3.7] 18 | steps: 19 | - uses: conda-incubator/setup-miniconda@v2 20 | with: 21 | miniconda-version: 'latest' 22 | auto-update-conda: true 23 | python-version: ${{ matrix.python-version }} 24 | channels: conda-forge,bioconda 25 | channel-priority: strict 26 | activate-environment: struo2 27 | - name: Checkout repository 28 | uses: actions/checkout@v2 29 | - name: Checkout submodules 30 | run: git submodule update --init --recursive 31 | - name: conda env setup 32 | shell: bash -l {0} 33 | run: | 34 | conda info -a 35 | conda env update -f conda_env.yaml python=${{ matrix.python-version }} 36 | conda list 37 | - name: taxdump db setup 38 | shell: bash -l {0} 39 | run: | 40 | mkdir -p $DATADIR 41 | wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/GTDB_release95/taxdump/taxdump.tar.gz 42 | tar -pzxvf $DATADIR/taxdump.tar.gz --directory $DATADIR 43 | - name: UniRef db setup 44 | shell: bash -l {0} 45 | run: | 46 | mkdir -p $DATADIR/UniRef90/ 47 | touch $DATADIR/UniRef90/uniref90 $DATADIR/UniRef90/uniref90.dbtype $DATADIR/UniRef90/uniref90.index 48 | touch $DATADIR/UniRef90/uniref90.lookup $DATADIR/UniRef90/uniref90.source 49 | touch $DATADIR/UniRef90/uniref90_h $DATADIR/UniRef90/uniref90_h.dbtype $DATADIR/UniRef90/uniref90_h.index 50 | wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/install/uniref_2019.01/uniref50-90.pkl 51 | - name: Reference genome download 52 | shell: bash -l {0} 53 | run: | 54 | wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/dev_data/genomes/GTDBr95_n10.tar.gz 55 | tar -pzxvf $DATADIR/GTDBr95_n10.tar.gz --directory $DATADIR 56 | - name: DB create tests 57 | shell: bash -l {0} 58 | run: | 59 | snakemake --use-conda --configfile config.yaml -j 1 -F --dryrun 60 | - name: DB update tests 61 | shell: bash -l {0} 62 | run: | 63 | echo "todo" #snakemake --use-conda --configfile config-update.yaml -j 1 -F --dryrun 64 | - name: Util script dependency tests 65 | shell: bash -l {0} 66 | run: | 67 | ./util_scripts/genome_download.R -h 68 | ./util_scripts/GTDB_metadata_filter.R -h 69 | ./util_scripts/tree_prune.py -h 70 | ./util_scripts/genome_mis-asmbl_sim.py -h 71 | - name: Tree pruning test 72 | shell: bash -l {0} 73 | run: | 74 | wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/dev_data/phylogeny.tar.gz 75 | tar -pzxvf $DATADIR/phylogeny.tar.gz --directory $DATADIR 76 | ./util_scripts/tree_prune.py data/phylogeny/accs_to_keep.txt data/phylogeny/ar122_r95.tree 77 | - name: Mis-assembly simulation test 78 | shell: bash -l {0} 79 | run: | 80 | ./util_scripts/genome_mis-asmbl_sim.py -b 2 -r 2 -c 2 -T ncbi_organism_name -F fasta_file_path data/GTDBr95_n10/GTDBr95_n10.tsv 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .snakemake/ 6 | tests/output*/ 7 | tmp/ 8 | .DS_Store 9 | ._.DS_Store 10 | .envrc 11 | ._* 12 | misc/ 13 | archive/ 14 | screenlog.* 15 | bin/scripts/metaphlan2/databases/ 16 | .ipynb_checkpoints/ 17 | no-log_jobs_sge.* 18 | *4test.yaml 19 | data/ 20 | notebooks_dev/ 21 | util_scripts/data 22 | 23 | # hg 24 | **/.hg/ 25 | **/.hg* 26 | 27 | 28 | # C extensions 29 | *.so 30 | 31 | # Distribution / packaging 32 | .Python 33 | env/ 34 | build/ 35 | develop-eggs/ 36 | #dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | *.egg-info/ 46 | .installed.cfg 47 | *.egg 48 | 49 | # PyInstaller 50 | # Usually these files are written by a python script from a template 51 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 52 | *.manifest 53 | *.spec 54 | 55 | # Installer logs 56 | pip-log.txt 57 | pip-delete-this-directory.txt 58 | 59 | # Unit test / coverage reports 60 | htmlcov/ 61 | .tox/ 62 | .coverage 63 | .coverage.* 64 | .cache 65 | nosetests.xml 66 | coverage.xml 67 | *,cover 68 | .hypothesis/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # pyenv python configuration file 84 | .python-version 85 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "bin/ll_pipeline_utils"] 2 | path = bin/ll_pipeline_utils 3 | url = https://github.com/leylabmpi/ll_pipeline_utils.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Nick Youngblut 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Snakefile: -------------------------------------------------------------------------------- 1 | # import 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import re 6 | import glob 7 | import socket 8 | import getpass 9 | import subprocess 10 | from distutils.spawn import find_executable 11 | import pandas as pd 12 | 13 | # load 14 | configfile: 'config.yaml' 15 | 16 | # general functions 17 | def ssw(x, *args, **kwargs): 18 | sys.stderr.write(x, *args, **kwargs) 19 | 20 | # setup 21 | ## pipeline utils 22 | snake_dir = config['pipeline']['snakemake_folder'] 23 | include: snake_dir + 'bin/ll_pipeline_utils/Snakefile' 24 | config_default(config, 'pipeline', 'name') 25 | ## custom functions 26 | def make_fasta_splits(n_jobs, zero_pad=3): 27 | if str(n_jobs).lstrip().startswith('Skip'): 28 | n_jobs = 1 29 | zero_pad = '{0:0' + str(zero_pad) + 'd}' 30 | return [str(zero_pad.format(x+1)) for x in range(n_jobs)] 31 | 32 | # setting paths 33 | config['samples_file'] = os.path.abspath(config['samples_file']) 34 | config['pipeline']['snakemake_folder'] = \ 35 | os.path.abspath(config['pipeline']['snakemake_folder']) + '/' 36 | 37 | # uniref 38 | config['uniref_name'] = str(config['uniref_name']).rstrip().lower() 39 | if config['uniref_name'] == 'uniref50': 40 | config['uniref_other_name'] = 'uniref90' 41 | elif config['uniref_name'] == 'uniref90': 42 | config['uniref_other_name'] = 'uniref50' 43 | else: 44 | msg = 'Only "uniref90" and "uniref50" supported for "uniref_name:". Value provided: {}' 45 | raise ValueError(msg.format(config['uniref_name'])) 46 | 47 | ## base of the snakefile hierarchy 48 | include: snake_dir + 'bin/Snakefile' 49 | include: snake_dir + 'bin/utils/Snakefile' 50 | 51 | ## pipeline main 52 | wildcard_constraints: 53 | sample="[^/]+", 54 | uniref="[^/]+" 55 | 56 | localrules: all 57 | 58 | rule all: 59 | input: 60 | all_which_input 61 | 62 | -------------------------------------------------------------------------------- /bin/Snakefile: -------------------------------------------------------------------------------- 1 | #-- settings for all workflows --# 2 | config['pipeline']['username'] = getpass.getuser() 3 | config['pipeline']['email'] = config['email'] 4 | 5 | #-- workflow selection --# 6 | if config['pipeline']['config'] == 'create': 7 | include: snake_dir + 'db_create/Snakefile' 8 | elif config['pipeline']['config'] == 'update': 9 | include: snake_dir + 'db_update/Snakefile' 10 | else: 11 | msg ='Pipeline "config" param not recognized: {}' 12 | raise ValueError(msg.format(config['pipeline']['config'])) 13 | 14 | 15 | # final output files (both db-create & db-update workflows) 16 | def all_which_input(wildcards): 17 | """ 18 | The final output files for both db_create & db_update 19 | """ 20 | F = [] 21 | # kraken2 22 | if (config['samples'] is not None and 23 | not skipped(config['databases']['kraken2'])): 24 | F.append(os.path.join(kraken2_dir, 'hash.k2d')) 25 | F.append(os.path.join(kraken2_dir, 'opts.k2d')) 26 | F.append(os.path.join(kraken2_dir, 'taxo.k2d')) 27 | F.append(os.path.join(kraken2_dir, 'seqid2taxid.map')) 28 | # bracken 29 | if not skipped(config['databases']['bracken']): 30 | F += expand(os.path.join(kraken2_dir, 'database{read_len}mers.kraken'), 31 | read_len = config['params']['bracken']['build_read_lens']) 32 | # genes 33 | if not skipped(config['databases']['genes']): 34 | F.append(genes_dir + 'genome_reps_filtered.fna.gz') 35 | F.append(genes_dir + 'genome_reps_filtered.faa.gz') 36 | F.append(genes_dir + 'genome_reps_filtered.txt.gz') 37 | if str(config['keep_intermediate']) == 'True': 38 | # mmseqs gene database 39 | F.append(genes_dir + 'genes_db.tar.gz') 40 | ## mmseqs cluster database 41 | F.append(genes_dir + 'cluster/clusters_db.tar.gz') 42 | ## cluster membership info 43 | F.append(genes_dir + 'cluster/clusters_membership.tsv.gz') 44 | ## cluster rep sequences 45 | F.append(genes_dir + 'cluster/clusters_reps.faa.gz') 46 | # humann3 47 | if (not skipped(config['databases']['humann3_bowtie2']) and 48 | not skipped(config['databases']['humann3_diamond'])): 49 | # multiple UniRef databases 50 | uniref_cutoffs = [config['uniref_name']] 51 | dmnd_names = [config['dmnd_name']] 52 | if config['uniref_name'] != 'uniref50' and not skipped(config['cluster_idx']): 53 | uniref_cutoffs.append('uniref50') 54 | x = re.sub('([Uu])ni([Rr])ef90', '\\1ni\\2ef50', config['dmnd_name']) 55 | dmnd_names.append(x) 56 | # intermediate files 57 | if str(config['keep_intermediate']) == 'True': 58 | # annotation hits 59 | F.append(humann3_dir + 'annotation_hits.gz') 60 | ## annotated genes (all) 61 | F += expand(humann3_dir + '{uniref}/genome_reps_filt_annot.fna.gz', 62 | uniref = uniref_cutoffs) 63 | F += expand(humann3_dir + '{uniref}/genome_reps_filt_annot.faa.gz', 64 | uniref = uniref_cutoffs) 65 | F += expand(humann3_dir + '{uniref}/genome_reps_filt_annot.tsv.gz', 66 | uniref = uniref_cutoffs) 67 | ## databases 68 | ### bowtie2 69 | if not skipped(config['databases']['humann3_bowtie2']): 70 | F += expand(os.path.join(humann3_dir + '{uniref}', 'bowtie2_build.done'), 71 | uniref = uniref_cutoffs) 72 | ### diamond 73 | if not skipped(config['databases']['humann3_diamond']): 74 | x = os.path.join(humann3_dir, '{uniref}', 'protein_database', '{dmnd}') 75 | for u,d in zip(uniref_cutoffs, dmnd_names): 76 | F.append(x.format(uniref=u,dmnd=d)) 77 | 78 | # metaphlan 79 | #if not skipped(config['databases']['metaphlan3']): 80 | # F += expand(config['tmp_dir'] + '{uniref}/metaphlan3/species_specific.txt', 81 | # uniref = uniref_cutoffs) 82 | 83 | # ret 84 | return F 85 | -------------------------------------------------------------------------------- /bin/db_create/Snakefile: -------------------------------------------------------------------------------- 1 | #-- Parsing input for db-create workflow --# 2 | 3 | # outdir 4 | config['output_dir'] = config['output_dir'].rstrip('/') + '/' 5 | print('\33[33mUsing output directory: {} \x1b[0m'.format(config['output_dir'])) 6 | 7 | # Samples table 8 | if not os.path.isfile(config['samples_file']): 9 | raise IOError('Cannot find file: {}'.format(config['samples_file'])) 10 | config['samples'] = pd.read_csv(config['samples_file'], sep='\t') 11 | ## Required columns 12 | for f in [config['samples_col'], config['accession_col'], config['fasta_file_path_col'], 13 | config['taxID_col'], config['taxonomy_col']]: 14 | if f not in config['samples'].columns: 15 | raise ValueError('Cannot find column: {}'.format(f)) 16 | config['samples'][config['samples_col']] = config['samples'][config['samples_col']].str.replace('[^A-Za-z0-9]+', '_', regex=True) 17 | config['samples'] = config['samples'].set_index(config['samples'][config['samples_col']]) 18 | 19 | ## check that files exist (skipping if not) 20 | rowID = 0 21 | to_rm = [] 22 | for index,row in config['samples'].iterrows(): 23 | rowID += 1 24 | file_cols = [config['fasta_file_path_col']] 25 | for f in file_cols: 26 | if not os.path.isfile(str(row[f])): 27 | msg = 'Samples table (Row {}): Cannot find file: {}; Skipping\n' 28 | sys.stderr.write(msg.format(rowID, row[f])) 29 | to_rm.append(row[config['samples_col']]) 30 | ssw('\33[33mNumber of skipped sample table entries: {}\n\x1b[0m'.format(len(to_rm))) 31 | config['samples'].drop(to_rm, inplace=True) 32 | if config['samples'].shape[0] < 1: 33 | raise ValueError('No genomes remaining after filtering!') 34 | config['samples_unique'] = config['samples'][config['samples_col']].unique().tolist() 35 | 36 | ## temp_folder 37 | config['tmp_dir'] = os.path.join(config['tmp_dir'], config['pipeline']['username']) 38 | config['tmp_dir'] = os.path.join(config['tmp_dir'], 'Struo2_' + str(os.stat('.').st_ino) + '/') 39 | print('\33[33mUsing temporary directory: {} \x1b[0m'.format(config['tmp_dir'])) 40 | 41 | ## batches 42 | config['params']['humann3']['splits'] = \ 43 | make_fasta_splits(config['params']['humann3']['batches']) 44 | 45 | ## including modular snakefiles 46 | print('\33[36m--Running db-create pipeline--\x1b[0m') 47 | snake_dir = config['pipeline']['snakemake_folder'] 48 | include: snake_dir + 'bin/dirs' 49 | ### kraken/bracken 50 | if not skipped(config['databases']['kraken2']): 51 | print('\33[36m* Creating kraken2 database\x1b[0m') 52 | include: snake_dir + 'bin/db_create/kraken2/Snakefile' 53 | if not skipped(config['databases']['bracken']): 54 | print('\33[36m* Creating bracken database\x1b[0m') 55 | include: snake_dir + 'bin/db_create/bracken/Snakefile' 56 | ### genes 57 | if not skipped(config['databases']['genes']): 58 | print('\33[36m* Creating genes database\x1b[0m') 59 | include: snake_dir + 'bin/db_create/genes/Snakefile' 60 | else: 61 | m = '\33[33m* Skipping creation of genes database;' 62 | m += ' assuming the database already exist!\x1b[0m' 63 | print(m) 64 | ### humann3 65 | if not skipped(config['databases']['humann3_bowtie2']) and \ 66 | not skipped(config['databases']['humann3_diamond']): 67 | print('\33[36m* Creating humann3 database\x1b[0m') 68 | include: snake_dir + 'bin/db_create/humann3/Snakefile' 69 | #if not skipped(config['databases']['metaphlan3']): 70 | # print('\33[36m* Creating metaphlan database\x1b[0m') 71 | # include: snake_dir + 'bin/db_create/metaphlan3/Snakefile' 72 | -------------------------------------------------------------------------------- /bin/db_create/bracken/Snakefile: -------------------------------------------------------------------------------- 1 | rule bracken_build: 2 | """ 3 | Build braken database(s) from kraken2 database. 4 | One database to user-selected read lengths. 5 | """ 6 | input: 7 | kraken2_db = kraken2_dir + 'hash.k2d', 8 | rm_done = kraken2_dir + 'tmp_db_rm.done' 9 | output: 10 | krk = kraken2_dir + 'database{read_len}mers.kraken', 11 | krkd = kraken2_dir + 'database{read_len}mers.kmer_distrib' 12 | params: 13 | kmer = config['params']['bracken']['build_kmer'], 14 | exe = config['pipeline']['script_folder'] + 'bracken-build.py', 15 | read_len = lambda wildcards: wildcards.read_len, 16 | threads: 17 | 12 18 | resources: 19 | time = lambda wildcards, attempt: attempt ** 2 * 60 * 24, 20 | n = lambda wildcards, attempt, threads: threads, 21 | mem_gb_pt = lambda wildcards, attempt: attempt * 16 22 | conda: 23 | '../../envs/kraken2.yaml' 24 | log: 25 | log_dir + 'bracken_build/ReadLen{read_len}.log' 26 | benchmark: 27 | benchmark_dir + 'bracken_build/ReadLen{read_len}.txt' 28 | shell: 29 | """ 30 | # location of the kraken2 db files 31 | DB=`dirname {input.kraken2_db}` 32 | # removing existing files possibly created by bracken 33 | TMP_FILE=$DB"/database.kraken" 34 | rm -f $TMP_FILE 35 | # running bracken 36 | {params.exe} -t {threads} -d $DB \ 37 | -k {params.kmer} -l {params.read_len} \ 38 | 2> {log} 1>&2 39 | """ 40 | 41 | -------------------------------------------------------------------------------- /bin/db_create/humann3/Snakefile: -------------------------------------------------------------------------------- 1 | #-- humann database creation workflow --# 2 | 3 | # preparing of input sequences 4 | include: snake_dir + 'bin/db_create/humann3/prepare_query/Snakefile' 5 | # gene annotation 6 | include: snake_dir + 'bin/db_create/humann3/query/Snakefile' 7 | # creating the humann database 8 | include: snake_dir + 'bin/db_create/humann3/db_create/Snakefile' 9 | -------------------------------------------------------------------------------- /bin/db_create/humann3/db_create/Snakefile: -------------------------------------------------------------------------------- 1 | rule humann3_annotate_genes: 2 | """ 3 | Use search hits from clustered reps & index table to annotate all 4 | genome-derep genes. The annotation for each cluster rep is propagated 5 | to each member of the cluster. 6 | """ 7 | input: 8 | hits = config['tmp_dir'] + 'humann3_search/hits.txt', 9 | tsv = config['tmp_dir'] + 'humann3/clusters_membership.tsv.gz', 10 | fna = config['tmp_dir'] + 'humann3/genome_reps_filtered.fna.gz', 11 | faa = config['tmp_dir'] + 'humann3/genome_reps_filtered.faa.gz', 12 | txt = config['tmp_dir'] + 'humann3/genome_reps_filtered.txt.gz' 13 | output: 14 | fna = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna'), 15 | faa = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa'), 16 | tsv = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv'), 17 | params: 18 | exe = config['pipeline']['script_folder'] + 'propagate_annotations.py', 19 | params = config['params']['humann3']['propagate_annotations'] 20 | resources: 21 | time = lambda wildcards, attempt: attempt ** 3 * 59, 22 | mem_gb_pt = lambda wildcards, attempt: attempt ** 4 * 3 + 11, 23 | log: 24 | log_dir + 'humann3_annotate_genes/all.log' 25 | benchmark: 26 | benchmark_dir + 'humann3_annotate_genes/all.txt' 27 | shell: 28 | """ 29 | OUTDIR=`dirname {output.fna}` 30 | mkdir -p $OUTDIR 2> {log} 31 | {params.exe} {params.params} \ 32 | --in-nuc {input.fna} \ 33 | --out-nuc {output.fna} \ 34 | --out-prot {output.faa} \ 35 | {input.hits} {input.faa} \ 36 | {input.txt} {input.tsv} \ 37 | > {output.tsv} 2>> {log} 38 | """ 39 | 40 | rule humann3_annotate_hits_copy: 41 | """ 42 | Copying/compressing query hits (diamond or mmseqs) to the final output directory 43 | """ 44 | input: 45 | hits = config['tmp_dir'] + 'humann3_search/hits.txt' 46 | output: 47 | hits = humann3_dir + 'annotation_hits.gz' 48 | params: 49 | ionice = config['params']['ionice'] 50 | resources: 51 | time = lambda wildcards, attempt: attempt ** 3 * 59 52 | log: 53 | log_dir + 'humann3_annotate_hits_copy/all.log' 54 | shell: 55 | """ 56 | ionice {params.ionice} gzip -c {input.hits} > {output.hits} 2> {log} 57 | """ 58 | 59 | rule humann3_alt_annotate: 60 | """ 61 | Re-annotating with different UniRef cluster resolution 62 | """ 63 | input: 64 | fna = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna', 65 | faa = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa', 66 | tsv = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv', 67 | idx = ancient(config['cluster_idx']) 68 | output: 69 | fna = temp(config['tmp_dir'] + config['uniref_other_name'] + \ 70 | '/genome_reps_filt_annot.fna'), 71 | faa = temp(config['tmp_dir'] + config['uniref_other_name'] + \ 72 | '/genome_reps_filt_annot.faa'), 73 | tsv = temp(config['tmp_dir'] + config['uniref_other_name'] + \ 74 | '/genome_reps_filt_annot.tsv') 75 | params: 76 | exe = config['pipeline']['script_folder'] + 'uniref_clst_trans.py' 77 | resources: 78 | time = lambda wildcards, attempt: attempt ** 3 * 59, 79 | mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 3 + 9 80 | log: 81 | log_dir + 'humann3_annotate_genes/all.log' 82 | benchmark: 83 | benchmark_dir + 'humann3_annotate_genes/all.txt' 84 | shell: 85 | """ 86 | OUTDIR=`dirname {output.fna}` 87 | mkdir -p $OUTDIR 2> {log} 88 | {params.exe} {input.idx} \ 89 | --in-nuc {input.fna} \ 90 | --in-prot {input.faa} \ 91 | --in-tsv {input.tsv} \ 92 | --out-nuc {output.fna} \ 93 | --out-prot {output.faa} \ 94 | --out-tsv {output.tsv} \ 95 | 2>> {log} 1>&2 96 | """ 97 | 98 | rule humann3_annotate_genes_copy: 99 | """ 100 | Copying/compressing annotated gene files to the final directory 101 | """ 102 | input: 103 | fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna', 104 | faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa', 105 | tsv = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.tsv' 106 | output: 107 | fna = humann3_dir + '{uniref}/genome_reps_filt_annot.fna.gz', 108 | faa = humann3_dir + '{uniref}/genome_reps_filt_annot.faa.gz', 109 | tsv = humann3_dir + '{uniref}/genome_reps_filt_annot.tsv.gz' 110 | params: 111 | ionice = config['params']['ionice'] 112 | resources: 113 | time = lambda wildcards, attempt: attempt ** 3 * 59 114 | log: 115 | log_dir + 'humann3_annotate_genes_copy/{uniref}.log' 116 | benchmark: 117 | benchmark_dir + 'humann3_annotate_genes_copy/{uniref}.txt' 118 | shell: 119 | """ 120 | ionice {params.ionice} gzip -c {input.fna} > {output.fna} 2> {log} 121 | ionice {params.ionice} gzip -c {input.faa} > {output.faa} 2>> {log} 122 | ionice {params.ionice} gzip -c {input.tsv} > {output.tsv} 2>> {log} 123 | """ 124 | 125 | rule humann3_bowtie2_build: 126 | """ 127 | Running bowtie2 build on combined, annotated genes (nucleotide) 128 | """ 129 | input: 130 | fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna' 131 | output: 132 | touch(os.path.join(humann3_dir, '{uniref}/bowtie2_build.done')) 133 | conda: 134 | '../../../envs/humann3.yaml' 135 | threads: 136 | 12 137 | resources: 138 | time = lambda wildcards, attempt: attempt * 2 * 60 * 24, 139 | n = lambda wildcards, attempt, threads: threads, 140 | mem_gb_pt = lambda wildcards, attempt: attempt ** 2 * 2 + 8, 141 | lg_idx = lambda wildcards, attempt: '--large-index' if attempt > 1 else '' 142 | log: 143 | log_dir + 'humann3_bowtie2_build/{uniref}.log' 144 | benchmark: 145 | benchmark_dir + 'humann3_bowtie2_build/{uniref}.txt' 146 | shell: 147 | """ 148 | OUTDIR=`dirname {output}` 149 | PREFIX="$OUTDIR/all_genes_annot" 150 | bowtie2-build --threads {threads} {resources.lg_idx} \ 151 | {input.fna} $PREFIX 2> {log} 1>&2 152 | 153 | # check that output exists 154 | IDX_FILES=`find $OUTDIR -maxdepth 1 -name "*.bt2*"` 155 | IDX_FILES=`echo $IDX_FILES | perl -pe 's/ +/\n/g' | wc -l` 156 | if [ $IDX_FILES -lt 1 ]; then 157 | echo "ERROR: no bowtie2 index files found!" >> {log} 158 | exit 1 159 | fi 160 | touch {output} 2>> {log} 161 | """ 162 | 163 | rule humann3_diamond_makedb: 164 | """ 165 | Running diamond makedb on combined, annotated genes (amino acid) 166 | """ 167 | input: 168 | faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa' 169 | output: 170 | humann3_dir + '{uniref}/protein_database/{dmnd}' 171 | params: 172 | tmp_dir = config['tmp_dir'] 173 | conda: 174 | '../../../envs/humann3.yaml' 175 | resources: 176 | time = lambda wildcards, attempt: attempt * 2 * 60 * 24 + 60 * 24, 177 | mem_gb_pt = lambda wildcards, attempt: (attempt ** 3 + 3) * 12 178 | log: 179 | log_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.log' 180 | benchmark: 181 | benchmark_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.txt' 182 | shell: 183 | """ 184 | PREF=`echo {output} | perl -pe 's/\.[^.]+$//'` 185 | diamond makedb --in {input.faa} -d $PREF 2> {log} 1>&2 186 | """ 187 | 188 | 189 | -------------------------------------------------------------------------------- /bin/db_create/humann3/prepare_query/Snakefile: -------------------------------------------------------------------------------- 1 | #-- preparing input for annotation & humann database construction --# 2 | 3 | rule humann3_copy_input: 4 | """ 5 | Copying input to temp directory 6 | """ 7 | input: 8 | faa_c = genes_dir + 'cluster/clusters_reps.faa.gz', 9 | mem_c = genes_dir + 'cluster/clusters_membership.tsv.gz', 10 | fna = genes_dir + 'genome_reps_filtered.fna.gz', 11 | faa = genes_dir + 'genome_reps_filtered.faa.gz', 12 | txt = genes_dir + 'genome_reps_filtered.txt.gz' 13 | output: 14 | faa_c = temp(config['tmp_dir'] + 'humann3/clusters_reps.faa.gz'), 15 | mem_c = temp(config['tmp_dir'] + 'humann3/clusters_membership.tsv.gz'), 16 | fna = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.fna.gz'), 17 | faa = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.faa.gz'), 18 | txt = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.txt.gz') 19 | params: 20 | ionice = config['params']['ionice'] 21 | resources: 22 | time = lambda wildcards, attempt: attempt ** 3 * 59 23 | log: 24 | log_dir + 'humann3_copy_input/all.log' 25 | shell: 26 | """ 27 | ionice {params.ionice} cp -f {input.faa_c} {output.faa_c} 2> {log} 1>&2 28 | ionice {params.ionice} cp -f {input.mem_c} {output.mem_c} 2>> {log} 1>&2 29 | ionice {params.ionice} cp -f {input.faa} {output.faa} 2>> {log} 1>&2 30 | ionice {params.ionice} cp -f {input.fna} {output.fna} 2>> {log} 1>&2 31 | ionice {params.ionice} cp -f {input.txt} {output.txt} 2>> {log} 1>&2 32 | """ 33 | 34 | rule humann3_batch_seqs: 35 | """ 36 | Splitting gene fasta for distributed searching (annotation) 37 | """ 38 | input: 39 | faa = config['tmp_dir'] + 'humann3/clusters_reps.faa.gz' 40 | output: 41 | done = config['tmp_dir'] + 'humann3_search/split.done', 42 | splt = temp(expand(config['tmp_dir'] + \ 43 | 'humann3_search/stdin.part_{splitID}.fasta', 44 | splitID=config['params']['humann3']['splits'])) 45 | params: 46 | n_splits = config['params']['humann3']['batches'] 47 | threads: 48 | 4 49 | resources: 50 | time = lambda wildcards, attempt: int(round(attempt ** 4 * 59,0)), 51 | n = lambda wildcards, attempt, threads: threads, 52 | mem_gb_pt = lambda wildcards, attempt, threads: int(round(attempt ** 3 * 10.0 / threads,0)) 53 | conda: 54 | '../../../envs/genes.yaml' 55 | log: 56 | log_dir + 'humann3_batch_seqs/all.log' 57 | benchmark: 58 | benchmark_dir + 'humann3_batch_seqs/all.txt' 59 | shell: 60 | """ 61 | OUTDIR=`dirname {output.done} 2> {log}` 62 | seqkit shuffle -j {threads} {input.faa} 2>> {log} | \ 63 | seqkit split -j {threads} --by-part {params.n_splits} \ 64 | --out-dir $OUTDIR 2>> {log} 1>&2 65 | touch {output.done} 66 | """ 67 | 68 | -------------------------------------------------------------------------------- /bin/db_create/humann3/query/Snakefile: -------------------------------------------------------------------------------- 1 | #-- gene annotation (for humann database) workflow --# 2 | 3 | if (not skipped(config['params']['humann3']['mmseqs_search']['db']) and 4 | not skipped(config['params']['humann3']['mmseqs_search']['index']) and 5 | not skipped(config['params']['humann3']['mmseqs_search']['run'])): 6 | # checking on database 7 | if not re.search(config['uniref_name'], 8 | str(config['params']['humann3']['mmseqs_search']['db']).lower()): 9 | print('\33[35m * WARNING the uniref_name does not match the query database\x1b[0m') 10 | print('\33[35m * ({} <=> {})\x1b[0m'.format(config['uniref_name'], config['params']['humann3']['mmseqs_search']['db'])) 11 | # mmseqs search 12 | print('\33[36m * Using "mmseqs search" for annotating genes\x1b[0m') 13 | include: snake_dir + 'bin/db_create/humann3/query/mmseqs/Snakefile' 14 | elif (not skipped(config['params']['humann3']['diamond']['db']) and 15 | not skipped(config['params']['humann3']['diamond']['run'])): 16 | # checking on database 17 | if not re.search(config['uniref_name'], 18 | str(config['params']['humann3']['diamond']['db']).lower()): 19 | print('\33[35m * WARNING the uniref_name does not match the query database\x1b[0m') 20 | print('\33[35m * ({} <=> {})\x1b[0m'.format(config['uniref_name'], config['params']['humann3']['diamond']['db'])) 21 | # diamond blastp 22 | print('\33[36m * Using "diamond blastp" for annotating genes\x1b[0m') 23 | include: snake_dir + 'bin/db_create/humann3/query/dmnd/Snakefile' 24 | else: 25 | print('\33[31m ERROR: all query methods skipped!\x1b[0m') 26 | 27 | -------------------------------------------------------------------------------- /bin/db_create/humann3/query/dmnd/Snakefile: -------------------------------------------------------------------------------- 1 | rule humann3_diamond_db_copy: 2 | """ 3 | Copying the user-provided DIAMOND database to the temp directory 4 | """ 5 | input: 6 | db = ancient(config['params']['humann3']['diamond']['db']) 7 | output: 8 | db = temp(config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd') 9 | params: 10 | ionice = config['params']['ionice'] 11 | resources: 12 | time = lambda wildcards, attempt: attempt ** 3 * 59, 13 | mem_gb_pt = lambda wildcards, attempt: attempt * 4 14 | log: 15 | log_dir + 'humann3_diamond_db_copy/all.log' 16 | benchmark: 17 | benchmark_dir + 'humann3_diamond_db_copy/all.txt' 18 | shell: 19 | """ 20 | ionice {params.ionice} cp -f {input} {output} 2> {log} 1>&2 21 | """ 22 | 23 | def dmnd_start_mem(wildcards, attempt, threads=12): 24 | """ 25 | Estimating the baseline memory to use for jobs, given the diamond database size 26 | """ 27 | prot_db_size = os.stat(config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd').st_size / 1e9 28 | mem = round(prot_db_size * 5 / threads + 1.499,0) 29 | mem = (attempt - 1) * 2 + mem 30 | return int(mem) 31 | 32 | rule humann3_diamond_pass1: 33 | """ 34 | Annotating genes via 'diamond blastp' search of UniRef DB 35 | """ 36 | input: 37 | faa = config['tmp_dir'] + 'humann3_search/stdin.part_{splitID}.fasta', 38 | dmnd_db = config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd' 39 | output: 40 | hits = temp(config['tmp_dir'] + 'humann3_search/hits_pass1/{splitID}.txt'), 41 | unaln = temp(config['tmp_dir'] + 'humann3_search/unaln/{splitID}.faa') 42 | params: 43 | params = config['params']['humann3']['diamond']['run'], 44 | tmp_dir = config['tmp_dir'] + 'humann3_search_TMP/{splitID}/' 45 | threads: 46 | 8 47 | resources: 48 | time = lambda wildcards, attempt: attempt ** 2 * 60 * 12, 49 | n = lambda wildcards, attempt, threads: threads, 50 | mem_gb_pt = dmnd_start_mem 51 | conda: 52 | '../../../../envs/humann3.yaml' 53 | log: 54 | log_dir + 'humann3_diamond_pass1/{splitID}.log' 55 | benchmark: 56 | benchmark_dir + 'humann3_diamond_pass1/{splitID}.txt' 57 | shell: 58 | """ 59 | TMPDIR="{params.tmp_dir}" 60 | mkdir -p $TMPDIR 2> {log} 61 | 62 | # diamond run 63 | diamond blastp {params.params} \ 64 | --tmpdir $TMPDIR --threads {threads} \ 65 | -q {input.faa} -d {input.dmnd_db} \ 66 | -o {output.hits} --un {output.unaln} \ 67 | --outfmt 6 qseqid sseqid evalue pident length slen \ 68 | 2>> {log} 1>&2 69 | """ 70 | 71 | rule humann3_diamond_pass2: 72 | """ 73 | Annotating genes via diamond search of UniRef DB (sensitive mode). 74 | Just running on genes not annotated by Pass1. 75 | """ 76 | input: 77 | faa = config['tmp_dir'] + 'humann3_search/unaln/{splitID}.faa', 78 | dmnd_db = config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd' 79 | output: 80 | hits = temp(config['tmp_dir'] + 'humann3_search/hits_pass2/{splitID}.txt') 81 | params: 82 | params = config['params']['humann3']['diamond']['run'], 83 | tmp_dir = config['tmp_dir'] + 'humann3_search_TMP/{splitID}/' 84 | threads: 85 | 8 86 | resources: 87 | time = lambda wildcards, attempt: attempt * 60 * 48, 88 | n = lambda wildcards, attempt, threads: threads, 89 | mem_gb_pt = dmnd_start_mem 90 | conda: 91 | '../../../../envs/humann3.yaml' 92 | log: 93 | log_dir + 'humann3_diamond_pass2/{splitID}.log' 94 | benchmark: 95 | benchmark_dir + 'humann3_diamond_pass2/{splitID}.txt' 96 | shell: 97 | """ 98 | NSEQ=`seqkit seq -n {input.faa} | wc -l 2> {log}` 99 | if [[ "$NSEQ" -gt "0" ]]; then 100 | TMPDIR="{params.tmp_dir}" 101 | mkdir -p $TMPDIR 2>> {log} 102 | # diamond run 103 | diamond blastp --sensitive {params.params} \ 104 | --tmpdir $TMPDIR --threads {threads} \ 105 | -q {input.faa} -d {input.dmnd_db} -o {output.hits} \ 106 | --outfmt 6 qseqid sseqid evalue pident length slen \ 107 | 2>> {log} 1>&2 108 | else 109 | touch {output.hits} 2> {log} 1>&2 110 | echo "No unaligned sequences. Skipping DIAMOND" >> {log} 111 | fi 112 | """ 113 | 114 | localrules: humann3_diamond_merge 115 | 116 | rule humann3_diamond_merge: 117 | """ 118 | Merging the results of the 2 DIAMOND passes (all query fasta splits) 119 | """ 120 | input: 121 | hits1 = expand(config['tmp_dir'] + \ 122 | 'humann3_search/hits_pass1/{splitID}.txt', 123 | splitID=config['params']['humann3']['splits']), 124 | hits2 = expand(config['tmp_dir'] + \ 125 | 'humann3_search/hits_pass2/{splitID}.txt', 126 | splitID=config['params']['humann3']['splits']) 127 | output: 128 | hits = temp(config['tmp_dir'] + 'humann3_search/hits.txt') 129 | resources: 130 | time = lambda wildcards, attempt: attempt ** 3 * 59 131 | run: 132 | with open(output.hits, 'w') as outF: 133 | for F in input.hits1 + input.hits2: 134 | with open(F) as inF: 135 | for line in inF: 136 | outF.write(line) 137 | 138 | -------------------------------------------------------------------------------- /bin/db_create/kraken2/Snakefile: -------------------------------------------------------------------------------- 1 | if not config['names_dmp'].startswith('Skip') and \ 2 | not config['nodes_dmp'].startswith('Skip'): 3 | localrules: kraken2_cp_dump 4 | rule kraken2_cp_dump: 5 | """ 6 | Copying names/nodes taxdump files to kraken2 db directory 7 | """ 8 | input: 9 | names = ancient(config['names_dmp']), 10 | nodes = ancient(config['nodes_dmp']) 11 | output: 12 | names = config['tmp_dir'] + 'kraken2/taxonomy/names.dmp', 13 | nodes = config['tmp_dir'] + 'kraken2/taxonomy/nodes.dmp' 14 | log: 15 | log_dir + 'kraken2_cp_dump/all.log' 16 | shell: 17 | """ 18 | cp -f {input.names} {output.names} 2> {log} 19 | cp -f {input.nodes} {output.nodes} 2>> {log} 20 | chmod u+w {output.names} {output.nodes} 2>> {log} 21 | """ 22 | else: 23 | localrules: kraken2_build_download_tax 24 | rule kraken2_build_download_tax: 25 | """ 26 | Downloading NCBI taxdump files 27 | """ 28 | output: 29 | gb = config['tmp_dir'] + 'kraken2/taxonomy/nucl_gb.accession2taxid', 30 | wgs = config['tmp_dir'] + 'kraken2/taxonomy/nucl_wgs.accession2taxid', 31 | dump = config['tmp_dir'] + 'kraken2/taxonomy/taxdump.tar.gz', 32 | nodes = config['tmp_dir'] + 'kraken2/taxonomy/nodes.dmp', 33 | names = config['tmp_dir'] + 'kraken2/taxonomy/names.dmp', 34 | merged = config['tmp_dir'] + 'kraken2/taxonomy/merged.dmp' 35 | params: 36 | exe = config['pipeline']['script_folder'] + 'kraken2-build' 37 | conda: 38 | '../../envs/kraken2.yaml' 39 | log: 40 | log_dir + 'kraken_build_download_tax/all.log' 41 | benchmark: 42 | benchmark_dir + 'kraken_build_download_tax/all.txt' 43 | shell: 44 | """ 45 | OUTDIR=`dirname {output.gb}` 46 | OUTDIR=`dirname $OUTDIR` 47 | rm -rf $OUTDIR 2> {log} 48 | mkdir -p $OUTDIR 2>> {log} 49 | echo "# Downloading NCBI taxonomy to $OUTDIR" >> {log} 50 | {params.exe} --use-ftp --download-taxonomy --db $OUTDIR 2>> {log} 1>&2 51 | """ 52 | 53 | def kraken2_add_taxID_get_taxID(wildcards): 54 | """ 55 | Getting genome taxID from the user input table 56 | """ 57 | taxID = config['samples'].loc[wildcards.sample, config['taxID_col']] 58 | try: 59 | taxID = taxID.astype(str) 60 | except AttributeError: 61 | pass 62 | return taxID 63 | 64 | rule kraken2_add_taxID: 65 | """ 66 | Adding a taxononmy ID to the header of each genome. 67 | Assuming the taxID is in the samples table. 68 | Writing edited genome to temp dir. 69 | 70 | Format: `kraken:taxid||` 71 | """ 72 | input: 73 | fasta = lambda wildcards: \ 74 | config['samples'].loc[wildcards.sample, config['fasta_file_path_col']] 75 | output: 76 | temp(config['tmp_dir'] + 'genomes/{sample}.fna') 77 | resources: 78 | time = lambda wildcards, attempt: attempt ** 2 * 59, 79 | mem_gb_pt = lambda wildcards, attempt: attempt * 6 80 | params: 81 | taxID = kraken2_add_taxID_get_taxID, 82 | exe = config['pipeline']['script_folder'] + 'kraken2_rename_genome.py' 83 | log: 84 | log_dir + 'kraken2_add_taxID/{sample}.log' 85 | benchmark: 86 | benchmark_dir + 'kraken2_add_taxID/{sample}.txt' 87 | shell: 88 | """ 89 | {params.exe} {input.fasta} {params.taxID} > {output} 2> {log} 90 | """ 91 | 92 | rule kraken2_build_add: 93 | """ 94 | Adding genome fasta files to the kraken database. 95 | Using the --add-to-library flag 96 | """ 97 | input: 98 | fasta = config['tmp_dir'] + 'genomes/{sample}.fna', 99 | nodes = config['tmp_dir'] + 'kraken2/taxonomy/nodes.dmp', 100 | names = config['tmp_dir'] + 'kraken2/taxonomy/names.dmp' 101 | output: 102 | done = temp(config['tmp_dir'] + 'kraken2/added/{sample}.done') 103 | threads: 104 | 4 105 | resources: 106 | time = lambda wildcards, attempt: attempt ** 2 * 59, 107 | n = lambda wildcards, attempt, threads: threads, 108 | mem_gb_pt = lambda wildcards, attempt: attempt * 3 109 | conda: 110 | '../../envs/kraken2.yaml' 111 | log: 112 | log_dir + 'kraken2_build_add/{sample}.log' 113 | benchmark: 114 | benchmark_dir + 'kraken2_build_add/{sample}.txt' 115 | shell: 116 | """ 117 | DB=`dirname {input.names}` 118 | DB=`dirname $DB` 119 | 120 | kraken2-build --threads {threads} \ 121 | --db $DB --add-to-library {input.fasta} \ 122 | 2> {log} 1>&2 123 | touch {output.done} 2>> {log} 124 | """ 125 | 126 | rule kraken2_build: 127 | """ 128 | Building the kraken database 129 | """ 130 | input: 131 | expand(config['tmp_dir'] + 'kraken2/added/{sample}.done', 132 | sample = config['samples_unique']) 133 | output: 134 | hash = temp(config['tmp_dir'] + 'kraken2/hash.k2d'), 135 | opts = temp(config['tmp_dir'] + 'kraken2/opts.k2d'), 136 | map = temp(config['tmp_dir'] + 'kraken2/seqid2taxid.map'), 137 | taxo = temp(config['tmp_dir'] + 'kraken2/taxo.k2d') 138 | threads: 139 | 12 140 | resources: 141 | time = lambda wildcards, attempt: attempt ** 2 * 60 * 24, 142 | n = lambda wildcards, attempt, threads: threads, 143 | mem_gb_pt = lambda wildcards, attempt: int(round(attempt * 18 + 4,0)) 144 | conda: 145 | '../../envs/kraken2.yaml' 146 | log: 147 | log_dir + 'kraken2_build/all.log' 148 | benchmark: 149 | benchmark_dir + 'kraken2_build/all.txt' 150 | shell: 151 | """ 152 | DB=`dirname {output.hash}` 153 | kraken2-build --build --threads {threads} --db $DB 2> {log} 1>&2 154 | """ 155 | 156 | rule kraken2_db_copy: 157 | """ 158 | Copying the kraken2 database to the output directory 159 | """ 160 | input: 161 | hash = config['tmp_dir'] + 'kraken2/hash.k2d', 162 | opts = config['tmp_dir'] + 'kraken2/opts.k2d', 163 | taxo = config['tmp_dir'] + 'kraken2/taxo.k2d', 164 | map = config['tmp_dir'] + 'kraken2/seqid2taxid.map' 165 | output: 166 | hash = kraken2_dir + 'hash.k2d', 167 | opts = kraken2_dir + 'opts.k2d', 168 | taxo = kraken2_dir + 'taxo.k2d', 169 | map = kraken2_dir + 'seqid2taxid.map' 170 | params: 171 | keep = config['keep_intermediate'], 172 | ionice = config['params']['ionice'] 173 | resources: 174 | time = lambda wildcards, attempt: attempt ** 3 * 59 175 | log: 176 | log_dir + 'kraken2_db_copy/all.log' 177 | benchmark: 178 | benchmark_dir + 'kraken2_db_copy/all.txt' 179 | shell: 180 | """ 181 | if [ "{params.keep}" == "True" ]; then 182 | echo "# copying entire kraken db" > {log} 183 | DIR1=`dirname {input.hash}` 184 | DIR2=`dirname {output.hash}` 185 | rm -rf $DIR2 2>> {log} 1>&2 186 | ionice {params.ionice} cp -rf $DIR1 $DIR2 2>> {log} 1>&2 187 | else 188 | echo "# copying just built kraken index files" > {log} 189 | ionice {params.ionice} cp -f {input.hash} {output.hash} 2>> {log} 1>&2 190 | ionice {params.ionice} cp -f {input.opts} {output.opts} 2>> {log} 1>&2 191 | ionice {params.ionice} cp -f {input.map} {output.map} 2>> {log} 1>&2 192 | ionice {params.ionice} cp -f {input.taxo} {output.taxo} 2>> {log} 1>&2 193 | fi 194 | """ 195 | 196 | rule kraken2_tmp_db_rm: 197 | """ 198 | Removing temporary kraken2 db directory 199 | """ 200 | input: 201 | hash_tmp = config['tmp_dir'] + 'kraken2/hash.k2d', 202 | hash = kraken2_dir + 'hash.k2d', 203 | opts = kraken2_dir + 'opts.k2d', 204 | taxo = kraken2_dir + 'taxo.k2d', 205 | map = kraken2_dir + 'seqid2taxid.map' 206 | output: 207 | done = kraken2_dir + 'tmp_db_rm.done' 208 | resources: 209 | time = lambda wildcards, attempt: attempt ** 3 * 59 210 | log: 211 | log_dir + 'kraken2_tmp_db_rm/all.log' 212 | shell: 213 | """ 214 | rm -rf `dirname {input.hash_tmp}` 2> {log} 1>&2 215 | touch {output.done} 2>> {log} 216 | """ 217 | -------------------------------------------------------------------------------- /bin/db_update/Snakefile: -------------------------------------------------------------------------------- 1 | #-- database update workflow --# 2 | import gzip 3 | 4 | # input processing 5 | ## outdir 6 | config['output_dir'] = config['output_dir'].rstrip('/') + '/' 7 | print('\33[33mUsing output directory: {} \x1b[0m'.format(config['output_dir'])) 8 | 9 | ## Samples table 10 | if skipped(os.path.split(config['samples_file'])[1]): 11 | config['samples'] = None 12 | else: 13 | if not os.path.isfile(config['samples_file']): 14 | raise IOError('Cannot find file: {}'.format(config['samples_file'])) 15 | config['samples'] = pd.read_csv(config['samples_file'], sep='\t') 16 | ### required columns 17 | for f in [config['samples_col'], config['accession_col'], config['fasta_file_path_col'], 18 | config['taxID_col'], config['taxonomy_col']]: 19 | if f not in config['samples'].columns: 20 | raise ValueError('Cannot find column: {}'.format(f)) 21 | config['samples'][config['samples_col']] = config['samples'][config['samples_col']].str.replace('[^A-Za-z0-9]+', '_', regex=True) 22 | config['samples'] = config['samples'].set_index(config['samples'][config['samples_col']]) 23 | 24 | ### check that files exist (skipping if not) 25 | if config['samples'] is not None: 26 | rowID = 0 27 | to_rm = [] 28 | for index,row in config['samples'].iterrows(): 29 | rowID += 1 30 | file_cols = [config['fasta_file_path_col']] 31 | for f in file_cols: 32 | if not os.path.isfile(str(row[f])): 33 | msg = 'Samples table (Row {}): Cannot find file: {}; Skipping\n' 34 | sys.stderr.write(msg.format(rowID, row[f])) 35 | to_rm.append(row[config['samples_col']]) 36 | ssw('\33[33mNumber of skipped sample table entries: {}\n\x1b[0m'.format(len(to_rm))) 37 | config['samples'].drop(to_rm, inplace=True) 38 | if config['samples'].shape[0] < 1: 39 | raise ValueError('No genomes remaining after filtering!') 40 | config['samples_unique'] = config['samples'][config['samples_col']].unique().tolist() 41 | 42 | # check that user-gene info (if provided) as all of the required columns 43 | if (not skipped(config['new_genes']['amino_acid']) or 44 | not skipped(config['new_genes']['nucleotide'])): 45 | if skipped(config['new_genes']['metadata']) or config['new_genes']['metadata'] == '': 46 | raise IOError('User-provide genes metadata file not provided, but required!') 47 | req_cols = ['seq_uuid', 'seq_orig_name', 'genus', 'species', 'taxid'] 48 | if config['new_genes']['metadata'].endswith('.gz'): 49 | _open = lambda x: gzip.open(x, 'rb') 50 | else: 51 | _open = lambda x: open(x) 52 | with _open(config['new_genes']['metadata']) as inF: 53 | for i,line in enumerate(inF): 54 | if i > 0: 55 | break 56 | if config['new_genes']['metadata'].endswith('.gz'): 57 | line = line.decode('utf-8') 58 | line = line.rstrip().split('\t') 59 | missing = [x for x in req_cols if not x in line] 60 | if len(missing) > 0: 61 | msg = 'Missing required columns in user-provided genes metadata file: {}' 62 | raise ValueError(msg.format(','.join(missing))) 63 | if line[0] != 'seq_uuid': 64 | msg = 'The first column of the gene metadata table must be "seq_uuid"' 65 | raise ValueError(msg) 66 | 67 | ## temp_folder 68 | config['pipeline']['username'] = getpass.getuser() 69 | config['pipeline']['email'] = config['email'] 70 | config['tmp_dir'] = os.path.join(config['tmp_dir'], config['pipeline']['username']) 71 | config['tmp_dir'] = os.path.join(config['tmp_dir'], 'Struo2_' + str(os.stat('.').st_ino) + '/') 72 | print('\33[33mUsing temporary directory: {} \x1b[0m'.format(config['tmp_dir'])) 73 | 74 | ## batches 75 | config['params']['humann3']['splits'] = \ 76 | make_fasta_splits(config['params']['humann3']['batches']) 77 | 78 | ## including modular snakefiles 79 | print('\33[36m--Running db-update pipeline--\x1b[0m') 80 | snake_dir = config['pipeline']['snakemake_folder'] 81 | include: snake_dir + 'bin/dirs' 82 | ### Adding genomes to kraken/bracken (user cannot provide gene list) 83 | if (config['samples'] is not None and 84 | not skipped(config['databases']['kraken2'])): 85 | print('\33[36m* Updating kraken2 database\x1b[0m') 86 | include: snake_dir + 'bin/db_update/kraken2/Snakefile' 87 | if not skipped(config['databases']['bracken']): 88 | print('\33[36m* Updating bracken database\x1b[0m') 89 | include: snake_dir + 'bin/db_update/bracken/Snakefile' 90 | ### Updating genes db 91 | if not skipped(config['databases']['genes']): 92 | print('\33[36m* Updating genes database\x1b[0m') 93 | include: snake_dir + 'bin/db_update/genes/Snakefile' 94 | ### updating humann databases 95 | if not skipped(config['databases']['humann3_bowtie2']) and \ 96 | not skipped(config['databases']['humann3_diamond']): 97 | print('\33[36m* Updating humann database\x1b[0m') 98 | include: snake_dir + 'bin/db_update/humann3/Snakefile' 99 | 100 | -------------------------------------------------------------------------------- /bin/db_update/bracken/Snakefile: -------------------------------------------------------------------------------- 1 | rule bracken_build: 2 | """ 3 | Build braken database from kraken2 database 4 | """ 5 | input: 6 | kraken2_db = kraken2_dir + 'hash.k2d' 7 | output: 8 | krk = kraken2_dir + 'database{read_len}mers.kraken', 9 | krkd = kraken2_dir + 'database{read_len}mers.kmer_distrib' 10 | params: 11 | kmer = config['params']['bracken']['build_kmer'], 12 | exe = config['pipeline']['script_folder'] + 'bracken-build.py', 13 | read_len = lambda wildcards: wildcards.read_len, 14 | threads: 15 | 12 16 | resources: 17 | time = lambda wildcards, attempt: attempt ** 2 * 60 * 24, 18 | n = lambda wildcards, attempt, threads: threads, 19 | mem_gb_pt = lambda wildcards, attempt: attempt * 16 20 | conda: 21 | '../../envs/kraken2.yaml' 22 | log: 23 | log_dir + 'bracken_build/ReadLen{read_len}.log' 24 | benchmark: 25 | benchmark_dir + 'bracken_build/ReadLen{read_len}.txt' 26 | shell: 27 | """ 28 | # location of the kraken2 db files 29 | DB=`dirname {input.kraken2_db}` 30 | # removing existing files possibly created by bracken 31 | TMP_FILE=$DB"/database.kraken" 32 | rm -f $TMP_FILE 33 | # running bracken 34 | {params.exe} -t {threads} -d $DB \ 35 | -k {params.kmer} -l {params.read_len} \ 36 | 2> {log} 1>&2 37 | """ 38 | 39 | -------------------------------------------------------------------------------- /bin/db_update/genes/Snakefile: -------------------------------------------------------------------------------- 1 | #-- gene database update workflow --# 2 | 3 | # Adding genes to genes_db (user can provide genomes or gene list) 4 | include: snake_dir + 'bin/db_update/genes/input/Snakefile' 5 | # updating the mmseqs cluster database 6 | include: snake_dir + 'bin/db_update/genes/db_update/Snakefile' 7 | -------------------------------------------------------------------------------- /bin/db_update/genes/input/Snakefile: -------------------------------------------------------------------------------- 1 | #-- dealing with all possible gene inputs for this workflow --# 2 | 3 | # checking/validating input 4 | include: snake_dir + 'bin/db_update/genes/input/check/Snakefile' 5 | # gene input 6 | if (not skipped(config['new_genes']['amino_acid']) or 7 | not skipped(config['new_genes']['nucleotide'])): 8 | # assuming user provided a set of genes 9 | print('\33[36m * Using user-provided set of gene sequences\x1b[0m') 10 | include: snake_dir + 'bin/db_update/genes/input/from_gene_set/Snakefile' 11 | else: 12 | # assuming the user provide a list of genomes to extract genes from 13 | print('\33[36m * Extracting new genes from user-provided genomes\x1b[0m') 14 | include: snake_dir + 'bin/db_update/genes/input/from_genomes/Snakefile' 15 | -------------------------------------------------------------------------------- /bin/db_update/genes/input/check/Snakefile: -------------------------------------------------------------------------------- 1 | #-- checking/validating input formats --# 2 | 3 | localrules: genes_check_input 4 | 5 | if skipped(config['genes_db']['genes']['nucleotide']): 6 | rule genes_check_input: 7 | """ 8 | Checking that input is formatted correctly. 9 | Skipping nucleotide check. 10 | """ 11 | input: 12 | faa = config['genes_db']['genes']['amino_acid'], 13 | txt = config['genes_db']['genes']['metadata'] 14 | output: 15 | fna = temp(config['tmp_dir'] + 'db_update/orig.fna'), 16 | faa = temp(config['tmp_dir'] + 'db_update/orig.faa'), 17 | txt = temp(config['tmp_dir'] + 'db_update/orig.txt'), 18 | done = genes_dir + 'genes_input.check.done' 19 | params: 20 | ionice = config['params']['ionice'], 21 | exe1 = config['pipeline']['script_folder'] + 'cat_files.py', 22 | exe2 = config['pipeline']['script_folder'] + 'check_gene_info.py' 23 | conda: 24 | '../../../../envs/genes.yaml' 25 | resources: 26 | time = lambda wildcards, attempt: attempt ** 2 * 59 27 | log: 28 | log_dir + 'genes_check_input/all.log' 29 | shell: 30 | """ 31 | # copy 32 | touch {output.fna} 2> {log} 1>&2 33 | ionice {params.ionice} seqkit seq -v {input.faa} > {output.faa} 2>> {log} 34 | ionice {params.ionice} {params.exe1} {input.txt} > {output.txt} 2>> {log} 35 | # check on data content 36 | {params.exe2} {input.faa} {input.txt} 2>> {log} 1>&2 37 | touch {output.done} 2>> {log} 38 | """ 39 | else: 40 | rule genes_check_input: 41 | """ 42 | Checking that input is formatted correctly. 43 | """ 44 | input: 45 | fna = config['genes_db']['genes']['nucleotide'], 46 | faa = config['genes_db']['genes']['amino_acid'], 47 | txt = config['genes_db']['genes']['metadata'] 48 | output: 49 | fna = temp(config['tmp_dir'] + 'db_update/orig.fna'), 50 | faa = temp(config['tmp_dir'] + 'db_update/orig.faa'), 51 | txt = temp(config['tmp_dir'] + 'db_update/orig.txt'), 52 | done = genes_dir + 'genes_input.check.done' 53 | params: 54 | ionice = config['params']['ionice'], 55 | exe1 = config['pipeline']['script_folder'] + 'cat_files.py', 56 | exe2 = config['pipeline']['script_folder'] + 'check_gene_info.py' 57 | conda: 58 | '../../../../envs/genes.yaml' 59 | resources: 60 | time = lambda wildcards, attempt: attempt ** 2 * 59 61 | log: 62 | log_dir + 'genes_check_input/all.log' 63 | shell: 64 | """ 65 | # copy 66 | ionice {params.ionice} seqkit seq -v {input.fna} > {output.fna} 2> {log} 67 | ionice {params.ionice} seqkit seq -v {input.faa} > {output.faa} 2>> {log} 68 | ionice {params.ionice} {params.exe1} {input.txt} > {output.txt} 2>> {log} 69 | # check on data content 70 | {params.exe2} -n {input.fna} {input.faa} {input.txt} 2>> {log} 1>&2 71 | touch {output.done} 2>> {log} 72 | """ 73 | 74 | -------------------------------------------------------------------------------- /bin/db_update/genes/input/from_genomes/Snakefile: -------------------------------------------------------------------------------- 1 | #-- obtaining new genes from a list of genomes & adding them to the existing genes database --# 2 | 3 | rule prodigal: 4 | """ 5 | For each genome, running prodigal to call genes 6 | """ 7 | input: 8 | fasta = lambda wildcards: \ 9 | config['samples'].loc[wildcards.sample, config['fasta_file_path_col']] 10 | output: 11 | fna = temp(config['tmp_dir'] + 'db_update/prodigal/{sample}.fna'), 12 | faa = temp(config['tmp_dir'] + 'db_update/prodigal/{sample}.faa'), 13 | gbk = temp(config['tmp_dir'] + 'db_update/prodigal/{sample}.gbk') 14 | params: 15 | params = config['params']['genes']['prodigal'] 16 | resources: 17 | time = lambda wildcards, attempt: attempt ** 2 * 59, 18 | n = lambda wildcards, attempt, threads: threads, 19 | mem_gb_pt = lambda wildcards, attempt: attempt * 8 + 14 20 | conda: 21 | '../../../../envs/genes.yaml' 22 | log: 23 | log_dir + 'prodigal/{sample}.log' 24 | benchmark: 25 | benchmark_dir + 'prodigal/{sample}.txt' 26 | shell: 27 | """ 28 | gunzip -c {input.fasta} | \ 29 | prodigal {params.params} \ 30 | -o {output.gbk} -d {output.fna} -a {output.faa} \ 31 | 2> {log} 1>&2 32 | """ 33 | 34 | rule vsearch_per_genome: 35 | """ 36 | For each genome, clustering genes (at nuc level) and taking the centroid. 37 | """ 38 | input: 39 | fna = config['tmp_dir'] + 'db_update/prodigal/{sample}.fna', 40 | faa = config['tmp_dir'] + 'db_update/prodigal/{sample}.faa' 41 | output: 42 | reps = temp(config['tmp_dir'] + 'db_update/vsearch/{sample}_reps.fna') 43 | params: 44 | params = config['params']['genes']['vsearch_per_genome'] 45 | threads: 46 | 4 47 | resources: 48 | time = lambda wildcards, attempt: attempt ** 3 * 59, 49 | n = lambda wildcards, attempt, threads: threads, 50 | mem_gb_pt = lambda wildcards, attempt: attempt * 3 51 | conda: 52 | '../../../../envs/genes.yaml' 53 | log: 54 | log_dir + 'vsearch_per_genome/{sample}.log' 55 | benchmark: 56 | benchmark_dir + 'vsearch_per_genome/{sample}.txt' 57 | shell: 58 | """ 59 | vsearch {params.params} \ 60 | --threads {threads} \ 61 | --cluster_fast {input.fna} \ 62 | --centroids {output.reps} \ 63 | 2> {log} 1>&2 64 | """ 65 | 66 | rule filter_gene_seqs: 67 | """ 68 | Filtering the amino acid gene sequences to just the vsearch cluster reps (nucleotide). 69 | Renaming as [seq_name]|gene_length|taxonomy 70 | """ 71 | input: 72 | fasta = lambda wildcards: \ 73 | config['samples'].loc[wildcards.sample, config['fasta_file_path_col']], 74 | reps = config['tmp_dir'] + 'db_update/vsearch/{sample}_reps.fna', 75 | faa = config['tmp_dir'] + 'db_update/prodigal/{sample}.faa' 76 | output: 77 | fna = temp(config['tmp_dir'] + 'db_update/nuc_filtered/{sample}_reps.fna'), 78 | faa = temp(config['tmp_dir'] + 'db_update/prot_filtered/{sample}_reps.faa'), 79 | txt = temp(config['tmp_dir'] + 'db_update/names_filtered/{sample}_reps.txt') 80 | params: 81 | exe = config['pipeline']['script_folder'] + 'filter_seqs.py', 82 | tax = lambda wildcards: \ 83 | config['samples'].loc[wildcards.sample, config['taxonomy_col']], 84 | taxID = lambda wildcards: \ 85 | config['samples'].loc[wildcards.sample, config['taxID_col']], 86 | acc = lambda wildcards: \ 87 | config['samples'].loc[wildcards.sample, config['accession_col']] 88 | resources: 89 | time = lambda wildcards, attempt: attempt ** 2 * 59, 90 | mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 8 91 | conda: 92 | '../../../../envs/genes.yaml' 93 | log: 94 | log_dir + 'filter_gene_seqs/{sample}.log' 95 | benchmark: 96 | benchmark_dir + 'filter_gene_seqs/{sample}.txt' 97 | shell: 98 | """ 99 | {params.exe} --taxonomy "{params.tax}" \ 100 | --taxID {params.taxID} \ 101 | --accession {params.acc} \ 102 | --genome-file {input.fasta} \ 103 | {input.reps} {input.faa} \ 104 | {output.fna} {output.faa} \ 105 | > {output.txt} 2> {log} 106 | """ 107 | 108 | rule genes_combine_fna: 109 | """ 110 | For all per-genome de-replicated genes of all genomes, combining into 1 collection. 111 | Including original genes. 112 | """ 113 | input: 114 | fna1 = config['tmp_dir'] + 'db_update/orig.fna', 115 | fna2 = expand(config['tmp_dir'] + 'db_update/nuc_filtered/{sample}_reps.fna', 116 | sample = config['samples_unique']) 117 | output: 118 | fna = temp(config['tmp_dir'] + 'db_update/filtered_reps.fna') 119 | resources: 120 | time = lambda wildcards, attempt: attempt ** 2 * 59 121 | run: 122 | cat_files([input.fna1], input.fna2, outfile=output.fna) 123 | 124 | rule genes_combine_faa: 125 | """ 126 | For all per-genome de-replicated genes of all genomes, combining into 1 collection. 127 | Including original genes. 128 | """ 129 | input: 130 | faa1 = config['tmp_dir'] + 'db_update/orig.faa', 131 | faa2 = expand(config['tmp_dir'] + 'db_update/prot_filtered/{sample}_reps.faa', 132 | sample = config['samples_unique']) 133 | output: 134 | faa = temp(config['tmp_dir'] + 'db_update/filtered_reps.faa') 135 | resources: 136 | time = lambda wildcards, attempt: attempt ** 2 * 59 137 | run: 138 | cat_files([input.faa1], input.faa2, outfile=output.faa) 139 | 140 | rule genes_combine_txt: 141 | """ 142 | For all per-genome de-replicated genes of all genomes, combining into 1 collection. 143 | Including original genes. 144 | """ 145 | input: 146 | txt1 = config['tmp_dir'] + 'db_update/orig.txt', 147 | txt2 = expand(config['tmp_dir'] + \ 148 | 'db_update/names_filtered/{sample}_reps.txt', 149 | sample = config['samples_unique']) 150 | output: 151 | txt = temp(config['tmp_dir'] + 'db_update/filtered_reps.txt') 152 | resources: 153 | time = lambda wildcards, attempt: attempt ** 2 * 59 154 | run: 155 | cat_files([input.txt1], input.txt2, outfile=output.txt, header=True) 156 | 157 | rule copy_gene_info: 158 | """ 159 | Copying/compressing gene data to the output directory. 160 | """ 161 | input: 162 | fna = config['tmp_dir'] + 'db_update/filtered_reps.fna', 163 | faa = config['tmp_dir'] + 'db_update/filtered_reps.faa', 164 | txt = config['tmp_dir'] + 'db_update/filtered_reps.txt' 165 | output: 166 | fna = genes_dir + 'genome_reps_filtered.fna.gz', 167 | faa = genes_dir + 'genome_reps_filtered.faa.gz', 168 | txt = genes_dir + 'genome_reps_filtered.txt.gz' 169 | params: 170 | ionice = config['params']['ionice'] 171 | resources: 172 | time = lambda wildcards, attempt: attempt ** 3 * 59, 173 | mem_gb_pt = lambda wildcards, attempt: attempt * 8 174 | log: 175 | log_dir + 'copy_gene_info/all.log' 176 | benchmark: 177 | benchmark_dir + 'copy_gene_info/all.txt' 178 | shell: 179 | """ 180 | ionice {params.ionice} gzip -c {input.fna} > {output.fna} 2> {log} 181 | ionice {params.ionice} gzip -c {input.faa} > {output.faa} 2>> {log} 182 | ionice {params.ionice} gzip -c {input.txt} > {output.txt} 2>> {log} 183 | """ 184 | 185 | rule mmseqs_db_create: 186 | """ 187 | Creating mmseqs2 database that will be used for updating the existing cluster database. 188 | This databased includes all genes (original & new). 189 | """ 190 | input: 191 | fna = config['tmp_dir'] + 'db_update/filtered_reps.fna', 192 | faa = config['tmp_dir'] + 'db_update/filtered_reps.faa', 193 | txt = config['tmp_dir'] + 'db_update/filtered_reps.txt' 194 | output: 195 | db = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db'), 196 | db_t = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.dbtype'), 197 | db_i = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.index'), 198 | db_l = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.lookup'), 199 | db_s = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.source'), 200 | db_h = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db_h'), 201 | db_ht = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db_h.dbtype'), 202 | db_hi = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db_h.index') 203 | resources: 204 | time = lambda wildcards, attempt: attempt ** 3 * 59, 205 | mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 20 + 20 206 | conda: 207 | '../../../../envs/genes.yaml' 208 | log: 209 | log_dir + 'mmseqs_db_create/all.log' 210 | benchmark: 211 | benchmark_dir + 'mmseqs_db_create/all.txt' 212 | shell: 213 | """ 214 | mmseqs createdb {input.faa} {output.db} 2> {log} 1>&2 215 | """ 216 | 217 | -------------------------------------------------------------------------------- /bin/db_update/humann3/Snakefile: -------------------------------------------------------------------------------- 1 | # input 2 | if not skipped(config['databases']['genes']): 3 | print('\33[36m * Using updated genes database\x1b[0m') 4 | include: snake_dir + 'bin/db_update/humann3/input_from_genes/Snakefile' 5 | else: 6 | msg = '\33[35m X For user-provided gene sequences' 7 | msg += ' you must update the genes database!\x1b[0m' 8 | print(msg) 9 | sys.exit(1) 10 | include: snake_dir + 'bin/db_update/humann3/prepare_query/Snakefile' 11 | # query 12 | if (not skipped(config['params']['humann3']['mmseqs_search']['db']) and 13 | not skipped(config['params']['humann3']['mmseqs_search']['index']) and 14 | not skipped(config['params']['humann3']['mmseqs_search']['run'])): 15 | # checking on database 16 | if not re.search(config['uniref_name'], 17 | str(config['params']['humann3']['mmseqs_search']['db']).lower()): 18 | print('\33[35m * WARNING the uniref_name does not match the query database\x1b[0m') 19 | print('\33[35m * ({} <=> {})\x1b[0m'.format(config['uniref_name'], 20 | config['params']['humann3']['mmseqs_search']['db'])) 21 | # mmseqs search 22 | print('\33[36m * Annotating via "mmseqs search"\x1b[0m') 23 | include: snake_dir + 'bin/db_update/humann3/query_mmseqs/Snakefile' 24 | elif (not skipped(config['params']['humann3']['diamond']['db']) and 25 | not skipped(config['params']['humann3']['diamond']['run'])): 26 | # checking on database 27 | if not re.search(config['uniref_name'], 28 | str(config['params']['humann3']['diamond']['db']).lower()): 29 | print('\33[35m * WARNING the uniref_name does not match the query database\x1b[0m') 30 | print('\33[35m * ({} <=> {})\x1b[0m'.format(config['uniref_name'], 31 | config['params']['humann3']['diamond']['db'])) 32 | # diamond blastp 33 | print('\33[36m * Annotating via "diamond blastp"\x1b[0m') 34 | include: snake_dir + 'bin/db_update/humann3/query_dmnd/Snakefile' 35 | else: 36 | print('\33[31m ERROR: all query methods skipped!\x1b[0m') 37 | # database creation 38 | include: snake_dir + 'bin/db_update/humann3/db_create/Snakefile' 39 | -------------------------------------------------------------------------------- /bin/db_update/humann3/db_create/Snakefile: -------------------------------------------------------------------------------- 1 | def which_membership(wildcards): 2 | """ 3 | Which membership file to use as input 4 | """ 5 | if not skipped(config['databases']['genes']): 6 | return genes_dir + 'cluster/clusters_membership.tsv.gz' 7 | else: 8 | return config['humann_db']['cluster']['membership'] 9 | 10 | rule humann3_annotate_genes: 11 | """ 12 | Use search hits from clustered reps & index table to annotate all 13 | genome-derep genes. The annotation for each cluster rep is propagated 14 | to each member of the cluster. 15 | """ 16 | input: 17 | hits = config['tmp_dir'] + 'humann3/hits.txt', 18 | fna = config['tmp_dir'] + 'humann3/genome_reps_filtered.fna', 19 | faa = config['tmp_dir'] + 'humann3/genome_reps_filtered.faa', 20 | txt = config['tmp_dir'] + 'humann3/genome_reps_filtered.txt', 21 | tsv = which_membership 22 | output: 23 | fna = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna'), 24 | faa = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa'), 25 | tsv = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv') 26 | params: 27 | exe = config['pipeline']['script_folder'] + 'propagate_annotations.py', 28 | params = config['params']['humann3']['propagate_annotations'] 29 | resources: 30 | time = lambda wildcards, attempt: attempt ** 3 * 59, 31 | mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 3 + 11 32 | log: 33 | log_dir + 'humann3_annotate_genes/all.log' 34 | benchmark: 35 | benchmark_dir + 'humann3_annotate_genes/all.txt' 36 | shell: 37 | """ 38 | {params.exe} {params.params} \ 39 | --in-nuc {input.fna} \ 40 | --out-nuc {output.fna} \ 41 | --out-prot {output.faa} \ 42 | {input.hits} {input.faa} \ 43 | {input.txt} {input.tsv} \ 44 | > {output.tsv} 2> {log} 45 | """ 46 | 47 | rule humann3_annotate_hits_copy: 48 | """ 49 | Copying query hits (diamond or mmseqs) to the final output directory 50 | """ 51 | input: 52 | hits = config['tmp_dir'] + 'humann3/hits.txt' 53 | output: 54 | hits = humann3_dir + 'annotation_hits.gz' 55 | params: 56 | ionice = config['params']['ionice'] 57 | resources: 58 | time = lambda wildcards, attempt: attempt ** 3 * 59 59 | log: 60 | log_dir + 'humann3_annotate_hits_copy/all.log' 61 | shell: 62 | """ 63 | ionice {params.ionice} gzip -c {input.hits} > {output.hits} 2> {log} 64 | """ 65 | 66 | rule humann3_alt_annotate: 67 | """ 68 | Re-annotating with different UniRef cluster resolution 69 | """ 70 | input: 71 | fna = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna', 72 | faa = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa', 73 | tsv = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv', 74 | idx = ancient(config['cluster_idx']) 75 | output: 76 | fna = temp(config['tmp_dir'] + config['uniref_other_name'] + \ 77 | '/genome_reps_filt_annot.fna'), 78 | faa = temp(config['tmp_dir'] + config['uniref_other_name'] + \ 79 | '/genome_reps_filt_annot.faa'), 80 | tsv = temp(config['tmp_dir'] + config['uniref_other_name'] + \ 81 | '/genome_reps_filt_annot.tsv') 82 | params: 83 | exe = config['pipeline']['script_folder'] + 'uniref_clst_trans.py' 84 | resources: 85 | time = lambda wildcards, attempt: attempt ** 3 * 59, 86 | mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 3 + 15 87 | log: 88 | log_dir + 'humann3_alt_annotate/all.log' 89 | benchmark: 90 | benchmark_dir + 'humann3_alt_annotate/all.txt' 91 | shell: 92 | """ 93 | OUTDIR=`dirname {output.fna}` 94 | mkdir -p $OUTDIR 2> {log} 95 | {params.exe} {input.idx} \ 96 | --in-nuc {input.fna} \ 97 | --in-prot {input.faa} \ 98 | --in-tsv {input.tsv} \ 99 | --out-nuc {output.fna} \ 100 | --out-prot {output.faa} \ 101 | --out-tsv {output.tsv} \ 102 | 2>> {log} 1>&2 103 | """ 104 | 105 | rule humann3_annotate_genes_copy: 106 | """ 107 | Copying annotated gene files to the final directory 108 | """ 109 | input: 110 | fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna', 111 | faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa', 112 | tsv = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.tsv' 113 | output: 114 | fna = humann3_dir + '{uniref}/genome_reps_filt_annot.fna.gz', 115 | faa = humann3_dir + '{uniref}/genome_reps_filt_annot.faa.gz', 116 | tsv = humann3_dir + '{uniref}/genome_reps_filt_annot.tsv.gz' 117 | params: 118 | ionice = config['params']['ionice'] 119 | resources: 120 | time = lambda wildcards, attempt: attempt ** 3 * 59 121 | log: 122 | log_dir + 'humann3_annotate_genes_copy/{uniref}.log' 123 | benchmark: 124 | benchmark_dir + 'humann3_annotate_genes_copy/{uniref}.txt' 125 | shell: 126 | """ 127 | ionice {params.ionice} gzip -c {input.fna} > {output.fna} 2> {log} 128 | ionice {params.ionice} gzip -c {input.faa} > {output.faa} 2>> {log} 129 | ionice {params.ionice} gzip -c {input.tsv} > {output.tsv} 2>> {log} 130 | """ 131 | 132 | rule humann3_bowtie2_build: 133 | """ 134 | Running bowtie2 build on combined, annotated genes 135 | """ 136 | input: 137 | fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna' 138 | output: 139 | os.path.join(humann3_dir, '{uniref}/bowtie2_build.done') 140 | params: 141 | prefix = humann3_dir + '{uniref}/genome_reps_filt_annot' 142 | conda: 143 | '../../../envs/humann3.yaml' 144 | threads: 145 | 12 146 | resources: 147 | time = lambda wildcards, attempt: attempt * 2 * 60 * 24, 148 | n = lambda wildcards, attempt, threads: threads, 149 | mem_gb_pt = lambda wildcards, attempt: attempt ** 2 * 2 + 8, 150 | lg_idx = lambda wildcards, attempt: '--large-index' if attempt > 1 else '' 151 | log: 152 | log_dir + 'humann3_bowtie2_build/{uniref}.log' 153 | benchmark: 154 | benchmark_dir + 'humann3_bowtie2_build/{uniref}.txt' 155 | shell: 156 | """ 157 | bowtie2-build --threads {threads} {resources.lg_idx} \ 158 | {input.fna} {params.prefix} 2> {log} 1>&2 159 | 160 | # check that output exists 161 | OUTDIR=`dirname {params.prefix}` 162 | IDX_FILES=`find $OUTDIR -maxdepth 1 -name "*.bt2*"` 163 | IDX_FILES=`echo $IDX_FILES | perl -pe 's/ +/\n/g' | wc -l` 164 | if [ $IDX_FILES -lt 1 ]; then 165 | echo "ERROR: no bowtie2 index files found!" 166 | exit 1 167 | fi 168 | touch {output} 2>> {log} 169 | """ 170 | 171 | rule humann3_diamond_makedb: 172 | """ 173 | Running diamond makedb on combined, annotated genes 174 | """ 175 | input: 176 | faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa' 177 | output: 178 | humann3_dir + '{uniref}/protein_database/{dmnd}' 179 | params: 180 | tmp_dir = config['tmp_dir'] 181 | conda: 182 | '../../../envs/humann3.yaml' 183 | resources: 184 | time = lambda wildcards, attempt: attempt * 2 * 60 * 24, 185 | mem_gb_pt = lambda wildcards, attempt: (attempt ** 3 + 2) * 12 186 | log: 187 | log_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.log' 188 | benchmark: 189 | benchmark_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.txt' 190 | shell: 191 | """ 192 | PREF=`echo {output} | perl -pe 's/\.[^.]+$//'` 193 | diamond makedb --in {input.faa} -d $PREF 2> {log} 1>&2 194 | """ 195 | 196 | 197 | -------------------------------------------------------------------------------- /bin/db_update/humann3/input_from_genes/Snakefile: -------------------------------------------------------------------------------- 1 | # copying from the genes-db-update pipeline output 2 | 3 | rule humann3_copy_gene_input: 4 | """ 5 | Splitting gene fasta for distributed searching 6 | """ 7 | input: 8 | fna = genes_dir + 'genome_reps_filtered.fna.gz', 9 | faa = genes_dir + 'genome_reps_filtered.faa.gz', 10 | txt = genes_dir + 'genome_reps_filtered.txt.gz' 11 | output: 12 | fna = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.fna'), 13 | faa = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.faa'), 14 | txt = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.txt') 15 | params: 16 | ionice = config['params']['ionice'], 17 | exe = config['pipeline']['script_folder'] + 'cat_files.py' 18 | resources: 19 | time = lambda wildcards, attempt: attempt ** 3 * 59 20 | log: 21 | log_dir + 'humann3_copy_gene_input/all.log' 22 | shell: 23 | """ 24 | ionice {params.ionice} {params.exe} {input.fna} > {output.fna} 2> {log} 25 | ionice {params.ionice} {params.exe} {input.faa} > {output.faa} 2>> {log} 26 | ionice {params.ionice} {params.exe} {input.txt} > {output.txt} 2>> {log} 27 | """ 28 | 29 | rule humann3_copy_cluster_input: 30 | """ 31 | Splitting gene fasta for distributed searching 32 | """ 33 | input: 34 | hit_c = config['humann_db']['query']['hits'], 35 | faa_c = genes_dir + 'cluster/clusters_reps.faa.gz', 36 | mem_c = genes_dir + 'cluster/clusters_membership.tsv.gz', 37 | output: 38 | hit_c = temp(config['tmp_dir'] + 'humann3/query_hits.txt'), 39 | faa_c = temp(config['tmp_dir'] + 'humann3/clusters_reps.faa'), 40 | mem_c = temp(config['tmp_dir'] + 'humann3/clusters_membership.tsv') 41 | params: 42 | ionice = config['params']['ionice'], 43 | exe = config['pipeline']['script_folder'] + 'cat_files.py' 44 | resources: 45 | time = lambda wildcards, attempt: attempt ** 3 * 59 46 | log: 47 | log_dir + 'humann3_copy_cluster_input/all.log' 48 | shell: 49 | """ 50 | ionice {params.ionice} {params.exe} {input.hit_c} > {output.hit_c} 2> {log} 51 | ionice {params.ionice} {params.exe} {input.mem_c} > {output.mem_c} 2>> {log} 52 | ionice {params.ionice} {params.exe} {input.faa_c} > {output.faa_c} 2>> {log} 53 | """ 54 | 55 | -------------------------------------------------------------------------------- /bin/db_update/humann3/prepare_query/Snakefile: -------------------------------------------------------------------------------- 1 | rule humann3_which_to_query: 2 | """ 3 | Selecting which of the cluster reps need to be queried, given the existing query hits. 4 | """ 5 | input: 6 | mem_c = config['tmp_dir'] + 'humann3/clusters_membership.tsv', 7 | hit_c = config['tmp_dir'] + 'humann3/query_hits.txt', 8 | faa_c = config['tmp_dir'] + 'humann3/clusters_reps.faa' 9 | output: 10 | faa = temp(config['tmp_dir'] + 'humann3/clusters_reps_filt.faa') 11 | params: 12 | exe = config['pipeline']['script_folder'] + 'filter_cluster_reps.py', 13 | params = config['params']['humann3']['filter_existing'] 14 | resources: 15 | time = lambda wildcards, attempt: attempt ** 3 * 59, 16 | mem_gb_pt = lambda wildcards, attempt: attempt * 10 17 | log: 18 | log_dir + 'humann3_which_to_query/all.log' 19 | shell: 20 | """ 21 | {params.exe} {params.params} \ 22 | {input.mem_c} {input.hit_c} \ 23 | {input.faa_c} > {output.faa} 2> {log} 24 | """ 25 | 26 | rule humann_query_fasta_split: 27 | """ 28 | Splitting gene fasta for distributed searching 29 | """ 30 | input: 31 | faa = config['tmp_dir'] + 'humann3/clusters_reps_filt.faa' 32 | output: 33 | done = config['tmp_dir'] + 'humann3_search/split.done', 34 | splt = temp(expand(config['tmp_dir'] + \ 35 | 'humann3_search/stdin.part_{splitID}.fasta', 36 | splitID=config['params']['humann3']['splits'])) 37 | params: 38 | n_splits = config['params']['humann3']['batches'] 39 | threads: 40 | 4 41 | resources: 42 | time = lambda wildcards, attempt: int(round(attempt ** 4 * 59,0)), 43 | n = lambda wildcards, attempt, threads: threads, 44 | mem_gb_pt = lambda wildcards, attempt, threads: int(round(attempt ** 3 * 10.0 / threads,0)) 45 | conda: 46 | '../../../envs/genes.yaml' 47 | log: 48 | log_dir + 'mmseqs_search_batch_seqs/all.log' 49 | benchmark: 50 | benchmark_dir + 'mmseqs_search_batch_seqs/all.txt' 51 | shell: 52 | """ 53 | OUTDIR=`dirname {output.done} 2> {log}` 54 | seqkit shuffle -j {threads} {input.faa} 2>> {log} | \ 55 | seqkit split -j {threads} --by-part {params.n_splits} \ 56 | --out-dir $OUTDIR 2>> {log} 1>&2 57 | touch {output.done} 58 | """ 59 | 60 | -------------------------------------------------------------------------------- /bin/db_update/humann3/query_dmnd/Snakefile: -------------------------------------------------------------------------------- 1 | rule humann3_diamond_db_copy: 2 | """ 3 | Copying the user-provided DIAMOND database to the temp directory 4 | """ 5 | input: 6 | db = ancient(config['params']['humann3']['diamond']['db']) 7 | output: 8 | db = temp(config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd') 9 | params: 10 | ionice = config['params']['ionice'] 11 | resources: 12 | time = lambda wildcards, attempt: attempt ** 3 * 59, 13 | mem_gb_pt = lambda wildcards, attempt: attempt * 4 14 | log: 15 | log_dir + 'humann3_diamond_db_copy/all.log' 16 | benchmark: 17 | benchmark_dir + 'humann3_diamond_db_copy/all.txt' 18 | shell: 19 | """ 20 | ionice {params.ionice} cp -f {input} {output} 2> {log} 1>&2 21 | """ 22 | 23 | def dmnd_start_mem(wildcards, attempt, threads=12): 24 | """ 25 | Estimating the baseline memory to use for jobs, given the diamond database size 26 | """ 27 | F = config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd' 28 | prot_db_size = os.stat(F).st_size / 1e9 29 | mem = round(prot_db_size * 5 / threads + 1.499,0) 30 | mem = (attempt - 1) * 2 + mem 31 | return int(mem) 32 | 33 | rule humann3_diamond_pass1: 34 | """ 35 | Annotating genes via diamond search of UniRef DB 36 | """ 37 | input: 38 | faa = config['tmp_dir'] + 'humann3_search/stdin.part_{splitID}.fasta', 39 | dmnd_db = config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd' 40 | output: 41 | hits = temp(config['tmp_dir'] + 'humann3/hits_pass1/{splitID}.txt'), 42 | unaln = temp(config['tmp_dir'] + 'humann3/unaln/{splitID}.faa') 43 | params: 44 | params = config['params']['humann3']['diamond']['run'], 45 | tmp_dir = config['tmp_dir'] + 'humann3_dmnd_TMP/{splitID}/' 46 | threads: 47 | 8 48 | resources: 49 | time = lambda wildcards, attempt: attempt ** 2 * 60 * 12, 50 | n = lambda wildcards, attempt, threads: threads, 51 | mem_gb_pt = dmnd_start_mem 52 | conda: 53 | '../../../envs/humann3.yaml' 54 | log: 55 | log_dir + 'humann3_diamond_pass1/{splitID}.log' 56 | benchmark: 57 | benchmark_dir + 'humann3_diamond_pass1/{splitID}.txt' 58 | shell: 59 | """ 60 | TMPDIR="{params.tmp_dir}" 61 | mkdir -p $TMPDIR 2> {log} 62 | 63 | # diamond run 64 | diamond blastp {params.params} \ 65 | --tmpdir $TMPDIR --threads {threads} \ 66 | -q {input.faa} -d {input.dmnd_db} \ 67 | -o {output.hits} --un {output.unaln} \ 68 | --outfmt 6 qseqid sseqid evalue pident length slen \ 69 | 2>> {log} 1>&2 70 | """ 71 | 72 | rule humann3_diamond_pass2: 73 | """ 74 | Annotating genes via diamond search of UniRef DB (sensitive mode) 75 | """ 76 | input: 77 | faa = config['tmp_dir'] + 'humann3/unaln/{splitID}.faa', 78 | dmnd_db = config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd' 79 | output: 80 | hits = temp(config['tmp_dir'] + 'humann3/hits_pass2/{splitID}.txt') 81 | params: 82 | params = config['params']['humann3']['diamond']['run'], 83 | tmp_dir = config['tmp_dir'] + 'humann3_dmnd_TMP/{splitID}/' 84 | threads: 85 | 8 86 | resources: 87 | time = lambda wildcards, attempt: attempt * 60 * 48, 88 | n = lambda wildcards, attempt, threads: threads, 89 | mem_gb_pt = dmnd_start_mem 90 | conda: 91 | '../../../envs/humann3.yaml' 92 | log: 93 | log_dir + 'humann3_diamond_pass2/{splitID}.log' 94 | benchmark: 95 | benchmark_dir + 'humann3_diamond_pass2/{splitID}.txt' 96 | shell: 97 | """ 98 | NSEQ=`seqkit seq -n {input.faa} | wc -l 2> {log}` 99 | if [[ "$NSEQ" -gt "0" ]]; then 100 | TMPDIR="{params.tmp_dir}" 101 | mkdir -p $TMPDIR 2>> {log} 102 | # diamond run 103 | diamond blastp --sensitive {params.params} \ 104 | --tmpdir $TMPDIR --threads {threads} \ 105 | -q {input.faa} -d {input.dmnd_db} -o {output.hits} \ 106 | --outfmt 6 qseqid sseqid evalue pident length slen \ 107 | 2>> {log} 1>&2 108 | else 109 | touch {output.hits} 2> {log} 1>&2 110 | echo "No unaligned sequences. Skipping DIAMOND" >> {log} 111 | fi 112 | """ 113 | 114 | localrules: humann3_diamond_merge 115 | rule humann3_diamond_merge: 116 | """ 117 | Merging the results of the 2 DIAMOND passes (all splits). 118 | Also including all original hits (prior to db update). 119 | """ 120 | input: 121 | hits_orig = config['tmp_dir'] + 'humann3/query_hits.txt', 122 | hits1 = expand(config['tmp_dir'] + \ 123 | 'humann3/hits_pass1/{splitID}.txt', 124 | splitID=config['params']['humann3']['splits']), 125 | hits2 = expand(config['tmp_dir'] + \ 126 | 'humann3/hits_pass2/{splitID}.txt', 127 | splitID=config['params']['humann3']['splits']) 128 | output: 129 | hits = temp(config['tmp_dir'] + 'humann3/hits.txt') 130 | resources: 131 | time = lambda wildcards, attempt: attempt ** 3 * 59 132 | run: 133 | with open(output.hits, 'w') as outF: 134 | for F in [input.hits_orig] + input.hits1 + input.hits2: 135 | with open(F) as inF: 136 | for line in inF: 137 | outF.write(line) 138 | 139 | -------------------------------------------------------------------------------- /bin/db_update/kraken2/Snakefile: -------------------------------------------------------------------------------- 1 | #-- Kraken2 database update workflow --# 2 | rule kraken2_cp_to_tmp: 3 | """ 4 | Copying an existing kraken database to temp directory 5 | """ 6 | input: 7 | lib = ancient(config['kraken2_db']['library']), 8 | tax = ancient(config['kraken2_db']['taxonomy']) 9 | output: 10 | lib = temp(directory(config['tmp_dir'] + 'db_update/kraken2/library/')), 11 | tax = temp(directory(config['tmp_dir'] + 'db_update/kraken2/taxonomy/')), 12 | nodes = temp(config['tmp_dir'] + 'db_update/kraken2/taxonomy/nodes.dmp'), 13 | names = temp(config['tmp_dir'] + 'db_update/kraken2/taxonomy/names.dmp') 14 | params: 15 | ionice = config['params']['ionice'] 16 | resources: 17 | time = lambda wildcards, attempt: attempt ** 3 * 59 18 | log: 19 | log_dir + 'db_update/kraken2_cp_to_tmp/all.log' 20 | benchmark: 21 | benchmark_dir + 'db_update/kraken2_cp_to_tmp/all.txt' 22 | shell: 23 | """ 24 | rm -rf {output.lib} 2> {log} 25 | rm -rf {output.tax} 2>> {log} 26 | ionice {params.ionice} cp -rf {input.lib} {output.lib} 2>> {log} 27 | ionice {params.ionice} cp -rf {input.tax} {output.tax} 2>> {log} 28 | """ 29 | 30 | def kraken2_add_taxID_get_taxID(wildcards): 31 | """ 32 | Getting genome taxID from the user input table (genome metadata) 33 | """ 34 | taxID = config['samples'].loc[wildcards.sample, config['taxID_col']] 35 | try: 36 | taxID = taxID.astype(str) 37 | except AttributeError: 38 | pass 39 | return taxID 40 | 41 | rule kraken2_add_taxID: 42 | """ 43 | Adding a taxononmy ID to the header of each genome. 44 | Assuming the taxID is in the samples table. 45 | Writing edited genome to temp dir. 46 | 47 | Format: `kraken:taxid||` 48 | """ 49 | input: 50 | fasta = lambda wildcards: \ 51 | config['samples'].loc[wildcards.sample, config['fasta_file_path_col']] 52 | output: 53 | temp(config['tmp_dir'] + 'db_update/genomes/{sample}.fna') 54 | resources: 55 | time = lambda wildcards, attempt: attempt ** 2 * 59, 56 | mem_gb_pt = lambda wildcards, attempt: attempt * 6 57 | params: 58 | taxID = kraken2_add_taxID_get_taxID, 59 | exe = config['pipeline']['script_folder'] + 'kraken2_rename_genome.py' 60 | log: 61 | log_dir + 'db_update/kraken2_add_taxID/{sample}.log' 62 | benchmark: 63 | benchmark_dir + 'db_update/kraken2_add_taxID/{sample}.txt' 64 | shell: 65 | """ 66 | {params.exe} {input.fasta} {params.taxID} > {output} 2> {log} 67 | """ 68 | 69 | localrules: kraken2_build_add 70 | 71 | rule kraken2_build_add: 72 | """ 73 | Adding genome fasta files to the kraken database. 74 | Using the --add-to-library flag 75 | """ 76 | input: 77 | lib = config['tmp_dir'] + 'db_update/kraken2/library/', 78 | tax = config['tmp_dir'] + 'db_update/kraken2/taxonomy/', 79 | fasta = config['tmp_dir'] + 'db_update/genomes/{sample}.fna', 80 | nodes = config['tmp_dir'] + 'db_update/kraken2/taxonomy/nodes.dmp', 81 | names = config['tmp_dir'] + 'db_update/kraken2/taxonomy/names.dmp' 82 | output: 83 | temp(config['tmp_dir'] + 'db_update/kraken2/added/{sample}.done') 84 | resources: 85 | time = lambda wildcards, attempt: attempt ** 2 * 59, 86 | mem_gb_pt = lambda wildcards, attempt: attempt * 6 87 | conda: 88 | '../../envs/kraken2.yaml' 89 | log: 90 | log_dir + 'db_update/kraken2_build_add/{sample}.log' 91 | benchmark: 92 | benchmark_dir + 'db_update/kraken2_build_add/{sample}.txt' 93 | shell: 94 | """ 95 | DB=`dirname {input.names}` 96 | DB=`dirname $DB` 97 | 98 | kraken2-build --db $DB --add-to-library {input.fasta} 2> {log} 1>&2 99 | touch {output} 2>> {log} 100 | """ 101 | 102 | rule kraken2_build: 103 | """ 104 | Building the kraken database 105 | """ 106 | input: 107 | expand(config['tmp_dir'] + 'db_update/kraken2/added/{sample}.done', 108 | sample = config['samples_unique']) 109 | output: 110 | hash = temp(config['tmp_dir'] + 'db_update/kraken2/hash.k2d'), 111 | opts = temp(config['tmp_dir'] + 'db_update/kraken2/opts.k2d'), 112 | map = temp(config['tmp_dir'] + 'db_update/kraken2/seqid2taxid.map'), 113 | taxo = temp(config['tmp_dir'] + 'db_update/kraken2/taxo.k2d') 114 | threads: 115 | 12 116 | resources: 117 | time = lambda wildcards, attempt: attempt ** 2 * 60 * 24, 118 | n = lambda wildcards, attempt, threads: threads, 119 | mem_gb_pt = lambda wildcards, attempt: int(round(attempt * 18 + 4,0)) 120 | conda: 121 | '../../envs/kraken2.yaml' 122 | log: 123 | log_dir + 'db_update/kraken2_build/all.log' 124 | benchmark: 125 | benchmark_dir + 'db_update/kraken2_build/all.txt' 126 | shell: 127 | """ 128 | DB=`dirname {output.hash}` 129 | kraken2-build --build --threads {threads} --db $DB 2> {log} 1>&2 130 | """ 131 | 132 | rule kraken2_db_copy: 133 | """ 134 | Copying the Kraken database to the output directory 135 | """ 136 | input: 137 | lib = config['tmp_dir'] + 'db_update/kraken2/library/', 138 | tax = config['tmp_dir'] + 'db_update/kraken2/taxonomy/', 139 | nodes = config['tmp_dir'] + 'db_update/kraken2/taxonomy/nodes.dmp', 140 | names = config['tmp_dir'] + 'db_update/kraken2/taxonomy/names.dmp', 141 | hash = config['tmp_dir'] + 'db_update/kraken2/hash.k2d', 142 | opts = config['tmp_dir'] + 'db_update/kraken2/opts.k2d', 143 | taxo = config['tmp_dir'] + 'db_update/kraken2/taxo.k2d', 144 | map = config['tmp_dir'] + 'db_update/kraken2/seqid2taxid.map' 145 | output: 146 | hash = kraken2_dir + 'hash.k2d', 147 | opts = kraken2_dir + 'opts.k2d', 148 | taxo = kraken2_dir + 'taxo.k2d', 149 | map = kraken2_dir + 'seqid2taxid.map' 150 | params: 151 | keep = config['keep_intermediate'], 152 | ionice = config['params']['ionice'] 153 | resources: 154 | time = lambda wildcards, attempt: attempt ** 3 * 59 155 | conda: 156 | '../../envs/kraken2.yaml' 157 | log: 158 | log_dir + 'db_update/kraken2_db_copy/all.log' 159 | benchmark: 160 | benchmark_dir + 'db_update/kraken2_db_copy/all.txt' 161 | shell: 162 | """ 163 | if [ "{params.keep}" == "True" ]; then 164 | echo "# copying entire kraken db" > {log} 165 | DIR1=`dirname {input.hash}` 166 | DIR2=`dirname {output.hash}` 167 | rm -rf $DIR2 2>> {log} 1>&2 168 | ionice {params.ionice} cp -rf $DIR1 $DIR2 2>> {log} 1>&2 169 | else 170 | echo "# copying just built kraken index files" > {log} 171 | ionice {params.ionice} cp -f {input.hash} {output.hash} 2>> {log} 1>&2 172 | ionice {params.ionice} cp -f {input.opts} {output.opts} 2>> {log} 1>&2 173 | ionice {params.ionice} cp -f {input.map} {output.map} 2>> {log} 1>&2 174 | ionice {params.ionice} cp -f {input.taxo} {output.taxo} 2>> {log} 1>&2 175 | fi 176 | """ 177 | 178 | -------------------------------------------------------------------------------- /bin/dirs: -------------------------------------------------------------------------------- 1 | genes_dir = os.path.join(config['output_dir'], 'genes/') 2 | kraken2_dir = os.path.join(config['output_dir'], 'kraken2/') 3 | bracken_dir = os.path.join(config['output_dir'], 'bracken/') 4 | humann3_dir = os.path.join(config['output_dir'], 'humann3/') 5 | 6 | log_dir = os.path.join(config['output_dir'], 'logs/') 7 | benchmark_dir = os.path.join(config['output_dir'], 'benchmarks/') 8 | scripts_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'scripts/') 9 | -------------------------------------------------------------------------------- /bin/envs/genes.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - pigz 6 | - python=3 7 | - numpy 8 | - bioconda::seqkit>=0.16.1 9 | - bioconda::vsearch 10 | - bioconda::prodigal 11 | - bioconda::mmseqs2 -------------------------------------------------------------------------------- /bin/envs/humann2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | dependencies: 4 | - pigz 5 | - bioconda::bowtie2 6 | - bioconda::vsearch 7 | - bioconda::prodigal 8 | - bioconda::diamond=0.8.36 9 | -------------------------------------------------------------------------------- /bin/envs/humann3.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - biobakery 4 | - conda-forge 5 | dependencies: 6 | - pigz 7 | - bioconda::seqkit 8 | - bioconda::vsearch 9 | - bioconda::prodigal 10 | - bioconda::diamond=0.9.24 11 | - biobakery::humann 12 | -------------------------------------------------------------------------------- /bin/envs/kraken2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | dependencies: 5 | - libiconv 6 | - bioconda::kraken2 7 | - bioconda::bracken 8 | 9 | -------------------------------------------------------------------------------- /bin/envs/krakenuniq.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - bioconda 3 | - conda-forge 4 | dependencies: 5 | - bioconda::krakenuniq=0.6 6 | 7 | -------------------------------------------------------------------------------- /bin/scripts/add_user_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import sys,os 4 | import re 5 | import gzip 6 | import uuid 7 | import argparse 8 | import logging 9 | 10 | desc = 'Adding user-provided sequences' 11 | epi = """DESCRIPTION: 12 | """ 13 | parser = argparse.ArgumentParser(description=desc, 14 | epilog=epi, 15 | formatter_class=argparse.RawTextHelpFormatter) 16 | parser.add_argument('fasta', metavar='fasta', type=str, nargs='+', 17 | help='Fasta files (nuc, then prot)') 18 | parser.add_argument('--in-fna', type=str, default='genes.fna', 19 | help='Nucleotide output') 20 | parser.add_argument('--in-faa', type=str, default='genes.faa', 21 | help='Amino acid output') 22 | parser.add_argument('--in-txt', type=str, default='genes.txt', 23 | help='Names index output') 24 | parser.add_argument('--out-fna', type=str, default='wUser_genes.fna', 25 | help='Nucleotide output') 26 | parser.add_argument('--out-faa', type=str, default='wUser_genes.faa', 27 | help='Amino acid output') 28 | parser.add_argument('--out-txt', type=str, default='wUser_genes.txt', 29 | help='Names index output') 30 | parser.add_argument('--version', action='version', version='0.0.1') 31 | 32 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 33 | 34 | def read_fasta(infile): 35 | seqs = {} 36 | seq_name = '' 37 | with open(infile) as inF: 38 | for line in inF: 39 | line = line.rstrip() 40 | if line.startswith('>'): 41 | seq_name = line.lstrip('>') 42 | else: 43 | try: 44 | seqs[seq_name] += line 45 | except KeyError: 46 | seqs[seq_name] = line 47 | return seqs 48 | 49 | def make_index(input_fna, input_faa): 50 | # loading fasta files 51 | if input_fna.lower() != 'skip': 52 | fna = read_fasta(input_fna) 53 | else: 54 | fna = {} 55 | if input_faa.lower() != 'skip': 56 | faa = read_fasta(input_faa) 57 | else: 58 | faa = {} 59 | # union of names 60 | seq_names = set(fna.keys()) & set(faa.keys()) 61 | seq_idx = {x:str(uuid.uuid4()) for x in seq_idx} 62 | # return 63 | return fna, faa, seq_idx 64 | 65 | def seq_cat(seqs, in_fasta, out_fasta): 66 | with open(in_fasta) as inF, open(out_fasta, 'w') as outF: 67 | for line in inF: 68 | outF.write(line) 69 | for seqid,seq in seqs.items(): 70 | outF.write('>' + seqid + '\n' + seq + '\n') 71 | logging.info('File written: {}'.format(out_fasta)) 72 | 73 | def names_cat(seq_idx, in_txt, out_txt): 74 | with open(in_txt) as inF, open(out_txt, 'w') as outF: 75 | for line in inF: 76 | outF.write(line) 77 | for uuid,seqid in seq_idx.items(): 78 | outF.write(uuid + '\t' + seqid + '\n') 79 | logging.info('File written: {}'.format(out_txt)) 80 | 81 | def main(args): 82 | # getting overlap of user-provided nuc & prot gene names 83 | fna,faa,seq_idx = make_index(args.fasta[0], args.fasta[1]) 84 | 85 | # combining sequences 86 | seq_cat(fna, args.in_fna, args.out_fna) 87 | seq_cat(faa, args.in_faa, args.out_faa) 88 | names_cat(seq_idx, args.in_txt, args.out_txt) 89 | 90 | if __name__ == '__main__': 91 | args = parser.parse_args() 92 | main(args) 93 | -------------------------------------------------------------------------------- /bin/scripts/annotate_genes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import sys,os 4 | import re 5 | import gzip 6 | import logging 7 | import argparse 8 | from pprint import pprint 9 | 10 | desc = 'Annotate a genome based on mapping to UniRef via diamond' 11 | epi = """DESCRIPTION: 12 | Adding UniRef IDs and taxonomy to genes from a particular genome. 13 | """ 14 | parser = argparse.ArgumentParser(description=desc, 15 | epilog=epi, 16 | formatter_class=argparse.RawTextHelpFormatter) 17 | parser.add_argument('diamond_hits', metavar='diamond_hits', type=str, 18 | help='tab-delim table of diamond hits') 19 | parser.add_argument('genes_fasta_nuc', metavar='genes_fasta_nuc', type=str, 20 | help='Genes in nucletide fasta format') 21 | parser.add_argument('genes_fasta_AA', metavar='genes_fasta_AA', type=str, 22 | help='Genes in amino acid fasta format') 23 | parser.add_argument('taxonomy', metavar='taxonomy', type=str, 24 | help='Taxonomy of the genome') 25 | parser.add_argument('taxID', metavar='taxID', type=str, 26 | help='NCBI TaxID of the genome') 27 | parser.add_argument('--columns', type=str, default='qseqid,sseqid,pident,length,qstart,qend,qlen,sstart,send,slen,evalue', 28 | help='Diamond output columns (default: %(default)s)') 29 | parser.add_argument('--outdir', type=str, default='genes_annotated', 30 | help='Output directory (default: %(default)s)') 31 | parser.add_argument('--dmnd-db', type=str, default='/ebio/abt3_projects2/databases_no-backup/humann2/uniref50/uniref50_annotated.1.1.dmnd', 32 | help='UniRef dmnd db for annotating genes (default: %(default)s)') 33 | parser.add_argument('--percid', type=float, default=50.0, 34 | help='Percent sequence ID cutoff for calling a hit (default: %(default)s)') 35 | parser.add_argument('--overlap', type=float, default=80.0, 36 | help='Perc. overlap cutoff (longest sequence) for calling a hit (default: %(default)s)') 37 | parser.add_argument('--skip', action='store_true', default=False, 38 | help='Skip diamond-based annotation if the diamond hits file exists (default: %(default)s)') 39 | parser.add_argument('--gzip', action='store_true', default=False, 40 | help='gzip output (default: %(default)s)') 41 | parser.add_argument('--threads', type=int, default=1, 42 | help='Threads used for diamond (default: %(default)s)') 43 | parser.add_argument('--version', action='version', version='0.0.1') 44 | 45 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 46 | 47 | 48 | def make_best_hit_index(dmnd_hit_file, outfmt_cols): 49 | """Making index of best diamond hits for each query 50 | """ 51 | # column index 52 | column_idx = {x:i for i,x in enumerate(outfmt_cols)} 53 | # streaming hits 54 | hits = {} 55 | with open(dmnd_hit_file) as inF: 56 | longest_seq_len = None 57 | for line in inF: 58 | line = line.rstrip().split('\t') 59 | # Perc ID >= cutoff? 60 | try: 61 | pident = float(line[column_idx['pident']]) 62 | except KeyError: 63 | raise KeyError('Cannot find "pident" column') 64 | if pident < args.percid: 65 | continue 66 | # overlap >= cutoff? 67 | ## longest sequence? 68 | try: 69 | query_len = float(line[column_idx['qlen']]) 70 | except KeyError: 71 | raise KeyError('Cannot find "qlen" column') 72 | try: 73 | subject_len = float(line[column_idx['slen']]) 74 | except KeyError: 75 | raise KeyError('Cannot find "slen" column') 76 | if query_len >= subject_len: 77 | longest_seq_len = query_len 78 | else: 79 | longest_seq_len = subject_len 80 | ## overlap vs longest sequence 81 | try: 82 | aln_len = float(line[column_idx['length']]) 83 | except KeyError: 84 | raise KeyError('Cannot find "length" column') 85 | perc_overlap = aln_len / longest_seq_len * 100.0 86 | if perc_overlap < args.overlap: 87 | continue 88 | # better than current best hit for query? 89 | try: 90 | qseqid = line[column_idx['qseqid']] 91 | except KeyError: 92 | raise KeyError('Cannot find "qseqid" column') 93 | try: 94 | sseqid = line[column_idx['sseqid']] 95 | except KeyError: 96 | raise KeyError('Cannot find "sseqid" column') 97 | try: 98 | best_hit = hits[qseqid] 99 | except KeyError: 100 | best_hit = None 101 | if best_hit is None or (pident >= best_hit[1] and perc_overlap >= best_hit[2]): 102 | hits[qseqid] = [sseqid, pident, perc_overlap] 103 | return hits 104 | 105 | def rename_seqs(best_hits, fasta_file, taxonomy, outfile, gzip_output=False): 106 | """Renaming sequences based on uniref hits. 107 | Using naming format: `gene_family|gene_length|taxonomy` 108 | Taxonomy format: `g__{genus};s__{species}_taxID{taxID}` 109 | (see https://bitbucket.org/biobakery/humann2/wiki/Home). 110 | """ 111 | seq_name = None 112 | seq = '' 113 | annot_cnt = 0 114 | annot_skip_cnt = 0 115 | if gzip_output == True: 116 | _open = lambda x: gzip.open(x, 'ab') 117 | outfile += '.gz' 118 | else: 119 | _open = lambda x: open(x, 'a') 120 | 121 | with open(fasta_file) as inF, _open(outfile) as outF: 122 | for line in inF: 123 | if line.startswith('>'): 124 | # previous sequence 125 | if seq_name is not None and seq != '': 126 | seq = seq.rstrip().strip('*') 127 | x = '\n'.join(['>' + seq_name, seq]) + '\n' 128 | if gzip_output == True: 129 | x = x.encode() 130 | outF.write(x) 131 | annot_cnt += 1 132 | else: 133 | annot_skip_cnt += 1 134 | seq = '' 135 | # hit for sequence? 136 | query = line.rstrip().lstrip('>').split(' ')[0] 137 | try: 138 | best_hit = best_hits[query] 139 | except KeyError: 140 | best_hit = None 141 | # renaming 142 | if best_hit is None: 143 | seq_name = None 144 | else: 145 | seq_name = '|'.join([best_hit[0], taxonomy]) 146 | else: 147 | seq += line.rstrip() 148 | # final sequence 149 | if seq_name is not None: 150 | seq = seq.rstrip().strip('*') 151 | x = '\n'.join(['>' + seq_name, seq]) + '\n' 152 | if gzip_output == True: 153 | x = x.encode() 154 | outF.write(x) 155 | annot_cnt += 1 156 | else: 157 | annot_skip_cnt += 1 158 | 159 | logging.info('Number of genes with an annotation: {}'.format(annot_cnt)) 160 | logging.info('Number of genes skipped due to no annotation: {}'.format(annot_skip_cnt)) 161 | logging.info('File written: {}'.format(outfile)) 162 | 163 | def format_taxonomy(tax, taxID): 164 | """ 165 | Formatting taxonomy string 166 | """ 167 | logging.info('Taxonomy string provided {}'.format(tax)) 168 | logging.info('TaxID provided {}'.format(taxID)) 169 | 170 | try: 171 | taxID = int(float(taxID.strip())) 172 | except ValueError: 173 | msg = 'ERROR: taxID "{}" is not an integer!' 174 | raise ValueError(msg) 175 | tax = re.sub('[^A-Za-z0-9-_;]+', '_', tax).split(';') 176 | 177 | if not len(tax) == 7: 178 | species = 's__unclassified' 179 | else: 180 | species = tax[6] 181 | if not len(tax) >= 6: 182 | genus = 'g__unclassified' 183 | else: 184 | genus = tax[5] 185 | 186 | if genus.startswith('G__'): 187 | genus = genus[3:] 188 | if not genus.startswith('g__'): 189 | genus = 'g__' + genus 190 | 191 | if species.startswith('S__'): 192 | species = genus[3:] 193 | if not species.startswith('s__'): 194 | species = 'g__' + species 195 | 196 | if genus == 'g__': 197 | genus = 'g__unclassified' 198 | if species == 's__': 199 | species = 's__unclassified' 200 | 201 | tax = '.'.join([genus, species]) 202 | tax += '__taxID{}'.format(taxID) 203 | logging.info('Converted taxonomy string to {}'.format(tax)) 204 | return tax 205 | 206 | def main(args): 207 | # formatting taxonomy 208 | args.taxonomy = format_taxonomy(args.taxonomy, args.taxID) 209 | 210 | # filtering diamond hits 211 | logging.info('Finding best hit for each gene') 212 | outfmt_cols = args.columns.split(',') 213 | best_hits = make_best_hit_index(args.diamond_hits, outfmt_cols) 214 | #pprint(hits) 215 | 216 | logging.info('Renaming genes') 217 | # nuc 218 | outfile = os.path.join(args.outdir, 'annot.fna') 219 | rename_seqs(best_hits, args.genes_fasta_nuc, 220 | args.taxonomy, outfile=outfile, 221 | gzip_output=args.gzip) 222 | # AA 223 | outfile = os.path.join(args.outdir, 'annot.faa') 224 | rename_seqs(best_hits, args.genes_fasta_AA, 225 | args.taxonomy, outfile=outfile, 226 | gzip_output=args.gzip) 227 | 228 | 229 | if __name__ == '__main__': 230 | args = parser.parse_args() 231 | main(args) 232 | -------------------------------------------------------------------------------- /bin/scripts/bracken-build.py: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ##################################################################### 4 | #bracken_build.sh creates the kmer distribution file for a single Kraken database 5 | #Copyright (C) 2016-2017 Jennifer Lu, jlu26@jhmi.edu 6 | # 7 | #This file is part of Bracken. 8 | # 9 | #Bracken is free software; you can redistribute it and/or modify 10 | #it under the terms of the GNU General Public License as published by 11 | #the Free Software Foundation; either version 3 of the license, or 12 | #(at your option) any later version. 13 | # 14 | #This program is distributed in the hope that it will be useful, 15 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | #GNU General Public License for more details 18 | # 19 | #You should have received a copy of the GNU General Public License 20 | #along with this program; if not, see . 21 | # 22 | ##################################################################### 23 | 24 | set -eu 25 | THREADS=1 26 | KMER_LEN=35 27 | READ_LEN=100 28 | DATABASE="" 29 | KRAKEN="kraken" 30 | KINSTALL="" 31 | 32 | VERSION="2.2" 33 | while getopts "k:l:d:x:t:" OPTION 34 | do 35 | case $OPTION in 36 | t) 37 | THREADS=$OPTARG 38 | ;; 39 | k) 40 | KMER_LEN=$OPTARG 41 | ;; 42 | l) 43 | READ_LEN=$OPTARG 44 | ;; 45 | d) 46 | DATABASE=$OPTARG 47 | ;; 48 | x) 49 | KINSTALL=$OPTARG 50 | ;; 51 | \?) 52 | echo "Usage: bracken_build -k KMER_LEN -l READ_LEN -d MY_DB -x K_INSTALLATION -t THREADS" 53 | echo " KMER_LEN kmer length used to build the kraken database (default: 35)" 54 | echo " THREADS the number of threads to use when running kraken classification and the bracken scripts" 55 | echo " READ_LEN read length to get all classifications for (default: 100)" 56 | echo " MY_DB location of Kraken database" 57 | echo " K_INSTALLATION location of the installed kraken/kraken-build scripts (default assumes scripts can be run from the user path)" 58 | echo 59 | echo "**Note that this script will try to use kraken2 as default. If kraken2 is not installed, kraken will be used instead" 60 | exit 61 | ;; 62 | esac 63 | done 64 | #Output command line options selected 65 | echo " >> Selected Options:" 66 | echo " kmer length = $KMER_LEN" 67 | echo " read length = $READ_LEN" 68 | echo " database = $DATABASE" 69 | echo " threads = $THREADS" 70 | if [[ "$DATABASE" =~ "/"$ ]] 71 | then 72 | DATABASE=${DATABASE:0:-1} 73 | fi 74 | #Check for Kraken version 75 | #echo ${KINSTALL}kraken2 76 | if [ "$KINSTALL" == "" ]; then 77 | if hash kraken2 &> /dev/null; then 78 | KRAKEN="kraken2" 79 | elif hash kraken &> /dev/null; then 80 | KRAKEN="kraken" 81 | else 82 | echo "User must first install kraken or kraken2 and/or specify installation directory of kraken/kraken2 using -x flag" 83 | exit 84 | fi 85 | else 86 | if [ -f ${KINSTALL}kraken2 ]; then 87 | KRAKEN="kraken2" 88 | elif [ -f ${KINSTALL}kraken ]; then 89 | KRAKEN="kraken" 90 | else 91 | echo "User must first install kraken or kraken2 and/or specify installation directory of kraken/kraken2 using -x flag" 92 | exit 93 | fi 94 | fi 95 | #Check if Kraken database exists 96 | echo " >> Checking for Valid Options..." 97 | if [ -d $DATABASE ] 98 | then 99 | #Directory exists, check for taxonomy/nodes.dmp, library/ and for hash.k2d file 100 | if [ ! -d $DATABASE/library ] 101 | then 102 | echo " ERROR: Database library $DATABASE/library does not exist" 103 | exit 104 | elif [ ! -d $DATABASE/taxonomy ] 105 | then 106 | echo " ERROR : Database taxonomy $DATABASE/taxonomy does not exist" 107 | exit 108 | elif [ ! -f $DATABASE/taxonomy/nodes.dmp ] 109 | then 110 | echo " ERROR: Database taxonomy $DATABASE/taxonomy/nodes.dmp does not exist" 111 | exit 112 | elif [ $KRAKEN == "kraken2" ] && [ ! -f $DATABASE/hash.k2d ] 113 | then 114 | echo " ERROR: Kraken2 Database incomplete: $DATABASE/hash.k2d does not exist" 115 | exit 116 | elif [ $KRAKEN == "kraken" ] && [ ! -f $DATABASE/database.kdb ] 117 | then 118 | echo " ERROR: Kraken Database incomplete: $DATABASE/database.kdb does not exist" 119 | exit 120 | fi 121 | else 122 | echo " ERROR: Kraken database $DATABASE" does not exist 123 | exit 124 | fi 125 | #See if database.kraken exists, if not, create 126 | echo " >> Creating database.kraken [if not found]" 127 | if [ -f $DATABASE/database.kraken ] 128 | then 129 | #database.kraken exists, skip 130 | echo " database.kraken exists, skipping creation...." 131 | elif [ $KRAKEN == "kraken2" ] 132 | then 133 | #database.kraken not found, must create 134 | echo " >> ${KINSTALL}kraken2 --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken" 135 | 136 | ${KINSTALL}kraken2 --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken 137 | else 138 | #database.kraken not found, must create 139 | echo " >> ${KINSTALL}kraken --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken" 140 | ${KINSTALL}kraken --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken 141 | fi 142 | echo " Finished creating database.kraken [in DB folder]" 143 | #Generate databaseXmers.kmer_distrib 144 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" 145 | #cd $DIR 146 | echo " >> Creating database${READ_LEN}mers.kmer_distrib " 147 | if [ -f $DIR/src/kmer2read_distr ]; then 148 | $DIR/src/kmer2read_distr --seqid2taxid $DATABASE/seqid2taxid.map --taxonomy $DATABASE/taxonomy/ --kraken $DATABASE/database.kraken --output $DATABASE/database${READ_LEN}mers.kraken -k ${KMER_LEN} -l ${READ_LEN} -t ${THREADS} 149 | python $DIR/src/generate_kmer_distribution.py -i $DATABASE/database${READ_LEN}mers.kraken -o $DATABASE/database${READ_LEN}mers.kmer_distrib 150 | # check if kmer2read_distr is in PATH 151 | elif [ -f $(command -v kmer2read_distr) ]; then 152 | kmer2read_distr --seqid2taxid $DATABASE/seqid2taxid.map --taxonomy $DATABASE/taxonomy/ --kraken $DATABASE/database.kraken --output $DATABASE/database${READ_LEN}mers.kraken -k ${KMER_LEN} -l ${READ_LEN} -t ${THREADS} 153 | if [ -f $(command -v generate_kmer_distribution.py) ]; then 154 | python $(command -v generate_kmer_distribution.py) -i $DATABASE/database${READ_LEN}mers.kraken -o $DATABASE/database${READ_LEN}mers.kmer_distrib 155 | else 156 | echo " ERROR: generate_kmer_distribution.py script not found. " 157 | echo " Run 'sh install_bracken.sh' to generate the kmer2read_distr script." 158 | echo " Alternatively, cd to BRACKEN_FOLDER/src/ and run 'make'" 159 | exit 160 | fi 161 | else 162 | echo " ERROR: kmer2read_distr program not found. " 163 | echo " Run 'sh install_bracken.sh' to generate the kmer2read_distr script." 164 | echo " Alternatively, cd to BRACKEN_FOLDER/src/ and run 'make'" 165 | exit 166 | fi 167 | echo " Finished creating database${READ_LEN}mers.kraken and database${READ_LEN}mers.kmer_distrib [in DB folder]" 168 | echo " *NOTE: to create read distribution files for multiple read lengths, " 169 | echo " rerun this script specifying the same database but a different read length" 170 | echo 171 | echo "Bracken build complete." 172 | -------------------------------------------------------------------------------- /bin/scripts/cat_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import gzip 6 | import argparse 7 | import logging 8 | 9 | desc = 'Simple cat of files; files can be gzip\'d' 10 | epi = """DESCRIPTION: 11 | Simple concatentation of files that allows for a mixture 12 | of gzip'd and uncompressed files. 13 | Output written to STDOUT 14 | """ 15 | parser = argparse.ArgumentParser(description=desc, 16 | epilog=epi, 17 | formatter_class=argparse.RawTextHelpFormatter) 18 | parser.add_argument('input', metavar='input', type=str, nargs='+', 19 | help='Input file(s)') 20 | parser.add_argument('--header', action='store_true', default=False, 21 | help='Input files have headers, so just keep first (default: %(default)s)') 22 | parser.add_argument('--version', action='version', version='0.0.1') 23 | 24 | 25 | def main(args): 26 | for i,infile in enumerate(args.input): 27 | if infile.endswith('.gz'): 28 | _open = lambda x: gzip.open(x, 'rb') 29 | else: 30 | _open = lambda x: open(x, 'r') 31 | with _open(infile) as inF: 32 | for ii,line in enumerate(inF): 33 | # skipping header (except for first table) 34 | if i > 0 and ii == 0 and args.header is True: 35 | continue 36 | # writing line 37 | if infile.endswith('.gz'): 38 | line = line.decode('utf-8') 39 | print(line, end='') 40 | 41 | if __name__ == '__main__': 42 | args = parser.parse_args() 43 | main(args) 44 | -------------------------------------------------------------------------------- /bin/scripts/check_gene_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import gzip 6 | import argparse 7 | import logging 8 | 9 | desc = 'Simple check of gene input' 10 | epi = """DESCRIPTION: 11 | Check of overlapping UUIDs for fasta files & metadata table. 12 | Also, check that required data is provided in the metadata table. 13 | """ 14 | parser = argparse.ArgumentParser(description=desc, 15 | epilog=epi, 16 | formatter_class=argparse.RawTextHelpFormatter) 17 | parser.add_argument('prot_fasta', metavar='prot_fasta', type=str, 18 | help='Protein sequence fasta') 19 | parser.add_argument('metadata', metavar='metadata', type=str, 20 | help='gene metadata') 21 | parser.add_argument('-n', '--nuc-fasta', type=str, default=None, 22 | help='Nucleotide sequence fasta (default: %(default)s)') 23 | parser.add_argument('--version', action='version', version='0.0.1') 24 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 25 | 26 | def get_open(infile): 27 | if infile.endswith('.gz'): 28 | _open = lambda x: gzip.open(x, 'rb') 29 | else: 30 | _open = lambda x: open(x, 'r') 31 | return _open 32 | 33 | def read_fasta(infile): 34 | logging.info('Reading file: {}'.format(infile)) 35 | _open = get_open(infile) 36 | seqs = [] 37 | with _open(infile) as inF: 38 | for line in inF: 39 | if infile.endswith('.gz'): 40 | line = line.decode('utf-8') 41 | if line.startswith('>'): 42 | seqs.append(line.lstrip('>').rstrip()) 43 | return set(seqs) 44 | 45 | def read_meta(infile): 46 | logging.info('Reading file: {}'.format(infile)) 47 | _open = get_open(infile) 48 | meta = {} 49 | header = {} 50 | req_cols = ['seq_uuid', 'seq_orig_name', 'domain', 'phylum', 51 | 'class', 'order', 'family', 'genus', 'species', 52 | 'taxid', 'genome_name', 'genome_length_bp'] 53 | non_empty_cols = ['seq_uuid', 'seq_orig_name', 'genus', 'species'] 54 | entry_cnt = 0 55 | with _open(infile) as inF: 56 | for i,line in enumerate(inF): 57 | if infile.endswith('.gz'): 58 | line = line.decode('utf-8') 59 | line = line.rstrip().split('\t') 60 | if line == '': 61 | continue 62 | # header 63 | if i == 0: 64 | header = {x:ii for ii,x in enumerate(line)} 65 | missing = [] 66 | for x in req_cols: 67 | try: 68 | _ = header[x] 69 | except KeyError: 70 | missing.append(x) 71 | if len(missing) > 0: 72 | msg = 'Missing columns in metadata table: {}' 73 | raise ValueError(msg.format(','.join(missing))) 74 | else: 75 | logging.info('The metadata table has all required columns') 76 | continue 77 | # body 78 | entry_cnt += 1 79 | for col in non_empty_cols: 80 | if line[header[col]] == '': 81 | msg = 'Line {}: Column "{}" cannot be empty' 82 | raise ValueError(msg.format(i+1, col)) 83 | seq_uuid = line[header['seq_uuid']] 84 | seq_orig_name = line[header['seq_orig_name']] 85 | try: 86 | _ = meta[seq_uuid] 87 | raise ValueError('Value duplicated: {}'.format(seq_uuid)) 88 | except KeyError: 89 | pass 90 | meta[seq_uuid] = seq_orig_name 91 | 92 | return meta 93 | 94 | def main(args): 95 | # aa fasta 96 | aa_seqs = read_fasta(args.prot_fasta) 97 | # nuc fasta 98 | if args.nuc_fasta is not None: 99 | nuc_seqs = read_fasta(args.nuc_fasta) 100 | if not len(aa_seqs) == len(nuc_seqs): 101 | msg = 'WARNIGN: prot. & nuc. seqs differ in length!' 102 | logging.warning(msg) 103 | # compare fasta files 104 | logging.info('Comparing fasta files...') 105 | ## just in prot. 106 | just_aa = list(aa_seqs - nuc_seqs) 107 | if len(just_aa) > 0: 108 | just_aa = '\n '.join(just_aa) 109 | print('Genes just in the prot. fasta:\n {}'.format(just_aa)) 110 | ## just in prot. 111 | just_nuc = list(nuc_seqs - aa_seqs) 112 | if len(just_nuc) > 0: 113 | just_nuc = '\n '.join(just_nuc) 114 | print('Genes just in the nuc. fasta:{} \n'.format(just_nuc)) 115 | raise ValueError('Exiting due to mismatches') 116 | # metadata 117 | meta = read_meta(args.metadata) 118 | ## comparing to fasta 119 | if len(meta.keys()) != len(aa_seqs): 120 | msg = 'WARNIGN: No. of metadata entries does not match no. of prot. seqs' 121 | logging.warning(msg) 122 | ### just in protein 123 | just_aa = list(aa_seqs - set(meta.keys())) 124 | if len(just_aa) > 0: 125 | just_aa = '\n '.join(just_aa) 126 | print('Genes just in the prot. fasta:\n {}'.format(just_aa)) 127 | just_txt = list(set(meta.keys()) - aa_seqs) 128 | if len(just_txt) > 0: 129 | msg = 'Genes just in the metadata table:\n {}' 130 | print(msg.format('\n '.join(just_txt))) 131 | just_txt = '\n '.join([meta[x] for x in just_txt]) 132 | print('Genes just in the metadata table (gene_orig_name):\n {}'.format(just_txt)) 133 | raise ValueError('Exiting due to mismatches') 134 | 135 | 136 | if __name__ == '__main__': 137 | args = parser.parse_args() 138 | main(args) 139 | -------------------------------------------------------------------------------- /bin/scripts/download_taxonomy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2013-2019, Derrick Wood 4 | # 5 | # This file is part of the Kraken 2 taxonomic sequence classification system. 6 | 7 | # Download NCBI taxonomy information for Kraken 2. 8 | # Designed to be called by kraken2-build 9 | 10 | set -u # Protect against uninitialized vars. 11 | set -e # Stop on error 12 | 13 | TAXONOMY_DIR="$KRAKEN2_DB_NAME/taxonomy" 14 | NCBI_SERVER="ftp.ncbi.nlm.nih.gov" 15 | RSYNC_SERVER="rsync://$NCBI_SERVER" 16 | FTP_SERVER="ftp://$NCBI_SERVER" 17 | 18 | mkdir -p "$TAXONOMY_DIR" 19 | cd "$TAXONOMY_DIR" 20 | 21 | function download_file() { 22 | file="$1" 23 | if [ -n "$KRAKEN2_USE_FTP" ] 24 | then 25 | wget -q ${FTP_SERVER}${file} 26 | else 27 | rsync --no-motd ${RSYNC_SERVER}${file} . 28 | fi 29 | } 30 | 31 | if [ ! -e "accmap.dlflag" ] && [ -z "$KRAKEN2_SKIP_MAPS" ] 32 | then 33 | if [ -z "$KRAKEN2_PROTEIN_DB" ] 34 | then 35 | for subsection in gb wgs 36 | do 37 | 1>&2 echo -n "Downloading nucleotide ${subsection} accession to taxon map..." 38 | download_file "/pub/taxonomy/accession2taxid/nucl_${subsection}.accession2taxid.gz" 39 | 1>&2 echo " done." 40 | done 41 | else 42 | 1>&2 echo -n "Downloading protein accession to taxon map..." 43 | download_file "/pub/taxonomy/accession2taxid/prot.accession2taxid.gz" 44 | 1>&2 echo " done." 45 | fi 46 | touch accmap.dlflag 47 | 1>&2 echo "Downloaded accession to taxon map(s)" 48 | fi 49 | 50 | if [ ! -e "taxdump.dlflag" ] 51 | then 52 | 1>&2 echo -n "Downloading taxonomy tree data..." 53 | download_file "/pub/taxonomy/taxdump.tar.gz" 54 | touch taxdump.dlflag 55 | 1>&2 echo " done." 56 | fi 57 | 58 | if ls | grep -q 'accession2taxid\.gz$' 59 | then 60 | 1>&2 echo -n "Uncompressing taxonomy data..." 61 | gunzip *accession2taxid.gz 62 | 1>&2 echo " done." 63 | fi 64 | 65 | if [ ! -e "taxdump.untarflag" ] 66 | then 67 | 1>&2 echo -n "Untarring taxonomy tree data..." 68 | tar zxf taxdump.tar.gz 69 | touch taxdump.untarflag 70 | 1>&2 echo " done." 71 | fi 72 | -------------------------------------------------------------------------------- /bin/scripts/filter_cluster_reps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import gzip 6 | import argparse 7 | import logging 8 | 9 | desc = 'Filtering the cluster reps to just those that lack current annotations' 10 | epi = """DESCRIPTION: 11 | Filtering cluster reps to just those that are lacking any 12 | annotation data. 13 | Output (filtered fasta) written to STDOUT 14 | """ 15 | parser = argparse.ArgumentParser(description=desc, 16 | epilog=epi, 17 | formatter_class=argparse.RawTextHelpFormatter) 18 | parser.add_argument('cluster_membership', metavar='cluster_membership', type=str, 19 | help='mmseqs cluster membership file (format: cluster_repcluster_member)') 20 | parser.add_argument('query_hits', metavar='query_hits', type=str, 21 | help='blast-formatted table of hits (cluster_rep <=> target_db_seqs)') 22 | parser.add_argument('cluster_reps_fasta', metavar='cluster_reps_aa', type=str, 23 | help='mmseqs cluster representatives fasta file') 24 | parser.add_argument('--hit-columns', type=str, 25 | default='qseqid,sseqid,evalue,pident,alnlen,slen', 26 | help='Hit table output columns (default: %(default)s)') 27 | parser.add_argument('--min-pident', type=float, default=90, 28 | help='Min % identity of hit (default: %(default)s)') 29 | parser.add_argument('--min-cov', type=float, default=80, 30 | help='Min % alignment coverage of subject sequence length (default: %(default)s)') 31 | parser.add_argument('--reps-metadata', type=str, default=None, 32 | help='Cluster reps metadata in order to provide more summary info about filtering (default: %(default)s)') 33 | parser.add_argument('--version', action='version', version='0.0.1') 34 | 35 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 36 | 37 | def _open(infile): 38 | if infile.endswith('.gz'): 39 | return gzip.open(infile, 'rb') 40 | else: 41 | return open(infile) 42 | 43 | def read_membership(infile): 44 | """ 45 | Reading in cluster membership table: cluster_repcluster_member 46 | Return: 47 | dict: {cluster_member : cluster_id} 48 | """ 49 | logging.info('Reading file: {}'.format(infile)) 50 | mem = {} 51 | with _open(infile) as inF: 52 | for line in inF: 53 | if infile.endswith('.gz'): 54 | line = line.decode('utf-8') 55 | line = line.rstrip().split('\t') 56 | if len(line) < 2: 57 | continue 58 | mem[line[1]] = line[0] 59 | logging.info(' No. of cluster members: {}'.format(len(mem.keys()))) 60 | n_clst = len(set(mem.values())) 61 | logging.info(' No. of clusters: {}'.format(n_clst)) 62 | return mem 63 | 64 | def read_hits(infile, mem, colnames, min_pident=0, min_cov=0): 65 | """ 66 | Loading query hits. 67 | Return: 68 | set(cluster rep) # clusters with hits 69 | """ 70 | logging.info('Loading hits table...') 71 | clusts = [] 72 | idx = {x:i for i,x in enumerate(colnames.split(','))} 73 | with _open(infile) as inF: 74 | for i,line in enumerate(inF): 75 | if infile.endswith('.gz'): 76 | line = line.decode('utf-8') 77 | line = line.rstrip().split('\t') 78 | if line[0] == '': 79 | continue 80 | if len(line) < 2: 81 | msg = 'Line {}: <2 values in hits table' 82 | raise ValueError(msg.format(i+1)) 83 | else: 84 | # filtering to just acceptable annotations 85 | ## percent identity 86 | pident = 0 87 | try: 88 | pident = float(line[idx['pident']]) 89 | except KeyError: 90 | pass 91 | if pident < min_pident: 92 | continue 93 | ## coverage of target seq 94 | cov = 0 95 | try: 96 | cov = float(line[idx['slen']]) / float(line[idx['alnlen']]) * 100 97 | except KeyError: 98 | pass 99 | if cov < min_cov: 100 | continue 101 | ## adding clusterID to set of genes w/ acceptable annotation 102 | qseqid = line[idx['qseqid']] 103 | try: 104 | clusts.append(mem[qseqid]) 105 | except KeyError: 106 | msg = 'Cannot find "{}" in cluster membership' 107 | raise KeyError(msg.format(qseqid)) 108 | clusts = set(clusts) 109 | msg = ' No of clusters with acceptable annotations: {}' 110 | logging.info(msg.format(len(clusts))) 111 | return clusts 112 | 113 | def filter_fasta(infile, clust_w_annot, meta=None): 114 | """ 115 | Filtering out sequences that are in the set(clust_w_annot) 116 | Return: 117 | None 118 | """ 119 | logging.info('Filtering input fasta...') 120 | cnts = {'all' : 0, 'filtered' : 0, 'kept' : 0} 121 | genome_cnt = {} 122 | to_keep = False 123 | with _open(infile) as inF: 124 | for line in inF: 125 | if infile.endswith('.gz'): 126 | line = line.decode('utf-8') 127 | line = line.rstrip() 128 | if line.startswith('>'): 129 | cnts['all'] += 1 130 | # already has annotation? 131 | line = line.lstrip('>').rstrip().split(' ')[0] 132 | if line in clust_w_annot: 133 | to_keep = False 134 | cnts['filtered'] += 1 135 | else: 136 | to_keep = True 137 | cnts['kept'] += 1 138 | print('>' + line) 139 | # metadata stats 140 | if meta is not None and to_keep is True: 141 | try: 142 | genome = meta[line] 143 | except KeyError: 144 | genome = 'OTHER' 145 | try: 146 | genome_cnt[genome] += 1 147 | except KeyError: 148 | genome_cnt[genome] = 1 149 | elif to_keep == True: 150 | print(line) 151 | # status 152 | logging.info(' No. of total seqs: {}'.format(cnts['all'])) 153 | logging.info(' No. of filtered seqs: {}'.format(cnts['filtered'])) 154 | logging.info(' No. of retained seqs: {}'.format(cnts['kept'])) 155 | ## w/ metadata 156 | if meta is not None: 157 | msg = ' No. of retained seqs for {}: {}' 158 | for genome,cnt in genome_cnt.items(): 159 | logging.info(msg.format(genome, cnt)) 160 | 161 | def read_metadata(infile): 162 | logging.info('Reading file: {}'.format(infile)) 163 | header = {} 164 | meta = {} 165 | with _open(infile) as inF: 166 | for i,line in enumerate(inF): 167 | if infile.endswith('.gz'): 168 | line = line.decode('utf-8') 169 | line = line.rstrip().split('\t') 170 | if i == 0: 171 | header = {x:ii for ii,x in enumerate(line)} 172 | continue 173 | seqid = line[header['seq_uuid']] 174 | genome = line[header['genome_name']] 175 | meta[seqid] = genome 176 | return meta 177 | 178 | def main(args): 179 | # cluster metadata 180 | if args.reps_metadata is not None: 181 | meta = read_metadata(args.reps_metadata) 182 | else: 183 | meta = None 184 | 185 | # determining which clusters already have acceptable annotations 186 | clust_w_annot = read_hits(args.query_hits, 187 | read_membership(args.cluster_membership), 188 | colnames = args.hit_columns, 189 | min_pident = args.min_pident, 190 | min_cov = args.min_cov) 191 | # filtering fasta to just those lacking acceptable annotations 192 | filter_fasta(args.cluster_reps_fasta, clust_w_annot, meta) 193 | 194 | 195 | if __name__ == '__main__': 196 | args = parser.parse_args() 197 | main(args) 198 | 199 | -------------------------------------------------------------------------------- /bin/scripts/filter_seqs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import sys,os 4 | import re 5 | import gzip 6 | import bz2 7 | import uuid 8 | import argparse 9 | import logging 10 | 11 | desc = 'Filtering two fasta files down to just intersection' 12 | epi = """DESCRIPTION: 13 | Filtering 2 fasta files down to the intersection of their sequencines. 14 | The sequence headers must perfectly match. 15 | If any duplicate headers, only the first will be selected. 16 | 17 | Output columns: 18 | * seq UUID 19 | * seq original name 20 | * domain 21 | * phylum 22 | * class 23 | * order 24 | * family 25 | * genus 26 | * species 27 | * taxid 28 | * genome ID 29 | * genome length 30 | 31 | Output written to STDOUT. 32 | """ 33 | parser = argparse.ArgumentParser(description=desc, 34 | epilog=epi, 35 | formatter_class=argparse.RawTextHelpFormatter) 36 | parser.add_argument('fasta1', metavar='fasta1', type=str, 37 | help='The first fasta file') 38 | parser.add_argument('fasta2', metavar='fasta2', type=str, 39 | help='The second fasta file') 40 | parser.add_argument('fasta1_output', metavar='fasta1_output', type=str, 41 | help='Name of the output fasta1 file') 42 | parser.add_argument('fasta2_output', metavar='fasta2_output', type=str, 43 | help='Name of the output fasta2 file') 44 | parser.add_argument('--taxonomy', type=str, default='', 45 | help='genome taxonomy') 46 | parser.add_argument('--taxID', type=str, default='', 47 | help='genome taxonomy') 48 | parser.add_argument('--accession', type=str, default='', 49 | help='genome accession') 50 | parser.add_argument('--genome-file', type=str, default='', 51 | help='genome fasta file (to get genome length)') 52 | parser.add_argument('--gzip', action='store_true', default=False, 53 | help='gzip output') 54 | parser.add_argument('--version', action='version', version='0.0.1') 55 | 56 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 57 | 58 | def _open(infile, mode='rb'): 59 | """ 60 | Openning of input, regardless of compression 61 | """ 62 | if infile.endswith('.bz2'): 63 | return bz2.open(infile, mode) 64 | elif infile.endswith('.gz'): 65 | return gzip.open(infile, mode) 66 | else: 67 | return open(infile) 68 | 69 | def _decode(line): 70 | """ 71 | Decoding input, depending on the file extension 72 | """ 73 | try: 74 | line = line.decode('utf-8') 75 | except AttributeError: 76 | pass 77 | return line 78 | 79 | def make_index(fasta): 80 | regex = re.compile(r' .+') 81 | if fasta.endswith('.gz'): 82 | _openR = lambda x: gzip.open(x, 'rb') 83 | else: 84 | _openR = lambda x: open(x, 'r') 85 | 86 | idx = {} 87 | with _open(fasta) as inF: 88 | for line in inF: 89 | line = _decode(line) 90 | if line.startswith('>'): 91 | line = line.lstrip('>').rstrip() 92 | idx[regex.sub('', line)] = 0 93 | return set(idx.keys()) 94 | 95 | def format_taxonomy(tax, taxID): 96 | """Formatting taxonomy string 97 | """ 98 | logging.info('Taxonomy string provided: {}'.format(tax)) 99 | logging.info('TaxID provided: {}'.format(taxID)) 100 | 101 | try: 102 | taxID = int(float(taxID.strip())) 103 | except ValueError: 104 | msg = 'WARNING: taxID "{}" is not an integer!' 105 | logging.warning(msg.format(taxID)) 106 | #raise ValueError(msg) 107 | tax = [re.sub('[^A-Za-z0-9-_;]+', '_', x) for x in tax.split(';')] 108 | regex_d = re.compile(r'[Dd]__.+') 109 | regex_p = re.compile(r'[Pp]__.+') 110 | regex_c = re.compile(r'[Cc]__.+') 111 | regex_o = re.compile(r'[Oo]__.+') 112 | regex_f = re.compile(r'[Ff]__.+') 113 | regex_g = re.compile(r'[Gg]__.+') 114 | regex_s = re.compile(r'[Ss]__.+') 115 | 116 | domain = 'd__unclassified' 117 | phylum = 'p__unclassified' 118 | cls = 'c__unclassified' 119 | order = 'o__unclassified' 120 | family = 'f__unclassified' 121 | genus = 'g__unclassified' 122 | species = 's__unclassified' 123 | for lev in tax: 124 | if regex_d.match(lev): 125 | domain = lev 126 | elif regex_p.match(lev): 127 | phylum = lev 128 | if regex_c.match(lev): 129 | cls = lev 130 | elif regex_o.match(lev): 131 | order = lev 132 | if regex_f.match(lev): 133 | family = lev 134 | if regex_g.match(lev): 135 | genus = lev 136 | elif regex_s.match(lev): 137 | species = lev 138 | 139 | tax = [domain, phylum, cls, order, family, genus, species, str(taxID)] 140 | logging.info('Converted taxonomy string to {}'.format(';'.join(tax))) 141 | return tax 142 | 143 | def filter_fasta(fasta, idx, output, gzip_out=False): 144 | """ 145 | Filtering fasta to just those in idx 146 | """ 147 | if gzip_out is True: 148 | _openW = lambda x: gzip.open(x, 'wb') 149 | else: 150 | _openW = lambda x: open(x, 'w') 151 | 152 | found = {} 153 | hit = False 154 | regex = re.compile(r' .+') 155 | with _open(fasta) as inF, _openW(output) as outF: 156 | for line in inF: 157 | line = _decode(line) 158 | if line.startswith('>'): 159 | line = regex.sub('', line.lstrip('>').rstrip()) 160 | # filter is already seen 161 | try: 162 | found[line] 163 | continue 164 | except KeyError: 165 | pass 166 | # is seq in index? 167 | try: 168 | found[line] = idx[line] 169 | hit = True 170 | except KeyError: 171 | hit = False 172 | continue 173 | seq_name = '>' + idx[line] + '\n' 174 | try: 175 | outF.write(seq_name) 176 | except TypeError: 177 | outF.write(seq_name.encode('utf-8')) 178 | else: 179 | if hit: 180 | try: 181 | outF.write(line) 182 | except TypeError: 183 | outF.write(line.encode('utf-8')) 184 | 185 | logging.info('File written: {}'.format(output)) 186 | logging.info('Number of seqs written: {}'.format(len(found.keys()))) 187 | return found 188 | 189 | def idx_overlap(idx1, idx2, verbose=True): 190 | """ 191 | Getting overlapping keys 192 | """ 193 | idx = {} 194 | for x in set(idx1.keys()) & set(idx2.keys()): 195 | idx[x] = idx1[x] 196 | if verbose: 197 | logging.info('No. of seqIDs in idx1: {}'.format(len(idx1.keys()))) 198 | logging.info('No. of seqIDs in idx2: {}'.format(len(idx2.keys()))) 199 | logging.info('No. of overlapping seqIDs: {}'.format(len(idx.keys()))) 200 | return idx 201 | 202 | def write_name_idx(idx, tax, genome_id, genome_len): 203 | """ 204 | Writing gene metadata 205 | """ 206 | header = ['seq_uuid', 'seq_orig_name', 'domain', 'phylum', 207 | 'class', 'order', 'family', 'genus', 'species', 208 | 'taxid', 'genome_name', 'genome_length_bp'] 209 | print('\t'.join(header)) 210 | for k,v in idx.items(): 211 | print('\t'.join([v, k] + tax + [genome_id, str(genome_len)])) 212 | 213 | def get_genome_length(infile): 214 | """ 215 | Getting the length of the genome 216 | """ 217 | if infile.endswith('.gz'): 218 | _open = lambda x: gzip.open(x, 'rb') 219 | else: 220 | _open = lambda x: open(x) 221 | seq_len = 0 222 | with _open(infile) as inF: 223 | for line in inF: 224 | if infile.endswith('.gz'): 225 | line = line.decode('utf-8') 226 | if not line.startswith('>'): 227 | seq_len += len(line.rstrip()) 228 | return seq_len 229 | 230 | def main(args): 231 | """ 232 | Main interface 233 | """ 234 | tax = format_taxonomy(args.taxonomy, args.taxID) 235 | genome_len = get_genome_length(args.genome_file) 236 | genomeID = args.accession 237 | 238 | # creating the seq header index 239 | seq_idx = make_index(args.fasta1) & make_index(args.fasta2) 240 | seq_idx = {x:str(uuid.uuid4()).replace('-', '') for x in seq_idx} 241 | 242 | # filtering the fasta files 243 | idx = filter_fasta(args.fasta1, seq_idx, 244 | args.fasta1_output, gzip_out=args.gzip) 245 | idx = filter_fasta(args.fasta2, seq_idx, 246 | args.fasta2_output, gzip_out=args.gzip) 247 | # creating name index 248 | write_name_idx(idx, tax, genomeID, genome_len) 249 | 250 | if __name__ == '__main__': 251 | args = parser.parse_args() 252 | main(args) 253 | -------------------------------------------------------------------------------- /bin/scripts/kraken2-build: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # Copyright 2013-2019, Derrick Wood 4 | # 5 | # This file is part of the Kraken 2 taxonomic sequence classification system. 6 | 7 | # General build process wrapper for Kraken 2. 8 | 9 | use strict; 10 | use warnings; 11 | use File::Basename; 12 | use Getopt::Long; 13 | 14 | my $PROG = basename $0; 15 | my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####"; 16 | 17 | # Test to see if the executables got moved, try to recover if we can 18 | if (! -e "$KRAKEN2_DIR/classify") { 19 | use Cwd 'abs_path'; 20 | $KRAKEN2_DIR = dirname abs_path($0); 21 | } 22 | 23 | $ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR; 24 | $ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}"; 25 | 26 | my $DEF_AA_MINIMIZER_LEN = 12; 27 | my $DEF_AA_KMER_LEN = 15; 28 | my $DEF_AA_MINIMIZER_SPACES = 0; 29 | my $DEF_NT_MINIMIZER_LEN = 31; 30 | my $DEF_NT_KMER_LEN = 35; 31 | my $DEF_NT_MINIMIZER_SPACES = 7; 32 | my $DEF_THREAD_CT = 1; 33 | 34 | my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral plant 35 | protozoa fungi human nr nt env_nr env_nt 36 | UniVec UniVec_Core/; 37 | my @VALID_SPECIAL_DB_TYPES = qw/greengenes silva rdp/; 38 | 39 | # Option/task option variables 40 | my ( 41 | $db, 42 | $threads, 43 | $minimizer_len, 44 | $kmer_len, 45 | $minimizer_spaces, 46 | $is_protein, 47 | $no_masking, 48 | $max_db_size, 49 | $use_ftp, 50 | $skip_maps, 51 | 52 | $dl_taxonomy, 53 | $dl_library, 54 | $add_to_library, 55 | $build, 56 | $standard, 57 | $clean, 58 | $special, 59 | ); 60 | 61 | $threads = $DEF_THREAD_CT; 62 | $is_protein = 0; 63 | 64 | # variables corresponding to task options 65 | my @TASK_LIST = ( 66 | \$dl_taxonomy, 67 | \$dl_library, 68 | \$add_to_library, 69 | \$build, 70 | \$standard, 71 | \$clean, 72 | \$special, 73 | ); 74 | 75 | GetOptions( 76 | "help" => \&display_help, 77 | "version" => \&display_version, 78 | 79 | "db=s" => \$db, 80 | "threads=i" => \$threads, 81 | "minimizer-len=i" => \$minimizer_len, 82 | "kmer-len=i" => \$kmer_len, 83 | "minimizer-spaces=i", \$minimizer_spaces, 84 | "protein" => \$is_protein, 85 | "no-masking" => \$no_masking, 86 | "max-db-size=i" => \$max_db_size, 87 | "use-ftp" => \$use_ftp, 88 | "skip-maps" => \$skip_maps, 89 | 90 | "download-taxonomy" => \$dl_taxonomy, 91 | "download-library=s" => \$dl_library, 92 | "add-to-library=s" => \$add_to_library, 93 | "build" => \$build, 94 | "standard" => \$standard, 95 | "clean" => \$clean, 96 | "special=s" => \$special, 97 | ) or usage(); 98 | 99 | if ($is_protein) { 100 | $kmer_len = $DEF_AA_KMER_LEN if ! defined $kmer_len; 101 | $minimizer_len = $DEF_AA_MINIMIZER_LEN if ! defined $minimizer_len; 102 | $minimizer_spaces = $DEF_AA_MINIMIZER_SPACES if ! defined $minimizer_spaces; 103 | } 104 | else { 105 | $kmer_len = $DEF_NT_KMER_LEN if ! defined $kmer_len; 106 | $minimizer_len = $DEF_NT_MINIMIZER_LEN if ! defined $minimizer_len; 107 | $minimizer_spaces = $DEF_NT_MINIMIZER_SPACES if ! defined $minimizer_spaces; 108 | } 109 | 110 | if (@ARGV) { 111 | warn "Extra arguments on command line.\n"; 112 | usage(); 113 | } 114 | my $task_options = scalar grep defined $$_, @TASK_LIST; 115 | if ($task_options > 1) { 116 | warn "More than one task option selected.\n"; 117 | usage(); 118 | } 119 | if ($task_options == 0) { 120 | warn "Must select a task option.\n"; 121 | usage(); 122 | } 123 | 124 | if (! defined $db) { 125 | die "Must specify a database name\n"; 126 | } 127 | if ($threads <= 0) { 128 | die "Can't use nonpositive thread count of $threads\n"; 129 | } 130 | if ($minimizer_len > $kmer_len) { 131 | die "Minimizer length ($minimizer_len) must not be greater than k ($kmer_len)\n"; 132 | } 133 | if ($minimizer_len <= 0) { 134 | die "Can't use nonpositive minimizer length of $minimizer_len\n"; 135 | } 136 | if ($minimizer_len > 31) { 137 | die "Can't use minimizer len of $minimizer_len (must be <= 31)\n"; 138 | } 139 | 140 | $ENV{"KRAKEN2_DB_NAME"} = $db; 141 | $ENV{"KRAKEN2_THREAD_CT"} = $threads; 142 | $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len; 143 | $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len; 144 | $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces; 145 | $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template(); 146 | $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : ""; 147 | $ENV{"KRAKEN2_MASK_LC"} = $no_masking ? "" : 1; 148 | $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : ""; 149 | $ENV{"KRAKEN2_USE_FTP"} = $use_ftp ? 1 : ""; 150 | $ENV{"KRAKEN2_SKIP_MAPS"} = $skip_maps ? 1 : ""; 151 | 152 | if ($dl_taxonomy) { 153 | download_taxonomy(); 154 | } 155 | elsif (defined($dl_library)) { 156 | download_library($dl_library); 157 | } 158 | elsif (defined($add_to_library)) { 159 | add_to_library($add_to_library); 160 | } 161 | elsif ($standard) { 162 | standard_installation(); 163 | } 164 | elsif ($build) { 165 | build_database(); 166 | } 167 | elsif ($clean) { 168 | clean_database(); 169 | } 170 | elsif ($special) { 171 | build_special_database($special); 172 | } 173 | else { 174 | usage(); 175 | } 176 | 177 | exit -1; 178 | # END OF MAIN CODE. 179 | 180 | sub usage { 181 | my $exit_code = @_ ? shift : 64; 182 | print STDERR <|` 11 | The taxid is provided by the user. 12 | """ 13 | 14 | parser = argparse.ArgumentParser(description=desc, 15 | epilog=epi, 16 | formatter_class=argparse.RawTextHelpFormatter) 17 | parser.add_argument('genome_file', metavar='genome_file', type=str, 18 | help='genome file (can be gzip\'ed)') 19 | parser.add_argument('taxID', metavar='taxID', type=str, 20 | help='taxonomy ID used for renaming sequences') 21 | parser.add_argument('--version', action='version', version='0.0.1') 22 | 23 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 24 | 25 | 26 | def main(args): 27 | gz = args.genome_file.endswith('.gz') 28 | if gz: 29 | inF = gzip.open(args.genome_file) 30 | else: 31 | inF = open(args.genome_file) 32 | for line in inF: 33 | if gz: 34 | line = line.decode('utf8') 35 | line = line.rstrip() 36 | if line.startswith('>'): 37 | line = '>kraken:taxid|{}|{}'.format(args.taxID, line.lstrip('>')) 38 | print(line) 39 | 40 | inF.close() 41 | 42 | if __name__ == '__main__': 43 | args = parser.parse_args() 44 | main(args) 45 | -------------------------------------------------------------------------------- /bin/scripts/uncomp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import re 6 | import gzip 7 | import bz2 8 | import argparse 9 | import logging 10 | 11 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 13 | argparse.RawDescriptionHelpFormatter): 14 | pass 15 | 16 | desc = 'Uncompress gzip\'ed or bz2\'ed file' 17 | epi = """DESCRIPTION: 18 | Output written to STDOUT 19 | """ 20 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 21 | formatter_class=CustomFormatter) 22 | argparse.ArgumentDefaultsHelpFormatter 23 | parser.add_argument('input_file', metavar='input_file', type=str, 24 | help='Input file') 25 | parser.add_argument('--version', action='version', version='0.0.1') 26 | 27 | 28 | def _open(infile, mode='rb'): 29 | """ 30 | Openning of input, regardless of compression 31 | """ 32 | if infile.endswith('.bz2'): 33 | return bz2.open(infile, mode) 34 | elif infile.endswith('.gz'): 35 | return gzip.open(infile, mode) 36 | else: 37 | return open(infile) 38 | 39 | def _decode(line, infile): 40 | """ 41 | Decoding input, depending on the file extension 42 | """ 43 | if os.path.isfile(infile) and (infile.endswith('.gz') or infile.endswith('.bz2')): 44 | line = line.decode('utf-8') 45 | return line 46 | 47 | def main(args): 48 | with _open(args.input_file) as inF: 49 | for line in inF: 50 | print(_decode(line, args.input_file).rstrip()) 51 | 52 | 53 | if __name__ == '__main__': 54 | args = parser.parse_args() 55 | main(args) 56 | -------------------------------------------------------------------------------- /bin/scripts/uncomp_tarball.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import tarfile 6 | import argparse 7 | import logging 8 | 9 | desc = 'Uncompress tarball' 10 | epi = """DESCRIPTION: 11 | Simple script for smartly uncompressing a tarball. 12 | All files extracted the same output directory, regardless 13 | of the directory structure in the tarball. 14 | """ 15 | parser = argparse.ArgumentParser(description=desc, 16 | epilog=epi, 17 | formatter_class=argparse.RawTextHelpFormatter) 18 | parser.add_argument('tarball', metavar='tarball', type=str, 19 | help='tarball file to extract') 20 | parser.add_argument('-o', '--output-directory', type=str, default='.', 21 | help='Output directory location (default: %(default)s)') 22 | parser.add_argument('--version', action='version', version='0.0.1') 23 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 24 | 25 | 26 | def main(args): 27 | # output location 28 | if not os.path.isdir(args.output_directory): 29 | os.makedirs(args.output_directory) 30 | # extracting 31 | ext = 'r:gz' if args.tarball.endswith('.gz') else 'r' 32 | with tarfile.open(args.tarball, ext) as inF: 33 | files = {k:v for k,v in zip(inF.getnames(), inF.getmembers())} 34 | for F,M in files.items(): 35 | outfile = os.path.split(F)[1] 36 | logging.info('Extracting file: {}'.format(outfile)) 37 | outfile = os.path.join(args.output_directory, outfile) 38 | with inF.extractfile(M) as inFa, open(outfile, 'w') as outF: 39 | for line in inFa: 40 | outF.write(line.decode('utf-8')) 41 | 42 | if __name__ == '__main__': 43 | args = parser.parse_args() 44 | main(args) 45 | -------------------------------------------------------------------------------- /bin/scripts/uniref_clst_trans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import re 6 | import gzip 7 | import bz2 8 | import argparse 9 | import logging 10 | import pickle 11 | from pickle import UnpicklingError 12 | 13 | desc = 'Transferring UniRef annotations by cluster cutoff' 14 | epi = """DESCRIPTION: 15 | Using an index of how UniRef50 clusters map to UniRef90 clusters. 16 | Renaming annotations with the other UniRef cluster level 17 | (eg., UniRef90_Q8WZ42-5 => UniRef50_Q8WZ42-5). 18 | Due to the multi-mapping when going from UniRef50 to UniRef90, 19 | it makes much more sense to go from UniRef90 to UniRef50. 20 | For multi-mappings, one mapping will be selected. 21 | 22 | The input file format is determined by the input file extension. 23 | Output is written to STDOUT. 24 | """ 25 | parser = argparse.ArgumentParser(description=desc, 26 | epilog=epi, 27 | formatter_class=argparse.RawTextHelpFormatter) 28 | parser.add_argument('index_file', metavar='index_file', type=str, 29 | help='UniRef 50 <=> 90 index') 30 | parser.add_argument('--in-nuc', type=str, default='', 31 | help='Input nucleotide fasta file path (default: %(default)s)') 32 | parser.add_argument('--in-prot', type=str, default='', 33 | help='Input amino acid fasta file path (default: %(default)s)') 34 | parser.add_argument('--in-tsv', type=str, default='', 35 | help='Input gene metadata table path (default: %(default)s)') 36 | parser.add_argument('--out-nuc', type=str, default='', 37 | help='Output nucleotide fasta file path (default: %(default)s)') 38 | parser.add_argument('--out-prot', type=str, default='', 39 | help='Output amino acid fasta file path (default: %(default)s)') 40 | parser.add_argument('--out-tsv', type=str, default='', 41 | help='Output gene metadata table path (default: %(default)s)') 42 | parser.add_argument('-d', '--direction', type=str, default='90=>50', 43 | choices = ['90=>50', '50=>90'], 44 | help='Changing annotations from X to Y (default: %(default)s)') 45 | parser.add_argument('-p', '--pickle-idx', type=str, default='', 46 | help='Write the index as a pickled dict for faster loading (default: %(default)s)') 47 | parser.add_argument('--version', action='version', version='0.0.1') 48 | 49 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 50 | 51 | 52 | def _open(infile, mode='rb'): 53 | """ 54 | Openning of input, regardless of compression 55 | """ 56 | if infile.endswith('.bz2'): 57 | return bz2.open(infile, mode) 58 | elif infile.endswith('.gz'): 59 | return gzip.open(infile, mode) 60 | else: 61 | return open(infile) 62 | 63 | def _decode(line, infile): 64 | """ 65 | Decoding input, depending on the file extension 66 | """ 67 | if os.path.isfile(infile) and (infile.endswith('.gz') or infile.endswith('.bz2')): 68 | line = line.decode('utf-8') 69 | return line 70 | 71 | def read_index(infile, direction): 72 | logging.info('Loading file: {}'.format(infile)) 73 | # if pickle 74 | try: 75 | idx = pickle.load(open(infile, 'rb')) 76 | return idx 77 | except UnpicklingError: 78 | pass 79 | 80 | idx = {} 81 | with _open(infile) as inF: 82 | for i,line in enumerate(inF): 83 | line = _decode(line, infile) 84 | line = line.rstrip().split('\t') 85 | if line[0] == '': 86 | continue 87 | if len(line) < 2: 88 | msg = 'Line{}: <2 values' 89 | raise ValueError(msg.format(i)) 90 | # assuming UniRef50UniRef90 91 | if direction == '50=>90': 92 | idx[line[0]] = line[1] 93 | else: 94 | idx[line[1]] = line[0] 95 | logging.info(' No. of index keys: {}'.format(len(idx.keys()))) 96 | return idx 97 | 98 | def which_ext(infile): 99 | infile = infile.rstrip('.gz') 100 | for x in ['.fasta', '.fna', '.faa', '.fa']: 101 | if infile.endswith(x): 102 | return 'fasta' 103 | for x in ['.txt', '.tsv']: 104 | if infile.endswith(x): 105 | return 'txt' 106 | msg = 'Cannot determine file type from extension: {}' 107 | raise IOError(msg.format(infile)) 108 | 109 | def make_dir(outfile): 110 | D = os.path.split(outfile)[0] 111 | if not os.path.isdir(D): 112 | os.makedirs(D) 113 | 114 | def rename_fasta(infile, outfile, idx): 115 | logging.info('Processing file: {}'.format(infile)) 116 | make_dir(outfile) 117 | to_write = False 118 | stats = {'in_index' : 0, 'not_in_index' : 0} 119 | with _open(infile) as inF, open(outfile, 'w') as outF: 120 | for line in inF: 121 | line = _decode(line, infile) 122 | # header 123 | if line.startswith('>'): 124 | line = line.lstrip('>').split('|') 125 | try: 126 | line[0] = idx[line[0]] 127 | to_write = True 128 | stats['in_index'] += 1 129 | except KeyError: 130 | msg = 'Cannot find "{}" in index' 131 | logging.warning(msg.format(line[0])) 132 | to_write = False 133 | stats['not_in_index'] += 1 134 | if to_write is True: 135 | line = '>' + '|'.join(line) 136 | # body 137 | if to_write is True: 138 | outF.write(line) 139 | # status 140 | msg = ' No. of genes found in UniRef50<=>90 index: {}' 141 | logging.info(msg.format(stats['in_index'])) 142 | msg = ' No. of genes NOT found in UniRef50<=>90 index: {}' 143 | logging.info(msg.format(stats['not_in_index'])) 144 | if stats['in_index'] == 0: 145 | raise ValueError('No genes were present in the UniRef50<=>90 index!') 146 | 147 | def rename_txt(infile, outfile, idx): 148 | logging.info('Processing file: {}'.format(infile)) 149 | make_dir(outfile) 150 | header = {} 151 | to_write = False 152 | stats = {'in_index' : 0, 'not_in_index' : 0} 153 | with _open(infile) as inF, open(outfile, 'w') as outF: 154 | for i,line in enumerate(inF): 155 | line = _decode(line, infile) 156 | line = line.rstrip().split('\t') 157 | if i == 0: 158 | header = {x:ii for ii,x in enumerate(line)} 159 | else: 160 | try: 161 | line[header['annotation']] = idx[line[header['annotation']]] 162 | to_write = True 163 | stats['in_index'] += 1 164 | except KeyError: 165 | to_write = False 166 | stats['not_in_index'] += 1 167 | if i == 0 or to_write is True: 168 | outF.write('\t'.join(line) + '\n') 169 | # status 170 | msg = ' No. of genes found in UniRef50<=>90 index: {}' 171 | logging.info(msg.format(stats['in_index'])) 172 | msg = ' No. of genes NOT found in UniRef50<=>90 index: {}' 173 | logging.info(msg.format(stats['not_in_index'])) 174 | 175 | def pickle_idx(idx, outfile): 176 | logging.info('Pickling index to {}'.format(outfile)) 177 | with open(outfile, 'wb') as outF: 178 | pickle.dump(idx, outF) 179 | logging.info(' File pickled. Exiting') 180 | sys.exit() 181 | 182 | def main(args): 183 | # loading UniRef cluster index 184 | idx = read_index(args.index_file, args.direction) 185 | # pickle 186 | if args.pickle_idx != '': 187 | pickle_idx(idx, args.pickle_idx) 188 | # renaming input 189 | for inpath,outpath in zip([args.in_nuc, args.in_prot, args.in_tsv], 190 | [args.out_nuc, args.out_prot, args.out_tsv]): 191 | if which_ext(inpath) == 'fasta': 192 | rename_fasta(inpath, outpath, idx) 193 | elif which_ext(inpath) == 'txt': 194 | rename_txt(inpath, outpath, idx) 195 | 196 | if __name__ == '__main__': 197 | args = parser.parse_args() 198 | main(args) 199 | -------------------------------------------------------------------------------- /bin/utils/Snakefile: -------------------------------------------------------------------------------- 1 | #-- utilty functions --# 2 | import gzip 3 | import bz2 4 | from itertools import chain 5 | 6 | 7 | def concatenate(*lists): 8 | """ 9 | Combine >1 list and/or strings 10 | """ 11 | new_list = [] 12 | for x in lists: 13 | new_list.extend(x) 14 | return new_list 15 | 16 | def _open(infile, mode='rb'): 17 | """ 18 | Openning of input, regardless of compression 19 | """ 20 | if infile.endswith('.bz2'): 21 | return bz2.open(infile, mode) 22 | elif infile.endswith('.gz'): 23 | return gzip.open(infile, mode) 24 | else: 25 | return open(infile) 26 | 27 | def _decode(line, infile): 28 | """ 29 | Decoding input, depending on the file extension 30 | """ 31 | if os.path.isfile(infile) and (infile.endswith('.gz') or infile.endswith('.bz2')): 32 | line = line.decode('utf-8') 33 | return line 34 | 35 | def cat_files(*args, outfile, header=False): 36 | """ 37 | Combining files (*args). File can be compressed. 38 | Combined files written to outfile. 39 | Input: 40 | args : tuple of lists, each list contains file paths 41 | output : str, output file path 42 | header : bool, just print the header line of the first input file? 43 | """ 44 | infiles = concatenate(*args) 45 | with open(outfile, 'w') as outF: 46 | for i,infile in enumerate(infiles): 47 | with _open(infile) as inF: 48 | for ii,line in enumerate(inF): 49 | # skipping header (except for first table) 50 | if i > 0 and ii == 0 and header is True: 51 | continue 52 | # writing line 53 | line = _decode(line, infile) 54 | outF.write(line) 55 | -------------------------------------------------------------------------------- /conda_env.yaml: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # `conda env create --name --file ` 3 | channels: 4 | - conda-forge 5 | - bioconda 6 | dependencies: 7 | - python>=3.6 8 | - mamba>=0.11 9 | - pandas>=1.1.2 10 | - snakemake>=5.31.1 11 | - r-base>=3.6 12 | - r-argparse>=2.0.1 13 | - r-curl>=4.2 14 | - r-data.table>=1.12.4 15 | - r-dplyr>=0.8.3 16 | - r-r.utils 17 | - ncbi-genome-download>=0.2.10 18 | - newick_utils>=1.6 19 | - beautifulsoup4>=4.11 20 | -------------------------------------------------------------------------------- /config-update.yaml: -------------------------------------------------------------------------------- 1 | #-- email notifications of pipeline success/failure (use "Skip" to deactivate) --# 2 | email: None 3 | 4 | #-- databases to update --# 5 | # Replace "Create" with "Skip" to skip creation of any of these 6 | # Note that braken relies on the kraken2 database 7 | databases: 8 | kraken2: Create 9 | bracken: Create 10 | genes: Create 11 | humann3_bowtie2: Create 12 | humann3_diamond: Create 13 | 14 | #-- Input --# 15 | #--- If just a set of gene sequences to add ---# 16 | # If you have nucleotide/amino-acid gene sequences formatted for humann 17 | # If translate = True, missing nuc or AA seqs will be (rev)translated from the other, else seqs not used 18 | new_genes: # remove "Skip" to add these genes instead of from the genomes 19 | amino_acid: Skip #data/UniRef50/genome_reps_filtered.faa.gz 20 | nucleotide: Skip #data/UniRef50/genome_reps_filtered.fna.gz 21 | metadata: Skip #data/UniRef50/genome_reps_filtered.txt.gz 22 | translate: True 23 | 24 | #--- If a set of genomes to add ---# 25 | # file listing samples and associated data 26 | samples_file: data/GTDBr95_n5/GTDBr95_n5.tsv 27 | 28 | ## column names in samples table 29 | samples_col: 'ncbi_organism_name' 30 | accession_col: 'accession' 31 | fasta_file_path_col: 'fasta_file_path' 32 | taxID_col: 'gtdb_taxid' # or 'ncbi_species_taxid' 33 | taxonomy_col: 'gtdb_taxonomy' # or 'ncbi_taxonomy' 34 | 35 | # Saved databases that will be updated 36 | kraken2_db: 37 | library: tests/output/GTDBr95_n10/kraken2/library/ 38 | taxonomy: tests/output/GTDBr95_n10/kraken2/taxonomy/ 39 | genes_db: 40 | genes: 41 | mmseqs_db: tests/output/GTDBr95_n10/genes/genes_db.tar.gz 42 | amino_acid: tests/output/GTDBr95_n10/genes/genome_reps_filtered.faa.gz 43 | nucleotide: tests/output/GTDBr95_n10/genes/genome_reps_filtered.fna.gz 44 | metadata: tests/output/GTDBr95_n10/genes/genome_reps_filtered.txt.gz 45 | cluster: 46 | mmseqs_db: tests/output/GTDBr95_n10/genes/cluster/clusters_db.tar.gz 47 | humann_db: 48 | query: 49 | hits: tests/output/GTDBr95_n10/humann3/annotation_hits.gz 50 | cluster: 51 | reps: tests/output/GTDBr95_n10/genes/cluster/clusters_reps.faa.gz 52 | membership: tests/output/GTDBr95_n10/genes/cluster/clusters_membership.tsv.gz 53 | 54 | #-- Output --# 55 | # output location 56 | output_dir: tests/output/GTDBr95_n10-n5/ 57 | 58 | # Name of UniRef clustering (uniref90 or uniref50) 59 | ## "uniref90" highly recommended 60 | uniref_name: uniref90 61 | # Name of the humann3 diamond database to create 62 | ## This must match naming allowed by humann3 63 | dmnd_name: uniref90_201901.dmnd 64 | # Index mapping UniRef90 clusters to UniRef50 (saves time vs re-annotating) 65 | ## Skip if annotating with UniRef50 66 | cluster_idx: data/uniref50-90.pkl 67 | 68 | # temporary file directory (your username will be added automatically) 69 | tmp_dir: tmp/db_update_tmp/ 70 | 71 | #-- if custom NCBI/GTDB taxdump files, "Skip" if standard NCBI taxdump --# 72 | # Used for kraken taxonomy & metaphlan 73 | names_dmp: data/taxdump/names.dmp 74 | nodes_dmp: data/taxdump/nodes.dmp 75 | 76 | #-- keep intermediate files required for re-creating DBs (eg., w/ more genomes) --# 77 | # If "True", the intermediate files are saved to `output_dir` 78 | # Else, the intermediate files are temporarily stored in `temp_folder` 79 | keep_intermediate: True 80 | 81 | #-- software parameters --# 82 | # `vsearch_per_genome` = per-genome gene clustering 83 | # for humann3, use either mmseqs or diamond (mmseqs gets priority if neither skipped) 84 | # for humann3::mmseqs_search::run, --num-iterations must be >=2 85 | params: 86 | ionice: -c 3 87 | bracken: 88 | build_kmer: 35 89 | build_read_lens: 90 | - 100 91 | - 150 92 | genes: 93 | prodigal: "" 94 | vsearch_per_genome: --id 0.97 --strand both --qmask none --fasta_width 0 95 | mmseqs_cluster_update: --min-seq-id 0.9 -c 0.8 -s 4.0 96 | humann3: 97 | batches: 2 98 | filter_existing: --min-pident 0 # any existing genes w/ < cutoff with be re-queried 99 | mmseqs_search: 100 | db: data/UniRef90/uniref90 101 | index: -s 6 102 | run: -e 1e-3 --max-accept 1 --max-seqs 100 --num-iterations 2 --start-sens 1 --sens-steps 3 -s 6 103 | diamond: 104 | db: Skip #data/uniref90_ec-filtered/uniref90_ec_filt_201901.dmnd 105 | run: --evalue 1e-3 --query-cover 80 --id 90 --max-target-seqs 1 --block-size 4 --index-chunks 2 106 | propagate_annotations: --min-cov 80 --min-pident 90 107 | 108 | #-- snakemake pipeline --# 109 | pipeline: 110 | snakemake_folder: ./ 111 | script_folder: ./bin/scripts/ 112 | name: Struo2_db-update 113 | config: update -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | #-- email notifications of pipeline success/failure (use "Skip" to deactivate) --# 2 | email: None 3 | 4 | #-- I/O --# 5 | # file listing samples and associated data 6 | samples_file: data/GTDBr95_n10/GTDBr95_n10.tsv 7 | 8 | ## column names in samples table 9 | samples_col: 'ncbi_organism_name' 10 | accession_col: 'accession' 11 | fasta_file_path_col: 'fasta_file_path' 12 | taxID_col: 'gtdb_taxid' # or 'ncbi_species_taxid' 13 | taxonomy_col: 'gtdb_taxonomy' # or 'ncbi_taxonomy' 14 | 15 | # output location 16 | output_dir: tests/output/GTDBr95_n10/ 17 | 18 | # temporary file directory (your username will be added automatically) 19 | tmp_dir: /ebio/abt3_scratch/ 20 | 21 | #-- databases to create --# 22 | # Replace "Create" with "Skip" to skip creation of any of these 23 | # Note that braken relies on the kraken2 database 24 | databases: 25 | kraken2: Create 26 | bracken: Create 27 | genes: Create 28 | humann3_bowtie2: Create 29 | humann3_diamond: Create 30 | 31 | # Name of UniRef clustering (uniref90 or uniref50) 32 | ## "uniref90" highly recommended 33 | uniref_name: uniref90 34 | # Name of the humann3 diamond database to be created 35 | ## This must match the naming allowed by humann3 (eg., "uniref90_201901.dmnd") 36 | dmnd_name: uniref90_201901.dmnd 37 | # Index mapping UniRef90 clusters to UniRef50 (saves time vs re-annotating) 38 | ## This is skipped if annotating with UniRef50 instead of UniRef90 39 | cluster_idx: data/uniref50-90.pkl 40 | 41 | #-- if custom NCBI/GTDB taxdump files, "Skip" if standard NCBI taxdump --# 42 | # Used for kraken taxonomy & metaphlan 43 | names_dmp: data/taxdump/names.dmp 44 | nodes_dmp: data/taxdump/nodes.dmp 45 | 46 | #-- keep intermediate files required for re-creating DBs (eg., w/ more genomes) --# 47 | # If "True", the intermediate files are saved to `output_dir` 48 | # Else, the intermediate files are temporarily stored in `temp_folder` 49 | keep_intermediate: True 50 | 51 | #-- software parameters --# 52 | # `vsearch_per_genome` = per-genome gene clustering 53 | # use "Skip" at the start of any param to skip (if possible to skip) 54 | # for humann3, use either mmseqs or diamond (mmseqs gets priority if neither skipped) 55 | # for humann3::mmseqs_search::run, --num-iterations must be >=2 56 | params: 57 | ionice: -c 3 58 | bracken: 59 | build_kmer: 35 60 | build_read_lens: 61 | - 100 62 | - 150 63 | genes: 64 | prodigal: "" 65 | vsearch_per_genome: --id 0.97 --strand both --qmask none --fasta_width 0 66 | mmseqs_cluster: --min-seq-id 0.9 -c 0.8 67 | mmseqs_cluster_method: linclust # or "cluster", which is slower 68 | humann3: 69 | batches: 2 70 | mmseqs_search: 71 | db: data/UniRef90/uniref90 72 | index: -s 6 73 | run: -e 1e-3 --max-accept 1 --max-seqs 100 --num-iterations 2 --start-sens 1 --sens-steps 3 -s 6 74 | diamond: 75 | db: Skip #data/uniref90_ec-filtered/uniref90_ec_filt_201901.dmnd 76 | run: --evalue 1e-3 --query-cover 80 --id 90 --max-target-seqs 1 --block-size 4 --index-chunks 2 77 | propagate_annotations: --min-cov 80 --min-pident 90 78 | 79 | #-- snakemake pipeline --# 80 | pipeline: 81 | snakemake_folder: ./ 82 | script_folder: ./bin/scripts/ 83 | name: Struo2_db-create 84 | config: create -------------------------------------------------------------------------------- /data/GTDBr95_n5/GCA_000014945.1_ASM1494v1_genomic.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_000014945.1_ASM1494v1_genomic.fna.gz -------------------------------------------------------------------------------- /data/GTDBr95_n5/GCA_000720375.1_ASM72037v1_genomic.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_000720375.1_ASM72037v1_genomic.fna.gz -------------------------------------------------------------------------------- /data/GTDBr95_n5/GCA_002478565.1_ASM247856v1_genomic.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_002478565.1_ASM247856v1_genomic.fna.gz -------------------------------------------------------------------------------- /data/GTDBr95_n5/GCA_006715045.1_ASM671504v1_genomic.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_006715045.1_ASM671504v1_genomic.fna.gz -------------------------------------------------------------------------------- /data/GTDBr95_n5/GCA_007116575.1_ASM711657v1_genomic.fna.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_007116575.1_ASM711657v1_genomic.fna.gz -------------------------------------------------------------------------------- /data/GTDBr95_n5/GTDBr95_n5.tsv: -------------------------------------------------------------------------------- 1 | ncbi_organism_name accession ambiguous_bases checkm_completeness checkm_contamination checkm_marker_count checkm_marker_lineage checkm_marker_set_count checkm_strain_heterogeneity coding_bases coding_density contig_count gc_count gc_percentage genome_size gtdb_genome_representative gtdb_representative gtdb_taxonomy gtdb_type_designation gtdb_type_designation_sources gtdb_type_species_of_genus l50_contigs l50_scaffolds longest_contig longest_scaffold lsu_23s_contig_len lsu_23s_count lsu_23s_length lsu_23s_query_id lsu_5s_contig_len lsu_5s_count lsu_5s_length lsu_5s_query_id lsu_silva_23s_blast_align_len lsu_silva_23s_blast_bitscore lsu_silva_23s_blast_evalue lsu_silva_23s_blast_perc_identity lsu_silva_23s_blast_subject_id lsu_silva_23s_taxonomy mean_contig_length mean_scaffold_length mimag_high_quality mimag_low_quality mimag_medium_quality n50_contigs n50_scaffolds ncbi_assembly_level ncbi_assembly_name ncbi_assembly_type ncbi_bioproject ncbi_biosample ncbi_contig_count ncbi_contig_n50 ncbi_country ncbi_date ncbi_genbank_assembly_accession ncbi_genome_category ncbi_genome_representation ncbi_isolate ncbi_isolation_source ncbi_lat_lon ncbi_molecule_count ncbi_ncrna_count ncbi_protein_count ncbi_refseq_category ncbi_rrna_count ncbi_scaffold_count ncbi_scaffold_l50 ncbi_scaffold_n50 ncbi_scaffold_n75 ncbi_scaffold_n90 ncbi_seq_rel_date ncbi_spanned_gaps ncbi_species_taxid ncbi_ssu_count ncbi_strain_identifiers ncbi_submitter ncbi_taxid ncbi_taxonomy ncbi_taxonomy_unfiltered ncbi_total_gap_length ncbi_total_length ncbi_translation_table ncbi_trna_count ncbi_type_material_designation ncbi_ungapped_length ncbi_unspanned_gaps ncbi_wgs_master protein_count scaffold_count ssu_contig_len ssu_count ssu_gg_blast_align_len ssu_gg_blast_bitscore ssu_gg_blast_evalue ssu_gg_blast_perc_identity ssu_gg_blast_subject_id ssu_gg_taxonomy ssu_length ssu_query_id ssu_silva_blast_align_len ssu_silva_blast_bitscore ssu_silva_blast_evalue ssu_silva_blast_perc_identity ssu_silva_blast_subject_id ssu_silva_taxonomy total_gap_length trna_aa_count trna_count trna_selenocysteine_count SPECIES gtdb_taxid fasta_file_path 2 | RS_GCF_006715045.1_Amycolatopsis cihanbeyliensis RS_GCF_006715045.1 0 100 0.99 350 o__Actinomycetales (UID2014) 203 0 6692435 90.00726117 2 5213783 70.12071513 7435439 RS_GCF_006715045.1 t d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis type strain of species LPSN; DSMZ; StrainInfo f 1 1 6286745 6286745 6286745 2 3100 NZ_VFML01000001.1 6286745 2 108 NZ_VFML01000001.1 3118 4571 0 93.297 ARVW01000001.3175334.3178445 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis nigrescens CSC17Ta-90 3717719 3717719 t f f 6286745 6286745 Contig ASM671504v1 n/a PRJNA224116 SAMN11512385 2 6286745 none 7/8/19 GCA_006715045.1 none full none none none 0 0 none na 6 none none none none none 7/8/19 0 1128664 2 DSM 45679 DOE Joint Genome Institute 1128664 d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis 0 7435439 11 49 assembly from type material 7435439 0 VFML00000000.1 6730 2 6286745 2 none none none none none none 1514 NZ_VFML01000001.1 1460 2697 0 100 JN989302.1.1460 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis cihanbeyliensis 0 20 49 0 Amycolatopsis cihanbeyliensis 202433 data/GTDBr95_n5/GCA_006715045.1_ASM671504v1_genomic.fna.gz 3 | GB_GCA_002478565.1_Flavobacteriales bacterium UBA7468 GB_GCA_002478565.1 0 96.74 0.54 277 k__Bacteria (UID2569) 185 0 2200517 95.42095689 46 952777 41.34812551 2306115 GB_GCA_002478565.1 t d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Crocinitomicaceae;g__UBA952;s__UBA952 sp002478565 not type material none f 6 5 378299 378299 none 0 none none none 0 none none none none none none none none 50093 88696 f f t 122097 148012 Scaffold ASM247856v1 n/a PRJNA348753 SAMN06451991 46 122097 none 10/6/17 GCA_002478565.1 derived from metagenome full UBA7468 none none 0 0 0 na 0 none none none none none 10/6/17 20 1951076 0 UBA7468 University of Queensland 1951076 d__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__;g__;s__ d__Bacteria;x__FCB group;x__Bacteroidetes/Chlorobi group;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;x__unclassified Flavobacteriales;x__unclassified Flavobacteriales (miscellaneous);s__Flavobacteriales bacterium UBA7468 1834 2306115 none 0 none 2304281 0 DLPC00000000.1 2062 26 none 0 none none none none none none none none none none none none none none 1834 17 28 0 UBA952 sp002478565 189650 data/GTDBr95_n5/GCA_002478565.1_ASM247856v1_genomic.fna.gz 4 | GB_GCA_007116575.1_Rhodobacteraceae bacterium GB_GCA_007116575.1 0 90.5 0.91 568 f__Rhodobacteraceae (UID3340) 330 0 2402398 90.58286563 142 1695274 63.92062304 2652155 GB_GCA_007116575.1 t d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Roseinatronobacter;s__Roseinatronobacter sp007116575 not type material none f 28 28 83077 83077 none 0 none none none 0 none none none none none none none none 18677 18677 f f t 27979 27979 Contig ASM711657v1 n/a PRJNA453733 SAMN10605132 142 27979 none 7/18/19 GCA_007116575.1 derived from metagenome full CSBr16_51 none none 0 none none na none none none none none none 7/18/19 0 1904441 none CSBr16_51 "Institute for Biodiversity and Ecosystem dynamics, Faculty of Science, University of Amsterdam" 1904441 d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__;s__ d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;x__unclassified Rhodobacteraceae;s__Rhodobacteraceae bacterium 0 2652155 none none none 2652155 0 SKIF00000000.1 2563 142 none 0 none none none none none none none none none none none none none none 0 18 34 0 Roseinatronobacter sp007116575 195713 data/GTDBr95_n5/GCA_007116575.1_ASM711657v1_genomic.fna.gz 5 | GB_GCA_000014945.1_Methanosaeta thermophila PT GB_GCA_000014945.1 0 100 0 228 p__Euryarchaeota (UID49) 153 0 1591668 84.68702098 1 1006368 53.5452795 1879471 GB_GCA_000014945.1 t d__Archaea;p__Halobacteriota;c__Methanosarcinia;o__Methanotrichales;f__Methanotrichaceae;g__Methanothrix_B;s__Methanothrix_B thermoacetophila type strain of heterotypic synonym LPSN; StrainInfo f 1 1 1879471 1879471 1879471 2 2898 CP000477.1 1879471 2 115 CP000477.1 2888 5334 0 100 CP000477.1614054.1616941 Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT 1879471 1879471 t f f 1879471 1879471 Complete Genome ASM1494v1 n/a PRJNA15765 SAMN02598350 none none none 10/25/06 GCA_000014945.1 none full none none none 1 1 1696 na 6 none none none none none 10/25/06 0 2224 2 PT US DOE Joint Genome Institute 349307 d__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila d__Archaea;p__Euryarchaeota;x__Stenosarchaea group;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila;x__Methanothrix thermoacetophila PT 0 1879471 11 44 assembly from synonym type material 1879471 0 none 1810 1 1879471 2 1472 2719 0 100 155726 k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosaetaceae;g__Methanosaeta;s__ 1473 CP000477.1 1472 2719 0 100 CP000477.1617193.1618665 Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT 0 18 43 0 Methanothrix_B thermoacetophila 5158 data/GTDBr95_n5/GCA_000014945.1_ASM1494v1_genomic.fna.gz 6 | GB_GCA_000720375.1_Rhodococcus rhodnii GB_GCA_000720375.1 0 99.47 3.85 495 o__Actinomycetales (UID2012) 282 0 11292191 90.93197579 182 8230553 66.27769988 12418284 GB_GCA_000720375.1 t d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Kibdelosporangium;s__Kibdelosporangium sp000720375 type strain of species LPSN; DSMZ; StrainInfo f 14 14 746316 746316 1593 4 1525 JOAA01000102.1 1581 4 102 JOAA01000103.1 1525 2560 0 96.984 JNYM01000477.49.3165 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Kibdelosporangium aridum subsp. largum 68232 68232 f f t 319285 319285 Contig ASM72037v1 n/a PRJNA238534 SAMN02645355 182 319285 none 7/2/14 GCA_000720375.1 none full none none none 0 0 0 na 0 none none none none none 7/2/14 0 38312 0 NRRL B-16535 University of Illinois 38312 d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii 0 12418284 none 0 assembly from type material 12418284 0 JOAA00000000.1 11452 182 1381 3 1301 2379 0 99.693 756603 k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinosynnemataceae;g__Kibdelosporangium;s__ 1319 JOAA01000108.1 1305 2410 0 100 JOAA01000108.1.1312 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Rhodococcus rhodnii 0 20 66 2 Kibdelosporangium sp000720375 193223 data/GTDBr95_n5/GCA_000720375.1_ASM72037v1_genomic.fna.gz -------------------------------------------------------------------------------- /img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/img/logo.png -------------------------------------------------------------------------------- /snakemake_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # user input 4 | if [ "$#" -lt 2 ]; then 5 | echo "Usage: snakemake_clean.sh config_file [preview|delete]" 6 | echo "Description: delete all snakemake-generated files" 7 | echo " preview => preview files to delete" 8 | echo " delete => delete files" 9 | echo "NOTE: this only deletes files that snakemake knows about, but that's all that's needed to fully restart the snakemake pipeline" 10 | exit 11 | fi 12 | 13 | 14 | FILES=`snakemake --summary --rerun-incomplete --configfile $1 | tail -n+2 | cut -f1` 15 | if [ $1 == "preview" ]; then 16 | echo "#-- Files to delete --#" 17 | printf '%s\n' "${FILES[@]}" 18 | elif [ $1 == "delete" ]; then 19 | echo "#-- Deleting the following files --#" 20 | printf '%s\n' "${FILES[@]}" 21 | rm -rf $FILES 22 | else 23 | echo "$1 not recoginized" 24 | fi 25 | -------------------------------------------------------------------------------- /snakemake_conda-list.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # check for snakemake 4 | command -v snakemake >/dev/null 2>&1 || { echo "snakemake is not in your PATH"; exit 1; } 5 | 6 | # check for conda envs 7 | if [[ ! -d .snakemake ]] || [[ -z "$(ls -A .snakemake/conda/)" ]]; then 8 | echo "No conda envs found!" 9 | echo "To create the envs, run: 'snakemake --use-conda --create-envs-only -F'" 10 | exit 1 11 | fi 12 | 13 | # list all conda envs 14 | for X in $(snakemake --list-conda-envs -F | tail -n +3) 15 | do 16 | conda list -p $X 2>/dev/null || echo "#--- conda env: $X ---#" 17 | done 18 | 19 | -------------------------------------------------------------------------------- /snakemake_sge.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # user input 4 | if [ "$#" -lt 2 ]; then 5 | echo "snakemake_sge.sh config.yaml jobs ..." 6 | echo " config.yaml : snakemake config" 7 | echo " jobs : number of parallel qsub jobs" 8 | echo " ... : additional arguments passed to snakemake" 9 | exit 10 | fi 11 | 12 | # check for snakemake 13 | command -v snakemake >/dev/null 2>&1 || { echo "snakemake is not in your PATH"; exit 1; } 14 | 15 | # set args 16 | CONFIG=$1 17 | JOBS=$2 18 | 19 | # snakemake call 20 | WORKDIR=`pwd` 21 | snakemake -f \ 22 | --profile bin/ll_pipeline_utils/profiles/sge/ \ 23 | --use-conda \ 24 | --configfile $CONFIG \ 25 | --jobs $JOBS \ 26 | --local-cores $JOBS \ 27 | --printshellcmds \ 28 | --resources temp=$JOBS \ 29 | --directory $WORKDIR \ 30 | "${@:3}" 31 | 32 | 33 | -------------------------------------------------------------------------------- /tests/samples/GTDBr95_n5.tsv: -------------------------------------------------------------------------------- 1 | ncbi_organism_name accession ambiguous_bases checkm_completeness checkm_contamination checkm_marker_count checkm_marker_lineage checkm_marker_set_count checkm_strain_heterogeneity coding_bases coding_density contig_count gc_count gc_percentage genome_size gtdb_genome_representative gtdb_representative gtdb_taxonomy gtdb_type_designation gtdb_type_designation_sources gtdb_type_species_of_genus l50_contigs l50_scaffolds longest_contig longest_scaffold lsu_23s_contig_len lsu_23s_count lsu_23s_length lsu_23s_query_id lsu_5s_contig_len lsu_5s_count lsu_5s_length lsu_5s_query_id lsu_silva_23s_blast_align_len lsu_silva_23s_blast_bitscore lsu_silva_23s_blast_evalue lsu_silva_23s_blast_perc_identity lsu_silva_23s_blast_subject_id lsu_silva_23s_taxonomy mean_contig_length mean_scaffold_length mimag_high_quality mimag_low_quality mimag_medium_quality n50_contigs n50_scaffolds ncbi_assembly_level ncbi_assembly_name ncbi_assembly_type ncbi_bioproject ncbi_biosample ncbi_contig_count ncbi_contig_n50 ncbi_country ncbi_date ncbi_genbank_assembly_accession ncbi_genome_category ncbi_genome_representation ncbi_isolate ncbi_isolation_source ncbi_lat_lon ncbi_molecule_count ncbi_ncrna_count ncbi_protein_count ncbi_refseq_category ncbi_rrna_count ncbi_scaffold_count ncbi_scaffold_l50 ncbi_scaffold_n50 ncbi_scaffold_n75 ncbi_scaffold_n90 ncbi_seq_rel_date ncbi_spanned_gaps ncbi_species_taxid ncbi_ssu_count ncbi_strain_identifiers ncbi_submitter ncbi_taxid ncbi_taxonomy ncbi_taxonomy_unfiltered ncbi_total_gap_length ncbi_total_length ncbi_translation_table ncbi_trna_count ncbi_type_material_designation ncbi_ungapped_length ncbi_unspanned_gaps ncbi_wgs_master protein_count scaffold_count ssu_contig_len ssu_count ssu_gg_blast_align_len ssu_gg_blast_bitscore ssu_gg_blast_evalue ssu_gg_blast_perc_identity ssu_gg_blast_subject_id ssu_gg_taxonomy ssu_length ssu_query_id ssu_silva_blast_align_len ssu_silva_blast_bitscore ssu_silva_blast_evalue ssu_silva_blast_perc_identity ssu_silva_blast_subject_id ssu_silva_taxonomy total_gap_length trna_aa_count trna_count trna_selenocysteine_count SPECIES gtdb_taxid fasta_file_path 2 | RS_GCF_006715045.1_Amycolatopsis cihanbeyliensis RS_GCF_006715045.1 0 100 0.99 350 o__Actinomycetales (UID2014) 203 0 6692435 90.0072611718 2 5213783 70.120715132 7435439 RS_GCF_006715045.1 t d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis type strain of species LPSN; DSMZ; StrainInfo f 1 1 6286745 6286745 6286745 2 3100 NZ_VFML01000001.1 6286745 2 108 NZ_VFML01000001.1 3118 4571 0 93.297 ARVW01000001.3175334.3178445 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis nigrescens CSC17Ta-90 3717719 3717719 t f f 6286745 6286745 Contig ASM671504v1 n/a PRJNA224116 SAMN11512385 2 6286745 none 2019-07-08 GCA_006715045.1 none full none none none 0 0 none na 6 none none none none none 2019/07/08 0 1128664 2 DSM 45679 DOE Joint Genome Institute 1128664 d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis 0 7435439 11 49 assembly from type material 7435439 0 VFML00000000.1 6730 2 6286745 2 none none none none none none 1514 NZ_VFML01000001.1 1460 2697 0 100 JN989302.1.1460 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis cihanbeyliensis 0 20 49 0 Amycolatopsis cihanbeyliensis 202433 /ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_006715045.1/GCA_006715045.1_ASM671504v1_genomic.fna.gz 3 | GB_GCA_002478565.1_Flavobacteriales bacterium UBA7468 GB_GCA_002478565.1 0 96.74 0.54 277 k__Bacteria (UID2569) 185 0 2200517 95.4209568907 46 952777 41.3481255107 2306115 GB_GCA_002478565.1 t d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Crocinitomicaceae;g__UBA952;s__UBA952 sp002478565 not type material none f 6 5 378299 378299 none 0 none none none 0 none none none none none none none none 50093 88696 f f t 122097 148012 Scaffold ASM247856v1 n/a PRJNA348753 SAMN06451991 46 122097 none 2017-10-6 GCA_002478565.1 derived from metagenome full UBA7468 none none 0 0 0 na 0 none none none none none 2017/10/06 20 1951076 0 UBA7468 University of Queensland 1951076 d__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__;g__;s__ d__Bacteria;x__FCB group;x__Bacteroidetes/Chlorobi group;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;x__unclassified Flavobacteriales;x__unclassified Flavobacteriales (miscellaneous);s__Flavobacteriales bacterium UBA7468 1834 2306115 none 0 none 2304281 0 DLPC00000000.1 2062 26 none 0 none none none none none none none none none none none none none none 1834 17 28 0 UBA952 sp002478565 189650 /ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_002478565.1/GCA_002478565.1_ASM247856v1_genomic.fna.gz 4 | GB_GCA_007116575.1_Rhodobacteraceae bacterium GB_GCA_007116575.1 0 90.5 0.91 568 f__Rhodobacteraceae (UID3340) 330 0 2402398 90.5828656319 142 1695274 63.9206230405 2652155 GB_GCA_007116575.1 t d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Roseinatronobacter;s__Roseinatronobacter sp007116575 not type material none f 28 28 83077 83077 none 0 none none none 0 none none none none none none none none 18677 18677 f f t 27979 27979 Contig ASM711657v1 n/a PRJNA453733 SAMN10605132 142 27979 none 2019-07-18 GCA_007116575.1 derived from metagenome full CSBr16_51 none none 0 none none na none none none none none none 2019/07/18 0 1904441 none CSBr16_51 Institute for Biodiversity and Ecosystem dynamics, Faculty of Science, University of Amsterdam 1904441 d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__;s__ d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;x__unclassified Rhodobacteraceae;s__Rhodobacteraceae bacterium 0 2652155 none none none 2652155 0 SKIF00000000.1 2563 142 none 0 none none none none none none none none none none none none none none 0 18 34 0 Roseinatronobacter sp007116575 195713 /ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_007116575.1/GCA_007116575.1_ASM711657v1_genomic.fna.gz 5 | GB_GCA_000014945.1_Methanosaeta thermophila PT GB_GCA_000014945.1 0 100 0 228 p__Euryarchaeota (UID49) 153 0 1591668 84.6870209756 1 1006368 53.5452794962 1879471 GB_GCA_000014945.1 t d__Archaea;p__Halobacteriota;c__Methanosarcinia;o__Methanotrichales;f__Methanotrichaceae;g__Methanothrix_B;s__Methanothrix_B thermoacetophila type strain of heterotypic synonym LPSN; StrainInfo f 1 1 1879471 1879471 1879471 2 2898 CP000477.1 1879471 2 115 CP000477.1 2888 5334 0 100 CP000477.1614054.1616941 Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT 1879471 1879471 t f f 1879471 1879471 Complete Genome ASM1494v1 n/a PRJNA15765 SAMN02598350 none none none 2006-10-25 GCA_000014945.1 none full none none none 1 1 1696 na 6 none none none none none 2006/10/25 0 2224 2 PT US DOE Joint Genome Institute 349307 d__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila d__Archaea;p__Euryarchaeota;x__Stenosarchaea group;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila;x__Methanothrix thermoacetophila PT 0 1879471 11 44 assembly from synonym type material 1879471 0 none 1810 1 1879471 2 1472 2719 0 100 155726 k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosaetaceae;g__Methanosaeta;s__ 1473 CP000477.1 1472 2719 0 100 CP000477.1617193.1618665 Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT 0 18 43 0 Methanothrix_B thermoacetophila 5158 /ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/archaea/GCA_000014945.1/GCA_000014945.1_ASM1494v1_genomic.fna.gz 6 | GB_GCA_000720375.1_Rhodococcus rhodnii GB_GCA_000720375.1 0 99.47 3.85 495 o__Actinomycetales (UID2012) 282 0 11292191 90.9319757867 182 8230553 66.2776998819 12418284 GB_GCA_000720375.1 t d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Kibdelosporangium;s__Kibdelosporangium sp000720375 type strain of species LPSN; DSMZ; StrainInfo f 14 14 746316 746316 1593 4 1525 JOAA01000102.1 1581 4 102 JOAA01000103.1 1525 2560 0 96.984 JNYM01000477.49.3165 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Kibdelosporangium aridum subsp. largum 68232 68232 f f t 319285 319285 Contig ASM72037v1 n/a PRJNA238534 SAMN02645355 182 319285 none 2014-7-2 GCA_000720375.1 none full none none none 0 0 0 na 0 none none none none none 2014/07/02 0 38312 0 NRRL B-16535 University of Illinois 38312 d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii 0 12418284 none 0 assembly from type material 12418284 0 JOAA00000000.1 11452 182 1381 3 1301 2379 0 99.693 756603 k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinosynnemataceae;g__Kibdelosporangium;s__ 1319 JOAA01000108.1 1305 2410 0 100 JOAA01000108.1.1312 Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Rhodococcus rhodnii 0 20 66 2 Kibdelosporangium sp000720375 193223 /ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_000720375.1/GCA_000720375.1_ASM72037v1_genomic.fna.gz 7 | -------------------------------------------------------------------------------- /util_scripts/GTDB_metadata_filter.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # libraries 4 | suppressPackageStartupMessages(library("argparse")) 5 | suppressPackageStartupMessages(library("curl")) 6 | suppressPackageStartupMessages(library("data.table")) 7 | 8 | # create parser object 9 | parser <- ArgumentParser() 10 | 11 | # specifying options 12 | parser$add_argument("metadata_urls", nargs='+', help=">=1 url to GTDB metadata") 13 | parser$add_argument("-o", "--output", type='character', default='metadata.tsv', 14 | help="Output file name [default: %(default)s]") 15 | parser$add_argument("-c", "--columns", type='character', default='ncbi_organism_name,ncbi_genbank_assembly_accession,scaffold_count,contig_count,gc_percentage,genome_size,checkm_completeness,checkm_contamination,checkm_strain_heterogeneity,ncbi_assembly_level,ncbi_refseq_category,ncbi_species_taxid,ncbi_taxonomy,gtdb_taxonomy,mimag_high_quality,gtdb_representative', 16 | help="Table columns to keep [default: %(default)s]") 17 | parser$add_argument("-f", "--filter", type='character', default='gtdb_representative == "t" & checkm_completeness >= 50 & checkm_contamination < 5', 18 | help="Table columns to keep [default: %(default)s]") 19 | #parser$add_argument("-t", "--tmpdir", type='character', default='GMF_TMP', 20 | # help="Directory for temporary output [default: %(default)s]") 21 | parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, 22 | help="Print extra output [default: %(default)s]") 23 | parser$add_argument("-q", "--quietly", action="store_false", 24 | dest="verbose", help="Print little output") 25 | args <- parser$parse_args() 26 | 27 | 28 | # reading in table(s) 29 | write(sprintf('Keeping columns: %s', args['columns']), stderr()) 30 | cols = unlist(strsplit(unlist(args['columns']), ',')) 31 | write('----', stderr()) 32 | 33 | df = list() 34 | tmpdir = NULL 35 | for(url in unlist(args['metadata_urls'])){ 36 | # download and uncompress tarball 37 | if(grepl('.tar.gz$', url)){ 38 | write('url points to tarball; downloading and uncompressing', stderr()) 39 | tmpdir = 'GTDB_metadata_filter_TMP' 40 | if(! dir.exists(tmpdir)){ 41 | dir.create(tmpdir) 42 | } 43 | tmpfile = file.path(tmpdir, 'GTDB_metadata_filter_TMP.tar.gz') 44 | download.file(url, destfile=tmpfile) 45 | untar(tmpfile, exdir=tmpdir) 46 | url = file.path(tmpdir, gsub('.tar.gz$', '.tsv', basename(url))) 47 | } 48 | # read table 49 | write(sprintf('Reading in file: %s', url), stderr()) 50 | df[[url]] = fread(url, sep='\t', check.names=TRUE)[, ..cols] 51 | # clean up 52 | if(!is.null(tmpdir) & dir.exists(tmpdir)){ 53 | unlink(tmpdir, recursive = TRUE) 54 | } 55 | } 56 | 57 | df = do.call(rbind, df) 58 | x = as.character(nrow(df)) 59 | write(sprintf('Number of rows in the combined table: %s', x), stderr()) 60 | 61 | # Filtering 62 | x = unlist(args['filter'])[1] 63 | write(sprintf('Filtering rows by expression: %s', x), stderr()) 64 | df = df[eval(parse(text=x)),] 65 | x = as.character(nrow(df)) 66 | write(sprintf('Number of rows after filtering: %s', x), stderr()) 67 | 68 | # Writing table 69 | write(sprintf('Writing file to: %s', args['output']), stderr()) 70 | out_file = unlist(args['output']) 71 | fwrite(df, file=out_file, sep='\t', quote=FALSE, row.names=FALSE) -------------------------------------------------------------------------------- /util_scripts/database_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import re 6 | import gzip 7 | import bz2 8 | import resource 9 | import argparse 10 | import logging 11 | import functools 12 | import multiprocessing as mp 13 | # 3rd party 14 | import requests 15 | import bs4 16 | 17 | # logging 18 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 19 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 20 | argparse.RawDescriptionHelpFormatter): 21 | pass 22 | 23 | # argparse 24 | desc = 'Download Struo2 database files' 25 | epi = """DESCRIPTION: 26 | A helper script for downloading pre-built custom database files. 27 | Multiple GTDB releases & databases (eg., kraken2 or humann3) 28 | can be downloaded. 29 | 30 | Note: use "--" to separate "--database" parameters from 31 | """ 32 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 33 | formatter_class=CustomFormatter) 34 | parser.add_argument('output_dir', type=str, 35 | help='Output directory') 36 | parser.add_argument('-r', '--release', type=str, nargs='+', 37 | choices = ['95', '202', '207'], default=['207'], 38 | help='GTDB release') 39 | parser.add_argument('-d', '--database', type=str, nargs='+', 40 | choices = ['kraken2', 'humann3', 'taxdump', 'phylogeny', 41 | 'metadata', 'genes'], default=['metadata'], 42 | help='Database(s) to download ') 43 | parser.add_argument('-u', '--base-url', type=str, 44 | default='http://ftp.tue.mpg.de/ebio/projects/struo2/', 45 | help='Base url for downloads') 46 | parser.add_argument('-t', '--threads', type=int, default=1, 47 | help='Parallel download processes') 48 | parser.add_argument('-m', '--max-recursion', type=int, default=1048576, 49 | help='Max recursion limit') 50 | parser.add_argument('--version', action='version', version='0.0.1') 51 | 52 | # functions 53 | def decode(x): 54 | """ 55 | Decoding input, if needed 56 | """ 57 | try: 58 | x = x.decode('utf-8') 59 | except AttributeError: 60 | pass 61 | return x 62 | 63 | def write_lines(url, l, out_dir): 64 | """" 65 | Writing lines obtained from requests 66 | """ 67 | with requests.get(url + '/' + l['href'].lstrip('/'), stream=True) as r: 68 | if r.status_code == 404: 69 | return None 70 | if l['href'] == 'database.kraken': # debug 71 | return None 72 | out_file = os.path.join(out_dir, l['href']) 73 | with open(out_file, 'w') as outF: 74 | for line in r.iter_lines(decode_unicode=True): 75 | outF.write(decode(line) + '\n') 76 | logging.info(f'File written: {out_file}') 77 | 78 | def write_chunks(url, l, out_dir): 79 | """ 80 | Writing chunks obtained from requests 81 | """ 82 | with requests.get(url + '/' + l['href'].lstrip('/'), stream=True) as r: 83 | if r.status_code == 404: 84 | return None 85 | out_file = os.path.join(out_dir, l['href']) 86 | with open(out_file, 'wb') as outF: 87 | for chunk in r.iter_content(chunk_size = 1024): 88 | if chunk: 89 | outF.write(chunk) 90 | logging.info(f'File written: {out_file}') 91 | 92 | def dl_file(l, url, out_dir): 93 | """ 94 | Download file from url 95 | """ 96 | if l['href'].startswith('?') or l['href'].endswith('/'): 97 | return None 98 | try: 99 | write_lines(url, l, out_dir) 100 | except UnicodeDecodeError: 101 | write_chunks(url, l, out_dir) 102 | return None 103 | 104 | def dl_files(base_url, release, database, out_dir, threads): 105 | """ 106 | List files from url and download all available 107 | """ 108 | # output directory 109 | out_dir = os.path.join(out_dir, release, database) 110 | if not os.path.isdir(out_dir): 111 | os.makedirs(out_dir) 112 | # base url: GET 113 | url = os.path.join(base_url, release, database) 114 | r = requests.get(url) 115 | if r.status_code == 404: 116 | logging.warning('WARNING: 404 status code for url: {}'.format(url)) 117 | return None 118 | # file urls: GET 119 | data = bs4.BeautifulSoup(r.text, 'html.parser') 120 | func = functools.partial(dl_file, url=url, out_dir=out_dir) 121 | bs4_list = [x for x in data.find_all('a')] 122 | if args.threads > 1: 123 | pool = mp.Pool(threads) 124 | pool.map(func, bs4_list) 125 | else: 126 | [x for x in map(func, bs4_list)] 127 | try: 128 | pool.close() 129 | except UnboundLocalError: 130 | pass 131 | 132 | def set_recursion(max_rec): 133 | """ 134 | max_rec = 0x100000 135 | """ 136 | resource.setrlimit(resource.RLIMIT_STACK, [0x100 * max_rec, resource.RLIM_INFINITY]) 137 | sys.setrecursionlimit(max_rec) 138 | logging.info(f'Max recursion set to: {max_rec}') 139 | 140 | def main(args): 141 | # args 142 | args.release = ['GTDB_release{}'.format(x) for x in args.release] 143 | # recursion 144 | set_recursion(args.max_recursion) 145 | # output 146 | if not os.path.isdir(args.output_dir): 147 | os.makedirs(args.output_dir) 148 | # list files 149 | for db in args.database: 150 | for release in args.release: 151 | dl_files(args.base_url, release, db, 152 | args.output_dir, args.threads) 153 | 154 | 155 | if __name__ == '__main__': 156 | args = parser.parse_args() 157 | main(args) 158 | -------------------------------------------------------------------------------- /util_scripts/genome_download.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # libraries 4 | suppressPackageStartupMessages(library("argparse")) 5 | suppressPackageStartupMessages(library("dplyr")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specifying options 11 | parser$add_argument("acc_table", nargs=1, help="Table containing assembly accessions (tab-delim with header)") 12 | parser$add_argument("-c", "--column", type='character', default='ncbi_genbank_assembly_accession', 13 | help="Column name containing accessions [default: %(default)s]") 14 | parser$add_argument("-o", "--output", type='character', default='.', 15 | help="Path for output [default: %(default)s]") 16 | parser$add_argument("-p", "--procs", type='integer', default=1, 17 | help="Number of parallel processes [default: %(default)s]") 18 | parser$add_argument("-r", "--retries", type='integer', default=3, 19 | help="Number of retries [default: %(default)s]") 20 | parser$add_argument("-d", "--database", type='character', default='genbank', 21 | help="database to download (-s flag for ncbi-genome-download) [default: %(default)s]") 22 | parser$add_argument("-x", "--params", type='character', default='archaea,bacteria', 23 | help="Filtering parameters for ncbi-genome-download [default: %(default)s]") 24 | parser$add_argument("-f", "--filter", action="store_true", default=FALSE, 25 | help="Check for 'fasta_file_path' and just download any accessions lacking values [default: %(default)s]") 26 | parser$add_argument("-s", "--skip", action="store_true", default=FALSE, 27 | help="Skip the genome downloading; useful if re-running to re-make the output table [default: %(default)s]") 28 | parser$add_argument("-v", "--verbose", action="store_true", default=TRUE, 29 | help="Print extra output [default: %(default)s]") 30 | parser$add_argument("-q", "--quietly", action="store_false", 31 | dest="verbose", help="Print little output") 32 | args = parser$parse_args() 33 | 34 | 35 | # checking for executables 36 | exe = 'ncbi-genome-download' 37 | hits = unlist(Sys.which('ncbi-genome-download')) 38 | if(hits[1] == ''){ 39 | stop(sprintf('Cannot find executable: %s', exe)) 40 | } 41 | 42 | # reading in table 43 | x = unlist(args['acc_table'])[1] 44 | write(sprintf('Reading table: %s', x), stderr()) 45 | df = read.delim(x, sep='\t') 46 | write(sprintf('Number of rows: %s', nrow(df)), stderr()) 47 | 48 | # filtering table 49 | ## checking for "fasta_file_path" column 50 | filter_bool = unlist(args['filter'])[1] 51 | if(filter_bool == TRUE){ 52 | write('Filtering to just rows with NAs in `fasta_file_path` column', stderr()) 53 | if('fasta_file_path' %in% colnames(df)){ 54 | df_complete = filter(df, !(is.na(fasta_file_path) | fasta_file_path == '')) 55 | df = filter(df, is.na(fasta_file_path) | fasta_file_path == '') 56 | } else { 57 | stop('Cannot find column: "fasta_file_path"') 58 | } 59 | } 60 | 61 | ## Filtering based on user params & getting accessions 62 | write('Filtering out genomes lacking NCBI genbank assembly accession', stderr()) 63 | col = unlist(args['column'])[1] 64 | df = df[df[,col] != 'none',] 65 | write(sprintf('Number of rows after filtering: %s', nrow(df)), stderr()) 66 | ### just accessions 67 | df_acc = df[,col] 68 | df_acc = as.data.frame(df_acc) 69 | 70 | # creating temp file of accessions 71 | ## Creating output directory 72 | D = normalizePath(unlist(args['output'])[1]) 73 | dir.create(D, showWarnings = FALSE) 74 | ## writing table 75 | F = file.path(D, 'accession.txt') 76 | write(sprintf('Writing accessions to: %s', F), stderr()) 77 | write.table(df_acc, file=F, sep='\t', quote=FALSE, col.names=FALSE, row.names=FALSE) 78 | 79 | # calling ncbi genome download 80 | procs = as.character(unlist(args['procs'])[1]) 81 | retries = as.character(unlist(args['retries'])[1]) 82 | params = as.character(unlist(args['params'])[1]) 83 | database = as.character(unlist(args['database'])[1]) 84 | skip_bool = args['skip'][1] 85 | if(skip_bool != TRUE){ 86 | cmd = paste(c(exe, '-F', 'fasta', '-o', D, '-p', procs, '-r', retries, 87 | '-A', F, '-s', database, params), collapse=' ') 88 | write(sprintf('Running cmd: %s', cmd), stderr()) 89 | system(cmd) 90 | } else { 91 | write('Skipping genome download', stderr()) 92 | } 93 | 94 | # adding paths to genomes onto the table 95 | ## getting file paths 96 | D2 = file.path(D, database) 97 | fasta_files = list.files(D2, pattern='*.fna.gz', recursive=TRUE, full.names=TRUE) 98 | n_files = as.character(length(fasta_files)) 99 | write(sprintf('Number of fasta files found: %s', n_files), stderr()) 100 | ## Adding paths to input table 101 | write('Adding file paths to the input table', stderr()) 102 | fasta_files = data.frame(accession = gsub('.+/', '', fasta_files), 103 | fasta_file_path = fasta_files) 104 | fasta_files$accession = gsub('(GCA_[0-9]+\\.[0-9]+)_.+', '\\1', fasta_files$accession) 105 | df = left_join(df, fasta_files, by=setNames('accession', col)) 106 | 107 | # recombining tables (if --filter) 108 | if(filter_bool == TRUE){ 109 | df = rbind(df_complete, df) 110 | } 111 | 112 | # writing table 113 | write.table(df, file=stdout(), sep='\t', row.names=FALSE, quote=FALSE) 114 | 115 | # status 116 | n_rows = as.character(nrow(df)) 117 | write(sprintf('Number of rows in the output: %s', n_rows), stderr()) 118 | 119 | n_missing = as.character(nrow(df[is.na(df$fasta_file_path),])) 120 | write(sprintf('Number of rows with missing file paths: %s ', n_missing), stderr()) 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /util_scripts/genome_traitar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import os 4 | import sys 5 | import re 6 | import gzip 7 | import bz2 8 | import argparse 9 | import logging 10 | from collections import defaultdict 11 | 12 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 13 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, 14 | argparse.RawDescriptionHelpFormatter): 15 | pass 16 | 17 | desc = 'Converting traitar data to standardized table of traits' 18 | epi = """DESCRIPTION: 19 | Formatting traitar table with at least the following columns: 20 | [sample, phenotype, prediction] 21 | ... to the following: 22 | [genome, domain, phylum, class, order, family, genus, species, trait1, ..., traitN] 23 | 24 | A metadata file of all genomes is used to get the genome taxonomy data. 25 | 26 | Output written as tsv file to STDOUT. 27 | """ 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi, 29 | formatter_class=CustomFormatter) 30 | argparse.ArgumentDefaultsHelpFormatter 31 | parser.add_argument('traitar_output', metavar='traitar_output', type=str, 32 | help='Traitar output file') 33 | parser.add_argument('genome_metadata', metavar='genome_metadata', type=str, 34 | help='Tab-delim metadata file that contains at least 2 columns: "accession" & "gtdb_taxonomy') 35 | parser.add_argument('--version', action='version', version='0.0.1') 36 | 37 | 38 | def _open(infile, mode='rb'): 39 | """ 40 | Openning of input, regardless of compression 41 | """ 42 | if infile.endswith('.bz2'): 43 | return bz2.open(infile, mode) 44 | elif infile.endswith('.gz'): 45 | return gzip.open(infile, mode) 46 | else: 47 | return open(infile) 48 | 49 | def _decode(x): 50 | """ 51 | Decoding input, if needed 52 | """ 53 | try: 54 | x = x.decode('utf-8') 55 | except AttributeError: 56 | pass 57 | return x 58 | 59 | def parse_meta(infile): 60 | """ 61 | Return: {genome_accession : taxonomy} 62 | """ 63 | if infile is None: 64 | return None 65 | logging.info('Loading file: {}'.format(infile)) 66 | regex = re.compile(r'[^a-zA-Z0-9_-]+') 67 | header = {} 68 | meta = {} 69 | msg = 'Cannot find column "{}"' 70 | with _open(infile) as inF: 71 | for i,line in enumerate(inF): 72 | line = _decode(line).rstrip().split('\t') 73 | if i == 0: 74 | header = {x.lower():ii for ii,x in enumerate(line)} 75 | continue 76 | try: 77 | accession = regex.sub('_', str(line[header['accession']])) 78 | except KeyError: 79 | raise KeyError(msg.format('accession')) 80 | try: 81 | taxonomy = line[header['gtdb_taxonomy']].split(';') 82 | except KeyError: 83 | raise KeyError(msg.format('gtdb_taxonomy')) 84 | meta[accession] = taxonomy 85 | logging.info(' No. of accessions: {}'.format(len(meta.keys()))) 86 | return meta 87 | 88 | def parse_traitar(infile, model='phypat+PGL'): 89 | """ 90 | Parsing gene annotation file. 91 | Return: {genome : {trait : score}} 92 | """ 93 | logging.info('Loading file: {}'.format(infile)) 94 | trt = defaultdict(dict) 95 | header = {} 96 | status = {'records' : 0} 97 | with _open(infile) as inF: 98 | for i,line in enumerate(inF): 99 | # line parse 100 | line = _decode(line).rstrip().split('\t') 101 | if i == 0: 102 | header = {x:ii for ii,x in enumerate(line)} 103 | continue 104 | if len(line) < len(header.keys()): 105 | msg = 'line {}: less columns than header; skipping!' 106 | logging.warning(msg.format(i+1)) 107 | continue 108 | # genome 109 | try: 110 | genome_name = line[header['genome']] 111 | except KeyError: 112 | msg = 'Cannot find "{}" column in "{}"' 113 | raise KeyError(msg.format('genome', infile)) 114 | # phenotype model 115 | try: 116 | phen_model = str(line[header['phenotype_model']]) 117 | except KeyError: 118 | msg = 'Cannot find "{}" column in "{}"' 119 | raise KeyError(msg.format('phenotype_model', infile)) 120 | except IndexError: 121 | msg = 'No model listed in line: {}' 122 | raise IndexError(msg.format(i+1)) 123 | if phen_model != model: 124 | continue 125 | # phenotype 126 | try: 127 | trt_name = str(line[header['phenotype']]) 128 | except KeyError: 129 | msg = 'Cannot find "{}" column in "{}"' 130 | raise KeyError(msg.format('phenotype', infile)) 131 | # phenotype score 132 | try: 133 | trt_score = line[header['prediction_score']] 134 | except KeyError: 135 | msg = 'Cannot find "{}" column in "{}"' 136 | raise KeyError(msg.format('prediction_score', infile)) 137 | # adding info 138 | trt[genome_name][trt_name] = trt_score 139 | status['records'] += 1 140 | # status 141 | logging.info(' No. of records: {}'.format(status['records'])) 142 | return trt 143 | 144 | def get_all_trt(trt): 145 | """ 146 | All phenotype names 147 | """ 148 | all_trt = set() 149 | for genome in trt.keys(): 150 | for trt_name in trt[genome].keys(): 151 | all_trt.add(trt_name) 152 | logging.info(' No. of trait columns: {}'.format(len(all_trt))) 153 | return sorted(all_trt) 154 | 155 | def write_trait_table(trt, meta): 156 | """ 157 | Writing table of annotations 158 | """ 159 | logging.info('Writing table to STDOUT...') 160 | # all annotations 161 | all_trt = get_all_trt(trt) 162 | header = ['genome', 'domain', 'phylum', 'class', 'order', 'family', 163 | 'genus', 'species'] 164 | print('\t'.join(header + all_trt)) 165 | status = {'records' : 0} 166 | for genome in meta.keys(): 167 | # genome taxonomy 168 | try: 169 | taxonomy = meta[genome] 170 | except KeyError: 171 | msg = 'Cannot find "{}" in metadata' 172 | raise KeyError(msg.format(genome)) 173 | # counts 174 | trt_cnts = [] 175 | for trt_name in all_trt: 176 | try: 177 | x = trt[genome][trt_name] 178 | except KeyError: 179 | x = 0 180 | trt_cnts.append(x) 181 | # writing line 182 | trt_cnts = [str(x) for x in trt_cnts] 183 | print('\t'.join([genome] + taxonomy + trt_cnts)) 184 | status['records'] += 1 185 | logging.info(' No. of records written: {}'.format(status['records'])) 186 | 187 | 188 | def main(args): 189 | # parsing genome metadata 190 | meta = parse_meta(args.genome_metadata) 191 | # loading traitar data 192 | trt = parse_traitar(args.traitar_output) 193 | # writing table 194 | write_trait_table(trt, meta) 195 | 196 | if __name__ == '__main__': 197 | args = parser.parse_args() 198 | main(args) 199 | -------------------------------------------------------------------------------- /util_scripts/tree_prune.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | import sys,os 4 | import argparse 5 | import logging 6 | import tempfile 7 | import csv 8 | import urllib.request 9 | import codecs 10 | from distutils.spawn import find_executable 11 | import subprocess 12 | 13 | desc = 'Prune >=1 phylogeny' 14 | epi = """DESCRIPTION: 15 | Prune GTDB phylogeny (bacteria and/ archaea) 16 | to just the list of genome accessions provided. 17 | 18 | `nw_prune` from the newick_utils toolset is used 19 | for pruning. 20 | 21 | Trees can be provided as files or urls to files. 22 | 23 | If >1 tree is provided, then the trees are merged; 24 | `--root-brlen` determines the brlens to the root. 25 | 26 | Output is written to STDOUT. 27 | """ 28 | parser = argparse.ArgumentParser(description=desc, 29 | epilog=epi, 30 | formatter_class=argparse.RawTextHelpFormatter) 31 | parser.add_argument('accs_to_keep', metavar='accs_to_keep', type=str, 32 | help='File of genome accessions to keep on the tree. 1 acc per line') 33 | parser.add_argument('tree_file', metavar='tree_file', type=str, nargs='+', 34 | help='>=1 newick file (or url to the file)') 35 | parser.add_argument('-r', '--root-brlen', type=float, default=0.0001, 36 | help='Root node branch length (default: %(default)s)') 37 | parser.add_argument('-s', '--skip-root-brlen', action='store_true', default=False, 38 | help='Don\'t add root node branch length (default: %(default)s)') 39 | parser.add_argument('--version', action='version', version='0.0.1') 40 | 41 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG) 42 | 43 | 44 | def read_tree(file_or_url, root_brlen, skip_root_brlen=False): 45 | """ Reading in tree file (downloading if url) """ 46 | logging.info('Reading tree: {}'.format(file_or_url)) 47 | try: 48 | line = urllib.request.urlopen(file_or_url).read().decode('utf-8') 49 | #inF = #csv.reader(codecs.iterdecode(ftpstream, 'utf-8')) 50 | except ValueError: 51 | line = open(file_or_url).read() 52 | 53 | line = line.rstrip().rstrip(';') 54 | if not skip_root_brlen: 55 | line += '100.0:{}'.format(root_brlen) 56 | 57 | return line 58 | 59 | def read_trees(tree_files, root_brlen, skip_root_brlen=False): 60 | """ reading in >= tree file (or url) """ 61 | trees = [] 62 | for F in tree_files: 63 | trees.append(read_tree(F, root_brlen, skip_root_brlen)) 64 | trees = '(' + ','.join(trees) + ')' 65 | trees += '100.0:{}'.format(root_brlen) 66 | return trees 67 | 68 | def prune_tree(tree_file, taxa_to_keep): 69 | """ Pruning trees via nw_prune """ 70 | cmd = ['nw_prune', '-v', '-f', tree_file, taxa_to_keep] 71 | cmd = ' '.join(cmd) 72 | logging.info('CMD: {}'.format(cmd)) 73 | try: 74 | res = subprocess.run(cmd, check=True, shell=True, 75 | stdout=subprocess.PIPE) 76 | except subprocess.CalledProcessError as e: 77 | raise e 78 | res = res.stdout.decode().rstrip() 79 | print(res) 80 | 81 | def main(args): 82 | # checking for newick_utils exe 83 | if find_executable('nw_prune') is None: 84 | msg = 'Cannot find "nw_prune" in PATH. Is newick_utils installed?' 85 | raise IOError(msg) 86 | 87 | # downloading/merging trees 88 | ## temp output file 89 | dirpath = tempfile.mkdtemp() 90 | tmpTree_name = os.path.join(dirpath, 'TMP.nwk') 91 | ## reading in trees 92 | with open(tmpTree_name, 'w') as tmpTree: 93 | if len(args.tree_file) > 1: 94 | tree = read_trees(args.tree_file, args.root_brlen, 95 | args.skip_root_brlen) 96 | else: 97 | tree = read_tree(args.tree_file[0], args.root_brlen, 98 | args.skip_root_brlen) 99 | tmpTree.write(tree + ';') 100 | 101 | # pruning 102 | prune_tree(tmpTree_name, args.accs_to_keep) 103 | 104 | 105 | if __name__ == '__main__': 106 | args = parser.parse_args() 107 | main(args) 108 | --------------------------------------------------------------------------------