├── .github
    └── workflows
    │   └── pythonpackage.yml
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── Snakefile
├── bin
    ├── Snakefile
    ├── db_create
    │   ├── Snakefile
    │   ├── bracken
    │   │   └── Snakefile
    │   ├── genes
    │   │   └── Snakefile
    │   ├── humann3
    │   │   ├── Snakefile
    │   │   ├── db_create
    │   │   │   └── Snakefile
    │   │   ├── prepare_query
    │   │   │   └── Snakefile
    │   │   └── query
    │   │   │   ├── Snakefile
    │   │   │   ├── dmnd
    │   │   │       └── Snakefile
    │   │   │   └── mmseqs
    │   │   │       └── Snakefile
    │   └── kraken2
    │   │   └── Snakefile
    ├── db_update
    │   ├── Snakefile
    │   ├── bracken
    │   │   └── Snakefile
    │   ├── genes
    │   │   ├── Snakefile
    │   │   ├── db_update
    │   │   │   └── Snakefile
    │   │   └── input
    │   │   │   ├── Snakefile
    │   │   │   ├── check
    │   │   │       └── Snakefile
    │   │   │   ├── from_gene_set
    │   │   │       └── Snakefile
    │   │   │   └── from_genomes
    │   │   │       └── Snakefile
    │   ├── humann3
    │   │   ├── Snakefile
    │   │   ├── db_create
    │   │   │   └── Snakefile
    │   │   ├── input_from_genes
    │   │   │   └── Snakefile
    │   │   ├── prepare_query
    │   │   │   └── Snakefile
    │   │   ├── query_dmnd
    │   │   │   └── Snakefile
    │   │   └── query_mmseqs
    │   │   │   └── Snakefile
    │   └── kraken2
    │   │   └── Snakefile
    ├── dirs
    ├── envs
    │   ├── genes.yaml
    │   ├── humann2.yaml
    │   ├── humann3.yaml
    │   ├── kraken2.yaml
    │   └── krakenuniq.yaml
    ├── scripts
    │   ├── add_user_seqs.py
    │   ├── annotate_genes.py
    │   ├── bracken-build.py
    │   ├── cat_files.py
    │   ├── check_gene_info.py
    │   ├── download_taxonomy.sh
    │   ├── filter_cluster_reps.py
    │   ├── filter_seqs.py
    │   ├── kraken2-build
    │   ├── kraken2_rename_genome.py
    │   ├── log_summarize.py
    │   ├── metaphlan_db_from_clusts.py
    │   ├── metaphlan_db_from_uniref.py
    │   ├── propagate_annotations.py
    │   ├── species_specific.py
    │   ├── uncomp.py
    │   ├── uncomp_tarball.py
    │   └── uniref_clst_trans.py
    └── utils
    │   └── Snakefile
├── conda_env.yaml
├── config-update.yaml
├── config.yaml
├── data
    └── GTDBr95_n5
    │   ├── GCA_000014945.1_ASM1494v1_genomic.fna.gz
    │   ├── GCA_000720375.1_ASM72037v1_genomic.fna.gz
    │   ├── GCA_002478565.1_ASM247856v1_genomic.fna.gz
    │   ├── GCA_006715045.1_ASM671504v1_genomic.fna.gz
    │   ├── GCA_007116575.1_ASM711657v1_genomic.fna.gz
    │   └── GTDBr95_n5.tsv
├── img
    └── logo.png
├── notebooks
    ├── GTDB_release202
    │   └── 01_metadata
    │   │   ├── 01_GTDB_metadata_summary.ipynb
    │   │   ├── 02_phylogeny.ipynb
    │   │   └── 03_Struo2.ipynb
    ├── GTDB_release95
    │   ├── 01_metadata
    │   │   └── 01_GTDB_metadata_summary.ipynb
    │   ├── 02_struo_version_db-create_benchmarking
    │   │   ├── 01_benchmarking_UniRef50-90_db-create.ipynb
    │   │   └── 02_benchmarking_UniRef50-90_db-update.ipynb
    │   └── 03_GTDBr95_db
    │   │   ├── 01_phylogeny.ipynb
    │   │   └── 02_GTDBr95_create.ipynb
    ├── Misc
    │   └── 01_GTDB_release_summary.ipynb
    └── struo_dev
    │   └── 01_test_dataset.ipynb
├── snakemake_clean.sh
├── snakemake_conda-list.sh
├── snakemake_sge.sh
├── tests
    └── samples
    │   ├── GTDBr95_n10.tsv
    │   └── GTDBr95_n5.tsv
└── util_scripts
    ├── GTDB_metadata_filter.R
    ├── database_download.py
    ├── genome_download.R
    ├── genome_gene_content.py
    ├── genome_mis-asmbl_sim.py
    ├── genome_traitar.py
    └── tree_prune.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Struo2
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 |     name: build (${{ matrix.python-version }}, ${{ matrix.os }})
12 |     runs-on: ubuntu-latest
13 |     env:
14 |       DATADIR: data
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.7]
18 |     steps:
19 |     - uses: conda-incubator/setup-miniconda@v2
20 |       with:
21 |         miniconda-version: 'latest'
22 |         auto-update-conda: true
23 |         python-version: ${{ matrix.python-version }}
24 |         channels: conda-forge,bioconda
25 |         channel-priority: strict
26 |         activate-environment: struo2
27 |     - name: Checkout repository
28 |       uses: actions/checkout@v2
29 |     - name: Checkout submodules
30 |       run: git submodule update --init --recursive
31 |     - name: conda env setup
32 |       shell: bash -l {0}
33 |       run: |
34 |         conda info -a
35 |         conda env update -f conda_env.yaml python=${{ matrix.python-version }}
36 |         conda list
37 |     - name: taxdump db setup
38 |       shell: bash -l {0}
39 |       run: |
40 |         mkdir -p $DATADIR
41 |         wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/GTDB_release95/taxdump/taxdump.tar.gz
42 |         tar -pzxvf $DATADIR/taxdump.tar.gz --directory $DATADIR
43 |     - name: UniRef db setup
44 |       shell: bash -l {0}
45 |       run: |
46 |         mkdir -p $DATADIR/UniRef90/
47 |         touch $DATADIR/UniRef90/uniref90 $DATADIR/UniRef90/uniref90.dbtype $DATADIR/UniRef90/uniref90.index
48 |         touch $DATADIR/UniRef90/uniref90.lookup $DATADIR/UniRef90/uniref90.source
49 |         touch $DATADIR/UniRef90/uniref90_h $DATADIR/UniRef90/uniref90_h.dbtype $DATADIR/UniRef90/uniref90_h.index
50 |         wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/install/uniref_2019.01/uniref50-90.pkl
51 |     - name: Reference genome download
52 |       shell: bash -l {0}
53 |       run: |
54 |         wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/dev_data/genomes/GTDBr95_n10.tar.gz
55 |         tar -pzxvf $DATADIR/GTDBr95_n10.tar.gz --directory $DATADIR
56 |     - name: DB create tests
57 |       shell: bash -l {0}
58 |       run: |
59 |         snakemake --use-conda --configfile config.yaml -j 1 -F --dryrun
60 |     - name: DB update tests
61 |       shell: bash -l {0}
62 |       run: |
63 |         echo "todo" #snakemake --use-conda --configfile config-update.yaml -j 1 -F --dryrun
64 |     - name: Util script dependency tests
65 |       shell: bash -l {0}
66 |       run: |
67 |         ./util_scripts/genome_download.R -h
68 |         ./util_scripts/GTDB_metadata_filter.R -h
69 |         ./util_scripts/tree_prune.py -h
70 |         ./util_scripts/genome_mis-asmbl_sim.py -h
71 |     - name: Tree pruning test
72 |       shell: bash -l {0}
73 |       run: |
74 |         wget --directory-prefix $DATADIR http://ftp.tue.mpg.de/ebio/projects/struo2/dev_data/phylogeny.tar.gz
75 |         tar -pzxvf $DATADIR/phylogeny.tar.gz --directory $DATADIR
76 |         ./util_scripts/tree_prune.py data/phylogeny/accs_to_keep.txt data/phylogeny/ar122_r95.tree
77 |     - name: Mis-assembly simulation test
78 |       shell: bash -l {0}
79 |       run: |
80 |         ./util_scripts/genome_mis-asmbl_sim.py -b 2 -r 2 -c 2 -T ncbi_organism_name -F fasta_file_path data/GTDBr95_n10/GTDBr95_n10.tsv
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | .snakemake/
 6 | tests/output*/
 7 | tmp/
 8 | .DS_Store
 9 | ._.DS_Store
10 | .envrc
11 | ._*
12 | misc/
13 | archive/
14 | screenlog.*
15 | bin/scripts/metaphlan2/databases/
16 | .ipynb_checkpoints/
17 | no-log_jobs_sge.*
18 | *4test.yaml
19 | data/
20 | notebooks_dev/
21 | util_scripts/data
22 | 
23 | # hg
24 | **/.hg/
25 | **/.hg*
26 | 
27 | 
28 | # C extensions
29 | *.so
30 | 
31 | # Distribution / packaging
32 | .Python
33 | env/
34 | build/
35 | develop-eggs/
36 | #dist/
37 | downloads/
38 | eggs/
39 | .eggs/
40 | lib/
41 | lib64/
42 | parts/
43 | sdist/
44 | var/
45 | *.egg-info/
46 | .installed.cfg
47 | *.egg
48 | 
49 | # PyInstaller
50 | #  Usually these files are written by a python script from a template
51 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
52 | *.manifest
53 | *.spec
54 | 
55 | # Installer logs
56 | pip-log.txt
57 | pip-delete-this-directory.txt
58 | 
59 | # Unit test / coverage reports
60 | htmlcov/
61 | .tox/
62 | .coverage
63 | .coverage.*
64 | .cache
65 | nosetests.xml
66 | coverage.xml
67 | *,cover
68 | .hypothesis/
69 | 
70 | # Translations
71 | *.mo
72 | *.pot
73 | 
74 | # Django stuff:
75 | *.log
76 | 
77 | # Sphinx documentation
78 | docs/_build/
79 | 
80 | # PyBuilder
81 | target/
82 | 
83 | # pyenv python configuration file
84 | .python-version
85 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "bin/ll_pipeline_utils"]
2 | 	path = bin/ll_pipeline_utils
3 | 	url = https://github.com/leylabmpi/ll_pipeline_utils.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Nick Youngblut
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Snakefile:
--------------------------------------------------------------------------------
 1 | # import 
 2 | from __future__ import print_function
 3 | import os
 4 | import sys
 5 | import re
 6 | import glob
 7 | import socket
 8 | import getpass
 9 | import subprocess
10 | from distutils.spawn import find_executable
11 | import pandas as pd
12 | 
13 | # load
14 | configfile: 'config.yaml'
15 | 
16 | # general functions
17 | def ssw(x, *args, **kwargs):
18 |     sys.stderr.write(x, *args, **kwargs)
19 | 
20 | # setup
21 | ## pipeline utils
22 | snake_dir = config['pipeline']['snakemake_folder']
23 | include: snake_dir + 'bin/ll_pipeline_utils/Snakefile'
24 | config_default(config, 'pipeline', 'name')
25 | ## custom functions
26 | def make_fasta_splits(n_jobs, zero_pad=3):
27 |     if str(n_jobs).lstrip().startswith('Skip'):
28 |         n_jobs = 1
29 |     zero_pad = '{0:0' + str(zero_pad) + 'd}'
30 |     return [str(zero_pad.format(x+1)) for x in range(n_jobs)]
31 | 
32 | # setting paths
33 | config['samples_file'] = os.path.abspath(config['samples_file'])
34 | config['pipeline']['snakemake_folder'] = \
35 |     os.path.abspath(config['pipeline']['snakemake_folder']) + '/'
36 | 
37 | # uniref
38 | config['uniref_name'] = str(config['uniref_name']).rstrip().lower()
39 | if config['uniref_name'] == 'uniref50':
40 |     config['uniref_other_name'] = 'uniref90'
41 | elif config['uniref_name'] == 'uniref90':
42 |     config['uniref_other_name'] = 'uniref50'
43 | else:
44 |     msg = 'Only "uniref90" and "uniref50" supported for "uniref_name:". Value provided: {}'
45 |     raise ValueError(msg.format(config['uniref_name']))
46 | 
47 | ## base of the snakefile hierarchy 
48 | include: snake_dir + 'bin/Snakefile'
49 | include: snake_dir + 'bin/utils/Snakefile'
50 | 
51 | ## pipeline main
52 | wildcard_constraints:
53 |     sample="[^/]+",
54 |     uniref="[^/]+"
55 | 
56 | localrules: all
57 | 
58 | rule all:
59 |     input:
60 |         all_which_input
61 | 
62 | 


--------------------------------------------------------------------------------
/bin/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- settings for all workflows --#
 2 | config['pipeline']['username'] = getpass.getuser()
 3 | config['pipeline']['email'] = config['email']
 4 | 
 5 | #-- workflow selection --#
 6 | if config['pipeline']['config'] == 'create':
 7 |     include: snake_dir + 'db_create/Snakefile'
 8 | elif config['pipeline']['config'] == 'update':
 9 |     include: snake_dir + 'db_update/Snakefile'
10 | else:
11 |     msg ='Pipeline "config" param not recognized: {}'
12 |     raise ValueError(msg.format(config['pipeline']['config']))
13 | 
14 | 
15 | # final output files (both db-create & db-update workflows)
16 | def all_which_input(wildcards):
17 |     """
18 |     The final output files for both db_create & db_update
19 |     """
20 |     F = []    
21 |     # kraken2
22 |     if (config['samples'] is not None and
23 |         not skipped(config['databases']['kraken2'])):
24 |         F.append(os.path.join(kraken2_dir, 'hash.k2d'))
25 |         F.append(os.path.join(kraken2_dir, 'opts.k2d'))
26 |         F.append(os.path.join(kraken2_dir, 'taxo.k2d'))
27 |         F.append(os.path.join(kraken2_dir, 'seqid2taxid.map'))
28 |         # bracken
29 |         if not skipped(config['databases']['bracken']):
30 |     	    F += expand(os.path.join(kraken2_dir, 'database{read_len}mers.kraken'),
31 | 	                read_len = config['params']['bracken']['build_read_lens'])
32 |     # genes
33 |     if not skipped(config['databases']['genes']):
34 |         F.append(genes_dir + 'genome_reps_filtered.fna.gz')
35 |         F.append(genes_dir + 'genome_reps_filtered.faa.gz')
36 |         F.append(genes_dir + 'genome_reps_filtered.txt.gz')
37 |         if str(config['keep_intermediate']) == 'True':
38 |             # mmseqs gene database
39 |             F.append(genes_dir + 'genes_db.tar.gz')
40 |             ## mmseqs cluster database
41 |             F.append(genes_dir + 'cluster/clusters_db.tar.gz')
42 |             ## cluster membership info
43 |             F.append(genes_dir + 'cluster/clusters_membership.tsv.gz')
44 |             ## cluster rep sequences
45 |             F.append(genes_dir + 'cluster/clusters_reps.faa.gz')       
46 |     # humann3
47 |     if (not skipped(config['databases']['humann3_bowtie2']) and
48 |         not skipped(config['databases']['humann3_diamond'])):
49 |         # multiple UniRef databases
50 |         uniref_cutoffs = [config['uniref_name']]
51 |         dmnd_names = [config['dmnd_name']]
52 |         if config['uniref_name'] != 'uniref50' and not skipped(config['cluster_idx']):
53 |             uniref_cutoffs.append('uniref50')
54 |             x = re.sub('([Uu])ni([Rr])ef90', '\\1ni\\2ef50', config['dmnd_name'])
55 |             dmnd_names.append(x)
56 |         # intermediate files
57 |         if str(config['keep_intermediate']) == 'True':
58 |             # annotation hits
59 |             F.append(humann3_dir + 'annotation_hits.gz')
60 |             ## annotated genes (all)
61 |             F += expand(humann3_dir + '{uniref}/genome_reps_filt_annot.fna.gz',
62 |                         uniref = uniref_cutoffs)
63 |             F += expand(humann3_dir + '{uniref}/genome_reps_filt_annot.faa.gz',
64 |                         uniref = uniref_cutoffs)
65 |             F += expand(humann3_dir + '{uniref}/genome_reps_filt_annot.tsv.gz',
66 |                         uniref = uniref_cutoffs)
67 |         ## databases
68 |         ### bowtie2
69 |         if not skipped(config['databases']['humann3_bowtie2']):
70 |             F += expand(os.path.join(humann3_dir + '{uniref}', 'bowtie2_build.done'),
71 |                         uniref = uniref_cutoffs)
72 |         ### diamond
73 |         if not skipped(config['databases']['humann3_diamond']):
74 |             x = os.path.join(humann3_dir, '{uniref}', 'protein_database', '{dmnd}')
75 |             for u,d in zip(uniref_cutoffs, dmnd_names):                               
76 |                 F.append(x.format(uniref=u,dmnd=d))
77 |                         
78 |     # metaphlan
79 |     #if not skipped(config['databases']['metaphlan3']):
80 |     #    F += expand(config['tmp_dir'] + '{uniref}/metaphlan3/species_specific.txt',
81 |     #                uniref = uniref_cutoffs)        
82 |     
83 |     # ret
84 |     return F
85 | 


--------------------------------------------------------------------------------
/bin/db_create/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- Parsing input for db-create workflow --#
 2 | 
 3 | # outdir
 4 | config['output_dir'] = config['output_dir'].rstrip('/') + '/'
 5 | print('\33[33mUsing output directory: {} \x1b[0m'.format(config['output_dir']))
 6 | 
 7 | # Samples table
 8 | if not os.path.isfile(config['samples_file']):
 9 |     raise IOError('Cannot find file: {}'.format(config['samples_file']))
10 | config['samples'] = pd.read_csv(config['samples_file'], sep='\t')
11 | ## Required columns 
12 | for f in [config['samples_col'], config['accession_col'], config['fasta_file_path_col'],
13 |           config['taxID_col'], config['taxonomy_col']]:
14 |     if f not in config['samples'].columns:
15 |         raise ValueError('Cannot find column: {}'.format(f))
16 | config['samples'][config['samples_col']] = config['samples'][config['samples_col']].str.replace('[^A-Za-z0-9]+', '_', regex=True)
17 | config['samples'] = config['samples'].set_index(config['samples'][config['samples_col']])
18 | 
19 | ## check that files exist (skipping if not)
20 | rowID = 0
21 | to_rm = []
22 | for index,row in config['samples'].iterrows():
23 |     rowID += 1
24 |     file_cols = [config['fasta_file_path_col']]
25 |     for f in file_cols:
26 |         if not os.path.isfile(str(row[f])):
27 |            msg = 'Samples table (Row {}): Cannot find file: {}; Skipping\n'
28 |            sys.stderr.write(msg.format(rowID, row[f]))
29 |            to_rm.append(row[config['samples_col']])
30 | ssw('\33[33mNumber of skipped sample table entries: {}\n\x1b[0m'.format(len(to_rm)))
31 | config['samples'].drop(to_rm, inplace=True)
32 | if config['samples'].shape[0] < 1:
33 |     raise ValueError('No genomes remaining after filtering!')
34 | config['samples_unique'] = config['samples'][config['samples_col']].unique().tolist()
35 | 
36 | ## temp_folder
37 | config['tmp_dir'] = os.path.join(config['tmp_dir'], config['pipeline']['username'])
38 | config['tmp_dir'] = os.path.join(config['tmp_dir'], 'Struo2_' + str(os.stat('.').st_ino) + '/')
39 | print('\33[33mUsing temporary directory: {} \x1b[0m'.format(config['tmp_dir']))
40 | 
41 | ## batches
42 | config['params']['humann3']['splits'] = \
43 |   make_fasta_splits(config['params']['humann3']['batches'])
44 | 
45 | ## including modular snakefiles
46 | print('\33[36m--Running db-create pipeline--\x1b[0m')
47 | snake_dir = config['pipeline']['snakemake_folder']
48 | include: snake_dir + 'bin/dirs'
49 | ### kraken/bracken
50 | if not skipped(config['databases']['kraken2']):
51 |     print('\33[36m* Creating kraken2 database\x1b[0m')
52 |     include: snake_dir + 'bin/db_create/kraken2/Snakefile'
53 |     if not skipped(config['databases']['bracken']):
54 |         print('\33[36m* Creating bracken database\x1b[0m')
55 |         include: snake_dir + 'bin/db_create/bracken/Snakefile'
56 | ### genes
57 | if not skipped(config['databases']['genes']):
58 |     print('\33[36m* Creating genes database\x1b[0m')
59 |     include: snake_dir + 'bin/db_create/genes/Snakefile'
60 | else:
61 |     m = '\33[33m* Skipping creation of genes database;'
62 |     m += ' assuming the database already exist!\x1b[0m'
63 |     print(m)
64 | ### humann3
65 | if not skipped(config['databases']['humann3_bowtie2']) and \
66 |    not skipped(config['databases']['humann3_diamond']):
67 |     print('\33[36m* Creating humann3 database\x1b[0m')
68 |     include: snake_dir + 'bin/db_create/humann3/Snakefile'
69 | #if not skipped(config['databases']['metaphlan3']):
70 | #    print('\33[36m* Creating metaphlan database\x1b[0m')
71 | #    include: snake_dir + 'bin/db_create/metaphlan3/Snakefile'
72 | 


--------------------------------------------------------------------------------
/bin/db_create/bracken/Snakefile:
--------------------------------------------------------------------------------
 1 | rule bracken_build:
 2 |     """
 3 |     Build braken database(s) from kraken2 database.
 4 |     One database to user-selected read lengths.
 5 |     """
 6 |     input:
 7 |         kraken2_db = kraken2_dir + 'hash.k2d',
 8 |         rm_done = kraken2_dir + 'tmp_db_rm.done'
 9 |     output:
10 |         krk = kraken2_dir + 'database{read_len}mers.kraken',
11 |         krkd = kraken2_dir + 'database{read_len}mers.kmer_distrib'
12 |     params:
13 |         kmer = config['params']['bracken']['build_kmer'],
14 |         exe = config['pipeline']['script_folder'] + 'bracken-build.py',
15 |         read_len = lambda wildcards: wildcards.read_len,
16 |     threads:
17 |         12
18 |     resources:
19 |         time = lambda wildcards, attempt: attempt ** 2 * 60 * 24,
20 |         n = lambda wildcards, attempt, threads: threads,
21 |         mem_gb_pt = lambda wildcards, attempt: attempt * 16
22 |     conda:
23 |         '../../envs/kraken2.yaml'
24 |     log:
25 |         log_dir + 'bracken_build/ReadLen{read_len}.log'
26 |     benchmark:
27 |         benchmark_dir + 'bracken_build/ReadLen{read_len}.txt'
28 |     shell:
29 |         """
30 |         # location of the kraken2 db files
31 |         DB=`dirname {input.kraken2_db}`
32 |         # removing existing files possibly created by bracken
33 |         TMP_FILE=$DB"/database.kraken"
34 |         rm -f $TMP_FILE
35 |         # running bracken 
36 |         {params.exe} -t {threads} -d $DB \
37 |           -k {params.kmer} -l {params.read_len} \
38 |           2> {log} 1>&2
39 |         """
40 | 
41 | 


--------------------------------------------------------------------------------
/bin/db_create/humann3/Snakefile:
--------------------------------------------------------------------------------
1 | #-- humann database creation workflow --#
2 | 
3 | # preparing of input sequences
4 | include: snake_dir + 'bin/db_create/humann3/prepare_query/Snakefile'
5 | # gene annotation
6 | include: snake_dir + 'bin/db_create/humann3/query/Snakefile'
7 | # creating the humann database
8 | include: snake_dir + 'bin/db_create/humann3/db_create/Snakefile'
9 | 


--------------------------------------------------------------------------------
/bin/db_create/humann3/db_create/Snakefile:
--------------------------------------------------------------------------------
  1 | rule humann3_annotate_genes:
  2 |     """
  3 |     Use search hits from clustered reps & index table to annotate all
  4 |     genome-derep genes. The annotation for each cluster rep is propagated
  5 |     to each member of the cluster.
  6 |     """
  7 |     input:
  8 |         hits = config['tmp_dir'] + 'humann3_search/hits.txt',
  9 |         tsv = config['tmp_dir'] + 'humann3/clusters_membership.tsv.gz',
 10 |         fna = config['tmp_dir'] + 'humann3/genome_reps_filtered.fna.gz',
 11 |         faa = config['tmp_dir'] + 'humann3/genome_reps_filtered.faa.gz',
 12 |         txt = config['tmp_dir'] + 'humann3/genome_reps_filtered.txt.gz'
 13 |     output:
 14 |         fna = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna'),
 15 |         faa = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa'),
 16 |         tsv = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv'),
 17 |     params:
 18 |         exe = config['pipeline']['script_folder'] + 'propagate_annotations.py',
 19 |         params = config['params']['humann3']['propagate_annotations']
 20 |     resources:
 21 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 22 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 4 * 3 + 11,
 23 |     log:
 24 |         log_dir + 'humann3_annotate_genes/all.log'
 25 |     benchmark:
 26 |         benchmark_dir + 'humann3_annotate_genes/all.txt'
 27 |     shell:
 28 |         """
 29 |         OUTDIR=`dirname {output.fna}`
 30 |         mkdir -p $OUTDIR 2> {log}
 31 |         {params.exe} {params.params} \
 32 |           --in-nuc {input.fna} \
 33 |           --out-nuc {output.fna} \
 34 |           --out-prot {output.faa} \
 35 |           {input.hits} {input.faa} \
 36 |           {input.txt} {input.tsv} \
 37 |           > {output.tsv} 2>> {log}
 38 |         """
 39 | 
 40 | rule humann3_annotate_hits_copy:
 41 |     """
 42 |     Copying/compressing query hits (diamond or mmseqs) to the final output directory 
 43 |     """
 44 |     input:
 45 |         hits = config['tmp_dir'] + 'humann3_search/hits.txt'
 46 |     output:
 47 |         hits = humann3_dir + 'annotation_hits.gz'
 48 |     params:
 49 |         ionice = config['params']['ionice']
 50 |     resources:
 51 |         time = lambda wildcards, attempt: attempt ** 3 * 59
 52 |     log:
 53 |         log_dir + 'humann3_annotate_hits_copy/all.log'
 54 |     shell:
 55 |         """
 56 |         ionice {params.ionice} gzip -c {input.hits} > {output.hits} 2> {log}
 57 |         """
 58 |         
 59 | rule humann3_alt_annotate:
 60 |     """
 61 |     Re-annotating with different UniRef cluster resolution
 62 |     """
 63 |     input:
 64 |         fna = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna',
 65 |         faa = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa',
 66 |         tsv = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv',
 67 |         idx = ancient(config['cluster_idx'])
 68 |     output:
 69 |         fna = temp(config['tmp_dir'] + config['uniref_other_name'] + \
 70 |                    '/genome_reps_filt_annot.fna'),
 71 |         faa = temp(config['tmp_dir'] + config['uniref_other_name'] + \
 72 |                    '/genome_reps_filt_annot.faa'),
 73 |         tsv = temp(config['tmp_dir'] + config['uniref_other_name'] + \
 74 |                    '/genome_reps_filt_annot.tsv')
 75 |     params:
 76 |         exe = config['pipeline']['script_folder'] + 'uniref_clst_trans.py'
 77 |     resources:
 78 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 79 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 3 + 9
 80 |     log:
 81 |         log_dir + 'humann3_annotate_genes/all.log'
 82 |     benchmark:
 83 |         benchmark_dir + 'humann3_annotate_genes/all.txt'
 84 |     shell:
 85 |         """
 86 |         OUTDIR=`dirname {output.fna}`
 87 |         mkdir -p $OUTDIR 2> {log}
 88 |         {params.exe} {input.idx} \
 89 |           --in-nuc {input.fna} \
 90 |           --in-prot {input.faa} \
 91 |           --in-tsv {input.tsv} \
 92 |           --out-nuc {output.fna} \
 93 |           --out-prot {output.faa} \
 94 |           --out-tsv {output.tsv} \
 95 |           2>> {log} 1>&2
 96 |         """    
 97 |                 
 98 | rule humann3_annotate_genes_copy:
 99 |     """
100 |     Copying/compressing annotated gene files to the final directory
101 |     """
102 |     input:
103 |         fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna',
104 |         faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa',
105 |         tsv = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.tsv'
106 |     output:
107 |         fna = humann3_dir + '{uniref}/genome_reps_filt_annot.fna.gz',
108 |         faa = humann3_dir + '{uniref}/genome_reps_filt_annot.faa.gz',
109 |         tsv = humann3_dir + '{uniref}/genome_reps_filt_annot.tsv.gz'            
110 |     params:
111 |         ionice = config['params']['ionice']
112 |     resources:
113 |         time = lambda wildcards, attempt: attempt ** 3 * 59
114 |     log:
115 |         log_dir + 'humann3_annotate_genes_copy/{uniref}.log'
116 |     benchmark:
117 |         benchmark_dir + 'humann3_annotate_genes_copy/{uniref}.txt'
118 |     shell:
119 |         """
120 |         ionice {params.ionice} gzip -c {input.fna} > {output.fna} 2> {log}
121 |         ionice {params.ionice} gzip -c {input.faa} > {output.faa} 2>> {log}
122 |         ionice {params.ionice} gzip -c {input.tsv} > {output.tsv} 2>> {log}
123 |         """
124 |     
125 | rule humann3_bowtie2_build:
126 |     """
127 |     Running bowtie2 build on combined, annotated genes (nucleotide)
128 |     """
129 |     input:
130 |         fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna'
131 |     output:
132 |         touch(os.path.join(humann3_dir, '{uniref}/bowtie2_build.done'))
133 |     conda:
134 |         '../../../envs/humann3.yaml'
135 |     threads:
136 |         12
137 |     resources:
138 |         time = lambda wildcards, attempt: attempt * 2 * 60 * 24,
139 |         n = lambda wildcards, attempt, threads: threads,
140 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 2 * 2 + 8,
141 |         lg_idx = lambda wildcards, attempt: '--large-index' if attempt > 1 else ''
142 |     log:
143 |         log_dir + 'humann3_bowtie2_build/{uniref}.log'
144 |     benchmark:
145 |         benchmark_dir + 'humann3_bowtie2_build/{uniref}.txt'
146 |     shell:
147 |         """
148 |         OUTDIR=`dirname {output}`
149 |         PREFIX="$OUTDIR/all_genes_annot"
150 |         bowtie2-build --threads {threads} {resources.lg_idx} \
151 |           {input.fna} $PREFIX 2> {log} 1>&2
152 | 
153 |         # check that output exists
154 |         IDX_FILES=`find $OUTDIR -maxdepth 1 -name "*.bt2*"`
155 |         IDX_FILES=`echo $IDX_FILES | perl -pe 's/ +/\n/g' | wc -l`
156 |         if [ $IDX_FILES -lt 1 ]; then
157 |           echo "ERROR: no bowtie2 index files found!" >> {log}
158 |           exit 1
159 |         fi
160 |         touch {output} 2>> {log}
161 |         """    
162 | 
163 | rule humann3_diamond_makedb:
164 |     """
165 |     Running diamond makedb on combined, annotated genes (amino acid)
166 |     """
167 |     input:
168 |         faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa'
169 |     output:
170 |         humann3_dir + '{uniref}/protein_database/{dmnd}'
171 |     params:
172 |         tmp_dir = config['tmp_dir']
173 |     conda:
174 |         '../../../envs/humann3.yaml'
175 |     resources:
176 |         time = lambda wildcards, attempt: attempt * 2 * 60 * 24 + 60 * 24,
177 |         mem_gb_pt = lambda wildcards, attempt: (attempt ** 3 + 3) * 12
178 |     log:
179 |         log_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.log'
180 |     benchmark:
181 |         benchmark_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.txt'
182 |     shell:
183 |         """
184 |         PREF=`echo {output} | perl -pe 's/\.[^.]+$//'`
185 |         diamond makedb --in {input.faa} -d $PREF 2> {log} 1>&2
186 |         """    
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/bin/db_create/humann3/prepare_query/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- preparing input for annotation & humann database construction --#
 2 | 
 3 | rule humann3_copy_input:
 4 |     """
 5 |     Copying input to temp directory
 6 |     """
 7 |     input:
 8 |         faa_c = genes_dir + 'cluster/clusters_reps.faa.gz',
 9 |         mem_c = genes_dir + 'cluster/clusters_membership.tsv.gz',
10 |         fna = genes_dir + 'genome_reps_filtered.fna.gz',
11 |         faa = genes_dir + 'genome_reps_filtered.faa.gz',
12 |         txt = genes_dir + 'genome_reps_filtered.txt.gz'
13 |     output:
14 |         faa_c = temp(config['tmp_dir'] + 'humann3/clusters_reps.faa.gz'),
15 |         mem_c = temp(config['tmp_dir'] + 'humann3/clusters_membership.tsv.gz'),
16 |         fna = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.fna.gz'),
17 |         faa = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.faa.gz'),
18 |         txt = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.txt.gz')
19 |     params:
20 |         ionice = config['params']['ionice']
21 |     resources:
22 |         time = lambda wildcards, attempt: attempt ** 3 * 59
23 |     log:
24 |         log_dir + 'humann3_copy_input/all.log'
25 |     shell:
26 |         """
27 |         ionice {params.ionice} cp -f {input.faa_c} {output.faa_c} 2> {log} 1>&2
28 |         ionice {params.ionice} cp -f {input.mem_c} {output.mem_c} 2>> {log} 1>&2
29 |         ionice {params.ionice} cp -f {input.faa} {output.faa} 2>> {log} 1>&2
30 |         ionice {params.ionice} cp -f {input.fna} {output.fna} 2>> {log} 1>&2
31 |         ionice {params.ionice} cp -f {input.txt} {output.txt} 2>> {log} 1>&2
32 |         """
33 | 
34 | rule humann3_batch_seqs:
35 |     """
36 |     Splitting gene fasta for distributed searching (annotation)
37 |     """
38 |     input:
39 |         faa = config['tmp_dir'] + 'humann3/clusters_reps.faa.gz'
40 |     output:
41 |         done = config['tmp_dir'] + 'humann3_search/split.done',
42 |         splt = temp(expand(config['tmp_dir'] + \
43 |                            'humann3_search/stdin.part_{splitID}.fasta',
44 | 	                   splitID=config['params']['humann3']['splits']))
45 |     params:
46 |         n_splits = config['params']['humann3']['batches']
47 |     threads:
48 |         4
49 |     resources:
50 |         time = lambda wildcards, attempt: int(round(attempt ** 4 * 59,0)),
51 |         n = lambda wildcards, attempt, threads: threads,
52 |         mem_gb_pt = lambda wildcards, attempt, threads: int(round(attempt ** 3 * 10.0 / threads,0))
53 |     conda:
54 |         '../../../envs/genes.yaml'
55 |     log:
56 |         log_dir + 'humann3_batch_seqs/all.log'
57 |     benchmark:
58 |         benchmark_dir + 'humann3_batch_seqs/all.txt'
59 |     shell:
60 |         """
61 |         OUTDIR=`dirname {output.done} 2> {log}`
62 |         seqkit shuffle -j {threads} {input.faa} 2>> {log} | \
63 |           seqkit split -j {threads} --by-part {params.n_splits} \
64 |             --out-dir $OUTDIR 2>> {log} 1>&2
65 |         touch {output.done}
66 |         """
67 | 
68 | 


--------------------------------------------------------------------------------
/bin/db_create/humann3/query/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- gene annotation (for humann database) workflow --#
 2 | 
 3 | if (not skipped(config['params']['humann3']['mmseqs_search']['db']) and
 4 |     not skipped(config['params']['humann3']['mmseqs_search']['index']) and
 5 |     not skipped(config['params']['humann3']['mmseqs_search']['run'])):
 6 |     # checking on database
 7 |     if not re.search(config['uniref_name'],
 8 |                      str(config['params']['humann3']['mmseqs_search']['db']).lower()):
 9 |         print('\33[35m  * WARNING the uniref_name does not match the query database\x1b[0m')
10 |         print('\33[35m    * ({} <=> {})\x1b[0m'.format(config['uniref_name'], config['params']['humann3']['mmseqs_search']['db']))               
11 |     # mmseqs search
12 |     print('\33[36m  * Using "mmseqs search" for annotating genes\x1b[0m')
13 |     include: snake_dir + 'bin/db_create/humann3/query/mmseqs/Snakefile'
14 | elif (not skipped(config['params']['humann3']['diamond']['db']) and
15 |       not skipped(config['params']['humann3']['diamond']['run'])):
16 |     # checking on database
17 |     if not re.search(config['uniref_name'],
18 |                      str(config['params']['humann3']['diamond']['db']).lower()):
19 |         print('\33[35m  * WARNING the uniref_name does not match the query database\x1b[0m')
20 |         print('\33[35m    * ({} <=> {})\x1b[0m'.format(config['uniref_name'], config['params']['humann3']['diamond']['db']))               
21 |     # diamond blastp
22 |     print('\33[36m  * Using "diamond blastp" for annotating genes\x1b[0m')
23 |     include: snake_dir + 'bin/db_create/humann3/query/dmnd/Snakefile'
24 | else:
25 |     print('\33[31m  ERROR: all query methods skipped!\x1b[0m')
26 |     
27 | 


--------------------------------------------------------------------------------
/bin/db_create/humann3/query/dmnd/Snakefile:
--------------------------------------------------------------------------------
  1 | rule humann3_diamond_db_copy:
  2 |     """
  3 |     Copying the user-provided DIAMOND database to the temp directory
  4 |     """
  5 |     input:
  6 |         db = ancient(config['params']['humann3']['diamond']['db'])
  7 |     output:
  8 |         db = temp(config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd')
  9 |     params:
 10 |         ionice = config['params']['ionice']
 11 |     resources:
 12 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 13 |         mem_gb_pt = lambda wildcards, attempt: attempt * 4
 14 |     log:
 15 |         log_dir + 'humann3_diamond_db_copy/all.log'
 16 |     benchmark:
 17 |         benchmark_dir + 'humann3_diamond_db_copy/all.txt'
 18 |     shell:
 19 |         """
 20 |         ionice {params.ionice} cp -f {input} {output} 2> {log} 1>&2 
 21 |         """ 
 22 | 
 23 | def dmnd_start_mem(wildcards, attempt, threads=12):
 24 |     """
 25 |     Estimating the baseline memory to use for jobs, given the diamond database size
 26 |     """
 27 |     prot_db_size = os.stat(config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd').st_size / 1e9
 28 |     mem = round(prot_db_size * 5 / threads + 1.499,0)
 29 |     mem = (attempt - 1) * 2 + mem
 30 |     return int(mem)
 31 | 
 32 | rule humann3_diamond_pass1:
 33 |     """
 34 |     Annotating genes via 'diamond blastp' search of UniRef DB
 35 |     """
 36 |     input:
 37 |         faa = config['tmp_dir'] + 'humann3_search/stdin.part_{splitID}.fasta',
 38 | 	dmnd_db = config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd'
 39 |     output:
 40 |         hits = temp(config['tmp_dir'] + 'humann3_search/hits_pass1/{splitID}.txt'),
 41 |         unaln = temp(config['tmp_dir'] + 'humann3_search/unaln/{splitID}.faa')
 42 |     params:
 43 |         params = config['params']['humann3']['diamond']['run'],
 44 |         tmp_dir = config['tmp_dir'] + 'humann3_search_TMP/{splitID}/'
 45 |     threads:
 46 |         8
 47 |     resources:
 48 |         time = lambda wildcards, attempt: attempt ** 2 * 60 * 12,
 49 |         n = lambda wildcards, attempt, threads: threads,
 50 |         mem_gb_pt = dmnd_start_mem
 51 |     conda:
 52 |         '../../../../envs/humann3.yaml'
 53 |     log:
 54 |         log_dir + 'humann3_diamond_pass1/{splitID}.log'
 55 |     benchmark:
 56 |         benchmark_dir + 'humann3_diamond_pass1/{splitID}.txt'
 57 |     shell:
 58 |         """
 59 |         TMPDIR="{params.tmp_dir}"
 60 |         mkdir -p $TMPDIR 2> {log}
 61 | 
 62 |         # diamond run
 63 |         diamond blastp {params.params} \
 64 |           --tmpdir $TMPDIR --threads {threads} \
 65 |           -q {input.faa} -d {input.dmnd_db} \
 66 |           -o {output.hits} --un {output.unaln} \
 67 |           --outfmt 6 qseqid sseqid evalue pident length slen \
 68 |           2>> {log} 1>&2
 69 |         """
 70 | 
 71 | rule humann3_diamond_pass2:
 72 |     """
 73 |     Annotating genes via diamond search of UniRef DB (sensitive mode).
 74 |     Just running on genes not annotated by Pass1.
 75 |     """
 76 |     input:
 77 |         faa = config['tmp_dir'] + 'humann3_search/unaln/{splitID}.faa',
 78 | 	dmnd_db = config['tmp_dir'] + 'humann3_search/humann3_dmnd_db.dmnd'
 79 |     output:
 80 |         hits = temp(config['tmp_dir'] + 'humann3_search/hits_pass2/{splitID}.txt')
 81 |     params:
 82 |         params = config['params']['humann3']['diamond']['run'],
 83 |         tmp_dir = config['tmp_dir'] + 'humann3_search_TMP/{splitID}/'
 84 |     threads:
 85 |         8
 86 |     resources:
 87 |         time = lambda wildcards, attempt: attempt * 60 * 48,
 88 |         n = lambda wildcards, attempt, threads: threads,
 89 |         mem_gb_pt = dmnd_start_mem
 90 |     conda:
 91 |         '../../../../envs/humann3.yaml'
 92 |     log:
 93 |         log_dir + 'humann3_diamond_pass2/{splitID}.log'
 94 |     benchmark:
 95 |         benchmark_dir + 'humann3_diamond_pass2/{splitID}.txt'
 96 |     shell:
 97 |         """
 98 |         NSEQ=`seqkit seq -n {input.faa} | wc -l 2> {log}`
 99 |         if [[ "$NSEQ" -gt "0" ]]; then 
100 |           TMPDIR="{params.tmp_dir}"
101 |           mkdir -p $TMPDIR 2>> {log}
102 |           # diamond run
103 |           diamond blastp --sensitive {params.params} \
104 |             --tmpdir $TMPDIR --threads {threads} \
105 |             -q {input.faa} -d {input.dmnd_db} -o {output.hits} \
106 |             --outfmt 6 qseqid sseqid evalue pident length slen \
107 |             2>> {log} 1>&2
108 |         else
109 |           touch {output.hits} 2> {log} 1>&2
110 |           echo "No unaligned sequences. Skipping DIAMOND" >> {log}
111 |         fi
112 |         """        
113 | 
114 | localrules: humann3_diamond_merge
115 |         
116 | rule humann3_diamond_merge:
117 |     """
118 |     Merging the results of the 2 DIAMOND passes (all query fasta splits)
119 |     """
120 |     input:
121 |         hits1 = expand(config['tmp_dir'] + \
122 |                        'humann3_search/hits_pass1/{splitID}.txt',
123 | 	               splitID=config['params']['humann3']['splits']),
124 |         hits2 = expand(config['tmp_dir'] + \
125 |                        'humann3_search/hits_pass2/{splitID}.txt',
126 | 	               splitID=config['params']['humann3']['splits'])
127 |     output:
128 |         hits = temp(config['tmp_dir'] + 'humann3_search/hits.txt')
129 |     resources:
130 |         time = lambda wildcards, attempt: attempt ** 3 * 59
131 |     run:
132 |         with open(output.hits, 'w') as outF:
133 |             for F in input.hits1 + input.hits2:
134 |                 with open(F) as inF:
135 |                     for line in inF:
136 |                         outF.write(line)
137 |         
138 | 


--------------------------------------------------------------------------------
/bin/db_create/kraken2/Snakefile:
--------------------------------------------------------------------------------
  1 | if not config['names_dmp'].startswith('Skip') and \
  2 |    not config['nodes_dmp'].startswith('Skip'):
  3 |     localrules: kraken2_cp_dump  
  4 |     rule kraken2_cp_dump:
  5 |         """
  6 |         Copying names/nodes taxdump files to kraken2 db directory
  7 |         """
  8 |         input:
  9 |             names = ancient(config['names_dmp']),
 10 |             nodes = ancient(config['nodes_dmp'])
 11 |         output:
 12 |             names = config['tmp_dir'] + 'kraken2/taxonomy/names.dmp',
 13 |             nodes = config['tmp_dir'] + 'kraken2/taxonomy/nodes.dmp'
 14 |         log:
 15 |             log_dir + 'kraken2_cp_dump/all.log'
 16 |         shell:
 17 |             """
 18 |             cp -f {input.names} {output.names} 2> {log}
 19 |             cp -f {input.nodes} {output.nodes} 2>> {log}
 20 |             chmod u+w {output.names} {output.nodes} 2>> {log}
 21 |             """
 22 | else:
 23 |     localrules: kraken2_build_download_tax
 24 |     rule kraken2_build_download_tax:
 25 |         """
 26 |         Downloading NCBI taxdump files
 27 |         """
 28 |         output:
 29 |             gb = config['tmp_dir'] + 'kraken2/taxonomy/nucl_gb.accession2taxid',
 30 |             wgs = config['tmp_dir'] + 'kraken2/taxonomy/nucl_wgs.accession2taxid',
 31 |             dump = config['tmp_dir'] + 'kraken2/taxonomy/taxdump.tar.gz',
 32 |             nodes = config['tmp_dir'] + 'kraken2/taxonomy/nodes.dmp',
 33 |             names = config['tmp_dir'] + 'kraken2/taxonomy/names.dmp',
 34 |             merged = config['tmp_dir'] + 'kraken2/taxonomy/merged.dmp'
 35 |         params:
 36 |             exe = config['pipeline']['script_folder'] + 'kraken2-build'
 37 |         conda:
 38 |             '../../envs/kraken2.yaml'
 39 |         log:
 40 |             log_dir + 'kraken_build_download_tax/all.log'
 41 |         benchmark:
 42 |             benchmark_dir + 'kraken_build_download_tax/all.txt'
 43 |         shell:
 44 |             """
 45 |             OUTDIR=`dirname {output.gb}`
 46 |             OUTDIR=`dirname $OUTDIR`
 47 |             rm -rf $OUTDIR 2> {log}
 48 |             mkdir -p $OUTDIR 2>> {log}
 49 |             echo "# Downloading NCBI taxonomy to $OUTDIR" >> {log}
 50 |             {params.exe} --use-ftp --download-taxonomy --db $OUTDIR 2>> {log} 1>&2
 51 |             """
 52 | 
 53 | def kraken2_add_taxID_get_taxID(wildcards):
 54 |     """
 55 |     Getting genome taxID from the user input table
 56 |     """
 57 |     taxID = config['samples'].loc[wildcards.sample, config['taxID_col']]
 58 |     try:
 59 |         taxID = taxID.astype(str)
 60 |     except AttributeError:
 61 |         pass
 62 |     return taxID
 63 |             
 64 | rule kraken2_add_taxID:
 65 |     """
 66 |     Adding a taxononmy ID to the header of each genome.
 67 |     Assuming the taxID is in the samples table.
 68 |     Writing edited genome to temp dir.
 69 | 
 70 |     Format: `kraken:taxid|<taxID>|<seqID>`
 71 |     """
 72 |     input:
 73 |         fasta = lambda wildcards: \
 74 | 	  config['samples'].loc[wildcards.sample, config['fasta_file_path_col']]
 75 |     output:
 76 |         temp(config['tmp_dir'] + 'genomes/{sample}.fna')
 77 |     resources:
 78 |         time = lambda wildcards, attempt: attempt ** 2 * 59,
 79 |         mem_gb_pt = lambda wildcards, attempt: attempt * 6
 80 |     params:
 81 |         taxID = kraken2_add_taxID_get_taxID,
 82 |         exe = config['pipeline']['script_folder'] + 'kraken2_rename_genome.py'
 83 |     log:
 84 |         log_dir + 'kraken2_add_taxID/{sample}.log'
 85 |     benchmark:
 86 |         benchmark_dir + 'kraken2_add_taxID/{sample}.txt'
 87 |     shell:
 88 |         """
 89 |         {params.exe} {input.fasta} {params.taxID} > {output} 2> {log}
 90 |         """
 91 | 
 92 | rule kraken2_build_add:
 93 |     """
 94 |     Adding genome fasta files to the kraken database.
 95 |     Using the --add-to-library flag
 96 |     """
 97 |     input:
 98 |         fasta = config['tmp_dir'] + 'genomes/{sample}.fna',
 99 |         nodes = config['tmp_dir'] + 'kraken2/taxonomy/nodes.dmp',
100 |         names = config['tmp_dir'] + 'kraken2/taxonomy/names.dmp'
101 |     output:
102 |         done = temp(config['tmp_dir'] + 'kraken2/added/{sample}.done')
103 |     threads:
104 |         4
105 |     resources:
106 |         time = lambda wildcards, attempt: attempt ** 2 * 59,
107 |         n = lambda wildcards, attempt, threads: threads,
108 |         mem_gb_pt = lambda wildcards, attempt: attempt * 3
109 |     conda:
110 |         '../../envs/kraken2.yaml'
111 |     log:
112 |         log_dir + 'kraken2_build_add/{sample}.log'
113 |     benchmark:
114 |         benchmark_dir + 'kraken2_build_add/{sample}.txt'
115 |     shell:
116 |         """
117 |         DB=`dirname {input.names}`
118 |         DB=`dirname $DB`
119 | 
120 |         kraken2-build --threads {threads} \
121 |           --db $DB --add-to-library {input.fasta} \
122 |           2> {log} 1>&2
123 |         touch {output.done} 2>> {log}
124 |         """
125 |     
126 | rule kraken2_build:
127 |     """
128 |     Building the kraken database
129 |     """
130 |     input:
131 |         expand(config['tmp_dir'] + 'kraken2/added/{sample}.done',
132 | 	       sample = config['samples_unique'])
133 |     output:
134 |         hash = temp(config['tmp_dir'] + 'kraken2/hash.k2d'),
135 | 	opts = temp(config['tmp_dir'] + 'kraken2/opts.k2d'),
136 | 	map  = temp(config['tmp_dir'] + 'kraken2/seqid2taxid.map'),
137 | 	taxo = temp(config['tmp_dir'] + 'kraken2/taxo.k2d')        
138 |     threads:
139 |         12
140 |     resources:
141 |         time = lambda wildcards, attempt: attempt ** 2 * 60 * 24,
142 |         n = lambda wildcards, attempt, threads: threads,
143 |         mem_gb_pt = lambda wildcards, attempt: int(round(attempt * 18 + 4,0))
144 |     conda:
145 |         '../../envs/kraken2.yaml'
146 |     log:
147 |         log_dir + 'kraken2_build/all.log'
148 |     benchmark:
149 |         benchmark_dir + 'kraken2_build/all.txt'
150 |     shell:
151 |         """
152 |         DB=`dirname {output.hash}`
153 |         kraken2-build --build --threads {threads} --db $DB 2> {log} 1>&2
154 |         """
155 |     
156 | rule kraken2_db_copy:
157 |     """
158 |     Copying the kraken2 database to the output directory
159 |     """
160 |     input:
161 |         hash = config['tmp_dir'] + 'kraken2/hash.k2d',
162 | 	opts = config['tmp_dir'] + 'kraken2/opts.k2d',
163 | 	taxo = config['tmp_dir'] + 'kraken2/taxo.k2d',
164 | 	map  = config['tmp_dir'] + 'kraken2/seqid2taxid.map'
165 |     output:
166 |         hash = kraken2_dir + 'hash.k2d',
167 | 	opts = kraken2_dir + 'opts.k2d',
168 | 	taxo = kraken2_dir + 'taxo.k2d',
169 | 	map  = kraken2_dir + 'seqid2taxid.map'
170 |     params:
171 |         keep = config['keep_intermediate'],
172 |         ionice = config['params']['ionice']
173 |     resources:
174 |         time = lambda wildcards, attempt: attempt ** 3 * 59
175 |     log:
176 |         log_dir + 'kraken2_db_copy/all.log'
177 |     benchmark:
178 |         benchmark_dir + 'kraken2_db_copy/all.txt'
179 |     shell:
180 |         """
181 |         if [ "{params.keep}" == "True" ]; then
182 |           echo "# copying entire kraken db" > {log}
183 |           DIR1=`dirname {input.hash}`
184 |           DIR2=`dirname {output.hash}`
185 |           rm -rf $DIR2 2>> {log} 1>&2
186 |           ionice {params.ionice} cp -rf $DIR1 $DIR2 2>> {log} 1>&2
187 |         else
188 |           echo "# copying just built kraken index files" > {log}
189 |           ionice {params.ionice} cp -f {input.hash} {output.hash} 2>> {log} 1>&2
190 |           ionice {params.ionice} cp -f {input.opts} {output.opts} 2>> {log} 1>&2
191 |           ionice {params.ionice} cp -f {input.map} {output.map} 2>> {log} 1>&2
192 |           ionice {params.ionice} cp -f {input.taxo} {output.taxo} 2>> {log} 1>&2
193 |         fi
194 |         """
195 |     
196 | rule kraken2_tmp_db_rm:
197 |     """
198 |     Removing temporary kraken2 db directory
199 |     """
200 |     input:
201 |         hash_tmp = config['tmp_dir'] + 'kraken2/hash.k2d',
202 |         hash = kraken2_dir + 'hash.k2d',
203 | 	opts = kraken2_dir + 'opts.k2d',
204 | 	taxo = kraken2_dir + 'taxo.k2d',
205 | 	map  = kraken2_dir + 'seqid2taxid.map'
206 |     output:
207 |         done = kraken2_dir + 'tmp_db_rm.done'
208 |     resources:
209 |         time = lambda wildcards, attempt: attempt ** 3 * 59
210 |     log:
211 |         log_dir + 'kraken2_tmp_db_rm/all.log'
212 |     shell:
213 |         """
214 |         rm -rf `dirname {input.hash_tmp}` 2> {log} 1>&2
215 |         touch {output.done} 2>> {log}
216 |         """
217 | 


--------------------------------------------------------------------------------
/bin/db_update/Snakefile:
--------------------------------------------------------------------------------
  1 | #-- database update workflow --#
  2 | import gzip
  3 | 
  4 | # input processing
  5 | ## outdir
  6 | config['output_dir'] = config['output_dir'].rstrip('/') + '/'
  7 | print('\33[33mUsing output directory: {} \x1b[0m'.format(config['output_dir']))
  8 | 
  9 | ## Samples table
 10 | if skipped(os.path.split(config['samples_file'])[1]):
 11 |     config['samples'] = None
 12 | else:
 13 |     if not os.path.isfile(config['samples_file']):
 14 |         raise IOError('Cannot find file: {}'.format(config['samples_file']))
 15 |     config['samples'] = pd.read_csv(config['samples_file'], sep='\t')
 16 |     ### required columns
 17 |     for f in [config['samples_col'], config['accession_col'], config['fasta_file_path_col'],
 18 |               config['taxID_col'], config['taxonomy_col']]:
 19 |         if f not in config['samples'].columns:
 20 |             raise ValueError('Cannot find column: {}'.format(f))
 21 |     config['samples'][config['samples_col']] = config['samples'][config['samples_col']].str.replace('[^A-Za-z0-9]+', '_', regex=True)
 22 |     config['samples'] = config['samples'].set_index(config['samples'][config['samples_col']])
 23 |     
 24 | ### check that files exist (skipping if not)
 25 | if config['samples'] is not None:
 26 |     rowID = 0
 27 |     to_rm = []
 28 |     for index,row in config['samples'].iterrows():
 29 |         rowID += 1
 30 |         file_cols = [config['fasta_file_path_col']]
 31 |         for f in file_cols:
 32 |             if not os.path.isfile(str(row[f])):
 33 |                msg = 'Samples table (Row {}): Cannot find file: {}; Skipping\n'
 34 |                sys.stderr.write(msg.format(rowID, row[f]))
 35 |                to_rm.append(row[config['samples_col']])
 36 |     ssw('\33[33mNumber of skipped sample table entries: {}\n\x1b[0m'.format(len(to_rm)))
 37 |     config['samples'].drop(to_rm, inplace=True)
 38 |     if config['samples'].shape[0] < 1:
 39 |         raise ValueError('No genomes remaining after filtering!')
 40 |     config['samples_unique'] = config['samples'][config['samples_col']].unique().tolist()
 41 |     
 42 | # check that user-gene info (if provided) as all of the required columns
 43 | if (not skipped(config['new_genes']['amino_acid']) or
 44 |     not skipped(config['new_genes']['nucleotide'])):
 45 |     if skipped(config['new_genes']['metadata']) or config['new_genes']['metadata'] == '':
 46 |         raise IOError('User-provide genes metadata file not provided, but required!')
 47 |     req_cols = ['seq_uuid', 'seq_orig_name', 'genus', 'species', 'taxid']
 48 |     if config['new_genes']['metadata'].endswith('.gz'):
 49 |         _open = lambda x: gzip.open(x, 'rb')
 50 |     else:
 51 |         _open = lambda x: open(x)
 52 |     with _open(config['new_genes']['metadata']) as inF:        
 53 |         for i,line in enumerate(inF):
 54 |             if i > 0:
 55 |                 break
 56 |             if config['new_genes']['metadata'].endswith('.gz'):
 57 |                 line = line.decode('utf-8')
 58 |             line = line.rstrip().split('\t')
 59 |             missing = [x for x in req_cols if not x in line]
 60 |             if len(missing) > 0:
 61 |                 msg = 'Missing required columns in user-provided genes metadata file: {}'
 62 |                 raise ValueError(msg.format(','.join(missing)))
 63 |             if line[0] != 'seq_uuid':
 64 |                 msg = 'The first column of the gene metadata table must be "seq_uuid"'
 65 |                 raise ValueError(msg)
 66 | 
 67 | ## temp_folder
 68 | config['pipeline']['username'] = getpass.getuser()
 69 | config['pipeline']['email'] = config['email']
 70 | config['tmp_dir'] = os.path.join(config['tmp_dir'], config['pipeline']['username'])
 71 | config['tmp_dir'] = os.path.join(config['tmp_dir'], 'Struo2_' + str(os.stat('.').st_ino) + '/')
 72 | print('\33[33mUsing temporary directory: {} \x1b[0m'.format(config['tmp_dir']))
 73 | 
 74 | ## batches
 75 | config['params']['humann3']['splits'] = \
 76 |   make_fasta_splits(config['params']['humann3']['batches'])
 77 | 
 78 | ## including modular snakefiles
 79 | print('\33[36m--Running db-update pipeline--\x1b[0m')
 80 | snake_dir = config['pipeline']['snakemake_folder']
 81 | include: snake_dir + 'bin/dirs'
 82 | ### Adding genomes to kraken/bracken (user cannot provide gene list)
 83 | if (config['samples'] is not None and
 84 |     not skipped(config['databases']['kraken2'])):
 85 |     print('\33[36m* Updating kraken2 database\x1b[0m')        
 86 |     include: snake_dir + 'bin/db_update/kraken2/Snakefile'
 87 |     if not skipped(config['databases']['bracken']):
 88 |         print('\33[36m* Updating bracken database\x1b[0m')        
 89 |         include: snake_dir + 'bin/db_update/bracken/Snakefile'
 90 | ### Updating genes db
 91 | if not skipped(config['databases']['genes']):
 92 |     print('\33[36m* Updating genes database\x1b[0m')
 93 |     include: snake_dir + 'bin/db_update/genes/Snakefile'
 94 | ### updating humann databases
 95 | if not skipped(config['databases']['humann3_bowtie2']) and \
 96 |    not skipped(config['databases']['humann3_diamond']):
 97 |     print('\33[36m* Updating humann database\x1b[0m')
 98 |     include: snake_dir + 'bin/db_update/humann3/Snakefile'
 99 | 
100 | 


--------------------------------------------------------------------------------
/bin/db_update/bracken/Snakefile:
--------------------------------------------------------------------------------
 1 | rule bracken_build:
 2 |     """
 3 |     Build braken database from kraken2 database
 4 |     """
 5 |     input:
 6 |         kraken2_db = kraken2_dir + 'hash.k2d'
 7 |     output:
 8 |         krk = kraken2_dir + 'database{read_len}mers.kraken',
 9 |         krkd = kraken2_dir + 'database{read_len}mers.kmer_distrib'
10 |     params:
11 |         kmer = config['params']['bracken']['build_kmer'],
12 |         exe = config['pipeline']['script_folder'] + 'bracken-build.py',
13 |         read_len = lambda wildcards: wildcards.read_len,
14 |     threads:
15 |         12
16 |     resources:
17 |         time = lambda wildcards, attempt: attempt ** 2 * 60 * 24,
18 |         n = lambda wildcards, attempt, threads: threads,
19 |         mem_gb_pt = lambda wildcards, attempt: attempt * 16
20 |     conda:
21 |         '../../envs/kraken2.yaml'
22 |     log:
23 |         log_dir + 'bracken_build/ReadLen{read_len}.log'
24 |     benchmark:
25 |         benchmark_dir + 'bracken_build/ReadLen{read_len}.txt'
26 |     shell:
27 |         """
28 |         # location of the kraken2 db files
29 |         DB=`dirname {input.kraken2_db}`
30 |         # removing existing files possibly created by bracken
31 |         TMP_FILE=$DB"/database.kraken"
32 |         rm -f $TMP_FILE
33 |         # running bracken 
34 |         {params.exe} -t {threads} -d $DB \
35 |           -k {params.kmer} -l {params.read_len} \
36 |           2> {log} 1>&2
37 |         """
38 | 
39 | 


--------------------------------------------------------------------------------
/bin/db_update/genes/Snakefile:
--------------------------------------------------------------------------------
1 | #-- gene database update workflow --#
2 | 
3 | # Adding genes to genes_db (user can provide genomes or gene list)
4 | include: snake_dir + 'bin/db_update/genes/input/Snakefile'
5 | # updating the mmseqs cluster database
6 | include: snake_dir + 'bin/db_update/genes/db_update/Snakefile'
7 | 


--------------------------------------------------------------------------------
/bin/db_update/genes/input/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- dealing with all possible gene inputs for this workflow --#
 2 | 
 3 | # checking/validating input
 4 | include: snake_dir + 'bin/db_update/genes/input/check/Snakefile'
 5 | # gene input
 6 | if (not skipped(config['new_genes']['amino_acid']) or
 7 |     not skipped(config['new_genes']['nucleotide'])):
 8 |     # assuming user provided a set of genes
 9 |     print('\33[36m  * Using user-provided set of gene sequences\x1b[0m')
10 |     include: snake_dir + 'bin/db_update/genes/input/from_gene_set/Snakefile'
11 | else:
12 |     # assuming the user provide a list of genomes to extract genes from
13 |     print('\33[36m  * Extracting new genes from user-provided genomes\x1b[0m')
14 |     include: snake_dir + 'bin/db_update/genes/input/from_genomes/Snakefile'
15 | 


--------------------------------------------------------------------------------
/bin/db_update/genes/input/check/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- checking/validating input formats --#
 2 | 
 3 | localrules: genes_check_input
 4 | 
 5 | if skipped(config['genes_db']['genes']['nucleotide']):
 6 |     rule genes_check_input:
 7 |         """
 8 |         Checking that input is formatted correctly.
 9 |         Skipping nucleotide check.
10 |         """
11 |         input:
12 |             faa = config['genes_db']['genes']['amino_acid'],
13 |             txt = config['genes_db']['genes']['metadata']
14 |         output:
15 |             fna = temp(config['tmp_dir'] + 'db_update/orig.fna'),
16 |             faa = temp(config['tmp_dir'] + 'db_update/orig.faa'),
17 |             txt = temp(config['tmp_dir'] + 'db_update/orig.txt'),
18 |             done = genes_dir + 'genes_input.check.done'
19 |         params:
20 |             ionice = config['params']['ionice'],
21 |             exe1 = config['pipeline']['script_folder'] + 'cat_files.py',
22 |             exe2 = config['pipeline']['script_folder'] + 'check_gene_info.py'
23 |         conda:
24 |             '../../../../envs/genes.yaml'
25 |         resources:
26 |             time = lambda wildcards, attempt: attempt ** 2 * 59
27 |         log:
28 |             log_dir + 'genes_check_input/all.log'
29 |         shell:
30 |             """
31 |             # copy
32 |             touch {output.fna} 2> {log} 1>&2
33 |             ionice {params.ionice} seqkit seq -v {input.faa} > {output.faa} 2>> {log}
34 |             ionice {params.ionice} {params.exe1} {input.txt} > {output.txt} 2>> {log}
35 |             # check on data content
36 |             {params.exe2} {input.faa} {input.txt} 2>> {log} 1>&2
37 |             touch {output.done} 2>> {log}        
38 |             """    
39 | else:
40 |     rule genes_check_input:
41 |         """
42 |         Checking that input is formatted correctly.
43 |         """
44 |         input:
45 |             fna = config['genes_db']['genes']['nucleotide'],
46 |             faa = config['genes_db']['genes']['amino_acid'],
47 |             txt = config['genes_db']['genes']['metadata']
48 |         output:
49 |             fna = temp(config['tmp_dir'] + 'db_update/orig.fna'),
50 |             faa = temp(config['tmp_dir'] + 'db_update/orig.faa'),
51 |             txt = temp(config['tmp_dir'] + 'db_update/orig.txt'),
52 |             done = genes_dir + 'genes_input.check.done'
53 |         params:
54 |             ionice = config['params']['ionice'],
55 |             exe1 = config['pipeline']['script_folder'] + 'cat_files.py',
56 |             exe2 = config['pipeline']['script_folder'] + 'check_gene_info.py'
57 |         conda:
58 |             '../../../../envs/genes.yaml'
59 |         resources:
60 |             time = lambda wildcards, attempt: attempt ** 2 * 59
61 |         log:
62 |             log_dir + 'genes_check_input/all.log'
63 |         shell:
64 |             """
65 |             # copy
66 |             ionice {params.ionice} seqkit seq -v {input.fna} > {output.fna} 2> {log}
67 |             ionice {params.ionice} seqkit seq -v {input.faa} > {output.faa} 2>> {log}
68 |             ionice {params.ionice} {params.exe1} {input.txt} > {output.txt} 2>> {log}
69 |             # check on data content
70 |             {params.exe2} -n {input.fna} {input.faa} {input.txt} 2>> {log} 1>&2
71 |             touch {output.done} 2>> {log}        
72 |             """
73 |     
74 | 


--------------------------------------------------------------------------------
/bin/db_update/genes/input/from_genomes/Snakefile:
--------------------------------------------------------------------------------
  1 | #-- obtaining new genes from a list of genomes & adding them to the existing genes database --#
  2 | 
  3 | rule prodigal:
  4 |     """
  5 |     For each genome, running prodigal to call genes
  6 |     """
  7 |     input:
  8 |         fasta = lambda wildcards: \
  9 | 	  config['samples'].loc[wildcards.sample, config['fasta_file_path_col']]
 10 |     output:
 11 |         fna = temp(config['tmp_dir'] + 'db_update/prodigal/{sample}.fna'),
 12 |         faa = temp(config['tmp_dir'] + 'db_update/prodigal/{sample}.faa'),
 13 |         gbk = temp(config['tmp_dir'] + 'db_update/prodigal/{sample}.gbk')
 14 |     params:
 15 |         params = config['params']['genes']['prodigal']
 16 |     resources:
 17 |         time = lambda wildcards, attempt: attempt ** 2 * 59,
 18 |         n = lambda wildcards, attempt, threads: threads,
 19 |         mem_gb_pt = lambda wildcards, attempt: attempt * 8 + 14
 20 |     conda:
 21 |         '../../../../envs/genes.yaml'
 22 |     log:
 23 |         log_dir + 'prodigal/{sample}.log'
 24 |     benchmark:
 25 |         benchmark_dir + 'prodigal/{sample}.txt'
 26 |     shell:
 27 |         """
 28 |         gunzip -c {input.fasta} | \
 29 |           prodigal {params.params} \
 30 |           -o {output.gbk} -d {output.fna} -a {output.faa} \
 31 |           2> {log} 1>&2
 32 |         """    
 33 | 
 34 | rule vsearch_per_genome:
 35 |     """
 36 |     For each genome, clustering genes (at nuc level) and taking the centroid. 
 37 |     """
 38 |     input:
 39 |         fna = config['tmp_dir'] + 'db_update/prodigal/{sample}.fna',
 40 |         faa = config['tmp_dir'] + 'db_update/prodigal/{sample}.faa'
 41 |     output:
 42 |         reps = temp(config['tmp_dir'] + 'db_update/vsearch/{sample}_reps.fna')
 43 |     params:
 44 |         params = config['params']['genes']['vsearch_per_genome']
 45 |     threads:
 46 |         4
 47 |     resources:
 48 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 49 |         n = lambda wildcards, attempt, threads: threads,
 50 |         mem_gb_pt = lambda wildcards, attempt: attempt * 3
 51 |     conda:
 52 |         '../../../../envs/genes.yaml'
 53 |     log:
 54 |         log_dir + 'vsearch_per_genome/{sample}.log'
 55 |     benchmark:
 56 |         benchmark_dir + 'vsearch_per_genome/{sample}.txt'
 57 |     shell:
 58 |         """
 59 |         vsearch {params.params} \
 60 |           --threads {threads} \
 61 |           --cluster_fast {input.fna} \
 62 |           --centroids {output.reps} \
 63 |           2> {log} 1>&2
 64 |         """
 65 |         
 66 | rule filter_gene_seqs:
 67 |     """
 68 |     Filtering the amino acid gene sequences to just the vsearch cluster reps (nucleotide).
 69 |     Renaming as [seq_name]|gene_length|taxonomy
 70 |     """
 71 |     input:
 72 |         fasta = lambda wildcards: \
 73 | 	  config['samples'].loc[wildcards.sample, config['fasta_file_path_col']],
 74 |         reps = config['tmp_dir'] + 'db_update/vsearch/{sample}_reps.fna',
 75 |         faa = config['tmp_dir'] + 'db_update/prodigal/{sample}.faa'
 76 |     output:
 77 |         fna = temp(config['tmp_dir'] + 'db_update/nuc_filtered/{sample}_reps.fna'),
 78 |         faa = temp(config['tmp_dir'] + 'db_update/prot_filtered/{sample}_reps.faa'),
 79 |         txt = temp(config['tmp_dir'] + 'db_update/names_filtered/{sample}_reps.txt')
 80 |     params:
 81 |         exe = config['pipeline']['script_folder'] + 'filter_seqs.py',
 82 |         tax = lambda wildcards: \
 83 |                 config['samples'].loc[wildcards.sample, config['taxonomy_col']],
 84 |         taxID = lambda wildcards: \
 85 | 	          config['samples'].loc[wildcards.sample, config['taxID_col']],
 86 |         acc = lambda wildcards: \
 87 | 	      config['samples'].loc[wildcards.sample, config['accession_col']]
 88 |     resources:
 89 |         time = lambda wildcards, attempt: attempt ** 2 * 59,
 90 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 8
 91 |     conda:
 92 |         '../../../../envs/genes.yaml'
 93 |     log:
 94 |         log_dir + 'filter_gene_seqs/{sample}.log'
 95 |     benchmark:
 96 |         benchmark_dir + 'filter_gene_seqs/{sample}.txt'
 97 |     shell:
 98 |         """        
 99 |         {params.exe} --taxonomy "{params.tax}" \
100 |            --taxID {params.taxID} \
101 |            --accession {params.acc} \
102 |            --genome-file {input.fasta} \
103 |            {input.reps} {input.faa} \
104 |            {output.fna} {output.faa} \
105 |            > {output.txt} 2> {log}
106 |         """
107 | 
108 | rule genes_combine_fna:
109 |     """
110 |     For all per-genome de-replicated genes of all genomes, combining into 1 collection.
111 |     Including original genes.
112 |     """
113 |     input:
114 |         fna1 = config['tmp_dir'] + 'db_update/orig.fna',
115 |         fna2 = expand(config['tmp_dir'] + 'db_update/nuc_filtered/{sample}_reps.fna',
116 |                       sample = config['samples_unique'])
117 |     output:
118 |         fna = temp(config['tmp_dir'] + 'db_update/filtered_reps.fna')
119 |     resources:
120 |         time = lambda wildcards, attempt: attempt ** 2 * 59
121 |     run:
122 |         cat_files([input.fna1], input.fna2, outfile=output.fna) 
123 | 
124 | rule genes_combine_faa:
125 |     """
126 |     For all per-genome de-replicated genes of all genomes, combining into 1 collection.
127 |     Including original genes.
128 |     """
129 |     input:
130 |         faa1 = config['tmp_dir'] + 'db_update/orig.faa',
131 |         faa2 = expand(config['tmp_dir'] + 'db_update/prot_filtered/{sample}_reps.faa',
132 |                       sample = config['samples_unique'])
133 |     output:
134 |         faa = temp(config['tmp_dir'] + 'db_update/filtered_reps.faa')
135 |     resources:
136 |         time = lambda wildcards, attempt: attempt ** 2 * 59
137 |     run:
138 |         cat_files([input.faa1], input.faa2, outfile=output.faa) 
139 | 
140 | rule genes_combine_txt:
141 |     """
142 |     For all per-genome de-replicated genes of all genomes, combining into 1 collection.
143 |     Including original genes.
144 |     """
145 |     input:
146 |         txt1 = config['tmp_dir'] + 'db_update/orig.txt',
147 |         txt2 = expand(config['tmp_dir'] + \
148 |                       'db_update/names_filtered/{sample}_reps.txt',
149 |                       sample = config['samples_unique'])
150 |     output:
151 |         txt = temp(config['tmp_dir'] + 'db_update/filtered_reps.txt')
152 |     resources:
153 |         time = lambda wildcards, attempt: attempt ** 2 * 59
154 |     run:
155 |         cat_files([input.txt1], input.txt2, outfile=output.txt, header=True) 
156 |         
157 | rule copy_gene_info:
158 |     """
159 |     Copying/compressing gene data to the output directory.
160 |     """
161 |     input:
162 |         fna = config['tmp_dir'] + 'db_update/filtered_reps.fna',
163 |         faa = config['tmp_dir'] + 'db_update/filtered_reps.faa',
164 |         txt = config['tmp_dir'] + 'db_update/filtered_reps.txt'
165 |     output:
166 |         fna = genes_dir + 'genome_reps_filtered.fna.gz',
167 |         faa = genes_dir + 'genome_reps_filtered.faa.gz',
168 |         txt = genes_dir + 'genome_reps_filtered.txt.gz'
169 |     params:
170 |         ionice = config['params']['ionice']
171 |     resources:
172 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
173 |         mem_gb_pt = lambda wildcards, attempt: attempt * 8
174 |     log:
175 |         log_dir + 'copy_gene_info/all.log'
176 |     benchmark:
177 |         benchmark_dir + 'copy_gene_info/all.txt'
178 |     shell:
179 |         """
180 |         ionice {params.ionice} gzip -c {input.fna} > {output.fna} 2> {log}
181 |         ionice {params.ionice} gzip -c {input.faa} > {output.faa} 2>> {log}
182 |         ionice {params.ionice} gzip -c {input.txt} > {output.txt} 2>> {log}
183 |         """
184 |         
185 | rule mmseqs_db_create:
186 |     """ 
187 |     Creating mmseqs2 database that will be used for updating the existing cluster database.
188 |     This databased includes all genes (original & new).
189 |     """
190 |     input:
191 |         fna = config['tmp_dir'] + 'db_update/filtered_reps.fna',
192 |         faa = config['tmp_dir'] + 'db_update/filtered_reps.faa',
193 |         txt = config['tmp_dir'] + 'db_update/filtered_reps.txt'
194 |     output:
195 |         db = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db'),
196 |         db_t = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.dbtype'),
197 |         db_i = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.index'),
198 |         db_l = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.lookup'),
199 |         db_s = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db.source'),
200 |         db_h = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db_h'),
201 |         db_ht = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db_h.dbtype'),
202 |         db_hi = temp(config['tmp_dir'] + 'db_update/all_genes/genes_db_h.index')
203 |     resources:
204 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
205 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 20 + 20
206 |     conda:
207 |         '../../../../envs/genes.yaml'
208 |     log:
209 |         log_dir + 'mmseqs_db_create/all.log'
210 |     benchmark:
211 |         benchmark_dir + 'mmseqs_db_create/all.txt'
212 |     shell:
213 |         """
214 |         mmseqs createdb {input.faa} {output.db} 2> {log} 1>&2
215 |         """
216 | 
217 | 


--------------------------------------------------------------------------------
/bin/db_update/humann3/Snakefile:
--------------------------------------------------------------------------------
 1 | # input
 2 | if not skipped(config['databases']['genes']):
 3 |     print('\33[36m  * Using updated genes database\x1b[0m')
 4 |     include: snake_dir + 'bin/db_update/humann3/input_from_genes/Snakefile'
 5 | else:
 6 |     msg = '\33[35m  X For user-provided gene sequences'
 7 |     msg += ' you must update the genes database!\x1b[0m'
 8 |     print(msg)
 9 |     sys.exit(1)
10 | include: snake_dir + 'bin/db_update/humann3/prepare_query/Snakefile'
11 | # query
12 | if (not skipped(config['params']['humann3']['mmseqs_search']['db']) and
13 |     not skipped(config['params']['humann3']['mmseqs_search']['index']) and
14 |     not skipped(config['params']['humann3']['mmseqs_search']['run'])):
15 |     # checking on database
16 |     if not re.search(config['uniref_name'],
17 |                      str(config['params']['humann3']['mmseqs_search']['db']).lower()):
18 |         print('\33[35m  * WARNING the uniref_name does not match the query database\x1b[0m')
19 |         print('\33[35m    * ({} <=> {})\x1b[0m'.format(config['uniref_name'],
20 |                                                        config['params']['humann3']['mmseqs_search']['db']))               
21 |     # mmseqs search
22 |     print('\33[36m  * Annotating via "mmseqs search"\x1b[0m')     
23 |     include: snake_dir + 'bin/db_update/humann3/query_mmseqs/Snakefile'
24 | elif (not skipped(config['params']['humann3']['diamond']['db']) and
25 |       not skipped(config['params']['humann3']['diamond']['run'])):
26 |     # checking on database
27 |     if not re.search(config['uniref_name'],
28 |                      str(config['params']['humann3']['diamond']['db']).lower()):
29 |         print('\33[35m  * WARNING the uniref_name does not match the query database\x1b[0m')
30 |         print('\33[35m    * ({} <=> {})\x1b[0m'.format(config['uniref_name'],
31 |                                                        config['params']['humann3']['diamond']['db']))   
32 |     # diamond blastp
33 |     print('\33[36m  * Annotating via "diamond blastp"\x1b[0m')     
34 |     include: snake_dir + 'bin/db_update/humann3/query_dmnd/Snakefile'
35 | else:
36 |     print('\33[31m  ERROR: all query methods skipped!\x1b[0m')
37 | # database creation
38 | include: snake_dir + 'bin/db_update/humann3/db_create/Snakefile'
39 | 


--------------------------------------------------------------------------------
/bin/db_update/humann3/db_create/Snakefile:
--------------------------------------------------------------------------------
  1 | def which_membership(wildcards):
  2 |     """
  3 |     Which membership file to use as input
  4 |     """
  5 |     if not skipped(config['databases']['genes']):
  6 |         return genes_dir + 'cluster/clusters_membership.tsv.gz'
  7 |     else:
  8 |         return config['humann_db']['cluster']['membership']
  9 | 
 10 | rule humann3_annotate_genes:
 11 |     """
 12 |     Use search hits from clustered reps & index table to annotate all
 13 |     genome-derep genes. The annotation for each cluster rep is propagated
 14 |     to each member of the cluster.
 15 |     """
 16 |     input:
 17 |         hits = config['tmp_dir'] + 'humann3/hits.txt',
 18 |         fna = config['tmp_dir'] + 'humann3/genome_reps_filtered.fna',
 19 |         faa = config['tmp_dir'] + 'humann3/genome_reps_filtered.faa',
 20 |         txt = config['tmp_dir'] + 'humann3/genome_reps_filtered.txt',
 21 |         tsv = which_membership
 22 |     output:
 23 |         fna = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna'),
 24 |         faa = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa'),
 25 |         tsv = temp(config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv')
 26 |     params:
 27 |         exe = config['pipeline']['script_folder'] + 'propagate_annotations.py',
 28 |         params = config['params']['humann3']['propagate_annotations']
 29 |     resources:
 30 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 31 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 3 + 11
 32 |     log:
 33 |         log_dir + 'humann3_annotate_genes/all.log'
 34 |     benchmark:
 35 |         benchmark_dir + 'humann3_annotate_genes/all.txt'
 36 |     shell:
 37 |         """
 38 |         {params.exe} {params.params} \
 39 |           --in-nuc {input.fna} \
 40 |           --out-nuc {output.fna} \
 41 |           --out-prot {output.faa} \
 42 |           {input.hits} {input.faa} \
 43 |           {input.txt} {input.tsv} \
 44 |           > {output.tsv} 2> {log}
 45 |         """
 46 | 
 47 | rule humann3_annotate_hits_copy:
 48 |     """
 49 |     Copying query hits (diamond or mmseqs) to the final output directory 
 50 |     """
 51 |     input:
 52 |         hits = config['tmp_dir'] + 'humann3/hits.txt'
 53 |     output:
 54 |         hits = humann3_dir + 'annotation_hits.gz'
 55 |     params:
 56 |         ionice = config['params']['ionice']
 57 |     resources:
 58 |         time = lambda wildcards, attempt: attempt ** 3 * 59
 59 |     log:
 60 |         log_dir + 'humann3_annotate_hits_copy/all.log'
 61 |     shell:
 62 |         """
 63 |         ionice {params.ionice} gzip -c {input.hits} > {output.hits} 2> {log}
 64 |         """
 65 | 
 66 | rule humann3_alt_annotate:
 67 |     """
 68 |     Re-annotating with different UniRef cluster resolution
 69 |     """
 70 |     input:
 71 |         fna = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.fna',
 72 |         faa = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.faa',
 73 |         tsv = config['tmp_dir'] + config['uniref_name'] + '/genome_reps_filt_annot.tsv',
 74 |         idx = ancient(config['cluster_idx'])
 75 |     output:
 76 |         fna = temp(config['tmp_dir'] + config['uniref_other_name'] + \
 77 |                    '/genome_reps_filt_annot.fna'),
 78 |         faa = temp(config['tmp_dir'] + config['uniref_other_name'] + \
 79 |                    '/genome_reps_filt_annot.faa'),
 80 |         tsv = temp(config['tmp_dir'] + config['uniref_other_name'] + \
 81 |                    '/genome_reps_filt_annot.tsv')
 82 |     params:
 83 |         exe = config['pipeline']['script_folder'] + 'uniref_clst_trans.py'
 84 |     resources:
 85 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 86 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 3 * 3 + 15
 87 |     log:
 88 |         log_dir + 'humann3_alt_annotate/all.log'
 89 |     benchmark:
 90 |         benchmark_dir + 'humann3_alt_annotate/all.txt'
 91 |     shell:
 92 |         """
 93 |         OUTDIR=`dirname {output.fna}`
 94 |         mkdir -p $OUTDIR 2> {log}
 95 |         {params.exe} {input.idx} \
 96 |           --in-nuc {input.fna} \
 97 |           --in-prot {input.faa} \
 98 |           --in-tsv {input.tsv} \
 99 |           --out-nuc {output.fna} \
100 |           --out-prot {output.faa} \
101 |           --out-tsv {output.tsv} \
102 |           2>> {log} 1>&2
103 |         """    
104 |         
105 | rule humann3_annotate_genes_copy:
106 |     """
107 |     Copying annotated gene files to the final directory
108 |     """
109 |     input:
110 |         fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna',
111 |         faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa',
112 |         tsv = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.tsv'
113 |     output:
114 |         fna = humann3_dir + '{uniref}/genome_reps_filt_annot.fna.gz',
115 |         faa = humann3_dir + '{uniref}/genome_reps_filt_annot.faa.gz',
116 |         tsv = humann3_dir + '{uniref}/genome_reps_filt_annot.tsv.gz'            
117 |     params:
118 |         ionice = config['params']['ionice']
119 |     resources:
120 |         time = lambda wildcards, attempt: attempt ** 3 * 59
121 |     log:
122 |         log_dir + 'humann3_annotate_genes_copy/{uniref}.log'
123 |     benchmark:
124 |         benchmark_dir + 'humann3_annotate_genes_copy/{uniref}.txt'
125 |     shell:
126 |         """
127 |         ionice {params.ionice} gzip -c {input.fna} > {output.fna} 2> {log}
128 |         ionice {params.ionice} gzip -c {input.faa} > {output.faa} 2>> {log}
129 |         ionice {params.ionice} gzip -c {input.tsv} > {output.tsv} 2>> {log}
130 |         """
131 |     
132 | rule humann3_bowtie2_build:
133 |     """
134 |     Running bowtie2 build on combined, annotated genes 
135 |     """
136 |     input:
137 |         fna = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.fna'
138 |     output:
139 |         os.path.join(humann3_dir, '{uniref}/bowtie2_build.done')
140 |     params:
141 |         prefix = humann3_dir + '{uniref}/genome_reps_filt_annot'
142 |     conda:
143 |         '../../../envs/humann3.yaml'
144 |     threads:
145 |         12
146 |     resources:
147 |         time = lambda wildcards, attempt: attempt * 2 * 60 * 24,
148 |         n = lambda wildcards, attempt, threads: threads,
149 |         mem_gb_pt = lambda wildcards, attempt: attempt ** 2 * 2 + 8,
150 |         lg_idx = lambda wildcards, attempt: '--large-index' if attempt > 1 else ''
151 |     log:
152 |         log_dir + 'humann3_bowtie2_build/{uniref}.log'
153 |     benchmark:
154 |         benchmark_dir + 'humann3_bowtie2_build/{uniref}.txt'
155 |     shell:
156 |         """
157 |         bowtie2-build --threads {threads} {resources.lg_idx} \
158 |           {input.fna} {params.prefix} 2> {log} 1>&2
159 | 
160 |         # check that output exists
161 |         OUTDIR=`dirname {params.prefix}`
162 |         IDX_FILES=`find $OUTDIR -maxdepth 1 -name "*.bt2*"`
163 |         IDX_FILES=`echo $IDX_FILES | perl -pe 's/ +/\n/g' | wc -l`
164 |         if [ $IDX_FILES -lt 1 ]; then
165 |           echo "ERROR: no bowtie2 index files found!"
166 |           exit 1
167 |         fi
168 |         touch {output} 2>> {log}
169 |         """    
170 | 
171 | rule humann3_diamond_makedb:
172 |     """
173 |     Running diamond makedb on combined, annotated genes 
174 |     """
175 |     input:
176 |         faa = config['tmp_dir'] + '{uniref}/genome_reps_filt_annot.faa'
177 |     output:
178 |         humann3_dir + '{uniref}/protein_database/{dmnd}'
179 |     params:
180 |         tmp_dir = config['tmp_dir']
181 |     conda:
182 |         '../../../envs/humann3.yaml'
183 |     resources:
184 |         time = lambda wildcards, attempt: attempt * 2 * 60 * 24,
185 |         mem_gb_pt = lambda wildcards, attempt: (attempt ** 3 + 2) * 12
186 |     log:
187 |         log_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.log'
188 |     benchmark:
189 |         benchmark_dir + 'humann3_diamond_makedb/{uniref}/{dmnd}.txt'
190 |     shell:
191 |         """
192 |         PREF=`echo {output} | perl -pe 's/\.[^.]+$//'`
193 |         diamond makedb --in {input.faa} -d $PREF 2> {log} 1>&2
194 |         """    
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/bin/db_update/humann3/input_from_genes/Snakefile:
--------------------------------------------------------------------------------
 1 | # copying from the genes-db-update pipeline output
 2 | 
 3 | rule humann3_copy_gene_input:
 4 |     """
 5 |     Splitting gene fasta for distributed searching
 6 |     """
 7 |     input:
 8 |         fna = genes_dir + 'genome_reps_filtered.fna.gz',
 9 |         faa = genes_dir + 'genome_reps_filtered.faa.gz',
10 |         txt = genes_dir + 'genome_reps_filtered.txt.gz'
11 |     output:
12 |         fna = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.fna'),
13 |         faa = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.faa'),
14 |         txt = temp(config['tmp_dir'] + 'humann3/genome_reps_filtered.txt')
15 |     params:
16 |         ionice = config['params']['ionice'],
17 |         exe = config['pipeline']['script_folder'] + 'cat_files.py'
18 |     resources:
19 |         time = lambda wildcards, attempt: attempt ** 3 * 59
20 |     log:
21 |         log_dir + 'humann3_copy_gene_input/all.log'
22 |     shell:
23 |         """
24 |         ionice {params.ionice} {params.exe} {input.fna} > {output.fna} 2>  {log}
25 |         ionice {params.ionice} {params.exe} {input.faa} > {output.faa} 2>> {log}
26 |         ionice {params.ionice} {params.exe} {input.txt} > {output.txt} 2>> {log}
27 |         """
28 | 
29 | rule humann3_copy_cluster_input:
30 |     """
31 |     Splitting gene fasta for distributed searching
32 |     """
33 |     input:
34 |         hit_c = config['humann_db']['query']['hits'],
35 |         faa_c = genes_dir + 'cluster/clusters_reps.faa.gz',
36 |         mem_c = genes_dir + 'cluster/clusters_membership.tsv.gz',
37 |     output:
38 |         hit_c = temp(config['tmp_dir'] + 'humann3/query_hits.txt'),
39 |         faa_c = temp(config['tmp_dir'] + 'humann3/clusters_reps.faa'),
40 |         mem_c = temp(config['tmp_dir'] + 'humann3/clusters_membership.tsv')
41 |     params:
42 |         ionice = config['params']['ionice'],
43 |         exe = config['pipeline']['script_folder'] + 'cat_files.py'
44 |     resources:
45 |         time = lambda wildcards, attempt: attempt ** 3 * 59
46 |     log:
47 |         log_dir + 'humann3_copy_cluster_input/all.log'
48 |     shell:
49 |         """
50 |         ionice {params.ionice} {params.exe} {input.hit_c} > {output.hit_c} 2>  {log} 
51 |         ionice {params.ionice} {params.exe} {input.mem_c} > {output.mem_c} 2>> {log} 
52 |         ionice {params.ionice} {params.exe} {input.faa_c} > {output.faa_c} 2>> {log} 
53 |         """
54 |         
55 | 


--------------------------------------------------------------------------------
/bin/db_update/humann3/prepare_query/Snakefile:
--------------------------------------------------------------------------------
 1 | rule humann3_which_to_query:
 2 |     """
 3 |     Selecting which of the cluster reps need to be queried, given the existing query hits.
 4 |     """
 5 |     input:
 6 |         mem_c = config['tmp_dir'] + 'humann3/clusters_membership.tsv',
 7 |         hit_c = config['tmp_dir'] + 'humann3/query_hits.txt',        
 8 |         faa_c = config['tmp_dir'] + 'humann3/clusters_reps.faa'
 9 |     output:
10 |         faa = temp(config['tmp_dir'] + 'humann3/clusters_reps_filt.faa')
11 |     params:
12 |         exe = config['pipeline']['script_folder'] + 'filter_cluster_reps.py',
13 |         params = config['params']['humann3']['filter_existing']
14 |     resources:
15 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
16 |         mem_gb_pt = lambda wildcards, attempt: attempt * 10
17 |     log:
18 |         log_dir + 'humann3_which_to_query/all.log'
19 |     shell:
20 |         """
21 |         {params.exe} {params.params} \
22 |           {input.mem_c} {input.hit_c} \
23 |           {input.faa_c} > {output.faa} 2> {log}
24 |         """
25 |         
26 | rule humann_query_fasta_split:
27 |     """
28 |     Splitting gene fasta for distributed searching
29 |     """
30 |     input:
31 |         faa = config['tmp_dir'] + 'humann3/clusters_reps_filt.faa'
32 |     output:
33 |         done = config['tmp_dir'] + 'humann3_search/split.done',
34 |         splt = temp(expand(config['tmp_dir'] + \
35 |                            'humann3_search/stdin.part_{splitID}.fasta',
36 | 	                   splitID=config['params']['humann3']['splits']))
37 |     params:
38 |         n_splits = config['params']['humann3']['batches']
39 |     threads:
40 |         4
41 |     resources:
42 |         time = lambda wildcards, attempt: int(round(attempt ** 4 * 59,0)),
43 |         n = lambda wildcards, attempt, threads: threads,
44 |         mem_gb_pt = lambda wildcards, attempt, threads: int(round(attempt ** 3 * 10.0 / threads,0))
45 |     conda:
46 |         '../../../envs/genes.yaml'
47 |     log:
48 |         log_dir + 'mmseqs_search_batch_seqs/all.log'
49 |     benchmark:
50 |         benchmark_dir + 'mmseqs_search_batch_seqs/all.txt'
51 |     shell:
52 |         """    
53 |         OUTDIR=`dirname {output.done} 2> {log}`
54 |         seqkit shuffle -j {threads} {input.faa} 2>> {log} | \
55 |           seqkit split -j {threads} --by-part {params.n_splits} \
56 |             --out-dir $OUTDIR 2>> {log} 1>&2
57 |         touch {output.done}
58 |         """
59 |         
60 | 


--------------------------------------------------------------------------------
/bin/db_update/humann3/query_dmnd/Snakefile:
--------------------------------------------------------------------------------
  1 | rule humann3_diamond_db_copy:
  2 |     """
  3 |     Copying the user-provided DIAMOND database to the temp directory
  4 |     """
  5 |     input:
  6 |         db = ancient(config['params']['humann3']['diamond']['db'])
  7 |     output:
  8 |         db = temp(config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd')
  9 |     params:
 10 |         ionice = config['params']['ionice']
 11 |     resources:
 12 |         time = lambda wildcards, attempt: attempt ** 3 * 59,
 13 |         mem_gb_pt = lambda wildcards, attempt: attempt * 4
 14 |     log:
 15 |         log_dir + 'humann3_diamond_db_copy/all.log'
 16 |     benchmark:
 17 |         benchmark_dir + 'humann3_diamond_db_copy/all.txt'
 18 |     shell:
 19 |         """
 20 |         ionice {params.ionice} cp -f {input} {output} 2> {log} 1>&2 
 21 |         """ 
 22 | 
 23 | def dmnd_start_mem(wildcards, attempt, threads=12):
 24 |     """
 25 |     Estimating the baseline memory to use for jobs, given the diamond database size
 26 |     """
 27 |     F = config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd'
 28 |     prot_db_size = os.stat(F).st_size / 1e9
 29 |     mem = round(prot_db_size * 5 / threads + 1.499,0)
 30 |     mem = (attempt - 1) * 2 + mem
 31 |     return int(mem)
 32 | 
 33 | rule humann3_diamond_pass1:
 34 |     """
 35 |     Annotating genes via diamond search of UniRef DB
 36 |     """
 37 |     input:
 38 |         faa = config['tmp_dir'] + 'humann3_search/stdin.part_{splitID}.fasta',
 39 | 	dmnd_db = config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd'
 40 |     output:
 41 |         hits = temp(config['tmp_dir'] + 'humann3/hits_pass1/{splitID}.txt'),
 42 |         unaln = temp(config['tmp_dir'] + 'humann3/unaln/{splitID}.faa')
 43 |     params:
 44 |         params = config['params']['humann3']['diamond']['run'],
 45 |         tmp_dir = config['tmp_dir'] + 'humann3_dmnd_TMP/{splitID}/'
 46 |     threads:
 47 |         8
 48 |     resources:
 49 |         time = lambda wildcards, attempt: attempt ** 2 * 60 * 12,
 50 |         n = lambda wildcards, attempt, threads: threads,
 51 |         mem_gb_pt = dmnd_start_mem
 52 |     conda:
 53 |         '../../../envs/humann3.yaml'
 54 |     log:
 55 |         log_dir + 'humann3_diamond_pass1/{splitID}.log'
 56 |     benchmark:
 57 |         benchmark_dir + 'humann3_diamond_pass1/{splitID}.txt'
 58 |     shell:
 59 |         """
 60 |         TMPDIR="{params.tmp_dir}"
 61 |         mkdir -p $TMPDIR 2> {log}
 62 | 
 63 |         # diamond run
 64 |         diamond blastp {params.params} \
 65 |           --tmpdir $TMPDIR --threads {threads} \
 66 |           -q {input.faa} -d {input.dmnd_db} \
 67 |           -o {output.hits} --un {output.unaln} \
 68 |           --outfmt 6 qseqid sseqid evalue pident length slen \
 69 |           2>> {log} 1>&2
 70 |         """
 71 | 
 72 | rule humann3_diamond_pass2:
 73 |     """
 74 |     Annotating genes via diamond search of UniRef DB (sensitive mode)
 75 |     """
 76 |     input:
 77 |         faa = config['tmp_dir'] + 'humann3/unaln/{splitID}.faa',
 78 | 	dmnd_db = config['tmp_dir'] + 'humann3/humann3_dmnd_db.dmnd'
 79 |     output:
 80 |         hits = temp(config['tmp_dir'] + 'humann3/hits_pass2/{splitID}.txt')
 81 |     params:
 82 |         params = config['params']['humann3']['diamond']['run'],
 83 |         tmp_dir = config['tmp_dir']  + 'humann3_dmnd_TMP/{splitID}/'
 84 |     threads:
 85 |         8
 86 |     resources:
 87 |         time = lambda wildcards, attempt: attempt * 60 * 48,
 88 |         n = lambda wildcards, attempt, threads: threads,
 89 |         mem_gb_pt = dmnd_start_mem
 90 |     conda:
 91 |         '../../../envs/humann3.yaml'
 92 |     log:
 93 |         log_dir + 'humann3_diamond_pass2/{splitID}.log'
 94 |     benchmark:
 95 |         benchmark_dir + 'humann3_diamond_pass2/{splitID}.txt'
 96 |     shell:
 97 |         """
 98 |         NSEQ=`seqkit seq -n {input.faa} | wc -l 2> {log}`
 99 |         if [[ "$NSEQ" -gt "0" ]]; then 
100 |           TMPDIR="{params.tmp_dir}"
101 |           mkdir -p $TMPDIR 2>> {log}
102 |           # diamond run
103 |           diamond blastp --sensitive {params.params} \
104 |             --tmpdir $TMPDIR --threads {threads} \
105 |             -q {input.faa} -d {input.dmnd_db} -o {output.hits} \
106 |             --outfmt 6 qseqid sseqid evalue pident length slen \
107 |             2>> {log} 1>&2
108 |         else
109 |           touch {output.hits} 2> {log} 1>&2
110 |           echo "No unaligned sequences. Skipping DIAMOND" >> {log}
111 |         fi
112 |         """        
113 | 
114 | localrules: humann3_diamond_merge       
115 | rule humann3_diamond_merge:
116 |     """
117 |     Merging the results of the 2 DIAMOND passes (all splits).
118 |     Also including all original hits (prior to db update).
119 |     """
120 |     input:
121 |         hits_orig = config['tmp_dir'] + 'humann3/query_hits.txt',  
122 |         hits1 = expand(config['tmp_dir'] + \
123 |                        'humann3/hits_pass1/{splitID}.txt',
124 | 	               splitID=config['params']['humann3']['splits']),
125 |         hits2 = expand(config['tmp_dir'] + \
126 |                        'humann3/hits_pass2/{splitID}.txt',
127 | 	               splitID=config['params']['humann3']['splits'])
128 |     output:
129 |         hits = temp(config['tmp_dir'] + 'humann3/hits.txt')
130 |     resources:
131 |         time = lambda wildcards, attempt: attempt ** 3 * 59
132 |     run:
133 |         with open(output.hits, 'w') as outF:
134 |             for F in [input.hits_orig] + input.hits1 + input.hits2:
135 |                 with open(F) as inF:
136 |                     for line in inF:
137 |                         outF.write(line)
138 |         
139 | 


--------------------------------------------------------------------------------
/bin/db_update/kraken2/Snakefile:
--------------------------------------------------------------------------------
  1 | #-- Kraken2 database update workflow --#
  2 | rule kraken2_cp_to_tmp:
  3 |     """
  4 |     Copying an existing kraken database to temp directory
  5 |     """
  6 |     input:
  7 |         lib = ancient(config['kraken2_db']['library']),
  8 |         tax = ancient(config['kraken2_db']['taxonomy'])
  9 |     output:
 10 |         lib = temp(directory(config['tmp_dir'] + 'db_update/kraken2/library/')),
 11 |         tax = temp(directory(config['tmp_dir'] + 'db_update/kraken2/taxonomy/')),
 12 |         nodes = temp(config['tmp_dir'] + 'db_update/kraken2/taxonomy/nodes.dmp'),
 13 |         names = temp(config['tmp_dir'] + 'db_update/kraken2/taxonomy/names.dmp')
 14 |     params:
 15 |         ionice = config['params']['ionice']
 16 |     resources:
 17 |         time = lambda wildcards, attempt: attempt ** 3 * 59
 18 |     log:
 19 |         log_dir + 'db_update/kraken2_cp_to_tmp/all.log'
 20 |     benchmark:
 21 |         benchmark_dir + 'db_update/kraken2_cp_to_tmp/all.txt'
 22 |     shell:
 23 |         """
 24 |         rm -rf {output.lib} 2> {log}
 25 |         rm -rf {output.tax} 2>> {log}
 26 |         ionice {params.ionice} cp -rf {input.lib} {output.lib} 2>> {log}
 27 |         ionice {params.ionice} cp -rf {input.tax} {output.tax} 2>> {log}
 28 |         """
 29 |     
 30 | def kraken2_add_taxID_get_taxID(wildcards):
 31 |     """
 32 |     Getting genome taxID from the user input table (genome metadata)
 33 |     """
 34 |     taxID = config['samples'].loc[wildcards.sample, config['taxID_col']]
 35 |     try:
 36 |         taxID = taxID.astype(str)
 37 |     except AttributeError:
 38 |         pass
 39 |     return taxID
 40 |             
 41 | rule kraken2_add_taxID:
 42 |     """
 43 |     Adding a taxononmy ID to the header of each genome.
 44 |     Assuming the taxID is in the samples table.
 45 |     Writing edited genome to temp dir.
 46 | 
 47 |     Format: `kraken:taxid|<taxID>|<seqID>`
 48 |     """
 49 |     input:
 50 |         fasta = lambda wildcards: \
 51 | 	  config['samples'].loc[wildcards.sample, config['fasta_file_path_col']]
 52 |     output:
 53 |         temp(config['tmp_dir'] + 'db_update/genomes/{sample}.fna')
 54 |     resources:
 55 |         time = lambda wildcards, attempt: attempt ** 2 * 59,
 56 |         mem_gb_pt = lambda wildcards, attempt: attempt * 6
 57 |     params:
 58 |         taxID = kraken2_add_taxID_get_taxID,
 59 |         exe = config['pipeline']['script_folder'] + 'kraken2_rename_genome.py'
 60 |     log:
 61 |         log_dir + 'db_update/kraken2_add_taxID/{sample}.log'
 62 |     benchmark:
 63 |         benchmark_dir + 'db_update/kraken2_add_taxID/{sample}.txt'
 64 |     shell:
 65 |         """
 66 |         {params.exe} {input.fasta} {params.taxID} > {output} 2> {log}
 67 |         """
 68 | 
 69 | localrules: kraken2_build_add
 70 | 
 71 | rule kraken2_build_add:
 72 |     """
 73 |     Adding genome fasta files to the kraken database.
 74 |     Using the --add-to-library flag
 75 |     """
 76 |     input:
 77 |         lib = config['tmp_dir'] + 'db_update/kraken2/library/',
 78 |         tax = config['tmp_dir'] + 'db_update/kraken2/taxonomy/',
 79 |         fasta = config['tmp_dir'] + 'db_update/genomes/{sample}.fna',
 80 |         nodes = config['tmp_dir'] + 'db_update/kraken2/taxonomy/nodes.dmp',
 81 |         names = config['tmp_dir'] + 'db_update/kraken2/taxonomy/names.dmp'
 82 |     output:
 83 |         temp(config['tmp_dir'] + 'db_update/kraken2/added/{sample}.done')
 84 |     resources:
 85 |         time = lambda wildcards, attempt: attempt ** 2 * 59,
 86 |         mem_gb_pt = lambda wildcards, attempt: attempt * 6
 87 |     conda:
 88 |         '../../envs/kraken2.yaml'
 89 |     log:
 90 |         log_dir + 'db_update/kraken2_build_add/{sample}.log'
 91 |     benchmark:
 92 |         benchmark_dir + 'db_update/kraken2_build_add/{sample}.txt'
 93 |     shell:
 94 |         """
 95 |         DB=`dirname {input.names}`
 96 |         DB=`dirname $DB`
 97 | 
 98 |         kraken2-build --db $DB --add-to-library {input.fasta}  2> {log} 1>&2
 99 |         touch {output} 2>> {log}
100 |         """
101 |     
102 | rule kraken2_build:
103 |     """
104 |     Building the kraken database
105 |     """
106 |     input:
107 |         expand(config['tmp_dir'] + 'db_update/kraken2/added/{sample}.done',
108 | 	       sample = config['samples_unique'])
109 |     output:
110 |         hash = temp(config['tmp_dir'] + 'db_update/kraken2/hash.k2d'),
111 | 	opts = temp(config['tmp_dir'] + 'db_update/kraken2/opts.k2d'),
112 | 	map  = temp(config['tmp_dir'] + 'db_update/kraken2/seqid2taxid.map'),
113 | 	taxo = temp(config['tmp_dir'] + 'db_update/kraken2/taxo.k2d')
114 |     threads:
115 |         12
116 |     resources:
117 |         time = lambda wildcards, attempt: attempt ** 2 * 60 * 24,
118 |         n = lambda wildcards, attempt, threads: threads,
119 |         mem_gb_pt = lambda wildcards, attempt: int(round(attempt * 18 + 4,0))
120 |     conda:
121 |         '../../envs/kraken2.yaml'
122 |     log:
123 |         log_dir + 'db_update/kraken2_build/all.log'
124 |     benchmark:
125 |         benchmark_dir + 'db_update/kraken2_build/all.txt'
126 |     shell:
127 |         """
128 |         DB=`dirname {output.hash}`
129 |         kraken2-build --build --threads {threads} --db $DB 2> {log} 1>&2
130 |         """
131 |     
132 | rule kraken2_db_copy:
133 |     """
134 |     Copying the Kraken database to the output directory
135 |     """
136 |     input:
137 |         lib = config['tmp_dir'] + 'db_update/kraken2/library/',
138 |         tax = config['tmp_dir'] + 'db_update/kraken2/taxonomy/',
139 |         nodes = config['tmp_dir'] + 'db_update/kraken2/taxonomy/nodes.dmp',
140 |         names = config['tmp_dir'] + 'db_update/kraken2/taxonomy/names.dmp',
141 |         hash = config['tmp_dir'] + 'db_update/kraken2/hash.k2d',
142 | 	opts = config['tmp_dir'] + 'db_update/kraken2/opts.k2d',
143 | 	taxo = config['tmp_dir'] + 'db_update/kraken2/taxo.k2d',
144 | 	map  = config['tmp_dir'] + 'db_update/kraken2/seqid2taxid.map'
145 |     output:
146 |         hash = kraken2_dir + 'hash.k2d',
147 | 	opts = kraken2_dir + 'opts.k2d',
148 | 	taxo = kraken2_dir + 'taxo.k2d',
149 | 	map  = kraken2_dir + 'seqid2taxid.map'
150 |     params:
151 |         keep = config['keep_intermediate'],
152 |         ionice = config['params']['ionice']
153 |     resources:
154 |         time = lambda wildcards, attempt: attempt ** 3 * 59
155 |     conda:
156 |         '../../envs/kraken2.yaml'
157 |     log:
158 |         log_dir + 'db_update/kraken2_db_copy/all.log'
159 |     benchmark:
160 |         benchmark_dir + 'db_update/kraken2_db_copy/all.txt'
161 |     shell:
162 |         """
163 |         if [ "{params.keep}" == "True" ]; then
164 |           echo "# copying entire kraken db" > {log}
165 |           DIR1=`dirname {input.hash}`
166 |           DIR2=`dirname {output.hash}`
167 |           rm -rf $DIR2 2>> {log} 1>&2
168 |           ionice {params.ionice} cp -rf $DIR1 $DIR2 2>> {log} 1>&2
169 |         else
170 |           echo "# copying just built kraken index files" > {log}
171 |           ionice {params.ionice} cp -f {input.hash} {output.hash} 2>> {log} 1>&2
172 |           ionice {params.ionice} cp -f {input.opts} {output.opts} 2>> {log} 1>&2
173 |           ionice {params.ionice} cp -f {input.map} {output.map} 2>> {log} 1>&2
174 |           ionice {params.ionice} cp -f {input.taxo} {output.taxo} 2>> {log} 1>&2
175 |         fi
176 |         """
177 |     
178 | 


--------------------------------------------------------------------------------
/bin/dirs:
--------------------------------------------------------------------------------
1 | genes_dir = os.path.join(config['output_dir'], 'genes/') 
2 | kraken2_dir = os.path.join(config['output_dir'], 'kraken2/') 
3 | bracken_dir = os.path.join(config['output_dir'], 'bracken/') 
4 | humann3_dir = os.path.join(config['output_dir'], 'humann3/') 
5 | 
6 | log_dir = os.path.join(config['output_dir'], 'logs/')
7 | benchmark_dir = os.path.join(config['output_dir'], 'benchmarks/')
8 | scripts_dir = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'scripts/')
9 | 


--------------------------------------------------------------------------------
/bin/envs/genes.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - conda-forge
 3 | - bioconda
 4 | dependencies:
 5 | - pigz
 6 | - python=3
 7 | - numpy
 8 | - bioconda::seqkit>=0.16.1
 9 | - bioconda::vsearch
10 | - bioconda::prodigal
11 | - bioconda::mmseqs2


--------------------------------------------------------------------------------
/bin/envs/humann2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - bioconda
3 | dependencies:
4 | - pigz
5 | - bioconda::bowtie2
6 | - bioconda::vsearch
7 | - bioconda::prodigal
8 | - bioconda::diamond=0.8.36
9 | 


--------------------------------------------------------------------------------
/bin/envs/humann3.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - bioconda
 3 | - biobakery
 4 | - conda-forge
 5 | dependencies:
 6 | - pigz
 7 | - bioconda::seqkit
 8 | - bioconda::vsearch
 9 | - bioconda::prodigal
10 | - bioconda::diamond=0.9.24
11 | - biobakery::humann
12 | 


--------------------------------------------------------------------------------
/bin/envs/kraken2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - bioconda
3 | - conda-forge
4 | dependencies:
5 | - libiconv
6 | - bioconda::kraken2
7 | - bioconda::bracken
8 | 
9 | 


--------------------------------------------------------------------------------
/bin/envs/krakenuniq.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - bioconda
3 | - conda-forge
4 | dependencies:
5 | - bioconda::krakenuniq=0.6
6 | 
7 | 


--------------------------------------------------------------------------------
/bin/scripts/add_user_seqs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import sys,os
 4 | import re
 5 | import gzip
 6 | import uuid
 7 | import argparse
 8 | import logging
 9 | 
10 | desc = 'Adding user-provided sequences'
11 | epi = """DESCRIPTION:
12 | """
13 | parser = argparse.ArgumentParser(description=desc,
14 |                                  epilog=epi,
15 |                                  formatter_class=argparse.RawTextHelpFormatter)
16 | parser.add_argument('fasta', metavar='fasta', type=str, nargs='+',
17 |                     help='Fasta files (nuc, then prot)')
18 | parser.add_argument('--in-fna', type=str, default='genes.fna',
19 |                     help='Nucleotide output')
20 | parser.add_argument('--in-faa', type=str, default='genes.faa',
21 |                     help='Amino acid output')
22 | parser.add_argument('--in-txt', type=str, default='genes.txt',
23 |                     help='Names index output')
24 | parser.add_argument('--out-fna', type=str, default='wUser_genes.fna',
25 |                     help='Nucleotide output')
26 | parser.add_argument('--out-faa', type=str, default='wUser_genes.faa',
27 |                     help='Amino acid output')
28 | parser.add_argument('--out-txt', type=str, default='wUser_genes.txt',
29 |                     help='Names index output')
30 | parser.add_argument('--version', action='version', version='0.0.1')
31 | 
32 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
33 | 
34 | def read_fasta(infile):
35 |     seqs = {}
36 |     seq_name = ''
37 |     with open(infile) as inF:
38 |         for line in inF:
39 |             line = line.rstrip()
40 |             if line.startswith('>'):
41 |                 seq_name = line.lstrip('>')
42 |             else:
43 |                 try:
44 |                     seqs[seq_name] += line
45 |                 except KeyError:
46 |                     seqs[seq_name] = line
47 |     return seqs            
48 | 
49 | def make_index(input_fna, input_faa):
50 |     # loading fasta files
51 |     if input_fna.lower() != 'skip':
52 |         fna = read_fasta(input_fna)
53 |     else:
54 |         fna = {}
55 |     if input_faa.lower() != 'skip':
56 |         faa = read_fasta(input_faa)
57 |     else:
58 |         faa = {}
59 |     # union of names
60 |     seq_names = set(fna.keys()) & set(faa.keys())
61 |     seq_idx = {x:str(uuid.uuid4()) for x in seq_idx}
62 |     # return
63 |     return fna, faa, seq_idx
64 | 
65 | def seq_cat(seqs, in_fasta, out_fasta):
66 |     with open(in_fasta) as inF, open(out_fasta, 'w') as outF:
67 |         for line in inF:
68 |             outF.write(line)
69 |         for seqid,seq in seqs.items():
70 |             outF.write('>' + seqid + '\n' + seq + '\n')
71 |     logging.info('File written: {}'.format(out_fasta))
72 | 
73 | def names_cat(seq_idx, in_txt, out_txt):
74 |     with open(in_txt) as inF, open(out_txt, 'w') as outF:
75 |         for line in inF:
76 |             outF.write(line)
77 |         for uuid,seqid in seq_idx.items():
78 |             outF.write(uuid + '\t' + seqid + '\n')
79 |     logging.info('File written: {}'.format(out_txt))
80 |             
81 | def main(args):
82 |     # getting overlap of user-provided nuc & prot gene names
83 |     fna,faa,seq_idx = make_index(args.fasta[0], args.fasta[1])
84 | 
85 |     # combining sequences
86 |     seq_cat(fna, args.in_fna, args.out_fna)
87 |     seq_cat(faa, args.in_faa, args.out_faa)
88 |     names_cat(seq_idx, args.in_txt, args.out_txt)
89 |     
90 | if __name__ == '__main__':
91 |     args = parser.parse_args()
92 |     main(args)
93 | 


--------------------------------------------------------------------------------
/bin/scripts/annotate_genes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import sys,os
  4 | import re
  5 | import gzip
  6 | import logging
  7 | import argparse
  8 | from pprint import pprint
  9 | 
 10 | desc = 'Annotate a genome based on mapping to UniRef via diamond'
 11 | epi = """DESCRIPTION:
 12 | Adding UniRef IDs and taxonomy to genes from a particular genome.
 13 | """
 14 | parser = argparse.ArgumentParser(description=desc,
 15 |                                  epilog=epi,
 16 |                                  formatter_class=argparse.RawTextHelpFormatter)
 17 | parser.add_argument('diamond_hits', metavar='diamond_hits', type=str,
 18 |                     help='tab-delim table of diamond hits')
 19 | parser.add_argument('genes_fasta_nuc', metavar='genes_fasta_nuc', type=str,
 20 |                     help='Genes in nucletide fasta format')
 21 | parser.add_argument('genes_fasta_AA', metavar='genes_fasta_AA', type=str,
 22 |                     help='Genes in amino acid fasta format')
 23 | parser.add_argument('taxonomy', metavar='taxonomy', type=str,
 24 |                     help='Taxonomy of the genome')
 25 | parser.add_argument('taxID', metavar='taxID', type=str,
 26 |                     help='NCBI TaxID of the genome')
 27 | parser.add_argument('--columns', type=str, default='qseqid,sseqid,pident,length,qstart,qend,qlen,sstart,send,slen,evalue',
 28 |                         help='Diamond output columns (default:  %(default)s)')                    
 29 | parser.add_argument('--outdir', type=str, default='genes_annotated',
 30 |                         help='Output directory (default:  %(default)s)')
 31 | parser.add_argument('--dmnd-db', type=str, default='/ebio/abt3_projects2/databases_no-backup/humann2/uniref50/uniref50_annotated.1.1.dmnd',
 32 |                     help='UniRef dmnd db for annotating genes (default: %(default)s)')
 33 | parser.add_argument('--percid', type=float, default=50.0,
 34 |                         help='Percent sequence ID cutoff for calling a hit (default:  %(default)s)')
 35 | parser.add_argument('--overlap', type=float, default=80.0,
 36 |                         help='Perc. overlap cutoff (longest sequence) for calling a hit (default:  %(default)s)')
 37 | parser.add_argument('--skip', action='store_true', default=False,
 38 |                         help='Skip diamond-based annotation if the diamond hits file exists (default:  %(default)s)')
 39 | parser.add_argument('--gzip', action='store_true', default=False,
 40 |                         help='gzip output (default:  %(default)s)')
 41 | parser.add_argument('--threads', type=int, default=1,
 42 |                        help='Threads used for diamond (default:  %(default)s)')
 43 | parser.add_argument('--version', action='version', version='0.0.1')
 44 | 
 45 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 46 | 
 47 | 
 48 | def make_best_hit_index(dmnd_hit_file, outfmt_cols):
 49 |     """Making index of best diamond hits for each query
 50 |     """
 51 |     # column index
 52 |     column_idx = {x:i for i,x in enumerate(outfmt_cols)}
 53 |     # streaming hits
 54 |     hits = {}
 55 |     with open(dmnd_hit_file) as inF:
 56 |         longest_seq_len = None
 57 |         for line in inF:
 58 |             line = line.rstrip().split('\t')
 59 |             # Perc ID >= cutoff?
 60 |             try:
 61 |                 pident = float(line[column_idx['pident']])
 62 |             except KeyError:
 63 |                 raise KeyError('Cannot find "pident" column')
 64 |             if pident < args.percid:
 65 |                 continue
 66 |             # overlap >= cutoff?
 67 |             ## longest sequence?
 68 |             try:
 69 |                 query_len = float(line[column_idx['qlen']])
 70 |             except KeyError:
 71 |                 raise KeyError('Cannot find "qlen" column')
 72 |             try:
 73 |                 subject_len = float(line[column_idx['slen']])
 74 |             except KeyError:
 75 |                 raise KeyError('Cannot find "slen" column')
 76 |             if query_len >= subject_len:
 77 |                 longest_seq_len = query_len
 78 |             else:
 79 |                 longest_seq_len = subject_len
 80 |             ## overlap vs longest sequence
 81 |             try:
 82 |                 aln_len = float(line[column_idx['length']])
 83 |             except KeyError:
 84 |                 raise KeyError('Cannot find "length" column')
 85 |             perc_overlap = aln_len / longest_seq_len * 100.0
 86 |             if perc_overlap < args.overlap:
 87 |                 continue
 88 |             # better than current best hit for query?
 89 |             try:
 90 |                 qseqid = line[column_idx['qseqid']]
 91 |             except KeyError:
 92 |                 raise KeyError('Cannot find "qseqid" column')
 93 |             try:
 94 |                 sseqid = line[column_idx['sseqid']]
 95 |             except KeyError:
 96 |                 raise KeyError('Cannot find "sseqid" column')
 97 |             try:
 98 |                 best_hit = hits[qseqid]
 99 |             except KeyError:
100 |                 best_hit = None
101 |             if best_hit is None or (pident >= best_hit[1] and perc_overlap >= best_hit[2]):
102 |                 hits[qseqid] = [sseqid, pident, perc_overlap]
103 |     return hits
104 | 
105 | def rename_seqs(best_hits, fasta_file, taxonomy, outfile, gzip_output=False):
106 |     """Renaming sequences based on uniref hits.
107 |     Using naming format: `gene_family|gene_length|taxonomy`
108 |     Taxonomy format: `g__{genus};s__{species}_taxID{taxID}`
109 |     (see https://bitbucket.org/biobakery/humann2/wiki/Home).
110 |     """
111 |     seq_name = None
112 |     seq = ''
113 |     annot_cnt = 0
114 |     annot_skip_cnt = 0
115 |     if gzip_output == True:
116 |         _open = lambda x: gzip.open(x, 'ab')
117 |         outfile += '.gz'
118 |     else:
119 |         _open = lambda x: open(x, 'a')
120 |     
121 |     with open(fasta_file) as inF, _open(outfile) as outF:
122 |         for line in inF:
123 |             if line.startswith('>'):
124 |                 # previous sequence
125 |                 if seq_name is not None and seq != '':
126 |                     seq = seq.rstrip().strip('*')
127 |                     x = '\n'.join(['>' + seq_name, seq]) + '\n'
128 |                     if gzip_output == True:
129 |                         x = x.encode()   
130 |                     outF.write(x)
131 |                     annot_cnt += 1
132 |                 else:
133 |                     annot_skip_cnt += 1
134 |                 seq = ''
135 |                 # hit for sequence?
136 |                 query = line.rstrip().lstrip('>').split(' ')[0]
137 |                 try:
138 |                     best_hit = best_hits[query]
139 |                 except KeyError:
140 |                     best_hit = None
141 |                 # renaming
142 |                 if best_hit is None:
143 |                     seq_name = None
144 |                 else:
145 |                     seq_name = '|'.join([best_hit[0], taxonomy])
146 |             else:
147 |                 seq += line.rstrip()
148 |         # final sequence
149 |         if seq_name is not None:
150 |             seq = seq.rstrip().strip('*')
151 |             x = '\n'.join(['>' + seq_name, seq]) + '\n'
152 |             if gzip_output == True:
153 |                 x = x.encode()
154 |             outF.write(x)
155 |             annot_cnt += 1
156 |         else:
157 |             annot_skip_cnt += 1
158 | 
159 |     logging.info('Number of genes with an annotation: {}'.format(annot_cnt))            
160 |     logging.info('Number of genes skipped due to no annotation: {}'.format(annot_skip_cnt))  
161 |     logging.info('File written: {}'.format(outfile))
162 |      
163 | def format_taxonomy(tax, taxID):
164 |     """
165 |     Formatting taxonomy string
166 |     """
167 |     logging.info('Taxonomy string provided {}'.format(tax))
168 |     logging.info('TaxID provided {}'.format(taxID))
169 |     
170 |     try:
171 |         taxID = int(float(taxID.strip()))
172 |     except ValueError:
173 |         msg = 'ERROR: taxID "{}" is not an integer!'
174 |         raise ValueError(msg)
175 |     tax = re.sub('[^A-Za-z0-9-_;]+', '_', tax).split(';')
176 |     
177 |     if not len(tax) == 7:
178 |         species = 's__unclassified'
179 |     else:
180 |         species = tax[6]
181 |     if not len(tax) >= 6:
182 |         genus = 'g__unclassified'
183 |     else:
184 |         genus = tax[5]
185 | 
186 |     if genus.startswith('G__'):
187 |         genus = genus[3:]
188 |     if not genus.startswith('g__'):
189 |         genus = 'g__' + genus
190 | 
191 |     if species.startswith('S__'):
192 |         species = genus[3:]
193 |     if not species.startswith('s__'):
194 |         species = 'g__' + species
195 | 
196 |     if genus == 'g__':
197 |         genus = 'g__unclassified'
198 |     if species == 's__':
199 |         species = 's__unclassified'
200 |         
201 |     tax = '.'.join([genus, species])
202 |     tax += '__taxID{}'.format(taxID)
203 |     logging.info('Converted taxonomy string to {}'.format(tax))
204 |     return tax
205 | 
206 | def main(args):
207 |     # formatting taxonomy
208 |     args.taxonomy = format_taxonomy(args.taxonomy, args.taxID)
209 |     
210 |     # filtering diamond hits
211 |     logging.info('Finding best hit for each gene')
212 |     outfmt_cols = args.columns.split(',')
213 |     best_hits = make_best_hit_index(args.diamond_hits, outfmt_cols)
214 |     #pprint(hits)
215 | 
216 |     logging.info('Renaming genes')
217 |     # nuc
218 |     outfile = os.path.join(args.outdir, 'annot.fna')
219 |     rename_seqs(best_hits, args.genes_fasta_nuc,
220 |                 args.taxonomy, outfile=outfile,
221 |                 gzip_output=args.gzip)
222 |     # AA
223 |     outfile = os.path.join(args.outdir, 'annot.faa')
224 |     rename_seqs(best_hits, args.genes_fasta_AA,
225 |                 args.taxonomy, outfile=outfile,
226 |                 gzip_output=args.gzip)
227 | 
228 |     
229 | if __name__ == '__main__':
230 |     args = parser.parse_args()
231 |     main(args)
232 | 


--------------------------------------------------------------------------------
/bin/scripts/bracken-build.py:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #####################################################################
  4 | #bracken_build.sh creates the kmer distribution file for a single Kraken database
  5 | #Copyright (C) 2016-2017 Jennifer Lu, jlu26@jhmi.edu
  6 | #
  7 | #This file is part of Bracken.
  8 | #
  9 | #Bracken is free software; you can redistribute it and/or modify
 10 | #it under the terms of the GNU General Public License as published by
 11 | #the Free Software Foundation; either version 3 of the license, or
 12 | #(at your option) any later version.
 13 | #
 14 | #This program is distributed in the hope that it will be useful,
 15 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 17 | #GNU General Public License for more details
 18 | #
 19 | #You should have received a copy of the GNU General Public License
 20 | #along with this program; if not, see <http://www.gnu.org/licenses/>.
 21 | #
 22 | #####################################################################
 23 | 
 24 | set -eu
 25 | THREADS=1
 26 | KMER_LEN=35
 27 | READ_LEN=100
 28 | DATABASE=""
 29 | KRAKEN="kraken"
 30 | KINSTALL=""
 31 | 
 32 | VERSION="2.2"
 33 | while getopts "k:l:d:x:t:" OPTION
 34 |     do
 35 |         case $OPTION in
 36 |             t)
 37 |                 THREADS=$OPTARG
 38 |                 ;;
 39 |             k)
 40 |                 KMER_LEN=$OPTARG
 41 |                 ;;
 42 |             l)
 43 |                 READ_LEN=$OPTARG
 44 |                 ;;
 45 |             d)
 46 |                 DATABASE=$OPTARG
 47 |                 ;;
 48 |             x)
 49 |                 KINSTALL=$OPTARG
 50 |                 ;;
 51 |             \?)
 52 |                 echo "Usage: bracken_build -k KMER_LEN -l READ_LEN -d MY_DB -x K_INSTALLATION -t THREADS"
 53 |                 echo "  KMER_LEN       kmer length used to build the kraken database (default: 35)"
 54 |                 echo "  THREADS        the number of threads to use when running kraken classification and the bracken scripts"
 55 |                 echo "  READ_LEN       read length to get all classifications for (default: 100)"
 56 |                 echo "  MY_DB          location of Kraken database"
 57 |                 echo "  K_INSTALLATION location of the installed kraken/kraken-build scripts (default assumes scripts can be run from the user path)"
 58 |                 echo
 59 |                 echo "**Note that this script will try to use kraken2 as default. If kraken2 is not installed, kraken will be used instead"
 60 |                 exit
 61 |                 ;;
 62 |         esac
 63 |     done
 64 | #Output command line options selected
 65 | echo " >> Selected Options:"
 66 | echo "       kmer length = $KMER_LEN"
 67 | echo "       read length = $READ_LEN"
 68 | echo "       database    = $DATABASE"
 69 | echo "       threads     = $THREADS"
 70 | if [[ "$DATABASE" =~ "/"$ ]]
 71 | then
 72 |     DATABASE=${DATABASE:0:-1}
 73 | fi
 74 | #Check for Kraken version
 75 | #echo ${KINSTALL}kraken2
 76 | if [ "$KINSTALL" == "" ]; then
 77 |     if hash kraken2 &> /dev/null; then
 78 |         KRAKEN="kraken2"
 79 |     elif hash kraken &> /dev/null; then
 80 |         KRAKEN="kraken"
 81 |     else
 82 |         echo "User must first install kraken or kraken2 and/or specify installation directory of kraken/kraken2 using -x flag"
 83 |         exit
 84 |     fi
 85 | else
 86 |     if [ -f ${KINSTALL}kraken2 ]; then
 87 |         KRAKEN="kraken2"
 88 |     elif [ -f ${KINSTALL}kraken ]; then
 89 |         KRAKEN="kraken"
 90 |     else
 91 |         echo "User must first install kraken or kraken2 and/or specify installation directory of kraken/kraken2 using -x flag"
 92 |         exit
 93 |     fi
 94 | fi
 95 | #Check if Kraken database exists
 96 | echo " >> Checking for Valid Options..."
 97 | if [ -d $DATABASE ]
 98 | then
 99 |     #Directory exists, check for taxonomy/nodes.dmp, library/ and for hash.k2d file
100 |     if [ ! -d $DATABASE/library ]
101 |     then
102 |         echo " ERROR: Database library $DATABASE/library does not exist"
103 |         exit
104 |     elif [ ! -d $DATABASE/taxonomy ]
105 |     then
106 |         echo " ERROR : Database taxonomy $DATABASE/taxonomy does not exist"
107 |         exit
108 |     elif [ ! -f $DATABASE/taxonomy/nodes.dmp ]
109 |     then
110 |         echo " ERROR: Database taxonomy $DATABASE/taxonomy/nodes.dmp does not exist"
111 |         exit
112 |     elif [ $KRAKEN == "kraken2" ] && [ ! -f $DATABASE/hash.k2d ]
113 |     then
114 |         echo " ERROR: Kraken2 Database incomplete: $DATABASE/hash.k2d does not exist"
115 |         exit
116 |     elif [ $KRAKEN == "kraken" ] && [ ! -f $DATABASE/database.kdb ]
117 |     then
118 |         echo " ERROR: Kraken Database incomplete: $DATABASE/database.kdb does not exist"
119 |         exit
120 |     fi
121 | else
122 |     echo " ERROR: Kraken database $DATABASE" does not exist
123 |     exit
124 | fi
125 | #See if database.kraken exists, if not, create
126 | echo " >> Creating database.kraken [if not found]"
127 | if [ -f $DATABASE/database.kraken ]
128 | then
129 |     #database.kraken exists, skip
130 |     echo "          database.kraken exists, skipping creation...."
131 | elif [ $KRAKEN == "kraken2" ]
132 | then
133 |     #database.kraken not found, must create
134 |     echo "      >> ${KINSTALL}kraken2 --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken"
135 | 
136 |     ${KINSTALL}kraken2 --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken
137 | else
138 |     #database.kraken not found, must create
139 |     echo "      >> ${KINSTALL}kraken --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken"
140 |     ${KINSTALL}kraken --db $DATABASE --threads ${THREADS} <( find -L $DATABASE/library \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > $DATABASE/database.kraken
141 | fi
142 | echo "          Finished creating database.kraken [in DB folder]"
143 | #Generate databaseXmers.kmer_distrib
144 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
145 | #cd $DIR
146 | echo " >> Creating database${READ_LEN}mers.kmer_distrib "
147 | if [ -f $DIR/src/kmer2read_distr ]; then
148 |     $DIR/src/kmer2read_distr --seqid2taxid $DATABASE/seqid2taxid.map --taxonomy $DATABASE/taxonomy/ --kraken $DATABASE/database.kraken --output $DATABASE/database${READ_LEN}mers.kraken -k ${KMER_LEN} -l ${READ_LEN} -t ${THREADS}
149 |     python $DIR/src/generate_kmer_distribution.py -i $DATABASE/database${READ_LEN}mers.kraken -o $DATABASE/database${READ_LEN}mers.kmer_distrib
150 | # check if kmer2read_distr is in PATH
151 | elif [ -f $(command -v kmer2read_distr) ]; then
152 |     kmer2read_distr --seqid2taxid $DATABASE/seqid2taxid.map --taxonomy $DATABASE/taxonomy/ --kraken $DATABASE/database.kraken --output $DATABASE/database${READ_LEN}mers.kraken -k ${KMER_LEN} -l ${READ_LEN} -t ${THREADS}
153 |     if [ -f $(command -v generate_kmer_distribution.py) ]; then
154 |         python $(command -v generate_kmer_distribution.py) -i $DATABASE/database${READ_LEN}mers.kraken -o $DATABASE/database${READ_LEN}mers.kmer_distrib
155 |     else
156 |         echo "      ERROR: generate_kmer_distribution.py script not found. "
157 |         echo "          Run 'sh install_bracken.sh' to generate the kmer2read_distr script."
158 |         echo "          Alternatively, cd to BRACKEN_FOLDER/src/ and run 'make'"
159 |         exit
160 |     fi
161 | else
162 |     echo "      ERROR: kmer2read_distr program not found. "
163 |     echo "          Run 'sh install_bracken.sh' to generate the kmer2read_distr script."
164 |     echo "          Alternatively, cd to BRACKEN_FOLDER/src/ and run 'make'"
165 |     exit
166 | fi
167 | echo "          Finished creating database${READ_LEN}mers.kraken and database${READ_LEN}mers.kmer_distrib [in DB folder]"
168 | echo "          *NOTE: to create read distribution files for multiple read lengths, "
169 | echo "                 rerun this script specifying the same database but a different read length"
170 | echo
171 | echo "Bracken build complete."
172 | 


--------------------------------------------------------------------------------
/bin/scripts/cat_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import os
 4 | import sys
 5 | import gzip
 6 | import argparse
 7 | import logging
 8 | 
 9 | desc = 'Simple cat of files; files can be gzip\'d'
10 | epi = """DESCRIPTION:
11 | Simple concatentation of files that allows for a mixture
12 | of gzip'd and uncompressed files.
13 | Output written to STDOUT
14 | """
15 | parser = argparse.ArgumentParser(description=desc,
16 |                                  epilog=epi,
17 |                                  formatter_class=argparse.RawTextHelpFormatter)
18 | parser.add_argument('input', metavar='input', type=str, nargs='+',
19 |                     help='Input file(s)')
20 | parser.add_argument('--header', action='store_true', default=False,
21 |                     help='Input files have headers, so just keep first (default: %(default)s)')
22 | parser.add_argument('--version', action='version', version='0.0.1')
23 | 
24 | 
25 | def main(args):    
26 |     for i,infile in enumerate(args.input):
27 |         if infile.endswith('.gz'):
28 |             _open = lambda x: gzip.open(x, 'rb')
29 |         else:
30 |             _open = lambda x: open(x, 'r')
31 |         with _open(infile) as inF:
32 |             for ii,line in enumerate(inF):
33 |                 # skipping header (except for first table)
34 |                 if i > 0 and ii == 0 and args.header is True:
35 |                     continue
36 |                 # writing line 
37 |                 if infile.endswith('.gz'):
38 |                     line = line.decode('utf-8')
39 |                 print(line, end='')
40 |     
41 | if __name__ == '__main__':
42 |     args = parser.parse_args()
43 |     main(args)
44 | 


--------------------------------------------------------------------------------
/bin/scripts/check_gene_info.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import os
  4 | import sys
  5 | import gzip
  6 | import argparse
  7 | import logging
  8 | 
  9 | desc = 'Simple check of gene input'
 10 | epi = """DESCRIPTION:
 11 | Check of overlapping UUIDs for fasta files & metadata table.
 12 | Also, check that required data is provided in the metadata table.
 13 | """
 14 | parser = argparse.ArgumentParser(description=desc,
 15 |                                  epilog=epi,
 16 |                                  formatter_class=argparse.RawTextHelpFormatter)
 17 | parser.add_argument('prot_fasta', metavar='prot_fasta', type=str,
 18 |                     help='Protein sequence fasta')
 19 | parser.add_argument('metadata', metavar='metadata', type=str,
 20 |                     help='gene metadata')
 21 | parser.add_argument('-n', '--nuc-fasta', type=str, default=None,
 22 |                     help='Nucleotide sequence fasta (default: %(default)s)')
 23 | parser.add_argument('--version', action='version', version='0.0.1')
 24 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 25 | 
 26 | def get_open(infile):
 27 |     if infile.endswith('.gz'):
 28 |         _open = lambda x: gzip.open(x, 'rb')
 29 |     else:
 30 |         _open = lambda x: open(x, 'r')
 31 |     return _open
 32 | 
 33 | def read_fasta(infile):
 34 |     logging.info('Reading file: {}'.format(infile))
 35 |     _open = get_open(infile)
 36 |     seqs = []
 37 |     with _open(infile) as inF:
 38 |         for line in inF:
 39 |             if infile.endswith('.gz'):
 40 |                 line = line.decode('utf-8')
 41 |             if line.startswith('>'):
 42 |                 seqs.append(line.lstrip('>').rstrip())
 43 |     return set(seqs)
 44 | 
 45 | def read_meta(infile):
 46 |     logging.info('Reading file: {}'.format(infile))
 47 |     _open = get_open(infile)
 48 |     meta = {}
 49 |     header = {}
 50 |     req_cols = ['seq_uuid', 'seq_orig_name', 'domain', 'phylum',
 51 |                 'class', 'order', 'family', 'genus', 'species',
 52 |                 'taxid', 'genome_name', 'genome_length_bp']
 53 |     non_empty_cols = ['seq_uuid', 'seq_orig_name', 'genus', 'species']
 54 |     entry_cnt = 0
 55 |     with _open(infile) as inF:
 56 |         for i,line in enumerate(inF):
 57 |             if infile.endswith('.gz'):
 58 |                 line = line.decode('utf-8')
 59 |             line = line.rstrip().split('\t')
 60 |             if line == '':
 61 |                 continue
 62 |             # header
 63 |             if i == 0:
 64 |                 header = {x:ii for ii,x in enumerate(line)}
 65 |                 missing = []
 66 |                 for x in req_cols:                    
 67 |                     try:
 68 |                         _ = header[x]
 69 |                     except KeyError:
 70 |                         missing.append(x)
 71 |                 if len(missing) > 0:
 72 |                     msg = 'Missing columns in metadata table: {}'
 73 |                     raise ValueError(msg.format(','.join(missing)))
 74 |                 else:
 75 |                     logging.info('The metadata table has all required columns')
 76 |                 continue
 77 |             # body
 78 |             entry_cnt += 1
 79 |             for col in non_empty_cols:
 80 |                 if line[header[col]] == '':
 81 |                     msg = 'Line {}: Column "{}" cannot be empty'
 82 |                     raise ValueError(msg.format(i+1, col))
 83 |             seq_uuid = line[header['seq_uuid']] 
 84 |             seq_orig_name = line[header['seq_orig_name']]
 85 |             try:
 86 |                 _ = meta[seq_uuid]
 87 |                 raise ValueError('Value duplicated: {}'.format(seq_uuid))
 88 |             except KeyError:
 89 |                 pass
 90 |             meta[seq_uuid] = seq_orig_name 
 91 |             
 92 |     return meta
 93 |             
 94 | def main(args):
 95 |     # aa fasta
 96 |     aa_seqs = read_fasta(args.prot_fasta)
 97 |     # nuc fasta
 98 |     if args.nuc_fasta is not None:        
 99 |         nuc_seqs = read_fasta(args.nuc_fasta)
100 |         if not len(aa_seqs) == len(nuc_seqs):
101 |             msg = 'WARNIGN: prot. & nuc. seqs differ in length!'
102 |             logging.warning(msg)
103 |             # compare fasta files
104 |             logging.info('Comparing fasta files...')
105 |             ## just in prot.
106 |             just_aa = list(aa_seqs - nuc_seqs)
107 |             if len(just_aa) > 0:
108 |                 just_aa = '\n  '.join(just_aa)
109 |                 print('Genes just in the prot. fasta:\n  {}'.format(just_aa))
110 |             ## just in prot.
111 |             just_nuc = list(nuc_seqs - aa_seqs)
112 |             if len(just_nuc) > 0:
113 |                 just_nuc = '\n  '.join(just_nuc)
114 |                 print('Genes just in the nuc. fasta:{}  \n'.format(just_nuc))
115 |             raise ValueError('Exiting due to mismatches')
116 |     # metadata
117 |     meta = read_meta(args.metadata)
118 |     ## comparing to fasta
119 |     if len(meta.keys()) != len(aa_seqs):
120 |         msg = 'WARNIGN: No. of metadata entries does not match no. of prot. seqs'
121 |         logging.warning(msg)
122 |         ### just in protein
123 |         just_aa = list(aa_seqs - set(meta.keys()))
124 |         if len(just_aa) > 0:
125 |             just_aa = '\n  '.join(just_aa)
126 |             print('Genes just in the prot. fasta:\n  {}'.format(just_aa))
127 |         just_txt = list(set(meta.keys()) - aa_seqs)
128 |         if len(just_txt) > 0:
129 |             msg = 'Genes just in the metadata table:\n  {}'
130 |             print(msg.format('\n  '.join(just_txt)))
131 |             just_txt = '\n  '.join([meta[x] for x in just_txt])
132 |             print('Genes just in the metadata table (gene_orig_name):\n  {}'.format(just_txt))
133 |         raise ValueError('Exiting due to mismatches')
134 |         
135 |     
136 | if __name__ == '__main__':
137 |     args = parser.parse_args()
138 |     main(args)
139 | 


--------------------------------------------------------------------------------
/bin/scripts/download_taxonomy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2013-2019, Derrick Wood <dwood@cs.jhu.edu>
 4 | #
 5 | # This file is part of the Kraken 2 taxonomic sequence classification system.
 6 | 
 7 | # Download NCBI taxonomy information for Kraken 2.
 8 | # Designed to be called by kraken2-build
 9 | 
10 | set -u  # Protect against uninitialized vars.
11 | set -e  # Stop on error
12 | 
13 | TAXONOMY_DIR="$KRAKEN2_DB_NAME/taxonomy"
14 | NCBI_SERVER="ftp.ncbi.nlm.nih.gov"
15 | RSYNC_SERVER="rsync://$NCBI_SERVER"
16 | FTP_SERVER="ftp://$NCBI_SERVER"
17 | 
18 | mkdir -p "$TAXONOMY_DIR"
19 | cd "$TAXONOMY_DIR"
20 | 
21 | function download_file() {
22 |   file="$1"
23 |   if [ -n "$KRAKEN2_USE_FTP" ]
24 |   then
25 |     wget -q ${FTP_SERVER}${file}
26 |   else
27 |     rsync --no-motd ${RSYNC_SERVER}${file} .
28 |   fi
29 | }
30 | 
31 | if [ ! -e "accmap.dlflag" ] && [ -z "$KRAKEN2_SKIP_MAPS" ]
32 | then
33 |   if [ -z "$KRAKEN2_PROTEIN_DB" ]
34 |   then
35 |     for subsection in gb wgs
36 |     do
37 |       1>&2 echo -n "Downloading nucleotide ${subsection} accession to taxon map..."
38 |       download_file "/pub/taxonomy/accession2taxid/nucl_${subsection}.accession2taxid.gz"
39 |       1>&2 echo " done."
40 |     done
41 |   else
42 |     1>&2 echo -n "Downloading protein accession to taxon map..."
43 |     download_file "/pub/taxonomy/accession2taxid/prot.accession2taxid.gz"
44 |     1>&2 echo " done."
45 |   fi
46 |   touch accmap.dlflag
47 |   1>&2 echo "Downloaded accession to taxon map(s)"
48 | fi
49 | 
50 | if [ ! -e "taxdump.dlflag" ]
51 | then
52 |   1>&2 echo -n "Downloading taxonomy tree data..."
53 |   download_file "/pub/taxonomy/taxdump.tar.gz"
54 |   touch taxdump.dlflag
55 |   1>&2 echo " done."
56 | fi
57 | 
58 | if ls | grep -q 'accession2taxid\.gz$'
59 | then
60 |   1>&2 echo -n "Uncompressing taxonomy data..."
61 |   gunzip *accession2taxid.gz
62 |   1>&2 echo " done."
63 | fi
64 | 
65 | if [ ! -e "taxdump.untarflag" ]
66 | then
67 |   1>&2 echo -n "Untarring taxonomy tree data..."
68 |   tar zxf taxdump.tar.gz
69 |   touch taxdump.untarflag
70 |   1>&2 echo " done."
71 | fi
72 | 


--------------------------------------------------------------------------------
/bin/scripts/filter_cluster_reps.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import os
  4 | import sys
  5 | import gzip
  6 | import argparse
  7 | import logging
  8 | 
  9 | desc = 'Filtering the cluster reps to just those that lack current annotations'
 10 | epi = """DESCRIPTION:
 11 | Filtering cluster reps to just those that are lacking any 
 12 | annotation data.
 13 | Output (filtered fasta) written to STDOUT
 14 | """
 15 | parser = argparse.ArgumentParser(description=desc,
 16 |                                  epilog=epi,
 17 |                                  formatter_class=argparse.RawTextHelpFormatter)
 18 | parser.add_argument('cluster_membership', metavar='cluster_membership', type=str,
 19 |                     help='mmseqs cluster membership file (format: cluster_rep<tab>cluster_member)')
 20 | parser.add_argument('query_hits', metavar='query_hits', type=str,
 21 |                     help='blast-formatted table of hits (cluster_rep <=> target_db_seqs)')
 22 | parser.add_argument('cluster_reps_fasta', metavar='cluster_reps_aa', type=str,
 23 |                     help='mmseqs cluster representatives fasta file')
 24 | parser.add_argument('--hit-columns', type=str,
 25 |                     default='qseqid,sseqid,evalue,pident,alnlen,slen',
 26 |                     help='Hit table output columns (default: %(default)s)')      
 27 | parser.add_argument('--min-pident', type=float, default=90,
 28 |                     help='Min % identity of hit (default: %(default)s)')
 29 | parser.add_argument('--min-cov', type=float, default=80,
 30 |                     help='Min % alignment coverage of subject sequence length (default: %(default)s)')
 31 | parser.add_argument('--reps-metadata', type=str, default=None,
 32 |                     help='Cluster reps metadata in order to provide more summary info about filtering (default: %(default)s)')
 33 | parser.add_argument('--version', action='version', version='0.0.1')
 34 | 
 35 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 36 | 
 37 | def _open(infile):
 38 |     if infile.endswith('.gz'):
 39 |         return gzip.open(infile, 'rb')
 40 |     else:
 41 |         return open(infile)
 42 | 
 43 | def read_membership(infile):
 44 |     """
 45 |     Reading in cluster membership table: cluster_rep<tab>cluster_member
 46 |     Return: 
 47 |       dict: {cluster_member : cluster_id} 
 48 |     """
 49 |     logging.info('Reading file: {}'.format(infile))
 50 |     mem = {}
 51 |     with _open(infile) as inF:
 52 |         for line in inF:
 53 |             if infile.endswith('.gz'):
 54 |                 line = line.decode('utf-8')
 55 |             line = line.rstrip().split('\t')
 56 |             if len(line) < 2:
 57 |                 continue
 58 |             mem[line[1]] = line[0]
 59 |     logging.info('  No. of cluster members: {}'.format(len(mem.keys())))
 60 |     n_clst = len(set(mem.values()))
 61 |     logging.info('  No. of clusters: {}'.format(n_clst))
 62 |     return mem
 63 | 
 64 | def read_hits(infile, mem, colnames, min_pident=0, min_cov=0):
 65 |     """ 
 66 |     Loading query hits.
 67 |     Return: 
 68 |       set(cluster rep)  # clusters with hits
 69 |     """
 70 |     logging.info('Loading hits table...')    
 71 |     clusts = []
 72 |     idx = {x:i for i,x in enumerate(colnames.split(','))}
 73 |     with _open(infile) as inF:
 74 |         for i,line in enumerate(inF):            
 75 |             if infile.endswith('.gz'):
 76 |                 line = line.decode('utf-8')
 77 |             line = line.rstrip().split('\t')
 78 |             if line[0] == '':
 79 |                 continue
 80 |             if len(line) < 2:
 81 |                 msg = 'Line {}: <2 values in hits table'
 82 |                 raise ValueError(msg.format(i+1))
 83 |             else:
 84 |                 # filtering to just acceptable annotations
 85 |                 ## percent identity
 86 |                 pident = 0
 87 |                 try:
 88 |                     pident = float(line[idx['pident']])
 89 |                 except KeyError:
 90 |                     pass
 91 |                 if pident < min_pident:
 92 |                     continue
 93 |                 ## coverage of target seq
 94 |                 cov = 0
 95 |                 try:
 96 |                     cov = float(line[idx['slen']]) / float(line[idx['alnlen']]) * 100
 97 |                 except KeyError:
 98 |                     pass
 99 |                 if cov < min_cov:
100 |                     continue
101 |                 ## adding clusterID to set of genes w/ acceptable annotation
102 |                 qseqid = line[idx['qseqid']]
103 |                 try:
104 |                     clusts.append(mem[qseqid])
105 |                 except KeyError:
106 |                     msg = 'Cannot find "{}" in cluster membership'
107 |                     raise KeyError(msg.format(qseqid))
108 |     clusts = set(clusts)
109 |     msg = '  No of clusters with acceptable annotations: {}'
110 |     logging.info(msg.format(len(clusts)))
111 |     return clusts
112 | 
113 | def filter_fasta(infile, clust_w_annot, meta=None):
114 |     """
115 |     Filtering out sequences that are in the set(clust_w_annot)
116 |     Return:
117 |       None
118 |     """
119 |     logging.info('Filtering input fasta...')
120 |     cnts = {'all' : 0, 'filtered' : 0, 'kept' : 0}
121 |     genome_cnt = {}
122 |     to_keep = False
123 |     with _open(infile) as inF:
124 |         for line in inF:            
125 |             if infile.endswith('.gz'):
126 |                 line = line.decode('utf-8')
127 |             line = line.rstrip()
128 |             if line.startswith('>'):
129 |                 cnts['all'] += 1
130 |                 # already has annotation?
131 |                 line = line.lstrip('>').rstrip().split(' ')[0]                
132 |                 if line in clust_w_annot:
133 |                     to_keep = False
134 |                     cnts['filtered'] += 1
135 |                 else:
136 |                     to_keep = True
137 |                     cnts['kept'] += 1
138 |                     print('>' + line)
139 |                 # metadata stats
140 |                 if meta is not None and to_keep is True:
141 |                     try:
142 |                         genome = meta[line]
143 |                     except KeyError:
144 |                         genome = 'OTHER'
145 |                     try:
146 |                         genome_cnt[genome] += 1
147 |                     except KeyError:
148 |                         genome_cnt[genome] = 1
149 |             elif to_keep == True:
150 |                 print(line)
151 |     # status
152 |     logging.info('  No. of total seqs: {}'.format(cnts['all']))
153 |     logging.info('  No. of filtered seqs: {}'.format(cnts['filtered']))
154 |     logging.info('  No. of retained seqs: {}'.format(cnts['kept']))
155 |     ## w/ metadata
156 |     if meta is not None:
157 |         msg = '    No. of retained seqs for {}: {}'
158 |         for genome,cnt in genome_cnt.items():
159 |             logging.info(msg.format(genome, cnt))
160 | 
161 | def read_metadata(infile):
162 |     logging.info('Reading file: {}'.format(infile))
163 |     header = {}
164 |     meta = {}
165 |     with _open(infile) as inF:
166 |         for i,line in enumerate(inF):
167 |             if infile.endswith('.gz'):
168 |                 line = line.decode('utf-8')
169 |             line = line.rstrip().split('\t')
170 |             if i == 0:
171 |                 header = {x:ii for ii,x in enumerate(line)}
172 |                 continue
173 |             seqid = line[header['seq_uuid']]
174 |             genome = line[header['genome_name']]
175 |             meta[seqid] = genome
176 |     return meta
177 |     
178 | def main(args):
179 |     # cluster metadata
180 |     if args.reps_metadata is not None:
181 |         meta = read_metadata(args.reps_metadata)
182 |     else:
183 |         meta = None
184 |     
185 |     # determining which clusters already have acceptable annotations
186 |     clust_w_annot = read_hits(args.query_hits,
187 |                               read_membership(args.cluster_membership),
188 |                               colnames = args.hit_columns,
189 |                               min_pident = args.min_pident,
190 |                               min_cov = args.min_cov)
191 |     # filtering fasta to just those lacking acceptable annotations
192 |     filter_fasta(args.cluster_reps_fasta, clust_w_annot, meta)
193 |     
194 |     
195 | if __name__ == '__main__':
196 |     args = parser.parse_args()
197 |     main(args)
198 | 
199 | 


--------------------------------------------------------------------------------
/bin/scripts/filter_seqs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import sys,os
  4 | import re
  5 | import gzip
  6 | import bz2
  7 | import uuid
  8 | import argparse
  9 | import logging
 10 | 
 11 | desc = 'Filtering two fasta files down to just intersection'
 12 | epi = """DESCRIPTION:
 13 | Filtering 2 fasta files down to the intersection of their sequencines.
 14 | The sequence headers must perfectly match.
 15 | If any duplicate headers, only the first will be selected.
 16 | 
 17 | Output columns:
 18 | * seq UUID
 19 | * seq original name
 20 | * domain
 21 | * phylum
 22 | * class
 23 | * order
 24 | * family
 25 | * genus
 26 | * species
 27 | * taxid
 28 | * genome ID
 29 | * genome length
 30 | 
 31 | Output written to STDOUT.
 32 | """
 33 | parser = argparse.ArgumentParser(description=desc,
 34 |                                  epilog=epi,
 35 |                                  formatter_class=argparse.RawTextHelpFormatter)
 36 | parser.add_argument('fasta1', metavar='fasta1', type=str,
 37 |                     help='The first fasta file')
 38 | parser.add_argument('fasta2', metavar='fasta2', type=str,
 39 |                     help='The second fasta file')
 40 | parser.add_argument('fasta1_output', metavar='fasta1_output', type=str,
 41 |                     help='Name of the output fasta1 file')
 42 | parser.add_argument('fasta2_output', metavar='fasta2_output', type=str,
 43 |                     help='Name of the output fasta2 file')
 44 | parser.add_argument('--taxonomy', type=str, default='',
 45 |                     help='genome taxonomy')
 46 | parser.add_argument('--taxID', type=str, default='',
 47 |                     help='genome taxonomy')
 48 | parser.add_argument('--accession', type=str, default='',
 49 |                     help='genome accession')
 50 | parser.add_argument('--genome-file', type=str, default='',
 51 |                     help='genome fasta file (to get genome length)')
 52 | parser.add_argument('--gzip', action='store_true', default=False,
 53 |                     help='gzip output')
 54 | parser.add_argument('--version', action='version', version='0.0.1')
 55 | 
 56 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 57 | 
 58 | def _open(infile, mode='rb'):
 59 |     """
 60 |     Openning of input, regardless of compression
 61 |     """
 62 |     if infile.endswith('.bz2'):
 63 |         return bz2.open(infile, mode)
 64 |     elif infile.endswith('.gz'):
 65 |         return gzip.open(infile, mode)
 66 |     else:
 67 |         return open(infile)
 68 | 
 69 | def _decode(line):
 70 |     """
 71 |     Decoding input, depending on the file extension
 72 |     """
 73 |     try:
 74 |         line = line.decode('utf-8')
 75 |     except AttributeError:
 76 |         pass
 77 |     return line
 78 | 
 79 | def make_index(fasta):
 80 |     regex = re.compile(r' .+')
 81 |     if fasta.endswith('.gz'):
 82 |         _openR = lambda x: gzip.open(x, 'rb')
 83 |     else:
 84 |         _openR = lambda x: open(x, 'r')        
 85 |     
 86 |     idx = {}
 87 |     with _open(fasta) as inF:
 88 |         for line in inF:
 89 |             line = _decode(line)
 90 |             if line.startswith('>'):
 91 |                 line = line.lstrip('>').rstrip()
 92 |                 idx[regex.sub('', line)] = 0  
 93 |     return set(idx.keys())
 94 | 
 95 | def format_taxonomy(tax, taxID):
 96 |     """Formatting taxonomy string
 97 |     """
 98 |     logging.info('Taxonomy string provided: {}'.format(tax))
 99 |     logging.info('TaxID provided: {}'.format(taxID))
100 |     
101 |     try:
102 |         taxID = int(float(taxID.strip()))
103 |     except ValueError:
104 |         msg = 'WARNING: taxID "{}" is not an integer!'
105 |         logging.warning(msg.format(taxID))        
106 |         #raise ValueError(msg)
107 |     tax = [re.sub('[^A-Za-z0-9-_;]+', '_', x) for x in tax.split(';')]
108 |     regex_d = re.compile(r'[Dd]__.+')
109 |     regex_p = re.compile(r'[Pp]__.+')
110 |     regex_c = re.compile(r'[Cc]__.+')
111 |     regex_o = re.compile(r'[Oo]__.+')
112 |     regex_f = re.compile(r'[Ff]__.+')    
113 |     regex_g = re.compile(r'[Gg]__.+')
114 |     regex_s = re.compile(r'[Ss]__.+')
115 | 
116 |     domain = 'd__unclassified'
117 |     phylum = 'p__unclassified'
118 |     cls = 'c__unclassified'
119 |     order = 'o__unclassified'
120 |     family = 'f__unclassified'
121 |     genus = 'g__unclassified'
122 |     species = 's__unclassified'
123 |     for lev in tax:
124 |         if regex_d.match(lev):
125 |             domain = lev
126 |         elif regex_p.match(lev):
127 |             phylum = lev
128 |         if regex_c.match(lev):
129 |             cls = lev
130 |         elif regex_o.match(lev):
131 |             order = lev
132 |         if regex_f.match(lev):
133 |             family = lev
134 |         if regex_g.match(lev):
135 |             genus = lev
136 |         elif regex_s.match(lev):
137 |             species = lev
138 |             
139 |     tax = [domain, phylum, cls, order, family, genus, species, str(taxID)]
140 |     logging.info('Converted taxonomy string to {}'.format(';'.join(tax)))
141 |     return tax
142 | 
143 | def filter_fasta(fasta, idx, output, gzip_out=False):
144 |     """
145 |     Filtering fasta to just those in idx
146 |     """
147 |     if gzip_out is True:
148 |         _openW = lambda x: gzip.open(x, 'wb')
149 |     else:
150 |         _openW = lambda x: open(x, 'w')
151 |         
152 |     found = {}
153 |     hit = False
154 |     regex = re.compile(r' .+')
155 |     with _open(fasta) as inF, _openW(output) as outF:
156 |         for line in inF:
157 |             line = _decode(line)
158 |             if line.startswith('>'):
159 |                 line = regex.sub('', line.lstrip('>').rstrip())
160 |                 # filter is already seen
161 |                 try:
162 |                     found[line]
163 |                     continue
164 |                 except KeyError:
165 |                     pass
166 |                 # is seq in index?
167 |                 try:                    
168 |                     found[line] = idx[line]
169 |                     hit = True
170 |                 except KeyError:
171 |                     hit = False
172 |                     continue
173 |                 seq_name = '>' + idx[line] + '\n'
174 |                 try:
175 |                     outF.write(seq_name)
176 |                 except TypeError:
177 |                     outF.write(seq_name.encode('utf-8'))
178 |             else:
179 |                 if hit:
180 |                     try:
181 |                         outF.write(line)
182 |                     except TypeError:
183 |                         outF.write(line.encode('utf-8'))
184 |                         
185 |     logging.info('File written: {}'.format(output))
186 |     logging.info('Number of seqs written: {}'.format(len(found.keys())))
187 |     return found
188 | 
189 | def idx_overlap(idx1, idx2, verbose=True):
190 |     """
191 |     Getting overlapping keys
192 |     """
193 |     idx = {}
194 |     for x in set(idx1.keys()) & set(idx2.keys()):
195 |         idx[x] = idx1[x]
196 |     if verbose:
197 |         logging.info('No. of seqIDs in idx1: {}'.format(len(idx1.keys())))
198 |         logging.info('No. of seqIDs in idx2: {}'.format(len(idx2.keys())))
199 |         logging.info('No. of overlapping seqIDs: {}'.format(len(idx.keys())))
200 |     return idx
201 | 
202 | def write_name_idx(idx, tax, genome_id, genome_len):
203 |     """
204 |     Writing gene metadata
205 |     """
206 |     header = ['seq_uuid', 'seq_orig_name', 'domain', 'phylum',
207 |               'class', 'order', 'family', 'genus', 'species',
208 |               'taxid', 'genome_name', 'genome_length_bp']
209 |     print('\t'.join(header))
210 |     for k,v in idx.items():
211 |         print('\t'.join([v, k] + tax + [genome_id, str(genome_len)]))        
212 | 
213 | def get_genome_length(infile):
214 |     """
215 |     Getting the length of the genome
216 |     """
217 |     if infile.endswith('.gz'):
218 |         _open = lambda x: gzip.open(x, 'rb')
219 |     else:
220 |         _open = lambda x: open(x)
221 |     seq_len = 0
222 |     with _open(infile) as inF:
223 |         for line in inF:
224 |             if infile.endswith('.gz'):
225 |                 line = line.decode('utf-8')
226 |             if not line.startswith('>'):
227 |                 seq_len += len(line.rstrip())
228 |     return seq_len
229 |                 
230 | def main(args):
231 |     """
232 |     Main interface
233 |     """
234 |     tax = format_taxonomy(args.taxonomy, args.taxID)
235 |     genome_len = get_genome_length(args.genome_file)
236 |     genomeID = args.accession
237 |     
238 |     # creating the seq header index
239 |     seq_idx = make_index(args.fasta1) & make_index(args.fasta2)
240 |     seq_idx = {x:str(uuid.uuid4()).replace('-', '') for x in seq_idx}
241 |     
242 |     # filtering the fasta files
243 |     idx = filter_fasta(args.fasta1, seq_idx,
244 |                        args.fasta1_output, gzip_out=args.gzip)
245 |     idx = filter_fasta(args.fasta2, seq_idx,
246 |                        args.fasta2_output, gzip_out=args.gzip)
247 |     # creating name index
248 |     write_name_idx(idx, tax, genomeID, genome_len)
249 |     
250 | if __name__ == '__main__':
251 |     args = parser.parse_args()
252 |     main(args)
253 | 


--------------------------------------------------------------------------------
/bin/scripts/kraken2-build:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | # Copyright 2013-2019, Derrick Wood <dwood@cs.jhu.edu>
  4 | #
  5 | # This file is part of the Kraken 2 taxonomic sequence classification system.
  6 | 
  7 | # General build process wrapper for Kraken 2.
  8 | 
  9 | use strict;
 10 | use warnings;
 11 | use File::Basename;
 12 | use Getopt::Long;
 13 | 
 14 | my $PROG = basename $0;
 15 | my $KRAKEN2_DIR = "#####=KRAKEN2_DIR=#####";
 16 | 
 17 | # Test to see if the executables got moved, try to recover if we can
 18 | if (! -e "$KRAKEN2_DIR/classify") {
 19 |   use Cwd 'abs_path';
 20 |   $KRAKEN2_DIR = dirname abs_path($0);
 21 | }
 22 | 
 23 | $ENV{"KRAKEN2_DIR"} = $KRAKEN2_DIR;
 24 | $ENV{"PATH"} = "$KRAKEN2_DIR:$ENV{PATH}";
 25 | 
 26 | my $DEF_AA_MINIMIZER_LEN = 12;
 27 | my $DEF_AA_KMER_LEN = 15;
 28 | my $DEF_AA_MINIMIZER_SPACES = 0;
 29 | my $DEF_NT_MINIMIZER_LEN = 31;
 30 | my $DEF_NT_KMER_LEN = 35;
 31 | my $DEF_NT_MINIMIZER_SPACES = 7;
 32 | my $DEF_THREAD_CT = 1;
 33 | 
 34 | my @VALID_LIBRARY_TYPES = qw/archaea bacteria plasmid viral plant
 35 |                              protozoa fungi human nr nt env_nr env_nt
 36 |                              UniVec UniVec_Core/;
 37 | my @VALID_SPECIAL_DB_TYPES = qw/greengenes silva rdp/;
 38 | 
 39 | # Option/task option variables
 40 | my (
 41 |   $db,
 42 |   $threads,
 43 |   $minimizer_len,
 44 |   $kmer_len,
 45 |   $minimizer_spaces,
 46 |   $is_protein,
 47 |   $no_masking,
 48 |   $max_db_size,
 49 |   $use_ftp,
 50 |   $skip_maps,
 51 | 
 52 |   $dl_taxonomy,
 53 |   $dl_library,
 54 |   $add_to_library,
 55 |   $build,
 56 |   $standard,
 57 |   $clean,
 58 |   $special,
 59 | );
 60 | 
 61 | $threads = $DEF_THREAD_CT;
 62 | $is_protein = 0;
 63 | 
 64 | # variables corresponding to task options
 65 | my @TASK_LIST = (
 66 |   \$dl_taxonomy,
 67 |   \$dl_library,
 68 |   \$add_to_library,
 69 |   \$build,
 70 |   \$standard,
 71 |   \$clean,
 72 |   \$special,
 73 | );
 74 | 
 75 | GetOptions(
 76 |   "help" => \&display_help,
 77 |   "version" => \&display_version,
 78 | 
 79 |   "db=s" => \$db,
 80 |   "threads=i" => \$threads,
 81 |   "minimizer-len=i" => \$minimizer_len,
 82 |   "kmer-len=i" => \$kmer_len,
 83 |   "minimizer-spaces=i", \$minimizer_spaces,
 84 |   "protein" => \$is_protein,
 85 |   "no-masking" => \$no_masking,
 86 |   "max-db-size=i" => \$max_db_size,
 87 |   "use-ftp" => \$use_ftp,
 88 |   "skip-maps" => \$skip_maps,
 89 | 
 90 |   "download-taxonomy" => \$dl_taxonomy,
 91 |   "download-library=s" => \$dl_library,
 92 |   "add-to-library=s" => \$add_to_library,
 93 |   "build" => \$build,
 94 |   "standard" => \$standard,
 95 |   "clean" => \$clean,
 96 |   "special=s" => \$special,
 97 | ) or usage();
 98 | 
 99 | if ($is_protein) {
100 |   $kmer_len = $DEF_AA_KMER_LEN if ! defined $kmer_len;
101 |   $minimizer_len = $DEF_AA_MINIMIZER_LEN if ! defined $minimizer_len;
102 |   $minimizer_spaces = $DEF_AA_MINIMIZER_SPACES if ! defined $minimizer_spaces;
103 | }
104 | else {
105 |   $kmer_len = $DEF_NT_KMER_LEN if ! defined $kmer_len;
106 |   $minimizer_len = $DEF_NT_MINIMIZER_LEN if ! defined $minimizer_len;
107 |   $minimizer_spaces = $DEF_NT_MINIMIZER_SPACES if ! defined $minimizer_spaces;
108 | }
109 | 
110 | if (@ARGV) {
111 |   warn "Extra arguments on command line.\n";
112 |   usage();
113 | }
114 | my $task_options = scalar grep defined $$_, @TASK_LIST;
115 | if ($task_options > 1) {
116 |   warn "More than one task option selected.\n";
117 |   usage();
118 | }
119 | if ($task_options == 0) {
120 |   warn "Must select a task option.\n";
121 |   usage();
122 | }
123 | 
124 | if (! defined $db) {
125 |   die "Must specify a database name\n";
126 | }
127 | if ($threads <= 0) {
128 |   die "Can't use nonpositive thread count of $threads\n";
129 | }
130 | if ($minimizer_len > $kmer_len) {
131 |   die "Minimizer length ($minimizer_len) must not be greater than k ($kmer_len)\n";
132 | }
133 | if ($minimizer_len <= 0) {
134 |   die "Can't use nonpositive minimizer length of $minimizer_len\n";
135 | }
136 | if ($minimizer_len > 31) {
137 |   die "Can't use minimizer len of $minimizer_len (must be <= 31)\n";
138 | }
139 | 
140 | $ENV{"KRAKEN2_DB_NAME"} = $db;
141 | $ENV{"KRAKEN2_THREAD_CT"} = $threads;
142 | $ENV{"KRAKEN2_MINIMIZER_LEN"} = $minimizer_len;
143 | $ENV{"KRAKEN2_KMER_LEN"} = $kmer_len;
144 | $ENV{"KRAKEN2_MINIMIZER_SPACES"} = $minimizer_spaces;
145 | $ENV{"KRAKEN2_SEED_TEMPLATE"} = construct_seed_template();
146 | $ENV{"KRAKEN2_PROTEIN_DB"} = $is_protein ? 1 : "";
147 | $ENV{"KRAKEN2_MASK_LC"} = $no_masking ? "" : 1;
148 | $ENV{"KRAKEN2_MAX_DB_SIZE"} = defined($max_db_size) ? $max_db_size : "";
149 | $ENV{"KRAKEN2_USE_FTP"} = $use_ftp ? 1 : "";
150 | $ENV{"KRAKEN2_SKIP_MAPS"} = $skip_maps ? 1 : "";
151 | 
152 | if ($dl_taxonomy) {
153 |   download_taxonomy();
154 | }
155 | elsif (defined($dl_library)) {
156 |   download_library($dl_library);
157 | }
158 | elsif (defined($add_to_library)) {
159 |   add_to_library($add_to_library);
160 | }
161 | elsif ($standard) {
162 |   standard_installation();
163 | }
164 | elsif ($build) {
165 |   build_database();
166 | }
167 | elsif ($clean) {
168 |   clean_database();
169 | }
170 | elsif ($special) {
171 |   build_special_database($special);
172 | }
173 | else {
174 |   usage();
175 | }
176 | 
177 | exit -1;
178 | # END OF MAIN CODE.
179 | 
180 | sub usage {
181 |   my $exit_code = @_ ? shift : 64;
182 |   print STDERR <<EOF;
183 | Usage: $PROG [task option] [options]
184 | 
185 | Task options (exactly one must be selected):
186 |   --download-taxonomy        Download NCBI taxonomic information
187 |   --download-library TYPE    Download partial library
188 |                              (TYPE = one of "archaea", "bacteria", "plasmid",
189 |                              "viral", "human", "fungi", "plant", "protozoa",
190 |                              "nr", "nt", "env_nr", "env_nt", "UniVec",
191 |                              "UniVec_Core")
192 |   --special TYPE             Download and build a special database
193 |                              (TYPE = one of "greengenes", "silva", "rdp")
194 |   --add-to-library FILE      Add FILE to library
195 |   --build                    Create DB from library
196 |                              (requires taxonomy d/l'ed and at least one file
197 |                              in library)
198 |   --clean                    Remove unneeded files from a built database
199 |   --standard                 Download and build default database
200 |   --help                     Print this message
201 |   --version                  Print version information
202 | 
203 | Options:
204 |   --db NAME                  Kraken 2 DB name (mandatory except for
205 |                              --help/--version)
206 |   --threads #                Number of threads (def: $DEF_THREAD_CT)
207 |   --kmer-len NUM             K-mer length in bp/aa (build task only;
208 |                              def: $DEF_NT_KMER_LEN nt, $DEF_AA_KMER_LEN aa)
209 |   --minimizer-len NUM        Minimizer length in bp/aa (build task only;
210 |                              def: $DEF_NT_MINIMIZER_LEN nt, $DEF_AA_MINIMIZER_LEN aa)
211 |   --minimizer-spaces NUM     Number of characters in minimizer that are
212 |                              ignored in comparisons (build task only;
213 |                              def: $DEF_NT_MINIMIZER_SPACES nt, $DEF_AA_MINIMIZER_SPACES aa)
214 |   --protein                  Build a protein database for translated search
215 |   --no-masking               Used with --standard/--download-library/
216 |                              --add-to-library to avoid masking low-complexity
217 |                              sequences prior to building; masking requires
218 |                              dustmasker or segmasker to be installed in PATH,
219 |                              which some users might not have.
220 |   --max-db-size NUM          Maximum number of bytes for Kraken 2 hash table;
221 |                              if the estimator determines more would normally be
222 |                              needed, the reference library will be downsampled
223 |                              to fit. (Used with --build/--standard/--special)
224 |   --use-ftp                  Use FTP for downloading instead of RSYNC; used with
225 |                              --download-library/--download-taxonomy/--standard.
226 |   --skip-maps                Avoids downloading accession number to taxid maps,
227 |                              used with --download-taxonomy.
228 | EOF
229 |   exit $exit_code;
230 | }
231 | 
232 | sub display_help {
233 |   usage(0);
234 | }
235 | 
236 | sub display_version {
237 |   print "Kraken version #####=VERSION=#####\n";
238 |   print "Copyright 2013-2019, Derrick Wood (dwood\@cs.jhu.edu)\n";
239 |   exit 0;
240 | }
241 | 
242 | sub download_taxonomy {
243 |   exec "download_taxonomy.sh";
244 | }
245 | 
246 | sub download_library {
247 |   my $type = shift;
248 |   if (! grep $type eq $_, @VALID_LIBRARY_TYPES) {
249 |     warn "Unknown library type \"$type\"\n";
250 |     usage();
251 |   }
252 |   exec "download_genomic_library.sh", $type;
253 | }
254 | 
255 | sub add_to_library {
256 |   my $arg = shift;
257 |   exec "add_to_library.sh", $arg;
258 | }
259 | 
260 | sub standard_installation {
261 |   exec "standard_installation.sh";
262 | }
263 | 
264 | sub build_database {
265 |   exec "build_kraken2_db.sh";
266 | }
267 | 
268 | sub clean_database {
269 |   exec "clean_db.sh";
270 | }
271 | 
272 | sub build_special_database {
273 |   my $type = shift;
274 |   if (! grep $type eq $_, @VALID_SPECIAL_DB_TYPES) {
275 |     warn "Unknown special DB type \"$type\"\n";
276 |     usage();
277 |   }
278 | 
279 |   if ($type eq "greengenes") {
280 |     exec "16S_gg_installation.sh";
281 |   }
282 |   elsif ($type eq "silva") {
283 |     exec "16S_silva_installation.sh";
284 |   }
285 |   elsif ($type eq "rdp") {
286 |     exec "16S_rdp_installation.sh";
287 |   }
288 |   else {
289 |     die "$PROG: unrecognized special DB type, this is a Kraken error\n";
290 |   }
291 | }
292 | 
293 | # Currently just supporting simple template with 0s every other position
294 | # can't have more than a quarter of positions be spaces
295 | sub construct_seed_template {
296 |   if (int($minimizer_len/4) < $minimizer_spaces) {
297 |     die "$PROG: number of minimizer spaces ($minimizer_spaces) exceeds max\n" .
298 |         "for minimizer len ($minimizer_len); max: @{[int($minimizer_len/4)]}\n";
299 |   }
300 |   return ("1" x ($minimizer_len - 2 * $minimizer_spaces)) .
301 |          ("01" x $minimizer_spaces);
302 | }
303 | 


--------------------------------------------------------------------------------
/bin/scripts/kraken2_rename_genome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import sys,os
 4 | import gzip
 5 | import argparse
 6 | import logging
 7 | 
 8 | desc = 'Renaming sequence headers in a genome fasta to kraken2 db format'
 9 | epi = """DESCRIPTION:
10 | Output seq header format: `kraken:taxid|<taxID>|<seqID>`
11 | The taxid is provided by the user.
12 | """
13 | 
14 | parser = argparse.ArgumentParser(description=desc,
15 |                                  epilog=epi,
16 |                                  formatter_class=argparse.RawTextHelpFormatter)
17 | parser.add_argument('genome_file', metavar='genome_file', type=str,
18 |                     help='genome file (can be gzip\'ed)')
19 | parser.add_argument('taxID', metavar='taxID', type=str,
20 |                     help='taxonomy ID used for renaming sequences')
21 | parser.add_argument('--version', action='version', version='0.0.1')
22 | 
23 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
24 | 
25 | 
26 | def main(args):
27 |     gz = args.genome_file.endswith('.gz')
28 |     if gz:
29 |         inF = gzip.open(args.genome_file)
30 |     else:
31 |         inF = open(args.genome_file)
32 |     for line in inF:
33 |         if gz:
34 |             line = line.decode('utf8')
35 |         line = line.rstrip()
36 |         if line.startswith('>'):
37 |             line = '>kraken:taxid|{}|{}'.format(args.taxID, line.lstrip('>'))
38 |         print(line)
39 | 
40 |     inF.close()
41 | 
42 | if __name__ == '__main__':
43 |     args = parser.parse_args()
44 |     main(args)
45 | 


--------------------------------------------------------------------------------
/bin/scripts/uncomp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import os
 4 | import sys
 5 | import re
 6 | import gzip
 7 | import bz2
 8 | import argparse
 9 | import logging
10 | 
11 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
12 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
13 |                       argparse.RawDescriptionHelpFormatter):
14 |     pass
15 | 
16 | desc = 'Uncompress gzip\'ed or bz2\'ed file'
17 | epi = """DESCRIPTION:
18 | Output written to STDOUT
19 | """
20 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
21 |                                  formatter_class=CustomFormatter)
22 | argparse.ArgumentDefaultsHelpFormatter
23 | parser.add_argument('input_file', metavar='input_file', type=str,
24 |                     help='Input file')
25 | parser.add_argument('--version', action='version', version='0.0.1')
26 | 
27 | 
28 | def _open(infile, mode='rb'):
29 |     """
30 |     Openning of input, regardless of compression
31 |     """
32 |     if infile.endswith('.bz2'):
33 |         return bz2.open(infile, mode)
34 |     elif infile.endswith('.gz'):
35 |         return gzip.open(infile, mode)
36 |     else:
37 |         return open(infile)
38 | 
39 | def _decode(line, infile):
40 |     """
41 |     Decoding input, depending on the file extension
42 |     """
43 |     if os.path.isfile(infile) and (infile.endswith('.gz') or infile.endswith('.bz2')):
44 |         line = line.decode('utf-8')
45 |     return line
46 | 
47 | def main(args):
48 |     with _open(args.input_file) as inF:
49 |         for line in inF:
50 |             print(_decode(line, args.input_file).rstrip())
51 |     
52 | 
53 | if __name__ == '__main__':
54 |     args = parser.parse_args()
55 |     main(args)
56 | 


--------------------------------------------------------------------------------
/bin/scripts/uncomp_tarball.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | import os
 4 | import sys
 5 | import tarfile
 6 | import argparse
 7 | import logging
 8 | 
 9 | desc = 'Uncompress tarball'
10 | epi = """DESCRIPTION:
11 | Simple script for smartly uncompressing a tarball.
12 | All files extracted the same output directory, regardless
13 | of the directory structure in the tarball. 
14 | """
15 | parser = argparse.ArgumentParser(description=desc,
16 |                                  epilog=epi,
17 |                                  formatter_class=argparse.RawTextHelpFormatter)
18 | parser.add_argument('tarball', metavar='tarball', type=str,
19 |                     help='tarball file to extract')
20 | parser.add_argument('-o', '--output-directory', type=str, default='.',
21 |                     help='Output directory location (default: %(default)s)')
22 | parser.add_argument('--version', action='version', version='0.0.1')
23 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
24 | 
25 | 
26 | def main(args):
27 |     # output location
28 |     if not os.path.isdir(args.output_directory):
29 |         os.makedirs(args.output_directory)
30 |     # extracting
31 |     ext = 'r:gz' if args.tarball.endswith('.gz') else 'r'
32 |     with tarfile.open(args.tarball, ext) as inF:
33 |         files = {k:v for k,v in zip(inF.getnames(), inF.getmembers())}
34 |         for F,M in files.items():
35 |             outfile = os.path.split(F)[1]
36 |             logging.info('Extracting file: {}'.format(outfile))
37 |             outfile = os.path.join(args.output_directory, outfile)
38 |             with inF.extractfile(M) as inFa, open(outfile, 'w') as outF:
39 |                 for line in inFa:
40 |                     outF.write(line.decode('utf-8'))
41 |         
42 | if __name__ == '__main__':
43 |     args = parser.parse_args()
44 |     main(args)
45 | 


--------------------------------------------------------------------------------
/bin/scripts/uniref_clst_trans.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import os
  4 | import sys
  5 | import re
  6 | import gzip
  7 | import bz2
  8 | import argparse
  9 | import logging
 10 | import pickle
 11 | from pickle import UnpicklingError
 12 | 
 13 | desc = 'Transferring UniRef annotations by cluster cutoff'
 14 | epi = """DESCRIPTION:
 15 | Using an index of how UniRef50 clusters map to UniRef90 clusters.
 16 | Renaming annotations with the other UniRef cluster level
 17 | (eg., UniRef90_Q8WZ42-5 => UniRef50_Q8WZ42-5).
 18 | Due to the multi-mapping when going from UniRef50 to UniRef90,
 19 | it makes much more sense to go from UniRef90 to UniRef50.
 20 | For multi-mappings, one mapping will be selected.
 21 | 
 22 | The input file format is determined by the input file extension.
 23 | Output is written to STDOUT.
 24 | """
 25 | parser = argparse.ArgumentParser(description=desc,
 26 |                                  epilog=epi,
 27 |                                  formatter_class=argparse.RawTextHelpFormatter)
 28 | parser.add_argument('index_file', metavar='index_file', type=str,
 29 |                     help='UniRef 50 <=> 90 index')
 30 | parser.add_argument('--in-nuc', type=str, default='', 
 31 |                     help='Input nucleotide fasta file path (default: %(default)s)')
 32 | parser.add_argument('--in-prot', type=str, default='', 
 33 |                     help='Input amino acid fasta file path (default: %(default)s)')
 34 | parser.add_argument('--in-tsv', type=str, default='', 
 35 |                     help='Input gene metadata table path (default: %(default)s)')
 36 | parser.add_argument('--out-nuc', type=str, default='', 
 37 |                     help='Output nucleotide fasta file path (default: %(default)s)')
 38 | parser.add_argument('--out-prot', type=str, default='', 
 39 |                     help='Output amino acid fasta file path (default: %(default)s)')
 40 | parser.add_argument('--out-tsv', type=str, default='', 
 41 |                     help='Output gene metadata table path (default: %(default)s)')
 42 | parser.add_argument('-d', '--direction', type=str, default='90=>50',
 43 |                     choices = ['90=>50', '50=>90'],
 44 |                     help='Changing annotations from X to Y (default: %(default)s)')
 45 | parser.add_argument('-p', '--pickle-idx', type=str, default='',
 46 |                     help='Write the index as a pickled dict for faster loading (default: %(default)s)')
 47 | parser.add_argument('--version', action='version', version='0.0.1')
 48 | 
 49 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 50 | 
 51 | 
 52 | def _open(infile, mode='rb'):
 53 |     """
 54 |     Openning of input, regardless of compression
 55 |     """
 56 |     if infile.endswith('.bz2'):
 57 |         return bz2.open(infile, mode)
 58 |     elif infile.endswith('.gz'):
 59 |         return gzip.open(infile, mode)
 60 |     else:
 61 |         return open(infile)
 62 | 
 63 | def _decode(line, infile):
 64 |     """
 65 |     Decoding input, depending on the file extension
 66 |     """
 67 |     if os.path.isfile(infile) and (infile.endswith('.gz') or infile.endswith('.bz2')):
 68 |         line = line.decode('utf-8')
 69 |     return line
 70 | 
 71 | def read_index(infile, direction):
 72 |     logging.info('Loading file: {}'.format(infile))
 73 |     # if pickle
 74 |     try:
 75 |         idx = pickle.load(open(infile, 'rb'))
 76 |         return idx
 77 |     except UnpicklingError:
 78 |         pass
 79 |     
 80 |     idx = {}
 81 |     with _open(infile) as inF:
 82 |         for i,line in enumerate(inF):
 83 |             line = _decode(line, infile)
 84 |             line = line.rstrip().split('\t')
 85 |             if line[0] == '':
 86 |                 continue
 87 |             if len(line) < 2:
 88 |                 msg = 'Line{}: <2 values'
 89 |                 raise ValueError(msg.format(i))
 90 |             # assuming UniRef50<tabl>UniRef90
 91 |             if direction == '50=>90':
 92 |                 idx[line[0]] = line[1]
 93 |             else:
 94 |                 idx[line[1]] = line[0]
 95 |     logging.info('  No. of index keys: {}'.format(len(idx.keys())))
 96 |     return idx
 97 | 
 98 | def which_ext(infile):
 99 |     infile = infile.rstrip('.gz')
100 |     for x in ['.fasta', '.fna', '.faa', '.fa']:
101 |         if infile.endswith(x):
102 |             return 'fasta'
103 |     for x in ['.txt', '.tsv']:
104 |         if infile.endswith(x):
105 |             return 'txt'
106 |     msg = 'Cannot determine file type from extension: {}'
107 |     raise IOError(msg.format(infile))
108 | 
109 | def make_dir(outfile):
110 |     D = os.path.split(outfile)[0]
111 |     if not os.path.isdir(D):
112 |         os.makedirs(D)
113 | 
114 | def rename_fasta(infile, outfile, idx):
115 |     logging.info('Processing file: {}'.format(infile))
116 |     make_dir(outfile)
117 |     to_write = False
118 |     stats = {'in_index' : 0, 'not_in_index' : 0}
119 |     with _open(infile) as inF, open(outfile, 'w') as outF:
120 |         for line in inF:
121 |             line = _decode(line, infile)
122 |             # header
123 |             if line.startswith('>'):
124 |                 line = line.lstrip('>').split('|')
125 |                 try:
126 |                     line[0] = idx[line[0]]
127 |                     to_write = True
128 |                     stats['in_index'] += 1
129 |                 except KeyError:
130 |                     msg = 'Cannot find "{}" in index'
131 |                     logging.warning(msg.format(line[0]))
132 |                     to_write = False
133 |                     stats['not_in_index'] += 1                    
134 |                 if to_write is True:
135 |                     line = '>' + '|'.join(line)
136 |             # body
137 |             if to_write is True:
138 |                 outF.write(line)
139 |     # status
140 |     msg = '  No. of genes found in UniRef50<=>90 index: {}'
141 |     logging.info(msg.format(stats['in_index']))
142 |     msg = '  No. of genes NOT found in UniRef50<=>90 index: {}'
143 |     logging.info(msg.format(stats['not_in_index']))
144 |     if stats['in_index'] == 0:
145 |         raise ValueError('No genes were present in the UniRef50<=>90 index!')
146 |                     
147 | def rename_txt(infile, outfile, idx):
148 |     logging.info('Processing file: {}'.format(infile))
149 |     make_dir(outfile)    
150 |     header = {}
151 |     to_write = False
152 |     stats = {'in_index' : 0, 'not_in_index' : 0}
153 |     with _open(infile) as inF, open(outfile, 'w') as outF:
154 |         for i,line in enumerate(inF):
155 |             line = _decode(line, infile)
156 |             line = line.rstrip().split('\t')
157 |             if i == 0:
158 |                 header = {x:ii for ii,x in enumerate(line)}
159 |             else:
160 |                 try:
161 |                     line[header['annotation']] = idx[line[header['annotation']]]
162 |                     to_write = True
163 |                     stats['in_index'] += 1       
164 |                 except KeyError:
165 |                     to_write = False
166 |                     stats['not_in_index'] += 1
167 |             if i == 0 or to_write is True:
168 |                 outF.write('\t'.join(line) + '\n')
169 |     # status
170 |     msg = '  No. of genes found in UniRef50<=>90 index: {}'
171 |     logging.info(msg.format(stats['in_index']))
172 |     msg = '  No. of genes NOT found in UniRef50<=>90 index: {}'
173 |     logging.info(msg.format(stats['not_in_index']))
174 |                 
175 | def pickle_idx(idx, outfile):
176 |     logging.info('Pickling index to {}'.format(outfile))
177 |     with open(outfile, 'wb') as outF:
178 |         pickle.dump(idx, outF)
179 |     logging.info('  File pickled. Exiting')
180 |     sys.exit()
181 |             
182 | def main(args):
183 |     # loading UniRef cluster index
184 |     idx = read_index(args.index_file, args.direction)
185 |     # pickle
186 |     if args.pickle_idx != '':
187 |         pickle_idx(idx, args.pickle_idx)
188 |     # renaming input
189 |     for inpath,outpath in zip([args.in_nuc, args.in_prot, args.in_tsv],
190 |                               [args.out_nuc, args.out_prot, args.out_tsv]):
191 |         if which_ext(inpath) == 'fasta':
192 |             rename_fasta(inpath, outpath, idx)
193 |         elif which_ext(inpath) == 'txt':
194 |             rename_txt(inpath, outpath, idx)
195 |     
196 | if __name__ == '__main__':
197 |     args = parser.parse_args()
198 |     main(args)
199 | 


--------------------------------------------------------------------------------
/bin/utils/Snakefile:
--------------------------------------------------------------------------------
 1 | #-- utilty functions --#
 2 | import gzip
 3 | import bz2
 4 | from itertools import chain
 5 | 
 6 | 
 7 | def concatenate(*lists):
 8 |     """
 9 |     Combine >1 list and/or strings
10 |     """
11 |     new_list = []
12 |     for x in lists:
13 |         new_list.extend(x)
14 |     return new_list
15 | 
16 | def _open(infile, mode='rb'):
17 |     """
18 |     Openning of input, regardless of compression
19 |     """
20 |     if infile.endswith('.bz2'):
21 |         return bz2.open(infile, mode)
22 |     elif infile.endswith('.gz'):
23 |         return gzip.open(infile, mode)
24 |     else:
25 |         return open(infile)
26 | 
27 | def _decode(line, infile):
28 |     """
29 |     Decoding input, depending on the file extension
30 |     """
31 |     if os.path.isfile(infile) and (infile.endswith('.gz') or infile.endswith('.bz2')):
32 |         line = line.decode('utf-8')
33 |     return line
34 | 
35 | def cat_files(*args, outfile, header=False):
36 |     """
37 |     Combining files (*args). File can be compressed. 
38 |     Combined files written to outfile.
39 |     Input:
40 |       args : tuple of lists, each list contains file paths
41 |       output : str, output file path
42 |       header : bool, just print the header line of the first input file?
43 |     """
44 |     infiles = concatenate(*args)
45 |     with open(outfile, 'w') as outF:
46 |         for i,infile in enumerate(infiles):
47 |             with _open(infile) as inF:
48 |                 for ii,line in enumerate(inF):
49 |                     # skipping header (except for first table)
50 |                     if i > 0 and ii == 0 and header is True:
51 |                         continue
52 |                     # writing line 
53 |                     line = _decode(line, infile)
54 |                     outF.write(line)
55 | 


--------------------------------------------------------------------------------
/conda_env.yaml:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # `conda env create --name <env> --file <this file>`
 3 | channels:
 4 | - conda-forge
 5 | - bioconda
 6 | dependencies:
 7 | - python>=3.6
 8 | - mamba>=0.11
 9 | - pandas>=1.1.2
10 | - snakemake>=5.31.1
11 | - r-base>=3.6
12 | - r-argparse>=2.0.1
13 | - r-curl>=4.2
14 | - r-data.table>=1.12.4
15 | - r-dplyr>=0.8.3
16 | - r-r.utils
17 | - ncbi-genome-download>=0.2.10
18 | - newick_utils>=1.6
19 | - beautifulsoup4>=4.11
20 | 


--------------------------------------------------------------------------------
/config-update.yaml:
--------------------------------------------------------------------------------
  1 | #-- email notifications of pipeline success/failure (use "Skip" to deactivate) --#
  2 | email: None
  3 | 
  4 | #-- databases to update --#
  5 | # Replace "Create" with "Skip" to skip creation of any of these
  6 | # Note that braken relies on the kraken2 database
  7 | databases:
  8 |   kraken2: Create
  9 |   bracken: Create
 10 |   genes: Create
 11 |   humann3_bowtie2: Create
 12 |   humann3_diamond: Create
 13 | 
 14 | #-- Input --#
 15 | #--- If just a set of gene sequences to add ---#
 16 | # If you have nucleotide/amino-acid gene sequences formatted for humann
 17 | # If translate = True, missing nuc or AA seqs will be (rev)translated from the other, else seqs not used
 18 | new_genes:  # remove "Skip" to add these genes instead of from the genomes
 19 |   amino_acid: Skip #data/UniRef50/genome_reps_filtered.faa.gz
 20 |   nucleotide: Skip #data/UniRef50/genome_reps_filtered.fna.gz
 21 |   metadata: Skip #data/UniRef50/genome_reps_filtered.txt.gz
 22 |   translate: True
 23 | 
 24 | #--- If a set of genomes to add ---#
 25 | # file listing samples and associated data
 26 | samples_file: data/GTDBr95_n5/GTDBr95_n5.tsv
 27 | 
 28 | ## column names in samples table
 29 | samples_col: 'ncbi_organism_name'
 30 | accession_col: 'accession'
 31 | fasta_file_path_col: 'fasta_file_path'
 32 | taxID_col: 'gtdb_taxid'          # or 'ncbi_species_taxid'
 33 | taxonomy_col: 'gtdb_taxonomy'    # or 'ncbi_taxonomy' 
 34 | 
 35 | # Saved databases that will be updated
 36 | kraken2_db:
 37 |   library:  tests/output/GTDBr95_n10/kraken2/library/
 38 |   taxonomy: tests/output/GTDBr95_n10/kraken2/taxonomy/
 39 | genes_db:
 40 |   genes:
 41 |     mmseqs_db:  tests/output/GTDBr95_n10/genes/genes_db.tar.gz
 42 |     amino_acid: tests/output/GTDBr95_n10/genes/genome_reps_filtered.faa.gz
 43 |     nucleotide: tests/output/GTDBr95_n10/genes/genome_reps_filtered.fna.gz
 44 |     metadata:   tests/output/GTDBr95_n10/genes/genome_reps_filtered.txt.gz
 45 |   cluster:
 46 |     mmseqs_db:  tests/output/GTDBr95_n10/genes/cluster/clusters_db.tar.gz    
 47 | humann_db:
 48 |   query:
 49 |     hits: tests/output/GTDBr95_n10/humann3/annotation_hits.gz
 50 |   cluster:
 51 |     reps: tests/output/GTDBr95_n10/genes/cluster/clusters_reps.faa.gz
 52 |     membership: tests/output/GTDBr95_n10/genes/cluster/clusters_membership.tsv.gz
 53 | 
 54 | #-- Output --#
 55 | # output location
 56 | output_dir: tests/output/GTDBr95_n10-n5/
 57 | 
 58 | # Name of UniRef clustering (uniref90 or uniref50)
 59 | ## "uniref90" highly recommended
 60 | uniref_name: uniref90
 61 | # Name of the humann3 diamond database to create
 62 | ## This must match naming allowed by humann3
 63 | dmnd_name: uniref90_201901.dmnd
 64 | # Index mapping UniRef90 clusters to UniRef50 (saves time vs re-annotating)
 65 | ## Skip if annotating with UniRef50
 66 | cluster_idx: data/uniref50-90.pkl
 67 | 
 68 | # temporary file directory (your username will be added automatically)
 69 | tmp_dir: tmp/db_update_tmp/
 70 | 
 71 | #-- if custom NCBI/GTDB taxdump files, "Skip" if standard NCBI taxdump --#
 72 | # Used for kraken taxonomy & metaphlan
 73 | names_dmp: data/taxdump/names.dmp
 74 | nodes_dmp: data/taxdump/nodes.dmp
 75 | 
 76 | #-- keep intermediate files required for re-creating DBs (eg., w/ more genomes) --#
 77 | # If "True", the intermediate files are saved to `output_dir`
 78 | # Else, the intermediate files are temporarily stored in `temp_folder`
 79 | keep_intermediate: True
 80 | 
 81 | #-- software parameters --#
 82 | # `vsearch_per_genome` = per-genome gene clustering
 83 | # for humann3, use either mmseqs or diamond (mmseqs gets priority if neither skipped)
 84 | # for humann3::mmseqs_search::run, --num-iterations must be >=2
 85 | params:
 86 |   ionice: -c 3
 87 |   bracken:
 88 |     build_kmer: 35
 89 |     build_read_lens:
 90 |       - 100
 91 |       - 150
 92 |   genes:
 93 |     prodigal: ""
 94 |     vsearch_per_genome: --id 0.97 --strand both --qmask none --fasta_width 0
 95 |     mmseqs_cluster_update: --min-seq-id 0.9 -c 0.8 -s 4.0    
 96 |   humann3:
 97 |     batches: 2
 98 |     filter_existing: --min-pident 0  # any existing genes w/ < cutoff with be re-queried
 99 |     mmseqs_search:
100 |       db: data/UniRef90/uniref90
101 |       index: -s 6
102 |       run: -e 1e-3 --max-accept 1 --max-seqs 100 --num-iterations 2 --start-sens 1 --sens-steps 3 -s 6
103 |     diamond:
104 |       db: Skip #data/uniref90_ec-filtered/uniref90_ec_filt_201901.dmnd
105 |       run: --evalue 1e-3 --query-cover 80 --id 90 --max-target-seqs 1 --block-size 4 --index-chunks 2
106 |     propagate_annotations: --min-cov 80 --min-pident 90
107 | 
108 | #-- snakemake pipeline --#
109 | pipeline:
110 |   snakemake_folder: ./
111 |   script_folder: ./bin/scripts/
112 |   name: Struo2_db-update
113 |   config: update


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | #-- email notifications of pipeline success/failure (use "Skip" to deactivate) --#
 2 | email: None
 3 | 
 4 | #-- I/O --#
 5 | # file listing samples and associated data
 6 | samples_file: data/GTDBr95_n10/GTDBr95_n10.tsv
 7 | 
 8 | ## column names in samples table
 9 | samples_col: 'ncbi_organism_name'
10 | accession_col: 'accession'
11 | fasta_file_path_col: 'fasta_file_path'
12 | taxID_col: 'gtdb_taxid'          # or 'ncbi_species_taxid'
13 | taxonomy_col: 'gtdb_taxonomy'    # or 'ncbi_taxonomy' 
14 | 
15 | # output location
16 | output_dir: tests/output/GTDBr95_n10/
17 | 
18 | # temporary file directory (your username will be added automatically)
19 | tmp_dir: /ebio/abt3_scratch/
20 | 
21 | #-- databases to create --#
22 | # Replace "Create" with "Skip" to skip creation of any of these
23 | # Note that braken relies on the kraken2 database
24 | databases:
25 |   kraken2: Create
26 |   bracken: Create
27 |   genes: Create
28 |   humann3_bowtie2: Create
29 |   humann3_diamond: Create
30 | 
31 | # Name of UniRef clustering (uniref90 or uniref50)
32 | ## "uniref90" highly recommended
33 | uniref_name: uniref90
34 | # Name of the humann3 diamond database to be created
35 | ## This must match the naming allowed by humann3 (eg., "uniref90_201901.dmnd")
36 | dmnd_name: uniref90_201901.dmnd
37 | # Index mapping UniRef90 clusters to UniRef50 (saves time vs re-annotating)
38 | ## This is skipped if annotating with UniRef50 instead of UniRef90
39 | cluster_idx: data/uniref50-90.pkl
40 | 
41 | #-- if custom NCBI/GTDB taxdump files, "Skip" if standard NCBI taxdump --#
42 | # Used for kraken taxonomy & metaphlan
43 | names_dmp: data/taxdump/names.dmp
44 | nodes_dmp: data/taxdump/nodes.dmp
45 | 
46 | #-- keep intermediate files required for re-creating DBs (eg., w/ more genomes) --#
47 | # If "True", the intermediate files are saved to `output_dir`
48 | # Else, the intermediate files are temporarily stored in `temp_folder`
49 | keep_intermediate: True
50 | 
51 | #-- software parameters --#
52 | # `vsearch_per_genome` = per-genome gene clustering
53 | # use "Skip" at the start of any param to skip (if possible to skip) 
54 | # for humann3, use either mmseqs or diamond (mmseqs gets priority if neither skipped)
55 | # for humann3::mmseqs_search::run, --num-iterations must be >=2
56 | params:
57 |   ionice: -c 3
58 |   bracken:
59 |     build_kmer: 35
60 |     build_read_lens:
61 |       - 100
62 |       - 150
63 |   genes:
64 |     prodigal: ""
65 |     vsearch_per_genome: --id 0.97 --strand both --qmask none --fasta_width 0
66 |     mmseqs_cluster: --min-seq-id 0.9 -c 0.8
67 |     mmseqs_cluster_method: linclust     # or "cluster", which is slower
68 |   humann3:
69 |     batches: 2
70 |     mmseqs_search:
71 |       db: data/UniRef90/uniref90
72 |       index: -s 6
73 |       run: -e 1e-3 --max-accept 1 --max-seqs 100 --num-iterations 2 --start-sens 1 --sens-steps 3 -s 6
74 |     diamond:
75 |       db: Skip #data/uniref90_ec-filtered/uniref90_ec_filt_201901.dmnd
76 |       run: --evalue 1e-3 --query-cover 80 --id 90 --max-target-seqs 1 --block-size 4 --index-chunks 2
77 |     propagate_annotations: --min-cov 80 --min-pident 90
78 | 
79 | #-- snakemake pipeline --#
80 | pipeline:
81 |   snakemake_folder: ./
82 |   script_folder: ./bin/scripts/
83 |   name: Struo2_db-create
84 |   config: create


--------------------------------------------------------------------------------
/data/GTDBr95_n5/GCA_000014945.1_ASM1494v1_genomic.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_000014945.1_ASM1494v1_genomic.fna.gz


--------------------------------------------------------------------------------
/data/GTDBr95_n5/GCA_000720375.1_ASM72037v1_genomic.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_000720375.1_ASM72037v1_genomic.fna.gz


--------------------------------------------------------------------------------
/data/GTDBr95_n5/GCA_002478565.1_ASM247856v1_genomic.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_002478565.1_ASM247856v1_genomic.fna.gz


--------------------------------------------------------------------------------
/data/GTDBr95_n5/GCA_006715045.1_ASM671504v1_genomic.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_006715045.1_ASM671504v1_genomic.fna.gz


--------------------------------------------------------------------------------
/data/GTDBr95_n5/GCA_007116575.1_ASM711657v1_genomic.fna.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/data/GTDBr95_n5/GCA_007116575.1_ASM711657v1_genomic.fna.gz


--------------------------------------------------------------------------------
/data/GTDBr95_n5/GTDBr95_n5.tsv:
--------------------------------------------------------------------------------
1 | ncbi_organism_name	accession	ambiguous_bases	checkm_completeness	checkm_contamination	checkm_marker_count	checkm_marker_lineage	checkm_marker_set_count	checkm_strain_heterogeneity	coding_bases	coding_density	contig_count	gc_count	gc_percentage	genome_size	gtdb_genome_representative	gtdb_representative	gtdb_taxonomy	gtdb_type_designation	gtdb_type_designation_sources	gtdb_type_species_of_genus	l50_contigs	l50_scaffolds	longest_contig	longest_scaffold	lsu_23s_contig_len	lsu_23s_count	lsu_23s_length	lsu_23s_query_id	lsu_5s_contig_len	lsu_5s_count	lsu_5s_length	lsu_5s_query_id	lsu_silva_23s_blast_align_len	lsu_silva_23s_blast_bitscore	lsu_silva_23s_blast_evalue	lsu_silva_23s_blast_perc_identity	lsu_silva_23s_blast_subject_id	lsu_silva_23s_taxonomy	mean_contig_length	mean_scaffold_length	mimag_high_quality	mimag_low_quality	mimag_medium_quality	n50_contigs	n50_scaffolds	ncbi_assembly_level	ncbi_assembly_name	ncbi_assembly_type	ncbi_bioproject	ncbi_biosample	ncbi_contig_count	ncbi_contig_n50	ncbi_country	ncbi_date	ncbi_genbank_assembly_accession	ncbi_genome_category	ncbi_genome_representation	ncbi_isolate	ncbi_isolation_source	ncbi_lat_lon	ncbi_molecule_count	ncbi_ncrna_count	ncbi_protein_count	ncbi_refseq_category	ncbi_rrna_count	ncbi_scaffold_count	ncbi_scaffold_l50	ncbi_scaffold_n50	ncbi_scaffold_n75	ncbi_scaffold_n90	ncbi_seq_rel_date	ncbi_spanned_gaps	ncbi_species_taxid	ncbi_ssu_count	ncbi_strain_identifiers	ncbi_submitter	ncbi_taxid	ncbi_taxonomy	ncbi_taxonomy_unfiltered	ncbi_total_gap_length	ncbi_total_length	ncbi_translation_table	ncbi_trna_count	ncbi_type_material_designation	ncbi_ungapped_length	ncbi_unspanned_gaps	ncbi_wgs_master	protein_count	scaffold_count	ssu_contig_len	ssu_count	ssu_gg_blast_align_len	ssu_gg_blast_bitscore	ssu_gg_blast_evalue	ssu_gg_blast_perc_identity	ssu_gg_blast_subject_id	ssu_gg_taxonomy	ssu_length	ssu_query_id	ssu_silva_blast_align_len	ssu_silva_blast_bitscore	ssu_silva_blast_evalue	ssu_silva_blast_perc_identity	ssu_silva_blast_subject_id	ssu_silva_taxonomy	total_gap_length	trna_aa_count	trna_count	trna_selenocysteine_count	SPECIES	gtdb_taxid	fasta_file_path
2 | RS_GCF_006715045.1_Amycolatopsis cihanbeyliensis	RS_GCF_006715045.1	0	100	0.99	350	o__Actinomycetales (UID2014)	203	0	6692435	90.00726117	2	5213783	70.12071513	7435439	RS_GCF_006715045.1	t	d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis	type strain of species	LPSN; DSMZ; StrainInfo	f	1	1	6286745	6286745	6286745	2	3100	NZ_VFML01000001.1	6286745	2	108	NZ_VFML01000001.1	3118	4571	0	93.297	ARVW01000001.3175334.3178445	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis nigrescens CSC17Ta-90	3717719	3717719	t	f	f	6286745	6286745	Contig	ASM671504v1	n/a	PRJNA224116	SAMN11512385	2	6286745	none	7/8/19	GCA_006715045.1	none	full	none	none	none	0	0	none	na	6	none	none	none	none	none	7/8/19	0	1128664	2	DSM 45679	DOE Joint Genome Institute	1128664	d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis	d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis	0	7435439	11	49	assembly from type material	7435439	0	VFML00000000.1	6730	2	6286745	2	none	none	none	none	none	none	1514	NZ_VFML01000001.1	1460	2697	0	100	JN989302.1.1460	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis cihanbeyliensis	0	20	49	0	Amycolatopsis cihanbeyliensis	202433	data/GTDBr95_n5/GCA_006715045.1_ASM671504v1_genomic.fna.gz
3 | GB_GCA_002478565.1_Flavobacteriales bacterium UBA7468	GB_GCA_002478565.1	0	96.74	0.54	277	k__Bacteria (UID2569)	185	0	2200517	95.42095689	46	952777	41.34812551	2306115	GB_GCA_002478565.1	t	d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Crocinitomicaceae;g__UBA952;s__UBA952 sp002478565	not type material	none	f	6	5	378299	378299	none	0	none	none	none	0	none	none	none	none	none	none	none	none	50093	88696	f	f	t	122097	148012	Scaffold	ASM247856v1	n/a	PRJNA348753	SAMN06451991	46	122097	none	10/6/17	GCA_002478565.1	derived from metagenome	full	UBA7468	none	none	0	0	0	na	0	none	none	none	none	none	10/6/17	20	1951076	0	UBA7468	University of Queensland	1951076	d__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__;g__;s__	d__Bacteria;x__FCB group;x__Bacteroidetes/Chlorobi group;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;x__unclassified Flavobacteriales;x__unclassified Flavobacteriales (miscellaneous);s__Flavobacteriales bacterium UBA7468	1834	2306115	none	0	none	2304281	0	DLPC00000000.1	2062	26	none	0	none	none	none	none	none	none	none	none	none	none	none	none	none	none	1834	17	28	0	UBA952 sp002478565	189650	data/GTDBr95_n5/GCA_002478565.1_ASM247856v1_genomic.fna.gz
4 | GB_GCA_007116575.1_Rhodobacteraceae bacterium	GB_GCA_007116575.1	0	90.5	0.91	568	f__Rhodobacteraceae (UID3340)	330	0	2402398	90.58286563	142	1695274	63.92062304	2652155	GB_GCA_007116575.1	t	d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Roseinatronobacter;s__Roseinatronobacter sp007116575	not type material	none	f	28	28	83077	83077	none	0	none	none	none	0	none	none	none	none	none	none	none	none	18677	18677	f	f	t	27979	27979	Contig	ASM711657v1	n/a	PRJNA453733	SAMN10605132	142	27979	none	7/18/19	GCA_007116575.1	derived from metagenome	full	CSBr16_51	none	none	0	none	none	na	none	none	none	none	none	none	7/18/19	0	1904441	none	CSBr16_51	"Institute for Biodiversity and Ecosystem dynamics, Faculty of Science, University of Amsterdam"	1904441	d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__;s__	d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;x__unclassified Rhodobacteraceae;s__Rhodobacteraceae bacterium	0	2652155	none	none	none	2652155	0	SKIF00000000.1	2563	142	none	0	none	none	none	none	none	none	none	none	none	none	none	none	none	none	0	18	34	0	Roseinatronobacter sp007116575	195713	data/GTDBr95_n5/GCA_007116575.1_ASM711657v1_genomic.fna.gz
5 | GB_GCA_000014945.1_Methanosaeta thermophila PT	GB_GCA_000014945.1	0	100	0	228	p__Euryarchaeota (UID49)	153	0	1591668	84.68702098	1	1006368	53.5452795	1879471	GB_GCA_000014945.1	t	d__Archaea;p__Halobacteriota;c__Methanosarcinia;o__Methanotrichales;f__Methanotrichaceae;g__Methanothrix_B;s__Methanothrix_B thermoacetophila	type strain of heterotypic synonym	LPSN; StrainInfo	f	1	1	1879471	1879471	1879471	2	2898	CP000477.1	1879471	2	115	CP000477.1	2888	5334	0	100	CP000477.1614054.1616941	Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT	1879471	1879471	t	f	f	1879471	1879471	Complete Genome	ASM1494v1	n/a	PRJNA15765	SAMN02598350	none	none	none	10/25/06	GCA_000014945.1	none	full	none	none	none	1	1	1696	na	6	none	none	none	none	none	10/25/06	0	2224	2	PT	US DOE Joint Genome Institute	349307	d__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila	d__Archaea;p__Euryarchaeota;x__Stenosarchaea group;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila;x__Methanothrix thermoacetophila PT	0	1879471	11	44	assembly from synonym type material	1879471	0	none	1810	1	1879471	2	1472	2719	0	100	155726	k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosaetaceae;g__Methanosaeta;s__	1473	CP000477.1	1472	2719	0	100	CP000477.1617193.1618665	Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT	0	18	43	0	Methanothrix_B thermoacetophila	5158	data/GTDBr95_n5/GCA_000014945.1_ASM1494v1_genomic.fna.gz
6 | GB_GCA_000720375.1_Rhodococcus rhodnii	GB_GCA_000720375.1	0	99.47	3.85	495	o__Actinomycetales (UID2012)	282	0	11292191	90.93197579	182	8230553	66.27769988	12418284	GB_GCA_000720375.1	t	d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Kibdelosporangium;s__Kibdelosporangium sp000720375	type strain of species	LPSN; DSMZ; StrainInfo	f	14	14	746316	746316	1593	4	1525	JOAA01000102.1	1581	4	102	JOAA01000103.1	1525	2560	0	96.984	JNYM01000477.49.3165	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Kibdelosporangium aridum subsp. largum	68232	68232	f	f	t	319285	319285	Contig	ASM72037v1	n/a	PRJNA238534	SAMN02645355	182	319285	none	7/2/14	GCA_000720375.1	none	full	none	none	none	0	0	0	na	0	none	none	none	none	none	7/2/14	0	38312	0	NRRL B-16535	University of Illinois	38312	d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii	d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii	0	12418284	none	0	assembly from type material	12418284	0	JOAA00000000.1	11452	182	1381	3	1301	2379	0	99.693	756603	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinosynnemataceae;g__Kibdelosporangium;s__	1319	JOAA01000108.1	1305	2410	0	100	JOAA01000108.1.1312	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Rhodococcus rhodnii	0	20	66	2	Kibdelosporangium sp000720375	193223	data/GTDBr95_n5/GCA_000720375.1_ASM72037v1_genomic.fna.gz


--------------------------------------------------------------------------------
/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leylabmpi/Struo2/e3a73f5f656725054c96f0e829d187e0d25e302e/img/logo.png


--------------------------------------------------------------------------------
/snakemake_clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # user input
 4 | if [ "$#" -lt 2 ]; then
 5 |     echo "Usage: snakemake_clean.sh config_file [preview|delete]"
 6 |     echo "Description: delete all snakemake-generated files"
 7 |     echo " preview => preview files to delete"
 8 |     echo " delete => delete files"
 9 |     echo "NOTE: this only deletes files that snakemake knows about, but that's all that's needed to fully restart the snakemake pipeline"
10 |     exit
11 | fi
12 | 
13 | 
14 | FILES=`snakemake --summary --rerun-incomplete --configfile $1 | tail -n+2 | cut -f1`
15 | if [ $1 == "preview" ]; then
16 |     echo "#-- Files to delete --#"
17 |     printf '%s\n' "${FILES[@]}"
18 | elif [ $1 == "delete" ]; then
19 |     echo "#-- Deleting the following files --#"
20 |     printf '%s\n' "${FILES[@]}"
21 |     rm -rf $FILES
22 | else
23 |     echo "$1 not recoginized"
24 | fi 
25 | 


--------------------------------------------------------------------------------
/snakemake_conda-list.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # check for snakemake
 4 | command -v snakemake >/dev/null 2>&1 || { echo "snakemake is not in your PATH"; exit 1; }
 5 | 
 6 | # check for conda envs
 7 | if [[ ! -d .snakemake ]] || [[ -z "$(ls -A .snakemake/conda/)" ]]; then
 8 |     echo "No conda envs found!"
 9 |     echo "To create the envs, run: 'snakemake --use-conda --create-envs-only -F'"
10 |     exit 1
11 | fi
12 | 
13 | # list all conda envs
14 | for X in $(snakemake --list-conda-envs -F | tail -n +3)
15 | do
16 |     conda list -p $X 2>/dev/null || echo "#--- conda env: $X ---#"
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/snakemake_sge.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # user input
 4 | if [ "$#" -lt 2 ]; then
 5 |     echo "snakemake_sge.sh config.yaml jobs ..."
 6 |     echo " config.yaml : snakemake config"
 7 |     echo " jobs : number of parallel qsub jobs"
 8 |     echo " ... : additional arguments passed to snakemake"
 9 |     exit
10 | fi
11 | 
12 | # check for snakemake
13 | command -v snakemake >/dev/null 2>&1 || { echo "snakemake is not in your PATH"; exit 1; }
14 | 
15 | # set args
16 | CONFIG=$1
17 | JOBS=$2
18 | 
19 | # snakemake call
20 | WORKDIR=`pwd`
21 | snakemake -f \
22 | 	  --profile bin/ll_pipeline_utils/profiles/sge/ \
23 | 	  --use-conda \
24 | 	  --configfile $CONFIG \
25 | 	  --jobs $JOBS \
26 | 	  --local-cores $JOBS \
27 | 	  --printshellcmds \
28 | 	  --resources temp=$JOBS \
29 | 	  --directory $WORKDIR \
30 | 	  "${@:3}"
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/tests/samples/GTDBr95_n5.tsv:
--------------------------------------------------------------------------------
1 | ncbi_organism_name	accession	ambiguous_bases	checkm_completeness	checkm_contamination	checkm_marker_count	checkm_marker_lineage	checkm_marker_set_count	checkm_strain_heterogeneity	coding_bases	coding_density	contig_count	gc_count	gc_percentage	genome_size	gtdb_genome_representative	gtdb_representative	gtdb_taxonomy	gtdb_type_designation	gtdb_type_designation_sources	gtdb_type_species_of_genus	l50_contigs	l50_scaffolds	longest_contig	longest_scaffold	lsu_23s_contig_len	lsu_23s_count	lsu_23s_length	lsu_23s_query_id	lsu_5s_contig_len	lsu_5s_count	lsu_5s_length	lsu_5s_query_id	lsu_silva_23s_blast_align_len	lsu_silva_23s_blast_bitscore	lsu_silva_23s_blast_evalue	lsu_silva_23s_blast_perc_identity	lsu_silva_23s_blast_subject_id	lsu_silva_23s_taxonomy	mean_contig_length	mean_scaffold_length	mimag_high_quality	mimag_low_quality	mimag_medium_quality	n50_contigs	n50_scaffolds	ncbi_assembly_level	ncbi_assembly_name	ncbi_assembly_type	ncbi_bioproject	ncbi_biosample	ncbi_contig_count	ncbi_contig_n50	ncbi_country	ncbi_date	ncbi_genbank_assembly_accession	ncbi_genome_category	ncbi_genome_representation	ncbi_isolate	ncbi_isolation_source	ncbi_lat_lon	ncbi_molecule_count	ncbi_ncrna_count	ncbi_protein_count	ncbi_refseq_category	ncbi_rrna_count	ncbi_scaffold_count	ncbi_scaffold_l50	ncbi_scaffold_n50	ncbi_scaffold_n75	ncbi_scaffold_n90	ncbi_seq_rel_date	ncbi_spanned_gaps	ncbi_species_taxid	ncbi_ssu_count	ncbi_strain_identifiers	ncbi_submitter	ncbi_taxid	ncbi_taxonomy	ncbi_taxonomy_unfiltered	ncbi_total_gap_length	ncbi_total_length	ncbi_translation_table	ncbi_trna_count	ncbi_type_material_designation	ncbi_ungapped_length	ncbi_unspanned_gaps	ncbi_wgs_master	protein_count	scaffold_count	ssu_contig_len	ssu_count	ssu_gg_blast_align_len	ssu_gg_blast_bitscore	ssu_gg_blast_evalue	ssu_gg_blast_perc_identity	ssu_gg_blast_subject_id	ssu_gg_taxonomy	ssu_length	ssu_query_id	ssu_silva_blast_align_len	ssu_silva_blast_bitscore	ssu_silva_blast_evalue	ssu_silva_blast_perc_identity	ssu_silva_blast_subject_id	ssu_silva_taxonomy	total_gap_length	trna_aa_count	trna_count	trna_selenocysteine_count	SPECIES	gtdb_taxid	fasta_file_path
2 | RS_GCF_006715045.1_Amycolatopsis cihanbeyliensis	RS_GCF_006715045.1	0	100	0.99	350	o__Actinomycetales (UID2014)	203	0	6692435	90.0072611718	2	5213783	70.120715132	7435439	RS_GCF_006715045.1	t	d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis	type strain of species	LPSN; DSMZ; StrainInfo	f	1	1	6286745	6286745	6286745	2	3100	NZ_VFML01000001.1	6286745	2	108	NZ_VFML01000001.1	3118	4571	0	93.297	ARVW01000001.3175334.3178445	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis nigrescens CSC17Ta-90	3717719	3717719	t	f	f	6286745	6286745	Contig	ASM671504v1	n/a	PRJNA224116	SAMN11512385	2	6286745	none	2019-07-08	GCA_006715045.1	none	full	none	none	none	0	0	none	na	6	none	none	none	none	none	2019/07/08	0	1128664	2	DSM 45679	DOE Joint Genome Institute	1128664	d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis	d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Amycolatopsis;s__Amycolatopsis cihanbeyliensis	0	7435439	11	49	assembly from type material	7435439	0	VFML00000000.1	6730	2	6286745	2	none	none	none	none	none	none	1514	NZ_VFML01000001.1	1460	2697	0	100	JN989302.1.1460	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Amycolatopsis;Amycolatopsis cihanbeyliensis	0	20	49	0	Amycolatopsis cihanbeyliensis	202433	/ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_006715045.1/GCA_006715045.1_ASM671504v1_genomic.fna.gz
3 | GB_GCA_002478565.1_Flavobacteriales bacterium UBA7468	GB_GCA_002478565.1	0	96.74	0.54	277	k__Bacteria (UID2569)	185	0	2200517	95.4209568907	46	952777	41.3481255107	2306115	GB_GCA_002478565.1	t	d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Flavobacteriales;f__Crocinitomicaceae;g__UBA952;s__UBA952 sp002478565	not type material	none	f	6	5	378299	378299	none	0	none	none	none	0	none	none	none	none	none	none	none	none	50093	88696	f	f	t	122097	148012	Scaffold	ASM247856v1	n/a	PRJNA348753	SAMN06451991	46	122097	none	2017-10-6	GCA_002478565.1	derived from metagenome	full	UBA7468	none	none	0	0	0	na	0	none	none	none	none	none	2017/10/06	20	1951076	0	UBA7468	University of Queensland	1951076	d__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__;g__;s__	d__Bacteria;x__FCB group;x__Bacteroidetes/Chlorobi group;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;x__unclassified Flavobacteriales;x__unclassified Flavobacteriales (miscellaneous);s__Flavobacteriales bacterium UBA7468	1834	2306115	none	0	none	2304281	0	DLPC00000000.1	2062	26	none	0	none	none	none	none	none	none	none	none	none	none	none	none	none	none	1834	17	28	0	UBA952 sp002478565	189650	/ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_002478565.1/GCA_002478565.1_ASM247856v1_genomic.fna.gz
4 | GB_GCA_007116575.1_Rhodobacteraceae bacterium	GB_GCA_007116575.1	0	90.5	0.91	568	f__Rhodobacteraceae (UID3340)	330	0	2402398	90.5828656319	142	1695274	63.9206230405	2652155	GB_GCA_007116575.1	t	d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Roseinatronobacter;s__Roseinatronobacter sp007116575	not type material	none	f	28	28	83077	83077	none	0	none	none	none	0	none	none	none	none	none	none	none	none	18677	18677	f	f	t	27979	27979	Contig	ASM711657v1	n/a	PRJNA453733	SAMN10605132	142	27979	none	2019-07-18	GCA_007116575.1	derived from metagenome	full	CSBr16_51	none	none	0	none	none	na	none	none	none	none	none	none	2019/07/18	0	1904441	none	CSBr16_51	Institute for Biodiversity and Ecosystem dynamics, Faculty of Science, University of Amsterdam	1904441	d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__;s__	d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;x__unclassified Rhodobacteraceae;s__Rhodobacteraceae bacterium	0	2652155	none	none	none	2652155	0	SKIF00000000.1	2563	142	none	0	none	none	none	none	none	none	none	none	none	none	none	none	none	none	0	18	34	0	Roseinatronobacter sp007116575	195713	/ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_007116575.1/GCA_007116575.1_ASM711657v1_genomic.fna.gz
5 | GB_GCA_000014945.1_Methanosaeta thermophila PT	GB_GCA_000014945.1	0	100	0	228	p__Euryarchaeota (UID49)	153	0	1591668	84.6870209756	1	1006368	53.5452794962	1879471	GB_GCA_000014945.1	t	d__Archaea;p__Halobacteriota;c__Methanosarcinia;o__Methanotrichales;f__Methanotrichaceae;g__Methanothrix_B;s__Methanothrix_B thermoacetophila	type strain of heterotypic synonym	LPSN; StrainInfo	f	1	1	1879471	1879471	1879471	2	2898	CP000477.1	1879471	2	115	CP000477.1	2888	5334	0	100	CP000477.1614054.1616941	Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT	1879471	1879471	t	f	f	1879471	1879471	Complete Genome	ASM1494v1	n/a	PRJNA15765	SAMN02598350	none	none	none	2006-10-25	GCA_000014945.1	none	full	none	none	none	1	1	1696	na	6	none	none	none	none	none	2006/10/25	0	2224	2	PT	US DOE Joint Genome Institute	349307	d__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila	d__Archaea;p__Euryarchaeota;x__Stenosarchaea group;c__Methanomicrobia;o__Methanosarcinales;f__Methanotrichaceae;g__Methanothrix;s__Methanothrix thermoacetophila;x__Methanothrix thermoacetophila PT	0	1879471	11	44	assembly from synonym type material	1879471	0	none	1810	1	1879471	2	1472	2719	0	100	155726	k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosaetaceae;g__Methanosaeta;s__	1473	CP000477.1	1472	2719	0	100	CP000477.1617193.1618665	Archaea;Euryarchaeota;Methanomicrobia;Methanosarcinales;Methanosaetaceae;Methanosaeta;Methanosaeta thermophila PT	0	18	43	0	Methanothrix_B thermoacetophila	5158	/ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/archaea/GCA_000014945.1/GCA_000014945.1_ASM1494v1_genomic.fna.gz
6 | GB_GCA_000720375.1_Rhodococcus rhodnii	GB_GCA_000720375.1	0	99.47	3.85	495	o__Actinomycetales (UID2012)	282	0	11292191	90.9319757867	182	8230553	66.2776998819	12418284	GB_GCA_000720375.1	t	d__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;g__Kibdelosporangium;s__Kibdelosporangium sp000720375	type strain of species	LPSN; DSMZ; StrainInfo	f	14	14	746316	746316	1593	4	1525	JOAA01000102.1	1581	4	102	JOAA01000103.1	1525	2560	0	96.984	JNYM01000477.49.3165	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Kibdelosporangium aridum subsp. largum	68232	68232	f	f	t	319285	319285	Contig	ASM72037v1	n/a	PRJNA238534	SAMN02645355	182	319285	none	2014-7-2	GCA_000720375.1	none	full	none	none	none	0	0	0	na	0	none	none	none	none	none	2014/07/02	0	38312	0	NRRL B-16535	University of Illinois	38312	d__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii	d__Bacteria;x__Terrabacteria group;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Nocardiaceae;g__Rhodococcus;s__Rhodococcus rhodnii	0	12418284	none	0	assembly from type material	12418284	0	JOAA00000000.1	11452	182	1381	3	1301	2379	0	99.693	756603	k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinosynnemataceae;g__Kibdelosporangium;s__	1319	JOAA01000108.1	1305	2410	0	100	JOAA01000108.1.1312	Bacteria;Actinobacteria;Actinobacteria;Pseudonocardiales;Pseudonocardiaceae;Kibdelosporangium;Rhodococcus rhodnii	0	20	66	2	Kibdelosporangium sp000720375	193223	/ebio/abt3_projects2/databases_no-backup/GTDB/release95/Struo/genomes/genbank/bacteria/GCA_000720375.1/GCA_000720375.1_ASM72037v1_genomic.fna.gz
7 | 


--------------------------------------------------------------------------------
/util_scripts/GTDB_metadata_filter.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # libraries
 4 | suppressPackageStartupMessages(library("argparse"))
 5 | suppressPackageStartupMessages(library("curl"))
 6 | suppressPackageStartupMessages(library("data.table"))
 7 | 
 8 | # create parser object
 9 | parser <- ArgumentParser()
10 | 
11 | # specifying options
12 | parser$add_argument("metadata_urls", nargs='+', help=">=1 url to GTDB metadata")
13 | parser$add_argument("-o", "--output", type='character', default='metadata.tsv',
14 | 			   help="Output file name [default: %(default)s]")
15 | parser$add_argument("-c", "--columns", type='character', default='ncbi_organism_name,ncbi_genbank_assembly_accession,scaffold_count,contig_count,gc_percentage,genome_size,checkm_completeness,checkm_contamination,checkm_strain_heterogeneity,ncbi_assembly_level,ncbi_refseq_category,ncbi_species_taxid,ncbi_taxonomy,gtdb_taxonomy,mimag_high_quality,gtdb_representative',
16 | 			   help="Table columns to keep [default: %(default)s]")
17 | parser$add_argument("-f", "--filter", type='character', default='gtdb_representative == "t" & checkm_completeness >= 50 & checkm_contamination < 5',
18 | 			   help="Table columns to keep [default: %(default)s]")
19 | #parser$add_argument("-t", "--tmpdir", type='character', default='GMF_TMP',
20 | #			   help="Directory for temporary output [default: %(default)s]")	   
21 | parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
22 | 			   help="Print extra output [default: %(default)s]")
23 | parser$add_argument("-q", "--quietly", action="store_false",
24 | 			   dest="verbose", help="Print little output")
25 | args <- parser$parse_args()
26 | 
27 | 
28 | # reading in table(s)
29 | write(sprintf('Keeping columns: %s', args['columns']), stderr())
30 | cols = unlist(strsplit(unlist(args['columns']), ','))
31 | write('----', stderr())
32 | 
33 | df = list()
34 | tmpdir = NULL
35 | for(url in unlist(args['metadata_urls'])){
36 |     # download and uncompress tarball
37 |     if(grepl('.tar.gz$', url)){
38 |         write('url points to tarball; downloading and uncompressing', stderr())
39 |         tmpdir = 'GTDB_metadata_filter_TMP'
40 | 	if(! dir.exists(tmpdir)){
41 | 	    dir.create(tmpdir)
42 | 	}
43 |         tmpfile = file.path(tmpdir, 'GTDB_metadata_filter_TMP.tar.gz')
44 |         download.file(url, destfile=tmpfile)
45 |         untar(tmpfile, exdir=tmpdir)
46 | 	url = file.path(tmpdir, gsub('.tar.gz$', '.tsv', basename(url)))
47 |     }
48 |     # read table
49 |     write(sprintf('Reading in file: %s', url), stderr())
50 |     df[[url]] = fread(url, sep='\t', check.names=TRUE)[, ..cols]
51 |     # clean up
52 |     if(!is.null(tmpdir) & dir.exists(tmpdir)){
53 |         unlink(tmpdir, recursive = TRUE)
54 |     }
55 | }
56 | 
57 | df = do.call(rbind, df)
58 | x = as.character(nrow(df))
59 | write(sprintf('Number of rows in the combined table: %s', x), stderr())
60 | 
61 | # Filtering
62 | x = unlist(args['filter'])[1]
63 | write(sprintf('Filtering rows by expression: %s', x), stderr())
64 | df = df[eval(parse(text=x)),]
65 | x = as.character(nrow(df))
66 | write(sprintf('Number of rows after filtering: %s', x), stderr())
67 | 
68 | # Writing table
69 | write(sprintf('Writing file to: %s', args['output']), stderr())
70 | out_file = unlist(args['output'])
71 | fwrite(df, file=out_file, sep='\t', quote=FALSE, row.names=FALSE)


--------------------------------------------------------------------------------
/util_scripts/database_download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import os
  4 | import sys
  5 | import re
  6 | import gzip
  7 | import bz2
  8 | import resource
  9 | import argparse
 10 | import logging
 11 | import functools
 12 | import multiprocessing as mp
 13 | # 3rd party
 14 | import requests
 15 | import bs4
 16 | 
 17 | # logging
 18 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 19 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 20 |                       argparse.RawDescriptionHelpFormatter):
 21 |     pass
 22 | 
 23 | # argparse
 24 | desc = 'Download Struo2 database files'
 25 | epi = """DESCRIPTION:
 26 | A helper script for downloading pre-built custom database files.
 27 | Multiple GTDB releases & databases (eg., kraken2 or humann3) 
 28 | can be downloaded.
 29 | 
 30 | Note: use "--" to separate "--database" parameters from <output_dir>
 31 | """
 32 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 33 |                                  formatter_class=CustomFormatter)
 34 | parser.add_argument('output_dir', type=str,
 35 |                     help='Output directory')
 36 | parser.add_argument('-r', '--release', type=str, nargs='+',
 37 |                     choices = ['95', '202', '207'], default=['207'],
 38 |                     help='GTDB release')
 39 | parser.add_argument('-d', '--database', type=str, nargs='+',
 40 |                     choices = ['kraken2', 'humann3', 'taxdump', 'phylogeny',
 41 |                                'metadata', 'genes'], default=['metadata'],
 42 |                     help='Database(s) to download ')
 43 | parser.add_argument('-u', '--base-url', type=str,
 44 |                     default='http://ftp.tue.mpg.de/ebio/projects/struo2/',
 45 |                     help='Base url for downloads')
 46 | parser.add_argument('-t', '--threads', type=int, default=1,
 47 |                     help='Parallel download processes')
 48 | parser.add_argument('-m', '--max-recursion', type=int, default=1048576,
 49 |                     help='Max recursion limit')
 50 | parser.add_argument('--version', action='version', version='0.0.1')
 51 | 
 52 | # functions
 53 | def decode(x):
 54 |     """
 55 |     Decoding input, if needed
 56 |     """
 57 |     try:
 58 |         x = x.decode('utf-8')
 59 |     except AttributeError:
 60 |         pass
 61 |     return x
 62 | 
 63 | def write_lines(url, l, out_dir):
 64 |     """"
 65 |     Writing lines obtained from requests 
 66 |     """
 67 |     with requests.get(url + '/' + l['href'].lstrip('/'), stream=True) as r:        
 68 |         if r.status_code == 404:
 69 |             return None
 70 |         if l['href'] == 'database.kraken':  # debug
 71 |             return None
 72 |         out_file = os.path.join(out_dir, l['href'])
 73 |         with open(out_file, 'w') as outF:
 74 |             for line in r.iter_lines(decode_unicode=True):
 75 |                 outF.write(decode(line) + '\n')
 76 |     logging.info(f'File written: {out_file}')
 77 |     
 78 | def write_chunks(url, l, out_dir):
 79 |     """
 80 |     Writing chunks obtained from requests
 81 |     """
 82 |     with requests.get(url + '/' + l['href'].lstrip('/'), stream=True) as r:        
 83 |         if r.status_code == 404:
 84 |             return None
 85 |         out_file = os.path.join(out_dir, l['href'])
 86 |         with open(out_file, 'wb') as outF:
 87 |             for chunk in r.iter_content(chunk_size = 1024):
 88 |                 if chunk:
 89 |                     outF.write(chunk)
 90 |     logging.info(f'File written: {out_file}')
 91 | 
 92 | def dl_file(l, url, out_dir):
 93 |     """
 94 |     Download file from url
 95 |     """
 96 |     if l['href'].startswith('?') or l['href'].endswith('/'):
 97 |         return None
 98 |     try:
 99 |         write_lines(url, l, out_dir)
100 |     except UnicodeDecodeError:
101 |         write_chunks(url, l, out_dir)    
102 |     return None
103 |             
104 | def dl_files(base_url, release, database, out_dir, threads):
105 |     """
106 |     List files from url and download all available
107 |     """
108 |     # output directory
109 |     out_dir = os.path.join(out_dir, release, database)
110 |     if not os.path.isdir(out_dir):
111 |         os.makedirs(out_dir)
112 |     # base url: GET
113 |     url = os.path.join(base_url, release, database)
114 |     r = requests.get(url)
115 |     if r.status_code == 404:
116 |         logging.warning('WARNING: 404 status code for url: {}'.format(url))
117 |         return None
118 |     # file urls: GET
119 |     data = bs4.BeautifulSoup(r.text, 'html.parser')
120 |     func = functools.partial(dl_file, url=url, out_dir=out_dir)
121 |     bs4_list = [x for x in data.find_all('a')]
122 |     if args.threads > 1:
123 |         pool = mp.Pool(threads)
124 |         pool.map(func, bs4_list)
125 |     else:
126 |         [x for x in map(func, bs4_list)]
127 |     try:
128 |         pool.close()
129 |     except UnboundLocalError:
130 |         pass
131 |         
132 | def set_recursion(max_rec):
133 |     """
134 |     max_rec = 0x100000
135 |     """
136 |     resource.setrlimit(resource.RLIMIT_STACK, [0x100 * max_rec, resource.RLIM_INFINITY])
137 |     sys.setrecursionlimit(max_rec)
138 |     logging.info(f'Max recursion set to: {max_rec}')
139 |     
140 | def main(args):
141 |     # args
142 |     args.release = ['GTDB_release{}'.format(x) for x in args.release]
143 |     # recursion
144 |     set_recursion(args.max_recursion)
145 |     # output
146 |     if not os.path.isdir(args.output_dir):
147 |         os.makedirs(args.output_dir)
148 |     # list files
149 |     for db in args.database:
150 |         for release in args.release:
151 |             dl_files(args.base_url, release, db,
152 |                      args.output_dir, args.threads)
153 |     
154 | 
155 | if __name__ == '__main__':
156 |     args = parser.parse_args()
157 |     main(args)
158 | 


--------------------------------------------------------------------------------
/util_scripts/genome_download.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # libraries
  4 | suppressPackageStartupMessages(library("argparse"))
  5 | suppressPackageStartupMessages(library("dplyr"))
  6 | 
  7 | # create parser object
  8 | parser <- ArgumentParser()
  9 | 
 10 | # specifying options
 11 | parser$add_argument("acc_table", nargs=1, help="Table containing assembly accessions (tab-delim with header)")
 12 | parser$add_argument("-c", "--column", type='character', default='ncbi_genbank_assembly_accession',
 13 |                     help="Column name containing accessions [default: %(default)s]")
 14 | parser$add_argument("-o", "--output", type='character', default='.',
 15 |                     help="Path for output [default: %(default)s]")
 16 | parser$add_argument("-p", "--procs", type='integer', default=1,
 17 | 	            help="Number of parallel processes [default: %(default)s]")
 18 | parser$add_argument("-r", "--retries", type='integer', default=3,
 19 |                     help="Number of retries [default: %(default)s]")
 20 | parser$add_argument("-d", "--database", type='character', default='genbank',
 21 |                     help="database to download (-s flag for ncbi-genome-download) [default: %(default)s]")
 22 | parser$add_argument("-x", "--params", type='character', default='archaea,bacteria',
 23 | 	            help="Filtering parameters for ncbi-genome-download [default: %(default)s]")
 24 | parser$add_argument("-f", "--filter", action="store_true", default=FALSE,
 25 |                     help="Check for 'fasta_file_path' and just download any accessions lacking values [default: %(default)s]")
 26 | parser$add_argument("-s", "--skip", action="store_true", default=FALSE,
 27 |                     help="Skip the genome downloading; useful if re-running to re-make the output table [default: %(default)s]")
 28 | parser$add_argument("-v", "--verbose", action="store_true", default=TRUE,
 29 |                     help="Print extra output [default: %(default)s]")
 30 | parser$add_argument("-q", "--quietly", action="store_false",
 31 |                     dest="verbose", help="Print little output")
 32 | args = parser$parse_args()
 33 | 
 34 | 
 35 | # checking for executables
 36 | exe = 'ncbi-genome-download'
 37 | hits = unlist(Sys.which('ncbi-genome-download'))
 38 | if(hits[1] == ''){
 39 |     stop(sprintf('Cannot find executable: %s', exe))
 40 | }
 41 | 
 42 | # reading in table
 43 | x = unlist(args['acc_table'])[1]
 44 | write(sprintf('Reading table: %s', x), stderr())
 45 | df = read.delim(x, sep='\t')
 46 | write(sprintf('Number of rows: %s', nrow(df)), stderr())
 47 | 
 48 | # filtering table
 49 | ## checking for "fasta_file_path" column
 50 | filter_bool = unlist(args['filter'])[1]
 51 | if(filter_bool == TRUE){
 52 |     write('Filtering to just rows with NAs in `fasta_file_path` column', stderr())
 53 |     if('fasta_file_path' %in% colnames(df)){
 54 | 	df_complete = filter(df, !(is.na(fasta_file_path) | fasta_file_path == ''))
 55 |         df = filter(df, is.na(fasta_file_path) | fasta_file_path == '')
 56 |     } else {
 57 |         stop('Cannot find column: "fasta_file_path"')
 58 |     }
 59 | }
 60 | 
 61 | ## Filtering based on user params & getting accessions
 62 | write('Filtering out genomes lacking NCBI genbank assembly accession', stderr())
 63 | col = unlist(args['column'])[1]
 64 | df = df[df[,col] != 'none',]
 65 | write(sprintf('Number of rows after filtering: %s', nrow(df)), stderr())
 66 | ### just accessions
 67 | df_acc = df[,col]
 68 | df_acc = as.data.frame(df_acc)
 69 | 
 70 | # creating temp file of accessions
 71 | ## Creating output directory
 72 | D = normalizePath(unlist(args['output'])[1])
 73 | dir.create(D, showWarnings = FALSE)
 74 | ## writing table
 75 | F = file.path(D, 'accession.txt')
 76 | write(sprintf('Writing accessions to: %s', F), stderr())
 77 | write.table(df_acc, file=F, sep='\t', quote=FALSE, col.names=FALSE, row.names=FALSE)
 78 | 
 79 | # calling ncbi genome download
 80 | procs = as.character(unlist(args['procs'])[1])
 81 | retries = as.character(unlist(args['retries'])[1])
 82 | params = as.character(unlist(args['params'])[1])
 83 | database = as.character(unlist(args['database'])[1])
 84 | skip_bool = args['skip'][1]
 85 | if(skip_bool != TRUE){
 86 |     cmd = paste(c(exe, '-F', 'fasta', '-o', D, '-p', procs, '-r', retries,
 87 |                   '-A', F, '-s', database, params), collapse=' ')
 88 |     write(sprintf('Running cmd: %s', cmd), stderr())
 89 |     system(cmd)
 90 | } else {
 91 |     write('Skipping genome download', stderr())
 92 | }
 93 | 
 94 | # adding paths to genomes onto the table
 95 | ## getting file paths
 96 | D2 = file.path(D, database)
 97 | fasta_files = list.files(D2, pattern='*.fna.gz', recursive=TRUE, full.names=TRUE)
 98 | n_files = as.character(length(fasta_files))
 99 | write(sprintf('Number of fasta files found: %s', n_files), stderr())
100 | ## Adding paths to input table
101 | write('Adding file paths to the input table', stderr())
102 | fasta_files = data.frame(accession = gsub('.+/', '', fasta_files),
103 |                          fasta_file_path = fasta_files)
104 | fasta_files$accession = gsub('(GCA_[0-9]+\\.[0-9]+)_.+', '\\1', fasta_files$accession)
105 | df = left_join(df, fasta_files, by=setNames('accession', col))
106 | 
107 | # recombining tables (if --filter)
108 | if(filter_bool == TRUE){
109 |     df = rbind(df_complete, df)
110 | }
111 | 
112 | # writing table
113 | write.table(df, file=stdout(), sep='\t', row.names=FALSE, quote=FALSE)
114 | 
115 | # status
116 | n_rows = as.character(nrow(df))
117 | write(sprintf('Number of rows in the output: %s', n_rows), stderr())
118 | 
119 | n_missing = as.character(nrow(df[is.na(df$fasta_file_path),]))
120 | write(sprintf('Number of rows with missing file paths: %s ', n_missing), stderr())
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/util_scripts/genome_traitar.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import os
  4 | import sys
  5 | import re
  6 | import gzip
  7 | import bz2
  8 | import argparse
  9 | import logging
 10 | from collections import defaultdict
 11 | 
 12 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 13 | class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
 14 |                       argparse.RawDescriptionHelpFormatter):
 15 |     pass
 16 | 
 17 | desc = 'Converting traitar data to standardized table of traits'
 18 | epi = """DESCRIPTION:
 19 | Formatting traitar table with at least the following columns:
 20 | [sample, phenotype, prediction]
 21 | ... to the following:
 22 | [genome, domain, phylum, class, order, family, genus, species, trait1, ..., traitN]
 23 | 
 24 | A metadata file of all genomes is used to get the genome taxonomy data.
 25 | 
 26 | Output written as tsv file to STDOUT.
 27 | """
 28 | parser = argparse.ArgumentParser(description=desc, epilog=epi,
 29 |                                  formatter_class=CustomFormatter)
 30 | argparse.ArgumentDefaultsHelpFormatter
 31 | parser.add_argument('traitar_output', metavar='traitar_output', type=str,
 32 |                     help='Traitar output file')
 33 | parser.add_argument('genome_metadata', metavar='genome_metadata', type=str,
 34 |                     help='Tab-delim metadata file that contains at least 2 columns: "accession" & "gtdb_taxonomy')
 35 | parser.add_argument('--version', action='version', version='0.0.1')
 36 | 
 37 | 
 38 | def _open(infile, mode='rb'):
 39 |     """
 40 |     Openning of input, regardless of compression
 41 |     """
 42 |     if infile.endswith('.bz2'):
 43 |         return bz2.open(infile, mode)
 44 |     elif infile.endswith('.gz'):
 45 |         return gzip.open(infile, mode)
 46 |     else:
 47 |         return open(infile)
 48 | 
 49 | def _decode(x):
 50 |     """
 51 |     Decoding input, if needed
 52 |     """
 53 |     try:
 54 |         x = x.decode('utf-8')
 55 |     except AttributeError:
 56 |         pass
 57 |     return x
 58 | 
 59 | def parse_meta(infile):
 60 |     """
 61 |     Return: {genome_accession : taxonomy}
 62 |     """
 63 |     if infile is None:
 64 |         return None
 65 |     logging.info('Loading file: {}'.format(infile))
 66 |     regex = re.compile(r'[^a-zA-Z0-9_-]+')
 67 |     header = {}
 68 |     meta = {}
 69 |     msg = 'Cannot find column "{}"'
 70 |     with _open(infile) as inF:
 71 |         for i,line in enumerate(inF):
 72 |             line = _decode(line).rstrip().split('\t')
 73 |             if i == 0:
 74 |                 header = {x.lower():ii for ii,x in enumerate(line)}
 75 |                 continue
 76 |             try:
 77 |                 accession = regex.sub('_', str(line[header['accession']]))
 78 |             except KeyError:
 79 |                 raise KeyError(msg.format('accession'))
 80 |             try:
 81 |                 taxonomy = line[header['gtdb_taxonomy']].split(';')
 82 |             except KeyError:
 83 |                 raise KeyError(msg.format('gtdb_taxonomy'))                
 84 |             meta[accession] = taxonomy
 85 |     logging.info('  No. of accessions: {}'.format(len(meta.keys())))
 86 |     return meta
 87 | 
 88 | def parse_traitar(infile, model='phypat+PGL'):
 89 |     """
 90 |     Parsing gene annotation file.
 91 |     Return: {genome : {trait : score}}
 92 |     """
 93 |     logging.info('Loading file: {}'.format(infile))
 94 |     trt = defaultdict(dict)
 95 |     header = {}
 96 |     status = {'records' : 0}
 97 |     with _open(infile) as inF:
 98 |         for i,line in enumerate(inF):
 99 |             # line parse
100 |             line = _decode(line).rstrip().split('\t')
101 |             if i == 0:
102 |                 header = {x:ii for ii,x in enumerate(line)}
103 |                 continue
104 |             if len(line) < len(header.keys()):
105 |                 msg = 'line {}: less columns than header; skipping!'
106 |                 logging.warning(msg.format(i+1))
107 |                 continue
108 |             # genome
109 |             try:
110 |                 genome_name = line[header['genome']]
111 |             except KeyError:
112 |                 msg = 'Cannot find "{}" column in "{}"'
113 |                 raise KeyError(msg.format('genome', infile))
114 |             # phenotype model
115 |             try:
116 |                 phen_model = str(line[header['phenotype_model']])
117 |             except KeyError:
118 |                 msg = 'Cannot find "{}" column in "{}"'
119 |                 raise KeyError(msg.format('phenotype_model', infile))
120 |             except IndexError:
121 |                 msg = 'No model listed in line: {}'
122 |                 raise IndexError(msg.format(i+1))
123 |             if phen_model != model:
124 |                 continue
125 |             # phenotype
126 |             try:
127 |                 trt_name = str(line[header['phenotype']])
128 |             except KeyError:
129 |                 msg = 'Cannot find "{}" column in "{}"'
130 |                 raise KeyError(msg.format('phenotype', infile))
131 |             # phenotype score
132 |             try:
133 |                 trt_score = line[header['prediction_score']]
134 |             except KeyError:
135 |                 msg = 'Cannot find "{}" column in "{}"'
136 |                 raise KeyError(msg.format('prediction_score', infile))
137 |             # adding info            
138 |             trt[genome_name][trt_name] = trt_score
139 |             status['records'] += 1
140 |     # status
141 |     logging.info('  No. of records: {}'.format(status['records']))
142 |     return trt
143 | 
144 | def get_all_trt(trt):
145 |     """
146 |     All phenotype names
147 |     """
148 |     all_trt = set()
149 |     for genome in trt.keys():
150 |         for trt_name in trt[genome].keys():
151 |             all_trt.add(trt_name)
152 |     logging.info('  No. of trait columns: {}'.format(len(all_trt)))
153 |     return sorted(all_trt)
154 | 
155 | def write_trait_table(trt, meta):
156 |     """
157 |     Writing table of annotations
158 |     """
159 |     logging.info('Writing table to STDOUT...')
160 |     # all annotations
161 |     all_trt = get_all_trt(trt)
162 |     header = ['genome', 'domain', 'phylum', 'class', 'order', 'family',
163 |               'genus', 'species']
164 |     print('\t'.join(header + all_trt))
165 |     status = {'records' : 0}
166 |     for genome in meta.keys():
167 |         # genome taxonomy
168 |         try:
169 |             taxonomy = meta[genome]
170 |         except KeyError:
171 |             msg = 'Cannot find "{}" in metadata'
172 |             raise KeyError(msg.format(genome))
173 |         # counts
174 |         trt_cnts = []
175 |         for trt_name in all_trt:
176 |             try:
177 |                 x = trt[genome][trt_name]
178 |             except KeyError:
179 |                 x = 0
180 |             trt_cnts.append(x)
181 |         # writing line
182 |         trt_cnts = [str(x) for x in trt_cnts]
183 |         print('\t'.join([genome] + taxonomy + trt_cnts))
184 |         status['records'] += 1
185 |     logging.info('  No. of records written: {}'.format(status['records']))
186 | 
187 | 
188 | def main(args):
189 |     # parsing genome metadata
190 |     meta = parse_meta(args.genome_metadata)
191 |     # loading traitar data
192 |     trt = parse_traitar(args.traitar_output)
193 |     # writing table
194 |     write_trait_table(trt, meta)    
195 | 
196 | if __name__ == '__main__':
197 |     args = parser.parse_args()
198 |     main(args)
199 | 


--------------------------------------------------------------------------------
/util_scripts/tree_prune.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from __future__ import print_function
  3 | import sys,os
  4 | import argparse
  5 | import logging
  6 | import tempfile
  7 | import csv
  8 | import urllib.request
  9 | import codecs
 10 | from distutils.spawn import find_executable
 11 | import subprocess
 12 | 
 13 | desc = 'Prune >=1 phylogeny'
 14 | epi = """DESCRIPTION:
 15 | Prune GTDB phylogeny (bacteria and/ archaea)
 16 | to just the list of genome accessions provided.
 17 | 
 18 | `nw_prune` from the newick_utils toolset is used
 19 | for pruning. 
 20 | 
 21 | Trees can be provided as files or urls to files.
 22 | 
 23 | If >1 tree is provided, then the trees are merged;
 24 | `--root-brlen` determines the brlens to the root.
 25 | 
 26 | Output is written to STDOUT.
 27 | """
 28 | parser = argparse.ArgumentParser(description=desc,
 29 |                                  epilog=epi,
 30 |                                  formatter_class=argparse.RawTextHelpFormatter)
 31 | parser.add_argument('accs_to_keep', metavar='accs_to_keep', type=str, 
 32 |                     help='File of genome accessions to keep on the tree. 1 acc per line')
 33 | parser.add_argument('tree_file', metavar='tree_file', type=str, nargs='+',
 34 |                     help='>=1 newick file (or url to the file)')
 35 | parser.add_argument('-r', '--root-brlen', type=float, default=0.0001,
 36 |                     help='Root node branch length (default: %(default)s)')
 37 | parser.add_argument('-s', '--skip-root-brlen', action='store_true', default=False,
 38 |                     help='Don\'t add root node branch length (default: %(default)s)')
 39 | parser.add_argument('--version', action='version', version='0.0.1')
 40 | 
 41 | logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.DEBUG)
 42 | 
 43 | 
 44 | def read_tree(file_or_url, root_brlen, skip_root_brlen=False):
 45 |     """ Reading in tree file (downloading if url) """
 46 |     logging.info('Reading tree: {}'.format(file_or_url))
 47 |     try:
 48 |         line = urllib.request.urlopen(file_or_url).read().decode('utf-8')
 49 |         #inF =   #csv.reader(codecs.iterdecode(ftpstream, 'utf-8'))
 50 |     except ValueError:
 51 |         line = open(file_or_url).read()
 52 |               
 53 |     line = line.rstrip().rstrip(';')
 54 |     if not skip_root_brlen:
 55 |         line += '100.0:{}'.format(root_brlen)
 56 |     
 57 |     return line
 58 | 
 59 | def read_trees(tree_files, root_brlen, skip_root_brlen=False):
 60 |     """ reading in >= tree file (or url) """
 61 |     trees = []
 62 |     for F in tree_files:
 63 |         trees.append(read_tree(F, root_brlen, skip_root_brlen))
 64 |     trees = '(' + ','.join(trees) + ')'
 65 |     trees += '100.0:{}'.format(root_brlen)
 66 |     return trees
 67 | 
 68 | def prune_tree(tree_file, taxa_to_keep):
 69 |     """ Pruning trees via nw_prune """
 70 |     cmd = ['nw_prune', '-v', '-f', tree_file, taxa_to_keep]
 71 |     cmd = ' '.join(cmd)
 72 |     logging.info('CMD: {}'.format(cmd))
 73 |     try:
 74 |         res = subprocess.run(cmd, check=True, shell=True,
 75 |                              stdout=subprocess.PIPE)
 76 |     except subprocess.CalledProcessError as e:
 77 |         raise e     
 78 |     res = res.stdout.decode().rstrip()
 79 |     print(res)
 80 |         
 81 | def main(args):
 82 |     # checking for newick_utils exe
 83 |     if find_executable('nw_prune') is None:
 84 |         msg = 'Cannot find "nw_prune" in PATH. Is newick_utils installed?'
 85 |         raise IOError(msg)
 86 |     
 87 |     # downloading/merging trees
 88 |     ## temp output file
 89 |     dirpath = tempfile.mkdtemp()
 90 |     tmpTree_name = os.path.join(dirpath, 'TMP.nwk')
 91 |     ## reading in trees
 92 |     with open(tmpTree_name, 'w') as tmpTree:        
 93 |         if len(args.tree_file) > 1:
 94 |             tree = read_trees(args.tree_file, args.root_brlen,
 95 |                               args.skip_root_brlen)
 96 |         else:
 97 |             tree = read_tree(args.tree_file[0], args.root_brlen,
 98 |                              args.skip_root_brlen)
 99 |         tmpTree.write(tree + ';')
100 | 
101 |     # pruning 
102 |     prune_tree(tmpTree_name, args.accs_to_keep)
103 |     
104 | 
105 | if __name__ == '__main__':
106 |     args = parser.parse_args()
107 |     main(args)
108 | 


--------------------------------------------------------------------------------