├── .circleci └── config.yml ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── workflow-error.md └── workflows │ ├── codespell.yml │ ├── conventional-prs.yml │ ├── format.yml │ ├── python-package-conda.yml │ ├── release-please.yml │ └── stale.yml ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── atlas ├── __init__.py ├── _version.py ├── atlas.py ├── color_logger.py ├── config ├── default_values.py ├── init │ ├── __init__.py │ ├── atlas_init.py │ ├── create_sample_table.py │ ├── get_SRA_runinfo.py │ └── parse_sra.py ├── make_config.py ├── sample_table.py └── workflow ├── atlasenv.yml ├── config ├── default_config.yaml └── template_config.yaml ├── docker_run.sh ├── docs ├── Makefile ├── advanced │ ├── assembly.rst │ └── qc.rst ├── conf.py ├── index.rst ├── pyproject.toml ├── reports │ ├── QC_report.html │ ├── assembly_report.html │ ├── bin_report_DASTool.html │ ├── bin_report_SemiBin.html │ ├── bin_report_vamb.html │ ├── dram_product.html │ └── samples.tsv └── usage │ ├── changelog.md │ ├── configuration.rst │ ├── getting_started.rst │ └── output.rst ├── prepare.py ├── resources ├── images │ ├── atlas_image.png │ └── atlas_list.png └── report.css ├── setup.cfg ├── setup.py ├── test ├── dryrun.sh ├── test_assembly.sh ├── test_ci.sh ├── test_external_genomes.sh ├── test_init_many_samples.sh ├── test_local.sh └── test_sra.sh ├── versioneer.py └── workflow ├── Snakefile ├── annotate.smk ├── envs ├── DASTool.yaml ├── busco.yaml ├── cd-hit.yaml ├── checkm.yaml ├── checkm2.yaml ├── dram.yaml ├── eggNOG.yaml ├── fasta.yaml ├── grabseq.yaml ├── gtdbtk.yaml ├── gunc.yaml ├── hdf.yaml ├── instrain.yaml ├── maxbin.yaml ├── megahit.yaml ├── metabat.yaml ├── minimap.yaml ├── mmseqs.yaml ├── prodigal.yaml ├── report.yaml ├── required_packages.yaml ├── semibin.yaml ├── sequence_utils.yaml ├── skani.yaml ├── spades.yaml ├── species_clustering.yaml ├── sra.post-deploy.sh ├── sra.yaml ├── tree.yaml └── vamb.yaml ├── report ├── assembly_report.py ├── bin_report.py ├── common_report.py ├── qc_report.py ├── report.css ├── template_QC_report.html ├── template_assembly_report.html └── template_bin_report.html ├── rules ├── assemble.smk ├── bin_quality.smk ├── binning.smk ├── cdhit.smk ├── cobinning.smk ├── derep.smk ├── download.smk ├── dram.smk ├── genecatalog.smk ├── genomes.smk ├── gtdbtk.smk ├── patch.smk ├── predict_genes_of_genomes.py ├── qc.smk ├── sample_table.smk ├── scg_blank_diamond.rb ├── screen.smk ├── semibin.smk ├── sra.smk └── strains.smk └── scripts ├── DRAM_get_all_modules.py ├── cluster_species.py ├── combine_busco.py ├── combine_checkm.py ├── combine_checkm2.py ├── combine_contig_stats.py ├── combine_coverage_MAGs.py ├── combine_dram_gene_annotations.py ├── combine_gene_coverages.py ├── combine_taxonomy.py ├── convert_jgi2vamb_coverage.py ├── filter_genes.py ├── filter_genomes.py ├── gene2genome.py ├── generate_orf_info.py ├── get_fasta_of_bins.py ├── get_read_stats.py ├── parse_semibin.py ├── parse_vamb.py ├── rename_assembly.py ├── rename_genecatalog.py ├── rename_genomes.py ├── root_tree.py ├── split_genecatalog.py └── utils ├── __init__.py ├── fasta.py ├── gene_scripts.py ├── genome_dist.py ├── genome_stats.py ├── io.py ├── parsers.py ├── parsers_bbmap.py ├── taxonomy.py ├── tree.py └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | atlas/_version.py export-subst 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: Feature request 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | 17 | 18 | **Additional context** 19 | Add any other context or screenshots about the feature request here. 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/workflow-error.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Workflow Error 3 | about: I got atlas running but then it encountered this error. For other errors see 4 | the other template. 5 | title: Error in rule 6 | labels: '' 7 | assignees: '' 8 | 9 | --- 10 | 11 | - [ ] I checked and didn't found a related issue,e.g. while typing the title 12 | - [ ] ** I got an error in the following rule(s):** ` ` 13 | 14 | 17 | 18 | 19 | 20 | - [ ] I checked the log files indicated indicated in the error message (and the cluster logs if submitted to a cluster) 21 | 22 | 44 | 45 | Here is the relevant log output: 46 | 47 | ``` 48 | 49 | 50 | 51 | ``` 52 | 53 | 54 | ** Atlas version** 55 | 56 | **Additional context** 57 | Add any other context about the problem here. 58 | -------------------------------------------------------------------------------- /.github/workflows/codespell.yml: -------------------------------------------------------------------------------- 1 | # Codespell configuration is within pyproject.toml 2 | --- 3 | name: Codespell 4 | 5 | on: 6 | push: 7 | branches: [main] 8 | pull_request: 9 | branches: [main] 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | codespell: 16 | name: Check for spelling errors 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - name: Checkout 21 | uses: actions/checkout@v4 22 | - name: Codespell 23 | uses: codespell-project/actions-codespell@v2 24 | with: 25 | check_filenames: true 26 | skip: ".git,*.pdf,*.svg,versioneer.py,*.css,*.html" 27 | check_hidden: true 28 | -------------------------------------------------------------------------------- /.github/workflows/conventional-prs.yml: -------------------------------------------------------------------------------- 1 | name: PR 2 | on: 3 | pull_request_target: 4 | types: 5 | - opened 6 | - reopened 7 | - edited 8 | - synchronize 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | title-format: 15 | permissions: 16 | pull-requests: read # for amannn/action-semantic-pull-request to analyze PRs 17 | statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: amannn/action-semantic-pull-request@v5.0.2 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /.github/workflows/format.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | concurrency: 10 | # Cancel concurrent flows on PRs 11 | group: ci-${{ github.head_ref || github.run_id }} 12 | cancel-in-progress: true 13 | 14 | jobs: 15 | formatting: 16 | permissions: 17 | contents: read # for actions/checkout to fetch code 18 | pull-requests: write # for marocchino/sticky-pull-request-comment to create or update PR comment 19 | runs-on: ubuntu-latest 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - uses: mamba-org/setup-micromamba@v1 24 | with: 25 | environment-name: formatting 26 | create-args: black snakefmt 27 | condarc: | 28 | channels: 29 | - conda-forge 30 | - bioconda 31 | cache-environment: true 32 | 33 | - name: Check Black formatting 34 | shell: bash -el {0} 35 | run: black --check --diff . 36 | 37 | - name: Check Snakefmt formatting 38 | shell: bash -el {0} 39 | run: snakefmt --check --diff . 40 | 41 | - name: Comment PR 42 | if: github.event_name == 'pull_request' && failure() 43 | uses: marocchino/sticky-pull-request-comment@v2.8.0 44 | with: 45 | message: | 46 | Please format your code with: 47 | - [black](https://black.readthedocs.io): `black .` 48 | - [snakefmt](https://github.com/snakemake/snakefmt): `snakefmt .` 49 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 50 | -------------------------------------------------------------------------------- /.github/workflows/release-please.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | 6 | name: release-please 7 | 8 | jobs: 9 | release-please: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: GoogleCloudPlatform/release-please-action@v3 13 | id: release 14 | with: 15 | release-type: python 16 | package-name: metagenome-atlas 17 | 18 | - uses: actions/checkout@v3 19 | if: ${{ steps.release.outputs.release_created }} 20 | with: 21 | fetch-depth: 0 22 | 23 | - name: Set up Python 24 | if: ${{ steps.release.outputs.release_created }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: "3.x" 28 | 29 | - name: Build and check package 30 | if: ${{ steps.release.outputs.release_created }} 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install build twine 34 | python -m build 35 | twine check --strict dist/* 36 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | schedule: 10 | - cron: '16 13 * * *' 11 | 12 | jobs: 13 | stale: 14 | 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | pull-requests: write 19 | 20 | steps: 21 | - uses: actions/stale@v7 22 | with: 23 | repo-token: ${{ secrets.GITHUB_TOKEN }} 24 | stale-issue-message: | 25 | There was no activity since some time. I hope your issue is solved in the mean time. 26 | This issue will automatically close soon if no further activity occurs. 27 | 28 | Thank you for your contributions. 29 | stale-pr-message: | 30 | This PR is stale because it has not had any recent activity. 31 | This PR will automatically close soon if no further activity occurs. 32 | 33 | Thank you for your contributions. 34 | days-before-stale: 60 35 | days-before-close: 15 36 | stale-issue-label: stale 37 | stale-pr-label: stale 38 | exempt-issue-labels: 'keep,enhancement,bug,documentation,Feature-request,Known-issue' 39 | exempt-pr-labels: '' 40 | days-before-pr-stale: 1000 41 | exempt-assignees: silask 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | .snakemake/ 28 | .history 29 | # atlas specific 30 | databases 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | databases 38 | .test/* 39 | test/* 40 | !test/*.sh 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | .vscode 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv/ 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | 101 | 102 | # on mac 103 | .DS_Store 104 | example_data 105 | atlas/regex_formating.py 106 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | language: python 3 | cache: 4 | directories: 5 | - $HOME/miniconda 6 | python: 7 | # We don't actually use the Travis Python, but this keeps it organized. 8 | - "3.6" 9 | before_install: 10 | - | 11 | if [ -d "$HOME/miniconda" ]; then 12 | echo "conda exist already"; 13 | source activate atlasenv 14 | conda list 15 | else: 16 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 17 | bash miniconda.sh -b -p $HOME/miniconda 18 | export PATH="$HOME/miniconda/bin:$PATH" 19 | hash -r 20 | conda config --set always_yes yes --set changeps1 no 21 | conda update -q conda 22 | # Useful for debugging any issues with conda 23 | conda info -a 24 | conda config --add channels defaults 25 | conda config --add channels bioconda 26 | conda config --add channels conda-forge 27 | conda env create -n atlasenv --file atlasenv.yml 28 | fi 29 | 30 | install: 31 | - source activate atlasenv 32 | - python setup.py install 33 | - atlas --help 34 | - atlas --version 35 | script: 36 | - N_THREADS=2 ; MEM=7 37 | - test/dryrun.sh 38 | - test/test_assembly.sh --resources mem=$MEM java_mem=$MEM --jobs=$N_THREADS --restart-times=2 39 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Submit a bug report 2 | 3 | *First look in the all open and closed issues for the error message you got.* 4 | 5 | As atlas is based on snakemake, check if you find a similar bug already discussed for other snakemake-workflows. 6 | 7 | If you don't find any help. Submit a issue: 8 | 9 | - specify the system you are working on: linux, cluster, shared filesystem? 10 | - copy the error message 11 | - add the log file of the rule, which produced the error. 12 | - If you run the atlas on a cluster join also the log file of the cluster. 13 | 14 | I hope we can help you... 15 | 16 | # Contribute to the metagenome-atlas code 17 | 18 | ## Prerequisites 19 | 20 | - know the basic about git and gitHub 21 | - know how snakemake works, otherwise check the [tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html) 22 | 23 | ## Setup 24 | 25 | You can ask the maintainers to be added to the repository and work from a *branch* of the main atlas repository or you can work from a fork of the atlas repository. 26 | 27 | Follow the [steps](https://github.com/metagenome-atlas/atlas#install-the-development-version-from-github) to set up the development version of atlas. This allows you to work with the code you have in the git repository. 28 | 29 | ## Test the code 30 | 31 | ### Locally 32 | 33 | Idelly you should have some test prpject on your local machine. 34 | When you created a new rule and you want to test the output of this rule `my_target.tsv` you can do this by running: 35 | 36 | ``` atlas run None my_target.tsv ``` 37 | 38 | ### Continuous integration 39 | 40 | When you make a pull request to the master branch. Each change in your code gets checked by continuous integration (CI). The tests should make sure that your modification don't break any other use of atlas. However due to the requeirements needed during the execution of atlas, it is not possible to test all functionalities via CI. If you add functionalities to atlas, they should also be tested. Have a look at the scripts in `.test`. 41 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Start with the Miniconda base image 2 | FROM continuumio/miniconda3:24.9.2-0 3 | 4 | # Set the working directory in the container 5 | WORKDIR /main 6 | 7 | # Copy the environment file and project code 8 | COPY atlasenv.yml . 9 | 10 | # Create a user with a specific UID and GID 11 | RUN groupadd -g 1000 atlasgroup && \ 12 | useradd -m -u 1000 -g atlasgroup -s /bin/bash atlasuser 13 | 14 | # Set the HOME environment variable 15 | ENV HOME=/home/atlasuser 16 | 17 | # Change ownership of the home directory 18 | RUN chown -R atlasuser:atlasgroup $HOME 19 | 20 | # Switch to the new user 21 | USER atlasuser 22 | 23 | # Create and activate the environment 24 | RUN conda env create -n atlas -f atlasenv.yml && \ 25 | conda clean -afy && \ 26 | echo "source activate atlas" > ~/.bashrc 27 | 28 | # Set the working directory 29 | WORKDIR /main 30 | 31 | 32 | # Set the default command 33 | CMD ["bash"] -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Battelle Memorial Institute 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of Battelle Memorial Institute nor the names of its 13 | contributors may be used to endorse or promote products derived from 14 | this software without specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include atlas * 2 | include versioneer.py 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Metagenome-Atlas 2 | 3 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/metagenome-atlas/badges/latest_release_relative_date.svg)](https://anaconda.org/bioconda/metagenome-atlas) 4 | [![Bioconda](https://img.shields.io/conda/dn/bioconda/metagenome-atlas.svg?label=Bioconda )](https://anaconda.org/bioconda/metagenome-atlas) 5 | [![Documentation Status](https://readthedocs.org/projects/metagenome-atlas/badge/?version=latest)](https://metagenome-atlas.readthedocs.io/en/latest/?badge=latest) 6 | ![Mastodon Follow](https://img.shields.io/mastodon/follow/109273833677404282?domain=https%3A%2F%2Fmstdn.science&style=social) 7 | 8 | 9 | Metagenome-atlas is a easy-to-use metagenomic pipeline based on snakemake. It handles all steps from QC, Assembly, Binning, to Annotation. 10 | 11 | ![scheme of workflow](resources/images/atlas_list.png?raw=true) 12 | 13 | You can start using atlas with three commands: 14 | 15 | ```sh 16 | mamba install -y -c bioconda -c conda-forge metagenome-atlas={latest_version} 17 | atlas init --db-dir databases path/to/fastq/files 18 | atlas run all 19 | ``` 20 | 21 | where `{latest_version}` should be replaced by [![Version](https://anaconda.org/bioconda/metagenome-atlas/badges/version.svg)](https://anaconda.org/bioconda/metagenome-atlas) 22 | 23 | ## Webpage 24 | 25 | [metagenome-atlas.github.io](https://metagenome-atlas.github.io/) 26 | 27 | ## Documentation 28 | 29 | 30 | 31 | [Tutorial](https://github.com/metagenome-atlas/Tutorial) 32 | 33 | ## Citation 34 | 35 | > ATLAS: a Snakemake workflow for assembly, annotation, and genomic binning of metagenome sequence data. 36 | > Kieser, S., Brown, J., Zdobnov, E. M., Trajkovski, M. & McCue, L. A. 37 | > BMC Bioinformatics 21, 257 (2020). 38 | > doi: [10.1186/s12859-020-03585-4](https://doi.org/10.1186/s12859-020-03585-4) 39 | 40 | ## Development/Extensions 41 | 42 | Here are some ideas I work or want to work on when I have time. If you want to contribute or have some ideas let me know via a feature request issue. 43 | 44 | - Optimized MAG recovery (e.g. [Spacegraphcats](https://github.com/spacegraphcats/spacegraphcats)) 45 | - Integration of viruses/plasmid that live for now as [extensions](https://github.com/metagenome-atlas/virome_atlas) 46 | - Add statistics and visualisations as in [atlas_analyze](https://github.com/metagenome-atlas/atlas_analyze) 47 | - Implementation of most rules as snakemake wrapper 48 | - Cloud execution 49 | - Update to new Snakemake version and use cool reports. 50 | -------------------------------------------------------------------------------- /atlas/__init__.py: -------------------------------------------------------------------------------- 1 | import snakemake 2 | from . import _version 3 | import os 4 | 5 | from .workflow.scripts import utils 6 | 7 | 8 | TAX_LEVELS = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"] 9 | BLAST6 = [ 10 | "qseqid", 11 | "sseqid", 12 | "pident", 13 | "length", 14 | "mismatch", 15 | "gapopen", 16 | "qstart", 17 | "qend", 18 | "sstart", 19 | "send", 20 | "evalue", 21 | "bitscore", 22 | ] 23 | 24 | 25 | __version__ = _version.get_versions()["version"] 26 | -------------------------------------------------------------------------------- /atlas/color_logger.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | # root logger 5 | logger = logging.getLogger() 6 | 7 | grey = "\x1b[38;21m" 8 | green = "\x1b[32;21m" 9 | yellow = "\x1b[33;21m" 10 | red = "\x1b[31;21m" 11 | bold_red = "\x1b[31;1m" 12 | reset = "\x1b[0m" 13 | 14 | prefix = "[Atlas] " 15 | 16 | 17 | class ColorFormatter(logging.Formatter): 18 | def __init__( 19 | self, 20 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)", 21 | ): 22 | self.FORMATS = { 23 | logging.DEBUG: prefix + grey + format + reset, 24 | logging.INFO: prefix + green + format + reset, 25 | logging.WARNING: prefix + yellow + format + reset, 26 | logging.ERROR: prefix + red + format + reset, 27 | logging.CRITICAL: prefix + red + format + reset, 28 | } 29 | 30 | def format(self, record): 31 | log_fmt = self.FORMATS.get(record.levelno) 32 | formatter = logging.Formatter(log_fmt) 33 | return formatter.format(record) 34 | 35 | 36 | # 37 | logging_format = "%(levelname)s: %(message)s" 38 | # datefmt="%Y-%m-%d %H:%M" 39 | # 40 | # 41 | # 42 | # 43 | # 44 | # fileHandler = logging.FileHandler("atlas.log",mode='w') 45 | # fileHandler.setFormatter(logging.Formatter(logging_format)) 46 | # fileHandler.setLevel(logging.DEBUG) 47 | # 48 | # 49 | # 50 | # creat console logging 51 | consoleHandler = logging.StreamHandler() 52 | consoleHandler.setLevel(logging.INFO) 53 | consoleHandler.setFormatter(ColorFormatter(logging_format)) 54 | 55 | 56 | # 57 | 58 | ## Define logging 59 | logging.basicConfig( 60 | level=logging.DEBUG, 61 | datefmt="%Y-%m-%d %H:%M", 62 | format=logging_format, 63 | handlers=[consoleHandler], 64 | ) 65 | logging.captureWarnings(True) 66 | 67 | 68 | # create logging for atlas 69 | 70 | 71 | def handle_exception(exc_type, exc_value, exc_traceback): 72 | if issubclass(exc_type, KeyboardInterrupt): 73 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 74 | return 75 | 76 | logger.error( 77 | "".join( 78 | [ 79 | "Uncaught exception: ", 80 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 81 | ] 82 | ) 83 | ) 84 | 85 | 86 | # Install exception handler 87 | sys.excepthook = handle_exception 88 | 89 | # root logger 90 | logger = logging.getLogger() 91 | 92 | # logger= logging 93 | -------------------------------------------------------------------------------- /atlas/config: -------------------------------------------------------------------------------- 1 | ../config/ -------------------------------------------------------------------------------- /atlas/default_values.py: -------------------------------------------------------------------------------- 1 | # global defaults 2 | MEM = 80 3 | JAVA_MEM_FRACTION = 0.85 4 | PREALLOCATE_RAM = "t" 5 | 6 | 7 | MERGING_FLAGS = "ecct iterations=1" 8 | MERGING_EXTEND2 = 50 9 | MERGING_K = 62 10 | 11 | CONTAMINANT_MAX_INDEL = 20 12 | CONTAMINANT_MIN_RATIO = 0.65 13 | CONTAMINANT_MINIMUM_HITS = 1 14 | CONTAMINANT_AMBIGUOUS = "best" 15 | CONTAMINANT_KMER_LENGTH = 13 16 | 17 | DUPLICATES_ONLY_OPTICAL = False 18 | DUPLICATES_ALLOW_SUBSTITUTIONS = 2 19 | 20 | NORMALIZATION_KMER_LENGTH = 21 21 | 22 | # almost no filtering unless grossly over-represented 23 | NORMALIZATION_TARGET_DEPTH = 1000 # 500 24 | # allow very low represented kmers to remain 25 | NORMALIZATION_MINIMUM_KMERS = 3 # 15 26 | 27 | ASSEMBLY_MEMORY = 250 28 | ASSEMBLY_THREADS = 8 29 | MEGAHIT_MIN_COUNT = 2 30 | MEGAHIT_K_MIN = 21 31 | MEGAHIT_K_MAX = 121 32 | MEGAHIT_K_STEP = 20 33 | MEGAHIT_MERGE_LEVEL = "20,0.98" 34 | MEGAHIT_PRUNE_LEVEL = 2 35 | MEGAHIT_LOW_LOCAL_RATIO = 0.2 36 | SPADES_K = "auto" 37 | 38 | # this is bumped up slightly to filter non-merged R1 and R2 sequences 39 | MINIMUM_CONTIG_LENGTH = 300 # 2200 40 | 41 | # leave all contigs 42 | MINIMUM_AVERAGE_COVERAGE = 1 # 5 43 | MINIMUM_PERCENT_COVERED_BASES = 20 # 40 44 | MINIMUM_MAPPED_READS = 0 45 | CONTIG_TRIM_BP = 0 # 100 46 | 47 | # bases 48 | MINIMUM_REGION_OVERLAP = 1 49 | FEATURE_COUNTS_ALLOW_OVERLAP = True 50 | MAXIMUM_COUNTED_MAP_SITES = 10 51 | # default bbmap 52 | CONTIG_MIN_ID = 0.76 53 | CONTIG_MAP_PAIRED_ONLY = True 54 | CONTIG_MAX_DISTANCE_BETWEEN_PAIRS = 1000 55 | # only best 56 | CONTIG_COUNT_MULTI_MAPPED_READS = False 57 | # set minimum map quality 58 | MINIMUM_MAP_QUALITY = 0 59 | 60 | PROKKA_KINGDOM = "Bacteria" 61 | 62 | MAXBIN_MAX_ITERATION = 50 63 | MAXBIN_MIN_CONTIG_LENGTH = 1000 64 | MAXBIN_PROB_THRESHOLD = 0.9 65 | 66 | DIAMOND_TOP_SEQS = 2 67 | DIAMOND_E_VALUE = 0.000001 68 | DIAMOND_MIN_IDENTITY = 50 69 | DIAMOND_QUERY_COVERAGE = 60 70 | DIAMOND_GAP_OPEN = 11 71 | DIAMOND_GAP_EXTEND = 1 72 | DIAMOND_BLOCK_SIZE = 2 73 | DIAMOND_INDEX_CHUNKS = 4 74 | 75 | SUMMARY_METHOD = "lca" 76 | AGGREGATION_METHOD = "lca-majority" 77 | MAJORITY_THRESHOLD = 0.51 78 | MIN_BITSCORE = 0 79 | MIN_LENGTH = 20 80 | MAX_HITS = 100 81 | 82 | 83 | EGGNOG_HEADER = [ 84 | "Query", 85 | "Seed", 86 | "Seed_evalue", 87 | "Seed_Score", 88 | "eggNOG", 89 | "max_annot_lvl", 90 | "COG_cat", 91 | "Description", 92 | "Name", 93 | "GO_terms", 94 | "EC", 95 | "KO", 96 | "KEGG_Pathway", 97 | "KEGG_Module", 98 | "KEGG_Reaction", 99 | "KEGG_rclass", 100 | "BRITE", 101 | "KEGG_TC", 102 | "CAZy", 103 | "BiGG_Reaction", 104 | "PFAMs", 105 | ] 106 | -------------------------------------------------------------------------------- /atlas/init/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metagenome-atlas/atlas/b7694014f0bb3284255325b7abb93d2cf58224e6/atlas/init/__init__.py -------------------------------------------------------------------------------- /atlas/init/parse_sra.py: -------------------------------------------------------------------------------- 1 | # from ..color_logger import logger 2 | import logging 3 | 4 | logger = logging.getLogger(__file__) 5 | import pandas as pd 6 | 7 | 8 | Expected_library_values = { 9 | "LibrarySelection": "RANDOM", 10 | "LibraryStrategy": "WGS", 11 | "LibrarySource": "METAGENOMIC", 12 | "Platform": "ILLUMINA", 13 | } 14 | 15 | 16 | def load_and_validate_runinfo_table(path): 17 | RunTable = pd.read_csv(path, sep="\t", index_col=0) 18 | 19 | # validate sra table 20 | format_error = False 21 | 22 | # check if all headers are present 23 | Expected_headers = [ 24 | "LibraryLayout", 25 | "LibrarySource", 26 | "LibrarySelection", 27 | "LibraryStrategy", 28 | "BioSample", 29 | ] 30 | for header in Expected_headers: 31 | if not header in RunTable.columns: 32 | logger.error(f"Didn't found expected header {header}") 33 | format_error = True 34 | 35 | if not all(RunTable.index.str[1:2] == "R"): 36 | logger.error("Expect runs as index, e.g. [E,S,D]RR000") 37 | format_error = True 38 | 39 | if not RunTable.BioSample.str.startswith("SAM").all(): 40 | logger.error("BioSample should start with 'SAM'") 41 | format_error = True 42 | 43 | if not RunTable.LibraryLayout.isin(["PAIRED", "SINGLE"]).all(): 44 | logger.error("LibraryLayout should be 'PAIRED' or 'SINGLE'") 45 | format_error = True 46 | 47 | if format_error: 48 | logger.error("RunTable {} is not valid. Abort.".format(path)) 49 | exit(1) 50 | 51 | return RunTable 52 | 53 | 54 | def filter_runinfo(RunTable, ignore_paired=False): 55 | logger.info( 56 | f"Start with {RunTable.shape[0]} runs from {RunTable.BioSample.unique().shape[0]} samples" 57 | ) 58 | 59 | # Filter out reads that are not metagenomics 60 | 61 | for key in ["LibrarySource"]: 62 | Nruns_before = RunTable.shape[0] 63 | All_values = RunTable[key].unique() 64 | RunTable = RunTable.loc[RunTable[key] == Expected_library_values[key]] 65 | 66 | Difference = Nruns_before - RunTable.shape[0] 67 | 68 | if Difference > 0: 69 | logger.info( 70 | f"Runs have the following values for {key}: {', '.join(All_values)}\n" 71 | f"Select only runs {key} == {Expected_library_values[key]}, " 72 | f"Filtered out {Difference} runs" 73 | ) 74 | 75 | for key in ["LibrarySelection", "LibraryStrategy"]: 76 | Nruns_before = RunTable.shape[0] 77 | All_values = RunTable[key].unique() 78 | if any(RunTable[key] != Expected_library_values[key]): 79 | logger.warning( 80 | f"Runs have the following values for {key}: {', '.join(All_values)}\n" 81 | f"Usually I expect {key} == {Expected_library_values[key]} " 82 | ) 83 | 84 | # Handle single end reads if mixed 85 | 86 | if ("PAIRED" in RunTable.LibraryLayout) and ("SINGLE" in RunTable.LibraryLayout): 87 | N_library_layout = RunTable.LibraryLayout.value_counts() 88 | 89 | logger.info( 90 | f"Run table contains {N_library_layout['SINGLE']} single-end " 91 | f"and {N_library_layout['PAIRED']} paired-end libraries. " 92 | ) 93 | 94 | if ignore_paired: 95 | logger.info(f"I drop {N_library_layout['PAIRED']} paired end libraries") 96 | RunTable = RunTable.query("LibraryLayout == 'SINGLE'") 97 | 98 | else: 99 | logger.warning(f"I drop {N_library_layout['SINGLE']} single end libraries") 100 | 101 | RunTable = RunTable.query("LibraryLayout == 'PAIRED'") 102 | 103 | # Illumina or not 104 | 105 | if not RunTable.Platform.isin(["ILLUMINA"]).all(): 106 | Platforms = ", ".join(RunTable.Platform.unique()) 107 | 108 | logger.warning( 109 | f"Your samples are sequenced on the following platform: {Platforms}\n" 110 | "I don't know how well Atlas handles non-illumina reads.\n" 111 | "If you have long-reads, specify them via a the longreads, column in the sample table." 112 | ) 113 | 114 | # Final 115 | if RunTable.shape[0] > 0: 116 | logger.info( 117 | f"Selected {RunTable.shape[0]} runs from {RunTable.BioSample.unique().shape[0]} samples" 118 | ) 119 | 120 | else: 121 | logger.critical("No runs left after filtering. Abort.") 122 | exit(1) 123 | 124 | return RunTable 125 | 126 | 127 | def validate_merging_runinfo(path): 128 | RunTable = load_and_validate_runinfo_table(path) 129 | 130 | # If each run is from a different biosample, merging is not necessary 131 | if RunTable.shape[0] == RunTable.BioSample.unique().shape[0]: 132 | return RunTable 133 | 134 | # Cannot merge if different platforms 135 | problematic_samples = [] 136 | for sample, df in RunTable.groupby("BioSample"): 137 | if not all(df.Platform == df.Platform.iloc[0]): 138 | problematic_samples.append(sample) 139 | 140 | if len(problematic_samples) > 0: 141 | logger.error( 142 | f"You attempt to merge runs from the same sample. " 143 | f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and shouldn't be merged.\n" 144 | f"Please resolve the ambiguity in the table {path} and rerun the command.\n" 145 | ) 146 | 147 | exit(1) 148 | 149 | # Warn if samples are not identical for the following columns 150 | Expected_same_values = ["Experiment", "Model", "LibraryName"] 151 | for key in Expected_same_values: 152 | problematic_samples = [] 153 | for sample, df in RunTable.groupby("BioSample"): 154 | if not all(df[key] == df[key].iloc[0]): 155 | problematic_samples.append(sample) 156 | 157 | if len(problematic_samples) > 0: 158 | if len(problematic_samples) > 5: 159 | problematic_samples_list = " ".join(problematic_samples[:3] + ["..."]) 160 | else: 161 | problematic_samples_list = " ".join(problematic_samples) 162 | 163 | logger.warning( 164 | "You attempt to merge runs from the same sample. " 165 | f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n" 166 | f"You can modify the table {path} and rerun the command.\n" 167 | ) 168 | 169 | logger.info("I will automatically merge runs from the same biosample.") 170 | 171 | return RunTable 172 | -------------------------------------------------------------------------------- /atlas/sample_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | import logging 4 | 5 | logger = logging.getLogger(__file__) 6 | 7 | 8 | def validate_sample_table(sampleTable): 9 | Expected_Headers = ["BinGroup"] 10 | for h in Expected_Headers: 11 | if not (h in sampleTable.columns): 12 | logger.error(f"expect '{h}' to be found in samples.tsv") 13 | exit(1) 14 | elif sampleTable[h].isnull().any(): 15 | logger.error(f"Found empty values in the sample table column '{h}'") 16 | exit(1) 17 | 18 | if not sampleTable.index.is_unique: 19 | duplicated_samples = ", ".join(sampleTable.index.duplicated()) 20 | logger.error( 21 | f"Expect Samples to be unique. Found {duplicated_samples} more than once" 22 | ) 23 | exit(1) 24 | 25 | if sampleTable.index.str.match("^\d").any(): 26 | logger.error( 27 | f"Sample names shouldn't start with a digit. This can lead to incompatibilities.\n {list(sampleTable.index)}" 28 | ) 29 | exit(1) 30 | 31 | if sampleTable.index.str.contains("_").any(): 32 | logger.error( 33 | f"Sample names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.index)}" 34 | ) 35 | exit(1) 36 | 37 | if sampleTable.index.str.count("-").max() > 1: 38 | logger.error( 39 | f"Sample names shouldn't have more than one hypo '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}" 40 | ) 41 | exit(1) 42 | 43 | ### Validate BinGroup 44 | 45 | if sampleTable.BinGroup.isnull().any(): 46 | logger.warning(f"Found empty values in the sample table column 'BinGroup'") 47 | 48 | if sampleTable.BinGroup.str.contains("_").any(): 49 | logger.error( 50 | f"BinGroup names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.BinGroup)}" 51 | ) 52 | exit(1) 53 | 54 | if sampleTable.BinGroup.str.contains("-").any(): 55 | logger.error( 56 | f"BinGroup names shouldn't contain hypos '-'. This can lead to incompatibilities.\n {list(sampleTable.BinGroup)}" 57 | ) 58 | exit(1) 59 | 60 | 61 | def load_sample_table(sample_table="samples.tsv"): 62 | sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t") 63 | validate_sample_table(sampleTable) 64 | return sampleTable 65 | 66 | 67 | class BinGroupSizeError(Exception): 68 | """ 69 | Exception with Bingroupsize 70 | """ 71 | 72 | def __init__(self, message): 73 | super(BinGroupSizeError, self).__init__(message) 74 | 75 | 76 | def validate_bingroup_size_cobinning(sampleTable, logger): 77 | """ 78 | Validate that the bingroups are not too large, nor too small for co-binning. 79 | 80 | e.g. vamb and SemiBin 81 | """ 82 | 83 | bin_group_sizes = sampleTable.BinGroup.value_counts() 84 | 85 | if bin_group_sizes.max() > 180: 86 | logger.warning( 87 | f"Found a bin group with more than 180 samples. This might lead to memory issues. \n {bin_group_sizes}" 88 | ) 89 | 90 | if bin_group_sizes.min() < 10: 91 | logger.error( 92 | "If you want to use co-binning, you should have at least 5-10 samples per bin group. \n" 93 | ) 94 | raise BinGroupSizeError("BinGroup too small") 95 | 96 | 97 | def validate_bingroup_size_metabat(sampleTable, logger): 98 | bin_group_sizes = sampleTable.BinGroup.value_counts() 99 | 100 | max_bin_group_size = bin_group_sizes.max() 101 | 102 | warn_message = ( 103 | "Co-binning with metabat uses cross-mapping which scales quadratically." 104 | f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings." 105 | ) 106 | 107 | if max_bin_group_size > 50: 108 | logger.error( 109 | warn_message 110 | + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups." 111 | ) 112 | BinGroupSizeError("BinGroup too large") 113 | 114 | if max_bin_group_size > 15: 115 | logger.warning( 116 | warn_message 117 | + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups." 118 | ) 119 | 120 | elif max_bin_group_size == 1: 121 | logger.warning( 122 | "You have only one sample per bingroup. This doesn't use the co-abundance information." 123 | ) 124 | 125 | 126 | def validate_bingroup_size(sampleTable, config, logger): 127 | if config["final_binner"] == "DASTool": 128 | binners = config["binner"] 129 | 130 | logger.info(f"DASTool uses the following binners: {binners}") 131 | 132 | if ("vamb" in binners) or ("SemiBin" in binners): 133 | validate_bingroup_size_cobinning(sampleTable, logger) 134 | 135 | if "metabat" in binners: 136 | validate_bingroup_size_metabat(sampleTable, logger) 137 | 138 | elif config["final_binner"] == "metabat": 139 | validate_bingroup_size_metabat(sampleTable, logger) 140 | 141 | elif config["final_binner"] in ["vamb", "SemiBin"]: 142 | validate_bingroup_size_cobinning(sampleTable, logger) 143 | 144 | elif config["final_binner"] == "maxbin": 145 | logger.warning("maxbin Doesn't use coabundance for binning.") 146 | 147 | else: 148 | Exception(f"Unknown final binner: {config['final_binner']}") 149 | -------------------------------------------------------------------------------- /atlas/workflow: -------------------------------------------------------------------------------- 1 | ../workflow/ -------------------------------------------------------------------------------- /atlasenv.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python >=3.8, < 3.12 7 | - mamba 8 | - bbmap >= 39.01, <40 9 | - snakemake-minimal >= 7.18.1, <7.26 10 | - pygments 11 | - networkx 12 | - graphviz 13 | - pandas >=1.2, <1.6 14 | - pyarrow # for parquet reading 15 | - click >=7 16 | - ruamel.yaml >=0.17 17 | - cookiecutter 18 | - wget 19 | -------------------------------------------------------------------------------- /config/default_config.yaml: -------------------------------------------------------------------------------- 1 | ## use just in time compilation for bbmap 2 | usejni: false 3 | 4 | gene_annotations: [] 5 | 6 | genome_filter_criteria: "(Completeness-5*Contamination >50 ) & (Length_scaffolds >=50000) & (Ambigious_bases <1e6) & (N50 > 5*1e3) & (N_scaffolds < 1e3)" 7 | exclude_unplacable_genomes: false 8 | 9 | genome_dereplication: 10 | ANI: 0.95 ## Genome dreplication threshold 11 | overlap: 0.2 # See more on https://drep.readthedocs.io/en/latest/module_descriptions.html 12 | greedy_clustering: "auto" # Add options for greedy clustering 'auto' when using more than 5k bins 13 | opt_parameters: "" 14 | score: 15 | completeness: 1 16 | contamination: 5 17 | N50: 0.5 18 | length: 0 19 | centrality: 1 20 | 21 | genome_aligner: "minimap" 22 | 23 | bin_quality_asesser: checkm2 #[ checkm2, busco, cehckm] 24 | 25 | semibin_options: "" 26 | semibin_train_extra: "" 27 | 28 | filter_chimieric_bins: true 29 | gunc_database: "progenomes" # progenomes or gtdb 30 | 31 | binner: # If DASTool is used as final_binner, use predictions of this binners 32 | - metabat 33 | - maxbin 34 | # - vamb 35 | 36 | cobinning_readmapping_id: 0.95 #when mapping different reads to contigs from different samples use less stringent alignment threshold 37 | 38 | preprocess_qtrim: "rl" 39 | preprocess_kmer_trim: "r" 40 | preprocess_minimum_base_quality: 10 41 | preprocess_allowable_kmer_mismatches: 1 42 | preprocess_reference_kmer_match_length: 27 43 | preprocess_minimum_passing_read_length: 51 44 | preprocess_minimum_base_frequency: 0.05 45 | preprocess_max_ns: -1 46 | preallocate_ram: "t" 47 | error_correction_overlapping_pairs: "t" 48 | -------------------------------------------------------------------------------- /docker_run.sh: -------------------------------------------------------------------------------- 1 | docker run -it --name atlas-debug -v $(pwd):/main atlas-debug -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = Metagenome-Atlas 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/advanced/assembly.rst: -------------------------------------------------------------------------------- 1 | Pre-Assambly-processing 2 | ------------------------ 3 | 4 | Normalization Parameters 5 | `````````````````````````` 6 | 7 | To improve assembly time and often assemblies themselves, coverage is 8 | normalized across kmers to a target depth and can be set using:: 9 | 10 | # kmer length over which we calculated coverage 11 | normalization_kmer_length: 21 12 | # the normalized target coverage across kmers 13 | normalization_target_depth: 100 14 | # reads must have at least this many kmers over min depth to be retained 15 | normalization_minimum_kmers: 8 16 | 17 | 18 | 19 | Error Correction 20 | `````````````````````````` 21 | 22 | Optionally perform error correction using ``tadpole.sh`` from BBTools:: 23 | 24 | perform_error_correction: true 25 | 26 | 27 | 28 | Assembly Parameters 29 | ------------------------ 30 | 31 | 32 | Assembler 33 | `````````````````````````` 34 | 35 | Currently, the supported assemblers are 'spades' and 'megahit' with the 36 | default setting of:: 37 | 38 | assembler: megahit 39 | 40 | Both assemblers have settings that can be altered in the configuration:: 41 | 42 | # minimum multiplicity for filtering (k_min+1)-mers 43 | megahit_min_count: 2 44 | # minimum kmer size (<= 255), must be odd number 45 | megahit_k_min: 21 46 | # maximum kmer size (<= 255), must be odd number 47 | megahit_k_max: 121 48 | # increment of kmer size of each iteration (<= 28), must be even number 49 | megahit_k_step: 20 50 | # merge complex bubbles of length <= l*kmer_size and similarity >= s 51 | megahit_merge_level: 20,0.98 52 | # strength of low depth pruning (0-3) 53 | megahit_prune_level: 2 54 | # ratio threshold to define low local coverage contigs 55 | megahit_low_local_ratio: 0.2 56 | # minimum length of contigs (after contig trimming) 57 | minimum_contig_length: 200 58 | # comma-separated list of k-mer sizes (must be odd and less than 128) 59 | spades_k: auto 60 | 61 | 62 | Contig Filtering 63 | `````````````````````````` 64 | 65 | After assembly, contigs can be filtered based on several metrics:: 66 | 67 | # Discard contigs with lower average coverage. 68 | minimum_average_coverage: 5 69 | # Discard contigs with a lower percent covered bases. 70 | minimum_percent_covered_bases: 40 71 | # Discard contigs with fewer mapped reads. 72 | minimum_mapped_reads: 0 73 | # Trim the first and last X bases of each sequence. 74 | contig_trim_bp: 0 75 | -------------------------------------------------------------------------------- /docs/advanced/qc.rst: -------------------------------------------------------------------------------- 1 | Quality control of reads 2 | ------------------------- 3 | 4 | 5 | Adapter Trimming 6 | `````````````````````````` 7 | 8 | FASTA file paths for adapter sequences to be trimmed from the sequence ends. 9 | 10 | We provide the adapter reference FASTA included in `bbmap` for various 11 | 12 | :: 13 | 14 | preprocess_adapters: /database_dir/adapters.fa 15 | 16 | 17 | Quality Trimming 18 | `````````````````````````` 19 | 20 | Trim regions with an average quality below this threshold. Higher is more 21 | stringent. 22 | 23 | :: 24 | 25 | preprocess_minimum_base_quality: 10 26 | 27 | 28 | Adapter Trimming at Read Tips 29 | ```````````````````````````````````````````````````` 30 | 31 | Allow shorter kmer matches down to `mink` at the read ends. 0 disables. 32 | 33 | :: 34 | 35 | preprocess_adapter_min_k: 8 36 | 37 | 38 | Allowable Mismatches in Adapter Hits 39 | ```````````````````````````````````````````````````` 40 | 41 | Maximum number of substitutions between the target adapter kmer and the query 42 | sequence kmer. Lower is more stringent. 43 | 44 | :: 45 | 46 | preprocess_allowable_kmer_mismatches: 1 47 | 48 | 49 | Contaminant Kmer Length 50 | `````````````````````````` 51 | 52 | Kmer length used for finding contaminants. Contaminant matches shorter than 53 | this length will not be found. 54 | 55 | :: 56 | 57 | preprocess_reference_kmer_match_length: 27 58 | 59 | 60 | Read Length Threshold 61 | `````````````````````````` 62 | 63 | This is applied after quality and adapter trimming have been applied to the 64 | sequence. 65 | 66 | :: 67 | 68 | preprocess_minimum_passing_read_length: 51 69 | 70 | 71 | Sequence Complexity Filter 72 | `````````````````````````` 73 | 74 | Require this fraction of each nucleotide per sequence to eliminate low 75 | complexity reads. 76 | 77 | :: 78 | 79 | preprocess_minimum_base_frequency: 0.05 80 | 81 | 82 | Contamination Parameters 83 | `````````````````````````` 84 | 85 | Contamination reference sequences in the form of nucleotide FASTA files can be 86 | provided and filtered from the reads using the following parameters. 87 | 88 | If 'rRNA' is defined, it will be added back to metagenomes but not to metatranscriptomes. 89 | Additional references can be added arbitrarily, such as:: 90 | :: 91 | 92 | contaminant_references: 93 | rRNA: /database_dir/silva_rfam_all_rRNAs.fa 94 | phiX: /database_dir/phiX174_virus.fa 95 | 96 | Don't look for indels longer than this:: 97 | 98 | contaminant_max_indel: 20 99 | 100 | 101 | Fraction of max alignment score required to keep a site:: 102 | 103 | contaminant_min_ratio: 0.65 104 | 105 | mapping kmer length; range 8-15; longer is faster but uses more memory; shorter is more sensitive:: 106 | 107 | contaminant_kmer_length: 12 108 | 109 | Minimum number of seed hits required for candidate sites:: 110 | 111 | contaminant_minimum_hits: 1 112 | 113 | Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations): 114 | 115 | - best (use the first best site) 116 | - toss (consider unmapped, retain in reads for assembly) 117 | - random (select one top-scoring site randomly) 118 | - all (retain all top-scoring sites) 119 | 120 | :: 121 | 122 | contaminant_ambiguous: best 123 | 124 | For host decontamination we suggest the following genomes, where contaminants and low complexity regions were masked. 125 | 126 | Many thanks to Brian Bushnell for providing the genomes of [human](https://drive.google.com/file/d/0B3llHR93L14wd0pSSnFULUlhcUk/edit?resourcekey=0-PsIKmg2q4EvTGWGOUjsKGQ),[mouse](https://drive.google.com/file/d/0B3llHR93L14wYmJYNm9EbkhMVHM/view?resourcekey=0-jSsdejBncqPu4eiFfJvf1w), 127 | [dog](https://drive.google.com/file/d/0B3llHR93L14wTHdWRG55c2hPUXM/view?resourcekey=0-nJ2WQzTQYrTizK0pllVRZg), and [cat](https://drive.google.com/file/d/0B3llHR93L14wOXJhWXRlZjBpVUU/view?resourcekey=0-xxh33oYWp5FGBpRzobD_uw). [Source](https://www.seqanswers.com/forum/bioinformatics/bioinformatics-aa/37175-introducing-removehuman-human-contaminant-removal?p=286481#post286481) 128 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ATLAS documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jan 20 12:31:40 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | # import os 21 | # import sys 22 | # sys.path.insert(0, os.path.abspath('.')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | "sphinx.ext.autodoc", 36 | "sphinx.ext.todo", 37 | "sphinx.ext.viewcode", 38 | "sphinx.ext.napoleon", 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ["_templates"] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # 47 | source_suffix = [".rst", ".md"] 48 | # source_suffix = ".rst" 49 | 50 | # The master toctree document. 51 | master_doc = "index" 52 | 53 | # General information about the project. 54 | project = "Metagenome-atlas" 55 | copyright = "2021, Silas Kieser" 56 | author = "Joe Brown and Silas Kieser" 57 | 58 | # The version info for the project you're documenting, acts as replacement for 59 | # |version| and |release|, also used in various other places throughout the 60 | # built documents. 61 | # 62 | # The short X.Y version. 63 | version = "2.0" 64 | # The full version, including alpha/beta/rc tags. 65 | release = "2.0" 66 | 67 | # The language for content autogenerated by Sphinx. Refer to documentation 68 | # for a list of supported languages. 69 | # 70 | # This is also used if you do content translation via gettext catalogs. 71 | # Usually you set "language" from the command line for these cases. 72 | language = None 73 | 74 | # List of patterns, relative to source directory, that match files and 75 | # directories to ignore when looking for source files. 76 | # This patterns also effect to html_static_path and html_extra_path 77 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "old"] 78 | 79 | # The name of the Pygments (syntax highlighting) style to use. 80 | pygments_style = "sphinx" 81 | 82 | # If true, `todo` and `todoList` produce output, else they produce nothing. 83 | todo_include_todos = True 84 | 85 | 86 | # -- Options for HTML output ---------------------------------------------- 87 | 88 | # The theme to use for HTML and HTML Help pages. See the documentation for 89 | # a list of builtin themes. 90 | # 91 | html_theme = "sphinx_rtd_theme" 92 | 93 | # Theme options are theme-specific and customize the look and feel of a theme 94 | # further. For a list of options available for each theme, see the 95 | # documentation. 96 | # 97 | # html_theme_options = {} 98 | 99 | # Add any paths that contain custom static files (such as style sheets) here, 100 | # relative to this directory. They are copied after the builtin static files, 101 | # so a file named "default.css" will overwrite the builtin "default.css". 102 | html_static_path = ["reports"] 103 | 104 | 105 | # -- Options for HTMLHelp output ------------------------------------------ 106 | 107 | # Output file base name for HTML help builder. 108 | htmlhelp_basename = "ATLASdoc" 109 | 110 | 111 | # -- Options for LaTeX output --------------------------------------------- 112 | 113 | # latex_elements = { 114 | # # The paper size ('letterpaper' or 'a4paper'). 115 | # # 116 | # # 'papersize': 'letterpaper', 117 | # # The font size ('10pt', '11pt' or '12pt'). 118 | # # 119 | # # 'pointsize': '10pt', 120 | # # Additional stuff for the LaTeX preamble. 121 | # # 122 | # # 'preamble': '', 123 | # # Latex figure (float) alignment 124 | # # 125 | # # 'figure_align': 'htbp', 126 | # } 127 | # 128 | # # Grouping the document tree into LaTeX files. List of tuples 129 | # # (source start file, target name, title, 130 | # # author, documentclass [howto, manual, or own class]). 131 | # latex_documents = [ 132 | # (master_doc, "ATLAS.tex", "ATLAS Documentation", "Joe Brown", "manual"), 133 | # ] 134 | # 135 | 136 | # -- Options for manual page output --------------------------------------- 137 | 138 | # One entry per manual page. List of tuples 139 | # (source start file, name, description, authors, manual section). 140 | man_pages = [(master_doc, "atlas", "ATLAS Documentation", [author], 1)] 141 | 142 | 143 | # -- Options for Texinfo output ------------------------------------------- 144 | 145 | # Grouping the document tree into Texinfo files. List of tuples 146 | # (source start file, target name, title, author, 147 | # dir menu entry, description, category) 148 | texinfo_documents = [ 149 | ( 150 | master_doc, 151 | "ATLAS", 152 | "ATLAS Documentation", 153 | author, 154 | "ATLAS", 155 | "One line description of project.", 156 | "Miscellaneous", 157 | ), 158 | ] 159 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. image:: https://anaconda.org/bioconda/metagenome-atlas/badges/version.svg 3 | :target: https://anaconda.org/bioconda/metagenome-atlas 4 | 5 | .. image:: https://img.shields.io/conda/dn/bioconda/metagenome-atlas.svg?label=Bioconda 6 | :target: https://bioconda.github.io/recipes/metagenome-atlas/README.html 7 | 8 | 9 | .. image:: https://img.shields.io/twitter/follow/SilasKieser.svg?style=social&label=Follow 10 | :target: https://twitter.com/search?f=tweets&q=%40SilasKieser%20%23metagenomeAtlas&src=typd 11 | 12 | 13 | .. |logo| image:: ../resources/images/atlas_image.png 14 | :alt: Metagenome-atlas logo 15 | 16 | 17 | 18 | 19 | Metagenome-Atlas 20 | **************** 21 | 22 | |logo| 23 | 24 | Metagenome-Atlas is a easy-to-use metagenomic pipeline based on `snakemake `_. 25 | It handles all steps from QC, Assembly, Binning, to Annotation. 26 | 27 | You can start using atlas with three commands:: 28 | 29 | mamba install -c bioconda -c conda-forge metagenome-atlas={latest_version} 30 | atlas init --db-dir databases path/to/fastq/files 31 | atlas run 32 | 33 | where `{latest_version}` should be replaced by 34 | 35 | 36 | .. image:: https://anaconda.org/bioconda/metagenome-atlas/badges/version.svg 37 | :target: https://anaconda.org/bioconda/metagenome-atlas 38 | 39 | 40 | .. _publication: 41 | 42 | Publication 43 | =========== 44 | 45 | ATLAS: a Snakemake workflow for assembly, annotation, and genomic binning of metagenome sequence data. 46 | Kieser, S., Brown, J., Zdobnov, E. M., Trajkovski, M. & McCue, L. A. 47 | BMC Bioinformatics 21, 257 (2020). 48 | doi: `10.1186/s12859-020-03585-4 `_ 49 | 50 | 51 | .. toctree:: 52 | :maxdepth: 2 53 | :caption: Documentation 54 | 55 | usage/getting_started 56 | usage/output 57 | usage/configuration 58 | usage/changelog 59 | -------------------------------------------------------------------------------- /docs/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.codespell] 2 | # Ref: https://github.com/codespell-project/codespell#using-a-config-file 3 | skip = '.git,*.pdf,*.svg,versioneer.py,*.css,test_*' 4 | check-hidden = true 5 | ignore-regex = '^\s*"image/\S+": ".*|\b[Mm]anuel[. ][Hh]oltgrewe\b' 6 | ignore-words-list = 'testin' 7 | 8 | 9 | [tool.versioneer] 10 | VCS = git 11 | style = pep440 12 | versionfile_source = atlas/_version.py 13 | versionfile_build = atlas/_version.py 14 | tag_prefix = v 15 | -------------------------------------------------------------------------------- /docs/reports/samples.tsv: -------------------------------------------------------------------------------- 1 | Reads_raw_R1 Reads_raw_R2 Reads_QC_R1 Reads_QC_R2 BinGroup 2 | S001 /Users/silas/Documents/metagenomics/data/S001_R1.fastq.gz /Users/silas/Documents/metagenomics/data/S001_R2.fastq.gz Cage1 3 | S002 /Users/silas/Documents/metagenomics/data/S002_R1.fastq.gz /Users/silas/Documents/metagenomics/data/S002_R2.fastq.gz Cage1 4 | -------------------------------------------------------------------------------- /docs/usage/changelog.md: -------------------------------------------------------------------------------- 1 | 2 | (_changelog)= 3 | 4 | ```{include} ../../CHANGELOG.md 5 | ``` -------------------------------------------------------------------------------- /docs/usage/configuration.rst: -------------------------------------------------------------------------------- 1 | 2 | .. 3 | _configuration: 4 | 5 | Configure Atlas 6 | *************** 7 | 8 | .. 9 | _contaminants: 10 | 11 | Remove reads from Host 12 | ====================== 13 | 14 | One of the most important steps in the Quality control is to remove reads from the host's genome. 15 | You can add any number of genomes to be removed. 16 | 17 | We recommend using genomes where repetitive sequences are masked. 18 | See here for more details `human genome `_. 19 | 20 | 21 | Co-abundance Binning 22 | ==================== 23 | 24 | .. _cobinning: 25 | 26 | While binning each sample individually is faster, using co-abundance for binning is recommended. 27 | Quantifying the coverage of contigs across multiple samples provides valuable insights about contig co-variation. 28 | 29 | There are two primary strategies for co-abundance binning: 30 | 31 | 1. **Cross mapping:** Map the reads from multiple samples to each sample's contigs. 32 | 2. **Co-binning:** Concatenate contigs from multiple samples and map all the reads to these combined contigs. 33 | 34 | `final_binner: metabat2` is used for cross-mapping, while `vamb` or `SemiBin` is used for co-binning. 35 | 36 | The samples to be binned together are specified using the `BinGroup` in the `sample.tsv` file. 37 | The size of the BinGroup should be selected based on the binner and the co-binning strategy in use. 38 | 39 | Cross-mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other. 40 | This might yield better results for complex metagenomes, although no definitive benchmark is known. 41 | On the other hand, co-binning is more efficient, as it maps a sample's reads only once to a potentially large assembly. 42 | 43 | Default Behavior 44 | ---------------- 45 | 46 | Starting with version 2.18, Atlas places every sample in a single BinGroup and defaults to `vamb` as the binner unless there are very few samples. 47 | For fewer than 8 samples, `metabat` is the default binner. 48 | 49 | .. note:: 50 | This represents a departure from previous versions, where each sample had its own BinGroup. 51 | Running `vamb` in those versions would consider all samples, regardless of their BinGroup. 52 | This change might cause errors if using a `sample.tsv` file from an older Atlas version. 53 | Typically, you can resolve this by assigning a unique BinGroup to each sample. 54 | 55 | The mapping threshold has been adjusted to 95% identity (single sample binning is 97%) to allow reads from different strains — 56 | but not other species — to map to contigs from a different sample. 57 | 58 | If you're co-binning more than 150-200 samples or cross-mapping more than 50 samples, Atlas will issue a warning regarding excessive samples in a BinGroup. 59 | Although VAMB's official publication suggests it can handle up to 1000 samples, this demands substantial resources. 60 | 61 | Therefore, splitting your samples into multiple BinGroups is recommended. 62 | Ideally, related samples, or those where the same species are anticipated, should belong to the same BinGroup. 63 | 64 | Single-sample Binning 65 | --------------------- 66 | 67 | To employ single-sample binning, simply assign each sample to its own BinGroup and select `metabat` or `DASTool` as the `final_binner`. 68 | 69 | Although it's not recommended, it's feasible to use `DASTool` and feed it inputs from `metabat` and other co-abundance-based binners. 70 | 71 | Add the following lines to your `config.yaml`: 72 | 73 | 74 | .. code-block:: yaml 75 | 76 | final_binner: DASTool 77 | 78 | binner: 79 | - metabat 80 | - maxbin 81 | - vamb 82 | 83 | 84 | 85 | .. _longreads: 86 | 87 | Long reads 88 | ========== 89 | 90 | Limitation: Hybrid assembly of long and short reads is supported with spades and metaSpades. 91 | However, metaSpades needs a paired-end short-read library. 92 | 93 | The path of the (preprocessed) long reads should be added manually to the 94 | sample table under a new column heading 'longreads'. 95 | 96 | In addition, the type of the long reads should be defined in the config file: 97 | ``longread_type`` one of ["pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"] 98 | 99 | 100 | Example config file 101 | =================== 102 | 103 | 104 | ..include:: ../../config/template_config.yaml 105 | :code: 106 | 107 | 108 | 109 | 110 | Detailed configuration 111 | ====================== 112 | 113 | .. 114 | toctree:: 115 | :maxdepth: 1 116 | 117 | ../advanced/qc 118 | ../advanced/assembly 119 | -------------------------------------------------------------------------------- /prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import pandas as pd 4 | from collections import defaultdict 5 | 6 | 7 | def get_sample_files(path, outfile="samples.tsv"): 8 | samples = defaultdict(dict) 9 | seen = set() 10 | for dir_name, sub_dirs, files in os.walk(os.path.abspath(path)): 11 | for fname in files: 12 | if ".fastq" in fname or ".fq" in fname: 13 | sample_id = fname.split(".fastq")[0].split(".fq")[0] 14 | 15 | sample_id = ( 16 | sample_id.replace("_R1", "") 17 | .replace("_r1", "") 18 | .replace("_R2", "") 19 | .replace("_r2", "") 20 | ) 21 | sample_id = sample_id.replace("_", "-").replace(" ", "-") 22 | 23 | fq_path = os.path.join(dir_name, fname) 24 | 25 | if fq_path in seen: 26 | continue 27 | 28 | if "_R2" in fname or "_r2" in fname: 29 | if "R2" in samples[sample_id]: 30 | logging.error( 31 | f"Duplicate sample {sample_id} was found after renaming; skipping... \n Samples: \n{samples}" 32 | ) 33 | 34 | samples[sample_id]["R2"] = fq_path 35 | else: 36 | if "R1" in samples[sample_id]: 37 | logging.error( 38 | f"Duplicate sample {sample_id} was found after renaming; skipping... \n Samples: \n{samples}" 39 | ) 40 | 41 | samples[sample_id]["R1"] = fq_path 42 | 43 | samples = pd.DataFrame(samples).T 44 | 45 | if samples.isna().any().any(): 46 | logging.error(f"Missing files:\n {samples}") 47 | 48 | if os.path.exists(outfile): 49 | logging.error( 50 | f"Output file {outfile} already exists I don't date to overwrite it." 51 | ) 52 | else: 53 | samples.to_csv(outfile, sep="\t") 54 | 55 | return samples 56 | 57 | 58 | if __name__ == "__main__": 59 | import sys 60 | 61 | get_sample_files(sys.argv[1]) 62 | -------------------------------------------------------------------------------- /resources/images/atlas_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metagenome-atlas/atlas/b7694014f0bb3284255325b7abb93d2cf58224e6/resources/images/atlas_image.png -------------------------------------------------------------------------------- /resources/images/atlas_list.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metagenome-atlas/atlas/b7694014f0bb3284255325b7abb93d2cf58224e6/resources/images/atlas_list.png -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [versioneer] 2 | VCS = git 3 | style = pep440 4 | versionfile_source = atlas/_version.py 5 | versionfile_build = atlas/_version.py 6 | tag_prefix = v 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import versioneer # script in directory 3 | 4 | __author__ = "Silas Kieser, Joe Brown" 5 | __copyright__ = "Copyright 2021, Silas Kieser" 6 | __email__ = "silas.kieser@gmail.com, brwnjm@gmail.com" 7 | __license__ = "BSD-3" 8 | 9 | # read the contents of your README file 10 | from os import path 11 | 12 | this_directory = path.abspath(path.dirname(__file__)) 13 | with open(path.join(this_directory, "README.md"), encoding="utf-8") as f: 14 | long_description = f.read() 15 | 16 | 17 | setup( 18 | name="metagenome-atlas", 19 | version=versioneer.get_version(), 20 | cmdclass=versioneer.get_cmdclass(), 21 | url="https://github.com/metagenome-atlas/atlas", 22 | license=__license__, 23 | author=__author__, 24 | author_email=__email__, 25 | zip_safe=False, 26 | description="ATLAS - workflows for assembly, annotation, and genomic binning of metagenomic and metatranscriptomic data.", 27 | long_description=long_description, 28 | long_description_content_type="text/markdown", 29 | packages=["atlas", "atlas.init"], 30 | package_data={ 31 | "": [ 32 | "workflow", 33 | ] 34 | }, 35 | data_files=[(".", ["README.md", "LICENSE.txt"])], 36 | include_package_data=True, 37 | install_requires=[], 38 | # install via conda: click, pandas, pyyaml, snakemake 39 | entry_points={"console_scripts": ["atlas = atlas.atlas:cli"]}, 40 | classifiers=["Topic :: Scientific/Engineering :: Bio-Informatics"], 41 | ) 42 | -------------------------------------------------------------------------------- /test/dryrun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | 5 | NThreads=2 6 | MaxMem=3 7 | 8 | atlas --version 9 | atlas run --help 10 | 11 | 12 | databaseDir="test/databases" 13 | WD='test/Dryrun' 14 | reads_dir='test/reads/empty' 15 | snakemake_args=" --quiet rules $@ --dryrun " 16 | test_script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 17 | 18 | 19 | 20 | create_reads_dir() { 21 | 22 | local reads_dir="$1" 23 | local N=$2 24 | 25 | echo "touch reads dir: $reads_dir" 26 | 27 | rm -rf $reads_dir 28 | mkdir -p $reads_dir 29 | 30 | for (( i=1; i<=$N; i++ )); do 31 | sample="Sample$i" 32 | 33 | for fraction in R1 R2; 34 | do 35 | touch $reads_dir/${sample}_${fraction}.fastq.gz 36 | done 37 | done 38 | } 39 | 40 | # need at least 10 samples for cobinning 41 | 42 | create_reads_dir $reads_dir 10 43 | 44 | 45 | 46 | 47 | 48 | rm -fr $WD 49 | 50 | echo "Atlas download" 51 | atlas download --db-dir $databaseDir -n 52 | 53 | echo "Init" 54 | atlas init --db-dir $databaseDir --threads=$NThreads -w $WD $reads_dir 55 | 56 | 57 | 58 | 59 | echo "Dryrun all" 60 | atlas run all -w $WD $snakemake_args 61 | 62 | echo "Dryrun strains" 63 | atlas run genomes strains -w $WD $snakemake_args 64 | 65 | 66 | for binner in metabat SemiBin vamb DASTool ; do 67 | 68 | echo " 69 | Dryrun Binner $binner 70 | " 71 | 72 | atlas run binning -w $WD --config final_binner=$binner $snakemake_args 73 | 74 | done 75 | 76 | 77 | # 78 | 79 | echo " 80 | Dryrun with skip QC and megahit 81 | " 82 | # 83 | 84 | rm -fr $WD 85 | 86 | WD=${WD}/noQC 87 | rm -fr $WD 88 | 89 | atlas init --db-dir $databaseDir --skip-qc -w $WD --assembler megahit $reads_dir 90 | 91 | atlas run all -w $WD $snakemake_args 92 | 93 | 94 | echo " 95 | execution with profile 96 | " 97 | 98 | mkdir -p $WD/local 99 | printf 'cores: 2\n' > $WD/local/config.yaml 100 | 101 | atlas run qc -w $WD --profile $WD/local $snakemake_args 102 | 103 | 104 | # clean up 105 | rm -rf $WD $reads_dir 106 | 107 | 108 | 109 | 110 | 111 | 112 | echo " 113 | test with external genomes 114 | " 115 | 116 | bash $test_script_dir/test_external_genomes.sh $snakemake_args 117 | 118 | 119 | 120 | echo " 121 | test init with different samples 122 | " 123 | 124 | bash $test_script_dir/test_init_many_samples.sh $snakemake_args -------------------------------------------------------------------------------- /test/test_assembly.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -euo pipefail 3 | 4 | 5 | 6 | 7 | 8 | atlas --version 9 | 10 | 11 | 12 | samplenames="Mycoplasma Streptococcus" 13 | databaseDir="databases" 14 | WD='test/Test_assembly' 15 | reads_dir="example_data/reads/stub" 16 | 17 | 18 | rm -f $WD/samples.tsv 19 | # 20 | atlas init --db-dir $databaseDir -w $WD $reads_dir 21 | 22 | 23 | atlas run -w $WD qc $@ 24 | 25 | atlas run assembly -w $WD $@ 26 | 27 | echo "copy qc reads and assemble" 28 | 29 | WD2='test/Test_assembly_skipQC' 30 | reads_dir=$WD2/"reads" 31 | 32 | rm -f $WD2/samples.tsv 33 | mkdir -p $reads_dir 34 | cp $WD/*/sequence_quality_control/*_QC_R?.fastq.gz $reads_dir 35 | 36 | atlas init --db-dir $databaseDir --assembler megahit --skip-qc -w $WD2 $reads_dir 37 | 38 | atlas run -w $WD2 assembly $@ 39 | 40 | 41 | echo "start from interleaved QC reads" 42 | 43 | WD3='test/Test_assembly_interleved' 44 | reads_dir=$WD3/"reads" 45 | 46 | rm -f $WD3/samples.tsv 47 | mkdir -p $reads_dir 48 | 49 | for sample in $samplenames ; 50 | do 51 | reformat.sh in=$WD/$sample/sequence_quality_control/${sample}_QC_R1.fastq.gz \ 52 | in2=$WD/$sample/sequence_quality_control/${sample}_QC_R2.fastq.gz out=$reads_dir/${sample}.fastq.gz overwrite=true 53 | done 54 | 55 | atlas init --db-dir $databaseDir --skip-qc --interleaved-fastq -w $WD3 $reads_dir 56 | 57 | atlas run assembly --config threads=2 mem=4 java_mem=4 normalize_reads_before_assembly=true -w $WD3 $@ 58 | -------------------------------------------------------------------------------- /test/test_ci.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -exuo pipefail 4 | 5 | 6 | 7 | 8 | 9 | atlas --version 10 | 11 | # get test reads 12 | wget https://zenodo.org/record/3992790/files/test_reads.tar.gz 13 | tar -xzf test_reads.tar.gz 14 | 15 | 16 | ls -l test_reads 17 | 18 | databaseDir="databases" 19 | WD='test_ci' 20 | reads_dir="test_reads" #"example_data/reads/test" 21 | 22 | 23 | rm -f $WD/samples.tsv 24 | # 25 | atlas init $reads_dir --db-dir $databaseDir -w $WD #--interleaved-fastq 26 | 27 | atlas run None screen -w $WD qc $@ 28 | 29 | echo "\n\nFinished screen\n\n" 30 | 31 | atlas run -w $WD qc $@ 32 | 33 | echo "\n\nFinished qc\n\n" 34 | 35 | 36 | atlas run assembly -w $WD $@ 37 | 38 | echo "\n\nFinished assembly\n\n" 39 | 40 | atlas run binning -w $WD $@ 41 | 42 | echo "\n\nFinished binning\n\n" 43 | 44 | atlas run genecatalog --omit-from combine_egg_nogg_annotations combine_dram_genecatalog_annotations -w $WD $@ 45 | 46 | echo "\n\nFinished genecatalog\n\n" 47 | 48 | # atlas run genomes -w $WD $@ 49 | 50 | # echo "\n\nFinished genomes\n\n" 51 | 52 | # atlas run all -w $WD $@ 53 | 54 | # echo "\n\nFinished all\n\n" 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /test/test_external_genomes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | 5 | NThreads=2 6 | MaxMem=3 7 | 8 | 9 | 10 | 11 | 12 | databaseDir="test/databases" 13 | WD='test/genome_quant' 14 | reads_dir='test/reads/empty' 15 | snakemake_args=" --quiet rules $@ --dryrun " 16 | 17 | echo "touch reads dir" 18 | mkdir -p $reads_dir 19 | for sample in Sample1 Sample2 ; 20 | do 21 | for fraction in R1 R2; 22 | do 23 | touch $reads_dir/${sample}_${fraction}.fastq.gz 24 | done 25 | done 26 | 27 | 28 | rm -fr $WD 29 | 30 | # create genome dir 31 | genome_dir=$WD/other_genomes 32 | 33 | mkdir -p $genome_dir 34 | for i in 1::5 ; 35 | do 36 | touch $genome_dir/Genome_$i.fasta 37 | done 38 | 39 | echo "Init" 40 | atlas init --db-dir $databaseDir --skip-qc -w $WD $reads_dir 41 | 42 | echo "Run quantify_genomes" 43 | 44 | atlas run quantify_genomes -w $WD --config genome_dir="other_genomes" $snakemake_args 45 | 46 | 47 | echo "Run strains" 48 | 49 | atlas run strains -w $WD --config genome_dir="other_genomes" $snakemake_args 50 | 51 | 52 | rm -rf $WD $reads_dir -------------------------------------------------------------------------------- /test/test_init_many_samples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | 5 | NThreads=2 6 | MaxMem=3 7 | 8 | atlas --version 9 | atlas run --help 10 | 11 | 12 | databaseDir="test/databases" 13 | 14 | 15 | 16 | 17 | create_reads_dir() { 18 | 19 | local reads_dir="$1" 20 | local N=$2 21 | 22 | echo "touch reads dir" 23 | 24 | rm -rf $reads_dir 25 | mkdir -p $reads_dir 26 | 27 | for (( i=1; i<=$N; i++ )); do 28 | sample="Sample$i" 29 | 30 | for fraction in R1 R2; 31 | do 32 | touch $reads_dir/${sample}_${fraction}.fastq.gz 33 | done 34 | done 35 | } 36 | 37 | 38 | 39 | 40 | 41 | for N in 5 10 50 300 ; 42 | do 43 | 44 | echo "test init with $N samples" 45 | 46 | WD="test/test_init/$N" 47 | reads_dir="test/test_init/reads_${N}_samples/" 48 | 49 | rm -rf $WD $reads_dir 50 | 51 | 52 | 53 | create_reads_dir $reads_dir $N 54 | 55 | atlas init --db-dir $databaseDir -w $WD $reads_dir 56 | 57 | done 58 | -------------------------------------------------------------------------------- /test/test_local.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -euo pipefail 4 | 5 | test_script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 6 | debug_dir="$test_script_dir/../../Debug_atlas" 7 | 8 | mkdir -p $debug_dir 9 | cd $debug_dir 10 | 11 | 12 | reads_dir="test_reads" 13 | 14 | snakemake_args=" --quiet rules $@ " 15 | 16 | # if test_reads doean't exist download it 17 | 18 | if [ ! -d "$reads_dir" ]; then 19 | echo "Downloading test reads" 20 | wget https://zenodo.org/record/3992790/files/test_reads.tar.gz 21 | tar -xzf test_reads.tar.gz 22 | rm test_reads.tar.gz 23 | fi 24 | 25 | WD='wd' 26 | 27 | 28 | rm -f $WD/samples.tsv $WD/config.yaml 29 | 30 | # 31 | atlas init $reads_dir -w $WD --assembler megahit 32 | 33 | #atlas run None screen -w $WD $snakemake_args 34 | 35 | # echo "\n\nFinished screen\n\n" 36 | 37 | atlas run -w $WD qc $snakemake_args 38 | 39 | echo "\n\nFinished qc\n\n" 40 | 41 | 42 | atlas run assembly -w $WD $snakemake_args 43 | 44 | echo "\n\nFinished assembly\n\n" 45 | 46 | # atlas run binning -w $WD $snakemake_args 47 | 48 | # echo "\n\nFinished binning\n\n" 49 | 50 | # atlas run genecatalog --omit-from combine_egg_nogg_annotations combine_dram_genecatalog_annotations -w $WD $snakemake_args 51 | 52 | # echo "\n\nFinished genecatalog\n\n" 53 | 54 | # atlas run genomes -w $WD $@ 55 | 56 | # echo "\n\nFinished genomes\n\n" 57 | 58 | # atlas run strains -w $WD $@ 59 | 60 | # echo "\n\nFinished strains\n\n" 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /test/test_sra.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -euo pipefail 3 | 4 | atlas --version 5 | 6 | Test_dir="test/Test_sra_init" 7 | 8 | rm -rf $Test_dir 9 | mkdir -p $Test_dir 10 | 11 | echo "Download reads from our library" 12 | 13 | WD=$Test_dir/"Mouse" 14 | echo "WD="$WD 15 | 16 | atlas init-public PRJEB20796 -w $WD 17 | 18 | echo "Run Atlas" 19 | 20 | atlas run qc -w $WD --dry-run $@ 21 | 22 | 23 | echo "Download reads from HMP" 24 | WD=$Test_dir/"HMP" 25 | echo "WD="$WD 26 | 27 | # this fails as HMP have samples sequenced with different platforms 28 | 29 | 30 | set +e 31 | atlas init-public SRP002423 -w $WD 32 | 33 | set -e 34 | echo "(expected errors)" 35 | 36 | 37 | echo "drop illumina samples" 38 | sed -i.bak '/ILLUMINA/d' $WD/RunInfo.tsv 39 | 40 | # modify assembler as spades cannot handle single end reads 41 | 42 | # python << END 43 | # from ruamel.yaml import YAML 44 | # yaml = YAML() 45 | # config_file="$WD/config.yaml" 46 | # config= yaml.load(open(config_file)) 47 | # config['assembler'] = 'megahit' 48 | # yaml.dump(config, open(config_file, 'w')) 49 | # END 50 | 51 | 52 | echo "create sample table" 53 | atlas init-public continue -w $WD 54 | 55 | echo "Run Atlas" 56 | 57 | atlas run qc -w $WD --dry-run $@ 58 | 59 | ## single end 60 | 61 | echo "Now with a single end sample" 62 | 63 | WD=$Test_dir/"SingleEnd" 64 | echo "WD="$WD 65 | 66 | atlas init-public SAMEA104416160 -w $WD 67 | 68 | atlas run None download_sra -w $WD $@ 69 | 70 | ## smal data 71 | 72 | 73 | echo "Download reads from small dataset for real test" 74 | 75 | WD=$Test_dir/"Small" 76 | echo "WD="$WD 77 | 78 | echo "gives warning as library is selected with PCR" 79 | 80 | atlas init-public SAMEA9831203 SAMEA9831204 -w $WD 81 | 82 | echo "Run Atlas" 83 | 84 | atlas run None download_sra -w $WD $@ -------------------------------------------------------------------------------- /workflow/annotate.smk: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import tempfile 5 | 6 | # import pandas as pd 7 | # import numpy as np 8 | 9 | from snakemake.utils import logger, min_version 10 | 11 | sys.path.append( 12 | os.path.join(os.path.dirname(os.path.abspath(workflow.snakefile)), "scripts") 13 | ) 14 | import utils 15 | 16 | from conf import update_config 17 | 18 | config = update_config(config) 19 | 20 | TMPDIR = config.get("tmpdir", tempfile.gettempdir()) 21 | 22 | # CONDAENV = "envs" # overwrite definition in download.smk 23 | 24 | 25 | include: "rules/dram.smk" 26 | -------------------------------------------------------------------------------- /workflow/envs/DASTool.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - ruby-lang 5 | - defaults 6 | dependencies: 7 | - das_tool=1.1.6 8 | -------------------------------------------------------------------------------- /workflow/envs/busco.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - busco=5.4 7 | -------------------------------------------------------------------------------- /workflow/envs/cd-hit.yaml: -------------------------------------------------------------------------------- 1 | name: cd-hit 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - cd-hit=4.6 8 | # - libgcc=7.2.0=h69d50b8_2 9 | # - libgcc-ng=7.2.0=h7cc24e2_2 10 | # - libstdcxx-ng=7.2.0=h7a57d05_2 11 | -------------------------------------------------------------------------------- /workflow/envs/checkm.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - checkm-genome =1.1.* 7 | - python =3.6 # needs to be 3.6 8 | -------------------------------------------------------------------------------- /workflow/envs/checkm2.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - checkm2>=1.0.1, <1.1 -------------------------------------------------------------------------------- /workflow/envs/dram.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | dependencies: 5 | - python >=3.8 6 | - altair >=4 7 | - networkx 8 | - numpy 9 | - openpyxl 10 | - pandas >=1.5, <2 11 | - scikit-bio >=0.5.8, <0.6 12 | - sqlalchemy 13 | - prodigal 14 | - scipy >=1.9 15 | - mmseqs2 >10.6d92c 16 | - hmmer 17 | - trnascan-se >=2 18 | - barrnap 19 | - ruby 20 | - parallel 21 | - wget 22 | - curl 23 | - pip 24 | - pip: 25 | - git+https://github.com/SilasK/DRAM.git -------------------------------------------------------------------------------- /workflow/envs/eggNOG.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - eggnog-mapper >=2.1.11, <2.2 7 | - python=3.11 8 | - diamond =2.1 9 | - wget # to download_eggnog_data on macOS 10 | - pandas>=1.5,<2 11 | -------------------------------------------------------------------------------- /workflow/envs/fasta.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - pyfastx=0.9 7 | - pandas=1.2 8 | - pyarrow 9 | - biopython 10 | -------------------------------------------------------------------------------- /workflow/envs/grabseq.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - louiejtaylor 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - grabseqs 8 | -------------------------------------------------------------------------------- /workflow/envs/gtdbtk.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - gtdbtk =2.4 7 | -------------------------------------------------------------------------------- /workflow/envs/gunc.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - gunc=1.0 7 | - pandas=1.5.1 8 | -------------------------------------------------------------------------------- /workflow/envs/hdf.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python=3.10 7 | - pandas=1.5 8 | - h5py=3.8 9 | - psutil=5.9 10 | - biom-format >=2.1.14, <2.2 11 | - pyarrow 12 | -------------------------------------------------------------------------------- /workflow/envs/instrain.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - instrain =1.5.* 7 | - pandas>=1.5,<2.0 8 | -------------------------------------------------------------------------------- /workflow/envs/maxbin.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - maxbin2 =2.2.* 7 | -------------------------------------------------------------------------------- /workflow/envs/megahit.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - megahit=1.2 7 | -------------------------------------------------------------------------------- /workflow/envs/metabat.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - metabat2 =2.15 7 | -------------------------------------------------------------------------------- /workflow/envs/minimap.yaml: -------------------------------------------------------------------------------- 1 | name: minimap2 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - minimap2 6 | - samtools 7 | -------------------------------------------------------------------------------- /workflow/envs/mmseqs.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - mmseqs2=13 7 | -------------------------------------------------------------------------------- /workflow/envs/prodigal.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - prodigal =2.6.* 7 | -------------------------------------------------------------------------------- /workflow/envs/report.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - pandas=1.2 7 | - plotly=5.3 8 | -------------------------------------------------------------------------------- /workflow/envs/required_packages.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python >=3.8, <3.11 7 | - bbmap >= 39.01, <40 8 | - pigz 9 | - bzip2 >=1.0 10 | - pandas >=1.2, <2 11 | - samtools >=1.13, <2 12 | - sambamba >=1.0.1 13 | -------------------------------------------------------------------------------- /workflow/envs/semibin.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python>=3.8, <3.11 7 | - semibin=1.5 8 | - biopython 9 | -------------------------------------------------------------------------------- /workflow/envs/sequence_utils.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - biopython=1.74 7 | - pandas=1.2 8 | - matplotlib-base 9 | -------------------------------------------------------------------------------- /workflow/envs/skani.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - skani=0.1 -------------------------------------------------------------------------------- /workflow/envs/spades.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - spades>=4.0 7 | -------------------------------------------------------------------------------- /workflow/envs/species_clustering.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - python=3.11 7 | - pandas=2 8 | - pyarrow=11 9 | - networkx=3.1 10 | - scipy=1.10 -------------------------------------------------------------------------------- /workflow/envs/sra.post-deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | vdb-config --quiet -------------------------------------------------------------------------------- /workflow/envs/sra.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | - bioconda 4 | - conda-forge 5 | dependencies: 6 | - sra-tools 7 | - pigz 8 | - parallel-fastq-dump 9 | -------------------------------------------------------------------------------- /workflow/envs/tree.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - defaults 5 | dependencies: 6 | - ete3=3.1.2 7 | -------------------------------------------------------------------------------- /workflow/envs/vamb.yaml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - pytorch 5 | - defaults 6 | dependencies: 7 | - vamb>=3.0 8 | -------------------------------------------------------------------------------- /workflow/report/assembly_report.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | logging.captureWarnings(True) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | #### Begining of scripts 33 | 34 | from common_report import * 35 | 36 | import os, sys 37 | import pandas as pd 38 | import plotly.express as px 39 | 40 | 41 | labels = { 42 | "Percent_Assembled_Reads": "Percent of Assembled Reads", 43 | "contig_bp": "Total BP", 44 | "n_contigs": "Contigs (count)", 45 | "N_Predicted_Genes": "Predicted Genes (count)", 46 | "N50": "N50-number", 47 | "L50": "N50-length (bp)", 48 | "N90": "N90-number", 49 | "L90": "N90-length (bp)", 50 | } 51 | 52 | 53 | PLOT_PARAMS = dict(labels=labels) 54 | 55 | 56 | def make_plots(combined_stats): 57 | ## Make figures with PLOTLY 58 | # load and rename data 59 | df = pd.read_csv(combined_stats, sep="\t", index_col=0) 60 | df.sort_index(ascending=True, inplace=True) 61 | df.index.name = "Sample" 62 | df["Sample"] = df.index 63 | 64 | # create plots store in div 65 | div = {} 66 | 67 | fig = px.strip(df, y="Percent_Assembled_Reads", hover_name="Sample", **PLOT_PARAMS) 68 | fig.update_yaxes(range=[0, 100]) 69 | div["Percent_Assembled_Reads"] = fig.to_html(**HTML_PARAMS) 70 | 71 | fig = px.strip(df, y="N_Predicted_Genes", hover_name="Sample", **PLOT_PARAMS) 72 | div["N_Predicted_Genes"] = fig.to_html(**HTML_PARAMS) 73 | 74 | fig = px.scatter(df, y="L50", x="N50", hover_name="Sample", **PLOT_PARAMS) 75 | div["N50"] = fig.to_html(**HTML_PARAMS) 76 | 77 | fig = px.scatter(df, y="L90", x="N90", hover_name="Sample", **PLOT_PARAMS) 78 | div["N90"] = fig.to_html(**HTML_PARAMS) 79 | 80 | fig = px.scatter( 81 | df, y="contig_bp", x="n_contigs", hover_name="Sample", **PLOT_PARAMS 82 | ) 83 | div["Total"] = fig.to_html(**HTML_PARAMS) 84 | 85 | return div 86 | 87 | 88 | # main 89 | 90 | 91 | div = make_plots(combined_stats=snakemake.input.combined_contig_stats) 92 | 93 | 94 | make_html( 95 | div=div, 96 | report_out=snakemake.output.report, 97 | html_template_file=os.path.join(reports_dir, "template_assembly_report.html"), 98 | ) 99 | -------------------------------------------------------------------------------- /workflow/report/bin_report.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | logging.captureWarnings(True) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | #### Begining of scripts 33 | 34 | 35 | from common_report import * 36 | 37 | import pandas as pd 38 | import plotly.express as px 39 | 40 | 41 | from utils.taxonomy import tax2table 42 | 43 | 44 | def make_plots(bin_info): 45 | div = {} 46 | 47 | div["input_file"] = f"{bin_info} and {snakemake.input.bins2species}" 48 | 49 | # Prepare data 50 | df = pd.read_table(bin_info, index_col=0) 51 | df["Bin Id"] = df.index # need it also as column 52 | 53 | # add species info 54 | bin2species = pd.read_table(snakemake.input.bins2species, index_col=0) 55 | df = df.join(bin2species) 56 | 57 | logging.info(df.head()) 58 | 59 | logging.info(bin2species.head()) 60 | 61 | # calculate number of genomes/bins 62 | st = pd.DataFrame(columns=["Bins", "Species"]) 63 | 64 | def add_stats(name, d): 65 | st.loc[name, "Bins"] = d.shape[0] 66 | st.loc[name, "Species"] = d.Representative.unique().shape[0] 67 | 68 | add_stats("All", df) 69 | 70 | df.eval("Quality_score = Completeness - 5* Contamination", inplace=True) 71 | div["QualityScore"] = ( 72 | "

Quality score is calculated as: Completeness - 5 x Contamination.

" 73 | ) 74 | add_stats("Quality score >50 ", df.query("Quality_score>50")) 75 | add_stats("Good quality", df.query("Completeness>90 & Contamination <5")) 76 | add_stats("Quality score >90 ", df.query("Quality_score>90")) 77 | 78 | div["table"] = st.to_html() 79 | 80 | logging.info(df.describe()) 81 | 82 | # Bin Id Completeness completeness_general Contamination completeness_specific completeness_model_used translation_table_used coding_density contig_n50 average_gene_length genome_size gc_content total_coding_sequences additional_notes quality_score sample Ambigious_bases Length_contigs Length_scaffolds N50 N_contigs N_scaffolds logN50 83 | hover_data = [ 84 | "Completeness_Model_Used", 85 | "Coding_Density", 86 | "N50", 87 | "GC_Content", 88 | ] 89 | size_name = "Genome_Size" 90 | 91 | lineage_name = "Species" 92 | 93 | # 2D plot 94 | 95 | logging.info("make 2d plot") 96 | fig = px.scatter( 97 | data_frame=df, 98 | y="Completeness", 99 | x="Contamination", 100 | color=lineage_name, 101 | size=size_name, 102 | hover_data=hover_data, 103 | hover_name="Bin Id", 104 | ) 105 | fig.update_yaxes(range=(50, 102)) 106 | fig.update_xaxes(range=(-0.2, 10.1)) 107 | div["2D"] = fig.to_html(**HTML_PARAMS) 108 | 109 | # 2D plot 110 | 111 | logging.info("make 2d plot species") 112 | fig = px.scatter( 113 | data_frame=df.loc[df.Representative.unique()], 114 | y="Completeness", 115 | x="Contamination", 116 | color=lineage_name, 117 | size=size_name, 118 | hover_data=hover_data, 119 | hover_name="Bin Id", 120 | ) 121 | fig.update_yaxes(range=(50, 102)) 122 | fig.update_xaxes(range=(-0.2, 10.1)) 123 | div["2Dsp"] = fig.to_html(**HTML_PARAMS) 124 | 125 | ## By sample 126 | logging.info("plot by sample") 127 | fig = px.strip( 128 | data_frame=df, 129 | y="Quality_score", 130 | x="Sample", 131 | color=lineage_name, 132 | hover_data=hover_data, 133 | hover_name="Bin Id", 134 | ) 135 | fig.update_yaxes(range=(50, 102)) 136 | div["bySample"] = fig.to_html(**HTML_PARAMS) 137 | 138 | # # By species 139 | # logging.info("plot by species") 140 | # fig = px.strip( 141 | # data_frame=df, 142 | # y="Quality_score", 143 | # x=lineage_name, 144 | # hover_data=hover_data, 145 | # hover_name="Bin Id", 146 | # ) 147 | # fig.update_yaxes(range=(50, 102)) 148 | # div["byPhylum"] = fig.to_html(**HTML_PARAMS) 149 | 150 | return div 151 | 152 | 153 | # main 154 | 155 | 156 | div = make_plots(bin_info=snakemake.input.bin_info) 157 | 158 | 159 | make_html( 160 | div=div, 161 | report_out=snakemake.output.report, 162 | html_template_file=os.path.join(reports_dir, "template_bin_report.html"), 163 | wildcards=snakemake.wildcards, 164 | ) 165 | -------------------------------------------------------------------------------- /workflow/report/common_report.py: -------------------------------------------------------------------------------- 1 | import plotly.io as pio 2 | import os, sys 3 | 4 | atlas_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 5 | 6 | reports_dir = os.path.join(atlas_dir, "report") 7 | 8 | sys.path.append(os.path.join(atlas_dir, "scripts")) 9 | 10 | 11 | pio.templates.default = "simple_white" 12 | HTML_PARAMS = dict( 13 | include_plotlyjs=False, 14 | full_html=False, 15 | ) 16 | 17 | 18 | ## make html report 19 | 20 | 21 | def make_html( 22 | html_template_file, 23 | report_out, 24 | div, 25 | css_file=os.path.join(reports_dir, "report.css"), 26 | wildcards={}, 27 | ): 28 | html_template = open(html_template_file).read() 29 | css_content = open(css_file).read() 30 | 31 | html_string = html_template.format(div=div, css_content=css_content, **wildcards) 32 | 33 | with open(report_out, "w") as outf: 34 | outf.write(html_string) 35 | -------------------------------------------------------------------------------- /workflow/report/report.css: -------------------------------------------------------------------------------- 1 | /* Overrides of notebook CSS for static HTML export */ 2 | body { 3 | overflow: visible; 4 | font-size: 14pt; 5 | padding: 8px; 6 | margin:0 100; 7 | background:whitesmoke; 8 | } 9 | 10 | h1 { 11 | text-align: center 12 | } 13 | 14 | p { 15 | font-size: 14pt; 16 | } 17 | 18 | .float-container { 19 | padding: 2px; 20 | height:100%; 21 | width:100%; 22 | } 23 | 24 | .float-child { 25 | width: 50%; 26 | float: left; 27 | padding: 2px; 28 | } 29 | 30 | @media not print { 31 | #notebook-container { 32 | padding: 15px; 33 | background-color: #fff; 34 | min-height: 0; 35 | -webkit-box-shadow: 0px 0px 12px 1px rgba(87, 87, 87, 0.2); 36 | box-shadow: 0px 0px 12px 1px rgba(87, 87, 87, 0.2); 37 | } 38 | } 39 | @media print { 40 | #notebook-container { 41 | width: 100%; 42 | } 43 | -------------------------------------------------------------------------------- /workflow/report/template_QC_report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Metagenome-Atlas - QC Report 5 | 6 | 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 |

Quality Control Report

20 | 21 | 22 |

Number of reads that went through the quality control process.

23 | 24 | {div[Reads]} 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 |
StepOutput
rawthe input reads
deduplicatedafter (optional) deduplication step
filteredtrimmed, quality filtered
qcfinal reads, contaminants removed
52 | 53 |

Total number of reads/bases after QC

54 | 55 |
56 | 57 |
58 | {div[Total_Reads]} 59 |
60 | 61 |
62 | {div[Total_Bases]} 63 |
64 | 65 |
66 | 67 | 68 |

Base quality values along reads

69 | 70 | {div[quality_QC]} 71 | 72 |

Read length

73 | 74 | 75 | {div[Length]} 76 | 77 |

Insert size

78 |

The size of the reads + the space between. Ideally, the paired-end reads don't overlap.

79 | 80 | {div[Insert]} 81 | 82 |
83 | 84 | 85 | -------------------------------------------------------------------------------- /workflow/report/template_assembly_report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Metagenome-Atlas - Assembly Report 5 | 6 | 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 |

Assembly Summary

20 | 21 | 22 |

Total assembly length

23 | 24 | {div[Total]} 25 | 26 | 27 |

Fragmentation

28 | 29 |

30 | N50/N90 is a measure of how fractionated assemblies are: 31 | 50%/90% of the assembly consists of contigs of length N50/N90 or longer. 32 | You need N50/N90-number contigs to get 50%/90% of the total assembly length. 33 |

34 | 35 | 36 |
37 | 38 |
39 | {div[N50]} 40 |
41 | 42 |
43 | {div[N90]} 44 |
45 | 46 |
47 | 48 |

Genes / Reads

49 | 50 |
51 | 52 |
53 | {div[N_Predicted_Genes]} 54 |
55 | 56 |
57 | {div[Percent_Assembled_Reads]} 58 |
59 | 60 |
61 | 62 | 63 |

64 |
65 | 66 | 67 | -------------------------------------------------------------------------------- /workflow/report/template_bin_report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {binner} Metagenome-Atlas - Bin Report 6 | 7 | 8 | 9 | 10 | 13 | 14 | 15 | 16 | 17 | 18 | 19 |
20 | 21 |

Bin Report for Binner {binner}

22 |

Genome completeness and contamination, and taxonomy were estimated unsing CheckM2.

23 | {div[QualityScore]} 24 |

For all the information see the file {div[input_file]}

25 | 26 |

Number of genomes

27 | {div[table]} 28 | 29 |

"Good quality" refers to the standard of Completeness > 90% and Contamination < 5%. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score > 90.

30 | 31 |

Quality for all bins

32 | {div[2D]} 33 | 34 | 35 |

Quality for Species representatives

36 | {div[2Dsp]} 37 | 38 | 39 | 40 |

Quality score by Sample

41 | 42 | 43 | 44 | {div[bySample]} 45 | 46 | 47 | 48 | 49 |
50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /workflow/rules/cdhit.smk: -------------------------------------------------------------------------------- 1 | def parse_cd_hit_file(clstr_file): 2 | """ 3 | 4 | >Cluster 0 5 | 0 342nt, >S1_83_1... * 6 | 1 342nt, >S2_82_1... at +/100.00% 7 | >Cluster 1 8 | 0 339nt, >S1_61_1... * 9 | 1 339nt, >S2_59_1... at +/100.00% 10 | 11 | 12 | """ 13 | import numpy as np 14 | 15 | def parse_line(line): 16 | _, length, name, identity = ( 17 | line.strip().replace("...", "\t").replace(", ", "\t").split("\t") 18 | ) 19 | 20 | length = int(length.replace("nt", "")) 21 | name = name[1:] 22 | if "*" in identity: 23 | identity = np.nan 24 | else: 25 | identity = float(identity[identity.rfind("/") + 1 : identity.rfind("%")]) 26 | 27 | return name, length, identity 28 | 29 | Clusters = [] 30 | with open(clstr_file) as f: 31 | for line in f: 32 | if line[0] == ">": # new cluster 33 | cluster = dict(elements=[], representative=None) 34 | Clusters.append(cluster) 35 | else: 36 | name, length, identity = parse_line(line) 37 | cluster["elements"].append((name, length, identity)) 38 | if np.isnan(identity): 39 | cluster["representative"] = name 40 | return Clusters 41 | 42 | 43 | def write_cd_hit_clusters(Clusters, file_handle): 44 | for cluster in Clusters: 45 | for element in cluster["elements"]: 46 | file_handle.write( 47 | f"{element[0]}\t{element[1]}\t{element[2]}\t{cluster['representative']}\n" 48 | ) 49 | 50 | 51 | localrules: 52 | parse_clstr_files, 53 | rename_gene_clusters, 54 | 55 | 56 | rule cluster_genes: 57 | input: 58 | fna_dir="Genecatalog/all_genes/predicted_genes.fna", 59 | output: 60 | temp("Genecatalog/representatives_of_clusters.fasta"), 61 | temp("Genecatalog/gene_catalog_oldnames.clstr"), 62 | conda: 63 | "%s/cd-hit.yaml" % CONDAENV 64 | log: 65 | "logs/Genecatalog/cluster_genes.log", 66 | threads: config.get("threads", 1) 67 | resources: 68 | mem_mb=config["mem"] * 1000, 69 | params: 70 | coverage=config["genecatalog"]["coverage"], 71 | identity=config["genecatalog"]["minid"], 72 | extra=config["genecatalog"]["extra"], 73 | prefix=lambda wc, output: os.path.splitext(output[1])[0], 74 | shell: 75 | """ 76 | cd-hit-est -i {input} -T {threads} \ 77 | -M {resources.mem}000 -o {params.prefix} \ 78 | -c {params.identity} -n 9 -d 0 {params.extra} \ 79 | -aS {params.coverage} -aL {params.coverage} &> {log} 80 | 81 | mv {params.prefix} {output[0]} 2>> {log} 82 | """ 83 | 84 | 85 | rule parse_clstr_files: 86 | input: 87 | clustered_dir="Genecatalog/gene_catalog_oldnames.clstr", 88 | output: 89 | temp("Genecatalog/orf2gene_oldnames.tsv"), 90 | run: 91 | with open(output[0], "w") as fout: 92 | fout.write(f"ORF\tLength\tIdentity\tRepresentative\n") 93 | Clusters = parse_cd_hit_file(input[0]) 94 | write_cd_hit_clusters(Clusters, fout) 95 | 96 | 97 | rule generate_orf_info: 98 | input: 99 | cluster_attribution="Genecatalog/orf2gene_oldnames.tsv", 100 | output: 101 | cluster_attribution="Genecatalog/clustering/orf_info.parquet", 102 | rep2genenr="Genecatalog/clustering/representative2genenr.tsv", 103 | threads: 1 104 | run: 105 | import pandas as pd 106 | import numpy as np 107 | 108 | from utils import gene_scripts 109 | 110 | # cd hit format ORF\tLength\tIdentity\tRepresentative\n 111 | orf2gene = pd.read_csv(input.orf2gene, sep="\t") 112 | 113 | # rename gene repr to Gene0000XX 114 | 115 | # split orf names in sample, contig_nr, and orf_nr 116 | orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF) 117 | 118 | # rename representative 119 | 120 | representative_names = orf2gene.Representative.unique() 121 | 122 | map_names = pd.Series( 123 | index=representative_names, 124 | data=np.arange(1, len(representative_names) + 1, dtype=np.uint), 125 | ) 126 | 127 | 128 | orf_info["GeneNr"] = orf2gene.Representative.map(map_names) 129 | 130 | 131 | orf_info.to_parquet(output.cluster_attribution) 132 | 133 | 134 | # Save name of representatives 135 | map_names.index.name = "Representative" 136 | map_names.name = "GeneNr" 137 | map_names.to_csv(output.rep2genenr, sep="\t") 138 | -------------------------------------------------------------------------------- /workflow/rules/derep.smk: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rule run_skani: 5 | input: 6 | paths="Binning/{binner}/filtered_bins_paths.txt", 7 | output: 8 | temp("Intermediate/dereplication/{binner}_distance_matrix.txt"), 9 | log: 10 | "logs/binning/{binner}/dereplication/skani_calculation.log", 11 | resources: 12 | mem_mb=config["mem"] * 1000, 13 | time_min=60 * config["runtime"]["default"], 14 | params: 15 | #preset= "medium", # fast, medium or slow 16 | min_af=config["genome_dereplication"]["overlap"] * 100, 17 | extra="", 18 | threads: config["threads"] 19 | conda: 20 | "../envs/skani.yaml" 21 | shell: 22 | "skani triangle " 23 | " {params.extra} " 24 | " -l {input.paths} " 25 | " -o {output} " 26 | " -t {threads} " 27 | " --sparse --ci " 28 | " --min-af {params.min_af} " 29 | " &> {log} " 30 | 31 | 32 | rule skani_2_parquet: 33 | input: 34 | rules.run_skani.output, 35 | output: 36 | "Binning/{binner}/genome_similarities.parquet", 37 | resources: 38 | mem_mb=config["mem"] * 1000, 39 | time_min=60 * config["runtime"]["simplejob"], 40 | log: 41 | "logs/binning/{binner}/dereplication/skani_2_parquet.log", 42 | threads: 1 43 | run: 44 | try: 45 | skani_column_dtypes = { 46 | "Ref_file": "category", 47 | "Query_file": "category", 48 | "ANI": float, 49 | "Align_fraction_ref": float, 50 | "Align_fraction_query": float, 51 | "ANI_5_percentile": float, 52 | "ANI_95_percentile": float, 53 | } # Ref_name Query_name 54 | 55 | import pandas as pd 56 | 57 | import pandas as pd 58 | 59 | df = pd.read_table(input[0]) 60 | 61 | from utils.io import simplify_path 62 | 63 | df = pd.read_table( 64 | input[0], 65 | usecols=list(skani_column_dtypes.keys()), 66 | dtype=skani_column_dtypes, 67 | ) 68 | 69 | df["Ref"] = df.Ref_file.cat.rename_categories(simplify_path) 70 | df["Query"] = df.Query_file.cat.rename_categories(simplify_path) 71 | 72 | df.to_parquet(output[0]) 73 | 74 | except Exception as e: 75 | import traceback 76 | 77 | with open(log[0], "w") as logfile: 78 | traceback.print_exc(file=logfile) 79 | 80 | raise e 81 | 82 | 83 | rule cluster_species: 84 | input: 85 | dist="Binning/{binner}/genome_similarities.parquet", 86 | bin_info="Binning/{binner}/filtered_bin_info.tsv", 87 | params: 88 | linkage_method="average", 89 | pre_cluster_threshold=0.925, 90 | threshold=config["genome_dereplication"]["ANI"], 91 | conda: 92 | "../envs/species_clustering.yaml" 93 | log: 94 | "logs/binning/{binner}/dereplication/species_clustering.log", 95 | output: 96 | bin_info="Binning/{binner}/bin_info.tsv", 97 | bins2species="Binning/{binner}/bins2species.tsv", 98 | script: 99 | "../scripts/cluster_species.py" 100 | 101 | 102 | rule build_bin_report: 103 | input: 104 | bin_info="Binning/{binner}/bin_info.tsv", 105 | bins2species="Binning/{binner}/bins2species.tsv", 106 | output: 107 | report="reports/bin_report_{binner}.html", 108 | conda: 109 | "../envs/report.yaml" 110 | log: 111 | "logs/binning/report_{binner}.log", 112 | script: 113 | "../report/bin_report.py" 114 | -------------------------------------------------------------------------------- /workflow/rules/dram.smk: -------------------------------------------------------------------------------- 1 | DBDIR = config["database_dir"] 2 | 3 | 4 | def get_dram_config(wildcards): 5 | old_dram_path = f"{DBDIR}/Dram" 6 | if Path(old_dram_path).exists(): 7 | logger.error( 8 | f"Detected an old database for DRAM in {old_dram_path}. You can delete it." 9 | ) 10 | 11 | return config.get("dram_config_file", f"{DBDIR}/DRAM/DRAM.config") 12 | 13 | 14 | localrules: 15 | dram_download, 16 | concat_annotations, 17 | 18 | 19 | rule dram_download: 20 | output: 21 | dbdir=directory(f"{DBDIR}/DRAM/db/"), 22 | config=f"{DBDIR}/DRAM/DRAM.config", 23 | threads: config["threads"] 24 | resources: 25 | mem_mb=config["mem"] * 1000, 26 | time_min=60 * config["runtime"]["default"], 27 | log: 28 | "logs/dram/download_dram.log", 29 | benchmark: 30 | "logs/benchmarks/dram/download_dram.tsv" 31 | conda: 32 | "../envs/dram.yaml" 33 | shell: 34 | " DRAM-setup.py prepare_databases " 35 | " --output_dir {output.dbdir} " 36 | " --threads {threads} " 37 | " --verbose " 38 | " --skip_uniref " 39 | " &> {log} " 40 | " ; " 41 | " DRAM-setup.py export_config --output_file {output.config}" 42 | 43 | 44 | rule DRAM_annotate: 45 | input: 46 | fasta="genomes/genomes/{genome}.fasta", 47 | #checkm= "genomes/checkm/completeness.tsv", 48 | #gtdb_dir= "genomes/taxonomy/gtdb/classify", 49 | config=get_dram_config, 50 | output: 51 | outdir=directory("genomes/annotations/dram/intermediate_files/{genome}"), 52 | threads: config["simplejob_threads"] 53 | resources: 54 | mem_mb=config["simplejob_mem"] * 1000, 55 | time_min=60 * config["runtime"]["default"], 56 | conda: 57 | "../envs/dram.yaml" 58 | params: 59 | extra=config.get("dram_extra", ""), 60 | min_contig_size=config.get("minimum_contig_length", "1000"), 61 | log: 62 | "logs/dram/run_dram/{genome}.log", 63 | benchmark: 64 | "logs/benchmarks/dram/run_dram/{genome}.tsv" 65 | shell: 66 | " DRAM.py annotate " 67 | " --config_loc {input.config} " 68 | " --input_fasta {input.fasta}" 69 | " --output_dir {output.outdir} " 70 | " --threads {threads} " 71 | " --min_contig_size {params.min_contig_size} " 72 | " {params.extra} " 73 | " --verbose &> {log}" 74 | #" --gtdb_taxonomy {input.gtdb_dir}/{params.gtdb_file} " 75 | #" --checkm_quality {input.checkm} " 76 | 77 | 78 | def get_all_dram(wildcards): 79 | all_genomes = get_all_genomes(wildcards) 80 | 81 | return expand(rules.DRAM_annotate.output.outdir, genome=all_genomes) 82 | 83 | 84 | DRAM_ANNOTATON_FILES = ["annotations.tsv"] 85 | 86 | 87 | rule concat_annotations: 88 | input: 89 | get_all_dram, 90 | output: 91 | expand("genomes/annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES), 92 | resources: 93 | time_min=60 * config["runtime"]["default"], 94 | run: 95 | from utils import io 96 | 97 | for i, annotation_file in enumerate(DRAM_ANNOTATON_FILES): 98 | input_files = [ 99 | os.path.join(dram_folder, annotation_file) for dram_folder in input 100 | ] 101 | 102 | io.pandas_concat( 103 | input_files, output[i], sep="\t", index_col=0, axis=0, disk_based=True 104 | ) 105 | 106 | 107 | rule DRAM_destill: 108 | input: 109 | rules.concat_annotations.output, 110 | config=get_dram_config, 111 | output: 112 | outdir=directory("genomes/annotations/dram/distil"), 113 | threads: 1 114 | resources: 115 | mem_mb=config["simplejob_mem"] * 1000, 116 | ttime_min=60 * config["runtime"]["simplejob"], 117 | conda: 118 | "../envs/dram.yaml" 119 | log: 120 | "logs/dram/distil.log", 121 | shell: 122 | " DRAM.py distill " 123 | " --config_loc {input.config} " 124 | " --input_file {input[0]}" 125 | " --output_dir {output} " 126 | " &> {log}" 127 | 128 | 129 | rule get_all_modules: 130 | input: 131 | annotations="genomes/annotations/dram/annotations.tsv", 132 | config=get_dram_config, 133 | output: 134 | "genomes/annotations/dram/kegg_modules.tsv", 135 | threads: 1 136 | resources: 137 | mem_mb=config["simplejob_mem"] * 1000, 138 | time_min=60 * config["runtime"]["default"], 139 | conda: 140 | "../envs/dram.yaml" 141 | log: 142 | "logs/dram/get_all_modules.log", 143 | script: 144 | "../scripts/DRAM_get_all_modules.py" 145 | 146 | 147 | rule dram: 148 | input: 149 | "genomes/annotations/dram/distil", 150 | "genomes/annotations/dram/kegg_modules.tsv", 151 | -------------------------------------------------------------------------------- /workflow/rules/gtdbtk.smk: -------------------------------------------------------------------------------- 1 | gtdb_dir = "genomes/taxonomy/gtdb" 2 | 3 | 4 | rule identify: 5 | input: 6 | flag=rules.extract_gtdb.output, 7 | genes_flag="genomes/annotations/genes/predicted", 8 | output: 9 | directory(f"{gtdb_dir}/identify"), 10 | threads: config["threads"] 11 | conda: 12 | "../envs/gtdbtk.yaml" 13 | log: 14 | "logs/taxonomy/gtdbtk/identify.txt", 15 | f"{gtdb_dir}/gtdbtk.log", 16 | params: 17 | outdir=gtdb_dir, 18 | extension="faa", 19 | gene_dir=lambda wc, input: os.path.abspath(os.path.dirname(input.genes_flag)), 20 | shell: 21 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; ' 22 | "gtdbtk identify " 23 | "--genes --genome_dir {params.gene_dir} " 24 | " --out_dir {params.outdir} " 25 | "--extension {params.extension} " 26 | "--cpus {threads} &> {log[0]}" 27 | 28 | 29 | checkpoint align: 30 | input: 31 | f"{gtdb_dir}/identify", 32 | output: 33 | directory(f"{gtdb_dir}/align"), 34 | threads: config["threads"] 35 | resources: 36 | mem_mb=config["large_mem"] * 1000, 37 | conda: 38 | "../envs/gtdbtk.yaml" 39 | log: 40 | "logs/taxonomy/gtdbtk/align.txt", 41 | f"{gtdb_dir}/gtdbtk.log", 42 | params: 43 | outdir=gtdb_dir, 44 | shell: 45 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; ' 46 | "gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} " 47 | "--cpus {threads} &> {log[0]}" 48 | 49 | 50 | rule classify: 51 | input: 52 | rules.align.output, 53 | genome_dir=genome_dir, 54 | output: 55 | directory(f"{gtdb_dir}/classify"), 56 | threads: config["threads"] #pplacer needs much memory for not many threads 57 | resources: 58 | mem_mb=config["large_mem"] * 1000, 59 | time_min=60 * config["runtime"]["long"], 60 | conda: 61 | "../envs/gtdbtk.yaml" 62 | log: 63 | "logs/taxonomy/gtdbtk/classify.txt", 64 | f"{gtdb_dir}/gtdbtk.log", 65 | params: 66 | outdir=gtdb_dir, 67 | extension="fasta", 68 | mashdir=Path(GTDBTK_DATA_PATH) / "mash_db", 69 | shell: 70 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; ' 71 | "gtdbtk classify --genome_dir {input.genome_dir} --align_dir {params.outdir} " 72 | " --mash_db {params.mashdir} " 73 | "--out_dir {params.outdir} " 74 | " --tmpdir {resources.tmpdir} " 75 | "--extension {params.extension} " 76 | "--cpus {threads} &> {log[0]}" 77 | 78 | 79 | rule combine_taxonomy: 80 | input: 81 | folder=f"{gtdb_dir}/classify", 82 | output: 83 | combined=f"{gtdb_dir}/gtdbtk.combined.summary.tsv", 84 | taxonomy="genomes/taxonomy/gtdb_taxonomy.tsv", 85 | log: 86 | "logs/taxonomy/gtdbtk/combine.txt", 87 | script: 88 | "../scripts/combine_taxonomy.py" 89 | 90 | 91 | rule build_tree: 92 | input: 93 | f"{gtdb_dir}/align/{{msa}}.user_msa.fasta.gz", 94 | output: 95 | temp("genomes/taxonomy/gtdb/{msa}.unrooted.tree"), 96 | log: 97 | "logs/genomes/tree/{msa}.log", 98 | "logs/genomes/tree/{msa}.err", 99 | threads: max(config["threads"], 3) 100 | params: 101 | outdir=lambda wc, output: Path(output[0]).parent, 102 | conda: 103 | "../envs/gtdbtk.yaml" 104 | shell: 105 | 'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; ' 106 | "gtdbtk infer --msa_file {input} " 107 | " --out_dir {params.outdir} " 108 | " --prefix {wildcards.msa} " 109 | " --cpus {threads} " 110 | "--tmpdir {resources.tmpdir} > {log[0]} 2> {log[1]}" 111 | 112 | 113 | localrules: 114 | root_tree, 115 | 116 | 117 | rule root_tree: 118 | input: 119 | tree=rules.build_tree.output[0], 120 | wildcard_constraints: 121 | msa="((?!unrooted).)*", 122 | output: 123 | tree="genomes/tree/{msa}.nwk", 124 | conda: 125 | "../envs/tree.yaml" 126 | threads: 1 127 | resources: 128 | mem_mb=config["simplejob_mem"] * 1000, 129 | ttime_min=60 * config["runtime"]["simplejob"], 130 | log: 131 | "logs/genomes/tree/root_tree_{msa}.log", 132 | script: 133 | "../scripts/root_tree.py" 134 | 135 | 136 | def all_gtdb_trees_input(wildcards): 137 | dir = checkpoints.align.get().output[0] 138 | 139 | domains = glob_wildcards(f"{dir}/gtdbtk.{{domain}}.user_msa.fasta.gz").domain 140 | 141 | return expand("genomes/tree/gtdbtk.{domain}.nwk", domain=domains) 142 | 143 | 144 | rule all_gtdb_trees: 145 | input: 146 | all_gtdb_trees_input, 147 | output: 148 | touch("genomes/tree/finished_gtdb_trees"), 149 | -------------------------------------------------------------------------------- /workflow/rules/patch.smk: -------------------------------------------------------------------------------- 1 | localrules: 2 | copy_assembly, 3 | 4 | 5 | # Rules that are usefull temporarily to update to new version of atlas 6 | 7 | 8 | ruleorder: copy_assembly > finalize_contigs 9 | 10 | 11 | rule copy_assembly: 12 | input: 13 | "{sample}/{sample}_contigs.fasta", 14 | output: 15 | "Assembly/fasta/{sample}.fasta", 16 | shell: 17 | "cp {input} {output}" 18 | -------------------------------------------------------------------------------- /workflow/rules/predict_genes_of_genomes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os, sys 4 | import logging, traceback 5 | 6 | logging.basicConfig( 7 | filename=snakemake.log[0], 8 | level=logging.INFO, 9 | format="%(asctime)s %(message)s", 10 | datefmt="%Y-%m-%d %H:%M:%S", 11 | ) 12 | 13 | logging.captureWarnings(True) 14 | 15 | 16 | def handle_exception(exc_type, exc_value, exc_traceback): 17 | if issubclass(exc_type, KeyboardInterrupt): 18 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 19 | return 20 | 21 | logging.error( 22 | "".join( 23 | [ 24 | "Uncaught exception: ", 25 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 26 | ] 27 | ) 28 | ) 29 | 30 | 31 | # Install exception handler 32 | sys.excepthook = handle_exception 33 | 34 | #### Begining of scripts 35 | 36 | # python 3.5 without f strings 37 | 38 | import os, shutil, sys 39 | import uuid 40 | import itertools 41 | from glob import glob 42 | from snakemake.shell import shell 43 | from snakemake.io import glob_wildcards 44 | from multiprocessing import Pool 45 | 46 | 47 | def predict_genes(genome, fasta, out_dir, log): 48 | fna = "{}/{}.fna".format(out_dir, genome) 49 | faa = "{}/{}.faa".format(out_dir, genome) 50 | gff = "{}/{}.gff".format(out_dir, genome) 51 | 52 | shell('printf "{genome}:\n" > {log}'.format(genome=genome, log=log)) 53 | shell( 54 | "prodigal -i {fasta} -o {gff} -d {fna} -a {faa} -p sinlge -c -m -f gff 2>> {log} ".format( 55 | fasta=fasta, log=log, gff=gff, fna=fna, faa=faa 56 | ) 57 | ) 58 | shell('printf "\n" >> {log}'.format(log=log)) 59 | 60 | 61 | def predict_genes_genomes(input_dir, out_dir, log, threads): 62 | genomes_fastas = glob(os.path.join(input_dir, "*.fasta")) 63 | 64 | os.makedirs(out_dir, exist_ok=True) 65 | 66 | temp_log_dir = os.path.join(os.path.dirname(log), "tmp_" + uuid.uuid4().hex) 67 | os.makedirs(temp_log_dir, exist_ok=False) 68 | 69 | genome_names = [] 70 | log_names = [] 71 | for fasta in genomes_fastas: 72 | genome_name = os.path.splitext(os.path.split(fasta)[-1])[0] 73 | genome_names.append(genome_name) 74 | log_names.append(os.path.join(temp_log_dir, genome_name + ".prodigal.tmp")) 75 | 76 | pool = Pool(threads) 77 | pool.starmap( 78 | predict_genes, 79 | zip(genome_names, genomes_fastas, itertools.repeat(out_dir), log_names), 80 | ) 81 | 82 | # cat in python 83 | with open(log, "ab") as f_out: 84 | for logfile in log_names: 85 | with open(logfile, "rb") as f_in: 86 | shutil.copyfileobj(f_in, f_out) 87 | 88 | shell("rm -r {temp_log_dir}".format(temp_log_dir=temp_log_dir)) 89 | 90 | 91 | if __name__ == "__main__": 92 | predict_genes_genomes( 93 | snakemake.input.dir, 94 | snakemake.output[0], 95 | snakemake.log[0], 96 | int(snakemake.threads), 97 | ) 98 | -------------------------------------------------------------------------------- /workflow/rules/scg_blank_diamond.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # The MIT License (MIT) 4 | # Copyright (c) 2016 Alexander J Probst 5 | 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 9 | 10 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | 12 | # https://github.com/AJProbst/sngl_cp_gn 13 | 14 | #1: $search_engine name 15 | #2: $proteins 16 | #3: $DIR\/db/bac.all.faa 17 | #4: $DIR\/db/bac.scg.faa 18 | #5: $DIR\/db/bac.scg.lookup 19 | #6: $threads 20 | 21 | d = ARGV[0] 22 | 23 | input_file = ARGV[1] 24 | output_dir = File.dirname(input_file) 25 | 26 | datab = ARGV[2] 27 | db_all = File.dirname(input_file) + "/all_prot" 28 | puts "database name of all proteins is #{datab}" 29 | 30 | db_name = ARGV[3] 31 | puts "database name of SCGs is #{db_name}" 32 | 33 | db_lookup = ARGV[4] 34 | puts "database lookup is #{db_lookup}" 35 | 36 | threads = ARGV[5] 37 | 38 | #build databases 39 | full_db = system "#{d} makedb --in #{datab} -d #{db_all}.dmnd" 40 | abort "makeblastdb did not work for #{datab}, please check your input file" unless full_db 41 | 42 | # find SCG candidates 43 | puts "finding SCG candidates..." 44 | input_blast_database = system "#{d} makedb --in #{input_file} -d #{input_file}.dmnd" 45 | input_blast_out = File.join(output_dir,File.basename(input_file) + ".findSCG.b6") 46 | abort "makeblastdb did not work for #{input_file}, please check your input file" unless input_blast_database 47 | input_blast_ok = system "#{d} blastp --query #{db_name} --db #{input_file}.dmnd --max-target-seqs 0 --outfmt 6 qseqid sseqid pident length qlen slen evalue bitscore --out #{input_blast_out} --evalue 0.01 --threads #{threads}" 48 | system "rm #{input_file}.dmnd" 49 | abort "blast did not work, please check your input file." unless input_blast_ok 50 | 51 | input_blast_out_whitelist = File.join(output_dir,File.basename(input_file) + ".findSCG.b6.whitelist") 52 | system "awk '{print$2}' #{input_blast_out} | sort -u > #{input_blast_out_whitelist}" 53 | scg_candidates = File.join(output_dir,File.basename(input_file) + ".scg.candidates.faa") 54 | system "pullseq -i #{input_file} -n #{input_blast_out_whitelist} > #{scg_candidates}" 55 | system "rm #{input_blast_out_whitelist}" 56 | 57 | # verify SCGs by blasting against all proteins of all genomes 58 | puts "verifying selected SCGs..." 59 | db_blast_out = File.join(output_dir,File.basename(input_file) + ".all.b6") 60 | db_blast_ok = system "#{d} blastp --query #{scg_candidates} --db #{db_all} --evalue 0.00001 --threads #{threads} --out #{db_blast_out} --outfmt 6 qseqid sseqid pident length qlen slen evalue bitscore --max-target-seqs 1" 61 | abort "verifying blast did not work" unless db_blast_ok 62 | system "rm #{db_all}.dmnd" 63 | puts "starting annotations of single copy cogs..." 64 | 65 | # Read db_lookup 66 | lookup_h = {} 67 | File.open(db_lookup).each do |line| 68 | sbj, annotation = line.chomp.split 69 | lookup_h[sbj]=annotation 70 | end 71 | 72 | # now compare and print 73 | File.open(File.join(output_dir,File.basename(input_file)+".scg"), "w") do |file| 74 | File.open(db_blast_out).each do |line| 75 | next if line =~ /^#/ 76 | line.chomp! 77 | temp = line.split(/\t/) 78 | query, sbjct = temp[0], temp[1] 79 | aln_len, sbjct_len = temp[3], temp[5] 80 | if lookup_h[sbjct] && aln_len > (sbjct_len*0.5) 81 | file.puts "#{query.split[0]}\t#{lookup_h[sbjct]}" 82 | end 83 | end 84 | end 85 | 86 | puts "successfully finished" 87 | -------------------------------------------------------------------------------- /workflow/rules/screen.smk: -------------------------------------------------------------------------------- 1 | 2 | rule generate_sketch: 3 | input: 4 | unpack(get_input_fastq), 5 | output: 6 | "Intermediate/screen/sketches/{sample}.sketch.gz", 7 | log: 8 | "logs/screen/make_sketch/{sample}.log", 9 | conda: 10 | "../envs/required_packages.yaml" 11 | threads: 1 12 | resources: 13 | mem_mb=config["simplejob_mem"] * 1000, 14 | java_mem=int(config["simplejob_mem"] * JAVA_MEM_FRACTION), 15 | shell: 16 | "bbsketch.sh " 17 | "in={input[0]}" 18 | " samplerate=0.5" 19 | " minkeycount=2 " 20 | " out={output} " 21 | " blacklist=nt ssu=f name0={wildcards.sample} depth=t overwrite=t " 22 | " -Xmx{resources.java_mem}g " 23 | " &> {log}" 24 | # take only one read 25 | 26 | 27 | rule compare_sketch: 28 | input: 29 | expand(rules.generate_sketch.output, sample=SAMPLES), 30 | output: 31 | "QC/screen/sketch_comparison.tsv.gz", 32 | priority: 100 33 | log: 34 | "logs/screen/compare_sketch.log", 35 | conda: 36 | "../envs/required_packages.yaml" 37 | threads: 1 38 | resources: 39 | mem_mb=config["mem"] * 1000, 40 | java_mem=int(config["mem"] * JAVA_MEM_FRACTION), 41 | shell: 42 | "comparesketch.sh alltoall " 43 | " format=3 out={output} " 44 | " records=5000 " 45 | " {input} " 46 | " -Xmx{resources.java_mem}g " 47 | " &> {log}" 48 | 49 | 50 | # sendsketch.sh sample2.sketch printdepth2=t level=2 printqfname=f printvolume=t color=f out 51 | -------------------------------------------------------------------------------- /workflow/rules/semibin.smk: -------------------------------------------------------------------------------- 1 | 2 | rule semibin_generate_data_multi: 3 | input: 4 | fasta=rules.combine_contigs.output, 5 | bams=get_bams_of_bingroup, 6 | output: 7 | directory("Intermediate/cobinning/{bingroup}/semibin/data_multi"), 8 | # expand( 9 | # "Cobinning/SemiBin/samples/{sample}/{files}", 10 | # sample=SAMPLES, 11 | # files=["data.csv", "data_split.csv"], 12 | # ), 13 | conda: 14 | "../envs/semibin.yaml" 15 | threads: config["threads"] 16 | resources: 17 | mem_mb=config["mem"] * 1000, 18 | time_min=60 * config["runtime"]["default"], 19 | log: 20 | "logs/semibin/{bingroup}/generate_data_multi.log", 21 | benchmark: 22 | "logs/benchmarks/semibin/{bingroup}/generate_data_multi.tsv" 23 | params: 24 | # output_dir="Cobinning/SemiBin", 25 | separator=config["cobinning_separator"], 26 | shell: 27 | "SemiBin generate_sequence_features_multi" 28 | " --input-fasta {input.fasta} " 29 | " --input-bam {input.bams} " 30 | " --output {output} " 31 | " --threads {threads} " 32 | " --separator {params.separator} " 33 | " 2> {log}" 34 | 35 | 36 | rule semibin_train: 37 | input: 38 | flag=get_assembly, 39 | fasta_sample=rules.filter_contigs.output[0], 40 | bams=get_bams_of_bingroup, 41 | data_folder=rules.semibin_generate_data_multi.output[0], 42 | output: 43 | "Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5", 44 | conda: 45 | "../envs/semibin.yaml" 46 | threads: config["threads"] 47 | resources: 48 | mem_mb=config["mem"] * 1000, 49 | time_min=60 * config["runtime"]["default"], 50 | log: 51 | "logs/semibin/{bingroup}/train/{sample}.log", 52 | benchmark: 53 | "logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv" 54 | params: 55 | output_dir=lambda wc, output: os.path.dirname(output[0]), 56 | data=lambda wc, input: Path(input.data_folder) 57 | / "samples" 58 | / wc.sample 59 | / "data.csv", 60 | data_split=lambda wc, input: Path(input.data_folder) 61 | / "samples" 62 | / wc.sample 63 | / "data_split.csv", 64 | extra=config["semibin_train_extra"], 65 | shell: 66 | "SemiBin train_self " 67 | " --output {params.output_dir} " 68 | " --threads {threads} " 69 | " --data {params.data} " 70 | " --data-split {params.data_split} " 71 | " {params.extra} " 72 | " 2> {log}" 73 | 74 | 75 | def semibin_input(wildcards): 76 | bingroup_of_sample = sampleTable.loc[wildcards.sample, "BinGroup"] 77 | samples_of_bingroup = sampleTable.query( 78 | f'BinGroup=="{bingroup_of_sample}"' 79 | ).index.tolist() 80 | 81 | assert len(samples_of_bingroup) > 1 82 | 83 | mapping = dict( 84 | fasta=rules.filter_contigs.output[0].format(**wildcards), 85 | bams=expand( 86 | "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam", 87 | sample=samples_of_bingroup, 88 | bingroup=bingroup_of_sample, 89 | ), 90 | data_folder=rules.semibin_generate_data_multi.output[0].format( 91 | bingroup=bingroup_of_sample, **wildcards 92 | ), 93 | model=rules.semibin_train.output[0].format( 94 | bingroup=bingroup_of_sample, **wildcards 95 | ), 96 | ) 97 | 98 | return mapping 99 | 100 | 101 | rule run_semibin: 102 | input: 103 | unpack(semibin_input), 104 | output: 105 | # contains no info to bingroup 106 | directory( 107 | "Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/" 108 | ), 109 | conda: 110 | "../envs/semibin.yaml" 111 | threads: config["threads"] 112 | resources: 113 | mem_mb=config["mem"] * 1000, 114 | time_min=60 * config["runtime"]["default"], 115 | log: 116 | "logs/semibin/bin/{sample}.log", 117 | benchmark: 118 | "logs/benchmarks/semibin/bin/{sample}.tsv" 119 | params: 120 | output_dir=lambda wc, output: os.path.dirname(output[0]), 121 | data=lambda wc, input: Path(input.data_folder) 122 | / "samples" 123 | / wc.sample 124 | / "data.csv", 125 | min_bin_kbs=int(config["cobining_min_bin_size"] / 1000), 126 | extra=config["semibin_options"], 127 | shell: 128 | "SemiBin bin " 129 | " --input-fasta {input.fasta} " 130 | " --output {params.output_dir} " 131 | " --threads {threads} " 132 | " --data {params.data} " 133 | " --model {input.model} " 134 | " --minfasta-kbs {params.min_bin_kbs}" 135 | " {params.extra} " 136 | " 2> {log}" 137 | 138 | 139 | localrules: 140 | parse_semibin_output, 141 | 142 | 143 | ruleorder: parse_semibin_output > get_unique_cluster_attribution 144 | 145 | 146 | rule parse_semibin_output: 147 | input: 148 | rules.run_semibin.output[0], 149 | output: 150 | "{sample}/binning/SemiBin/cluster_attribution.tsv", 151 | conda: 152 | "../envs/semibin.yaml" 153 | log: 154 | "logs/semibin/parse_output/{sample}.log", 155 | params: 156 | extension=".fa", 157 | script: 158 | "../scripts/parse_semibin.py" 159 | 160 | 161 | rule semibin: 162 | input: 163 | expand("{sample}/binning/SemiBin/cluster_attribution.tsv", sample=SAMPLES), 164 | -------------------------------------------------------------------------------- /workflow/rules/sra.smk: -------------------------------------------------------------------------------- 1 | wildcard_constraints: 2 | sra_run="[S,E,D]RR[0-9]+", 3 | 4 | 5 | localrules: 6 | prefetch, 7 | 8 | 9 | SRA_read_fractions = ["_1", "_2"] if PAIRED_END else [""] 10 | SRA_SUBDIR_RUN = "SRA/Runs" 11 | 12 | 13 | rule prefetch: 14 | output: 15 | sra=temp(touch(SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}_downloaded")), 16 | # not givins sra file as output allows for continue from the same download 17 | params: 18 | outdir=SRA_SUBDIR_RUN, # prefetch creates file in subfolder with run name automatically 19 | log: 20 | "logs/SRAdownload/prefetch/{sra_run}.log", 21 | benchmark: 22 | "logs/benchmarks/SRAdownload/prefetch/{sra_run}.tsv" 23 | threads: 1 24 | resources: 25 | mem_mb=1000, 26 | time_min=60 * int(config["runtime"]["simplejob"]), 27 | internet_connection=1, 28 | conda: 29 | "%s/sra.yaml" % CONDAENV 30 | shell: 31 | " mkdir -p {params.outdir} 2> {log} " 32 | " ; " 33 | " prefetch " 34 | " --output-directory {params.outdir} " 35 | " -X 999999999 " 36 | " --progress " 37 | " --log-level info " 38 | " {wildcards.sra_run} &>> {log} " 39 | " ; " 40 | " vdb-validate {params.outdir}/{wildcards.sra_run}/{wildcards.sra_run}.sra &>> {log} " 41 | 42 | 43 | rule extract_run: 44 | input: 45 | flag=rules.prefetch.output, 46 | output: 47 | temp( 48 | expand( 49 | SRA_SUBDIR_RUN + "/{{sra_run}}/{{sra_run}}{fraction}.fastq.gz", 50 | fraction=SRA_read_fractions, 51 | ) 52 | ), 53 | params: 54 | outdir=os.path.abspath(SRA_SUBDIR_RUN + "/{sra_run}"), 55 | sra_file=SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}.sra", 56 | log: 57 | "logs/SRAdownload/extract/{sra_run}.log", 58 | benchmark: 59 | "logs/benchmarks/SRAdownload/fasterqdump/{sra_run}.tsv" 60 | threads: config["simplejob_threads"] 61 | resources: 62 | time_min=60 * int(config["runtime"]["simplejob"]), 63 | mem_mb=1000, #default 100Mb 64 | conda: 65 | "%s/sra.yaml" % CONDAENV 66 | shell: 67 | " vdb-validate {params.sra_file} &>> {log} " 68 | " ; " 69 | " parallel-fastq-dump " 70 | " --threads {threads} " 71 | " --gzip --split-files " 72 | " --outdir {params.outdir} " 73 | " --tmpdir {resources.tmpdir} " 74 | " --skip-technical --split-3 " 75 | " -s {params.sra_file} &>> {log} " 76 | " ; " 77 | " rm -f {params.sra_file} 2>> {log} " 78 | 79 | 80 | RunTable = None 81 | 82 | 83 | def get_runids_for_biosample(wildcards): 84 | global RunTable 85 | if RunTable is None: 86 | from atlas.init.parse_sra import load_and_validate_runinfo_table 87 | 88 | RunTable = load_and_validate_runinfo_table("RunInfo.tsv") 89 | 90 | run_ids = RunTable.query(f"BioSample == '{wildcards.sample}'").index.tolist() 91 | 92 | return run_ids 93 | 94 | 95 | def get_runs_for_biosample(wildcards): 96 | run_ids = get_runids_for_biosample(wildcards) 97 | 98 | ReadFiles = {} 99 | for fraction in SRA_read_fractions: 100 | if fraction == "": 101 | key = "se" 102 | else: 103 | key = fraction 104 | 105 | ReadFiles[key] = expand( 106 | SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}{fraction}.fastq.gz", 107 | fraction=fraction, 108 | sra_run=run_ids, 109 | ) 110 | 111 | return ReadFiles 112 | 113 | 114 | rule merge_runs_to_sample: 115 | input: 116 | unpack(get_runs_for_biosample), 117 | output: 118 | expand( 119 | "SRA/Samples/{{sample}}/{{sample}}{fraction}.fastq.gz", 120 | fraction=SRA_read_fractions, 121 | ), 122 | threads: 1 123 | run: 124 | from utils import io 125 | 126 | for i, fraction in enumerate(SRA_read_fractions): 127 | if fraction == "": 128 | fraction = "se" 129 | io.cat_files(input[fraction], output[i]) 130 | 131 | 132 | rule download_sra: 133 | input: 134 | expand( 135 | "SRA/Samples/{sample}/{sample}{fraction}.fastq.gz", 136 | fraction=SRA_read_fractions, 137 | sample=SAMPLES, 138 | ), 139 | -------------------------------------------------------------------------------- /workflow/rules/strains.smk: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rule instrain_profile: 5 | input: 6 | bam="genomes/alignments/bams/{sample}.bam", 7 | genomes="genomes/all_contigs.fasta", 8 | # genes=lambda wc: get_all_genes(wc, extension=".fna"), 9 | scaffold_to_genome="genomes/clustering/contig2genome.tsv", 10 | output: 11 | directory("strains/intermediate_files/{sample}"), 12 | threads: config["threads"] 13 | params: 14 | extra=config.get("instrain_profile_extra", ""), 15 | log: 16 | "logs/strains/profile/{sample}.log", 17 | conda: 18 | "../envs/instrain.yaml" 19 | benchmark: 20 | "logs/benchmarks/strains/profile/{sample}.tsv" 21 | resources: 22 | mem_mb=config["mem"] * 1000, 23 | time_min=60 * config["runtime"]["long"], 24 | shell: 25 | #" cat {input.genes} > {resources.tmpdir}/all_genome_genes.fna 2> {log} " 26 | #" ; " 27 | "inStrain profile " 28 | " {input.bam} {input.genomes} " 29 | " -o {output} " 30 | " -p {threads} " 31 | 32 | " -s {input.scaffold_to_genome} " 33 | " --database_mode " 34 | " {params.extra} &>> {log}" 35 | #" -g {resources.tmpdir}/all_genome_genes.fna " 36 | 37 | 38 | rule instrain_compare: 39 | input: 40 | profiles=expand("strains/intermediate_files/{sample}", sample=SAMPLES), 41 | scaffold_to_genome="genomes/clustering/contig2genome.tsv", 42 | output: 43 | directory("strains/comparison"), 44 | threads: config["threads"] 45 | params: 46 | extra=config.get("instrain_compare_extra", ""), 47 | log: 48 | "logs/strains/compare.log", 49 | conda: 50 | "../envs/instrain.yaml" 51 | benchmark: 52 | "logs/benchmarks/strains/compare.tsv" 53 | resources: 54 | mem_mb=config["mem"] * 1000, 55 | time_min=60 * config["runtime"]["long"], 56 | shell: 57 | "inStrain compare " 58 | " --input {input.profiles} " 59 | " -o {output} " 60 | " -p {threads} " 61 | " -s {input.scaffold_to_genome} " 62 | " --database_mode " 63 | " {params.extra} &> {log}" 64 | 65 | 66 | # usage: inStrain compare -i [INPUT [INPUT ...]] [-o OUTPUT] [-p PROCESSES] [-d] 67 | # [-h] [--version] [-s [STB [STB ...]]] [-c MIN_COV] 68 | # [-f MIN_FREQ] [-fdr FDR] [--database_mode] 69 | # [--breadth BREADTH] [-sc SCAFFOLDS] [--genome GENOME] 70 | # [--store_coverage_overlap] 71 | # [--store_mismatch_locations] 72 | # [--include_self_comparisons] [--skip_plot_generation] 73 | # [--group_length GROUP_LENGTH] [--force_compress] 74 | # [-ani ANI_THRESHOLD] [-cov COVERAGE_TRESHOLD] 75 | # [--clusterAlg {ward,single,complete,average,weighted,median,centroid}] 76 | -------------------------------------------------------------------------------- /workflow/scripts/DRAM_get_all_modules.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | import pandas as pd 35 | 36 | annotation_file = snakemake.input.annotations 37 | module_output_table = snakemake.output[0] 38 | 39 | from mag_annotator.database_handler import DatabaseHandler 40 | from mag_annotator.summarize_genomes import build_module_net, make_module_coverage_frame 41 | 42 | annotations = pd.read_csv(annotation_file, sep="\t", index_col=0) 43 | 44 | 45 | # get db_locs and read in dbs 46 | database_handler = DatabaseHandler(logger=logging, config_loc=snakemake.input.config) 47 | 48 | 49 | if "module_step_form" not in database_handler.config["dram_sheets"]: 50 | raise ValueError( 51 | "Module step form location must be set in order to summarize genomes" 52 | ) 53 | 54 | module_steps_form = pd.read_csv( 55 | database_handler.config["dram_sheets"]["module_step_form"], sep="\t" 56 | ) 57 | 58 | all_module_nets = { 59 | module: build_module_net(module_df) 60 | for module, module_df in module_steps_form.groupby("module") 61 | } 62 | 63 | module_coverage_frame = make_module_coverage_frame( 64 | annotations, all_module_nets, groupby_column="fasta" 65 | ) 66 | 67 | module_coverage_frame.to_csv(module_output_table, sep="\t") 68 | -------------------------------------------------------------------------------- /workflow/scripts/combine_busco.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | logging.captureWarnings(True) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | #### Begining of scripts 33 | 34 | import pandas as pd 35 | from utils.parsers import read_busco_output 36 | 37 | 38 | def main(samples, completeness_files, bin_table): 39 | sample_data = {} 40 | div = {} 41 | 42 | df = pd.DataFrame() 43 | 44 | for i, sample in enumerate(samples): 45 | sample_data = read_busco_output(completeness_files[i]) 46 | sample_data["Sample"] = sample 47 | 48 | df = df.append(sample_data) 49 | 50 | # remove missing 51 | 52 | failed_genomes = df.index[df.Dataset.str.lower().str.contains("run failed")] 53 | 54 | if len(failed_genomes) > 0: 55 | logging.warning( 56 | "Following genomes didn't pass BUSCO. I ignore them, because " 57 | "I think theas means they are too bad to be quantified:\n" 58 | f"{failed_genomes}" 59 | ) 60 | 61 | df.loc[failed_genomes, ["Completeness", "Contamination", "Quality_score"]] = 0 62 | 63 | df.to_csv(bin_table, sep="\t") 64 | 65 | 66 | if __name__ == "__main__": 67 | main( 68 | samples=snakemake.params.samples, 69 | completeness_files=snakemake.input.completeness_files, 70 | bin_table=snakemake.output.bin_table, 71 | ) 72 | -------------------------------------------------------------------------------- /workflow/scripts/combine_checkm.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | logging.captureWarnings(True) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | #### Begining of scripts 33 | 34 | import pandas as pd 35 | from utils.parsers import read_checkm_output 36 | 37 | 38 | def main(samples, completeness_files, taxonomy_files, bin_table): 39 | sample_data = {} 40 | div = {} 41 | 42 | df = pd.DataFrame() 43 | 44 | for i, sample in enumerate(samples): 45 | sample_data = read_checkm_output( 46 | taxonomy_table=taxonomy_files[i], completness_table=completeness_files[i] 47 | ) 48 | sample_data["Sample"] = sample 49 | 50 | df = df.append(sample_data) 51 | 52 | df.to_csv(bin_table, sep="\t") 53 | 54 | 55 | if __name__ == "__main__": 56 | main( 57 | samples=snakemake.params.samples, 58 | taxonomy_files=snakemake.input.taxonomy_files, 59 | completeness_files=snakemake.input.completeness_files, 60 | bin_table=snakemake.output.bin_table, 61 | ) 62 | -------------------------------------------------------------------------------- /workflow/scripts/combine_checkm2.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | logging.captureWarnings(True) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | #### Begining of scripts 33 | 34 | import pandas as pd 35 | from utils.parsers import read_checkm2_output 36 | 37 | 38 | def main(samples, completeness_files, bin_table): 39 | sample_data = {} 40 | div = {} 41 | 42 | df_list = [] 43 | 44 | for i, sample in enumerate(samples): 45 | sample_data = read_checkm2_output(completness_table=completeness_files[i]) 46 | sample_data["Sample"] = sample 47 | 48 | df_list.append(sample_data) 49 | 50 | df = pd.concat(df_list, axis=0) 51 | 52 | df.to_csv(bin_table, sep="\t") 53 | 54 | 55 | if __name__ == "__main__": 56 | main( 57 | samples=snakemake.params.samples, 58 | completeness_files=snakemake.input.completeness_files, 59 | bin_table=snakemake.output.bin_table, 60 | ) 61 | -------------------------------------------------------------------------------- /workflow/scripts/combine_contig_stats.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | 12 | def handle_exception(exc_type, exc_value, exc_traceback): 13 | if issubclass(exc_type, KeyboardInterrupt): 14 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 15 | return 16 | 17 | logging.error( 18 | "".join( 19 | [ 20 | "Uncaught exception: ", 21 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 22 | ] 23 | ) 24 | ) 25 | 26 | 27 | # Install exception handler 28 | sys.excepthook = handle_exception 29 | 30 | 31 | import pandas as pd 32 | from utils.parsers_bbmap import parse_pileup_log_file 33 | 34 | 35 | def parse_map_stats(sample_data, out_tsv): 36 | sample_stats = {} 37 | for sample in sample_data.keys(): 38 | df = pd.read_csv(sample_data[sample]["contig_stats"], sep="\t") 39 | 40 | assert df.shape[0] == 1, "Assumed only one row in file {}; found {}".format( 41 | sample_data[sample]["contig_stats"], df.iloc[0] 42 | ) 43 | 44 | # n genes 45 | genes_df = pd.read_csv(sample_data[sample]["gene_table"], index_col=0, sep="\t") 46 | df["N_Predicted_Genes"] = genes_df.shape[0] 47 | 48 | # mappingt stats 49 | mapping_stats = parse_pileup_log_file(sample_data[sample]["mapping_log"]) 50 | df["Assembled_Reads"] = mapping_stats["Mapped reads"] 51 | df["Percent_Assembled_Reads"] = mapping_stats["Percent mapped"] 52 | 53 | logging.info(f"Stats for sample {sample}\n{df}") 54 | 55 | sample_stats[sample] = df 56 | 57 | stats_df = pd.concat(sample_stats, axis=0) 58 | stats_df.index = stats_df.index.get_level_values(0) 59 | # remove contig stats and keep only scaffold stats 60 | stats_df = stats_df.loc[:, ~stats_df.columns.str.startswith("scaf_")] 61 | stats_df.columns = stats_df.columns.str.replace("ctg_", "") 62 | # save 63 | stats_df.to_csv(out_tsv, sep="\t") 64 | return stats_df 65 | 66 | 67 | def main(samples, contig_stats, gene_tables, mapping_logs, combined_stats): 68 | sample_data = {} 69 | for sample in samples: 70 | sample_data[sample] = {} 71 | for c_stat in contig_stats: 72 | # underscore version was for simplified local testing 73 | # if "%s_" % sample in c_stat: 74 | if "%s/" % sample in c_stat: 75 | sample_data[sample]["contig_stats"] = c_stat 76 | for g_table in gene_tables: 77 | # if "%s_" % sample in g_table: 78 | if "%s/" % sample in g_table: 79 | sample_data[sample]["gene_table"] = g_table 80 | for mapping_log in mapping_logs: 81 | # if "%s_" % sample in mapping_log: 82 | if "%s/" % sample in mapping_log: 83 | sample_data[sample]["mapping_log"] = mapping_log 84 | 85 | parse_map_stats(sample_data, combined_stats) 86 | 87 | 88 | if __name__ == "__main__": 89 | main( 90 | samples=snakemake.params.samples, 91 | contig_stats=snakemake.input.contig_stats, 92 | gene_tables=snakemake.input.gene_tables, 93 | mapping_logs=snakemake.input.mapping_logs, 94 | combined_stats=snakemake.output.combined_contig_stats, 95 | ) 96 | -------------------------------------------------------------------------------- /workflow/scripts/combine_coverage_MAGs.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | 12 | def handle_exception(exc_type, exc_value, exc_traceback): 13 | if issubclass(exc_type, KeyboardInterrupt): 14 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 15 | return 16 | 17 | logging.error( 18 | "".join( 19 | [ 20 | "Uncaught exception: ", 21 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 22 | ] 23 | ) 24 | ) 25 | 26 | 27 | # Install exception handler 28 | sys.excepthook = handle_exception 29 | 30 | 31 | import pandas as pd 32 | import os, gc 33 | from utils.parsers_bbmap import read_coverage_binned, combine_coverages 34 | 35 | 36 | contig2genome = pd.read_csv( 37 | snakemake.input.contig2genome, header=None, index_col=0, sep="\t" 38 | ).iloc[:, 0] 39 | 40 | 41 | # sum counts 42 | logging.info("Loading counts and coverage per contig") 43 | 44 | combined_cov, Counts_contigs = combine_coverages( 45 | snakemake.input.coverage_files, snakemake.params.samples 46 | ) 47 | 48 | combined_cov = combined_cov.T 49 | 50 | combined_cov.insert( 51 | 0, "Genome", value=pd.Categorical(contig2genome.loc[combined_cov.index].values) 52 | ) 53 | 54 | logging.info(f"Saving coverage to {snakemake.output.coverage_contigs}") 55 | 56 | combined_cov.reset_index().to_parquet(snakemake.output.coverage_contigs) 57 | 58 | logging.info("Sum counts per genome") 59 | 60 | Counts_genome = Counts_contigs.groupby(contig2genome, axis=1).sum().T 61 | Counts_genome.index.name = "Sample" 62 | 63 | logging.info(f"Saving counts to {snakemake.output.counts}") 64 | 65 | Counts_genome.reset_index().to_parquet(snakemake.output.counts) 66 | del Counts_genome, combined_cov, Counts_contigs 67 | gc.collect() 68 | 69 | # Binned coverage 70 | logging.info("Loading binned coverage") 71 | binCov = {} 72 | for i, cov_file in enumerate(snakemake.input.binned_coverage_files): 73 | sample = snakemake.params.samples[i] 74 | 75 | binCov[sample] = read_coverage_binned(cov_file) 76 | 77 | binCov = pd.DataFrame.from_dict(binCov) 78 | 79 | logging.info("Add genome information to it") 80 | binCov.insert( 81 | 0, 82 | "Genome", 83 | value=pd.Categorical(contig2genome.loc[binCov.index.get_level_values(0)].values), 84 | ) 85 | 86 | gc.collect() 87 | logging.info(f"Saving combined binCov to {snakemake.output.binned_cov}") 88 | binCov.reset_index().to_parquet(snakemake.output.binned_cov) 89 | 90 | # Median coverage 91 | logging.info("Calculate median coverage") 92 | Median_abund = binCov.groupby("Genome").median().T 93 | del binCov 94 | gc.collect() 95 | logging.info(f"Saving mediuan coverage {snakemake.output.median_abund}") 96 | Median_abund.reset_index().to_parquet(snakemake.output.median_abund) 97 | -------------------------------------------------------------------------------- /workflow/scripts/combine_dram_gene_annotations.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | 12 | def handle_exception(exc_type, exc_value, exc_traceback): 13 | if issubclass(exc_type, KeyboardInterrupt): 14 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 15 | return 16 | 17 | logging.error( 18 | "".join( 19 | [ 20 | "Uncaught exception: ", 21 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 22 | ] 23 | ) 24 | ) 25 | 26 | 27 | # Install exception handler 28 | sys.excepthook = handle_exception 29 | 30 | 31 | from pathlib import Path 32 | import numpy as np 33 | import pandas as pd 34 | from collections import defaultdict 35 | 36 | db_columns = { 37 | "kegg": ["ko_id", "kegg_hit"], 38 | "peptidase": [ 39 | "peptidase_id", 40 | "peptidase_family", 41 | "peptidase_hit", 42 | "peptidase_RBH", 43 | "peptidase_identity", 44 | "peptidase_bitScore", 45 | "peptidase_eVal", 46 | ], 47 | "pfam": ["pfam_hits"], 48 | "cazy": ["cazy_ids", "cazy_hits", "cazy_subfam_ec", "cazy_best_hit"], 49 | # "heme": ["heme_regulatory_motif_count"], 50 | } 51 | 52 | Tables = defaultdict(list) 53 | 54 | for file in snakemake.input: 55 | df = pd.read_csv(file, index_col=0, sep="\t") 56 | 57 | # drop un-annotated genes 58 | df = df.query("rank!='E'") 59 | 60 | # change index from 'subset1_Gene111' -> simply 'Gene111' 61 | # Gene name to nr 62 | df.index = ( 63 | df.index.str.split("_", n=1, expand=True) 64 | .get_level_values(1) 65 | .str[len("Gene") :] 66 | .astype(np.int64) 67 | ) 68 | df.index.name = "GeneNr" 69 | 70 | # select columns, drop na rows and append to list 71 | for db in db_columns: 72 | cols = db_columns[db] 73 | 74 | if not df.columns.intersection(cols).empty: 75 | Tables[db].append(df[cols].dropna(axis=0, how="all")) 76 | 77 | del df 78 | 79 | out_dir = Path(snakemake.output[0]) 80 | out_dir.mkdir() 81 | 82 | for db in Tables: 83 | combined = pd.concat(Tables[db], axis=0) 84 | 85 | combined.sort_index(inplace=True) 86 | 87 | combined.reset_index().to_parquet(out_dir / (db + ".parquet")) 88 | -------------------------------------------------------------------------------- /workflow/scripts/combine_gene_coverages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os, sys 3 | import logging, traceback 4 | 5 | logging.basicConfig( 6 | filename=snakemake.log[0], 7 | level=logging.INFO, 8 | format="%(asctime)s %(message)s", 9 | datefmt="%Y-%m-%d %H:%M:%S", 10 | ) 11 | 12 | 13 | def handle_exception(exc_type, exc_value, exc_traceback): 14 | if issubclass(exc_type, KeyboardInterrupt): 15 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 16 | return 17 | 18 | logging.error( 19 | "".join( 20 | [ 21 | "Uncaught exception: ", 22 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 23 | ] 24 | ) 25 | ) 26 | 27 | 28 | # Install exception handler 29 | sys.excepthook = handle_exception 30 | 31 | #### Begining of script 32 | import numpy as np 33 | import pandas as pd 34 | import gc, os 35 | 36 | 37 | import h5py 38 | 39 | import h5py 40 | 41 | import psutil 42 | 43 | 44 | def measure_memory(write_log_entry=True): 45 | mem_uage = psutil.Process().memory_info().rss / (1024 * 1024) 46 | 47 | if write_log_entry: 48 | logging.info(f"The process is currently using {mem_uage: 7.0f} MB of RAM") 49 | 50 | return mem_uage 51 | 52 | 53 | logging.info("Start") 54 | measure_memory() 55 | 56 | N_samples = len(snakemake.input.covstats) 57 | 58 | logging.info("Read gene info") 59 | 60 | gene_info = pd.read_table(snakemake.input.info) 61 | 62 | # Gene name is only first part of first column 63 | gene_info.index = gene_info["#Name"].str.split(" ", n=1, expand=True)[0] 64 | gene_info.index.name = "GeneName" 65 | gene_info.drop("#Name", axis=1, inplace=True) 66 | 67 | gene_info.sort_index(inplace=True) 68 | N_genes = gene_info.shape[0] 69 | # gene_list= gene_info.index 70 | 71 | # Sort 72 | gene_info.sort_index(inplace=True) 73 | N_genes = gene_info.shape[0] 74 | 75 | gene_info[ 76 | ["Samples_nz_coverage", "Samples_nz_counts", "Sum_coverage", "Max_coverage"] 77 | ] = 0 78 | 79 | 80 | # gene_list= gene_info.index 81 | 82 | 83 | logging.info("Open hdf files for writing") 84 | 85 | gene_matrix_shape = (N_samples, N_genes) 86 | 87 | with h5py.File(snakemake.output.cov, "w") as hdf_cov_file, h5py.File( 88 | snakemake.output.counts, "w" 89 | ) as hdf_counts_file: 90 | combined_cov = hdf_cov_file.create_dataset( 91 | "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip" 92 | ) 93 | combined_counts = hdf_counts_file.create_dataset( 94 | "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip" 95 | ) 96 | 97 | # add Smaple names attribute 98 | sample_names = np.array(list(snakemake.params.samples)).astype("S") 99 | combined_cov.attrs["sample_names"] = sample_names 100 | combined_counts.attrs["sample_names"] = sample_names 101 | 102 | gc.collect() 103 | 104 | Summary = {} 105 | 106 | logging.info("Start reading files") 107 | initial_mem_uage = measure_memory() 108 | 109 | for i, sample in enumerate(snakemake.params.samples): 110 | logging.info(f"Read coverage file for sample {i+1} / {N_samples}") 111 | sample_cov_file = snakemake.input.covstats[i] 112 | 113 | data = pd.read_parquet( 114 | sample_cov_file, columns=["GeneName", "Reads", "Median_fold"] 115 | ).set_index("GeneName") 116 | 117 | assert ( 118 | data.shape[0] == N_genes 119 | ), f"I only have {data.shape[0]} /{N_genes} in the file {sample_cov_file}" 120 | 121 | # genes are not sorted :-() 122 | assert ( 123 | data.index.is_monotonic_increasing 124 | ), f"data is not sorted by index in {sample_cov_file}" 125 | 126 | # downcast data 127 | # median is int 128 | Median_fold = pd.to_numeric(data.Median_fold, downcast="integer") 129 | Reads = pd.to_numeric(data.Reads, downcast="integer") 130 | 131 | # delete interminate data and release mem 132 | del data 133 | 134 | # get summary statistics per sample 135 | logging.debug("Extract Summary statistics") 136 | 137 | Summary[sample] = { 138 | "Sum_coverage": Median_fold.sum(), 139 | "Total_counts": Reads.sum(), 140 | "Genes_nz_counts": (Reads > 0).sum(), 141 | "Genes_nz_coverage": (Median_fold > 0).sum(), 142 | } 143 | 144 | # get gene wise stats 145 | gene_info["Samples_nz_counts"] += (Reads > 0) * 1 146 | gene_info["Samples_nz_coverage"] += (Median_fold > 0) * 1 147 | gene_info["Sum_coverage"] += Median_fold 148 | 149 | gene_info["Max_coverage"] = np.fmax(gene_info["Max_coverage"], Median_fold) 150 | 151 | combined_cov[i, :] = Median_fold.values 152 | combined_counts[i, :] = Reads.values 153 | 154 | del Median_fold, Reads 155 | gc.collect() 156 | 157 | current_mem_uage = measure_memory() 158 | 159 | 160 | logging.info("All samples processed") 161 | gc.collect() 162 | 163 | logging.info("Save sample Summary") 164 | pd.DataFrame(Summary).T.to_csv(snakemake.output.sample_info, sep="\t") 165 | 166 | 167 | logging.info("Save gene Summary") 168 | 169 | # downcast 170 | for col in gene_info.columns: 171 | if col == "GC": 172 | gene_info[col] = pd.to_numeric(gene_info[col], downcast="float") 173 | else: 174 | gene_info[col] = pd.to_numeric(gene_info[col], downcast="integer") 175 | 176 | gene_info.reset_index().to_parquet(snakemake.output.gene_info) 177 | -------------------------------------------------------------------------------- /workflow/scripts/combine_taxonomy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os, sys 3 | import logging, traceback 4 | 5 | logging.basicConfig( 6 | filename=snakemake.log[0], 7 | level=logging.INFO, 8 | format="%(asctime)s %(message)s", 9 | datefmt="%Y-%m-%d %H:%M:%S", 10 | ) 11 | 12 | 13 | def handle_exception(exc_type, exc_value, exc_traceback): 14 | if issubclass(exc_type, KeyboardInterrupt): 15 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 16 | return 17 | 18 | logging.error( 19 | "".join( 20 | [ 21 | "Uncaught exception: ", 22 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 23 | ] 24 | ) 25 | ) 26 | 27 | 28 | # Install exception handler 29 | sys.excepthook = handle_exception 30 | 31 | #### Begining of scripts 32 | 33 | import pandas as pd 34 | import numpy as np 35 | from utils.taxonomy import tax2table 36 | 37 | from glob import glob 38 | 39 | gtdb_classify_folder = snakemake.input.folder 40 | 41 | taxonomy_files = glob(f"{gtdb_classify_folder}/gtdbtk.*.summary.tsv") 42 | 43 | N_taxonomy_files = len(taxonomy_files) 44 | logging.info(f"Found {N_taxonomy_files} gtdb taxonomy files.") 45 | 46 | if (0 == N_taxonomy_files) or (N_taxonomy_files > 2): 47 | raise Exception( 48 | f"Found {N_taxonomy_files} number of taxonomy files 'gtdbtk.*.summary.tsv' in {gtdb_classify_folder} expect 1 or 2." 49 | ) 50 | 51 | 52 | DT = pd.concat([pd.read_table(file, index_col=0) for file in taxonomy_files], axis=0) 53 | 54 | DT.to_csv(snakemake.output.combined) 55 | 56 | Tax = tax2table(DT.classification, remove_prefix=True) 57 | Tax.to_csv(snakemake.output.taxonomy, sep="\t") 58 | -------------------------------------------------------------------------------- /workflow/scripts/convert_jgi2vamb_coverage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | import re 5 | 6 | 7 | def main(jgi_file): 8 | # parsing input 9 | header = {} 10 | col2keep = ["contigName", "contigLen", "totalAvgDepth"] 11 | with open(jgi_file) as inF: 12 | for i, line in enumerate(inF): 13 | line = line.rstrip().split("\t") 14 | if i == 0: 15 | header = {x: ii for ii, x in enumerate(line)} 16 | col2keep += [x for x in line if x.endswith(".bam")] 17 | print("\t".join(col2keep)) 18 | continue 19 | elif line[0] == "": 20 | continue 21 | # contig ID 22 | contig = line[header["contigName"]] 23 | # collect per-sample info 24 | out = [] 25 | for col in col2keep: 26 | out.append(line[header[col]]) 27 | print("\t".join(out)) 28 | 29 | 30 | if __name__ == "__main__": 31 | if "snakemake" in globals(): 32 | with open(snakemake.log[0], "w") as log: 33 | sys.stderr = log 34 | 35 | with open(snakemake.output[0], "w") as outf: 36 | sys.stdout = outf 37 | 38 | main(snakemake.input[0]) 39 | 40 | else: 41 | import argparse 42 | import logging 43 | 44 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) 45 | 46 | class CustomFormatter( 47 | argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter 48 | ): 49 | pass 50 | 51 | desc = ( 52 | "Converting jgi_summarize_bam_contig_depths output to format used by VAMB" 53 | ) 54 | epi = """DESCRIPTION: 55 | Output format: contigNamecontigLentotalAvgDepthSAMPLE1.sort.bamSample2.sort.bam... 56 | Output written to STDOUT 57 | """ 58 | parser = argparse.ArgumentParser( 59 | description=desc, epilog=epi, formatter_class=CustomFormatter 60 | ) 61 | argparse.ArgumentDefaultsHelpFormatter 62 | parser.add_argument( 63 | "jgi_file", 64 | metavar="jgi_file", 65 | type=str, 66 | help="jgi_summarize_bam_contig_depths output table", 67 | ) 68 | parser.add_argument("--version", action="version", version="0.0.1") 69 | 70 | args = parser.parse_args() 71 | main(args.jgi_file) 72 | -------------------------------------------------------------------------------- /workflow/scripts/filter_genes.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | import pyfastx 35 | 36 | 37 | faa_iterator = pyfastx.Fastx(snakemake.input.faa, format="fasta", comment=True) 38 | fna_iterator = pyfastx.Fastx(snakemake.input.fna, format="fasta", comment=True) 39 | 40 | 41 | with open(snakemake.output.faa, "w") as out_faa, open( 42 | snakemake.output.fna, "w" 43 | ) as out_fna, open(snakemake.output.short, "w") as out_short: 44 | for name, seq, comment in fna_iterator: 45 | protein = next(faa_iterator) 46 | 47 | # include gene and corresponding protein if gene passes length threshold 48 | # or annotation contains prodigal info that it's complete 49 | if (len(seq) >= snakemake.params.minlength_nt) or ("partial=00" in comment): 50 | out_fna.write(f">{name} {comment}\n{seq}\n") 51 | out_faa.write(">{0} {2}\n{1}\n".format(*protein)) 52 | 53 | else: 54 | out_short.write(">{0} {2}\n{1}\n".format(*protein)) 55 | -------------------------------------------------------------------------------- /workflow/scripts/filter_genomes.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | import pandas as pd 35 | from glob import glob 36 | from numpy import log 37 | 38 | from utils.parsers import load_quality 39 | 40 | 41 | Q = load_quality(snakemake.input.quality) 42 | 43 | stats = pd.read_csv(snakemake.input.stats, index_col=0, sep="\t") 44 | stats["logN50"] = log(stats.N50) 45 | 46 | # merge table but only shared Bins and non overlapping columns 47 | Q = Q.join(stats.loc[Q.index, stats.columns.difference(Q.columns)]) 48 | del stats 49 | 50 | n_all_bins = Q.shape[0] 51 | 52 | filter_criteria = snakemake.params["filter_criteria"] 53 | logging.info(f"Filter genomes according to criteria:\n {filter_criteria}") 54 | 55 | 56 | Q = Q.query(filter_criteria) 57 | 58 | logging.info(f"Retain {Q.shape[0]} genomes from {n_all_bins}") 59 | 60 | 61 | ## GUNC 62 | 63 | if hasattr(snakemake.input, "gunc"): 64 | gunc = pd.read_table(snakemake.input.gunc, index_col=0) 65 | gunc = gunc.loc[Q.index] 66 | 67 | bad_genomes = gunc.index[gunc["pass.GUNC"] == False] 68 | logging.info(f"{len(bad_genomes)} Don't pass gunc filtering") 69 | 70 | Q.drop(bad_genomes, inplace=True) 71 | else: 72 | logging.info(" Don't filter based on gunc") 73 | 74 | 75 | if Q.shape[0] == 0: 76 | logging.error( 77 | f"No bins passed filtering criteria! Bad luck!. You might want to tweek the filtering criteria. Also check the {snakemake.input.quality}" 78 | ) 79 | exit(1) 80 | 81 | # output Q together with quality 82 | Q.to_csv(snakemake.output.info, sep="\t") 83 | 84 | 85 | # filter path genomes for skani 86 | 87 | F = pd.read_table(snakemake.input.paths, index_col=0).squeeze() 88 | 89 | F = F.loc[Q.index].iloc[:, 0] 90 | F.to_csv(snakemake.output.paths, index=False, header=False) 91 | -------------------------------------------------------------------------------- /workflow/scripts/gene2genome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os, sys 3 | import logging, traceback 4 | 5 | logging.basicConfig( 6 | filename=snakemake.log[0], 7 | level=logging.INFO, 8 | format="%(asctime)s %(message)s", 9 | datefmt="%Y-%m-%d %H:%M:%S", 10 | ) 11 | 12 | 13 | def handle_exception(exc_type, exc_value, exc_traceback): 14 | if issubclass(exc_type, KeyboardInterrupt): 15 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 16 | return 17 | 18 | logging.error( 19 | "".join( 20 | [ 21 | "Uncaught exception: ", 22 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 23 | ] 24 | ) 25 | ) 26 | 27 | 28 | # Install exception handler 29 | sys.excepthook = handle_exception 30 | 31 | #### Begining of script 32 | 33 | import pandas as pd 34 | from utils import gene_scripts 35 | 36 | # if MAGs are renamed I need to obtain the old contig names 37 | # otherwise not 38 | if snakemake.params.renamed_contigs: 39 | contigs2bins = pd.read_csv( 40 | snakemake.input.contigs2bins, index_col=0, sep="\t", header=None 41 | ) 42 | 43 | contigs2bins.columns = ["Bin"] 44 | old2newID = pd.read_csv(snakemake.input.old2newID, index_col=0, sep="\t").squeeze() 45 | 46 | contigs2genome = contigs2bins.join(old2newID, on="Bin").dropna().drop("Bin", axis=1) 47 | else: 48 | contigs2genome = pd.read_csv( 49 | snakemake.input.contigs2mags, index_col=0, squeeze=False, sep="\t", header=None 50 | ) 51 | contigs2genome.columns = ["MAG"] 52 | 53 | # load orf_info 54 | orf_info = pd.read_parquet(snakemake.input.orf_info) 55 | 56 | 57 | # recreate Contig name `Sample_ContigNr` and Gene names `Gene0004` 58 | orf_info["Contig"] = orf_info.Sample + "_" + orf_info.ContigNr.astype(str) 59 | orf_info["Gene"] = gene_scripts.geneNr_to_string(orf_info.GeneNr) 60 | 61 | # Join genomes on contig 62 | orf_info = orf_info.join(contigs2genome, on="Contig") 63 | 64 | # remove genes not on genomes 65 | orf_info = orf_info.dropna(axis=0) 66 | 67 | 68 | # count genes per genome in a matrix 69 | gene2genome = pd.to_numeric( 70 | orf_info.groupby(["Gene", "MAG"]).size(), downcast="unsigned" 71 | ).unstack(fill_value=0) 72 | 73 | # save as parquet 74 | gene2genome.reset_index().to_parquet(snakemake.output[0]) 75 | -------------------------------------------------------------------------------- /workflow/scripts/generate_orf_info.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | ## Start 35 | 36 | import pandas as pd 37 | import numpy as np 38 | 39 | from utils import gene_scripts 40 | 41 | # CLuterID GeneID empty third column 42 | orf2gene = pd.read_csv( 43 | snakemake.input.cluster_attribution, header=None, sep="\t", usecols=[0, 1] 44 | ) 45 | 46 | orf2gene.columns = ["Representative", "ORF"] 47 | 48 | # split orf names in sample, contig_nr, and orf_nr 49 | orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF) 50 | 51 | # rename representative 52 | 53 | representative_names = orf2gene.Representative.unique() 54 | 55 | map_names = pd.Series( 56 | index=representative_names, 57 | data=np.arange(1, len(representative_names) + 1, dtype=np.uint), 58 | ) 59 | 60 | 61 | orf_info["GeneNr"] = orf2gene.Representative.map(map_names) 62 | 63 | 64 | orf_info.to_parquet(snakemake.output.cluster_attribution) 65 | 66 | 67 | # Save name of representatives 68 | map_names.index.name = "Representative" 69 | map_names.name = "GeneNr" 70 | map_names.to_csv(snakemake.output.rep2genenr, sep="\t") 71 | -------------------------------------------------------------------------------- /workflow/scripts/get_fasta_of_bins.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.DEBUG, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | # start of script 35 | import argparse 36 | import os, sys 37 | import shutil 38 | import warnings 39 | 40 | import pandas as pd 41 | from Bio import SeqIO 42 | 43 | 44 | def get_fasta_of_bins(cluster_attribution, contigs_file, out_folder): 45 | """ 46 | Creates individual fasta files for each bin using the contigs fasta and the cluster attribution. 47 | 48 | input: 49 | - cluster attribution file: tab seperated file of "contig_fasta_header bin" 50 | - contigs: fasta file of contigs 51 | - out_prefix: output_prefix for bin fastas {out_folder}/{binid}.fasta 52 | """ 53 | # create outdir 54 | if os.path.exists(out_folder): 55 | shutil.rmtree(out_folder) 56 | os.makedirs(out_folder) 57 | 58 | CA = pd.read_csv(cluster_attribution, header=None, sep="\t", dtype=str) 59 | 60 | assert CA.shape[1] == 2, "File should have only two columns " + cluster_attribution 61 | 62 | CA.columns = ["Contig", "Bin"] 63 | 64 | # # assert that Contig is unique 65 | # assert CA.Contig.is_unique, ( 66 | # f"First column of file {cluster_attribution} should be contigs, hence unique" 67 | # f"I got\n{CA.head()}" 68 | # ) 69 | 70 | logging.info(f"index fasta file {contigs_file} for fast access") 71 | contig_fasta_dict = SeqIO.index(str(contigs_file), "fasta") 72 | 73 | assert len(contig_fasta_dict) > 0, "No contigs in your fasta" 74 | 75 | unique_bins = CA.Bin.unique() 76 | 77 | assert len(unique_bins) >= 1, "No bins found" 78 | 79 | for binid in unique_bins: 80 | bin_contig_names = CA.loc[CA.Bin == binid, "Contig"].tolist() 81 | out_file = os.path.join(out_folder, f"{binid}.fasta") 82 | 83 | assert ( 84 | len(bin_contig_names) >= 1 85 | ), f"No contigs found for bin {binid} in {cluster_attribution}" 86 | 87 | if len(bin_contig_names) == 1: 88 | warnings.warn(f"single contig bin Bin : {binid} {bin_contig_names}") 89 | 90 | logging.debug(f"Found {len(bin_contig_names)} contigs {bin_contig_names}") 91 | 92 | fasta_contigs = [contig_fasta_dict[c] for c in bin_contig_names] 93 | SeqIO.write(fasta_contigs, out_file, "fasta") 94 | 95 | 96 | if __name__ == "__main__": 97 | if "snakemake" not in globals(): 98 | p = argparse.ArgumentParser() 99 | p.add_argument("--cluster-attribution") 100 | p.add_argument("--contigs") 101 | p.add_argument("--out-folder") 102 | args = vars(p.parse_args()) 103 | get_fasta_of_bins(**args) 104 | else: 105 | get_fasta_of_bins( 106 | snakemake.input.cluster_attribution, 107 | snakemake.input.contigs, 108 | snakemake.output[0], 109 | ) 110 | -------------------------------------------------------------------------------- /workflow/scripts/get_read_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os, sys 3 | import logging, traceback 4 | 5 | 6 | logging.basicConfig( 7 | filename=snakemake.log[0], 8 | level=logging.INFO, 9 | format="%(asctime)s %(message)s", 10 | datefmt="%Y-%m-%d %H:%M:%S", 11 | ) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | 33 | # begining of script 34 | 35 | import datetime 36 | import shutil 37 | import os 38 | 39 | 40 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%X") 41 | 42 | 43 | def get_read_stats(fraction, params_in): 44 | "get read stats by running reformat.sh" 45 | 46 | from snakemake.shell import shell 47 | 48 | subfolder = os.path.join(snakemake.params.folder, fraction) 49 | tmp_file = os.path.join(subfolder, "read_stats.tmp") 50 | shell( 51 | f" mkdir -p {subfolder} 2>> {snakemake.log[0]} " 52 | " ; " 53 | f" reformat.sh {params_in} " 54 | f" bhist={subfolder}/base_hist.txt " 55 | f" qhist={subfolder}/quality_by_pos.txt " 56 | f" lhist={subfolder}/readlength.txt " 57 | f" gchist={subfolder}/gc_hist.txt " 58 | " gcbins=auto " 59 | f" bqhist={subfolder}/boxplot_quality.txt " 60 | f" threads={snakemake.threads} " 61 | " overwrite=true " 62 | f" -Xmx{snakemake.resources.java_mem}G " 63 | f" 2>&1 | tee -a {snakemake.log[0]} {tmp_file} >/dev/null " 64 | ) 65 | content = open(tmp_file).read() 66 | pos = content.find("Input:") 67 | if pos == -1: 68 | raise Exception("Didn't find read number in file:\n\n" + content) 69 | else: 70 | content[pos:].split()[1:4] 71 | # Input: 123 reads 1234 bases 72 | n_reads, _, n_bases = content[pos:].split()[1:4] 73 | 74 | os.remove(tmp_file) 75 | return int(n_reads), int(n_bases) 76 | 77 | 78 | if len(snakemake.input) >= 2: 79 | n_reads_pe, n_bases_pe = get_read_stats( 80 | "pe", "in1={0} in2={1}".format(*snakemake.input) 81 | ) 82 | 83 | n_reads_pe = n_reads_pe / 2 84 | 85 | headers = [ 86 | "Sample", 87 | "Step", 88 | "Total_Reads", 89 | "Total_Bases", 90 | "Reads_pe", 91 | "Bases_pe", 92 | "Reads_se", 93 | "Bases_se", 94 | "Timestamp", 95 | ] 96 | 97 | if os.path.exists(snakemake.params.single_end_file): 98 | n_reads_se, n_bases_se = get_read_stats( 99 | "se", "in=" + snakemake.params.single_end_file 100 | ) 101 | else: 102 | n_reads_se, n_bases_se = 0, 0 103 | 104 | values = [ 105 | n_reads_pe + n_reads_se, 106 | n_bases_pe + n_bases_se, 107 | n_reads_pe, 108 | n_bases_pe, 109 | n_reads_se, 110 | n_bases_se, 111 | ] 112 | else: 113 | headers = [ 114 | "Sample", 115 | "Step", 116 | "Total_Reads", 117 | "Total_Bases", 118 | "Reads", 119 | "Bases", 120 | "Timestamp", 121 | ] 122 | values = 2 * get_read_stats("", "in=" + snakemake.input[0]) 123 | 124 | with open(snakemake.output.read_counts, "w") as f: 125 | f.write("\t".join(headers) + "\n") 126 | f.write( 127 | "\t".join( 128 | [snakemake.wildcards.sample, snakemake.wildcards.step] 129 | + [str(v) for v in values] 130 | + [timestamp] 131 | ) 132 | + "\n" 133 | ) 134 | 135 | shutil.make_archive(snakemake.params.folder, "zip", snakemake.params.folder) 136 | shutil.rmtree(snakemake.params.folder) 137 | -------------------------------------------------------------------------------- /workflow/scripts/parse_semibin.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | 12 | def handle_exception(exc_type, exc_value, exc_traceback): 13 | if issubclass(exc_type, KeyboardInterrupt): 14 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 15 | return 16 | 17 | logging.error( 18 | "".join( 19 | [ 20 | "Uncaught exception: ", 21 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 22 | ] 23 | ) 24 | ) 25 | 26 | 27 | # Install exception handler 28 | sys.excepthook = handle_exception 29 | 30 | from utils.fasta import parse_fasta_headers 31 | from utils.utils import gen_names_for_range 32 | from glob import glob 33 | import pandas as pd 34 | 35 | 36 | fasta_files = glob(f"{snakemake.input[0]}/*{snakemake.params.extension}") 37 | 38 | if len(fasta_files) > 0: 39 | Bin_names = gen_names_for_range( 40 | N=len(fasta_files), prefix=f"{snakemake.wildcards.sample}_SemiBin_" 41 | ) 42 | 43 | mappings = [] 44 | 45 | for bin_name, fasta in zip(Bin_names, fasta_files): 46 | contigs = parse_fasta_headers(fasta) 47 | 48 | mappings.append(pd.Series(data=bin_name, index=contigs)) 49 | 50 | pd.concat(mappings, axis=0).to_csv( 51 | snakemake.output[0], sep="\t", header=False, index=True 52 | ) 53 | 54 | else: 55 | logging.warning( 56 | f"No bins found in {snakemake.input[0]} add longest contig as bin to make atlas continue." 57 | ) 58 | 59 | with open(snakemake.output[0], "w") as outf: 60 | outf.write("{sample}_1\t{sample}_SemiBin_1\n".format(**snakemake.wildcards)) 61 | -------------------------------------------------------------------------------- /workflow/scripts/parse_vamb.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging, traceback 3 | 4 | logging.basicConfig( 5 | filename=snakemake.log[0], 6 | level=logging.INFO, 7 | format="%(asctime)s %(message)s", 8 | datefmt="%Y-%m-%d %H:%M:%S", 9 | ) 10 | 11 | 12 | def handle_exception(exc_type, exc_value, exc_traceback): 13 | if issubclass(exc_type, KeyboardInterrupt): 14 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 15 | return 16 | 17 | logging.error( 18 | "".join( 19 | [ 20 | "Uncaught exception: ", 21 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 22 | ] 23 | ) 24 | ) 25 | 26 | 27 | # Install exception handler 28 | sys.excepthook = handle_exception 29 | 30 | 31 | import pandas as pd 32 | from pathlib import Path 33 | from utils.utils import gen_names_for_range 34 | from utils.fasta import parse_fasta_headers 35 | 36 | 37 | fasta_extension = snakemake.params.fasta_extension 38 | separator = snakemake.params.separator 39 | 40 | # path with {sample} to replace 41 | cluster_output_path = snakemake.params.output_path 42 | 43 | 44 | # cluster.tsv.gz file for all samples of all bingroups 45 | output_culsters = snakemake.output.renamed_clusters 46 | 47 | 48 | all_clusters = [] 49 | 50 | 51 | for i in range(len(list(snakemake.input))): 52 | vamb_folder = Path(snakemake.input[i]) 53 | 54 | bingroup = snakemake.params.bingroups[i] 55 | 56 | logging.info(f"Parse vamb output for bingroup {bingroup}") 57 | 58 | # path to the bins folder 59 | bin_dir = vamb_folder / "bins" 60 | vamb_cluster_file = vamb_folder / "clusters.tsv" 61 | 62 | # Get a list of binds that are big enough. Not all bins in the vamb_cluster_file, pass the size filter 63 | big_bins = [] 64 | 65 | for file in os.listdir(bin_dir): 66 | bin_name, extension = os.path.splitext(file) 67 | 68 | logging.debug(f"Found file {bin_name} with extension {extension}") 69 | 70 | if extension == fasta_extension: 71 | big_bins.append(bin_name) 72 | 73 | logging.info( 74 | f"Found {len(big_bins)} bins created by Vamb (above size limit)\n" 75 | f"E.g. {big_bins[:5]}" 76 | ) 77 | 78 | logging.info(f"Load vamb cluster file {vamb_cluster_file}") 79 | clusters_contigs = pd.read_table(vamb_cluster_file, header=None) 80 | clusters_contigs.columns = ["OriginalName", "Contig"] 81 | 82 | # split contigs by separator. This is mainly done for compatibility with SemiBin 83 | clusters = clusters_contigs.Contig.str.rsplit(separator, n=1, expand=True) 84 | clusters.columns = ["Sample", "Contig"] 85 | 86 | # get number of BinID given by vamb, prefix with bingroup 87 | clusters["BinId"] = ( 88 | bingroup 89 | + clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1] 90 | ) 91 | 92 | # Add information if the bin is large enough 93 | clusters["OriginalName"] = clusters_contigs.OriginalName 94 | clusters["Large_enough"] = clusters.OriginalName.isin(big_bins) 95 | 96 | # Add information about the bingroup 97 | clusters["BinGroup"] = bingroup 98 | 99 | all_clusters.append(clusters) 100 | 101 | del clusters_contigs 102 | 103 | 104 | logging.info(f"Concatenate clusters of all bingroups") 105 | clusters = pd.concat(all_clusters, axis=0) 106 | 107 | 108 | n_bins = ( 109 | clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique() 110 | ) 111 | logging.info( 112 | f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}" 113 | ) 114 | 115 | 116 | clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinId 117 | clusters.loc[~clusters.Large_enough, "SampleBin"] = "" 118 | 119 | 120 | logging.info(f"Write reformated table to {output_culsters}") 121 | clusters.to_csv(output_culsters, sep="\t", index=False) 122 | 123 | # filter for following 124 | clusters = clusters.query("Large_enough") 125 | 126 | logging.info(f"Write cluster_attribution for samples") 127 | for sample, cl in clusters.groupby("Sample"): 128 | sample_output_path = cluster_output_path.format(sample=sample) 129 | 130 | logging.debug(f"Write file {sample_output_path}") 131 | cl[["Contig", "SampleBin"]].to_csv( 132 | sample_output_path, sep="\t", index=False, header=False 133 | ) 134 | 135 | 136 | samples_without_bins = set(snakemake.params.samples).difference(set(clusters.Sample)) 137 | 138 | if len(samples_without_bins) > 0: 139 | logging.warning( 140 | "The following samples didn't yield bins, I add longest contig to make the pipeline continue:\n" 141 | + "\n".join(samples_without_bins) 142 | ) 143 | 144 | for sample in samples_without_bins: 145 | sample_output_path = cluster_output_path.format(sample=sample) 146 | with open(sample_output_path, "w") as fout: 147 | fout.write(f"{sample}_1\t{sample}_vamb_1\n") 148 | -------------------------------------------------------------------------------- /workflow/scripts/rename_assembly.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | from Bio import SeqIO 35 | 36 | # Open the snakemake.output FASTA file and mapping table file for writing 37 | with open(snakemake.output.fasta, "w") as output_handle, open( 38 | snakemake.output.mapping_table, "w" 39 | ) as mapping_table_handle: 40 | i = 1 41 | 42 | for record in SeqIO.parse(snakemake.input[0], "fasta"): 43 | if len(record) < snakemake.params.minlength: 44 | break 45 | 46 | old_name = record.id 47 | new_name = f"{snakemake.wildcards.sample}_{i}" 48 | record.id = new_name 49 | record.description = "" 50 | 51 | SeqIO.write(record, output_handle, "fasta") 52 | 53 | mapping_table_handle.write(f"{new_name}\t{old_name}\n") 54 | 55 | i += 1 56 | -------------------------------------------------------------------------------- /workflow/scripts/rename_genecatalog.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | 34 | import pandas as pd 35 | from utils.gene_scripts import geneNr_to_string 36 | 37 | 38 | # Start 39 | 40 | 41 | map_genenr = pd.read_csv(snakemake.input.rep2genenr, index_col=0, sep="\t").squeeze() 42 | 43 | 44 | # from gene Nr to gene name 45 | rep2gene = geneNr_to_string(map_genenr) 46 | 47 | logging.info( 48 | f"Collect and rename representative genes according to:\n {rep2gene.head()}" 49 | ) 50 | 51 | assert rep2gene.shape[0] > 0 52 | 53 | 54 | with open(snakemake.output[0], "w") as fout: 55 | with open(snakemake.input.fasta, "r") as fin: 56 | for line in fin: 57 | if line[0] == ">": 58 | gene_name = line[1:].strip().split(" ")[0] 59 | 60 | gene_id = rep2gene.loc[gene_name] 61 | 62 | fout.write(f">{gene_id} {gene_name}\n") 63 | 64 | else: 65 | fout.write(line) 66 | -------------------------------------------------------------------------------- /workflow/scripts/rename_genomes.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import sys, os 4 | import logging, traceback 5 | 6 | logging.basicConfig( 7 | filename=snakemake.log[0], 8 | level=logging.INFO, 9 | format="%(asctime)s %(message)s", 10 | datefmt="%Y-%m-%d %H:%M:%S", 11 | ) 12 | 13 | 14 | def handle_exception(exc_type, exc_value, exc_traceback): 15 | if issubclass(exc_type, KeyboardInterrupt): 16 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 17 | return 18 | 19 | logging.error( 20 | "".join( 21 | [ 22 | "Uncaught exception: ", 23 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 24 | ] 25 | ) 26 | ) 27 | 28 | 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | 33 | # start 34 | 35 | 36 | from atlas import utils 37 | import pandas as pd 38 | 39 | # Bin Filename Proteins 40 | paths = pd.read_csv(snakemake.input.paths, sep="\t", index_col=0).Filename 41 | # genome SpeciesNr Species Representative 42 | mapping = pd.read_csv( 43 | snakemake.input.mapping_file, 44 | sep="\t", 45 | index_col=0, 46 | ).squeeze() 47 | 48 | 49 | # standardize names of representatives 50 | # MAG001 .... 51 | representatives = mapping.Representative.unique() 52 | old2new_name = dict( 53 | zip(representatives, utils.gen_names_for_range(len(representatives), prefix="MAG")) 54 | ) 55 | mapping["MAG"] = mapping.Representative.map(old2new_name) 56 | 57 | 58 | # write cluster attribution 59 | mapping[["MAG", "Representative"]].to_csv( 60 | snakemake.output.mapfile_allbins2mag, sep="\t", header=True 61 | ) 62 | 63 | # write out old2new ids 64 | old2new = mapping.loc[representatives, "MAG"] 65 | old2new.index.name = "Representative" 66 | old2new.to_csv(snakemake.output.mapfile_old2mag, sep="\t", header=True) 67 | 68 | 69 | #### Write genomes and contig to genome mapping file 70 | output_dir = snakemake.output.dir 71 | mapfile_contigs = snakemake.output.mapfile_contigs 72 | rename_contigs = snakemake.params.rename_contigs 73 | 74 | 75 | os.makedirs(output_dir) 76 | 77 | with open(mapfile_contigs, "w") as out_contigs: 78 | for rep in representatives: 79 | fasta_in = paths.loc[rep] 80 | new_name = old2new.loc[rep] 81 | 82 | fasta_out = os.path.join(output_dir, f"{new_name}.fasta") 83 | 84 | # write names of contigs in mapping file 85 | with open(fasta_in) as ffi, open(fasta_out, "w") as ffo: 86 | Nseq = 0 87 | for line in ffi: 88 | # if header line 89 | if line[0] == ">": 90 | Nseq += 1 91 | 92 | if rename_contigs: 93 | new_header = f"{new_name}_{Nseq}" 94 | else: 95 | new_header = line[1:].strip().split()[0] 96 | 97 | # write to contig to mapping file 98 | out_contigs.write(f"{new_header}\t{new_name}\n") 99 | # write to fasta file 100 | ffo.write(f">{new_header}\n") 101 | else: 102 | ffo.write(line) 103 | 104 | 105 | # rename quality 106 | def rename_quality(quality_in, quality_out, old2new_name): 107 | Q = pd.read_csv(quality_in, index_col=0, sep="\t") 108 | 109 | Q = Q.loc[old2new_name.keys()].rename(index=old2new_name) 110 | 111 | Q.to_csv(quality_out, sep="\t") 112 | 113 | 114 | rename_quality( 115 | quality_in=snakemake.input.genome_info, 116 | quality_out=snakemake.output.genome_info, 117 | old2new_name=old2new_name, 118 | ) 119 | -------------------------------------------------------------------------------- /workflow/scripts/root_tree.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | import sys, os 3 | import logging, traceback 4 | 5 | logging.basicConfig( 6 | filename=snakemake.log[0], 7 | level=logging.INFO, 8 | format="%(asctime)s %(message)s", 9 | datefmt="%Y-%m-%d %H:%M:%S", 10 | ) 11 | 12 | 13 | def handle_exception(exc_type, exc_value, exc_traceback): 14 | if issubclass(exc_type, KeyboardInterrupt): 15 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 16 | return 17 | 18 | logging.error( 19 | "".join( 20 | [ 21 | "Uncaught exception: ", 22 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 23 | ] 24 | ) 25 | ) 26 | 27 | 28 | # Install exception handler 29 | sys.excepthook = handle_exception 30 | 31 | # start 32 | import ete3 33 | 34 | T = ete3.Tree(snakemake.input.tree, quoted_node_names=True, format=1) 35 | 36 | try: 37 | T.unroot() 38 | if len(T) > 2: 39 | T.set_outgroup(T.get_midpoint_outgroup()) 40 | 41 | except Exception as e: 42 | logging.error("Failed to root tree, keep unrooted. Reason was:\n\n" + str(e)) 43 | 44 | 45 | T.write(outfile=snakemake.output.tree) 46 | -------------------------------------------------------------------------------- /workflow/scripts/split_genecatalog.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | 4 | import sys, os 5 | import logging, traceback 6 | 7 | logging.basicConfig( 8 | filename=snakemake.log[0], 9 | level=logging.INFO, 10 | format="%(asctime)s %(message)s", 11 | datefmt="%Y-%m-%d %H:%M:%S", 12 | ) 13 | 14 | 15 | def handle_exception(exc_type, exc_value, exc_traceback): 16 | if issubclass(exc_type, KeyboardInterrupt): 17 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 18 | return 19 | 20 | logging.error( 21 | "".join( 22 | [ 23 | "Uncaught exception: ", 24 | *traceback.format_exception(exc_type, exc_value, exc_traceback), 25 | ] 26 | ) 27 | ) 28 | 29 | 30 | # Install exception handler 31 | sys.excepthook = handle_exception 32 | 33 | ## start 34 | 35 | 36 | from utils import fasta 37 | 38 | fasta.split( 39 | snakemake.input[0], 40 | snakemake.params.subset_size, 41 | snakemake.output[0], 42 | simplify_headers=True, 43 | ) 44 | -------------------------------------------------------------------------------- /workflow/scripts/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import parsers_bbmap, parsers 2 | from .utils import gen_names_for_range 3 | from . import taxonomy 4 | -------------------------------------------------------------------------------- /workflow/scripts/utils/fasta.py: -------------------------------------------------------------------------------- 1 | from Bio import SeqIO 2 | from numpy import ceil 3 | import os 4 | 5 | 6 | def _make_test_fasta(test_file="test_ABC.fasta"): 7 | with open(test_file, "w") as f: 8 | for Number, Letter in enumerate("ATCG"): 9 | f.write(f">contig_{Number+1} description\n{Letter}\n") 10 | 11 | 12 | def count_Nseq(fasta_file): 13 | """ 14 | Counts number of sequences in a fasta file. 15 | >>> fasta_file='test_ABC.fasta' 16 | >>> _make_test_fasta(fasta_file) # makes fasta with a seq for each nucleotide 17 | >>> count_Nseq(fasta_file) 18 | 4 19 | >>> os.remove(fasta_file) 20 | 21 | """ 22 | i = 0 23 | with open(fasta_file) as f: 24 | for line in f: 25 | if line[0] == ">": 26 | i += 1 27 | return i 28 | 29 | 30 | def split(fasta_file, maxSubsetSize, out_dir, simplify_headers=True): 31 | """ 32 | Splits a fasta in subsets of size max maxSubsetSize. 33 | >>> fasta_file='test_ABC.fasta' 34 | >>> out_dir = 'test_outdit_doctest' 35 | >>> _make_test_fasta(fasta_file) # makes fasta with a seq for each nucleotide 36 | >>> split(fasta_file,3,out_dir,simplify_headers=True) 37 | >>> len(os.listdir(out_dir)) 38 | 2 39 | >>> count_Nseq('test_outdit_doctest/subset1.fasta') 40 | 2 41 | >>> count_Nseq('test_outdit_doctest/subset2.fasta') 42 | 2 43 | >>> split(fasta_file,3,out_dir,simplify_headers=True) 44 | Traceback (most recent call last): 45 | ... 46 | FileExistsError: [Errno 17] File exists: 'test_outdit_doctest' 47 | >>> import shutil; shutil.rmtree(out_dir) 48 | >>> os.remove(fasta_file) 49 | """ 50 | 51 | N = count_Nseq(fasta_file) 52 | 53 | SubsetSize = int(ceil(N / ceil(N / maxSubsetSize))) 54 | extension = os.path.splitext(fasta_file)[-1] 55 | 56 | os.makedirs(out_dir) 57 | 58 | i, subset_n = 0, 0 59 | fout = None 60 | for i, seq in enumerate(SeqIO.parse(fasta_file, "fasta")): 61 | if (i % SubsetSize) == 0: 62 | subset_n += 1 63 | if fout is not None: 64 | fout.close() 65 | 66 | fout = open(f"{out_dir}/subset{subset_n}{extension}", "w") 67 | 68 | if simplify_headers: 69 | seq.description = "" 70 | SeqIO.write(seq, fout, "fasta") 71 | 72 | fout.close() 73 | 74 | 75 | def parse_fasta_headers(fasta_file, simplify_header=True): 76 | """ 77 | returns list of fasta headers 78 | """ 79 | 80 | headers = [] 81 | 82 | with open(fasta_file) as f: 83 | for line in f: 84 | if line[0] == ">": 85 | header = line[1:].strip() 86 | 87 | if simplify_header: 88 | header = header.split()[0] 89 | 90 | headers.append(header) 91 | 92 | return headers 93 | 94 | 95 | def header2origin(fasta_file, out, simplify_header=True): 96 | """ 97 | Annotates a fasta file to it's filename: 98 | genome.fasta: 99 | >contig1 description 100 | ACTAC 101 | >contig2 description 102 | ACTAC 103 | ... 104 | 105 | becomes: 106 | contig1 genome 107 | contig2 genome 108 | 109 | input is a fasta filename: 110 | out is a filename or a stream 111 | 112 | """ 113 | 114 | if type(out) == str: 115 | out_stream = open(out, "w") 116 | else: 117 | out_stream = out 118 | 119 | name = os.path.splitext(os.path.split(fasta_file)[-1])[0] 120 | 121 | # write names of contigs in mapping file 122 | with open(fasta_file) as f: 123 | for line in f: 124 | if line[0] == ">": 125 | header = line[1:].strip() 126 | if simplify_header: 127 | header = header.split()[0] 128 | out_stream.write(f"{header}\t{name}\n") 129 | out_stream.flush() 130 | 131 | 132 | if __name__ == "__main__": 133 | import doctest, shutil 134 | 135 | doctest.testmod() 136 | -------------------------------------------------------------------------------- /workflow/scripts/utils/gene_scripts.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def split_orf_to_index(orf_names): 6 | """Split the typical prodigal orf name `Sample_contigNr_OrfNR` into three column dataset and conversts the numbers to the smallest unsigned number.""" 7 | result = ( 8 | pd.Series(orf_names) 9 | .str.rsplit("_", n=2, expand=True) 10 | .rename(columns={0: "Sample", 1: "ContigNr", 2: "OrfNr"}) 11 | ) 12 | result = result.apply(pd.to_numeric, errors="ignore", downcast="unsigned") 13 | 14 | are_numeric = result.dtypes.map(lambda x: np.issubdtype(x, np.number)) 15 | assert all( 16 | are_numeric == np.array([False, True, True]) 17 | ), f"datatypes are not as expected {result.dtypes}" 18 | 19 | return result 20 | 21 | 22 | def geneNr_to_string(GeneNrs, Ngenes=None): 23 | """Convert the array of gene number to the corresponding string, e.g.: 5 -> Gene0005 24 | The leading zeros depends on the number of genes 25 | """ 26 | 27 | assert np.issubdtype(GeneNrs.dtype, np.number) 28 | 29 | if Ngenes is None: 30 | Ngenes = GeneNrs.max() 31 | 32 | n_leading_zeros = len(str(Ngenes)) 33 | 34 | number_format = f"Gene{{:0{n_leading_zeros}d}}" 35 | 36 | return GeneNrs.apply(number_format.format) 37 | -------------------------------------------------------------------------------- /workflow/scripts/utils/genome_stats.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | import pandas as pd 3 | import os, sys 4 | from .io import simplify_path, simply_open 5 | from itertools import groupby 6 | import numpy as np 7 | import gzip as gz 8 | import logging 9 | 10 | logger = logging.getLogger(__file__) 11 | 12 | 13 | def verify_dna(sequence, is_upper): 14 | if not is_upper: 15 | sequence = sequence.upper() 16 | letters_used = set(sequence) 17 | 18 | alphabet = set(list("ATCGN")) 19 | 20 | additional_letters = letters_used - alphabet 21 | 22 | if len(additional_letters) > 0: 23 | raise Exception( 24 | f"Sequence contains additional letters that are not DNA {additional_letters}" 25 | ) 26 | 27 | 28 | def get_stats_from_lengths(lengths): 29 | sorted_lengths = sorted(lengths, reverse=True) 30 | csum = np.cumsum(sorted_lengths) 31 | 32 | Total_length = int(sum(lengths)) 33 | N = len(lengths) 34 | 35 | n2 = int(Total_length / 2) 36 | 37 | # get index for cumsum >= N/2 38 | csumn2 = min(csum[csum >= n2]) 39 | ind = int(np.where(csum == csumn2)[0][0]) 40 | 41 | N50 = sorted_lengths[ind] 42 | 43 | return Total_length, N, N50 44 | 45 | 46 | def genome_stats(fasta_file, number_of_n_for_split=10): 47 | """Get genome stats from a fasta file. Outputs a tuple with: 48 | name,Length, n_seq,N50 49 | """ 50 | 51 | try: 52 | name = simplify_path(fasta_file) 53 | 54 | scaffold_lengths = [] 55 | contig_lengths = [] 56 | ambigious_bases = 0 57 | 58 | with simply_open(fasta_file, "r") as fasta: 59 | ## parse each sequence by header: groupby(data, key) 60 | faiter = (x[1] for x in groupby(fasta, lambda line: line[0] == ">")) 61 | 62 | for record in faiter: 63 | # reccord contains header 64 | ## join sequence lines 65 | sequence = "".join(s.strip() for s in faiter.__next__()) 66 | sequence = sequence.upper() 67 | 68 | verify_dna(sequence, is_upper=True) 69 | 70 | # count ambigous bases 71 | ambigious_bases += sequence.count("N") 72 | 73 | # get set of scaffold lengths 74 | scaffold_lengths.append(len(sequence)) 75 | 76 | # split on Ns and get set of contig lengths. 77 | contig_lengths += [ 78 | len(contig) 79 | for contig in sequence.split("N" * number_of_n_for_split) 80 | ] 81 | 82 | Length_scaffolds, N_scaffolds, N50 = get_stats_from_lengths(scaffold_lengths) 83 | 84 | Length_contigs, N_contigs, _ = get_stats_from_lengths(contig_lengths) 85 | 86 | except Exception as e: 87 | logger.critical(f"Error in calculating stats of {fasta_file}") 88 | logger.critical(e) 89 | raise Exception(f"Error in calculating stats of {fasta_file}") from e 90 | 91 | return { 92 | "File": name, 93 | "Length_scaffolds": Length_scaffolds, 94 | "N_scaffolds": N_scaffolds, 95 | "N50": N50, 96 | "Length_contigs": Length_contigs, 97 | "N_contigs": N_contigs, 98 | "Ambigious_bases": ambigious_bases, 99 | } 100 | 101 | 102 | def get_many_genome_stats(filenames, output_filename, threads=1): 103 | """Small function to calculate total genome length and N50""" 104 | 105 | pool = Pool(threads) 106 | 107 | results = pool.map(genome_stats, filenames) 108 | Stats = pd.DataFrame(results).rename({"Length_scaffolds": "Length"}) 109 | 110 | Stats.to_csv(output_filename, sep="\t", index=False) 111 | -------------------------------------------------------------------------------- /workflow/scripts/utils/io.py: -------------------------------------------------------------------------------- 1 | import gzip as gz 2 | 3 | import logging 4 | import os 5 | from pathlib import Path 6 | 7 | logger = logging.getLogger("io") 8 | 9 | 10 | def simplify_path(path, remove_gz=True): 11 | """Removes dir and extension from a filepath. 12 | checks if file has an e 13 | """ 14 | 15 | path = Path(path) 16 | 17 | name = path.stem 18 | ext = path.suffix 19 | 20 | if remove_gz & (ext == ".gz"): 21 | name = Path(name).stem 22 | 23 | return name 24 | 25 | 26 | def simply_open(filename, mode="r", *args, **kwargs): 27 | """open file irrespective if gz compressed or not""" 28 | 29 | filename = Path(filename) 30 | 31 | if filename.suffix == ".gz": 32 | # To read file in textmode 33 | if mode in ["r", "a", "w", "x"]: 34 | mode += "t" 35 | 36 | return gz.open(filename, mode, *args, **kwargs) 37 | else: 38 | return open(filename, mode, *args, **kwargs) 39 | 40 | 41 | def cat_files(files, outfilename, gzip=False): 42 | """cat files in python 43 | gzip: compress outfile 44 | set to false when cat files that are already gzipped. 45 | """ 46 | 47 | import shutil 48 | 49 | if gzip: 50 | import gzip as gz 51 | 52 | outhandle = gz.open 53 | else: 54 | outhandle = open 55 | 56 | with outhandle(outfilename, "wb") as f_out: 57 | for f in files: 58 | with open(f, "rb") as f_in: 59 | shutil.copyfileobj(f_in, f_out) 60 | 61 | 62 | def convert_percentages(df): 63 | """Convet all columns with strings and % at the end to percentages""" 64 | for col in df.columns: 65 | if df.dtypes[col] == "object": 66 | if df[col].iloc[0].endswith("%"): 67 | df.loc[:, col] = df[col].str.rstrip("%").astype("float") / 100.0 68 | 69 | 70 | def symlink_relative(files, input_dir, output_dir): 71 | """create symlink with and adjust for relative path""" 72 | 73 | input_dir_rel = os.path.relpath(input_dir, output_dir) 74 | 75 | for f in files: 76 | os.symlink(os.path.join(input_dir_rel, f), os.path.join(output_dir, f)) 77 | 78 | 79 | def _pandas_concat_in_memory( 80 | input_tables, 81 | output_table, 82 | sep, 83 | index_col, 84 | axis, 85 | read_arguments, 86 | save_arguments, 87 | concat_arguments, 88 | ): 89 | import pandas as pd 90 | 91 | Tables = [ 92 | pd.read_csv(file, index_col=index_col, sep=sep, **read_arguments) 93 | for file in input_tables 94 | ] 95 | 96 | out = pd.concat(Tables, axis=axis, **concat_arguments).sort_index() 97 | 98 | del Tables 99 | 100 | out.to_csv(output_table, sep=sep, **save_arguments) 101 | 102 | 103 | def _pandas_concat_disck_based( 104 | input_tables, 105 | output_table, 106 | sep, 107 | index_col, 108 | read_arguments, 109 | save_arguments, 110 | selected_headers=None, 111 | ): 112 | """combine different tables but one after the other in disk based""" 113 | 114 | import pandas as pd 115 | 116 | try: 117 | from tqdm import tqdm 118 | except ImportError: 119 | tqdm = tuple 120 | 121 | if selected_headers is not None: 122 | try: 123 | selected_headers = list(selected_headers) 124 | except Exception as e: 125 | raise Exception("selected_headers should be a list-like") from e 126 | 127 | else: 128 | # read all_headers 129 | selected_headers = set() 130 | for file in input_tables: 131 | headers_of_file = pd.read_csv( 132 | file, index_col=index_col, sep=sep, nrows=2, dtype=str, **read_arguments 133 | ) 134 | 135 | selected_headers.update(list(headers_of_file.columns)) 136 | 137 | selected_headers = list(selected_headers) 138 | logger.info(f"Inferred following list of headers {selected_headers}") 139 | 140 | # parse one file after another 141 | 142 | logger.info("Read an append table by table") 143 | for file in tqdm(input_tables): 144 | # read full table 145 | table = pd.read_csv( 146 | file, index_col=index_col, sep=sep, dtype=str, **read_arguments 147 | ) 148 | # set to common header 149 | table = table.reindex(selected_headers, axis=1) 150 | 151 | if file == input_tables[0]: 152 | mode = "w" 153 | print_header = True 154 | else: 155 | mode = "a" 156 | print_header = False 157 | 158 | table.to_csv( 159 | output_table, sep=sep, mode=mode, header=print_header, **save_arguments 160 | ) 161 | 162 | 163 | def pandas_concat( 164 | input_tables, 165 | output_table, 166 | sep="\t", 167 | index_col=0, 168 | axis=0, 169 | read_arguments=None, 170 | save_arguments=None, 171 | concat_arguments=None, 172 | disk_based=False, 173 | selected_headers=None, # only used in disk based, not passed to usecols 174 | ): 175 | """ 176 | Uses pandas to read,concatenate and save tables using pandas.concat 177 | """ 178 | 179 | if read_arguments is None: 180 | read_arguments = {} 181 | if save_arguments is None: 182 | save_arguments = {} 183 | 184 | if type(input_tables) == str: 185 | input_tables = [input_tables] 186 | 187 | common_arrguments = dict( 188 | input_tables=input_tables, 189 | output_table=output_table, 190 | sep=sep, 191 | index_col=index_col, 192 | read_arguments=read_arguments, 193 | save_arguments=save_arguments, 194 | ) 195 | 196 | if disk_based: 197 | if concat_arguments is not None: 198 | raise Exception( 199 | f"cannot hanndle concat arguments by disck based append, got {concat_arguments}" 200 | ) 201 | 202 | assert axis == 0, "Can only append on axis= 0" 203 | 204 | _pandas_concat_disck_based( 205 | selected_headers=selected_headers, **common_arrguments 206 | ) 207 | 208 | else: 209 | # in memory concat 210 | if concat_arguments is None: 211 | concat_arguments = {} 212 | 213 | if selected_headers is not None: 214 | raise Exception( 215 | "argument 'selected_headers' is not used in 'in memory' concat. Use read_arguments=dict(usecols=selected_headers) instead " 216 | ) 217 | 218 | _pandas_concat_in_memory( 219 | axis=axis, concat_arguments=concat_arguments, **common_arrguments 220 | ) 221 | -------------------------------------------------------------------------------- /workflow/scripts/utils/parsers.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from warnings import warn 3 | 4 | 5 | def read_checkm_output(taxonomy_table, completness_table): 6 | c_df = pd.read_csv(completness_table, index_col=0, sep="\t")[ 7 | ["Completeness", "Contamination", "Strain heterogeneity"] 8 | ] 9 | t_df = pd.read_csv(taxonomy_table, index_col=0, sep="\t")[ 10 | [ 11 | "# unique markers (of 43)", 12 | "# multi-copy", 13 | "Insertion branch UID", 14 | "Taxonomy (contained)", 15 | "Taxonomy (sister lineage)", 16 | "GC", 17 | "Genome size (Mbp)", 18 | "Gene count", 19 | "Coding density", 20 | ] 21 | ] 22 | df = pd.concat([c_df, t_df], axis=1) 23 | return df 24 | 25 | 26 | def read_busco_output( 27 | completness_table, quality_score_formula="Completeness - 5*Contamination" 28 | ): 29 | df = pd.read_table(completness_table, index_col=0) 30 | 31 | df.eval( 32 | "Completeness = Complete ", 33 | inplace=True, 34 | ) 35 | df.eval("Contamination = Duplicated", inplace=True) 36 | df.eval( 37 | "Quality_score = " + quality_score_formula, 38 | inplace=True, 39 | ) 40 | 41 | # remove extension from filename 42 | df.index = df.index.str.replace(".fasta", "", regex=False) 43 | df.index.name = "Bin Id" 44 | 45 | return df 46 | 47 | 48 | def read_checkm2_output( 49 | completness_table, quality_score_formula="Completeness - 5*Contamination" 50 | ): 51 | df = pd.read_table(completness_table, index_col=0) 52 | 53 | if not "Completeness" in df.columns: 54 | # create empty column 55 | df.insert(0, "Completeness", 0.0) 56 | 57 | # add completeness depending on selected model 58 | specific = df.Completeness_Model_Used.str.contains("Specific Model") 59 | df.loc[specific, "Completeness"] = df.loc[specific, "Completeness_Specific"] 60 | df.loc[~specific, "Completeness"] = df.loc[~specific, "Completeness_General"] 61 | 62 | df.eval( 63 | "Quality_score = " + quality_score_formula, 64 | inplace=True, 65 | ) 66 | 67 | df.index.name = "Bin Id" 68 | 69 | return df 70 | 71 | 72 | def load_quality(quality_file): 73 | Q = pd.read_csv(quality_file, index_col=0, sep="\t") 74 | 75 | # remove extension if present 76 | if Q.index.str.contains(".fa").all(): 77 | warn("Found fasta extension in index. I remove them") 78 | Q.index = Q.index.str.split(".fa", expand=True).to_frame()[0] 79 | 80 | # Q.columns = Q.columns.str.lower() 81 | 82 | necessary_columns = ["Completeness", "Contamination"] 83 | 84 | # rename lower and uppercase to necessary_columns 85 | Q = Q.rename( 86 | columns={ 87 | fun(s[0]) + s[1:]: s 88 | for s in necessary_columns 89 | for fun in (str.lower, str.upper) 90 | } 91 | ) 92 | 93 | if Q.columns.isin(necessary_columns).sum() != len(necessary_columns): 94 | raise Exception( 95 | f"{necessary_columns} should be in the quality table, only got {Q.columns}" 96 | ) 97 | 98 | assert not Q.index.duplicated().any(), f"duplicated indexes in {quality_file}" 99 | 100 | return Q 101 | -------------------------------------------------------------------------------- /workflow/scripts/utils/taxonomy.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import warnings 4 | 5 | TAXONMIC_LEVELS = ["Domain", "phylum", "class", "order", "family", "genus", "species"] 6 | 7 | 8 | def tax2table(Taxonomy_Series, split_character=";", remove_prefix=False): 9 | """ 10 | Transforms (green_genes) taxonomy to a table 11 | Expect the following input format: 12 | d__Bacteria;p__Bacteroidota;c__Bacteroidia;f__ 13 | Replaces empty values and can remove prefix 'c__' 14 | """ 15 | 16 | # drop missing values 17 | if Taxonomy_Series.isnull().any(): 18 | warnings.warn( 19 | "Some samples have no taxonomy asigned. Samples:\n" 20 | + ", ".join(Taxonomy_Series.index[Taxonomy_Series.isnull()].astype(str)) 21 | ) 22 | 23 | Tax = Taxonomy_Series.dropna().astype(str).str.split(split_character, expand=True) 24 | # Add headers as long as we have columns 25 | Tax.columns = TAXONMIC_LEVELS[: len(Tax.columns)] 26 | 27 | if remove_prefix: 28 | Tax = Tax.applymap(lambda s: s[3:], na_action="ignore").replace("", np.nan) 29 | else: 30 | Tax[Tax.applymap(len, na_action="ignore") == 3] = np.nan 31 | 32 | # add missing values again 33 | 34 | Tax = Tax.reindex(Taxonomy_Series.index) 35 | 36 | return Tax 37 | 38 | 39 | def load_checkm_tax(taxonomy_file, remove_prefix=False): 40 | D = pd.read_table(taxonomy_file, index_col=0) 41 | 42 | checkmTax = tax2table(D["Taxonomy (contained)"], remove_prefix=remove_prefix) 43 | 44 | return checkmTax 45 | 46 | 47 | def load_gtdb_tax(taxonomy_file, remove_prefix=False): 48 | D = pd.read_table(taxonomy_file, index_col=0) 49 | 50 | Tax = tax2table(D["classification"], remove_prefix=remove_prefix) 51 | 52 | return Tax 53 | -------------------------------------------------------------------------------- /workflow/scripts/utils/tree.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["QT_QPA_PLATFORM"] = "offscreen" # because we might not have a X server 4 | 5 | import ete3 6 | import pandas as pd 7 | import warnings 8 | 9 | 10 | def load_tree(netwik_file): 11 | return ete3.Tree(netwik_file, quoted_node_names=True, format=1) 12 | 13 | 14 | def root_tree_by_phyla(T, phyla): 15 | """Root the tree next to the phylum that is as far apart as possible from the other phyla""" 16 | phylum_LCA = {} 17 | 18 | for p in phyla.unique(): 19 | phylum_LCA[p] = T.get_common_ancestor(*tuple(phyla.index[phyla == p].values)) 20 | 21 | Dist = pd.DataFrame() 22 | for p1, lca1 in phylum_LCA.items(): 23 | for p2, lca2 in phylum_LCA.items(): 24 | Dist.loc[p1, p2] = T.get_distance(lca1, lca2) 25 | 26 | furthest_phylum = Dist.mean().idxmax() 27 | outgroup = phylum_LCA[furthest_phylum] 28 | 29 | if not outgroup == T: 30 | T.set_outgroup(outgroup) 31 | 32 | 33 | def layout_black_circles(node): 34 | # If node is a leaf 35 | if node.is_leaf(): 36 | node.img_style["fgcolor"] = "k" 37 | else: 38 | node.img_style["size"] = 0 39 | 40 | 41 | def render_tree(T, out): 42 | from ete3 import TreeStyle 43 | 44 | ts = TreeStyle() 45 | ts.show_leaf_name = False 46 | ts.mode = "c" 47 | ts.scale = 200 48 | ts.show_scale = False 49 | 50 | T.render(out, tree_style=ts, layout=layout_black_circles) 51 | -------------------------------------------------------------------------------- /workflow/scripts/utils/utils.py: -------------------------------------------------------------------------------- 1 | def gen_names_for_range(N, prefix="", start=1): 2 | """generates a range of IDS with leading zeros so sorting will be ok""" 3 | n_leading_zeros = len(str(N)) 4 | format_int = prefix + "{:0" + str(n_leading_zeros) + "d}" 5 | return [format_int.format(i) for i in range(start, N + start)] 6 | --------------------------------------------------------------------------------