├── .circleci
    └── config.yml
├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── workflow-error.md
    └── workflows
    │   ├── codespell.yml
    │   ├── conventional-prs.yml
    │   ├── format.yml
    │   ├── python-package-conda.yml
    │   ├── release-please.yml
    │   └── stale.yml
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── atlas
    ├── __init__.py
    ├── _version.py
    ├── atlas.py
    ├── color_logger.py
    ├── config
    ├── default_values.py
    ├── init
    │   ├── __init__.py
    │   ├── atlas_init.py
    │   ├── create_sample_table.py
    │   ├── get_SRA_runinfo.py
    │   └── parse_sra.py
    ├── make_config.py
    ├── sample_table.py
    └── workflow
├── atlasenv.yml
├── config
    ├── default_config.yaml
    └── template_config.yaml
├── docker_run.sh
├── docs
    ├── Makefile
    ├── advanced
    │   ├── assembly.rst
    │   └── qc.rst
    ├── conf.py
    ├── index.rst
    ├── pyproject.toml
    ├── reports
    │   ├── QC_report.html
    │   ├── assembly_report.html
    │   ├── bin_report_DASTool.html
    │   ├── bin_report_SemiBin.html
    │   ├── bin_report_vamb.html
    │   ├── dram_product.html
    │   └── samples.tsv
    └── usage
    │   ├── changelog.md
    │   ├── configuration.rst
    │   ├── getting_started.rst
    │   └── output.rst
├── prepare.py
├── resources
    ├── images
    │   ├── atlas_image.png
    │   └── atlas_list.png
    └── report.css
├── setup.cfg
├── setup.py
├── test
    ├── dryrun.sh
    ├── test_assembly.sh
    ├── test_ci.sh
    ├── test_external_genomes.sh
    ├── test_init_many_samples.sh
    ├── test_local.sh
    └── test_sra.sh
├── versioneer.py
└── workflow
    ├── Snakefile
    ├── annotate.smk
    ├── envs
        ├── DASTool.yaml
        ├── busco.yaml
        ├── cd-hit.yaml
        ├── checkm.yaml
        ├── checkm2.yaml
        ├── dram.yaml
        ├── eggNOG.yaml
        ├── fasta.yaml
        ├── grabseq.yaml
        ├── gtdbtk.yaml
        ├── gunc.yaml
        ├── hdf.yaml
        ├── instrain.yaml
        ├── maxbin.yaml
        ├── megahit.yaml
        ├── metabat.yaml
        ├── minimap.yaml
        ├── mmseqs.yaml
        ├── prodigal.yaml
        ├── report.yaml
        ├── required_packages.yaml
        ├── semibin.yaml
        ├── sequence_utils.yaml
        ├── skani.yaml
        ├── spades.yaml
        ├── species_clustering.yaml
        ├── sra.post-deploy.sh
        ├── sra.yaml
        ├── tree.yaml
        └── vamb.yaml
    ├── report
        ├── assembly_report.py
        ├── bin_report.py
        ├── common_report.py
        ├── qc_report.py
        ├── report.css
        ├── template_QC_report.html
        ├── template_assembly_report.html
        └── template_bin_report.html
    ├── rules
        ├── assemble.smk
        ├── bin_quality.smk
        ├── binning.smk
        ├── cdhit.smk
        ├── cobinning.smk
        ├── derep.smk
        ├── download.smk
        ├── dram.smk
        ├── genecatalog.smk
        ├── genomes.smk
        ├── gtdbtk.smk
        ├── patch.smk
        ├── predict_genes_of_genomes.py
        ├── qc.smk
        ├── sample_table.smk
        ├── scg_blank_diamond.rb
        ├── screen.smk
        ├── semibin.smk
        ├── sra.smk
        └── strains.smk
    └── scripts
        ├── DRAM_get_all_modules.py
        ├── cluster_species.py
        ├── combine_busco.py
        ├── combine_checkm.py
        ├── combine_checkm2.py
        ├── combine_contig_stats.py
        ├── combine_coverage_MAGs.py
        ├── combine_dram_gene_annotations.py
        ├── combine_gene_coverages.py
        ├── combine_taxonomy.py
        ├── convert_jgi2vamb_coverage.py
        ├── filter_genes.py
        ├── filter_genomes.py
        ├── gene2genome.py
        ├── generate_orf_info.py
        ├── get_fasta_of_bins.py
        ├── get_read_stats.py
        ├── parse_semibin.py
        ├── parse_vamb.py
        ├── rename_assembly.py
        ├── rename_genecatalog.py
        ├── rename_genomes.py
        ├── root_tree.py
        ├── split_genecatalog.py
        └── utils
            ├── __init__.py
            ├── fasta.py
            ├── gene_scripts.py
            ├── genome_dist.py
            ├── genome_stats.py
            ├── io.py
            ├── parsers.py
            ├── parsers_bbmap.py
            ├── taxonomy.py
            ├── tree.py
            └── utils.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | atlas/_version.py export-subst
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: Feature request
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | 
17 | 
18 | **Additional context**
19 | Add any other context or screenshots about the feature request here.
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/workflow-error.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Workflow Error
 3 | about: I got atlas running but then it encountered this error. For other errors see
 4 |   the other template.
 5 | title: Error in rule
 6 | labels: ''
 7 | assignees: ''
 8 | 
 9 | ---
10 | 
11 | - [ ] I checked and didn't found a related issue,e.g. while typing the title
12 | - [ ] ** I got an error in the following rule(s):** `   `
13 | 
14 | <!-- The error messages are highlighted in red in the stdout. Otherwise you can get all the errors occurred in the workflow by running the following command in the atlas working directory:
15 | `grep  -A 8 rule $(ls -t .snakemake/log/* | head -1)`
16 |   -->
17 | 
18 | 
19 | 
20 | - [ ] I checked the log files indicated indicated in the error message (and the cluster logs if submitted to a cluster)
21 | 
22 | <!-- 
23 | The error message should look something like the rule below 
24 | ```
25 | 
26 | Error in rule run_das_tool:
27 |     jobid: 91
28 |     output: sample2/binning/DASTool/sample2_DASTool_summary.txt, sample2/binning/DASTool/cluster_attribution.tsv
29 |     log: sample2/logs/binning/DASTool.log (check log file(s) for error message)
30 |     conda-env: /data/users/ocetiner/ABERRANT_data/test3/databases/conda_envs/83d50c467b0444c29b1510fd5b6ba829
31 |     shell:
32 |          DAS_Tool --outputbasename sample2/binning/DASTool/sample2  --bins 
33 |         (one of the commands exited with non-zero exit code; note that snakemake uses bash strict mode!)
34 |     cluster_jobid: 7010304
35 | 
36 | Error executing rule run_das_tool on cluster (jobid: 91, external: 7010304). For error details see the cluster log and the log files of the involved rule(s).
37 | 
38 | ```
39 | Do what is says **For error details see the cluster log and the log files of the involved rule(s).**
40 | 
41 | cluster logs are usually in `cluster_log/*external_id*` or so.
42 | 
43 | -->
44 | 
45 | Here is the relevant log output:
46 | 
47 | ```
48 | 
49 | 
50 | 
51 | ```
52 | 
53 | 
54 | ** Atlas version**
55 | 
56 | **Additional context**
57 | Add any other context about the problem here.
58 | 


--------------------------------------------------------------------------------
/.github/workflows/codespell.yml:
--------------------------------------------------------------------------------
 1 | # Codespell configuration is within pyproject.toml
 2 | ---
 3 | name: Codespell
 4 | 
 5 | on:
 6 |   push:
 7 |     branches: [main]
 8 |   pull_request:
 9 |     branches: [main]
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   codespell:
16 |     name: Check for spelling errors
17 |     runs-on: ubuntu-latest
18 | 
19 |     steps:
20 |       - name: Checkout
21 |         uses: actions/checkout@v4
22 |       - name: Codespell
23 |         uses: codespell-project/actions-codespell@v2
24 |         with:
25 |           check_filenames: true
26 |           skip: ".git,*.pdf,*.svg,versioneer.py,*.css,*.html"
27 |           check_hidden: true
28 | 


--------------------------------------------------------------------------------
/.github/workflows/conventional-prs.yml:
--------------------------------------------------------------------------------
 1 | name: PR
 2 | on:
 3 |   pull_request_target:
 4 |     types:
 5 |       - opened
 6 |       - reopened
 7 |       - edited
 8 |       - synchronize
 9 | 
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   title-format:
15 |     permissions:
16 |       pull-requests: read # for amannn/action-semantic-pull-request to analyze PRs
17 |       statuses: write # for amannn/action-semantic-pull-request to mark status of analyzed PR
18 |     runs-on: ubuntu-latest
19 |     steps:
20 |       - uses: amannn/action-semantic-pull-request@v5.0.2
21 |         env:
22 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23 | 


--------------------------------------------------------------------------------
/.github/workflows/format.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | concurrency:
10 |   # Cancel concurrent flows on PRs
11 |   group: ci-${{ github.head_ref || github.run_id }}
12 |   cancel-in-progress: true
13 | 
14 | jobs:
15 |   formatting:
16 |     permissions:
17 |       contents: read # for actions/checkout to fetch code
18 |       pull-requests: write # for marocchino/sticky-pull-request-comment to create or update PR comment
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - uses: actions/checkout@v4
22 | 
23 |       - uses: mamba-org/setup-micromamba@v1
24 |         with:
25 |           environment-name: formatting
26 |           create-args: black snakefmt
27 |           condarc: |
28 |             channels:
29 |               - conda-forge
30 |               - bioconda
31 |           cache-environment: true
32 | 
33 |       - name: Check Black formatting
34 |         shell: bash -el {0}
35 |         run: black --check --diff .
36 | 
37 |       - name: Check Snakefmt formatting
38 |         shell: bash -el {0}
39 |         run: snakefmt --check --diff .
40 | 
41 |       - name: Comment PR
42 |         if: github.event_name == 'pull_request' && failure()
43 |         uses: marocchino/sticky-pull-request-comment@v2.8.0
44 |         with:
45 |           message: |
46 |             Please format your code with:
47 |             - [black](https://black.readthedocs.io): `black .`
48 |             - [snakefmt](https://github.com/snakemake/snakefmt): `snakefmt .`
49 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
50 | 


--------------------------------------------------------------------------------
/.github/workflows/release-please.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 | 
 6 | name: release-please
 7 | 
 8 | jobs:
 9 |   release-please:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: GoogleCloudPlatform/release-please-action@v3
13 |         id: release
14 |         with:
15 |           release-type: python
16 |           package-name: metagenome-atlas
17 | 
18 |       - uses: actions/checkout@v3
19 |         if: ${{ steps.release.outputs.release_created }}
20 |         with:
21 |           fetch-depth: 0
22 | 
23 |       - name: Set up Python
24 |         if: ${{ steps.release.outputs.release_created }}
25 |         uses: actions/setup-python@v4
26 |         with:
27 |           python-version: "3.x"
28 | 
29 |       - name: Build and check package
30 |         if: ${{ steps.release.outputs.release_created }}
31 |         run: |
32 |           python -m pip install --upgrade pip
33 |           pip install build twine
34 |           python -m build
35 |           twine check --strict dist/*
36 | 


--------------------------------------------------------------------------------
/.github/workflows/stale.yml:
--------------------------------------------------------------------------------
 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
 2 | #
 3 | # You can adjust the behavior by modifying this file.
 4 | # For more information, see:
 5 | # https://github.com/actions/stale
 6 | name: Mark stale issues and pull requests
 7 | 
 8 | on:
 9 |   schedule:
10 |   - cron: '16 13 * * *'
11 | 
12 | jobs:
13 |   stale:
14 | 
15 |     runs-on: ubuntu-latest
16 |     permissions:
17 |       issues: write
18 |       pull-requests: write
19 | 
20 |     steps:
21 |     - uses: actions/stale@v7
22 |       with:
23 |         repo-token: ${{ secrets.GITHUB_TOKEN }}
24 |         stale-issue-message: |
25 |               There was no activity since some time. I hope your issue is solved in the mean time. 
26 |               This issue will automatically close soon if no further activity occurs. 
27 |               
28 |               Thank you for your contributions.
29 |         stale-pr-message: |
30 |             This PR is stale because it has not had any recent activity. 
31 |             This PR will automatically close soon if no further activity occurs. 
32 |             
33 |             Thank you for your contributions.
34 |         days-before-stale: 60
35 |         days-before-close: 15
36 |         stale-issue-label: stale
37 |         stale-pr-label: stale
38 |         exempt-issue-labels: 'keep,enhancement,bug,documentation,Feature-request,Known-issue'
39 |         exempt-pr-labels: ''
40 |         days-before-pr-stale: 1000
41 |         exempt-assignees: silask
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | .snakemake/
 28 | .history
 29 | # atlas specific
 30 | databases
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | databases
 38 | .test/*
 39 | test/*
 40 | !test/*.sh
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *,cover
 54 | .hypothesis/
 55 | .vscode
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | local_settings.py
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | .venv/
 91 | venv/
 92 | ENV/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | 
101 | 
102 | # on mac
103 | .DS_Store
104 | example_data
105 | atlas/regex_formating.py
106 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | language: python
 3 | cache:
 4 |     directories:
 5 |      - $HOME/miniconda
 6 | python:
 7 |   # We don't actually use the Travis Python, but this keeps it organized.
 8 |   - "3.6"
 9 | before_install:
10 |   - |
11 |     if [ -d "$HOME/miniconda" ]; then
12 |       echo "conda exist already";
13 |       source activate atlasenv
14 |       conda list
15 |     else:
16 |       wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
17 |       bash miniconda.sh -b -p $HOME/miniconda
18 |       export PATH="$HOME/miniconda/bin:$PATH"
19 |       hash -r
20 |       conda config --set always_yes yes --set changeps1 no
21 |       conda update -q conda
22 |   # Useful for debugging any issues with conda
23 |       conda info -a
24 |       conda config --add channels defaults
25 |       conda config --add channels bioconda
26 |       conda config --add channels conda-forge
27 |       conda env create -n atlasenv --file atlasenv.yml
28 |     fi
29 | 
30 | install:
31 |   - source activate atlasenv
32 |   - python setup.py install
33 |   - atlas --help
34 |   - atlas --version
35 | script:
36 |   - N_THREADS=2 ; MEM=7
37 |   - test/dryrun.sh
38 |   - test/test_assembly.sh --resources mem=$MEM java_mem=$MEM --jobs=$N_THREADS --restart-times=2
39 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Submit a bug report
 2 | 
 3 | *First look in the all open and closed issues for the error message you got.*
 4 | 
 5 | As atlas is based on snakemake, check if you find a similar bug already discussed for other snakemake-workflows.
 6 | 
 7 | If you don't find any help. Submit a issue:
 8 | 
 9 | - specify the system you are working on: linux, cluster, shared filesystem?
10 | - copy the error message
11 | - add the log file of the rule, which produced the error.
12 | - If you run the atlas on a cluster join also the log file of the cluster.
13 | 
14 | I hope we can help you...
15 | 
16 | # Contribute to the metagenome-atlas code
17 | 
18 | ## Prerequisites
19 | 
20 | - know the basic about git and gitHub
21 | - know how snakemake works, otherwise check the [tutorial](https://snakemake.readthedocs.io/en/stable/tutorial/tutorial.html)
22 | 
23 | ## Setup
24 | 
25 | You can ask the maintainers to be added to the repository and work from a *branch* of the main atlas repository or you can work from a fork of the atlas repository.
26 | 
27 | Follow the [steps](https://github.com/metagenome-atlas/atlas#install-the-development-version-from-github) to set up the development version of atlas. This allows you to work with the code you have in the git repository.
28 | 
29 | ## Test the code
30 | 
31 | ### Locally
32 | 
33 | Idelly you should have some test prpject on your local machine.
34 | When you created a new rule and you want to test the output of this rule `my_target.tsv` you can do this by running:
35 | 
36 | ``` atlas run None my_target.tsv ```
37 | 
38 | ### Continuous integration
39 | 
40 | When you make a pull request to the master branch. Each change in your code gets checked by continuous integration (CI). The tests should make sure that your modification don't break any other use of atlas. However due to the requeirements needed during the execution of atlas, it is not possible to test all functionalities via CI. If you add functionalities to atlas, they should also be tested. Have a look at the scripts in `.test`.
41 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Start with the Miniconda base image
 2 | FROM continuumio/miniconda3:24.9.2-0
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /main
 6 | 
 7 | # Copy the environment file and project code
 8 | COPY atlasenv.yml .
 9 | 
10 | # Create a user with a specific UID and GID
11 | RUN groupadd -g 1000 atlasgroup && \
12 |     useradd -m -u 1000 -g atlasgroup -s /bin/bash atlasuser
13 | 
14 | # Set the HOME environment variable
15 | ENV HOME=/home/atlasuser
16 | 
17 | # Change ownership of the home directory
18 | RUN chown -R atlasuser:atlasgroup $HOME
19 | 
20 | # Switch to the new user
21 | USER atlasuser
22 | 
23 | # Create and activate the environment
24 | RUN conda env create -n atlas -f atlasenv.yml && \
25 |     conda clean -afy && \
26 |     echo "source activate atlas" > ~/.bashrc
27 | 
28 | # Set the working directory
29 | WORKDIR /main
30 | 
31 | 
32 | # Set the default command
33 | CMD ["bash"]


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Battelle Memorial Institute
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 |  * Redistributions of source code must retain the above copyright notice,
 8 |    this list of conditions and the following disclaimer.
 9 |  * Redistributions in binary form must reproduce the above copyright
10 |    notice, this list of conditions and the following disclaimer in the
11 |    documentation and/or other materials provided with the distribution.
12 |  * Neither the name of Battelle Memorial Institute nor the names of its
13 |    contributors may be used to endorse or promote products derived from
14 |    this software without specific prior written permission.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 | POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include atlas *
2 | include versioneer.py
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Metagenome-Atlas
 2 | 
 3 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/metagenome-atlas/badges/latest_release_relative_date.svg)](https://anaconda.org/bioconda/metagenome-atlas)
 4 | [![Bioconda](https://img.shields.io/conda/dn/bioconda/metagenome-atlas.svg?label=Bioconda )](https://anaconda.org/bioconda/metagenome-atlas)
 5 | [![Documentation Status](https://readthedocs.org/projects/metagenome-atlas/badge/?version=latest)](https://metagenome-atlas.readthedocs.io/en/latest/?badge=latest)
 6 | ![Mastodon Follow](https://img.shields.io/mastodon/follow/109273833677404282?domain=https%3A%2F%2Fmstdn.science&style=social)
 7 | <!--[![follow on twitter](https://img.shields.io/twitter/follow/SilasKieser.svg?style=social&label=Follow)](https://twitter.com/search?f=tweets&q=%40SilasKieser%20%23metagenomeAtlas&src=typd) -->
 8 | 
 9 | Metagenome-atlas is a easy-to-use metagenomic pipeline based on snakemake. It handles all steps from QC, Assembly, Binning, to Annotation.
10 | 
11 | ![scheme of workflow](resources/images/atlas_list.png?raw=true)
12 | 
13 | You can start using atlas with three commands:
14 | 
15 | ```sh
16 |     mamba install -y -c bioconda -c conda-forge metagenome-atlas={latest_version}
17 |     atlas init --db-dir databases path/to/fastq/files
18 |     atlas run all
19 | ```
20 | 
21 | where `{latest_version}` should be replaced by [![Version](https://anaconda.org/bioconda/metagenome-atlas/badges/version.svg)](https://anaconda.org/bioconda/metagenome-atlas)
22 | 
23 | ## Webpage
24 | 
25 | [metagenome-atlas.github.io](https://metagenome-atlas.github.io/)
26 | 
27 | ## Documentation
28 | 
29 | <https://metagenome-atlas.readthedocs.io/>
30 | 
31 | [Tutorial](https://github.com/metagenome-atlas/Tutorial)
32 | 
33 | ## Citation
34 | 
35 | > ATLAS: a Snakemake workflow for assembly, annotation, and genomic binning of metagenome sequence data.  
36 | > Kieser, S., Brown, J., Zdobnov, E. M., Trajkovski, M. & McCue, L. A.
37 | > BMC Bioinformatics 21, 257 (2020).  
38 | > doi: [10.1186/s12859-020-03585-4](https://doi.org/10.1186/s12859-020-03585-4)
39 | 
40 | ## Development/Extensions
41 | 
42 | Here are some ideas I work or want to work on when I have time. If you want to contribute or have some ideas let me know via a feature request issue.
43 | 
44 | - Optimized MAG recovery (e.g. [Spacegraphcats](https://github.com/spacegraphcats/spacegraphcats))
45 | - Integration of viruses/plasmid that live for now as [extensions](https://github.com/metagenome-atlas/virome_atlas)
46 | - Add statistics and visualisations as in [atlas_analyze](https://github.com/metagenome-atlas/atlas_analyze)
47 | - Implementation of most rules as snakemake wrapper
48 | - Cloud execution
49 | - Update to new Snakemake version and use cool reports.
50 | 


--------------------------------------------------------------------------------
/atlas/__init__.py:
--------------------------------------------------------------------------------
 1 | import snakemake
 2 | from . import _version
 3 | import os
 4 | 
 5 | from .workflow.scripts import utils
 6 | 
 7 | 
 8 | TAX_LEVELS = ["superkingdom", "phylum", "class", "order", "family", "genus", "species"]
 9 | BLAST6 = [
10 |     "qseqid",
11 |     "sseqid",
12 |     "pident",
13 |     "length",
14 |     "mismatch",
15 |     "gapopen",
16 |     "qstart",
17 |     "qend",
18 |     "sstart",
19 |     "send",
20 |     "evalue",
21 |     "bitscore",
22 | ]
23 | 
24 | 
25 | __version__ = _version.get_versions()["version"]
26 | 


--------------------------------------------------------------------------------
/atlas/color_logger.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | # root logger
 5 | logger = logging.getLogger()
 6 | 
 7 | grey = "\x1b[38;21m"
 8 | green = "\x1b[32;21m"
 9 | yellow = "\x1b[33;21m"
10 | red = "\x1b[31;21m"
11 | bold_red = "\x1b[31;1m"
12 | reset = "\x1b[0m"
13 | 
14 | prefix = "[Atlas] "
15 | 
16 | 
17 | class ColorFormatter(logging.Formatter):
18 |     def __init__(
19 |         self,
20 |         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)",
21 |     ):
22 |         self.FORMATS = {
23 |             logging.DEBUG: prefix + grey + format + reset,
24 |             logging.INFO: prefix + green + format + reset,
25 |             logging.WARNING: prefix + yellow + format + reset,
26 |             logging.ERROR: prefix + red + format + reset,
27 |             logging.CRITICAL: prefix + red + format + reset,
28 |         }
29 | 
30 |     def format(self, record):
31 |         log_fmt = self.FORMATS.get(record.levelno)
32 |         formatter = logging.Formatter(log_fmt)
33 |         return formatter.format(record)
34 | 
35 | 
36 | #
37 | logging_format = "%(levelname)s: %(message)s"
38 | # datefmt="%Y-%m-%d %H:%M"
39 | #
40 | #
41 | #
42 | #
43 | #
44 | # fileHandler = logging.FileHandler("atlas.log",mode='w')
45 | # fileHandler.setFormatter(logging.Formatter(logging_format))
46 | # fileHandler.setLevel(logging.DEBUG)
47 | #
48 | #
49 | #
50 | # creat console logging
51 | consoleHandler = logging.StreamHandler()
52 | consoleHandler.setLevel(logging.INFO)
53 | consoleHandler.setFormatter(ColorFormatter(logging_format))
54 | 
55 | 
56 | #
57 | 
58 | ## Define logging
59 | logging.basicConfig(
60 |     level=logging.DEBUG,
61 |     datefmt="%Y-%m-%d %H:%M",
62 |     format=logging_format,
63 |     handlers=[consoleHandler],
64 | )
65 | logging.captureWarnings(True)
66 | 
67 | 
68 | # create logging for atlas
69 | 
70 | 
71 | def handle_exception(exc_type, exc_value, exc_traceback):
72 |     if issubclass(exc_type, KeyboardInterrupt):
73 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
74 |         return
75 | 
76 |     logger.error(
77 |         "".join(
78 |             [
79 |                 "Uncaught exception: ",
80 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
81 |             ]
82 |         )
83 |     )
84 | 
85 | 
86 | # Install exception handler
87 | sys.excepthook = handle_exception
88 | 
89 | # root logger
90 | logger = logging.getLogger()
91 | 
92 | # logger= logging
93 | 


--------------------------------------------------------------------------------
/atlas/config:
--------------------------------------------------------------------------------
1 | ../config/


--------------------------------------------------------------------------------
/atlas/default_values.py:
--------------------------------------------------------------------------------
  1 | # global defaults
  2 | MEM = 80
  3 | JAVA_MEM_FRACTION = 0.85
  4 | PREALLOCATE_RAM = "t"
  5 | 
  6 | 
  7 | MERGING_FLAGS = "ecct iterations=1"
  8 | MERGING_EXTEND2 = 50
  9 | MERGING_K = 62
 10 | 
 11 | CONTAMINANT_MAX_INDEL = 20
 12 | CONTAMINANT_MIN_RATIO = 0.65
 13 | CONTAMINANT_MINIMUM_HITS = 1
 14 | CONTAMINANT_AMBIGUOUS = "best"
 15 | CONTAMINANT_KMER_LENGTH = 13
 16 | 
 17 | DUPLICATES_ONLY_OPTICAL = False
 18 | DUPLICATES_ALLOW_SUBSTITUTIONS = 2
 19 | 
 20 | NORMALIZATION_KMER_LENGTH = 21
 21 | 
 22 | # almost no filtering unless grossly over-represented
 23 | NORMALIZATION_TARGET_DEPTH = 1000  # 500
 24 | # allow very low represented kmers to remain
 25 | NORMALIZATION_MINIMUM_KMERS = 3  # 15
 26 | 
 27 | ASSEMBLY_MEMORY = 250
 28 | ASSEMBLY_THREADS = 8
 29 | MEGAHIT_MIN_COUNT = 2
 30 | MEGAHIT_K_MIN = 21
 31 | MEGAHIT_K_MAX = 121
 32 | MEGAHIT_K_STEP = 20
 33 | MEGAHIT_MERGE_LEVEL = "20,0.98"
 34 | MEGAHIT_PRUNE_LEVEL = 2
 35 | MEGAHIT_LOW_LOCAL_RATIO = 0.2
 36 | SPADES_K = "auto"
 37 | 
 38 | # this is bumped up slightly to filter non-merged R1 and R2 sequences
 39 | MINIMUM_CONTIG_LENGTH = 300  # 2200
 40 | 
 41 | # leave all contigs
 42 | MINIMUM_AVERAGE_COVERAGE = 1  # 5
 43 | MINIMUM_PERCENT_COVERED_BASES = 20  # 40
 44 | MINIMUM_MAPPED_READS = 0
 45 | CONTIG_TRIM_BP = 0  # 100
 46 | 
 47 | # bases
 48 | MINIMUM_REGION_OVERLAP = 1
 49 | FEATURE_COUNTS_ALLOW_OVERLAP = True
 50 | MAXIMUM_COUNTED_MAP_SITES = 10
 51 | # default bbmap
 52 | CONTIG_MIN_ID = 0.76
 53 | CONTIG_MAP_PAIRED_ONLY = True
 54 | CONTIG_MAX_DISTANCE_BETWEEN_PAIRS = 1000
 55 | # only best
 56 | CONTIG_COUNT_MULTI_MAPPED_READS = False
 57 | # set minimum map quality
 58 | MINIMUM_MAP_QUALITY = 0
 59 | 
 60 | PROKKA_KINGDOM = "Bacteria"
 61 | 
 62 | MAXBIN_MAX_ITERATION = 50
 63 | MAXBIN_MIN_CONTIG_LENGTH = 1000
 64 | MAXBIN_PROB_THRESHOLD = 0.9
 65 | 
 66 | DIAMOND_TOP_SEQS = 2
 67 | DIAMOND_E_VALUE = 0.000001
 68 | DIAMOND_MIN_IDENTITY = 50
 69 | DIAMOND_QUERY_COVERAGE = 60
 70 | DIAMOND_GAP_OPEN = 11
 71 | DIAMOND_GAP_EXTEND = 1
 72 | DIAMOND_BLOCK_SIZE = 2
 73 | DIAMOND_INDEX_CHUNKS = 4
 74 | 
 75 | SUMMARY_METHOD = "lca"
 76 | AGGREGATION_METHOD = "lca-majority"
 77 | MAJORITY_THRESHOLD = 0.51
 78 | MIN_BITSCORE = 0
 79 | MIN_LENGTH = 20
 80 | MAX_HITS = 100
 81 | 
 82 | 
 83 | EGGNOG_HEADER = [
 84 |     "Query",
 85 |     "Seed",
 86 |     "Seed_evalue",
 87 |     "Seed_Score",
 88 |     "eggNOG",
 89 |     "max_annot_lvl",
 90 |     "COG_cat",
 91 |     "Description",
 92 |     "Name",
 93 |     "GO_terms",
 94 |     "EC",
 95 |     "KO",
 96 |     "KEGG_Pathway",
 97 |     "KEGG_Module",
 98 |     "KEGG_Reaction",
 99 |     "KEGG_rclass",
100 |     "BRITE",
101 |     "KEGG_TC",
102 |     "CAZy",
103 |     "BiGG_Reaction",
104 |     "PFAMs",
105 | ]
106 | 


--------------------------------------------------------------------------------
/atlas/init/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metagenome-atlas/atlas/b7694014f0bb3284255325b7abb93d2cf58224e6/atlas/init/__init__.py


--------------------------------------------------------------------------------
/atlas/init/parse_sra.py:
--------------------------------------------------------------------------------
  1 | # from ..color_logger import logger
  2 | import logging
  3 | 
  4 | logger = logging.getLogger(__file__)
  5 | import pandas as pd
  6 | 
  7 | 
  8 | Expected_library_values = {
  9 |     "LibrarySelection": "RANDOM",
 10 |     "LibraryStrategy": "WGS",
 11 |     "LibrarySource": "METAGENOMIC",
 12 |     "Platform": "ILLUMINA",
 13 | }
 14 | 
 15 | 
 16 | def load_and_validate_runinfo_table(path):
 17 |     RunTable = pd.read_csv(path, sep="\t", index_col=0)
 18 | 
 19 |     # validate sra table
 20 |     format_error = False
 21 | 
 22 |     # check if all headers are present
 23 |     Expected_headers = [
 24 |         "LibraryLayout",
 25 |         "LibrarySource",
 26 |         "LibrarySelection",
 27 |         "LibraryStrategy",
 28 |         "BioSample",
 29 |     ]
 30 |     for header in Expected_headers:
 31 |         if not header in RunTable.columns:
 32 |             logger.error(f"Didn't found expected header {header}")
 33 |             format_error = True
 34 | 
 35 |     if not all(RunTable.index.str[1:2] == "R"):
 36 |         logger.error("Expect runs as index, e.g. [E,S,D]RR000")
 37 |         format_error = True
 38 | 
 39 |     if not RunTable.BioSample.str.startswith("SAM").all():
 40 |         logger.error("BioSample should start with 'SAM'")
 41 |         format_error = True
 42 | 
 43 |     if not RunTable.LibraryLayout.isin(["PAIRED", "SINGLE"]).all():
 44 |         logger.error("LibraryLayout should be 'PAIRED' or 'SINGLE'")
 45 |         format_error = True
 46 | 
 47 |     if format_error:
 48 |         logger.error("RunTable {} is not valid. Abort.".format(path))
 49 |         exit(1)
 50 | 
 51 |     return RunTable
 52 | 
 53 | 
 54 | def filter_runinfo(RunTable, ignore_paired=False):
 55 |     logger.info(
 56 |         f"Start with {RunTable.shape[0]} runs from {RunTable.BioSample.unique().shape[0]} samples"
 57 |     )
 58 | 
 59 |     # Filter out reads that are not metagenomics
 60 | 
 61 |     for key in ["LibrarySource"]:
 62 |         Nruns_before = RunTable.shape[0]
 63 |         All_values = RunTable[key].unique()
 64 |         RunTable = RunTable.loc[RunTable[key] == Expected_library_values[key]]
 65 | 
 66 |         Difference = Nruns_before - RunTable.shape[0]
 67 | 
 68 |         if Difference > 0:
 69 |             logger.info(
 70 |                 f"Runs have the following values for {key}: {', '.join(All_values)}\n"
 71 |                 f"Select only runs {key} == {Expected_library_values[key]}, "
 72 |                 f"Filtered out {Difference} runs"
 73 |             )
 74 | 
 75 |     for key in ["LibrarySelection", "LibraryStrategy"]:
 76 |         Nruns_before = RunTable.shape[0]
 77 |         All_values = RunTable[key].unique()
 78 |         if any(RunTable[key] != Expected_library_values[key]):
 79 |             logger.warning(
 80 |                 f"Runs have the following values for {key}: {', '.join(All_values)}\n"
 81 |                 f"Usually I expect {key} == {Expected_library_values[key]} "
 82 |             )
 83 | 
 84 |     # Handle single end reads if mixed
 85 | 
 86 |     if ("PAIRED" in RunTable.LibraryLayout) and ("SINGLE" in RunTable.LibraryLayout):
 87 |         N_library_layout = RunTable.LibraryLayout.value_counts()
 88 | 
 89 |         logger.info(
 90 |             f"Run table contains {N_library_layout['SINGLE']} single-end "
 91 |             f"and {N_library_layout['PAIRED']} paired-end libraries. "
 92 |         )
 93 | 
 94 |         if ignore_paired:
 95 |             logger.info(f"I drop {N_library_layout['PAIRED']} paired end libraries")
 96 |             RunTable = RunTable.query("LibraryLayout == 'SINGLE'")
 97 | 
 98 |         else:
 99 |             logger.warning(f"I drop {N_library_layout['SINGLE']} single end libraries")
100 | 
101 |             RunTable = RunTable.query("LibraryLayout == 'PAIRED'")
102 | 
103 |     # Illumina or not
104 | 
105 |     if not RunTable.Platform.isin(["ILLUMINA"]).all():
106 |         Platforms = ", ".join(RunTable.Platform.unique())
107 | 
108 |         logger.warning(
109 |             f"Your samples are sequenced on the following platform: {Platforms}\n"
110 |             "I don't know how well Atlas handles non-illumina reads.\n"
111 |             "If you have long-reads, specify them via a the longreads, column in the sample table."
112 |         )
113 | 
114 |     # Final
115 |     if RunTable.shape[0] > 0:
116 |         logger.info(
117 |             f"Selected {RunTable.shape[0]} runs from {RunTable.BioSample.unique().shape[0]} samples"
118 |         )
119 | 
120 |     else:
121 |         logger.critical("No runs left after filtering. Abort.")
122 |         exit(1)
123 | 
124 |     return RunTable
125 | 
126 | 
127 | def validate_merging_runinfo(path):
128 |     RunTable = load_and_validate_runinfo_table(path)
129 | 
130 |     # If each run is from a different biosample, merging is not necessary
131 |     if RunTable.shape[0] == RunTable.BioSample.unique().shape[0]:
132 |         return RunTable
133 | 
134 |     # Cannot merge if different platforms
135 |     problematic_samples = []
136 |     for sample, df in RunTable.groupby("BioSample"):
137 |         if not all(df.Platform == df.Platform.iloc[0]):
138 |             problematic_samples.append(sample)
139 | 
140 |     if len(problematic_samples) > 0:
141 |         logger.error(
142 |             f"You attempt to merge runs from the same sample. "
143 |             f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and shouldn't be merged.\n"
144 |             f"Please resolve the ambiguity in the table {path} and rerun the command.\n"
145 |         )
146 | 
147 |         exit(1)
148 | 
149 |     # Warn if samples are not identical for the following columns
150 |     Expected_same_values = ["Experiment", "Model", "LibraryName"]
151 |     for key in Expected_same_values:
152 |         problematic_samples = []
153 |         for sample, df in RunTable.groupby("BioSample"):
154 |             if not all(df[key] == df[key].iloc[0]):
155 |                 problematic_samples.append(sample)
156 | 
157 |         if len(problematic_samples) > 0:
158 |             if len(problematic_samples) > 5:
159 |                 problematic_samples_list = " ".join(problematic_samples[:3] + ["..."])
160 |             else:
161 |                 problematic_samples_list = " ".join(problematic_samples)
162 | 
163 |                 logger.warning(
164 |                     "You attempt to merge runs from the same sample. "
165 |                     f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n"
166 |                     f"You can modify the table {path} and rerun the command.\n"
167 |                 )
168 | 
169 |     logger.info("I will automatically merge runs from the same biosample.")
170 | 
171 |     return RunTable
172 | 


--------------------------------------------------------------------------------
/atlas/sample_table.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | import logging
  4 | 
  5 | logger = logging.getLogger(__file__)
  6 | 
  7 | 
  8 | def validate_sample_table(sampleTable):
  9 |     Expected_Headers = ["BinGroup"]
 10 |     for h in Expected_Headers:
 11 |         if not (h in sampleTable.columns):
 12 |             logger.error(f"expect '{h}' to be found in samples.tsv")
 13 |             exit(1)
 14 |         elif sampleTable[h].isnull().any():
 15 |             logger.error(f"Found empty values in the sample table column '{h}'")
 16 |             exit(1)
 17 | 
 18 |     if not sampleTable.index.is_unique:
 19 |         duplicated_samples = ", ".join(sampleTable.index.duplicated())
 20 |         logger.error(
 21 |             f"Expect Samples to be unique. Found {duplicated_samples} more than once"
 22 |         )
 23 |         exit(1)
 24 | 
 25 |     if sampleTable.index.str.match("^\d").any():
 26 |         logger.error(
 27 |             f"Sample names shouldn't start with a digit. This can lead to incompatibilities.\n {list(sampleTable.index)}"
 28 |         )
 29 |         exit(1)
 30 | 
 31 |     if sampleTable.index.str.contains("_").any():
 32 |         logger.error(
 33 |             f"Sample names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.index)}"
 34 |         )
 35 |         exit(1)
 36 | 
 37 |     if sampleTable.index.str.count("-").max() > 1:
 38 |         logger.error(
 39 |             f"Sample names shouldn't have more than one hypo '-'. This can lead to incompatibilities.\n {list(sampleTable.index)}"
 40 |         )
 41 |         exit(1)
 42 | 
 43 |     ### Validate BinGroup
 44 | 
 45 |     if sampleTable.BinGroup.isnull().any():
 46 |         logger.warning(f"Found empty values in the sample table column 'BinGroup'")
 47 | 
 48 |     if sampleTable.BinGroup.str.contains("_").any():
 49 |         logger.error(
 50 |             f"BinGroup names shouldn't contain underscores. This can lead to incompatibilities. \n {list(sampleTable.BinGroup)}"
 51 |         )
 52 |         exit(1)
 53 | 
 54 |     if sampleTable.BinGroup.str.contains("-").any():
 55 |         logger.error(
 56 |             f"BinGroup names shouldn't contain hypos '-'. This can lead to incompatibilities.\n {list(sampleTable.BinGroup)}"
 57 |         )
 58 |         exit(1)
 59 | 
 60 | 
 61 | def load_sample_table(sample_table="samples.tsv"):
 62 |     sampleTable = pd.read_csv(sample_table, index_col=0, sep="\t")
 63 |     validate_sample_table(sampleTable)
 64 |     return sampleTable
 65 | 
 66 | 
 67 | class BinGroupSizeError(Exception):
 68 |     """
 69 |     Exception with Bingroupsize
 70 |     """
 71 | 
 72 |     def __init__(self, message):
 73 |         super(BinGroupSizeError, self).__init__(message)
 74 | 
 75 | 
 76 | def validate_bingroup_size_cobinning(sampleTable, logger):
 77 |     """
 78 |     Validate that the bingroups are not too large, nor too small for co-binning.
 79 | 
 80 |     e.g. vamb and SemiBin
 81 |     """
 82 | 
 83 |     bin_group_sizes = sampleTable.BinGroup.value_counts()
 84 | 
 85 |     if bin_group_sizes.max() > 180:
 86 |         logger.warning(
 87 |             f"Found a bin group with more than 180 samples. This might lead to memory issues. \n {bin_group_sizes}"
 88 |         )
 89 | 
 90 |     if bin_group_sizes.min() < 10:
 91 |         logger.error(
 92 |             "If you want to use co-binning, you should have at least 5-10 samples per bin group. \n"
 93 |         )
 94 |         raise BinGroupSizeError("BinGroup too small")
 95 | 
 96 | 
 97 | def validate_bingroup_size_metabat(sampleTable, logger):
 98 |     bin_group_sizes = sampleTable.BinGroup.value_counts()
 99 | 
100 |     max_bin_group_size = bin_group_sizes.max()
101 | 
102 |     warn_message = (
103 |         "Co-binning with metabat uses cross-mapping which scales quadratically."
104 |         f"You have a bingroup with {max_bin_group_size} samples, which already leads to {max_bin_group_size*max_bin_group_size} cross-mappings."
105 |     )
106 | 
107 |     if max_bin_group_size > 50:
108 |         logger.error(
109 |             warn_message
110 |             + "This is too much for metabat. Please use vamb, or SemiBin or split your samples into smaller groups."
111 |         )
112 |         BinGroupSizeError("BinGroup too large")
113 | 
114 |     if max_bin_group_size > 15:
115 |         logger.warning(
116 |             warn_message
117 |             + "This might be too much for metabat. Consider using vamb, or SemiBin or split your samples into smaller groups."
118 |         )
119 | 
120 |     elif max_bin_group_size == 1:
121 |         logger.warning(
122 |             "You have only one sample per bingroup. This doesn't use the co-abundance information."
123 |         )
124 | 
125 | 
126 | def validate_bingroup_size(sampleTable, config, logger):
127 |     if config["final_binner"] == "DASTool":
128 |         binners = config["binner"]
129 | 
130 |         logger.info(f"DASTool uses the following binners: {binners}")
131 | 
132 |         if ("vamb" in binners) or ("SemiBin" in binners):
133 |             validate_bingroup_size_cobinning(sampleTable, logger)
134 | 
135 |         if "metabat" in binners:
136 |             validate_bingroup_size_metabat(sampleTable, logger)
137 | 
138 |     elif config["final_binner"] == "metabat":
139 |         validate_bingroup_size_metabat(sampleTable, logger)
140 | 
141 |     elif config["final_binner"] in ["vamb", "SemiBin"]:
142 |         validate_bingroup_size_cobinning(sampleTable, logger)
143 | 
144 |     elif config["final_binner"] == "maxbin":
145 |         logger.warning("maxbin Doesn't use coabundance for binning.")
146 | 
147 |     else:
148 |         Exception(f"Unknown final binner: {config['final_binner']}")
149 | 


--------------------------------------------------------------------------------
/atlas/workflow:
--------------------------------------------------------------------------------
1 | ../workflow/


--------------------------------------------------------------------------------
/atlasenv.yml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - python >=3.8, < 3.12
 7 |   - mamba
 8 |   - bbmap >= 39.01, <40
 9 |   - snakemake-minimal >= 7.18.1, <7.26
10 |   - pygments
11 |   - networkx
12 |   - graphviz
13 |   - pandas >=1.2, <1.6
14 |   - pyarrow # for parquet reading
15 |   - click >=7
16 |   - ruamel.yaml >=0.17
17 |   - cookiecutter
18 |   - wget
19 | 


--------------------------------------------------------------------------------
/config/default_config.yaml:
--------------------------------------------------------------------------------
 1 | ## use just in time compilation for bbmap
 2 | usejni: false
 3 | 
 4 | gene_annotations: []
 5 | 
 6 | genome_filter_criteria: "(Completeness-5*Contamination >50 ) & (Length_scaffolds >=50000) & (Ambigious_bases <1e6) & (N50 > 5*1e3) & (N_scaffolds < 1e3)"
 7 | exclude_unplacable_genomes: false
 8 | 
 9 | genome_dereplication:
10 |   ANI: 0.95 ## Genome dreplication threshold
11 |   overlap: 0.2 # See more on https://drep.readthedocs.io/en/latest/module_descriptions.html
12 |   greedy_clustering: "auto" # Add options for greedy clustering 'auto' when using more than 5k bins
13 |   opt_parameters: ""
14 |   score:
15 |     completeness: 1
16 |     contamination: 5
17 |     N50: 0.5
18 |     length: 0
19 |     centrality: 1
20 | 
21 | genome_aligner: "minimap"
22 | 
23 | bin_quality_asesser: checkm2 #[ checkm2, busco, cehckm]
24 | 
25 | semibin_options: ""
26 | semibin_train_extra: ""
27 | 
28 | filter_chimieric_bins: true
29 | gunc_database: "progenomes" # progenomes or gtdb
30 | 
31 | binner: # If DASTool is used as final_binner, use predictions of this binners
32 |   - metabat
33 |   - maxbin
34 | #  - vamb
35 | 
36 | cobinning_readmapping_id: 0.95 #when mapping different reads to contigs from different samples use less stringent alignment threshold
37 | 
38 | preprocess_qtrim: "rl"
39 | preprocess_kmer_trim: "r"
40 | preprocess_minimum_base_quality: 10
41 | preprocess_allowable_kmer_mismatches: 1
42 | preprocess_reference_kmer_match_length: 27
43 | preprocess_minimum_passing_read_length: 51
44 | preprocess_minimum_base_frequency: 0.05
45 | preprocess_max_ns: -1
46 | preallocate_ram: "t"
47 | error_correction_overlapping_pairs: "t"
48 | 


--------------------------------------------------------------------------------
/docker_run.sh:
--------------------------------------------------------------------------------
1 | docker run -it --name atlas-debug -v $(pwd):/main  atlas-debug


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = Metagenome-Atlas
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/advanced/assembly.rst:
--------------------------------------------------------------------------------
 1 | Pre-Assambly-processing
 2 | ------------------------
 3 | 
 4 | Normalization Parameters
 5 | ``````````````````````````
 6 | 
 7 | To improve assembly time and often assemblies themselves, coverage is
 8 | normalized across kmers to a target depth and can be set using::
 9 | 
10 |     # kmer length over which we calculated coverage
11 |     normalization_kmer_length: 21
12 |     # the normalized target coverage across kmers
13 |     normalization_target_depth: 100
14 |     # reads must have at least this many kmers over min depth to be retained
15 |     normalization_minimum_kmers: 8
16 | 
17 | 
18 | 
19 | Error Correction
20 | ``````````````````````````
21 | 
22 | Optionally perform error correction using ``tadpole.sh`` from BBTools::
23 | 
24 |     perform_error_correction: true
25 | 
26 | 
27 | 
28 | Assembly Parameters
29 | ------------------------
30 | 
31 | 
32 | Assembler
33 | ``````````````````````````
34 | 
35 | Currently, the supported assemblers are 'spades' and 'megahit' with the
36 | default setting of::
37 | 
38 |     assembler: megahit
39 | 
40 | Both assemblers have settings that can be altered in the configuration::
41 | 
42 |     # minimum multiplicity for filtering (k_min+1)-mers
43 |     megahit_min_count: 2
44 |     # minimum kmer size (<= 255), must be odd number
45 |     megahit_k_min: 21
46 |     # maximum kmer size (<= 255), must be odd number
47 |     megahit_k_max: 121
48 |     # increment of kmer size of each iteration (<= 28), must be even number
49 |     megahit_k_step: 20
50 |     # merge complex bubbles of length <= l*kmer_size and similarity >= s
51 |     megahit_merge_level: 20,0.98
52 |     # strength of low depth pruning (0-3)
53 |     megahit_prune_level: 2
54 |     # ratio threshold to define low local coverage contigs
55 |     megahit_low_local_ratio: 0.2
56 |     # minimum length of contigs (after contig trimming)
57 |     minimum_contig_length: 200
58 |     # comma-separated list of k-mer sizes (must be odd and less than 128)
59 |     spades_k: auto
60 | 
61 | 
62 | Contig Filtering
63 | ``````````````````````````
64 | 
65 | After assembly, contigs can be filtered based on several metrics::
66 | 
67 |     # Discard contigs with lower average coverage.
68 |     minimum_average_coverage: 5
69 |     # Discard contigs with a lower percent covered bases.
70 |     minimum_percent_covered_bases: 40
71 |     # Discard contigs with fewer mapped reads.
72 |     minimum_mapped_reads: 0
73 |     # Trim the first and last X bases of each sequence.
74 |     contig_trim_bp: 0
75 | 


--------------------------------------------------------------------------------
/docs/advanced/qc.rst:
--------------------------------------------------------------------------------
  1 | Quality control of reads
  2 | -------------------------
  3 | 
  4 | 
  5 | Adapter Trimming
  6 | ``````````````````````````
  7 | 
  8 | FASTA file paths for adapter sequences to be trimmed from the sequence ends.
  9 | 
 10 | We provide the adapter reference FASTA included in `bbmap` for various
 11 | 
 12 | ::
 13 | 
 14 |     preprocess_adapters: /database_dir/adapters.fa
 15 | 
 16 | 
 17 | Quality Trimming
 18 | ``````````````````````````
 19 | 
 20 | Trim regions with an average quality below this threshold. Higher is more
 21 | stringent.
 22 | 
 23 | ::
 24 | 
 25 |     preprocess_minimum_base_quality: 10
 26 | 
 27 | 
 28 | Adapter Trimming at Read Tips
 29 | ````````````````````````````````````````````````````
 30 | 
 31 | Allow shorter kmer matches down to `mink` at the read ends. 0 disables.
 32 | 
 33 | ::
 34 | 
 35 |     preprocess_adapter_min_k: 8
 36 | 
 37 | 
 38 | Allowable Mismatches in Adapter Hits
 39 | ````````````````````````````````````````````````````
 40 | 
 41 | Maximum number of substitutions between the target adapter kmer and the query
 42 | sequence kmer. Lower is more stringent.
 43 | 
 44 | ::
 45 | 
 46 |     preprocess_allowable_kmer_mismatches: 1
 47 | 
 48 | 
 49 | Contaminant Kmer Length
 50 | ``````````````````````````
 51 | 
 52 | Kmer length used for finding contaminants. Contaminant matches shorter than
 53 | this length will not be found.
 54 | 
 55 | ::
 56 | 
 57 |     preprocess_reference_kmer_match_length: 27
 58 | 
 59 | 
 60 | Read Length Threshold
 61 | ``````````````````````````
 62 | 
 63 | This is applied after quality and adapter trimming have been applied to the
 64 | sequence.
 65 | 
 66 | ::
 67 | 
 68 |     preprocess_minimum_passing_read_length: 51
 69 | 
 70 | 
 71 | Sequence Complexity Filter
 72 | ``````````````````````````
 73 | 
 74 | Require this fraction of each nucleotide per sequence to eliminate low
 75 | complexity reads.
 76 | 
 77 | ::
 78 | 
 79 |     preprocess_minimum_base_frequency: 0.05
 80 | 
 81 | 
 82 | Contamination Parameters
 83 | ``````````````````````````
 84 | 
 85 | Contamination reference sequences in the form of nucleotide FASTA files can be
 86 | provided and filtered from the reads using the following parameters.
 87 | 
 88 | If 'rRNA' is defined, it will be added back to metagenomes but not to metatranscriptomes.
 89 | Additional references can be added arbitrarily, such as::
 90 | ::
 91 | 
 92 |     contaminant_references:
 93 |         rRNA: /database_dir/silva_rfam_all_rRNAs.fa
 94 |         phiX: /database_dir/phiX174_virus.fa
 95 | 
 96 | Don't look for indels longer than this::
 97 | 
 98 |     contaminant_max_indel: 20
 99 | 
100 | 
101 | Fraction of max alignment score required to keep a site::
102 | 
103 |     contaminant_min_ratio: 0.65
104 |     
105 | mapping kmer length; range 8-15; longer is faster but uses more memory; shorter is more sensitive::
106 | 
107 |     contaminant_kmer_length: 12
108 | 
109 | Minimum number of seed hits required for candidate sites::
110 | 
111 |     contaminant_minimum_hits: 1
112 | 
113 | Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations):
114 | 
115 | - best    (use the first best site)
116 | - toss    (consider unmapped, retain in reads for assembly)
117 | - random  (select one top-scoring site randomly)
118 | - all     (retain all top-scoring sites)
119 | 
120 | ::
121 | 
122 |     contaminant_ambiguous: best
123 | 
124 | For host decontamination we suggest the following genomes, where contaminants and low complexity regions were masked.
125 | 
126 | Many thanks to Brian Bushnell for providing the genomes of [human](https://drive.google.com/file/d/0B3llHR93L14wd0pSSnFULUlhcUk/edit?resourcekey=0-PsIKmg2q4EvTGWGOUjsKGQ),[mouse](https://drive.google.com/file/d/0B3llHR93L14wYmJYNm9EbkhMVHM/view?resourcekey=0-jSsdejBncqPu4eiFfJvf1w), 
127 | [dog](https://drive.google.com/file/d/0B3llHR93L14wTHdWRG55c2hPUXM/view?resourcekey=0-nJ2WQzTQYrTizK0pllVRZg), and [cat](https://drive.google.com/file/d/0B3llHR93L14wOXJhWXRlZjBpVUU/view?resourcekey=0-xxh33oYWp5FGBpRzobD_uw). [Source](https://www.seqanswers.com/forum/bioinformatics/bioinformatics-aa/37175-introducing-removehuman-human-contaminant-removal?p=286481#post286481)
128 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # ATLAS documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jan 20 12:31:40 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | # import os
 21 | # import sys
 22 | # sys.path.insert(0, os.path.abspath('.'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     "sphinx.ext.autodoc",
 36 |     "sphinx.ext.todo",
 37 |     "sphinx.ext.viewcode",
 38 |     "sphinx.ext.napoleon",
 39 | ]
 40 | 
 41 | # Add any paths that contain templates here, relative to this directory.
 42 | templates_path = ["_templates"]
 43 | 
 44 | # The suffix(es) of source filenames.
 45 | # You can specify multiple suffix as a list of string:
 46 | #
 47 | source_suffix = [".rst", ".md"]
 48 | # source_suffix = ".rst"
 49 | 
 50 | # The master toctree document.
 51 | master_doc = "index"
 52 | 
 53 | # General information about the project.
 54 | project = "Metagenome-atlas"
 55 | copyright = "2021, Silas Kieser"
 56 | author = "Joe Brown and Silas Kieser"
 57 | 
 58 | # The version info for the project you're documenting, acts as replacement for
 59 | # |version| and |release|, also used in various other places throughout the
 60 | # built documents.
 61 | #
 62 | # The short X.Y version.
 63 | version = "2.0"
 64 | # The full version, including alpha/beta/rc tags.
 65 | release = "2.0"
 66 | 
 67 | # The language for content autogenerated by Sphinx. Refer to documentation
 68 | # for a list of supported languages.
 69 | #
 70 | # This is also used if you do content translation via gettext catalogs.
 71 | # Usually you set "language" from the command line for these cases.
 72 | language = None
 73 | 
 74 | # List of patterns, relative to source directory, that match files and
 75 | # directories to ignore when looking for source files.
 76 | # This patterns also effect to html_static_path and html_extra_path
 77 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "old"]
 78 | 
 79 | # The name of the Pygments (syntax highlighting) style to use.
 80 | pygments_style = "sphinx"
 81 | 
 82 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 83 | todo_include_todos = True
 84 | 
 85 | 
 86 | # -- Options for HTML output ----------------------------------------------
 87 | 
 88 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 89 | # a list of builtin themes.
 90 | #
 91 | html_theme = "sphinx_rtd_theme"
 92 | 
 93 | # Theme options are theme-specific and customize the look and feel of a theme
 94 | # further.  For a list of options available for each theme, see the
 95 | # documentation.
 96 | #
 97 | # html_theme_options = {}
 98 | 
 99 | # Add any paths that contain custom static files (such as style sheets) here,
100 | # relative to this directory. They are copied after the builtin static files,
101 | # so a file named "default.css" will overwrite the builtin "default.css".
102 | html_static_path = ["reports"]
103 | 
104 | 
105 | # -- Options for HTMLHelp output ------------------------------------------
106 | 
107 | # Output file base name for HTML help builder.
108 | htmlhelp_basename = "ATLASdoc"
109 | 
110 | 
111 | # -- Options for LaTeX output ---------------------------------------------
112 | 
113 | # latex_elements = {
114 | #     # The paper size ('letterpaper' or 'a4paper').
115 | #     #
116 | #     # 'papersize': 'letterpaper',
117 | #     # The font size ('10pt', '11pt' or '12pt').
118 | #     #
119 | #     # 'pointsize': '10pt',
120 | #     # Additional stuff for the LaTeX preamble.
121 | #     #
122 | #     # 'preamble': '',
123 | #     # Latex figure (float) alignment
124 | #     #
125 | #     # 'figure_align': 'htbp',
126 | # }
127 | #
128 | # # Grouping the document tree into LaTeX files. List of tuples
129 | # # (source start file, target name, title,
130 | # #  author, documentclass [howto, manual, or own class]).
131 | # latex_documents = [
132 | #     (master_doc, "ATLAS.tex", "ATLAS Documentation", "Joe Brown", "manual"),
133 | # ]
134 | #
135 | 
136 | # -- Options for manual page output ---------------------------------------
137 | 
138 | # One entry per manual page. List of tuples
139 | # (source start file, name, description, authors, manual section).
140 | man_pages = [(master_doc, "atlas", "ATLAS Documentation", [author], 1)]
141 | 
142 | 
143 | # -- Options for Texinfo output -------------------------------------------
144 | 
145 | # Grouping the document tree into Texinfo files. List of tuples
146 | # (source start file, target name, title, author,
147 | #  dir menu entry, description, category)
148 | texinfo_documents = [
149 |     (
150 |         master_doc,
151 |         "ATLAS",
152 |         "ATLAS Documentation",
153 |         author,
154 |         "ATLAS",
155 |         "One line description of project.",
156 |         "Miscellaneous",
157 |     ),
158 | ]
159 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. image:: https://anaconda.org/bioconda/metagenome-atlas/badges/version.svg
 3 |     :target: https://anaconda.org/bioconda/metagenome-atlas
 4 | 
 5 | .. image:: https://img.shields.io/conda/dn/bioconda/metagenome-atlas.svg?label=Bioconda
 6 |     :target: https://bioconda.github.io/recipes/metagenome-atlas/README.html
 7 | 
 8 | 
 9 | .. image:: https://img.shields.io/twitter/follow/SilasKieser.svg?style=social&label=Follow
10 |     :target: https://twitter.com/search?f=tweets&q=%40SilasKieser%20%23metagenomeAtlas&src=typd
11 | 
12 | 
13 | .. |logo| image:: ../resources/images/atlas_image.png
14 |   :alt: Metagenome-atlas logo
15 | 
16 | 
17 | 
18 | 
19 | Metagenome-Atlas
20 | ****************
21 | 
22 | |logo|
23 | 
24 | Metagenome-Atlas is a easy-to-use metagenomic pipeline based on `snakemake <https://snakemake.github.io/>`_.
25 | It handles all steps from QC, Assembly, Binning, to Annotation.
26 | 
27 | You can start using atlas with three commands::
28 | 
29 |       mamba install -c bioconda -c conda-forge metagenome-atlas={latest_version}
30 |       atlas init --db-dir databases path/to/fastq/files
31 |       atlas run
32 | 
33 | where `{latest_version}` should be replaced by 
34 | 
35 | 
36 | .. image:: https://anaconda.org/bioconda/metagenome-atlas/badges/version.svg
37 |     :target: https://anaconda.org/bioconda/metagenome-atlas
38 | 
39 | 
40 | .. _publication:
41 | 
42 | Publication
43 | ===========
44 | 
45 |     ATLAS: a Snakemake workflow for assembly, annotation, and genomic binning of metagenome sequence data.
46 |     Kieser, S., Brown, J., Zdobnov, E. M., Trajkovski, M. & McCue, L. A.
47 |     BMC Bioinformatics 21, 257 (2020).
48 |     doi: `10.1186/s12859-020-03585-4 <https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-03585-4>`_
49 | 
50 | 
51 | .. toctree::
52 |     :maxdepth: 2
53 |     :caption: Documentation
54 | 
55 |     usage/getting_started
56 |     usage/output
57 |     usage/configuration
58 |     usage/changelog
59 | 


--------------------------------------------------------------------------------
/docs/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.codespell]
 2 | # Ref: https://github.com/codespell-project/codespell#using-a-config-file
 3 | skip = '.git,*.pdf,*.svg,versioneer.py,*.css,test_*'
 4 | check-hidden = true
 5 | ignore-regex = '^\s*"image/\S+": ".*|\b[Mm]anuel[. ][Hh]oltgrewe\b'
 6 | ignore-words-list = 'testin'
 7 | 
 8 | 
 9 | [tool.versioneer]
10 | VCS = git
11 | style = pep440
12 | versionfile_source = atlas/_version.py
13 | versionfile_build = atlas/_version.py
14 | tag_prefix = v
15 | 


--------------------------------------------------------------------------------
/docs/reports/samples.tsv:
--------------------------------------------------------------------------------
1 | 	Reads_raw_R1	Reads_raw_R2	Reads_QC_R1	Reads_QC_R2	BinGroup
2 | S001	/Users/silas/Documents/metagenomics/data/S001_R1.fastq.gz	/Users/silas/Documents/metagenomics/data/S001_R2.fastq.gz			Cage1
3 | S002	/Users/silas/Documents/metagenomics/data/S002_R1.fastq.gz	/Users/silas/Documents/metagenomics/data/S002_R2.fastq.gz			Cage1
4 | 


--------------------------------------------------------------------------------
/docs/usage/changelog.md:
--------------------------------------------------------------------------------
1 | 
2 | (_changelog)=
3 | 
4 | ```{include} ../../CHANGELOG.md
5 | ```


--------------------------------------------------------------------------------
/docs/usage/configuration.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | ..
  3 | _configuration:
  4 | 
  5 | Configure Atlas
  6 | ***************
  7 | 
  8 | ..
  9 | _contaminants:
 10 | 
 11 | Remove reads from Host
 12 | ======================
 13 | 
 14 | One of the most important steps in the Quality control is to remove reads from the host's genome.
 15 | You can add any number of genomes to be removed.
 16 | 
 17 | We recommend using genomes where repetitive sequences are masked.
 18 | See here for more details `human genome <http://seqanswers.com/forums/archive/index.php/t-42552.html>`_.
 19 | 
 20 | 
 21 | Co-abundance Binning
 22 | ====================
 23 | 
 24 | .. _cobinning:
 25 | 
 26 | While binning each sample individually is faster, using co-abundance for binning is recommended.
 27 | Quantifying the coverage of contigs across multiple samples provides valuable insights about contig co-variation.
 28 | 
 29 | There are two primary strategies for co-abundance binning:
 30 | 
 31 | 1. **Cross mapping:** Map the reads from multiple samples to each sample's contigs.
 32 | 2. **Co-binning:** Concatenate contigs from multiple samples and map all the reads to these combined contigs.
 33 | 
 34 | `final_binner: metabat2` is used for cross-mapping, while `vamb` or `SemiBin` is used for co-binning.
 35 | 
 36 | The samples to be binned together are specified using the `BinGroup` in the `sample.tsv` file.
 37 | The size of the BinGroup should be selected based on the binner and the co-binning strategy in use.
 38 | 
 39 | Cross-mapping complexity scales quadratically with the size of the BinGroup since each sample's reads are mapped to each other.
 40 | This might yield better results for complex metagenomes, although no definitive benchmark is known.
 41 | On the other hand, co-binning is more efficient, as it maps a sample's reads only once to a potentially large assembly.
 42 | 
 43 | Default Behavior
 44 | ----------------
 45 | 
 46 | Starting with version 2.18, Atlas places every sample in a single BinGroup and defaults to `vamb` as the binner unless there are very few samples.
 47 | For fewer than 8 samples, `metabat` is the default binner.
 48 | 
 49 | .. note::
 50 |     This represents a  departure from previous versions, where each sample had its own BinGroup.
 51 |     Running `vamb` in those versions would consider all samples, regardless of their BinGroup.
 52 |     This change might cause errors if using a `sample.tsv` file from an older Atlas version.
 53 |     Typically, you can resolve this by assigning a unique BinGroup to each sample.
 54 | 
 55 | The mapping threshold has been adjusted to 95% identity (single sample binning is 97%) to allow reads from different strains — 
 56 | but not other species — to map to contigs from a different sample.
 57 | 
 58 | If you're co-binning more than 150-200 samples or cross-mapping more than 50 samples, Atlas will issue a warning regarding excessive samples in a BinGroup.
 59 | Although VAMB's official publication suggests it can handle up to 1000 samples, this demands substantial resources.
 60 | 
 61 | Therefore, splitting your samples into multiple BinGroups is recommended.
 62 | Ideally, related samples, or those where the same species are anticipated, should belong to the same BinGroup.
 63 | 
 64 | Single-sample Binning
 65 | ---------------------
 66 | 
 67 | To employ single-sample binning, simply assign each sample to its own BinGroup and select `metabat` or `DASTool` as the `final_binner`.
 68 | 
 69 | Although it's not recommended, it's feasible to use `DASTool` and feed it inputs from `metabat` and other co-abundance-based binners.
 70 | 
 71 | Add the following lines to your `config.yaml`:
 72 | 
 73 | 
 74 | .. code-block:: yaml
 75 | 
 76 |    final_binner: DASTool
 77 | 
 78 |    binner: 
 79 |      - metabat
 80 |      - maxbin
 81 |      - vamb
 82 | 
 83 | 
 84 | 
 85 | .. _longreads:
 86 | 
 87 | Long reads
 88 | ==========
 89 | 
 90 | Limitation: Hybrid assembly of long and short reads is supported with spades and metaSpades.
 91 | However, metaSpades needs a paired-end short-read library.
 92 | 
 93 | The path of the (preprocessed) long reads should be added manually to the
 94 | sample table under a new column heading  'longreads'.
 95 | 
 96 | In addition, the type of the long reads should be defined in the config file:
 97 | ``longread_type`` one of ["pacbio", "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"]
 98 | 
 99 | 
100 | Example config file
101 | ===================
102 | 
103 | 
104 | ..include:: ../../config/template_config.yaml
105 |   :code:
106 | 
107 | 
108 | 
109 | 
110 | Detailed configuration
111 | ======================
112 | 
113 | ..
114 | toctree::
115 |     :maxdepth: 1
116 | 
117 |     ../advanced/qc
118 |     ../advanced/assembly
119 | 


--------------------------------------------------------------------------------
/prepare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import pandas as pd
 4 | from collections import defaultdict
 5 | 
 6 | 
 7 | def get_sample_files(path, outfile="samples.tsv"):
 8 |     samples = defaultdict(dict)
 9 |     seen = set()
10 |     for dir_name, sub_dirs, files in os.walk(os.path.abspath(path)):
11 |         for fname in files:
12 |             if ".fastq" in fname or ".fq" in fname:
13 |                 sample_id = fname.split(".fastq")[0].split(".fq")[0]
14 | 
15 |                 sample_id = (
16 |                     sample_id.replace("_R1", "")
17 |                     .replace("_r1", "")
18 |                     .replace("_R2", "")
19 |                     .replace("_r2", "")
20 |                 )
21 |                 sample_id = sample_id.replace("_", "-").replace(" ", "-")
22 | 
23 |                 fq_path = os.path.join(dir_name, fname)
24 | 
25 |                 if fq_path in seen:
26 |                     continue
27 | 
28 |                 if "_R2" in fname or "_r2" in fname:
29 |                     if "R2" in samples[sample_id]:
30 |                         logging.error(
31 |                             f"Duplicate sample {sample_id} was found after renaming; skipping... \n Samples: \n{samples}"
32 |                         )
33 | 
34 |                     samples[sample_id]["R2"] = fq_path
35 |                 else:
36 |                     if "R1" in samples[sample_id]:
37 |                         logging.error(
38 |                             f"Duplicate sample {sample_id} was found after renaming; skipping... \n Samples: \n{samples}"
39 |                         )
40 | 
41 |                     samples[sample_id]["R1"] = fq_path
42 | 
43 |     samples = pd.DataFrame(samples).T
44 | 
45 |     if samples.isna().any().any():
46 |         logging.error(f"Missing files:\n {samples}")
47 | 
48 |     if os.path.exists(outfile):
49 |         logging.error(
50 |             f"Output file {outfile} already exists I don't date to overwrite it."
51 |         )
52 |     else:
53 |         samples.to_csv(outfile, sep="\t")
54 | 
55 |     return samples
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     import sys
60 | 
61 |     get_sample_files(sys.argv[1])
62 | 


--------------------------------------------------------------------------------
/resources/images/atlas_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metagenome-atlas/atlas/b7694014f0bb3284255325b7abb93d2cf58224e6/resources/images/atlas_image.png


--------------------------------------------------------------------------------
/resources/images/atlas_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metagenome-atlas/atlas/b7694014f0bb3284255325b7abb93d2cf58224e6/resources/images/atlas_list.png


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [versioneer]
2 | VCS = git
3 | style = pep440
4 | versionfile_source = atlas/_version.py
5 | versionfile_build = atlas/_version.py
6 | tag_prefix = v
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import versioneer  # script in directory
 3 | 
 4 | __author__ = "Silas Kieser, Joe Brown"
 5 | __copyright__ = "Copyright 2021, Silas Kieser"
 6 | __email__ = "silas.kieser@gmail.com, brwnjm@gmail.com"
 7 | __license__ = "BSD-3"
 8 | 
 9 | # read the contents of your README file
10 | from os import path
11 | 
12 | this_directory = path.abspath(path.dirname(__file__))
13 | with open(path.join(this_directory, "README.md"), encoding="utf-8") as f:
14 |     long_description = f.read()
15 | 
16 | 
17 | setup(
18 |     name="metagenome-atlas",
19 |     version=versioneer.get_version(),
20 |     cmdclass=versioneer.get_cmdclass(),
21 |     url="https://github.com/metagenome-atlas/atlas",
22 |     license=__license__,
23 |     author=__author__,
24 |     author_email=__email__,
25 |     zip_safe=False,
26 |     description="ATLAS - workflows for assembly, annotation, and genomic binning of metagenomic and metatranscriptomic data.",
27 |     long_description=long_description,
28 |     long_description_content_type="text/markdown",
29 |     packages=["atlas", "atlas.init"],
30 |     package_data={
31 |         "": [
32 |             "workflow",
33 |         ]
34 |     },
35 |     data_files=[(".", ["README.md", "LICENSE.txt"])],
36 |     include_package_data=True,
37 |     install_requires=[],
38 |     # install via conda: click, pandas, pyyaml, snakemake
39 |     entry_points={"console_scripts": ["atlas = atlas.atlas:cli"]},
40 |     classifiers=["Topic :: Scientific/Engineering :: Bio-Informatics"],
41 | )
42 | 


--------------------------------------------------------------------------------
/test/dryrun.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -euo pipefail
  3 | 
  4 | 
  5 | NThreads=2
  6 | MaxMem=3
  7 | 
  8 | atlas --version
  9 | atlas run --help
 10 | 
 11 | 
 12 | databaseDir="test/databases"
 13 | WD='test/Dryrun'
 14 | reads_dir='test/reads/empty'
 15 | snakemake_args=" --quiet rules $@ --dryrun " 
 16 | test_script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 17 | 
 18 | 
 19 | 
 20 | create_reads_dir() {
 21 | 
 22 |     local reads_dir="$1"
 23 |     local N=$2
 24 | 
 25 | echo "touch reads dir: $reads_dir"
 26 | 
 27 | rm -rf $reads_dir
 28 | mkdir -p $reads_dir
 29 | 
 30 | for (( i=1; i<=$N; i++ )); do
 31 |     sample="Sample$i"
 32 |     
 33 |   for fraction in R1 R2;
 34 |     do
 35 |     touch $reads_dir/${sample}_${fraction}.fastq.gz
 36 |   done
 37 | done
 38 | }
 39 | 
 40 | # need at least 10 samples for cobinning
 41 | 
 42 | create_reads_dir $reads_dir 10 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | 
 48 | rm -fr $WD
 49 | 
 50 | echo "Atlas download"
 51 | atlas download --db-dir $databaseDir -n
 52 | 
 53 | echo "Init"
 54 | atlas init --db-dir $databaseDir --threads=$NThreads -w $WD $reads_dir
 55 | 
 56 | 
 57 | 
 58 | 
 59 | echo "Dryrun all"
 60 | atlas run all -w $WD  $snakemake_args
 61 | 
 62 | echo "Dryrun strains"
 63 | atlas run genomes strains -w $WD $snakemake_args
 64 | 
 65 | 
 66 | for binner in metabat SemiBin vamb DASTool ; do
 67 | 
 68 |   echo "
 69 |         Dryrun Binner $binner
 70 |       "
 71 | 
 72 |   atlas run binning -w $WD --config final_binner=$binner $snakemake_args
 73 | 
 74 | done
 75 | 
 76 | 
 77 | #
 78 | 
 79 | echo "
 80 |       Dryrun with skip QC and megahit
 81 |     "
 82 | #
 83 | 
 84 | rm -fr $WD
 85 | 
 86 | WD=${WD}/noQC
 87 | rm -fr $WD
 88 | 
 89 | atlas init --db-dir $databaseDir --skip-qc -w $WD --assembler megahit $reads_dir
 90 | 
 91 | atlas run all -w $WD $snakemake_args
 92 | 
 93 | 
 94 | echo "
 95 |       execution with profile
 96 |     "
 97 | 
 98 |   mkdir -p $WD/local
 99 |   printf 'cores: 2\n' > $WD/local/config.yaml
100 | 
101 |   atlas run qc -w $WD  --profile $WD/local $snakemake_args
102 | 
103 | 
104 | # clean up
105 | rm -rf $WD $reads_dir
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | echo " 
113 |       test with external genomes
114 |       "
115 | 
116 | bash $test_script_dir/test_external_genomes.sh $snakemake_args
117 | 
118 | 
119 | 
120 | echo " 
121 |       test init with different samples
122 |       "
123 | 
124 | bash $test_script_dir/test_init_many_samples.sh $snakemake_args


--------------------------------------------------------------------------------
/test/test_assembly.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -euo pipefail
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | atlas --version
 9 | 
10 | 
11 | 
12 | samplenames="Mycoplasma Streptococcus"
13 | databaseDir="databases"
14 | WD='test/Test_assembly'
15 | reads_dir="example_data/reads/stub"
16 | 
17 | 
18 | rm -f $WD/samples.tsv
19 | #
20 | atlas init --db-dir $databaseDir  -w $WD $reads_dir
21 | 
22 | 
23 | atlas run -w $WD qc  $@
24 | 
25 | atlas run assembly -w $WD $@
26 | 
27 | echo "copy qc reads and assemble"
28 | 
29 | WD2='test/Test_assembly_skipQC'
30 | reads_dir=$WD2/"reads"
31 | 
32 | rm -f $WD2/samples.tsv
33 | mkdir -p $reads_dir
34 | cp $WD/*/sequence_quality_control/*_QC_R?.fastq.gz $reads_dir
35 | 
36 | atlas init --db-dir $databaseDir --assembler megahit --skip-qc -w $WD2 $reads_dir
37 | 
38 | atlas run -w $WD2 assembly  $@
39 | 
40 | 
41 | echo "start from interleaved QC reads"
42 | 
43 | WD3='test/Test_assembly_interleved'
44 | reads_dir=$WD3/"reads"
45 | 
46 | rm -f $WD3/samples.tsv
47 | mkdir -p $reads_dir
48 | 
49 | for sample in $samplenames ;
50 | do
51 | reformat.sh in=$WD/$sample/sequence_quality_control/${sample}_QC_R1.fastq.gz \
52 |   in2=$WD/$sample/sequence_quality_control/${sample}_QC_R2.fastq.gz out=$reads_dir/${sample}.fastq.gz overwrite=true
53 | done
54 | 
55 | atlas init --db-dir $databaseDir --skip-qc --interleaved-fastq -w $WD3 $reads_dir
56 | 
57 | atlas run assembly --config  threads=2 mem=4 java_mem=4 normalize_reads_before_assembly=true -w $WD3 $@
58 | 


--------------------------------------------------------------------------------
/test/test_ci.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -exuo pipefail
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | atlas --version
10 | 
11 | # get test reads
12 | wget https://zenodo.org/record/3992790/files/test_reads.tar.gz
13 | tar -xzf test_reads.tar.gz 
14 | 
15 | 
16 | ls -l test_reads
17 | 
18 | databaseDir="databases"
19 | WD='test_ci'
20 | reads_dir="test_reads" #"example_data/reads/test"
21 | 
22 | 
23 | rm -f $WD/samples.tsv
24 | #
25 | atlas init $reads_dir --db-dir  $databaseDir  -w $WD #--interleaved-fastq
26 | 
27 | atlas run None screen -w $WD qc  $@
28 | 
29 | echo "\n\nFinished screen\n\n"
30 | 
31 | atlas run -w $WD qc  $@
32 | 
33 | echo "\n\nFinished qc\n\n"
34 | 
35 | 
36 | atlas run assembly -w $WD $@
37 | 
38 | echo "\n\nFinished assembly\n\n"
39 | 
40 | atlas run binning -w $WD $@
41 | 
42 | echo "\n\nFinished binning\n\n"
43 | 
44 | atlas run genecatalog --omit-from combine_egg_nogg_annotations combine_dram_genecatalog_annotations -w $WD $@
45 | 
46 | echo "\n\nFinished genecatalog\n\n"
47 | 
48 | # atlas run genomes -w $WD $@
49 | 
50 | # echo "\n\nFinished genomes\n\n"
51 | 
52 | # atlas run all -w $WD $@
53 | 
54 | # echo "\n\nFinished all\n\n"
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/test/test_external_genomes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | 
 5 | NThreads=2
 6 | MaxMem=3
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | databaseDir="test/databases"
13 | WD='test/genome_quant'
14 | reads_dir='test/reads/empty'
15 | snakemake_args=" --quiet rules $@ --dryrun " 
16 | 
17 | echo "touch reads dir"
18 | mkdir -p $reads_dir
19 | for sample in Sample1 Sample2 ; 
20 |   do
21 |   for fraction in R1 R2;
22 |     do
23 |     touch $reads_dir/${sample}_${fraction}.fastq.gz
24 |   done
25 | done
26 | 
27 | 
28 | rm -fr $WD
29 | 
30 | # create genome dir
31 | genome_dir=$WD/other_genomes
32 | 
33 | mkdir -p $genome_dir
34 | for i in 1::5 ; 
35 | do
36 |     touch $genome_dir/Genome_$i.fasta
37 | done
38 | 
39 | echo "Init"
40 | atlas init --db-dir $databaseDir --skip-qc -w $WD $reads_dir
41 | 
42 | echo "Run quantify_genomes"
43 | 
44 | atlas run quantify_genomes -w $WD --config genome_dir="other_genomes" $snakemake_args
45 | 
46 | 
47 | echo "Run strains"
48 | 
49 | atlas run strains -w $WD --config genome_dir="other_genomes" $snakemake_args
50 | 
51 | 
52 | rm -rf $WD $reads_dir


--------------------------------------------------------------------------------
/test/test_init_many_samples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | 
 5 | NThreads=2
 6 | MaxMem=3
 7 | 
 8 | atlas --version
 9 | atlas run --help
10 | 
11 | 
12 | databaseDir="test/databases"
13 | 
14 | 
15 | 
16 | 
17 | create_reads_dir() {
18 | 
19 |     local reads_dir="$1"
20 |     local N=$2
21 | 
22 | echo "touch reads dir"
23 | 
24 | rm -rf $reads_dir
25 | mkdir -p $reads_dir
26 | 
27 | for (( i=1; i<=$N; i++ )); do
28 |     sample="Sample$i"
29 |     
30 |   for fraction in R1 R2;
31 |     do
32 |     touch $reads_dir/${sample}_${fraction}.fastq.gz
33 |   done
34 | done
35 | }
36 | 
37 | 
38 | 
39 | 
40 | 
41 | for N in 5 10 50 300 ; 
42 | do
43 | 
44 | echo "test init with  $N samples"
45 | 
46 | WD="test/test_init/$N"
47 | reads_dir="test/test_init/reads_${N}_samples/"
48 | 
49 | rm -rf $WD $reads_dir
50 | 
51 | 
52 | 
53 | create_reads_dir $reads_dir $N
54 | 
55 | atlas init --db-dir $databaseDir -w $WD $reads_dir
56 | 
57 | done
58 | 


--------------------------------------------------------------------------------
/test/test_local.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | test_script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 6 | debug_dir="$test_script_dir/../../Debug_atlas"
 7 | 
 8 | mkdir -p $debug_dir
 9 | cd $debug_dir
10 | 
11 | 
12 | reads_dir="test_reads"
13 | 
14 | snakemake_args=" --quiet rules $@ " 
15 | 
16 | # if test_reads doean't exist download it
17 | 
18 | if [ ! -d "$reads_dir" ]; then
19 |     echo "Downloading test reads"
20 |     wget https://zenodo.org/record/3992790/files/test_reads.tar.gz
21 |     tar -xzf test_reads.tar.gz 
22 |     rm test_reads.tar.gz
23 | fi
24 | 
25 | WD='wd'
26 | 
27 | 
28 | rm -f $WD/samples.tsv $WD/config.yaml
29 | 
30 | #
31 | atlas init $reads_dir -w $WD --assembler megahit
32 | 
33 | #atlas run None screen -w $WD  $snakemake_args
34 | 
35 | # echo "\n\nFinished screen\n\n"
36 | 
37 | atlas run -w $WD qc  $snakemake_args
38 | 
39 | echo "\n\nFinished qc\n\n"
40 | 
41 | 
42 | atlas run assembly -w $WD $snakemake_args
43 | 
44 | echo "\n\nFinished assembly\n\n"
45 | 
46 | # atlas run binning -w $WD $snakemake_args
47 | 
48 | # echo "\n\nFinished binning\n\n"
49 | 
50 | # atlas run genecatalog --omit-from combine_egg_nogg_annotations combine_dram_genecatalog_annotations -w $WD $snakemake_args
51 | 
52 | # echo "\n\nFinished genecatalog\n\n"
53 | 
54 | # atlas run genomes -w $WD $@
55 | 
56 | # echo "\n\nFinished genomes\n\n"
57 | 
58 | # atlas run strains -w $WD $@
59 | 
60 | # echo "\n\nFinished strains\n\n"
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/test/test_sra.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | set -euo pipefail
 3 | 
 4 | atlas --version
 5 | 
 6 | Test_dir="test/Test_sra_init"
 7 | 
 8 | rm -rf $Test_dir
 9 | mkdir -p $Test_dir
10 | 
11 | echo "Download reads from our library"
12 | 
13 | WD=$Test_dir/"Mouse"
14 | echo "WD="$WD
15 | 
16 | atlas init-public PRJEB20796 -w $WD
17 | 
18 | echo "Run Atlas"
19 | 
20 | atlas run qc -w $WD --dry-run $@
21 | 
22 | 
23 | echo "Download reads from HMP"
24 | WD=$Test_dir/"HMP"
25 | echo "WD="$WD
26 | 
27 | # this fails as HMP have samples sequenced with different platforms
28 | 
29 | 
30 | set +e
31 | atlas init-public SRP002423 -w $WD
32 | 
33 | set -e
34 | echo "(expected errors)"
35 | 
36 | 
37 | echo "drop illumina samples"
38 | sed -i.bak '/ILLUMINA/d' $WD/RunInfo.tsv
39 | 
40 | # modify assembler as spades cannot handle single end reads
41 | 
42 | # python << END
43 | # from ruamel.yaml import YAML
44 | # yaml = YAML()
45 | # config_file="$WD/config.yaml"
46 | # config= yaml.load(open(config_file))
47 | # config['assembler'] = 'megahit'
48 | # yaml.dump(config, open(config_file, 'w'))
49 | # END
50 | 
51 | 
52 | echo "create sample table"
53 | atlas init-public continue -w $WD
54 | 
55 | echo "Run Atlas"
56 | 
57 | atlas run qc -w $WD --dry-run $@
58 | 
59 | ## single end
60 | 
61 | echo "Now with a single end sample"
62 | 
63 | WD=$Test_dir/"SingleEnd"
64 | echo "WD="$WD
65 | 
66 | atlas init-public SAMEA104416160  -w $WD
67 | 
68 | atlas run None download_sra -w $WD $@
69 | 
70 | ## smal data
71 | 
72 | 
73 | echo "Download reads from small dataset for real test"
74 | 
75 | WD=$Test_dir/"Small"
76 | echo "WD="$WD
77 | 
78 | echo "gives warning as library is selected with PCR"
79 | 
80 | atlas init-public SAMEA9831203 SAMEA9831204 -w $WD
81 | 
82 | echo "Run Atlas"
83 | 
84 | atlas run None download_sra -w $WD $@ 


--------------------------------------------------------------------------------
/workflow/annotate.smk:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import sys
 4 | import tempfile
 5 | 
 6 | # import pandas as pd
 7 | # import numpy as np
 8 | 
 9 | from snakemake.utils import logger, min_version
10 | 
11 | sys.path.append(
12 |     os.path.join(os.path.dirname(os.path.abspath(workflow.snakefile)), "scripts")
13 | )
14 | import utils
15 | 
16 | from conf import update_config
17 | 
18 | config = update_config(config)
19 | 
20 | TMPDIR = config.get("tmpdir", tempfile.gettempdir())
21 | 
22 | # CONDAENV = "envs" # overwrite definition in download.smk
23 | 
24 | 
25 | include: "rules/dram.smk"
26 | 


--------------------------------------------------------------------------------
/workflow/envs/DASTool.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - ruby-lang
5 |     - defaults
6 | dependencies:
7 |     - das_tool=1.1.6
8 | 


--------------------------------------------------------------------------------
/workflow/envs/busco.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - defaults
5 | dependencies:
6 | - busco=5.4
7 | 


--------------------------------------------------------------------------------
/workflow/envs/cd-hit.yaml:
--------------------------------------------------------------------------------
 1 | name: cd-hit
 2 | channels:
 3 | - conda-forge
 4 | - bioconda
 5 | - defaults
 6 | dependencies:
 7 | - cd-hit=4.6
 8 | # - libgcc=7.2.0=h69d50b8_2
 9 | # - libgcc-ng=7.2.0=h7cc24e2_2
10 | # - libstdcxx-ng=7.2.0=h7a57d05_2
11 | 


--------------------------------------------------------------------------------
/workflow/envs/checkm.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - checkm-genome =1.1.*
7 |     - python =3.6 # needs to be 3.6
8 | 


--------------------------------------------------------------------------------
/workflow/envs/checkm2.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - checkm2>=1.0.1, <1.1


--------------------------------------------------------------------------------
/workflow/envs/dram.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 | dependencies:
 5 |   - python >=3.8
 6 |   - altair >=4
 7 |   - networkx
 8 |   - numpy
 9 |   - openpyxl
10 |   - pandas >=1.5, <2
11 |   - scikit-bio >=0.5.8, <0.6
12 |   - sqlalchemy
13 |   - prodigal
14 |   - scipy >=1.9
15 |   - mmseqs2 >10.6d92c
16 |   - hmmer
17 |   - trnascan-se >=2
18 |   - barrnap
19 |   - ruby
20 |   - parallel
21 |   - wget
22 |   - curl
23 |   - pip
24 |   - pip:
25 |     - git+https://github.com/SilasK/DRAM.git


--------------------------------------------------------------------------------
/workflow/envs/eggNOG.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |     - conda-forge
 3 |     - bioconda
 4 |     - defaults
 5 | dependencies:
 6 |     - eggnog-mapper >=2.1.11, <2.2
 7 |     - python=3.11
 8 |     - diamond =2.1
 9 |     - wget # to download_eggnog_data on macOS
10 |     - pandas>=1.5,<2
11 | 


--------------------------------------------------------------------------------
/workflow/envs/fasta.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |     - conda-forge
 3 |     - bioconda
 4 |     - defaults
 5 | dependencies:
 6 |     - pyfastx=0.9
 7 |     - pandas=1.2
 8 |     - pyarrow
 9 |     - biopython
10 | 


--------------------------------------------------------------------------------
/workflow/envs/grabseq.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - louiejtaylor
3 |   - conda-forge
4 |   - bioconda
5 |   - defaults
6 | dependencies:
7 |   - grabseqs
8 | 


--------------------------------------------------------------------------------
/workflow/envs/gtdbtk.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - defaults
5 | dependencies:
6 | - gtdbtk =2.4
7 | 


--------------------------------------------------------------------------------
/workflow/envs/gunc.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - defaults
5 | dependencies:
6 | - gunc=1.0
7 | - pandas=1.5.1
8 | 


--------------------------------------------------------------------------------
/workflow/envs/hdf.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 | - conda-forge
 3 | - bioconda
 4 | - defaults
 5 | dependencies:
 6 | - python=3.10
 7 | - pandas=1.5
 8 | - h5py=3.8
 9 | - psutil=5.9
10 | - biom-format >=2.1.14, <2.2
11 | - pyarrow
12 | 


--------------------------------------------------------------------------------
/workflow/envs/instrain.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - defaults
5 | dependencies:
6 | - instrain =1.5.*
7 | - pandas>=1.5,<2.0
8 | 


--------------------------------------------------------------------------------
/workflow/envs/maxbin.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - maxbin2 =2.2.*
7 | 


--------------------------------------------------------------------------------
/workflow/envs/megahit.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - megahit=1.2
7 | 


--------------------------------------------------------------------------------
/workflow/envs/metabat.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - metabat2 =2.15
7 | 


--------------------------------------------------------------------------------
/workflow/envs/minimap.yaml:
--------------------------------------------------------------------------------
1 | name: minimap2
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - minimap2
6 |   - samtools
7 | 


--------------------------------------------------------------------------------
/workflow/envs/mmseqs.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - mmseqs2=13
7 | 


--------------------------------------------------------------------------------
/workflow/envs/prodigal.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - prodigal =2.6.*
7 | 


--------------------------------------------------------------------------------
/workflow/envs/report.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - pandas=1.2
7 |     - plotly=5.3
8 | 


--------------------------------------------------------------------------------
/workflow/envs/required_packages.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |     - conda-forge
 3 |     - bioconda
 4 |     - defaults
 5 | dependencies:
 6 |     - python >=3.8, <3.11
 7 |     - bbmap >= 39.01, <40
 8 |     - pigz
 9 |     - bzip2 >=1.0
10 |     - pandas >=1.2, <2
11 |     - samtools >=1.13, <2
12 |     - sambamba >=1.0.1
13 | 


--------------------------------------------------------------------------------
/workflow/envs/semibin.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |   - conda-forge
3 |   - bioconda
4 |   - defaults
5 | dependencies:
6 |   - python>=3.8, <3.11
7 |   - semibin=1.5
8 |   - biopython
9 | 


--------------------------------------------------------------------------------
/workflow/envs/sequence_utils.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - biopython=1.74
7 |     - pandas=1.2
8 |     - matplotlib-base
9 | 


--------------------------------------------------------------------------------
/workflow/envs/skani.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - skani=0.1


--------------------------------------------------------------------------------
/workflow/envs/spades.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - spades>=4.0
7 | 


--------------------------------------------------------------------------------
/workflow/envs/species_clustering.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - conda-forge
 3 |   - bioconda
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.11
 7 |   - pandas=2
 8 |   - pyarrow=11
 9 |   - networkx=3.1
10 |   - scipy=1.10


--------------------------------------------------------------------------------
/workflow/envs/sra.post-deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | vdb-config --quiet


--------------------------------------------------------------------------------
/workflow/envs/sra.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 | - defaults
3 | - bioconda
4 | - conda-forge
5 | dependencies:
6 | - sra-tools
7 | - pigz
8 | - parallel-fastq-dump
9 | 


--------------------------------------------------------------------------------
/workflow/envs/tree.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - defaults
5 | dependencies:
6 |     - ete3=3.1.2
7 | 


--------------------------------------------------------------------------------
/workflow/envs/vamb.yaml:
--------------------------------------------------------------------------------
1 | channels:
2 |     - conda-forge
3 |     - bioconda
4 |     - pytorch
5 |     - defaults
6 | dependencies:
7 |   - vamb>=3.0
8 | 


--------------------------------------------------------------------------------
/workflow/report/assembly_report.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | logging.captureWarnings(True)
12 | 
13 | 
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 |     if issubclass(exc_type, KeyboardInterrupt):
16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 |         return
18 | 
19 |     logging.error(
20 |         "".join(
21 |             [
22 |                 "Uncaught exception: ",
23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 |             ]
25 |         )
26 |     )
27 | 
28 | 
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 | 
32 | #### Begining of scripts
33 | 
34 | from common_report import *
35 | 
36 | import os, sys
37 | import pandas as pd
38 | import plotly.express as px
39 | 
40 | 
41 | labels = {
42 |     "Percent_Assembled_Reads": "Percent of Assembled Reads",
43 |     "contig_bp": "Total BP",
44 |     "n_contigs": "Contigs (count)",
45 |     "N_Predicted_Genes": "Predicted Genes (count)",
46 |     "N50": "N50-number",
47 |     "L50": "N50-length (bp)",
48 |     "N90": "N90-number",
49 |     "L90": "N90-length (bp)",
50 | }
51 | 
52 | 
53 | PLOT_PARAMS = dict(labels=labels)
54 | 
55 | 
56 | def make_plots(combined_stats):
57 |     ## Make figures with PLOTLY
58 |     # load and rename data
59 |     df = pd.read_csv(combined_stats, sep="\t", index_col=0)
60 |     df.sort_index(ascending=True, inplace=True)
61 |     df.index.name = "Sample"
62 |     df["Sample"] = df.index
63 | 
64 |     # create plots store in div
65 |     div = {}
66 | 
67 |     fig = px.strip(df, y="Percent_Assembled_Reads", hover_name="Sample", **PLOT_PARAMS)
68 |     fig.update_yaxes(range=[0, 100])
69 |     div["Percent_Assembled_Reads"] = fig.to_html(**HTML_PARAMS)
70 | 
71 |     fig = px.strip(df, y="N_Predicted_Genes", hover_name="Sample", **PLOT_PARAMS)
72 |     div["N_Predicted_Genes"] = fig.to_html(**HTML_PARAMS)
73 | 
74 |     fig = px.scatter(df, y="L50", x="N50", hover_name="Sample", **PLOT_PARAMS)
75 |     div["N50"] = fig.to_html(**HTML_PARAMS)
76 | 
77 |     fig = px.scatter(df, y="L90", x="N90", hover_name="Sample", **PLOT_PARAMS)
78 |     div["N90"] = fig.to_html(**HTML_PARAMS)
79 | 
80 |     fig = px.scatter(
81 |         df, y="contig_bp", x="n_contigs", hover_name="Sample", **PLOT_PARAMS
82 |     )
83 |     div["Total"] = fig.to_html(**HTML_PARAMS)
84 | 
85 |     return div
86 | 
87 | 
88 | # main
89 | 
90 | 
91 | div = make_plots(combined_stats=snakemake.input.combined_contig_stats)
92 | 
93 | 
94 | make_html(
95 |     div=div,
96 |     report_out=snakemake.output.report,
97 |     html_template_file=os.path.join(reports_dir, "template_assembly_report.html"),
98 | )
99 | 


--------------------------------------------------------------------------------
/workflow/report/bin_report.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import logging, traceback
  3 | 
  4 | logging.basicConfig(
  5 |     filename=snakemake.log[0],
  6 |     level=logging.INFO,
  7 |     format="%(asctime)s %(message)s",
  8 |     datefmt="%Y-%m-%d %H:%M:%S",
  9 | )
 10 | 
 11 | logging.captureWarnings(True)
 12 | 
 13 | 
 14 | def handle_exception(exc_type, exc_value, exc_traceback):
 15 |     if issubclass(exc_type, KeyboardInterrupt):
 16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
 17 |         return
 18 | 
 19 |     logging.error(
 20 |         "".join(
 21 |             [
 22 |                 "Uncaught exception: ",
 23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
 24 |             ]
 25 |         )
 26 |     )
 27 | 
 28 | 
 29 | # Install exception handler
 30 | sys.excepthook = handle_exception
 31 | 
 32 | #### Begining of scripts
 33 | 
 34 | 
 35 | from common_report import *
 36 | 
 37 | import pandas as pd
 38 | import plotly.express as px
 39 | 
 40 | 
 41 | from utils.taxonomy import tax2table
 42 | 
 43 | 
 44 | def make_plots(bin_info):
 45 |     div = {}
 46 | 
 47 |     div["input_file"] = f"{bin_info} and {snakemake.input.bins2species}"
 48 | 
 49 |     # Prepare data
 50 |     df = pd.read_table(bin_info, index_col=0)
 51 |     df["Bin Id"] = df.index  # need it also as column
 52 | 
 53 |     # add species info
 54 |     bin2species = pd.read_table(snakemake.input.bins2species, index_col=0)
 55 |     df = df.join(bin2species)
 56 | 
 57 |     logging.info(df.head())
 58 | 
 59 |     logging.info(bin2species.head())
 60 | 
 61 |     # calculate number of genomes/bins
 62 |     st = pd.DataFrame(columns=["Bins", "Species"])
 63 | 
 64 |     def add_stats(name, d):
 65 |         st.loc[name, "Bins"] = d.shape[0]
 66 |         st.loc[name, "Species"] = d.Representative.unique().shape[0]
 67 | 
 68 |     add_stats("All", df)
 69 | 
 70 |     df.eval("Quality_score = Completeness - 5* Contamination", inplace=True)
 71 |     div["QualityScore"] = (
 72 |         "<p>Quality score is calculated as: Completeness - 5 x Contamination.</p>"
 73 |     )
 74 |     add_stats("Quality score >50 ", df.query("Quality_score>50"))
 75 |     add_stats("Good quality", df.query("Completeness>90 & Contamination <5"))
 76 |     add_stats("Quality score >90 ", df.query("Quality_score>90"))
 77 | 
 78 |     div["table"] = st.to_html()
 79 | 
 80 |     logging.info(df.describe())
 81 | 
 82 |     # Bin Id  Completeness    completeness_general    Contamination   completeness_specific   completeness_model_used translation_table_used  coding_density  contig_n50      average_gene_length      genome_size     gc_content      total_coding_sequences  additional_notes        quality_score   sample  Ambigious_bases Length_contigs  Length_scaffolds N50     N_contigs       N_scaffolds     logN50
 83 |     hover_data = [
 84 |         "Completeness_Model_Used",
 85 |         "Coding_Density",
 86 |         "N50",
 87 |         "GC_Content",
 88 |     ]
 89 |     size_name = "Genome_Size"
 90 | 
 91 |     lineage_name = "Species"
 92 | 
 93 |     # 2D plot
 94 | 
 95 |     logging.info("make 2d plot")
 96 |     fig = px.scatter(
 97 |         data_frame=df,
 98 |         y="Completeness",
 99 |         x="Contamination",
100 |         color=lineage_name,
101 |         size=size_name,
102 |         hover_data=hover_data,
103 |         hover_name="Bin Id",
104 |     )
105 |     fig.update_yaxes(range=(50, 102))
106 |     fig.update_xaxes(range=(-0.2, 10.1))
107 |     div["2D"] = fig.to_html(**HTML_PARAMS)
108 | 
109 |     # 2D plot
110 | 
111 |     logging.info("make 2d plot species")
112 |     fig = px.scatter(
113 |         data_frame=df.loc[df.Representative.unique()],
114 |         y="Completeness",
115 |         x="Contamination",
116 |         color=lineage_name,
117 |         size=size_name,
118 |         hover_data=hover_data,
119 |         hover_name="Bin Id",
120 |     )
121 |     fig.update_yaxes(range=(50, 102))
122 |     fig.update_xaxes(range=(-0.2, 10.1))
123 |     div["2Dsp"] = fig.to_html(**HTML_PARAMS)
124 | 
125 |     ## By sample
126 |     logging.info("plot  by sample")
127 |     fig = px.strip(
128 |         data_frame=df,
129 |         y="Quality_score",
130 |         x="Sample",
131 |         color=lineage_name,
132 |         hover_data=hover_data,
133 |         hover_name="Bin Id",
134 |     )
135 |     fig.update_yaxes(range=(50, 102))
136 |     div["bySample"] = fig.to_html(**HTML_PARAMS)
137 | 
138 |     # # By species
139 |     # logging.info("plot by species")
140 |     # fig = px.strip(
141 |     #     data_frame=df,
142 |     #     y="Quality_score",
143 |     #     x=lineage_name,
144 |     #     hover_data=hover_data,
145 |     #     hover_name="Bin Id",
146 |     # )
147 |     # fig.update_yaxes(range=(50, 102))
148 |     # div["byPhylum"] = fig.to_html(**HTML_PARAMS)
149 | 
150 |     return div
151 | 
152 | 
153 | # main
154 | 
155 | 
156 | div = make_plots(bin_info=snakemake.input.bin_info)
157 | 
158 | 
159 | make_html(
160 |     div=div,
161 |     report_out=snakemake.output.report,
162 |     html_template_file=os.path.join(reports_dir, "template_bin_report.html"),
163 |     wildcards=snakemake.wildcards,
164 | )
165 | 


--------------------------------------------------------------------------------
/workflow/report/common_report.py:
--------------------------------------------------------------------------------
 1 | import plotly.io as pio
 2 | import os, sys
 3 | 
 4 | atlas_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 5 | 
 6 | reports_dir = os.path.join(atlas_dir, "report")
 7 | 
 8 | sys.path.append(os.path.join(atlas_dir, "scripts"))
 9 | 
10 | 
11 | pio.templates.default = "simple_white"
12 | HTML_PARAMS = dict(
13 |     include_plotlyjs=False,
14 |     full_html=False,
15 | )
16 | 
17 | 
18 | ## make html report
19 | 
20 | 
21 | def make_html(
22 |     html_template_file,
23 |     report_out,
24 |     div,
25 |     css_file=os.path.join(reports_dir, "report.css"),
26 |     wildcards={},
27 | ):
28 |     html_template = open(html_template_file).read()
29 |     css_content = open(css_file).read()
30 | 
31 |     html_string = html_template.format(div=div, css_content=css_content, **wildcards)
32 | 
33 |     with open(report_out, "w") as outf:
34 |         outf.write(html_string)
35 | 


--------------------------------------------------------------------------------
/workflow/report/report.css:
--------------------------------------------------------------------------------
 1 | /* Overrides of notebook CSS for static HTML export */
 2 | body {
 3 |   overflow: visible;
 4 |   font-size: 14pt;
 5 |   padding: 8px;
 6 |   margin:0 100;
 7 |   background:whitesmoke;
 8 | }
 9 | 
10 | h1 {
11 |     text-align: center
12 | }
13 | 
14 | p {
15 |   font-size: 14pt;
16 | }
17 | 
18 | .float-container {
19 |     padding: 2px;
20 |     height:100%;
21 |     width:100%;
22 | }
23 | 
24 | .float-child {
25 |     width: 50%;
26 |     float: left;
27 |     padding: 2px;
28 | }
29 | 
30 | @media not print {
31 |   #notebook-container {
32 |     padding: 15px;
33 |     background-color: #fff;
34 |     min-height: 0;
35 |     -webkit-box-shadow: 0px 0px 12px 1px rgba(87, 87, 87, 0.2);
36 |     box-shadow: 0px 0px 12px 1px rgba(87, 87, 87, 0.2);
37 |   }
38 | }
39 | @media print {
40 |   #notebook-container {
41 |     width: 100%;
42 |   }
43 | 


--------------------------------------------------------------------------------
/workflow/report/template_QC_report.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 | 
 4 |         <title>Metagenome-Atlas - QC Report</title>
 5 | 
 6 |         <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
 7 | 
 8 |         <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css">
 9 |         <style type="text/css">
10 |             {css_content}
11 |         </style>
12 | 
13 |     </head>
14 |     <body>
15 | 
16 | 
17 |     <div class="container" id="notebook-container">
18 | 
19 |         <h1>Quality Control Report</h1>
20 | 
21 | 
22 |         <h2>Number of reads that went through the quality control process.</h2>
23 | 
24 |             {div[Reads]}
25 | 
26 | 
27 |             <table border="1" class="docutils">
28 |             <colgroup>
29 |             <col width="26%" />
30 |             <col width="74%" />
31 |             </colgroup>
32 |             <thead valign="bottom">
33 |             <tr><th class="head">Step</th>
34 |             <th class="head">Output</th>
35 |             </tr>
36 |             </thead>
37 |             <tbody valign="top">
38 |             <tr><td>raw</td>
39 |             <td>the input reads</td>
40 |             </tr>
41 |             <tr><td>deduplicated</td>
42 |             <td>after (optional) deduplication step</td>
43 |             </tr>
44 |             <tr><td>filtered</td>
45 |             <td>trimmed, quality filtered</td>
46 |             </tr>
47 |             <tr><td>qc</td>
48 |             <td>final reads, contaminants removed</td>
49 |             </tr>
50 |             </tbody>
51 |             </table>
52 | 
53 |         <h2>Total number of reads/bases after QC</h2>
54 | 
55 |           <div class="float-container">
56 | 
57 |             <div class="float-child">
58 |               {div[Total_Reads]}
59 |             </div>
60 | 
61 |             <div class="float-child">
62 |               {div[Total_Bases]}
63 |             </div>
64 | 
65 |           </div>
66 | 
67 | 
68 |           <h2>Base quality values along reads</h2>
69 | 
70 |             {div[quality_QC]}
71 | 
72 |           <h2>Read length</h2>
73 | 
74 | 
75 |             {div[Length]}
76 | 
77 |           <h2>Insert size</h2>
78 |             <p> The size of the reads + the space between. Ideally, the paired-end reads don't overlap. </p>
79 | 
80 |             {div[Insert]}
81 | 
82 |             </div>
83 |     </body>
84 | </html>
85 | 


--------------------------------------------------------------------------------
/workflow/report/template_assembly_report.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 | 
 4 |         <title>Metagenome-Atlas - Assembly Report</title>
 5 | 
 6 |         <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
 7 | 
 8 |         <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css">
 9 |         <style type="text/css">
10 |             {css_content}
11 |         </style>
12 | 
13 |     </head>
14 |     <body>
15 | 
16 | 
17 |     <div class="container" id="notebook-container">
18 | 
19 |         <h1>Assembly Summary</h1>
20 | 
21 | 
22 |         <h2>Total assembly length</h2>
23 | 
24 |             {div[Total]}
25 | 
26 | 
27 |         <h2>Fragmentation</h2>
28 | 
29 |         <p>
30 |         N50/N90 is a measure of how fractionated assemblies are:
31 |         50%/90% of the assembly consists of contigs of length N50/N90 or longer.
32 |         You need N50/N90-number contigs to get 50%/90% of the total assembly length.
33 |         </p>
34 | 
35 | 
36 |         <div class="float-container">
37 | 
38 |           <div class="float-child">
39 |             {div[N50]}
40 |           </div>
41 | 
42 |           <div class="float-child">
43 |             {div[N90]}
44 |           </div>
45 | 
46 |         </div>
47 | 
48 |         <h2>Genes / Reads</h2>
49 | 
50 |         <div class="float-container">
51 | 
52 |           <div class="float-child">
53 |             {div[N_Predicted_Genes]}
54 |           </div>
55 | 
56 |           <div class="float-child">
57 |             {div[Percent_Assembled_Reads]}
58 |           </div>
59 | 
60 |         </div>
61 | 
62 | 
63 |           <p></p>
64 |             </div>
65 |     </body>
66 | </html>
67 | 


--------------------------------------------------------------------------------
/workflow/report/template_bin_report.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 
 3 | <head>
 4 | 
 5 |     <title>{binner} Metagenome-Atlas - Bin Report</title>
 6 | 
 7 |     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
 8 | 
 9 |     <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.1.1/css/bootstrap.min.css">
10 |     <style type="text/css">
11 |         {css_content}
12 |     </style>
13 | 
14 | </head>
15 | 
16 | <body>
17 | 
18 | 
19 |     <div class="container" id="notebook-container">
20 | 
21 |         <h1>Bin Report for Binner {binner}</h1>
22 |         <p>Genome completeness and contamination, and taxonomy were estimated unsing CheckM2. </p>
23 |         {div[QualityScore]}
24 |         <p>For all the information see the file {div[input_file]}</p>
25 | 
26 |         <h2>Number of genomes</h2></h>
27 |             {div[table]}
28 |         
29 |             <p>"Good quality" refers to the standard of Completeness &gt; 90&percnt; and Contamination &lt; 5&percnt;. Also called high-quality or near-complete. But t-RNA/r-RNA presence is not evaluated. It is less stingent than Quality Score &gt; 90.  </p>
30 | 
31 |         <h2>Quality for all bins</h2>
32 |                 {div[2D]}
33 | 
34 | 
35 |         <h2>Quality for Species representatives</h2>
36 |                 {div[2Dsp]}
37 | 
38 | 
39 | 
40 |         <h2>Quality score by Sample </h2>
41 | 
42 |             
43 | 
44 |             {div[bySample]}
45 | 
46 | 
47 | 
48 | 
49 |     </div>
50 | </body>
51 | 
52 | </html>
53 | 


--------------------------------------------------------------------------------
/workflow/rules/cdhit.smk:
--------------------------------------------------------------------------------
  1 | def parse_cd_hit_file(clstr_file):
  2 |     """
  3 | 
  4 |     >Cluster 0
  5 |     0   342nt, >S1_83_1... *
  6 |     1   342nt, >S2_82_1... at +/100.00%
  7 |     >Cluster 1
  8 |     0   339nt, >S1_61_1... *
  9 |     1   339nt, >S2_59_1... at +/100.00%
 10 | 
 11 | 
 12 |     """
 13 |     import numpy as np
 14 | 
 15 |     def parse_line(line):
 16 |         _, length, name, identity = (
 17 |             line.strip().replace("...", "\t").replace(", ", "\t").split("\t")
 18 |         )
 19 | 
 20 |         length = int(length.replace("nt", ""))
 21 |         name = name[1:]
 22 |         if "*" in identity:
 23 |             identity = np.nan
 24 |         else:
 25 |             identity = float(identity[identity.rfind("/") + 1 : identity.rfind("%")])
 26 | 
 27 |         return name, length, identity
 28 | 
 29 |     Clusters = []
 30 |     with open(clstr_file) as f:
 31 |         for line in f:
 32 |             if line[0] == ">":  # new cluster
 33 |                 cluster = dict(elements=[], representative=None)
 34 |                 Clusters.append(cluster)
 35 |             else:
 36 |                 name, length, identity = parse_line(line)
 37 |                 cluster["elements"].append((name, length, identity))
 38 |                 if np.isnan(identity):
 39 |                     cluster["representative"] = name
 40 |     return Clusters
 41 | 
 42 | 
 43 | def write_cd_hit_clusters(Clusters, file_handle):
 44 |     for cluster in Clusters:
 45 |         for element in cluster["elements"]:
 46 |             file_handle.write(
 47 |                 f"{element[0]}\t{element[1]}\t{element[2]}\t{cluster['representative']}\n"
 48 |             )
 49 | 
 50 | 
 51 | localrules:
 52 |     parse_clstr_files,
 53 |     rename_gene_clusters,
 54 | 
 55 | 
 56 | rule cluster_genes:
 57 |     input:
 58 |         fna_dir="Genecatalog/all_genes/predicted_genes.fna",
 59 |     output:
 60 |         temp("Genecatalog/representatives_of_clusters.fasta"),
 61 |         temp("Genecatalog/gene_catalog_oldnames.clstr"),
 62 |     conda:
 63 |         "%s/cd-hit.yaml" % CONDAENV
 64 |     log:
 65 |         "logs/Genecatalog/cluster_genes.log",
 66 |     threads: config.get("threads", 1)
 67 |     resources:
 68 |         mem_mb=config["mem"] * 1000,
 69 |     params:
 70 |         coverage=config["genecatalog"]["coverage"],
 71 |         identity=config["genecatalog"]["minid"],
 72 |         extra=config["genecatalog"]["extra"],
 73 |         prefix=lambda wc, output: os.path.splitext(output[1])[0],
 74 |     shell:
 75 |         """
 76 |         cd-hit-est -i {input} -T {threads} \
 77 |         -M {resources.mem}000 -o {params.prefix} \
 78 |         -c {params.identity} -n 9  -d 0 {params.extra} \
 79 |         -aS {params.coverage} -aL {params.coverage} &> {log}
 80 | 
 81 |         mv {params.prefix} {output[0]} 2>> {log}
 82 |         """
 83 | 
 84 | 
 85 | rule parse_clstr_files:
 86 |     input:
 87 |         clustered_dir="Genecatalog/gene_catalog_oldnames.clstr",
 88 |     output:
 89 |         temp("Genecatalog/orf2gene_oldnames.tsv"),
 90 |     run:
 91 |         with open(output[0], "w") as fout:
 92 |             fout.write(f"ORF\tLength\tIdentity\tRepresentative\n")
 93 |             Clusters = parse_cd_hit_file(input[0])
 94 |             write_cd_hit_clusters(Clusters, fout)
 95 | 
 96 | 
 97 | rule generate_orf_info:
 98 |     input:
 99 |         cluster_attribution="Genecatalog/orf2gene_oldnames.tsv",
100 |     output:
101 |         cluster_attribution="Genecatalog/clustering/orf_info.parquet",
102 |         rep2genenr="Genecatalog/clustering/representative2genenr.tsv",
103 |     threads: 1
104 |     run:
105 |         import pandas as pd
106 |         import numpy as np
107 | 
108 |         from utils import gene_scripts
109 | 
110 |         # cd hit format ORF\tLength\tIdentity\tRepresentative\n
111 |         orf2gene = pd.read_csv(input.orf2gene, sep="\t")
112 | 
113 |         # rename gene repr to Gene0000XX
114 | 
115 |         # split orf names in sample, contig_nr, and orf_nr
116 |         orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF)
117 | 
118 |         # rename representative
119 | 
120 |         representative_names = orf2gene.Representative.unique()
121 | 
122 |         map_names = pd.Series(
123 |             index=representative_names,
124 |             data=np.arange(1, len(representative_names) + 1, dtype=np.uint),
125 |         )
126 | 
127 | 
128 |         orf_info["GeneNr"] = orf2gene.Representative.map(map_names)
129 | 
130 | 
131 |         orf_info.to_parquet(output.cluster_attribution)
132 | 
133 | 
134 |         # Save name of representatives
135 |         map_names.index.name = "Representative"
136 |         map_names.name = "GeneNr"
137 |         map_names.to_csv(output.rep2genenr, sep="\t")
138 | 


--------------------------------------------------------------------------------
/workflow/rules/derep.smk:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | rule run_skani:
  5 |     input:
  6 |         paths="Binning/{binner}/filtered_bins_paths.txt",
  7 |     output:
  8 |         temp("Intermediate/dereplication/{binner}_distance_matrix.txt"),
  9 |     log:
 10 |         "logs/binning/{binner}/dereplication/skani_calculation.log",
 11 |     resources:
 12 |         mem_mb=config["mem"] * 1000,
 13 |         time_min=60 * config["runtime"]["default"],
 14 |     params:
 15 |         #preset= "medium", # fast, medium or slow
 16 |         min_af=config["genome_dereplication"]["overlap"] * 100,
 17 |         extra="",
 18 |     threads: config["threads"]
 19 |     conda:
 20 |         "../envs/skani.yaml"
 21 |     shell:
 22 |         "skani triangle "
 23 |         " {params.extra} "
 24 |         " -l {input.paths} "
 25 |         " -o {output} "
 26 |         " -t {threads} "
 27 |         " --sparse --ci "
 28 |         " --min-af {params.min_af} "
 29 |         " &> {log} "
 30 | 
 31 | 
 32 | rule skani_2_parquet:
 33 |     input:
 34 |         rules.run_skani.output,
 35 |     output:
 36 |         "Binning/{binner}/genome_similarities.parquet",
 37 |     resources:
 38 |         mem_mb=config["mem"] * 1000,
 39 |         time_min=60 * config["runtime"]["simplejob"],
 40 |     log:
 41 |         "logs/binning/{binner}/dereplication/skani_2_parquet.log",
 42 |     threads: 1
 43 |     run:
 44 |         try:
 45 |             skani_column_dtypes = {
 46 |                 "Ref_file": "category",
 47 |                 "Query_file": "category",
 48 |                 "ANI": float,
 49 |                 "Align_fraction_ref": float,
 50 |                 "Align_fraction_query": float,
 51 |                 "ANI_5_percentile": float,
 52 |                 "ANI_95_percentile": float,
 53 |             }  # Ref_name        Query_name
 54 | 
 55 |             import pandas as pd
 56 | 
 57 |             import pandas as pd
 58 | 
 59 |             df = pd.read_table(input[0])
 60 | 
 61 |             from utils.io import simplify_path
 62 | 
 63 |             df = pd.read_table(
 64 |                 input[0],
 65 |                 usecols=list(skani_column_dtypes.keys()),
 66 |                 dtype=skani_column_dtypes,
 67 |             )
 68 | 
 69 |             df["Ref"] = df.Ref_file.cat.rename_categories(simplify_path)
 70 |             df["Query"] = df.Query_file.cat.rename_categories(simplify_path)
 71 | 
 72 |             df.to_parquet(output[0])
 73 | 
 74 |         except Exception as e:
 75 |             import traceback
 76 | 
 77 |             with open(log[0], "w") as logfile:
 78 |                 traceback.print_exc(file=logfile)
 79 | 
 80 |             raise e
 81 | 
 82 | 
 83 | rule cluster_species:
 84 |     input:
 85 |         dist="Binning/{binner}/genome_similarities.parquet",
 86 |         bin_info="Binning/{binner}/filtered_bin_info.tsv",
 87 |     params:
 88 |         linkage_method="average",
 89 |         pre_cluster_threshold=0.925,
 90 |         threshold=config["genome_dereplication"]["ANI"],
 91 |     conda:
 92 |         "../envs/species_clustering.yaml"
 93 |     log:
 94 |         "logs/binning/{binner}/dereplication/species_clustering.log",
 95 |     output:
 96 |         bin_info="Binning/{binner}/bin_info.tsv",
 97 |         bins2species="Binning/{binner}/bins2species.tsv",
 98 |     script:
 99 |         "../scripts/cluster_species.py"
100 | 
101 | 
102 | rule build_bin_report:
103 |     input:
104 |         bin_info="Binning/{binner}/bin_info.tsv",
105 |         bins2species="Binning/{binner}/bins2species.tsv",
106 |     output:
107 |         report="reports/bin_report_{binner}.html",
108 |     conda:
109 |         "../envs/report.yaml"
110 |     log:
111 |         "logs/binning/report_{binner}.log",
112 |     script:
113 |         "../report/bin_report.py"
114 | 


--------------------------------------------------------------------------------
/workflow/rules/dram.smk:
--------------------------------------------------------------------------------
  1 | DBDIR = config["database_dir"]
  2 | 
  3 | 
  4 | def get_dram_config(wildcards):
  5 |     old_dram_path = f"{DBDIR}/Dram"
  6 |     if Path(old_dram_path).exists():
  7 |         logger.error(
  8 |             f"Detected an old database for DRAM in {old_dram_path}. You can delete it."
  9 |         )
 10 | 
 11 |     return config.get("dram_config_file", f"{DBDIR}/DRAM/DRAM.config")
 12 | 
 13 | 
 14 | localrules:
 15 |     dram_download,
 16 |     concat_annotations,
 17 | 
 18 | 
 19 | rule dram_download:
 20 |     output:
 21 |         dbdir=directory(f"{DBDIR}/DRAM/db/"),
 22 |         config=f"{DBDIR}/DRAM/DRAM.config",
 23 |     threads: config["threads"]
 24 |     resources:
 25 |         mem_mb=config["mem"] * 1000,
 26 |         time_min=60 * config["runtime"]["default"],
 27 |     log:
 28 |         "logs/dram/download_dram.log",
 29 |     benchmark:
 30 |         "logs/benchmarks/dram/download_dram.tsv"
 31 |     conda:
 32 |         "../envs/dram.yaml"
 33 |     shell:
 34 |         " DRAM-setup.py prepare_databases "
 35 |         " --output_dir {output.dbdir} "
 36 |         " --threads {threads} "
 37 |         " --verbose "
 38 |         " --skip_uniref "
 39 |         " &> {log} "
 40 |         " ; "
 41 |         " DRAM-setup.py export_config --output_file {output.config}"
 42 | 
 43 | 
 44 | rule DRAM_annotate:
 45 |     input:
 46 |         fasta="genomes/genomes/{genome}.fasta",
 47 |         #checkm= "genomes/checkm/completeness.tsv",
 48 |         #gtdb_dir= "genomes/taxonomy/gtdb/classify",
 49 |         config=get_dram_config,
 50 |     output:
 51 |         outdir=directory("genomes/annotations/dram/intermediate_files/{genome}"),
 52 |     threads: config["simplejob_threads"]
 53 |     resources:
 54 |         mem_mb=config["simplejob_mem"] * 1000,
 55 |         time_min=60 * config["runtime"]["default"],
 56 |     conda:
 57 |         "../envs/dram.yaml"
 58 |     params:
 59 |         extra=config.get("dram_extra", ""),
 60 |         min_contig_size=config.get("minimum_contig_length", "1000"),
 61 |     log:
 62 |         "logs/dram/run_dram/{genome}.log",
 63 |     benchmark:
 64 |         "logs/benchmarks/dram/run_dram/{genome}.tsv"
 65 |     shell:
 66 |         " DRAM.py annotate "
 67 |         " --config_loc {input.config} "
 68 |         " --input_fasta {input.fasta}"
 69 |         " --output_dir {output.outdir} "
 70 |         " --threads {threads} "
 71 |         " --min_contig_size {params.min_contig_size} "
 72 |         " {params.extra} "
 73 |         " --verbose &> {log}"
 74 |         #" --gtdb_taxonomy {input.gtdb_dir}/{params.gtdb_file} "
 75 |         #" --checkm_quality {input.checkm} "
 76 | 
 77 | 
 78 | def get_all_dram(wildcards):
 79 |     all_genomes = get_all_genomes(wildcards)
 80 | 
 81 |     return expand(rules.DRAM_annotate.output.outdir, genome=all_genomes)
 82 | 
 83 | 
 84 | DRAM_ANNOTATON_FILES = ["annotations.tsv"]
 85 | 
 86 | 
 87 | rule concat_annotations:
 88 |     input:
 89 |         get_all_dram,
 90 |     output:
 91 |         expand("genomes/annotations/dram/{annotation}", annotation=DRAM_ANNOTATON_FILES),
 92 |     resources:
 93 |         time_min=60 * config["runtime"]["default"],
 94 |     run:
 95 |         from utils import io
 96 | 
 97 |         for i, annotation_file in enumerate(DRAM_ANNOTATON_FILES):
 98 |             input_files = [
 99 |                 os.path.join(dram_folder, annotation_file) for dram_folder in input
100 |             ]
101 | 
102 |             io.pandas_concat(
103 |                 input_files, output[i], sep="\t", index_col=0, axis=0, disk_based=True
104 |             )
105 | 
106 | 
107 | rule DRAM_destill:
108 |     input:
109 |         rules.concat_annotations.output,
110 |         config=get_dram_config,
111 |     output:
112 |         outdir=directory("genomes/annotations/dram/distil"),
113 |     threads: 1
114 |     resources:
115 |         mem_mb=config["simplejob_mem"] * 1000,
116 |         ttime_min=60 * config["runtime"]["simplejob"],
117 |     conda:
118 |         "../envs/dram.yaml"
119 |     log:
120 |         "logs/dram/distil.log",
121 |     shell:
122 |         " DRAM.py distill "
123 |         " --config_loc {input.config} "
124 |         " --input_file {input[0]}"
125 |         " --output_dir {output} "
126 |         "  &> {log}"
127 | 
128 | 
129 | rule get_all_modules:
130 |     input:
131 |         annotations="genomes/annotations/dram/annotations.tsv",
132 |         config=get_dram_config,
133 |     output:
134 |         "genomes/annotations/dram/kegg_modules.tsv",
135 |     threads: 1
136 |     resources:
137 |         mem_mb=config["simplejob_mem"] * 1000,
138 |         time_min=60 * config["runtime"]["default"],
139 |     conda:
140 |         "../envs/dram.yaml"
141 |     log:
142 |         "logs/dram/get_all_modules.log",
143 |     script:
144 |         "../scripts/DRAM_get_all_modules.py"
145 | 
146 | 
147 | rule dram:
148 |     input:
149 |         "genomes/annotations/dram/distil",
150 |         "genomes/annotations/dram/kegg_modules.tsv",
151 | 


--------------------------------------------------------------------------------
/workflow/rules/gtdbtk.smk:
--------------------------------------------------------------------------------
  1 | gtdb_dir = "genomes/taxonomy/gtdb"
  2 | 
  3 | 
  4 | rule identify:
  5 |     input:
  6 |         flag=rules.extract_gtdb.output,
  7 |         genes_flag="genomes/annotations/genes/predicted",
  8 |     output:
  9 |         directory(f"{gtdb_dir}/identify"),
 10 |     threads: config["threads"]
 11 |     conda:
 12 |         "../envs/gtdbtk.yaml"
 13 |     log:
 14 |         "logs/taxonomy/gtdbtk/identify.txt",
 15 |         f"{gtdb_dir}/gtdbtk.log",
 16 |     params:
 17 |         outdir=gtdb_dir,
 18 |         extension="faa",
 19 |         gene_dir=lambda wc, input: os.path.abspath(os.path.dirname(input.genes_flag)),
 20 |     shell:
 21 |         'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
 22 |         "gtdbtk identify "
 23 |         "--genes --genome_dir {params.gene_dir} "
 24 |         " --out_dir {params.outdir} "
 25 |         "--extension {params.extension} "
 26 |         "--cpus {threads} &> {log[0]}"
 27 | 
 28 | 
 29 | checkpoint align:
 30 |     input:
 31 |         f"{gtdb_dir}/identify",
 32 |     output:
 33 |         directory(f"{gtdb_dir}/align"),
 34 |     threads: config["threads"]
 35 |     resources:
 36 |         mem_mb=config["large_mem"] * 1000,
 37 |     conda:
 38 |         "../envs/gtdbtk.yaml"
 39 |     log:
 40 |         "logs/taxonomy/gtdbtk/align.txt",
 41 |         f"{gtdb_dir}/gtdbtk.log",
 42 |     params:
 43 |         outdir=gtdb_dir,
 44 |     shell:
 45 |         'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
 46 |         "gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} "
 47 |         "--cpus {threads} &> {log[0]}"
 48 | 
 49 | 
 50 | rule classify:
 51 |     input:
 52 |         rules.align.output,
 53 |         genome_dir=genome_dir,
 54 |     output:
 55 |         directory(f"{gtdb_dir}/classify"),
 56 |     threads: config["threads"]  #pplacer needs much memory for not many threads
 57 |     resources:
 58 |         mem_mb=config["large_mem"] * 1000,
 59 |         time_min=60 * config["runtime"]["long"],
 60 |     conda:
 61 |         "../envs/gtdbtk.yaml"
 62 |     log:
 63 |         "logs/taxonomy/gtdbtk/classify.txt",
 64 |         f"{gtdb_dir}/gtdbtk.log",
 65 |     params:
 66 |         outdir=gtdb_dir,
 67 |         extension="fasta",
 68 |         mashdir=Path(GTDBTK_DATA_PATH) / "mash_db",
 69 |     shell:
 70 |         'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
 71 |         "gtdbtk classify --genome_dir {input.genome_dir} --align_dir {params.outdir} "
 72 |         " --mash_db {params.mashdir} "
 73 |         "--out_dir {params.outdir} "
 74 |         " --tmpdir {resources.tmpdir} "
 75 |         "--extension {params.extension} "
 76 |         "--cpus {threads} &> {log[0]}"
 77 | 
 78 | 
 79 | rule combine_taxonomy:
 80 |     input:
 81 |         folder=f"{gtdb_dir}/classify",
 82 |     output:
 83 |         combined=f"{gtdb_dir}/gtdbtk.combined.summary.tsv",
 84 |         taxonomy="genomes/taxonomy/gtdb_taxonomy.tsv",
 85 |     log:
 86 |         "logs/taxonomy/gtdbtk/combine.txt",
 87 |     script:
 88 |         "../scripts/combine_taxonomy.py"
 89 | 
 90 | 
 91 | rule build_tree:
 92 |     input:
 93 |         f"{gtdb_dir}/align/{{msa}}.user_msa.fasta.gz",
 94 |     output:
 95 |         temp("genomes/taxonomy/gtdb/{msa}.unrooted.tree"),
 96 |     log:
 97 |         "logs/genomes/tree/{msa}.log",
 98 |         "logs/genomes/tree/{msa}.err",
 99 |     threads: max(config["threads"], 3)
100 |     params:
101 |         outdir=lambda wc, output: Path(output[0]).parent,
102 |     conda:
103 |         "../envs/gtdbtk.yaml"
104 |     shell:
105 |         'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
106 |         "gtdbtk infer --msa_file {input} "
107 |         " --out_dir {params.outdir} "
108 |         " --prefix {wildcards.msa} "
109 |         " --cpus {threads} "
110 |         "--tmpdir {resources.tmpdir} > {log[0]} 2> {log[1]}"
111 | 
112 | 
113 | localrules:
114 |     root_tree,
115 | 
116 | 
117 | rule root_tree:
118 |     input:
119 |         tree=rules.build_tree.output[0],
120 |     wildcard_constraints:
121 |         msa="((?!unrooted).)*",
122 |     output:
123 |         tree="genomes/tree/{msa}.nwk",
124 |     conda:
125 |         "../envs/tree.yaml"
126 |     threads: 1
127 |     resources:
128 |         mem_mb=config["simplejob_mem"] * 1000,
129 |         ttime_min=60 * config["runtime"]["simplejob"],
130 |     log:
131 |         "logs/genomes/tree/root_tree_{msa}.log",
132 |     script:
133 |         "../scripts/root_tree.py"
134 | 
135 | 
136 | def all_gtdb_trees_input(wildcards):
137 |     dir = checkpoints.align.get().output[0]
138 | 
139 |     domains = glob_wildcards(f"{dir}/gtdbtk.{{domain}}.user_msa.fasta.gz").domain
140 | 
141 |     return expand("genomes/tree/gtdbtk.{domain}.nwk", domain=domains)
142 | 
143 | 
144 | rule all_gtdb_trees:
145 |     input:
146 |         all_gtdb_trees_input,
147 |     output:
148 |         touch("genomes/tree/finished_gtdb_trees"),
149 | 


--------------------------------------------------------------------------------
/workflow/rules/patch.smk:
--------------------------------------------------------------------------------
 1 | localrules:
 2 |     copy_assembly,
 3 | 
 4 | 
 5 | # Rules that are usefull temporarily to update to new version of atlas
 6 | 
 7 | 
 8 | ruleorder: copy_assembly > finalize_contigs
 9 | 
10 | 
11 | rule copy_assembly:
12 |     input:
13 |         "{sample}/{sample}_contigs.fasta",
14 |     output:
15 |         "Assembly/fasta/{sample}.fasta",
16 |     shell:
17 |         "cp {input} {output}"
18 | 


--------------------------------------------------------------------------------
/workflow/rules/predict_genes_of_genomes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os, sys
 4 | import logging, traceback
 5 | 
 6 | logging.basicConfig(
 7 |     filename=snakemake.log[0],
 8 |     level=logging.INFO,
 9 |     format="%(asctime)s %(message)s",
10 |     datefmt="%Y-%m-%d %H:%M:%S",
11 | )
12 | 
13 | logging.captureWarnings(True)
14 | 
15 | 
16 | def handle_exception(exc_type, exc_value, exc_traceback):
17 |     if issubclass(exc_type, KeyboardInterrupt):
18 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
19 |         return
20 | 
21 |     logging.error(
22 |         "".join(
23 |             [
24 |                 "Uncaught exception: ",
25 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
26 |             ]
27 |         )
28 |     )
29 | 
30 | 
31 | # Install exception handler
32 | sys.excepthook = handle_exception
33 | 
34 | #### Begining of scripts
35 | 
36 | # python 3.5 without f strings
37 | 
38 | import os, shutil, sys
39 | import uuid
40 | import itertools
41 | from glob import glob
42 | from snakemake.shell import shell
43 | from snakemake.io import glob_wildcards
44 | from multiprocessing import Pool
45 | 
46 | 
47 | def predict_genes(genome, fasta, out_dir, log):
48 |     fna = "{}/{}.fna".format(out_dir, genome)
49 |     faa = "{}/{}.faa".format(out_dir, genome)
50 |     gff = "{}/{}.gff".format(out_dir, genome)
51 | 
52 |     shell('printf "{genome}:\n" > {log}'.format(genome=genome, log=log))
53 |     shell(
54 |         "prodigal -i {fasta} -o {gff} -d {fna} -a {faa} -p sinlge -c -m -f gff 2>> {log} ".format(
55 |             fasta=fasta, log=log, gff=gff, fna=fna, faa=faa
56 |         )
57 |     )
58 |     shell('printf "\n" >> {log}'.format(log=log))
59 | 
60 | 
61 | def predict_genes_genomes(input_dir, out_dir, log, threads):
62 |     genomes_fastas = glob(os.path.join(input_dir, "*.fasta"))
63 | 
64 |     os.makedirs(out_dir, exist_ok=True)
65 | 
66 |     temp_log_dir = os.path.join(os.path.dirname(log), "tmp_" + uuid.uuid4().hex)
67 |     os.makedirs(temp_log_dir, exist_ok=False)
68 | 
69 |     genome_names = []
70 |     log_names = []
71 |     for fasta in genomes_fastas:
72 |         genome_name = os.path.splitext(os.path.split(fasta)[-1])[0]
73 |         genome_names.append(genome_name)
74 |         log_names.append(os.path.join(temp_log_dir, genome_name + ".prodigal.tmp"))
75 | 
76 |     pool = Pool(threads)
77 |     pool.starmap(
78 |         predict_genes,
79 |         zip(genome_names, genomes_fastas, itertools.repeat(out_dir), log_names),
80 |     )
81 | 
82 |     # cat in python
83 |     with open(log, "ab") as f_out:
84 |         for logfile in log_names:
85 |             with open(logfile, "rb") as f_in:
86 |                 shutil.copyfileobj(f_in, f_out)
87 | 
88 |     shell("rm -r {temp_log_dir}".format(temp_log_dir=temp_log_dir))
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     predict_genes_genomes(
93 |         snakemake.input.dir,
94 |         snakemake.output[0],
95 |         snakemake.log[0],
96 |         int(snakemake.threads),
97 |     )
98 | 


--------------------------------------------------------------------------------
/workflow/rules/scg_blank_diamond.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # The MIT License (MIT)
 4 | # Copyright (c) 2016 Alexander J Probst
 5 | 
 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 9 | 
10 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | 
12 | # https://github.com/AJProbst/sngl_cp_gn
13 | 
14 | #1: $search_engine name
15 | #2: $proteins
16 | #3: $DIR\/db/bac.all.faa
17 | #4: $DIR\/db/bac.scg.faa
18 | #5: $DIR\/db/bac.scg.lookup
19 | #6: $threads
20 | 
21 | d = ARGV[0]
22 | 
23 | input_file = ARGV[1]
24 | output_dir = File.dirname(input_file)
25 | 
26 | datab = ARGV[2]
27 | db_all = File.dirname(input_file) + "/all_prot"
28 | puts "database name of all proteins is #{datab}"
29 | 
30 | db_name = ARGV[3]
31 | puts "database name of SCGs is #{db_name}"
32 | 
33 | db_lookup = ARGV[4]
34 | puts "database lookup is #{db_lookup}"
35 | 
36 | threads = ARGV[5]
37 | 
38 | #build databases
39 | full_db = system "#{d} makedb --in #{datab} -d #{db_all}.dmnd"
40 | abort "makeblastdb did not work for #{datab}, please check your input file" unless full_db
41 | 
42 | # find SCG candidates
43 | puts "finding SCG candidates..."
44 | input_blast_database = system "#{d} makedb --in #{input_file} -d #{input_file}.dmnd"
45 | input_blast_out = File.join(output_dir,File.basename(input_file) + ".findSCG.b6")
46 | abort "makeblastdb did not work for #{input_file}, please check your input file" unless input_blast_database
47 | input_blast_ok = system "#{d} blastp --query #{db_name} --db #{input_file}.dmnd --max-target-seqs 0 --outfmt 6 qseqid sseqid pident length qlen slen evalue bitscore --out #{input_blast_out} --evalue 0.01 --threads #{threads}"
48 | system "rm #{input_file}.dmnd"
49 | abort "blast did not work, please check your input file." unless input_blast_ok
50 | 
51 | input_blast_out_whitelist = File.join(output_dir,File.basename(input_file) + ".findSCG.b6.whitelist")
52 | system "awk '{print$2}' #{input_blast_out} | sort -u > #{input_blast_out_whitelist}"
53 | scg_candidates = File.join(output_dir,File.basename(input_file) + ".scg.candidates.faa")
54 | system "pullseq -i #{input_file} -n #{input_blast_out_whitelist} > #{scg_candidates}"
55 | system "rm #{input_blast_out_whitelist}"
56 | 
57 | # verify SCGs by blasting against all proteins of all genomes
58 | puts "verifying selected SCGs..."
59 | db_blast_out = File.join(output_dir,File.basename(input_file) + ".all.b6")
60 | db_blast_ok = system "#{d} blastp --query #{scg_candidates} --db #{db_all} --evalue 0.00001 --threads #{threads} --out #{db_blast_out} --outfmt 6 qseqid sseqid pident length qlen slen evalue bitscore --max-target-seqs 1"
61 | abort "verifying blast did not work" unless db_blast_ok
62 | system "rm #{db_all}.dmnd"
63 | puts "starting annotations of single copy cogs..."
64 | 
65 | # Read db_lookup
66 | lookup_h = {}
67 | File.open(db_lookup).each do |line|
68 |   sbj, annotation = line.chomp.split
69 |   lookup_h[sbj]=annotation
70 | end
71 | 
72 | # now compare and print
73 | File.open(File.join(output_dir,File.basename(input_file)+".scg"), "w") do |file|
74 |   File.open(db_blast_out).each do |line|
75 |     next if line =~ /^#/
76 |     line.chomp!
77 |     temp = line.split(/\t/)
78 |     query, sbjct = temp[0], temp[1]
79 |     aln_len, sbjct_len = temp[3], temp[5]
80 |     if lookup_h[sbjct] && aln_len > (sbjct_len*0.5)
81 |       file.puts "#{query.split[0]}\t#{lookup_h[sbjct]}"
82 |     end
83 |   end
84 | end
85 | 
86 | puts "successfully finished"
87 | 


--------------------------------------------------------------------------------
/workflow/rules/screen.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | rule generate_sketch:
 3 |     input:
 4 |         unpack(get_input_fastq),
 5 |     output:
 6 |         "Intermediate/screen/sketches/{sample}.sketch.gz",
 7 |     log:
 8 |         "logs/screen/make_sketch/{sample}.log",
 9 |     conda:
10 |         "../envs/required_packages.yaml"
11 |     threads: 1
12 |     resources:
13 |         mem_mb=config["simplejob_mem"] * 1000,
14 |         java_mem=int(config["simplejob_mem"] * JAVA_MEM_FRACTION),
15 |     shell:
16 |         "bbsketch.sh "
17 |         "in={input[0]}"
18 |         " samplerate=0.5"
19 |         " minkeycount=2 "
20 |         " out={output} "
21 |         " blacklist=nt ssu=f name0={wildcards.sample} depth=t overwrite=t "
22 |         " -Xmx{resources.java_mem}g "
23 |         " &> {log}"
24 |         # take only one read
25 | 
26 | 
27 | rule compare_sketch:
28 |     input:
29 |         expand(rules.generate_sketch.output, sample=SAMPLES),
30 |     output:
31 |         "QC/screen/sketch_comparison.tsv.gz",
32 |     priority: 100
33 |     log:
34 |         "logs/screen/compare_sketch.log",
35 |     conda:
36 |         "../envs/required_packages.yaml"
37 |     threads: 1
38 |     resources:
39 |         mem_mb=config["mem"] * 1000,
40 |         java_mem=int(config["mem"] * JAVA_MEM_FRACTION),
41 |     shell:
42 |         "comparesketch.sh alltoall "
43 |         " format=3 out={output} "
44 |         " records=5000 "
45 |         " {input} "
46 |         " -Xmx{resources.java_mem}g "
47 |         " &> {log}"
48 | 
49 | 
50 | #        sendsketch.sh sample2.sketch printdepth2=t level=2 printqfname=f printvolume=t color=f out
51 | 


--------------------------------------------------------------------------------
/workflow/rules/semibin.smk:
--------------------------------------------------------------------------------
  1 | 
  2 | rule semibin_generate_data_multi:
  3 |     input:
  4 |         fasta=rules.combine_contigs.output,
  5 |         bams=get_bams_of_bingroup,
  6 |     output:
  7 |         directory("Intermediate/cobinning/{bingroup}/semibin/data_multi"),
  8 |         # expand(
  9 |         #     "Cobinning/SemiBin/samples/{sample}/{files}",
 10 |         #     sample=SAMPLES,
 11 |         #     files=["data.csv", "data_split.csv"],
 12 |         # ),
 13 |     conda:
 14 |         "../envs/semibin.yaml"
 15 |     threads: config["threads"]
 16 |     resources:
 17 |         mem_mb=config["mem"] * 1000,
 18 |         time_min=60 * config["runtime"]["default"],
 19 |     log:
 20 |         "logs/semibin/{bingroup}/generate_data_multi.log",
 21 |     benchmark:
 22 |         "logs/benchmarks/semibin/{bingroup}/generate_data_multi.tsv"
 23 |     params:
 24 |         # output_dir="Cobinning/SemiBin",
 25 |         separator=config["cobinning_separator"],
 26 |     shell:
 27 |         "SemiBin generate_sequence_features_multi"
 28 |         " --input-fasta {input.fasta} "
 29 |         " --input-bam {input.bams} "
 30 |         " --output {output} "
 31 |         " --threads {threads} "
 32 |         " --separator {params.separator} "
 33 |         " 2> {log}"
 34 | 
 35 | 
 36 | rule semibin_train:
 37 |     input:
 38 |         flag=get_assembly,
 39 |         fasta_sample=rules.filter_contigs.output[0],
 40 |         bams=get_bams_of_bingroup,
 41 |         data_folder=rules.semibin_generate_data_multi.output[0],
 42 |     output:
 43 |         "Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5",
 44 |     conda:
 45 |         "../envs/semibin.yaml"
 46 |     threads: config["threads"]
 47 |     resources:
 48 |         mem_mb=config["mem"] * 1000,
 49 |         time_min=60 * config["runtime"]["default"],
 50 |     log:
 51 |         "logs/semibin/{bingroup}/train/{sample}.log",
 52 |     benchmark:
 53 |         "logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv"
 54 |     params:
 55 |         output_dir=lambda wc, output: os.path.dirname(output[0]),
 56 |         data=lambda wc, input: Path(input.data_folder)
 57 |         / "samples"
 58 |         / wc.sample
 59 |         / "data.csv",
 60 |         data_split=lambda wc, input: Path(input.data_folder)
 61 |         / "samples"
 62 |         / wc.sample
 63 |         / "data_split.csv",
 64 |         extra=config["semibin_train_extra"],
 65 |     shell:
 66 |         "SemiBin train_self "
 67 |         " --output {params.output_dir} "
 68 |         " --threads {threads} "
 69 |         " --data {params.data} "
 70 |         " --data-split {params.data_split} "
 71 |         " {params.extra} "
 72 |         " 2> {log}"
 73 | 
 74 | 
 75 | def semibin_input(wildcards):
 76 |     bingroup_of_sample = sampleTable.loc[wildcards.sample, "BinGroup"]
 77 |     samples_of_bingroup = sampleTable.query(
 78 |         f'BinGroup=="{bingroup_of_sample}"'
 79 |     ).index.tolist()
 80 | 
 81 |     assert len(samples_of_bingroup) > 1
 82 | 
 83 |     mapping = dict(
 84 |         fasta=rules.filter_contigs.output[0].format(**wildcards),
 85 |         bams=expand(
 86 |             "Intermediate/cobinning/{bingroup}/bams/{sample}.sorted.bam",
 87 |             sample=samples_of_bingroup,
 88 |             bingroup=bingroup_of_sample,
 89 |         ),
 90 |         data_folder=rules.semibin_generate_data_multi.output[0].format(
 91 |             bingroup=bingroup_of_sample, **wildcards
 92 |         ),
 93 |         model=rules.semibin_train.output[0].format(
 94 |             bingroup=bingroup_of_sample, **wildcards
 95 |         ),
 96 |     )
 97 | 
 98 |     return mapping
 99 | 
100 | 
101 | rule run_semibin:
102 |     input:
103 |         unpack(semibin_input),
104 |     output:
105 |         # contains no info to bingroup
106 |         directory(
107 |             "Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/"
108 |         ),
109 |     conda:
110 |         "../envs/semibin.yaml"
111 |     threads: config["threads"]
112 |     resources:
113 |         mem_mb=config["mem"] * 1000,
114 |         time_min=60 * config["runtime"]["default"],
115 |     log:
116 |         "logs/semibin/bin/{sample}.log",
117 |     benchmark:
118 |         "logs/benchmarks/semibin/bin/{sample}.tsv"
119 |     params:
120 |         output_dir=lambda wc, output: os.path.dirname(output[0]),
121 |         data=lambda wc, input: Path(input.data_folder)
122 |         / "samples"
123 |         / wc.sample
124 |         / "data.csv",
125 |         min_bin_kbs=int(config["cobining_min_bin_size"] / 1000),
126 |         extra=config["semibin_options"],
127 |     shell:
128 |         "SemiBin bin "
129 |         " --input-fasta {input.fasta} "
130 |         " --output {params.output_dir} "
131 |         " --threads {threads} "
132 |         " --data {params.data} "
133 |         " --model {input.model} "
134 |         " --minfasta-kbs {params.min_bin_kbs}"
135 |         " {params.extra} "
136 |         " 2> {log}"
137 | 
138 | 
139 | localrules:
140 |     parse_semibin_output,
141 | 
142 | 
143 | ruleorder: parse_semibin_output > get_unique_cluster_attribution
144 | 
145 | 
146 | rule parse_semibin_output:
147 |     input:
148 |         rules.run_semibin.output[0],
149 |     output:
150 |         "{sample}/binning/SemiBin/cluster_attribution.tsv",
151 |     conda:
152 |         "../envs/semibin.yaml"
153 |     log:
154 |         "logs/semibin/parse_output/{sample}.log",
155 |     params:
156 |         extension=".fa",
157 |     script:
158 |         "../scripts/parse_semibin.py"
159 | 
160 | 
161 | rule semibin:
162 |     input:
163 |         expand("{sample}/binning/SemiBin/cluster_attribution.tsv", sample=SAMPLES),
164 | 


--------------------------------------------------------------------------------
/workflow/rules/sra.smk:
--------------------------------------------------------------------------------
  1 | wildcard_constraints:
  2 |     sra_run="[S,E,D]RR[0-9]+",
  3 | 
  4 | 
  5 | localrules:
  6 |     prefetch,
  7 | 
  8 | 
  9 | SRA_read_fractions = ["_1", "_2"] if PAIRED_END else [""]
 10 | SRA_SUBDIR_RUN = "SRA/Runs"
 11 | 
 12 | 
 13 | rule prefetch:
 14 |     output:
 15 |         sra=temp(touch(SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}_downloaded")),
 16 |         # not givins sra file as output allows for continue from the same download
 17 |     params:
 18 |         outdir=SRA_SUBDIR_RUN,  # prefetch creates file in subfolder with run name automatically
 19 |     log:
 20 |         "logs/SRAdownload/prefetch/{sra_run}.log",
 21 |     benchmark:
 22 |         "logs/benchmarks/SRAdownload/prefetch/{sra_run}.tsv"
 23 |     threads: 1
 24 |     resources:
 25 |         mem_mb=1000,
 26 |         time_min=60 * int(config["runtime"]["simplejob"]),
 27 |         internet_connection=1,
 28 |     conda:
 29 |         "%s/sra.yaml" % CONDAENV
 30 |     shell:
 31 |         " mkdir -p {params.outdir} 2> {log} "
 32 |         " ; "
 33 |         " prefetch "
 34 |         " --output-directory {params.outdir} "
 35 |         " -X 999999999 "
 36 |         " --progress "
 37 |         " --log-level info "
 38 |         " {wildcards.sra_run} &>> {log} "
 39 |         " ; "
 40 |         " vdb-validate {params.outdir}/{wildcards.sra_run}/{wildcards.sra_run}.sra &>> {log} "
 41 | 
 42 | 
 43 | rule extract_run:
 44 |     input:
 45 |         flag=rules.prefetch.output,
 46 |     output:
 47 |         temp(
 48 |             expand(
 49 |                 SRA_SUBDIR_RUN + "/{{sra_run}}/{{sra_run}}{fraction}.fastq.gz",
 50 |                 fraction=SRA_read_fractions,
 51 |             )
 52 |         ),
 53 |     params:
 54 |         outdir=os.path.abspath(SRA_SUBDIR_RUN + "/{sra_run}"),
 55 |         sra_file=SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}.sra",
 56 |     log:
 57 |         "logs/SRAdownload/extract/{sra_run}.log",
 58 |     benchmark:
 59 |         "logs/benchmarks/SRAdownload/fasterqdump/{sra_run}.tsv"
 60 |     threads: config["simplejob_threads"]
 61 |     resources:
 62 |         time_min=60 * int(config["runtime"]["simplejob"]),
 63 |         mem_mb=1000,  #default 100Mb
 64 |     conda:
 65 |         "%s/sra.yaml" % CONDAENV
 66 |     shell:
 67 |         " vdb-validate {params.sra_file} &>> {log} "
 68 |         " ; "
 69 |         " parallel-fastq-dump "
 70 |         " --threads {threads} "
 71 |         " --gzip --split-files "
 72 |         " --outdir {params.outdir} "
 73 |         " --tmpdir {resources.tmpdir} "
 74 |         " --skip-technical --split-3 "
 75 |         " -s {params.sra_file} &>> {log} "
 76 |         " ; "
 77 |         " rm -f {params.sra_file} 2>> {log} "
 78 | 
 79 | 
 80 | RunTable = None
 81 | 
 82 | 
 83 | def get_runids_for_biosample(wildcards):
 84 |     global RunTable
 85 |     if RunTable is None:
 86 |         from atlas.init.parse_sra import load_and_validate_runinfo_table
 87 | 
 88 |         RunTable = load_and_validate_runinfo_table("RunInfo.tsv")
 89 | 
 90 |     run_ids = RunTable.query(f"BioSample == '{wildcards.sample}'").index.tolist()
 91 | 
 92 |     return run_ids
 93 | 
 94 | 
 95 | def get_runs_for_biosample(wildcards):
 96 |     run_ids = get_runids_for_biosample(wildcards)
 97 | 
 98 |     ReadFiles = {}
 99 |     for fraction in SRA_read_fractions:
100 |         if fraction == "":
101 |             key = "se"
102 |         else:
103 |             key = fraction
104 | 
105 |         ReadFiles[key] = expand(
106 |             SRA_SUBDIR_RUN + "/{sra_run}/{sra_run}{fraction}.fastq.gz",
107 |             fraction=fraction,
108 |             sra_run=run_ids,
109 |         )
110 | 
111 |     return ReadFiles
112 | 
113 | 
114 | rule merge_runs_to_sample:
115 |     input:
116 |         unpack(get_runs_for_biosample),
117 |     output:
118 |         expand(
119 |             "SRA/Samples/{{sample}}/{{sample}}{fraction}.fastq.gz",
120 |             fraction=SRA_read_fractions,
121 |         ),
122 |     threads: 1
123 |     run:
124 |         from utils import io
125 | 
126 |         for i, fraction in enumerate(SRA_read_fractions):
127 |             if fraction == "":
128 |                 fraction = "se"
129 |             io.cat_files(input[fraction], output[i])
130 | 
131 | 
132 | rule download_sra:
133 |     input:
134 |         expand(
135 |             "SRA/Samples/{sample}/{sample}{fraction}.fastq.gz",
136 |             fraction=SRA_read_fractions,
137 |             sample=SAMPLES,
138 |         ),
139 | 


--------------------------------------------------------------------------------
/workflow/rules/strains.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | rule instrain_profile:
 5 |     input:
 6 |         bam="genomes/alignments/bams/{sample}.bam",
 7 |         genomes="genomes/all_contigs.fasta",
 8 |         # genes=lambda wc: get_all_genes(wc, extension=".fna"),
 9 |         scaffold_to_genome="genomes/clustering/contig2genome.tsv",
10 |     output:
11 |         directory("strains/intermediate_files/{sample}"),
12 |     threads: config["threads"]
13 |     params:
14 |         extra=config.get("instrain_profile_extra", ""),
15 |     log:
16 |         "logs/strains/profile/{sample}.log",
17 |     conda:
18 |         "../envs/instrain.yaml"
19 |     benchmark:
20 |         "logs/benchmarks/strains/profile/{sample}.tsv"
21 |     resources:
22 |         mem_mb=config["mem"] * 1000,
23 |         time_min=60 * config["runtime"]["long"],
24 |     shell:
25 |         #" cat {input.genes} > {resources.tmpdir}/all_genome_genes.fna 2> {log} "
26 |         #" ; "
27 |         "inStrain profile "
28 |         " {input.bam} {input.genomes} "
29 |         " -o {output} "
30 |         " -p {threads} "
31 | 
32 |         " -s {input.scaffold_to_genome} "
33 |         " --database_mode "
34 |         " {params.extra} &>> {log}"
35 |         #" -g {resources.tmpdir}/all_genome_genes.fna "
36 | 
37 | 
38 | rule instrain_compare:
39 |     input:
40 |         profiles=expand("strains/intermediate_files/{sample}", sample=SAMPLES),
41 |         scaffold_to_genome="genomes/clustering/contig2genome.tsv",
42 |     output:
43 |         directory("strains/comparison"),
44 |     threads: config["threads"]
45 |     params:
46 |         extra=config.get("instrain_compare_extra", ""),
47 |     log:
48 |         "logs/strains/compare.log",
49 |     conda:
50 |         "../envs/instrain.yaml"
51 |     benchmark:
52 |         "logs/benchmarks/strains/compare.tsv"
53 |     resources:
54 |         mem_mb=config["mem"] * 1000,
55 |         time_min=60 * config["runtime"]["long"],
56 |     shell:
57 |         "inStrain compare "
58 |         " --input {input.profiles} "
59 |         " -o {output} "
60 |         " -p {threads} "
61 |         " -s {input.scaffold_to_genome} "
62 |         " --database_mode "
63 |         " {params.extra} &> {log}"
64 | 
65 | 
66 | # usage: inStrain compare -i [INPUT [INPUT ...]] [-o OUTPUT] [-p PROCESSES] [-d]
67 | #                         [-h] [--version] [-s [STB [STB ...]]] [-c MIN_COV]
68 | #                         [-f MIN_FREQ] [-fdr FDR] [--database_mode]
69 | #                         [--breadth BREADTH] [-sc SCAFFOLDS] [--genome GENOME]
70 | #                         [--store_coverage_overlap]
71 | #                         [--store_mismatch_locations]
72 | #                         [--include_self_comparisons] [--skip_plot_generation]
73 | #                         [--group_length GROUP_LENGTH] [--force_compress]
74 | #                         [-ani ANI_THRESHOLD] [-cov COVERAGE_TRESHOLD]
75 | #                         [--clusterAlg {ward,single,complete,average,weighted,median,centroid}]
76 | 


--------------------------------------------------------------------------------
/workflow/scripts/DRAM_get_all_modules.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | 
34 | import pandas as pd
35 | 
36 | annotation_file = snakemake.input.annotations
37 | module_output_table = snakemake.output[0]
38 | 
39 | from mag_annotator.database_handler import DatabaseHandler
40 | from mag_annotator.summarize_genomes import build_module_net, make_module_coverage_frame
41 | 
42 | annotations = pd.read_csv(annotation_file, sep="\t", index_col=0)
43 | 
44 | 
45 | # get db_locs and read in dbs
46 | database_handler = DatabaseHandler(logger=logging, config_loc=snakemake.input.config)
47 | 
48 | 
49 | if "module_step_form" not in database_handler.config["dram_sheets"]:
50 |     raise ValueError(
51 |         "Module step form location must be set in order to summarize genomes"
52 |     )
53 | 
54 | module_steps_form = pd.read_csv(
55 |     database_handler.config["dram_sheets"]["module_step_form"], sep="\t"
56 | )
57 | 
58 | all_module_nets = {
59 |     module: build_module_net(module_df)
60 |     for module, module_df in module_steps_form.groupby("module")
61 | }
62 | 
63 | module_coverage_frame = make_module_coverage_frame(
64 |     annotations, all_module_nets, groupby_column="fasta"
65 | )
66 | 
67 | module_coverage_frame.to_csv(module_output_table, sep="\t")
68 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_busco.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | logging.captureWarnings(True)
12 | 
13 | 
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 |     if issubclass(exc_type, KeyboardInterrupt):
16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 |         return
18 | 
19 |     logging.error(
20 |         "".join(
21 |             [
22 |                 "Uncaught exception: ",
23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 |             ]
25 |         )
26 |     )
27 | 
28 | 
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 | 
32 | #### Begining of scripts
33 | 
34 | import pandas as pd
35 | from utils.parsers import read_busco_output
36 | 
37 | 
38 | def main(samples, completeness_files, bin_table):
39 |     sample_data = {}
40 |     div = {}
41 | 
42 |     df = pd.DataFrame()
43 | 
44 |     for i, sample in enumerate(samples):
45 |         sample_data = read_busco_output(completeness_files[i])
46 |         sample_data["Sample"] = sample
47 | 
48 |         df = df.append(sample_data)
49 | 
50 |     # remove missing
51 | 
52 |     failed_genomes = df.index[df.Dataset.str.lower().str.contains("run failed")]
53 | 
54 |     if len(failed_genomes) > 0:
55 |         logging.warning(
56 |             "Following genomes didn't pass BUSCO. I ignore them, because "
57 |             "I think theas means they are too bad to be quantified:\n"
58 |             f"{failed_genomes}"
59 |         )
60 | 
61 |         df.loc[failed_genomes, ["Completeness", "Contamination", "Quality_score"]] = 0
62 | 
63 |     df.to_csv(bin_table, sep="\t")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main(
68 |         samples=snakemake.params.samples,
69 |         completeness_files=snakemake.input.completeness_files,
70 |         bin_table=snakemake.output.bin_table,
71 |     )
72 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_checkm.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | logging.captureWarnings(True)
12 | 
13 | 
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 |     if issubclass(exc_type, KeyboardInterrupt):
16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 |         return
18 | 
19 |     logging.error(
20 |         "".join(
21 |             [
22 |                 "Uncaught exception: ",
23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 |             ]
25 |         )
26 |     )
27 | 
28 | 
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 | 
32 | #### Begining of scripts
33 | 
34 | import pandas as pd
35 | from utils.parsers import read_checkm_output
36 | 
37 | 
38 | def main(samples, completeness_files, taxonomy_files, bin_table):
39 |     sample_data = {}
40 |     div = {}
41 | 
42 |     df = pd.DataFrame()
43 | 
44 |     for i, sample in enumerate(samples):
45 |         sample_data = read_checkm_output(
46 |             taxonomy_table=taxonomy_files[i], completness_table=completeness_files[i]
47 |         )
48 |         sample_data["Sample"] = sample
49 | 
50 |         df = df.append(sample_data)
51 | 
52 |     df.to_csv(bin_table, sep="\t")
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main(
57 |         samples=snakemake.params.samples,
58 |         taxonomy_files=snakemake.input.taxonomy_files,
59 |         completeness_files=snakemake.input.completeness_files,
60 |         bin_table=snakemake.output.bin_table,
61 |     )
62 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_checkm2.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | logging.captureWarnings(True)
12 | 
13 | 
14 | def handle_exception(exc_type, exc_value, exc_traceback):
15 |     if issubclass(exc_type, KeyboardInterrupt):
16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
17 |         return
18 | 
19 |     logging.error(
20 |         "".join(
21 |             [
22 |                 "Uncaught exception: ",
23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
24 |             ]
25 |         )
26 |     )
27 | 
28 | 
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 | 
32 | #### Begining of scripts
33 | 
34 | import pandas as pd
35 | from utils.parsers import read_checkm2_output
36 | 
37 | 
38 | def main(samples, completeness_files, bin_table):
39 |     sample_data = {}
40 |     div = {}
41 | 
42 |     df_list = []
43 | 
44 |     for i, sample in enumerate(samples):
45 |         sample_data = read_checkm2_output(completness_table=completeness_files[i])
46 |         sample_data["Sample"] = sample
47 | 
48 |         df_list.append(sample_data)
49 | 
50 |     df = pd.concat(df_list, axis=0)
51 | 
52 |     df.to_csv(bin_table, sep="\t")
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     main(
57 |         samples=snakemake.params.samples,
58 |         completeness_files=snakemake.input.completeness_files,
59 |         bin_table=snakemake.output.bin_table,
60 |     )
61 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_contig_stats.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | 
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 |     if issubclass(exc_type, KeyboardInterrupt):
14 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 |         return
16 | 
17 |     logging.error(
18 |         "".join(
19 |             [
20 |                 "Uncaught exception: ",
21 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 |             ]
23 |         )
24 |     )
25 | 
26 | 
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 | 
30 | 
31 | import pandas as pd
32 | from utils.parsers_bbmap import parse_pileup_log_file
33 | 
34 | 
35 | def parse_map_stats(sample_data, out_tsv):
36 |     sample_stats = {}
37 |     for sample in sample_data.keys():
38 |         df = pd.read_csv(sample_data[sample]["contig_stats"], sep="\t")
39 | 
40 |         assert df.shape[0] == 1, "Assumed only one row in file {}; found {}".format(
41 |             sample_data[sample]["contig_stats"], df.iloc[0]
42 |         )
43 | 
44 |         # n genes
45 |         genes_df = pd.read_csv(sample_data[sample]["gene_table"], index_col=0, sep="\t")
46 |         df["N_Predicted_Genes"] = genes_df.shape[0]
47 | 
48 |         # mappingt stats
49 |         mapping_stats = parse_pileup_log_file(sample_data[sample]["mapping_log"])
50 |         df["Assembled_Reads"] = mapping_stats["Mapped reads"]
51 |         df["Percent_Assembled_Reads"] = mapping_stats["Percent mapped"]
52 | 
53 |         logging.info(f"Stats for sample {sample}\n{df}")
54 | 
55 |         sample_stats[sample] = df
56 | 
57 |     stats_df = pd.concat(sample_stats, axis=0)
58 |     stats_df.index = stats_df.index.get_level_values(0)
59 |     # remove contig stats and keep only scaffold stats
60 |     stats_df = stats_df.loc[:, ~stats_df.columns.str.startswith("scaf_")]
61 |     stats_df.columns = stats_df.columns.str.replace("ctg_", "")
62 |     # save
63 |     stats_df.to_csv(out_tsv, sep="\t")
64 |     return stats_df
65 | 
66 | 
67 | def main(samples, contig_stats, gene_tables, mapping_logs, combined_stats):
68 |     sample_data = {}
69 |     for sample in samples:
70 |         sample_data[sample] = {}
71 |         for c_stat in contig_stats:
72 |             # underscore version was for simplified local testing
73 |             # if "%s_" % sample in c_stat:
74 |             if "%s/" % sample in c_stat:
75 |                 sample_data[sample]["contig_stats"] = c_stat
76 |         for g_table in gene_tables:
77 |             # if "%s_" % sample in g_table:
78 |             if "%s/" % sample in g_table:
79 |                 sample_data[sample]["gene_table"] = g_table
80 |         for mapping_log in mapping_logs:
81 |             # if "%s_" % sample in mapping_log:
82 |             if "%s/" % sample in mapping_log:
83 |                 sample_data[sample]["mapping_log"] = mapping_log
84 | 
85 |     parse_map_stats(sample_data, combined_stats)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     main(
90 |         samples=snakemake.params.samples,
91 |         contig_stats=snakemake.input.contig_stats,
92 |         gene_tables=snakemake.input.gene_tables,
93 |         mapping_logs=snakemake.input.mapping_logs,
94 |         combined_stats=snakemake.output.combined_contig_stats,
95 |     )
96 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_coverage_MAGs.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | 
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 |     if issubclass(exc_type, KeyboardInterrupt):
14 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 |         return
16 | 
17 |     logging.error(
18 |         "".join(
19 |             [
20 |                 "Uncaught exception: ",
21 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 |             ]
23 |         )
24 |     )
25 | 
26 | 
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 | 
30 | 
31 | import pandas as pd
32 | import os, gc
33 | from utils.parsers_bbmap import read_coverage_binned, combine_coverages
34 | 
35 | 
36 | contig2genome = pd.read_csv(
37 |     snakemake.input.contig2genome, header=None, index_col=0, sep="\t"
38 | ).iloc[:, 0]
39 | 
40 | 
41 | # sum counts
42 | logging.info("Loading counts and coverage per contig")
43 | 
44 | combined_cov, Counts_contigs = combine_coverages(
45 |     snakemake.input.coverage_files, snakemake.params.samples
46 | )
47 | 
48 | combined_cov = combined_cov.T
49 | 
50 | combined_cov.insert(
51 |     0, "Genome", value=pd.Categorical(contig2genome.loc[combined_cov.index].values)
52 | )
53 | 
54 | logging.info(f"Saving coverage to {snakemake.output.coverage_contigs}")
55 | 
56 | combined_cov.reset_index().to_parquet(snakemake.output.coverage_contigs)
57 | 
58 | logging.info("Sum counts per genome")
59 | 
60 | Counts_genome = Counts_contigs.groupby(contig2genome, axis=1).sum().T
61 | Counts_genome.index.name = "Sample"
62 | 
63 | logging.info(f"Saving counts to {snakemake.output.counts}")
64 | 
65 | Counts_genome.reset_index().to_parquet(snakemake.output.counts)
66 | del Counts_genome, combined_cov, Counts_contigs
67 | gc.collect()
68 | 
69 | # Binned coverage
70 | logging.info("Loading binned coverage")
71 | binCov = {}
72 | for i, cov_file in enumerate(snakemake.input.binned_coverage_files):
73 |     sample = snakemake.params.samples[i]
74 | 
75 |     binCov[sample] = read_coverage_binned(cov_file)
76 | 
77 | binCov = pd.DataFrame.from_dict(binCov)
78 | 
79 | logging.info("Add genome information to it")
80 | binCov.insert(
81 |     0,
82 |     "Genome",
83 |     value=pd.Categorical(contig2genome.loc[binCov.index.get_level_values(0)].values),
84 | )
85 | 
86 | gc.collect()
87 | logging.info(f"Saving combined binCov to {snakemake.output.binned_cov}")
88 | binCov.reset_index().to_parquet(snakemake.output.binned_cov)
89 | 
90 | # Median coverage
91 | logging.info("Calculate median coverage")
92 | Median_abund = binCov.groupby("Genome").median().T
93 | del binCov
94 | gc.collect()
95 | logging.info(f"Saving mediuan coverage {snakemake.output.median_abund}")
96 | Median_abund.reset_index().to_parquet(snakemake.output.median_abund)
97 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_dram_gene_annotations.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | 
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 |     if issubclass(exc_type, KeyboardInterrupt):
14 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 |         return
16 | 
17 |     logging.error(
18 |         "".join(
19 |             [
20 |                 "Uncaught exception: ",
21 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 |             ]
23 |         )
24 |     )
25 | 
26 | 
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 | 
30 | 
31 | from pathlib import Path
32 | import numpy as np
33 | import pandas as pd
34 | from collections import defaultdict
35 | 
36 | db_columns = {
37 |     "kegg": ["ko_id", "kegg_hit"],
38 |     "peptidase": [
39 |         "peptidase_id",
40 |         "peptidase_family",
41 |         "peptidase_hit",
42 |         "peptidase_RBH",
43 |         "peptidase_identity",
44 |         "peptidase_bitScore",
45 |         "peptidase_eVal",
46 |     ],
47 |     "pfam": ["pfam_hits"],
48 |     "cazy": ["cazy_ids", "cazy_hits", "cazy_subfam_ec", "cazy_best_hit"],
49 |     # "heme": ["heme_regulatory_motif_count"],
50 | }
51 | 
52 | Tables = defaultdict(list)
53 | 
54 | for file in snakemake.input:
55 |     df = pd.read_csv(file, index_col=0, sep="\t")
56 | 
57 |     # drop un-annotated genes
58 |     df = df.query("rank!='E'")
59 | 
60 |     # change index from 'subset1_Gene111' ->  simply 'Gene111'
61 |     # Gene name to nr
62 |     df.index = (
63 |         df.index.str.split("_", n=1, expand=True)
64 |         .get_level_values(1)
65 |         .str[len("Gene") :]
66 |         .astype(np.int64)
67 |     )
68 |     df.index.name = "GeneNr"
69 | 
70 |     # select columns, drop na rows and append to list
71 |     for db in db_columns:
72 |         cols = db_columns[db]
73 | 
74 |         if not df.columns.intersection(cols).empty:
75 |             Tables[db].append(df[cols].dropna(axis=0, how="all"))
76 | 
77 |     del df
78 | 
79 | out_dir = Path(snakemake.output[0])
80 | out_dir.mkdir()
81 | 
82 | for db in Tables:
83 |     combined = pd.concat(Tables[db], axis=0)
84 | 
85 |     combined.sort_index(inplace=True)
86 | 
87 |     combined.reset_index().to_parquet(out_dir / (db + ".parquet"))
88 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_gene_coverages.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os, sys
  3 | import logging, traceback
  4 | 
  5 | logging.basicConfig(
  6 |     filename=snakemake.log[0],
  7 |     level=logging.INFO,
  8 |     format="%(asctime)s %(message)s",
  9 |     datefmt="%Y-%m-%d %H:%M:%S",
 10 | )
 11 | 
 12 | 
 13 | def handle_exception(exc_type, exc_value, exc_traceback):
 14 |     if issubclass(exc_type, KeyboardInterrupt):
 15 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
 16 |         return
 17 | 
 18 |     logging.error(
 19 |         "".join(
 20 |             [
 21 |                 "Uncaught exception: ",
 22 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
 23 |             ]
 24 |         )
 25 |     )
 26 | 
 27 | 
 28 | # Install exception handler
 29 | sys.excepthook = handle_exception
 30 | 
 31 | #### Begining of script
 32 | import numpy as np
 33 | import pandas as pd
 34 | import gc, os
 35 | 
 36 | 
 37 | import h5py
 38 | 
 39 | import h5py
 40 | 
 41 | import psutil
 42 | 
 43 | 
 44 | def measure_memory(write_log_entry=True):
 45 |     mem_uage = psutil.Process().memory_info().rss / (1024 * 1024)
 46 | 
 47 |     if write_log_entry:
 48 |         logging.info(f"The process is currently using {mem_uage: 7.0f} MB of RAM")
 49 | 
 50 |     return mem_uage
 51 | 
 52 | 
 53 | logging.info("Start")
 54 | measure_memory()
 55 | 
 56 | N_samples = len(snakemake.input.covstats)
 57 | 
 58 | logging.info("Read gene info")
 59 | 
 60 | gene_info = pd.read_table(snakemake.input.info)
 61 | 
 62 | # Gene name is only first part of first column
 63 | gene_info.index = gene_info["#Name"].str.split(" ", n=1, expand=True)[0]
 64 | gene_info.index.name = "GeneName"
 65 | gene_info.drop("#Name", axis=1, inplace=True)
 66 | 
 67 | gene_info.sort_index(inplace=True)
 68 | N_genes = gene_info.shape[0]
 69 | # gene_list= gene_info.index
 70 | 
 71 | # Sort
 72 | gene_info.sort_index(inplace=True)
 73 | N_genes = gene_info.shape[0]
 74 | 
 75 | gene_info[
 76 |     ["Samples_nz_coverage", "Samples_nz_counts", "Sum_coverage", "Max_coverage"]
 77 | ] = 0
 78 | 
 79 | 
 80 | # gene_list= gene_info.index
 81 | 
 82 | 
 83 | logging.info("Open hdf files for writing")
 84 | 
 85 | gene_matrix_shape = (N_samples, N_genes)
 86 | 
 87 | with h5py.File(snakemake.output.cov, "w") as hdf_cov_file, h5py.File(
 88 |     snakemake.output.counts, "w"
 89 | ) as hdf_counts_file:
 90 |     combined_cov = hdf_cov_file.create_dataset(
 91 |         "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip"
 92 |     )
 93 |     combined_counts = hdf_counts_file.create_dataset(
 94 |         "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip"
 95 |     )
 96 | 
 97 |     # add Smaple names attribute
 98 |     sample_names = np.array(list(snakemake.params.samples)).astype("S")
 99 |     combined_cov.attrs["sample_names"] = sample_names
100 |     combined_counts.attrs["sample_names"] = sample_names
101 | 
102 |     gc.collect()
103 | 
104 |     Summary = {}
105 | 
106 |     logging.info("Start reading files")
107 |     initial_mem_uage = measure_memory()
108 | 
109 |     for i, sample in enumerate(snakemake.params.samples):
110 |         logging.info(f"Read coverage file for sample {i+1} / {N_samples}")
111 |         sample_cov_file = snakemake.input.covstats[i]
112 | 
113 |         data = pd.read_parquet(
114 |             sample_cov_file, columns=["GeneName", "Reads", "Median_fold"]
115 |         ).set_index("GeneName")
116 | 
117 |         assert (
118 |             data.shape[0] == N_genes
119 |         ), f"I only have {data.shape[0]} /{N_genes} in the file {sample_cov_file}"
120 | 
121 |         # genes are not sorted :-()
122 |         assert (
123 |             data.index.is_monotonic_increasing
124 |         ), f"data is not sorted by index in {sample_cov_file}"
125 | 
126 |         # downcast data
127 |         # median is int
128 |         Median_fold = pd.to_numeric(data.Median_fold, downcast="integer")
129 |         Reads = pd.to_numeric(data.Reads, downcast="integer")
130 | 
131 |         # delete interminate data and release mem
132 |         del data
133 | 
134 |         # get summary statistics per sample
135 |         logging.debug("Extract Summary statistics")
136 | 
137 |         Summary[sample] = {
138 |             "Sum_coverage": Median_fold.sum(),
139 |             "Total_counts": Reads.sum(),
140 |             "Genes_nz_counts": (Reads > 0).sum(),
141 |             "Genes_nz_coverage": (Median_fold > 0).sum(),
142 |         }
143 | 
144 |         # get gene wise stats
145 |         gene_info["Samples_nz_counts"] += (Reads > 0) * 1
146 |         gene_info["Samples_nz_coverage"] += (Median_fold > 0) * 1
147 |         gene_info["Sum_coverage"] += Median_fold
148 | 
149 |         gene_info["Max_coverage"] = np.fmax(gene_info["Max_coverage"], Median_fold)
150 | 
151 |         combined_cov[i, :] = Median_fold.values
152 |         combined_counts[i, :] = Reads.values
153 | 
154 |         del Median_fold, Reads
155 |         gc.collect()
156 | 
157 |         current_mem_uage = measure_memory()
158 | 
159 | 
160 | logging.info("All samples processed")
161 | gc.collect()
162 | 
163 | logging.info("Save sample Summary")
164 | pd.DataFrame(Summary).T.to_csv(snakemake.output.sample_info, sep="\t")
165 | 
166 | 
167 | logging.info("Save gene Summary")
168 | 
169 | # downcast
170 | for col in gene_info.columns:
171 |     if col == "GC":
172 |         gene_info[col] = pd.to_numeric(gene_info[col], downcast="float")
173 |     else:
174 |         gene_info[col] = pd.to_numeric(gene_info[col], downcast="integer")
175 | 
176 | gene_info.reset_index().to_parquet(snakemake.output.gene_info)
177 | 


--------------------------------------------------------------------------------
/workflow/scripts/combine_taxonomy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os, sys
 3 | import logging, traceback
 4 | 
 5 | logging.basicConfig(
 6 |     filename=snakemake.log[0],
 7 |     level=logging.INFO,
 8 |     format="%(asctime)s %(message)s",
 9 |     datefmt="%Y-%m-%d %H:%M:%S",
10 | )
11 | 
12 | 
13 | def handle_exception(exc_type, exc_value, exc_traceback):
14 |     if issubclass(exc_type, KeyboardInterrupt):
15 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
16 |         return
17 | 
18 |     logging.error(
19 |         "".join(
20 |             [
21 |                 "Uncaught exception: ",
22 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
23 |             ]
24 |         )
25 |     )
26 | 
27 | 
28 | # Install exception handler
29 | sys.excepthook = handle_exception
30 | 
31 | #### Begining of scripts
32 | 
33 | import pandas as pd
34 | import numpy as np
35 | from utils.taxonomy import tax2table
36 | 
37 | from glob import glob
38 | 
39 | gtdb_classify_folder = snakemake.input.folder
40 | 
41 | taxonomy_files = glob(f"{gtdb_classify_folder}/gtdbtk.*.summary.tsv")
42 | 
43 | N_taxonomy_files = len(taxonomy_files)
44 | logging.info(f"Found {N_taxonomy_files} gtdb taxonomy files.")
45 | 
46 | if (0 == N_taxonomy_files) or (N_taxonomy_files > 2):
47 |     raise Exception(
48 |         f"Found {N_taxonomy_files} number of taxonomy files 'gtdbtk.*.summary.tsv' in {gtdb_classify_folder} expect 1 or 2."
49 |     )
50 | 
51 | 
52 | DT = pd.concat([pd.read_table(file, index_col=0) for file in taxonomy_files], axis=0)
53 | 
54 | DT.to_csv(snakemake.output.combined)
55 | 
56 | Tax = tax2table(DT.classification, remove_prefix=True)
57 | Tax.to_csv(snakemake.output.taxonomy, sep="\t")
58 | 


--------------------------------------------------------------------------------
/workflow/scripts/convert_jgi2vamb_coverage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | import re
 5 | 
 6 | 
 7 | def main(jgi_file):
 8 |     # parsing input
 9 |     header = {}
10 |     col2keep = ["contigName", "contigLen", "totalAvgDepth"]
11 |     with open(jgi_file) as inF:
12 |         for i, line in enumerate(inF):
13 |             line = line.rstrip().split("\t")
14 |             if i == 0:
15 |                 header = {x: ii for ii, x in enumerate(line)}
16 |                 col2keep += [x for x in line if x.endswith(".bam")]
17 |                 print("\t".join(col2keep))
18 |                 continue
19 |             elif line[0] == "":
20 |                 continue
21 |             # contig ID
22 |             contig = line[header["contigName"]]
23 |             # collect per-sample info
24 |             out = []
25 |             for col in col2keep:
26 |                 out.append(line[header[col]])
27 |             print("\t".join(out))
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     if "snakemake" in globals():
32 |         with open(snakemake.log[0], "w") as log:
33 |             sys.stderr = log
34 | 
35 |             with open(snakemake.output[0], "w") as outf:
36 |                 sys.stdout = outf
37 | 
38 |                 main(snakemake.input[0])
39 | 
40 |     else:
41 |         import argparse
42 |         import logging
43 | 
44 |         logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
45 | 
46 |         class CustomFormatter(
47 |             argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
48 |         ):
49 |             pass
50 | 
51 |         desc = (
52 |             "Converting jgi_summarize_bam_contig_depths output to format used by VAMB"
53 |         )
54 |         epi = """DESCRIPTION:
55 |         Output format: contigName<tab>contigLen<tab>totalAvgDepth<tab>SAMPLE1.sort.bam<tab>Sample2.sort.bam<tab>...
56 |         Output written to STDOUT
57 |         """
58 |         parser = argparse.ArgumentParser(
59 |             description=desc, epilog=epi, formatter_class=CustomFormatter
60 |         )
61 |         argparse.ArgumentDefaultsHelpFormatter
62 |         parser.add_argument(
63 |             "jgi_file",
64 |             metavar="jgi_file",
65 |             type=str,
66 |             help="jgi_summarize_bam_contig_depths output table",
67 |         )
68 |         parser.add_argument("--version", action="version", version="0.0.1")
69 | 
70 |         args = parser.parse_args()
71 |         main(args.jgi_file)
72 | 


--------------------------------------------------------------------------------
/workflow/scripts/filter_genes.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | 
34 | import pyfastx
35 | 
36 | 
37 | faa_iterator = pyfastx.Fastx(snakemake.input.faa, format="fasta", comment=True)
38 | fna_iterator = pyfastx.Fastx(snakemake.input.fna, format="fasta", comment=True)
39 | 
40 | 
41 | with open(snakemake.output.faa, "w") as out_faa, open(
42 |     snakemake.output.fna, "w"
43 | ) as out_fna, open(snakemake.output.short, "w") as out_short:
44 |     for name, seq, comment in fna_iterator:
45 |         protein = next(faa_iterator)
46 | 
47 |         # include gene and corresponding protein if gene passes length threshold
48 |         # or annotation contains prodigal info that it's complete
49 |         if (len(seq) >= snakemake.params.minlength_nt) or ("partial=00" in comment):
50 |             out_fna.write(f">{name} {comment}\n{seq}\n")
51 |             out_faa.write(">{0} {2}\n{1}\n".format(*protein))
52 | 
53 |         else:
54 |             out_short.write(">{0} {2}\n{1}\n".format(*protein))
55 | 


--------------------------------------------------------------------------------
/workflow/scripts/filter_genomes.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | 
34 | import pandas as pd
35 | from glob import glob
36 | from numpy import log
37 | 
38 | from utils.parsers import load_quality
39 | 
40 | 
41 | Q = load_quality(snakemake.input.quality)
42 | 
43 | stats = pd.read_csv(snakemake.input.stats, index_col=0, sep="\t")
44 | stats["logN50"] = log(stats.N50)
45 | 
46 | # merge table but only shared Bins and non overlapping columns
47 | Q = Q.join(stats.loc[Q.index, stats.columns.difference(Q.columns)])
48 | del stats
49 | 
50 | n_all_bins = Q.shape[0]
51 | 
52 | filter_criteria = snakemake.params["filter_criteria"]
53 | logging.info(f"Filter genomes according to criteria:\n {filter_criteria}")
54 | 
55 | 
56 | Q = Q.query(filter_criteria)
57 | 
58 | logging.info(f"Retain {Q.shape[0]} genomes from {n_all_bins}")
59 | 
60 | 
61 | ## GUNC
62 | 
63 | if hasattr(snakemake.input, "gunc"):
64 |     gunc = pd.read_table(snakemake.input.gunc, index_col=0)
65 |     gunc = gunc.loc[Q.index]
66 | 
67 |     bad_genomes = gunc.index[gunc["pass.GUNC"] == False]
68 |     logging.info(f"{len(bad_genomes)} Don't pass gunc filtering")
69 | 
70 |     Q.drop(bad_genomes, inplace=True)
71 | else:
72 |     logging.info(" Don't filter based on gunc")
73 | 
74 | 
75 | if Q.shape[0] == 0:
76 |     logging.error(
77 |         f"No bins passed filtering criteria! Bad luck!. You might want to tweek the filtering criteria. Also check the {snakemake.input.quality}"
78 |     )
79 |     exit(1)
80 | 
81 | # output Q together with quality
82 | Q.to_csv(snakemake.output.info, sep="\t")
83 | 
84 | 
85 | # filter path genomes for skani
86 | 
87 | F = pd.read_table(snakemake.input.paths, index_col=0).squeeze()
88 | 
89 | F = F.loc[Q.index].iloc[:, 0]
90 | F.to_csv(snakemake.output.paths, index=False, header=False)
91 | 


--------------------------------------------------------------------------------
/workflow/scripts/gene2genome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os, sys
 3 | import logging, traceback
 4 | 
 5 | logging.basicConfig(
 6 |     filename=snakemake.log[0],
 7 |     level=logging.INFO,
 8 |     format="%(asctime)s %(message)s",
 9 |     datefmt="%Y-%m-%d %H:%M:%S",
10 | )
11 | 
12 | 
13 | def handle_exception(exc_type, exc_value, exc_traceback):
14 |     if issubclass(exc_type, KeyboardInterrupt):
15 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
16 |         return
17 | 
18 |     logging.error(
19 |         "".join(
20 |             [
21 |                 "Uncaught exception: ",
22 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
23 |             ]
24 |         )
25 |     )
26 | 
27 | 
28 | # Install exception handler
29 | sys.excepthook = handle_exception
30 | 
31 | #### Begining of script
32 | 
33 | import pandas as pd
34 | from utils import gene_scripts
35 | 
36 | # if MAGs are renamed I need to obtain the old contig names
37 | # otherwise not
38 | if snakemake.params.renamed_contigs:
39 |     contigs2bins = pd.read_csv(
40 |         snakemake.input.contigs2bins, index_col=0, sep="\t", header=None
41 |     )
42 | 
43 |     contigs2bins.columns = ["Bin"]
44 |     old2newID = pd.read_csv(snakemake.input.old2newID, index_col=0, sep="\t").squeeze()
45 | 
46 |     contigs2genome = contigs2bins.join(old2newID, on="Bin").dropna().drop("Bin", axis=1)
47 | else:
48 |     contigs2genome = pd.read_csv(
49 |         snakemake.input.contigs2mags, index_col=0, squeeze=False, sep="\t", header=None
50 |     )
51 |     contigs2genome.columns = ["MAG"]
52 | 
53 | # load orf_info
54 | orf_info = pd.read_parquet(snakemake.input.orf_info)
55 | 
56 | 
57 | # recreate Contig name `Sample_ContigNr` and Gene names `Gene0004`
58 | orf_info["Contig"] = orf_info.Sample + "_" + orf_info.ContigNr.astype(str)
59 | orf_info["Gene"] = gene_scripts.geneNr_to_string(orf_info.GeneNr)
60 | 
61 | # Join genomes on contig
62 | orf_info = orf_info.join(contigs2genome, on="Contig")
63 | 
64 | # remove genes not on genomes
65 | orf_info = orf_info.dropna(axis=0)
66 | 
67 | 
68 | # count genes per genome in a matrix
69 | gene2genome = pd.to_numeric(
70 |     orf_info.groupby(["Gene", "MAG"]).size(), downcast="unsigned"
71 | ).unstack(fill_value=0)
72 | 
73 | # save as parquet
74 | gene2genome.reset_index().to_parquet(snakemake.output[0])
75 | 


--------------------------------------------------------------------------------
/workflow/scripts/generate_orf_info.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | 
34 | ## Start
35 | 
36 | import pandas as pd
37 | import numpy as np
38 | 
39 | from utils import gene_scripts
40 | 
41 | # CLuterID    GeneID    empty third column
42 | orf2gene = pd.read_csv(
43 |     snakemake.input.cluster_attribution, header=None, sep="\t", usecols=[0, 1]
44 | )
45 | 
46 | orf2gene.columns = ["Representative", "ORF"]
47 | 
48 | # split orf names in sample, contig_nr, and orf_nr
49 | orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF)
50 | 
51 | # rename representative
52 | 
53 | representative_names = orf2gene.Representative.unique()
54 | 
55 | map_names = pd.Series(
56 |     index=representative_names,
57 |     data=np.arange(1, len(representative_names) + 1, dtype=np.uint),
58 | )
59 | 
60 | 
61 | orf_info["GeneNr"] = orf2gene.Representative.map(map_names)
62 | 
63 | 
64 | orf_info.to_parquet(snakemake.output.cluster_attribution)
65 | 
66 | 
67 | # Save name of representatives
68 | map_names.index.name = "Representative"
69 | map_names.name = "GeneNr"
70 | map_names.to_csv(snakemake.output.rep2genenr, sep="\t")
71 | 


--------------------------------------------------------------------------------
/workflow/scripts/get_fasta_of_bins.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | 
  4 | import sys, os
  5 | import logging, traceback
  6 | 
  7 | logging.basicConfig(
  8 |     filename=snakemake.log[0],
  9 |     level=logging.DEBUG,
 10 |     format="%(asctime)s %(message)s",
 11 |     datefmt="%Y-%m-%d %H:%M:%S",
 12 | )
 13 | 
 14 | 
 15 | def handle_exception(exc_type, exc_value, exc_traceback):
 16 |     if issubclass(exc_type, KeyboardInterrupt):
 17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
 18 |         return
 19 | 
 20 |     logging.error(
 21 |         "".join(
 22 |             [
 23 |                 "Uncaught exception: ",
 24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
 25 |             ]
 26 |         )
 27 |     )
 28 | 
 29 | 
 30 | # Install exception handler
 31 | sys.excepthook = handle_exception
 32 | 
 33 | 
 34 | # start of script
 35 | import argparse
 36 | import os, sys
 37 | import shutil
 38 | import warnings
 39 | 
 40 | import pandas as pd
 41 | from Bio import SeqIO
 42 | 
 43 | 
 44 | def get_fasta_of_bins(cluster_attribution, contigs_file, out_folder):
 45 |     """
 46 |     Creates individual fasta files for each bin using the contigs fasta and the cluster attribution.
 47 | 
 48 |     input:
 49 |     - cluster attribution file:   tab seperated file of "contig_fasta_header    bin"
 50 |     - contigs:                    fasta file of contigs
 51 |     - out_prefix:                 output_prefix for bin fastas  {out_folder}/{binid}.fasta
 52 |     """
 53 |     # create outdir
 54 |     if os.path.exists(out_folder):
 55 |         shutil.rmtree(out_folder)
 56 |     os.makedirs(out_folder)
 57 | 
 58 |     CA = pd.read_csv(cluster_attribution, header=None, sep="\t", dtype=str)
 59 | 
 60 |     assert CA.shape[1] == 2, "File should have only two columns " + cluster_attribution
 61 | 
 62 |     CA.columns = ["Contig", "Bin"]
 63 | 
 64 |     # # assert that Contig is unique
 65 |     # assert CA.Contig.is_unique, (
 66 |     #     f"First column of file {cluster_attribution} should be contigs, hence unique"
 67 |     #     f"I got\n{CA.head()}"
 68 |     # )
 69 | 
 70 |     logging.info(f"index fasta file {contigs_file} for fast access")
 71 |     contig_fasta_dict = SeqIO.index(str(contigs_file), "fasta")
 72 | 
 73 |     assert len(contig_fasta_dict) > 0, "No contigs in your fasta"
 74 | 
 75 |     unique_bins = CA.Bin.unique()
 76 | 
 77 |     assert len(unique_bins) >= 1, "No bins found"
 78 | 
 79 |     for binid in unique_bins:
 80 |         bin_contig_names = CA.loc[CA.Bin == binid, "Contig"].tolist()
 81 |         out_file = os.path.join(out_folder, f"{binid}.fasta")
 82 | 
 83 |         assert (
 84 |             len(bin_contig_names) >= 1
 85 |         ), f"No contigs found for bin {binid} in {cluster_attribution}"
 86 | 
 87 |         if len(bin_contig_names) == 1:
 88 |             warnings.warn(f"single contig bin Bin : {binid} {bin_contig_names}")
 89 | 
 90 |         logging.debug(f"Found {len(bin_contig_names)} contigs {bin_contig_names}")
 91 | 
 92 |         fasta_contigs = [contig_fasta_dict[c] for c in bin_contig_names]
 93 |         SeqIO.write(fasta_contigs, out_file, "fasta")
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     if "snakemake" not in globals():
 98 |         p = argparse.ArgumentParser()
 99 |         p.add_argument("--cluster-attribution")
100 |         p.add_argument("--contigs")
101 |         p.add_argument("--out-folder")
102 |         args = vars(p.parse_args())
103 |         get_fasta_of_bins(**args)
104 |     else:
105 |         get_fasta_of_bins(
106 |             snakemake.input.cluster_attribution,
107 |             snakemake.input.contigs,
108 |             snakemake.output[0],
109 |         )
110 | 


--------------------------------------------------------------------------------
/workflow/scripts/get_read_stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os, sys
  3 | import logging, traceback
  4 | 
  5 | 
  6 | logging.basicConfig(
  7 |     filename=snakemake.log[0],
  8 |     level=logging.INFO,
  9 |     format="%(asctime)s %(message)s",
 10 |     datefmt="%Y-%m-%d %H:%M:%S",
 11 | )
 12 | 
 13 | 
 14 | def handle_exception(exc_type, exc_value, exc_traceback):
 15 |     if issubclass(exc_type, KeyboardInterrupt):
 16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
 17 |         return
 18 | 
 19 |     logging.error(
 20 |         "".join(
 21 |             [
 22 |                 "Uncaught exception: ",
 23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
 24 |             ]
 25 |         )
 26 |     )
 27 | 
 28 | 
 29 | # Install exception handler
 30 | sys.excepthook = handle_exception
 31 | 
 32 | 
 33 | # begining of script
 34 | 
 35 | import datetime
 36 | import shutil
 37 | import os
 38 | 
 39 | 
 40 | timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%X")
 41 | 
 42 | 
 43 | def get_read_stats(fraction, params_in):
 44 |     "get read stats by running reformat.sh"
 45 | 
 46 |     from snakemake.shell import shell
 47 | 
 48 |     subfolder = os.path.join(snakemake.params.folder, fraction)
 49 |     tmp_file = os.path.join(subfolder, "read_stats.tmp")
 50 |     shell(
 51 |         f" mkdir -p {subfolder} 2>> {snakemake.log[0]} "
 52 |         " ; "
 53 |         f" reformat.sh {params_in} "
 54 |         f" bhist={subfolder}/base_hist.txt "
 55 |         f" qhist={subfolder}/quality_by_pos.txt "
 56 |         f" lhist={subfolder}/readlength.txt "
 57 |         f" gchist={subfolder}/gc_hist.txt "
 58 |         " gcbins=auto "
 59 |         f" bqhist={subfolder}/boxplot_quality.txt "
 60 |         f" threads={snakemake.threads} "
 61 |         " overwrite=true "
 62 |         f" -Xmx{snakemake.resources.java_mem}G "
 63 |         f" 2>&1 | tee -a {snakemake.log[0]} {tmp_file} >/dev/null "
 64 |     )
 65 |     content = open(tmp_file).read()
 66 |     pos = content.find("Input:")
 67 |     if pos == -1:
 68 |         raise Exception("Didn't find read number in file:\n\n" + content)
 69 |     else:
 70 |         content[pos:].split()[1:4]
 71 |         # Input:    123 reads   1234 bases
 72 |         n_reads, _, n_bases = content[pos:].split()[1:4]
 73 | 
 74 |         os.remove(tmp_file)
 75 |     return int(n_reads), int(n_bases)
 76 | 
 77 | 
 78 | if len(snakemake.input) >= 2:
 79 |     n_reads_pe, n_bases_pe = get_read_stats(
 80 |         "pe", "in1={0} in2={1}".format(*snakemake.input)
 81 |     )
 82 | 
 83 |     n_reads_pe = n_reads_pe / 2
 84 | 
 85 |     headers = [
 86 |         "Sample",
 87 |         "Step",
 88 |         "Total_Reads",
 89 |         "Total_Bases",
 90 |         "Reads_pe",
 91 |         "Bases_pe",
 92 |         "Reads_se",
 93 |         "Bases_se",
 94 |         "Timestamp",
 95 |     ]
 96 | 
 97 |     if os.path.exists(snakemake.params.single_end_file):
 98 |         n_reads_se, n_bases_se = get_read_stats(
 99 |             "se", "in=" + snakemake.params.single_end_file
100 |         )
101 |     else:
102 |         n_reads_se, n_bases_se = 0, 0
103 | 
104 |     values = [
105 |         n_reads_pe + n_reads_se,
106 |         n_bases_pe + n_bases_se,
107 |         n_reads_pe,
108 |         n_bases_pe,
109 |         n_reads_se,
110 |         n_bases_se,
111 |     ]
112 | else:
113 |     headers = [
114 |         "Sample",
115 |         "Step",
116 |         "Total_Reads",
117 |         "Total_Bases",
118 |         "Reads",
119 |         "Bases",
120 |         "Timestamp",
121 |     ]
122 |     values = 2 * get_read_stats("", "in=" + snakemake.input[0])
123 | 
124 | with open(snakemake.output.read_counts, "w") as f:
125 |     f.write("\t".join(headers) + "\n")
126 |     f.write(
127 |         "\t".join(
128 |             [snakemake.wildcards.sample, snakemake.wildcards.step]
129 |             + [str(v) for v in values]
130 |             + [timestamp]
131 |         )
132 |         + "\n"
133 |     )
134 | 
135 | shutil.make_archive(snakemake.params.folder, "zip", snakemake.params.folder)
136 | shutil.rmtree(snakemake.params.folder)
137 | 


--------------------------------------------------------------------------------
/workflow/scripts/parse_semibin.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import logging, traceback
 3 | 
 4 | logging.basicConfig(
 5 |     filename=snakemake.log[0],
 6 |     level=logging.INFO,
 7 |     format="%(asctime)s %(message)s",
 8 |     datefmt="%Y-%m-%d %H:%M:%S",
 9 | )
10 | 
11 | 
12 | def handle_exception(exc_type, exc_value, exc_traceback):
13 |     if issubclass(exc_type, KeyboardInterrupt):
14 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
15 |         return
16 | 
17 |     logging.error(
18 |         "".join(
19 |             [
20 |                 "Uncaught exception: ",
21 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
22 |             ]
23 |         )
24 |     )
25 | 
26 | 
27 | # Install exception handler
28 | sys.excepthook = handle_exception
29 | 
30 | from utils.fasta import parse_fasta_headers
31 | from utils.utils import gen_names_for_range
32 | from glob import glob
33 | import pandas as pd
34 | 
35 | 
36 | fasta_files = glob(f"{snakemake.input[0]}/*{snakemake.params.extension}")
37 | 
38 | if len(fasta_files) > 0:
39 |     Bin_names = gen_names_for_range(
40 |         N=len(fasta_files), prefix=f"{snakemake.wildcards.sample}_SemiBin_"
41 |     )
42 | 
43 |     mappings = []
44 | 
45 |     for bin_name, fasta in zip(Bin_names, fasta_files):
46 |         contigs = parse_fasta_headers(fasta)
47 | 
48 |         mappings.append(pd.Series(data=bin_name, index=contigs))
49 | 
50 |     pd.concat(mappings, axis=0).to_csv(
51 |         snakemake.output[0], sep="\t", header=False, index=True
52 |     )
53 | 
54 | else:
55 |     logging.warning(
56 |         f"No bins found in {snakemake.input[0]} add longest contig as bin to make atlas continue."
57 |     )
58 | 
59 |     with open(snakemake.output[0], "w") as outf:
60 |         outf.write("{sample}_1\t{sample}_SemiBin_1\n".format(**snakemake.wildcards))
61 | 


--------------------------------------------------------------------------------
/workflow/scripts/parse_vamb.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import logging, traceback
  3 | 
  4 | logging.basicConfig(
  5 |     filename=snakemake.log[0],
  6 |     level=logging.INFO,
  7 |     format="%(asctime)s %(message)s",
  8 |     datefmt="%Y-%m-%d %H:%M:%S",
  9 | )
 10 | 
 11 | 
 12 | def handle_exception(exc_type, exc_value, exc_traceback):
 13 |     if issubclass(exc_type, KeyboardInterrupt):
 14 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
 15 |         return
 16 | 
 17 |     logging.error(
 18 |         "".join(
 19 |             [
 20 |                 "Uncaught exception: ",
 21 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
 22 |             ]
 23 |         )
 24 |     )
 25 | 
 26 | 
 27 | # Install exception handler
 28 | sys.excepthook = handle_exception
 29 | 
 30 | 
 31 | import pandas as pd
 32 | from pathlib import Path
 33 | from utils.utils import gen_names_for_range
 34 | from utils.fasta import parse_fasta_headers
 35 | 
 36 | 
 37 | fasta_extension = snakemake.params.fasta_extension
 38 | separator = snakemake.params.separator
 39 | 
 40 | # path with {sample} to replace
 41 | cluster_output_path = snakemake.params.output_path
 42 | 
 43 | 
 44 | # cluster.tsv.gz file for all samples of all bingroups
 45 | output_culsters = snakemake.output.renamed_clusters
 46 | 
 47 | 
 48 | all_clusters = []
 49 | 
 50 | 
 51 | for i in range(len(list(snakemake.input))):
 52 |     vamb_folder = Path(snakemake.input[i])
 53 | 
 54 |     bingroup = snakemake.params.bingroups[i]
 55 | 
 56 |     logging.info(f"Parse vamb output for bingroup {bingroup}")
 57 | 
 58 |     # path to the bins folder
 59 |     bin_dir = vamb_folder / "bins"
 60 |     vamb_cluster_file = vamb_folder / "clusters.tsv"
 61 | 
 62 |     # Get a list of binds that are big enough. Not all bins in the vamb_cluster_file, pass the size filter
 63 |     big_bins = []
 64 | 
 65 |     for file in os.listdir(bin_dir):
 66 |         bin_name, extension = os.path.splitext(file)
 67 | 
 68 |         logging.debug(f"Found file {bin_name} with extension {extension}")
 69 | 
 70 |         if extension == fasta_extension:
 71 |             big_bins.append(bin_name)
 72 | 
 73 |     logging.info(
 74 |         f"Found {len(big_bins)} bins created by Vamb (above size limit)\n"
 75 |         f"E.g. {big_bins[:5]}"
 76 |     )
 77 | 
 78 |     logging.info(f"Load vamb cluster file {vamb_cluster_file}")
 79 |     clusters_contigs = pd.read_table(vamb_cluster_file, header=None)
 80 |     clusters_contigs.columns = ["OriginalName", "Contig"]
 81 | 
 82 |     # split contigs by separator. This is mainly done for compatibility with SemiBin
 83 |     clusters = clusters_contigs.Contig.str.rsplit(separator, n=1, expand=True)
 84 |     clusters.columns = ["Sample", "Contig"]
 85 | 
 86 |     # get number of BinID given by vamb, prefix with bingroup
 87 |     clusters["BinId"] = (
 88 |         bingroup
 89 |         + clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1]
 90 |     )
 91 | 
 92 |     # Add information if the bin is large enough
 93 |     clusters["OriginalName"] = clusters_contigs.OriginalName
 94 |     clusters["Large_enough"] = clusters.OriginalName.isin(big_bins)
 95 | 
 96 |     # Add information about the bingroup
 97 |     clusters["BinGroup"] = bingroup
 98 | 
 99 |     all_clusters.append(clusters)
100 | 
101 |     del clusters_contigs
102 | 
103 | 
104 | logging.info(f"Concatenate clusters of all bingroups")
105 | clusters = pd.concat(all_clusters, axis=0)
106 | 
107 | 
108 | n_bins = (
109 |     clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique()
110 | )
111 | logging.info(
112 |     f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}"
113 | )
114 | 
115 | 
116 | clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinId
117 | clusters.loc[~clusters.Large_enough, "SampleBin"] = ""
118 | 
119 | 
120 | logging.info(f"Write reformated table to {output_culsters}")
121 | clusters.to_csv(output_culsters, sep="\t", index=False)
122 | 
123 | # filter for following
124 | clusters = clusters.query("Large_enough")
125 | 
126 | logging.info(f"Write cluster_attribution for samples")
127 | for sample, cl in clusters.groupby("Sample"):
128 |     sample_output_path = cluster_output_path.format(sample=sample)
129 | 
130 |     logging.debug(f"Write file {sample_output_path}")
131 |     cl[["Contig", "SampleBin"]].to_csv(
132 |         sample_output_path, sep="\t", index=False, header=False
133 |     )
134 | 
135 | 
136 | samples_without_bins = set(snakemake.params.samples).difference(set(clusters.Sample))
137 | 
138 | if len(samples_without_bins) > 0:
139 |     logging.warning(
140 |         "The following samples didn't yield bins, I add longest contig to make the pipeline continue:\n"
141 |         + "\n".join(samples_without_bins)
142 |     )
143 | 
144 |     for sample in samples_without_bins:
145 |         sample_output_path = cluster_output_path.format(sample=sample)
146 |         with open(sample_output_path, "w") as fout:
147 |             fout.write(f"{sample}_1\t{sample}_vamb_1\n")
148 | 


--------------------------------------------------------------------------------
/workflow/scripts/rename_assembly.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | 
34 | from Bio import SeqIO
35 | 
36 | # Open the snakemake.output FASTA file and mapping table file for writing
37 | with open(snakemake.output.fasta, "w") as output_handle, open(
38 |     snakemake.output.mapping_table, "w"
39 | ) as mapping_table_handle:
40 |     i = 1
41 | 
42 |     for record in SeqIO.parse(snakemake.input[0], "fasta"):
43 |         if len(record) < snakemake.params.minlength:
44 |             break
45 | 
46 |         old_name = record.id
47 |         new_name = f"{snakemake.wildcards.sample}_{i}"
48 |         record.id = new_name
49 |         record.description = ""
50 | 
51 |         SeqIO.write(record, output_handle, "fasta")
52 | 
53 |         mapping_table_handle.write(f"{new_name}\t{old_name}\n")
54 | 
55 |         i += 1
56 | 


--------------------------------------------------------------------------------
/workflow/scripts/rename_genecatalog.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | 
34 | import pandas as pd
35 | from utils.gene_scripts import geneNr_to_string
36 | 
37 | 
38 | # Start
39 | 
40 | 
41 | map_genenr = pd.read_csv(snakemake.input.rep2genenr, index_col=0, sep="\t").squeeze()
42 | 
43 | 
44 | # from gene Nr to gene name
45 | rep2gene = geneNr_to_string(map_genenr)
46 | 
47 | logging.info(
48 |     f"Collect and rename representative genes according to:\n {rep2gene.head()}"
49 | )
50 | 
51 | assert rep2gene.shape[0] > 0
52 | 
53 | 
54 | with open(snakemake.output[0], "w") as fout:
55 |     with open(snakemake.input.fasta, "r") as fin:
56 |         for line in fin:
57 |             if line[0] == ">":
58 |                 gene_name = line[1:].strip().split(" ")[0]
59 | 
60 |                 gene_id = rep2gene.loc[gene_name]
61 | 
62 |                 fout.write(f">{gene_id} {gene_name}\n")
63 | 
64 |             else:
65 |                 fout.write(line)
66 | 


--------------------------------------------------------------------------------
/workflow/scripts/rename_genomes.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | import sys, os
  4 | import logging, traceback
  5 | 
  6 | logging.basicConfig(
  7 |     filename=snakemake.log[0],
  8 |     level=logging.INFO,
  9 |     format="%(asctime)s %(message)s",
 10 |     datefmt="%Y-%m-%d %H:%M:%S",
 11 | )
 12 | 
 13 | 
 14 | def handle_exception(exc_type, exc_value, exc_traceback):
 15 |     if issubclass(exc_type, KeyboardInterrupt):
 16 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
 17 |         return
 18 | 
 19 |     logging.error(
 20 |         "".join(
 21 |             [
 22 |                 "Uncaught exception: ",
 23 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
 24 |             ]
 25 |         )
 26 |     )
 27 | 
 28 | 
 29 | # Install exception handler
 30 | sys.excepthook = handle_exception
 31 | 
 32 | 
 33 | # start
 34 | 
 35 | 
 36 | from atlas import utils
 37 | import pandas as pd
 38 | 
 39 | # Bin     Filename        Proteins
 40 | paths = pd.read_csv(snakemake.input.paths, sep="\t", index_col=0).Filename
 41 | # genome  SpeciesNr       Species Representative
 42 | mapping = pd.read_csv(
 43 |     snakemake.input.mapping_file,
 44 |     sep="\t",
 45 |     index_col=0,
 46 | ).squeeze()
 47 | 
 48 | 
 49 | # standardize names of representatives
 50 | # MAG001 ....
 51 | representatives = mapping.Representative.unique()
 52 | old2new_name = dict(
 53 |     zip(representatives, utils.gen_names_for_range(len(representatives), prefix="MAG"))
 54 | )
 55 | mapping["MAG"] = mapping.Representative.map(old2new_name)
 56 | 
 57 | 
 58 | # write cluster attribution
 59 | mapping[["MAG", "Representative"]].to_csv(
 60 |     snakemake.output.mapfile_allbins2mag, sep="\t", header=True
 61 | )
 62 | 
 63 | # write out old2new ids
 64 | old2new = mapping.loc[representatives, "MAG"]
 65 | old2new.index.name = "Representative"
 66 | old2new.to_csv(snakemake.output.mapfile_old2mag, sep="\t", header=True)
 67 | 
 68 | 
 69 | #### Write genomes and contig to genome mapping file
 70 | output_dir = snakemake.output.dir
 71 | mapfile_contigs = snakemake.output.mapfile_contigs
 72 | rename_contigs = snakemake.params.rename_contigs
 73 | 
 74 | 
 75 | os.makedirs(output_dir)
 76 | 
 77 | with open(mapfile_contigs, "w") as out_contigs:
 78 |     for rep in representatives:
 79 |         fasta_in = paths.loc[rep]
 80 |         new_name = old2new.loc[rep]
 81 | 
 82 |         fasta_out = os.path.join(output_dir, f"{new_name}.fasta")
 83 | 
 84 |         # write names of contigs in mapping file
 85 |         with open(fasta_in) as ffi, open(fasta_out, "w") as ffo:
 86 |             Nseq = 0
 87 |             for line in ffi:
 88 |                 # if header line
 89 |                 if line[0] == ">":
 90 |                     Nseq += 1
 91 | 
 92 |                     if rename_contigs:
 93 |                         new_header = f"{new_name}_{Nseq}"
 94 |                     else:
 95 |                         new_header = line[1:].strip().split()[0]
 96 | 
 97 |                     # write to contig to mapping file
 98 |                     out_contigs.write(f"{new_header}\t{new_name}\n")
 99 |                     # write to fasta file
100 |                     ffo.write(f">{new_header}\n")
101 |                 else:
102 |                     ffo.write(line)
103 | 
104 | 
105 | # rename quality
106 | def rename_quality(quality_in, quality_out, old2new_name):
107 |     Q = pd.read_csv(quality_in, index_col=0, sep="\t")
108 | 
109 |     Q = Q.loc[old2new_name.keys()].rename(index=old2new_name)
110 | 
111 |     Q.to_csv(quality_out, sep="\t")
112 | 
113 | 
114 | rename_quality(
115 |     quality_in=snakemake.input.genome_info,
116 |     quality_out=snakemake.output.genome_info,
117 |     old2new_name=old2new_name,
118 | )
119 | 


--------------------------------------------------------------------------------
/workflow/scripts/root_tree.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | import sys, os
 3 | import logging, traceback
 4 | 
 5 | logging.basicConfig(
 6 |     filename=snakemake.log[0],
 7 |     level=logging.INFO,
 8 |     format="%(asctime)s %(message)s",
 9 |     datefmt="%Y-%m-%d %H:%M:%S",
10 | )
11 | 
12 | 
13 | def handle_exception(exc_type, exc_value, exc_traceback):
14 |     if issubclass(exc_type, KeyboardInterrupt):
15 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
16 |         return
17 | 
18 |     logging.error(
19 |         "".join(
20 |             [
21 |                 "Uncaught exception: ",
22 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
23 |             ]
24 |         )
25 |     )
26 | 
27 | 
28 | # Install exception handler
29 | sys.excepthook = handle_exception
30 | 
31 | # start
32 | import ete3
33 | 
34 | T = ete3.Tree(snakemake.input.tree, quoted_node_names=True, format=1)
35 | 
36 | try:
37 |     T.unroot()
38 |     if len(T) > 2:
39 |         T.set_outgroup(T.get_midpoint_outgroup())
40 | 
41 | except Exception as e:
42 |     logging.error("Failed to root tree, keep unrooted. Reason was:\n\n" + str(e))
43 | 
44 | 
45 | T.write(outfile=snakemake.output.tree)
46 | 


--------------------------------------------------------------------------------
/workflow/scripts/split_genecatalog.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | 
 4 | import sys, os
 5 | import logging, traceback
 6 | 
 7 | logging.basicConfig(
 8 |     filename=snakemake.log[0],
 9 |     level=logging.INFO,
10 |     format="%(asctime)s %(message)s",
11 |     datefmt="%Y-%m-%d %H:%M:%S",
12 | )
13 | 
14 | 
15 | def handle_exception(exc_type, exc_value, exc_traceback):
16 |     if issubclass(exc_type, KeyboardInterrupt):
17 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
18 |         return
19 | 
20 |     logging.error(
21 |         "".join(
22 |             [
23 |                 "Uncaught exception: ",
24 |                 *traceback.format_exception(exc_type, exc_value, exc_traceback),
25 |             ]
26 |         )
27 |     )
28 | 
29 | 
30 | # Install exception handler
31 | sys.excepthook = handle_exception
32 | 
33 | ## start
34 | 
35 | 
36 | from utils import fasta
37 | 
38 | fasta.split(
39 |     snakemake.input[0],
40 |     snakemake.params.subset_size,
41 |     snakemake.output[0],
42 |     simplify_headers=True,
43 | )
44 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from . import parsers_bbmap, parsers
2 | from .utils import gen_names_for_range
3 | from . import taxonomy
4 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/fasta.py:
--------------------------------------------------------------------------------
  1 | from Bio import SeqIO
  2 | from numpy import ceil
  3 | import os
  4 | 
  5 | 
  6 | def _make_test_fasta(test_file="test_ABC.fasta"):
  7 |     with open(test_file, "w") as f:
  8 |         for Number, Letter in enumerate("ATCG"):
  9 |             f.write(f">contig_{Number+1} description\n{Letter}\n")
 10 | 
 11 | 
 12 | def count_Nseq(fasta_file):
 13 |     """
 14 |     Counts number of sequences in a fasta file.
 15 |     >>> fasta_file='test_ABC.fasta'
 16 |     >>> _make_test_fasta(fasta_file) # makes fasta with a seq for each nucleotide
 17 |     >>> count_Nseq(fasta_file)
 18 |     4
 19 |     >>> os.remove(fasta_file)
 20 | 
 21 |     """
 22 |     i = 0
 23 |     with open(fasta_file) as f:
 24 |         for line in f:
 25 |             if line[0] == ">":
 26 |                 i += 1
 27 |     return i
 28 | 
 29 | 
 30 | def split(fasta_file, maxSubsetSize, out_dir, simplify_headers=True):
 31 |     """
 32 |     Splits a fasta in subsets of size max maxSubsetSize.
 33 |     >>> fasta_file='test_ABC.fasta'
 34 |     >>> out_dir = 'test_outdit_doctest'
 35 |     >>> _make_test_fasta(fasta_file) # makes fasta with a seq for each nucleotide
 36 |     >>> split(fasta_file,3,out_dir,simplify_headers=True)
 37 |     >>> len(os.listdir(out_dir))
 38 |     2
 39 |     >>> count_Nseq('test_outdit_doctest/subset1.fasta')
 40 |     2
 41 |     >>> count_Nseq('test_outdit_doctest/subset2.fasta')
 42 |     2
 43 |     >>> split(fasta_file,3,out_dir,simplify_headers=True)
 44 |     Traceback (most recent call last):
 45 |         ...
 46 |     FileExistsError: [Errno 17] File exists: 'test_outdit_doctest'
 47 |     >>> import shutil; shutil.rmtree(out_dir)
 48 |     >>> os.remove(fasta_file)
 49 |     """
 50 | 
 51 |     N = count_Nseq(fasta_file)
 52 | 
 53 |     SubsetSize = int(ceil(N / ceil(N / maxSubsetSize)))
 54 |     extension = os.path.splitext(fasta_file)[-1]
 55 | 
 56 |     os.makedirs(out_dir)
 57 | 
 58 |     i, subset_n = 0, 0
 59 |     fout = None
 60 |     for i, seq in enumerate(SeqIO.parse(fasta_file, "fasta")):
 61 |         if (i % SubsetSize) == 0:
 62 |             subset_n += 1
 63 |             if fout is not None:
 64 |                 fout.close()
 65 | 
 66 |             fout = open(f"{out_dir}/subset{subset_n}{extension}", "w")
 67 | 
 68 |         if simplify_headers:
 69 |             seq.description = ""
 70 |         SeqIO.write(seq, fout, "fasta")
 71 | 
 72 |     fout.close()
 73 | 
 74 | 
 75 | def parse_fasta_headers(fasta_file, simplify_header=True):
 76 |     """
 77 |     returns list of fasta headers
 78 |     """
 79 | 
 80 |     headers = []
 81 | 
 82 |     with open(fasta_file) as f:
 83 |         for line in f:
 84 |             if line[0] == ">":
 85 |                 header = line[1:].strip()
 86 | 
 87 |                 if simplify_header:
 88 |                     header = header.split()[0]
 89 | 
 90 |                 headers.append(header)
 91 | 
 92 |     return headers
 93 | 
 94 | 
 95 | def header2origin(fasta_file, out, simplify_header=True):
 96 |     """
 97 |     Annotates a fasta file to it's filename:
 98 |     genome.fasta:
 99 |     >contig1 description
100 |     ACTAC
101 |     >contig2 description
102 |     ACTAC
103 |     ...
104 | 
105 |     becomes:
106 |     contig1 genome
107 |     contig2 genome
108 | 
109 |     input is a fasta filename:
110 |     out is a filename or a stream
111 | 
112 |     """
113 | 
114 |     if type(out) == str:
115 |         out_stream = open(out, "w")
116 |     else:
117 |         out_stream = out
118 | 
119 |     name = os.path.splitext(os.path.split(fasta_file)[-1])[0]
120 | 
121 |     # write names of contigs in mapping file
122 |     with open(fasta_file) as f:
123 |         for line in f:
124 |             if line[0] == ">":
125 |                 header = line[1:].strip()
126 |                 if simplify_header:
127 |                     header = header.split()[0]
128 |                 out_stream.write(f"{header}\t{name}\n")
129 |     out_stream.flush()
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     import doctest, shutil
134 | 
135 |     doctest.testmod()
136 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/gene_scripts.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def split_orf_to_index(orf_names):
 6 |     """Split the typical prodigal orf name `Sample_contigNr_OrfNR` into three column dataset and conversts the numbers to the smallest unsigned number."""
 7 |     result = (
 8 |         pd.Series(orf_names)
 9 |         .str.rsplit("_", n=2, expand=True)
10 |         .rename(columns={0: "Sample", 1: "ContigNr", 2: "OrfNr"})
11 |     )
12 |     result = result.apply(pd.to_numeric, errors="ignore", downcast="unsigned")
13 | 
14 |     are_numeric = result.dtypes.map(lambda x: np.issubdtype(x, np.number))
15 |     assert all(
16 |         are_numeric == np.array([False, True, True])
17 |     ), f"datatypes are not as expected {result.dtypes}"
18 | 
19 |     return result
20 | 
21 | 
22 | def geneNr_to_string(GeneNrs, Ngenes=None):
23 |     """Convert the array of gene number to the corresponding string, e.g.: 5 -> Gene0005
24 |     The leading zeros depends on the number of genes
25 |     """
26 | 
27 |     assert np.issubdtype(GeneNrs.dtype, np.number)
28 | 
29 |     if Ngenes is None:
30 |         Ngenes = GeneNrs.max()
31 | 
32 |     n_leading_zeros = len(str(Ngenes))
33 | 
34 |     number_format = f"Gene{{:0{n_leading_zeros}d}}"
35 | 
36 |     return GeneNrs.apply(number_format.format)
37 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/genome_stats.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Pool
  2 | import pandas as pd
  3 | import os, sys
  4 | from .io import simplify_path, simply_open
  5 | from itertools import groupby
  6 | import numpy as np
  7 | import gzip as gz
  8 | import logging
  9 | 
 10 | logger = logging.getLogger(__file__)
 11 | 
 12 | 
 13 | def verify_dna(sequence, is_upper):
 14 |     if not is_upper:
 15 |         sequence = sequence.upper()
 16 |     letters_used = set(sequence)
 17 | 
 18 |     alphabet = set(list("ATCGN"))
 19 | 
 20 |     additional_letters = letters_used - alphabet
 21 | 
 22 |     if len(additional_letters) > 0:
 23 |         raise Exception(
 24 |             f"Sequence contains additional letters that are not DNA {additional_letters}"
 25 |         )
 26 | 
 27 | 
 28 | def get_stats_from_lengths(lengths):
 29 |     sorted_lengths = sorted(lengths, reverse=True)
 30 |     csum = np.cumsum(sorted_lengths)
 31 | 
 32 |     Total_length = int(sum(lengths))
 33 |     N = len(lengths)
 34 | 
 35 |     n2 = int(Total_length / 2)
 36 | 
 37 |     # get index for cumsum >= N/2
 38 |     csumn2 = min(csum[csum >= n2])
 39 |     ind = int(np.where(csum == csumn2)[0][0])
 40 | 
 41 |     N50 = sorted_lengths[ind]
 42 | 
 43 |     return Total_length, N, N50
 44 | 
 45 | 
 46 | def genome_stats(fasta_file, number_of_n_for_split=10):
 47 |     """Get genome stats from a fasta file. Outputs a tuple with:
 48 |     name,Length, n_seq,N50
 49 |     """
 50 | 
 51 |     try:
 52 |         name = simplify_path(fasta_file)
 53 | 
 54 |         scaffold_lengths = []
 55 |         contig_lengths = []
 56 |         ambigious_bases = 0
 57 | 
 58 |         with simply_open(fasta_file, "r") as fasta:
 59 |             ## parse each sequence by header: groupby(data, key)
 60 |             faiter = (x[1] for x in groupby(fasta, lambda line: line[0] == ">"))
 61 | 
 62 |             for record in faiter:
 63 |                 # reccord contains header
 64 |                 ## join sequence lines
 65 |                 sequence = "".join(s.strip() for s in faiter.__next__())
 66 |                 sequence = sequence.upper()
 67 | 
 68 |                 verify_dna(sequence, is_upper=True)
 69 | 
 70 |                 # count ambigous bases
 71 |                 ambigious_bases += sequence.count("N")
 72 | 
 73 |                 # get set of scaffold lengths
 74 |                 scaffold_lengths.append(len(sequence))
 75 | 
 76 |                 # split on Ns and get set of contig lengths.
 77 |                 contig_lengths += [
 78 |                     len(contig)
 79 |                     for contig in sequence.split("N" * number_of_n_for_split)
 80 |                 ]
 81 | 
 82 |         Length_scaffolds, N_scaffolds, N50 = get_stats_from_lengths(scaffold_lengths)
 83 | 
 84 |         Length_contigs, N_contigs, _ = get_stats_from_lengths(contig_lengths)
 85 | 
 86 |     except Exception as e:
 87 |         logger.critical(f"Error in calculating stats of {fasta_file}")
 88 |         logger.critical(e)
 89 |         raise Exception(f"Error in calculating stats of {fasta_file}") from e
 90 | 
 91 |     return {
 92 |         "File": name,
 93 |         "Length_scaffolds": Length_scaffolds,
 94 |         "N_scaffolds": N_scaffolds,
 95 |         "N50": N50,
 96 |         "Length_contigs": Length_contigs,
 97 |         "N_contigs": N_contigs,
 98 |         "Ambigious_bases": ambigious_bases,
 99 |     }
100 | 
101 | 
102 | def get_many_genome_stats(filenames, output_filename, threads=1):
103 |     """Small function to calculate total genome length and N50"""
104 | 
105 |     pool = Pool(threads)
106 | 
107 |     results = pool.map(genome_stats, filenames)
108 |     Stats = pd.DataFrame(results).rename({"Length_scaffolds": "Length"})
109 | 
110 |     Stats.to_csv(output_filename, sep="\t", index=False)
111 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/io.py:
--------------------------------------------------------------------------------
  1 | import gzip as gz
  2 | 
  3 | import logging
  4 | import os
  5 | from pathlib import Path
  6 | 
  7 | logger = logging.getLogger("io")
  8 | 
  9 | 
 10 | def simplify_path(path, remove_gz=True):
 11 |     """Removes dir and extension from a filepath.
 12 |     checks if file has an e
 13 |     """
 14 | 
 15 |     path = Path(path)
 16 | 
 17 |     name = path.stem
 18 |     ext = path.suffix
 19 | 
 20 |     if remove_gz & (ext == ".gz"):
 21 |         name = Path(name).stem
 22 | 
 23 |     return name
 24 | 
 25 | 
 26 | def simply_open(filename, mode="r", *args, **kwargs):
 27 |     """open file irrespective if gz compressed or not"""
 28 | 
 29 |     filename = Path(filename)
 30 | 
 31 |     if filename.suffix == ".gz":
 32 |         # To read file in textmode
 33 |         if mode in ["r", "a", "w", "x"]:
 34 |             mode += "t"
 35 | 
 36 |         return gz.open(filename, mode, *args, **kwargs)
 37 |     else:
 38 |         return open(filename, mode, *args, **kwargs)
 39 | 
 40 | 
 41 | def cat_files(files, outfilename, gzip=False):
 42 |     """cat files in python
 43 |     gzip: compress outfile
 44 |     set to false when cat files that are already gzipped.
 45 |     """
 46 | 
 47 |     import shutil
 48 | 
 49 |     if gzip:
 50 |         import gzip as gz
 51 | 
 52 |         outhandle = gz.open
 53 |     else:
 54 |         outhandle = open
 55 | 
 56 |     with outhandle(outfilename, "wb") as f_out:
 57 |         for f in files:
 58 |             with open(f, "rb") as f_in:
 59 |                 shutil.copyfileobj(f_in, f_out)
 60 | 
 61 | 
 62 | def convert_percentages(df):
 63 |     """Convet all columns with strings and % at the end to percentages"""
 64 |     for col in df.columns:
 65 |         if df.dtypes[col] == "object":
 66 |             if df[col].iloc[0].endswith("%"):
 67 |                 df.loc[:, col] = df[col].str.rstrip("%").astype("float") / 100.0
 68 | 
 69 | 
 70 | def symlink_relative(files, input_dir, output_dir):
 71 |     """create symlink with and adjust for relative path"""
 72 | 
 73 |     input_dir_rel = os.path.relpath(input_dir, output_dir)
 74 | 
 75 |     for f in files:
 76 |         os.symlink(os.path.join(input_dir_rel, f), os.path.join(output_dir, f))
 77 | 
 78 | 
 79 | def _pandas_concat_in_memory(
 80 |     input_tables,
 81 |     output_table,
 82 |     sep,
 83 |     index_col,
 84 |     axis,
 85 |     read_arguments,
 86 |     save_arguments,
 87 |     concat_arguments,
 88 | ):
 89 |     import pandas as pd
 90 | 
 91 |     Tables = [
 92 |         pd.read_csv(file, index_col=index_col, sep=sep, **read_arguments)
 93 |         for file in input_tables
 94 |     ]
 95 | 
 96 |     out = pd.concat(Tables, axis=axis, **concat_arguments).sort_index()
 97 | 
 98 |     del Tables
 99 | 
100 |     out.to_csv(output_table, sep=sep, **save_arguments)
101 | 
102 | 
103 | def _pandas_concat_disck_based(
104 |     input_tables,
105 |     output_table,
106 |     sep,
107 |     index_col,
108 |     read_arguments,
109 |     save_arguments,
110 |     selected_headers=None,
111 | ):
112 |     """combine different tables but one after the other in disk based"""
113 | 
114 |     import pandas as pd
115 | 
116 |     try:
117 |         from tqdm import tqdm
118 |     except ImportError:
119 |         tqdm = tuple
120 | 
121 |     if selected_headers is not None:
122 |         try:
123 |             selected_headers = list(selected_headers)
124 |         except Exception as e:
125 |             raise Exception("selected_headers should be a list-like") from e
126 | 
127 |     else:
128 |         # read all_headers
129 |         selected_headers = set()
130 |         for file in input_tables:
131 |             headers_of_file = pd.read_csv(
132 |                 file, index_col=index_col, sep=sep, nrows=2, dtype=str, **read_arguments
133 |             )
134 | 
135 |             selected_headers.update(list(headers_of_file.columns))
136 | 
137 |         selected_headers = list(selected_headers)
138 |         logger.info(f"Inferred following list of headers {selected_headers}")
139 | 
140 |     # parse one file after another
141 | 
142 |     logger.info("Read an append table by table")
143 |     for file in tqdm(input_tables):
144 |         # read full table
145 |         table = pd.read_csv(
146 |             file, index_col=index_col, sep=sep, dtype=str, **read_arguments
147 |         )
148 |         # set to common header
149 |         table = table.reindex(selected_headers, axis=1)
150 | 
151 |         if file == input_tables[0]:
152 |             mode = "w"
153 |             print_header = True
154 |         else:
155 |             mode = "a"
156 |             print_header = False
157 | 
158 |         table.to_csv(
159 |             output_table, sep=sep, mode=mode, header=print_header, **save_arguments
160 |         )
161 | 
162 | 
163 | def pandas_concat(
164 |     input_tables,
165 |     output_table,
166 |     sep="\t",
167 |     index_col=0,
168 |     axis=0,
169 |     read_arguments=None,
170 |     save_arguments=None,
171 |     concat_arguments=None,
172 |     disk_based=False,
173 |     selected_headers=None,  # only used in disk based, not passed to usecols
174 | ):
175 |     """
176 |     Uses pandas to read,concatenate and save tables using pandas.concat
177 |     """
178 | 
179 |     if read_arguments is None:
180 |         read_arguments = {}
181 |     if save_arguments is None:
182 |         save_arguments = {}
183 | 
184 |     if type(input_tables) == str:
185 |         input_tables = [input_tables]
186 | 
187 |     common_arrguments = dict(
188 |         input_tables=input_tables,
189 |         output_table=output_table,
190 |         sep=sep,
191 |         index_col=index_col,
192 |         read_arguments=read_arguments,
193 |         save_arguments=save_arguments,
194 |     )
195 | 
196 |     if disk_based:
197 |         if concat_arguments is not None:
198 |             raise Exception(
199 |                 f"cannot hanndle concat arguments by disck based append, got {concat_arguments}"
200 |             )
201 | 
202 |         assert axis == 0, "Can only append on axis= 0"
203 | 
204 |         _pandas_concat_disck_based(
205 |             selected_headers=selected_headers, **common_arrguments
206 |         )
207 | 
208 |     else:
209 |         # in memory concat
210 |         if concat_arguments is None:
211 |             concat_arguments = {}
212 | 
213 |         if selected_headers is not None:
214 |             raise Exception(
215 |                 "argument 'selected_headers' is not used in 'in memory' concat. Use read_arguments=dict(usecols=selected_headers) instead "
216 |             )
217 | 
218 |         _pandas_concat_in_memory(
219 |             axis=axis, concat_arguments=concat_arguments, **common_arrguments
220 |         )
221 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/parsers.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from warnings import warn
  3 | 
  4 | 
  5 | def read_checkm_output(taxonomy_table, completness_table):
  6 |     c_df = pd.read_csv(completness_table, index_col=0, sep="\t")[
  7 |         ["Completeness", "Contamination", "Strain heterogeneity"]
  8 |     ]
  9 |     t_df = pd.read_csv(taxonomy_table, index_col=0, sep="\t")[
 10 |         [
 11 |             "# unique markers (of 43)",
 12 |             "# multi-copy",
 13 |             "Insertion branch UID",
 14 |             "Taxonomy (contained)",
 15 |             "Taxonomy (sister lineage)",
 16 |             "GC",
 17 |             "Genome size (Mbp)",
 18 |             "Gene count",
 19 |             "Coding density",
 20 |         ]
 21 |     ]
 22 |     df = pd.concat([c_df, t_df], axis=1)
 23 |     return df
 24 | 
 25 | 
 26 | def read_busco_output(
 27 |     completness_table, quality_score_formula="Completeness - 5*Contamination"
 28 | ):
 29 |     df = pd.read_table(completness_table, index_col=0)
 30 | 
 31 |     df.eval(
 32 |         "Completeness = Complete ",
 33 |         inplace=True,
 34 |     )
 35 |     df.eval("Contamination = Duplicated", inplace=True)
 36 |     df.eval(
 37 |         "Quality_score = " + quality_score_formula,
 38 |         inplace=True,
 39 |     )
 40 | 
 41 |     # remove extension from filename
 42 |     df.index = df.index.str.replace(".fasta", "", regex=False)
 43 |     df.index.name = "Bin Id"
 44 | 
 45 |     return df
 46 | 
 47 | 
 48 | def read_checkm2_output(
 49 |     completness_table, quality_score_formula="Completeness - 5*Contamination"
 50 | ):
 51 |     df = pd.read_table(completness_table, index_col=0)
 52 | 
 53 |     if not "Completeness" in df.columns:
 54 |         # create empty column
 55 |         df.insert(0, "Completeness", 0.0)
 56 | 
 57 |         # add completeness depending on selected model
 58 |         specific = df.Completeness_Model_Used.str.contains("Specific Model")
 59 |         df.loc[specific, "Completeness"] = df.loc[specific, "Completeness_Specific"]
 60 |         df.loc[~specific, "Completeness"] = df.loc[~specific, "Completeness_General"]
 61 | 
 62 |     df.eval(
 63 |         "Quality_score = " + quality_score_formula,
 64 |         inplace=True,
 65 |     )
 66 | 
 67 |     df.index.name = "Bin Id"
 68 | 
 69 |     return df
 70 | 
 71 | 
 72 | def load_quality(quality_file):
 73 |     Q = pd.read_csv(quality_file, index_col=0, sep="\t")
 74 | 
 75 |     # remove extension if present
 76 |     if Q.index.str.contains(".fa").all():
 77 |         warn("Found fasta extension in index. I remove them")
 78 |         Q.index = Q.index.str.split(".fa", expand=True).to_frame()[0]
 79 | 
 80 |     # Q.columns = Q.columns.str.lower()
 81 | 
 82 |     necessary_columns = ["Completeness", "Contamination"]
 83 | 
 84 |     # rename lower and uppercase to necessary_columns
 85 |     Q = Q.rename(
 86 |         columns={
 87 |             fun(s[0]) + s[1:]: s
 88 |             for s in necessary_columns
 89 |             for fun in (str.lower, str.upper)
 90 |         }
 91 |     )
 92 | 
 93 |     if Q.columns.isin(necessary_columns).sum() != len(necessary_columns):
 94 |         raise Exception(
 95 |             f"{necessary_columns} should be in the quality table, only got {Q.columns}"
 96 |         )
 97 | 
 98 |     assert not Q.index.duplicated().any(), f"duplicated indexes in {quality_file}"
 99 | 
100 |     return Q
101 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/taxonomy.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import warnings
 4 | 
 5 | TAXONMIC_LEVELS = ["Domain", "phylum", "class", "order", "family", "genus", "species"]
 6 | 
 7 | 
 8 | def tax2table(Taxonomy_Series, split_character=";", remove_prefix=False):
 9 |     """
10 |     Transforms (green_genes) taxonomy to a table
11 |     Expect the following input format:
12 |     d__Bacteria;p__Bacteroidota;c__Bacteroidia;f__
13 |     Replaces empty values and can remove prefix 'c__'
14 |     """
15 | 
16 |     # drop missing values
17 |     if Taxonomy_Series.isnull().any():
18 |         warnings.warn(
19 |             "Some samples have no taxonomy asigned. Samples:\n"
20 |             + ", ".join(Taxonomy_Series.index[Taxonomy_Series.isnull()].astype(str))
21 |         )
22 | 
23 |     Tax = Taxonomy_Series.dropna().astype(str).str.split(split_character, expand=True)
24 |     # Add headers as long as we have columns
25 |     Tax.columns = TAXONMIC_LEVELS[: len(Tax.columns)]
26 | 
27 |     if remove_prefix:
28 |         Tax = Tax.applymap(lambda s: s[3:], na_action="ignore").replace("", np.nan)
29 |     else:
30 |         Tax[Tax.applymap(len, na_action="ignore") == 3] = np.nan
31 | 
32 |     # add missing values again
33 | 
34 |     Tax = Tax.reindex(Taxonomy_Series.index)
35 | 
36 |     return Tax
37 | 
38 | 
39 | def load_checkm_tax(taxonomy_file, remove_prefix=False):
40 |     D = pd.read_table(taxonomy_file, index_col=0)
41 | 
42 |     checkmTax = tax2table(D["Taxonomy (contained)"], remove_prefix=remove_prefix)
43 | 
44 |     return checkmTax
45 | 
46 | 
47 | def load_gtdb_tax(taxonomy_file, remove_prefix=False):
48 |     D = pd.read_table(taxonomy_file, index_col=0)
49 | 
50 |     Tax = tax2table(D["classification"], remove_prefix=remove_prefix)
51 | 
52 |     return Tax
53 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/tree.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["QT_QPA_PLATFORM"] = "offscreen"  # because we might not have a X server
 4 | 
 5 | import ete3
 6 | import pandas as pd
 7 | import warnings
 8 | 
 9 | 
10 | def load_tree(netwik_file):
11 |     return ete3.Tree(netwik_file, quoted_node_names=True, format=1)
12 | 
13 | 
14 | def root_tree_by_phyla(T, phyla):
15 |     """Root the tree next to the phylum that is as far apart as possible from the other phyla"""
16 |     phylum_LCA = {}
17 | 
18 |     for p in phyla.unique():
19 |         phylum_LCA[p] = T.get_common_ancestor(*tuple(phyla.index[phyla == p].values))
20 | 
21 |     Dist = pd.DataFrame()
22 |     for p1, lca1 in phylum_LCA.items():
23 |         for p2, lca2 in phylum_LCA.items():
24 |             Dist.loc[p1, p2] = T.get_distance(lca1, lca2)
25 | 
26 |     furthest_phylum = Dist.mean().idxmax()
27 |     outgroup = phylum_LCA[furthest_phylum]
28 | 
29 |     if not outgroup == T:
30 |         T.set_outgroup(outgroup)
31 | 
32 | 
33 | def layout_black_circles(node):
34 |     # If node is a leaf
35 |     if node.is_leaf():
36 |         node.img_style["fgcolor"] = "k"
37 |     else:
38 |         node.img_style["size"] = 0
39 | 
40 | 
41 | def render_tree(T, out):
42 |     from ete3 import TreeStyle
43 | 
44 |     ts = TreeStyle()
45 |     ts.show_leaf_name = False
46 |     ts.mode = "c"
47 |     ts.scale = 200
48 |     ts.show_scale = False
49 | 
50 |     T.render(out, tree_style=ts, layout=layout_black_circles)
51 | 


--------------------------------------------------------------------------------
/workflow/scripts/utils/utils.py:
--------------------------------------------------------------------------------
1 | def gen_names_for_range(N, prefix="", start=1):
2 |     """generates a range of IDS with leading zeros so sorting will be ok"""
3 |     n_leading_zeros = len(str(N))
4 |     format_int = prefix + "{:0" + str(n_leading_zeros) + "d}"
5 |     return [format_int.format(i) for i in range(start, N + start)]
6 | 


--------------------------------------------------------------------------------