├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md └── workflows │ ├── codeql.yml │ ├── pypi-publish.yml │ └── testing.yml ├── .gitignore ├── .readthedocs.yaml ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── build ├── environment.yml └── meta.yaml ├── container ├── Dockerfile └── README.md ├── docs ├── annotation.md ├── assemble.md ├── citation.md ├── comparison.md ├── faq.md ├── graph_stats.md ├── images │ ├── Phables_workflow.png │ ├── components.png │ ├── histogram_n_nodes.png │ ├── pearson_clustermap.png │ ├── pearson_heatmap.png │ ├── phables_logo.png │ ├── phage_comp_280_cycle_1_plot.png │ ├── qual_resolved_genome_unitig_boxen.png │ └── qual_resolved_genome_unitig_violin.png ├── index.md ├── install.md ├── quality.md ├── requirements.txt └── usage.md ├── mkdocs.yml ├── phables ├── __init__.py ├── __main__.py ├── config │ ├── config.yaml │ └── databases.yaml ├── phables.CITATION ├── phables.LICENSE ├── phables.VERSION ├── test_data │ ├── assembly_graph.gfa │ ├── edge_coverages.tsv │ ├── edges.fasta.hmmout │ ├── junction_pe_coverage.pickle │ └── phrogs_annotations.tsv └── workflow │ ├── envs │ ├── curl.yaml │ ├── koverage.yaml │ ├── mapping.yaml │ ├── mmseqs.yaml │ ├── phables.yaml │ └── smg.yaml │ ├── install.smk │ ├── phables.smk │ ├── rules │ ├── 00_database_preflight.smk │ ├── 02_phables_preflight.smk │ ├── 02_phables_targets.smk │ ├── 03_test_preflight.smk │ ├── 03_test_targets.smk │ ├── coverage.smk │ ├── genes.smk │ ├── gfa2fasta.smk │ ├── phables.smk │ └── postprocess.smk │ ├── scripts │ ├── combine_cov.py │ ├── format_koverage_results.py │ ├── gfa2fasta.py │ ├── phables.py │ └── phables_utils │ │ ├── FD_Inexact.py │ │ ├── __init__.py │ │ ├── component_utils.py │ │ ├── coverage_utils.py │ │ ├── edge_graph_utils.py │ │ ├── flow_utils.py │ │ ├── gene_utils.py │ │ ├── genome_utils.py │ │ ├── long_utils.py │ │ ├── output_utils.py │ │ ├── phrogs │ │ └── phrog_annot.tsv │ │ └── short_utils.py │ └── test_phables.smk ├── phables_logo.png ├── phables_logo_dark.png ├── phables_logo_light.png ├── setup.py └── tests └── test_phables.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behaviour, including the 15 | 1. Command executed 16 | 2. Error message 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Desktop (please complete the following information):** 25 | - OS: [e.g. iOS] 26 | - Browser [e.g. chrome, safari] 27 | - Version [e.g. 22] 28 | 29 | **Additional context** 30 | Add any other context about the problem here. 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[Feature request]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "develop" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "develop" ] 20 | schedule: 21 | - cron: '39 18 * * 0' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Use only 'java' to analyze code written in Java, Kotlin or both 38 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both 39 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 40 | 41 | steps: 42 | - name: Checkout repository 43 | uses: actions/checkout@v3 44 | 45 | # Initializes the CodeQL tools for scanning. 46 | - name: Initialize CodeQL 47 | uses: github/codeql-action/init@v2 48 | with: 49 | languages: ${{ matrix.language }} 50 | # If you wish to specify custom queries, you can do so here or in a config file. 51 | # By default, queries listed here will override any specified in a config file. 52 | # Prefix the list here with "+" to use these queries and those in the config file. 53 | 54 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 55 | # queries: security-extended,security-and-quality 56 | 57 | 58 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 59 | # If this step fails, then you should remove it and run the build manually (see below) 60 | - name: Autobuild 61 | uses: github/codeql-action/autobuild@v2 62 | 63 | # ℹ️ Command-line programs to run using the OS shell. 64 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 65 | 66 | # If the Autobuild fails above, remove it and uncomment the following three lines. 67 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 68 | 69 | # - run: | 70 | # echo "Run, Build Application using script" 71 | # ./location_of_script_within_repo/buildscript.sh 72 | 73 | - name: Perform CodeQL Analysis 74 | uses: github/codeql-action/analyze@v2 75 | with: 76 | category: "/language:${{matrix.language}}" 77 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ develop ] 6 | pull_request: 7 | branches: [ develop ] 8 | 9 | 10 | jobs: 11 | tests: 12 | name: "Python ${{ matrix.python-version }}" 13 | runs-on: ${{ matrix.os }} 14 | 15 | defaults: 16 | run: 17 | shell: bash -el {0} 18 | 19 | strategy: 20 | matrix: 21 | os: [ubuntu-latest, macos-latest] 22 | python-version: ["3.9", "3.10"] 23 | 24 | steps: 25 | - uses: "actions/checkout@v3" 26 | with: 27 | fetch-depth: 0 28 | 29 | # Setup env 30 | - uses: conda-incubator/setup-miniconda@v3 31 | with: 32 | activate-environment: phables 33 | environment-file: build/environment.yml 34 | python-version: ${{ matrix.python-version }} 35 | auto-activate-base: false 36 | 37 | - name: "Setup Phables on ${{ matrix.os }} for Python ${{ matrix.python-version }}" 38 | run: | 39 | python -m pip install --upgrade pip 40 | pip install . 41 | 42 | - name: "Generate coverage report on ${{ matrix.os }} for Python ${{ matrix.python-version }}" 43 | run: | 44 | pip install pytest pytest-cov 45 | pytest --cov=./ --cov-report xml --cov-report lcov --cov-append 46 | 47 | - name: Coveralls Parallel 48 | uses: coverallsapp/github-action@master 49 | with: 50 | parallel: true 51 | github-token: ${{ secrets.github_token }} 52 | flag-name: run-${{ matrix.test_number }} 53 | path-to-lcov: "coverage.lcov" 54 | 55 | finish: 56 | needs: tests 57 | runs-on: ${{ matrix.os }} 58 | strategy: 59 | matrix: 60 | os: [ubuntu-latest, macos-latest] 61 | python-version: ["3.9", "3.10"] 62 | steps: 63 | - name: Coveralls Finished 64 | uses: coverallsapp/github-action@master 65 | with: 66 | github-token: ${{ secrets.github_token }} 67 | parallel-finished: true 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | develop-eggs/ 12 | dist/ 13 | downloads/ 14 | eggs/ 15 | .eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | pip-wheel-metadata/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 94 | __pypackages__/ 95 | 96 | # Celery stuff 97 | celerybeat-schedule 98 | celerybeat.pid 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | # Mac OS 131 | *.DS_Store 132 | ./**/.DS_Store 133 | 134 | # Snakemake 135 | .snakemake/ 136 | phables/workflow/conda/ 137 | phables.out/ 138 | 139 | # Databases 140 | databases/ 141 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.9" 13 | 14 | mkdocs: 15 | configuration: mkdocs.yml 16 | 17 | # Optionally declare the Python requirements required to build your docs 18 | python: 19 | install: 20 | - requirements: docs/requirements.txt 21 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite our article in Bioinformatics as below." 3 | authors: 4 | - family-names: "Mallawaarachchi" 5 | given-names: "Vijini" 6 | orcid: "https://orcid.org/0000-0002-2651-8719" 7 | - family-names: "Roach" 8 | given-names: "Michael J." 9 | orcid: "https://orcid.org/0000-0003-1488-5148" 10 | - family-names: "Decewicz" 11 | given-names: "Przemyslaw" 12 | orcid: "https://orcid.org/0000-0002-5621-7124" 13 | - family-names: "Papudeshi" 14 | given-names: "Bhavya" 15 | orcid: "https://orcid.org/0000-0001-5359-3100" 16 | - family-names: "Giles" 17 | given-names: "Sarak K." 18 | orcid: "https://orcid.org/0000-0002-4395-060X" 19 | - family-names: "Grigson" 20 | given-names: "Susanna R." 21 | orcid: "https://orcid.org/0000-0003-4738-3451" 22 | - family-names: "Bouras" 23 | given-names: "George" 24 | orcid: "https://orcid.org/0000-0002-5885-4186" 25 | - family-names: "Hesse" 26 | given-names: "Ryan D." 27 | orcid: "https://orcid.org/0000-0001-9366-5631" 28 | - family-names: "Inglis" 29 | given-names: "Laura K." 30 | orcid: "https://orcid.org/0000-0001-7919-8563" 31 | - family-names: "Hutton" 32 | given-names: "Abbey LK." 33 | orcid: "https://orcid.org/0000-0002-2474-1327" 34 | - family-names: "Dinsdale" 35 | given-names: "Elizabeth A." 36 | orcid: "https://orcid.org/0000-0002-2177-203X" 37 | - family-names: "Edwards" 38 | given-names: "Robert A." 39 | orcid: "https://orcid.org/0000-0001-8383-8949" 40 | title: "Phables: from fragmented assemblies to high-quality bacteriophage genomes" 41 | doi: 10.1093/bioinformatics/btad586 42 | date-released: 2017-12-18 43 | url: "https://github.com/github-linguist/linguist" 44 | preferred-citation: 45 | type: article 46 | authors: 47 | - family-names: "Mallawaarachchi" 48 | given-names: "Vijini" 49 | orcid: "https://orcid.org/0000-0002-2651-8719" 50 | - family-names: "Roach" 51 | given-names: "Michael J." 52 | orcid: "https://orcid.org/0000-0003-1488-5148" 53 | - family-names: "Decewicz" 54 | given-names: "Przemyslaw" 55 | orcid: "https://orcid.org/0000-0002-5621-7124" 56 | - family-names: "Papudeshi" 57 | given-names: "Bhavya" 58 | orcid: "https://orcid.org/0000-0001-5359-3100" 59 | - family-names: "Giles" 60 | given-names: "Sarak K." 61 | orcid: "https://orcid.org/0000-0002-4395-060X" 62 | - family-names: "Grigson" 63 | given-names: "Susanna R." 64 | orcid: "https://orcid.org/0000-0003-4738-3451" 65 | - family-names: "Bouras" 66 | given-names: "George" 67 | orcid: "https://orcid.org/0000-0002-5885-4186" 68 | - family-names: "Hesse" 69 | given-names: "Ryan D." 70 | orcid: "https://orcid.org/0000-0001-9366-5631" 71 | - family-names: "Inglis" 72 | given-names: "Laura K." 73 | orcid: "https://orcid.org/0000-0001-7919-8563" 74 | - family-names: "Hutton" 75 | given-names: "Abbey LK." 76 | orcid: "https://orcid.org/0000-0002-2474-1327" 77 | - family-names: "Dinsdale" 78 | given-names: "Elizabeth A." 79 | orcid: "https://orcid.org/0000-0002-2177-203X" 80 | - family-names: "Edwards" 81 | given-names: "Robert A." 82 | orcid: "https://orcid.org/0000-0001-8383-8949" 83 | doi: "10.1093/bioinformatics/btad586" 84 | journal: "Bioinformatics" 85 | month: 9 86 | title: "Phables: from fragmented assemblies to high-quality bacteriophage genomes" 87 | start: "btad586" 88 | issue: 10 89 | volume: 39 90 | year: 2023 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | [Vijini Mallawaarachchi](mailto:viji.mallawaarachchi@gmail.com). 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Phables project 2 | 3 | We love to have your contributions to the Phables project, whether it's: 4 | * Reporting a bug 5 | * Submitting a fix 6 | * Proposing new features 7 | 8 | ## Clone and install Phables onto your machine 9 | 10 | First, make sure you have [git](https://github.com/git-guides/install-git) installed on your machine. 11 | 12 | On GitHub, [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the Phables repository and [clone](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) it to your machine. 13 | 14 | ```bash 15 | # clone repository to your local machine 16 | git clone https://github.com/Vini2/phables.git 17 | ``` 18 | 19 | Move to the Phables directory 20 | 21 | ```bash 22 | cd phables 23 | ``` 24 | 25 | Create and activate the conda environment. Make sure to have [`conda`](https://docs.conda.io/en/latest/) installed. 26 | 27 | ```bash 28 | # Create the phables environment 29 | conda env create -f build/environment.yml 30 | 31 | # Activate the phables environment 32 | conda activate phables 33 | ``` 34 | 35 | Now install Phables via [`pip`](https://pip.pypa.io/en/stable/). 36 | 37 | ```bash 38 | pip install -e . 39 | ``` 40 | 41 | ## Test Phables installation 42 | 43 | Print the help message using the following command. 44 | 45 | ```bash 46 | phables -h 47 | ``` 48 | 49 | Use the following command to launch the phables test run and all the tests should pass. 50 | 51 | ```bash 52 | phables test 53 | ``` 54 | 55 | ## Coding Style 56 | 57 | We adhere to the [PEP 8](https://peps.python.org/pep-0008/) style guide. 58 | 59 | Before committing, make sure to run [`black`](https://pypi.org/project/black/) and [`isort`](https://pypi.org/project/isort/). 60 | 61 | ```bash 62 | black phables/workflow/scripts 63 | isort --atomic phables/workflow/scripts 64 | ``` 65 | 66 | ## Report bugs using GitHub's issues 67 | 68 | We use GitHub issues to track public bugs. Report a bug by opening a new issue in GitHub [issues](https://github.com/Vini2/phables/issues). You will get to select between templates for bug report and feature request. If none of these templates match what you want to report, you can use the custom issue template. 69 | 70 | ## Committing code 71 | 72 | Once you have finished coding and all the tests pass, commit your code and make a pull request. 73 | 74 | ```bash 75 | # Add changed/added files 76 | git add 77 | 78 | # Commit changes 79 | git commit -m "" 80 | 81 | # Push changes 82 | git push 83 | ``` 84 | 85 | Make sure to follow the commit style of [c3dev](https://github.com/cogent3/c3dev/wiki#style-for-commit-messages). Relevant prefixes are replicated below for convenience. 86 | 87 | | **Commit Prefix** | **For** | 88 | |-------------------|-----------------------------------------------| 89 | | DEV: | development tool or utility | 90 | | DOC: | documentation | 91 | | TST: | addition or modification of tests | 92 | | REL: | related to a release | 93 | | MAINT: | maintenance commit (refactoring, typos, etc.) | 94 | | BUG: | bug fix | 95 | | GIT: | git related | 96 | | REV: | revert an earlier commit | 97 | 98 | 99 | Your contribution will be reviewed before accepting it. 100 | 101 | ## License 102 | 103 | By contributing, you agree that your contributions will be licensed under the MIT License. 104 | 105 | ## References 106 | 107 | This document was adapted from the open-source contribution guidelines for [Transcriptase](https://github.com/briandk/transcriptase-atom/blob/master/CONTRIBUTING.md) and [c3dev](https://github.com/cogent3/c3dev/wiki/How-to-Contribute-Code). 108 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Vijini Mallawaarachchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include phables/phables.LICENSE 2 | include phables/phables.CITATION 3 | include phables/phables.VERSION 4 | recursive-include phables * 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | phables logo 3 | phables logo 4 |

5 | 6 | Phables: from fragmented assemblies to high-quality bacteriophage genomes 7 | =============== 8 | 9 | [![DOI](https://img.shields.io/badge/DOI-10.1093/bioinformatics/btad586-blue)](https://doi.org/10.1093/bioinformatics/btad586) 10 | ![GitHub](https://img.shields.io/github/license/Vini2/phables) 11 | [![](https://img.shields.io/static/v1?label=CLI&message=Snaketool&color=blueviolet)](https://github.com/beardymcjohnface/Snaketool) 12 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 13 | ![GitHub last commit (branch)](https://img.shields.io/github/last-commit/Vini2/phables/develop?color=8a35da) 14 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/phables/README.html) 15 | [![Conda](https://img.shields.io/conda/v/bioconda/phables)](https://anaconda.org/bioconda/phables) 16 | [![Conda](https://img.shields.io/conda/dn/bioconda/phables)](https://anaconda.org/bioconda/phables) 17 | [![PyPI version](https://badge.fury.io/py/phables.svg)](https://badge.fury.io/py/phables) 18 | [![Downloads](https://static.pepy.tech/badge/phables)](https://pepy.tech/project/phables) 19 | [![CI](https://github.com/Vini2/phables/actions/workflows/testing.yml/badge.svg)](https://github.com/Vini2/phables/actions/workflows/testing.yml) 20 | [![CodeQL](https://github.com/Vini2/phables/actions/workflows/codeql.yml/badge.svg)](https://github.com/Vini2/phables/actions/workflows/codeql.yml) 21 | [![Documentation Status](https://readthedocs.org/projects/phables/badge/?version=latest)](https://phables.readthedocs.io/en/latest/?badge=latest) 22 | 23 | Phables is a tool developed to resolve bacteriophage genomes using assembly graphs of viral metagenomic data. It models phage-like components in the viral metagenomic assembly as flow networks, models as a minimum flow decomposition problem and resolves genomic paths corresponding to flow paths determined. Phables uses the [Minimum Flow Decomposition via Integer Linear Programming](https://github.com/algbio/MFD-ILP) implementation to obtain the flow paths. 24 | 25 | For detailed instructions on installation and usage, please refer to the [**documentation hosted at Read the Docs**](https://phables.readthedocs.io/en/latest/). 26 | 27 | Phables is available on Bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables) and on PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Feel free to pick your package manager, but we recommend that you use [`conda`](https://docs.conda.io/en/latest/). 28 | 29 | **NEW:** Phables is now available as a Docker container from [Docker hub](https://hub.docker.com/r/linsalrob/phables). Click [here](https://github.com/Vini2/phables/tree/develop/container) for more details. 30 | 31 | ## Setting up Phables 32 | 33 | ### Option 1: Installing Phables using conda (recommended) 34 | 35 | You can install Phables from Bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables). Make sure you have [`conda`](https://docs.conda.io/en/latest/) installed. 36 | 37 | ```bash 38 | # create conda environment and install phables 39 | conda create -n phables -c conda-forge -c anaconda -c bioconda phables 40 | 41 | # activate environment 42 | conda activate phables 43 | ``` 44 | 45 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi. 46 | 47 | ### Option 2: Installing Phables using pip 48 | 49 | You can install Phables from PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Make sure you have [`pip`](https://pip.pypa.io/en/stable/) and [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) installed. 50 | 51 | ```bash 52 | pip install phables 53 | ``` 54 | 55 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi. 56 | 57 | ### Setting up Gurobi 58 | 59 | The MFD implementation uses the linear programming solver [Gurobi](https://www.gurobi.com/). The `phables` conda environment and pip setup does not include Gurobi. You have to install Gurobi using one of the following commands depending on your package manager. 60 | 61 | ```bash 62 | # conda 63 | conda install -c gurobi gurobi 64 | 65 | # pip 66 | pip install gurobipy 67 | ``` 68 | 69 | To handle large models without any model size limitations, once you have installed Gurobi, you have to activate the (academic) license and add the key using the following command. You only have to do this once. 70 | 71 | ```bash 72 | grbgetkey 73 | ``` 74 | 75 | You can refer to further instructions at [https://www.gurobi.com/academia/academic-program-and-licenses/](https://www.gurobi.com/academia/academic-program-and-licenses/). 76 | 77 | ### Test the installation 78 | 79 | After setting up, run the following command to print out the Phables help message. 80 | 81 | ```bash 82 | phables --help 83 | ``` 84 | 85 | ## Quick Start Guide 86 | 87 | Phables is powered by [Snaketool](https://github.com/beardymcjohnface/Snaketool) which packs in all the setup, testing, preprocessing and running steps into an easy-to-use pipeline. 88 | 89 | ### Setup the databases 90 | 91 | ```bash 92 | # Download and setup the databases - you only have to do this once 93 | phables install 94 | ``` 95 | 96 | ### Run on test data 97 | 98 | ```bash 99 | phables test 100 | ``` 101 | 102 | ### Run on your own data 103 | 104 | ```bash 105 | # Run Phables using short read data 106 | phables run --input assembly_graph.gfa --reads fastq/ --threads 8 107 | 108 | # Run Phables using long read data 109 | phables run --input assembly_graph.gfa --reads fastq/ --threads 8 --longreads 110 | ``` 111 | 112 | Please refer to the [**documentation hosted at Read the Docs**](https://phables.readthedocs.io/en/latest/) for further information on how to run Phables. 113 | 114 | 115 | ## Issues and Questions 116 | 117 | If you want to test (or break) Phables give it a try and report any issues and suggestions under [Phables Issues](https://github.com/Vini2/phables/issues). 118 | 119 | If you come across any questions, please have a look at the [Phables FAQ page](https://phables.readthedocs.io/en/latest/faq/). If your question is not here, feel free to post it under [Phables Issues](https://github.com/Vini2/phables/issues). 120 | 121 | 122 | ## Contributing to Phables 123 | 124 | Are you interested in contributing to the Phables project? If so, you can check out the contributing guidelines in [CONTRIBUTING.md](https://github.com/Vini2/phables/blob/develop/CONTRIBUTING.md). 125 | 126 | 127 | ## Acknowledgement 128 | 129 | Phables uses the [Gurobi](https://www.gurobi.com/) implementation of [MFD-ILP](https://github.com/algbio/MFD-ILP) and code snippets from [STRONG](https://github.com/chrisquince/STRONG), [METAMVGL](https://github.com/ZhangZhenmiao/METAMVGL), [GraphBin](https://github.com/metagentools/GraphBin), [MetaCoAG](https://github.com/metagentools/MetaCoAG) and [Hecatomb](https://hecatomb.readthedocs.io/en/latest/). Special thanks are owed to [Ryan Wick](https://github.com/rrwick) for developing [Bandage](https://rrwick.github.io/Bandage/) to visualise assembly graphs, which I heavily rely upon to investigate, develop and optimise my methods. The Phables logo was designed by [Amber Skye](https://fame.flinders.edu.au/people/2021/01/01/amber-cook). 130 | 131 | ## Citation 132 | Phables is published in [Bioinformatics](https://academic.oup.com/bioinformatics) at DOI: [10.1093/bioinformatics/btad586](https://doi.org/10.1093/bioinformatics/btad586). 133 | 134 | If you use Phables in your work, please cite Phables as, 135 | 136 | > Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, Robert A Edwards, Phables: from fragmented assemblies to high-quality bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586 137 | 138 | ```bibtex 139 | @article{10.1093/bioinformatics/btad586, 140 | author = {Mallawaarachchi, Vijini and Roach, Michael J and Decewicz, Przemyslaw and Papudeshi, Bhavya and Giles, Sarah K and Grigson, Susanna R and Bouras, George and Hesse, Ryan D and Inglis, Laura K and Hutton, Abbey L K and Dinsdale, Elizabeth A and Edwards, Robert A}, 141 | title = "{Phables: from fragmented assemblies to high-quality bacteriophage genomes}", 142 | journal = {Bioinformatics}, 143 | volume = {39}, 144 | number = {10}, 145 | pages = {btad586}, 146 | year = {2023}, 147 | month = {09}, 148 | abstract = "{Microbial communities have a profound impact on both human health and various environments. Viruses infecting bacteria, known as bacteriophages or phages, play a key role in modulating bacterial communities within environments. High-quality phage genome sequences are essential for advancing our understanding of phage biology, enabling comparative genomics studies and developing phage-based diagnostic tools. Most available viral identification tools consider individual sequences to determine whether they are of viral origin. As a result of challenges in viral assembly, fragmentation of genomes can occur, and existing tools may recover incomplete genome fragments. Therefore, the identification and characterization of novel phage genomes remain a challenge, leading to the need of improved approaches for phage genome recovery.We introduce Phables, a new computational method to resolve phage genomes from fragmented viral metagenome assemblies. Phables identifies phage-like components in the assembly graph, models each component as a flow network, and uses graph algorithms and flow decomposition techniques to identify genomic paths. Experimental results of viral metagenomic samples obtained from different environments show that Phables recovers on average over 49\\% more high-quality phage genomes compared to existing viral identification tools. Furthermore, Phables can resolve variant phage genomes with over 99\\% average nucleotide identity, a distinction that existing tools are unable to make.Phables is available on GitHub at https://github.com/Vini2/phables.}", 149 | issn = {1367-4811}, 150 | doi = {10.1093/bioinformatics/btad586}, 151 | url = {https://doi.org/10.1093/bioinformatics/btad586}, 152 | eprint = {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btad586/51972145/btad586.pdf}, 153 | } 154 | ``` 155 | 156 | Also, please cite the following tools/databases used by Phables. 157 | 158 | * Roach MJ, Pierce-Ward NT, Suchecki R, Mallawaarachchi V, Papudeshi B, et al. Ten simple rules and a template for creating workflows-as-applications. PLOS Computational Biology 18(12) (2022): e1010705. [https://doi.org/10.1371/journal.pcbi.1010705](https://doi.org/10.1371/journal.pcbi.1010705) 159 | * Terzian P, Olo Ndela E, Galiez C, Lossouarn J, Pérez Bucio RE, Mom R, Toussaint A, Petit MA, Enault F. PHROG: families of prokaryotic virus proteins clustered using remote homology. NAR Genomics and Bioinformatics, Volume 3, Issue 3, lqab067 (2021). [https://doi.org/10.1093/nargab/lqab067](https://doi.org/10.1093/nargab/lqab067) 160 | * Steinegger M, Söding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017). [https://doi.org/10.1038/nbt.3988](https://doi.org/10.1038/nbt.3988) 161 | * Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34:3094-3100 (2018). [https://doi.org/10.1093/bioinformatics/bty191](https://doi.org/10.1093/bioinformatics/bty191) 162 | * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume 25, Issue 16, Pages 2078–2079 (2009). [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352) 163 | * Woodcroft BJ, Newell R, CoverM: Read coverage calculator for metagenomics (2017). [https://github.com/wwood/CoverM](https://github.com/wwood/CoverM) 164 | * Roach, M. J., Hart, B. J., Beecroft, S. J., Papudeshi, B., Inglis, L. K., Grigson, S. R., Mallawaarachchi, V., Bouras, G., & Edwards, R. A. Koverage: Read-coverage analysis for massive (meta)genomics datasets. Journal of Open Source Software, 9(94), 6235, (2024). [https://doi.org/10.21105/joss.06235](https://doi.org/10.21105/joss.06235) 165 | * Hagberg AA, Schult DA, and Swart PJ. Exploring network structure, dynamics, and function using NetworkX. In Proceedings of the 7th Python in Science Conference (SciPy2008), Gäel Varoquaux, Travis Vaught, and Jarrod Millman (Eds), (Pasadena, CA USA), pp. 11–15 (2008). 166 | * Gurobi Optimization. [https://www.gurobi.com/](https://www.gurobi.com/). 167 | -------------------------------------------------------------------------------- /build/environment.yml: -------------------------------------------------------------------------------- 1 | name: phables 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | - bioconda 6 | dependencies: 7 | - python>=3.9, <3.11 8 | - snakemake>=7.14.0 9 | - pyyaml>=6.0 10 | - click>=8.1.3 11 | - jinja2>=3.0.2 12 | - mamba 13 | - metasnek>=0.0.5 14 | - snaketool-utils>=0.0.4 -------------------------------------------------------------------------------- /build/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "phables" %} 2 | {% set version = "1.1.0" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | source: 9 | url: "https://github.com/Vini2/{{ name }}/archive/refs/tags/v{{ version }}.tar.gz" 10 | sha256: 3276a6372e41a679b73d533fcc416a70db39a5e8f1ee78ea9a96590e3acf00de 11 | 12 | build: 13 | number: 0 14 | noarch: python 15 | script: "{{ PYTHON }} -m pip install . -vv" 16 | 17 | requirements: 18 | host: 19 | - python 20 | - pip 21 | run: 22 | - python >=3.8,<3.11 23 | - snakemake >=7.14.0 24 | - pyyaml >=6.0 25 | - click >=8.1.3 26 | - jinja2 >=3.0.2 27 | - mamba <1.4.2 28 | 29 | test: 30 | commands: 31 | - phables --help 32 | 33 | about: 34 | home: "https://github.com/Vini2/phables" 35 | license: MIT 36 | license_family: MIT 37 | license_file: LICENSE 38 | summary: "Phables: from fragmented assemblies to high-quality bacteriophage genomes" 39 | description: | 40 | Phables resolves bacteriophage genomes using phage bubbles in viral metagenomic data. 41 | doc_url: "https://phables.readthedocs.io/" 42 | dev_url: "https://github.com/Vini2/phables" 43 | 44 | extra: 45 | recipe-maintainers: 46 | - Vini2 -------------------------------------------------------------------------------- /container/Dockerfile: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # phables 4 | # 5 | 6 | FROM --platform=linux/amd64 ubuntu:20.04 7 | FROM gurobi/optimizer:latest 8 | 9 | ENV DEBIAN_FRONTEND="noninteractive" 10 | 11 | 12 | ARG LIBFABRIC_VERSION=1.18.1 13 | 14 | # Install required packages and dependencies 15 | RUN apt -y update \ 16 | && apt -y install build-essential wget doxygen gnupg gnupg2 curl apt-transport-https software-properties-common \ 17 | git vim gfortran libtool python3-venv ninja-build python3-pip \ 18 | libnuma-dev python3-dev \ 19 | && apt -y remove --purge --auto-remove cmake \ 20 | && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null\ 21 | | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null \ 22 | && apt-add-repository -y "deb https://apt.kitware.com/ubuntu/ jammy-rc main" \ 23 | && apt -y update 24 | 25 | # Build and install libfabric 26 | RUN (if [ -e /tmp/build ]; then rm -rf /tmp/build; fi;) \ 27 | && mkdir -p /tmp/build \ 28 | && cd /tmp/build \ 29 | && wget https://github.com/ofiwg/libfabric/archive/refs/tags/v${LIBFABRIC_VERSION}.tar.gz \ 30 | && tar xf v${LIBFABRIC_VERSION}.tar.gz \ 31 | && cd libfabric-${LIBFABRIC_VERSION} \ 32 | && ./autogen.sh \ 33 | && ./configure \ 34 | && make -j 16 \ 35 | && make install 36 | 37 | # 38 | # Install miniforge 39 | # 40 | RUN set -eux ; \ 41 | curl -LO https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh ; \ 42 | bash ./Miniforge3-* -b -p /opt/miniforge3 -s ; \ 43 | rm -rf ./Miniforge3-* 44 | ENV PATH /opt/miniforge3/bin:$PATH 45 | # 46 | # Install conda environment 47 | # 48 | ARG PHABLES_VERSION=1.3.2 49 | 50 | RUN set -eux ; \ 51 | mamba install -y -c conda-forge -c anaconda -c bioconda -c defaults \ 52 | phables=${PHABLES_VERSION} ; 53 | ENV PATH /opt/miniforge3/bin:$PATH 54 | RUN conda clean -af -y 55 | RUN mkdir -p /phables /opt/gurobi 56 | RUN ln -s /opt/miniforge3/lib/python3.10/site-packages/phables/workflow/conda /conda 57 | RUN phables install 58 | -------------------------------------------------------------------------------- /container/README.md: -------------------------------------------------------------------------------- 1 | # Docker container 2 | 3 | Please note that this container is hosted on [docker hub](https://hub.docker.com/r/linsalrob/phables) and we recommend you use the latest version there. 4 | 5 | # Installing guorobi 6 | 7 | For the linear solver, you need the [Gurobi WLS](https://www.gurobi.com/features/academic-wls-license/) license. Get that file, which is called `gurobi.lic` by default, and put it in your home directory, or another location that you know where it is. 8 | 9 | # Running the container with singularity (recommended) 10 | 11 | We need to mount three locations that are writable for `phables` to work with singularity. 12 | 13 | 1. You need to mount the `gurobi.lic` file, and that needs to end up at `/opt/gurobi/gurobi.lic`. In the example here, it is in the current working directory, `$PWD`. 14 | 2. You need a temporary directory where conda can install some files. They are installed on the first run, and reused after that. In this example, I am using `/tmp`. You need to mount this to `/conda` which is actually a symlink to the correct location under the snakemake directory. 15 | 3. You need your `.gfa` and `.fastq` files, and a location for the output. In this example, I have a directory called `Sim_Phage`. You should mount this to `/phables`. An important point here is to add the `/` to the end of your directory name, but _not_ to the `/phables`, and then the `.gfa` and `reads` will be in the `/phables` directory. 16 | 17 | > **NOTE:** when you specify the paths, it is important that they are absolute paths (i.e. beginning with `$PWD` or `/`), as relative paths don't work. 18 | 19 | 20 | ## Create the `.sif` image 21 | 22 | The first step is to create the .sif image in a directory. 23 | 24 | Check [docker hub](https://hub.docker.com/r/linsalrob/phables) for the latest version. In this example, I'm using version 0.6 but it may have been updated after that. 25 | 26 | ``` 27 | IMAGE_DIR= 28 | mkdir -p $IMAGE_DIR 29 | singularity pull --dir $IMAGE_DIR docker://linsalrob/phables:v0.5_sneaky_sleeky 30 | ``` 31 | 32 | You can set `IMAGE_DIR` to any path you can write to. 33 | 34 | 35 | ## Run the container 36 | 37 | ``` 38 | singularity exec --bind /tmp:/conda,$PWD/Sim_Phage/:/phables,$PWD/gurobi.lic:/opt/gurobi/gurobi.lic singularity/phables_0.6_gogo phables run --input /phables/assembly_graph_after_simplification.gfa --reads /phables/reads/ --output /phables/phables --threads 32 39 | ``` 40 | 41 | # Running the container with docker 42 | 43 | The approach is very similar, except instead of the `--bind` you need to use `--volume`. Note that you will need to have root access for this to work. 44 | 45 | ``` 46 | docker pull linsalrob/phables:v0.6_gogo 47 | 48 | sudo docker run --volume=$PWD/Sim_Phage/:/phables --volume=/tmp:/conda --volume=$PWD/gurobi.lic:/opt/gurobi/gurobi.lic:ro phables phables run --input /phables/assembly_graph_after_simplification.gfa --reads /phables/reads/ --output /phables/phables --threads 32 49 | ``` 50 | 51 | 52 | 53 | singularity exec --bind /scratch/pawsey1018/edwa0468/tmp:/opt/miniforge3/lib/python3.10/site-packages/phables/workflow/conda,$PWD/testy/Sim_Phage/:/phables,$PWD/gurobi.lic:/opt/gurobi/gurobi.lic testy/phables_v0.5_sneaky_sleeky.sif phables run --input /phables/assembly_graph_after_simplification.gfa --reads /phables/reads/ --output /phables/phables --threads 32 54 | -------------------------------------------------------------------------------- /docs/annotation.md: -------------------------------------------------------------------------------- 1 | # Phage Genome Annotation 2 | 3 | Once you have identified the high-quality and complete genomes from the [CheckV results](https://phables.readthedocs.io/en/latest/quality/), you can annotate them using a tool such as [**pharokka**](https://github.com/gbouras13/pharokka). The following sections will walk you through how to setup and run pharokka. 4 | 5 | ## Installing pharokka 6 | 7 | The recommended way to install pharokka is using [`conda`](https://docs.conda.io/en/latest/). 8 | 9 | ```bash 10 | # Create a new conda environment and install pharokka 11 | conda create -n pharokka -c bioconda pharokka 12 | 13 | # Activate pharokka conda environment 14 | conda activate pharokka 15 | ``` 16 | 17 | ## Download and install the pharokka databases 18 | 19 | ```bash 20 | install_databases.py -o 21 | ``` 22 | 23 | ## Running pharokka 24 | 25 | Here is an example command to run pharokka on the complete and high-quality resolved genomes. 26 | 27 | ```bash 28 | pharokka.py -i complete_hq_genomes.fasta -o pharokka_output -t 16 -d 29 | ``` 30 | 31 | ## Circular genome plot 32 | 33 | You can use the `pharokka_plotter.py` implementation from pharokka to create circular genome plots with annotations. 34 | 35 | Let's assume that you have already run pharokka on all of the complete and high-quality resolved genomes and the output is available in `pharokka_output`. You can pick one genome to plot. For example, let's consider the genome `phage_comp_280_cycle_1.fasta` which is *phiX174*. 36 | 37 | We start by reorienting the genome to start from the `terminase large subunit`. You can look up the starting position and strand of the `terminase large subunit` from the output file `pharokka_output/pharokka_cds_final_merged_output.tsv`. For example, let's take the starting position as 617 on the positive strand. You can run pharokka again for this genome with reorientation as follows. 38 | 39 | ```bash 40 | pharokka.py -i resolved_phages/phage_comp_280_cycle_1.fasta -o pharokka_output_phage_comp_280_cycle_1 -d -t 16 --terminase --terminase_strand 'pos' --terminase_start 617 41 | ``` 42 | 43 | Then you can run the plotting command as follows. 44 | 45 | ```bash 46 | pharokka_plotter.py -i resolved_phages/phage_comp_280_cycle_1.fasta -n phage_comp_280_cycle_1_plot -o pharokka_output_phage_comp_250_cycle_1 -t "Escherichia phage phiX174" 47 | ``` 48 | 49 | ![](images/phage_comp_280_cycle_1_plot.png) -------------------------------------------------------------------------------- /docs/assemble.md: -------------------------------------------------------------------------------- 1 | # Assembly 2 | 3 | Phables requires either short or long read sequencing data from metagenomic samples to be assembled. The following steps explain the steps required to be carried out beforehand. 4 | 5 | 6 | ## Paired-end read files for short read assembly 7 | 8 | Please make sure that the names of the read files are in the following format. Assuming that your paired-end sequencing reads are in the folder `fastq`, please make sure that the reads are in the format `{sampleName}{pattern}{fileExtension}`. `fileExtension` can be `.fq`, `.fastq`, `.fq.gz` or `.fastq.gz`. 9 | 10 | Please make sure that your file `pattern` matches one of the following patterns. 11 | 12 | ``` 13 | _R1_ and _R2_ 14 | _R1. and _R2. 15 | .R1. and .R2. 16 | .R1_ and .R2_ 17 | _1_ and _2_ 18 | _1. and _2. 19 | .1. and .2. 20 | .1_ and .2_ 21 | ``` 22 | 23 | For example, your read files can be 24 | 25 | ``` 26 | sample1_R1.fastq.gz 27 | sample1_R2.fastq.gz 28 | sample2_R1.fastq.gz 29 | sample2_R2.fastq.gz 30 | ... 31 | ``` 32 | 33 | or 34 | 35 | ``` 36 | sample1_1.fq.gz 37 | sample1_2.fq.gz 38 | sample2_1.fq.gz 39 | sample2_2.fq.gz 40 | ... 41 | ``` 42 | 43 | ## Long read assemblies 44 | 45 | If you are using long read datasets, there is no specific naming format for the read files. 46 | 47 | ## Assemble the samples 48 | 49 | Phables requires the assembly graph file in **Graphical Fragment Assembly (GFA)** format. You can use any assembler that produces the assembly graph in GFA format to assemble your samples OR you can convert a FASTG file to GFA format. 50 | 51 | If you have multiple samples you can pool together reads and do a co-assembly. 52 | 53 | ## Recommended assemblers and tools 54 | 55 | ### MEGAHIT 56 | 57 | You can use [MEGAHIT](https://github.com/voutcn/megahit) to assemble your paired-end short read data. 58 | 59 | ```bash 60 | megahit -1 reads_1.fastq -2 reads_2.fastq -o megahit_out 61 | ``` 62 | 63 | By default, MEGAHIT does not produce an assembly graph file. You have to to run `contig2fastg` command from the MEGAHIT toolkit to build the assembly graph file. `contig2fastg` requires you to input the k-mer size used for the assembly. You can get the k-mer size from the contig IDs in the `final.contigs.fa` file. For example, you can use the `grep` command to print out the contig IDs as follows. 64 | 65 | ```bash 66 | grep "^>" final.contigs.fa 67 | ``` 68 | 69 | Imagine you get the output as follows. Here the k-mer size is 141 as denoted by `k141`. 70 | 71 | ```bash 72 | >k141_1456397 flag=0 multi=11.7570 len=1137 73 | >k141_1235266 flag=0 multi=13.6963 len=1254 74 | >k141_131192 flag=1 multi=47.8430 len=1510 75 | >k141_1566081 flag=0 multi=9.6645 len=1372 76 | ... 77 | ``` 78 | 79 | Using the `k` value as 141, now you can run the `contig2fastg` command as follows. 80 | 81 | ```bash 82 | megahit_toolkit contig2fastg 141 final.contigs.fa > final.graph.fastg 83 | ``` 84 | 85 | The MEGAHIT toolkit will result in a FASTG file which you can convert to GFA using [fastg2gfa](https://github.com/lh3/gfa1/blob/master/misc/fastg2gfa.c). 86 | 87 | ```bash 88 | fastg2gfa final.graph.fastg > final.graph.gfa 89 | ``` 90 | 91 | If you want to run Phables on an assembly from a different `k` value found in the MEGAHIT output folder `intermediate_contigs`, please make sure to build the `.fastg` file from the `.fa` file with the corresponding `k` value. For example, if you want to run Phables on the contigs from `k99.contigs.fa`, you should first build the corresponding `k99.graph.fastg` file and then run `fastg2gfa` as follows. 92 | 93 | ```bash 94 | megahit_toolkit contig2fastg 99 k99.contigs.fa > k99.graph.fastg 95 | fastg2gfa k99.graph.fastg > k99.graph.gfa 96 | ``` 97 | 98 | ### metaSPAdes 99 | 100 | You can use [metaSPAdes](https://github.com/ablab/spades) to assemble your paired-end short read data. 101 | 102 | ```bash 103 | spades.py --meta -1 reads_1.fastq -2 reads_2.fastq -o metaspades_output -t 16 104 | ``` 105 | 106 | After the assembly finished, the output will contain the assembly graph file as `assembly_graph_after_simplification.gfa`. 107 | 108 | ### metaFlye 109 | 110 | You can use [metaFlye](https://github.com/fenderglass/Flye) to assemble your long read data. 111 | 112 | ```bash 113 | flye --meta --nano-raw reads.fasta --out-dir metaflye_output --threads 16 114 | ``` 115 | 116 | After the assembly finished, the output will contain the assembly graph file as `assembly_graph.gfa`. 117 | 118 | ### Hecatomb 119 | 120 | You can use [Hecatomb](https://github.com/shandley/hecatomb) which is a viral analysis pipeline to obtain a pooled assembly of your short read or long read data contained in a folder named `reads`. You can run hecatomb as follows. Note that you only need to run the assembly module to process your data for Phables. 121 | 122 | ```bash 123 | hecatomb run --reads reads/ assembly 124 | ``` 125 | 126 | After the assembly finished, the output will contain the assembly graph file as `cross_assembly.gfa`. 127 | 128 | Now we are ready to run Phables. -------------------------------------------------------------------------------- /docs/citation.md: -------------------------------------------------------------------------------- 1 | # Phables Citation 2 | 3 | Phables is published in [Bioinformatics](https://academic.oup.com/bioinformatics) at DOI: [10.1093/bioinformatics/btad586](https://doi.org/10.1093/bioinformatics/btad586). 4 | 5 | If you use Phables in your work, please cite Phables as, 6 | 7 | > Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, Robert A Edwards, Phables: from fragmented assemblies to high-quality bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586 8 | 9 | ```bibtex 10 | @article{10.1093/bioinformatics/btad586, 11 | author = {Mallawaarachchi, Vijini and Roach, Michael J and Decewicz, Przemyslaw and Papudeshi, Bhavya and Giles, Sarah K and Grigson, Susanna R and Bouras, George and Hesse, Ryan D and Inglis, Laura K and Hutton, Abbey L K and Dinsdale, Elizabeth A and Edwards, Robert A}, 12 | title = "{Phables: from fragmented assemblies to high-quality bacteriophage genomes}", 13 | journal = {Bioinformatics}, 14 | volume = {39}, 15 | number = {10}, 16 | pages = {btad586}, 17 | year = {2023}, 18 | month = {09}, 19 | abstract = "{Microbial communities have a profound impact on both human health and various environments. Viruses infecting bacteria, known as bacteriophages or phages, play a key role in modulating bacterial communities within environments. High-quality phage genome sequences are essential for advancing our understanding of phage biology, enabling comparative genomics studies and developing phage-based diagnostic tools. Most available viral identification tools consider individual sequences to determine whether they are of viral origin. As a result of challenges in viral assembly, fragmentation of genomes can occur, and existing tools may recover incomplete genome fragments. Therefore, the identification and characterization of novel phage genomes remain a challenge, leading to the need of improved approaches for phage genome recovery.We introduce Phables, a new computational method to resolve phage genomes from fragmented viral metagenome assemblies. Phables identifies phage-like components in the assembly graph, models each component as a flow network, and uses graph algorithms and flow decomposition techniques to identify genomic paths. Experimental results of viral metagenomic samples obtained from different environments show that Phables recovers on average over 49\\% more high-quality phage genomes compared to existing viral identification tools. Furthermore, Phables can resolve variant phage genomes with over 99\\% average nucleotide identity, a distinction that existing tools are unable to make.Phables is available on GitHub at https://github.com/Vini2/phables.}", 20 | issn = {1367-4811}, 21 | doi = {10.1093/bioinformatics/btad586}, 22 | url = {https://doi.org/10.1093/bioinformatics/btad586}, 23 | eprint = {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btad586/51972145/btad586.pdf}, 24 | } 25 | ``` 26 | 27 | Also, please cite the following tools/databases used by Phables. 28 | 29 | * Roach MJ, Pierce-Ward NT, Suchecki R, Mallawaarachchi V, Papudeshi B, et al. Ten simple rules and a template for creating workflows-as-applications. PLOS Computational Biology 18(12) (2022): e1010705. [https://doi.org/10.1371/journal.pcbi.1010705](https://doi.org/10.1371/journal.pcbi.1010705) 30 | * Terzian P, Olo Ndela E, Galiez C, Lossouarn J, Pérez Bucio RE, Mom R, Toussaint A, Petit MA, Enault F. PHROG: families of prokaryotic virus proteins clustered using remote homology. NAR Genomics and Bioinformatics, Volume 3, Issue 3, lqab067 (2021). [https://doi.org/10.1093/nargab/lqab067](https://doi.org/10.1093/nargab/lqab067) 31 | * Steinegger M, Söding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017). [https://doi.org/10.1038/nbt.3988](https://doi.org/10.1038/nbt.3988) 32 | * Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34:3094-3100 (2018). [https://doi.org/10.1093/bioinformatics/bty191](https://doi.org/10.1093/bioinformatics/bty191) 33 | * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume 25, Issue 16, Pages 2078–2079 (2009). [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352) 34 | * Woodcroft BJ, Newell R, CoverM: Read coverage calculator for metagenomics (2017). [https://github.com/wwood/CoverM](https://github.com/wwood/CoverM) 35 | * Roach, M. J., Hart, B. J., Beecroft, S. J., Papudeshi, B., Inglis, L. K., Grigson, S. R., Mallawaarachchi, V., Bouras, G., & Edwards, R. A. Koverage: Read-coverage analysis for massive (meta)genomics datasets. Journal of Open Source Software, 9(94), 6235, (2024). [https://doi.org/10.21105/joss.06235](https://doi.org/10.21105/joss.06235) 36 | * Hagberg AA, Schult DA, and Swart PJ. Exploring network structure, dynamics, and function using NetworkX. In Proceedings of the 7th Python in Science Conference (SciPy2008), Gäel Varoquaux, Travis Vaught, and Jarrod Millman (Eds), (Pasadena, CA USA), pp. 11–15 (2008). 37 | * Gurobi Optimization. [https://www.gurobi.com/](https://www.gurobi.com/). 38 | -------------------------------------------------------------------------------- /docs/comparison.md: -------------------------------------------------------------------------------- 1 | # Comparing the viral quality of resolved genomes and their constituent unitigs 2 | 3 | You can combine the resolved genomes (`resolved_paths.fasta`) and their constituent unitigs (`resolved_edges.fasta`), and compare the viral quality. 4 | 5 | ## Run CheckV 6 | 7 | You can combine the resolved genome sequences and unitig sequences and run CheckV as follows. 8 | 9 | ```bash 10 | # Combine resolved_paths.fasta and resolved_edges.fasta 11 | cat resolved_paths.fasta resolved_edges.fasta > all_sequences.fasta 12 | 13 | # Run CheckV 14 | checkv end_to_end all_sequences.fasta checkv_result 15 | ``` 16 | 17 | Now you can compare and visualise the quality of the resolved genomes and their constituent unitigs. The following example code shows how to visualise the results using Python. 18 | 19 | ## Importing Python packages 20 | 21 | Assuming you have installed Python and the packages `matplotlib`, `pandas` and `seaborn`, let's import the following. 22 | 23 | ```python 24 | import pandas as pd 25 | import seaborn as sns 26 | import matplotlib.pyplot as plt 27 | ``` 28 | 29 | ## Load the data 30 | 31 | Now we will load the `quality_summary.tsv` file into a dataframe called `checkv_res`. 32 | 33 | ```python 34 | # Load the quality_summary.tsv from the CheckV results 35 | checkv_res = pd.read_csv("checkv_resolved_pathsquality_summary.tsv", delimiter="\t", header=0) 36 | ``` 37 | 38 | ## Format the data 39 | 40 | Now we will convert the sequence lengths into kilobases by dividing the lengths by 1000. 41 | 42 | ```python 43 | # Format the genome length to kb 44 | checkv_res['contig_length'] = checkv_res['contig_length'].div(1000) 45 | ``` 46 | 47 | Then we will add a new column to our dataframe called `Sequence type` to denote whether the sequence is a resolved genome or a unitig. 48 | 49 | ```python 50 | # Add a new column as "Sequence type" 51 | seq_type = [] 52 | 53 | for index, row in checkv_res.iterrows(): 54 | if row['contig_id'].startswith("phage"): 55 | seq_type.append("Resolved genomes") 56 | else: 57 | seq_type.append("Individual unitigs") 58 | 59 | checkv_res.insert(2, "Sequence type", seq_type, True) 60 | ``` 61 | 62 | ## Plot the data 63 | 64 | Now we can plot the viral quality (`Complete`, `High-quality`, `Medium-quality` or `Low-quality`) of the resolved genomes and their constituent unitigs using boxen plots and the save the figure as follows. 65 | 66 | ```python 67 | # Set the order of viral quality 68 | myorder=["Complete", "High-quality", "Medium-quality", "Low-quality"] 69 | 70 | # Plot using catplot 71 | ax = sns.catplot(y="checkv_quality", x="contig_length", hue="Sequence type", kind="boxen", data=checkv_res, height=5, aspect=1.5, order=myorder, showfliers=False) 72 | 73 | # Set axis titles 74 | ax.set(xlabel='Viral genome length (kbp)', ylabel='CheckV quality') 75 | 76 | # Save figure 77 | plt.savefig("checkv_qual_boxen.pdf", dpi=300, bbox_inches='tight', format='pdf') 78 | ``` 79 | 80 | ![](images/qual_resolved_genome_unitig_boxen.png) 81 | 82 | 83 | You can change the `kind` of the plot as you wish. For example, you can draw a violin plot by changing `kind="violin"` as follows. 84 | 85 | ```python 86 | ax = sns.catplot(y="checkv_quality", x="contig_length", hue="Sequence type", kind="violin", data=checkv_res, height=5, aspect=1.5, order=myorder, showfliers=False) 87 | ``` 88 | 89 | ![](images/qual_resolved_genome_unitig_violin.png) -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # Frequently Asked Questions 2 | 3 | ## General FAQs 4 | 5 | ### Q1: Where can I get help with issues? 6 | 7 | If you come across any issues while using Phables, you can open an [issue on GitHub](https://github.com/Vini2/phables/issues) and we will look into it. Phables is still under development and testing, so we expect that there will still be bugs and unhandled exceptions in the code. 8 | 9 | ### Q2: Can I use the assembly graph from any assembler? 10 | 11 | Phables supports any assembly graph in GFA (`.gfa`) format. You can use any assembler that produces the assembly graph in GFA format to assemble your samples OR you can convert an assembly graph in FASTG format to GFA format using a tool such as [fastg2gfa](https://github.com/lh3/gfa1/blob/master/misc/fastg2gfa.c). 12 | 13 | If you use metaSPAdes for assembly, you can use the `assembly_graph_after_simplification.gfa` as input for Phables. 14 | 15 | ### Q4: What can I do after running Phables? 16 | 17 | Once you have run Phables, check out the [EVALUATION](https://phables.readthedocs.io/en/latest/quality/) section where you can read on how to check and compare the quality of the resolved genomes, interpret graph statistics and visualise the results. 18 | 19 | ### Q5: How can I find out which contigs were included in the resolved phages? 20 | 21 | The `resolved_genome_info.txt` file contains the order of sequences in the assembly graph that were used to construct the genomes (refer to the example below). 22 | 23 | | Path | Case | Coverage | Length | GC content | Node order | 24 | |----------------------|-------|----------|--------|-------------------|------------------------------------------------------------------------------| 25 | | phage_comp_0_cycle_1 | case3 | 644 | 45659 | 34.86059703453864 | ['49-', '5524+', '24979-', '5556+', '55+', '5540-', '67+', '4490+', '5554-'] | 26 | | phage_comp_0_cycle_2 | case3 | 625 | 43427 | 35.03810993160937 | ['49-', '5522+', '24979-', '5558+', '55+', '65+', '67+', '4490+', '5498-'] | 27 | | ... | | | | | | 28 | 29 | The `Node order` column denotes the segment IDs from the assembly graph. 30 | 31 | The mapping between the contigs and assembly graph segments depends on the assembler you use. 32 | 33 | * If you use MEGAHIT, the segments in the assembly graph are the contigs themselves. You can directly relate the `Node order` information as the contigs that make the paths. 34 | * If you use an assembler such as SPAdes or Flye, the sequences represented in the assembly graph are **unitigs**, which make up contigs. The information on which unitigs make up the contigs can be found in, for example, `contigs.paths` file in SPAdes and `assembly_info.txt` file in Flye. 35 | 36 | ### Q6: Can I run Phables on mixed-microbial communities? 37 | 38 | Phables was originally designed to run on viromic data, but it can also be used to study mixed-microbial communities. However, the current implementation of Phables filters any component with at least a single unitig encoding any bacterial single-copy marker gene and hence, prophages might be omitted in the final result. Also, some plasmids or [phage-plasmids](https://doi.org/10.1128/mbio.01851-22), can be identified by Phables as phages. Hence, users should perform further downstream analysis to ensure that the predicted genomes are actual phages. One option is to use a tool such as [PPR-Meta](https://github.com/zhenchengfang/PPR-Meta) to classify the genomes resolved from Phables into phages and plasmids. 39 | 40 | ### Q7: Can Phables identify prophages? 41 | 42 | If a prophage is active, excises from the genome, and is replicating, Phables would identify it. However, if it is a cryptic prophage, Phables would not identify it as it will be integrated into the host genome and can be part of a larger bacterial component in the assembly graph. As Phables discards components having bacterial single-copy marker genes, such prophages will not be identified. 43 | 44 | Users can use specific tools to either identify prophages in bacterial genomes such as [Phispy](https://academic.oup.com/nar/article/40/16/e126/1027055) or [hafeZ](https://www.biorxiv.org/content/10.1101/2021.07.21.453177v1) or validate recovered prophage sequences from host-genomes in metagenomic sequences such as [CheckV](https://www.nature.com/articles/s41587-020-00774-7). 45 | 46 | 47 | ## Gurobi FAQs 48 | 49 | ### Q1: Gurobi installation conflicts and `grbgetkey` fails to run 50 | 51 | If you come across conflicts when installing Gurobi in the `phables` environment and could not run the `grbgetkey` command properly, please follow the steps given below. 52 | 53 | ```bash 54 | # Deactivate the phables environment 55 | conda deactivate 56 | 57 | # Remove phables environemnt 58 | conda remove -n phables --all 59 | 60 | # Create conda environment with phables and gurobi 61 | conda create -n phables -c conda-forge -c anaconda -c bioconda -c gurobi phables gurobi 62 | ``` 63 | 64 | ### Q2: Model too large for size-limited license 65 | 66 | If you get the following error when running Phables, this means that you don't have a proper license to handle large models. 67 | 68 | ```bash 69 | Error code 10010: Model too large for size-limited license; visit https://www.gurobi.com/free-trial for a full license 70 | ``` 71 | 72 | You should get an academic license which is provided free of charge to your institutional email address. You can refer to further instructions at [https://www.gurobi.com/academia/academic-program-and-licenses/](https://www.gurobi.com/academia/academic-program-and-licenses/). 73 | 74 | ### Q3: HostID mismatch 75 | 76 | If you get the following error when running Phables as a job on a cluster, you cannot use your academic license which is a file-based host-locked license, meaning that you can only run Gurobi on the machine that the license was obtained for. 77 | 78 | ```bash 79 | Failed to set up a license 80 | Error 10009: HostID mismatch (licensed to , hostid is ) 81 | ``` 82 | 83 | You will have to contact your system admin and setup a floating network license. You can find more details at [https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-](https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-). 84 | 85 | ### Q4: License not valid for Gurobi version x 86 | 87 | If you get the following error when running Phables, this means that the version in your license does not match the installed version. You can install the correct version of Gurobi to match your license or you can get a new license for the latest version installed. 88 | 89 | ```bash 90 | ERROR - Error code 10009: Request denied: license not valid for Gurobi version 11 91 | ``` 92 | 93 | ### Q5: How can I get a Gurobi license for a cluster? 94 | 95 | If you want to run Phables on a cluster, your cluster should have a [floating network license](https://en.wikipedia.org/wiki/Floating_licensing) for Gurobi for the `run` subcommand to execute properly. 96 | 97 | **Gurobi license for a cluster:** You will have to contact your system admin and setup a floating network license for the cluster. You can find more details at [https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-](https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-). 98 | 99 | If your cluster has Gurobi already installed with the license setup, you can load the module as follows, prior to running Phables. 100 | 101 | ```bash 102 | module load gurobi 103 | ``` -------------------------------------------------------------------------------- /docs/graph_stats.md: -------------------------------------------------------------------------------- 1 | # Graph Statistics 2 | 3 | Phables outputs a file named `resolved_component_info.txt` that contains the following information of the phage bubbles resolved. 4 | 5 | * Number of nodes 6 | * Number of paths resolved 7 | * Fraction of unitigs recovered in the paths 8 | * Maximum degree of the graph 9 | * Minimum degree of the graph 10 | * Maximum in degree of the graph 11 | * Maximum out degree of the graph 12 | * Average degree of the graph 13 | * Average in degree of the graph 14 | * Average out degree of the graph 15 | * Density of the graph 16 | * Maximum path length: length of the longest path 17 | * Minimum path length: length of the shortest path 18 | * Length ratio (long/short): (Maximum path length / Minimum path length) 19 | * Maximum coverage path length: length of the path with the highest coverage 20 | * Minimum coverage path length: length of the path with the lowest coverage 21 | * Length ratio (highest cov/lowest cov): (Maximum coverage path length / Minimum coverage path length) 22 | * Maximum coverage 23 | * Minimum coverage 24 | * Coverage ratio (highest/lowest): (Maximum coverage / Minimum coverage) 25 | 26 | You can compare and visualise the graph statistics of the resolved components using this information. The following example code shows how to visualise the results using Python. 27 | 28 | ## Importing Python packages 29 | 30 | Assuming you have installed Python and the packages `matplotlib`, `pandas` and `seaborn`, let's import the following. 31 | 32 | ```python 33 | import pandas as pd 34 | import seaborn as sns 35 | import matplotlib.pyplot as plt 36 | ``` 37 | 38 | ## Load the data 39 | 40 | Now we will load the `resolved_component_info.txt` file into a dataframe called `component_stats`. 41 | 42 | ```python 43 | # Load the resolved_component_info.txt from Phables results 44 | component_stats = pd.read_csv("resolved_component_info.txt", delimiter="\t", header=0) 45 | ``` 46 | 47 | You can list the columns using `component_stats.columns`. The following columns will be listed. 48 | 49 | ```python 50 | Index(['Component', 'Number of nodes', 'Number of paths', 51 | 'Fraction of unitigs recovered', 'Maximum degree', 'Maximum in degree', 52 | 'Maximum out degree', 'Average degree', 'Average in degree', 53 | 'Average out degree', 'Density', 'Maximum path length', 54 | 'Minimum path length', 'Length ratio (long/short)', 55 | 'Maximum coverage path length', 'Minimum coverage path length', 56 | 'Length ratio (highest cov/lowest cov)', 'Maximum coverage', 57 | 'Minimum coverage', 'Coverage ratio (highest/lowest)'], 58 | dtype='object') 59 | ``` 60 | 61 | ## Plot histograms 62 | 63 | You can plot histograms of the different columns. The following code plots a histogram of the `Number of nodes` column. 64 | 65 | ```python 66 | # Get the column 67 | df = component_stats["Number of nodes"] 68 | 69 | # Plot the histogram 70 | ax = df.plot.hist(bins=100, alpha=0.5, figsize=(12, 8)) 71 | 72 | # Set axis titles 73 | ax.set(xlabel='Number of nodes', ylabel='Frequency') 74 | 75 | # Save figure 76 | plt.savefig("histogram_n_nodes.png", format='png', dpi=300, bbox_inches='tight') 77 | ``` 78 | 79 | ![](images/histogram_n_nodes.png) 80 | 81 | ## Plot heatmaps 82 | 83 | You can plot heatmaps for correlations of all the graph statistics as follows. 84 | 85 | ```python 86 | # Use Pearson correlation 87 | df_cor = component_stats.corr(method='pearson') 88 | 89 | # Plot heatmap 90 | sns.heatmap(df_cor, cmap="Blues") 91 | 92 | # Save figure 93 | plt.savefig("pearson_heatmap.png", format='png', dpi=300, bbox_inches='tight') 94 | ``` 95 | 96 | ![](images/pearson_heatmap.png) 97 | 98 | ## Plot hierarchically-clustered heatmaps 99 | 100 | As the heatmap above looks a bit messy and hard to interpret, we can clean it up by clustering so we can observe some patterns. For this we can use the `clustermap` function from seaborn which produces a hierarchically-clustered heatmap. 101 | 102 | ```python 103 | # Plot the hierarchically-clustered heatmap 104 | pearson_clustermap = sns.clustermap(df_cor, cmap="Blues", method="ward") 105 | 106 | # Save figure 107 | pearson_clustermap.savefig("pearson_clustermap.png", format='png', dpi=300, bbox_inches='tight') 108 | ``` 109 | 110 | ![](images/pearson_clustermap.png) -------------------------------------------------------------------------------- /docs/images/Phables_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/Phables_workflow.png -------------------------------------------------------------------------------- /docs/images/components.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/components.png -------------------------------------------------------------------------------- /docs/images/histogram_n_nodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/histogram_n_nodes.png -------------------------------------------------------------------------------- /docs/images/pearson_clustermap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/pearson_clustermap.png -------------------------------------------------------------------------------- /docs/images/pearson_heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/pearson_heatmap.png -------------------------------------------------------------------------------- /docs/images/phables_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/phables_logo.png -------------------------------------------------------------------------------- /docs/images/phage_comp_280_cycle_1_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/phage_comp_280_cycle_1_plot.png -------------------------------------------------------------------------------- /docs/images/qual_resolved_genome_unitig_boxen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/qual_resolved_genome_unitig_boxen.png -------------------------------------------------------------------------------- /docs/images/qual_resolved_genome_unitig_violin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/qual_resolved_genome_unitig_violin.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ![](images/phables_logo.png) 2 | 3 | # Phables: from fragmented assemblies to high-quality bacteriophage genomes 4 | 5 | Phables is a tool developed to resolve bacteriophage genomes using phage bubbles in viral metagenomic data. 6 | It models phage-like components in a viral metagenomic assembly graph as flow networks, models as a 7 | minimum flow decomposition problem and resolves genomic paths corresponding to flow paths determined. 8 | Phables uses the [Minimum Flow Decomposition via Integer Linear 9 | Programming](https://github.com/algbio/MFD-ILP) implementation to obtain the flow paths. 10 | 11 | ## Motivation 12 | 13 | Existing viral identification tools run contigs through a pre-trained model and predict whether or not they are of viral origin. However, contigs do not necessarily represent complete genomes as viral assemblies are not always perfect. Most of the existing metagenomic binning tools are optimised for bacterial metagenomes and cannot handle viral metagenomes efficiently. 14 | 15 | We observed circular and linear components in viral metagenome assembly graphs as shown below (visualisations obtained from [Bandage](https://rrwick.github.io/Bandage/)), suggesting that viral genomes are fragmented and variant genomes exist. 16 | 17 | ![](images/components.png) 18 | 19 | Phables was developed to recover phage-like components called "phage bubbles" that represent one or more bacteriophage genomes and resolve phage bubbles to obtain complete and high-quality genomes. 20 | 21 | ## Workflow 22 | 23 | Phables is powered by [Snaketool](https://github.com/beardymcjohnface/Snaketool) which packs in all the setup, testing, preprocessing and running steps into an easy-to-use pipeline. 24 | 25 | The following diagram shows an overview of Phables. 26 | 27 | ![](images/Phables_workflow.png) 28 | 29 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Setting up Phables 2 | 3 | Phables is available on bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables) and on PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Feel free to pick your package manager, but we recommend that you use [`conda`](https://docs.conda.io/en/latest/). 4 | 5 | ### Option 1: Installing Phables using conda (recommended) 6 | 7 | You can install Phables from bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables). Make sure you have [`conda`](https://docs.conda.io/en/latest/) installed. 8 | 9 | ```bash 10 | # create conda environment and install phables 11 | conda create -n phables -c conda-forge -c anaconda -c bioconda phables 12 | 13 | # activate environment 14 | conda activate phables 15 | ``` 16 | 17 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi. 18 | 19 | ### Option 2: Installing Phables using pip 20 | 21 | You can install Phables from PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Make sure you have [`pip`](https://pip.pypa.io/en/stable/) and [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) installed. 22 | 23 | ```bash 24 | pip install phables 25 | ``` 26 | 27 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi. 28 | 29 | ## Setting up Gurobi 30 | 31 | The MFD implementation uses the linear programming solver [Gurobi](https://www.gurobi.com/). We chose Gurobi over open source solvers as Gurobi is fast and can solve large models (check the performance comparison at [https://www.gurobi.com/resources/open-source-linear-and-mixed-integer-programming-software-and-solvers/](https://www.gurobi.com/resources/open-source-linear-and-mixed-integer-programming-software-and-solvers/)). 32 | 33 | The `phables` conda environment and pip setup does not include Gurobi. You have to install Gurobi using one of the following commands depending on your package manager. 34 | 35 | ```bash 36 | # conda 37 | conda install -c gurobi gurobi 38 | 39 | # pip 40 | pip install gurobipy 41 | ``` 42 | 43 | To handle large models without any model size limitations, once you have installed Gurobi, you have to activate the (academic) license and add the key using the following command. You only have to do this once. 44 | 45 | ```bash 46 | grbgetkey 47 | ``` 48 | 49 | You can refer to further instructions at [https://www.gurobi.com/academia/academic-program-and-licenses/](https://www.gurobi.com/academia/academic-program-and-licenses/). Please note that this academic lisence is a file-based host-locked license, meaning that you can only run Gurobi on the machine that the license was obtained for. If you want to run on a cluster, you will have to contact your system admin and setup a floating network license. You can find more details at [https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-](https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-). 50 | 51 | ## Test the installation 52 | 53 | After setting up, run `phables --help` to print out the Phables help message. 54 | 55 | ```bash 56 | Usage: phables [OPTIONS] COMMAND [ARGS]... 57 | 58 | Phables: from fragmented assemblies to high-quality bacteriophage genomes. 59 | Please refer the full documentation available on Read the Docs at 60 | https://phables.readthedocs.io/ 61 | 62 | Options: 63 | -v, --version Show the version and exit. 64 | -h, --help Show this message and exit. 65 | 66 | Commands: 67 | run Run Phables 68 | install Install databases 69 | test Test Phables 70 | config Copy the system default config file 71 | citation Print the citation(s) for this tool 72 | ``` 73 | 74 | ## Setup the databases 75 | 76 | Now run the following command to download and setup the required databases. 77 | 78 | ```bash 79 | phables install 80 | ``` 81 | 82 | ## Run on the test data 83 | 84 | Then run the following command to launch the test run and ensure that Phables is working. 85 | 86 | ```bash 87 | phables test 88 | ``` 89 | 90 | If the test run completes without any issues, we are good to go. 91 | 92 | ## Build the docs 93 | 94 | Optionally, the complete documentation of Phables including these pages can be built using [MkDocs](https://www.mkdocs.org/) as follows. 95 | 96 | ```bash 97 | # install mkdocs 98 | pip install mkdocs 99 | 100 | # go to your installation directory 101 | cd /path/to/phables 102 | 103 | # build 104 | mkdocs build 105 | ``` -------------------------------------------------------------------------------- /docs/quality.md: -------------------------------------------------------------------------------- 1 | # Checking the quality of resolved genomes 2 | 3 | The sequences of the resolved genomic paths can be found in `resolved_paths.fasta`. Each entry in this FASTA file is a resolved genome (not a contig) and can be directly evaluated using a dedicated viral evaluation tool like [CheckV](https://bitbucket.org/berkeleylab/checkv/src/master/). The following sections will walk you through how to setup and run CheckV. 4 | 5 | ## Installing CheckV 6 | 7 | The recommended way to install CheckV is using [`conda`](https://docs.conda.io/en/latest/). 8 | 9 | ```bash 10 | # Create a new conda environment and install checkv 11 | conda create -n checkv -c conda-forge -c bioconda checkv 12 | 13 | # Activate checkv conda environment 14 | conda activate checkv 15 | ``` 16 | 17 | You can also install using [`pip`](https://pip.pypa.io/en/stable/). 18 | 19 | ```bash 20 | pip install checkv 21 | ``` 22 | 23 | ## Download the CheckV database 24 | 25 | ```bash 26 | checkv download_database ./ 27 | ``` 28 | 29 | Now you need to to specify the `CHECKVDB` location. 30 | 31 | ```bash 32 | export CHECKVDB=/path/to/checkv-db 33 | ``` 34 | 35 | ## Running CheckV 36 | 37 | Here is an example command to run CheckV on the resolved genomes. 38 | 39 | ```bash 40 | checkv end_to_end resolved_paths.fasta checkv_resolved_paths -t 16 41 | ``` 42 | 43 | The `end_to_end` option will run the full pipeline. 44 | 45 | You can also run individual commands for each step in the pipeline as follows. 46 | 47 | ```bash 48 | checkv contamination resolved_paths.fasta checkv_resolved_paths -t 16 49 | checkv completeness resolved_paths.fasta checkv_resolved_paths -t 16 50 | checkv complete_genomes resolved_paths.fasta checkv_resolved_paths 51 | checkv quality_summary resolved_paths.fasta checkv_resolved_paths 52 | ``` 53 | 54 | ## CheckV outputs 55 | 56 | CheckV will produce the following `.tsv` files. 57 | 58 | * `complete_genomes.tsv` - overview of putative complete genomes identified 59 | * `completeness.tsv` - overview of how completeness was estimated 60 | * `contamination.tsv` - overview of how contamination was estimated 61 | * `quality_summary.tsv` - integrated quality results -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2>=3.1.3 2 | mkdocs>=1.3.1 3 | babel>=2.9.0 4 | click>=7.0 5 | Markdown>=3.2.1,<3.4 6 | PyYAML>=5.2 7 | watchdog>=2.0.0 8 | mdx_gh_links>=0.2 9 | ghp-import>=1.0 10 | pyyaml_env_tag>=0.1 11 | mkdocs-redirects>=1.0.1 12 | importlib_metadata>=4.3 13 | packaging>=20.5 14 | mergedeep>=1.3.4 15 | pygments>=2.12 16 | pymdown-extensions 17 | mkdocs-material 18 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # Phables Usage 2 | 3 | Phables run options can be found using the `phables run -h` command. 4 | 5 | ``` 6 | Usage: phables run [OPTIONS] [SNAKE_ARGS]... 7 | 8 | Run Phables 9 | 10 | Options: 11 | --output PATH Output directory [default: phables.out] 12 | --configfile TEXT Custom config file [default: 13 | (outputDir)/config.yaml] 14 | --threads INTEGER Number of threads to use [default: 1] 15 | --use-conda / --no-use-conda Use conda for Snakemake rules [default: use- 16 | conda] 17 | --conda-prefix PATH Custom conda env directory 18 | --profile TEXT Snakemake profile 19 | --snake-default TEXT Customise Snakemake runtime args [default: 20 | --rerun-incomplete, --printshellcmds, 21 | --nolock, --show-failed-logs] 22 | --input PATH Path to assembly graph file in .GFA format 23 | [required] 24 | --reads PATH Path to directory containing paired-end reads 25 | [required] 26 | --minlength INTEGER minimum length of circular unitigs to consider 27 | [default: 2000] 28 | --mincov INTEGER minimum coverage of paths to output [default: 29 | 10] 30 | --compcount INTEGER maximum unitig count to consider a component 31 | [default: 200] 32 | --maxpaths INTEGER maximum number of paths to resolve for a 33 | component [default: 10] 34 | --mgfrac FLOAT length threshold to consider single copy 35 | marker genes [default: 0.2] 36 | --evalue FLOAT maximum e-value for phrog annotations 37 | [default: 1e-10] 38 | --seqidentity FLOAT minimum sequence identity for phrog 39 | annotations [default: 0.3] 40 | --covtol INTEGER coverage tolerance for extending subpaths 41 | [default: 100] 42 | --alpha FLOAT coverage multiplier for flow interval 43 | modelling [default: 1.2] 44 | --longreads provide long reads as input (else defaults to 45 | short reads) 46 | --prefix TEXT prefix for genome identifier 47 | -h, --help Show this message and exit. 48 | 49 | 50 | If you use Phables in your work, please cite Phables as, 51 | 52 | Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, 53 | Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, 54 | Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, 55 | Robert A Edwards, Phables: from fragmented assemblies to high-quality 56 | bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, 57 | October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586 58 | 59 | 60 | For more information on Phables please visit: 61 | https://phables.readthedocs.io/ 62 | 63 | 64 | CLUSTER EXECUTION: 65 | phables run ... --profile [profile] 66 | For information on Snakemake profiles see: 67 | https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles 68 | 69 | RUN EXAMPLES: 70 | Required: phables run --input [assembly graph file] 71 | Specify threads: phables run ... --threads [threads] 72 | Disable conda: phables run ... --no-use-conda 73 | Change defaults: phables run ... --snake-default="-k --nolock" 74 | Add Snakemake args: phables run ... --dry-run --keep-going --touch 75 | Specify targets: phables run ... print_stages 76 | Available targets: 77 | all Run everything (default) 78 | preprocess Run preprocessing only 79 | phables Run phables (and preprocessing if needed) 80 | postprocess Run postprocessing (with preprocessing and phables if needed) 81 | print_stages List available stages 82 | ``` 83 | 84 | ## Run options explained 85 | 86 | * `--input` - assembly graph file in .GFA format 87 | * `--reads` - folder containing paired-end read files 88 | * `--minlength` - minimum length of circular unitigs to consider [default: 2000] 89 | * `--mincov` - minimum coverage of paths to output [default: 10] 90 | * `--compcount` - maximum unitig count to consider a component [default: 200] 91 | * `--maxpaths` - maximum number of paths to resolve for a component [default: 10] 92 | * `--mgfrac` - length threshold to consider single copy marker genes [default: 0.2] 93 | * `--evalue` - maximum e-value for phrog annotations [default: 1e-10] 94 | * `--seqidentity` - minimum sequence identity for phrog annotations [default: 0.3] 95 | * `--covtol` - coverage tolerance for extending subpaths [default: 100] 96 | * `--alpha` - coverage multiplier for flow interval modelling [default: 1.2] 97 | * `--longreads` - provide long reads as input. If this flag is not provided phables defaults to short reads 98 | * `--prefix` - prefix for genome identifier [default: None] 99 | * `--output` - path to the output directory [default: `phables.out`] 100 | * `--configfile` - custom config file [default: `(outputDir)/config.yaml`] 101 | * `--threads` - number of threads to use [default: 1] 102 | * `--use-conda` / `--no-use-conda` - use conda for Snakemake rules [default: `use-conda`] 103 | * `--conda-prefix` - custom conda env directory 104 | * `--snake-default` - customise Snakemake runtime args [default: `--rerun-incomplete, --printshellcmds, --nolock, --show-failed-logs`] 105 | 106 | 107 | ## Example usage 108 | 109 | Assuming your assembly graph file is `assembly_graph.gfa` and reads folder as `fastq`, you can run `phables` as follows. 110 | 111 | ### Using short reads 112 | 113 | ```bash 114 | # Preprocess data using 8 threads (default is 1 thread) 115 | phables run --input assembly_graph.gfa --reads fastq --threads 8 116 | ``` 117 | 118 | ### Using long reads 119 | 120 | ```bash 121 | # Preprocess data using 8 threads (default is 1 thread) 122 | phables run --input assembly_graph.gfa --reads fastq --threads 8 --longreads 123 | ``` 124 | 125 | Note that you should provide the path to the GFA file to the `--input` parameter and the folder containing your sequencing reads to the `--reads` parameter. 126 | 127 | The output of Phables is set by default to `phables.out`. You can update the output path using the `--output` parameter for `phables run` as follows. 128 | 129 | ```bash 130 | # Preprocess data using 8 threads (default is 1 thread) 131 | phables run --input assembly_graph.gfa --reads fastq --output my_output_folder --threads 8 132 | ``` 133 | 134 | The `phables run` command will run preprocessing steps, perform genome resolution and the perform postprocessing steps. 135 | 136 | ## Output 137 | 138 | Following is the folder structure of the Phables complete run. 139 | 140 | ``` 141 | phable.out 142 | ├── config.yaml # config file 143 | ├── logs # all log files 144 | ├── phables # final phables results 145 | ├── phables.log # phables master log 146 | ├── postprocess # postprocessing results 147 | └── preprocess # preprocessing results 148 | ``` 149 | 150 | Phables will create 3 main folders `preprocess`, `phables` and `postprocess` for the different stages of execution. 151 | 152 | ### 1. `preprocess` - preprocessing results 153 | 154 | The following preprocessing steps will be run and their corresponding files and folders can be found in the `preprocess` folder. 155 | 156 | * Obtain unitig sequences from assembly graph - `edges.fasta` 157 | * Map reads to unitig sequences and get BAM files - `temp/*.bam` and `temp/*.bai` 158 | * Calculate coverage of unitig sequences - `coverage.tsv` 159 | * Scan unitig sequences for single-copy marker genes - `edges.fasta.hmmout` 160 | * Scan unitig sequences for Prokaryotic Virus Remote Homologous Groups ([PHROGs](https://phrogs.lmge.uca.fr/)) - `phrogs_annotations.tsv` 161 | 162 | ### 2. `phables` - genome resolution results 163 | 164 | The following files and folders can be found inside the `phables` folder which are the main outputs of Phables. 165 | 166 | * `resolved_paths.fasta` containing the resolved genomes 167 | * `resolved_phages` folder containing the resolved genomes in individual FASTA files 168 | * `resolved_genome_info.txt` containing the path name, coverage, length, GC content and unitig order of the resolved genomes 169 | * `resolved_edges.fasta` containing the unitigs that make up the resolved genomes 170 | * `unresolved_phage_like_edges.fasta` containing all the unresolved phage-like unitigs 171 | * `all_phage_like_edges.fasta` containing sequences from all the phage-like components (both resolved and unresolved) 172 | * `resolved_component_info.txt` containing the details of the phage bubbles resolved 173 | * `component_phrogs.txt` containing PHROGs found in each component 174 | 175 | ### 3. `postprocess` - postprocessing results 176 | 177 | The following postprocessing steps will be run and their corresponding files and folders can be found in the `postprocess` folder. 178 | 179 | * Combine resolved genomes and unresolved edges - `genomes_and_unresolved_edges.fasta` 180 | * Obtain read counts for resolved genomes and unresolved edges - `sample_genome_read_counts.tsv` 181 | * Obtain mean coverage of resolved genomes and unresolved edges - `sample_genome_mean_coverage.tsv` 182 | * Obtain RPKM coverage of resolved genomes and unresolved edges - `sample_genome_rpkm.tsv` 183 | 184 | 185 | ## Step-wise usage 186 | 187 | You can execute each of the preprocessing, phables and postprocessing steps individually if you wish to do so as follows. 188 | 189 | ### Preprocessing only 190 | 191 | You can use the following command to **only run the preprocessing steps**. 192 | 193 | ```bash 194 | # Only preprocess data 195 | phables run --input assembly_graph.gfa --reads fastq --threads 8 preprocess 196 | ``` 197 | 198 | ### Genome resolution only 199 | 200 | You can use the following command to **only run the genome resolution steps**. Please make sure to have the preprocessing results in the output folder. 201 | 202 | ```bash 203 | # Only run phables core using short reads 204 | phables run --input assembly_graph.gfa --reads fastq --threads 8 phables 205 | 206 | # Only run phables core using long reads 207 | phables run --input assembly_graph.gfa --reads fastq --threads 8 --longreads phables 208 | ``` 209 | 210 | ### Postprocessing only 211 | 212 | You can use the following command to **only run the postprocessing steps**. 213 | 214 | ```bash 215 | # Only run phables core 216 | phables run --input assembly_graph.gfa --reads fastq --threads 8 postprocess 217 | ``` 218 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Phables 2 | site_url: "https://github.com/Vini2/phables" 3 | site_author: "Vijini Mallawaarachchi" 4 | repo_url: "https://github.com/Vini2/phables" 5 | repo_name: 'GitHub' 6 | theme: 7 | name: readthedocs 8 | highlightjs: true 9 | hljs_languages: 10 | - yaml 11 | - bash 12 | - shell 13 | - text 14 | nav: 15 | - HOME: 16 | - Introduction: index.md 17 | - Citation: citation.md 18 | - RUNNING: 19 | - Install: install.md 20 | - Assemble: assemble.md 21 | - Usage: usage.md 22 | - FAQ: faq.md 23 | - EVALUATION: 24 | - Running CheckV: quality.md 25 | - Quality comparison: comparison.md 26 | - Annotation with pharokka: annotation.md 27 | - Graph statistics: graph_stats.md 28 | -------------------------------------------------------------------------------- /phables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables/__init__.py -------------------------------------------------------------------------------- /phables/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Entrypoint for phables 3 | 4 | Check out the wiki for a detailed look at customising this file: 5 | https://github.com/beardymcjohnface/Snaketool/wiki/Customising-your-Snaketool 6 | """ 7 | 8 | import os 9 | import click 10 | 11 | from snaketool_utils.cli_utils import OrderedCommands, run_snakemake, copy_config, echo_click 12 | 13 | 14 | def snake_base(rel_path): 15 | return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path) 16 | 17 | 18 | def get_version(): 19 | with open(snake_base("phables.VERSION"), "r") as f: 20 | version = f.readline() 21 | return version 22 | 23 | 24 | def print_citation(): 25 | with open(snake_base("phables.CITATION"), "r") as f: 26 | for line in f: 27 | echo_click(line) 28 | 29 | 30 | def default_to_output(ctx, param, value): 31 | """Callback for click options; places value in output directory unless specified""" 32 | if param.default == value: 33 | return os.path.join(ctx.params["output"], value) 34 | return value 35 | 36 | 37 | def common_options(func): 38 | """Common command line args 39 | Define common command line args here, and include them with the @common_options decorator below. 40 | """ 41 | options = [ 42 | click.option( 43 | "--output", 44 | help="Output directory", 45 | type=click.Path(dir_okay=True, writable=True, readable=True), 46 | default="phables.out", 47 | show_default=True, 48 | ), 49 | click.option( 50 | "--configfile", 51 | default="config.yaml", 52 | show_default=False, 53 | callback=default_to_output, 54 | help="Custom config file [default: (outputDir)/config.yaml]", 55 | ), 56 | click.option( 57 | "--threads", help="Number of threads to use", default=1, show_default=True 58 | ), 59 | click.option( 60 | "--use-conda/--no-use-conda", 61 | default=True, 62 | help="Use conda for Snakemake rules", 63 | show_default=True, 64 | ), 65 | click.option( 66 | "--conda-prefix", 67 | default=snake_base(os.path.join("workflow", "conda")), 68 | help="Custom conda env directory", 69 | type=click.Path(), 70 | show_default=False, 71 | ), 72 | click.option( 73 | "--profile", help="Snakemake profile", default=None, show_default=False 74 | ), 75 | click.option( 76 | "--snake-default", 77 | multiple=True, 78 | default=[ 79 | "--rerun-incomplete", 80 | "--printshellcmds", 81 | "--nolock", 82 | "--show-failed-logs", 83 | ], 84 | help="Customise Snakemake runtime args", 85 | show_default=True, 86 | ), 87 | click.option( 88 | "--log", 89 | default="phables.log", 90 | callback=default_to_output, 91 | hidden=True, 92 | ), 93 | click.option( 94 | "--system_config", 95 | default=snake_base(os.path.join("config", "config.yaml")), 96 | hidden=True, 97 | type=click.Path(), 98 | ), 99 | click.argument("snake_args", nargs=-1), 100 | ] 101 | for option in reversed(options): 102 | func = option(func) 103 | return func 104 | 105 | 106 | def run_options(func): 107 | """Command line args for run subcommand etc""" 108 | options = [ 109 | click.option( 110 | "--input", 111 | help="Path to assembly graph file in .GFA format", 112 | type=click.Path(), 113 | required=True, 114 | ), 115 | click.option( 116 | "--reads", 117 | help="Path to directory containing paired-end reads", 118 | type=click.Path(exists=True), 119 | required=True, 120 | ), 121 | click.option( 122 | "--minlength", 123 | default=2000, 124 | required=False, 125 | help="minimum length of circular unitigs to consider", 126 | type=int, 127 | show_default=True, 128 | ), 129 | click.option( 130 | "--mincov", 131 | default=10, 132 | required=False, 133 | help="minimum coverage of paths to output", 134 | type=int, 135 | show_default=True, 136 | ), 137 | click.option( 138 | "--compcount", 139 | default=200, 140 | required=False, 141 | help="maximum unitig count to consider a component", 142 | type=int, 143 | show_default=True, 144 | ), 145 | click.option( 146 | "--maxpaths", 147 | default=10, 148 | required=False, 149 | help="maximum number of paths to resolve for a component", 150 | type=int, 151 | show_default=True, 152 | ), 153 | click.option( 154 | "--mgfrac", 155 | default=0.2, 156 | required=False, 157 | help="length threshold to consider single copy marker genes", 158 | type=float, 159 | show_default=True, 160 | ), 161 | click.option( 162 | "--evalue", 163 | default=1e-10, 164 | required=False, 165 | help="maximum e-value for phrog annotations", 166 | type=float, 167 | show_default=True, 168 | ), 169 | click.option( 170 | "--seqidentity", 171 | default=0.3, 172 | required=False, 173 | help="minimum sequence identity for phrog annotations", 174 | type=float, 175 | show_default=True, 176 | ), 177 | click.option( 178 | "--covtol", 179 | default=100, 180 | required=False, 181 | help="coverage tolerance for extending subpaths", 182 | type=int, 183 | show_default=True, 184 | ), 185 | click.option( 186 | "--alpha", 187 | default=1.2, 188 | required=False, 189 | help="coverage multiplier for flow interval modelling", 190 | type=float, 191 | show_default=True, 192 | ), 193 | click.option( 194 | "--longreads", 195 | help="provide long reads as input (else defaults to short reads)", 196 | is_flag=True, 197 | default=False, 198 | show_default=True, 199 | required=False, 200 | ), 201 | click.option( 202 | "--prefix", 203 | help="prefix for genome identifier", 204 | type=str, 205 | required=False, 206 | ), 207 | ] 208 | for option in reversed(options): 209 | func = option(func) 210 | return func 211 | 212 | 213 | @click.group( 214 | cls=OrderedCommands, context_settings=dict(help_option_names=["-h", "--help"]) 215 | ) 216 | @click.version_option(get_version(), "-v", "--version", is_flag=True) 217 | def cli(): 218 | """ 219 | Phables: from fragmented assemblies to high-quality bacteriophage genomes. 220 | Please refer the full documentation available on Read the Docs at https://phables.readthedocs.io/ 221 | """ 222 | pass 223 | 224 | 225 | help_msg_extra = """ 226 | \b 227 | \b 228 | If you use Phables in your work, please cite Phables as, 229 | \b 230 | Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, 231 | Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, 232 | Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, 233 | Robert A Edwards, Phables: from fragmented assemblies to high-quality 234 | bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, 235 | October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586 236 | \b 237 | \b 238 | For more information on Phables please visit: 239 | https://phables.readthedocs.io/ 240 | \b 241 | \b 242 | CLUSTER EXECUTION: 243 | phables run ... --profile [profile] 244 | For information on Snakemake profiles see: 245 | https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles 246 | \b 247 | RUN EXAMPLES: 248 | Required: phables run --input [assembly graph file] 249 | Specify threads: phables run ... --threads [threads] 250 | Disable conda: phables run ... --no-use-conda 251 | Change defaults: phables run ... --snake-default="-k --nolock" 252 | Add Snakemake args: phables run ... --dry-run --keep-going --touch 253 | Specify targets: phables run ... print_stages 254 | Available targets: 255 | all Run everything (default) 256 | preprocess Run preprocessing only 257 | phables Run phables (and preprocessing if needed) 258 | postprocess Run postprocessing (with preprocessing and phables if needed) 259 | print_stages List available stages 260 | """ 261 | 262 | 263 | # Run command 264 | @click.command( 265 | epilog=help_msg_extra, 266 | context_settings=dict( 267 | help_option_names=["-h", "--help"], ignore_unknown_options=True 268 | ), 269 | ) 270 | @common_options 271 | @run_options 272 | def run(**kwargs): 273 | """Run Phables""" 274 | 275 | # run! 276 | run_snakemake( 277 | # Full path to Snakefile 278 | snakefile_path=snake_base(os.path.join("workflow", "phables.smk")), 279 | merge_config=kwargs, 280 | **kwargs 281 | ) 282 | 283 | 284 | # Install command 285 | @click.command( 286 | epilog=help_msg_extra, 287 | context_settings=dict( 288 | help_option_names=["-h", "--help"], ignore_unknown_options=True 289 | ), 290 | ) 291 | @common_options 292 | def install(output, **kwargs): 293 | """Install databases""" 294 | 295 | # run! 296 | run_snakemake( 297 | # Full path to Snakefile 298 | snakefile_path=snake_base(os.path.join("workflow", "install.smk")), 299 | **kwargs 300 | ) 301 | 302 | 303 | # Test command 304 | @click.command( 305 | epilog=help_msg_extra, 306 | context_settings=dict( 307 | help_option_names=["-h", "--help"], ignore_unknown_options=True 308 | ), 309 | ) 310 | @common_options 311 | def test(**kwargs): 312 | """Test Phables""" 313 | test_dir = snake_base("test_data") 314 | 315 | # Config to add or update in configfile 316 | merge_config = {"dir": test_dir} 317 | 318 | # run! 319 | run_snakemake( 320 | # Full path to Snakefile 321 | snakefile_path=snake_base(os.path.join("workflow", "test_phables.smk")), 322 | merge_config=merge_config, 323 | **kwargs 324 | ) 325 | 326 | 327 | @click.command() 328 | @common_options 329 | def config(configfile, **kwargs): 330 | """Copy the system default config file""" 331 | copy_config(configfile) 332 | 333 | 334 | @click.command() 335 | def citation(**kwargs): 336 | """Print the citation(s) for this tool""" 337 | print_citation() 338 | 339 | 340 | cli.add_command(run) 341 | cli.add_command(install) 342 | cli.add_command(test) 343 | cli.add_command(config) 344 | cli.add_command(citation) 345 | 346 | 347 | def main(): 348 | cli() 349 | 350 | 351 | if __name__ == "__main__": 352 | main() 353 | -------------------------------------------------------------------------------- /phables/config/config.yaml: -------------------------------------------------------------------------------- 1 | # Snakemake config 2 | input: 3 | output: 'phables.out/' 4 | log: 'phables/phables.log' 5 | 6 | # Databases 7 | databases: 8 | 9 | # Profile 10 | profile: 11 | 12 | # Job resources for use with Snakemake profiles 13 | # jobCPU will be scaled down if running locally with less than 8 threads 14 | # jobMem is ignored when running locally 15 | resources: 16 | jobCPU: 8 17 | jobMem: 16000 # in Mb 18 | 19 | # Phable parameters 20 | minlength: 2000 21 | mincov: 10 22 | compcount: 200 23 | maxpaths: 10 24 | mgfrac: 0.2 25 | evalue: 1E-10 26 | seqidentity: 0.3 27 | covtol: 100 28 | alpha: 1.2 29 | longreads: False 30 | prefix: -------------------------------------------------------------------------------- /phables/config/databases.yaml: -------------------------------------------------------------------------------- 1 | # Bacterial single-copy marker genes HMM file 2 | smg_hmm: "https://raw.githubusercontent.com/metagentools/MetaCoAG/develop/src/metacoag/metacoag_utils/auxiliary/marker.hmm" 3 | smg_hmm_file: "marker.hmm" 4 | 5 | # PHROGs mmseqs database 6 | phrogs_mmseqs: "https://phrogs.lmge.uca.fr/downloads_from_website/phrogs_mmseqs_db.tar.gz" 7 | phrogs_mmseqs_file: "phrogs_mmseqs_db.tar.gz" 8 | phrogs_mmseqs_folder: "phrogs_mmseqs_db/" 9 | 10 | # PROHGs annotations 11 | phrog_annot: "https://phrogs.lmge.uca.fr/downloads_from_website/phrog_annot_v4.tsv" 12 | phrog_annot_file: "phrog_annot_v4.tsv" -------------------------------------------------------------------------------- /phables/phables.CITATION: -------------------------------------------------------------------------------- 1 | Please cite phables in your paper using this link: 2 | https://doi.org/10.1093/bioinformatics/btad586 3 | 4 | 5 | Please consider also citing these dependencies: 6 | 7 | Snaketool: 8 | https://doi.org/10.31219/osf.io/8w5j3 9 | 10 | Snakemake: 11 | https://doi.org/10.12688/f1000research.29032.1 12 | 13 | PHROG: 14 | https://doi.org/10.1093/nargab/lqab067 15 | 16 | MMseqs2: 17 | https://doi.org/10.1038/nbt.3988 18 | 19 | Minimap2: 20 | https://doi.org/10.1093/bioinformatics/bty191 21 | 22 | SAMtools: 23 | https://doi.org/10.1093/bioinformatics/btp352 24 | 25 | CoverM 26 | https://github.com/wwood/CoverM 27 | 28 | Koverage 29 | https://github.com/beardymcjohnface/Koverage 30 | 31 | NetworkX 32 | https://conference.scipy.org/proceedings/scipy2008/paper_2/ 33 | 34 | Gurobi Optimization 35 | https://www.gurobi.com/ 36 | -------------------------------------------------------------------------------- /phables/phables.LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023, Vijini Mallawaarachchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /phables/phables.VERSION: -------------------------------------------------------------------------------- 1 | 1.4.1 -------------------------------------------------------------------------------- /phables/test_data/edge_coverages.tsv: -------------------------------------------------------------------------------- 1 | Contig sample1 sample2 2 | edge_1 4742.0 0.0 3 | edge_2 5000.0 0.0 4 | edge_3 4858.0 0.0 5 | edge_4 102.0 303.0 6 | edge_5 12.0 33.0 7 | edge_6 200.0 100.0 8 | edge_7 6000.0 9000.0 9 | edge_8 2000.0 3000.0 10 | edge_9 4020.0 6010.0 11 | edge_10 7000.0 8070.0 12 | edge_11 3090.0 5090.0 13 | edge_12 2010.0 3040.0 14 | edge_13 6010.0 9020.0 15 | edge_14 50.0 700.0 16 | edge_15 50.0 50.0 17 | edge_16 150.0 150.0 -------------------------------------------------------------------------------- /phables/test_data/edges.fasta.hmmout: -------------------------------------------------------------------------------- 1 | # --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord 2 | # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target 3 | # ------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- --------------------- -------------------------------------------------------------------------------- /phables/test_data/junction_pe_coverage.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables/test_data/junction_pe_coverage.pickle -------------------------------------------------------------------------------- /phables/test_data/phrogs_annotations.tsv: -------------------------------------------------------------------------------- 1 | "edge_2" "phrog_2 ## NC_006953_p8" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 2 | "edge_4" "phrog_2 ## NC_006953_p8" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 3 | "edge_5" "phrog_2 ## NC_006953_p8" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 4 | "edge_13" "phrog_2 ## NC_006953_p8" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 5 | "edge_14" "phrog_94 ## p350580 VI_04431" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 6 | "edge_14" "phrog_30832 ## AP018399_p123" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 7 | "edge_14" "phrog_195 ## p126863 VI_01011" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 8 | "edge_14" "phrog_1858 ## p362065 VI_01943" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 9 | "edge_15" "phrog_30832 ## AP018399_p123" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 10 | "edge_15" "phrog_195 ## p126863 VI_01011" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 11 | "edge_15" "phrog_1858 ## p362065 VI_01943" 142 0.994 3.622E-37 835 215 9332 18 221 408 901 2 -1 -1 -------------------------------------------------------------------------------- /phables/workflow/envs/curl.yaml: -------------------------------------------------------------------------------- 1 | name: curl 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - curl -------------------------------------------------------------------------------- /phables/workflow/envs/koverage.yaml: -------------------------------------------------------------------------------- 1 | name: koverage 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - koverage>=0.1.8 7 | - numpy<2.0.0 8 | -------------------------------------------------------------------------------- /phables/workflow/envs/mapping.yaml: -------------------------------------------------------------------------------- 1 | name: mapping 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - minimap2 6 | - samtools 7 | -------------------------------------------------------------------------------- /phables/workflow/envs/mmseqs.yaml: -------------------------------------------------------------------------------- 1 | name: mmseqs 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - mmseqs2=13.45111 6 | -------------------------------------------------------------------------------- /phables/workflow/envs/phables.yaml: -------------------------------------------------------------------------------- 1 | name: phables 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | - bioconda 6 | - gurobi 7 | dependencies: 8 | - python>=3.9, <3.11 9 | - biopython 10 | - python-igraph 11 | - pysam 12 | - networkx>=2.8.6 13 | - scipy 14 | - numpy<2.0.0 15 | - pandas 16 | - gurobi>=10.0.0 17 | - more-itertools 18 | - tqdm 19 | - click 20 | - metasnek>=0.0.3 21 | -------------------------------------------------------------------------------- /phables/workflow/envs/smg.yaml: -------------------------------------------------------------------------------- 1 | name: smg 2 | channels: 3 | - bioconda 4 | dependencies: 5 | - fraggenescan 6 | - hmmer 7 | -------------------------------------------------------------------------------- /phables/workflow/install.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Phables: from fragmented assemblies to high-quality bacteriophage genomes. 3 | 4 | 2023, Vijini Mallawaarachchi 5 | 6 | This is an auxiliary Snakefile to install databases or dependencies. 7 | """ 8 | 9 | 10 | """CONFIGURATION""" 11 | configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml") 12 | configfile: os.path.join(workflow.basedir, "..", "config", "databases.yaml") 13 | 14 | include: "rules/00_database_preflight.smk" 15 | 16 | 17 | """TARGETS""" 18 | db_files = [] 19 | 20 | db_files.append(os.path.join(DBPATH, config['phrogs_mmseqs_folder'])) 21 | db_files.append(os.path.join(DBPATH, config['smg_hmm_file'])) 22 | db_files.append(os.path.join(DBPATH, config['phrog_annot_file'])) 23 | 24 | 25 | """RUN SNAKEMAKE""" 26 | rule all: 27 | input: 28 | db_files 29 | 30 | 31 | """RULES""" 32 | rule phrogs_mmseqs_download: 33 | params: 34 | url=os.path.join(config['phrogs_mmseqs']), 35 | file=os.path.join(DBPATH, config['phrogs_mmseqs_file']), 36 | db_path = DBPATH 37 | output: 38 | directory(os.path.join(DBPATH, config['phrogs_mmseqs_folder'])) 39 | conda: 40 | os.path.join("envs", "curl.yaml") 41 | shell: 42 | """ 43 | curl -Lko {params.file} {params.url} 44 | tar -xf {params.file} -C {params.db_path} 45 | rm -rf {params.file} 46 | """ 47 | 48 | rule smg_hmm_download: 49 | params: 50 | url=os.path.join(config['smg_hmm']) 51 | output: 52 | os.path.join(DBPATH, config['smg_hmm_file']) 53 | conda: 54 | os.path.join("envs", "curl.yaml") 55 | shell: 56 | """ 57 | curl -Lko {output} {params.url} 58 | """ 59 | 60 | rule phrog_annot_download: 61 | params: 62 | url=os.path.join(config['phrog_annot']) 63 | output: 64 | os.path.join(DBPATH, config['phrog_annot_file']) 65 | conda: 66 | os.path.join("envs", "curl.yaml") 67 | shell: 68 | """ 69 | curl -Lko {output} {params.url} 70 | """ -------------------------------------------------------------------------------- /phables/workflow/phables.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Phables: from fragmented assemblies to high-quality bacteriophage genomes. 3 | 4 | 2023, Vijini Mallawaarachchi 5 | 6 | This is the main Snakefile to run phables. 7 | """ 8 | 9 | """CONFIGURATION""" 10 | configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml") 11 | configfile: os.path.join(workflow.basedir, "..", "config", "databases.yaml") 12 | 13 | 14 | """PREFLIGHT CHECKS""" 15 | include: os.path.join("rules", "00_database_preflight.smk") 16 | include: os.path.join("rules", "02_phables_preflight.smk") 17 | 18 | 19 | """TARGETS""" 20 | include: os.path.join("rules", "02_phables_targets.smk") 21 | 22 | 23 | """Target rules""" 24 | target_rules = [] 25 | 26 | def targetRule(fn): 27 | assert fn.__name__.startswith("__") 28 | target_rules.append(fn.__name__[2:]) 29 | return fn 30 | 31 | localrules: all, preprocess, phables, print_stages, koverage_tsv, postprocess 32 | 33 | 34 | """Run stages""" 35 | @targetRule 36 | rule all: 37 | input: 38 | preprocessTargets, 39 | phablesTargets, 40 | postprocessTargets 41 | 42 | 43 | @targetRule 44 | rule preprocess: 45 | input: 46 | preprocessTargets 47 | 48 | 49 | @targetRule 50 | rule phables: 51 | input: 52 | phablesTargets 53 | 54 | 55 | @targetRule 56 | rule postprocess: 57 | input: 58 | postprocessTargets 59 | 60 | 61 | @targetRule 62 | rule print_stages: 63 | run: 64 | print("\nIndividual Phables stages to run: \n", file=sys.stderr) 65 | print("* " + "\n* ".join(target_rules) + "\n\n", file=sys.stderr) 66 | 67 | 68 | """RULES""" 69 | # Step 2: Obtain unitig sequences from assembly graph 70 | include: os.path.join("rules", "gfa2fasta.smk") 71 | 72 | 73 | # Step 3: Calculate coverage of unitig sequences 74 | include: os.path.join("rules", "coverage.smk") 75 | 76 | 77 | # Step 4: Scan unitig sequences for single-copy marker genes and PHROGs 78 | include: os.path.join("rules", "genes.smk") 79 | 80 | 81 | # Step 5: Run Phables 82 | include: os.path.join("rules", "phables.smk") 83 | 84 | 85 | # Step 6: Postprocess genomes 86 | include: os.path.join("rules", "postprocess.smk") 87 | -------------------------------------------------------------------------------- /phables/workflow/rules/00_database_preflight.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Add your preflight checks as pure Python code here. 3 | e.g. Configure the run, declare directories, validate the input files etc. 4 | This preflight check to confirm the database filepaths 5 | """ 6 | 7 | 8 | """CHECK IF CUSTOM DATABASE DIRECTORY""" 9 | DBPATH = "" 10 | try: 11 | if config['databases'] is None: 12 | DBPATH = os.path.join(workflow.basedir, '..', '..', 'databases') 13 | else: 14 | DBPATH = config['databases'] 15 | except KeyError: 16 | DBPATH = os.path.join(workflow.basedir,'..','..','databases') 17 | 18 | 19 | """ONSTART/END/ERROR 20 | Tasks to perform at various stages the start and end of a run. 21 | """ 22 | onsuccess: 23 | """Print a success message""" 24 | sys.stderr.write('\n\nDatabases are successfully setup!\n\n') 25 | 26 | onerror: 27 | """Print an error message""" 28 | sys.stderr.write('\n\nERROR: Databases were not setup.\n\n') 29 | -------------------------------------------------------------------------------- /phables/workflow/rules/02_phables_preflight.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Add your preflight checks as pure Python code here. 3 | e.g. Configure the run, declare directories, validate the input files etc. 4 | This preflight check to confirm the database filepaths 5 | """ 6 | 7 | from metasnek import fastq_finder 8 | 9 | """ 10 | Setting the directory variables 11 | """ 12 | 13 | # THREADS = config['threads'] 14 | INPUT = config['input'] 15 | OUTDIR = config['output'] 16 | print(f"Output files will be saved to directory, {OUTDIR}\n") 17 | 18 | 19 | ############################################################################ 20 | # Checking through the reads folder 21 | ############################################################################ 22 | 23 | SAMPLE_READS = fastq_finder.parse_samples_to_dictionary(config['reads']) 24 | SAMPLE_NAMES = list(SAMPLE_READS.keys()) 25 | 26 | 27 | ############################################################################ 28 | # Get Phables parameters 29 | ############################################################################ 30 | ML = config['minlength'] 31 | MC = config['mincov'] 32 | CC = config['compcount'] 33 | MP = config['maxpaths'] 34 | MGF = config['mgfrac'] 35 | EV = config['evalue'] 36 | SI = config['seqidentity'] 37 | CT = config['covtol'] 38 | AL = config['alpha'] 39 | LR = config['longreads'] 40 | PR = config['prefix'] 41 | 42 | 43 | """DIRECTORIES/FILES etc. 44 | Declare some directories for pipeline intermediates and outputs. 45 | """ 46 | LOGSDIR = os.path.join(OUTDIR, 'logs') 47 | 48 | 49 | """ONSTART/END/ERROR 50 | Tasks to perform at various stages the start and end of a run. 51 | """ 52 | onstart: 53 | """Cleanup old log files before starting""" 54 | if os.path.isdir(LOGSDIR): 55 | oldLogs = filter(re.compile(r'.*.log').match, os.listdir(LOGSDIR)) 56 | for logfile in oldLogs: 57 | os.unlink(os.path.join(LOGSDIR, logfile)) 58 | 59 | 60 | onsuccess: 61 | """Print a success message""" 62 | sys.stderr.write('\n\nPhables ran successfully!\n\n') 63 | 64 | 65 | onerror: 66 | """Print an error message""" 67 | sys.stderr.write('\n\nPhables run failed\n\n') 68 | -------------------------------------------------------------------------------- /phables/workflow/rules/02_phables_targets.smk: -------------------------------------------------------------------------------- 1 | 2 | preprocessTargets = [] 3 | phablesTargets = [] 4 | postprocessTargets = [] 5 | 6 | 7 | """PREPROCESSING TARGETS""" 8 | EDGES_FILE = os.path.join(OUTDIR, "preprocess", "edges.fasta") 9 | preprocessTargets.append(EDGES_FILE) 10 | 11 | BAM_PATH = os.path.join(OUTDIR, "preprocess", "temp") 12 | preprocessTargets.append(expand(os.path.join(BAM_PATH, "{sample}.bam"), sample=SAMPLE_NAMES)) 13 | preprocessTargets.append(expand(os.path.join(BAM_PATH, "{sample}.bam.bai"), sample=SAMPLE_NAMES)) 14 | 15 | COVERAGE_PATH = os.path.join(OUTDIR, "preprocess", "coverage_rpkm/") 16 | # preprocessTargets.append(expand(os.path.join(COVERAGE_PATH, "{sample}_rpkm.tsv"), sample=SAMPLE_NAMES)) 17 | preprocessTargets.append(os.path.join(OUTDIR, "preprocess", "coverage.tsv")) 18 | preprocessTargets.append(os.path.join(OUTDIR, "preprocess", "edges.fasta.hmmout")) 19 | 20 | preprocessTargets.append(os.path.join(OUTDIR, "preprocess", "phrogs_annotations.tsv")) 21 | 22 | 23 | """MISC""" 24 | COVERAGE_FILE = os.path.join(OUTDIR, "preprocess", "coverage.tsv") 25 | PHROG_ANNOT = os.path.join(OUTDIR, "preprocess", "phrogs_annotations.tsv") 26 | SMG_FILE = os.path.join(OUTDIR, "preprocess", "edges.fasta.hmmout") 27 | GRAPH_FILE = INPUT 28 | 29 | 30 | """PHABLES TARGETS""" 31 | RESOLVED_GENOMES = os.path.join(OUTDIR, "phables", "resolved_paths.fasta") 32 | 33 | RESOLVED_GENOME_INFO = os.path.join(OUTDIR, "phables", "resolved_genome_info.txt") 34 | phablesTargets.append(RESOLVED_GENOME_INFO) 35 | 36 | RESOLVED_COMP_INFO = os.path.join(OUTDIR, "phables", "resolved_component_info.txt") 37 | phablesTargets.append(RESOLVED_COMP_INFO) 38 | 39 | COMP_PHROGS = os.path.join(OUTDIR, "phables", "component_phrogs.txt") 40 | phablesTargets.append(COMP_PHROGS) 41 | 42 | 43 | """POSTPROCESSING TARGETS""" 44 | GENOME_READ_COUNTS = os.path.join(OUTDIR, "postprocess", "sample_genome_read_counts.tsv") 45 | postprocessTargets.append(GENOME_READ_COUNTS) -------------------------------------------------------------------------------- /phables/workflow/rules/03_test_preflight.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Add your preflight checks as pure Python code here. 3 | e.g. Configure the run, declare directories, validate the input files etc. 4 | This preflight check to confirm the database filepaths 5 | """ 6 | 7 | 8 | """ 9 | Setting the directory variables 10 | """ 11 | 12 | TESTDIR = config['dir'] 13 | 14 | 15 | ############################################################################ 16 | # Get Phables parameters 17 | ############################################################################ 18 | ML = config['minlength'] 19 | MC = config['mincov'] 20 | CC = config['compcount'] 21 | MP = config['maxpaths'] 22 | MGF = config['mgfrac'] 23 | EV = config['evalue'] 24 | SI = config['seqidentity'] 25 | CT = config['covtol'] 26 | AL = config['alpha'] 27 | LR = config['longreads'] 28 | PR = config['prefix'] 29 | 30 | 31 | """ONSTART/END/ERROR 32 | Tasks to perform at various stages the start and end of a run. 33 | """ 34 | onsuccess: 35 | """Print a success message""" 36 | sys.stderr.write('\n\nPhables test run was successful!\n\n') 37 | 38 | onerror: 39 | """Print an error message""" 40 | sys.stderr.write('\n\nPhables test run failed! Please check.\n\n') -------------------------------------------------------------------------------- /phables/workflow/rules/03_test_targets.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Declare your targets here! 3 | A separate file is ideal if you have lots of target files to create, or need some python logic to determine 4 | the targets to declare. This example shows targets that are dependent on the input file type. 5 | """ 6 | 7 | allTargets = [] 8 | 9 | allTargets.append(os.path.join(TESTDIR, "resolved_paths.fasta")) 10 | allTargets.append(os.path.join(TESTDIR, "resolved_genome_info.txt")) 11 | allTargets.append(os.path.join(TESTDIR, "resolved_edges.fasta")) 12 | allTargets.append(os.path.join(TESTDIR, "resolved_component_info.txt")) 13 | allTargets.append(os.path.join(TESTDIR, "phage_like_edges.fasta")) -------------------------------------------------------------------------------- /phables/workflow/rules/coverage.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Use raw_coverage to map to calculate coverage of unitigs. 3 | Use combine_cov to combine the coverage values of multiple samples into one file. 4 | """ 5 | 6 | rule koverage_tsv: 7 | """Generate TSV of samples and reads for Koverage""" 8 | output: 9 | os.path.join(OUTDIR, "preprocess", "phables.samples.tsv") 10 | params: 11 | SAMPLE_READS 12 | run: 13 | from metasnek import fastq_finder 14 | fastq_finder.write_samples_tsv(params[0], output[0]) 15 | 16 | 17 | rule koverage: 18 | """Get coverage statistics with Koverage + CoverM""" 19 | input: 20 | tsv = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"), 21 | edges = EDGES_FILE 22 | params: 23 | out_dir = os.path.join(OUTDIR, "preprocess"), 24 | profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "", 25 | output: 26 | expand(os.path.join(OUTDIR, "preprocess", "temp", "{sample}.{ext}"), 27 | sample=SAMPLE_NAMES, 28 | ext=["bam","bam.bai"]), 29 | os.path.join(OUTDIR, "preprocess", "results", "sample_coverm_coverage.tsv") 30 | threads: 31 | config["resources"]["jobCPU"] 32 | resources: 33 | mem_mb = config["resources"]["jobMem"], 34 | mem = str(config["resources"]["jobMem"]) + "MB" 35 | conda: 36 | os.path.join("..", "envs", "koverage.yaml") 37 | shell: 38 | """ 39 | koverage run coverm \ 40 | --reads {input.tsv} \ 41 | --ref {input.edges} \ 42 | --threads {threads} \ 43 | --output {params.out_dir} \ 44 | {params.profile} 45 | """ 46 | 47 | 48 | rule run_combine_cov: 49 | """Sample\tContig\tCount\tRPKM\tTPM\tMean\tCovered_bases\tVariance\n""" 50 | input: 51 | os.path.join(OUTDIR, "preprocess", "results", "sample_coverm_coverage.tsv") 52 | output: 53 | os.path.join(OUTDIR, "preprocess", "coverage.tsv") 54 | shell: 55 | """ 56 | sed -i '1d' {input} 57 | awk -F '\t' '{{ sum[$2] += $6 }} END {{ for (key in sum) print key, sum[key] }}' {input} > {output} 58 | """ 59 | -------------------------------------------------------------------------------- /phables/workflow/rules/genes.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Use FragGeneScan and HMMER to scan for bacterial single-copy marker genes in unitigs. 3 | User mmseqs2 to scan for PHROGs in unitigs. 4 | """ 5 | 6 | rule scan_smg: 7 | input: 8 | genome = EDGES_FILE, 9 | hmm = os.path.join(DBPATH, "marker.hmm"), 10 | threads: 11 | config["resources"]["jobCPU"] 12 | resources: 13 | mem_mb = config["resources"]["jobMem"], 14 | mem = str(config["resources"]["jobMem"]) + "MB" 15 | output: 16 | hmmout = os.path.join(OUTDIR, "preprocess", "edges.fasta.hmmout") 17 | params: 18 | frag = EDGES_FILE + ".frag", 19 | frag_faa = EDGES_FILE + ".frag.faa", 20 | log: 21 | frag_out=os.path.join(LOGSDIR, "smg_scan_frag_out.log"), 22 | frag_err=os.path.join(LOGSDIR, "smg_scan_frag_err.log"), 23 | hmm_out=os.path.join(LOGSDIR, "smg_scan_hmm_out.log"), 24 | hmm_err=os.path.join(LOGSDIR, "smg_scan_hmm_err.log") 25 | conda: 26 | os.path.join("..", "envs", "smg.yaml") 27 | shell: 28 | """ 29 | run_FragGeneScan.pl -genome={input.genome} -out={params.frag} -complete=0 -train=complete -thread={threads} 1>{log.frag_out} 2>{log.frag_err} 30 | hmmsearch --domtblout {output.hmmout} --cut_tc --cpu {threads} {input.hmm} {params.frag_faa} 1>{log.hmm_out} 2> {log.hmm_err} 31 | """ 32 | 33 | 34 | rule scan_phrogs: 35 | input: 36 | genome = EDGES_FILE, 37 | db = os.path.join(DBPATH,"phrogs_mmseqs_db","phrogs_profile_db") 38 | threads: 39 | config["resources"]["jobCPU"] 40 | resources: 41 | mem_mb = config["resources"]["jobMem"], 42 | mem = str(config["resources"]["jobMem"]) + "MB" 43 | output: 44 | os.path.join(OUTDIR, "preprocess", "phrogs_annotations.tsv") 45 | params: 46 | out_path = os.path.join(OUTDIR, "preprocess", "phrogs"), 47 | target_seq = os.path.join(OUTDIR, "preprocess", "phrogs", "target_seq"), 48 | results_mmseqs = os.path.join(OUTDIR, "preprocess", "phrogs", "results_mmseqs"), 49 | tmp = os.path.join(OUTDIR, "preprocess", "phrogs", "tmp"), 50 | log: 51 | os.path.join(LOGSDIR, "phrogs_scan.log") 52 | conda: 53 | os.path.join("..", "envs", "mmseqs.yaml") 54 | shell: 55 | """ 56 | mkdir -p {params.out_path} 57 | mmseqs createdb {input} {params.target_seq} > {log} 58 | mmseqs search {params.target_seq} {input.db} {params.results_mmseqs} {params.tmp} --threads {threads} -s 7 > {log} 59 | mmseqs createtsv {params.target_seq} {input.db} {params.results_mmseqs} {output} --threads {threads} --full-header > {log} 60 | rm -rf {params.out_path} 61 | """ -------------------------------------------------------------------------------- /phables/workflow/rules/gfa2fasta.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Run gfa2fasta to obtain the sequences corresponding to unitigs in the assembly graphs in FASTA format. 3 | The assembly graph file with .GFA extension should be provided as inputs. 4 | """ 5 | 6 | rule run_gfa2fasta: 7 | input: 8 | GRAPH_FILE 9 | output: 10 | EDGES_FILE 11 | params: 12 | graph = GRAPH_FILE, 13 | output = os.path.join(OUTDIR, "preprocess"), 14 | log = os.path.join(LOGSDIR, "gfa2fasta.log") 15 | log: 16 | os.path.join(LOGSDIR, "gfa2fasta.log") 17 | conda: 18 | os.path.join("..", "envs", "phables.yaml") 19 | script: 20 | os.path.join('..', 'scripts', 'gfa2fasta.py') -------------------------------------------------------------------------------- /phables/workflow/rules/phables.smk: -------------------------------------------------------------------------------- 1 | rule run_phables: 2 | input: 3 | GRAPH_FILE, 4 | COVERAGE_FILE, 5 | PHROG_ANNOT, 6 | SMG_FILE, 7 | preprocessTargets 8 | output: 9 | genomes_fasta = os.path.join(OUTDIR, "phables", "resolved_paths.fasta"), 10 | genomes_folder = directory(os.path.join(OUTDIR, "phables", "resolved_phages")), 11 | genome_info = os.path.join(OUTDIR, "phables", "resolved_genome_info.txt"), 12 | unitigs = os.path.join(OUTDIR, "phables", "resolved_edges.fasta"), 13 | component_info = os.path.join(OUTDIR, "phables", "resolved_component_info.txt"), 14 | phrog_comp_info = os.path.join(OUTDIR, "phables", "component_phrogs.txt"), 15 | unresolved_edges = os.path.join(OUTDIR, "phables", "unresolved_phage_like_edges.fasta"), 16 | params: 17 | graph = GRAPH_FILE, 18 | hmmout = SMG_FILE, 19 | phrogs = PHROG_ANNOT, 20 | coverage = COVERAGE_FILE, 21 | bampath = BAM_PATH, 22 | minlength = ML, 23 | mincov = MC, 24 | compcount = CC, 25 | maxpaths = MP, 26 | mgfrac = MGF, 27 | evalue = EV, 28 | seqidentity = SI, 29 | covtol = CT, 30 | alpha = AL, 31 | longreads = LR, 32 | prefix = PR, 33 | output = os.path.join(OUTDIR, "phables"), 34 | nthreads = config["resources"]["jobCPU"], 35 | log = os.path.join(LOGSDIR, "phables_output.log") 36 | threads: 37 | config["resources"]["jobCPU"] 38 | log: 39 | os.path.join(LOGSDIR, "phables_output.log") 40 | conda: 41 | os.path.join("..", "envs", "phables.yaml") 42 | script: 43 | os.path.join("..", "scripts", "phables.py") 44 | -------------------------------------------------------------------------------- /phables/workflow/rules/postprocess.smk: -------------------------------------------------------------------------------- 1 | rule combine_genomes_and_unresolved_edges: 2 | """Combine resolved genomes and unresolved edges""" 3 | input: 4 | genomes = RESOLVED_GENOMES, 5 | unresolved_edges = os.path.join(OUTDIR, "phables", "unresolved_phage_like_edges.fasta") 6 | output: 7 | os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta") 8 | shell: 9 | """ 10 | cat {input.genomes} {input.unresolved_edges} > {output} 11 | """ 12 | 13 | 14 | rule koverage_genomes: 15 | """Get coverage statistics with Koverage""" 16 | input: 17 | tsv = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"), 18 | sequences = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta") 19 | params: 20 | out_dir = os.path.join(OUTDIR, "postprocess"), 21 | profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "", 22 | output: 23 | os.path.join(OUTDIR, "postprocess", "results", "sample_coverage.tsv") 24 | threads: 25 | config["resources"]["jobCPU"] 26 | resources: 27 | mem_mb = config["resources"]["jobMem"], 28 | mem = str(config["resources"]["jobMem"]) + "MB" 29 | conda: 30 | os.path.join("..", "envs", "koverage.yaml") 31 | shell: 32 | """ 33 | koverage run \ 34 | --no-report \ 35 | --reads {input.tsv} \ 36 | --ref {input.sequences} \ 37 | --threads {threads} \ 38 | --output {params.out_dir} \ 39 | {params.profile} 40 | """ 41 | 42 | 43 | rule koverage_postprocess: 44 | """Format TSV of samples and reads from Koverage""" 45 | input: 46 | koverage_tsv = os.path.join(OUTDIR, "postprocess", "results", "sample_coverage.tsv"), 47 | samples_file = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"), 48 | seq_file = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta") 49 | output: 50 | os.path.join(OUTDIR, "postprocess", "sample_genome_read_counts.tsv") 51 | params: 52 | koverage_tsv = os.path.join(OUTDIR, "postprocess", "results", "sample_coverage.tsv"), 53 | samples_file = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"), 54 | seq_file = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta"), 55 | info_file = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges_info.tsv"), 56 | output_path = os.path.join(OUTDIR, "postprocess"), 57 | log = os.path.join(LOGSDIR, "format_koverage_results_output.log") 58 | log: 59 | os.path.join(LOGSDIR, "format_koverage_results_output.log") 60 | conda: 61 | os.path.join("..", "envs", "phables.yaml") 62 | script: 63 | os.path.join("..", "scripts", "format_koverage_results.py") -------------------------------------------------------------------------------- /phables/workflow/scripts/combine_cov.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """combine_cov.py: Combine multiple coverage files of samples. 4 | 5 | """ 6 | 7 | import glob 8 | import logging 9 | import os 10 | import subprocess 11 | 12 | import pandas as pd 13 | 14 | __author__ = "Vijini Mallawaarachchi" 15 | __copyright__ = "Copyright 2023, Phables Project" 16 | __license__ = "MIT" 17 | __type__ = "Support Script" 18 | __maintainer__ = "Vijini Mallawaarachchi" 19 | __email__ = "viji.mallawaarachchi@gmail.com" 20 | 21 | 22 | def main(): 23 | # Get arguments 24 | # ----------------------- 25 | 26 | covpath = snakemake.params.covpath 27 | output_path = snakemake.params.output 28 | log = snakemake.params.log 29 | 30 | # Setup logger 31 | # ---------------------------------------------------------------------- 32 | 33 | logger = logging.getLogger("combine_cov") 34 | logger.setLevel(logging.DEBUG) 35 | logging.captureWarnings(True) 36 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 37 | consoleHeader = logging.StreamHandler() 38 | consoleHeader.setFormatter(formatter) 39 | consoleHeader.setLevel(logging.INFO) 40 | logger.addHandler(consoleHeader) 41 | 42 | # Setup output path for log file 43 | if log is None: 44 | fileHandler = logging.FileHandler(f"{output_path}/combine_cov.log") 45 | else: 46 | fileHandler = logging.FileHandler(f"{log}") 47 | 48 | fileHandler.setLevel(logging.DEBUG) 49 | fileHandler.setFormatter(formatter) 50 | logger.addHandler(fileHandler) 51 | 52 | # Validate inputs 53 | # --------------------------------------------------- 54 | 55 | # Handle for missing trailing forwardslash in output folder path 56 | if output_path[-1:] != "/": 57 | output_path = output_path + "/" 58 | 59 | # Create output folder if it does not exist 60 | if not os.path.isdir(output_path): 61 | subprocess.run("mkdir -p " + output_path, shell=True) 62 | 63 | # Get coverage values from samples 64 | # --------------------------------------------------- 65 | 66 | # Get coverage files 67 | cov_files = glob.glob(f"{covpath}/*.tsv") 68 | 69 | final_df = pd.DataFrame() 70 | 71 | for file in cov_files: 72 | logger.info(f"Reading file {file}") 73 | df = pd.read_csv(file, sep="\t", header=0) 74 | 75 | if final_df.empty: 76 | final_df = df 77 | else: 78 | final_df = pd.concat( 79 | [final_df, df[list(df.columns)[1]]], axis=1, join="inner" 80 | ) 81 | 82 | logger.info(f"Dataframe shape: {final_df.shape}") 83 | 84 | # Save dataframe to file 85 | final_df.to_csv(output_path + "coverage.tsv", sep="\t", index=False) 86 | logger.info( 87 | f"The combined coverage values can be found at {output_path}coverage.tsv" 88 | ) 89 | 90 | # Exit program 91 | # -------------- 92 | 93 | logger.info("Thank you for using combine_cov!") 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /phables/workflow/scripts/format_koverage_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """format_koverage_results.py: Format koverage results. 4 | 5 | """ 6 | 7 | import logging 8 | import os 9 | import subprocess 10 | from collections import defaultdict 11 | 12 | import pandas as pd 13 | from Bio import SeqIO 14 | 15 | __author__ = "Vijini Mallawaarachchi" 16 | __copyright__ = "Copyright 2023, Phables Project" 17 | __license__ = "MIT" 18 | __type__ = "Support Script" 19 | __maintainer__ = "Vijini Mallawaarachchi" 20 | __email__ = "viji.mallawaarachchi@gmail.com" 21 | 22 | 23 | def main(): 24 | # Get arguments 25 | # ----------------------- 26 | 27 | samples_file = snakemake.params.samples_file 28 | koverage_tsv = snakemake.params.koverage_tsv 29 | seq_file = snakemake.params.seq_file 30 | info_file = snakemake.params.info_file 31 | output_path = snakemake.params.output_path 32 | log = snakemake.params.log 33 | 34 | # Setup logger 35 | # ---------------------------------------------------------------------- 36 | 37 | logger = logging.getLogger("format_coverage") 38 | logger.setLevel(logging.DEBUG) 39 | logging.captureWarnings(True) 40 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 41 | consoleHeader = logging.StreamHandler() 42 | consoleHeader.setFormatter(formatter) 43 | consoleHeader.setLevel(logging.INFO) 44 | logger.addHandler(consoleHeader) 45 | 46 | # Setup output path for log file 47 | if log is None: 48 | fileHandler = logging.FileHandler(f"{log}") 49 | else: 50 | fileHandler = logging.FileHandler(f"{log}") 51 | 52 | fileHandler.setLevel(logging.DEBUG) 53 | fileHandler.setFormatter(formatter) 54 | logger.addHandler(fileHandler) 55 | 56 | # Validate inputs 57 | # --------------------------------------------------- 58 | 59 | # Handle for missing trailing forwardslash in output folder path 60 | if output_path[-1:] != "/": 61 | output_path = output_path + "/" 62 | 63 | # Create output folder if it does not exist 64 | if not os.path.isdir(output_path): 65 | subprocess.run("mkdir -p " + output_path, shell=True) 66 | 67 | # Get sample-wise genome coverage stats 68 | # ---------------------------------------------------------------------- 69 | 70 | # Log inputs 71 | logger.info(f"Samples file: {samples_file}") 72 | logger.info(f"Koverage results: {koverage_tsv}") 73 | logger.info(f"Output path: {output_path}") 74 | 75 | # Get sample names 76 | mysamples = [s.split("\t")[0] for s in open(samples_file, "r")] 77 | logger.debug(mysamples) 78 | 79 | # Initialise dataframe 80 | df_read_counts = pd.DataFrame(columns=["contig_phables"] + mysamples) 81 | df_rpkm = pd.DataFrame(columns=["contig_phables"] + mysamples) 82 | df_mean_cov = pd.DataFrame(columns=["contig_phables"] + mysamples) 83 | 84 | # Get coverage stats of genomes in each sample 85 | read_counts = defaultdict(lambda: defaultdict(list)) 86 | rpkm = defaultdict(lambda: defaultdict(list)) 87 | mean_cov = defaultdict(lambda: defaultdict(list)) 88 | 89 | with open(koverage_tsv, "r") as mf: 90 | for line in mf.readlines()[1:]: 91 | strings = line.strip().split("\t") 92 | read_counts[strings[1]][strings[0]] = int(float(strings[2])) 93 | rpkm[strings[1]][strings[0]] = float(strings[4]) 94 | mean_cov[strings[1]][strings[0]] = float(strings[7]) 95 | 96 | # Add records to dataframe 97 | counter = 0 98 | for genome in read_counts: 99 | read_counts_row = read_counts[genome] 100 | read_counts_row["contig_phables"] = genome 101 | read_counts_row = dict(read_counts_row) 102 | read_counts_row_df = pd.DataFrame(read_counts_row, index=[counter]) 103 | df_read_counts = pd.concat([df_read_counts, read_counts_row_df]) 104 | 105 | rpkm_row = rpkm[genome] 106 | rpkm_row["contig_phables"] = genome 107 | rpkm_row = dict(rpkm_row) 108 | rpkm_row_df = pd.DataFrame(rpkm_row, index=[counter]) 109 | df_rpkm = pd.concat([df_rpkm, rpkm_row_df]) 110 | 111 | mean_cov_row = mean_cov[genome] 112 | mean_cov_row["contig_phables"] = genome 113 | mean_cov_row = dict(mean_cov_row) 114 | mean_cov_row_df = pd.DataFrame(mean_cov_row, index=[counter]) 115 | df_mean_cov = pd.concat([df_mean_cov, mean_cov_row_df]) 116 | 117 | counter += 1 118 | 119 | # Save dataframe to file 120 | df_read_counts.to_csv( 121 | f"{output_path}sample_genome_read_counts.tsv", sep="\t", index=False 122 | ) 123 | df_rpkm.to_csv(f"{output_path}sample_genome_rpkm.tsv", sep="\t", index=False) 124 | df_mean_cov.to_csv( 125 | f"{output_path}sample_genome_mean_coverage.tsv", sep="\t", index=False 126 | ) 127 | 128 | logger.info( 129 | f"Raw read counts mapped to resolved genomes can be found in {output_path}sample_genome_read_counts.tsv" 130 | ) 131 | logger.info( 132 | f"RPKM values of resolved genomes can be found in {output_path}sample_genome_rpkm.tsv" 133 | ) 134 | logger.info( 135 | f"Estimated mean read depth of resolved genomes can be found in {output_path}sample_genome_mean_coverage.tsv" 136 | ) 137 | 138 | # Make sequence information file 139 | with open(info_file, "w") as myfile: 140 | myfile.write(f"contig_phables_name\tlength\tcontig_or_phables\n") 141 | for index, record in enumerate(SeqIO.parse(seq_file, "fasta")): 142 | if "phage_comp" in record.id: 143 | myfile.write(f"{record.id}\t{len(record.seq)}\tphables\n") 144 | else: 145 | myfile.write(f"{record.id}\t{len(record.seq)}\tcontig\n") 146 | 147 | logger.info(f"Sequence information file can be found in {info_file}") 148 | 149 | # Exit program 150 | # -------------- 151 | 152 | logger.info("Thank you for using format_koverage_results!") 153 | 154 | 155 | if __name__ == "__main__": 156 | main() 157 | -------------------------------------------------------------------------------- /phables/workflow/scripts/gfa2fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """gfa2fasta.py: Obtain the sequences corresponding to edges in the Flye and Miniasm assembly graphs in FASTA format. 4 | 5 | The assembly graph file of Flye (assembly_graph.gfa) should be provided as inputs. 6 | 7 | """ 8 | 9 | import logging 10 | import os 11 | import re 12 | import subprocess 13 | import sys 14 | 15 | from Bio import SeqIO 16 | from Bio.Seq import Seq 17 | from Bio.SeqRecord import SeqRecord 18 | 19 | __author__ = "Vijini Mallawaarachchi" 20 | __copyright__ = "Copyright 2023, Phables Project" 21 | __license__ = "MIT" 22 | __type__ = "Support Script" 23 | __maintainer__ = "Vijini Mallawaarachchi" 24 | __email__ = "viji.mallawaarachchi@gmail.com" 25 | 26 | 27 | def main(): 28 | # Get arguments 29 | # ----------------------- 30 | 31 | assembly_graph_file = snakemake.params.graph 32 | output_path = snakemake.params.output 33 | log = snakemake.params.log 34 | prefix = "" 35 | 36 | # Setup logger 37 | # ---------------------------------------------------------------------- 38 | 39 | logger = logging.getLogger("gfa2fasta") 40 | logger.setLevel(logging.DEBUG) 41 | logging.captureWarnings(True) 42 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 43 | consoleHeader = logging.StreamHandler() 44 | consoleHeader.setFormatter(formatter) 45 | consoleHeader.setLevel(logging.INFO) 46 | logger.addHandler(consoleHeader) 47 | 48 | # Setup output path for log file 49 | if log is None: 50 | fileHandler = logging.FileHandler(f"{output_path}/gfa2fasta.log") 51 | else: 52 | fileHandler = logging.FileHandler(f"{log}") 53 | 54 | fileHandler.setLevel(logging.DEBUG) 55 | fileHandler.setFormatter(formatter) 56 | logger.addHandler(fileHandler) 57 | 58 | # Check assembly graph file 59 | if not os.path.isfile(assembly_graph_file): 60 | logger.error( 61 | "Failed to open the assembly graph file. Please make sure to provife the .gfa file." 62 | ) 63 | logger.info("Exiting gfa2fasta.py...\nBye...!\n") 64 | sys.exit(1) 65 | 66 | # Check if output folder exists 67 | # --------------------------------------------------- 68 | 69 | # Handle for missing trailing forwardslash in output folder path 70 | if output_path[-1:] != "/": 71 | output_path = f"{output_path}/" 72 | 73 | # Create output folder if it does not exist 74 | if not os.path.isdir(output_path): 75 | subprocess.run("mkdir -p " + output_path, shell=True) 76 | 77 | # Get the sequences corresponding to edges of the graph. 78 | # --------------------------------------------------- 79 | 80 | logger.info("Obtaining edge sequences") 81 | 82 | sequenceset = [] 83 | 84 | with open(assembly_graph_file) as file: 85 | line = file.readline() 86 | 87 | while line != "": 88 | if "S" in line: 89 | strings = line.split("\t") 90 | 91 | record = SeqRecord( 92 | Seq(re.sub("[^GATC]", "", str(strings[2]).upper())), 93 | id=str(strings[1]), 94 | name=str(strings[1]), 95 | description="", 96 | ) 97 | 98 | sequenceset.append(record) 99 | 100 | line = file.readline() 101 | 102 | logger.info("Writing edge sequences to FASTA file") 103 | 104 | with open(f"{output_path}{prefix}edges.fasta", "w") as output_handle: 105 | SeqIO.write(sequenceset, output_handle, "fasta") 106 | 107 | logger.info( 108 | f"The FASTA file with unitig sequences can be found at {output_handle.name}" 109 | ) 110 | 111 | # Exit program 112 | # -------------- 113 | 114 | logger.info("Thank you for using gfa2fasta!") 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import logging 4 | import time 5 | 6 | from phables_utils import ( 7 | component_utils, 8 | edge_graph_utils, 9 | gene_utils, 10 | long_utils, 11 | short_utils, 12 | ) 13 | from phables_utils.coverage_utils import ( 14 | get_junction_pe_coverage, 15 | get_sub_path_coverage, 16 | get_unitig_coverage, 17 | ) 18 | from phables_utils.output_utils import ( 19 | init_files, 20 | write_component_info, 21 | write_component_phrog_info, 22 | write_res_genome_info, 23 | write_unitigs, 24 | ) 25 | 26 | __author__ = "Vijini Mallawaarachchi" 27 | __copyright__ = "Copyright 2022, Phables Project" 28 | __license__ = "MIT" 29 | __version__ = "1.4.1" 30 | __maintainer__ = "Vijini Mallawaarachchi" 31 | __email__ = "viji.mallawaarachchi@gmail.com" 32 | __status__ = "Stable Release" 33 | 34 | 35 | # Phables main code 36 | # ---------------------------------------------------------------------- 37 | 38 | 39 | def main(): 40 | # Get arguments 41 | # ---------------------------------------------------------------------- 42 | graph = snakemake.params.graph 43 | coverage = snakemake.params.coverage 44 | bampath = snakemake.params.bampath 45 | hmmout = snakemake.params.hmmout 46 | phrogs = snakemake.params.phrogs 47 | minlength = int(snakemake.params.minlength) 48 | mincov = int(snakemake.params.mincov) 49 | compcount = int(snakemake.params.compcount) 50 | maxpaths = int(snakemake.params.maxpaths) 51 | mgfrac = float(snakemake.params.mgfrac) 52 | evalue = float(snakemake.params.evalue) 53 | seqidentity = float(snakemake.params.seqidentity) 54 | covtol = float(snakemake.params.covtol) 55 | alpha = float(snakemake.params.alpha) 56 | longreads = bool(snakemake.params.longreads) 57 | prefix = snakemake.params.prefix 58 | output = snakemake.params.output 59 | nthreads = int(snakemake.params.nthreads) 60 | log = snakemake.params.log 61 | 62 | # Setup logger 63 | # ---------------------------------------------------------------------- 64 | 65 | logger = logging.getLogger(f"phables {__version__}") 66 | logger.setLevel(logging.DEBUG) 67 | logging.captureWarnings(True) 68 | formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") 69 | consoleHeader = logging.StreamHandler() 70 | consoleHeader.setFormatter(formatter) 71 | consoleHeader.setLevel(logging.INFO) 72 | logger.addHandler(consoleHeader) 73 | 74 | # Setup output path for log file 75 | if log is None: 76 | fileHandler = logging.FileHandler(f"{output}/phables.log") 77 | else: 78 | fileHandler = logging.FileHandler(f"{log}") 79 | fileHandler.setLevel(logging.DEBUG) 80 | fileHandler.setFormatter(formatter) 81 | logger.addHandler(fileHandler) 82 | 83 | logger.info( 84 | "Welcome to Phables: from fragmented assemblies to high-quality bacteriophage genomes." 85 | ) 86 | 87 | logger.info(f"Input arguments: ") 88 | logger.info(f"Assembly graph file: {graph}") 89 | logger.info(f"Unitig coverage file: {coverage}") 90 | logger.info(f"BAM files path: {bampath}") 91 | logger.info(f"Unitig .hmmout file: {hmmout}") 92 | logger.info(f"Unitig phrog annotations file: {phrogs}") 93 | logger.info(f"Minimum length of unitigs to consider: {minlength}") 94 | logger.info(f"Minimum coverage of paths to output: {mincov}") 95 | logger.info(f"Minimum unitig count to consider a component: {compcount}") 96 | logger.info(f"Maximum number of paths to resolve for a component: {maxpaths}") 97 | logger.info(f"Length threshold to consider single copy marker genes: {mgfrac}") 98 | logger.info(f"Maximum e-value for phrog annotations: {evalue}") 99 | logger.info(f"Minimum sequence identity for phrog annotations: {seqidentity}") 100 | logger.info(f"Coverage tolerance for extending subpaths: {covtol}") 101 | logger.info(f"Coverage multipler for flow interval modelling: {alpha}") 102 | logger.info(f"Input long reads: {longreads}") 103 | logger.info(f"Prefix for genome identifiers: {prefix}") 104 | logger.info(f"Number of threads to use: {nthreads}") 105 | logger.info(f"Output folder: {output}") 106 | 107 | if prefix is None or prefix == "": 108 | prefix = "" 109 | else: 110 | prefix = f"{prefix}_" 111 | 112 | start_time = time.time() 113 | 114 | # Init files 115 | # ---------------------------------------------------------------------- 116 | init_files(output) 117 | 118 | # Get assembly graph 119 | # ---------------------------------------------------------------------- 120 | ( 121 | assembly_graph, 122 | oriented_links, 123 | link_overlap, 124 | unitig_names, 125 | unitig_names_rev, 126 | graph_unitigs, 127 | self_looped_nodes, 128 | edges_lengths, 129 | ) = edge_graph_utils.build_assembly_graph(graph) 130 | 131 | logger.info( 132 | f"Total number of vertices in the assembly graph: {len(assembly_graph.vs)}" 133 | ) 134 | logger.info( 135 | f"Total number of links in the assembly graph: {len(assembly_graph.es)}" 136 | ) 137 | 138 | # Get single unitigs 139 | # ---------------------------------------------------------------------- 140 | circular = edge_graph_utils.get_circular(self_looped_nodes, graph_unitigs) 141 | 142 | # Get unitigs with bacterial single copy marker genes 143 | # ---------------------------------------------------------------------- 144 | smg_unitigs = gene_utils.get_smg_unitigs(hmmout, mgfrac) 145 | 146 | # Get unitigs with PHROGs 147 | # ---------------------------------------------------------------------- 148 | unitig_phrogs, phrog_dict = gene_utils.get_phrog_unitigs( 149 | phrogs, evalue, seqidentity 150 | ) 151 | 152 | # Get components with viral components 153 | # ---------------------------------------------------------------------- 154 | pruned_vs, comp_phrogs, likely_complete = component_utils.get_components( 155 | assembly_graph, 156 | unitig_names, 157 | smg_unitigs, 158 | unitig_phrogs, 159 | circular, 160 | edges_lengths, 161 | minlength, 162 | phrog_dict, 163 | ) 164 | logger.info(f"Total number of components found: {len(pruned_vs)}") 165 | 166 | # Get unitig coverages 167 | # ---------------------------------------------------------------------- 168 | 169 | unitig_coverages = get_unitig_coverage(coverage) 170 | 171 | # Resolve genomes 172 | # ---------------------------------------------------------------------- 173 | 174 | # If long reads are provided 175 | if longreads: 176 | logger.info(f"Long reads provided") 177 | 178 | # Get sub path coverages 179 | sub_path_cov = edge_graph_utils.get_all_sub_paths(assembly_graph, unitig_names) 180 | sub_path_cov = get_sub_path_coverage(sub_path_cov, bampath, output) 181 | 182 | # Resolve genomes 183 | ( 184 | resolved_edges, 185 | all_resolved_paths, 186 | all_components, 187 | cycle_components, 188 | linear_components, 189 | resolved_components, 190 | resolved_linear, 191 | single_unitigs, 192 | resolved_cyclic, 193 | case1_found, 194 | case1_resolved, 195 | case2_found, 196 | case2_resolved, 197 | case3_found, 198 | case3_resolved, 199 | phage_like_edges, 200 | all_phage_like_edges, 201 | unresolved_phage_like_edges, 202 | ) = long_utils.resolve_long( 203 | assembly_graph, 204 | pruned_vs, 205 | unitig_names, 206 | unitig_names_rev, 207 | self_looped_nodes, 208 | graph_unitigs, 209 | minlength, 210 | link_overlap, 211 | unitig_coverages, 212 | compcount, 213 | oriented_links, 214 | sub_path_cov, 215 | likely_complete, 216 | alpha, 217 | mincov, 218 | covtol, 219 | maxpaths, 220 | prefix, 221 | output, 222 | nthreads, 223 | ) 224 | 225 | # Else default to short reads 226 | else: 227 | logger.info(f"Short reads provided") 228 | 229 | # Get junction pe coverages 230 | junction_pe_coverage = get_junction_pe_coverage(bampath, output) 231 | 232 | # Resolve genomes 233 | ( 234 | resolved_edges, 235 | all_resolved_paths, 236 | all_components, 237 | cycle_components, 238 | linear_components, 239 | resolved_components, 240 | resolved_linear, 241 | single_unitigs, 242 | resolved_cyclic, 243 | case1_found, 244 | case1_resolved, 245 | case2_found, 246 | case2_resolved, 247 | case3_found, 248 | case3_resolved, 249 | phage_like_edges, 250 | all_phage_like_edges, 251 | unresolved_phage_like_edges, 252 | ) = short_utils.resolve_short( 253 | assembly_graph, 254 | pruned_vs, 255 | unitig_names, 256 | unitig_names_rev, 257 | self_looped_nodes, 258 | graph_unitigs, 259 | minlength, 260 | link_overlap, 261 | unitig_coverages, 262 | compcount, 263 | oriented_links, 264 | junction_pe_coverage, 265 | likely_complete, 266 | alpha, 267 | mincov, 268 | covtol, 269 | maxpaths, 270 | prefix, 271 | output, 272 | nthreads, 273 | ) 274 | 275 | # Log final summary information 276 | # ---------------------------------------------------------------------- 277 | logger.info(f"Total number of cyclic components found: {len(cycle_components)}") 278 | logger.info(f"Total number of cyclic components resolved: {len(resolved_cyclic)}") 279 | logger.info(f"Single unitigs identified: {len(single_unitigs)}") 280 | logger.info(f"Total number of linear components found: {len(linear_components)}") 281 | logger.info(f"Total number of linear components resolved: {len(resolved_linear)}") 282 | logger.info( 283 | f"Total number of cyclic components found including single unitigs: {len(cycle_components) + len(single_unitigs)}" 284 | ) 285 | logger.info( 286 | f"Total number of components resolved: {len(single_unitigs)+len(resolved_cyclic)+len(resolved_linear)}" 287 | ) 288 | logger.info(f"Case 1 (resolved/found): {len(case1_resolved)}/{len(case1_found)}") 289 | logger.info(f"Case 2 (resolved/found): {len(case2_resolved)}/{len(case2_found)}") 290 | logger.info(f"Case 3 (resolved/found): {len(case3_resolved)}/{len(case3_found)}") 291 | logger.info(f"Total number of genomes resolved: {len(all_resolved_paths)}") 292 | 293 | if len(all_resolved_paths) == 0: 294 | logger.info(f"No genomes were resolved.") 295 | else: 296 | logger.info(f"Resolved genomes can be found in {output}/resolved_paths.fasta") 297 | 298 | # Write edges to file 299 | # ---------------------------------------------------------------------- 300 | 301 | write_unitigs( 302 | phage_like_edges, unitig_names, graph_unitigs, "phage_like_edges", output 303 | ) 304 | write_unitigs( 305 | all_phage_like_edges, 306 | unitig_names, 307 | graph_unitigs, 308 | "all_phage_like_edges", 309 | output, 310 | ) 311 | write_unitigs(resolved_edges, unitig_names, graph_unitigs, "resolved_edges", output) 312 | write_unitigs( 313 | unresolved_phage_like_edges, 314 | unitig_names, 315 | graph_unitigs, 316 | "unresolved_phage_like_edges", 317 | output, 318 | ) 319 | 320 | # Record path information 321 | # ---------------------------------------------------------------------- 322 | 323 | filename = write_res_genome_info(all_resolved_paths, output) 324 | if len(all_resolved_paths) > 0: 325 | logger.info(f"Resolved genome information can be found in {output}/{filename}") 326 | 327 | # Record component information 328 | # ---------------------------------------------------------------------- 329 | 330 | filename = write_component_info(all_components, output) 331 | if len(all_components) > 0: 332 | logger.info( 333 | f"Resolved component information can be found in {output}/{filename}" 334 | ) 335 | 336 | filename = write_component_phrog_info(resolved_components, comp_phrogs, output) 337 | if len(resolved_components) > 0: 338 | logger.info( 339 | f"PHROGs found in resolved components can be found in {output}/{filename}" 340 | ) 341 | 342 | # Get elapsed time 343 | # ---------------------------------------------------------------------- 344 | 345 | # Determine elapsed time 346 | elapsed_time = time.time() - start_time 347 | 348 | # Print elapsed time for the process 349 | logger.info(f"Elapsed time: {elapsed_time} seconds") 350 | 351 | # Exit program 352 | # ---------------------------------------------------------------------- 353 | 354 | logger.info("Thank you for using Phables!") 355 | 356 | 357 | if __name__ == "__main__": 358 | main() 359 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/FD_Inexact.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/algbio/MFD-ILP 2 | 3 | import logging 4 | 5 | import more_itertools 6 | import networkx as nx 7 | 8 | # create logger 9 | logger = logging.getLogger("phables 1.4.1") 10 | 11 | 12 | def read_input(graphfile, number_subpath): 13 | trip_data = open(graphfile, "r").read().split("\n") 14 | i = 0 15 | listOfGraphs = {} 16 | k = 0 17 | 18 | while True: 19 | if "#" in trip_data[i]: 20 | i = i + 1 21 | N = int(trip_data[i]) 22 | edges = list() 23 | subpaths = {} 24 | while True: 25 | i = i + 1 26 | if "#" in trip_data[i]: 27 | break 28 | if "" == trip_data[i]: 29 | break 30 | if "subpaths" in trip_data[i]: 31 | for j in range(0, number_subpath): 32 | i = i + 1 33 | line = trip_data[i].split(" ") 34 | subpaths[j] = line[0 : len(line) - 1] 35 | i = i + 4 36 | if i >= len(trip_data) - 1: 37 | break 38 | line = trip_data[i].split(" ") 39 | edges.append((line[0], line[1], line[2], line[3])) 40 | G = {"Nodes": N, "list of edges": edges, "subpaths": subpaths} 41 | listOfGraphs[k] = G 42 | k += 1 43 | if i >= len(trip_data) - 1: 44 | break 45 | 46 | return listOfGraphs 47 | 48 | 49 | # FD-Subpath-Inexact-Gurobi 50 | # -------------------------------------------- 51 | def flowMultipleDecomposition(data, K, nthreads): 52 | # libraries 53 | import gurobipy as gp 54 | from gurobipy import GRB 55 | 56 | # calculate the minimal flow decomposition based on such graph 57 | V = data["vertices"] 58 | E = data["edges"] 59 | W = data["maxFlow"] 60 | S = data["sources"] 61 | D = data["targets"] 62 | AD_in = data["adj_in"] 63 | AD_out = data["adj_out"] 64 | f_low = data["flows_low"] 65 | f_up = data["flows_up"] 66 | subpaths = data["subpaths"] 67 | 68 | try: 69 | # create extra sets 70 | T = [(i, j, k) for (i, j) in E for k in range(0, K)] 71 | SC = [k for k in range(0, K)] 72 | R = [(k, s) for k in range(0, K) for s in range(0, len(subpaths))] 73 | 74 | # Create a new model 75 | model = gp.Model("MFD") 76 | model.setParam("LogToConsole", 0) 77 | model.setParam("Threads", nthreads) 78 | 79 | # Create variables 80 | x = model.addVars(T, vtype=GRB.BINARY, name="x") 81 | w = model.addVars(SC, vtype=GRB.INTEGER, name="w", lb=0) 82 | z = model.addVars(T, vtype=GRB.CONTINUOUS, name="z", lb=0) 83 | r = model.addVars(R, vtype=GRB.BINARY, name="r") 84 | 85 | model.setObjective(GRB.MINIMIZE) 86 | 87 | # flow conservation 88 | for k in range(0, K): 89 | for i in V: 90 | if i in S: 91 | model.addConstr(sum(x[i, j, k] for j in AD_out[i]) == 1) 92 | if i in D: 93 | model.addConstr(sum(x[j, i, k] for j in AD_in[i]) == 1) 94 | if i not in S and i not in D: 95 | model.addConstr( 96 | sum(x[i, j, k] for j in AD_out[i]) 97 | - sum(x[j, i, k] for j in AD_in[i]) 98 | == 0 99 | ) 100 | 101 | # flow balance 102 | model.addConstrs( 103 | f_up[i, j] >= gp.quicksum(z[i, j, k] for k in range(0, K)) for (i, j) in E 104 | ) 105 | model.addConstrs( 106 | f_low[i, j] <= gp.quicksum(z[i, j, k] for k in range(0, K)) for (i, j) in E 107 | ) 108 | 109 | # linearization 110 | for i, j in E: 111 | for k in range(0, K): 112 | model.addConstr(z[i, j, k] <= W * x[i, j, k]) 113 | model.addConstr(w[k] - (1 - x[i, j, k]) * W <= z[i, j, k]) 114 | model.addConstr(z[i, j, k] <= w[k]) 115 | 116 | # subpath constraints 117 | for k in range(0, K): 118 | for sp_len in range(0, len(subpaths)): 119 | subpath_edges = list(more_itertools.pairwise(subpaths[sp_len])) 120 | try: 121 | model.addConstr( 122 | gp.quicksum(x[i, j, k] for (i, j) in subpath_edges) 123 | >= len(subpath_edges) * r[k, sp_len] 124 | ) 125 | except: 126 | continue 127 | 128 | model.addConstrs( 129 | gp.quicksum(r[k, sp_len] for k in range(0, K)) >= 1 130 | for sp_len in range(0, len(subpaths)) 131 | ) 132 | 133 | # objective function 134 | model.optimize() 135 | 136 | w_sol = [0] * len(range(0, K)) 137 | x_sol = {} 138 | paths = [list() for i in range(0, K)] 139 | 140 | if model.status == GRB.OPTIMAL: 141 | data["message"] = "solved" 142 | data["runtime"] = model.Runtime 143 | 144 | for v in model.getVars(): 145 | if "w" in v.VarName: 146 | for k in range(0, K): 147 | if str(k) in v.VarName: 148 | w_sol[k] = v.x 149 | 150 | if "x" in v.VarName: 151 | for i, j, k in T: 152 | if str(i) + "," + str(j) + "," + str(k) in v.VarName: 153 | x_sol[i, j, k] = v.x 154 | 155 | for i, j, k in T: 156 | if x_sol[i, j, k] == 1: 157 | paths[k].append((i, j)) 158 | 159 | data["weights"] = w_sol 160 | data["solution"] = paths 161 | 162 | if model.status == GRB.INFEASIBLE: 163 | data["message"] = "unsolved" 164 | 165 | except gp.GurobiError as e: 166 | logger.error(f"Error code {e.errno}: {str(e)}") 167 | 168 | except AttributeError: 169 | logger.error(f"Encountered an attribute error") 170 | 171 | return data 172 | 173 | 174 | def FD_Algorithm(data, max_paths, nthreads): 175 | listOfEdges = data["edges"] 176 | solutionMap = data["graph"] 177 | solutionSet = 0 178 | Kmin = data["minK"] 179 | solutionWeights = 0 180 | 181 | for i in range(1, max_paths + 1): 182 | data = flowMultipleDecomposition(data, i, nthreads) 183 | if data["message"] == "solved": 184 | solutionSet = data["solution"] 185 | solutionWeights = data["weights"] 186 | break 187 | 188 | # Get solution paths and weights 189 | solution_paths = {} 190 | 191 | if solutionSet != 0: 192 | for i in range(0, len(solutionSet)): 193 | solution_paths[i] = {"weight": solutionWeights[i], "path": solutionSet[i]} 194 | # print("W:",solutionWeights[i], solutionSet[i]) 195 | 196 | return data, solution_paths 197 | 198 | 199 | def SolveInstances(Graphs, max_paths, outfile, recfile, nthreads): 200 | fp = open(outfile, "w+") 201 | fc = open(recfile, "w+") 202 | 203 | for s in range(0, 1): 204 | f_low = {} 205 | f_up = {} 206 | Edges = set() 207 | V = set() 208 | listOfEdges = Graphs[s]["list of edges"] 209 | 210 | for k in range(0, len(listOfEdges)): 211 | (a, b, c, d) = listOfEdges[k] 212 | Edges.add((a, b)) 213 | V.add(a) 214 | V.add(b) 215 | f_low[a, b] = int(float(c)) 216 | f_up[a, b] = int(float(d)) 217 | 218 | # creation of graphs 219 | # creation of graphs 220 | G = nx.DiGraph() 221 | G.add_edges_from(Edges, weights=f_low) 222 | G.add_nodes_from(V) 223 | 224 | # creation of adjacent matrix 225 | AD_in = {} 226 | AD_out = {} 227 | 228 | for v in V: 229 | setAdj = set() 230 | for i, j in list(G.out_edges(v)): 231 | if i != v: 232 | setAdj.add(i) 233 | if j != v: 234 | setAdj.add(j) 235 | 236 | AD_out[v] = list(setAdj) 237 | 238 | setAdj = set() 239 | for i, j in list(G.in_edges(v)): 240 | if i != v: 241 | setAdj.add(i) 242 | if j != v: 243 | setAdj.add(j) 244 | 245 | AD_in[v] = list(setAdj) 246 | 247 | # calculating source, sinks and max flows 248 | S = [x for x in G.nodes() if G.out_degree(x) >= 1 and G.in_degree(x) == 0] 249 | D = [x for x in G.nodes() if G.out_degree(x) == 0 and G.in_degree(x) >= 1] 250 | maxW = max(f_up.values()) 251 | 252 | # definition of data 253 | 254 | data = { 255 | "edges": Edges, 256 | "flows_low": f_low, 257 | "flows_up": f_up, 258 | "vertices": V, 259 | "graph": G, 260 | "Kmax": len(Edges), 261 | "weights": {}, 262 | "sources": S, 263 | "targets": D, 264 | "message": {}, 265 | "solution": 0, 266 | "maxFlow": maxW, 267 | "adj_in": AD_in, 268 | "adj_out": AD_out, 269 | "subpaths": Graphs[s]["subpaths"], 270 | "minK": 2, 271 | "runtime": 0, 272 | } 273 | 274 | data, solution_paths = FD_Algorithm(data, max_paths, nthreads) 275 | 276 | return solution_paths 277 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables/workflow/scripts/phables_utils/__init__.py -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/component_utils.py: -------------------------------------------------------------------------------- 1 | def get_components( 2 | assembly_graph, 3 | unitig_names, 4 | smg_unitigs, 5 | unitig_phrogs, 6 | circular, 7 | edges_lengths, 8 | cicular_len, 9 | phrog_dict, 10 | ): 11 | """ 12 | Get connected components with PHROGs and no SMGs. 13 | """ 14 | 15 | pruned_vs = {} 16 | likely_complete = {} 17 | 18 | i = 0 19 | 20 | comp_phrogs = {} 21 | 22 | for component in assembly_graph.components(): 23 | phrogs_found = set() 24 | 25 | head_present = False 26 | connector_present = False 27 | tail_present = False 28 | lysis_present = False 29 | 30 | if len(component) > 1: 31 | for unitig in component: 32 | if unitig_names[unitig] in smg_unitigs: 33 | break 34 | elif unitig_names[unitig] in unitig_phrogs: 35 | for phrog in unitig_phrogs[unitig_names[unitig]]: 36 | if "head and packaging" in phrog_dict[phrog]: 37 | head_present = True 38 | if "connector" in phrog_dict[phrog]: 39 | connector_present = True 40 | if "tail" in phrog_dict[phrog]: 41 | tail_present = True 42 | if "lysis" in phrog_dict[phrog]: 43 | lysis_present = True 44 | 45 | phrogs_found.add(phrog) 46 | 47 | if head_present or connector_present or tail_present or lysis_present: 48 | pruned_vs[i] = component 49 | comp_phrogs[i] = phrogs_found 50 | i += 1 51 | 52 | if len(component) == 1: 53 | unitig = component[0] 54 | phrogs_present = False 55 | 56 | if unitig_names[unitig] in unitig_phrogs: 57 | for phrog in unitig_phrogs[unitig_names[unitig]]: 58 | if "head and packaging" in phrog_dict[phrog]: 59 | head_present = True 60 | if "connector" in phrog_dict[phrog]: 61 | connector_present = True 62 | if "tail" in phrog_dict[phrog]: 63 | tail_present = True 64 | if "lysis" in phrog_dict[phrog]: 65 | lysis_present = True 66 | 67 | phrogs_found.add(phrog) 68 | 69 | # Check PHROG categories in unitig (should contain at least one) 70 | if head_present or connector_present or tail_present or lysis_present: 71 | phrogs_present = True 72 | 73 | if phrogs_present and edges_lengths[unitig_names[unitig]] > cicular_len: 74 | pruned_vs[i] = component 75 | comp_phrogs[i] = phrogs_found 76 | 77 | # Check if all PHROG categories are present in unitig 78 | if ( 79 | head_present 80 | and connector_present 81 | and tail_present 82 | and lysis_present 83 | ): 84 | likely_complete[i] = 1 85 | else: 86 | likely_complete[i] = 0 87 | 88 | i += 1 89 | 90 | return pruned_vs, comp_phrogs, likely_complete 91 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/coverage_utils.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import glob 3 | import os 4 | import pickle 5 | from collections import defaultdict 6 | 7 | import pysam 8 | 9 | 10 | def get_unitig_coverage(coverage): 11 | """ 12 | Get coverage values of unitigs 13 | """ 14 | 15 | unitig_coverages = {} 16 | 17 | with open(coverage, "r") as myfile: 18 | for line in myfile.readlines(): 19 | if not line.startswith("Contig"): 20 | strings = line.strip().split() 21 | 22 | unitig_name = strings[0] 23 | 24 | coverage_sum = sum([float(x) for x in strings[1:]]) 25 | 26 | unitig_coverages[unitig_name] = coverage_sum 27 | 28 | return unitig_coverages 29 | 30 | 31 | def read_pair_generator(bam, region_string=None): 32 | """ 33 | Generate read pairs in a BAM file or within a region string. 34 | Reads are added to read_dict until a pair is found. 35 | """ 36 | read_dict = defaultdict(lambda: [None, None]) 37 | 38 | for read in bam.fetch(region=region_string): 39 | if ( 40 | read.is_secondary 41 | or read.is_supplementary 42 | or not read.is_paired 43 | or read.mapping_quality <= 30 44 | ): 45 | continue 46 | qname = read.query_name 47 | if qname not in read_dict: 48 | if read.is_read1: 49 | read_dict[qname][0] = read 50 | else: 51 | read_dict[qname][1] = read 52 | else: 53 | if read.is_read1: 54 | yield read, read_dict[qname][1] 55 | else: 56 | yield read_dict[qname][0], read 57 | del read_dict[qname] 58 | 59 | return read_dict 60 | 61 | 62 | def get_junction_pe_coverage(bam_path, output): 63 | """ 64 | Get number of paired end reads supporting a junction 65 | """ 66 | 67 | link_counts = defaultdict(int) 68 | 69 | if os.path.isfile(f"{output}/junction_pe_coverage.pickle"): 70 | with open(f"{output}/junction_pe_coverage.pickle", "rb") as handle: 71 | link_counts = pickle.load(handle) 72 | 73 | else: 74 | bam_files = glob.glob(bam_path + "/*.bam") 75 | 76 | for bam_file in bam_files: 77 | bam = pysam.AlignmentFile(bam_file, "rb") 78 | 79 | read_pairs = read_pair_generator(bam) 80 | 81 | for read1, read2 in read_pairs: 82 | if read1.reference_name != read2.reference_name: 83 | link_counts[(read1.reference_name, read2.reference_name)] += 1 84 | 85 | with open(f"{output}/junction_pe_coverage.pickle", "wb") as handle: 86 | pickle.dump(link_counts, handle, protocol=pickle.HIGHEST_PROTOCOL) 87 | 88 | return link_counts 89 | 90 | 91 | def get_sub_path_coverage(sub_path_cov, bam_path, output): 92 | """ 93 | Get coverage values of sub paths using long reads 94 | """ 95 | 96 | if os.path.isfile(f"{output}/sub_path_coverage.pickle"): 97 | sub_path_cov = defaultdict(int) 98 | with open(f"{output}/sub_path_coverage.pickle", "rb") as handle: 99 | sub_path_cov = pickle.load(handle) 100 | 101 | else: 102 | bam_files = glob.glob(bam_path + "/*.bam") 103 | 104 | for bam_file in bam_files: 105 | unitig_reads = defaultdict(set) 106 | 107 | bam = pysam.AlignmentFile(bam_file, "rb") 108 | 109 | for read in bam: 110 | if not read.is_unmapped: # Only consider mapped reads 111 | query_id = read.query_name 112 | target_id = read.reference_name 113 | unitig_reads[target_id].add(query_id) 114 | 115 | for sub_path in sub_path_cov.keys(): 116 | if len(sub_path) == 3: 117 | node1 = sub_path[0] 118 | node2 = sub_path[1] 119 | node3 = sub_path[2] 120 | 121 | intersection_set = unitig_reads[node1].intersection( 122 | unitig_reads[node2], unitig_reads[node3] 123 | ) 124 | sub_path_cov[sub_path] += len(intersection_set) 125 | 126 | elif len(sub_path) == 2: 127 | node1 = sub_path[0] 128 | node2 = sub_path[1] 129 | 130 | intersection_set = unitig_reads[node1].intersection( 131 | unitig_reads[node2] 132 | ) 133 | sub_path_cov[sub_path] += len(intersection_set) 134 | 135 | del unitig_reads 136 | del bam 137 | gc.collect() 138 | 139 | return sub_path_cov 140 | 141 | 142 | def get_graph_spanning_reads(gaf_path, output): 143 | """ 144 | Get number of reads spanning across a junction 145 | """ 146 | 147 | junction_reads = defaultdict(int) 148 | 149 | if os.path.isfile(f"{output}/graph_spanning_reads.pickle"): 150 | with open(f"{output}/graph_spanning_reads.pickle", "rb") as handle: 151 | junction_reads = pickle.load(handle) 152 | 153 | else: 154 | gaf_files = glob.glob(gaf_path + "/*.gaf") 155 | 156 | for gaf_file in gaf_files: 157 | with open(gaf_file, "r") as myfile: 158 | for line in myfile.readlines(): 159 | strings = line.strip().split("\t") 160 | 161 | if strings[5].count(">") == 2: 162 | edges = strings[5].split(">")[1:] 163 | junction_reads[(edges[0], edges[1])] += 1 164 | 165 | elif strings[5].count("<") == 2: 166 | edges = strings[5].split("<")[1:] 167 | junction_reads[(edges[1], edges[0])] += 1 168 | 169 | with open(f"{output}/graph_spanning_reads.pickle", "wb") as handle: 170 | pickle.dump(junction_reads, handle, protocol=pickle.HIGHEST_PROTOCOL) 171 | 172 | return junction_reads 173 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/edge_graph_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | from collections import defaultdict 4 | 5 | from Bio import SeqIO 6 | from Bio.Seq import Seq 7 | from igraph import Graph 8 | 9 | # Create logger 10 | logger = logging.getLogger("phables 1.4.1") 11 | 12 | 13 | class BidirectionalError(Exception): 14 | """Must set a unique value in a BijectiveMap.""" 15 | 16 | def __init__(self, value): 17 | self.value = value 18 | msg = 'The value "{}" is already in the mapping.' 19 | super().__init__(msg.format(value)) 20 | 21 | 22 | class BidirectionalMap(dict): 23 | """Invertible map.""" 24 | 25 | def __init__(self, inverse=None): 26 | if inverse is None: 27 | inverse = self.__class__(inverse=self) 28 | self.inverse = inverse 29 | 30 | def __setitem__(self, key, value): 31 | if value in self.inverse: 32 | raise BidirectionalError(value) 33 | 34 | self.inverse._set_item(value, key) 35 | self._set_item(key, value) 36 | 37 | def __delitem__(self, key): 38 | self.inverse._del_item(self[key]) 39 | self._del_item(key) 40 | 41 | def _del_item(self, key): 42 | super().__delitem__(key) 43 | 44 | def _set_item(self, key, value): 45 | super().__setitem__(key, value) 46 | 47 | 48 | def get_unitig_lengths(edge_file): 49 | """ 50 | Get length of the unitigs 51 | """ 52 | 53 | unitig_lengths = {} 54 | 55 | for index, record in enumerate(SeqIO.parse(edge_file, "fasta")): 56 | unitig_lengths[record.id] = len(record.seq) 57 | 58 | return unitig_lengths 59 | 60 | 61 | def get_links(assembly_graph_file): 62 | """ 63 | Get links from the assembly graph 64 | """ 65 | 66 | node_count = 0 67 | graph_contigs = {} 68 | edges_lengths = {} 69 | oriented_links = defaultdict(lambda: defaultdict(list)) 70 | link_overlap = defaultdict(int) 71 | links = [] 72 | 73 | my_map = BidirectionalMap() 74 | 75 | # Get links from .gfa file 76 | with open(assembly_graph_file) as file: 77 | for line in file.readlines(): 78 | # Identify lines with link information 79 | if line.startswith("L"): 80 | strings = line.split("\t") 81 | 82 | link1 = strings[1] 83 | link2 = strings[3] 84 | 85 | link1_orientation = strings[2] 86 | link2_orientation = strings[4] 87 | overlap = int(strings[5].strip()[:-1]) 88 | 89 | link = [] 90 | link.append(link1) 91 | link.append(link2) 92 | links.append(link) 93 | 94 | if link1 != link2: 95 | if link1_orientation == "+" and link2_orientation == "+": 96 | oriented_links[link1][link2].append(("+", "+")) 97 | link_overlap[(f"{link1}+", f"{link2}+")] = overlap 98 | oriented_links[link2][link1].append(("-", "-")) 99 | link_overlap[(f"{link2}-", f"{link1}-")] = overlap 100 | elif link1_orientation == "-" and link2_orientation == "-": 101 | oriented_links[link1][link2].append(("-", "-")) 102 | link_overlap[(f"{link1}-", f"{link2}-")] = overlap 103 | oriented_links[link2][link1].append(("+", "+")) 104 | link_overlap[(f"{link2}+", f"{link1}+")] = overlap 105 | elif link1_orientation == "+" and link2_orientation == "-": 106 | oriented_links[link1][link2].append(("+", "-")) 107 | link_overlap[(f"{link1}+", f"{link2}-")] = overlap 108 | oriented_links[link2][link1].append(("+", "-")) 109 | link_overlap[(f"{link2}+", f"{link1}-")] = overlap 110 | elif link1_orientation == "-" and link2_orientation == "+": 111 | oriented_links[link1][link2].append(("-", "+")) 112 | link_overlap[(f"{link1}-", f"{link2}+")] = overlap 113 | oriented_links[link2][link1].append(("-", "+")) 114 | link_overlap[(f"{link2}-", f"{link1}+")] = overlap 115 | 116 | elif line.startswith("S"): 117 | strings = line.strip().split() 118 | my_map[node_count] = strings[1] 119 | graph_contigs[strings[1]] = Seq(strings[2]) 120 | edges_lengths[strings[1]] = len(strings[2]) 121 | node_count += 1 122 | 123 | line = file.readline() 124 | 125 | return ( 126 | node_count, 127 | graph_contigs, 128 | links, 129 | oriented_links, 130 | link_overlap, 131 | my_map, 132 | edges_lengths, 133 | ) 134 | 135 | 136 | def get_graph_edges(links, contig_names_rev): 137 | """ 138 | Returns the edges of the assembly graph 139 | """ 140 | 141 | self_looped_nodes = [] 142 | 143 | edge_list = [] 144 | 145 | # Iterate links 146 | for link in links: 147 | # Remove self loops 148 | if link[0] != link[1]: 149 | # Add edge to list of edges 150 | edge_list.append((contig_names_rev[link[0]], contig_names_rev[link[1]])) 151 | else: 152 | self_looped_nodes.append(link[0]) 153 | 154 | return edge_list, self_looped_nodes 155 | 156 | 157 | def build_assembly_graph(assembly_graph_file): 158 | """ 159 | Build the assembly graph 160 | """ 161 | 162 | ( 163 | node_count, 164 | graph_contigs, 165 | links, 166 | oriented_links, 167 | link_overlap, 168 | contig_names, 169 | edges_lengths, 170 | ) = get_links(assembly_graph_file) 171 | 172 | # Get reverse mapping of contig identifiers 173 | contig_names_rev = contig_names.inverse 174 | 175 | # Create graph 176 | assembly_graph = Graph(directed=False) 177 | 178 | # Add vertices 179 | assembly_graph.add_vertices(node_count) 180 | 181 | # Name vertices with contig identifiers 182 | for i in range(node_count): 183 | assembly_graph.vs[i]["id"] = i 184 | assembly_graph.vs[i]["name"] = contig_names[i] 185 | assembly_graph.vs[i]["label"] = contig_names[i] + "\nID:" + str(i) 186 | 187 | edge_list, self_looped_nodes = get_graph_edges( 188 | links=links, contig_names_rev=contig_names_rev 189 | ) 190 | 191 | # Add edges to the graph 192 | assembly_graph.add_edges(edge_list) 193 | 194 | # Simplify the graph 195 | assembly_graph.simplify(multiple=True, loops=False, combine_edges=None) 196 | 197 | return ( 198 | assembly_graph, 199 | oriented_links, 200 | link_overlap, 201 | contig_names, 202 | contig_names_rev, 203 | graph_contigs, 204 | self_looped_nodes, 205 | edges_lengths, 206 | ) 207 | 208 | 209 | def get_circular(self_looped_nodes, graph_unitigs): 210 | """ 211 | Get circular unitigs 212 | """ 213 | 214 | circular = {} 215 | 216 | for unitig in self_looped_nodes: 217 | circular[unitig] = len(str(graph_unitigs[unitig])) 218 | 219 | # with open(paths, "r") as myfile: 220 | 221 | # for line in myfile.readlines(): 222 | # if not line.startswith("#"): 223 | # strings = line.strip().split() 224 | 225 | # if strings[3] == "Y": 226 | # contig_name = strings[0].replace("contig", "edge") 227 | # contig_length = int(strings[1]) 228 | # circular[contig_name] = contig_length 229 | 230 | return circular 231 | 232 | 233 | def remove_dead_ends(G_edge): 234 | """ 235 | Remove dead-ends from the component 236 | """ 237 | 238 | new_G = copy.deepcopy(G_edge) 239 | 240 | has_dead_ends = True 241 | 242 | dead_ends_to_remove = [] 243 | 244 | while has_dead_ends: 245 | to_remove = [] 246 | 247 | for node in list(new_G.nodes): 248 | if not (new_G.in_degree(node) > 0 and new_G.out_degree()(node)) > 0: 249 | to_remove.append(node) 250 | 251 | if len(to_remove) > 0: 252 | new_G.remove_nodes_from(to_remove) 253 | logger.debug(f"Removing dead-ends: {to_remove}") 254 | else: 255 | has_dead_ends = False 256 | 257 | dead_ends_to_remove += to_remove 258 | 259 | return set(dead_ends_to_remove) 260 | 261 | 262 | def get_all_sub_paths(assembly_graph, unitig_names): 263 | """ 264 | Get all sub paths of length 2 and 3 265 | """ 266 | 267 | sub_paths = defaultdict(int) 268 | 269 | for v in range(assembly_graph.vcount()): 270 | # Get all paths starting from vertex 'v' of length exactly 2 271 | paths_from_v = assembly_graph.get_all_simple_paths(v, cutoff=2) 272 | 273 | for path in paths_from_v: 274 | if len(path) == 3: # Length 3 means 3 vertices (2 edges) 275 | node1 = unitig_names[path[0]] 276 | node2 = unitig_names[path[1]] 277 | node3 = unitig_names[path[2]] 278 | sub_paths[tuple([node1, node2, node3])] = 0 279 | 280 | elif len(path) == 2: # Length 2 means 2 vertices (1 edge) 281 | node1 = unitig_names[path[0]] 282 | node2 = unitig_names[path[1]] 283 | sub_paths[tuple([node1, node2])] = 0 284 | 285 | return sub_paths 286 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/flow_utils.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | from .FD_Inexact import SolveInstances 4 | 5 | 6 | def get_source_sink_circular(G_edge, graph_unitigs, minlength, self_looped_nodes): 7 | """ 8 | Identify source/sink vertex for circular components 9 | """ 10 | 11 | source_sink_candidates = [] 12 | 13 | for node in list(G_edge.nodes): 14 | unitig_name = node[:-1] 15 | 16 | if ( 17 | unitig_name not in self_looped_nodes 18 | and len(graph_unitigs[unitig_name]) > minlength 19 | ): 20 | # Get BFS layers 21 | bfs_layers = dict(enumerate(nx.bfs_layers(G_edge, node))) 22 | 23 | # Get last later 24 | last_layer = list(bfs_layers.keys())[-1] 25 | 26 | node_is_st = True 27 | 28 | # Check if successors of those in last_layer is same as the node 29 | for item in bfs_layers[last_layer]: 30 | if item[:-1] not in self_looped_nodes: 31 | item_successors = list(G_edge.successors(item)) 32 | 33 | if ( 34 | len(item_successors) > 0 35 | and list(G_edge.successors(item))[0] != node 36 | ): 37 | node_is_st = False 38 | break 39 | if len(item_successors) == 0: 40 | node_is_st = False 41 | 42 | if len(bfs_layers[last_layer]) == 0: 43 | node_is_st = False 44 | 45 | if node_is_st: 46 | source_sink_candidates.append(node) 47 | 48 | return source_sink_candidates 49 | 50 | 51 | def get_source_sink_linear(G_edge, self_looped_nodes): 52 | """ 53 | Identify source/sink vertex for linear components 54 | """ 55 | 56 | source_candidates = [] 57 | sink_candidates = [] 58 | 59 | for node in list(G_edge.nodes): 60 | unitig_name = node[:-1] 61 | 62 | if unitig_name not in self_looped_nodes: 63 | indegree = len([x for x in G_edge.predecessors(node)]) 64 | outdegree = len([x for x in G_edge.successors(node)]) 65 | if indegree > 0 and outdegree == 0: 66 | sink_candidates.append(node) 67 | elif indegree == 0 and outdegree > 0: 68 | source_candidates.append(node) 69 | 70 | return source_candidates, sink_candidates 71 | 72 | 73 | def solve_mfd(G, max_paths, output, nthreads): 74 | """ 75 | Get paths by solving MFD 76 | """ 77 | 78 | listOfGraphs = {} 79 | listOfGraphs[0] = G 80 | 81 | outputfile = f"{output}/results_MFD.txt" 82 | recordfile = f"{output}/results_MFD_details.txt" 83 | 84 | solution_paths = SolveInstances( 85 | listOfGraphs, max_paths, outputfile, recordfile, nthreads 86 | ) 87 | 88 | return solution_paths 89 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/gene_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from pathlib import Path 3 | 4 | 5 | def get_smg_unitigs(hmmout, mg_frac): 6 | """ 7 | Get unitigs containing bacterial single-copy marker genes 8 | """ 9 | 10 | # Commands 11 | # run_FragGeneScan.pl -genome=edges.fasta -out=edges.fasta.frag -complete=0 -train=complete -thread=8 1>edges.fasta.frag.out 2>edges.fasta.frag.err 12 | # hmmsearch --domtblout edges.fasta.hmmout --cut_tc --cpu 8 /home/mall0133/software/MetaCoAG/metacoag_utils/auxiliary/marker.hmm edges.fasta.frag.faa 1>edges.fasta.hmmout.out 2> edges.fasta.hmmout.err 13 | 14 | smg_unitigs = set() 15 | 16 | unitig_smgs = {} 17 | 18 | with open(hmmout, "r") as myfile: 19 | for line in myfile.readlines(): 20 | if not line.startswith("#"): 21 | strings = line.strip().split() 22 | 23 | unitig = strings[0] 24 | 25 | # Marker gene name 26 | marker_gene = strings[3] 27 | 28 | # Marker gene length 29 | marker_gene_length = int(strings[5]) 30 | 31 | # Mapped marker gene length 32 | mapped_marker_length = int(strings[16]) - int(strings[15]) 33 | 34 | name_strings = unitig.split("_") 35 | name_strings = name_strings[: len(name_strings) - 3] 36 | 37 | # unitig name 38 | unitig_name = "_".join(name_strings) 39 | 40 | if mapped_marker_length > marker_gene_length * mg_frac: 41 | smg_unitigs.add(unitig_name) 42 | 43 | if unitig_name not in unitig_smgs: 44 | unitig_smgs[unitig_name] = set() 45 | unitig_smgs[unitig_name].add(marker_gene) 46 | else: 47 | unitig_smgs[unitig_name].add(marker_gene) 48 | 49 | return smg_unitigs 50 | 51 | 52 | def get_phrog_unitigs(phrogs, e_value, seq_identity): 53 | """ 54 | Get unitigs containing PHROGs 55 | """ 56 | 57 | # Read phrogs table and get annotations and categories 58 | phrog_table_file = Path(__file__).parent / "phrogs" / "phrog_annot.tsv" 59 | 60 | phrog_dict = defaultdict(str) 61 | 62 | with open(phrog_table_file, "r") as myfile: 63 | for line in myfile.readlines(): 64 | if not line.startswith("phrog"): 65 | strings = line.strip().split("\t") 66 | phrog_dict[f"phrog_{strings[0]}"] = f"{strings[2]} {strings [3]}" 67 | 68 | # Get unitigs containing phrogs 69 | unitig_phrogs = {} 70 | 71 | with open(phrogs, "r") as myfile: 72 | for line in myfile.readlines(): 73 | # if "edge_" in line: 74 | 75 | strings = line.strip().split("\t") 76 | 77 | name = strings[0][1:-1] 78 | phrog_id = strings[1][1:-1].split()[0] 79 | seqIdentity = float(strings[3]) 80 | evalue = float(strings[4]) 81 | 82 | if evalue < e_value and seqIdentity > seq_identity: 83 | if name not in unitig_phrogs: 84 | unitig_phrogs[name] = set([phrog_id]) 85 | else: 86 | unitig_phrogs[name].add(phrog_id) 87 | 88 | return unitig_phrogs, phrog_dict 89 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/genome_utils.py: -------------------------------------------------------------------------------- 1 | # Class for genome path 2 | class GenomePath: 3 | def __init__( 4 | self, 5 | id, 6 | bubble_case, 7 | node_order, 8 | node_order_human, 9 | node_id_order, 10 | path, 11 | coverage, 12 | length, 13 | gc, 14 | ): 15 | self.id = id 16 | self.bubble_case = bubble_case 17 | self.path = path 18 | self.coverage = coverage 19 | self.length = length 20 | self.node_order = node_order 21 | self.node_order_human = node_order_human 22 | self.node_id_order = node_id_order 23 | self.gc = gc 24 | 25 | 26 | # Class for genome component 27 | class GenomeComponent: 28 | def __init__( 29 | self, 30 | id, 31 | n_nodes, 32 | n_paths, 33 | max_degree, 34 | min_degree, 35 | max_in_degree, 36 | max_out_degree, 37 | avg_degree, 38 | avg_in_degree, 39 | avg_out_degree, 40 | density, 41 | max_path_length, 42 | min_path_length, 43 | min_max_len_ratio, 44 | max_cov_path_length, 45 | min_cov_path_length, 46 | min_max_cov_len_ratio, 47 | max_cov, 48 | min_cov, 49 | min_max_cov_ratio, 50 | frac_unitigs, 51 | ): 52 | self.id = id 53 | self.n_nodes = n_nodes 54 | self.n_paths = n_paths 55 | self.max_degree = max_degree 56 | self.min_degree = min_degree 57 | self.max_in_degree = max_in_degree 58 | self.max_out_degree = max_out_degree 59 | self.avg_degree = avg_degree 60 | self.avg_in_degree = avg_in_degree 61 | self.avg_out_degree = avg_out_degree 62 | self.density = density 63 | self.max_path_length = max_path_length 64 | self.min_path_length = min_path_length 65 | self.min_max_len_ratio = min_max_len_ratio 66 | self.max_cov_path_length = max_cov_path_length 67 | self.min_cov_path_length = min_cov_path_length 68 | self.min_max_cov_len_ratio = min_max_cov_len_ratio 69 | self.max_cov = max_cov 70 | self.min_cov = min_cov 71 | self.min_max_cov_ratio = min_max_cov_ratio 72 | self.frac_unitigs = frac_unitigs 73 | -------------------------------------------------------------------------------- /phables/workflow/scripts/phables_utils/output_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import subprocess 4 | 5 | FASTA_LINE_LEN = 60 6 | 7 | # Create logger 8 | logger = logging.getLogger("phables 1.4.1") 9 | 10 | 11 | def write_unitigs(nodes, unitig_names, graph_unitigs, filename, output): 12 | """ 13 | Write unitigs to FASTA file 14 | """ 15 | 16 | with open(f"{output}/{filename}.fasta", "w+") as myfile: 17 | for node in nodes: 18 | unitig_name = unitig_names[node] 19 | edge_seq = str(graph_unitigs[unitig_name]) 20 | myfile.write(f">{unitig_name}\n") 21 | 22 | chunks = [ 23 | edge_seq[i : i + FASTA_LINE_LEN] 24 | for i in range(0, len(edge_seq), FASTA_LINE_LEN) 25 | ] 26 | 27 | for chunk in chunks: 28 | myfile.write(f"{chunk}\n") 29 | 30 | 31 | def write_component_info(all_components, output): 32 | """ 33 | Write component information to file 34 | """ 35 | 36 | with open(f"{output}/resolved_component_info.txt", "w") as myfile: 37 | myfile.write(f"Component\t") 38 | myfile.write(f"Number of nodes\t") 39 | myfile.write(f"Number of paths\t") 40 | myfile.write(f"Fraction of unitigs recovered\t") 41 | myfile.write(f"Maximum degree\t") 42 | myfile.write(f"Minimum degree\t") 43 | myfile.write(f"Maximum in degree\t") 44 | myfile.write(f"Maximum out degree\t") 45 | myfile.write(f"Average degree\t") 46 | myfile.write(f"Average in degree\t") 47 | myfile.write(f"Average out degree\t") 48 | myfile.write(f"Density\t") 49 | myfile.write(f"Maximum path length\t") 50 | myfile.write(f"Minimum path length\t") 51 | myfile.write(f"Length ratio (long/short)\t") 52 | myfile.write(f"Maximum coverage path length\t") 53 | myfile.write(f"Minimum coverage path length\t") 54 | myfile.write(f"Length ratio (highest cov/lowest cov)\t") 55 | myfile.write(f"Maximum coverage\t") 56 | myfile.write(f"Minimum coverage\t") 57 | myfile.write(f"Coverage ratio (highest/lowest)\n") 58 | 59 | if len(all_components) > 0: 60 | for component in all_components: 61 | myfile.write(f"{component.id}\t") 62 | myfile.write(f"{component.n_nodes}\t") 63 | myfile.write(f"{component.n_paths}\t") 64 | myfile.write(f"{component.frac_unitigs}\t") 65 | myfile.write(f"{component.max_degree}\t") 66 | myfile.write(f"{component.min_degree}\t") 67 | myfile.write(f"{component.max_in_degree}\t") 68 | myfile.write(f"{component.max_out_degree}\t") 69 | myfile.write(f"{component.avg_degree}\t") 70 | myfile.write(f"{component.avg_in_degree}\t") 71 | myfile.write(f"{component.avg_out_degree}\t") 72 | myfile.write(f"{component.density}\t") 73 | myfile.write(f"{component.max_path_length}\t") 74 | myfile.write(f"{component.min_path_length}\t") 75 | myfile.write(f"{component.min_max_len_ratio}\t") 76 | myfile.write(f"{component.max_cov_path_length}\t") 77 | myfile.write(f"{component.min_cov_path_length}\t") 78 | myfile.write(f"{component.min_max_cov_len_ratio}\t") 79 | myfile.write(f"{component.max_cov}\t") 80 | myfile.write(f"{component.min_cov}\t") 81 | myfile.write(f"{component.min_max_cov_ratio}\n") 82 | else: 83 | myfile.write(f"No complex components were resolved.") 84 | 85 | return "resolved_component_info.txt" 86 | 87 | 88 | def write_res_genome_info(all_resolved_paths, output): 89 | """ 90 | Write resolved genome information to file 91 | """ 92 | 93 | with open(f"{output}/resolved_genome_info.txt", "w") as myfile: 94 | myfile.write( 95 | f"Path\tCase\tCoverage\tLength\tGC content\tNode order (gfa link format)\tNode order (human readable)\n" 96 | ) 97 | for genomic_path in all_resolved_paths: 98 | myfile.write( 99 | f"{genomic_path.id}\t{genomic_path.bubble_case}\t{genomic_path.coverage}\t{genomic_path.length}\t{genomic_path.gc}\t{genomic_path.node_order}\t{genomic_path.node_order_human}\n" 100 | ) 101 | 102 | return "resolved_genome_info.txt" 103 | 104 | 105 | def write_path(final_genomic_paths, output): 106 | """ 107 | Write genomic paths to a single FASTA file 108 | """ 109 | 110 | with open(f"{output}/resolved_paths.fasta", "a+") as myfile: 111 | for genomic_path in final_genomic_paths: 112 | myfile.write(f">{genomic_path.id}\n") 113 | 114 | chunks = [ 115 | genomic_path.path[i : i + FASTA_LINE_LEN] 116 | for i in range(0, genomic_path.length, FASTA_LINE_LEN) 117 | ] 118 | 119 | for chunk in chunks: 120 | myfile.write(f"{chunk}\n") 121 | 122 | 123 | def write_path_fasta(final_genomic_paths, output_genomes_path): 124 | """ 125 | Write genomic paths to individual FASTA files 126 | """ 127 | 128 | if not os.path.isdir(f"{output_genomes_path}"): 129 | subprocess.run("mkdir -p " + output_genomes_path, shell=True) 130 | 131 | for genomic_path in final_genomic_paths: 132 | with open(f"{output_genomes_path}/{genomic_path.id}.fasta", "w+") as myfile: 133 | myfile.write(f">{genomic_path.id}\n") 134 | 135 | chunks = [ 136 | genomic_path.path[i : i + FASTA_LINE_LEN] 137 | for i in range(0, genomic_path.length, FASTA_LINE_LEN) 138 | ] 139 | 140 | for chunk in chunks: 141 | myfile.write(f"{chunk}\n") 142 | 143 | 144 | def write_component_phrog_info(resolved_components, comp_phrogs, output): 145 | """ 146 | Write PHROGs found in resolved components 147 | """ 148 | 149 | with open(f"{output}/component_phrogs.txt", "w") as myfile: 150 | myfile.write(f"Phage component\tPHROG\n") 151 | for comp in resolved_components: 152 | myfile.write(f"phage_{comp}\t{comp_phrogs[comp]}\n") 153 | 154 | return "component_phrogs.txt" 155 | 156 | 157 | def init_files(output): 158 | """ 159 | Initialise files and folders 160 | """ 161 | 162 | open(f"{output}/resolved_edges.fasta", "a").close() 163 | open(f"{output}/resolved_paths.fasta", "a").close() 164 | open(f"{output}/resolved_genome_info.txt", "a").close() 165 | open(f"{output}/resolved_component_info.txt", "a").close() 166 | open(f"{output}/component_phrogs.txt", "a").close() 167 | 168 | if not os.path.isdir(f"{output}/resolved_phages"): 169 | subprocess.run(f"mkdir -p {output}/resolved_phages", shell=True) 170 | -------------------------------------------------------------------------------- /phables/workflow/test_phables.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Phables: from fragmented assemblies to high-quality bacteriophage genomes. 3 | 4 | 2023, Vijini Mallawaarachchi 5 | 6 | This is an auxiliary Snakefile to test phables. 7 | """ 8 | 9 | """CONFIGURATION""" 10 | configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml") 11 | configfile: os.path.join(workflow.basedir, "..", "config", "databases.yaml") 12 | 13 | 14 | """PREFLIGHT CHECKS 15 | Validate your inputs, set up directories, parse your config, etc. 16 | """ 17 | include: "rules/00_database_preflight.smk" 18 | include: "rules/03_test_preflight.smk" 19 | 20 | 21 | """TARGETS 22 | Declare your targets, either here, or in a separate file. 23 | """ 24 | include: "rules/03_test_targets.smk" 25 | 26 | 27 | """RUN SNAKEMAKE!""" 28 | rule all: 29 | input: 30 | allTargets 31 | 32 | 33 | """RULES 34 | Add rules files with the include directive here, or add rules AFTER rule 'all'. 35 | """ 36 | 37 | rule test_phables: 38 | input: 39 | g = os.path.join(TESTDIR, "assembly_graph.gfa"), 40 | c = os.path.join(TESTDIR, "edge_coverages.tsv"), 41 | b = TESTDIR, 42 | ph = os.path.join(TESTDIR, "phrogs_annotations.tsv"), 43 | hm = os.path.join(TESTDIR, "edges.fasta.hmmout") 44 | output: 45 | genomes_fasta = temp(os.path.join(TESTDIR, "resolved_paths.fasta")), 46 | genomes_folder = temp(directory(os.path.join(TESTDIR, "resolved_phages"))), 47 | genome_info = temp(os.path.join(TESTDIR, "resolved_genome_info.txt")), 48 | phage_edges = temp(os.path.join(TESTDIR, "phage_like_edges.fasta")), 49 | all_phage_edges = temp(os.path.join(TESTDIR, "all_phage_like_edges.fasta")), 50 | unresolved_edges = temp(os.path.join(TESTDIR, "unresolved_phage_like_edges.fasta")), 51 | unitigs = temp(os.path.join(TESTDIR, "resolved_edges.fasta")), 52 | component_info = temp(os.path.join(TESTDIR, "resolved_component_info.txt")), 53 | phrog_comp_info = temp(os.path.join(TESTDIR, "component_phrogs.txt")), 54 | mfd = temp(os.path.join(TESTDIR, "results_MFD.txt")), 55 | mfd_details = temp(os.path.join(TESTDIR, "results_MFD_details.txt")), 56 | log = temp(os.path.join(TESTDIR, "phables_output.log")) 57 | params: 58 | graph = os.path.join(TESTDIR, "assembly_graph.gfa"), 59 | hmmout = os.path.join(TESTDIR, "edges.fasta.hmmout"), 60 | phrogs = os.path.join(TESTDIR, "phrogs_annotations.tsv"), 61 | coverage = os.path.join(TESTDIR, "edge_coverages.tsv"), 62 | bampath = TESTDIR, 63 | minlength = ML, 64 | mincov = MC, 65 | compcount = CC, 66 | maxpaths = MP, 67 | mgfrac = MGF, 68 | evalue = EV, 69 | seqidentity = SI, 70 | covtol = CT, 71 | alpha = AL, 72 | longreads = LR, 73 | prefix = PR, 74 | output = TESTDIR, 75 | nthreads = 2, 76 | log = temp(os.path.join(TESTDIR, "phables_output.log")) 77 | log: 78 | os.path.join(TESTDIR, "phables_output.log") 79 | conda: 80 | os.path.join("envs", "phables.yaml") 81 | script: 82 | os.path.join('scripts', 'phables.py') -------------------------------------------------------------------------------- /phables_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables_logo.png -------------------------------------------------------------------------------- /phables_logo_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables_logo_dark.png -------------------------------------------------------------------------------- /phables_logo_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables_logo_light.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | 6 | def get_version(): 7 | with open( 8 | os.path.join( 9 | os.path.dirname(os.path.realpath(__file__)), 10 | "phables", 11 | "phables.VERSION", 12 | ) 13 | ) as f: 14 | return f.readline().strip() 15 | 16 | 17 | with open("README.md", "r") as fh: 18 | long_description = fh.read() 19 | 20 | 21 | data_files = [(".", ["LICENSE", "README.md"])] 22 | 23 | setup( 24 | name="phables", 25 | packages=find_packages(), 26 | url="https://github.com/Vini2/phables", 27 | python_requires=">=3.9, <3.11", 28 | description="Phables: from fragmented assemblies to high-quality bacteriophage genomes", 29 | long_description=long_description, 30 | long_description_content_type="text/markdown", 31 | version=get_version(), 32 | author="Vijini Mallawaarachchi", 33 | author_email="viji.mallawaarachchi@gmail.com", 34 | data_files=data_files, 35 | py_modules=["phables"], 36 | install_requires=[ 37 | "snakemake>=7.14.0", 38 | "pyyaml>=6.0", 39 | "click>=8.1.3", 40 | "metasnek>=0.0.5", 41 | "snaketool-utils>=0.0.4", 42 | ], 43 | entry_points={"console_scripts": ["phables=phables.__main__:main"]}, 44 | include_package_data=True, 45 | classifiers=[ 46 | "Development Status :: 5 - Production/Stable", 47 | "Programming Language :: Python :: 3", 48 | "License :: OSI Approved :: MIT License", 49 | "Natural Language :: English", 50 | "Topic :: Scientific/Engineering :: Bio-Informatics", 51 | "Operating System :: MacOS", 52 | "Operating System :: POSIX", 53 | ], 54 | ) 55 | -------------------------------------------------------------------------------- /tests/test_phables.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | __author__ = "Vijini Mallawaarachchi" 7 | __copyright__ = "Copyright 2023, Phables Project" 8 | __license__ = "MIT" 9 | __type__ = "Test Script" 10 | __maintainer__ = "Vijini Mallawaarachchi" 11 | __email__ = "viji.mallawaarachchi@gmail.com" 12 | 13 | 14 | TEST_ROOTDIR = Path(__file__).parent 15 | EXEC_ROOTDIR = Path(__file__).parent.parent 16 | 17 | 18 | @pytest.fixture(scope="session") 19 | def tmp_dir(tmpdir_factory): 20 | return tmpdir_factory.mktemp("tmp") 21 | 22 | 23 | @pytest.fixture(autouse=True) 24 | def workingdir(tmp_dir, monkeypatch): 25 | """set the working directory for all tests""" 26 | monkeypatch.chdir(tmp_dir) 27 | 28 | 29 | def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE): 30 | """executes shell command and returns stdout if completes exit code 0 31 | Parameters 32 | ---------- 33 | cmnd : str 34 | shell command to be executed 35 | stdout, stderr : streams 36 | Default value (PIPE) intercepts process output, setting to None 37 | blocks this.""" 38 | 39 | proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr) 40 | out, err = proc.communicate() 41 | if proc.returncode != 0: 42 | raise RuntimeError(f"FAILED: {cmnd}\n{err}") 43 | return out.decode("utf8") if out is not None else None 44 | 45 | 46 | def test_phables(tmp_dir): 47 | """test phables""" 48 | cmd = f"phables --help" 49 | exec_command(cmd) 50 | --------------------------------------------------------------------------------