├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
    └── workflows
    │   ├── codeql.yml
    │   ├── pypi-publish.yml
    │   └── testing.yml
├── .gitignore
├── .readthedocs.yaml
├── CITATION.cff
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── build
    ├── environment.yml
    └── meta.yaml
├── container
    ├── Dockerfile
    └── README.md
├── docs
    ├── annotation.md
    ├── assemble.md
    ├── citation.md
    ├── comparison.md
    ├── faq.md
    ├── graph_stats.md
    ├── images
    │   ├── Phables_workflow.png
    │   ├── components.png
    │   ├── histogram_n_nodes.png
    │   ├── pearson_clustermap.png
    │   ├── pearson_heatmap.png
    │   ├── phables_logo.png
    │   ├── phage_comp_280_cycle_1_plot.png
    │   ├── qual_resolved_genome_unitig_boxen.png
    │   └── qual_resolved_genome_unitig_violin.png
    ├── index.md
    ├── install.md
    ├── quality.md
    ├── requirements.txt
    └── usage.md
├── mkdocs.yml
├── phables
    ├── __init__.py
    ├── __main__.py
    ├── config
    │   ├── config.yaml
    │   └── databases.yaml
    ├── phables.CITATION
    ├── phables.LICENSE
    ├── phables.VERSION
    ├── test_data
    │   ├── assembly_graph.gfa
    │   ├── edge_coverages.tsv
    │   ├── edges.fasta.hmmout
    │   ├── junction_pe_coverage.pickle
    │   └── phrogs_annotations.tsv
    └── workflow
    │   ├── envs
    │       ├── curl.yaml
    │       ├── koverage.yaml
    │       ├── mapping.yaml
    │       ├── mmseqs.yaml
    │       ├── phables.yaml
    │       └── smg.yaml
    │   ├── install.smk
    │   ├── phables.smk
    │   ├── rules
    │       ├── 00_database_preflight.smk
    │       ├── 02_phables_preflight.smk
    │       ├── 02_phables_targets.smk
    │       ├── 03_test_preflight.smk
    │       ├── 03_test_targets.smk
    │       ├── coverage.smk
    │       ├── genes.smk
    │       ├── gfa2fasta.smk
    │       ├── phables.smk
    │       └── postprocess.smk
    │   ├── scripts
    │       ├── combine_cov.py
    │       ├── format_koverage_results.py
    │       ├── gfa2fasta.py
    │       ├── phables.py
    │       └── phables_utils
    │       │   ├── FD_Inexact.py
    │       │   ├── __init__.py
    │       │   ├── component_utils.py
    │       │   ├── coverage_utils.py
    │       │   ├── edge_graph_utils.py
    │       │   ├── flow_utils.py
    │       │   ├── gene_utils.py
    │       │   ├── genome_utils.py
    │       │   ├── long_utils.py
    │       │   ├── output_utils.py
    │       │   ├── phrogs
    │       │       └── phrog_annot.tsv
    │       │   └── short_utils.py
    │   └── test_phables.smk
├── phables_logo.png
├── phables_logo_dark.png
├── phables_logo_light.png
├── setup.py
└── tests
    └── test_phables.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behaviour, including the
15 | 1. Command executed
16 | 2. Error message
17 | 
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 | 
21 | **Screenshots**
22 | If applicable, add screenshots to help explain your problem.
23 | 
24 | **Desktop (please complete the following information):**
25 |  - OS: [e.g. iOS]
26 |  - Browser [e.g. chrome, safari]
27 |  - Version [e.g. 22]
28 | 
29 | **Additional context**
30 | Add any other context about the problem here.
31 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Custom issue template
 3 | about: Describe this issue template's purpose here.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[Feature request]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "develop" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "develop" ]
20 |   schedule:
21 |     - cron: '39 18 * * 0'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Use only 'java' to analyze code written in Java, Kotlin or both
38 |         # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
39 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
40 | 
41 |     steps:
42 |     - name: Checkout repository
43 |       uses: actions/checkout@v3
44 | 
45 |     # Initializes the CodeQL tools for scanning.
46 |     - name: Initialize CodeQL
47 |       uses: github/codeql-action/init@v2
48 |       with:
49 |         languages: ${{ matrix.language }}
50 |         # If you wish to specify custom queries, you can do so here or in a config file.
51 |         # By default, queries listed here will override any specified in a config file.
52 |         # Prefix the list here with "+" to use these queries and those in the config file.
53 | 
54 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
55 |         # queries: security-extended,security-and-quality
56 | 
57 | 
58 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
59 |     # If this step fails, then you should remove it and run the build manually (see below)
60 |     - name: Autobuild
61 |       uses: github/codeql-action/autobuild@v2
62 | 
63 |     # ℹ️ Command-line programs to run using the OS shell.
64 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
65 | 
66 |     #   If the Autobuild fails above, remove it and uncomment the following three lines.
67 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
68 | 
69 |     # - run: |
70 |     #   echo "Run, Build Application using script"
71 |     #   ./location_of_script_within_repo/buildscript.sh
72 | 
73 |     - name: Perform CodeQL Analysis
74 |       uses: github/codeql-action/analyze@v2
75 |       with:
76 |         category: "/language:${{matrix.language}}"
77 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ develop ]
 6 |   pull_request:
 7 |     branches: [ develop ]
 8 |     
 9 | 
10 | jobs:
11 |   tests:
12 |     name: "Python ${{ matrix.python-version }}"
13 |     runs-on: ${{ matrix.os }}
14 | 
15 |     defaults:
16 |       run:
17 |         shell: bash -el {0}
18 | 
19 |     strategy:
20 |       matrix:
21 |         os: [ubuntu-latest, macos-latest]
22 |         python-version: ["3.9", "3.10"]
23 | 
24 |     steps:
25 |       - uses: "actions/checkout@v3"
26 |         with:
27 |           fetch-depth: 0
28 | 
29 |       # Setup env
30 |       - uses: conda-incubator/setup-miniconda@v3
31 |         with:
32 |           activate-environment: phables
33 |           environment-file: build/environment.yml
34 |           python-version: ${{ matrix.python-version }}
35 |           auto-activate-base: false
36 | 
37 |       - name: "Setup Phables on ${{ matrix.os }} for Python ${{ matrix.python-version }}"
38 |         run: |
39 |           python -m pip install --upgrade pip
40 |           pip install .
41 |       
42 |       - name: "Generate coverage report on ${{ matrix.os }} for Python ${{ matrix.python-version }}"
43 |         run: |
44 |           pip install pytest pytest-cov
45 |           pytest --cov=./ --cov-report xml --cov-report lcov --cov-append
46 |           
47 |       - name: Coveralls Parallel
48 |         uses: coverallsapp/github-action@master
49 |         with:
50 |           parallel: true
51 |           github-token: ${{ secrets.github_token }}
52 |           flag-name: run-${{ matrix.test_number }}
53 |           path-to-lcov: "coverage.lcov"
54 | 
55 |   finish:
56 |     needs: tests
57 |     runs-on: ${{ matrix.os }}
58 |     strategy:
59 |       matrix:
60 |         os: [ubuntu-latest, macos-latest]
61 |         python-version: ["3.9", "3.10"]
62 |     steps:
63 |     - name: Coveralls Finished
64 |       uses: coverallsapp/github-action@master
65 |       with:
66 |         github-token: ${{ secrets.github_token }}
67 |         parallel-finished: true
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | develop-eggs/
 12 | dist/
 13 | downloads/
 14 | eggs/
 15 | .eggs/
 16 | lib/
 17 | lib64/
 18 | parts/
 19 | sdist/
 20 | var/
 21 | wheels/
 22 | pip-wheel-metadata/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | db.sqlite3-journal
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # pipenv
 87 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 88 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 89 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 90 | #   install all needed dependencies.
 91 | #Pipfile.lock
 92 | 
 93 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 94 | __pypackages__/
 95 | 
 96 | # Celery stuff
 97 | celerybeat-schedule
 98 | celerybeat.pid
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | 
130 | # Mac OS
131 | *.DS_Store
132 | ./**/.DS_Store
133 | 
134 | # Snakemake
135 | .snakemake/
136 | phables/workflow/conda/
137 | phables.out/
138 | 
139 | # Databases
140 | databases/
141 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-20.04
11 |   tools:
12 |     python: "3.9"
13 | 
14 | mkdocs:
15 |   configuration: mkdocs.yml
16 | 
17 | # Optionally declare the Python requirements required to build your docs
18 | python:
19 |    install:
20 |    - requirements: docs/requirements.txt
21 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite our article in Bioinformatics as below."
 3 | authors:
 4 | - family-names: "Mallawaarachchi"
 5 |   given-names: "Vijini"
 6 |   orcid: "https://orcid.org/0000-0002-2651-8719"
 7 | - family-names: "Roach"
 8 |   given-names: "Michael J."
 9 |   orcid: "https://orcid.org/0000-0003-1488-5148"
10 | - family-names: "Decewicz"
11 |   given-names: "Przemyslaw"
12 |   orcid: "https://orcid.org/0000-0002-5621-7124"
13 | - family-names: "Papudeshi"
14 |   given-names: "Bhavya"
15 |   orcid: "https://orcid.org/0000-0001-5359-3100"
16 | - family-names: "Giles"
17 |   given-names: "Sarak K."
18 |   orcid: "https://orcid.org/0000-0002-4395-060X"
19 | - family-names: "Grigson"
20 |   given-names: "Susanna R."
21 |   orcid: "https://orcid.org/0000-0003-4738-3451"
22 | - family-names: "Bouras"
23 |   given-names: "George"
24 |   orcid: "https://orcid.org/0000-0002-5885-4186"
25 | - family-names: "Hesse"
26 |   given-names: "Ryan D."
27 |   orcid: "https://orcid.org/0000-0001-9366-5631"
28 | - family-names: "Inglis"
29 |   given-names: "Laura K."
30 |   orcid: "https://orcid.org/0000-0001-7919-8563"
31 | - family-names: "Hutton"
32 |   given-names: "Abbey LK."
33 |   orcid: "https://orcid.org/0000-0002-2474-1327"
34 | - family-names: "Dinsdale"
35 |   given-names: "Elizabeth A."
36 |   orcid: "https://orcid.org/0000-0002-2177-203X"
37 | - family-names: "Edwards"
38 |   given-names: "Robert A."
39 |   orcid: "https://orcid.org/0000-0001-8383-8949"
40 | title: "Phables: from fragmented assemblies to high-quality bacteriophage genomes"
41 | doi: 10.1093/bioinformatics/btad586
42 | date-released: 2017-12-18
43 | url: "https://github.com/github-linguist/linguist"
44 | preferred-citation:
45 |   type: article
46 |   authors:
47 |   - family-names: "Mallawaarachchi"
48 |     given-names: "Vijini"
49 |     orcid: "https://orcid.org/0000-0002-2651-8719"
50 |   - family-names: "Roach"
51 |     given-names: "Michael J."
52 |     orcid: "https://orcid.org/0000-0003-1488-5148"
53 |   - family-names: "Decewicz"
54 |     given-names: "Przemyslaw"
55 |     orcid: "https://orcid.org/0000-0002-5621-7124"
56 |   - family-names: "Papudeshi"
57 |     given-names: "Bhavya"
58 |     orcid: "https://orcid.org/0000-0001-5359-3100"
59 |   - family-names: "Giles"
60 |     given-names: "Sarak K."
61 |     orcid: "https://orcid.org/0000-0002-4395-060X"
62 |   - family-names: "Grigson"
63 |     given-names: "Susanna R."
64 |     orcid: "https://orcid.org/0000-0003-4738-3451"
65 |   - family-names: "Bouras"
66 |     given-names: "George"
67 |     orcid: "https://orcid.org/0000-0002-5885-4186"
68 |   - family-names: "Hesse"
69 |     given-names: "Ryan D."
70 |     orcid: "https://orcid.org/0000-0001-9366-5631"
71 |   - family-names: "Inglis"
72 |     given-names: "Laura K."
73 |     orcid: "https://orcid.org/0000-0001-7919-8563"
74 |   - family-names: "Hutton"
75 |     given-names: "Abbey LK."
76 |     orcid: "https://orcid.org/0000-0002-2474-1327"
77 |   - family-names: "Dinsdale"
78 |     given-names: "Elizabeth A."
79 |     orcid: "https://orcid.org/0000-0002-2177-203X"
80 |   - family-names: "Edwards"
81 |     given-names: "Robert A."
82 |     orcid: "https://orcid.org/0000-0001-8383-8949"
83 |   doi: "10.1093/bioinformatics/btad586"
84 |   journal: "Bioinformatics"
85 |   month: 9
86 |   title: "Phables: from fragmented assemblies to high-quality bacteriophage genomes"
87 |   start: "btad586"
88 |   issue: 10
89 |   volume: 39
90 |   year: 2023


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | [Vijini Mallawaarachchi](mailto:viji.mallawaarachchi@gmail.com).
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to Phables project
  2 | 
  3 | We love to have your contributions to the Phables project, whether it's:
  4 | * Reporting a bug
  5 | * Submitting a fix
  6 | * Proposing new features
  7 | 
  8 | ## Clone and install Phables onto your machine
  9 | 
 10 | First, make sure you have [git](https://github.com/git-guides/install-git) installed on your machine.
 11 | 
 12 | On GitHub, [fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the Phables repository and [clone](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) it to your machine.
 13 | 
 14 | ```bash
 15 | # clone repository to your local machine
 16 | git clone https://github.com/Vini2/phables.git
 17 | ```
 18 | 
 19 | Move to the Phables directory 
 20 | 
 21 | ```bash
 22 | cd phables
 23 | ```
 24 | 
 25 | Create and activate the conda environment. Make sure to have [`conda`](https://docs.conda.io/en/latest/) installed.
 26 | 
 27 | ```bash
 28 | # Create the phables environment
 29 | conda env create -f build/environment.yml
 30 | 
 31 | # Activate the phables environment
 32 | conda activate phables
 33 | ```
 34 | 
 35 | Now install Phables via [`pip`](https://pip.pypa.io/en/stable/).
 36 | 
 37 | ```bash
 38 | pip install -e .
 39 | ```
 40 | 
 41 | ## Test Phables installation
 42 | 
 43 | Print the help message using the following command.
 44 | 
 45 | ```bash
 46 | phables -h
 47 | ```
 48 | 
 49 | Use the following command to launch the phables test run and all the tests should pass.
 50 | 
 51 | ```bash
 52 | phables test
 53 | ```
 54 | 
 55 | ## Coding Style
 56 | 
 57 | We adhere to the [PEP 8](https://peps.python.org/pep-0008/) style guide. 
 58 | 
 59 | Before committing, make sure to run [`black`](https://pypi.org/project/black/) and [`isort`](https://pypi.org/project/isort/).
 60 | 
 61 | ```bash
 62 | black phables/workflow/scripts
 63 | isort --atomic phables/workflow/scripts 
 64 | ```
 65 | 
 66 | ## Report bugs using GitHub's issues
 67 | 
 68 | We use GitHub issues to track public bugs. Report a bug by opening a new issue in GitHub [issues](https://github.com/Vini2/phables/issues). You will get to select between templates for bug report and feature request. If none of these templates match what you want to report, you can use the custom issue template.
 69 | 
 70 | ## Committing code
 71 | 
 72 | Once you have finished coding and all the tests pass, commit your code and make a pull request. 
 73 | 
 74 | ```bash
 75 | # Add changed/added files
 76 | git add <file name>
 77 | 
 78 | # Commit changes
 79 | git commit -m "<commit message>"
 80 | 
 81 | # Push changes
 82 | git push
 83 | ```
 84 | 
 85 | Make sure to follow the commit style of [c3dev](https://github.com/cogent3/c3dev/wiki#style-for-commit-messages). Relevant prefixes are replicated below for convenience.
 86 | 
 87 | | **Commit Prefix** | **For**                                       |
 88 | |-------------------|-----------------------------------------------|
 89 | | DEV:              | development tool or utility                   |
 90 | | DOC:              | documentation                                 |
 91 | | TST:              | addition or modification of tests             |
 92 | | REL:              | related to a release                          |
 93 | | MAINT:            | maintenance commit (refactoring, typos, etc.) |
 94 | | BUG:              | bug fix                                       |
 95 | | GIT:              | git related                                   |
 96 | | REV:              | revert an earlier commit                      |
 97 | 
 98 | 
 99 | Your contribution will be reviewed before accepting it. 
100 | 
101 | ## License
102 | 
103 | By contributing, you agree that your contributions will be licensed under the MIT License.
104 | 
105 | ## References
106 | 
107 | This document was adapted from the open-source contribution guidelines for [Transcriptase](https://github.com/briandk/transcriptase-atom/blob/master/CONTRIBUTING.md) and [c3dev](https://github.com/cogent3/c3dev/wiki/How-to-Contribute-Code).
108 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Vijini Mallawaarachchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include phables/phables.LICENSE
2 | include phables/phables.CITATION
3 | include phables/phables.VERSION
4 | recursive-include phables *
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="https://raw.githubusercontent.com/Vini2/phables/master/phables_logo_light.png#gh-light-mode-only" width="500" title="phables logo" alt="phables logo">
  3 |   <img src="https://raw.githubusercontent.com/Vini2/phables/master/phables_logo_dark.png#gh-dark-mode-only" width="500" title="phables logo" alt="phables logo">
  4 | </p>
  5 | 
  6 | Phables: from fragmented assemblies to high-quality bacteriophage genomes
  7 | ===============
  8 | 
  9 | [![DOI](https://img.shields.io/badge/DOI-10.1093/bioinformatics/btad586-blue)](https://doi.org/10.1093/bioinformatics/btad586)
 10 | ![GitHub](https://img.shields.io/github/license/Vini2/phables)
 11 | [![](https://img.shields.io/static/v1?label=CLI&message=Snaketool&color=blueviolet)](https://github.com/beardymcjohnface/Snaketool)
 12 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 13 | ![GitHub last commit (branch)](https://img.shields.io/github/last-commit/Vini2/phables/develop?color=8a35da)
 14 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/phables/README.html)
 15 | [![Conda](https://img.shields.io/conda/v/bioconda/phables)](https://anaconda.org/bioconda/phables)
 16 | [![Conda](https://img.shields.io/conda/dn/bioconda/phables)](https://anaconda.org/bioconda/phables)
 17 | [![PyPI version](https://badge.fury.io/py/phables.svg)](https://badge.fury.io/py/phables)
 18 | [![Downloads](https://static.pepy.tech/badge/phables)](https://pepy.tech/project/phables)
 19 | [![CI](https://github.com/Vini2/phables/actions/workflows/testing.yml/badge.svg)](https://github.com/Vini2/phables/actions/workflows/testing.yml)
 20 | [![CodeQL](https://github.com/Vini2/phables/actions/workflows/codeql.yml/badge.svg)](https://github.com/Vini2/phables/actions/workflows/codeql.yml)
 21 | [![Documentation Status](https://readthedocs.org/projects/phables/badge/?version=latest)](https://phables.readthedocs.io/en/latest/?badge=latest)
 22 | 
 23 | Phables is a tool developed to resolve bacteriophage genomes using assembly graphs of viral metagenomic data. It models phage-like components in the viral metagenomic assembly as flow networks, models as a minimum flow decomposition problem and resolves genomic paths corresponding to flow paths determined. Phables uses the [Minimum Flow Decomposition via Integer Linear Programming](https://github.com/algbio/MFD-ILP) implementation to obtain the flow paths.
 24 | 
 25 | For detailed instructions on installation and usage, please refer to the [**documentation hosted at Read the Docs**](https://phables.readthedocs.io/en/latest/).
 26 | 
 27 | Phables is available on Bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables) and on PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Feel free to pick your package manager, but we recommend that you use [`conda`](https://docs.conda.io/en/latest/).
 28 | 
 29 | **NEW:** Phables is now available as a Docker container from [Docker hub](https://hub.docker.com/r/linsalrob/phables). Click [here](https://github.com/Vini2/phables/tree/develop/container) for more details.
 30 | 
 31 | ## Setting up Phables
 32 | 
 33 | ### Option 1: Installing Phables using conda (recommended)
 34 | 
 35 | You can install Phables from Bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables). Make sure you have [`conda`](https://docs.conda.io/en/latest/) installed.
 36 | 
 37 | ```bash
 38 | # create conda environment and install phables
 39 | conda create -n phables -c conda-forge -c anaconda -c bioconda phables
 40 | 
 41 | # activate environment
 42 | conda activate phables
 43 | ```
 44 | 
 45 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi.
 46 | 
 47 | ### Option 2: Installing Phables using pip
 48 | 
 49 | You can install Phables from PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Make sure you have [`pip`](https://pip.pypa.io/en/stable/) and [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) installed.
 50 | 
 51 | ```bash
 52 | pip install phables
 53 | ```
 54 | 
 55 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi.
 56 | 
 57 | ### Setting up Gurobi
 58 | 
 59 | The MFD implementation uses the linear programming solver [Gurobi](https://www.gurobi.com/). The `phables` conda environment and pip setup does not include Gurobi. You have to install Gurobi using one of the following commands depending on your package manager.
 60 | 
 61 | ```bash
 62 | # conda
 63 | conda install -c gurobi gurobi
 64 | 
 65 | # pip
 66 | pip install gurobipy
 67 | ```
 68 | 
 69 | To handle large models without any model size limitations, once you have installed Gurobi, you have to activate the (academic) license and add the key using the following command. You only have to do this once.
 70 | 
 71 | ```bash
 72 | grbgetkey <KEY>
 73 | ```
 74 | 
 75 | You can refer to further instructions at [https://www.gurobi.com/academia/academic-program-and-licenses/](https://www.gurobi.com/academia/academic-program-and-licenses/). 
 76 | 
 77 | ### Test the installation
 78 | 
 79 | After setting up, run the following command to print out the Phables help message.
 80 | 
 81 | ```bash
 82 | phables --help
 83 | ```
 84 | 
 85 | ## Quick Start Guide
 86 | 
 87 | Phables is powered by [Snaketool](https://github.com/beardymcjohnface/Snaketool) which packs in all the setup, testing, preprocessing and running steps into an easy-to-use pipeline.
 88 | 
 89 | ### Setup the databases
 90 | 
 91 | ```bash
 92 | # Download and setup the databases - you only have to do this once
 93 | phables install
 94 | ```
 95 | 
 96 | ### Run on test data
 97 | 
 98 | ```bash
 99 | phables test
100 | ```
101 | 
102 | ### Run on your own data
103 | 
104 | ```bash
105 | # Run Phables using short read data
106 | phables run --input assembly_graph.gfa --reads fastq/ --threads 8
107 | 
108 | # Run Phables using long read data
109 | phables run --input assembly_graph.gfa --reads fastq/ --threads 8 --longreads
110 | ```
111 | 
112 | Please refer to the [**documentation hosted at Read the Docs**](https://phables.readthedocs.io/en/latest/) for further information on how to run Phables.
113 | 
114 | 
115 | ##  Issues and Questions
116 | 
117 | If you want to test (or break) Phables give it a try and report any issues and suggestions under [Phables Issues](https://github.com/Vini2/phables/issues).
118 | 
119 | If you come across any questions, please have a look at the [Phables FAQ page](https://phables.readthedocs.io/en/latest/faq/). If your question is not here, feel free to post it under [Phables Issues](https://github.com/Vini2/phables/issues).
120 | 
121 | 
122 | ## Contributing to Phables
123 | 
124 | Are you interested in contributing to the Phables project? If so, you can check out the contributing guidelines in [CONTRIBUTING.md](https://github.com/Vini2/phables/blob/develop/CONTRIBUTING.md).
125 | 
126 | 
127 | ## Acknowledgement
128 | 
129 | Phables uses the [Gurobi](https://www.gurobi.com/) implementation of [MFD-ILP](https://github.com/algbio/MFD-ILP) and code snippets from [STRONG](https://github.com/chrisquince/STRONG), [METAMVGL](https://github.com/ZhangZhenmiao/METAMVGL), [GraphBin](https://github.com/metagentools/GraphBin), [MetaCoAG](https://github.com/metagentools/MetaCoAG) and [Hecatomb](https://hecatomb.readthedocs.io/en/latest/). Special thanks are owed to [Ryan Wick](https://github.com/rrwick) for developing [Bandage](https://rrwick.github.io/Bandage/) to visualise assembly graphs, which I heavily rely upon to investigate, develop and optimise my methods. The Phables logo was designed by [Amber Skye](https://fame.flinders.edu.au/people/2021/01/01/amber-cook).
130 | 
131 | ## Citation
132 | Phables is published in [Bioinformatics](https://academic.oup.com/bioinformatics) at DOI: [10.1093/bioinformatics/btad586](https://doi.org/10.1093/bioinformatics/btad586). 
133 | 
134 | If you use Phables in your work, please cite Phables as,
135 | 
136 | > Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, Robert A Edwards, Phables: from fragmented assemblies to high-quality bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586
137 | 
138 | ```bibtex
139 | @article{10.1093/bioinformatics/btad586,
140 |     author = {Mallawaarachchi, Vijini and Roach, Michael J and Decewicz, Przemyslaw and Papudeshi, Bhavya and Giles, Sarah K and Grigson, Susanna R and Bouras, George and Hesse, Ryan D and Inglis, Laura K and Hutton, Abbey L K and Dinsdale, Elizabeth A and Edwards, Robert A},
141 |     title = "{Phables: from fragmented assemblies to high-quality bacteriophage genomes}",
142 |     journal = {Bioinformatics},
143 |     volume = {39},
144 |     number = {10},
145 |     pages = {btad586},
146 |     year = {2023},
147 |     month = {09},
148 |     abstract = "{Microbial communities have a profound impact on both human health and various environments. Viruses infecting bacteria, known as bacteriophages or phages, play a key role in modulating bacterial communities within environments. High-quality phage genome sequences are essential for advancing our understanding of phage biology, enabling comparative genomics studies and developing phage-based diagnostic tools. Most available viral identification tools consider individual sequences to determine whether they are of viral origin. As a result of challenges in viral assembly, fragmentation of genomes can occur, and existing tools may recover incomplete genome fragments. Therefore, the identification and characterization of novel phage genomes remain a challenge, leading to the need of improved approaches for phage genome recovery.We introduce Phables, a new computational method to resolve phage genomes from fragmented viral metagenome assemblies. Phables identifies phage-like components in the assembly graph, models each component as a flow network, and uses graph algorithms and flow decomposition techniques to identify genomic paths. Experimental results of viral metagenomic samples obtained from different environments show that Phables recovers on average over 49\\% more high-quality phage genomes compared to existing viral identification tools. Furthermore, Phables can resolve variant phage genomes with over 99\\% average nucleotide identity, a distinction that existing tools are unable to make.Phables is available on GitHub at https://github.com/Vini2/phables.}",
149 |     issn = {1367-4811},
150 |     doi = {10.1093/bioinformatics/btad586},
151 |     url = {https://doi.org/10.1093/bioinformatics/btad586},
152 |     eprint = {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btad586/51972145/btad586.pdf},
153 | }
154 | ```
155 | 
156 | Also, please cite the following tools/databases used by Phables.
157 | 
158 | * Roach MJ, Pierce-Ward NT, Suchecki R, Mallawaarachchi V, Papudeshi B, et al. Ten simple rules and a template for creating workflows-as-applications. PLOS Computational Biology 18(12) (2022): e1010705. [https://doi.org/10.1371/journal.pcbi.1010705](https://doi.org/10.1371/journal.pcbi.1010705)
159 | * Terzian P, Olo Ndela E, Galiez C, Lossouarn J, Pérez Bucio RE, Mom R, Toussaint A, Petit MA, Enault F. PHROG: families of prokaryotic virus proteins clustered using remote homology. NAR Genomics and Bioinformatics, Volume 3, Issue 3, lqab067 (2021). [https://doi.org/10.1093/nargab/lqab067](https://doi.org/10.1093/nargab/lqab067)
160 | * Steinegger M, Söding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017). [https://doi.org/10.1038/nbt.3988](https://doi.org/10.1038/nbt.3988)
161 | * Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34:3094-3100 (2018). [https://doi.org/10.1093/bioinformatics/bty191](https://doi.org/10.1093/bioinformatics/bty191)
162 | * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume 25, Issue 16, Pages 2078–2079 (2009). [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352)
163 | * Woodcroft BJ, Newell R, CoverM: Read coverage calculator for metagenomics (2017). [https://github.com/wwood/CoverM](https://github.com/wwood/CoverM)
164 | * Roach, M. J., Hart, B. J., Beecroft, S. J., Papudeshi, B., Inglis, L. K., Grigson, S. R., Mallawaarachchi, V., Bouras, G., & Edwards, R. A. Koverage: Read-coverage analysis for massive (meta)genomics datasets. Journal of Open Source Software, 9(94), 6235, (2024). [https://doi.org/10.21105/joss.06235](https://doi.org/10.21105/joss.06235)
165 | * Hagberg AA, Schult DA, and Swart PJ. Exploring network structure, dynamics, and function using NetworkX. In Proceedings of the 7th Python in Science Conference (SciPy2008), Gäel Varoquaux, Travis Vaught, and Jarrod Millman (Eds), (Pasadena, CA USA), pp. 11–15 (2008).
166 | * Gurobi Optimization. [https://www.gurobi.com/](https://www.gurobi.com/).
167 | 


--------------------------------------------------------------------------------
/build/environment.yml:
--------------------------------------------------------------------------------
 1 | name: phables
 2 | channels:
 3 |   - conda-forge
 4 |   - anaconda
 5 |   - bioconda
 6 | dependencies:
 7 |   - python>=3.9, <3.11
 8 |   - snakemake>=7.14.0
 9 |   - pyyaml>=6.0
10 |   - click>=8.1.3
11 |   - jinja2>=3.0.2
12 |   - mamba
13 |   - metasnek>=0.0.5
14 |   - snaketool-utils>=0.0.4


--------------------------------------------------------------------------------
/build/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "phables" %}
 2 | {% set version = "1.1.0" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ version }}"
 7 | 
 8 | source:
 9 |   url: "https://github.com/Vini2/{{ name }}/archive/refs/tags/v{{ version }}.tar.gz"
10 |   sha256: 3276a6372e41a679b73d533fcc416a70db39a5e8f1ee78ea9a96590e3acf00de
11 | 
12 | build:
13 |   number: 0
14 |   noarch: python
15 |   script: "{{ PYTHON }} -m pip install . -vv"
16 | 
17 | requirements:
18 |   host:
19 |     - python
20 |     - pip
21 |   run:
22 |     - python >=3.8,<3.11
23 |     - snakemake >=7.14.0
24 |     - pyyaml >=6.0
25 |     - click >=8.1.3
26 |     - jinja2 >=3.0.2
27 |     - mamba <1.4.2
28 | 
29 | test:
30 |   commands:
31 |     - phables --help
32 | 
33 | about:
34 |   home: "https://github.com/Vini2/phables"
35 |   license: MIT
36 |   license_family: MIT
37 |   license_file: LICENSE
38 |   summary: "Phables: from fragmented assemblies to high-quality bacteriophage genomes"
39 |   description: |
40 |     Phables resolves bacteriophage genomes using phage bubbles in viral metagenomic data.
41 |   doc_url: "https://phables.readthedocs.io/"
42 |   dev_url: "https://github.com/Vini2/phables"
43 | 
44 | extra:
45 |   recipe-maintainers:
46 |     - Vini2


--------------------------------------------------------------------------------
/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | 
 2 | #
 3 | # phables
 4 | #
 5 | 
 6 | FROM --platform=linux/amd64 ubuntu:20.04
 7 | FROM gurobi/optimizer:latest
 8 | 
 9 | ENV DEBIAN_FRONTEND="noninteractive"
10 | 
11 | 
12 | ARG LIBFABRIC_VERSION=1.18.1
13 | 
14 | # Install required packages and dependencies
15 | RUN   apt -y update \
16 |       && apt -y install build-essential wget doxygen gnupg gnupg2 curl apt-transport-https software-properties-common  \
17 |  git vim gfortran libtool python3-venv ninja-build python3-pip \
18 |       libnuma-dev python3-dev \
19 |       && apt -y remove --purge --auto-remove cmake \
20 |       && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null\
21 |  | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null \
22 |       && apt-add-repository -y "deb https://apt.kitware.com/ubuntu/ jammy-rc main" \
23 |       && apt -y update 
24 | 
25 | # Build and install libfabric
26 | RUN (if [ -e /tmp/build ]; then rm -rf /tmp/build; fi;) \
27 |       && mkdir -p /tmp/build \
28 |       && cd /tmp/build \
29 |       && wget https://github.com/ofiwg/libfabric/archive/refs/tags/v${LIBFABRIC_VERSION}.tar.gz \
30 |       && tar xf v${LIBFABRIC_VERSION}.tar.gz \
31 |       && cd libfabric-${LIBFABRIC_VERSION} \ 
32 |       && ./autogen.sh \
33 |       && ./configure \
34 |       && make -j 16 \ 
35 |       && make install
36 | 
37 | #
38 | # Install miniforge
39 | #
40 | RUN set -eux ; \
41 |   curl -LO https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh ; \
42 |   bash ./Miniforge3-* -b -p /opt/miniforge3 -s ; \
43 |   rm -rf ./Miniforge3-*
44 | ENV PATH /opt/miniforge3/bin:$PATH
45 | #
46 | # Install conda environment
47 | # 
48 | ARG PHABLES_VERSION=1.3.2
49 | 
50 | RUN set -eux ; \
51 |   mamba install -y -c conda-forge -c anaconda -c bioconda -c defaults \
52 |   phables=${PHABLES_VERSION} ;
53 | ENV PATH /opt/miniforge3/bin:$PATH
54 | RUN conda clean -af -y
55 | RUN mkdir -p /phables /opt/gurobi
56 | RUN ln -s /opt/miniforge3/lib/python3.10/site-packages/phables/workflow/conda /conda
57 | RUN phables install
58 | 


--------------------------------------------------------------------------------
/container/README.md:
--------------------------------------------------------------------------------
 1 | # Docker container
 2 | 
 3 | Please note that this container is hosted on [docker hub](https://hub.docker.com/r/linsalrob/phables) and we recommend you use the latest version there.
 4 | 
 5 | # Installing guorobi
 6 | 
 7 | For the linear solver, you need the [Gurobi WLS](https://www.gurobi.com/features/academic-wls-license/) license. Get that file, which is called `gurobi.lic` by default, and put it in your home directory, or another location that you know where it is.
 8 | 
 9 | # Running the container with singularity (recommended)
10 | 
11 | We need to mount three locations that are writable for `phables` to work with singularity.
12 | 
13 | 1. You need to mount the `gurobi.lic` file, and that needs to end up at `/opt/gurobi/gurobi.lic`. In the example here, it is in the current working directory, `$PWD`.
14 | 2. You need a temporary directory where conda can install some files. They are installed on the first run, and reused after that. In this example, I am using `/tmp`. You need to mount this to `/conda` which is actually a symlink to the correct location under the snakemake directory.
15 | 3. You need your `.gfa` and `.fastq` files, and a location for the output. In this example, I have a directory called `Sim_Phage`. You should mount this to `/phables`. An important point here is to add the `/` to the end of your directory name, but _not_ to the `/phables`, and then the `.gfa` and `reads` will be in the `/phables` directory. 
16 | 
17 | > **NOTE:** when you specify the paths, it is important that they are absolute paths (i.e. beginning with `$PWD` or `/`), as relative paths don't work.
18 | 
19 | 
20 | ## Create the `.sif` image
21 | 
22 | The first step is to create the .sif image in a directory.
23 | 
24 | Check [docker hub](https://hub.docker.com/r/linsalrob/phables) for the latest version. In this example, I'm using version 0.6 but it may have been updated after that.
25 | 
26 | ```
27 | IMAGE_DIR=<path to singularity image>
28 | mkdir -p $IMAGE_DIR
29 | singularity pull --dir $IMAGE_DIR docker://linsalrob/phables:v0.5_sneaky_sleeky
30 | ```
31 | 
32 | You can set `IMAGE_DIR` to any path you can write to.
33 | 
34 | 
35 | ## Run the container
36 | 
37 | ```
38 | singularity exec --bind /tmp:/conda,$PWD/Sim_Phage/:/phables,$PWD/gurobi.lic:/opt/gurobi/gurobi.lic singularity/phables_0.6_gogo phables run --input /phables/assembly_graph_after_simplification.gfa --reads /phables/reads/ --output /phables/phables --threads 32
39 | ```
40 | 
41 | # Running the container with docker
42 | 
43 | The approach is very similar, except instead of the `--bind` you need to use `--volume`. Note that you will need to have root access for this to work.
44 | 
45 | ```
46 | docker pull linsalrob/phables:v0.6_gogo
47 | 
48 | sudo docker run --volume=$PWD/Sim_Phage/:/phables --volume=/tmp:/conda --volume=$PWD/gurobi.lic:/opt/gurobi/gurobi.lic:ro phables phables run --input /phables/assembly_graph_after_simplification.gfa --reads /phables/reads/ --output /phables/phables --threads 32
49 | ```
50 | 
51 | 
52 | 
53 | singularity exec --bind /scratch/pawsey1018/edwa0468/tmp:/opt/miniforge3/lib/python3.10/site-packages/phables/workflow/conda,$PWD/testy/Sim_Phage/:/phables,$PWD/gurobi.lic:/opt/gurobi/gurobi.lic testy/phables_v0.5_sneaky_sleeky.sif phables run --input /phables/assembly_graph_after_simplification.gfa --reads /phables/reads/ --output /phables/phables --threads 32
54 | 


--------------------------------------------------------------------------------
/docs/annotation.md:
--------------------------------------------------------------------------------
 1 | # Phage Genome Annotation
 2 | 
 3 | Once you have identified the high-quality and complete genomes from the [CheckV results](https://phables.readthedocs.io/en/latest/quality/), you can annotate them using a tool such as [**pharokka**](https://github.com/gbouras13/pharokka). The following sections will walk you through how to setup and run pharokka.
 4 | 
 5 | ## Installing pharokka
 6 | 
 7 | The recommended way to install pharokka is using [`conda`](https://docs.conda.io/en/latest/).
 8 | 
 9 | ```bash
10 | # Create a new conda environment and install pharokka
11 | conda create -n pharokka -c bioconda pharokka
12 | 
13 | # Activate pharokka conda environment
14 | conda activate pharokka
15 | ```
16 | 
17 | ## Download and install the pharokka databases
18 | 
19 | ```bash
20 | install_databases.py -o <path/to/databse_dir>
21 | ```
22 | 
23 | ## Running pharokka
24 | 
25 | Here is an example command to run pharokka on the complete and high-quality resolved genomes.
26 | 
27 | ```bash
28 | pharokka.py -i complete_hq_genomes.fasta -o pharokka_output  -t 16 -d <path/to/database_dir>
29 | ```
30 | 
31 | ## Circular genome plot
32 | 
33 | You can use the `pharokka_plotter.py` implementation from pharokka to create circular genome plots with annotations.
34 | 
35 | Let's assume that you have already run pharokka on all of the complete and high-quality resolved genomes and the output is available in `pharokka_output`. You can pick one genome to plot. For example, let's consider the genome `phage_comp_280_cycle_1.fasta` which is *phiX174*.
36 | 
37 | We start by reorienting the genome to start from the `terminase large subunit`. You can look up the starting position and strand of the `terminase large subunit` from the output file `pharokka_output/pharokka_cds_final_merged_output.tsv`. For example, let's take the starting position as 617 on the positive strand. You can run pharokka again for this genome with reorientation as follows.
38 | 
39 | ```bash
40 | pharokka.py -i resolved_phages/phage_comp_280_cycle_1.fasta -o pharokka_output_phage_comp_280_cycle_1 -d <path/to/databse_dir> -t 16 --terminase --terminase_strand 'pos' --terminase_start 617
41 | ```
42 | 
43 | Then you can run the plotting command as follows.
44 | 
45 | ```bash
46 | pharokka_plotter.py -i resolved_phages/phage_comp_280_cycle_1.fasta -n phage_comp_280_cycle_1_plot -o pharokka_output_phage_comp_250_cycle_1 -t "Escherichia phage phiX174"
47 | ```
48 | 
49 | ![](images/phage_comp_280_cycle_1_plot.png)


--------------------------------------------------------------------------------
/docs/assemble.md:
--------------------------------------------------------------------------------
  1 | # Assembly
  2 | 
  3 | Phables requires either short or long read sequencing data from metagenomic samples to be assembled. The following steps explain the steps required to be carried out beforehand.
  4 | 
  5 | 
  6 | ## Paired-end read files for short read assembly
  7 | 
  8 | Please make sure that the names of the read files are in the following format. Assuming that your paired-end sequencing reads are in the folder `fastq`, please make sure that the reads are in the format `{sampleName}{pattern}{fileExtension}`. `fileExtension` can be `.fq`, `.fastq`, `.fq.gz` or `.fastq.gz`.
  9 | 
 10 | Please make sure that your file `pattern` matches one of the following patterns.
 11 | 
 12 | ```
 13 | _R1_ and _R2_
 14 | _R1. and _R2.
 15 | .R1. and .R2.
 16 | .R1_ and .R2_
 17 | _1_ and _2_
 18 | _1. and _2.
 19 | .1. and .2.
 20 | .1_ and .2_
 21 | ```
 22 | 
 23 | For example, your read files can be
 24 | 
 25 | ```
 26 | sample1_R1.fastq.gz
 27 | sample1_R2.fastq.gz
 28 | sample2_R1.fastq.gz
 29 | sample2_R2.fastq.gz
 30 | ...
 31 | ```
 32 | 
 33 | or
 34 | 
 35 | ```
 36 | sample1_1.fq.gz
 37 | sample1_2.fq.gz
 38 | sample2_1.fq.gz
 39 | sample2_2.fq.gz
 40 | ...
 41 | ```
 42 | 
 43 | ## Long read assemblies
 44 | 
 45 | If you are using long read datasets, there is no specific naming format for the read files.
 46 | 
 47 | ## Assemble the samples
 48 | 
 49 | Phables requires the assembly graph file in **Graphical Fragment Assembly (GFA)** format. You can use any assembler that produces the assembly graph in GFA format to assemble your samples OR you can convert a FASTG file to GFA format.
 50 | 
 51 | If you have multiple samples you can pool together reads and do a co-assembly.
 52 | 
 53 | ## Recommended assemblers and tools
 54 | 
 55 | ### MEGAHIT
 56 | 
 57 | You can use [MEGAHIT](https://github.com/voutcn/megahit) to assemble your paired-end short read data.
 58 | 
 59 | ```bash
 60 | megahit -1 reads_1.fastq -2 reads_2.fastq -o megahit_out
 61 | ```
 62 | 
 63 | By default, MEGAHIT does not produce an assembly graph file. You have to to run `contig2fastg` command from the MEGAHIT toolkit to build the assembly graph file. `contig2fastg` requires you to input the k-mer size used for the assembly. You can get the k-mer size from the contig IDs in the `final.contigs.fa` file. For example, you can use the `grep` command to print out the contig IDs as follows.
 64 | 
 65 | ```bash
 66 | grep "^>" final.contigs.fa
 67 | ```
 68 | 
 69 | Imagine you get the output as follows. Here the k-mer size is 141 as denoted by `k141`.
 70 | 
 71 | ```bash
 72 | >k141_1456397 flag=0 multi=11.7570 len=1137
 73 | >k141_1235266 flag=0 multi=13.6963 len=1254
 74 | >k141_131192 flag=1 multi=47.8430 len=1510
 75 | >k141_1566081 flag=0 multi=9.6645 len=1372
 76 | ...
 77 | ```
 78 | 
 79 | Using the `k` value as 141, now you can run the `contig2fastg` command as follows.
 80 | 
 81 | ```bash
 82 | megahit_toolkit contig2fastg 141 final.contigs.fa > final.graph.fastg
 83 | ```
 84 | 
 85 | The MEGAHIT toolkit will result in a FASTG file which you can convert to GFA using [fastg2gfa](https://github.com/lh3/gfa1/blob/master/misc/fastg2gfa.c).
 86 | 
 87 | ```bash
 88 | fastg2gfa final.graph.fastg > final.graph.gfa
 89 | ```
 90 | 
 91 | If you want to run Phables on an assembly from a different `k` value found in the MEGAHIT output folder `intermediate_contigs`, please make sure to build the `.fastg` file from the `.fa` file with the corresponding `k` value. For example, if you want to run Phables on the contigs from `k99.contigs.fa`, you should first build the corresponding `k99.graph.fastg` file and then run `fastg2gfa` as follows.
 92 | 
 93 | ```bash
 94 | megahit_toolkit contig2fastg 99 k99.contigs.fa > k99.graph.fastg
 95 | fastg2gfa k99.graph.fastg > k99.graph.gfa
 96 | ```
 97 | 
 98 | ### metaSPAdes
 99 | 
100 | You can use [metaSPAdes](https://github.com/ablab/spades) to assemble your paired-end short read data. 
101 | 
102 | ```bash
103 | spades.py --meta -1 reads_1.fastq -2 reads_2.fastq -o metaspades_output -t 16
104 | ```
105 | 
106 | After the assembly finished, the output will contain the assembly graph file as `assembly_graph_after_simplification.gfa`.
107 | 
108 | ### metaFlye
109 | 
110 | You can use [metaFlye](https://github.com/fenderglass/Flye) to assemble your long read data.
111 | 
112 | ```bash
113 | flye --meta --nano-raw reads.fasta --out-dir metaflye_output --threads 16
114 | ```
115 | 
116 | After the assembly finished, the output will contain the assembly graph file as `assembly_graph.gfa`.
117 | 
118 | ### Hecatomb
119 | 
120 | You can use [Hecatomb](https://github.com/shandley/hecatomb) which is a viral analysis pipeline to obtain a pooled assembly of your short read or long read data contained in a folder named `reads`. You can run hecatomb as follows. Note that you only need to run the assembly module to process your data for Phables.
121 | 
122 | ```bash
123 | hecatomb run --reads reads/ assembly 
124 | ```
125 | 
126 | After the assembly finished, the output will contain the assembly graph file as `cross_assembly.gfa`.
127 | 
128 | Now we are ready to run Phables.


--------------------------------------------------------------------------------
/docs/citation.md:
--------------------------------------------------------------------------------
 1 | # Phables Citation
 2 | 
 3 | Phables is published in [Bioinformatics](https://academic.oup.com/bioinformatics) at DOI: [10.1093/bioinformatics/btad586](https://doi.org/10.1093/bioinformatics/btad586). 
 4 | 
 5 | If you use Phables in your work, please cite Phables as,
 6 | 
 7 | > Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, Robert A Edwards, Phables: from fragmented assemblies to high-quality bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586
 8 | 
 9 | ```bibtex
10 | @article{10.1093/bioinformatics/btad586,
11 |     author = {Mallawaarachchi, Vijini and Roach, Michael J and Decewicz, Przemyslaw and Papudeshi, Bhavya and Giles, Sarah K and Grigson, Susanna R and Bouras, George and Hesse, Ryan D and Inglis, Laura K and Hutton, Abbey L K and Dinsdale, Elizabeth A and Edwards, Robert A},
12 |     title = "{Phables: from fragmented assemblies to high-quality bacteriophage genomes}",
13 |     journal = {Bioinformatics},
14 |     volume = {39},
15 |     number = {10},
16 |     pages = {btad586},
17 |     year = {2023},
18 |     month = {09},
19 |     abstract = "{Microbial communities have a profound impact on both human health and various environments. Viruses infecting bacteria, known as bacteriophages or phages, play a key role in modulating bacterial communities within environments. High-quality phage genome sequences are essential for advancing our understanding of phage biology, enabling comparative genomics studies and developing phage-based diagnostic tools. Most available viral identification tools consider individual sequences to determine whether they are of viral origin. As a result of challenges in viral assembly, fragmentation of genomes can occur, and existing tools may recover incomplete genome fragments. Therefore, the identification and characterization of novel phage genomes remain a challenge, leading to the need of improved approaches for phage genome recovery.We introduce Phables, a new computational method to resolve phage genomes from fragmented viral metagenome assemblies. Phables identifies phage-like components in the assembly graph, models each component as a flow network, and uses graph algorithms and flow decomposition techniques to identify genomic paths. Experimental results of viral metagenomic samples obtained from different environments show that Phables recovers on average over 49\\% more high-quality phage genomes compared to existing viral identification tools. Furthermore, Phables can resolve variant phage genomes with over 99\\% average nucleotide identity, a distinction that existing tools are unable to make.Phables is available on GitHub at https://github.com/Vini2/phables.}",
20 |     issn = {1367-4811},
21 |     doi = {10.1093/bioinformatics/btad586},
22 |     url = {https://doi.org/10.1093/bioinformatics/btad586},
23 |     eprint = {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btad586/51972145/btad586.pdf},
24 | }
25 | ```
26 | 
27 | Also, please cite the following tools/databases used by Phables.
28 | 
29 | * Roach MJ, Pierce-Ward NT, Suchecki R, Mallawaarachchi V, Papudeshi B, et al. Ten simple rules and a template for creating workflows-as-applications. PLOS Computational Biology 18(12) (2022): e1010705. [https://doi.org/10.1371/journal.pcbi.1010705](https://doi.org/10.1371/journal.pcbi.1010705)
30 | * Terzian P, Olo Ndela E, Galiez C, Lossouarn J, Pérez Bucio RE, Mom R, Toussaint A, Petit MA, Enault F. PHROG: families of prokaryotic virus proteins clustered using remote homology. NAR Genomics and Bioinformatics, Volume 3, Issue 3, lqab067 (2021). [https://doi.org/10.1093/nargab/lqab067](https://doi.org/10.1093/nargab/lqab067)
31 | * Steinegger M, Söding J. MMseqs2 enables sensitive protein sequence searching for the analysis of massive data sets. Nat Biotechnol 35, 1026–1028 (2017). [https://doi.org/10.1038/nbt.3988](https://doi.org/10.1038/nbt.3988)
32 | * Li H. Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34:3094-3100 (2018). [https://doi.org/10.1093/bioinformatics/bty191](https://doi.org/10.1093/bioinformatics/bty191)
33 | * Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume 25, Issue 16, Pages 2078–2079 (2009). [https://doi.org/10.1093/bioinformatics/btp352](https://doi.org/10.1093/bioinformatics/btp352)
34 | * Woodcroft BJ, Newell R, CoverM: Read coverage calculator for metagenomics (2017). [https://github.com/wwood/CoverM](https://github.com/wwood/CoverM)
35 | * Roach, M. J., Hart, B. J., Beecroft, S. J., Papudeshi, B., Inglis, L. K., Grigson, S. R., Mallawaarachchi, V., Bouras, G., & Edwards, R. A. Koverage: Read-coverage analysis for massive (meta)genomics datasets. Journal of Open Source Software, 9(94), 6235, (2024). [https://doi.org/10.21105/joss.06235](https://doi.org/10.21105/joss.06235)
36 | * Hagberg AA, Schult DA, and Swart PJ. Exploring network structure, dynamics, and function using NetworkX. In Proceedings of the 7th Python in Science Conference (SciPy2008), Gäel Varoquaux, Travis Vaught, and Jarrod Millman (Eds), (Pasadena, CA USA), pp. 11–15 (2008).
37 | * Gurobi Optimization. [https://www.gurobi.com/](https://www.gurobi.com/).
38 | 


--------------------------------------------------------------------------------
/docs/comparison.md:
--------------------------------------------------------------------------------
 1 | # Comparing the viral quality of resolved genomes and their constituent unitigs
 2 | 
 3 | You can combine the resolved genomes (`resolved_paths.fasta`) and their constituent unitigs (`resolved_edges.fasta`), and compare the viral quality.
 4 | 
 5 | ## Run CheckV
 6 | 
 7 | You can combine the resolved genome sequences and unitig sequences and run CheckV as follows.
 8 | 
 9 | ```bash
10 | # Combine resolved_paths.fasta and resolved_edges.fasta
11 | cat resolved_paths.fasta resolved_edges.fasta > all_sequences.fasta
12 | 
13 | # Run CheckV
14 | checkv end_to_end all_sequences.fasta checkv_result
15 | ```
16 | 
17 | Now you can compare and visualise the quality of the resolved genomes and their constituent unitigs. The following example code shows how to visualise the results using Python.
18 | 
19 | ## Importing Python packages
20 | 
21 | Assuming you have installed Python and the packages `matplotlib`, `pandas` and `seaborn`, let's import the following.
22 | 
23 | ```python
24 | import pandas as pd
25 | import seaborn as sns
26 | import matplotlib.pyplot as plt
27 | ```
28 | 
29 | ## Load the data
30 | 
31 | Now we will load the `quality_summary.tsv` file into a dataframe called `checkv_res`.
32 | 
33 | ```python
34 | # Load the quality_summary.tsv from the CheckV results
35 | checkv_res = pd.read_csv("checkv_resolved_pathsquality_summary.tsv", delimiter="\t", header=0)
36 | ```
37 | 
38 | ## Format the data
39 | 
40 | Now we will convert the sequence lengths into kilobases by dividing the lengths by 1000.
41 | 
42 | ```python
43 | # Format the genome length to kb
44 | checkv_res['contig_length'] = checkv_res['contig_length'].div(1000)
45 | ```
46 | 
47 | Then we will add a new column to our dataframe called `Sequence type` to denote whether the sequence is a resolved genome or a unitig.
48 | 
49 | ```python
50 | # Add a new column as "Sequence type"
51 | seq_type = []
52 | 
53 | for index, row in checkv_res.iterrows():
54 |     if row['contig_id'].startswith("phage"):
55 |         seq_type.append("Resolved genomes")
56 |     else:
57 |         seq_type.append("Individual unitigs")
58 | 
59 | checkv_res.insert(2, "Sequence type", seq_type, True)
60 | ```
61 | 
62 | ## Plot the data
63 | 
64 | Now we can plot the viral quality (`Complete`, `High-quality`, `Medium-quality` or `Low-quality`) of the resolved genomes and their constituent unitigs using boxen plots and the save the figure as follows.
65 | 
66 | ```python
67 | # Set the order of viral quality
68 | myorder=["Complete", "High-quality", "Medium-quality", "Low-quality"]
69 | 
70 | # Plot using catplot
71 | ax = sns.catplot(y="checkv_quality", x="contig_length", hue="Sequence type", kind="boxen", data=checkv_res, height=5, aspect=1.5, order=myorder, showfliers=False)
72 | 
73 | # Set axis titles
74 | ax.set(xlabel='Viral genome length (kbp)', ylabel='CheckV quality')
75 | 
76 | # Save figure
77 | plt.savefig("checkv_qual_boxen.pdf", dpi=300, bbox_inches='tight', format='pdf') 
78 | ```
79 | 
80 | ![](images/qual_resolved_genome_unitig_boxen.png)
81 | 
82 | 
83 | You can change the `kind` of the plot as you wish. For example, you can draw a violin plot by changing `kind="violin"` as follows.
84 | 
85 | ```python
86 | ax = sns.catplot(y="checkv_quality", x="contig_length", hue="Sequence type", kind="violin", data=checkv_res, height=5, aspect=1.5, order=myorder, showfliers=False)
87 | ```
88 | 
89 | ![](images/qual_resolved_genome_unitig_violin.png)


--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
  1 | # Frequently Asked Questions
  2 | 
  3 | ## General FAQs
  4 | 
  5 | ### Q1: Where can I get help with issues?
  6 | 
  7 | If you come across any issues while using Phables, you can open an [issue on GitHub](https://github.com/Vini2/phables/issues) and we will look into it. Phables is still under development and testing, so we expect that there will still be bugs and unhandled exceptions in the code. 
  8 | 
  9 | ### Q2: Can I use the assembly graph from any assembler?
 10 | 
 11 | Phables supports any assembly graph in GFA (`.gfa`) format. You can use any assembler that produces the assembly graph in GFA format to assemble your samples OR you can convert an assembly graph in FASTG format to GFA format using a tool such as [fastg2gfa](https://github.com/lh3/gfa1/blob/master/misc/fastg2gfa.c).
 12 | 
 13 | If you use metaSPAdes for assembly, you can use the `assembly_graph_after_simplification.gfa` as input for Phables.
 14 | 
 15 | ### Q4: What can I do after running Phables?
 16 | 
 17 | Once you have run Phables, check out the [EVALUATION](https://phables.readthedocs.io/en/latest/quality/) section where you can read on how to check and compare the quality of the resolved genomes, interpret graph statistics and visualise the results.
 18 | 
 19 | ### Q5: How can I find out which contigs were included in the resolved phages?
 20 | 
 21 | The `resolved_genome_info.txt` file contains the order of sequences in the assembly graph that were used to construct the genomes (refer to the example below). 
 22 | 
 23 | | Path                 | Case  | Coverage | Length | GC content        | Node order                                                                   |
 24 | |----------------------|-------|----------|--------|-------------------|------------------------------------------------------------------------------|
 25 | | phage_comp_0_cycle_1 | case3 | 644      | 45659  | 34.86059703453864 | ['49-', '5524+', '24979-', '5556+', '55+', '5540-', '67+', '4490+', '5554-'] |
 26 | | phage_comp_0_cycle_2 | case3 | 625      | 43427  | 35.03810993160937 | ['49-', '5522+', '24979-', '5558+', '55+', '65+', '67+', '4490+', '5498-']   |
 27 | | ...                  |       |          |        |                   |                                                                              |
 28 | 
 29 | The `Node order` column denotes the segment IDs from the assembly graph.
 30 | 
 31 | The mapping between the contigs and assembly graph segments depends on the assembler you use. 
 32 | 
 33 | * If you use MEGAHIT, the segments in the assembly graph are the contigs themselves. You can directly relate the `Node order` information as the contigs that make the paths.
 34 | * If you use an assembler such as SPAdes or Flye, the sequences represented in the assembly graph are **unitigs**, which make up contigs. The information on which unitigs make up the contigs can be found in, for example, `contigs.paths` file in SPAdes and `assembly_info.txt` file in Flye.
 35 | 
 36 | ### Q6: Can I run Phables on mixed-microbial communities?
 37 | 
 38 | Phables was originally designed to run on viromic data, but it can also be used to study mixed-microbial communities. However, the current implementation of Phables filters any component with at least a single unitig encoding any bacterial single-copy marker gene and hence, prophages might be omitted in the final result. Also, some plasmids or [phage-plasmids](https://doi.org/10.1128/mbio.01851-22), can be identified by Phables as phages. Hence, users should perform further downstream analysis to ensure that the predicted genomes are actual phages. One option is to use a tool such as [PPR-Meta](https://github.com/zhenchengfang/PPR-Meta) to classify the genomes resolved from Phables into phages and plasmids.
 39 | 
 40 | ### Q7: Can Phables identify prophages?
 41 | 
 42 | If a prophage is active, excises from the genome, and is replicating, Phables would identify it. However, if it is a cryptic prophage, Phables would not identify it as it will be integrated into the host genome and can be part of a larger bacterial component in the assembly graph. As Phables discards components having bacterial single-copy marker genes, such prophages will not be identified. 
 43 | 
 44 | Users can use specific tools to either identify prophages in bacterial genomes such as [Phispy](https://academic.oup.com/nar/article/40/16/e126/1027055) or [hafeZ](https://www.biorxiv.org/content/10.1101/2021.07.21.453177v1) or validate recovered prophage sequences from host-genomes in metagenomic sequences such as [CheckV](https://www.nature.com/articles/s41587-020-00774-7).
 45 | 
 46 | 
 47 | ## Gurobi FAQs
 48 | 
 49 | ### Q1: Gurobi installation conflicts and `grbgetkey` fails to run
 50 | 
 51 | If you come across conflicts when installing Gurobi in the `phables` environment and could not run the `grbgetkey` command properly, please follow the steps given below.
 52 | 
 53 | ```bash
 54 | # Deactivate the phables environment
 55 | conda deactivate
 56 | 
 57 | # Remove phables environemnt
 58 | conda remove -n phables --all
 59 | 
 60 | # Create conda environment with phables and gurobi
 61 | conda create -n phables -c conda-forge -c anaconda -c bioconda -c gurobi phables gurobi
 62 | ```
 63 | 
 64 | ### Q2: Model too large for size-limited license
 65 | 
 66 | If you get the following error when running Phables, this means that you don't have a proper license to handle large models. 
 67 | 
 68 | ```bash
 69 | Error code 10010: Model too large for size-limited license; visit https://www.gurobi.com/free-trial for a full license
 70 | ```
 71 | 
 72 | You should get an academic license which is provided free of charge to your institutional email address. You can refer to further instructions at [https://www.gurobi.com/academia/academic-program-and-licenses/](https://www.gurobi.com/academia/academic-program-and-licenses/).
 73 | 
 74 | ### Q3: HostID mismatch
 75 | 
 76 | If you get the following error when running Phables as a job on a cluster, you cannot use your academic license which is a file-based host-locked license, meaning that you can only run Gurobi on the machine that the license was obtained for.
 77 | 
 78 | ```bash
 79 | Failed to set up a license
 80 | Error 10009: HostID mismatch (licensed to <host_1>, hostid is <host_2>)
 81 | ```
 82 | 
 83 | You will have to contact your system admin and setup a floating network license. You can find more details at [https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-](https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-).
 84 | 
 85 | ### Q4: License not valid for Gurobi version x
 86 | 
 87 | If you get the following error when running Phables, this means that the version in your license does not match the installed version. You can install the correct version of Gurobi to match your license or you can get a new license for the latest version installed.
 88 | 
 89 | ```bash
 90 | ERROR - Error code 10009: Request denied: license not valid for Gurobi version 11
 91 | ```
 92 | 
 93 | ### Q5: How can I get a Gurobi license for a cluster?
 94 | 
 95 | If you want to run Phables on a cluster, your cluster should have a [floating network license](https://en.wikipedia.org/wiki/Floating_licensing) for Gurobi for the `run` subcommand to execute properly.
 96 | 
 97 | **Gurobi license for a cluster:** You will have to contact your system admin and setup a floating network license for the cluster. You can find more details at [https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-](https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-).
 98 | 
 99 | If your cluster has Gurobi already installed with the license setup, you can load the module as follows, prior to running Phables.
100 | 
101 | ```bash
102 | module load gurobi
103 | ```


--------------------------------------------------------------------------------
/docs/graph_stats.md:
--------------------------------------------------------------------------------
  1 | # Graph Statistics
  2 | 
  3 | Phables outputs a file named `resolved_component_info.txt` that contains the following information of the phage bubbles resolved.
  4 | 
  5 | * Number of nodes
  6 | * Number of paths resolved
  7 | * Fraction of unitigs recovered in the paths
  8 | * Maximum degree of the graph
  9 | * Minimum degree of the graph
 10 | * Maximum in degree of the graph
 11 | * Maximum out degree of the graph
 12 | * Average degree of the graph
 13 | * Average in degree of the graph
 14 | * Average out degree of the graph
 15 | * Density of the graph
 16 | * Maximum path length: length of the longest path
 17 | * Minimum path length: length of the shortest path
 18 | * Length ratio (long/short): (Maximum path length / Minimum path length)
 19 | * Maximum coverage path length: length of the path with the highest coverage
 20 | * Minimum coverage path length: length of the path with the lowest coverage
 21 | * Length ratio (highest cov/lowest cov): (Maximum coverage path length / Minimum coverage path length)
 22 | * Maximum coverage
 23 | * Minimum coverage
 24 | * Coverage ratio (highest/lowest): (Maximum coverage / Minimum coverage)
 25 | 
 26 | You can compare and visualise the graph statistics of the resolved components using this information. The following example code shows how to visualise the results using Python.
 27 | 
 28 | ## Importing Python packages
 29 | 
 30 | Assuming you have installed Python and the packages `matplotlib`, `pandas` and `seaborn`, let's import the following.
 31 | 
 32 | ```python
 33 | import pandas as pd
 34 | import seaborn as sns
 35 | import matplotlib.pyplot as plt
 36 | ```
 37 | 
 38 | ## Load the data
 39 | 
 40 | Now we will load the `resolved_component_info.txt` file into a dataframe called `component_stats`.
 41 | 
 42 | ```python
 43 | # Load the resolved_component_info.txt from Phables results
 44 | component_stats = pd.read_csv("resolved_component_info.txt", delimiter="\t", header=0)
 45 | ```
 46 | 
 47 | You can list the columns using `component_stats.columns`. The following columns will be listed.
 48 | 
 49 | ```python
 50 | Index(['Component', 'Number of nodes', 'Number of paths',
 51 |        'Fraction of unitigs recovered', 'Maximum degree', 'Maximum in degree',
 52 |        'Maximum out degree', 'Average degree', 'Average in degree',
 53 |        'Average out degree', 'Density', 'Maximum path length',
 54 |        'Minimum path length', 'Length ratio (long/short)',
 55 |        'Maximum coverage path length', 'Minimum coverage path length',
 56 |        'Length ratio (highest cov/lowest cov)', 'Maximum coverage',
 57 |        'Minimum coverage', 'Coverage ratio (highest/lowest)'],
 58 |       dtype='object')
 59 | ```
 60 | 
 61 | ## Plot histograms
 62 | 
 63 | You can plot histograms of the different columns. The following code plots a histogram of the `Number of nodes` column.
 64 | 
 65 | ```python
 66 | # Get the column
 67 | df = component_stats["Number of nodes"]
 68 | 
 69 | # Plot the histogram
 70 | ax = df.plot.hist(bins=100, alpha=0.5, figsize=(12, 8))
 71 | 
 72 | # Set axis titles
 73 | ax.set(xlabel='Number of nodes', ylabel='Frequency')
 74 | 
 75 | # Save figure
 76 | plt.savefig("histogram_n_nodes.png", format='png', dpi=300, bbox_inches='tight')
 77 | ```
 78 | 
 79 | ![](images/histogram_n_nodes.png)
 80 | 
 81 | ## Plot heatmaps
 82 | 
 83 | You can plot heatmaps for correlations of all the graph statistics as follows.
 84 | 
 85 | ```python
 86 | # Use Pearson correlation
 87 | df_cor = component_stats.corr(method='pearson')
 88 | 
 89 | # Plot heatmap
 90 | sns.heatmap(df_cor, cmap="Blues")
 91 | 
 92 | # Save figure
 93 | plt.savefig("pearson_heatmap.png", format='png', dpi=300, bbox_inches='tight')
 94 | ```
 95 | 
 96 | ![](images/pearson_heatmap.png)
 97 | 
 98 | ## Plot hierarchically-clustered heatmaps
 99 | 
100 | As the heatmap above looks a bit messy and hard to interpret, we can clean it up by clustering so we can observe some patterns. For this we can use the `clustermap` function from seaborn which produces a hierarchically-clustered heatmap.
101 | 
102 | ```python
103 | # Plot the hierarchically-clustered heatmap
104 | pearson_clustermap = sns.clustermap(df_cor, cmap="Blues", method="ward")
105 | 
106 | # Save figure
107 | pearson_clustermap.savefig("pearson_clustermap.png", format='png', dpi=300, bbox_inches='tight')
108 | ```
109 | 
110 | ![](images/pearson_clustermap.png)


--------------------------------------------------------------------------------
/docs/images/Phables_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/Phables_workflow.png


--------------------------------------------------------------------------------
/docs/images/components.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/components.png


--------------------------------------------------------------------------------
/docs/images/histogram_n_nodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/histogram_n_nodes.png


--------------------------------------------------------------------------------
/docs/images/pearson_clustermap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/pearson_clustermap.png


--------------------------------------------------------------------------------
/docs/images/pearson_heatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/pearson_heatmap.png


--------------------------------------------------------------------------------
/docs/images/phables_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/phables_logo.png


--------------------------------------------------------------------------------
/docs/images/phage_comp_280_cycle_1_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/phage_comp_280_cycle_1_plot.png


--------------------------------------------------------------------------------
/docs/images/qual_resolved_genome_unitig_boxen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/qual_resolved_genome_unitig_boxen.png


--------------------------------------------------------------------------------
/docs/images/qual_resolved_genome_unitig_violin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/docs/images/qual_resolved_genome_unitig_violin.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | ![](images/phables_logo.png)
 2 | 
 3 | # Phables: from fragmented assemblies to high-quality bacteriophage genomes
 4 | 
 5 | Phables is a tool developed to resolve bacteriophage genomes using phage bubbles in viral metagenomic data. 
 6 | It models phage-like components in a viral metagenomic assembly graph as flow networks, models as a 
 7 | minimum flow decomposition problem and resolves genomic paths corresponding to flow paths determined. 
 8 | Phables uses the [Minimum Flow Decomposition via  Integer Linear 
 9 | Programming](https://github.com/algbio/MFD-ILP) implementation to obtain the flow paths.
10 | 
11 | ## Motivation
12 | 
13 | Existing viral identification tools run contigs through a pre-trained model and predict whether or not they are of viral origin. However, contigs do not necessarily represent complete genomes as viral assemblies are not always perfect. Most of the existing metagenomic binning tools are optimised for bacterial metagenomes and cannot handle viral metagenomes efficiently.
14 | 
15 | We observed circular and linear components in viral metagenome assembly graphs as shown below (visualisations obtained from [Bandage](https://rrwick.github.io/Bandage/)), suggesting that viral genomes are fragmented and variant genomes exist.
16 | 
17 | ![](images/components.png)
18 | 
19 | Phables was developed to recover phage-like components called "phage bubbles" that represent one or more bacteriophage genomes and resolve phage bubbles to obtain complete and high-quality genomes.
20 | 
21 | ## Workflow
22 | 
23 | Phables is powered by [Snaketool](https://github.com/beardymcjohnface/Snaketool) which packs in all the setup, testing, preprocessing and running steps into an easy-to-use pipeline. 
24 | 
25 | The following diagram shows an overview of Phables.
26 | 
27 | ![](images/Phables_workflow.png)
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/install.md:
--------------------------------------------------------------------------------
  1 | # Setting up Phables
  2 | 
  3 | Phables is available on bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables) and on PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Feel free to pick your package manager, but we recommend that you use [`conda`](https://docs.conda.io/en/latest/).
  4 | 
  5 | ### Option 1: Installing Phables using conda (recommended)
  6 | 
  7 | You can install Phables from bioconda at [https://anaconda.org/bioconda/phables](https://anaconda.org/bioconda/phables). Make sure you have [`conda`](https://docs.conda.io/en/latest/) installed.
  8 | 
  9 | ```bash
 10 | # create conda environment and install phables
 11 | conda create -n phables -c conda-forge -c anaconda -c bioconda phables
 12 | 
 13 | # activate environment
 14 | conda activate phables
 15 | ```
 16 | 
 17 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi.
 18 | 
 19 | ### Option 2: Installing Phables using pip
 20 | 
 21 | You can install Phables from PyPI at [https://pypi.org/project/phables/](https://pypi.org/project/phables/). Make sure you have [`pip`](https://pip.pypa.io/en/stable/) and [`mamba`](https://mamba.readthedocs.io/en/latest/index.html) installed.
 22 | 
 23 | ```bash
 24 | pip install phables
 25 | ```
 26 | 
 27 | Now you can go to [Setting up Gurobi](#setting-up-gurobi) to configure Gurobi.
 28 | 
 29 | ## Setting up Gurobi
 30 | 
 31 | The MFD implementation uses the linear programming solver [Gurobi](https://www.gurobi.com/). We chose Gurobi over open source solvers as Gurobi is fast and can solve large models (check the performance comparison at [https://www.gurobi.com/resources/open-source-linear-and-mixed-integer-programming-software-and-solvers/](https://www.gurobi.com/resources/open-source-linear-and-mixed-integer-programming-software-and-solvers/)).
 32 | 
 33 | The `phables` conda environment and pip setup does not include Gurobi. You have to install Gurobi using one of the following commands depending on your package manager.
 34 | 
 35 | ```bash
 36 | # conda
 37 | conda install -c gurobi gurobi
 38 | 
 39 | # pip
 40 | pip install gurobipy
 41 | ```
 42 | 
 43 | To handle large models without any model size limitations, once you have installed Gurobi, you have to activate the (academic) license and add the key using the following command. You only have to do this once.
 44 | 
 45 | ```bash
 46 | grbgetkey <KEY>
 47 | ```
 48 | 
 49 | You can refer to further instructions at [https://www.gurobi.com/academia/academic-program-and-licenses/](https://www.gurobi.com/academia/academic-program-and-licenses/). Please note that this academic lisence is a file-based host-locked license, meaning that you can only run Gurobi on the machine that the license was obtained for. If you want to run on a cluster, you will have to contact your system admin and setup a floating network license. You can find more details at [https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-](https://support.gurobi.com/hc/en-us/articles/360013195412-How-do-I-obtain-a-free-academic-license-for-a-cluster-or-a-shared-computer-lab-).
 50 | 
 51 | ## Test the installation
 52 | 
 53 | After setting up, run `phables --help` to print out the Phables help message.
 54 | 
 55 | ```bash
 56 | Usage: phables [OPTIONS] COMMAND [ARGS]...
 57 | 
 58 |   Phables: from fragmented assemblies to high-quality bacteriophage genomes.
 59 |   Please refer the full documentation available on Read the Docs at
 60 |   https://phables.readthedocs.io/
 61 | 
 62 | Options:
 63 |   -v, --version  Show the version and exit.
 64 |   -h, --help     Show this message and exit.
 65 | 
 66 | Commands:
 67 |   run       Run Phables
 68 |   install   Install databases
 69 |   test      Test Phables
 70 |   config    Copy the system default config file
 71 |   citation  Print the citation(s) for this tool
 72 | ```
 73 | 
 74 | ## Setup the databases
 75 | 
 76 | Now run the following command to download and setup the required databases.
 77 | 
 78 | ```bash
 79 | phables install
 80 | ```
 81 | 
 82 | ## Run on the test data
 83 | 
 84 | Then run the following command to launch the test run and ensure that Phables is working.
 85 | 
 86 | ```bash
 87 | phables test
 88 | ```
 89 | 
 90 | If the test run completes without any issues, we are good to go.
 91 | 
 92 | ## Build the docs
 93 | 
 94 | Optionally, the complete documentation of Phables including these pages can be built using [MkDocs](https://www.mkdocs.org/) as follows.
 95 | 
 96 | ```bash
 97 | # install mkdocs
 98 | pip install mkdocs
 99 | 
100 | # go to your installation directory
101 | cd /path/to/phables
102 | 
103 | # build
104 | mkdocs build
105 | ```


--------------------------------------------------------------------------------
/docs/quality.md:
--------------------------------------------------------------------------------
 1 | # Checking the quality of resolved genomes
 2 | 
 3 | The sequences of the resolved genomic paths can be found in `resolved_paths.fasta`. Each entry in this FASTA file is a resolved genome (not a contig) and can be directly evaluated using a dedicated viral evaluation tool like [CheckV](https://bitbucket.org/berkeleylab/checkv/src/master/). The following sections will walk you through how to setup and run CheckV.
 4 | 
 5 | ## Installing CheckV
 6 | 
 7 | The recommended way to install CheckV is using [`conda`](https://docs.conda.io/en/latest/).
 8 | 
 9 | ```bash
10 | # Create a new conda environment and install checkv
11 | conda create -n checkv -c conda-forge -c bioconda checkv
12 | 
13 | # Activate checkv conda environment
14 | conda activate checkv
15 | ```
16 | 
17 | You can also install using [`pip`](https://pip.pypa.io/en/stable/).
18 | 
19 | ```bash
20 | pip install checkv
21 | ```
22 | 
23 | ## Download the CheckV database
24 | 
25 | ```bash
26 | checkv download_database ./
27 | ```
28 | 
29 | Now you need to to specify the `CHECKVDB` location.
30 | 
31 | ```bash
32 | export CHECKVDB=/path/to/checkv-db
33 | ```
34 | 
35 | ## Running CheckV
36 | 
37 | Here is an example command to run CheckV on the resolved genomes.
38 | 
39 | ```bash
40 | checkv end_to_end resolved_paths.fasta checkv_resolved_paths -t 16
41 | ```
42 | 
43 | The `end_to_end` option will run the full pipeline.
44 | 
45 | You can also run individual commands for each step in the pipeline as follows.
46 | 
47 | ```bash
48 | checkv contamination resolved_paths.fasta checkv_resolved_paths -t 16
49 | checkv completeness resolved_paths.fasta checkv_resolved_paths -t 16
50 | checkv complete_genomes resolved_paths.fasta checkv_resolved_paths
51 | checkv quality_summary resolved_paths.fasta checkv_resolved_paths
52 | ```
53 | 
54 | ## CheckV outputs
55 | 
56 | CheckV will produce the following `.tsv` files.
57 | 
58 | * `complete_genomes.tsv` - overview of putative complete genomes identified
59 | * `completeness.tsv` - overview of how completeness was estimated
60 | * `contamination.tsv` - overview of how contamination was estimated
61 | * `quality_summary.tsv` - integrated quality results


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | jinja2>=3.1.3
 2 | mkdocs>=1.3.1
 3 | babel>=2.9.0
 4 | click>=7.0
 5 | Markdown>=3.2.1,<3.4
 6 | PyYAML>=5.2
 7 | watchdog>=2.0.0
 8 | mdx_gh_links>=0.2
 9 | ghp-import>=1.0
10 | pyyaml_env_tag>=0.1
11 | mkdocs-redirects>=1.0.1
12 | importlib_metadata>=4.3
13 | packaging>=20.5
14 | mergedeep>=1.3.4
15 | pygments>=2.12
16 | pymdown-extensions
17 | mkdocs-material
18 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
  1 | # Phables Usage
  2 | 
  3 | Phables run options can be found using the `phables run -h` command.
  4 | 
  5 | ```
  6 | Usage: phables run [OPTIONS] [SNAKE_ARGS]...
  7 | 
  8 |   Run Phables
  9 | 
 10 | Options:
 11 |   --output PATH                 Output directory  [default: phables.out]
 12 |   --configfile TEXT             Custom config file [default:
 13 |                                 (outputDir)/config.yaml]
 14 |   --threads INTEGER             Number of threads to use  [default: 1]
 15 |   --use-conda / --no-use-conda  Use conda for Snakemake rules  [default: use-
 16 |                                 conda]
 17 |   --conda-prefix PATH           Custom conda env directory
 18 |   --profile TEXT                Snakemake profile
 19 |   --snake-default TEXT          Customise Snakemake runtime args  [default:
 20 |                                 --rerun-incomplete, --printshellcmds,
 21 |                                 --nolock, --show-failed-logs]
 22 |   --input PATH                  Path to assembly graph file in .GFA format
 23 |                                 [required]
 24 |   --reads PATH                  Path to directory containing paired-end reads
 25 |                                 [required]
 26 |   --minlength INTEGER           minimum length of circular unitigs to consider
 27 |                                 [default: 2000]
 28 |   --mincov INTEGER              minimum coverage of paths to output  [default:
 29 |                                 10]
 30 |   --compcount INTEGER           maximum unitig count to consider a component
 31 |                                 [default: 200]
 32 |   --maxpaths INTEGER            maximum number of paths to resolve for a
 33 |                                 component  [default: 10]
 34 |   --mgfrac FLOAT                length threshold to consider single copy
 35 |                                 marker genes  [default: 0.2]
 36 |   --evalue FLOAT                maximum e-value for phrog annotations
 37 |                                 [default: 1e-10]
 38 |   --seqidentity FLOAT           minimum sequence identity for phrog
 39 |                                 annotations  [default: 0.3]
 40 |   --covtol INTEGER              coverage tolerance for extending subpaths
 41 |                                 [default: 100]
 42 |   --alpha FLOAT                 coverage multiplier for flow interval
 43 |                                 modelling  [default: 1.2]
 44 |   --longreads                   provide long reads as input (else defaults to
 45 |                                 short reads)
 46 |   --prefix TEXT                 prefix for genome identifier
 47 |   -h, --help                    Show this message and exit.
 48 | 
 49 |   
 50 |   If you use Phables in your work, please cite Phables as,
 51 |   
 52 |   Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, 
 53 |   Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, 
 54 |   Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, 
 55 |   Robert A Edwards, Phables: from fragmented assemblies to high-quality 
 56 |   bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, 
 57 |   October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586
 58 |   
 59 |   
 60 |   For more information on Phables please visit:
 61 |   https://phables.readthedocs.io/
 62 |   
 63 |   
 64 |   CLUSTER EXECUTION:
 65 |   phables run ... --profile [profile]
 66 |   For information on Snakemake profiles see:
 67 |   https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles
 68 |   
 69 |   RUN EXAMPLES:
 70 |   Required:           phables run --input [assembly graph file]
 71 |   Specify threads:    phables run ... --threads [threads]
 72 |   Disable conda:      phables run ... --no-use-conda 
 73 |   Change defaults:    phables run ... --snake-default="-k --nolock"
 74 |   Add Snakemake args: phables run ... --dry-run --keep-going --touch
 75 |   Specify targets:    phables run ... print_stages
 76 |   Available targets:
 77 |       all             Run everything (default)
 78 |       preprocess      Run preprocessing only
 79 |       phables         Run phables (and preprocessing if needed)
 80 |       postprocess     Run postprocessing (with preprocessing and phables if needed)
 81 |       print_stages    List available stages
 82 | ```
 83 | 
 84 | ## Run options explained
 85 | 
 86 | * `--input` - assembly graph file in .GFA format
 87 | * `--reads` - folder containing paired-end read files
 88 | * `--minlength` - minimum length of circular unitigs to consider [default: 2000]
 89 | * `--mincov` - minimum coverage of paths to output [default: 10]
 90 | * `--compcount` - maximum unitig count to consider a component [default: 200]
 91 | * `--maxpaths` - maximum number of paths to resolve for a component [default: 10]
 92 | * `--mgfrac` - length threshold to consider single copy marker genes [default: 0.2]
 93 | * `--evalue` - maximum e-value for phrog annotations [default: 1e-10]
 94 | * `--seqidentity` - minimum sequence identity for phrog annotations [default: 0.3]
 95 | * `--covtol` - coverage tolerance for extending subpaths [default: 100]
 96 | * `--alpha` - coverage multiplier for flow interval modelling [default: 1.2]
 97 | * `--longreads` - provide long reads as input. If this flag is not provided phables defaults to short reads
 98 | * `--prefix` - prefix for genome identifier [default: None]
 99 | * `--output` - path to the output directory [default: `phables.out`]
100 | * `--configfile` - custom config file [default: `(outputDir)/config.yaml`]
101 | * `--threads` - number of threads to use  [default: 1]
102 | * `--use-conda` / `--no-use-conda` - use conda for Snakemake rules  [default: `use-conda`]
103 | * `--conda-prefix` - custom conda env directory
104 | * `--snake-default` - customise Snakemake runtime args  [default: `--rerun-incomplete, --printshellcmds, --nolock, --show-failed-logs`]
105 | 
106 | 
107 | ## Example usage
108 | 
109 | Assuming your assembly graph file is `assembly_graph.gfa` and reads folder as `fastq`, you can run `phables` as follows.
110 | 
111 | ### Using short reads
112 | 
113 | ```bash
114 | # Preprocess data using 8 threads (default is 1 thread)
115 | phables run --input assembly_graph.gfa --reads fastq --threads 8
116 | ```
117 | 
118 | ### Using long reads
119 | 
120 | ```bash
121 | # Preprocess data using 8 threads (default is 1 thread)
122 | phables run --input assembly_graph.gfa --reads fastq --threads 8 --longreads
123 | ```
124 | 
125 | Note that you should provide the path to the GFA file to the `--input` parameter and the folder containing your sequencing reads to the `--reads` parameter. 
126 | 
127 | The output of Phables is set by default to `phables.out`. You can update the output path using the `--output` parameter for `phables run` as follows.
128 | 
129 | ```bash
130 | # Preprocess data using 8 threads (default is 1 thread)
131 | phables run --input assembly_graph.gfa --reads fastq --output my_output_folder --threads 8
132 | ```
133 | 
134 | The `phables run` command will run preprocessing steps, perform genome resolution and the perform postprocessing steps.
135 | 
136 | ## Output
137 | 
138 | Following is the folder structure of the Phables complete run.
139 | 
140 | ```
141 | phable.out
142 | ├── config.yaml  # config file
143 | ├── logs         # all log files
144 | ├── phables      # final phables results
145 | ├── phables.log  # phables master log
146 | ├── postprocess  # postprocessing results
147 | └── preprocess   # preprocessing results
148 | ```
149 | 
150 | Phables will create 3 main folders `preprocess`, `phables` and `postprocess` for the different stages of execution.
151 | 
152 | ### 1. `preprocess` - preprocessing results
153 | 
154 | The following preprocessing steps will be run and their corresponding files and folders can be found in the `preprocess` folder.
155 | 
156 | * Obtain unitig sequences from assembly graph - `edges.fasta`
157 | * Map reads to unitig sequences and get BAM files - `temp/*.bam` and `temp/*.bai`
158 | * Calculate coverage of unitig sequences - `coverage.tsv`
159 | * Scan unitig sequences for single-copy marker genes - `edges.fasta.hmmout`
160 | * Scan unitig sequences for Prokaryotic Virus Remote Homologous Groups ([PHROGs](https://phrogs.lmge.uca.fr/)) - `phrogs_annotations.tsv`
161 | 
162 | ### 2. `phables` - genome resolution results
163 | 
164 | The following files and folders can be found inside the `phables` folder which are the main outputs of Phables.
165 | 
166 | * `resolved_paths.fasta` containing the resolved genomes
167 | * `resolved_phages` folder containing the resolved genomes in individual FASTA files
168 | * `resolved_genome_info.txt` containing the path name, coverage, length, GC content and unitig order of the resolved genomes
169 | * `resolved_edges.fasta` containing the unitigs that make up the resolved genomes
170 | * `unresolved_phage_like_edges.fasta` containing all the unresolved phage-like unitigs
171 | * `all_phage_like_edges.fasta` containing sequences from all the phage-like components (both resolved and unresolved)
172 | * `resolved_component_info.txt` containing the details of the phage bubbles resolved
173 | * `component_phrogs.txt` containing PHROGs found in each component
174 | 
175 | ### 3. `postprocess` - postprocessing results
176 | 
177 | The following postprocessing steps will be run and their corresponding files and folders can be found in the `postprocess` folder.
178 | 
179 | * Combine resolved genomes and unresolved edges - `genomes_and_unresolved_edges.fasta`
180 | * Obtain read counts for resolved genomes and unresolved edges - `sample_genome_read_counts.tsv`
181 | * Obtain mean coverage of resolved genomes and unresolved edges - `sample_genome_mean_coverage.tsv`
182 | * Obtain RPKM coverage of resolved genomes and unresolved edges - `sample_genome_rpkm.tsv`
183 | 
184 | 
185 | ## Step-wise usage
186 | 
187 | You can execute each of the preprocessing, phables and postprocessing steps individually if you wish to do so as follows.
188 | 
189 | ### Preprocessing only
190 | 
191 | You can use the following command to **only run the preprocessing steps**.
192 | 
193 | ```bash
194 | # Only preprocess data
195 | phables run --input assembly_graph.gfa --reads fastq --threads 8 preprocess
196 | ```
197 | 
198 | ### Genome resolution only
199 | 
200 | You can use the following command to **only run the genome resolution steps**. Please make sure to have the preprocessing results in the output folder.
201 | 
202 | ```bash
203 | # Only run phables core using short reads
204 | phables run --input assembly_graph.gfa --reads fastq --threads 8 phables
205 | 
206 | # Only run phables core using long reads
207 | phables run --input assembly_graph.gfa --reads fastq --threads 8 --longreads phables
208 | ```
209 | 
210 | ### Postprocessing only
211 | 
212 | You can use the following command to **only run the postprocessing steps**.
213 | 
214 | ```bash
215 | # Only run phables core
216 | phables run --input assembly_graph.gfa --reads fastq --threads 8 postprocess
217 | ```
218 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Phables
 2 | site_url: "https://github.com/Vini2/phables"
 3 | site_author: "Vijini Mallawaarachchi"
 4 | repo_url: "https://github.com/Vini2/phables"
 5 | repo_name: 'GitHub'
 6 | theme:
 7 |     name: readthedocs
 8 |     highlightjs: true
 9 |     hljs_languages:
10 |         - yaml
11 |         - bash
12 |         - shell
13 |         - text
14 | nav:
15 |     - HOME:
16 |         - Introduction: index.md
17 |         - Citation: citation.md
18 |     - RUNNING: 
19 |         - Install: install.md
20 |         - Assemble: assemble.md
21 |         - Usage: usage.md
22 |         - FAQ: faq.md
23 |     - EVALUATION:
24 |         - Running CheckV: quality.md
25 |         - Quality comparison: comparison.md
26 |         - Annotation with pharokka: annotation.md
27 |         - Graph statistics: graph_stats.md
28 | 


--------------------------------------------------------------------------------
/phables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables/__init__.py


--------------------------------------------------------------------------------
/phables/__main__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Entrypoint for phables
  3 | 
  4 | Check out the wiki for a detailed look at customising this file:
  5 | https://github.com/beardymcjohnface/Snaketool/wiki/Customising-your-Snaketool
  6 | """
  7 | 
  8 | import os
  9 | import click
 10 | 
 11 | from snaketool_utils.cli_utils import OrderedCommands, run_snakemake, copy_config, echo_click
 12 | 
 13 | 
 14 | def snake_base(rel_path):
 15 |     return os.path.join(os.path.dirname(os.path.realpath(__file__)), rel_path)
 16 | 
 17 | 
 18 | def get_version():
 19 |     with open(snake_base("phables.VERSION"), "r") as f:
 20 |         version = f.readline()
 21 |     return version
 22 | 
 23 | 
 24 | def print_citation():
 25 |     with open(snake_base("phables.CITATION"), "r") as f:
 26 |         for line in f:
 27 |             echo_click(line)
 28 | 
 29 | 
 30 | def default_to_output(ctx, param, value):
 31 |     """Callback for click options; places value in output directory unless specified"""
 32 |     if param.default == value:
 33 |         return os.path.join(ctx.params["output"], value)
 34 |     return value
 35 | 
 36 | 
 37 | def common_options(func):
 38 |     """Common command line args
 39 |     Define common command line args here, and include them with the @common_options decorator below.
 40 |     """
 41 |     options = [
 42 |         click.option(
 43 |             "--output",
 44 |             help="Output directory",
 45 |             type=click.Path(dir_okay=True, writable=True, readable=True),
 46 |             default="phables.out",
 47 |             show_default=True,
 48 |         ),
 49 |         click.option(
 50 |             "--configfile",
 51 |             default="config.yaml",
 52 |             show_default=False,
 53 |             callback=default_to_output,
 54 |             help="Custom config file [default: (outputDir)/config.yaml]",
 55 |         ),
 56 |         click.option(
 57 |             "--threads", help="Number of threads to use", default=1, show_default=True
 58 |         ),
 59 |         click.option(
 60 |             "--use-conda/--no-use-conda",
 61 |             default=True,
 62 |             help="Use conda for Snakemake rules",
 63 |             show_default=True,
 64 |         ),
 65 |         click.option(
 66 |             "--conda-prefix",
 67 |             default=snake_base(os.path.join("workflow", "conda")),
 68 |             help="Custom conda env directory",
 69 |             type=click.Path(),
 70 |             show_default=False,
 71 |         ),
 72 |         click.option(
 73 |             "--profile", help="Snakemake profile", default=None, show_default=False
 74 |         ),
 75 |         click.option(
 76 |             "--snake-default",
 77 |             multiple=True,
 78 |             default=[
 79 |                 "--rerun-incomplete",
 80 |                 "--printshellcmds",
 81 |                 "--nolock",
 82 |                 "--show-failed-logs",
 83 |             ],
 84 |             help="Customise Snakemake runtime args",
 85 |             show_default=True,
 86 |         ),
 87 |         click.option(
 88 |             "--log",
 89 |             default="phables.log",
 90 |             callback=default_to_output,
 91 |             hidden=True,
 92 |         ),
 93 |         click.option(
 94 |             "--system_config",
 95 |             default=snake_base(os.path.join("config", "config.yaml")),
 96 |             hidden=True,
 97 |             type=click.Path(),
 98 |         ),
 99 |         click.argument("snake_args", nargs=-1),
100 |     ]
101 |     for option in reversed(options):
102 |         func = option(func)
103 |     return func
104 | 
105 | 
106 | def run_options(func):
107 |     """Command line args for run subcommand etc"""
108 |     options = [
109 |         click.option(
110 |             "--input",
111 |             help="Path to assembly graph file in .GFA format",
112 |             type=click.Path(),
113 |             required=True,
114 |         ),
115 |         click.option(
116 |             "--reads",
117 |             help="Path to directory containing paired-end reads",
118 |             type=click.Path(exists=True),
119 |             required=True,
120 |         ),
121 |         click.option(
122 |             "--minlength",
123 |             default=2000,
124 |             required=False,
125 |             help="minimum length of circular unitigs to consider",
126 |             type=int,
127 |             show_default=True,
128 |         ),
129 |         click.option(
130 |             "--mincov",
131 |             default=10,
132 |             required=False,
133 |             help="minimum coverage of paths to output",
134 |             type=int,
135 |             show_default=True,
136 |         ),
137 |         click.option(
138 |             "--compcount",
139 |             default=200,
140 |             required=False,
141 |             help="maximum unitig count to consider a component",
142 |             type=int,
143 |             show_default=True,
144 |         ),
145 |         click.option(
146 |             "--maxpaths",
147 |             default=10,
148 |             required=False,
149 |             help="maximum number of paths to resolve for a component",
150 |             type=int,
151 |             show_default=True,
152 |         ),
153 |         click.option(
154 |             "--mgfrac",
155 |             default=0.2,
156 |             required=False,
157 |             help="length threshold to consider single copy marker genes",
158 |             type=float,
159 |             show_default=True,
160 |         ),
161 |         click.option(
162 |             "--evalue",
163 |             default=1e-10,
164 |             required=False,
165 |             help="maximum e-value for phrog annotations",
166 |             type=float,
167 |             show_default=True,
168 |         ),
169 |         click.option(
170 |             "--seqidentity",
171 |             default=0.3,
172 |             required=False,
173 |             help="minimum sequence identity for phrog annotations",
174 |             type=float,
175 |             show_default=True,
176 |         ),
177 |         click.option(
178 |             "--covtol",
179 |             default=100,
180 |             required=False,
181 |             help="coverage tolerance for extending subpaths",
182 |             type=int,
183 |             show_default=True,
184 |         ),
185 |         click.option(
186 |             "--alpha",
187 |             default=1.2,
188 |             required=False,
189 |             help="coverage multiplier for flow interval modelling",
190 |             type=float,
191 |             show_default=True,
192 |         ),
193 |         click.option(
194 |             "--longreads",
195 |             help="provide long reads as input (else defaults to short reads)",
196 |             is_flag=True,
197 |             default=False,
198 |             show_default=True,
199 |             required=False,
200 |         ),
201 |         click.option(
202 |             "--prefix",
203 |             help="prefix for genome identifier",
204 |             type=str,
205 |             required=False,
206 |         ),
207 |     ]
208 |     for option in reversed(options):
209 |         func = option(func)
210 |     return func
211 | 
212 | 
213 | @click.group(
214 |     cls=OrderedCommands, context_settings=dict(help_option_names=["-h", "--help"])
215 | )
216 | @click.version_option(get_version(), "-v", "--version", is_flag=True)
217 | def cli():
218 |     """
219 |     Phables: from fragmented assemblies to high-quality bacteriophage genomes.
220 |     Please refer the full documentation available on Read the Docs at https://phables.readthedocs.io/
221 |     """
222 |     pass
223 | 
224 | 
225 | help_msg_extra = """
226 | \b
227 | \b
228 | If you use Phables in your work, please cite Phables as,
229 | \b
230 | Vijini Mallawaarachchi, Michael J Roach, Przemyslaw Decewicz, 
231 | Bhavya Papudeshi, Sarah K Giles, Susanna R Grigson, George Bouras, 
232 | Ryan D Hesse, Laura K Inglis, Abbey L K Hutton, Elizabeth A Dinsdale, 
233 | Robert A Edwards, Phables: from fragmented assemblies to high-quality 
234 | bacteriophage genomes, Bioinformatics, Volume 39, Issue 10, 
235 | October 2023, btad586, https://doi.org/10.1093/bioinformatics/btad586
236 | \b
237 | \b
238 | For more information on Phables please visit:
239 | https://phables.readthedocs.io/
240 | \b
241 | \b
242 | CLUSTER EXECUTION:
243 | phables run ... --profile [profile]
244 | For information on Snakemake profiles see:
245 | https://snakemake.readthedocs.io/en/stable/executing/cli.html#profiles
246 | \b
247 | RUN EXAMPLES:
248 | Required:           phables run --input [assembly graph file]
249 | Specify threads:    phables run ... --threads [threads]
250 | Disable conda:      phables run ... --no-use-conda 
251 | Change defaults:    phables run ... --snake-default="-k --nolock"
252 | Add Snakemake args: phables run ... --dry-run --keep-going --touch
253 | Specify targets:    phables run ... print_stages
254 | Available targets:
255 |     all             Run everything (default)
256 |     preprocess      Run preprocessing only
257 |     phables         Run phables (and preprocessing if needed)
258 |     postprocess     Run postprocessing (with preprocessing and phables if needed)
259 |     print_stages    List available stages
260 | """
261 | 
262 | 
263 | # Run command
264 | @click.command(
265 |     epilog=help_msg_extra,
266 |     context_settings=dict(
267 |         help_option_names=["-h", "--help"], ignore_unknown_options=True
268 |     ),
269 | )
270 | @common_options
271 | @run_options
272 | def run(**kwargs):
273 |     """Run Phables"""
274 | 
275 |     # run!
276 |     run_snakemake(
277 |         # Full path to Snakefile
278 |         snakefile_path=snake_base(os.path.join("workflow", "phables.smk")),
279 |         merge_config=kwargs,
280 |         **kwargs
281 |     )
282 | 
283 | 
284 | # Install command
285 | @click.command(
286 |     epilog=help_msg_extra,
287 |     context_settings=dict(
288 |         help_option_names=["-h", "--help"], ignore_unknown_options=True
289 |     ),
290 | )
291 | @common_options
292 | def install(output, **kwargs):
293 |     """Install databases"""
294 | 
295 |     # run!
296 |     run_snakemake(
297 |         # Full path to Snakefile
298 |         snakefile_path=snake_base(os.path.join("workflow", "install.smk")),
299 |         **kwargs
300 |     )
301 | 
302 | 
303 | # Test command
304 | @click.command(
305 |     epilog=help_msg_extra,
306 |     context_settings=dict(
307 |         help_option_names=["-h", "--help"], ignore_unknown_options=True
308 |     ),
309 | )
310 | @common_options
311 | def test(**kwargs):
312 |     """Test Phables"""
313 |     test_dir = snake_base("test_data")
314 | 
315 |     # Config to add or update in configfile
316 |     merge_config = {"dir": test_dir}
317 | 
318 |     # run!
319 |     run_snakemake(
320 |         # Full path to Snakefile
321 |         snakefile_path=snake_base(os.path.join("workflow", "test_phables.smk")),
322 |         merge_config=merge_config,
323 |         **kwargs
324 |     )
325 | 
326 | 
327 | @click.command()
328 | @common_options
329 | def config(configfile, **kwargs):
330 |     """Copy the system default config file"""
331 |     copy_config(configfile)
332 | 
333 | 
334 | @click.command()
335 | def citation(**kwargs):
336 |     """Print the citation(s) for this tool"""
337 |     print_citation()
338 | 
339 | 
340 | cli.add_command(run)
341 | cli.add_command(install)
342 | cli.add_command(test)
343 | cli.add_command(config)
344 | cli.add_command(citation)
345 | 
346 | 
347 | def main():
348 |     cli()
349 | 
350 | 
351 | if __name__ == "__main__":
352 |     main()
353 | 


--------------------------------------------------------------------------------
/phables/config/config.yaml:
--------------------------------------------------------------------------------
 1 | # Snakemake config
 2 | input:
 3 | output: 'phables.out/'
 4 | log: 'phables/phables.log'
 5 | 
 6 | # Databases 
 7 | databases:
 8 | 
 9 | # Profile
10 | profile: 
11 | 
12 | # Job resources for use with Snakemake profiles
13 |   # jobCPU will be scaled down if running locally with less than 8 threads
14 |   # jobMem is ignored when running locally
15 | resources:
16 |   jobCPU: 8
17 |   jobMem: 16000       # in Mb
18 | 
19 | # Phable parameters
20 | minlength: 2000
21 | mincov: 10
22 | compcount: 200
23 | maxpaths: 10
24 | mgfrac: 0.2
25 | evalue: 1E-10
26 | seqidentity: 0.3
27 | covtol: 100
28 | alpha: 1.2
29 | longreads: False
30 | prefix: 


--------------------------------------------------------------------------------
/phables/config/databases.yaml:
--------------------------------------------------------------------------------
 1 | # Bacterial single-copy marker genes HMM file
 2 | smg_hmm: "https://raw.githubusercontent.com/metagentools/MetaCoAG/develop/src/metacoag/metacoag_utils/auxiliary/marker.hmm"
 3 | smg_hmm_file: "marker.hmm"
 4 | 
 5 | # PHROGs mmseqs database
 6 | phrogs_mmseqs: "https://phrogs.lmge.uca.fr/downloads_from_website/phrogs_mmseqs_db.tar.gz"
 7 | phrogs_mmseqs_file: "phrogs_mmseqs_db.tar.gz"
 8 | phrogs_mmseqs_folder: "phrogs_mmseqs_db/"
 9 | 
10 | # PROHGs annotations
11 | phrog_annot: "https://phrogs.lmge.uca.fr/downloads_from_website/phrog_annot_v4.tsv"
12 | phrog_annot_file: "phrog_annot_v4.tsv"


--------------------------------------------------------------------------------
/phables/phables.CITATION:
--------------------------------------------------------------------------------
 1 | Please cite phables in your paper using this link:
 2 | https://doi.org/10.1093/bioinformatics/btad586
 3 | 
 4 | 
 5 | Please consider also citing these dependencies:
 6 | 
 7 | Snaketool:
 8 | https://doi.org/10.31219/osf.io/8w5j3
 9 | 
10 | Snakemake:
11 | https://doi.org/10.12688/f1000research.29032.1
12 | 
13 | PHROG:
14 | https://doi.org/10.1093/nargab/lqab067
15 | 
16 | MMseqs2:
17 | https://doi.org/10.1038/nbt.3988
18 | 
19 | Minimap2:
20 | https://doi.org/10.1093/bioinformatics/bty191
21 | 
22 | SAMtools:
23 | https://doi.org/10.1093/bioinformatics/btp352
24 | 
25 | CoverM
26 | https://github.com/wwood/CoverM
27 | 
28 | Koverage
29 | https://github.com/beardymcjohnface/Koverage
30 | 
31 | NetworkX
32 | https://conference.scipy.org/proceedings/scipy2008/paper_2/
33 | 
34 | Gurobi Optimization
35 | https://www.gurobi.com/
36 | 


--------------------------------------------------------------------------------
/phables/phables.LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023, Vijini Mallawaarachchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/phables/phables.VERSION:
--------------------------------------------------------------------------------
1 | 1.4.1


--------------------------------------------------------------------------------
/phables/test_data/edge_coverages.tsv:
--------------------------------------------------------------------------------
 1 | Contig	sample1	sample2
 2 | edge_1	4742.0	0.0
 3 | edge_2	5000.0	0.0
 4 | edge_3	4858.0	0.0
 5 | edge_4	102.0	303.0
 6 | edge_5	12.0	33.0
 7 | edge_6	200.0	100.0
 8 | edge_7	6000.0	9000.0
 9 | edge_8	2000.0	3000.0
10 | edge_9	4020.0	6010.0
11 | edge_10	7000.0	8070.0
12 | edge_11	3090.0	5090.0
13 | edge_12	2010.0	3040.0
14 | edge_13	6010.0	9020.0
15 | edge_14 50.0    700.0
16 | edge_15 50.0    50.0
17 | edge_16 150.0    150.0


--------------------------------------------------------------------------------
/phables/test_data/edges.fasta.hmmout:
--------------------------------------------------------------------------------
1 | #                                                                              --- full sequence --- -------------- this domain -------------   hmm coord   ali coord   env coord
2 | # target name          accession   tlen query name           accession   qlen   E-value  score  bias   #  of  c-Evalue  i-Evalue  score  bias  from    to  from    to  from    to  acc description of target
3 | #  ------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------


--------------------------------------------------------------------------------
/phables/test_data/junction_pe_coverage.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables/test_data/junction_pe_coverage.pickle


--------------------------------------------------------------------------------
/phables/test_data/phrogs_annotations.tsv:
--------------------------------------------------------------------------------
 1 | "edge_2"	"phrog_2 ## NC_006953_p8"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 2 | "edge_4"	"phrog_2 ## NC_006953_p8"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 3 | "edge_5"	"phrog_2 ## NC_006953_p8"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 4 | "edge_13"	"phrog_2 ## NC_006953_p8"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 5 | "edge_14"	"phrog_94 ## p350580 VI_04431"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 6 | "edge_14"	"phrog_30832 ## AP018399_p123"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 7 | "edge_14"	"phrog_195 ## p126863 VI_01011"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 8 | "edge_14"	"phrog_1858 ## p362065 VI_01943"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
 9 | "edge_15"	"phrog_30832 ## AP018399_p123"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
10 | "edge_15"	"phrog_195 ## p126863 VI_01011"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1
11 | "edge_15"	"phrog_1858 ## p362065 VI_01943"	142	0.994	3.622E-37	835	215	9332	18	221	408	901	2	-1	-1


--------------------------------------------------------------------------------
/phables/workflow/envs/curl.yaml:
--------------------------------------------------------------------------------
1 | name: curl
2 | channels:
3 |   - conda-forge
4 | dependencies:
5 |   - curl


--------------------------------------------------------------------------------
/phables/workflow/envs/koverage.yaml:
--------------------------------------------------------------------------------
1 | name: koverage
2 | channels:
3 |     - conda-forge
4 |     - bioconda
5 | dependencies:
6 |     - koverage>=0.1.8
7 |     - numpy<2.0.0
8 | 


--------------------------------------------------------------------------------
/phables/workflow/envs/mapping.yaml:
--------------------------------------------------------------------------------
1 | name: mapping
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - minimap2
6 |   - samtools
7 | 


--------------------------------------------------------------------------------
/phables/workflow/envs/mmseqs.yaml:
--------------------------------------------------------------------------------
1 | name: mmseqs
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - mmseqs2=13.45111
6 | 


--------------------------------------------------------------------------------
/phables/workflow/envs/phables.yaml:
--------------------------------------------------------------------------------
 1 | name: phables
 2 | channels:
 3 |   - conda-forge
 4 |   - anaconda
 5 |   - bioconda
 6 |   - gurobi
 7 | dependencies:
 8 |   - python>=3.9, <3.11
 9 |   - biopython
10 |   - python-igraph
11 |   - pysam
12 |   - networkx>=2.8.6
13 |   - scipy
14 |   - numpy<2.0.0
15 |   - pandas
16 |   - gurobi>=10.0.0
17 |   - more-itertools
18 |   - tqdm
19 |   - click
20 |   - metasnek>=0.0.3
21 | 


--------------------------------------------------------------------------------
/phables/workflow/envs/smg.yaml:
--------------------------------------------------------------------------------
1 | name: smg
2 | channels:
3 |   - bioconda
4 | dependencies:
5 |   - fraggenescan
6 |   - hmmer
7 | 


--------------------------------------------------------------------------------
/phables/workflow/install.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Phables: from fragmented assemblies to high-quality bacteriophage genomes.
 3 | 
 4 | 2023, Vijini Mallawaarachchi
 5 | 
 6 | This is an auxiliary Snakefile to install databases or dependencies.
 7 | """
 8 | 
 9 | 
10 | """CONFIGURATION"""
11 | configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml")
12 | configfile: os.path.join(workflow.basedir, "..", "config", "databases.yaml")
13 | 
14 | include: "rules/00_database_preflight.smk"
15 | 
16 | 
17 | """TARGETS"""
18 | db_files = []
19 | 
20 | db_files.append(os.path.join(DBPATH, config['phrogs_mmseqs_folder']))
21 | db_files.append(os.path.join(DBPATH, config['smg_hmm_file']))
22 | db_files.append(os.path.join(DBPATH, config['phrog_annot_file']))
23 | 
24 | 
25 | """RUN SNAKEMAKE"""
26 | rule all:
27 |     input:
28 |         db_files
29 | 
30 | 
31 | """RULES"""
32 | rule phrogs_mmseqs_download:
33 |     params:
34 |         url=os.path.join(config['phrogs_mmseqs']),
35 |         file=os.path.join(DBPATH, config['phrogs_mmseqs_file']),
36 |         db_path = DBPATH
37 |     output:
38 |         directory(os.path.join(DBPATH, config['phrogs_mmseqs_folder']))
39 |     conda:
40 |         os.path.join("envs", "curl.yaml")
41 |     shell:
42 |         """
43 |             curl -Lko {params.file} {params.url}
44 |             tar -xf {params.file} -C {params.db_path}
45 |             rm -rf {params.file}
46 |         """
47 | 
48 | rule smg_hmm_download:
49 |     params:
50 |         url=os.path.join(config['smg_hmm'])
51 |     output:
52 |         os.path.join(DBPATH, config['smg_hmm_file'])
53 |     conda:
54 |         os.path.join("envs", "curl.yaml")
55 |     shell:
56 |         """
57 |             curl -Lko {output} {params.url}
58 |         """
59 | 
60 | rule phrog_annot_download:
61 |     params:
62 |         url=os.path.join(config['phrog_annot'])
63 |     output:
64 |         os.path.join(DBPATH, config['phrog_annot_file'])
65 |     conda:
66 |         os.path.join("envs", "curl.yaml")
67 |     shell:
68 |         """
69 |             curl -Lko {output} {params.url}
70 |         """


--------------------------------------------------------------------------------
/phables/workflow/phables.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Phables: from fragmented assemblies to high-quality bacteriophage genomes.
 3 | 
 4 | 2023, Vijini Mallawaarachchi
 5 | 
 6 | This is the main Snakefile to run phables.
 7 | """
 8 | 
 9 | """CONFIGURATION"""
10 | configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml")
11 | configfile: os.path.join(workflow.basedir, "..", "config", "databases.yaml")
12 | 
13 | 
14 | """PREFLIGHT CHECKS"""
15 | include: os.path.join("rules", "00_database_preflight.smk")
16 | include: os.path.join("rules", "02_phables_preflight.smk")
17 | 
18 | 
19 | """TARGETS"""
20 | include: os.path.join("rules", "02_phables_targets.smk")
21 | 
22 | 
23 | """Target rules"""
24 | target_rules = []
25 | 
26 | def targetRule(fn):
27 |     assert fn.__name__.startswith("__")
28 |     target_rules.append(fn.__name__[2:])
29 |     return fn
30 | 
31 | localrules: all, preprocess, phables, print_stages, koverage_tsv, postprocess
32 | 
33 | 
34 | """Run stages"""
35 | @targetRule
36 | rule all:
37 |     input:
38 |         preprocessTargets,
39 |         phablesTargets,
40 |         postprocessTargets
41 | 
42 | 
43 | @targetRule
44 | rule preprocess:
45 |     input:
46 |         preprocessTargets
47 | 
48 | 
49 | @targetRule
50 | rule phables:
51 |     input:
52 |         phablesTargets
53 | 
54 | 
55 | @targetRule
56 | rule postprocess:
57 |     input:
58 |         postprocessTargets
59 | 
60 | 
61 | @targetRule
62 | rule print_stages:
63 |     run:
64 |         print("\nIndividual Phables stages to run: \n", file=sys.stderr)
65 |         print("* " + "\n* ".join(target_rules) + "\n\n", file=sys.stderr)
66 | 
67 | 
68 | """RULES"""
69 | # Step 2: Obtain unitig sequences from assembly graph
70 | include: os.path.join("rules", "gfa2fasta.smk")
71 | 
72 | 
73 | # Step 3: Calculate coverage of unitig sequences
74 | include: os.path.join("rules", "coverage.smk")
75 | 
76 | 
77 | # Step 4: Scan unitig sequences for single-copy marker genes and PHROGs
78 | include: os.path.join("rules", "genes.smk")
79 | 
80 | 
81 | # Step 5: Run Phables
82 | include: os.path.join("rules", "phables.smk")
83 | 
84 | 
85 | # Step 6: Postprocess genomes
86 | include: os.path.join("rules", "postprocess.smk")
87 | 


--------------------------------------------------------------------------------
/phables/workflow/rules/00_database_preflight.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Add your preflight checks as pure Python code here.
 3 | e.g. Configure the run, declare directories, validate the input files etc.
 4 | This preflight check to confirm the database filepaths 
 5 | """
 6 | 
 7 | 
 8 | """CHECK IF CUSTOM DATABASE DIRECTORY"""
 9 | DBPATH = ""
10 | try:
11 |     if config['databases'] is None:
12 |         DBPATH = os.path.join(workflow.basedir, '..', '..', 'databases')
13 |     else:
14 |         DBPATH = config['databases']
15 | except KeyError:
16 |     DBPATH = os.path.join(workflow.basedir,'..','..','databases')
17 | 
18 | 
19 | """ONSTART/END/ERROR
20 | Tasks to perform at various stages the start and end of a run.
21 | """
22 | onsuccess:
23 |     """Print a success message"""
24 |     sys.stderr.write('\n\nDatabases are successfully setup!\n\n')
25 | 
26 | onerror:
27 |     """Print an error message"""
28 |     sys.stderr.write('\n\nERROR: Databases were not setup.\n\n')
29 | 


--------------------------------------------------------------------------------
/phables/workflow/rules/02_phables_preflight.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Add your preflight checks as pure Python code here.
 3 | e.g. Configure the run, declare directories, validate the input files etc.
 4 | This preflight check to confirm the database filepaths 
 5 | """
 6 | 
 7 | from metasnek import fastq_finder
 8 | 
 9 | """
10 | Setting the directory variables
11 | """
12 | 
13 | # THREADS = config['threads']
14 | INPUT = config['input']
15 | OUTDIR = config['output']
16 | print(f"Output files will be saved to directory, {OUTDIR}\n")
17 | 
18 | 
19 | ############################################################################
20 | # Checking through the reads folder
21 | ############################################################################
22 | 
23 | SAMPLE_READS = fastq_finder.parse_samples_to_dictionary(config['reads'])
24 | SAMPLE_NAMES = list(SAMPLE_READS.keys())
25 | 
26 | 
27 | ############################################################################
28 | # Get Phables parameters
29 | ############################################################################
30 | ML = config['minlength']
31 | MC = config['mincov']
32 | CC = config['compcount']
33 | MP = config['maxpaths']
34 | MGF = config['mgfrac']
35 | EV = config['evalue']
36 | SI = config['seqidentity']
37 | CT = config['covtol']
38 | AL = config['alpha']
39 | LR = config['longreads']
40 | PR = config['prefix']
41 | 
42 | 
43 | """DIRECTORIES/FILES etc.
44 | Declare some directories for pipeline intermediates and outputs.
45 | """
46 | LOGSDIR = os.path.join(OUTDIR, 'logs')
47 | 
48 | 
49 | """ONSTART/END/ERROR
50 | Tasks to perform at various stages the start and end of a run.
51 | """
52 | onstart:
53 |     """Cleanup old log files before starting"""
54 |     if os.path.isdir(LOGSDIR):
55 |         oldLogs = filter(re.compile(r'.*.log').match, os.listdir(LOGSDIR))
56 |         for logfile in oldLogs:
57 |             os.unlink(os.path.join(LOGSDIR, logfile))
58 | 
59 | 
60 | onsuccess:
61 |     """Print a success message"""
62 |     sys.stderr.write('\n\nPhables ran successfully!\n\n')
63 | 
64 | 
65 | onerror:
66 |     """Print an error message"""
67 |     sys.stderr.write('\n\nPhables run failed\n\n')
68 | 


--------------------------------------------------------------------------------
/phables/workflow/rules/02_phables_targets.smk:
--------------------------------------------------------------------------------
 1 | 
 2 | preprocessTargets = []
 3 | phablesTargets = []
 4 | postprocessTargets = []
 5 | 
 6 | 
 7 | """PREPROCESSING TARGETS"""
 8 | EDGES_FILE = os.path.join(OUTDIR, "preprocess", "edges.fasta")
 9 | preprocessTargets.append(EDGES_FILE)
10 | 
11 | BAM_PATH = os.path.join(OUTDIR, "preprocess", "temp")
12 | preprocessTargets.append(expand(os.path.join(BAM_PATH, "{sample}.bam"), sample=SAMPLE_NAMES))
13 | preprocessTargets.append(expand(os.path.join(BAM_PATH, "{sample}.bam.bai"), sample=SAMPLE_NAMES))
14 | 
15 | COVERAGE_PATH = os.path.join(OUTDIR, "preprocess", "coverage_rpkm/")
16 | # preprocessTargets.append(expand(os.path.join(COVERAGE_PATH, "{sample}_rpkm.tsv"), sample=SAMPLE_NAMES))
17 | preprocessTargets.append(os.path.join(OUTDIR, "preprocess", "coverage.tsv"))
18 | preprocessTargets.append(os.path.join(OUTDIR, "preprocess", "edges.fasta.hmmout"))
19 | 
20 | preprocessTargets.append(os.path.join(OUTDIR, "preprocess", "phrogs_annotations.tsv"))
21 | 
22 | 
23 | """MISC"""
24 | COVERAGE_FILE = os.path.join(OUTDIR, "preprocess", "coverage.tsv")
25 | PHROG_ANNOT = os.path.join(OUTDIR, "preprocess", "phrogs_annotations.tsv")
26 | SMG_FILE = os.path.join(OUTDIR, "preprocess", "edges.fasta.hmmout")
27 | GRAPH_FILE = INPUT
28 | 
29 | 
30 | """PHABLES TARGETS"""
31 | RESOLVED_GENOMES = os.path.join(OUTDIR, "phables",  "resolved_paths.fasta")
32 | 
33 | RESOLVED_GENOME_INFO = os.path.join(OUTDIR, "phables", "resolved_genome_info.txt")
34 | phablesTargets.append(RESOLVED_GENOME_INFO)
35 | 
36 | RESOLVED_COMP_INFO = os.path.join(OUTDIR, "phables", "resolved_component_info.txt")
37 | phablesTargets.append(RESOLVED_COMP_INFO)
38 | 
39 | COMP_PHROGS = os.path.join(OUTDIR, "phables", "component_phrogs.txt")
40 | phablesTargets.append(COMP_PHROGS)
41 | 
42 | 
43 | """POSTPROCESSING TARGETS"""
44 | GENOME_READ_COUNTS = os.path.join(OUTDIR, "postprocess", "sample_genome_read_counts.tsv")
45 | postprocessTargets.append(GENOME_READ_COUNTS)


--------------------------------------------------------------------------------
/phables/workflow/rules/03_test_preflight.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Add your preflight checks as pure Python code here.
 3 | e.g. Configure the run, declare directories, validate the input files etc.
 4 | This preflight check to confirm the database filepaths 
 5 | """
 6 | 
 7 | 
 8 | """
 9 | Setting the directory variables
10 | """
11 | 
12 | TESTDIR = config['dir']
13 | 
14 | 
15 | ############################################################################
16 | # Get Phables parameters
17 | ############################################################################
18 | ML = config['minlength']
19 | MC = config['mincov']
20 | CC = config['compcount']
21 | MP = config['maxpaths']
22 | MGF = config['mgfrac']
23 | EV = config['evalue']
24 | SI = config['seqidentity']
25 | CT = config['covtol']
26 | AL = config['alpha']
27 | LR = config['longreads']
28 | PR = config['prefix']
29 | 
30 | 
31 | """ONSTART/END/ERROR
32 | Tasks to perform at various stages the start and end of a run.
33 | """
34 | onsuccess:
35 |     """Print a success message"""
36 |     sys.stderr.write('\n\nPhables test run was successful!\n\n')
37 | 
38 | onerror:
39 |     """Print an error message"""
40 |     sys.stderr.write('\n\nPhables test run failed! Please check.\n\n')


--------------------------------------------------------------------------------
/phables/workflow/rules/03_test_targets.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Declare your targets here!
 3 | A separate file is ideal if you have lots of target files to create, or need some python logic to determine
 4 | the targets to declare. This example shows targets that are dependent on the input file type.
 5 | """
 6 | 
 7 | allTargets = []
 8 | 
 9 | allTargets.append(os.path.join(TESTDIR, "resolved_paths.fasta"))
10 | allTargets.append(os.path.join(TESTDIR, "resolved_genome_info.txt"))
11 | allTargets.append(os.path.join(TESTDIR, "resolved_edges.fasta"))
12 | allTargets.append(os.path.join(TESTDIR, "resolved_component_info.txt"))
13 | allTargets.append(os.path.join(TESTDIR, "phage_like_edges.fasta"))


--------------------------------------------------------------------------------
/phables/workflow/rules/coverage.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Use raw_coverage to map to calculate coverage of unitigs.
 3 | Use combine_cov to combine the coverage values of multiple samples into one file.
 4 | """
 5 | 
 6 | rule koverage_tsv:
 7 |     """Generate TSV of samples and reads for Koverage"""
 8 |     output:
 9 |         os.path.join(OUTDIR, "preprocess", "phables.samples.tsv")
10 |     params:
11 |         SAMPLE_READS
12 |     run:
13 |         from metasnek import fastq_finder
14 |         fastq_finder.write_samples_tsv(params[0], output[0])
15 | 
16 | 
17 | rule koverage:
18 |     """Get coverage statistics with Koverage + CoverM"""
19 |     input:
20 |         tsv = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"),
21 |         edges = EDGES_FILE
22 |     params:
23 |         out_dir = os.path.join(OUTDIR, "preprocess"),
24 |         profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "",
25 |     output:
26 |         expand(os.path.join(OUTDIR, "preprocess", "temp", "{sample}.{ext}"),
27 |                sample=SAMPLE_NAMES,
28 |                ext=["bam","bam.bai"]),
29 |         os.path.join(OUTDIR, "preprocess", "results", "sample_coverm_coverage.tsv")
30 |     threads:
31 |         config["resources"]["jobCPU"]
32 |     resources:
33 |         mem_mb = config["resources"]["jobMem"],
34 |         mem = str(config["resources"]["jobMem"]) + "MB"
35 |     conda:
36 |         os.path.join("..", "envs", "koverage.yaml")
37 |     shell:
38 |         """
39 |         koverage run coverm \
40 |             --reads {input.tsv} \
41 |             --ref {input.edges} \
42 |             --threads {threads} \
43 |             --output {params.out_dir} \
44 |             {params.profile}
45 |         """
46 | 
47 | 
48 | rule run_combine_cov:
49 |     """Sample\tContig\tCount\tRPKM\tTPM\tMean\tCovered_bases\tVariance\n"""
50 |     input:
51 |         os.path.join(OUTDIR, "preprocess", "results", "sample_coverm_coverage.tsv")
52 |     output:
53 |         os.path.join(OUTDIR, "preprocess", "coverage.tsv")
54 |     shell:
55 |         """
56 |         sed -i '1d' {input}
57 |         awk -F '\t' '{{ sum[$2] += $6 }} END {{ for (key in sum) print key, sum[key] }}' {input} > {output}
58 |         """
59 | 


--------------------------------------------------------------------------------
/phables/workflow/rules/genes.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Use FragGeneScan and HMMER to scan for bacterial single-copy marker genes in unitigs.
 3 | User mmseqs2 to scan for PHROGs in unitigs.
 4 | """
 5 | 
 6 | rule scan_smg:
 7 |     input:
 8 |         genome = EDGES_FILE,
 9 |         hmm = os.path.join(DBPATH, "marker.hmm"),
10 |     threads:
11 |         config["resources"]["jobCPU"]
12 |     resources:
13 |         mem_mb = config["resources"]["jobMem"],
14 |         mem = str(config["resources"]["jobMem"]) + "MB"
15 |     output:
16 |         hmmout = os.path.join(OUTDIR, "preprocess", "edges.fasta.hmmout")
17 |     params:
18 |         frag = EDGES_FILE + ".frag",
19 |         frag_faa = EDGES_FILE + ".frag.faa",
20 |     log:
21 |         frag_out=os.path.join(LOGSDIR, "smg_scan_frag_out.log"),
22 |         frag_err=os.path.join(LOGSDIR, "smg_scan_frag_err.log"),
23 |         hmm_out=os.path.join(LOGSDIR, "smg_scan_hmm_out.log"),
24 |         hmm_err=os.path.join(LOGSDIR, "smg_scan_hmm_err.log")
25 |     conda: 
26 |         os.path.join("..", "envs", "smg.yaml")
27 |     shell:
28 |         """
29 |             run_FragGeneScan.pl -genome={input.genome} -out={params.frag} -complete=0 -train=complete -thread={threads} 1>{log.frag_out} 2>{log.frag_err}
30 |             hmmsearch --domtblout {output.hmmout} --cut_tc --cpu {threads} {input.hmm} {params.frag_faa} 1>{log.hmm_out} 2> {log.hmm_err}
31 |         """
32 | 
33 | 
34 | rule scan_phrogs:
35 |     input:
36 |         genome = EDGES_FILE,
37 |         db = os.path.join(DBPATH,"phrogs_mmseqs_db","phrogs_profile_db")
38 |     threads:
39 |         config["resources"]["jobCPU"]
40 |     resources:
41 |         mem_mb = config["resources"]["jobMem"],
42 |         mem = str(config["resources"]["jobMem"]) + "MB"
43 |     output:
44 |         os.path.join(OUTDIR, "preprocess", "phrogs_annotations.tsv")
45 |     params:
46 |         out_path = os.path.join(OUTDIR, "preprocess", "phrogs"),
47 |         target_seq = os.path.join(OUTDIR, "preprocess", "phrogs", "target_seq"),
48 |         results_mmseqs = os.path.join(OUTDIR, "preprocess", "phrogs", "results_mmseqs"),
49 |         tmp = os.path.join(OUTDIR, "preprocess", "phrogs", "tmp"),
50 |     log:
51 |         os.path.join(LOGSDIR, "phrogs_scan.log")
52 |     conda: 
53 |         os.path.join("..", "envs", "mmseqs.yaml")
54 |     shell:
55 |         """
56 |         mkdir -p {params.out_path}
57 |         mmseqs createdb {input} {params.target_seq} > {log}
58 |         mmseqs search {params.target_seq} {input.db} {params.results_mmseqs} {params.tmp} --threads {threads} -s 7 > {log}
59 |         mmseqs createtsv {params.target_seq} {input.db} {params.results_mmseqs} {output} --threads {threads} --full-header > {log}
60 |         rm -rf {params.out_path}
61 |         """


--------------------------------------------------------------------------------
/phables/workflow/rules/gfa2fasta.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Run gfa2fasta to obtain the sequences corresponding to unitigs in the assembly graphs in FASTA format.
 3 | The assembly graph file with .GFA extension should be provided as inputs.
 4 | """
 5 | 
 6 | rule run_gfa2fasta:
 7 |     input:
 8 |         GRAPH_FILE
 9 |     output:
10 |         EDGES_FILE
11 |     params:
12 |         graph = GRAPH_FILE,
13 |         output = os.path.join(OUTDIR, "preprocess"),
14 |         log = os.path.join(LOGSDIR, "gfa2fasta.log")
15 |     log:
16 |         os.path.join(LOGSDIR, "gfa2fasta.log")
17 |     conda: 
18 |         os.path.join("..", "envs", "phables.yaml")
19 |     script:
20 |         os.path.join('..', 'scripts', 'gfa2fasta.py')


--------------------------------------------------------------------------------
/phables/workflow/rules/phables.smk:
--------------------------------------------------------------------------------
 1 | rule run_phables:
 2 |     input:
 3 |         GRAPH_FILE,
 4 |         COVERAGE_FILE,
 5 |         PHROG_ANNOT,
 6 |         SMG_FILE,
 7 |         preprocessTargets
 8 |     output:
 9 |         genomes_fasta = os.path.join(OUTDIR, "phables", "resolved_paths.fasta"),
10 |         genomes_folder = directory(os.path.join(OUTDIR, "phables", "resolved_phages")),
11 |         genome_info = os.path.join(OUTDIR, "phables", "resolved_genome_info.txt"),
12 |         unitigs = os.path.join(OUTDIR, "phables", "resolved_edges.fasta"),
13 |         component_info = os.path.join(OUTDIR, "phables", "resolved_component_info.txt"),
14 |         phrog_comp_info = os.path.join(OUTDIR, "phables", "component_phrogs.txt"),
15 |         unresolved_edges = os.path.join(OUTDIR, "phables", "unresolved_phage_like_edges.fasta"),
16 |     params:
17 |         graph = GRAPH_FILE,
18 |         hmmout = SMG_FILE,
19 |         phrogs = PHROG_ANNOT,
20 |         coverage = COVERAGE_FILE,
21 |         bampath = BAM_PATH,
22 |         minlength = ML,
23 |         mincov = MC,
24 |         compcount = CC,
25 |         maxpaths = MP,
26 |         mgfrac = MGF,
27 |         evalue = EV,
28 |         seqidentity = SI,
29 |         covtol = CT,
30 |         alpha = AL,
31 |         longreads = LR,
32 |         prefix = PR,
33 |         output = os.path.join(OUTDIR, "phables"),
34 |         nthreads = config["resources"]["jobCPU"],
35 |         log = os.path.join(LOGSDIR, "phables_output.log")
36 |     threads:
37 |         config["resources"]["jobCPU"]
38 |     log:
39 |         os.path.join(LOGSDIR, "phables_output.log")
40 |     conda:
41 |         os.path.join("..", "envs", "phables.yaml")
42 |     script:
43 |         os.path.join("..", "scripts", "phables.py")
44 | 


--------------------------------------------------------------------------------
/phables/workflow/rules/postprocess.smk:
--------------------------------------------------------------------------------
 1 | rule combine_genomes_and_unresolved_edges:
 2 |     """Combine resolved genomes and unresolved edges"""
 3 |     input:
 4 |         genomes = RESOLVED_GENOMES,
 5 |         unresolved_edges = os.path.join(OUTDIR, "phables", "unresolved_phage_like_edges.fasta")
 6 |     output:
 7 |         os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta")
 8 |     shell:
 9 |         """
10 |         cat {input.genomes} {input.unresolved_edges} > {output}
11 |         """
12 | 
13 | 
14 | rule koverage_genomes:
15 |     """Get coverage statistics with Koverage"""
16 |     input:
17 |         tsv = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"),
18 |         sequences = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta")
19 |     params:
20 |         out_dir = os.path.join(OUTDIR, "postprocess"),
21 |         profile = lambda wildcards: "--profile " + config["profile"] if config["profile"] else "",
22 |     output:
23 |         os.path.join(OUTDIR, "postprocess", "results", "sample_coverage.tsv")
24 |     threads:
25 |         config["resources"]["jobCPU"]
26 |     resources:
27 |         mem_mb = config["resources"]["jobMem"],
28 |         mem = str(config["resources"]["jobMem"]) + "MB"
29 |     conda:
30 |         os.path.join("..", "envs", "koverage.yaml")
31 |     shell:
32 |         """
33 |         koverage run \
34 |             --no-report \
35 |             --reads {input.tsv} \
36 |             --ref {input.sequences} \
37 |             --threads {threads} \
38 |             --output {params.out_dir} \
39 |             {params.profile}
40 |         """
41 | 
42 | 
43 | rule koverage_postprocess:
44 |     """Format TSV of samples and reads from Koverage"""
45 |     input:
46 |         koverage_tsv = os.path.join(OUTDIR, "postprocess", "results", "sample_coverage.tsv"),
47 |         samples_file = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"),
48 |         seq_file = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta")
49 |     output:
50 |         os.path.join(OUTDIR, "postprocess", "sample_genome_read_counts.tsv")
51 |     params:
52 |         koverage_tsv = os.path.join(OUTDIR, "postprocess", "results", "sample_coverage.tsv"),
53 |         samples_file = os.path.join(OUTDIR, "preprocess", "phables.samples.tsv"),
54 |         seq_file = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges.fasta"),
55 |         info_file = os.path.join(OUTDIR, "postprocess", "genomes_and_unresolved_edges_info.tsv"),
56 |         output_path = os.path.join(OUTDIR, "postprocess"),
57 |         log = os.path.join(LOGSDIR, "format_koverage_results_output.log")
58 |     log:
59 |         os.path.join(LOGSDIR, "format_koverage_results_output.log")
60 |     conda:
61 |         os.path.join("..", "envs", "phables.yaml")
62 |     script:
63 |         os.path.join("..", "scripts", "format_koverage_results.py")


--------------------------------------------------------------------------------
/phables/workflow/scripts/combine_cov.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | """combine_cov.py: Combine multiple coverage files of samples.
 4 | 
 5 | """
 6 | 
 7 | import glob
 8 | import logging
 9 | import os
10 | import subprocess
11 | 
12 | import pandas as pd
13 | 
14 | __author__ = "Vijini Mallawaarachchi"
15 | __copyright__ = "Copyright 2023, Phables Project"
16 | __license__ = "MIT"
17 | __type__ = "Support Script"
18 | __maintainer__ = "Vijini Mallawaarachchi"
19 | __email__ = "viji.mallawaarachchi@gmail.com"
20 | 
21 | 
22 | def main():
23 |     # Get arguments
24 |     # -----------------------
25 | 
26 |     covpath = snakemake.params.covpath
27 |     output_path = snakemake.params.output
28 |     log = snakemake.params.log
29 | 
30 |     # Setup logger
31 |     # ----------------------------------------------------------------------
32 | 
33 |     logger = logging.getLogger("combine_cov")
34 |     logger.setLevel(logging.DEBUG)
35 |     logging.captureWarnings(True)
36 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
37 |     consoleHeader = logging.StreamHandler()
38 |     consoleHeader.setFormatter(formatter)
39 |     consoleHeader.setLevel(logging.INFO)
40 |     logger.addHandler(consoleHeader)
41 | 
42 |     # Setup output path for log file
43 |     if log is None:
44 |         fileHandler = logging.FileHandler(f"{output_path}/combine_cov.log")
45 |     else:
46 |         fileHandler = logging.FileHandler(f"{log}")
47 | 
48 |     fileHandler.setLevel(logging.DEBUG)
49 |     fileHandler.setFormatter(formatter)
50 |     logger.addHandler(fileHandler)
51 | 
52 |     # Validate inputs
53 |     # ---------------------------------------------------
54 | 
55 |     # Handle for missing trailing forwardslash in output folder path
56 |     if output_path[-1:] != "/":
57 |         output_path = output_path + "/"
58 | 
59 |     # Create output folder if it does not exist
60 |     if not os.path.isdir(output_path):
61 |         subprocess.run("mkdir -p " + output_path, shell=True)
62 | 
63 |     # Get coverage values from samples
64 |     # ---------------------------------------------------
65 | 
66 |     # Get coverage files
67 |     cov_files = glob.glob(f"{covpath}/*.tsv")
68 | 
69 |     final_df = pd.DataFrame()
70 | 
71 |     for file in cov_files:
72 |         logger.info(f"Reading file {file}")
73 |         df = pd.read_csv(file, sep="\t", header=0)
74 | 
75 |         if final_df.empty:
76 |             final_df = df
77 |         else:
78 |             final_df = pd.concat(
79 |                 [final_df, df[list(df.columns)[1]]], axis=1, join="inner"
80 |             )
81 | 
82 |     logger.info(f"Dataframe shape: {final_df.shape}")
83 | 
84 |     # Save dataframe to file
85 |     final_df.to_csv(output_path + "coverage.tsv", sep="\t", index=False)
86 |     logger.info(
87 |         f"The combined coverage values can be found at {output_path}coverage.tsv"
88 |     )
89 | 
90 |     # Exit program
91 |     # --------------
92 | 
93 |     logger.info("Thank you for using combine_cov!")
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/format_koverage_results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | """format_koverage_results.py: Format koverage results.
  4 | 
  5 | """
  6 | 
  7 | import logging
  8 | import os
  9 | import subprocess
 10 | from collections import defaultdict
 11 | 
 12 | import pandas as pd
 13 | from Bio import SeqIO
 14 | 
 15 | __author__ = "Vijini Mallawaarachchi"
 16 | __copyright__ = "Copyright 2023, Phables Project"
 17 | __license__ = "MIT"
 18 | __type__ = "Support Script"
 19 | __maintainer__ = "Vijini Mallawaarachchi"
 20 | __email__ = "viji.mallawaarachchi@gmail.com"
 21 | 
 22 | 
 23 | def main():
 24 |     # Get arguments
 25 |     # -----------------------
 26 | 
 27 |     samples_file = snakemake.params.samples_file
 28 |     koverage_tsv = snakemake.params.koverage_tsv
 29 |     seq_file = snakemake.params.seq_file
 30 |     info_file = snakemake.params.info_file
 31 |     output_path = snakemake.params.output_path
 32 |     log = snakemake.params.log
 33 | 
 34 |     # Setup logger
 35 |     # ----------------------------------------------------------------------
 36 | 
 37 |     logger = logging.getLogger("format_coverage")
 38 |     logger.setLevel(logging.DEBUG)
 39 |     logging.captureWarnings(True)
 40 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 41 |     consoleHeader = logging.StreamHandler()
 42 |     consoleHeader.setFormatter(formatter)
 43 |     consoleHeader.setLevel(logging.INFO)
 44 |     logger.addHandler(consoleHeader)
 45 | 
 46 |     # Setup output path for log file
 47 |     if log is None:
 48 |         fileHandler = logging.FileHandler(f"{log}")
 49 |     else:
 50 |         fileHandler = logging.FileHandler(f"{log}")
 51 | 
 52 |     fileHandler.setLevel(logging.DEBUG)
 53 |     fileHandler.setFormatter(formatter)
 54 |     logger.addHandler(fileHandler)
 55 | 
 56 |     # Validate inputs
 57 |     # ---------------------------------------------------
 58 | 
 59 |     # Handle for missing trailing forwardslash in output folder path
 60 |     if output_path[-1:] != "/":
 61 |         output_path = output_path + "/"
 62 | 
 63 |     # Create output folder if it does not exist
 64 |     if not os.path.isdir(output_path):
 65 |         subprocess.run("mkdir -p " + output_path, shell=True)
 66 | 
 67 |     # Get sample-wise genome coverage stats
 68 |     # ----------------------------------------------------------------------
 69 | 
 70 |     # Log inputs
 71 |     logger.info(f"Samples file: {samples_file}")
 72 |     logger.info(f"Koverage results: {koverage_tsv}")
 73 |     logger.info(f"Output path: {output_path}")
 74 | 
 75 |     # Get sample names
 76 |     mysamples = [s.split("\t")[0] for s in open(samples_file, "r")]
 77 |     logger.debug(mysamples)
 78 | 
 79 |     # Initialise dataframe
 80 |     df_read_counts = pd.DataFrame(columns=["contig_phables"] + mysamples)
 81 |     df_rpkm = pd.DataFrame(columns=["contig_phables"] + mysamples)
 82 |     df_mean_cov = pd.DataFrame(columns=["contig_phables"] + mysamples)
 83 | 
 84 |     # Get coverage stats of genomes in each sample
 85 |     read_counts = defaultdict(lambda: defaultdict(list))
 86 |     rpkm = defaultdict(lambda: defaultdict(list))
 87 |     mean_cov = defaultdict(lambda: defaultdict(list))
 88 | 
 89 |     with open(koverage_tsv, "r") as mf:
 90 |         for line in mf.readlines()[1:]:
 91 |             strings = line.strip().split("\t")
 92 |             read_counts[strings[1]][strings[0]] = int(float(strings[2]))
 93 |             rpkm[strings[1]][strings[0]] = float(strings[4])
 94 |             mean_cov[strings[1]][strings[0]] = float(strings[7])
 95 | 
 96 |     # Add records to dataframe
 97 |     counter = 0
 98 |     for genome in read_counts:
 99 |         read_counts_row = read_counts[genome]
100 |         read_counts_row["contig_phables"] = genome
101 |         read_counts_row = dict(read_counts_row)
102 |         read_counts_row_df = pd.DataFrame(read_counts_row, index=[counter])
103 |         df_read_counts = pd.concat([df_read_counts, read_counts_row_df])
104 | 
105 |         rpkm_row = rpkm[genome]
106 |         rpkm_row["contig_phables"] = genome
107 |         rpkm_row = dict(rpkm_row)
108 |         rpkm_row_df = pd.DataFrame(rpkm_row, index=[counter])
109 |         df_rpkm = pd.concat([df_rpkm, rpkm_row_df])
110 | 
111 |         mean_cov_row = mean_cov[genome]
112 |         mean_cov_row["contig_phables"] = genome
113 |         mean_cov_row = dict(mean_cov_row)
114 |         mean_cov_row_df = pd.DataFrame(mean_cov_row, index=[counter])
115 |         df_mean_cov = pd.concat([df_mean_cov, mean_cov_row_df])
116 | 
117 |         counter += 1
118 | 
119 |     # Save dataframe to file
120 |     df_read_counts.to_csv(
121 |         f"{output_path}sample_genome_read_counts.tsv", sep="\t", index=False
122 |     )
123 |     df_rpkm.to_csv(f"{output_path}sample_genome_rpkm.tsv", sep="\t", index=False)
124 |     df_mean_cov.to_csv(
125 |         f"{output_path}sample_genome_mean_coverage.tsv", sep="\t", index=False
126 |     )
127 | 
128 |     logger.info(
129 |         f"Raw read counts mapped to resolved genomes can be found in {output_path}sample_genome_read_counts.tsv"
130 |     )
131 |     logger.info(
132 |         f"RPKM values of resolved genomes can be found in {output_path}sample_genome_rpkm.tsv"
133 |     )
134 |     logger.info(
135 |         f"Estimated mean read depth of resolved genomes can be found in {output_path}sample_genome_mean_coverage.tsv"
136 |     )
137 | 
138 |     # Make sequence information file
139 |     with open(info_file, "w") as myfile:
140 |         myfile.write(f"contig_phables_name\tlength\tcontig_or_phables\n")
141 |         for index, record in enumerate(SeqIO.parse(seq_file, "fasta")):
142 |             if "phage_comp" in record.id:
143 |                 myfile.write(f"{record.id}\t{len(record.seq)}\tphables\n")
144 |             else:
145 |                 myfile.write(f"{record.id}\t{len(record.seq)}\tcontig\n")
146 | 
147 |     logger.info(f"Sequence information file can be found in {info_file}")
148 | 
149 |     # Exit program
150 |     # --------------
151 | 
152 |     logger.info("Thank you for using format_koverage_results!")
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     main()
157 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/gfa2fasta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | """gfa2fasta.py: Obtain the sequences corresponding to edges in the Flye and Miniasm assembly graphs in FASTA format.
  4 | 
  5 | The assembly graph file of Flye (assembly_graph.gfa) should be provided as inputs.
  6 | 
  7 | """
  8 | 
  9 | import logging
 10 | import os
 11 | import re
 12 | import subprocess
 13 | import sys
 14 | 
 15 | from Bio import SeqIO
 16 | from Bio.Seq import Seq
 17 | from Bio.SeqRecord import SeqRecord
 18 | 
 19 | __author__ = "Vijini Mallawaarachchi"
 20 | __copyright__ = "Copyright 2023, Phables Project"
 21 | __license__ = "MIT"
 22 | __type__ = "Support Script"
 23 | __maintainer__ = "Vijini Mallawaarachchi"
 24 | __email__ = "viji.mallawaarachchi@gmail.com"
 25 | 
 26 | 
 27 | def main():
 28 |     # Get arguments
 29 |     # -----------------------
 30 | 
 31 |     assembly_graph_file = snakemake.params.graph
 32 |     output_path = snakemake.params.output
 33 |     log = snakemake.params.log
 34 |     prefix = ""
 35 | 
 36 |     # Setup logger
 37 |     # ----------------------------------------------------------------------
 38 | 
 39 |     logger = logging.getLogger("gfa2fasta")
 40 |     logger.setLevel(logging.DEBUG)
 41 |     logging.captureWarnings(True)
 42 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 43 |     consoleHeader = logging.StreamHandler()
 44 |     consoleHeader.setFormatter(formatter)
 45 |     consoleHeader.setLevel(logging.INFO)
 46 |     logger.addHandler(consoleHeader)
 47 | 
 48 |     # Setup output path for log file
 49 |     if log is None:
 50 |         fileHandler = logging.FileHandler(f"{output_path}/gfa2fasta.log")
 51 |     else:
 52 |         fileHandler = logging.FileHandler(f"{log}")
 53 | 
 54 |     fileHandler.setLevel(logging.DEBUG)
 55 |     fileHandler.setFormatter(formatter)
 56 |     logger.addHandler(fileHandler)
 57 | 
 58 |     # Check assembly graph file
 59 |     if not os.path.isfile(assembly_graph_file):
 60 |         logger.error(
 61 |             "Failed to open the assembly graph file. Please make sure to provife the .gfa file."
 62 |         )
 63 |         logger.info("Exiting gfa2fasta.py...\nBye...!\n")
 64 |         sys.exit(1)
 65 | 
 66 |     # Check if output folder exists
 67 |     # ---------------------------------------------------
 68 | 
 69 |     # Handle for missing trailing forwardslash in output folder path
 70 |     if output_path[-1:] != "/":
 71 |         output_path = f"{output_path}/"
 72 | 
 73 |     # Create output folder if it does not exist
 74 |     if not os.path.isdir(output_path):
 75 |         subprocess.run("mkdir -p " + output_path, shell=True)
 76 | 
 77 |     # Get the sequences corresponding to edges of the graph.
 78 |     # ---------------------------------------------------
 79 | 
 80 |     logger.info("Obtaining edge sequences")
 81 | 
 82 |     sequenceset = []
 83 | 
 84 |     with open(assembly_graph_file) as file:
 85 |         line = file.readline()
 86 | 
 87 |         while line != "":
 88 |             if "S" in line:
 89 |                 strings = line.split("\t")
 90 | 
 91 |                 record = SeqRecord(
 92 |                     Seq(re.sub("[^GATC]", "", str(strings[2]).upper())),
 93 |                     id=str(strings[1]),
 94 |                     name=str(strings[1]),
 95 |                     description="",
 96 |                 )
 97 | 
 98 |                 sequenceset.append(record)
 99 | 
100 |             line = file.readline()
101 | 
102 |     logger.info("Writing edge sequences to FASTA file")
103 | 
104 |     with open(f"{output_path}{prefix}edges.fasta", "w") as output_handle:
105 |         SeqIO.write(sequenceset, output_handle, "fasta")
106 | 
107 |     logger.info(
108 |         f"The FASTA file with unitig sequences can be found at {output_handle.name}"
109 |     )
110 | 
111 |     # Exit program
112 |     # --------------
113 | 
114 |     logger.info("Thank you for using gfa2fasta!")
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import logging
  4 | import time
  5 | 
  6 | from phables_utils import (
  7 |     component_utils,
  8 |     edge_graph_utils,
  9 |     gene_utils,
 10 |     long_utils,
 11 |     short_utils,
 12 | )
 13 | from phables_utils.coverage_utils import (
 14 |     get_junction_pe_coverage,
 15 |     get_sub_path_coverage,
 16 |     get_unitig_coverage,
 17 | )
 18 | from phables_utils.output_utils import (
 19 |     init_files,
 20 |     write_component_info,
 21 |     write_component_phrog_info,
 22 |     write_res_genome_info,
 23 |     write_unitigs,
 24 | )
 25 | 
 26 | __author__ = "Vijini Mallawaarachchi"
 27 | __copyright__ = "Copyright 2022, Phables Project"
 28 | __license__ = "MIT"
 29 | __version__ = "1.4.1"
 30 | __maintainer__ = "Vijini Mallawaarachchi"
 31 | __email__ = "viji.mallawaarachchi@gmail.com"
 32 | __status__ = "Stable Release"
 33 | 
 34 | 
 35 | # Phables main code
 36 | # ----------------------------------------------------------------------
 37 | 
 38 | 
 39 | def main():
 40 |     # Get arguments
 41 |     # ----------------------------------------------------------------------
 42 |     graph = snakemake.params.graph
 43 |     coverage = snakemake.params.coverage
 44 |     bampath = snakemake.params.bampath
 45 |     hmmout = snakemake.params.hmmout
 46 |     phrogs = snakemake.params.phrogs
 47 |     minlength = int(snakemake.params.minlength)
 48 |     mincov = int(snakemake.params.mincov)
 49 |     compcount = int(snakemake.params.compcount)
 50 |     maxpaths = int(snakemake.params.maxpaths)
 51 |     mgfrac = float(snakemake.params.mgfrac)
 52 |     evalue = float(snakemake.params.evalue)
 53 |     seqidentity = float(snakemake.params.seqidentity)
 54 |     covtol = float(snakemake.params.covtol)
 55 |     alpha = float(snakemake.params.alpha)
 56 |     longreads = bool(snakemake.params.longreads)
 57 |     prefix = snakemake.params.prefix
 58 |     output = snakemake.params.output
 59 |     nthreads = int(snakemake.params.nthreads)
 60 |     log = snakemake.params.log
 61 | 
 62 |     # Setup logger
 63 |     # ----------------------------------------------------------------------
 64 | 
 65 |     logger = logging.getLogger(f"phables {__version__}")
 66 |     logger.setLevel(logging.DEBUG)
 67 |     logging.captureWarnings(True)
 68 |     formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 69 |     consoleHeader = logging.StreamHandler()
 70 |     consoleHeader.setFormatter(formatter)
 71 |     consoleHeader.setLevel(logging.INFO)
 72 |     logger.addHandler(consoleHeader)
 73 | 
 74 |     # Setup output path for log file
 75 |     if log is None:
 76 |         fileHandler = logging.FileHandler(f"{output}/phables.log")
 77 |     else:
 78 |         fileHandler = logging.FileHandler(f"{log}")
 79 |     fileHandler.setLevel(logging.DEBUG)
 80 |     fileHandler.setFormatter(formatter)
 81 |     logger.addHandler(fileHandler)
 82 | 
 83 |     logger.info(
 84 |         "Welcome to Phables: from fragmented assemblies to high-quality bacteriophage genomes."
 85 |     )
 86 | 
 87 |     logger.info(f"Input arguments: ")
 88 |     logger.info(f"Assembly graph file: {graph}")
 89 |     logger.info(f"Unitig coverage file: {coverage}")
 90 |     logger.info(f"BAM files path: {bampath}")
 91 |     logger.info(f"Unitig .hmmout file: {hmmout}")
 92 |     logger.info(f"Unitig phrog annotations file: {phrogs}")
 93 |     logger.info(f"Minimum length of unitigs to consider: {minlength}")
 94 |     logger.info(f"Minimum coverage of paths to output: {mincov}")
 95 |     logger.info(f"Minimum unitig count to consider a component: {compcount}")
 96 |     logger.info(f"Maximum number of paths to resolve for a component: {maxpaths}")
 97 |     logger.info(f"Length threshold to consider single copy marker genes: {mgfrac}")
 98 |     logger.info(f"Maximum e-value for phrog annotations: {evalue}")
 99 |     logger.info(f"Minimum sequence identity for phrog annotations: {seqidentity}")
100 |     logger.info(f"Coverage tolerance for extending subpaths: {covtol}")
101 |     logger.info(f"Coverage multipler for flow interval modelling: {alpha}")
102 |     logger.info(f"Input long reads: {longreads}")
103 |     logger.info(f"Prefix for genome identifiers: {prefix}")
104 |     logger.info(f"Number of threads to use: {nthreads}")
105 |     logger.info(f"Output folder: {output}")
106 | 
107 |     if prefix is None or prefix == "":
108 |         prefix = ""
109 |     else:
110 |         prefix = f"{prefix}_"
111 | 
112 |     start_time = time.time()
113 | 
114 |     # Init files
115 |     # ----------------------------------------------------------------------
116 |     init_files(output)
117 | 
118 |     # Get assembly graph
119 |     # ----------------------------------------------------------------------
120 |     (
121 |         assembly_graph,
122 |         oriented_links,
123 |         link_overlap,
124 |         unitig_names,
125 |         unitig_names_rev,
126 |         graph_unitigs,
127 |         self_looped_nodes,
128 |         edges_lengths,
129 |     ) = edge_graph_utils.build_assembly_graph(graph)
130 | 
131 |     logger.info(
132 |         f"Total number of vertices in the assembly graph: {len(assembly_graph.vs)}"
133 |     )
134 |     logger.info(
135 |         f"Total number of links in the assembly graph: {len(assembly_graph.es)}"
136 |     )
137 | 
138 |     # Get single unitigs
139 |     # ----------------------------------------------------------------------
140 |     circular = edge_graph_utils.get_circular(self_looped_nodes, graph_unitigs)
141 | 
142 |     # Get unitigs with bacterial single copy marker genes
143 |     # ----------------------------------------------------------------------
144 |     smg_unitigs = gene_utils.get_smg_unitigs(hmmout, mgfrac)
145 | 
146 |     # Get unitigs with PHROGs
147 |     # ----------------------------------------------------------------------
148 |     unitig_phrogs, phrog_dict = gene_utils.get_phrog_unitigs(
149 |         phrogs, evalue, seqidentity
150 |     )
151 | 
152 |     # Get components with viral components
153 |     # ----------------------------------------------------------------------
154 |     pruned_vs, comp_phrogs, likely_complete = component_utils.get_components(
155 |         assembly_graph,
156 |         unitig_names,
157 |         smg_unitigs,
158 |         unitig_phrogs,
159 |         circular,
160 |         edges_lengths,
161 |         minlength,
162 |         phrog_dict,
163 |     )
164 |     logger.info(f"Total number of components found: {len(pruned_vs)}")
165 | 
166 |     # Get unitig coverages
167 |     # ----------------------------------------------------------------------
168 | 
169 |     unitig_coverages = get_unitig_coverage(coverage)
170 | 
171 |     # Resolve genomes
172 |     # ----------------------------------------------------------------------
173 | 
174 |     # If long reads are provided
175 |     if longreads:
176 |         logger.info(f"Long reads provided")
177 | 
178 |         # Get sub path coverages
179 |         sub_path_cov = edge_graph_utils.get_all_sub_paths(assembly_graph, unitig_names)
180 |         sub_path_cov = get_sub_path_coverage(sub_path_cov, bampath, output)
181 | 
182 |         # Resolve genomes
183 |         (
184 |             resolved_edges,
185 |             all_resolved_paths,
186 |             all_components,
187 |             cycle_components,
188 |             linear_components,
189 |             resolved_components,
190 |             resolved_linear,
191 |             single_unitigs,
192 |             resolved_cyclic,
193 |             case1_found,
194 |             case1_resolved,
195 |             case2_found,
196 |             case2_resolved,
197 |             case3_found,
198 |             case3_resolved,
199 |             phage_like_edges,
200 |             all_phage_like_edges,
201 |             unresolved_phage_like_edges,
202 |         ) = long_utils.resolve_long(
203 |             assembly_graph,
204 |             pruned_vs,
205 |             unitig_names,
206 |             unitig_names_rev,
207 |             self_looped_nodes,
208 |             graph_unitigs,
209 |             minlength,
210 |             link_overlap,
211 |             unitig_coverages,
212 |             compcount,
213 |             oriented_links,
214 |             sub_path_cov,
215 |             likely_complete,
216 |             alpha,
217 |             mincov,
218 |             covtol,
219 |             maxpaths,
220 |             prefix,
221 |             output,
222 |             nthreads,
223 |         )
224 | 
225 |     # Else default to short reads
226 |     else:
227 |         logger.info(f"Short reads provided")
228 | 
229 |         # Get junction pe coverages
230 |         junction_pe_coverage = get_junction_pe_coverage(bampath, output)
231 | 
232 |         # Resolve genomes
233 |         (
234 |             resolved_edges,
235 |             all_resolved_paths,
236 |             all_components,
237 |             cycle_components,
238 |             linear_components,
239 |             resolved_components,
240 |             resolved_linear,
241 |             single_unitigs,
242 |             resolved_cyclic,
243 |             case1_found,
244 |             case1_resolved,
245 |             case2_found,
246 |             case2_resolved,
247 |             case3_found,
248 |             case3_resolved,
249 |             phage_like_edges,
250 |             all_phage_like_edges,
251 |             unresolved_phage_like_edges,
252 |         ) = short_utils.resolve_short(
253 |             assembly_graph,
254 |             pruned_vs,
255 |             unitig_names,
256 |             unitig_names_rev,
257 |             self_looped_nodes,
258 |             graph_unitigs,
259 |             minlength,
260 |             link_overlap,
261 |             unitig_coverages,
262 |             compcount,
263 |             oriented_links,
264 |             junction_pe_coverage,
265 |             likely_complete,
266 |             alpha,
267 |             mincov,
268 |             covtol,
269 |             maxpaths,
270 |             prefix,
271 |             output,
272 |             nthreads,
273 |         )
274 | 
275 |     # Log final summary information
276 |     # ----------------------------------------------------------------------
277 |     logger.info(f"Total number of cyclic components found: {len(cycle_components)}")
278 |     logger.info(f"Total number of cyclic components resolved: {len(resolved_cyclic)}")
279 |     logger.info(f"Single unitigs identified: {len(single_unitigs)}")
280 |     logger.info(f"Total number of linear components found: {len(linear_components)}")
281 |     logger.info(f"Total number of linear components resolved: {len(resolved_linear)}")
282 |     logger.info(
283 |         f"Total number of cyclic components found including single unitigs: {len(cycle_components) + len(single_unitigs)}"
284 |     )
285 |     logger.info(
286 |         f"Total number of components resolved: {len(single_unitigs)+len(resolved_cyclic)+len(resolved_linear)}"
287 |     )
288 |     logger.info(f"Case 1 (resolved/found): {len(case1_resolved)}/{len(case1_found)}")
289 |     logger.info(f"Case 2 (resolved/found): {len(case2_resolved)}/{len(case2_found)}")
290 |     logger.info(f"Case 3 (resolved/found): {len(case3_resolved)}/{len(case3_found)}")
291 |     logger.info(f"Total number of genomes resolved: {len(all_resolved_paths)}")
292 | 
293 |     if len(all_resolved_paths) == 0:
294 |         logger.info(f"No genomes were resolved.")
295 |     else:
296 |         logger.info(f"Resolved genomes can be found in {output}/resolved_paths.fasta")
297 | 
298 |     # Write edges to file
299 |     # ----------------------------------------------------------------------
300 | 
301 |     write_unitigs(
302 |         phage_like_edges, unitig_names, graph_unitigs, "phage_like_edges", output
303 |     )
304 |     write_unitigs(
305 |         all_phage_like_edges,
306 |         unitig_names,
307 |         graph_unitigs,
308 |         "all_phage_like_edges",
309 |         output,
310 |     )
311 |     write_unitigs(resolved_edges, unitig_names, graph_unitigs, "resolved_edges", output)
312 |     write_unitigs(
313 |         unresolved_phage_like_edges,
314 |         unitig_names,
315 |         graph_unitigs,
316 |         "unresolved_phage_like_edges",
317 |         output,
318 |     )
319 | 
320 |     # Record path information
321 |     # ----------------------------------------------------------------------
322 | 
323 |     filename = write_res_genome_info(all_resolved_paths, output)
324 |     if len(all_resolved_paths) > 0:
325 |         logger.info(f"Resolved genome information can be found in {output}/{filename}")
326 | 
327 |     # Record component information
328 |     # ----------------------------------------------------------------------
329 | 
330 |     filename = write_component_info(all_components, output)
331 |     if len(all_components) > 0:
332 |         logger.info(
333 |             f"Resolved component information can be found in {output}/{filename}"
334 |         )
335 | 
336 |     filename = write_component_phrog_info(resolved_components, comp_phrogs, output)
337 |     if len(resolved_components) > 0:
338 |         logger.info(
339 |             f"PHROGs found in resolved components can be found in {output}/{filename}"
340 |         )
341 | 
342 |     # Get elapsed time
343 |     # ----------------------------------------------------------------------
344 | 
345 |     # Determine elapsed time
346 |     elapsed_time = time.time() - start_time
347 | 
348 |     # Print elapsed time for the process
349 |     logger.info(f"Elapsed time: {elapsed_time} seconds")
350 | 
351 |     # Exit program
352 |     # ----------------------------------------------------------------------
353 | 
354 |     logger.info("Thank you for using Phables!")
355 | 
356 | 
357 | if __name__ == "__main__":
358 |     main()
359 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/FD_Inexact.py:
--------------------------------------------------------------------------------
  1 | # Source: https://github.com/algbio/MFD-ILP
  2 | 
  3 | import logging
  4 | 
  5 | import more_itertools
  6 | import networkx as nx
  7 | 
  8 | # create logger
  9 | logger = logging.getLogger("phables 1.4.1")
 10 | 
 11 | 
 12 | def read_input(graphfile, number_subpath):
 13 |     trip_data = open(graphfile, "r").read().split("\n")
 14 |     i = 0
 15 |     listOfGraphs = {}
 16 |     k = 0
 17 | 
 18 |     while True:
 19 |         if "#" in trip_data[i]:
 20 |             i = i + 1
 21 |             N = int(trip_data[i])
 22 |             edges = list()
 23 |             subpaths = {}
 24 |             while True:
 25 |                 i = i + 1
 26 |                 if "#" in trip_data[i]:
 27 |                     break
 28 |                 if "" == trip_data[i]:
 29 |                     break
 30 |                 if "subpaths" in trip_data[i]:
 31 |                     for j in range(0, number_subpath):
 32 |                         i = i + 1
 33 |                         line = trip_data[i].split(" ")
 34 |                         subpaths[j] = line[0 : len(line) - 1]
 35 |                     i = i + 4
 36 |                 if i >= len(trip_data) - 1:
 37 |                     break
 38 |                 line = trip_data[i].split(" ")
 39 |                 edges.append((line[0], line[1], line[2], line[3]))
 40 |             G = {"Nodes": N, "list of edges": edges, "subpaths": subpaths}
 41 |             listOfGraphs[k] = G
 42 |             k += 1
 43 |         if i >= len(trip_data) - 1:
 44 |             break
 45 | 
 46 |     return listOfGraphs
 47 | 
 48 | 
 49 | # FD-Subpath-Inexact-Gurobi
 50 | # --------------------------------------------
 51 | def flowMultipleDecomposition(data, K, nthreads):
 52 |     # libraries
 53 |     import gurobipy as gp
 54 |     from gurobipy import GRB
 55 | 
 56 |     # calculate the minimal flow decomposition based on such graph
 57 |     V = data["vertices"]
 58 |     E = data["edges"]
 59 |     W = data["maxFlow"]
 60 |     S = data["sources"]
 61 |     D = data["targets"]
 62 |     AD_in = data["adj_in"]
 63 |     AD_out = data["adj_out"]
 64 |     f_low = data["flows_low"]
 65 |     f_up = data["flows_up"]
 66 |     subpaths = data["subpaths"]
 67 | 
 68 |     try:
 69 |         # create extra sets
 70 |         T = [(i, j, k) for (i, j) in E for k in range(0, K)]
 71 |         SC = [k for k in range(0, K)]
 72 |         R = [(k, s) for k in range(0, K) for s in range(0, len(subpaths))]
 73 | 
 74 |         # Create a new model
 75 |         model = gp.Model("MFD")
 76 |         model.setParam("LogToConsole", 0)
 77 |         model.setParam("Threads", nthreads)
 78 | 
 79 |         # Create variables
 80 |         x = model.addVars(T, vtype=GRB.BINARY, name="x")
 81 |         w = model.addVars(SC, vtype=GRB.INTEGER, name="w", lb=0)
 82 |         z = model.addVars(T, vtype=GRB.CONTINUOUS, name="z", lb=0)
 83 |         r = model.addVars(R, vtype=GRB.BINARY, name="r")
 84 | 
 85 |         model.setObjective(GRB.MINIMIZE)
 86 | 
 87 |         # flow conservation
 88 |         for k in range(0, K):
 89 |             for i in V:
 90 |                 if i in S:
 91 |                     model.addConstr(sum(x[i, j, k] for j in AD_out[i]) == 1)
 92 |                 if i in D:
 93 |                     model.addConstr(sum(x[j, i, k] for j in AD_in[i]) == 1)
 94 |                 if i not in S and i not in D:
 95 |                     model.addConstr(
 96 |                         sum(x[i, j, k] for j in AD_out[i])
 97 |                         - sum(x[j, i, k] for j in AD_in[i])
 98 |                         == 0
 99 |                     )
100 | 
101 |         # flow balance
102 |         model.addConstrs(
103 |             f_up[i, j] >= gp.quicksum(z[i, j, k] for k in range(0, K)) for (i, j) in E
104 |         )
105 |         model.addConstrs(
106 |             f_low[i, j] <= gp.quicksum(z[i, j, k] for k in range(0, K)) for (i, j) in E
107 |         )
108 | 
109 |         # linearization
110 |         for i, j in E:
111 |             for k in range(0, K):
112 |                 model.addConstr(z[i, j, k] <= W * x[i, j, k])
113 |                 model.addConstr(w[k] - (1 - x[i, j, k]) * W <= z[i, j, k])
114 |                 model.addConstr(z[i, j, k] <= w[k])
115 | 
116 |         # subpath constraints
117 |         for k in range(0, K):
118 |             for sp_len in range(0, len(subpaths)):
119 |                 subpath_edges = list(more_itertools.pairwise(subpaths[sp_len]))
120 |                 try:
121 |                     model.addConstr(
122 |                         gp.quicksum(x[i, j, k] for (i, j) in subpath_edges)
123 |                         >= len(subpath_edges) * r[k, sp_len]
124 |                     )
125 |                 except:
126 |                     continue
127 | 
128 |         model.addConstrs(
129 |             gp.quicksum(r[k, sp_len] for k in range(0, K)) >= 1
130 |             for sp_len in range(0, len(subpaths))
131 |         )
132 | 
133 |         # objective function
134 |         model.optimize()
135 | 
136 |         w_sol = [0] * len(range(0, K))
137 |         x_sol = {}
138 |         paths = [list() for i in range(0, K)]
139 | 
140 |         if model.status == GRB.OPTIMAL:
141 |             data["message"] = "solved"
142 |             data["runtime"] = model.Runtime
143 | 
144 |             for v in model.getVars():
145 |                 if "w" in v.VarName:
146 |                     for k in range(0, K):
147 |                         if str(k) in v.VarName:
148 |                             w_sol[k] = v.x
149 | 
150 |                 if "x" in v.VarName:
151 |                     for i, j, k in T:
152 |                         if str(i) + "," + str(j) + "," + str(k) in v.VarName:
153 |                             x_sol[i, j, k] = v.x
154 | 
155 |             for i, j, k in T:
156 |                 if x_sol[i, j, k] == 1:
157 |                     paths[k].append((i, j))
158 | 
159 |             data["weights"] = w_sol
160 |             data["solution"] = paths
161 | 
162 |         if model.status == GRB.INFEASIBLE:
163 |             data["message"] = "unsolved"
164 | 
165 |     except gp.GurobiError as e:
166 |         logger.error(f"Error code {e.errno}: {str(e)}")
167 | 
168 |     except AttributeError:
169 |         logger.error(f"Encountered an attribute error")
170 | 
171 |     return data
172 | 
173 | 
174 | def FD_Algorithm(data, max_paths, nthreads):
175 |     listOfEdges = data["edges"]
176 |     solutionMap = data["graph"]
177 |     solutionSet = 0
178 |     Kmin = data["minK"]
179 |     solutionWeights = 0
180 | 
181 |     for i in range(1, max_paths + 1):
182 |         data = flowMultipleDecomposition(data, i, nthreads)
183 |         if data["message"] == "solved":
184 |             solutionSet = data["solution"]
185 |             solutionWeights = data["weights"]
186 |             break
187 | 
188 |     # Get solution paths and weights
189 |     solution_paths = {}
190 | 
191 |     if solutionSet != 0:
192 |         for i in range(0, len(solutionSet)):
193 |             solution_paths[i] = {"weight": solutionWeights[i], "path": solutionSet[i]}
194 |             # print("W:",solutionWeights[i], solutionSet[i])
195 | 
196 |     return data, solution_paths
197 | 
198 | 
199 | def SolveInstances(Graphs, max_paths, outfile, recfile, nthreads):
200 |     fp = open(outfile, "w+")
201 |     fc = open(recfile, "w+")
202 | 
203 |     for s in range(0, 1):
204 |         f_low = {}
205 |         f_up = {}
206 |         Edges = set()
207 |         V = set()
208 |         listOfEdges = Graphs[s]["list of edges"]
209 | 
210 |         for k in range(0, len(listOfEdges)):
211 |             (a, b, c, d) = listOfEdges[k]
212 |             Edges.add((a, b))
213 |             V.add(a)
214 |             V.add(b)
215 |             f_low[a, b] = int(float(c))
216 |             f_up[a, b] = int(float(d))
217 | 
218 |         # creation of graphs
219 |         # creation of graphs
220 |         G = nx.DiGraph()
221 |         G.add_edges_from(Edges, weights=f_low)
222 |         G.add_nodes_from(V)
223 | 
224 |         # creation of adjacent matrix
225 |         AD_in = {}
226 |         AD_out = {}
227 | 
228 |         for v in V:
229 |             setAdj = set()
230 |             for i, j in list(G.out_edges(v)):
231 |                 if i != v:
232 |                     setAdj.add(i)
233 |                 if j != v:
234 |                     setAdj.add(j)
235 | 
236 |             AD_out[v] = list(setAdj)
237 | 
238 |             setAdj = set()
239 |             for i, j in list(G.in_edges(v)):
240 |                 if i != v:
241 |                     setAdj.add(i)
242 |                 if j != v:
243 |                     setAdj.add(j)
244 | 
245 |             AD_in[v] = list(setAdj)
246 | 
247 |         # calculating source, sinks and max flows
248 |         S = [x for x in G.nodes() if G.out_degree(x) >= 1 and G.in_degree(x) == 0]
249 |         D = [x for x in G.nodes() if G.out_degree(x) == 0 and G.in_degree(x) >= 1]
250 |         maxW = max(f_up.values())
251 | 
252 |         # definition of data
253 | 
254 |         data = {
255 |             "edges": Edges,
256 |             "flows_low": f_low,
257 |             "flows_up": f_up,
258 |             "vertices": V,
259 |             "graph": G,
260 |             "Kmax": len(Edges),
261 |             "weights": {},
262 |             "sources": S,
263 |             "targets": D,
264 |             "message": {},
265 |             "solution": 0,
266 |             "maxFlow": maxW,
267 |             "adj_in": AD_in,
268 |             "adj_out": AD_out,
269 |             "subpaths": Graphs[s]["subpaths"],
270 |             "minK": 2,
271 |             "runtime": 0,
272 |         }
273 | 
274 |         data, solution_paths = FD_Algorithm(data, max_paths, nthreads)
275 | 
276 |     return solution_paths
277 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables/workflow/scripts/phables_utils/__init__.py


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/component_utils.py:
--------------------------------------------------------------------------------
 1 | def get_components(
 2 |     assembly_graph,
 3 |     unitig_names,
 4 |     smg_unitigs,
 5 |     unitig_phrogs,
 6 |     circular,
 7 |     edges_lengths,
 8 |     cicular_len,
 9 |     phrog_dict,
10 | ):
11 |     """
12 |     Get connected components with PHROGs and no SMGs.
13 |     """
14 | 
15 |     pruned_vs = {}
16 |     likely_complete = {}
17 | 
18 |     i = 0
19 | 
20 |     comp_phrogs = {}
21 | 
22 |     for component in assembly_graph.components():
23 |         phrogs_found = set()
24 | 
25 |         head_present = False
26 |         connector_present = False
27 |         tail_present = False
28 |         lysis_present = False
29 | 
30 |         if len(component) > 1:
31 |             for unitig in component:
32 |                 if unitig_names[unitig] in smg_unitigs:
33 |                     break
34 |                 elif unitig_names[unitig] in unitig_phrogs:
35 |                     for phrog in unitig_phrogs[unitig_names[unitig]]:
36 |                         if "head and packaging" in phrog_dict[phrog]:
37 |                             head_present = True
38 |                         if "connector" in phrog_dict[phrog]:
39 |                             connector_present = True
40 |                         if "tail" in phrog_dict[phrog]:
41 |                             tail_present = True
42 |                         if "lysis" in phrog_dict[phrog]:
43 |                             lysis_present = True
44 | 
45 |                         phrogs_found.add(phrog)
46 | 
47 |             if head_present or connector_present or tail_present or lysis_present:
48 |                 pruned_vs[i] = component
49 |                 comp_phrogs[i] = phrogs_found
50 |                 i += 1
51 | 
52 |         if len(component) == 1:
53 |             unitig = component[0]
54 |             phrogs_present = False
55 | 
56 |             if unitig_names[unitig] in unitig_phrogs:
57 |                 for phrog in unitig_phrogs[unitig_names[unitig]]:
58 |                     if "head and packaging" in phrog_dict[phrog]:
59 |                         head_present = True
60 |                     if "connector" in phrog_dict[phrog]:
61 |                         connector_present = True
62 |                     if "tail" in phrog_dict[phrog]:
63 |                         tail_present = True
64 |                     if "lysis" in phrog_dict[phrog]:
65 |                         lysis_present = True
66 | 
67 |                     phrogs_found.add(phrog)
68 | 
69 |             # Check PHROG categories in unitig (should contain at least one)
70 |             if head_present or connector_present or tail_present or lysis_present:
71 |                 phrogs_present = True
72 | 
73 |             if phrogs_present and edges_lengths[unitig_names[unitig]] > cicular_len:
74 |                 pruned_vs[i] = component
75 |                 comp_phrogs[i] = phrogs_found
76 | 
77 |                 # Check if all PHROG categories are present in unitig
78 |                 if (
79 |                     head_present
80 |                     and connector_present
81 |                     and tail_present
82 |                     and lysis_present
83 |                 ):
84 |                     likely_complete[i] = 1
85 |                 else:
86 |                     likely_complete[i] = 0
87 | 
88 |                 i += 1
89 | 
90 |     return pruned_vs, comp_phrogs, likely_complete
91 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/coverage_utils.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import glob
  3 | import os
  4 | import pickle
  5 | from collections import defaultdict
  6 | 
  7 | import pysam
  8 | 
  9 | 
 10 | def get_unitig_coverage(coverage):
 11 |     """
 12 |     Get coverage values of unitigs
 13 |     """
 14 | 
 15 |     unitig_coverages = {}
 16 | 
 17 |     with open(coverage, "r") as myfile:
 18 |         for line in myfile.readlines():
 19 |             if not line.startswith("Contig"):
 20 |                 strings = line.strip().split()
 21 | 
 22 |                 unitig_name = strings[0]
 23 | 
 24 |                 coverage_sum = sum([float(x) for x in strings[1:]])
 25 | 
 26 |                 unitig_coverages[unitig_name] = coverage_sum
 27 | 
 28 |     return unitig_coverages
 29 | 
 30 | 
 31 | def read_pair_generator(bam, region_string=None):
 32 |     """
 33 |     Generate read pairs in a BAM file or within a region string.
 34 |     Reads are added to read_dict until a pair is found.
 35 |     """
 36 |     read_dict = defaultdict(lambda: [None, None])
 37 | 
 38 |     for read in bam.fetch(region=region_string):
 39 |         if (
 40 |             read.is_secondary
 41 |             or read.is_supplementary
 42 |             or not read.is_paired
 43 |             or read.mapping_quality <= 30
 44 |         ):
 45 |             continue
 46 |         qname = read.query_name
 47 |         if qname not in read_dict:
 48 |             if read.is_read1:
 49 |                 read_dict[qname][0] = read
 50 |             else:
 51 |                 read_dict[qname][1] = read
 52 |         else:
 53 |             if read.is_read1:
 54 |                 yield read, read_dict[qname][1]
 55 |             else:
 56 |                 yield read_dict[qname][0], read
 57 |             del read_dict[qname]
 58 | 
 59 |     return read_dict
 60 | 
 61 | 
 62 | def get_junction_pe_coverage(bam_path, output):
 63 |     """
 64 |     Get number of paired end reads supporting a junction
 65 |     """
 66 | 
 67 |     link_counts = defaultdict(int)
 68 | 
 69 |     if os.path.isfile(f"{output}/junction_pe_coverage.pickle"):
 70 |         with open(f"{output}/junction_pe_coverage.pickle", "rb") as handle:
 71 |             link_counts = pickle.load(handle)
 72 | 
 73 |     else:
 74 |         bam_files = glob.glob(bam_path + "/*.bam")
 75 | 
 76 |         for bam_file in bam_files:
 77 |             bam = pysam.AlignmentFile(bam_file, "rb")
 78 | 
 79 |             read_pairs = read_pair_generator(bam)
 80 | 
 81 |             for read1, read2 in read_pairs:
 82 |                 if read1.reference_name != read2.reference_name:
 83 |                     link_counts[(read1.reference_name, read2.reference_name)] += 1
 84 | 
 85 |         with open(f"{output}/junction_pe_coverage.pickle", "wb") as handle:
 86 |             pickle.dump(link_counts, handle, protocol=pickle.HIGHEST_PROTOCOL)
 87 | 
 88 |     return link_counts
 89 | 
 90 | 
 91 | def get_sub_path_coverage(sub_path_cov, bam_path, output):
 92 |     """
 93 |     Get coverage values of sub paths using long reads
 94 |     """
 95 | 
 96 |     if os.path.isfile(f"{output}/sub_path_coverage.pickle"):
 97 |         sub_path_cov = defaultdict(int)
 98 |         with open(f"{output}/sub_path_coverage.pickle", "rb") as handle:
 99 |             sub_path_cov = pickle.load(handle)
100 | 
101 |     else:
102 |         bam_files = glob.glob(bam_path + "/*.bam")
103 | 
104 |         for bam_file in bam_files:
105 |             unitig_reads = defaultdict(set)
106 | 
107 |             bam = pysam.AlignmentFile(bam_file, "rb")
108 | 
109 |             for read in bam:
110 |                 if not read.is_unmapped:  # Only consider mapped reads
111 |                     query_id = read.query_name
112 |                     target_id = read.reference_name
113 |                     unitig_reads[target_id].add(query_id)
114 | 
115 |             for sub_path in sub_path_cov.keys():
116 |                 if len(sub_path) == 3:
117 |                     node1 = sub_path[0]
118 |                     node2 = sub_path[1]
119 |                     node3 = sub_path[2]
120 | 
121 |                     intersection_set = unitig_reads[node1].intersection(
122 |                         unitig_reads[node2], unitig_reads[node3]
123 |                     )
124 |                     sub_path_cov[sub_path] += len(intersection_set)
125 | 
126 |                 elif len(sub_path) == 2:
127 |                     node1 = sub_path[0]
128 |                     node2 = sub_path[1]
129 | 
130 |                     intersection_set = unitig_reads[node1].intersection(
131 |                         unitig_reads[node2]
132 |                     )
133 |                     sub_path_cov[sub_path] += len(intersection_set)
134 | 
135 |             del unitig_reads
136 |             del bam
137 |             gc.collect()
138 | 
139 |     return sub_path_cov
140 | 
141 | 
142 | def get_graph_spanning_reads(gaf_path, output):
143 |     """
144 |     Get number of reads spanning across a junction
145 |     """
146 | 
147 |     junction_reads = defaultdict(int)
148 | 
149 |     if os.path.isfile(f"{output}/graph_spanning_reads.pickle"):
150 |         with open(f"{output}/graph_spanning_reads.pickle", "rb") as handle:
151 |             junction_reads = pickle.load(handle)
152 | 
153 |     else:
154 |         gaf_files = glob.glob(gaf_path + "/*.gaf")
155 | 
156 |         for gaf_file in gaf_files:
157 |             with open(gaf_file, "r") as myfile:
158 |                 for line in myfile.readlines():
159 |                     strings = line.strip().split("\t")
160 | 
161 |                     if strings[5].count(">") == 2:
162 |                         edges = strings[5].split(">")[1:]
163 |                         junction_reads[(edges[0], edges[1])] += 1
164 | 
165 |                     elif strings[5].count("<") == 2:
166 |                         edges = strings[5].split("<")[1:]
167 |                         junction_reads[(edges[1], edges[0])] += 1
168 | 
169 |         with open(f"{output}/graph_spanning_reads.pickle", "wb") as handle:
170 |             pickle.dump(junction_reads, handle, protocol=pickle.HIGHEST_PROTOCOL)
171 | 
172 |     return junction_reads
173 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/edge_graph_utils.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | from collections import defaultdict
  4 | 
  5 | from Bio import SeqIO
  6 | from Bio.Seq import Seq
  7 | from igraph import Graph
  8 | 
  9 | # Create logger
 10 | logger = logging.getLogger("phables 1.4.1")
 11 | 
 12 | 
 13 | class BidirectionalError(Exception):
 14 |     """Must set a unique value in a BijectiveMap."""
 15 | 
 16 |     def __init__(self, value):
 17 |         self.value = value
 18 |         msg = 'The value "{}" is already in the mapping.'
 19 |         super().__init__(msg.format(value))
 20 | 
 21 | 
 22 | class BidirectionalMap(dict):
 23 |     """Invertible map."""
 24 | 
 25 |     def __init__(self, inverse=None):
 26 |         if inverse is None:
 27 |             inverse = self.__class__(inverse=self)
 28 |         self.inverse = inverse
 29 | 
 30 |     def __setitem__(self, key, value):
 31 |         if value in self.inverse:
 32 |             raise BidirectionalError(value)
 33 | 
 34 |         self.inverse._set_item(value, key)
 35 |         self._set_item(key, value)
 36 | 
 37 |     def __delitem__(self, key):
 38 |         self.inverse._del_item(self[key])
 39 |         self._del_item(key)
 40 | 
 41 |     def _del_item(self, key):
 42 |         super().__delitem__(key)
 43 | 
 44 |     def _set_item(self, key, value):
 45 |         super().__setitem__(key, value)
 46 | 
 47 | 
 48 | def get_unitig_lengths(edge_file):
 49 |     """
 50 |     Get length of the unitigs
 51 |     """
 52 | 
 53 |     unitig_lengths = {}
 54 | 
 55 |     for index, record in enumerate(SeqIO.parse(edge_file, "fasta")):
 56 |         unitig_lengths[record.id] = len(record.seq)
 57 | 
 58 |     return unitig_lengths
 59 | 
 60 | 
 61 | def get_links(assembly_graph_file):
 62 |     """
 63 |     Get links from the assembly graph
 64 |     """
 65 | 
 66 |     node_count = 0
 67 |     graph_contigs = {}
 68 |     edges_lengths = {}
 69 |     oriented_links = defaultdict(lambda: defaultdict(list))
 70 |     link_overlap = defaultdict(int)
 71 |     links = []
 72 | 
 73 |     my_map = BidirectionalMap()
 74 | 
 75 |     # Get links from .gfa file
 76 |     with open(assembly_graph_file) as file:
 77 |         for line in file.readlines():
 78 |             # Identify lines with link information
 79 |             if line.startswith("L"):
 80 |                 strings = line.split("\t")
 81 | 
 82 |                 link1 = strings[1]
 83 |                 link2 = strings[3]
 84 | 
 85 |                 link1_orientation = strings[2]
 86 |                 link2_orientation = strings[4]
 87 |                 overlap = int(strings[5].strip()[:-1])
 88 | 
 89 |                 link = []
 90 |                 link.append(link1)
 91 |                 link.append(link2)
 92 |                 links.append(link)
 93 | 
 94 |                 if link1 != link2:
 95 |                     if link1_orientation == "+" and link2_orientation == "+":
 96 |                         oriented_links[link1][link2].append(("+", "+"))
 97 |                         link_overlap[(f"{link1}+", f"{link2}+")] = overlap
 98 |                         oriented_links[link2][link1].append(("-", "-"))
 99 |                         link_overlap[(f"{link2}-", f"{link1}-")] = overlap
100 |                     elif link1_orientation == "-" and link2_orientation == "-":
101 |                         oriented_links[link1][link2].append(("-", "-"))
102 |                         link_overlap[(f"{link1}-", f"{link2}-")] = overlap
103 |                         oriented_links[link2][link1].append(("+", "+"))
104 |                         link_overlap[(f"{link2}+", f"{link1}+")] = overlap
105 |                     elif link1_orientation == "+" and link2_orientation == "-":
106 |                         oriented_links[link1][link2].append(("+", "-"))
107 |                         link_overlap[(f"{link1}+", f"{link2}-")] = overlap
108 |                         oriented_links[link2][link1].append(("+", "-"))
109 |                         link_overlap[(f"{link2}+", f"{link1}-")] = overlap
110 |                     elif link1_orientation == "-" and link2_orientation == "+":
111 |                         oriented_links[link1][link2].append(("-", "+"))
112 |                         link_overlap[(f"{link1}-", f"{link2}+")] = overlap
113 |                         oriented_links[link2][link1].append(("-", "+"))
114 |                         link_overlap[(f"{link2}-", f"{link1}+")] = overlap
115 | 
116 |             elif line.startswith("S"):
117 |                 strings = line.strip().split()
118 |                 my_map[node_count] = strings[1]
119 |                 graph_contigs[strings[1]] = Seq(strings[2])
120 |                 edges_lengths[strings[1]] = len(strings[2])
121 |                 node_count += 1
122 | 
123 |             line = file.readline()
124 | 
125 |     return (
126 |         node_count,
127 |         graph_contigs,
128 |         links,
129 |         oriented_links,
130 |         link_overlap,
131 |         my_map,
132 |         edges_lengths,
133 |     )
134 | 
135 | 
136 | def get_graph_edges(links, contig_names_rev):
137 |     """
138 |     Returns the edges of the assembly graph
139 |     """
140 | 
141 |     self_looped_nodes = []
142 | 
143 |     edge_list = []
144 | 
145 |     # Iterate links
146 |     for link in links:
147 |         # Remove self loops
148 |         if link[0] != link[1]:
149 |             # Add edge to list of edges
150 |             edge_list.append((contig_names_rev[link[0]], contig_names_rev[link[1]]))
151 |         else:
152 |             self_looped_nodes.append(link[0])
153 | 
154 |     return edge_list, self_looped_nodes
155 | 
156 | 
157 | def build_assembly_graph(assembly_graph_file):
158 |     """
159 |     Build the assembly graph
160 |     """
161 | 
162 |     (
163 |         node_count,
164 |         graph_contigs,
165 |         links,
166 |         oriented_links,
167 |         link_overlap,
168 |         contig_names,
169 |         edges_lengths,
170 |     ) = get_links(assembly_graph_file)
171 | 
172 |     # Get reverse mapping of contig identifiers
173 |     contig_names_rev = contig_names.inverse
174 | 
175 |     # Create graph
176 |     assembly_graph = Graph(directed=False)
177 | 
178 |     # Add vertices
179 |     assembly_graph.add_vertices(node_count)
180 | 
181 |     # Name vertices with contig identifiers
182 |     for i in range(node_count):
183 |         assembly_graph.vs[i]["id"] = i
184 |         assembly_graph.vs[i]["name"] = contig_names[i]
185 |         assembly_graph.vs[i]["label"] = contig_names[i] + "\nID:" + str(i)
186 | 
187 |     edge_list, self_looped_nodes = get_graph_edges(
188 |         links=links, contig_names_rev=contig_names_rev
189 |     )
190 | 
191 |     # Add edges to the graph
192 |     assembly_graph.add_edges(edge_list)
193 | 
194 |     # Simplify the graph
195 |     assembly_graph.simplify(multiple=True, loops=False, combine_edges=None)
196 | 
197 |     return (
198 |         assembly_graph,
199 |         oriented_links,
200 |         link_overlap,
201 |         contig_names,
202 |         contig_names_rev,
203 |         graph_contigs,
204 |         self_looped_nodes,
205 |         edges_lengths,
206 |     )
207 | 
208 | 
209 | def get_circular(self_looped_nodes, graph_unitigs):
210 |     """
211 |     Get circular unitigs
212 |     """
213 | 
214 |     circular = {}
215 | 
216 |     for unitig in self_looped_nodes:
217 |         circular[unitig] = len(str(graph_unitigs[unitig]))
218 | 
219 |     # with open(paths, "r") as myfile:
220 | 
221 |     #     for line in myfile.readlines():
222 |     #         if not line.startswith("#"):
223 |     #             strings = line.strip().split()
224 | 
225 |     #             if strings[3] == "Y":
226 |     #                 contig_name = strings[0].replace("contig", "edge")
227 |     #                 contig_length = int(strings[1])
228 |     #                 circular[contig_name] = contig_length
229 | 
230 |     return circular
231 | 
232 | 
233 | def remove_dead_ends(G_edge):
234 |     """
235 |     Remove dead-ends from the component
236 |     """
237 | 
238 |     new_G = copy.deepcopy(G_edge)
239 | 
240 |     has_dead_ends = True
241 | 
242 |     dead_ends_to_remove = []
243 | 
244 |     while has_dead_ends:
245 |         to_remove = []
246 | 
247 |         for node in list(new_G.nodes):
248 |             if not (new_G.in_degree(node) > 0 and new_G.out_degree()(node)) > 0:
249 |                 to_remove.append(node)
250 | 
251 |         if len(to_remove) > 0:
252 |             new_G.remove_nodes_from(to_remove)
253 |             logger.debug(f"Removing dead-ends: {to_remove}")
254 |         else:
255 |             has_dead_ends = False
256 | 
257 |         dead_ends_to_remove += to_remove
258 | 
259 |     return set(dead_ends_to_remove)
260 | 
261 | 
262 | def get_all_sub_paths(assembly_graph, unitig_names):
263 |     """
264 |     Get all sub paths of length 2 and 3
265 |     """
266 | 
267 |     sub_paths = defaultdict(int)
268 | 
269 |     for v in range(assembly_graph.vcount()):
270 |         # Get all paths starting from vertex 'v' of length exactly 2
271 |         paths_from_v = assembly_graph.get_all_simple_paths(v, cutoff=2)
272 | 
273 |         for path in paths_from_v:
274 |             if len(path) == 3:  # Length 3 means 3 vertices (2 edges)
275 |                 node1 = unitig_names[path[0]]
276 |                 node2 = unitig_names[path[1]]
277 |                 node3 = unitig_names[path[2]]
278 |                 sub_paths[tuple([node1, node2, node3])] = 0
279 | 
280 |             elif len(path) == 2:  # Length 2 means 2 vertices (1 edge)
281 |                 node1 = unitig_names[path[0]]
282 |                 node2 = unitig_names[path[1]]
283 |                 sub_paths[tuple([node1, node2])] = 0
284 | 
285 |     return sub_paths
286 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/flow_utils.py:
--------------------------------------------------------------------------------
 1 | import networkx as nx
 2 | 
 3 | from .FD_Inexact import SolveInstances
 4 | 
 5 | 
 6 | def get_source_sink_circular(G_edge, graph_unitigs, minlength, self_looped_nodes):
 7 |     """
 8 |     Identify source/sink vertex for circular components
 9 |     """
10 | 
11 |     source_sink_candidates = []
12 | 
13 |     for node in list(G_edge.nodes):
14 |         unitig_name = node[:-1]
15 | 
16 |         if (
17 |             unitig_name not in self_looped_nodes
18 |             and len(graph_unitigs[unitig_name]) > minlength
19 |         ):
20 |             # Get BFS layers
21 |             bfs_layers = dict(enumerate(nx.bfs_layers(G_edge, node)))
22 | 
23 |             # Get last later
24 |             last_layer = list(bfs_layers.keys())[-1]
25 | 
26 |             node_is_st = True
27 | 
28 |             # Check if successors of those in last_layer is same as the node
29 |             for item in bfs_layers[last_layer]:
30 |                 if item[:-1] not in self_looped_nodes:
31 |                     item_successors = list(G_edge.successors(item))
32 | 
33 |                     if (
34 |                         len(item_successors) > 0
35 |                         and list(G_edge.successors(item))[0] != node
36 |                     ):
37 |                         node_is_st = False
38 |                         break
39 |                     if len(item_successors) == 0:
40 |                         node_is_st = False
41 | 
42 |             if len(bfs_layers[last_layer]) == 0:
43 |                 node_is_st = False
44 | 
45 |             if node_is_st:
46 |                 source_sink_candidates.append(node)
47 | 
48 |     return source_sink_candidates
49 | 
50 | 
51 | def get_source_sink_linear(G_edge, self_looped_nodes):
52 |     """
53 |     Identify source/sink vertex for linear components
54 |     """
55 | 
56 |     source_candidates = []
57 |     sink_candidates = []
58 | 
59 |     for node in list(G_edge.nodes):
60 |         unitig_name = node[:-1]
61 | 
62 |         if unitig_name not in self_looped_nodes:
63 |             indegree = len([x for x in G_edge.predecessors(node)])
64 |             outdegree = len([x for x in G_edge.successors(node)])
65 |             if indegree > 0 and outdegree == 0:
66 |                 sink_candidates.append(node)
67 |             elif indegree == 0 and outdegree > 0:
68 |                 source_candidates.append(node)
69 | 
70 |     return source_candidates, sink_candidates
71 | 
72 | 
73 | def solve_mfd(G, max_paths, output, nthreads):
74 |     """
75 |     Get paths by solving MFD
76 |     """
77 | 
78 |     listOfGraphs = {}
79 |     listOfGraphs[0] = G
80 | 
81 |     outputfile = f"{output}/results_MFD.txt"
82 |     recordfile = f"{output}/results_MFD_details.txt"
83 | 
84 |     solution_paths = SolveInstances(
85 |         listOfGraphs, max_paths, outputfile, recordfile, nthreads
86 |     )
87 | 
88 |     return solution_paths
89 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/gene_utils.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | def get_smg_unitigs(hmmout, mg_frac):
 6 |     """
 7 |     Get unitigs containing bacterial single-copy marker genes
 8 |     """
 9 | 
10 |     # Commands
11 |     # run_FragGeneScan.pl -genome=edges.fasta -out=edges.fasta.frag -complete=0 -train=complete -thread=8 1>edges.fasta.frag.out 2>edges.fasta.frag.err
12 |     # hmmsearch --domtblout edges.fasta.hmmout --cut_tc --cpu 8 /home/mall0133/software/MetaCoAG/metacoag_utils/auxiliary/marker.hmm edges.fasta.frag.faa 1>edges.fasta.hmmout.out 2> edges.fasta.hmmout.err
13 | 
14 |     smg_unitigs = set()
15 | 
16 |     unitig_smgs = {}
17 | 
18 |     with open(hmmout, "r") as myfile:
19 |         for line in myfile.readlines():
20 |             if not line.startswith("#"):
21 |                 strings = line.strip().split()
22 | 
23 |                 unitig = strings[0]
24 | 
25 |                 # Marker gene name
26 |                 marker_gene = strings[3]
27 | 
28 |                 # Marker gene length
29 |                 marker_gene_length = int(strings[5])
30 | 
31 |                 # Mapped marker gene length
32 |                 mapped_marker_length = int(strings[16]) - int(strings[15])
33 | 
34 |                 name_strings = unitig.split("_")
35 |                 name_strings = name_strings[: len(name_strings) - 3]
36 | 
37 |                 # unitig name
38 |                 unitig_name = "_".join(name_strings)
39 | 
40 |                 if mapped_marker_length > marker_gene_length * mg_frac:
41 |                     smg_unitigs.add(unitig_name)
42 | 
43 |                     if unitig_name not in unitig_smgs:
44 |                         unitig_smgs[unitig_name] = set()
45 |                         unitig_smgs[unitig_name].add(marker_gene)
46 |                     else:
47 |                         unitig_smgs[unitig_name].add(marker_gene)
48 | 
49 |     return smg_unitigs
50 | 
51 | 
52 | def get_phrog_unitigs(phrogs, e_value, seq_identity):
53 |     """
54 |     Get unitigs containing PHROGs
55 |     """
56 | 
57 |     # Read phrogs table and get annotations and categories
58 |     phrog_table_file = Path(__file__).parent / "phrogs" / "phrog_annot.tsv"
59 | 
60 |     phrog_dict = defaultdict(str)
61 | 
62 |     with open(phrog_table_file, "r") as myfile:
63 |         for line in myfile.readlines():
64 |             if not line.startswith("phrog"):
65 |                 strings = line.strip().split("\t")
66 |                 phrog_dict[f"phrog_{strings[0]}"] = f"{strings[2]} {strings [3]}"
67 | 
68 |     # Get unitigs containing phrogs
69 |     unitig_phrogs = {}
70 | 
71 |     with open(phrogs, "r") as myfile:
72 |         for line in myfile.readlines():
73 |             # if "edge_" in line:
74 | 
75 |             strings = line.strip().split("\t")
76 | 
77 |             name = strings[0][1:-1]
78 |             phrog_id = strings[1][1:-1].split()[0]
79 |             seqIdentity = float(strings[3])
80 |             evalue = float(strings[4])
81 | 
82 |             if evalue < e_value and seqIdentity > seq_identity:
83 |                 if name not in unitig_phrogs:
84 |                     unitig_phrogs[name] = set([phrog_id])
85 |                 else:
86 |                     unitig_phrogs[name].add(phrog_id)
87 | 
88 |     return unitig_phrogs, phrog_dict
89 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/genome_utils.py:
--------------------------------------------------------------------------------
 1 | # Class for genome path
 2 | class GenomePath:
 3 |     def __init__(
 4 |         self,
 5 |         id,
 6 |         bubble_case,
 7 |         node_order,
 8 |         node_order_human,
 9 |         node_id_order,
10 |         path,
11 |         coverage,
12 |         length,
13 |         gc,
14 |     ):
15 |         self.id = id
16 |         self.bubble_case = bubble_case
17 |         self.path = path
18 |         self.coverage = coverage
19 |         self.length = length
20 |         self.node_order = node_order
21 |         self.node_order_human = node_order_human
22 |         self.node_id_order = node_id_order
23 |         self.gc = gc
24 | 
25 | 
26 | # Class for genome component
27 | class GenomeComponent:
28 |     def __init__(
29 |         self,
30 |         id,
31 |         n_nodes,
32 |         n_paths,
33 |         max_degree,
34 |         min_degree,
35 |         max_in_degree,
36 |         max_out_degree,
37 |         avg_degree,
38 |         avg_in_degree,
39 |         avg_out_degree,
40 |         density,
41 |         max_path_length,
42 |         min_path_length,
43 |         min_max_len_ratio,
44 |         max_cov_path_length,
45 |         min_cov_path_length,
46 |         min_max_cov_len_ratio,
47 |         max_cov,
48 |         min_cov,
49 |         min_max_cov_ratio,
50 |         frac_unitigs,
51 |     ):
52 |         self.id = id
53 |         self.n_nodes = n_nodes
54 |         self.n_paths = n_paths
55 |         self.max_degree = max_degree
56 |         self.min_degree = min_degree
57 |         self.max_in_degree = max_in_degree
58 |         self.max_out_degree = max_out_degree
59 |         self.avg_degree = avg_degree
60 |         self.avg_in_degree = avg_in_degree
61 |         self.avg_out_degree = avg_out_degree
62 |         self.density = density
63 |         self.max_path_length = max_path_length
64 |         self.min_path_length = min_path_length
65 |         self.min_max_len_ratio = min_max_len_ratio
66 |         self.max_cov_path_length = max_cov_path_length
67 |         self.min_cov_path_length = min_cov_path_length
68 |         self.min_max_cov_len_ratio = min_max_cov_len_ratio
69 |         self.max_cov = max_cov
70 |         self.min_cov = min_cov
71 |         self.min_max_cov_ratio = min_max_cov_ratio
72 |         self.frac_unitigs = frac_unitigs
73 | 


--------------------------------------------------------------------------------
/phables/workflow/scripts/phables_utils/output_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import subprocess
  4 | 
  5 | FASTA_LINE_LEN = 60
  6 | 
  7 | # Create logger
  8 | logger = logging.getLogger("phables 1.4.1")
  9 | 
 10 | 
 11 | def write_unitigs(nodes, unitig_names, graph_unitigs, filename, output):
 12 |     """
 13 |     Write unitigs to FASTA file
 14 |     """
 15 | 
 16 |     with open(f"{output}/{filename}.fasta", "w+") as myfile:
 17 |         for node in nodes:
 18 |             unitig_name = unitig_names[node]
 19 |             edge_seq = str(graph_unitigs[unitig_name])
 20 |             myfile.write(f">{unitig_name}\n")
 21 | 
 22 |             chunks = [
 23 |                 edge_seq[i : i + FASTA_LINE_LEN]
 24 |                 for i in range(0, len(edge_seq), FASTA_LINE_LEN)
 25 |             ]
 26 | 
 27 |             for chunk in chunks:
 28 |                 myfile.write(f"{chunk}\n")
 29 | 
 30 | 
 31 | def write_component_info(all_components, output):
 32 |     """
 33 |     Write component information to file
 34 |     """
 35 | 
 36 |     with open(f"{output}/resolved_component_info.txt", "w") as myfile:
 37 |         myfile.write(f"Component\t")
 38 |         myfile.write(f"Number of nodes\t")
 39 |         myfile.write(f"Number of paths\t")
 40 |         myfile.write(f"Fraction of unitigs recovered\t")
 41 |         myfile.write(f"Maximum degree\t")
 42 |         myfile.write(f"Minimum degree\t")
 43 |         myfile.write(f"Maximum in degree\t")
 44 |         myfile.write(f"Maximum out degree\t")
 45 |         myfile.write(f"Average degree\t")
 46 |         myfile.write(f"Average in degree\t")
 47 |         myfile.write(f"Average out degree\t")
 48 |         myfile.write(f"Density\t")
 49 |         myfile.write(f"Maximum path length\t")
 50 |         myfile.write(f"Minimum path length\t")
 51 |         myfile.write(f"Length ratio (long/short)\t")
 52 |         myfile.write(f"Maximum coverage path length\t")
 53 |         myfile.write(f"Minimum coverage path length\t")
 54 |         myfile.write(f"Length ratio (highest cov/lowest cov)\t")
 55 |         myfile.write(f"Maximum coverage\t")
 56 |         myfile.write(f"Minimum coverage\t")
 57 |         myfile.write(f"Coverage ratio (highest/lowest)\n")
 58 | 
 59 |         if len(all_components) > 0:
 60 |             for component in all_components:
 61 |                 myfile.write(f"{component.id}\t")
 62 |                 myfile.write(f"{component.n_nodes}\t")
 63 |                 myfile.write(f"{component.n_paths}\t")
 64 |                 myfile.write(f"{component.frac_unitigs}\t")
 65 |                 myfile.write(f"{component.max_degree}\t")
 66 |                 myfile.write(f"{component.min_degree}\t")
 67 |                 myfile.write(f"{component.max_in_degree}\t")
 68 |                 myfile.write(f"{component.max_out_degree}\t")
 69 |                 myfile.write(f"{component.avg_degree}\t")
 70 |                 myfile.write(f"{component.avg_in_degree}\t")
 71 |                 myfile.write(f"{component.avg_out_degree}\t")
 72 |                 myfile.write(f"{component.density}\t")
 73 |                 myfile.write(f"{component.max_path_length}\t")
 74 |                 myfile.write(f"{component.min_path_length}\t")
 75 |                 myfile.write(f"{component.min_max_len_ratio}\t")
 76 |                 myfile.write(f"{component.max_cov_path_length}\t")
 77 |                 myfile.write(f"{component.min_cov_path_length}\t")
 78 |                 myfile.write(f"{component.min_max_cov_len_ratio}\t")
 79 |                 myfile.write(f"{component.max_cov}\t")
 80 |                 myfile.write(f"{component.min_cov}\t")
 81 |                 myfile.write(f"{component.min_max_cov_ratio}\n")
 82 |         else:
 83 |             myfile.write(f"No complex components were resolved.")
 84 | 
 85 |     return "resolved_component_info.txt"
 86 | 
 87 | 
 88 | def write_res_genome_info(all_resolved_paths, output):
 89 |     """
 90 |     Write resolved genome information to file
 91 |     """
 92 | 
 93 |     with open(f"{output}/resolved_genome_info.txt", "w") as myfile:
 94 |         myfile.write(
 95 |             f"Path\tCase\tCoverage\tLength\tGC content\tNode order (gfa link format)\tNode order (human readable)\n"
 96 |         )
 97 |         for genomic_path in all_resolved_paths:
 98 |             myfile.write(
 99 |                 f"{genomic_path.id}\t{genomic_path.bubble_case}\t{genomic_path.coverage}\t{genomic_path.length}\t{genomic_path.gc}\t{genomic_path.node_order}\t{genomic_path.node_order_human}\n"
100 |             )
101 | 
102 |     return "resolved_genome_info.txt"
103 | 
104 | 
105 | def write_path(final_genomic_paths, output):
106 |     """
107 |     Write genomic paths to a single FASTA file
108 |     """
109 | 
110 |     with open(f"{output}/resolved_paths.fasta", "a+") as myfile:
111 |         for genomic_path in final_genomic_paths:
112 |             myfile.write(f">{genomic_path.id}\n")
113 | 
114 |             chunks = [
115 |                 genomic_path.path[i : i + FASTA_LINE_LEN]
116 |                 for i in range(0, genomic_path.length, FASTA_LINE_LEN)
117 |             ]
118 | 
119 |             for chunk in chunks:
120 |                 myfile.write(f"{chunk}\n")
121 | 
122 | 
123 | def write_path_fasta(final_genomic_paths, output_genomes_path):
124 |     """
125 |     Write genomic paths to individual FASTA files
126 |     """
127 | 
128 |     if not os.path.isdir(f"{output_genomes_path}"):
129 |         subprocess.run("mkdir -p " + output_genomes_path, shell=True)
130 | 
131 |     for genomic_path in final_genomic_paths:
132 |         with open(f"{output_genomes_path}/{genomic_path.id}.fasta", "w+") as myfile:
133 |             myfile.write(f">{genomic_path.id}\n")
134 | 
135 |             chunks = [
136 |                 genomic_path.path[i : i + FASTA_LINE_LEN]
137 |                 for i in range(0, genomic_path.length, FASTA_LINE_LEN)
138 |             ]
139 | 
140 |             for chunk in chunks:
141 |                 myfile.write(f"{chunk}\n")
142 | 
143 | 
144 | def write_component_phrog_info(resolved_components, comp_phrogs, output):
145 |     """
146 |     Write PHROGs found in resolved components
147 |     """
148 | 
149 |     with open(f"{output}/component_phrogs.txt", "w") as myfile:
150 |         myfile.write(f"Phage component\tPHROG\n")
151 |         for comp in resolved_components:
152 |             myfile.write(f"phage_{comp}\t{comp_phrogs[comp]}\n")
153 | 
154 |     return "component_phrogs.txt"
155 | 
156 | 
157 | def init_files(output):
158 |     """
159 |     Initialise files and folders
160 |     """
161 | 
162 |     open(f"{output}/resolved_edges.fasta", "a").close()
163 |     open(f"{output}/resolved_paths.fasta", "a").close()
164 |     open(f"{output}/resolved_genome_info.txt", "a").close()
165 |     open(f"{output}/resolved_component_info.txt", "a").close()
166 |     open(f"{output}/component_phrogs.txt", "a").close()
167 | 
168 |     if not os.path.isdir(f"{output}/resolved_phages"):
169 |         subprocess.run(f"mkdir -p {output}/resolved_phages", shell=True)
170 | 


--------------------------------------------------------------------------------
/phables/workflow/test_phables.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Phables: from fragmented assemblies to high-quality bacteriophage genomes.
 3 | 
 4 | 2023, Vijini Mallawaarachchi
 5 | 
 6 | This is an auxiliary Snakefile to test phables.
 7 | """
 8 | 
 9 | """CONFIGURATION"""
10 | configfile: os.path.join(workflow.basedir, "..", "config", "config.yaml")
11 | configfile: os.path.join(workflow.basedir, "..", "config", "databases.yaml")
12 | 
13 | 
14 | """PREFLIGHT CHECKS
15 | Validate your inputs, set up directories, parse your config, etc.
16 | """
17 | include: "rules/00_database_preflight.smk"
18 | include: "rules/03_test_preflight.smk"
19 | 
20 | 
21 | """TARGETS
22 | Declare your targets, either here, or in a separate file.
23 | """
24 | include: "rules/03_test_targets.smk"
25 | 
26 | 
27 | """RUN SNAKEMAKE!"""
28 | rule all:
29 |     input:
30 |         allTargets
31 | 
32 | 
33 | """RULES
34 | Add rules files with the include directive here, or add rules AFTER rule 'all'.
35 | """
36 | 
37 | rule test_phables:
38 |     input:
39 |         g = os.path.join(TESTDIR, "assembly_graph.gfa"),
40 |         c = os.path.join(TESTDIR, "edge_coverages.tsv"),
41 |         b = TESTDIR,
42 |         ph = os.path.join(TESTDIR, "phrogs_annotations.tsv"),
43 |         hm = os.path.join(TESTDIR, "edges.fasta.hmmout")
44 |     output:
45 |         genomes_fasta = temp(os.path.join(TESTDIR, "resolved_paths.fasta")),
46 |         genomes_folder = temp(directory(os.path.join(TESTDIR, "resolved_phages"))),
47 |         genome_info = temp(os.path.join(TESTDIR, "resolved_genome_info.txt")),
48 |         phage_edges = temp(os.path.join(TESTDIR, "phage_like_edges.fasta")),
49 |         all_phage_edges = temp(os.path.join(TESTDIR, "all_phage_like_edges.fasta")),
50 |         unresolved_edges = temp(os.path.join(TESTDIR, "unresolved_phage_like_edges.fasta")),
51 |         unitigs = temp(os.path.join(TESTDIR, "resolved_edges.fasta")),
52 |         component_info = temp(os.path.join(TESTDIR, "resolved_component_info.txt")),
53 |         phrog_comp_info = temp(os.path.join(TESTDIR, "component_phrogs.txt")),
54 |         mfd = temp(os.path.join(TESTDIR, "results_MFD.txt")),
55 |         mfd_details = temp(os.path.join(TESTDIR, "results_MFD_details.txt")),
56 |         log = temp(os.path.join(TESTDIR, "phables_output.log"))
57 |     params:
58 |         graph = os.path.join(TESTDIR, "assembly_graph.gfa"),
59 |         hmmout = os.path.join(TESTDIR, "edges.fasta.hmmout"),
60 |         phrogs = os.path.join(TESTDIR, "phrogs_annotations.tsv"),
61 |         coverage = os.path.join(TESTDIR, "edge_coverages.tsv"),
62 |         bampath = TESTDIR,
63 |         minlength = ML,
64 |         mincov = MC,
65 |         compcount = CC,
66 |         maxpaths = MP,
67 |         mgfrac = MGF,
68 |         evalue = EV,
69 |         seqidentity = SI,
70 |         covtol = CT,
71 |         alpha = AL,
72 |         longreads = LR,
73 |         prefix = PR,
74 |         output = TESTDIR,
75 |         nthreads = 2,
76 |         log = temp(os.path.join(TESTDIR, "phables_output.log"))
77 |     log:
78 |         os.path.join(TESTDIR, "phables_output.log")
79 |     conda: 
80 |         os.path.join("envs", "phables.yaml")
81 |     script:
82 |         os.path.join('scripts', 'phables.py')


--------------------------------------------------------------------------------
/phables_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables_logo.png


--------------------------------------------------------------------------------
/phables_logo_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables_logo_dark.png


--------------------------------------------------------------------------------
/phables_logo_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Vini2/phables/7f7da96dc8683e77f49e7a7bb909379e6a66a944/phables_logo_light.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | 
 6 | def get_version():
 7 |     with open(
 8 |         os.path.join(
 9 |             os.path.dirname(os.path.realpath(__file__)),
10 |             "phables",
11 |             "phables.VERSION",
12 |         )
13 |     ) as f:
14 |         return f.readline().strip()
15 | 
16 | 
17 | with open("README.md", "r") as fh:
18 |     long_description = fh.read()
19 | 
20 | 
21 | data_files = [(".", ["LICENSE", "README.md"])]
22 | 
23 | setup(
24 |     name="phables",
25 |     packages=find_packages(),
26 |     url="https://github.com/Vini2/phables",
27 |     python_requires=">=3.9, <3.11",
28 |     description="Phables: from fragmented assemblies to high-quality bacteriophage genomes",
29 |     long_description=long_description,
30 |     long_description_content_type="text/markdown",
31 |     version=get_version(),
32 |     author="Vijini Mallawaarachchi",
33 |     author_email="viji.mallawaarachchi@gmail.com",
34 |     data_files=data_files,
35 |     py_modules=["phables"],
36 |     install_requires=[
37 |         "snakemake>=7.14.0",
38 |         "pyyaml>=6.0",
39 |         "click>=8.1.3",
40 |         "metasnek>=0.0.5",
41 |         "snaketool-utils>=0.0.4",
42 |     ],
43 |     entry_points={"console_scripts": ["phables=phables.__main__:main"]},
44 |     include_package_data=True,
45 |     classifiers=[
46 |         "Development Status :: 5 - Production/Stable",
47 |         "Programming Language :: Python :: 3",
48 |         "License :: OSI Approved :: MIT License",
49 |         "Natural Language :: English",
50 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
51 |         "Operating System :: MacOS",
52 |         "Operating System :: POSIX",
53 |     ],
54 | )
55 | 


--------------------------------------------------------------------------------
/tests/test_phables.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | __author__ = "Vijini Mallawaarachchi"
 7 | __copyright__ = "Copyright 2023, Phables Project"
 8 | __license__ = "MIT"
 9 | __type__ = "Test Script"
10 | __maintainer__ = "Vijini Mallawaarachchi"
11 | __email__ = "viji.mallawaarachchi@gmail.com"
12 | 
13 | 
14 | TEST_ROOTDIR = Path(__file__).parent
15 | EXEC_ROOTDIR = Path(__file__).parent.parent
16 | 
17 | 
18 | @pytest.fixture(scope="session")
19 | def tmp_dir(tmpdir_factory):
20 |     return tmpdir_factory.mktemp("tmp")
21 | 
22 | 
23 | @pytest.fixture(autouse=True)
24 | def workingdir(tmp_dir, monkeypatch):
25 |     """set the working directory for all tests"""
26 |     monkeypatch.chdir(tmp_dir)
27 | 
28 | 
29 | def exec_command(cmnd, stdout=subprocess.PIPE, stderr=subprocess.PIPE):
30 |     """executes shell command and returns stdout if completes exit code 0
31 |     Parameters
32 |     ----------
33 |     cmnd : str
34 |       shell command to be executed
35 |     stdout, stderr : streams
36 |       Default value (PIPE) intercepts process output, setting to None
37 |       blocks this."""
38 | 
39 |     proc = subprocess.Popen(cmnd, shell=True, stdout=stdout, stderr=stderr)
40 |     out, err = proc.communicate()
41 |     if proc.returncode != 0:
42 |         raise RuntimeError(f"FAILED: {cmnd}\n{err}")
43 |     return out.decode("utf8") if out is not None else None
44 | 
45 | 
46 | def test_phables(tmp_dir):
47 |     """test phables"""
48 |     cmd = f"phables --help"
49 |     exec_command(cmd)
50 | 


--------------------------------------------------------------------------------