├── gretel ├── __init__.py ├── snpper.py ├── gretel.py ├── cmd.py └── util.py ├── tests ├── __init__.py ├── data │ ├── test.bam │ ├── test.vcf.gz │ ├── test.bam.bai │ ├── test.vcf.gz.tbi │ └── test.sam └── test_test.py ├── docs ├── readme.rst ├── changelog.rst ├── modules.rst ├── source │ ├── modules.rst │ └── gretel.rst ├── index.rst ├── gretel.rst ├── protocol.rst ├── make.bat ├── Makefile └── conf.py ├── gretel-logo.png ├── .gitignore ├── LICENSE ├── setup.py ├── Makefile ├── CHANGELOG.rst └── README.md /gretel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CHANGELOG.rst 2 | -------------------------------------------------------------------------------- /gretel-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/gretel-logo.png -------------------------------------------------------------------------------- /tests/data/test.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.bam -------------------------------------------------------------------------------- /tests/data/test.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.vcf.gz -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | gretel 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | gretel 8 | -------------------------------------------------------------------------------- /tests/data/test.bam.bai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.bam.bai -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | gretel 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | gretel 8 | -------------------------------------------------------------------------------- /tests/data/test.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.vcf.gz.tbi -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Gretel 2 | ====== 3 | 4 | An algorithm for recovering haplotypes from metagenomes. Sister to `Hansel `_. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | readme 10 | protocol 11 | changelog 12 | 13 | 14 | Indices and tables 15 | ================== 16 | 17 | * :ref:`genindex` 18 | * :ref:`modindex` 19 | * :ref:`search` 20 | 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | htmlcov 29 | coverage.xml 30 | 31 | # Translations 32 | *.mo 33 | 34 | # Mr Developer 35 | .mr.developer.cfg 36 | .project 37 | .pydevproject 38 | 39 | # Complexity 40 | output/*.html 41 | output/*/index.html 42 | 43 | # Sphinx 44 | docs/_build 45 | 46 | # Vim 47 | *.swp 48 | *.swo 49 | -------------------------------------------------------------------------------- /docs/gretel.rst: -------------------------------------------------------------------------------- 1 | gretel package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | gretel.cmd module 8 | ----------------- 9 | 10 | .. automodule:: gretel.cmd 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | gretel.gretel module 16 | -------------------- 17 | 18 | .. automodule:: gretel.gretel 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | gretel.util module 24 | ------------------ 25 | 26 | .. automodule:: gretel.util 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: gretel 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/source/gretel.rst: -------------------------------------------------------------------------------- 1 | gretel package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | gretel.cmd module 8 | ----------------- 9 | 10 | .. automodule:: gretel.cmd 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | gretel.gretel module 16 | -------------------- 17 | 18 | .. automodule:: gretel.gretel 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | gretel.util module 24 | ------------------ 25 | 26 | .. automodule:: gretel.util 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: gretel 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /tests/data/test.sam: -------------------------------------------------------------------------------- 1 | @HD VN:1.0 SO:coordinate 2 | @SQ SN:hoot LN:20 3 | @SQ SN:meow LN:20 4 | read1 0 hoot 1 42 10M * 0 10 AANNNNNNNA ~~~~~~~~~~ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:5 YT:Z:UU RG:Z:hoot 5 | read2 0 hoot 1 42 10M * 0 10 CCNNNNNNNC ~~~~~~~~~~ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:5 YT:Z:UU RG:Z:hoot 6 | read3 0 hoot 1 42 5M * 0 5 TTNNN ~~~~~ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:5 YT:Z:UU RG:Z:hoot 7 | read4 0 hoot 1 42 5M * 0 5 TTNNN !!!!! AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:5 YT:Z:UU RG:Z:hoot 8 | read5 0 hoot 10 42 11M * 0 11 GNNNNNNNNNG ~~~~~~~~~~~ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:5 YT:Z:UU RG:Z:hoot 9 | read6 0 meow 1 42 10M * 0 10 AANNNNNNNA ~~~~~~~~~~ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:5 YT:Z:UU RG:Z:hoot 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2016 Sam Nicholls 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import setuptools 5 | 6 | requirements = [ 7 | "numpy", 8 | "hanselx==0.0.92", 9 | "pysam", 10 | "PyVCF", 11 | ] 12 | 13 | test_requirements = [ 14 | 15 | ] 16 | 17 | setuptools.setup( 18 | name="gretel", 19 | version="0.0.94", 20 | url="https://github.com/samstudio8/gretel", 21 | 22 | description="An algorithm for recovering potential haplotypes from metagenomes", 23 | long_description="", 24 | 25 | author="Sam Nicholls", 26 | author_email="sam@samnicholls.net", 27 | 28 | maintainer="Sam Nicholls", 29 | maintainer_email="sam@samnicholls.net", 30 | 31 | packages=setuptools.find_packages(), 32 | include_package_data=True, 33 | 34 | install_requires=requirements, 35 | 36 | entry_points = { 37 | "console_scripts": [ 38 | "gretel=gretel.cmd:main", 39 | "gretel-snpper=gretel.snpper:main", 40 | ] 41 | }, 42 | 43 | classifiers = [ 44 | 'Development Status :: 2 - Pre-Alpha', 45 | 'Intended Audience :: Science/Research', 46 | 'Topic :: Scientific/Engineering', 47 | 'Topic :: Scientific/Engineering :: Bio-Informatics', 48 | 'License :: OSI Approved :: MIT License', 49 | ], 50 | 51 | test_suite="tests", 52 | tests_require=test_requirements 53 | ) 54 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc clean-build docs clean 2 | 3 | help: 4 | @echo "clean-build - remove build artifacts" 5 | @echo "clean-pyc - remove Python file artifacts" 6 | @echo "lint - check style with flake8" 7 | @echo "test - run tests quickly with the default Python" 8 | @echo "test-all - run tests on every Python version with tox" 9 | @echo "coverage - check code coverage quickly with the default Python" 10 | @echo "docs - generate Sphinx HTML documentation, including API docs" 11 | @echo "release - package and upload a release" 12 | @echo "dist - package" 13 | 14 | clean: clean-build clean-pyc 15 | rm -fr htmlcov/ 16 | 17 | clean-build: 18 | rm -fr build/ 19 | rm -fr dist/ 20 | rm -fr *.egg-info 21 | 22 | clean-pyc: 23 | find . -name '*.pyc' -exec rm -f {} + 24 | find . -name '*.pyo' -exec rm -f {} + 25 | find . -name '*~' -exec rm -f {} + 26 | 27 | lint: 28 | flake8 gretel tests 29 | 30 | test: 31 | python setup.py test 32 | 33 | test-all: 34 | tox 35 | 36 | coverage: 37 | coverage run --source gretel setup.py test 38 | coverage report -m 39 | coverage html 40 | open htmlcov/index.html 41 | 42 | docs: 43 | rm -f docs/gretel.rst 44 | rm -f docs/modules.rst 45 | sphinx-apidoc -o docs/ gretel 46 | $(MAKE) -C docs clean 47 | $(MAKE) -C docs html 48 | open docs/_build/html/index.html 49 | 50 | release: clean 51 | python3 -m twine upload dist/* 52 | 53 | dist: clean 54 | python3 setup.py sdist bdist_wheel 55 | ls -l dist 56 | -------------------------------------------------------------------------------- /gretel/snpper.py: -------------------------------------------------------------------------------- 1 | """Given a BAM, a contig, and an ending genomic position, aggressively call for 2 | variants and generate a placeholder VCF. 3 | 4 | Thanks to @linsalrob for the initial argparsification of gretel-snpper. 5 | """ 6 | import sys 7 | 8 | import numpy as np 9 | import pysam 10 | import argparse 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser('Aggressively call for variants and generate a VCF', epilog='NOTE: Coordinates are 1-based as they are for samtools') 14 | parser.add_argument('--bam', help='bam of reads aligned to (psuedo)-reference', required=True) 15 | parser.add_argument('--contig', help='name of contig to generate a VCF for', required=True) 16 | parser.add_argument('-s', help='start (default = 1)', type=int, default=1) 17 | parser.add_argument('-e', help='end (default = length of the reference)', type=int) 18 | parser.add_argument('--depth', help='number of reads that must feature a read to call that base as a possible variant (default = 0)', type=int, default=0) 19 | args = parser.parse_args() 20 | 21 | bam = pysam.AlignmentFile(args.bam) 22 | 23 | if not args.e: 24 | args.e = bam.lengths[bam.references.index(args.contig)] 25 | 26 | # convert 1-indexed numbers to 0 indexed numbers for pysam 27 | args.s = args.s - 1 28 | 29 | counts = np.array(bam.count_coverage(contig=args.contig, start=args.s, stop=args.e, quality_threshold=0, read_callback='nofilter')) 30 | 31 | COUNT_SENSITIVITY = args.depth 32 | 33 | vcf_h = [ 34 | "##fileformat=VCFv4.2", 35 | ] 36 | vcf = [] 37 | 38 | sites = (counts > COUNT_SENSITIVITY).sum(axis=0) 39 | for i, s in enumerate(sites): 40 | if s > 1: 41 | vcf.append([ 42 | args.contig, 43 | i+1+args.s, 44 | '.', 45 | 'A', 46 | 'C,T,G', 47 | 0, 48 | '.', 49 | "INFO" 50 | ]) 51 | 52 | 53 | for r in vcf_h: 54 | print(r) 55 | for r in vcf: 56 | print("\t".join([str(s) for s in r])) 57 | -------------------------------------------------------------------------------- /tests/test_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | EXP = [1,2,10] 4 | def load_vcf(path, contig, start_pos, end_pos): 5 | from gretel import util 6 | return util.process_vcf(path, contig, start_pos, end_pos) 7 | 8 | def load_bam(path, vcf_path, contig, start_pos, end_pos, threads): 9 | from gretel import util 10 | return util.load_from_bam(path, contig, start_pos, end_pos, load_vcf(vcf_path, contig, start_pos, end_pos), n_threads=threads) 11 | 12 | class BasicRegressionTest(unittest.TestCase): 13 | 14 | def test_test(self): 15 | self.assertTrue(True) 16 | 17 | def test_vcf(self): 18 | VCF_h = load_vcf('tests/data/test.vcf.gz', 'hoot', 1, 19) 19 | self.assertEqual(VCF_h["N"], 3) 20 | self.assertEqual(VCF_h["snp_rev"], {0:1, 1:2, 2:10}) 21 | self.assertEqual(VCF_h["snp_fwd"], {1:0, 2:1, 10:2}) 22 | 23 | self.assertEqual(len(VCF_h["region"]), 19+1) 24 | for i in range(len(VCF_h["region"])): 25 | if i in EXP: 26 | self.assertEqual(VCF_h["region"][i], 1) 27 | else: 28 | self.assertEqual(VCF_h["region"][i], 0) 29 | return VCF_h 30 | 31 | def test_bam(self): 32 | 33 | for thread in [1,2]: 34 | hansel = load_bam('tests/data/test.bam', 'tests/data/test.vcf.gz', 'hoot', 1, 20, thread) 35 | 36 | self.assertEqual(hansel.n_slices, 5) # n reads 37 | self.assertEqual(hansel.n_crumbs, 9) # n snp connections 38 | self.assertTrue(hansel.L > 0) # Check L is actually set 39 | 40 | # Test a couple of elements in the Hansel matrix 41 | self.assertEqual(hansel.get_observation('_', 'A', 0, 1), 1) 42 | self.assertEqual(hansel.get_observation('A', 'A', 1, 2), 1) 43 | self.assertEqual(hansel.get_observation('A', 'A', 1, 3), 1) 44 | self.assertEqual(hansel.get_observation('A', 'A', 1, 4), 0) 45 | self.assertEqual(hansel.get_observation('C', 'C', 1, 2), 1) 46 | self.assertEqual(hansel.get_observation('C', 'C', 1, 3), 1) 47 | self.assertEqual(hansel.get_observation('C', 'C', 1, 4), 0) 48 | self.assertEqual(hansel.get_observation('T', 'T', 1, 2), 2) 49 | self.assertEqual(hansel.get_observation('G', 'G', 1, 2), 0) 50 | self.assertEqual(hansel.get_observation('G', 'G', 2, 3), 0) 51 | self.assertEqual(hansel.get_observation('G', 'G', 3, 4), 1) 52 | self.assertEqual(hansel.get_observation('G', '_', 4, 5), 1) 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | History 2 | ======= 3 | 4 | 0.0.94 5 | ------ 6 | * Added `--pepper` option to for permissive pileups by overriding the pysam pileup stepper to `all` instead of `samtools`. 7 | 8 | 0.0.93 9 | ------ 10 | 11 | * Move `process_vcf` to `util` module. I may drop use of `pyvcf` in future as I don't like the API. 12 | * Dropped pointless `append_path` stub. 13 | * Fixed an edge case where reads beginning with a SNP that aligned to the start of a parallel parsing window are counted twice. 14 | * Added a small test package to help detect future regressions. 15 | * Added `--version` argument to print program version number. 16 | * Removed `--lorder` argument as users should not need to select the chain order. 17 | 18 | 0.0.92 19 | ------ 20 | 21 | * Adds `--dumpmatrix` and `--dumpsnps` debugging options. 22 | * Clean up Hansel matrix initialisation. 23 | * Add `gretel-snpper` command for generating naive VCF. 24 | * Fix a regression where the `L` parameter of the matrix is incorrectly left unset. 25 | 26 | 0.0.90 27 | ------ 28 | Resolves a bug whereby SNPs are incorrectly parsed from the BAM if either: 29 | * its quality score is below 13 30 | * the read is overlapped by its primary mate 31 | 32 | Well covered data sets need not be overly affected by the additional noise that 33 | may have been introduced, but the problem is more noticeable with low coverage 34 | and you may wish to reapply Gretel to affected data. Sorry. 35 | 36 | 37 | 0.0.81 38 | ------ 39 | * Add warning and advice when an entry in Hansel is missing evidence. 40 | * Make the 'Unable to select' warning sound much less bad because it is normal. 41 | 42 | 0.0.8 43 | ----- 44 | * Docs 45 | * Deprecate `gretel-crumbs` command 46 | 47 | 0.0.7 48 | ----- 49 | * Further improvements to parallel read processing 50 | * Add `-` symbol to enable support for deletions 51 | 52 | 0.0.6b 53 | ------ 54 | * Fix setting of `L` parameter 55 | 56 | 0.0.6 57 | ----- 58 | * MULTIPROCESSING 59 | * Re-write read handling, again 60 | 61 | 0.0.5 62 | ----- 63 | * `-s` and `-e` introduced to allow specification of positions between which 64 | to recover haplotypes 65 | * Attempt some basic indel handling 66 | * Fix a bug where the master sequence was altered by the output of each 67 | reported haplotype 68 | 69 | 0.0.4 70 | ----- 71 | * Add experimental `--sentinels` option 72 | * Improve docs 73 | 74 | 0.0.3 75 | ----- 76 | * Hansel is now seperate from Gretel 77 | * [Hansel] `get_marginal_at` is `now get_counts_at` 78 | * [Hansel] `selext_next_edge_at` deprecated 79 | * Gene recovery and likelihood plots are now on seperate panels 80 | * Re-write methods to add observations to matrix to be less awful to read 81 | * Drop `--hit` and `--gene` options to verification 82 | * Replace verification script to `gretel-crumbs` command 83 | 84 | 0.0.2 85 | ----- 86 | * Improve documentation. 87 | * Provide `util` subpackage for filling `Hansel` structure with BAM observations. 88 | * Explicitly provide possible symbols to `Hansel`. 89 | * Improve plotting 90 | * Remove `process_hits` and `process_refs` as these are no longer needed. 91 | * Rename `establish_path` to `generate_path` 92 | * Rename `add_ignore_support3` to `reweight_hansel_from_graph` so we have some sort of indication of what it does. 93 | * Altered Sphinx configuation. 94 | 95 | 0.0.1 96 | ----- 97 | * Import repository from `claw`. 98 | -------------------------------------------------------------------------------- /docs/protocol.rst: -------------------------------------------------------------------------------- 1 | Protocol 2 | ======== 3 | 4 | **Gretel** provides a command line tool for the recovery of haplotypes. 5 | We recommend the following protocol. 6 | 7 | Read Alignment 8 | -------------- 9 | 10 | **Gretel** requires your reads to be aligned to a common reference. This is to 11 | ensure that reads share a co-ordinate system, on which we can call for variants 12 | and recover haplotypes. The reference itself is of little consequence, though 13 | dropped reads will lead to evidence to be unavailable to Gretel. 14 | 15 | Construction of a *de novo* consensus assembly for a metagenome is left as an exercise 16 | for the reader. Align the reads to your assembly (`bowtie2`, `minimap2` etc.). 17 | Sort and index the alignment BAM. 18 | 19 | Variant Calling 20 | --------------- 21 | 22 | **Gretel** is robust to sequencing error and misalignment noise, thus the 23 | calling of variants need not be carefully conducted. Typically we have used `samtools`, 24 | but for our own Gretel pipeline, we have aggressively called all heterogenous sites 25 | in an alignment as a SNP using the `snpper` tool in our `gretel-test repository 26 | `_. 27 | 28 | For somewhat questionable reasoning, we currently require a compressed and indexed VCF: :: 29 | 30 | bgzip 31 | tabix 32 | 33 | Invocation of Gretel 34 | -------------------- 35 | As described in the README, Gretel is invoked as follows: :: 36 | 37 | gretel [-s 1startpos] [-e 1endpos] [--master master.fa] [-o output_dir] 38 | 39 | You must provide your sorted BAM, compressed VCF, and the name of the contig on which 40 | to recover haplotypes. Use `-s` and `-e` to specify the positions on the aligned reads between which 41 | to recover haplotypes from your metagenome. 42 | 43 | By default, Gretel will output a FASTA containing the recovered SNPs, in order, for each haplotype. 44 | Providing an optional "master" FASTA sequence will permit Gretel to "fill in" the non-SNP positions 45 | (*i.e.* the positions between `-s` and `-e` that do not appear in the VCF) with the nucleotide from 46 | the pseudo-reference. 47 | 48 | Gretel Outputs 49 | -------------- 50 | 51 | out.fasta 52 | ~~~~~~~~~ 53 | A **FASTA** containing each of the recovered sequences, in the order they were found. 54 | Each sequence is named `__-`. Sequences are not wrapped. 55 | 56 | gretel.crumbs 57 | ~~~~~~~~~~~~~ 58 | 59 | Additionally, Gretel outputs a whimsically named *crumbs* file, containing some potentially 60 | interesting metadata, as well as a record of each recovered haplotype. 61 | The first row is a comment containing the following (in order): 62 | 63 | * The number of SNPs across the region of interest 64 | * The number of 'crumbs': paired observations added to the Hansel matrix 65 | * The number of 'slices': reads with at least one observation added to the Hansel matrix 66 | * The chosen value of `L` for the `L`'th order Markov chain 67 | 68 | The rest of the file contains tab-delimited metadata for each recovered haplotype: 69 | 70 | * The iteration number, starting from 0 71 | * The number of times this haplotype was returned 72 | * The *weighted* likelihood of the haplotype, given the Hansel matrix at the time the haplotype was recovered (comma-sep for each time the haplotype was returned) 73 | * The *unweighted* likelihood of the haplotype, given the Hansel matrix at the time the reads were parsed (comma-sep for each time the haplotype was returned) 74 | * The haplotype magnitude: total number of observations removed from the Hansel matrix by the reweighting mechanism 75 | 76 | In practice, we rank with the **weighted** likelihoods to discern the haplotypes most likely to exist in the metagenome. 77 | One may attempt to use the *unweighted* likelihoods as a means to compare the abundance, or read support, **between the returned haplotypes** (*i.e.* not necessarily the metagenome as a whole). 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | gretel-logo 4 |

5 |

Gretel

6 |

An algorithm for recovering haplotypes from metagenomes. Sister to Hansel. 7 |

8 |

9 | License 10 | bioconda 11 |

12 |
13 | 14 | 15 | What is it? 16 | ----------- 17 | 18 | **Gretel** is a Python package providing a command line tool for the recovery of haplotypes 19 | from metagenomic data sets. **Gretel** parses an alignment of reads into a **Hansel** matrix 20 | and uses the evidence of SNP pairs observed to appear on the same reads to probabilistically 21 | reconstruct the most likely haplotypes. 22 | 23 | **Gretel** uses an L'th order Markov chain model to reconstruct likely sequences 24 | of variants that constitute haplotypes in the real metagenome. 25 | Our approach involves graph-like traversal of the data within the **Hansel** matrix. 26 | Edges are probabilitically weighted based on the evidence on the reads, as well as 27 | the haplotype as it has been reconstructed so far. 28 | 29 | What can I use it for? 30 | ---------------------- 31 | 32 | **Gretel** is designed to recover haplotypes from your data set, without the need for 33 | setting (or optimisation) of any parameters. 34 | **Gretel** does not require a priori knowledge of your input data (such as its contents, or 35 | the true number of haplotypes) and makes no assumptions 36 | regarding the distributions of alleles at variant sites and uses the available evidence 37 | from the aligned reads without altering or discarding the observed varations. 38 | 39 | Why should I use it? 40 | -------------------- 41 | 42 | **Gretel** is the first tool capable of recovering haplotypes from metagenomes. 43 | Whilst tools exist for analogous haplotyping problems, such as the assembly of 44 | viral quasispecies, typically these tools rely on overlap approaches that create 45 | too many unranked haplotypes. **Gretel** is capable of ranking the haplotypes it 46 | outputs by their likelihood. 47 | 48 | **Gretel** requires no parameters and our approach is robust to sequencing error 49 | and misalignment noise. 50 | 51 | Requirements 52 | ------------ 53 | 54 | 55 | $ pip install numpy hanselx pysam PyVCF 56 | 57 | Install 58 | ------- 59 | 60 | 61 | $ pip install gretel 62 | 63 | Alternatively, Gretel has been packaged for bioconda (Thanks [@johnne!](https://github.com/johnne)): 64 | 65 | $ conda install -c bioconda gretel 66 | 67 | Usage 68 | ----- 69 | You will require a sorted BAM containing your reads, aligned to some pseudo-reference. 70 | You can use any sequence as your reference, such as a consensus assembly of the 71 | metagenomic reads, or a known strain reference (such as HIV-1). 72 | You must bgzip and tabix your VCF. 73 | 74 | $ gretel -s <1-start> -e <1-end> --master -o 75 | 76 | 77 | Citation 78 | -------- 79 | ``` 80 | @article{10.1093/bioinformatics/btaa977, 81 | author = {Nicholls, Samuel M and Aubrey, Wayne and De Grave, Kurt and Schietgat, Leander and Creevey, Christopher J and Clare, Amanda}, 82 | title = "{On the complexity of haplotyping a microbial community}", 83 | journal = {Bioinformatics}, 84 | volume = {37}, 85 | number = {10}, 86 | pages = {1360-1366}, 87 | year = {2021}, 88 | month = {01}, 89 | issn = {1367-4803}, 90 | doi = {10.1093/bioinformatics/btaa977}, 91 | url = {https://doi.org/10.1093/bioinformatics/btaa977}, 92 | eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/10/1360/38663805/btaa977.pdf}, 93 | } 94 | ``` 95 | [Read more on Twitter](https://twitter.com/samstudio8/status/1329406136592834564) 96 | 97 | License 98 | ------- 99 | Hansel and Gretel are distributed under the MIT license, see LICENSE. 100 | -------------------------------------------------------------------------------- /gretel/gretel.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from math import log,log10,exp 3 | import random 4 | 5 | import numpy as np 6 | 7 | from hansel import Hansel 8 | from . import util 9 | 10 | #TODO Should the denom of the conditional use the unique variants at i-l or i? 11 | #TODO Util to parse known input and return SNP seq 12 | 13 | def reweight_hansel_from_path(hansel, path, ratio): 14 | """ 15 | Given a completed path, reweight the applicable pairwise observations in the Hansel structure. 16 | 17 | Parameters 18 | ---------- 19 | hansel : :py:class:`hansel.hansel.Hansel` 20 | The Hansel structure currently being explored by Gretel. 21 | 22 | path : list{str} 23 | The ordered sequence of selected variants. 24 | 25 | ratio : float 26 | The proportion of evidence to remove from each paired observation that 27 | was considered to recover the provided path. 28 | 29 | It is recommended this be the smallest marginal distribution observed across selected variants. 30 | 31 | *i.e.* For each selected variant in the path, note the value of the 32 | marginal distribution for the probability of observing that particular 33 | variant at that genomic position. Parameterise the minimum value of 34 | those marginals. 35 | 36 | Returns 37 | ------- 38 | Spent Observations : float 39 | The sum of removed observations from the Hansel structure. 40 | """ 41 | 42 | size = 0 43 | 44 | """ 45 | # Old re-implementation sans flip 46 | for i in range(0, len(path)-1): 47 | for j in range(0, i+1+1): 48 | # Reduce read supports 49 | if i == j: 50 | continue 51 | size += hansel.reweight_observation(path[i], path[j], i, j, ratio) 52 | return size 53 | 54 | # Reduce adjacent evidence pairs 55 | for i in range(len(path)-1): 56 | size += hansel.reweight_observation(path[i], path[i+1], i, i+1, ratio) 57 | 58 | # Reduce other evidence pairs 59 | for j in range(1, len(path)): 60 | for i in range(0, j-1): 61 | size += hansel.reweight_observation(path[i], path[j], i, j, ratio) 62 | 63 | # Reduce other non-evidence pairs 64 | # I have no idea why this works so well, so we'll need to have a think about it 65 | # before we put it in Gretel proper... 66 | #for j in range(1, len(path)): 67 | # for i in range(0, j-1): 68 | # size += hansel.reweight_observation(path[j], path[i], j, i, ratio) 69 | # pass 70 | 71 | # Reweight the rest of the matrix because we can at least explain that 72 | hansel.reweight_matrix( ratio / (hansel.L/10) ) 73 | 74 | sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size)) 75 | return size 76 | """ 77 | 78 | # Let's keep the RW system as-is for now... 79 | size = 0 80 | for i in range(0, len(path)): 81 | for j in range(0, i+1+1): 82 | # Reduce read supports 83 | if i >= len(path)-1: 84 | size += hansel.reweight_observation(path[i], path[j], i, i+1, ratio) 85 | break #??? 86 | else: 87 | if j < i: 88 | # This isn't just a case of j < i, but means that we are looking 89 | # at the two SNPs the wrong way around, we must switch them before 90 | # handing them over to reweight_observation 91 | t_i = j 92 | t_j = i 93 | else: 94 | t_i = i 95 | t_j = j 96 | size += hansel.reweight_observation(path[t_i], path[t_j], t_i, t_j, ratio) 97 | sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size)) 98 | return size 99 | 100 | ## PATH GENERATION ############################################################ 101 | 102 | def generate_path(n_snps, hansel, original_hansel, debug_hpos=None): 103 | """ 104 | Explore and generate the most likely path (haplotype) through the observed Hansel structure. 105 | 106 | Parameters 107 | ---------- 108 | n_snps : int 109 | The number of variants. 110 | 111 | hansel : :py:class:`hansel.hansel.Hansel` 112 | The Hansel structure currently being explored by Gretel. 113 | 114 | original_hansel : :py:class:`hansel.hansel.Hansel` 115 | A copy of the Hansel structure created by Gretel, before any reweighting. 116 | 117 | Returns 118 | ------- 119 | Path : list{str} or None 120 | The sequence of variants that represent the completed path (or haplotype), or None 121 | if one could not be successfully constructed. 122 | 123 | Path Probabilities : dict{str, float} 124 | The `hp_original` (orignal Hansel) and `hp_current` (current Hansel) joint 125 | probabilities of the variants in the returned path occurring together 126 | in the given order. 127 | 128 | Minimum Marginal : float 129 | The smallest marginal distribution observed across selected variants. 130 | """ 131 | 132 | # Cross the metahaplome in a greedily, naive fashion to establish a base path 133 | # This seeds the rest of the path generation (we might want to just select 134 | # a random path here in future) 135 | 136 | running_prob = 0.0 137 | running_prob_uw = 0.0 138 | current_path = [ hansel.symbols_d['_'] ] # start with the dummy 139 | marginals = [] 140 | 141 | # Find path 142 | sys.stderr.write("[NOTE] *Establishing next path\n") 143 | for snp in range(1, n_snps+1): 144 | #sys.stderr.write("\t*** ***\n") 145 | #sys.stderr.write("\t[SNP_] SNP %d\n" % snp) 146 | 147 | dh_flag = False 148 | if debug_hpos: 149 | if snp in debug_hpos: 150 | dh_flag = True 151 | 152 | # Get marginal and calculate branch probabilities for each available 153 | # mallele, given the current path seen so far 154 | # Select the next branch and append it to the path 155 | curr_branches = hansel.get_edge_weights_at(snp, current_path, debug=dh_flag) 156 | #sys.stderr.write("\t[TREE] %s\n" % curr_branches) 157 | # Return the symbol and probability of the next base to add to the 158 | # current path based on the best marginal 159 | next_v = 0.0 160 | next_m = None 161 | 162 | if debug_hpos: 163 | if snp in debug_hpos: 164 | print(curr_branches) 165 | 166 | for symbol in curr_branches: 167 | if str(symbol) == "total": 168 | continue 169 | if next_m is None: 170 | next_v = curr_branches[symbol] 171 | next_m = symbol 172 | elif curr_branches[symbol] > next_v: 173 | next_v = curr_branches[symbol] 174 | next_m = symbol 175 | 176 | if next_m == None: 177 | sys.stderr.write('''[NOTE] Unable to select next branch from SNP %d to %d 178 | By design, Gretel will attempt to recover haplotypes until a hole in the graph has been found. 179 | Recovery will intentionally terminate now.\n''' % (snp-1, snp)) 180 | return None, None, None 181 | 182 | selected_edge_weight = hansel.get_marginal_of_at(next_m, snp) 183 | marginals.append(selected_edge_weight) #NOTE This isn't a log, as it is used as a ratio later 184 | 185 | running_prob += log10(selected_edge_weight) 186 | running_prob_uw += log10(original_hansel.get_marginal_of_at(next_m, snp)) 187 | current_path.append(next_m) 188 | 189 | return current_path, {"hp_original": running_prob_uw, "hp_current": running_prob}, min(marginals) 190 | 191 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Gretel.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Gretel.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help 23 | help: 24 | @echo "Please use \`make ' where is one of" 25 | @echo " html to make standalone HTML files" 26 | @echo " dirhtml to make HTML files named index.html in directories" 27 | @echo " singlehtml to make a single large HTML file" 28 | @echo " pickle to make pickle files" 29 | @echo " json to make JSON files" 30 | @echo " htmlhelp to make HTML files and a HTML help project" 31 | @echo " qthelp to make HTML files and a qthelp project" 32 | @echo " applehelp to make an Apple Help Book" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | @echo " coverage to run coverage check of the documentation (if enabled)" 49 | 50 | .PHONY: clean 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | .PHONY: html 55 | html: 56 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 57 | @echo 58 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 59 | 60 | .PHONY: dirhtml 61 | dirhtml: 62 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 63 | @echo 64 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 65 | 66 | .PHONY: singlehtml 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | .PHONY: pickle 73 | pickle: 74 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 75 | @echo 76 | @echo "Build finished; now you can process the pickle files." 77 | 78 | .PHONY: json 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | .PHONY: htmlhelp 85 | htmlhelp: 86 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 87 | @echo 88 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 89 | ".hhp project file in $(BUILDDIR)/htmlhelp." 90 | 91 | .PHONY: qthelp 92 | qthelp: 93 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 94 | @echo 95 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 96 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 97 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Gretel.qhcp" 98 | @echo "To view the help file:" 99 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Gretel.qhc" 100 | 101 | .PHONY: applehelp 102 | applehelp: 103 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 104 | @echo 105 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 106 | @echo "N.B. You won't be able to view it unless you put it in" \ 107 | "~/Library/Documentation/Help or install it in your application" \ 108 | "bundle." 109 | 110 | .PHONY: devhelp 111 | devhelp: 112 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 113 | @echo 114 | @echo "Build finished." 115 | @echo "To view the help file:" 116 | @echo "# mkdir -p $$HOME/.local/share/devhelp/Gretel" 117 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Gretel" 118 | @echo "# devhelp" 119 | 120 | .PHONY: epub 121 | epub: 122 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 123 | @echo 124 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 125 | 126 | .PHONY: latex 127 | latex: 128 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 129 | @echo 130 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 131 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 132 | "(use \`make latexpdf' here to do that automatically)." 133 | 134 | .PHONY: latexpdf 135 | latexpdf: 136 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 137 | @echo "Running LaTeX files through pdflatex..." 138 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 139 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 140 | 141 | .PHONY: latexpdfja 142 | latexpdfja: 143 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 144 | @echo "Running LaTeX files through platex and dvipdfmx..." 145 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 146 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 147 | 148 | .PHONY: text 149 | text: 150 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 151 | @echo 152 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 153 | 154 | .PHONY: man 155 | man: 156 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 157 | @echo 158 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 159 | 160 | .PHONY: texinfo 161 | texinfo: 162 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 163 | @echo 164 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 165 | @echo "Run \`make' in that directory to run these through makeinfo" \ 166 | "(use \`make info' here to do that automatically)." 167 | 168 | .PHONY: info 169 | info: 170 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 171 | @echo "Running Texinfo files through makeinfo..." 172 | make -C $(BUILDDIR)/texinfo info 173 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 174 | 175 | .PHONY: gettext 176 | gettext: 177 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 178 | @echo 179 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 180 | 181 | .PHONY: changes 182 | changes: 183 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 184 | @echo 185 | @echo "The overview file is in $(BUILDDIR)/changes." 186 | 187 | .PHONY: linkcheck 188 | linkcheck: 189 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 190 | @echo 191 | @echo "Link check complete; look for any errors in the above output " \ 192 | "or in $(BUILDDIR)/linkcheck/output.txt." 193 | 194 | .PHONY: doctest 195 | doctest: 196 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 197 | @echo "Testing of doctests in the sources finished, look at the " \ 198 | "results in $(BUILDDIR)/doctest/output.txt." 199 | 200 | .PHONY: coverage 201 | coverage: 202 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 203 | @echo "Testing of coverage in the sources finished, look at the " \ 204 | "results in $(BUILDDIR)/coverage/python.txt." 205 | 206 | .PHONY: xml 207 | xml: 208 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 209 | @echo 210 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 211 | 212 | .PHONY: pseudoxml 213 | pseudoxml: 214 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 215 | @echo 216 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 217 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Gretel documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jun 13 13:40:08 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | # If extensions (or modules to document with autodoc) are in another directory, 19 | # add these directories to sys.path here. If the directory is relative to the 20 | # documentation root, use os.path.abspath to make it absolute, like shown here. 21 | sys.path.insert(0, os.path.abspath('.')) 22 | sys.path.insert(0, os.path.abspath('../')) 23 | 24 | import mock 25 | MOCK_MODULES = ['numpy', 'matplotlib', 'pysam', 'vcf', 'hansel'] 26 | for mod_name in MOCK_MODULES: 27 | sys.modules[mod_name] = mock.Mock() 28 | 29 | # -- General configuration ------------------------------------------------ 30 | 31 | # If your documentation needs a minimal Sphinx version, state it here. 32 | #needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | 'sphinx.ext.autodoc', 39 | 'sphinx.ext.doctest', 40 | 'sphinx.ext.todo', 41 | 'sphinx.ext.coverage', 42 | 'sphinx.ext.mathjax', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx.ext.intersphinx', 45 | 'sphinx.ext.napoleon', 46 | ] 47 | napoleon_google_docstring = False 48 | napoleon_use_param = False 49 | napoleon_use_ivar = True 50 | 51 | # Add any paths that contain templates here, relative to this directory. 52 | templates_path = ['_templates'] 53 | 54 | # The suffix(es) of source filenames. 55 | # You can specify multiple suffix as a list of string: 56 | # source_suffix = ['.rst', '.md'] 57 | source_suffix = '.rst' 58 | 59 | # The encoding of source files. 60 | #source_encoding = 'utf-8-sig' 61 | 62 | # The master toctree document. 63 | master_doc = 'index' 64 | 65 | # General information about the project. 66 | project = u'Gretel' 67 | copyright = u'2016, Sam Nicholls' 68 | author = u'Sam Nicholls' 69 | 70 | # The version info for the project you're documenting, acts as replacement for 71 | # |version| and |release|, also used in various other places throughout the 72 | # built documents. 73 | # 74 | # The short X.Y version. 75 | version = u'0.0.1a' 76 | # The full version, including alpha/beta/rc tags. 77 | release = u'0.0.1a' 78 | 79 | # The language for content autogenerated by Sphinx. Refer to documentation 80 | # for a list of supported languages. 81 | # 82 | # This is also used if you do content translation via gettext catalogs. 83 | # Usually you set "language" from the command line for these cases. 84 | language = None 85 | 86 | # There are two options for replacing |today|: either, you set today to some 87 | # non-false value, then it is used: 88 | #today = '' 89 | # Else, today_fmt is used as the format for a strftime call. 90 | #today_fmt = '%B %d, %Y' 91 | 92 | # List of patterns, relative to source directory, that match files and 93 | # directories to ignore when looking for source files. 94 | exclude_patterns = ['_build'] 95 | 96 | # The reST default role (used for this markup: `text`) to use for all 97 | # documents. 98 | #default_role = None 99 | 100 | # If true, '()' will be appended to :func: etc. cross-reference text. 101 | #add_function_parentheses = True 102 | 103 | # If true, the current module name will be prepended to all description 104 | # unit titles (such as .. function::). 105 | #add_module_names = True 106 | 107 | # If true, sectionauthor and moduleauthor directives will be shown in the 108 | # output. They are ignored by default. 109 | #show_authors = False 110 | 111 | # The name of the Pygments (syntax highlighting) style to use. 112 | pygments_style = 'sphinx' 113 | 114 | # A list of ignored prefixes for module index sorting. 115 | #modindex_common_prefix = [] 116 | 117 | # If true, keep warnings as "system message" paragraphs in the built documents. 118 | #keep_warnings = False 119 | 120 | # If true, `todo` and `todoList` produce output, else they produce nothing. 121 | todo_include_todos = False 122 | 123 | 124 | # -- Options for HTML output ---------------------------------------------- 125 | 126 | # The theme to use for HTML and HTML Help pages. See the documentation for 127 | # a list of builtin themes. 128 | html_theme = 'default' 129 | 130 | # Theme options are theme-specific and customize the look and feel of a theme 131 | # further. For a list of options available for each theme, see the 132 | # documentation. 133 | #html_theme_options = {} 134 | 135 | # Add any paths that contain custom themes here, relative to this directory. 136 | #html_theme_path = [] 137 | 138 | # The name for this set of Sphinx documents. If None, it defaults to 139 | # " v documentation". 140 | #html_title = None 141 | 142 | # A shorter title for the navigation bar. Default is the same as html_title. 143 | #html_short_title = None 144 | 145 | # The name of an image file (relative to this directory) to place at the top 146 | # of the sidebar. 147 | #html_logo = None 148 | 149 | # The name of an image file (within the static path) to use as favicon of the 150 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 151 | # pixels large. 152 | #html_favicon = None 153 | 154 | # Add any paths that contain custom static files (such as style sheets) here, 155 | # relative to this directory. They are copied after the builtin static files, 156 | # so a file named "default.css" will overwrite the builtin "default.css". 157 | html_static_path = ['_static'] 158 | 159 | # Add any extra paths that contain custom files (such as robots.txt or 160 | # .htaccess) here, relative to this directory. These files are copied 161 | # directly to the root of the documentation. 162 | #html_extra_path = [] 163 | 164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 165 | # using the given strftime format. 166 | #html_last_updated_fmt = '%b %d, %Y' 167 | 168 | # If true, SmartyPants will be used to convert quotes and dashes to 169 | # typographically correct entities. 170 | #html_use_smartypants = True 171 | 172 | # Custom sidebar templates, maps document names to template names. 173 | #html_sidebars = {} 174 | 175 | # Additional templates that should be rendered to pages, maps page names to 176 | # template names. 177 | #html_additional_pages = {} 178 | 179 | # If false, no module index is generated. 180 | #html_domain_indices = True 181 | 182 | # If false, no index is generated. 183 | #html_use_index = True 184 | 185 | # If true, the index is split into individual pages for each letter. 186 | #html_split_index = False 187 | 188 | # If true, links to the reST sources are added to the pages. 189 | #html_show_sourcelink = True 190 | 191 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 192 | #html_show_sphinx = True 193 | 194 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 195 | #html_show_copyright = True 196 | 197 | # If true, an OpenSearch description file will be output, and all pages will 198 | # contain a tag referring to it. The value of this option must be the 199 | # base URL from which the finished HTML is served. 200 | #html_use_opensearch = '' 201 | 202 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 203 | #html_file_suffix = None 204 | 205 | # Language to be used for generating the HTML full-text search index. 206 | # Sphinx supports the following languages: 207 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 208 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 209 | #html_search_language = 'en' 210 | 211 | # A dictionary with options for the search language support, empty by default. 212 | # Now only 'ja' uses this config value 213 | #html_search_options = {'type': 'default'} 214 | 215 | # The name of a javascript file (relative to the configuration directory) that 216 | # implements a search results scorer. If empty, the default will be used. 217 | #html_search_scorer = 'scorer.js' 218 | 219 | # Output file base name for HTML help builder. 220 | htmlhelp_basename = 'Greteldoc' 221 | 222 | # -- Options for LaTeX output --------------------------------------------- 223 | 224 | latex_elements = { 225 | # The paper size ('letterpaper' or 'a4paper'). 226 | #'papersize': 'letterpaper', 227 | 228 | # The font size ('10pt', '11pt' or '12pt'). 229 | #'pointsize': '10pt', 230 | 231 | # Additional stuff for the LaTeX preamble. 232 | #'preamble': '', 233 | 234 | # Latex figure (float) alignment 235 | #'figure_align': 'htbp', 236 | } 237 | 238 | # Grouping the document tree into LaTeX files. List of tuples 239 | # (source start file, target name, title, 240 | # author, documentclass [howto, manual, or own class]). 241 | latex_documents = [ 242 | (master_doc, 'Gretel.tex', u'Gretel Documentation', 243 | u'Sam Nicholls', 'manual'), 244 | ] 245 | 246 | # The name of an image file (relative to this directory) to place at the top of 247 | # the title page. 248 | #latex_logo = None 249 | 250 | # For "manual" documents, if this is true, then toplevel headings are parts, 251 | # not chapters. 252 | #latex_use_parts = False 253 | 254 | # If true, show page references after internal links. 255 | #latex_show_pagerefs = False 256 | 257 | # If true, show URL addresses after external links. 258 | #latex_show_urls = False 259 | 260 | # Documents to append as an appendix to all manuals. 261 | #latex_appendices = [] 262 | 263 | # If false, no module index is generated. 264 | #latex_domain_indices = True 265 | 266 | 267 | # -- Options for manual page output --------------------------------------- 268 | 269 | # One entry per manual page. List of tuples 270 | # (source start file, name, description, authors, manual section). 271 | man_pages = [ 272 | (master_doc, 'gretel', u'Gretel Documentation', 273 | [author], 1) 274 | ] 275 | 276 | # If true, show URL addresses after external links. 277 | #man_show_urls = False 278 | 279 | 280 | # -- Options for Texinfo output ------------------------------------------- 281 | 282 | # Grouping the document tree into Texinfo files. List of tuples 283 | # (source start file, target name, title, author, 284 | # dir menu entry, description, category) 285 | texinfo_documents = [ 286 | (master_doc, 'Gretel', u'Gretel Documentation', 287 | author, 'Gretel', 'One line description of project.', 288 | 'Miscellaneous'), 289 | ] 290 | 291 | # Documents to append as an appendix to all manuals. 292 | #texinfo_appendices = [] 293 | 294 | # If false, no module index is generated. 295 | #texinfo_domain_indices = True 296 | 297 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 298 | #texinfo_show_urls = 'footnote' 299 | 300 | # If true, do not generate a @detailmenu in the "Top" node's menu. 301 | #texinfo_no_detailmenu = False 302 | 303 | 304 | # Example configuration for intersphinx: refer to the Python standard library. 305 | intersphinx_mapping = { 306 | "python": ('https://docs.python.org/', None), 307 | "pysam": ('http://pysam.readthedocs.io/en/latest', None), 308 | "hansel": ('http://hansel.readthedocs.io/en/latest/', None), 309 | } 310 | -------------------------------------------------------------------------------- /gretel/cmd.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | import os 5 | 6 | from . import gretel 7 | from . import util 8 | 9 | __version__ = "0.0.94" 10 | 11 | def main(): 12 | """Gretel: A metagenomic haplotyper.""" 13 | parser = argparse.ArgumentParser(description="Gretel: A metagenomic haplotyper.") 14 | parser.add_argument("bam") 15 | parser.add_argument("vcf") 16 | parser.add_argument("contig") 17 | parser.add_argument("-s", "--start", type=int, default=1, help="1-indexed included start base position [default: 1]") 18 | parser.add_argument("-e", "--end", type=int, default=-1, help="1-indexed inlcuded end base position [default: reference length]") 19 | 20 | #parser.add_argument("-l", "--lorder", type=int, default=0, help="Order of markov chain to predict next nucleotide [default calculated from read data]") 21 | parser.add_argument("-p", "--paths", type=int, default=100, help="Maximum number of paths to generate [default:100]") 22 | 23 | parser.add_argument("--master", default=None, help="Master sequence (will be used to fill in homogeneous gaps in haplotypes, otherwise --gapchar)") #TODO Use something other than N? Should probably be a valid IUPAC 24 | parser.add_argument("--gapchar", default="N", help="Character to fill homogeneous gaps in haplotypes if no --master [default N]") 25 | parser.add_argument("--delchar", default="", help="Character to output in haplotype for deletion (eg. -) [default is blank]") 26 | 27 | parser.add_argument("--quiet", default=False, action='store_true', help="Don't output anything other than a single summary line.") 28 | #parser.add_argument("--sentinels", default=False, action='store_true', help="Add additional sentinels for read ends [default:False][EXPERIMENTAL]") 29 | parser.add_argument("-o", "--out", default=".", help="Output directory [default .]") 30 | parser.add_argument("-@", "--threads", type=int, default=1, help="Number of BAM iterators [default 1]") 31 | 32 | parser.add_argument("--debugreads", type=str, default="", help="A newline delimited list of read names to output debug data when parsing the BAM") 33 | parser.add_argument("--debugpos", type=str, default="", help="A newline delimited list of 1-indexed genomic positions to output debug data when parsing the BAM") 34 | parser.add_argument("--debughpos", type=str, default=",", help="A comma delimited list of 1-indexed SNP positions to output debug data when predicting haplotypes") 35 | 36 | parser.add_argument("--dumpmatrix", type=str, default=None, help="Location to dump the Hansel matrix to disk") 37 | parser.add_argument("--dumpsnps", type=str, default=None, help="Location to dump the SNP positions to disk") 38 | 39 | parser.add_argument("--pepper", action="store_true", help="enable a more permissive pileup by setting the pysam pileup stepper to 'all', instead of 'samtools'.\nNote that this will allow improper pairs.") 40 | 41 | parser.add_argument("--version", action="version", version="%(prog)s " + __version__) 42 | 43 | ARGS = parser.parse_args() 44 | 45 | debug_hpos = [] 46 | if ARGS.debughpos: 47 | for x in ARGS.debughpos.split(","): 48 | try: 49 | debug_hpos.append( int(x) ) 50 | except: 51 | pass 52 | 53 | if ARGS.end == -1: 54 | ARGS.end = util.get_ref_len_from_bam(ARGS.bam, ARGS.contig) 55 | sys.stderr.write("[NOTE] Setting end_pos to %d" % ARGS.end) 56 | 57 | debug_reads = set([]) 58 | if ARGS.debugreads: 59 | debug_fofn = open(ARGS.debugreads) 60 | for line in debug_fofn: 61 | debug_reads.add(line.strip()) 62 | 63 | debug_pos = set([]) 64 | if ARGS.debugpos: 65 | debug_fofn = open(ARGS.debugpos) 66 | for line in debug_fofn: 67 | debug_pos.add(int(line.strip())) 68 | 69 | VCF_h = util.process_vcf(ARGS.vcf, ARGS.contig, ARGS.start, ARGS.end) 70 | if ARGS.dumpsnps: 71 | snp_fh = open(ARGS.dumpsnps, 'w') 72 | for k in sorted(VCF_h["snp_fwd"].keys()): 73 | snp_fh.write("%d\t%d\t%d\n" % (VCF_h["snp_fwd"][k]+1, k, k-ARGS.start+1)) 74 | snp_fh.close() 75 | 76 | # Could we optimise for lower triangle by collapsing one of the dimensions 77 | # such that Z[m][n][i][j] == Z[m][n][i + ((j-1)*(j))/2] 78 | hansel = util.load_from_bam(ARGS.bam, ARGS.contig, ARGS.start, ARGS.end, VCF_h, n_threads=ARGS.threads, debug_reads=debug_reads, debug_pos=debug_pos, stepper="all" if ARGS.pepper else "samtools") 79 | original_hansel = hansel.copy() 80 | 81 | if ARGS.dumpmatrix: 82 | hansel.save_hansel_dump(ARGS.dumpmatrix) 83 | 84 | # Check if there is a gap in the matrix 85 | for i in range(0, VCF_h["N"]+1): 86 | marginal = hansel.get_counts_at(i) 87 | 88 | if i > 0: 89 | snp_rev = VCF_h["snp_rev"][i-1] 90 | else: 91 | snp_rev = 0 92 | if marginal.get("total", 0) == 0: 93 | sys.stderr.write('''[FAIL] Unable to recover pairwise evidence concerning SNP #%d at position %d 94 | Gretel needs every SNP to appear on a read with at least one other SNP, at least once. 95 | There is no read in your data set that bridges SNP #%d with any of its neighbours. 96 | 97 | * If you are trying to run Gretel along an entire contig or genome, please note that 98 | this is not the recommended usage for Gretel, as it was intended to uncover the 99 | variation in a metahaplome: the set of haplotypes for a specific gene. 100 | See our pre-print https://doi.org/10.1101/223404 for more information 101 | 102 | Consider running a prediction tool such as `prokka` on your assembly or reference 103 | and using the CDS regions in the GFF for corresponding genes of interest to 104 | uncover haplotypes with Gretel instead. 105 | 106 | * If you are already doing this, consider calling for SNPs more aggressively. 107 | We use `snpper` (https://github.com/SamStudio8/gretel-test/blob/master/snpper.py) 108 | to determine any site in a BAM that has at least one read in disagreement with 109 | the reference as a SNP. Although this introduces noise from alignment and sequence 110 | error, Gretel is fairly robust. Importantly, this naive calling method will 111 | likely close gaps between SNPs and permit recovery. 112 | 113 | * Finally, consider that the gaps are indicative that your reads do not support 114 | one or more parts of your assembly or reference. You could try and find or construct 115 | a more suitable reference, or reduce the size of the recovery window. 116 | 117 | Sorry :(\n''' % (i, snp_rev, i)) 118 | sys.exit(1) 119 | 120 | PATHS = {} 121 | 122 | # Spew out exciting information about the SNPs 123 | if not ARGS.quiet: 124 | print ("i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot") 125 | last_rev = 0 126 | for i in range(0, VCF_h["N"]+1): 127 | marginal = hansel.get_counts_at(i) 128 | marginal = {str(x): marginal[x] for x in marginal} 129 | snp_rev = 0 130 | if i > 0: 131 | snp_rev = VCF_h["snp_rev"][i-1] 132 | print ("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % ( 133 | i, 134 | snp_rev, 135 | snp_rev - last_rev, 136 | marginal.get("A", 0), 137 | marginal.get("C", 0), 138 | marginal.get("G", 0), 139 | marginal.get("T", 0), 140 | marginal.get("N", 0), 141 | marginal.get("-", 0), 142 | marginal.get("_", 0), 143 | marginal.get("total", 0), 144 | )) 145 | last_rev = snp_rev 146 | 147 | 148 | # Make some genes 149 | SPINS = ARGS.paths 150 | ongoing_mag = 0 151 | for i in range(0, SPINS): 152 | init_path, init_prob, init_min = gretel.generate_path(VCF_h["N"], hansel, original_hansel, debug_hpos=debug_hpos) 153 | if init_path == None: 154 | break 155 | current_path = init_path 156 | 157 | MIN_REMOVE = 0.01 # 1% 158 | if init_min < MIN_REMOVE: 159 | sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE)) 160 | init_min = MIN_REMOVE 161 | rw_magnitude = gretel.reweight_hansel_from_path(hansel, init_path, init_min) 162 | 163 | #TODO Horribly inefficient. 164 | current_path_str = "".join([str(x) for x in current_path]) 165 | if current_path_str not in PATHS: 166 | PATHS[current_path_str] = { 167 | "hp_current": [], 168 | "hp_original": [], 169 | "i": [], 170 | "i_0": i, 171 | "n": 0, 172 | "magnitude": 0, 173 | "hansel_path": current_path, 174 | } 175 | PATHS[current_path_str]["n"] += 1 176 | PATHS[current_path_str]["i"].append(i) 177 | PATHS[current_path_str]["magnitude"] += rw_magnitude 178 | PATHS[current_path_str]["hp_current"].append(init_prob["hp_current"]) 179 | PATHS[current_path_str]["hp_original"].append(init_prob["hp_original"]) 180 | 181 | # Output FASTA 182 | dirn = ARGS.out + "/" 183 | fasta_out_fh = open(dirn+"out.fasta", "w") 184 | hfasta_out_fh = open(dirn+"snp.fasta", "w") 185 | 186 | if ARGS.master: 187 | master_fa = util.load_fasta(ARGS.master) 188 | master_seq = master_fa.fetch(master_fa.references[0]) 189 | else: 190 | master_seq = [' '] * ARGS.end 191 | 192 | for p in sorted(PATHS, key=lambda x: PATHS[x]["i_0"]): 193 | p = PATHS[p] 194 | path = p["hansel_path"] 195 | i = p["i_0"] 196 | 197 | seq = list(master_seq[:]) 198 | for j, mallele in enumerate(path[1:]): 199 | snp_pos_on_master = VCF_h["snp_rev"][j] 200 | try: 201 | if mallele == hansel.symbols_d["-"]: 202 | # It's a deletion, don't print a SNP 203 | seq[snp_pos_on_master-1] = ARGS.delchar 204 | else: 205 | seq[snp_pos_on_master-1] = mallele 206 | except IndexError: 207 | print (path, len(seq), snp_pos_on_master-1) 208 | sys.exit(1) 209 | 210 | # Coerce HanselSymbols to str 211 | to_write = "".join(str(x) for x in seq[ARGS.start-1 : ARGS.end]) 212 | if not ARGS.master: 213 | to_write = to_write.replace(' ', ARGS.gapchar) 214 | 215 | fasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original? 216 | fasta_out_fh.write("%s\n" % to_write) 217 | 218 | hfasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original? 219 | hfasta_out_fh.write("%s\n" % "".join([str(x) for x in path[1:]])) 220 | fasta_out_fh.close() 221 | hfasta_out_fh.close() 222 | 223 | #TODO datetime, n_obs, n_slices, avg_obs_len, L, n_paths, n_avg_loglik 224 | crumb_file = open(dirn+"gretel.crumbs", "w") 225 | crumb_file.write("# %d\t%d\t%d\t%.2f\n" % ( 226 | VCF_h["N"], 227 | hansel.n_crumbs, 228 | hansel.n_slices, 229 | hansel.L, 230 | )) 231 | 232 | for p in sorted(PATHS, key=lambda x: PATHS[x]["hp_current"][0], reverse=True): 233 | p = PATHS[p] 234 | crumb_file.write("%d\t%d\t%s\t%s\t%.2f\n" % ( 235 | p["i_0"], 236 | p["n"], 237 | ",".join(["%.2f" % x for x in p["hp_current"]]), 238 | ",".join(["%.2f" % x for x in p["hp_original"]]), 239 | p["magnitude"], 240 | )) 241 | -------------------------------------------------------------------------------- /gretel/util.py: -------------------------------------------------------------------------------- 1 | import pysam 2 | import numpy as np 3 | from math import ceil 4 | from hansel import Hansel 5 | import vcf 6 | 7 | from multiprocessing import Process, Queue, Value 8 | import sys 9 | 10 | def get_ref_len_from_bam(bam_path, target_contig): 11 | """ 12 | Fetch the length of a given reference sequence from a :py:class:`pysam.AlignmentFile`. 13 | 14 | Parameters 15 | ---------- 16 | bam_path : str 17 | Path to the BAM alignment 18 | 19 | target_contig : str 20 | The name of the contig for which to recover haplotypes. 21 | 22 | Returns 23 | ------- 24 | end_pos : int 25 | The 1-indexed genomic position at which to stop considering variants. 26 | """ 27 | bam = pysam.AlignmentFile(bam_path) 28 | end = bam.lengths[bam.get_tid(target_contig)] 29 | bam.close() 30 | 31 | return end 32 | 33 | def load_from_bam(bam_path, target_contig, start_pos, end_pos, vcf_handler, use_end_sentinels=False, n_threads=1, debug_reads=False, debug_pos=False, stepper="samtools"): 34 | """ 35 | Load variants observed in a :py:class:`pysam.AlignmentFile` to 36 | an instance of :py:class:`hansel.hansel.Hansel`. 37 | 38 | Parameters 39 | ---------- 40 | bam_path : str 41 | Path to the BAM alignment 42 | 43 | target_contig : str 44 | The name of the contig for which to recover haplotypes. 45 | 46 | start_pos : int 47 | The 1-indexed genomic position from which to begin considering variants. 48 | 49 | end_pos : int 50 | The 1-indexed genomic position at which to stop considering variants. 51 | 52 | vcf_handler : dict{str, any} 53 | Variant metadata, as provided by :py:func:`gretel.gretel.process_vcf`. 54 | 55 | use_end_sentinels : boolean, optional(default=False) 56 | Whether or not to append an additional pairwise observation between 57 | the final variant on a read towards a sentinel. 58 | 59 | .. note:: Experimental 60 | This feature is for testing purposes, currently it is recommended 61 | that the flag be left at the default of `False`. However, some 62 | data sets report minor performance improvements for some haplotypes 63 | when set to `True`. 64 | This flag may be removed at any time without warning. 65 | 66 | n_threads : int, optional(default=1) 67 | Number of threads to spawn for reading the BAM 68 | 69 | debug_reads : list{str}, optional 70 | A list of read names for which to print out debugging information 71 | 72 | debug_pos : list{int}, optional 73 | A list of positions for which to print out debugging information 74 | 75 | stepper : str, optional(default=samtools) 76 | The pysam pileup stepper to use 77 | 78 | Returns 79 | ------- 80 | Hansel : :py:class:`hansel.hansel.Hansel` 81 | """ 82 | 83 | hansel = Hansel.init_matrix(['A', 'C', 'G', 'T', 'N', "-", "_"], ['N', "_"], vcf_handler["N"]) 84 | 85 | if not debug_reads: 86 | debug_reads = set([]) 87 | if not debug_pos: 88 | debug_pos = set([]) 89 | 90 | import random 91 | def progress_worker(progress_q, n_workers, slices, total_snps, crumbs): 92 | worker_pos = [] 93 | worker_done = [] 94 | for _ in range(0, n_workers): 95 | worker_pos.append(0) 96 | worker_done.append(0) 97 | 98 | while sum(worker_done) < n_workers: 99 | work_block = progress_q.get() 100 | worker_pos[work_block["worker_i"]] = work_block["pos"] 101 | if work_block["pos"] is None: 102 | worker_done[work_block["worker_i"]] = 1 103 | 104 | crumbs.value += work_block["crumbs"] 105 | slices.value += work_block["slices"] 106 | total_snps.value += work_block["covered_snps"] 107 | sys.stderr.write("%s\n" % ([ worker_pos[i] if status != 1 else None for (i, status) in enumerate(worker_done)])) 108 | if random.random() < 0.1: 109 | sys.stderr.write("%s\n" % ([ worker_pos[i] if status != 1 else None for (i, status) in enumerate(worker_done)])) 110 | return (slices, total_snps, crumbs) 111 | 112 | def bam_worker(bam_q, progress_q, worker_i): 113 | 114 | worker = worker_i 115 | 116 | slices = 0 117 | crumbs = 0 118 | covered_snps = 0 119 | 120 | bam = pysam.AlignmentFile(bam_path) 121 | 122 | while True: 123 | work_block = bam_q.get() 124 | if work_block is None: 125 | progress_q.put({ 126 | "pos": None, 127 | "worker_i": worker_i, 128 | "slices": slices, 129 | "crumbs": crumbs, 130 | "covered_snps": covered_snps, 131 | }) 132 | break 133 | 134 | reads = {} 135 | dreads = set([]) 136 | 137 | for p_col in bam.pileup(reference=target_contig, start=work_block["start"]-1, stop=work_block["end"], ignore_overlaps=False, min_base_quality=0, stepper=stepper): 138 | 139 | if p_col.reference_pos + 1 > end_pos: 140 | # Ignore positions beyond the end_pos 141 | break 142 | 143 | if vcf_handler["region"][p_col.reference_pos+1] != 1: 144 | # Ignore non-SNPs 145 | continue 146 | 147 | for p_read in p_col.pileups: 148 | 149 | curr_read_1or2 = 0 150 | if p_read.alignment.is_paired: 151 | if p_read.alignment.is_read1: 152 | curr_read_1or2 = 1 153 | elif p_read.alignment.is_read2: 154 | curr_read_1or2 = 2 155 | else: 156 | #TODO Probably indicative of bad data 157 | pass 158 | 159 | 160 | curr_read_name = "%s_%s_%d" % (p_read.alignment.query_name, str(p_read.alignment.flag), curr_read_1or2) 161 | 162 | LEFTMOST_1pos = p_read.alignment.reference_start + 1 # Convert 0-based reference_start to 1-based position (to match region array and 1-based VCF) 163 | 164 | # Special case: Consider reads that begin before the start_pos, but overlap the 0th block 165 | if work_block["i"] == 0: 166 | if LEFTMOST_1pos < start_pos: 167 | # Read starts before the start_pos 168 | if p_read.alignment.reference_start + 1 + p_read.alignment.query_alignment_length < start_pos: 169 | # Read ends before the start_pos 170 | continue 171 | LEFTMOST_1pos = start_pos 172 | else: 173 | # This read begins before the start of the current (non-0) block 174 | # and will have already been covered by the block that preceded it 175 | if LEFTMOST_1pos < work_block["start"]: 176 | continue 177 | 178 | sequence = None 179 | qual = None 180 | if p_read.is_del: 181 | # TODO Not sure about how to estimate quality of deletion? 182 | sequence = "-" * (abs(p_read.indel) + 1) 183 | qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * (abs(p_read.indel) + 1) 184 | elif p_read.indel > 0: 185 | # p_read.indel peeks to next CIGAR and determines whether the base FOLLOWING this one is an insertion or not 186 | sequence = p_read.alignment.query_sequence[p_read.query_position : p_read.query_position + p_read.indel + 1] 187 | qual = p_read.alignment.query_qualities[p_read.query_position : p_read.query_position + p_read.indel + 1] 188 | else: 189 | sequence = p_read.alignment.query_sequence[p_read.query_position] 190 | qual = p_read.alignment.query_qualities[p_read.query_position] 191 | 192 | if not sequence: 193 | print("[WARN] Sequence data seems to not be correctly salvaged from read %s" % p_read.alignment.query_name) 194 | continue 195 | 196 | if curr_read_name not in reads: 197 | reads[curr_read_name] = { 198 | "rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]), # non-inclusive 1pos end 199 | "seq": [], 200 | "quals": [], 201 | "refs_1pos": [], 202 | "read_variants_0pos": [], 203 | } 204 | if p_read.alignment.query_name in debug_reads: 205 | dreads.add(curr_read_name) 206 | reads[curr_read_name]["seq"].append(sequence) 207 | reads[curr_read_name]["quals"].append(qual) 208 | reads[curr_read_name]["refs_1pos"].append(p_col.reference_pos+1) 209 | reads[curr_read_name]["read_variants_0pos"].append(p_read.query_position) 210 | 211 | 212 | for dread in sorted(dreads): 213 | r = reads[dread] 214 | if r: 215 | for snp_i, ref_pos in enumerate(r["refs_1pos"]): 216 | print (dread, ref_pos, r["seq"][snp_i]) 217 | print ("RANK", dread, r["rank"]) 218 | 219 | if debug_pos: 220 | for read in reads: 221 | for d_pos in set(reads[read]["refs_1pos"]) & debug_pos: 222 | i = reads[read]["refs_1pos"].index(d_pos) 223 | print (read, d_pos, reads[read]["seq"][i]) 224 | 225 | 226 | num_reads = len(reads) 227 | for qi, qname in enumerate(reads): 228 | progress_q.put({"pos": num_reads-(qi+1), "worker_i": worker_i}) 229 | 230 | if not len(reads[qname]["seq"]) > 1: 231 | # Ignore reads without evidence 232 | continue 233 | slices += 1 234 | 235 | rank = reads[qname]["rank"] 236 | support_len = len(reads[qname]["seq"]) 237 | 238 | support_seq = "".join([b[0] for b in reads[qname]["seq"]]) # b[0] has the affect of capturing the base before any insertion 239 | covered_snps += len(support_seq.replace("N", "").replace("_", "")) 240 | 241 | # For each position in the supporting sequence (that is, each covered SNP) 242 | for i in range(0, support_len): 243 | snp_a = support_seq[i] 244 | 245 | #if support_len == 1: 246 | # if rank == 0: 247 | # hansel.add_observation('_', snp_a, 0, 1) 248 | # hansel.add_observation(snp_a, '_', 1, 2) 249 | # else: 250 | # hansel.add_observation(snp_a, '_', rank+1, rank+2) 251 | 252 | 253 | # For each position in the supporting sequence following i 254 | for j in range(i+1, support_len): 255 | snp_b = support_seq[j] 256 | 257 | # Ignore observations who are from an invalid transition 258 | if snp_a in ['_', 'N']: 259 | continue 260 | 261 | # Sentinel->A 262 | if i==0 and j==1 and rank==0: 263 | # If this is the first position in the support (support_pos == 0) 264 | # and rank > 0 (that is, this is not the first SNP) 265 | # and SNPs a, b are adjacent 266 | hansel.add_observation('_', snp_a, 0, 1) 267 | hansel.add_observation(snp_a, snp_b, 1, 2) 268 | crumbs += 1 269 | 270 | # B->Sentinel 271 | elif (j+rank+1) == vcf_handler["N"] and abs(i-j)==1: 272 | # Last observation (abs(i-j)==1), 273 | # that ends on the final SNP (j+rank+1 == N) 274 | hansel.add_observation(snp_a, snp_b, vcf_handler["N"]-1, vcf_handler["N"]) 275 | hansel.add_observation(snp_b, '_', vcf_handler["N"], vcf_handler["N"]+1) 276 | crumbs += 1 277 | 278 | # A regular observation (A->B) 279 | else: 280 | hansel.add_observation(snp_a, snp_b, i+rank+1, j+rank+1) 281 | crumbs += 1 282 | 283 | if use_end_sentinels: 284 | if j==(support_len-1) and abs(i-j)==1: 285 | # The last SNP on a read, needs a sentinel afterward 286 | hansel.add_observation(snp_b, '_', j+rank+1, j+rank+2) 287 | 288 | bam_queue = Queue() 289 | progress_queue = Queue() 290 | 291 | # Queue the wokers 292 | # TODO Evenly divide, but in future, consider the distn 293 | # TODO Also consider in general block0 has more work to do 294 | window_l = round((end_pos - start_pos) / float(n_threads)) 295 | for window_i, window_pos in enumerate(range(start_pos, end_pos+1, window_l)): 296 | bam_queue.put({ 297 | "start": window_pos, 298 | "end": window_pos + window_l - 1, # add -1 to stop end of window colliding with next window 299 | "i": window_i, 300 | "region_end": end_pos, 301 | }) 302 | 303 | processes = [] 304 | for _ in range(n_threads): 305 | p = Process(target=bam_worker, 306 | args=(bam_queue, progress_queue, _)) 307 | processes.append(p) 308 | 309 | # ...and a progress process 310 | n_reads = Value('i', 0) 311 | n_observations = Value('i', 0) 312 | total_covered_snps = Value('i', 0) 313 | p = Process(target=progress_worker, 314 | args=(progress_queue, n_threads, n_reads, total_covered_snps, n_observations)) 315 | processes.append(p) 316 | 317 | for p in processes: 318 | p.start() 319 | 320 | # Add sentinels 321 | for _ in range(n_threads): 322 | bam_queue.put(None) 323 | 324 | # Wait for processes to complete work 325 | for p in processes: 326 | p.join() 327 | 328 | 329 | hansel.n_slices = n_reads.value 330 | hansel.n_crumbs = n_observations.value 331 | sys.stderr.write("[NOTE] Loaded %d breadcrumbs from %d bread slices.\n" % (hansel.n_crumbs, hansel.n_slices)) 332 | 333 | hansel.L = int(ceil(float(total_covered_snps.value)/n_reads.value)) 334 | sys.stderr.write("[NOTE] Setting Gretel.L to %d\n" % hansel.L) 335 | return hansel 336 | 337 | def load_fasta(fa_path): 338 | """ 339 | A convenient wrapper function for constructing a :py:class:`pysam.FastaFile` 340 | 341 | Parameters 342 | ---------- 343 | fa_path : str 344 | Path to FASTA 345 | 346 | Returns 347 | ------- 348 | 349 | FASTA File Interface : :py:class:`pysam.FastaFile` 350 | """ 351 | return pysam.FastaFile(fa_path) 352 | 353 | 354 | def process_vcf(vcf_path, contig_name, start_pos, end_pos): 355 | """ 356 | Parse a VCF to extract the genomic positions of called variants. 357 | 358 | Parameters 359 | ---------- 360 | vcf_path : str 361 | Path to the VCF file. 362 | 363 | contig_name : str 364 | Name of the target contig on which variants were called. 365 | 366 | start_pos : int 367 | The 1-indexed genomic position from which to begin considering variants. 368 | 369 | end_pos : int 370 | The 1-indexed genomic position at which to stop considering variants. 371 | 372 | Returns 373 | ------- 374 | Gretel Metastructure : dict 375 | A collection of structures used for the execution of Gretel. 376 | The currently used keys are: 377 | N : int 378 | The number of observed SNPs 379 | snp_fwd : dict{int, int} 380 | A reverse lookup from the n'th variant, to its genomic position on the contig 381 | snp_rev : dict{int, int} 382 | A forward lookup to translate the n'th genomic position to its i'th SNP rank 383 | region : list{int} 384 | A masked representation of the target contig, positive values are variant positions 385 | """ 386 | 387 | # Open the VCF 388 | fp = open(vcf_path, 'rb') # assumes bgzip and tabix 389 | vcf_records = vcf.Reader(fp) 390 | n_snps = 0 391 | snp_reverse = {} 392 | snp_forward = {} 393 | region = np.zeros(end_pos + 1, dtype=int) 394 | i = 0 395 | for record in vcf_records.fetch(contig_name, 0, end_pos): # [0, 1end) 396 | # record.POS is 1-indexed 397 | if record.POS < start_pos: 398 | continue 399 | if record.POS > end_pos: 400 | continue 401 | 402 | n_snps += 1 403 | region[record.POS] = 1 404 | snp_reverse[i] = record.POS 405 | snp_forward[record.POS] = i 406 | i += 1 407 | fp.close() 408 | 409 | return { 410 | "N": n_snps, 411 | "snp_fwd": snp_forward, 412 | "snp_rev": snp_reverse, 413 | "region": region, 414 | } 415 | --------------------------------------------------------------------------------