├── gretel
    ├── __init__.py
    ├── snpper.py
    ├── gretel.py
    ├── cmd.py
    └── util.py
├── tests
    ├── __init__.py
    ├── data
    │   ├── test.bam
    │   ├── test.vcf.gz
    │   ├── test.bam.bai
    │   ├── test.vcf.gz.tbi
    │   └── test.sam
    └── test_test.py
├── docs
    ├── readme.rst
    ├── changelog.rst
    ├── modules.rst
    ├── source
    │   ├── modules.rst
    │   └── gretel.rst
    ├── index.rst
    ├── gretel.rst
    ├── protocol.rst
    ├── make.bat
    ├── Makefile
    └── conf.py
├── gretel-logo.png
├── .gitignore
├── LICENSE
├── setup.py
├── Makefile
├── CHANGELOG.rst
└── README.md


/gretel/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CHANGELOG.rst
2 | 


--------------------------------------------------------------------------------
/gretel-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/gretel-logo.png


--------------------------------------------------------------------------------
/tests/data/test.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.bam


--------------------------------------------------------------------------------
/tests/data/test.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.vcf.gz


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | gretel
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    gretel
8 | 


--------------------------------------------------------------------------------
/tests/data/test.bam.bai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.bam.bai


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | gretel
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    gretel
8 | 


--------------------------------------------------------------------------------
/tests/data/test.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SamStudio8/gretel/HEAD/tests/data/test.vcf.gz.tbi


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Gretel
 2 | ======
 3 | 
 4 | An algorithm for recovering haplotypes from metagenomes. Sister to `Hansel <http://hansel.readthedocs.io/en/latest/>`_.
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 2
 8 | 
 9 |     readme
10 |     protocol
11 |     changelog
12 | 
13 | 
14 | Indices and tables
15 | ==================
16 | 
17 | * :ref:`genindex`
18 | * :ref:`modindex`
19 | * :ref:`search`
20 | 
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | htmlcov
29 | coverage.xml
30 | 
31 | # Translations
32 | *.mo
33 | 
34 | # Mr Developer
35 | .mr.developer.cfg
36 | .project
37 | .pydevproject
38 | 
39 | # Complexity
40 | output/*.html
41 | output/*/index.html
42 | 
43 | # Sphinx
44 | docs/_build
45 | 
46 | # Vim
47 | *.swp
48 | *.swo
49 | 


--------------------------------------------------------------------------------
/docs/gretel.rst:
--------------------------------------------------------------------------------
 1 | gretel package
 2 | ==============
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | gretel.cmd module
 8 | -----------------
 9 | 
10 | .. automodule:: gretel.cmd
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | gretel.gretel module
16 | --------------------
17 | 
18 | .. automodule:: gretel.gretel
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | gretel.util module
24 | ------------------
25 | 
26 | .. automodule:: gretel.util
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: gretel
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/docs/source/gretel.rst:
--------------------------------------------------------------------------------
 1 | gretel package
 2 | ==============
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | gretel.cmd module
 8 | -----------------
 9 | 
10 | .. automodule:: gretel.cmd
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | gretel.gretel module
16 | --------------------
17 | 
18 | .. automodule:: gretel.gretel
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | gretel.util module
24 | ------------------
25 | 
26 | .. automodule:: gretel.util
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | 
32 | Module contents
33 | ---------------
34 | 
35 | .. automodule:: gretel
36 |     :members:
37 |     :undoc-members:
38 |     :show-inheritance:
39 | 


--------------------------------------------------------------------------------
/tests/data/test.sam:
--------------------------------------------------------------------------------
 1 | @HD	VN:1.0	SO:coordinate
 2 | @SQ	SN:hoot	LN:20
 3 | @SQ	SN:meow	LN:20
 4 | read1	0	hoot	1	42	10M	*	0	10	AANNNNNNNA	~~~~~~~~~~	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:5	YT:Z:UU	RG:Z:hoot
 5 | read2	0	hoot	1	42	10M	*	0	10	CCNNNNNNNC	~~~~~~~~~~	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:5	YT:Z:UU	RG:Z:hoot
 6 | read3	0	hoot	1	42	5M	*	0	5	TTNNN	~~~~~	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:5	YT:Z:UU	RG:Z:hoot
 7 | read4	0	hoot	1	42	5M	*	0	5	TTNNN	!!!!!	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:5	YT:Z:UU	RG:Z:hoot
 8 | read5	0	hoot	10	42	11M	*	0	11	GNNNNNNNNNG	~~~~~~~~~~~	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:5	YT:Z:UU	RG:Z:hoot
 9 | read6	0	meow	1	42	10M	*	0	10	AANNNNNNNA	~~~~~~~~~~	AS:i:0	XN:i:0	XM:i:0	XO:i:0	XG:i:0	NM:i:0	MD:Z:5	YT:Z:UU	RG:Z:hoot
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright © 2016 Sam Nicholls <sam@samnicholls.net>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import setuptools
 5 | 
 6 | requirements = [
 7 |     "numpy",
 8 |     "hanselx==0.0.92",
 9 |     "pysam",
10 |     "PyVCF",
11 | ]
12 | 
13 | test_requirements = [
14 | 
15 | ]
16 | 
17 | setuptools.setup(
18 |     name="gretel",
19 |     version="0.0.94",
20 |     url="https://github.com/samstudio8/gretel",
21 | 
22 |     description="An algorithm for recovering potential haplotypes from metagenomes",
23 |     long_description="",
24 | 
25 |     author="Sam Nicholls",
26 |     author_email="sam@samnicholls.net",
27 | 
28 |     maintainer="Sam Nicholls",
29 |     maintainer_email="sam@samnicholls.net",
30 | 
31 |     packages=setuptools.find_packages(),
32 |     include_package_data=True,
33 | 
34 |     install_requires=requirements,
35 | 
36 |     entry_points = {
37 |         "console_scripts": [
38 |             "gretel=gretel.cmd:main",
39 |             "gretel-snpper=gretel.snpper:main",
40 |         ]
41 |     },
42 | 
43 |     classifiers = [
44 |         'Development Status :: 2 - Pre-Alpha',
45 |         'Intended Audience :: Science/Research',
46 |         'Topic :: Scientific/Engineering',
47 |         'Topic :: Scientific/Engineering :: Bio-Informatics',
48 |         'License :: OSI Approved :: MIT License',
49 |     ],
50 | 
51 |     test_suite="tests",
52 |     tests_require=test_requirements
53 | )
54 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean-build - remove build artifacts"
 5 | 	@echo "clean-pyc - remove Python file artifacts"
 6 | 	@echo "lint - check style with flake8"
 7 | 	@echo "test - run tests quickly with the default Python"
 8 | 	@echo "test-all - run tests on every Python version with tox"
 9 | 	@echo "coverage - check code coverage quickly with the default Python"
10 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
11 | 	@echo "release - package and upload a release"
12 | 	@echo "dist - package"
13 | 
14 | clean: clean-build clean-pyc
15 | 	rm -fr htmlcov/
16 | 
17 | clean-build:
18 | 	rm -fr build/
19 | 	rm -fr dist/
20 | 	rm -fr *.egg-info
21 | 
22 | clean-pyc:
23 | 	find . -name '*.pyc' -exec rm -f {} +
24 | 	find . -name '*.pyo' -exec rm -f {} +
25 | 	find . -name '*~' -exec rm -f {} +
26 | 
27 | lint:
28 | 	flake8 gretel tests
29 | 
30 | test:
31 | 	python setup.py test
32 | 
33 | test-all:
34 | 	tox
35 | 
36 | coverage:
37 | 	coverage run --source gretel setup.py test
38 | 	coverage report -m
39 | 	coverage html
40 | 	open htmlcov/index.html
41 | 
42 | docs:
43 | 	rm -f docs/gretel.rst
44 | 	rm -f docs/modules.rst
45 | 	sphinx-apidoc -o docs/ gretel
46 | 	$(MAKE) -C docs clean
47 | 	$(MAKE) -C docs html
48 | 	open docs/_build/html/index.html
49 | 
50 | release: clean
51 | 	python3 -m twine upload dist/*
52 | 
53 | dist: clean
54 | 	python3 setup.py sdist bdist_wheel
55 | 	ls -l dist
56 | 


--------------------------------------------------------------------------------
/gretel/snpper.py:
--------------------------------------------------------------------------------
 1 | """Given a BAM, a contig, and an ending genomic position, aggressively call for
 2 | variants and generate a placeholder VCF.
 3 | 
 4 | Thanks to @linsalrob for the initial argparsification of gretel-snpper.
 5 | """
 6 | import sys
 7 | 
 8 | import numpy as np
 9 | import pysam
10 | import argparse
11 | 
12 | def main():
13 |     parser = argparse.ArgumentParser('Aggressively call for variants and generate a VCF', epilog='NOTE: Coordinates are 1-based as they are for samtools')
14 |     parser.add_argument('--bam', help='bam of reads aligned to (psuedo)-reference', required=True)
15 |     parser.add_argument('--contig', help='name of contig to generate a VCF for', required=True)
16 |     parser.add_argument('-s', help='start (default = 1)', type=int, default=1)
17 |     parser.add_argument('-e', help='end (default = length of the reference)', type=int)
18 |     parser.add_argument('--depth', help='number of reads that must feature a read to call that base as a possible variant (default = 0)', type=int, default=0)
19 |     args = parser.parse_args()
20 | 
21 |     bam = pysam.AlignmentFile(args.bam)
22 | 
23 |     if not args.e:
24 |         args.e = bam.lengths[bam.references.index(args.contig)]
25 | 
26 |     # convert 1-indexed numbers to 0 indexed numbers for pysam
27 |     args.s = args.s - 1
28 | 
29 |     counts = np.array(bam.count_coverage(contig=args.contig, start=args.s, stop=args.e, quality_threshold=0, read_callback='nofilter'))
30 | 
31 |     COUNT_SENSITIVITY = args.depth
32 | 
33 |     vcf_h = [
34 |         "##fileformat=VCFv4.2",
35 |     ]
36 |     vcf = []
37 | 
38 |     sites = (counts > COUNT_SENSITIVITY).sum(axis=0)
39 |     for i, s in enumerate(sites):
40 |         if s > 1:
41 |             vcf.append([
42 |                 args.contig,
43 |                 i+1+args.s,
44 |                 '.',
45 |                 'A',
46 |                 'C,T,G',
47 |                 0,
48 |                 '.',
49 |                 "INFO"
50 |             ])
51 | 
52 | 
53 |     for r in vcf_h:
54 |         print(r)
55 |     for r in vcf:
56 |         print("\t".join([str(s) for s in r]))
57 | 


--------------------------------------------------------------------------------
/tests/test_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | EXP = [1,2,10]
 4 | def load_vcf(path, contig, start_pos, end_pos):
 5 |     from gretel import util
 6 |     return util.process_vcf(path, contig, start_pos, end_pos)
 7 | 
 8 | def load_bam(path, vcf_path, contig, start_pos, end_pos, threads):
 9 |     from gretel import util
10 |     return util.load_from_bam(path, contig, start_pos, end_pos, load_vcf(vcf_path, contig, start_pos, end_pos), n_threads=threads)
11 | 
12 | class BasicRegressionTest(unittest.TestCase):
13 | 
14 |     def test_test(self):
15 |         self.assertTrue(True)
16 | 
17 |     def test_vcf(self):
18 |         VCF_h = load_vcf('tests/data/test.vcf.gz', 'hoot', 1, 19)
19 |         self.assertEqual(VCF_h["N"], 3)
20 |         self.assertEqual(VCF_h["snp_rev"], {0:1, 1:2, 2:10})
21 |         self.assertEqual(VCF_h["snp_fwd"], {1:0, 2:1, 10:2})
22 | 
23 |         self.assertEqual(len(VCF_h["region"]), 19+1)
24 |         for i in range(len(VCF_h["region"])):
25 |             if i in EXP:
26 |                 self.assertEqual(VCF_h["region"][i], 1)
27 |             else:
28 |                 self.assertEqual(VCF_h["region"][i], 0)
29 |         return VCF_h
30 | 
31 |     def test_bam(self):
32 | 
33 |         for thread in [1,2]:
34 |             hansel = load_bam('tests/data/test.bam', 'tests/data/test.vcf.gz', 'hoot', 1, 20, thread)
35 | 
36 |             self.assertEqual(hansel.n_slices, 5) # n reads
37 |             self.assertEqual(hansel.n_crumbs, 9) # n snp connections
38 |             self.assertTrue(hansel.L > 0) # Check L is actually set
39 | 
40 |             # Test a couple of elements in the Hansel matrix
41 |             self.assertEqual(hansel.get_observation('_', 'A', 0, 1), 1)
42 |             self.assertEqual(hansel.get_observation('A', 'A', 1, 2), 1)
43 |             self.assertEqual(hansel.get_observation('A', 'A', 1, 3), 1)
44 |             self.assertEqual(hansel.get_observation('A', 'A', 1, 4), 0)
45 |             self.assertEqual(hansel.get_observation('C', 'C', 1, 2), 1)
46 |             self.assertEqual(hansel.get_observation('C', 'C', 1, 3), 1)
47 |             self.assertEqual(hansel.get_observation('C', 'C', 1, 4), 0)
48 |             self.assertEqual(hansel.get_observation('T', 'T', 1, 2), 2)
49 |             self.assertEqual(hansel.get_observation('G', 'G', 1, 2), 0)
50 |             self.assertEqual(hansel.get_observation('G', 'G', 2, 3), 0)
51 |             self.assertEqual(hansel.get_observation('G', 'G', 3, 4), 1)
52 |             self.assertEqual(hansel.get_observation('G', '_', 4, 5), 1)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/CHANGELOG.rst:
--------------------------------------------------------------------------------
 1 | History
 2 | =======
 3 | 
 4 | 0.0.94
 5 | ------
 6 | * Added `--pepper` option to for permissive pileups by overriding the pysam pileup stepper to `all` instead of `samtools`.
 7 | 
 8 | 0.0.93
 9 | ------
10 | 
11 | * Move `process_vcf` to `util` module. I may drop use of `pyvcf` in future as I don't like the API.
12 | * Dropped pointless `append_path` stub.
13 | * Fixed an edge case where reads beginning with a SNP that aligned to the start of a parallel parsing window are counted twice.
14 | * Added a small test package to help detect future regressions.
15 | * Added `--version` argument to print program version number.
16 | * Removed `--lorder` argument as users should not need to select the chain order.
17 | 
18 | 0.0.92
19 | ------
20 | 
21 | * Adds `--dumpmatrix` and `--dumpsnps` debugging options.
22 | * Clean up Hansel matrix initialisation.
23 | * Add `gretel-snpper` command for generating naive VCF.
24 | * Fix a regression where the `L` parameter of the matrix is incorrectly left unset.
25 | 
26 | 0.0.90
27 | ------
28 | Resolves a bug whereby SNPs are incorrectly parsed from the BAM if either:
29 |     * its quality score is below 13
30 |     * the read is overlapped by its primary mate
31 | 
32 | Well covered data sets need not be overly affected by the additional noise that
33 | may have been introduced, but the problem is more noticeable with low coverage
34 | and you may wish to reapply Gretel to affected data. Sorry.
35 | 
36 | 
37 | 0.0.81
38 | ------
39 | * Add warning and advice when an entry in Hansel is missing evidence.
40 | * Make the 'Unable to select' warning sound much less bad because it is normal.
41 | 
42 | 0.0.8
43 | -----
44 | * Docs
45 | * Deprecate `gretel-crumbs` command
46 | 
47 | 0.0.7
48 | -----
49 | * Further improvements to parallel read processing
50 | * Add `-` symbol to enable support for deletions
51 | 
52 | 0.0.6b
53 | ------
54 | * Fix setting of `L` parameter
55 | 
56 | 0.0.6
57 | -----
58 | * MULTIPROCESSING
59 | * Re-write read handling, again
60 | 
61 | 0.0.5
62 | -----
63 | * `-s` and `-e` introduced to allow specification of positions between which
64 |   to recover haplotypes
65 | * Attempt some basic indel handling
66 | * Fix a bug where the master sequence was altered by the output of each
67 |   reported haplotype
68 | 
69 | 0.0.4
70 | -----
71 | * Add experimental `--sentinels` option
72 | * Improve docs
73 | 
74 | 0.0.3
75 | -----
76 | * Hansel is now seperate from Gretel
77 | * [Hansel] `get_marginal_at` is `now get_counts_at`
78 | * [Hansel] `selext_next_edge_at` deprecated
79 | * Gene recovery and likelihood plots are now on seperate panels
80 | * Re-write methods to add observations to matrix to be less awful to read
81 | * Drop `--hit` and `--gene` options to verification
82 | * Replace verification script to `gretel-crumbs` command
83 | 
84 | 0.0.2
85 | -----
86 | * Improve documentation.
87 | * Provide `util` subpackage for filling `Hansel` structure with BAM observations.
88 | * Explicitly provide possible symbols to `Hansel`.
89 | * Improve plotting
90 | * Remove `process_hits` and `process_refs` as these are no longer needed.
91 | * Rename `establish_path` to `generate_path`
92 | * Rename `add_ignore_support3` to `reweight_hansel_from_graph` so we have some sort of indication of what it does.
93 | * Altered Sphinx configuation.
94 | 
95 | 0.0.1
96 | -----
97 | * Import repository from `claw`.
98 | 


--------------------------------------------------------------------------------
/docs/protocol.rst:
--------------------------------------------------------------------------------
 1 | Protocol
 2 | ========
 3 | 
 4 | **Gretel** provides a command line tool for the recovery of haplotypes.
 5 | We recommend the following protocol.
 6 | 
 7 | Read Alignment
 8 | --------------
 9 | 
10 | **Gretel** requires your reads to be aligned to a common reference. This is to
11 | ensure that reads share a co-ordinate system, on which we can call for variants
12 | and recover haplotypes. The reference itself is of little consequence, though
13 | dropped reads will lead to evidence to be unavailable to Gretel.
14 | 
15 | Construction of a *de novo* consensus assembly for a metagenome is left as an exercise
16 | for the reader. Align the reads to your assembly (`bowtie2`, `minimap2` etc.).
17 | Sort and index the alignment BAM.
18 | 
19 | Variant Calling
20 | ---------------
21 | 
22 | **Gretel** is robust to sequencing error and misalignment noise, thus the
23 | calling of variants need not be carefully conducted. Typically we have used `samtools`,
24 | but for our own Gretel pipeline, we have aggressively called all heterogenous sites
25 | in an alignment as a SNP using the `snpper` tool in our `gretel-test repository
26 | <https://github.com/SamStudio8/gretel-test>`_.
27 | 
28 | For somewhat questionable reasoning, we currently require a compressed and indexed VCF: ::
29 | 
30 |     bgzip <my.vcf>
31 |     tabix <my.vcf.gz>
32 | 
33 | Invocation of Gretel
34 | --------------------
35 | As described in the README, Gretel is invoked as follows: ::
36 | 
37 |     gretel <my.sort.bam> <my.vcf.gz> <contig> [-s 1startpos] [-e 1endpos] [--master master.fa] [-o output_dir]
38 | 
39 | You must provide your sorted BAM, compressed VCF, and the name of the contig on which
40 | to recover haplotypes. Use `-s` and `-e` to specify the positions on the aligned reads between which
41 | to recover haplotypes from your metagenome.
42 | 
43 | By default, Gretel will output a FASTA containing the recovered SNPs, in order, for each haplotype.
44 | Providing an optional "master" FASTA sequence will permit Gretel to "fill in" the non-SNP positions
45 | (*i.e.* the positions between `-s` and `-e` that do not appear in the VCF) with the nucleotide from
46 | the pseudo-reference.
47 | 
48 | Gretel Outputs
49 | --------------
50 | 
51 | out.fasta
52 | ~~~~~~~~~
53 | A **FASTA** containing each of the recovered sequences, in the order they were found.
54 | Each sequence is named `<iteration>__-<log10 likelihood>`. Sequences are not wrapped.
55 | 
56 | gretel.crumbs
57 | ~~~~~~~~~~~~~
58 | 
59 | Additionally, Gretel outputs a whimsically named *crumbs* file, containing some potentially
60 | interesting metadata, as well as a record of each recovered haplotype.
61 | The first row is a comment containing the following (in order):
62 | 
63 | * The number of SNPs across the region of interest
64 | * The number of 'crumbs': paired observations added to the Hansel matrix
65 | * The number of 'slices': reads with at least one observation added to the Hansel matrix
66 | * The chosen value of `L` for the `L`'th order Markov chain
67 | 
68 | The rest of the file contains tab-delimited metadata for each recovered haplotype:
69 | 
70 | * The iteration number, starting from 0
71 | * The number of times this haplotype was returned
72 | * The *weighted* likelihood of the haplotype, given the Hansel matrix at the time the haplotype was recovered (comma-sep for each time the haplotype was returned)
73 | * The *unweighted* likelihood of the haplotype, given the Hansel matrix at the time the reads were parsed (comma-sep for each time the haplotype was returned)
74 | * The haplotype magnitude: total number of observations removed from the Hansel matrix by the reweighting mechanism
75 | 
76 | In practice, we rank with the **weighted** likelihoods to discern the haplotypes most likely to exist in the metagenome.
77 | One may attempt to use the *unweighted* likelihoods as a means to compare the abundance, or read support, **between the returned haplotypes** (*i.e.* not necessarily the metagenome as a whole).
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <p align="center">
  3 |     <img src="gretel-logo.png?raw=true?" alt="gretel-logo" width="200">
  4 | </p>
  5 | <h1 align="center">Gretel</h1>
  6 | <h3 align="center">An algorithm for recovering haplotypes from metagenomes. Sister to <a href="https://github.com/SamStudio8/hansel">Hansel</a>.
  7 | </h3>
  8 | <p align="center">
  9 | <a href="https://github.com/samstudio8/gretel/blob/master/LICENSE"><img src="https://img.shields.io/badge/license-MIT-orange.svg" alt="License"></a>
 10 | <a href="https://bioconda.github.io/recipes/gretel/README.html"><img src="https://anaconda.org/bioconda/gretel/badges/downloads.svg" alt="bioconda"></a>
 11 | </p>
 12 | </div>
 13 | 
 14 | 
 15 | What is it?
 16 | -----------
 17 | 
 18 | **Gretel** is a Python package providing a command line tool for the recovery of haplotypes
 19 | from metagenomic data sets. **Gretel** parses an alignment of reads into a **Hansel** matrix
 20 | and uses the evidence of SNP pairs observed to appear on the same reads to probabilistically
 21 | reconstruct the most likely haplotypes.
 22 | 
 23 | **Gretel** uses an L'th order Markov chain model to reconstruct likely sequences
 24 | of variants that constitute haplotypes in the real metagenome.
 25 | Our approach involves graph-like traversal of the data within the **Hansel** matrix.
 26 | Edges are probabilitically weighted based on the evidence on the reads, as well as
 27 | the haplotype as it has been reconstructed so far.
 28 | 
 29 | What can I use it for?
 30 | ----------------------
 31 | 
 32 | **Gretel** is designed to recover haplotypes from your data set, without the need for
 33 | setting (or optimisation) of any parameters.
 34 | **Gretel** does not require a priori knowledge of your input data (such as its contents, or
 35 | the true number of haplotypes) and makes no assumptions
 36 | regarding the distributions of alleles at variant sites and uses the available evidence
 37 | from the aligned reads without altering or discarding the observed varations.
 38 | 
 39 | Why should I use it?
 40 | --------------------
 41 | 
 42 | **Gretel** is the first tool capable of recovering haplotypes from metagenomes.
 43 | Whilst tools exist for analogous haplotyping problems, such as the assembly of
 44 | viral quasispecies, typically these tools rely on overlap approaches that create
 45 | too many unranked haplotypes. **Gretel** is capable of ranking the haplotypes it
 46 | outputs by their likelihood.
 47 | 
 48 | **Gretel** requires no parameters and our approach is robust to sequencing error
 49 | and misalignment noise.
 50 | 
 51 | Requirements
 52 | ------------
 53 | 
 54 | 
 55 |     $ pip install numpy hanselx pysam PyVCF
 56 | 
 57 | Install
 58 | -------
 59 | 
 60 | 
 61 |     $ pip install gretel
 62 | 
 63 | Alternatively, Gretel has been packaged for bioconda (Thanks [@johnne!](https://github.com/johnne)):
 64 | 
 65 |     $ conda install -c bioconda gretel
 66 | 
 67 | Usage
 68 | -----
 69 | You will require a sorted BAM containing your reads, aligned to some pseudo-reference.
 70 | You can use any sequence as your reference, such as a consensus assembly of the
 71 | metagenomic reads, or a known strain reference (such as HIV-1).
 72 | You must bgzip and tabix your VCF.
 73 | 
 74 |     $ gretel <bam> <vcf.gz> <contig> -s <1-start> -e <1-end> --master <master.fa> -o <outdir>
 75 | 
 76 | 
 77 | Citation
 78 | --------
 79 | ```
 80 | @article{10.1093/bioinformatics/btaa977,
 81 |     author = {Nicholls, Samuel M and Aubrey, Wayne and De Grave, Kurt and Schietgat, Leander and Creevey, Christopher J and Clare, Amanda},
 82 |     title = "{On the complexity of haplotyping a microbial community}",
 83 |     journal = {Bioinformatics},
 84 |     volume = {37},
 85 |     number = {10},
 86 |     pages = {1360-1366},
 87 |     year = {2021},
 88 |     month = {01},
 89 |     issn = {1367-4803},
 90 |     doi = {10.1093/bioinformatics/btaa977},
 91 |     url = {https://doi.org/10.1093/bioinformatics/btaa977},
 92 |     eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/10/1360/38663805/btaa977.pdf},
 93 | }
 94 | ```
 95 | [Read more on Twitter](https://twitter.com/samstudio8/status/1329406136592834564)
 96 | 
 97 | License
 98 | -------
 99 | Hansel and Gretel are distributed under the MIT license, see LICENSE.
100 | 


--------------------------------------------------------------------------------
/gretel/gretel.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from math import log,log10,exp
  3 | import random
  4 | 
  5 | import numpy as np
  6 | 
  7 | from hansel import Hansel
  8 | from . import util
  9 | 
 10 | #TODO Should the denom of the conditional use the unique variants at i-l or i?
 11 | #TODO Util to parse known input and return SNP seq
 12 | 
 13 | def reweight_hansel_from_path(hansel, path, ratio):
 14 |     """
 15 |     Given a completed path, reweight the applicable pairwise observations in the Hansel structure.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     hansel : :py:class:`hansel.hansel.Hansel`
 20 |         The Hansel structure currently being explored by Gretel.
 21 | 
 22 |     path : list{str}
 23 |         The ordered sequence of selected variants.
 24 | 
 25 |     ratio : float
 26 |         The proportion of evidence to remove from each paired observation that
 27 |         was considered to recover the provided path.
 28 | 
 29 |         It is recommended this be the smallest marginal distribution observed across selected variants.
 30 | 
 31 |         *i.e.* For each selected variant in the path, note the value of the
 32 |         marginal distribution for the probability of observing that particular
 33 |         variant at that genomic position. Parameterise the minimum value of
 34 |         those marginals.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     Spent Observations : float
 39 |         The sum of removed observations from the Hansel structure.
 40 |     """
 41 | 
 42 |     size = 0
 43 | 
 44 |     """
 45 |     # Old re-implementation sans flip
 46 |     for i in range(0, len(path)-1):
 47 |         for j in range(0, i+1+1):
 48 |             # Reduce read supports
 49 |             if i == j:
 50 |                 continue
 51 |             size += hansel.reweight_observation(path[i], path[j], i, j, ratio)
 52 |     return size
 53 | 
 54 |     # Reduce adjacent evidence pairs
 55 |     for i in range(len(path)-1):
 56 |         size += hansel.reweight_observation(path[i], path[i+1], i, i+1, ratio)
 57 | 
 58 |     # Reduce other evidence pairs
 59 |     for j in range(1, len(path)):
 60 |         for i in range(0, j-1):
 61 |             size += hansel.reweight_observation(path[i], path[j], i, j, ratio)
 62 | 
 63 |     # Reduce other non-evidence pairs
 64 |     # I have no idea why this works so well, so we'll need to have a think about it
 65 |     # before we put it in Gretel proper...
 66 |     #for j in range(1, len(path)):
 67 |     #    for i in range(0, j-1):
 68 |     #        size += hansel.reweight_observation(path[j], path[i], j, i, ratio)
 69 |     #        pass
 70 | 
 71 |     # Reweight the rest of the matrix because we can at least explain that
 72 |     hansel.reweight_matrix( ratio / (hansel.L/10) )
 73 | 
 74 |     sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size))
 75 |     return size
 76 |     """
 77 | 
 78 |     # Let's keep the RW system as-is for now...
 79 |     size = 0
 80 |     for i in range(0, len(path)):
 81 |         for j in range(0, i+1+1):
 82 |             # Reduce read supports
 83 |             if i >= len(path)-1:
 84 |                 size += hansel.reweight_observation(path[i], path[j], i, i+1, ratio)
 85 |                 break #???
 86 |             else:
 87 |                 if j < i:
 88 |                     # This isn't just a case of j < i, but means that we are looking
 89 |                     # at the two SNPs the wrong way around, we must switch them before
 90 |                     # handing them over to reweight_observation
 91 |                     t_i = j
 92 |                     t_j = i
 93 |                 else:
 94 |                     t_i = i
 95 |                     t_j = j
 96 |                 size += hansel.reweight_observation(path[t_i], path[t_j], t_i, t_j, ratio)
 97 |     sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size))
 98 |     return size
 99 | 
100 | ## PATH GENERATION ############################################################
101 | 
102 | def generate_path(n_snps, hansel, original_hansel, debug_hpos=None):
103 |     """
104 |     Explore and generate the most likely path (haplotype) through the observed Hansel structure.
105 | 
106 |     Parameters
107 |     ----------
108 |     n_snps : int
109 |         The number of variants.
110 | 
111 |     hansel : :py:class:`hansel.hansel.Hansel`
112 |         The Hansel structure currently being explored by Gretel.
113 | 
114 |     original_hansel : :py:class:`hansel.hansel.Hansel`
115 |         A copy of the Hansel structure created by Gretel, before any reweighting.
116 | 
117 |     Returns
118 |     -------
119 |     Path : list{str} or None
120 |         The sequence of variants that represent the completed path (or haplotype), or None
121 |         if one could not be successfully constructed.
122 | 
123 |     Path Probabilities : dict{str, float}
124 |         The `hp_original` (orignal Hansel) and `hp_current` (current Hansel) joint
125 |         probabilities of the variants in the returned path occurring together
126 |         in the given order.
127 | 
128 |     Minimum Marginal : float
129 |         The smallest marginal distribution observed across selected variants.
130 |     """
131 | 
132 |     # Cross the metahaplome in a greedily, naive fashion to establish a base path
133 |     # This seeds the rest of the path generation (we might want to just select
134 |     #   a random path here in future)
135 | 
136 |     running_prob = 0.0
137 |     running_prob_uw = 0.0
138 |     current_path = [ hansel.symbols_d['_'] ] # start with the dummy
139 |     marginals = []
140 | 
141 |     # Find path
142 |     sys.stderr.write("[NOTE] *Establishing next path\n")
143 |     for snp in range(1, n_snps+1):
144 |         #sys.stderr.write("\t*** ***\n")
145 |         #sys.stderr.write("\t[SNP_] SNP %d\n" % snp)
146 | 
147 |         dh_flag = False
148 |         if debug_hpos:
149 |             if snp in debug_hpos:
150 |                 dh_flag = True
151 | 
152 |         # Get marginal and calculate branch probabilities for each available
153 |         # mallele, given the current path seen so far
154 |         # Select the next branch and append it to the path
155 |         curr_branches = hansel.get_edge_weights_at(snp, current_path, debug=dh_flag)
156 |         #sys.stderr.write("\t[TREE] %s\n" % curr_branches)
157 |         # Return the symbol and probability of the next base to add to the
158 |         # current path based on the best marginal
159 |         next_v = 0.0
160 |         next_m = None
161 | 
162 |         if debug_hpos:
163 |             if snp in debug_hpos:
164 |                 print(curr_branches)
165 | 
166 |         for symbol in curr_branches:
167 |             if str(symbol) == "total":
168 |                 continue
169 |             if next_m is None:
170 |                 next_v = curr_branches[symbol]
171 |                 next_m = symbol
172 |             elif curr_branches[symbol] > next_v:
173 |                 next_v = curr_branches[symbol]
174 |                 next_m = symbol
175 | 
176 |         if next_m == None:
177 |             sys.stderr.write('''[NOTE] Unable to select next branch from SNP %d to %d
178 |        By design, Gretel will attempt to recover haplotypes until a hole in the graph has been found.
179 |        Recovery will intentionally terminate now.\n''' % (snp-1, snp))
180 |             return None, None, None
181 | 
182 |         selected_edge_weight = hansel.get_marginal_of_at(next_m, snp)
183 |         marginals.append(selected_edge_weight) #NOTE This isn't a log, as it is used as a ratio later
184 | 
185 |         running_prob += log10(selected_edge_weight)
186 |         running_prob_uw += log10(original_hansel.get_marginal_of_at(next_m, snp))
187 |         current_path.append(next_m)
188 | 
189 |     return current_path, {"hp_original": running_prob_uw, "hp_current": running_prob}, min(marginals)
190 | 
191 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 1>NUL 2>NUL
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\Gretel.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\Gretel.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help
 23 | help:
 24 | 	@echo "Please use \`make <target>' where <target> is one of"
 25 | 	@echo "  html       to make standalone HTML files"
 26 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 27 | 	@echo "  singlehtml to make a single large HTML file"
 28 | 	@echo "  pickle     to make pickle files"
 29 | 	@echo "  json       to make JSON files"
 30 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 31 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 32 | 	@echo "  applehelp  to make an Apple Help Book"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 49 | 
 50 | .PHONY: clean
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | .PHONY: html
 55 | html:
 56 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 57 | 	@echo
 58 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 59 | 
 60 | .PHONY: dirhtml
 61 | dirhtml:
 62 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 63 | 	@echo
 64 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 65 | 
 66 | .PHONY: singlehtml
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | .PHONY: pickle
 73 | pickle:
 74 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 75 | 	@echo
 76 | 	@echo "Build finished; now you can process the pickle files."
 77 | 
 78 | .PHONY: json
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | .PHONY: htmlhelp
 85 | htmlhelp:
 86 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 89 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 90 | 
 91 | .PHONY: qthelp
 92 | qthelp:
 93 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 94 | 	@echo
 95 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 96 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 97 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/Gretel.qhcp"
 98 | 	@echo "To view the help file:"
 99 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/Gretel.qhc"
100 | 
101 | .PHONY: applehelp
102 | applehelp:
103 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
104 | 	@echo
105 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
106 | 	@echo "N.B. You won't be able to view it unless you put it in" \
107 | 	      "~/Library/Documentation/Help or install it in your application" \
108 | 	      "bundle."
109 | 
110 | .PHONY: devhelp
111 | devhelp:
112 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
113 | 	@echo
114 | 	@echo "Build finished."
115 | 	@echo "To view the help file:"
116 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/Gretel"
117 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Gretel"
118 | 	@echo "# devhelp"
119 | 
120 | .PHONY: epub
121 | epub:
122 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
123 | 	@echo
124 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
125 | 
126 | .PHONY: latex
127 | latex:
128 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
129 | 	@echo
130 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
131 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
132 | 	      "(use \`make latexpdf' here to do that automatically)."
133 | 
134 | .PHONY: latexpdf
135 | latexpdf:
136 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
137 | 	@echo "Running LaTeX files through pdflatex..."
138 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
139 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
140 | 
141 | .PHONY: latexpdfja
142 | latexpdfja:
143 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
144 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
145 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
146 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
147 | 
148 | .PHONY: text
149 | text:
150 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
151 | 	@echo
152 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
153 | 
154 | .PHONY: man
155 | man:
156 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
157 | 	@echo
158 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
159 | 
160 | .PHONY: texinfo
161 | texinfo:
162 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
163 | 	@echo
164 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
165 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
166 | 	      "(use \`make info' here to do that automatically)."
167 | 
168 | .PHONY: info
169 | info:
170 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
171 | 	@echo "Running Texinfo files through makeinfo..."
172 | 	make -C $(BUILDDIR)/texinfo info
173 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
174 | 
175 | .PHONY: gettext
176 | gettext:
177 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
178 | 	@echo
179 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
180 | 
181 | .PHONY: changes
182 | changes:
183 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
184 | 	@echo
185 | 	@echo "The overview file is in $(BUILDDIR)/changes."
186 | 
187 | .PHONY: linkcheck
188 | linkcheck:
189 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
190 | 	@echo
191 | 	@echo "Link check complete; look for any errors in the above output " \
192 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
193 | 
194 | .PHONY: doctest
195 | doctest:
196 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
197 | 	@echo "Testing of doctests in the sources finished, look at the " \
198 | 	      "results in $(BUILDDIR)/doctest/output.txt."
199 | 
200 | .PHONY: coverage
201 | coverage:
202 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
203 | 	@echo "Testing of coverage in the sources finished, look at the " \
204 | 	      "results in $(BUILDDIR)/coverage/python.txt."
205 | 
206 | .PHONY: xml
207 | xml:
208 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
209 | 	@echo
210 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
211 | 
212 | .PHONY: pseudoxml
213 | pseudoxml:
214 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
215 | 	@echo
216 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
217 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Gretel documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Jun 13 13:40:08 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | # If extensions (or modules to document with autodoc) are in another directory,
 19 | # add these directories to sys.path here. If the directory is relative to the
 20 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 21 | sys.path.insert(0, os.path.abspath('.'))
 22 | sys.path.insert(0, os.path.abspath('../'))
 23 | 
 24 | import mock
 25 | MOCK_MODULES = ['numpy', 'matplotlib', 'pysam', 'vcf', 'hansel']
 26 | for mod_name in MOCK_MODULES:
 27 |     sys.modules[mod_name] = mock.Mock()
 28 | 
 29 | # -- General configuration ------------------------------------------------
 30 | 
 31 | # If your documentation needs a minimal Sphinx version, state it here.
 32 | #needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     'sphinx.ext.autodoc',
 39 |     'sphinx.ext.doctest',
 40 |     'sphinx.ext.todo',
 41 |     'sphinx.ext.coverage',
 42 |     'sphinx.ext.mathjax',
 43 |     'sphinx.ext.viewcode',
 44 |     'sphinx.ext.intersphinx',
 45 |     'sphinx.ext.napoleon',
 46 | ]
 47 | napoleon_google_docstring = False
 48 | napoleon_use_param = False
 49 | napoleon_use_ivar = True
 50 | 
 51 | # Add any paths that contain templates here, relative to this directory.
 52 | templates_path = ['_templates']
 53 | 
 54 | # The suffix(es) of source filenames.
 55 | # You can specify multiple suffix as a list of string:
 56 | # source_suffix = ['.rst', '.md']
 57 | source_suffix = '.rst'
 58 | 
 59 | # The encoding of source files.
 60 | #source_encoding = 'utf-8-sig'
 61 | 
 62 | # The master toctree document.
 63 | master_doc = 'index'
 64 | 
 65 | # General information about the project.
 66 | project = u'Gretel'
 67 | copyright = u'2016, Sam Nicholls'
 68 | author = u'Sam Nicholls'
 69 | 
 70 | # The version info for the project you're documenting, acts as replacement for
 71 | # |version| and |release|, also used in various other places throughout the
 72 | # built documents.
 73 | #
 74 | # The short X.Y version.
 75 | version = u'0.0.1a'
 76 | # The full version, including alpha/beta/rc tags.
 77 | release = u'0.0.1a'
 78 | 
 79 | # The language for content autogenerated by Sphinx. Refer to documentation
 80 | # for a list of supported languages.
 81 | #
 82 | # This is also used if you do content translation via gettext catalogs.
 83 | # Usually you set "language" from the command line for these cases.
 84 | language = None
 85 | 
 86 | # There are two options for replacing |today|: either, you set today to some
 87 | # non-false value, then it is used:
 88 | #today = ''
 89 | # Else, today_fmt is used as the format for a strftime call.
 90 | #today_fmt = '%B %d, %Y'
 91 | 
 92 | # List of patterns, relative to source directory, that match files and
 93 | # directories to ignore when looking for source files.
 94 | exclude_patterns = ['_build']
 95 | 
 96 | # The reST default role (used for this markup: `text`) to use for all
 97 | # documents.
 98 | #default_role = None
 99 | 
100 | # If true, '()' will be appended to :func: etc. cross-reference text.
101 | #add_function_parentheses = True
102 | 
103 | # If true, the current module name will be prepended to all description
104 | # unit titles (such as .. function::).
105 | #add_module_names = True
106 | 
107 | # If true, sectionauthor and moduleauthor directives will be shown in the
108 | # output. They are ignored by default.
109 | #show_authors = False
110 | 
111 | # The name of the Pygments (syntax highlighting) style to use.
112 | pygments_style = 'sphinx'
113 | 
114 | # A list of ignored prefixes for module index sorting.
115 | #modindex_common_prefix = []
116 | 
117 | # If true, keep warnings as "system message" paragraphs in the built documents.
118 | #keep_warnings = False
119 | 
120 | # If true, `todo` and `todoList` produce output, else they produce nothing.
121 | todo_include_todos = False
122 | 
123 | 
124 | # -- Options for HTML output ----------------------------------------------
125 | 
126 | # The theme to use for HTML and HTML Help pages.  See the documentation for
127 | # a list of builtin themes.
128 | html_theme = 'default'
129 | 
130 | # Theme options are theme-specific and customize the look and feel of a theme
131 | # further.  For a list of options available for each theme, see the
132 | # documentation.
133 | #html_theme_options = {}
134 | 
135 | # Add any paths that contain custom themes here, relative to this directory.
136 | #html_theme_path = []
137 | 
138 | # The name for this set of Sphinx documents.  If None, it defaults to
139 | # "<project> v<release> documentation".
140 | #html_title = None
141 | 
142 | # A shorter title for the navigation bar.  Default is the same as html_title.
143 | #html_short_title = None
144 | 
145 | # The name of an image file (relative to this directory) to place at the top
146 | # of the sidebar.
147 | #html_logo = None
148 | 
149 | # The name of an image file (within the static path) to use as favicon of the
150 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
151 | # pixels large.
152 | #html_favicon = None
153 | 
154 | # Add any paths that contain custom static files (such as style sheets) here,
155 | # relative to this directory. They are copied after the builtin static files,
156 | # so a file named "default.css" will overwrite the builtin "default.css".
157 | html_static_path = ['_static']
158 | 
159 | # Add any extra paths that contain custom files (such as robots.txt or
160 | # .htaccess) here, relative to this directory. These files are copied
161 | # directly to the root of the documentation.
162 | #html_extra_path = []
163 | 
164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
165 | # using the given strftime format.
166 | #html_last_updated_fmt = '%b %d, %Y'
167 | 
168 | # If true, SmartyPants will be used to convert quotes and dashes to
169 | # typographically correct entities.
170 | #html_use_smartypants = True
171 | 
172 | # Custom sidebar templates, maps document names to template names.
173 | #html_sidebars = {}
174 | 
175 | # Additional templates that should be rendered to pages, maps page names to
176 | # template names.
177 | #html_additional_pages = {}
178 | 
179 | # If false, no module index is generated.
180 | #html_domain_indices = True
181 | 
182 | # If false, no index is generated.
183 | #html_use_index = True
184 | 
185 | # If true, the index is split into individual pages for each letter.
186 | #html_split_index = False
187 | 
188 | # If true, links to the reST sources are added to the pages.
189 | #html_show_sourcelink = True
190 | 
191 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
192 | #html_show_sphinx = True
193 | 
194 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
195 | #html_show_copyright = True
196 | 
197 | # If true, an OpenSearch description file will be output, and all pages will
198 | # contain a <link> tag referring to it.  The value of this option must be the
199 | # base URL from which the finished HTML is served.
200 | #html_use_opensearch = ''
201 | 
202 | # This is the file name suffix for HTML files (e.g. ".xhtml").
203 | #html_file_suffix = None
204 | 
205 | # Language to be used for generating the HTML full-text search index.
206 | # Sphinx supports the following languages:
207 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
208 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
209 | #html_search_language = 'en'
210 | 
211 | # A dictionary with options for the search language support, empty by default.
212 | # Now only 'ja' uses this config value
213 | #html_search_options = {'type': 'default'}
214 | 
215 | # The name of a javascript file (relative to the configuration directory) that
216 | # implements a search results scorer. If empty, the default will be used.
217 | #html_search_scorer = 'scorer.js'
218 | 
219 | # Output file base name for HTML help builder.
220 | htmlhelp_basename = 'Greteldoc'
221 | 
222 | # -- Options for LaTeX output ---------------------------------------------
223 | 
224 | latex_elements = {
225 | # The paper size ('letterpaper' or 'a4paper').
226 | #'papersize': 'letterpaper',
227 | 
228 | # The font size ('10pt', '11pt' or '12pt').
229 | #'pointsize': '10pt',
230 | 
231 | # Additional stuff for the LaTeX preamble.
232 | #'preamble': '',
233 | 
234 | # Latex figure (float) alignment
235 | #'figure_align': 'htbp',
236 | }
237 | 
238 | # Grouping the document tree into LaTeX files. List of tuples
239 | # (source start file, target name, title,
240 | #  author, documentclass [howto, manual, or own class]).
241 | latex_documents = [
242 |     (master_doc, 'Gretel.tex', u'Gretel Documentation',
243 |      u'Sam Nicholls', 'manual'),
244 | ]
245 | 
246 | # The name of an image file (relative to this directory) to place at the top of
247 | # the title page.
248 | #latex_logo = None
249 | 
250 | # For "manual" documents, if this is true, then toplevel headings are parts,
251 | # not chapters.
252 | #latex_use_parts = False
253 | 
254 | # If true, show page references after internal links.
255 | #latex_show_pagerefs = False
256 | 
257 | # If true, show URL addresses after external links.
258 | #latex_show_urls = False
259 | 
260 | # Documents to append as an appendix to all manuals.
261 | #latex_appendices = []
262 | 
263 | # If false, no module index is generated.
264 | #latex_domain_indices = True
265 | 
266 | 
267 | # -- Options for manual page output ---------------------------------------
268 | 
269 | # One entry per manual page. List of tuples
270 | # (source start file, name, description, authors, manual section).
271 | man_pages = [
272 |     (master_doc, 'gretel', u'Gretel Documentation',
273 |      [author], 1)
274 | ]
275 | 
276 | # If true, show URL addresses after external links.
277 | #man_show_urls = False
278 | 
279 | 
280 | # -- Options for Texinfo output -------------------------------------------
281 | 
282 | # Grouping the document tree into Texinfo files. List of tuples
283 | # (source start file, target name, title, author,
284 | #  dir menu entry, description, category)
285 | texinfo_documents = [
286 |     (master_doc, 'Gretel', u'Gretel Documentation',
287 |      author, 'Gretel', 'One line description of project.',
288 |      'Miscellaneous'),
289 | ]
290 | 
291 | # Documents to append as an appendix to all manuals.
292 | #texinfo_appendices = []
293 | 
294 | # If false, no module index is generated.
295 | #texinfo_domain_indices = True
296 | 
297 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
298 | #texinfo_show_urls = 'footnote'
299 | 
300 | # If true, do not generate a @detailmenu in the "Top" node's menu.
301 | #texinfo_no_detailmenu = False
302 | 
303 | 
304 | # Example configuration for intersphinx: refer to the Python standard library.
305 | intersphinx_mapping = {
306 |         "python": ('https://docs.python.org/', None),
307 |         "pysam": ('http://pysam.readthedocs.io/en/latest', None),
308 |         "hansel": ('http://hansel.readthedocs.io/en/latest/', None),
309 | }
310 | 


--------------------------------------------------------------------------------
/gretel/cmd.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import sys
  4 | import os
  5 | 
  6 | from . import gretel
  7 | from . import util
  8 | 
  9 | __version__ = "0.0.94"
 10 | 
 11 | def main():
 12 |     """Gretel: A metagenomic haplotyper."""
 13 |     parser = argparse.ArgumentParser(description="Gretel: A metagenomic haplotyper.")
 14 |     parser.add_argument("bam")
 15 |     parser.add_argument("vcf")
 16 |     parser.add_argument("contig")
 17 |     parser.add_argument("-s", "--start", type=int, default=1, help="1-indexed included start base position [default: 1]")
 18 |     parser.add_argument("-e", "--end", type=int, default=-1, help="1-indexed inlcuded end base position [default: reference length]")
 19 | 
 20 |     #parser.add_argument("-l", "--lorder", type=int, default=0, help="Order of markov chain to predict next nucleotide [default calculated from read data]")
 21 |     parser.add_argument("-p", "--paths", type=int, default=100, help="Maximum number of paths to generate [default:100]")
 22 | 
 23 |     parser.add_argument("--master", default=None, help="Master sequence (will be used to fill in homogeneous gaps in haplotypes, otherwise --gapchar)") #TODO Use something other than N? Should probably be a valid IUPAC
 24 |     parser.add_argument("--gapchar", default="N", help="Character to fill homogeneous gaps in haplotypes if no --master [default N]")
 25 |     parser.add_argument("--delchar", default="", help="Character to output in haplotype for deletion (eg. -) [default is blank]")
 26 | 
 27 |     parser.add_argument("--quiet", default=False, action='store_true', help="Don't output anything other than a single summary line.")
 28 |     #parser.add_argument("--sentinels", default=False, action='store_true', help="Add additional sentinels for read ends [default:False][EXPERIMENTAL]")
 29 |     parser.add_argument("-o", "--out", default=".", help="Output directory [default .]")
 30 |     parser.add_argument("-@", "--threads", type=int, default=1, help="Number of BAM iterators [default 1]")
 31 | 
 32 |     parser.add_argument("--debugreads", type=str, default="", help="A newline delimited list of read names to output debug data when parsing the BAM")
 33 |     parser.add_argument("--debugpos", type=str, default="", help="A newline delimited list of 1-indexed genomic positions to output debug data when parsing the BAM")
 34 |     parser.add_argument("--debughpos", type=str, default=",", help="A comma delimited list of 1-indexed SNP positions to output debug data when predicting haplotypes")
 35 | 
 36 |     parser.add_argument("--dumpmatrix", type=str, default=None, help="Location to dump the Hansel matrix to disk")
 37 |     parser.add_argument("--dumpsnps", type=str, default=None, help="Location to dump the SNP positions to disk")
 38 | 
 39 |     parser.add_argument("--pepper", action="store_true", help="enable a more permissive pileup by setting the pysam pileup stepper to 'all', instead of 'samtools'.\nNote that this will allow improper pairs.")
 40 | 
 41 |     parser.add_argument("--version", action="version", version="%(prog)s " + __version__)
 42 | 
 43 |     ARGS = parser.parse_args()
 44 | 
 45 |     debug_hpos = []
 46 |     if ARGS.debughpos:
 47 |         for x in ARGS.debughpos.split(","):
 48 |             try:
 49 |                 debug_hpos.append( int(x) )
 50 |             except:
 51 |                 pass
 52 | 
 53 |     if ARGS.end == -1:
 54 |         ARGS.end = util.get_ref_len_from_bam(ARGS.bam, ARGS.contig)
 55 |         sys.stderr.write("[NOTE] Setting end_pos to %d" % ARGS.end)
 56 | 
 57 |     debug_reads = set([])
 58 |     if ARGS.debugreads:
 59 |         debug_fofn = open(ARGS.debugreads)
 60 |         for line in debug_fofn:
 61 |             debug_reads.add(line.strip())
 62 | 
 63 |     debug_pos = set([])
 64 |     if ARGS.debugpos:
 65 |         debug_fofn = open(ARGS.debugpos)
 66 |         for line in debug_fofn:
 67 |             debug_pos.add(int(line.strip()))
 68 | 
 69 |     VCF_h = util.process_vcf(ARGS.vcf, ARGS.contig, ARGS.start, ARGS.end)
 70 |     if ARGS.dumpsnps:
 71 |         snp_fh = open(ARGS.dumpsnps, 'w')
 72 |         for k in sorted(VCF_h["snp_fwd"].keys()):
 73 |             snp_fh.write("%d\t%d\t%d\n" % (VCF_h["snp_fwd"][k]+1, k, k-ARGS.start+1))
 74 |         snp_fh.close()
 75 | 
 76 |     # Could we optimise for lower triangle by collapsing one of the dimensions
 77 |     # such that Z[m][n][i][j] == Z[m][n][i + ((j-1)*(j))/2]
 78 |     hansel = util.load_from_bam(ARGS.bam, ARGS.contig, ARGS.start, ARGS.end, VCF_h, n_threads=ARGS.threads, debug_reads=debug_reads, debug_pos=debug_pos, stepper="all" if ARGS.pepper else "samtools")
 79 |     original_hansel = hansel.copy()
 80 | 
 81 |     if ARGS.dumpmatrix:
 82 |         hansel.save_hansel_dump(ARGS.dumpmatrix)
 83 | 
 84 |     # Check if there is a gap in the matrix
 85 |     for i in range(0, VCF_h["N"]+1):
 86 |         marginal = hansel.get_counts_at(i)
 87 | 
 88 |         if i > 0:
 89 |             snp_rev = VCF_h["snp_rev"][i-1]
 90 |         else:
 91 |             snp_rev = 0
 92 |         if marginal.get("total", 0) == 0:
 93 |             sys.stderr.write('''[FAIL] Unable to recover pairwise evidence concerning SNP #%d at position %d
 94 |        Gretel needs every SNP to appear on a read with at least one other SNP, at least once.
 95 |        There is no read in your data set that bridges SNP #%d with any of its neighbours.
 96 | 
 97 |        * If you are trying to run Gretel along an entire contig or genome, please note that
 98 |        this is not the recommended usage for Gretel, as it was intended to uncover the
 99 |        variation in a metahaplome: the set of haplotypes for a specific gene.
100 |            See our pre-print https://doi.org/10.1101/223404 for more information
101 | 
102 |        Consider running a prediction tool such as `prokka` on your assembly or reference
103 |        and using the CDS regions in the GFF for corresponding genes of interest to
104 |        uncover haplotypes with Gretel instead.
105 | 
106 |        * If you are already doing this, consider calling for SNPs more aggressively.
107 |        We use `snpper` (https://github.com/SamStudio8/gretel-test/blob/master/snpper.py)
108 |        to determine any site in a BAM that has at least one read in disagreement with
109 |        the reference as a SNP. Although this introduces noise from alignment and sequence
110 |        error, Gretel is fairly robust. Importantly, this naive calling method will
111 |        likely close gaps between SNPs and permit recovery.
112 | 
113 |        * Finally, consider that the gaps are indicative that your reads do not support
114 |        one or more parts of your assembly or reference. You could try and find or construct
115 |        a more suitable reference, or reduce the size of the recovery window.
116 | 
117 |        Sorry :(\n''' % (i, snp_rev, i))
118 |             sys.exit(1)
119 | 
120 |     PATHS = {}
121 | 
122 |     # Spew out exciting information about the SNPs
123 |     if not ARGS.quiet:
124 |         print ("i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot")
125 |         last_rev = 0
126 |         for i in range(0, VCF_h["N"]+1):
127 |             marginal = hansel.get_counts_at(i)
128 |             marginal = {str(x): marginal[x] for x in marginal}
129 |             snp_rev = 0
130 |             if i > 0:
131 |                 snp_rev = VCF_h["snp_rev"][i-1]
132 |             print ("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (
133 |                 i,
134 |                 snp_rev,
135 |                 snp_rev - last_rev,
136 |                 marginal.get("A", 0),
137 |                 marginal.get("C", 0),
138 |                 marginal.get("G", 0),
139 |                 marginal.get("T", 0),
140 |                 marginal.get("N", 0),
141 |                 marginal.get("-", 0),
142 |                 marginal.get("_", 0),
143 |                 marginal.get("total", 0),
144 |             ))
145 |             last_rev = snp_rev
146 | 
147 | 
148 |     # Make some genes
149 |     SPINS = ARGS.paths
150 |     ongoing_mag = 0
151 |     for i in range(0, SPINS):
152 |         init_path, init_prob, init_min = gretel.generate_path(VCF_h["N"], hansel, original_hansel, debug_hpos=debug_hpos)
153 |         if init_path == None:
154 |             break
155 |         current_path = init_path
156 | 
157 |         MIN_REMOVE = 0.01 # 1%
158 |         if init_min < MIN_REMOVE:
159 |             sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE))
160 |             init_min = MIN_REMOVE
161 |         rw_magnitude = gretel.reweight_hansel_from_path(hansel, init_path, init_min)
162 | 
163 |         #TODO Horribly inefficient.
164 |         current_path_str = "".join([str(x) for x in current_path])
165 |         if current_path_str not in PATHS:
166 |             PATHS[current_path_str] = {
167 |                 "hp_current": [],
168 |                 "hp_original": [],
169 |                 "i": [],
170 |                 "i_0": i,
171 |                 "n": 0,
172 |                 "magnitude": 0,
173 |                 "hansel_path": current_path,
174 |             }
175 |         PATHS[current_path_str]["n"] += 1
176 |         PATHS[current_path_str]["i"].append(i)
177 |         PATHS[current_path_str]["magnitude"] += rw_magnitude
178 |         PATHS[current_path_str]["hp_current"].append(init_prob["hp_current"])
179 |         PATHS[current_path_str]["hp_original"].append(init_prob["hp_original"])
180 | 
181 |     # Output FASTA
182 |     dirn = ARGS.out + "/"
183 |     fasta_out_fh = open(dirn+"out.fasta", "w")
184 |     hfasta_out_fh = open(dirn+"snp.fasta", "w")
185 | 
186 |     if ARGS.master:
187 |         master_fa = util.load_fasta(ARGS.master)
188 |         master_seq = master_fa.fetch(master_fa.references[0])
189 |     else:
190 |         master_seq = [' '] * ARGS.end
191 | 
192 |     for p in sorted(PATHS, key=lambda x: PATHS[x]["i_0"]):
193 |         p = PATHS[p]
194 |         path = p["hansel_path"]
195 |         i = p["i_0"]
196 | 
197 |         seq = list(master_seq[:])
198 |         for j, mallele in enumerate(path[1:]):
199 |             snp_pos_on_master = VCF_h["snp_rev"][j]
200 |             try:
201 |                 if mallele == hansel.symbols_d["-"]:
202 |                     # It's a deletion, don't print a SNP
203 |                     seq[snp_pos_on_master-1] = ARGS.delchar
204 |                 else:
205 |                     seq[snp_pos_on_master-1] = mallele
206 |             except IndexError:
207 |                 print (path, len(seq), snp_pos_on_master-1)
208 |                 sys.exit(1)
209 | 
210 |         # Coerce HanselSymbols to str
211 |         to_write = "".join(str(x) for x in seq[ARGS.start-1 : ARGS.end])
212 |         if not ARGS.master:
213 |             to_write = to_write.replace(' ', ARGS.gapchar)
214 | 
215 |         fasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original?
216 |         fasta_out_fh.write("%s\n" % to_write)
217 | 
218 |         hfasta_out_fh.write(">%d__%.2f\n" % (i, p["hp_current"][0])) #TODO hp_current or hp_original?
219 |         hfasta_out_fh.write("%s\n" % "".join([str(x) for x in path[1:]]))
220 |     fasta_out_fh.close()
221 |     hfasta_out_fh.close()
222 | 
223 |     #TODO datetime, n_obs, n_slices, avg_obs_len, L, n_paths, n_avg_loglik
224 |     crumb_file = open(dirn+"gretel.crumbs", "w")
225 |     crumb_file.write("# %d\t%d\t%d\t%.2f\n" % (
226 |         VCF_h["N"],
227 |         hansel.n_crumbs,
228 |         hansel.n_slices,
229 |         hansel.L,
230 |     ))
231 | 
232 |     for p in sorted(PATHS, key=lambda x: PATHS[x]["hp_current"][0], reverse=True):
233 |         p = PATHS[p]
234 |         crumb_file.write("%d\t%d\t%s\t%s\t%.2f\n" % (
235 |                 p["i_0"],
236 |                 p["n"],
237 |                 ",".join(["%.2f" % x for x in p["hp_current"]]),
238 |                 ",".join(["%.2f" % x for x in p["hp_original"]]),
239 |                 p["magnitude"],
240 |         ))
241 | 


--------------------------------------------------------------------------------
/gretel/util.py:
--------------------------------------------------------------------------------
  1 | import pysam
  2 | import numpy as np
  3 | from math import ceil
  4 | from hansel import Hansel
  5 | import vcf
  6 | 
  7 | from multiprocessing import Process, Queue, Value
  8 | import sys
  9 | 
 10 | def get_ref_len_from_bam(bam_path, target_contig):
 11 |     """
 12 |     Fetch the length of a given reference sequence from a :py:class:`pysam.AlignmentFile`.
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     bam_path : str
 17 |         Path to the BAM alignment
 18 | 
 19 |     target_contig : str
 20 |         The name of the contig for which to recover haplotypes.
 21 | 
 22 |     Returns
 23 |     -------
 24 |     end_pos : int
 25 |         The 1-indexed genomic position at which to stop considering variants.
 26 |     """
 27 |     bam = pysam.AlignmentFile(bam_path)
 28 |     end = bam.lengths[bam.get_tid(target_contig)]
 29 |     bam.close()
 30 | 
 31 |     return end
 32 | 
 33 | def load_from_bam(bam_path, target_contig, start_pos, end_pos, vcf_handler, use_end_sentinels=False, n_threads=1, debug_reads=False, debug_pos=False, stepper="samtools"):
 34 |     """
 35 |     Load variants observed in a :py:class:`pysam.AlignmentFile` to
 36 |     an instance of :py:class:`hansel.hansel.Hansel`.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     bam_path : str
 41 |         Path to the BAM alignment
 42 | 
 43 |     target_contig : str
 44 |         The name of the contig for which to recover haplotypes.
 45 | 
 46 |     start_pos : int
 47 |         The 1-indexed genomic position from which to begin considering variants.
 48 | 
 49 |     end_pos : int
 50 |         The 1-indexed genomic position at which to stop considering variants.
 51 | 
 52 |     vcf_handler : dict{str, any}
 53 |         Variant metadata, as provided by :py:func:`gretel.gretel.process_vcf`.
 54 | 
 55 |     use_end_sentinels : boolean, optional(default=False)
 56 |         Whether or not to append an additional pairwise observation between
 57 |         the final variant on a read towards a sentinel.
 58 | 
 59 |         .. note:: Experimental
 60 |           This feature is for testing purposes, currently it is recommended
 61 |           that the flag be left at the default of `False`. However, some
 62 |           data sets report minor performance improvements for some haplotypes
 63 |           when set to `True`.
 64 |           This flag may be removed at any time without warning.
 65 | 
 66 |     n_threads : int, optional(default=1)
 67 |         Number of threads to spawn for reading the BAM
 68 | 
 69 |     debug_reads : list{str}, optional
 70 |         A list of read names for which to print out debugging information
 71 | 
 72 |     debug_pos : list{int}, optional
 73 |         A list of positions for which to print out debugging information
 74 | 
 75 |     stepper : str, optional(default=samtools)
 76 |         The pysam pileup stepper to use
 77 | 
 78 |     Returns
 79 |     -------
 80 |     Hansel : :py:class:`hansel.hansel.Hansel`
 81 |     """
 82 | 
 83 |     hansel = Hansel.init_matrix(['A', 'C', 'G', 'T', 'N', "-", "_"], ['N', "_"], vcf_handler["N"])
 84 | 
 85 |     if not debug_reads:
 86 |         debug_reads = set([])
 87 |     if not debug_pos:
 88 |         debug_pos = set([])
 89 | 
 90 |     import random
 91 |     def progress_worker(progress_q, n_workers, slices, total_snps, crumbs):
 92 |         worker_pos = []
 93 |         worker_done = []
 94 |         for _ in range(0, n_workers):
 95 |             worker_pos.append(0)
 96 |             worker_done.append(0)
 97 | 
 98 |         while sum(worker_done) < n_workers:
 99 |             work_block = progress_q.get()
100 |             worker_pos[work_block["worker_i"]] = work_block["pos"]
101 |             if work_block["pos"] is None:
102 |                 worker_done[work_block["worker_i"]] = 1
103 | 
104 |                 crumbs.value += work_block["crumbs"]
105 |                 slices.value += work_block["slices"]
106 |                 total_snps.value += work_block["covered_snps"]
107 |                 sys.stderr.write("%s\n" % ([ worker_pos[i] if status != 1 else None for (i, status) in enumerate(worker_done)]))
108 |             if random.random() < 0.1:
109 |                 sys.stderr.write("%s\n" % ([ worker_pos[i] if status != 1 else None for (i, status) in enumerate(worker_done)]))
110 |         return (slices, total_snps, crumbs)
111 | 
112 |     def bam_worker(bam_q, progress_q, worker_i):
113 | 
114 |         worker = worker_i
115 | 
116 |         slices = 0
117 |         crumbs = 0
118 |         covered_snps = 0
119 | 
120 |         bam = pysam.AlignmentFile(bam_path)
121 | 
122 |         while True:
123 |             work_block = bam_q.get()
124 |             if work_block is None:
125 |                 progress_q.put({
126 |                     "pos": None,
127 |                     "worker_i": worker_i,
128 |                     "slices": slices,
129 |                     "crumbs": crumbs,
130 |                     "covered_snps": covered_snps,
131 |                 })
132 |                 break
133 | 
134 |             reads = {}
135 |             dreads = set([])
136 | 
137 |             for p_col in bam.pileup(reference=target_contig, start=work_block["start"]-1, stop=work_block["end"], ignore_overlaps=False, min_base_quality=0, stepper=stepper):
138 | 
139 |                 if p_col.reference_pos + 1 > end_pos:
140 |                     # Ignore positions beyond the end_pos
141 |                     break
142 | 
143 |                 if vcf_handler["region"][p_col.reference_pos+1] != 1:
144 |                     # Ignore non-SNPs
145 |                     continue
146 | 
147 |                 for p_read in p_col.pileups:
148 | 
149 |                     curr_read_1or2 = 0
150 |                     if p_read.alignment.is_paired:
151 |                         if p_read.alignment.is_read1:
152 |                             curr_read_1or2 = 1
153 |                         elif p_read.alignment.is_read2:
154 |                             curr_read_1or2 = 2
155 |                         else:
156 |                             #TODO Probably indicative of bad data
157 |                             pass
158 | 
159 | 
160 |                     curr_read_name = "%s_%s_%d" % (p_read.alignment.query_name, str(p_read.alignment.flag), curr_read_1or2)
161 | 
162 |                     LEFTMOST_1pos = p_read.alignment.reference_start + 1 # Convert 0-based reference_start to 1-based position (to match region array and 1-based VCF)
163 | 
164 |                     # Special case: Consider reads that begin before the start_pos, but overlap the 0th block
165 |                     if work_block["i"] == 0:
166 |                         if LEFTMOST_1pos < start_pos:
167 |                             # Read starts before the start_pos
168 |                             if p_read.alignment.reference_start + 1 + p_read.alignment.query_alignment_length < start_pos:
169 |                                 # Read ends before the start_pos
170 |                                 continue
171 |                             LEFTMOST_1pos = start_pos
172 |                     else:
173 |                         # This read begins before the start of the current (non-0) block
174 |                         # and will have already been covered by the block that preceded it
175 |                         if LEFTMOST_1pos < work_block["start"]:
176 |                             continue
177 | 
178 |                     sequence = None
179 |                     qual = None
180 |                     if p_read.is_del:
181 |                         # TODO Not sure about how to estimate quality of deletion?
182 |                         sequence = "-" * (abs(p_read.indel) + 1)
183 |                         qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * (abs(p_read.indel) + 1)
184 |                     elif p_read.indel > 0:
185 |                         # p_read.indel peeks to next CIGAR and determines whether the base FOLLOWING this one is an insertion or not
186 |                         sequence = p_read.alignment.query_sequence[p_read.query_position : p_read.query_position + p_read.indel + 1]
187 |                         qual = p_read.alignment.query_qualities[p_read.query_position : p_read.query_position + p_read.indel + 1]
188 |                     else:
189 |                         sequence = p_read.alignment.query_sequence[p_read.query_position]
190 |                         qual = p_read.alignment.query_qualities[p_read.query_position]
191 | 
192 |                     if not sequence:
193 |                         print("[WARN] Sequence data seems to not be correctly salvaged from read %s" % p_read.alignment.query_name)
194 |                         continue
195 | 
196 |                     if curr_read_name not in reads:
197 |                         reads[curr_read_name] = {
198 |                             "rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]), # non-inclusive 1pos end
199 |                             "seq": [],
200 |                             "quals": [],
201 |                             "refs_1pos": [],
202 |                             "read_variants_0pos": [],
203 |                         }
204 |                         if p_read.alignment.query_name in debug_reads:
205 |                             dreads.add(curr_read_name)
206 |                     reads[curr_read_name]["seq"].append(sequence)
207 |                     reads[curr_read_name]["quals"].append(qual)
208 |                     reads[curr_read_name]["refs_1pos"].append(p_col.reference_pos+1)
209 |                     reads[curr_read_name]["read_variants_0pos"].append(p_read.query_position)
210 | 
211 | 
212 |             for dread in sorted(dreads):
213 |                 r = reads[dread]
214 |                 if r:
215 |                     for snp_i, ref_pos in enumerate(r["refs_1pos"]):
216 |                         print (dread, ref_pos, r["seq"][snp_i])
217 |                     print ("RANK", dread, r["rank"])
218 | 
219 |             if debug_pos:
220 |                 for read in reads:
221 |                     for d_pos in set(reads[read]["refs_1pos"]) & debug_pos:
222 |                         i = reads[read]["refs_1pos"].index(d_pos)
223 |                         print (read, d_pos, reads[read]["seq"][i])
224 | 
225 | 
226 |             num_reads = len(reads)
227 |             for qi, qname in enumerate(reads):
228 |                 progress_q.put({"pos": num_reads-(qi+1), "worker_i": worker_i})
229 | 
230 |                 if not len(reads[qname]["seq"]) > 1:
231 |                     # Ignore reads without evidence
232 |                     continue
233 |                 slices += 1
234 | 
235 |                 rank = reads[qname]["rank"]
236 |                 support_len = len(reads[qname]["seq"])
237 | 
238 |                 support_seq = "".join([b[0] for b in reads[qname]["seq"]]) # b[0] has the affect of capturing the base before any insertion
239 |                 covered_snps += len(support_seq.replace("N", "").replace("_", ""))
240 | 
241 |                 # For each position in the supporting sequence (that is, each covered SNP)
242 |                 for i in range(0, support_len):
243 |                     snp_a = support_seq[i]
244 | 
245 |                     #if support_len == 1:
246 |                     #    if rank == 0:
247 |                     #        hansel.add_observation('_', snp_a, 0, 1)
248 |                     #        hansel.add_observation(snp_a, '_', 1, 2)
249 |                     #    else:
250 |                     #        hansel.add_observation(snp_a, '_', rank+1, rank+2)
251 | 
252 | 
253 |                     # For each position in the supporting sequence following i
254 |                     for j in range(i+1, support_len):
255 |                         snp_b = support_seq[j]
256 | 
257 |                         # Ignore observations who are from an invalid transition
258 |                         if snp_a in ['_', 'N']:
259 |                             continue
260 | 
261 |                         # Sentinel->A
262 |                         if i==0 and j==1 and rank==0:
263 |                             # If this is the first position in the support (support_pos == 0)
264 |                             # and rank > 0 (that is, this is not the first SNP)
265 |                             # and SNPs a, b are adjacent
266 |                             hansel.add_observation('_', snp_a, 0, 1)
267 |                             hansel.add_observation(snp_a, snp_b, 1, 2)
268 |                             crumbs += 1
269 | 
270 |                         # B->Sentinel
271 |                         elif (j+rank+1) == vcf_handler["N"] and abs(i-j)==1:
272 |                             # Last observation (abs(i-j)==1),
273 |                             # that ends on the final SNP (j+rank+1 == N)
274 |                             hansel.add_observation(snp_a, snp_b, vcf_handler["N"]-1, vcf_handler["N"])
275 |                             hansel.add_observation(snp_b, '_', vcf_handler["N"], vcf_handler["N"]+1)
276 |                             crumbs += 1
277 | 
278 |                         # A regular observation (A->B)
279 |                         else:
280 |                             hansel.add_observation(snp_a, snp_b, i+rank+1, j+rank+1)
281 |                             crumbs += 1
282 | 
283 |                             if use_end_sentinels:
284 |                                 if j==(support_len-1) and abs(i-j)==1:
285 |                                     # The last SNP on a read, needs a sentinel afterward
286 |                                     hansel.add_observation(snp_b, '_', j+rank+1, j+rank+2)
287 | 
288 |     bam_queue = Queue()
289 |     progress_queue = Queue()
290 | 
291 |     # Queue the wokers
292 |     # TODO Evenly divide, but in future, consider the distn
293 |     # TODO Also consider in general block0 has more work to do
294 |     window_l = round((end_pos - start_pos) / float(n_threads))
295 |     for window_i, window_pos in enumerate(range(start_pos, end_pos+1, window_l)):
296 |         bam_queue.put({
297 |             "start": window_pos,
298 |             "end": window_pos + window_l - 1, # add -1 to stop end of window colliding with next window
299 |             "i": window_i,
300 |             "region_end": end_pos,
301 |         })
302 | 
303 |     processes = []
304 |     for _ in range(n_threads):
305 |         p = Process(target=bam_worker,
306 |                     args=(bam_queue, progress_queue, _))
307 |         processes.append(p)
308 | 
309 |     # ...and a progress process
310 |     n_reads = Value('i', 0)
311 |     n_observations = Value('i', 0)
312 |     total_covered_snps = Value('i', 0)
313 |     p = Process(target=progress_worker,
314 |                 args=(progress_queue, n_threads, n_reads, total_covered_snps, n_observations))
315 |     processes.append(p)
316 | 
317 |     for p in processes:
318 |         p.start()
319 | 
320 |     # Add sentinels
321 |     for _ in range(n_threads):
322 |         bam_queue.put(None)
323 | 
324 |     # Wait for processes to complete work
325 |     for p in processes:
326 |         p.join()
327 | 
328 | 
329 |     hansel.n_slices = n_reads.value
330 |     hansel.n_crumbs = n_observations.value
331 |     sys.stderr.write("[NOTE] Loaded %d breadcrumbs from %d bread slices.\n" % (hansel.n_crumbs, hansel.n_slices))
332 | 
333 |     hansel.L = int(ceil(float(total_covered_snps.value)/n_reads.value))
334 |     sys.stderr.write("[NOTE] Setting Gretel.L to %d\n" % hansel.L)
335 |     return hansel
336 | 
337 | def load_fasta(fa_path):
338 |     """
339 |     A convenient wrapper function for constructing a :py:class:`pysam.FastaFile`
340 | 
341 |     Parameters
342 |     ----------
343 |     fa_path : str
344 |         Path to FASTA
345 | 
346 |     Returns
347 |     -------
348 | 
349 |     FASTA File Interface : :py:class:`pysam.FastaFile`
350 |     """
351 |     return pysam.FastaFile(fa_path)
352 | 
353 | 
354 | def process_vcf(vcf_path, contig_name, start_pos, end_pos):
355 |     """
356 |     Parse a VCF to extract the genomic positions of called variants.
357 | 
358 |     Parameters
359 |     ----------
360 |     vcf_path : str
361 |         Path to the VCF file.
362 | 
363 |     contig_name : str
364 |         Name of the target contig on which variants were called.
365 | 
366 |     start_pos : int
367 |         The 1-indexed genomic position from which to begin considering variants.
368 | 
369 |     end_pos : int
370 |         The 1-indexed genomic position at which to stop considering variants.
371 | 
372 |     Returns
373 |     -------
374 |     Gretel Metastructure : dict
375 |         A collection of structures used for the execution of Gretel.
376 |         The currently used keys are:
377 |             N : int
378 |                 The number of observed SNPs
379 |             snp_fwd : dict{int, int}
380 |                 A reverse lookup from the n'th variant, to its genomic position on the contig
381 |             snp_rev : dict{int, int}
382 |                 A forward lookup to translate the n'th genomic position to its i'th SNP rank
383 |             region : list{int}
384 |                 A masked representation of the target contig, positive values are variant positions
385 |     """
386 | 
387 |     # Open the VCF
388 |     fp = open(vcf_path, 'rb') # assumes bgzip and tabix
389 |     vcf_records = vcf.Reader(fp)
390 |     n_snps = 0
391 |     snp_reverse = {}
392 |     snp_forward = {}
393 |     region = np.zeros(end_pos + 1, dtype=int)
394 |     i = 0
395 |     for record in vcf_records.fetch(contig_name, 0, end_pos): # [0, 1end)
396 |         # record.POS is 1-indexed
397 |         if record.POS < start_pos:
398 |             continue
399 |         if record.POS > end_pos:
400 |             continue
401 | 
402 |         n_snps += 1
403 |         region[record.POS] = 1
404 |         snp_reverse[i] = record.POS
405 |         snp_forward[record.POS] = i
406 |         i += 1
407 |     fp.close()
408 | 
409 |     return {
410 |         "N": n_snps,
411 |         "snp_fwd": snp_forward,
412 |         "snp_rev": snp_reverse,
413 |         "region": region,
414 |     }
415 | 


--------------------------------------------------------------------------------