├── test
    ├── __init__.py
    ├── 1kg.vcf.gz
    ├── README.md
    ├── tb.vcf.gz
    ├── tb.vcf.gz.tbi
    ├── prof.py
    ├── issue-16.vcf
    ├── example-4.0.vcf
    ├── walk_left.vcf
    ├── example-4.1.vcf
    ├── example-4.1-sv.vcf
    ├── samtools.vcf
    ├── test.vcf
    ├── dbsnp.vcf
    ├── null_genotype_mono.vcf
    ├── test-gl.vcf
    ├── gatk.vcf
    └── test_vcf.py
├── cyvcf
    ├── version.py
    ├── utils.py
    ├── filters.py
    ├── __init__.py
    └── parser.pyx
├── docs
    ├── INTRO.rst
    ├── index.rst
    ├── API.rst
    ├── HISTORY.rst
    ├── FILTERS.rst
    ├── Makefile
    └── conf.py
├── .gitignore
├── MANIFEST.in
├── LICENSE
├── scripts
    ├── vcf_melt
    └── vcf_filter.py
├── README.rst
├── setup.py
└── ez_setup.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cyvcf/version.py:
--------------------------------------------------------------------------------
1 | __version__="0.1.16"
2 | 


--------------------------------------------------------------------------------
/test/1kg.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arq5x/cyvcf/HEAD/test/1kg.vcf.gz


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
1 | Test data from GATK 1.4-9 and freebayes 0.9.4.
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/test/tb.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arq5x/cyvcf/HEAD/test/tb.vcf.gz


--------------------------------------------------------------------------------
/docs/INTRO.rst:
--------------------------------------------------------------------------------
1 | Introduction
2 | ============
3 | 
4 | .. automodule:: vcf
5 | 
6 | 


--------------------------------------------------------------------------------
/test/tb.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arq5x/cyvcf/HEAD/test/tb.vcf.gz.tbi


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build
3 | cyvcf.egg-info
4 | dist
5 | cyvcf/parser.c
6 | cyvcf/parser.so
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include ez_setup.py
2 | include LICENSE
3 | include README.rst
4 | include cyvcf/parser.c


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | PyVCF - A Variant Call Format Parser for Python 
 3 | ===============================================
 4 | 
 5 | Contents:
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    INTRO
11 |    API
12 |    FILTERS
13 |    HISTORY
14 |    
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/API.rst:
--------------------------------------------------------------------------------
 1 | API
 2 | ===
 3 | 
 4 | vcf.Reader
 5 | ----------
 6 | 
 7 | .. autoclass:: vcf.Reader
 8 |    :members:
 9 | 
10 | vcf.Writer
11 | ----------
12 | 
13 | .. autoclass:: vcf.Writer
14 |    :members:
15 | 
16 | vcf._Record
17 | -----------
18 | 
19 | .. autoclass:: vcf.parser._Record
20 |    :members:
21 | 
22 | vcf._Call
23 | ---------
24 | 
25 | .. autoclass:: vcf.parser._Call
26 |    :members:
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/test/prof.py:
--------------------------------------------------------------------------------
 1 | import vcf
 2 | import cProfile
 3 | import timeit
 4 | import pstats
 5 | import sys
 6 | 
 7 | def parse_1kg():
 8 |     for line in vcf.Reader(filename='test/1kg.vcf.gz'):
 9 |         pass
10 | 
11 | if len(sys.argv) == 1:
12 |     sys.argv.append(None)
13 | 
14 | if sys.argv[1] == 'profile':
15 |     cProfile.run('parse_1kg()', '1kg.prof')
16 |     p = pstats.Stats('1kg.prof')
17 |     p.strip_dirs().sort_stats('time').print_stats()
18 | 
19 | elif sys.argv[1] == 'time':
20 |     n = 5
21 |     t = timeit.timeit('parse_1kg()',  "from __main__ import parse_1kg", number=n)
22 |     print t/n
23 | else:
24 |     print 'prof.py profile/time'
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011 John Dougherty
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/cyvcf/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def walk_together(*readers):
 4 |     """ Walk a set of readers and return lists of records from each
 5 |         reader, with None if no record present.  Caller must check the
 6 |         inputs are sorted in the same way and use the same reference
 7 |         otherwise behaviour is undefined.
 8 |     """
 9 |     nexts = [reader.next() for reader in readers]
10 | 
11 |     while True:
12 |         min_next = min([x for x in nexts if x is not None])
13 | 
14 |         # this line uses equality on Records, which checks the ALTs
15 |         # not sure what to do with records that have overlapping but different
16 |         # variation
17 |         yield [x if x is None or x == min_next else None for x in nexts]
18 | 
19 |         # update nexts that we just yielded
20 |         for i, n in enumerate(nexts):
21 | 
22 |             if n is not None and n == min_next:
23 |                 try:
24 |                     nexts[i] = readers[i].next()
25 |                 except StopIteration:
26 |                     nexts[i] = None
27 | 
28 |         if all([x is None for x in nexts]):
29 |             break
30 | 


--------------------------------------------------------------------------------
/test/issue-16.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.0
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=1000GenomesPilot-NCBI36
 5 | ##phasing=partial
 6 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 7 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 8 | ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
 9 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
10 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
11 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
12 | ##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
13 | ##FILTER=<ID=q10,Description="Quality below 10">
14 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
16 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
19 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
20 | 20	1234568	.	G	.	.	PASS	NS=3;DP=9;AA=G	GT	./.	./.	./.
21 | 


--------------------------------------------------------------------------------
/scripts/vcf_melt:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ Melt a VCF file into a tab delimited set of calls, one per line
 3 | 
 4 | VCF files have all the calls from different samples on one line.  This 
 5 | script reads vcf on stdin and writes all calls to stdout in tab delimited 
 6 | format with one call in one sample per line.  This makes it easy to find 
 7 | a given sample's genotype with, say, grep.
 8 | """
 9 | 
10 | import sys
11 | import csv
12 | import vcf
13 | 
14 | out = csv.writer(sys.stdout, delimiter='\t')
15 | reader = vcf.VCFReader(sys.stdin)
16 | 
17 | formats = reader.formats.keys()
18 | infos = reader.infos.keys()
19 | 
20 | header = ["SAMPLE"] + formats + ['FILTER', 'CHROM', 'POS', 'REF', 'ALT', 'ID'] + ['info.'+ x for x in infos]
21 | 
22 | 
23 | out.writerow(header)    
24 | 
25 | def flatten(x):
26 |     if type(x) == type([]): 
27 |         x = ','.join(map(str, x))
28 |     return x
29 | 
30 | for record in reader: 
31 |     info_row = [flatten(record.INFO.get(x, None)) for x in infos]
32 |     fixed = [record.CHROM, record.POS, record.REF, record.ALT, record.ID]
33 |     
34 |     for sample in record.samples: 
35 |         row = [sample.sample]
36 |         row += [flatten(sample.data.get(x, None)) for x in formats]
37 |         row += [record.FILTER or '.']
38 |         row += fixed
39 |         row += info_row
40 |         out.writerow(row)
41 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | CyVCF
 2 | ======
 3 | 
 4 | A Cython port of the PyVCF library maintained by @jamescasbon.
 5 | 
 6 | The goal of this project is to provide a very fast Python library for parsing and manipulating large VCF files.
 7 | Cython has been used to optimize speed.  This version is approximately 4 times faster than PyVCF,
 8 | and the parsing speed is essentially identical to that of C/C++ libraries provided by PLINKSEQ and VCFLIB.
 9 | 
10 | The functionality and interface are currently the same as documented here: http://pyvcf.rtfd.org/
11 | 
12 | Installation
13 | ============
14 | 
15 |     python setup.py build
16 |     python setup.py install
17 | 
18 | 
19 | Testing
20 | =======
21 | 
22 |     python setup.py test
23 | 
24 | 
25 | Basic usage
26 | ===========
27 | 
28 |     >>> import cyvcf
29 |     >>> vcf_reader = cyvcf.Reader(open('test/example-4.0.vcf', 'rb'))
30 |     >>> for record in vcf_reader:
31 |     ...     print record
32 |     20	14370	G	A	29.0	.	H2=True;NS=3;DB=True;DP=14;AF=0.5	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
33 |     20	17330	T	A	3.0	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3:.
34 |     20	1110696	A	G,T	67.0	.	AA=T;NS=2;DB=True;DP=10;AF=0.333,0.667	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4:.
35 |     20	1230237	T	.	47.0	.	AA=T;NS=3;DP=13	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2:.
36 |     20	1234567	GTCT	G,GTACT	50.0	.	AA=G;NS=3;DP=9	GT:GQ:DP	./.	0/2:17:2	1/1:40:3
37 | 


--------------------------------------------------------------------------------
/test/example-4.0.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.0
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=1000GenomesPilot-NCBI36
 5 | ##phasing=partial
 6 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 7 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 8 | ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
 9 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
10 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
11 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
12 | ##FILTER=<ID=q10,Description="Quality below 10">
13 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
14 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
15 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
16 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
17 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
18 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
19 | 20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
20 | 20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3
21 | 20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4
22 | 20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2
23 | 20	1234567	microsat1	GTCT	G,GTACT	50	PASS	NS=3;DP=9;AA=G	GT:GQ:DP	./.:35:4	0/2:17:2	1/1:40:3
24 | 


--------------------------------------------------------------------------------
/test/walk_left.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.0
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=1000GenomesPilot-NCBI36
 5 | ##phasing=partial
 6 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 7 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 8 | ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
 9 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
10 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
11 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
12 | ##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
13 | ##FILTER=<ID=q10,Description="Quality below 10">
14 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
16 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
19 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
20 | 19	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
21 | 19	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3:65,3
22 | 20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4:65,4
23 | 20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2:65,3
24 | 21	1234567	microsat1	GTCT	G,GTACT	50	PASS	NS=3;DP=9;AA=G	GT:GQ:DP	./.:35:4	0/2:17:2	1/1:40:3:65,3
25 | 


--------------------------------------------------------------------------------
/test/example-4.1.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20090805
 3 | ##source=myImputationProgramV3.1
 4 | ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta
 5 | ##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species="Homo sapiens",taxonomy=x>
 6 | ##phasing=partial
 7 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
 8 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 9 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
10 | ##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
11 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
12 | ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
13 | ##FILTER=<ID=q10,Description="Quality below 10">
14 | ##FILTER=<ID=s50,Description="Less than 50% of samples have data">
15 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
16 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
17 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18 | ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
19 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
20 | 20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
21 | 20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3
22 | 20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4
23 | 20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:7:56,60	0|0:48:4:51,51	0/0:61:2
24 | 20	1234567	microsat1	GTC	G,GTCT	50	PASS	NS=3;DP=9;AA=G	GT:GQ:DP	0/1:35:4	0/2:17:2	1/1:40:3
25 | 


--------------------------------------------------------------------------------
/cyvcf/filters.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Base(object):
 4 |     """ Base class for vcf_filter.py filters """
 5 | 
 6 |     name = 'f'
 7 |     """ name used to activate filter and in VCF headers """
 8 | 
 9 |     description = 'VCF filter base class'
10 |     """ descrtiption used in vcf headers """
11 | 
12 |     @classmethod
13 |     def customize_parser(self, parser):
14 |         """ hook to extend argparse parser with custom arguments """
15 |         pass
16 | 
17 |     def __init__(self, args):
18 |         """ create the filter using argparse ``args`` """
19 |         self.threshold = 0
20 | 
21 |     def __call__(self):
22 |         """ filter a site, return not None if the site should be filtered """
23 |         raise NotImplementedError('Filters must implement this method')
24 | 
25 | 
26 |     def filter_name(self):
27 |         """ return the name to put in the VCF header, default is ``name`` + ``threshold`` """
28 |         return '%s%s' % (self.name, self.threshold)
29 | 
30 | 
31 | class SiteQuality(Base):
32 | 
33 |     description = 'Filter sites by quality'
34 |     name = 'sq'
35 | 
36 |     @classmethod
37 |     def customize_parser(self, parser):
38 |         parser.add_argument('--site-quality', type=int, default=30,
39 |                 help='Filter sites below this quality')
40 | 
41 |     def __init__(self, args):
42 |         self.threshold = args.site_quality
43 | 
44 |     def __call__(self, record):
45 |         if record.QUAL < self.threshold:
46 |             return record.QUAL
47 | 
48 | 
49 | class VariantGenotypeQuality(Base):
50 | 
51 |     description = 'Demand a minimum quality associated with a non reference call'
52 |     name = 'mgq'
53 | 
54 |     @classmethod
55 |     def customize_parser(self, parser):
56 |         parser.add_argument('--genotype-quality', type=int, default=50,
57 |                 help='Filter sites with no genotypes above this quality')
58 | 
59 |     def __init__(self, args):
60 |         self.threshold = args.genotype_quality
61 | 
62 |     def __call__(self, record):
63 |         if not record.is_monomorphic:
64 |             vgq = max([x['GQ'] for x in record if x.is_variant])
65 |             if vgq < self.threshold:
66 |                 return vgq
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import ez_setup
 2 | ez_setup.use_setuptools()
 3 | 
 4 | import glob
 5 | import os
 6 | import sys
 7 | from setuptools import setup
 8 | from distutils.extension import Extension
 9 | 
10 | # optional cython
11 | try:
12 |   from Cython.Distutils import build_ext
13 | except ImportError:
14 |   from distutils.command import build_ext as _build_ext
15 |   class build_ext(_build_ext.build_ext):
16 | 
17 |       description = "change pyx files to corresponding .c/.cpp (fallback when cython is not installed)"
18 | 
19 |       def build_extensions(self):
20 |           # First, sanity-check the 'extensions' list
21 |           self.check_extensions_list(self.extensions)
22 |           
23 |           for extension in self.extensions:
24 |               target_ext = '.c'
25 | 
26 |               patchedsrc = []
27 |               for source in extension.sources:
28 |                 (root, ext) = os.path.splitext(source)
29 |                 if ext == '.pyx':
30 |                   patchedsrc.append(root + target_ext)
31 |                 else:
32 |                   patchedsrc.append(source)
33 | 
34 |               extension.sources = patchedsrc
35 |               self.build_extension(extension)
36 |   
37 | 
38 | if 'setuptools.extension' in sys.modules:
39 |     m = sys.modules['setuptools.extension']
40 |     m.Extension.__dict__ = m._Extension.__dict__
41 | 
42 | version_py = os.path.join(os.path.dirname(__file__), 'cyvcf', 'version.py')
43 | version = open(version_py).read().strip().split('=')[-1].replace('"','')
44 | 
45 | sources=["cyvcf/parser.pyx"]
46 | exts = [ Extension("cyvcf.parser", sources=sources)]
47 | 
48 | setup(
49 |         cmdclass= {'build_ext': build_ext},
50 |         name="cyvcf",
51 |         version=version,
52 |         ext_modules=exts,
53 |         test_suite='test.test_vcf.suite',
54 |         packages=['cyvcf'],
55 |         author="Aaron Quinlan, James Casbon, John Dougherty, Martin Vermaat, Brent Pedersen",
56 |         description='A fast Python library for VCF files using Cython for speed.',
57 |         url="none",
58 |         package_dir = {"cyvcf": "cyvcf"},
59 |         author_email="arq5x@virginia.edu",
60 |         classifiers=[
61 |             'Development Status :: 4 - Beta',
62 |             'Intended Audience :: Science/Research',
63 |             'License :: OSI Approved :: GNU General Public License (GPL)',
64 |             'Topic :: Scientific/Engineering :: Bio-Informatics']
65 | 
66 |     )


--------------------------------------------------------------------------------
/scripts/vcf_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import argparse
 4 | import pkg_resources
 5 | 
 6 | import vcf
 7 | from vcf.parser import _Filter
 8 | 
 9 | parser = argparse.ArgumentParser(description='Filter a VCF file',
10 |         formatter_class=argparse.RawDescriptionHelpFormatter,
11 |         )
12 | parser.add_argument('input', metavar='input', type=str, nargs=1,
13 |         help='File to process (use - for STDIN)')
14 | parser.add_argument('filters', metavar='filter', type=str, nargs='+',
15 |         help='Filters to use')
16 | parser.add_argument('--no-short-circuit', action='store_true',
17 |         help='Do not stop filter processing on a site if a single filter fails.')
18 | parser.add_argument('--output', action='store', default=sys.stdout,
19 |         help='Filename to output (default stdout)')
20 | parser.add_argument('--no-filtered', action='store_true',
21 |         help='Remove failed sites')
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     # TODO: allow filter specification by short name
26 |     # TODO: flag that writes filter output into INFO column
27 |     # TODO: argument use implies filter use
28 |     # TODO: parallelize
29 |     # TODO: prevent plugins raising an exception from crashing the script
30 | 
31 | 
32 |     # dynamically build the list of available filters
33 |     filters = {}
34 |     filter_help = '\n\navailable filters:'
35 | 
36 |     for p in pkg_resources.iter_entry_points('vcf.filters'):
37 |         filt = p.load()
38 |         filters[filt.name] = filt
39 |         filt.customize_parser(parser)
40 |         filter_help += '\n  %s:\t%s' % (filt.name, filt.description)
41 | 
42 |     parser.description += filter_help
43 | 
44 |     # parse command line args
45 |     args = parser.parse_args()
46 | 
47 |     inp = vcf.Reader(file(args.input[0]))
48 | 
49 |     # build filter chain
50 |     chain = []
51 |     for name in args.filters:
52 |         f = filters[name](args)
53 |         chain.append(f)
54 |         inp.filters[f.filter_name()] = _Filter(f.filter_name(), f.description)
55 | 
56 |     oup = vcf.Writer(args.output, inp)
57 | 
58 |     # apply filters
59 |     short_circuit = not args.no_short_circuit
60 | 
61 |     for record in inp:
62 |         for filt in chain:
63 |             result = filt(record)
64 |             if result:
65 |                 record.add_filter(filt.filter_name())
66 |                 if short_circuit:
67 |                     break
68 | 
69 |         if (not args.no_filtered) or (record.FILTER == '.'):
70 |             oup.write_record(record)
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/docs/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | Development
 2 | ===========
 3 | 
 4 | Please use the repository at github: https://github.com/jamescasbon/PyVCF/
 5 | Pull requests gladly accepted. 
 6 | Issues should be reported at the github issue tracker.
 7 | 
 8 | Changes
 9 | =======
10 | 
11 | 0.4.3 Release
12 | -------------
13 | 
14 | * Single floats in Reader._sample_parser not being converted to float #35
15 | * Handle String INFO values when Number=1 in header #34
16 | 
17 | 0.4.2 Release
18 | -------------
19 | 
20 | * Installation problems
21 | 
22 | 0.4.1 Release
23 | -------------
24 | 
25 | * Installation problems
26 | 
27 | 0.4.0 Release
28 | -------------
29 | 
30 | * Package structure 
31 | * add ``vcf.utils`` module with ``walk_together`` method
32 | * samtools tests 
33 | * support Freebayes' non standard '.' for no call
34 | * fix vcf_melt  
35 | * support monomorphic sites, add ``is_monomorphic`` method, handle null QUALs
36 | * filter support for files with monomorphic calls 
37 | * Values declared as single are no-longer returned in lists
38 | * several performance improvements 
39 | 
40 | 
41 | 0.3.0 Release
42 | -------------
43 | 
44 | * Fix setup.py for python < 2.7
45 | * Add ``__eq__`` to ``_Record`` and ``_Call``
46 | * Add ``is_het`` and ``is_variant`` to ``_Call``
47 | * Drop aggressive parse mode: we're always aggressive.
48 | * Add tabix fetch for single calls, fix one->zero based indexing
49 | * add prepend_chr mode for ``Reader`` to add `chr` to CHROM attributes
50 | 
51 | 0.2.2 Release
52 | -------------
53 | 
54 | Documentation release
55 | 
56 | 0.2.1 Release
57 | -------------
58 | 
59 | * Add shebang to vcf_filter.py
60 | 
61 | 0.2 Release 
62 | -----------
63 | 
64 | * Replace genotype dictionary with a ``Call`` object
65 | * Methods on ``Record`` and ``Call`` (thanks @arq5x)
66 | * Shortcut parse_sample when genotype is None
67 | 
68 | 0.1 Release 
69 | -----------
70 | 
71 | * Added test code
72 | * Added Writer class
73 | * Allow negative number in ``INFO`` and ``FORMAT`` fields (thanks @martijnvermaat)
74 | * Prefer ``vcf.Reader`` to ``vcf.VCFReader``
75 | * Support compressed files with guessing where filename is available on fsock
76 | * Allow opening by filename as well as filesocket
77 | * Support fetching rows for tabixed indexed files
78 | * Performance improvements (see ``test/prof.py``)
79 | * Added extensible filter script (see FILTERS.md), vcf_filter.py 
80 | 
81 | Contributions
82 | -------------
83 | 
84 | Project started by @jdoughertyii and taken over by @jamescasbon on 12th January 2011.
85 | Contributions from @arq5x, @brentp, @martijnvermaat, @ian1roberts.
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/test/example-4.1-sv.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20100501
 3 | ##reference=1000GenomesPilot-NCBI36
 4 | ##assembly=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta
 5 | ##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file">
 6 | ##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
 7 | ##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
 8 | ##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
 9 | ##INFO=<ID=HOMLEN,Number=.,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
10 | ##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints">
11 | ##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
12 | ##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY">
13 | ##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
14 | ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
15 | ##ALT=<ID=DEL,Description="Deletion">
16 | ##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
17 | ##ALT=<ID=DEL:ME:L1,Description="Deletion of L1 element">
18 | ##ALT=<ID=DUP,Description="Duplication">
19 | ##ALT=<ID=DUP:TANDEM,Description="Tandem Duplication">
20 | ##ALT=<ID=INS,Description="Insertion of novel sequence">
21 | ##ALT=<ID=INS:ME:ALU,Description="Insertion of ALU element">
22 | ##ALT=<ID=INS:ME:L1,Description="Insertion of L1 element">
23 | ##ALT=<ID=INV,Description="Inversion">
24 | ##ALT=<ID=CNV,Description="Copy number variable region">
25 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
26 | ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype quality">
27 | ##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">
28 | ##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events">
29 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001
30 | 1	2827693	.	CCGTGGATGCGGGGACCCGCATCCCCTCTCCCTTCACAGCTGAGTGACCCACATCCCCTCTCCCCTCGCA	C	.	PASS	SVTYPE=DEL;END=2827680;BKPTID=Pindel_LCS_D1099159;HOMLEN=1;HOMSEQ=C;SVLEN=-66	GT:GQ	1/1:13.9
31 | 2	321682	.	T	<DEL>	6	PASS	IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62	GT:GQ	0/1:12
32 | 2	14477084	.	C	<DEL:ME:ALU>	12	PASS	IMPRECISE;SVTYPE=DEL;END=14477381;SVLEN=-297;MEINFO=AluYa5,5,307,+;CIPOS=-22,18;CIEND=-12,32	GT:GQ	0/1:12
33 | 3	9425916	.	C	<INS:ME:L1>	23	PASS	IMPRECISE;SVTYPE=INS;END=9425916;SVLEN=6027;CIPOS=-16,22;MIINFO=L1HS,1,6025,-	GT:GQ	1/1:15
34 | 3	12665100	.	A	<DUP>	14	PASS	IMPRECISE;SVTYPE=DUP;END=12686200;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500	GT:GQ:CN:CNQ	./.:0:3:16.2
35 | 4	18665128	.	T	<DUP:TANDEM>	11	PASS	IMPRECISE;SVTYPE=DUP;END=18665204;SVLEN=76;CIPOS=-10,10;CIEND=-10,10	GT:GQ:CN:CNQ	./.:0:5:8.3
36 | 


--------------------------------------------------------------------------------
/docs/FILTERS.rst:
--------------------------------------------------------------------------------
 1 | Filtering VCF files
 2 | ===================
 3 | 
 4 | The filter script: vcf_filter.py
 5 | --------------------------------
 6 | 
 7 | Filtering a VCF file based on some properties of interest is a common enough 
 8 | operation that PyVCF offers an extensible script.  ``vcf_filter.py`` does 
 9 | the work of reading input, updating the metadata and filtering the records.
10 | 
11 | 
12 | Adding a filter
13 | ---------------
14 | 
15 | You can reuse this work by providing a filter class, rather than writing your own filter.
16 | For example, lets say I want to filter each site based on the quality of the site.
17 | I can create a class like this::
18 |     
19 |     class SiteQuality(vcf.Filter):
20 | 
21 |         description = 'Filter sites by quality'
22 |         name = 'sq'
23 | 
24 |         @classmethod
25 |         def customize_parser(self, parser):
26 |             parser.add_argument('--site-quality', type=int, default=30,
27 |                     help='Filter sites below this quality')
28 | 
29 |         def __init__(self, args):
30 |             self.threshold = args.site_quality
31 | 
32 |         def __call__(self, record):
33 |             if record.QUAL < self.threshold:
34 |                 return record.QUAL
35 | 
36 | 
37 | This class subclasses ``vcf.Filter`` which provides the interface for VCF filters.
38 | The ``description``` and ``name`` are metadata about the parser.
39 | The ``customize_parser`` method allows you to add arguments to the script.
40 | We use the ``__init__`` method to grab the argument of interest from the parser.
41 | Finally, the ``__call__`` method processes each record and returns a value if the 
42 | filter failed.  The base class uses the ``name`` and ``threshold`` to create
43 | the filter ID in the VCF file.
44 | 
45 | To make vcf_filter.py aware of the filter, you need to declare a ``vcf.filters`` entry 
46 | point in your ``setup``::
47 | 
48 |     setup(
49 |         ...
50 |         entry_points = {
51 |             'vcf.filters': [
52 |                 'site_quality = module.path:SiteQuality',
53 |             ]
54 |         }
55 |     )
56 | 
57 | Now when you call vcf_filter.py, you should see your filter in the list of available filters::
58 | 
59 |     >$ vcf_filter.py --help
60 |     usage: vcf_filter.py [-h] [--no-short-circuit] [--output OUTPUT]
61 |                          [--site-quality SITE_QUALITY]
62 |                          [--genotype-quality GENOTYPE_QUALITY]
63 |                          input filter [filter ...]
64 | 
65 |     Filter a VCF file
66 | 
67 |     available filters:
68 |       sq:	Filter sites by quality
69 | 
70 |     positional arguments:
71 |       input                 File to process (use - for STDIN)
72 |       filter                Filters to use
73 | 
74 |     optional arguments:
75 |       -h, --help            show this help message and exit
76 |       --no-short-circuit    Do not stop filter processing on a site if a single
77 |                             filter fails.
78 |       --output OUTPUT       Filename to output (default stdout)
79 |       --site-quality SITE_QUALITY
80 |                             Filter sites below this quality
81 |       --genotype-quality GENOTYPE_QUALITY
82 |                             Filter sites with no genotypes above this quality
83 | 
84 | 
85 | The filter base class: vcf.Filter
86 | ---------------------------------
87 | 
88 | .. autoclass:: vcf.Filter
89 |    :members:
90 | 
91 | 


--------------------------------------------------------------------------------
/test/samtools.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##samtoolsVersion=0.1.16 (r963:234)
 3 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Raw read depth">
 4 | ##INFO=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
 5 | ##INFO=<ID=MQ,Number=1,Type=Integer,Description="Root-mean-square mapping quality of covering reads">
 6 | ##INFO=<ID=FQ,Number=1,Type=Float,Description="Phred probability of all samples being the same">
 7 | ##INFO=<ID=AF1,Number=1,Type=Float,Description="Max-likelihood estimate of the site allele frequency of the first ALT allele">
 8 | ##INFO=<ID=G3,Number=3,Type=Float,Description="ML estimate of genotype frequencies">
 9 | ##INFO=<ID=HWE,Number=1,Type=Float,Description="Chi^2 based HWE test P-value based on G3">
10 | ##INFO=<ID=CI95,Number=2,Type=Float,Description="Equal-tail Bayesian credible interval of the site allele frequency at the 95% level">
11 | ##INFO=<ID=PV4,Number=4,Type=Float,Description="P-values for strand bias, baseQ bias, mapQ bias and tail distance bias">
12 | ##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Indicates that the variant is an INDEL.">
13 | ##INFO=<ID=PC2,Number=2,Type=Integer,Description="Phred probability of the nonRef allele frequency in group1 samples being larger (,smaller) than in group2.">
14 | ##INFO=<ID=PCHI2,Number=1,Type=Float,Description="Posterior weighted chi^2 P-value for testing the association between group1 and group2 samples.">
15 | ##INFO=<ID=QCHI2,Number=1,Type=Integer,Description="Phred scaled PCHI2.">
16 | ##INFO=<ID=PR,Number=1,Type=Integer,Description="# permutations yielding a smaller PCHI2.">
17 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
18 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
19 | ##FORMAT=<ID=GL,Number=3,Type=Float,Description="Likelihoods for RR,RA,AA genotypes (R=ref,A=alt)">
20 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="# high-quality bases">
21 | ##FORMAT=<ID=SP,Number=1,Type=Integer,Description="Phred-scaled strand bias P-value">
22 | ##FORMAT=<ID=PL,Number=-1,Type=Integer,Description="List of Phred-scaled genotype likelihoods, number of values is (#ALT+1)*(#ALT+2)/2">
23 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	-
24 | chrX	2774478	.	A	G	61.3	.	DP=8;AF1=1;CI95=0.5,1;DP4=0,0,5,0;MQ=60;FQ=-42	GT:PL:GQ	1/1:94,15,0:27
25 | chrX	2832661	.	A	T	29	.	DP=25;AF1=0.5;CI95=0.5,0.5;DP4=14,1,6,1;MQ=56;FQ=32;PV4=1,0.00034,0.00019,1	GT:PL:GQ	0/1:59,0,144:62
26 | chrX	2832880	.	AAT	AATAT	109	.	INDEL;DP=20;AF1=0.5;CI95=0.5,0.5;DP4=0,7,0,8;MQ=47;FQ=112;PV4=1,1,0.0069,0.091	GT:PL:GQ	0/1:147,0,172:99
27 | chrX	2832920	.	TTAT	TTATAT	85.5	.	INDEL;DP=12;AF1=0.5;CI95=0.5,0.5;DP4=0,6,0,5;MQ=56;FQ=88.5;PV4=1,0.36,0.052,0.017	GT:PL:GQ	0/1:123,0,150:99
28 | chrX	2833534	.	TTACGCCCT	T	8.18	.	INDEL;DP=15;AF1=0.5;CI95=0.5,0.5;DP4=10,0,2,0;MQ=60;FQ=10.8;PV4=1,0.0041,1,0.3	GT:PL:GQ	0/1:45,0,255:47
29 | chrX	2833580	.	A	G	80	.	DP=20;AF1=0.5;CI95=0.5,0.5;DP4=10,2,7,1;MQ=58;FQ=83;PV4=1,1,0.06,1	GT:PL:GQ	0/1:110,0,141:99
30 | chr1	10363194	.	cca	cCAca	57.5	.	INDEL;DP=19;AF1=0.5;CI95=0.5,0.5;DP4=16,0,3,0;MQ=59;FQ=60.5;PV4=1,1,1,0.0008	GT:PL:GQ	0/1:95,0,214:98
31 | chr1	11292952	.	T	A,C	41	.	DP=17;AF1=1;CI95=1,1;DP4=0,0,17,0;MQ=57;FQ=-75	GT:PL:GQ	1/1:74,48,0,66,28,63:85
32 | chr1	38304491	.	t	tTTTTTTTTTTTTTTTTTTTTTT,tTTTTTTTTTTTTT,tTTTTTTT	16.3	.	INDEL;DP=9;AF1=1;CI95=0.5,1;DP4=0,0,0,4;MQ=41;FQ=-40.5	GT:PL:GQ	1/1:105,56,50,54,0,51,98,45,44,95:10
33 | chr1	152195728	.	ATTTTTTTTTTT	ATTTTTTTTTT,ATTTTTTTTT	36.5	.	INDEL;DP=39;AF1=1;CI95=1,1;DP4=1,1,12,19;MQ=59;FQ=-104;PV4=1,0.42,0.4,0.2	GT:PL:GQ	1/1:77,69,0,77,75,73:99
34 | chr1	152276149	.	C	T	134	.	DP=30;AF1=0.5;CI95=0.5,0.5;DP4=2,7,8,11;MQ=33;FQ=45;PV4=0.42,0.23,0.33,1	GT:PL:GQ	0/1:164,0,72:75
35 | 


--------------------------------------------------------------------------------
/test/test.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##FILTER=<ID=PASS,Description="All filters passed">
 3 | ##source=VarScan2
 4 | ##INFO=<ID=ADP,Number=1,Type=Integer,Description="Average per-sample depth of bases with Phred score >= 15">
 5 | ##INFO=<ID=WT,Number=1,Type=Integer,Description="Number of samples called reference (wild-type)">
 6 | ##INFO=<ID=HET,Number=1,Type=Integer,Description="Number of samples called heterozygous-variant">
 7 | ##INFO=<ID=HOM,Number=1,Type=Integer,Description="Number of samples called homozygous-variant">
 8 | ##INFO=<ID=NC,Number=1,Type=Integer,Description="Number of samples not called">
 9 | ##FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand">
10 | ##FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position">
11 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
12 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
13 | ##FORMAT=<ID=SDP,Number=1,Type=Integer,Description="Raw Read Depth as reported by SAMtools">
14 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Quality Read Depth of bases with Phred score >= 15">
15 | ##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">
16 | ##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">
17 | ##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">
18 | ##FORMAT=<ID=PVAL,Number=1,Type=String,Description="P-value from Fisher's Exact Test">
19 | ##FORMAT=<ID=RBQ,Number=1,Type=Integer,Description="Average quality of reference-supporting bases (qual1)">
20 | ##FORMAT=<ID=ABQ,Number=1,Type=Integer,Description="Average quality of variant-supporting bases (qual2)">
21 | ##FORMAT=<ID=RDF,Number=1,Type=Integer,Description="Depth of reference-supporting bases on forward strand (reads1plus)">
22 | ##FORMAT=<ID=RDR,Number=1,Type=Integer,Description="Depth of reference-supporting bases on reverse strand (reads1minus)">
23 | ##FORMAT=<ID=ADF,Number=1,Type=Integer,Description="Depth of variant-supporting bases on forward strand (reads2plus)">
24 | ##FORMAT=<ID=ADR,Number=1,Type=Integer,Description="Depth of variant-supporting bases on reverse strand (reads2minus)">
25 | ##contig=<ID=chr1>
26 | ##contig=<ID=chr2>
27 | ##contig=<ID=chr3>
28 | ##contig=<ID=chr4>
29 | ##contig=<ID=chr5>
30 | ##contig=<ID=chr6>
31 | ##contig=<ID=chr7>
32 | ##contig=<ID=chr8>
33 | ##contig=<ID=chr9>
34 | ##contig=<ID=chr10>
35 | ##contig=<ID=chr11>
36 | ##contig=<ID=chr12>
37 | ##contig=<ID=chr13>
38 | ##contig=<ID=chr14>
39 | ##contig=<ID=chr15>
40 | ##contig=<ID=chr16>
41 | ##contig=<ID=chr17>
42 | ##contig=<ID=chr18>
43 | ##contig=<ID=chr19>
44 | ##contig=<ID=chr20>
45 | ##contig=<ID=chr21>
46 | ##contig=<ID=chr22>
47 | ##contig=<ID=chrX>
48 | ##contig=<ID=chrY>
49 | ##INFO=<ID=OLD_MULTIALLELIC,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">
50 | ##INFO=<ID=OLD_VARIANT,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">
51 | ##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE||HGVSp|HGVSc|CANONICAL">
52 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample_01-14G01394	Sample_02-14G04079	Sample_03-14G19847	Sample_04-15G00251	Sample_05-14G24871	Sample_06-04G03785	Sample_07-05G00575	Sample_08-04G05168	Sample_09-12G05170	Sample_10-14G19446-1	Sample_11-15G00148	Sample_12-11G13366
53 | chr1	949608	.	G	A	.	PASS	ADP=231;WT=5;HET=6;HOM=1;NC=0;CSQ=missense_variant|aGc/aAc|S/N|ENSG00000187608|ISG15|ENST00000379389|2/2|benign(0.009)|tolerated(0.25)|83/165|protein_coding||ENSP00000368699.4:p.Ser83Asn|ENST00000379389.4:c.248G>A|YES,upstream_gene_variant|||ENSG00000224969|RP11-54O7.11|ENST00000458555|||||antisense||||YES	GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR	0/0:395:240:240:235:3:1.25%:1.2421E-1:39:15:125:110:3:0	0/1:252:178:178:106:71:39.89%:6.0237E-26:39:38:56:50:32:39	0/1:255:193:193:88:105:54.4%:5.7979E-41:40:38:51:37:53:52	0/1:255:219:219:107:112:51.14%:7.0426E-43:41:39:62:45:60:52	0/1:255:239:239:128:110:46.03%:4.7111E-41:38:37:69:59:55:55	0/0:484:277:277:275:2:0.72%:2.4955E-1:38:15:135:140:2:0	0/1:255:227:227:126:101:44.49%:2.0201E-37:39:38:69:57:50:51	1/1:255:262:262:2:259:98.85%:7.1891E-152:40:38:2:0:138:121	0/1:255:246:246:137:108:43.9%:6.9049E-40:39:39:69:68:48:60	0/0:456:243:243:243:0:0%:1E0:38:0:127:116:0:0	0/0:348:195:195:194:1:0.51%:5E-1:37:15:93:101:1:0	0/0:458:264:264:261:2:0.76%:2.4952E-1:40:38:139:122:0:2
54 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | 
 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest
 16 | 
 17 | help:
 18 | 	@echo "Please use \`make <target>' where <target> is one of"
 19 | 	@echo "  html       to make standalone HTML files"
 20 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 21 | 	@echo "  singlehtml to make a single large HTML file"
 22 | 	@echo "  pickle     to make pickle files"
 23 | 	@echo "  json       to make JSON files"
 24 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 25 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 26 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 27 | 	@echo "  epub       to make an epub"
 28 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 29 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 30 | 	@echo "  text       to make text files"
 31 | 	@echo "  man        to make manual pages"
 32 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 33 | 	@echo "  linkcheck  to check all external links for integrity"
 34 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 35 | 
 36 | clean:
 37 | 	-rm -rf $(BUILDDIR)/*
 38 | 
 39 | html:
 40 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 41 | 	@echo
 42 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 43 | 
 44 | dirhtml:
 45 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 48 | 
 49 | singlehtml:
 50 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 53 | 
 54 | pickle:
 55 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 56 | 	@echo
 57 | 	@echo "Build finished; now you can process the pickle files."
 58 | 
 59 | json:
 60 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the JSON files."
 63 | 
 64 | htmlhelp:
 65 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 66 | 	@echo
 67 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 68 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 69 | 
 70 | qthelp:
 71 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 72 | 	@echo
 73 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 74 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 75 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyVCF.qhcp"
 76 | 	@echo "To view the help file:"
 77 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyVCF.qhc"
 78 | 
 79 | devhelp:
 80 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 81 | 	@echo
 82 | 	@echo "Build finished."
 83 | 	@echo "To view the help file:"
 84 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/PyVCF"
 85 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PyVCF"
 86 | 	@echo "# devhelp"
 87 | 
 88 | epub:
 89 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 90 | 	@echo
 91 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 92 | 
 93 | latex:
 94 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
 95 | 	@echo
 96 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
 97 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
 98 | 	      "(use \`make latexpdf' here to do that automatically)."
 99 | 
100 | latexpdf:
101 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
102 | 	@echo "Running LaTeX files through pdflatex..."
103 | 	make -C $(BUILDDIR)/latex all-pdf
104 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
105 | 
106 | text:
107 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
108 | 	@echo
109 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
110 | 
111 | man:
112 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
113 | 	@echo
114 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
115 | 
116 | changes:
117 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
118 | 	@echo
119 | 	@echo "The overview file is in $(BUILDDIR)/changes."
120 | 
121 | linkcheck:
122 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
123 | 	@echo
124 | 	@echo "Link check complete; look for any errors in the above output " \
125 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
126 | 
127 | doctest:
128 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
129 | 	@echo "Testing of doctests in the sources finished, look at the " \
130 | 	      "results in $(BUILDDIR)/doctest/output.txt."
131 | 


--------------------------------------------------------------------------------
/cyvcf/__init__.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | '''A VCFv4.0 parser for Python.
  3 | 
  4 | Online version of PyVCF documentation is available at http://pyvcf.rtfd.org/
  5 | 
  6 | The intent of this module is to mimic the ``csv`` module in the Python stdlib,
  7 | as opposed to more flexible serialization formats like JSON or YAML.  ``vcf``
  8 | will attempt to parse the content of each record based on the data types
  9 | specified in the meta-information lines --  specifically the ##INFO and
 10 | ##FORMAT lines.  If these lines are missing or incomplete, it will check
 11 | against the reserved types mentioned in the spec.  Failing that, it will just
 12 | return strings.
 13 | 
 14 | There main interface is the class: ``Reader``.  It takes a file-like
 15 | object and acts as a reader::
 16 | 
 17 |     >>> import vcf
 18 |     >>> vcf_reader = vcf.Reader(open('test/example-4.0.vcf', 'rb'))
 19 |     >>> for record in vcf_reader:
 20 |     ...     print record
 21 |     Record(CHROM=20, POS=14370, REF=G, ALT=['A'])
 22 |     Record(CHROM=20, POS=17330, REF=T, ALT=['A'])
 23 |     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
 24 |     Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
 25 |     Record(CHROM=20, POS=1234567, REF=GTCT, ALT=['G', 'GTACT'])
 26 | 
 27 | 
 28 | This produces a great deal of information, but it is conveniently accessed.
 29 | The attributes of a Record are the 8 fixed fields from the VCF spec::
 30 | 
 31 |     * ``Record.CHROM``
 32 |     * ``Record.POS``
 33 |     * ``Record.ID``
 34 |     * ``Record.REF``
 35 |     * ``Record.ALT``
 36 |     * ``Record.QUAL``
 37 |     * ``Record.FILTER``
 38 |     * ``Record.INFO``
 39 | 
 40 | plus attributes to handle genotype information:
 41 | 
 42 |     * ``Record.FORMAT``
 43 |     * ``Record.samples``
 44 |     * ``Record.genotype``
 45 | 
 46 | ``samples`` and ``genotype``, not being the title of any column, are left lowercase.  The format
 47 | of the fixed fields is from the spec.  Comma-separated lists in the VCF are
 48 | converted to lists.  In particular, one-entry VCF lists are converted to
 49 | one-entry Python lists (see, e.g., ``Record.ALT``).  Semicolon-delimited lists
 50 | of key=value pairs are converted to Python dictionaries, with flags being given
 51 | a ``True`` value. Integers and floats are handled exactly as you'd expect::
 52 | 
 53 |     >>> vcf_reader = vcf.Reader(open('test/example-4.0.vcf', 'rb'))
 54 |     >>> record = vcf_reader.next()
 55 |     >>> print record.POS
 56 |     14370
 57 |     >>> print record.ALT
 58 |     ['A']
 59 |     >>> print record.INFO['AF']
 60 |     [0.5]
 61 | 
 62 | There are a number of convienience methods and properties for each ``Record`` allowing you to
 63 | examine properties of interest::
 64 | 
 65 |     >>> print record.num_called, record.call_rate, record.num_unknown
 66 |     3 1.0 0
 67 |     >>> print record.num_hom_ref, record.num_het, record.num_hom_alt
 68 |     1 1 1
 69 |     >>> print record.nucl_diversity, record.aaf
 70 |     0.6 0.5
 71 |     >>> print record.get_hets()
 72 |     [Call(sample=NA00002, GT=1|0, GQ=48)]
 73 |     >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion
 74 |     True False True False
 75 |     >>> print record.var_type, record.var_subtype
 76 |     snp ts
 77 |     >>> print record.is_monomorphic
 78 |     False
 79 | 
 80 | ``record.FORMAT`` will be a string specifying the format of the genotype
 81 | fields.  In case the FORMAT column does not exist, ``record.FORMAT`` is
 82 | ``None``.  Finally, ``record.samples`` is a list of dictionaries containing the
 83 | parsed sample column and ``record.genotype`` is a way of looking up genotypes
 84 | by sample name::
 85 | 
 86 |     >>> record = vcf_reader.next()
 87 |     >>> for sample in record.samples:
 88 |     ...     print sample['GT']
 89 |     0|0
 90 |     0|1
 91 |     0/0
 92 |     >>> print record.genotype('NA00001')['GT']
 93 |     0|0
 94 | 
 95 | The genotypes are represented by ``Call`` objects, which have three attributes: the
 96 | corresponding Record ``site``, the sample name in ``sample`` and a dictionary of
 97 | call data in ``data``::
 98 | 
 99 |      >>> call = record.genotype('NA00001')
100 |      >>> print call.site
101 |      Record(CHROM=20, POS=17330, REF=T, ALT=['A'])
102 |      >>> print call.sample
103 |      NA00001
104 |      >>> print call.data
105 |      {'GT': '0|0', 'HQ': [58, 50], 'DP': 3, 'GQ': 49}
106 | 
107 | Please note that as of release 0.4.0, attributes known to have single values (such as
108 | ``DP`` and ``GQ`` above) are returned as values.  Other attributes are returned
109 | as lists (such as ``HQ`` above).
110 | 
111 | There are also a number of methods::
112 | 
113 |     >>> print call.called, call.gt_type, call.gt_bases, call.phased
114 |     True 0 T|T True
115 | 
116 | 
117 | Metadata regarding the VCF file itself can be investigated through the
118 | following attributes:
119 | 
120 |     * ``Reader.metadata``
121 |     * ``Reader.infos``
122 |     * ``Reader.filters``
123 |     * ``Reader.formats``
124 |     * ``Reader.samples``
125 | 
126 | For example::
127 | 
128 |     >>> vcf_reader.metadata['fileDate']
129 |     '20090805'
130 |     >>> vcf_reader.samples
131 |     ['NA00001', 'NA00002', 'NA00003']
132 |     >>> vcf_reader.filters
133 |     {'q10': Filter(id='q10', desc='Quality below 10'), 's50': Filter(id='s50', desc='Less than 50% of samples have data')}
134 |     >>> vcf_reader.infos['AA'].desc
135 |     'Ancestral Allele'
136 | 
137 | Random access is supported for files with tabix indexes.  Simply call fetch for the
138 | region you are interested in::
139 | 
140 |     >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
141 |     >>> for record in vcf_reader.fetch('20', 1110696, 1230237):
142 |     ...     print record
143 |     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
144 |     Record(CHROM=20, POS=1230237, REF=T, ALT=[None])
145 | 
146 | Or extract a single row::
147 | 
148 |     >>> print vcf_reader.fetch('20', 1110696)
149 |     Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T'])
150 | 
151 | 
152 | The ``Writer`` class provides a way of writing a VCF file.  Currently, you must specify a
153 | template ``Reader`` which provides the metadata::
154 | 
155 |     >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz')
156 |     >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader)
157 |     >>> for record in vcf_reader:
158 |     ...     vcf_writer.write_record(record)
159 | 
160 | 
161 | An extensible script is available to filter vcf files in vcf_filter.py.  VCF filters
162 | declared by other packages will be available for use in this script.  Please
163 | see :doc:`FILTERS` for full description.
164 | 
165 | '''
166 | from .parser import VCFReader, Reader, VCFWriter, Writer
167 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # PyVCF documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Jan 25 12:29:23 2012.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | sys.path.insert(0, os.path.abspath('..'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.viewcode']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['.templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'PyVCF'
 44 | copyright = u'2012, James Casbon, @jdoughertyii'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | import vcf
 52 | version = vcf.VERSION
 53 | # The full version, including alpha/beta/rc tags.
 54 | release = vcf.VERSION
 55 | 
 56 | # The language for content autogenerated by Sphinx. Refer to documentation
 57 | # for a list of supported languages.
 58 | #language = None
 59 | 
 60 | # There are two options for replacing |today|: either, you set today to some
 61 | # non-false value, then it is used:
 62 | #today = ''
 63 | # Else, today_fmt is used as the format for a strftime call.
 64 | #today_fmt = '%B %d, %Y'
 65 | 
 66 | # List of patterns, relative to source directory, that match files and
 67 | # directories to ignore when looking for source files.
 68 | exclude_patterns = ['.build']
 69 | 
 70 | # The reST default role (used for this markup: `text`) to use for all documents.
 71 | #default_role = None
 72 | 
 73 | # If true, '()' will be appended to :func: etc. cross-reference text.
 74 | #add_function_parentheses = True
 75 | 
 76 | # If true, the current module name will be prepended to all description
 77 | # unit titles (such as .. function::).
 78 | #add_module_names = True
 79 | 
 80 | # If true, sectionauthor and moduleauthor directives will be shown in the
 81 | # output. They are ignored by default.
 82 | #show_authors = False
 83 | 
 84 | # The name of the Pygments (syntax highlighting) style to use.
 85 | pygments_style = 'sphinx'
 86 | 
 87 | # A list of ignored prefixes for module index sorting.
 88 | #modindex_common_prefix = []
 89 | 
 90 | 
 91 | # -- Options for HTML output ---------------------------------------------------
 92 | 
 93 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 94 | # a list of builtin themes.
 95 | html_theme = 'default'
 96 | 
 97 | # Theme options are theme-specific and customize the look and feel of a theme
 98 | # further.  For a list of options available for each theme, see the
 99 | # documentation.
100 | #html_theme_options = {}
101 | 
102 | # Add any paths that contain custom themes here, relative to this directory.
103 | #html_theme_path = []
104 | 
105 | # The name for this set of Sphinx documents.  If None, it defaults to
106 | # "<project> v<release> documentation".
107 | #html_title = None
108 | 
109 | # A shorter title for the navigation bar.  Default is the same as html_title.
110 | #html_short_title = None
111 | 
112 | # The name of an image file (relative to this directory) to place at the top
113 | # of the sidebar.
114 | #html_logo = None
115 | 
116 | # The name of an image file (within the static path) to use as favicon of the
117 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
118 | # pixels large.
119 | #html_favicon = None
120 | 
121 | # Add any paths that contain custom static files (such as style sheets) here,
122 | # relative to this directory. They are copied after the builtin static files,
123 | # so a file named "default.css" will overwrite the builtin "default.css".
124 | html_static_path = ['.static']
125 | 
126 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
127 | # using the given strftime format.
128 | #html_last_updated_fmt = '%b %d, %Y'
129 | 
130 | # If true, SmartyPants will be used to convert quotes and dashes to
131 | # typographically correct entities.
132 | #html_use_smartypants = True
133 | 
134 | # Custom sidebar templates, maps document names to template names.
135 | #html_sidebars = {}
136 | 
137 | # Additional templates that should be rendered to pages, maps page names to
138 | # template names.
139 | #html_additional_pages = {}
140 | 
141 | # If false, no module index is generated.
142 | #html_domain_indices = True
143 | 
144 | # If false, no index is generated.
145 | #html_use_index = True
146 | 
147 | # If true, the index is split into individual pages for each letter.
148 | #html_split_index = False
149 | 
150 | # If true, links to the reST sources are added to the pages.
151 | #html_show_sourcelink = True
152 | 
153 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
154 | #html_show_sphinx = True
155 | 
156 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
157 | #html_show_copyright = True
158 | 
159 | # If true, an OpenSearch description file will be output, and all pages will
160 | # contain a <link> tag referring to it.  The value of this option must be the
161 | # base URL from which the finished HTML is served.
162 | #html_use_opensearch = ''
163 | 
164 | # This is the file name suffix for HTML files (e.g. ".xhtml").
165 | #html_file_suffix = None
166 | 
167 | # Output file base name for HTML help builder.
168 | htmlhelp_basename = 'PyVCFdoc'
169 | 
170 | 
171 | # -- Options for LaTeX output --------------------------------------------------
172 | 
173 | # The paper size ('letter' or 'a4').
174 | #latex_paper_size = 'letter'
175 | 
176 | # The font size ('10pt', '11pt' or '12pt').
177 | #latex_font_size = '10pt'
178 | 
179 | # Grouping the document tree into LaTeX files. List of tuples
180 | # (source start file, target name, title, author, documentclass [howto/manual]).
181 | latex_documents = [
182 |   ('index', 'PyVCF.tex', u'PyVCF Documentation',
183 |    u'James Casbon, @jdoughertyii', 'manual'),
184 | ]
185 | 
186 | # The name of an image file (relative to this directory) to place at the top of
187 | # the title page.
188 | #latex_logo = None
189 | 
190 | # For "manual" documents, if this is true, then toplevel headings are parts,
191 | # not chapters.
192 | #latex_use_parts = False
193 | 
194 | # If true, show page references after internal links.
195 | #latex_show_pagerefs = False
196 | 
197 | # If true, show URL addresses after external links.
198 | #latex_show_urls = False
199 | 
200 | # Additional stuff for the LaTeX preamble.
201 | #latex_preamble = ''
202 | 
203 | # Documents to append as an appendix to all manuals.
204 | #latex_appendices = []
205 | 
206 | # If false, no module index is generated.
207 | #latex_domain_indices = True
208 | 
209 | 
210 | # -- Options for manual page output --------------------------------------------
211 | 
212 | # One entry per manual page. List of tuples
213 | # (source start file, name, description, authors, manual section).
214 | man_pages = [
215 |     ('index', 'pyvcf', u'PyVCF Documentation',
216 |      [u'James Casbon, @jdoughertyii'], 1)
217 | ]
218 | 


--------------------------------------------------------------------------------
/test/dbsnp.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.0
 2 | ##fileDate=20120118
 3 | ##source=dbSNP
 4 | ##dbSNP_BUILD_ID=135
 5 | ##reference=GRCh37.p5
 6 | ##phasing=partial
 7 | ##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf	
 8 | ##INFO=<ID=RSPOS,Number=1,Type=Integer,Description="Chr position reported in dbSNP">
 9 | ##INFO=<ID=RV,Number=0,Type=Flag,Description="RS orientation is reversed">
10 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
11 | ##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
12 | ##INFO=<ID=VP,Number=1,Type=String,Description="Variation Property">
13 | ##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id.  The gene symbol and id are delimited by a colon (:) and each pair is delimited by a vertical bar (|)">
14 | ##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description="First dbSNP Build for RS">
15 | ##INFO=<ID=SAO,Number=1,Type=Integer,Description="Variant Allele Origin: 0 - unspecified, 1 - Germline, 2 - Somatic, 3 - Both">
16 | ##INFO=<ID=SSR,Number=1,Type=Integer,Description="Variant Suspect Reason Code, 0 - unspecified, 1 - Paralog, 2 - byEST, 3 - Para_EST, 4 - oldAlign, 5 - other">
17 | ##INFO=<ID=SCS,Number=1,Type=Integer,Description="Variant Clinical Significance, 0 - unknown, 1 - untested, 2 - non-pathogenic, 3 - probable-non-pathogenic, 4 - probable-pathogenic, 5 - pathogenic, 6 - drug-response, 7 - histocompatibility, 255 - other">
18 | ##INFO=<ID=GMAF,Number=1,Type=Float,Description="Global Minor Allele Frequency [0, 0.5]; global population is 1000GenomesProject phase 1 genotype data from 629 individuals, released in the 11-23-2010 dataset">
19 | ##INFO=<ID=WGT,Number=1,Type=Integer,Description="Weight, 00 - unmapped, 1 - weight 1, 2 - weight 2, 3 - weight 3 or more">
20 | ##INFO=<ID=VC,Number=1,Type=String,Description="Variation Class">
21 | ##INFO=<ID=CLN,Number=0,Type=Flag,Description="Variant is Clinical(LSDB,OMIM,TPA,Diagnostic)">
22 | ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
23 | ##INFO=<ID=TPA,Number=0,Type=Flag,Description="Provisional Third Party Annotation(TPA) (currently rs from PHARMGKB who will give phenotype data)">
24 | ##INFO=<ID=PMC,Number=0,Type=Flag,Description="Links exist to PubMed Central article">
25 | ##INFO=<ID=S3D,Number=0,Type=Flag,Description="Has 3D structure - SNP3D table">
26 | ##INFO=<ID=SLO,Number=0,Type=Flag,Description="Has SubmitterLinkOut - From SNP->SubSNP->Batch.link_out">
27 | ##INFO=<ID=NSF,Number=0,Type=Flag,Description="Has non-synonymous frameshift A coding region variation where one allele in the set changes all downstream amino acids. FxnClass = 44">
28 | ##INFO=<ID=NSM,Number=0,Type=Flag,Description="Has non-synonymous missense A coding region variation where one allele in the set changes protein peptide. FxnClass = 42">
29 | ##INFO=<ID=NSN,Number=0,Type=Flag,Description="Has non-synonymous nonsense A coding region variation where one allele in the set changes to STOP codon (TER). FxnClass = 41">
30 | ##INFO=<ID=REF,Number=0,Type=Flag,Description="Has reference A coding region variation where one allele in the set is identical to the reference sequence. FxnCode = 8">
31 | ##INFO=<ID=SYN,Number=0,Type=Flag,Description="Has synonymous A coding region variation where one allele in the set does not change the encoded amino acid. FxnCode = 3">
32 | ##INFO=<ID=U3,Number=0,Type=Flag,Description="In 3' UTR Location is in an untranslated region (UTR). FxnCode = 53">
33 | ##INFO=<ID=U5,Number=0,Type=Flag,Description="In 5' UTR Location is in an untranslated region (UTR). FxnCode = 55">
34 | ##INFO=<ID=ASS,Number=0,Type=Flag,Description="In acceptor splice site FxnCode = 73">
35 | ##INFO=<ID=DSS,Number=0,Type=Flag,Description="In donor splice-site FxnCode = 75">
36 | ##INFO=<ID=INT,Number=0,Type=Flag,Description="In Intron FxnCode = 6">
37 | ##INFO=<ID=R3,Number=0,Type=Flag,Description="In 3' gene region FxnCode = 13">
38 | ##INFO=<ID=R5,Number=0,Type=Flag,Description="In 5' gene region FxnCode = 15">
39 | ##INFO=<ID=OTH,Number=0,Type=Flag,Description="Has other variant with exactly the same set of mapped positions on NCBI refernce assembly.">
40 | ##INFO=<ID=CFL,Number=0,Type=Flag,Description="Has Assembly conflict. This is for weight 1 and 2 variant that maps to different chromosomes on different assemblies.">
41 | ##INFO=<ID=ASP,Number=0,Type=Flag,Description="Is Assembly specific. This is set if the variant only maps to one assembly">
42 | ##INFO=<ID=MUT,Number=0,Type=Flag,Description="Is mutation (journal citation, explicit fact): a low frequency variation that is cited in journal and other reputable sources">
43 | ##INFO=<ID=VLD,Number=0,Type=Flag,Description="Is Validated.  This bit is set if the variant has 2+ minor allele count based on frequency or genotype data.">
44 | ##INFO=<ID=G5A,Number=0,Type=Flag,Description=">5% minor allele frequency in each and all populations">
45 | ##INFO=<ID=G5,Number=0,Type=Flag,Description=">5% minor allele frequency in 1+ populations">
46 | ##INFO=<ID=HD,Number=0,Type=Flag,Description="Marker is on high density genotyping kit (50K density or greater).  The variant may have phenotype associations present in dbGaP.">
47 | ##INFO=<ID=GNO,Number=0,Type=Flag,Description="Genotypes available. The variant has individual genotype (in SubInd table).">
48 | ##INFO=<ID=KGValidated,Number=0,Type=Flag,Description="1000 Genome validated">
49 | ##INFO=<ID=KGPhase1,Number=0,Type=Flag,Description="1000 Genome phase 1 (incl. June Interim phase 1)">
50 | ##INFO=<ID=KGPilot123,Number=0,Type=Flag,Description="1000 Genome discovery all pilots 2010(1,2,3)">
51 | ##INFO=<ID=KGPROD,Number=0,Type=Flag,Description="Has 1000 Genome submission">
52 | ##INFO=<ID=OTHERKG,Number=0,Type=Flag,Description="non-1000 Genome submission">
53 | ##INFO=<ID=PH3,Number=0,Type=Flag,Description="HAP_MAP Phase 3 genotyped: filtered, non-redundant">
54 | ##INFO=<ID=CDA,Number=0,Type=Flag,Description="Variation is interrogated in a clinical diagnostic assay">
55 | ##INFO=<ID=LSD,Number=0,Type=Flag,Description="Submitted from a locus-specific database">
56 | ##INFO=<ID=MTP,Number=0,Type=Flag,Description="Microattribution/third-party annotation(TPA:GWAS,PAGE)">
57 | ##INFO=<ID=OM,Number=0,Type=Flag,Description="Has OMIM/OMIA">
58 | ##INFO=<ID=NOC,Number=0,Type=Flag,Description="Contig allele not present in variant allele list. The reference sequence allele at the mapped position is not present in the variant allele list, adjusted for orientation.">
59 | ##INFO=<ID=WTD,Number=0,Type=Flag,Description="Is Withdrawn by submitter If one member ss is withdrawn by submitter, then this bit is set.  If all member ss' are withdrawn, then the rs is deleted to SNPHistory">
60 | ##INFO=<ID=NOV,Number=0,Type=Flag,Description="Rs cluster has non-overlapping allele sets. True when rs set has more than 2 alleles from different submissions and these sets share no alleles in common.">
61 | ##INFO=<ID=GCF,Number=0,Type=Flag,Description="Has Genotype Conflict Same (rs, ind), different genotype.  N/N is not included.">
62 | ##FILTER=<ID=NC,Description="Inconsistent Genotype Submission For At Least One Sample">
63 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
64 | 1	10144	rs144773400	TA	T	.	.	RSPOS=10145;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000000005000002000200;WGT=1;VC=DIV;ASP;OTHERKG
65 | 1	10228	rs143255646	TA	T	.	.	RSPOS=10229;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG
66 | 1	10234	rs145599635	C	T	.	.	RSPOS=10234;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG
67 | 1	10248	rs148908337	A	T	.	.	RSPOS=10248;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG
68 | 1	10254	rs140194106	TA	T	.	.	RSPOS=10255;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG
69 | 1	10291	rs145427775	C	T	.	.	RSPOS=10291;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG
70 | 1	10327	rs112750067	T	C	.	.	RSPOS=10327;dbSNPBuildID=132;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG
71 | 1	10329	rs150969722	AC	A	.	.	RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG
72 | 1	10351	rs145072688	CTA	C,CA	.	.	RSPOS=10352;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000210;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG;NOC
73 | 1	10382	rs147093981	AAC	A,AC	.	.	RSPOS=10383;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000210;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG;NOC
74 | 1	10433	rs56289060	A	AC	.	.	RSPOS=10433;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG
75 | 1	10439	rs112766696	AC	A	.	.	RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=050100020015000102000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;SLO;R5;OTH;ASP;GNO;OTHERKG
76 | 1	10439	rs138941843	AC	A	.	.	RSPOS=10440;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG
77 | 1	10440	rs112155239	C	A	.	.	RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=050000020015000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;OTH;ASP;OTHERKG
78 | 1	10492	rs55998931	C	T	.	.	RSPOS=10492;GMAF=0.0617001828153565;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005040002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;VLD;OTHERKG
79 | 1	10519	rs62636508	G	C	.	.	RSPOS=10519;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG
80 | 1	10583	rs58108140	G	A	.	.	RSPOS=10583;GMAF=0.270566727605119;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005040016000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;VLD;KGPhase1;KGPROD;OTHERKG
81 | 


--------------------------------------------------------------------------------
/test/null_genotype_mono.vcf:
--------------------------------------------------------------------------------
  1 | ##fileformat=VCFv4.1
  2 | ##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
  3 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
  4 | ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
  5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  6 | ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
  7 | ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
  8 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
  9 | ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 10 | ##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
 11 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
 12 | ##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
 13 | ##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
 14 | ##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
 15 | ##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
 16 | ##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
 17 | ##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
 18 | ##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
 19 | ##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
 20 | ##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
 21 | ##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
 22 | ##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
 23 | ##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[np_control.bam] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=[tests/read_chr.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/data/reference/ucsc/hg19/ucsc.hg19.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=info log_to_file=null help=false genotype_likelihoods_model=SNP p_nonref_model=EXACT heterozygosity=0.001 pcr_error_rate=1.0E-4 genotyping_mode=DISCOVERY output_mode=EMIT_ALL_SITES standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 computeSLOD=false alleles=(RodBinding name= source=UNBOUND) min_base_quality_score=17 max_deletion_fraction=0.05 multiallelic=false max_alternate_alleles=5 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 bandedIndel=false indelDebug=false ignoreSNPAlleles=false dbsnp=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false"
 24 | ##contig=<ID=chr1,length=249250621,assembly=hg19>
 25 | ##contig=<ID=chr10,length=135534747,assembly=hg19>
 26 | ##contig=<ID=chr11,length=135006516,assembly=hg19>
 27 | ##contig=<ID=chr11_gl000202_random,length=40103,assembly=hg19>
 28 | ##contig=<ID=chr12,length=133851895,assembly=hg19>
 29 | ##contig=<ID=chr13,length=115169878,assembly=hg19>
 30 | ##contig=<ID=chr14,length=107349540,assembly=hg19>
 31 | ##contig=<ID=chr15,length=102531392,assembly=hg19>
 32 | ##contig=<ID=chr16,length=90354753,assembly=hg19>
 33 | ##contig=<ID=chr17,length=81195210,assembly=hg19>
 34 | ##contig=<ID=chr17_ctg5_hap1,length=1680828,assembly=hg19>
 35 | ##contig=<ID=chr17_gl000203_random,length=37498,assembly=hg19>
 36 | ##contig=<ID=chr17_gl000204_random,length=81310,assembly=hg19>
 37 | ##contig=<ID=chr17_gl000205_random,length=174588,assembly=hg19>
 38 | ##contig=<ID=chr17_gl000206_random,length=41001,assembly=hg19>
 39 | ##contig=<ID=chr18,length=78077248,assembly=hg19>
 40 | ##contig=<ID=chr18_gl000207_random,length=4262,assembly=hg19>
 41 | ##contig=<ID=chr19,length=59128983,assembly=hg19>
 42 | ##contig=<ID=chr19_gl000208_random,length=92689,assembly=hg19>
 43 | ##contig=<ID=chr19_gl000209_random,length=159169,assembly=hg19>
 44 | ##contig=<ID=chr1_gl000191_random,length=106433,assembly=hg19>
 45 | ##contig=<ID=chr1_gl000192_random,length=547496,assembly=hg19>
 46 | ##contig=<ID=chr2,length=243199373,assembly=hg19>
 47 | ##contig=<ID=chr20,length=63025520,assembly=hg19>
 48 | ##contig=<ID=chr21,length=48129895,assembly=hg19>
 49 | ##contig=<ID=chr21_gl000210_random,length=27682,assembly=hg19>
 50 | ##contig=<ID=chr22,length=51304566,assembly=hg19>
 51 | ##contig=<ID=chr3,length=198022430,assembly=hg19>
 52 | ##contig=<ID=chr4,length=191154276,assembly=hg19>
 53 | ##contig=<ID=chr4_ctg9_hap1,length=590426,assembly=hg19>
 54 | ##contig=<ID=chr4_gl000193_random,length=189789,assembly=hg19>
 55 | ##contig=<ID=chr4_gl000194_random,length=191469,assembly=hg19>
 56 | ##contig=<ID=chr5,length=180915260,assembly=hg19>
 57 | ##contig=<ID=chr6,length=171115067,assembly=hg19>
 58 | ##contig=<ID=chr6_apd_hap1,length=4622290,assembly=hg19>
 59 | ##contig=<ID=chr6_cox_hap2,length=4795371,assembly=hg19>
 60 | ##contig=<ID=chr6_dbb_hap3,length=4610396,assembly=hg19>
 61 | ##contig=<ID=chr6_mann_hap4,length=4683263,assembly=hg19>
 62 | ##contig=<ID=chr6_mcf_hap5,length=4833398,assembly=hg19>
 63 | ##contig=<ID=chr6_qbl_hap6,length=4611984,assembly=hg19>
 64 | ##contig=<ID=chr6_ssto_hap7,length=4928567,assembly=hg19>
 65 | ##contig=<ID=chr7,length=159138663,assembly=hg19>
 66 | ##contig=<ID=chr7_gl000195_random,length=182896,assembly=hg19>
 67 | ##contig=<ID=chr8,length=146364022,assembly=hg19>
 68 | ##contig=<ID=chr8_gl000196_random,length=38914,assembly=hg19>
 69 | ##contig=<ID=chr8_gl000197_random,length=37175,assembly=hg19>
 70 | ##contig=<ID=chr9,length=141213431,assembly=hg19>
 71 | ##contig=<ID=chr9_gl000198_random,length=90085,assembly=hg19>
 72 | ##contig=<ID=chr9_gl000199_random,length=169874,assembly=hg19>
 73 | ##contig=<ID=chr9_gl000200_random,length=187035,assembly=hg19>
 74 | ##contig=<ID=chr9_gl000201_random,length=36148,assembly=hg19>
 75 | ##contig=<ID=chrM,length=16571,assembly=hg19>
 76 | ##contig=<ID=chrUn_gl000211,length=166566,assembly=hg19>
 77 | ##contig=<ID=chrUn_gl000212,length=186858,assembly=hg19>
 78 | ##contig=<ID=chrUn_gl000213,length=164239,assembly=hg19>
 79 | ##contig=<ID=chrUn_gl000214,length=137718,assembly=hg19>
 80 | ##contig=<ID=chrUn_gl000215,length=172545,assembly=hg19>
 81 | ##contig=<ID=chrUn_gl000216,length=172294,assembly=hg19>
 82 | ##contig=<ID=chrUn_gl000217,length=172149,assembly=hg19>
 83 | ##contig=<ID=chrUn_gl000218,length=161147,assembly=hg19>
 84 | ##contig=<ID=chrUn_gl000219,length=179198,assembly=hg19>
 85 | ##contig=<ID=chrUn_gl000220,length=161802,assembly=hg19>
 86 | ##contig=<ID=chrUn_gl000221,length=155397,assembly=hg19>
 87 | ##contig=<ID=chrUn_gl000222,length=186861,assembly=hg19>
 88 | ##contig=<ID=chrUn_gl000223,length=180455,assembly=hg19>
 89 | ##contig=<ID=chrUn_gl000224,length=179693,assembly=hg19>
 90 | ##contig=<ID=chrUn_gl000225,length=211173,assembly=hg19>
 91 | ##contig=<ID=chrUn_gl000226,length=15008,assembly=hg19>
 92 | ##contig=<ID=chrUn_gl000227,length=128374,assembly=hg19>
 93 | ##contig=<ID=chrUn_gl000228,length=129120,assembly=hg19>
 94 | ##contig=<ID=chrUn_gl000229,length=19913,assembly=hg19>
 95 | ##contig=<ID=chrUn_gl000230,length=43691,assembly=hg19>
 96 | ##contig=<ID=chrUn_gl000231,length=27386,assembly=hg19>
 97 | ##contig=<ID=chrUn_gl000232,length=40652,assembly=hg19>
 98 | ##contig=<ID=chrUn_gl000233,length=45941,assembly=hg19>
 99 | ##contig=<ID=chrUn_gl000234,length=40531,assembly=hg19>
100 | ##contig=<ID=chrUn_gl000235,length=34474,assembly=hg19>
101 | ##contig=<ID=chrUn_gl000236,length=41934,assembly=hg19>
102 | ##contig=<ID=chrUn_gl000237,length=45867,assembly=hg19>
103 | ##contig=<ID=chrUn_gl000238,length=39939,assembly=hg19>
104 | ##contig=<ID=chrUn_gl000239,length=33824,assembly=hg19>
105 | ##contig=<ID=chrUn_gl000240,length=41933,assembly=hg19>
106 | ##contig=<ID=chrUn_gl000241,length=42152,assembly=hg19>
107 | ##contig=<ID=chrUn_gl000242,length=43523,assembly=hg19>
108 | ##contig=<ID=chrUn_gl000243,length=43341,assembly=hg19>
109 | ##contig=<ID=chrUn_gl000244,length=39929,assembly=hg19>
110 | ##contig=<ID=chrUn_gl000245,length=36651,assembly=hg19>
111 | ##contig=<ID=chrUn_gl000246,length=38154,assembly=hg19>
112 | ##contig=<ID=chrUn_gl000247,length=36422,assembly=hg19>
113 | ##contig=<ID=chrUn_gl000248,length=39786,assembly=hg19>
114 | ##contig=<ID=chrUn_gl000249,length=38502,assembly=hg19>
115 | ##contig=<ID=chrX,length=155270560,assembly=hg19>
116 | ##contig=<ID=chrY,length=59373566,assembly=hg19>
117 | ##reference=file:///data/reference/ucsc/hg19/ucsc.hg19.fasta
118 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA19239
119 | chr1	65312657	.	G	.	.	.	.	GT	./.
120 | 


--------------------------------------------------------------------------------
/ez_setup.py:
--------------------------------------------------------------------------------
  1 | #!python
  2 | """Bootstrap setuptools installation
  3 | 
  4 | If you want to use setuptools in your package's setup.py, just include this
  5 | file in the same directory with it, and add this to the top of your setup.py::
  6 | 
  7 |     from ez_setup import use_setuptools
  8 |     use_setuptools()
  9 | 
 10 | If you want to require a specific version of setuptools, set a download
 11 | mirror, or use an alternate download directory, you can do so by supplying
 12 | the appropriate options to ``use_setuptools()``.
 13 | 
 14 | This file can also be run as a script to install or upgrade setuptools.
 15 | """
 16 | import sys
 17 | DEFAULT_VERSION = "0.6c11"
 18 | DEFAULT_URL     = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3]
 19 | 
 20 | md5_data = {
 21 |     'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca',
 22 |     'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb',
 23 |     'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b',
 24 |     'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a',
 25 |     'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618',
 26 |     'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac',
 27 |     'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5',
 28 |     'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4',
 29 |     'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c',
 30 |     'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b',
 31 |     'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090',
 32 |     'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4',
 33 |     'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7',
 34 |     'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5',
 35 |     'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de',
 36 |     'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b',
 37 |     'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2',
 38 |     'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086',
 39 |     'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27',
 40 |     'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277',
 41 |     'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa',
 42 |     'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e',
 43 |     'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e',
 44 |     'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f',
 45 |     'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2',
 46 |     'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc',
 47 |     'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167',
 48 |     'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64',
 49 |     'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d',
 50 |     'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20',
 51 |     'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab',
 52 |     'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53',
 53 |     'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2',
 54 |     'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e',
 55 |     'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372',
 56 |     'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902',
 57 |     'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de',
 58 |     'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b',
 59 |     'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03',
 60 |     'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a',
 61 |     'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6',
 62 |     'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a',
 63 | }
 64 | 
 65 | import sys, os
 66 | try: from hashlib import md5
 67 | except ImportError: from md5 import md5
 68 | 
 69 | def _validate_md5(egg_name, data):
 70 |     if egg_name in md5_data:
 71 |         digest = md5(data).hexdigest()
 72 |         if digest != md5_data[egg_name]:
 73 |             print >>sys.stderr, (
 74 |                 "md5 validation of %s failed!  (Possible download problem?)"
 75 |                 % egg_name
 76 |             )
 77 |             sys.exit(2)
 78 |     return data
 79 | 
 80 | def use_setuptools(
 81 |     version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
 82 |     download_delay=15
 83 | ):
 84 |     """Automatically find/download setuptools and make it available on sys.path
 85 | 
 86 |     `version` should be a valid setuptools version number that is available
 87 |     as an egg for download under the `download_base` URL (which should end with
 88 |     a '/').  `to_dir` is the directory where setuptools will be downloaded, if
 89 |     it is not already available.  If `download_delay` is specified, it should
 90 |     be the number of seconds that will be paused before initiating a download,
 91 |     should one be required.  If an older version of setuptools is installed,
 92 |     this routine will print a message to ``sys.stderr`` and raise SystemExit in
 93 |     an attempt to abort the calling script.
 94 |     """
 95 |     was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules
 96 |     def do_download():
 97 |         egg = download_setuptools(version, download_base, to_dir, download_delay)
 98 |         sys.path.insert(0, egg)
 99 |         import setuptools; setuptools.bootstrap_install_from = egg
100 |     try:
101 |         import pkg_resources
102 |     except ImportError:
103 |         return do_download()       
104 |     try:
105 |         pkg_resources.require("setuptools>="+version); return
106 |     except pkg_resources.VersionConflict, e:
107 |         if was_imported:
108 |             print >>sys.stderr, (
109 |             "The required version of setuptools (>=%s) is not available, and\n"
110 |             "can't be installed while this script is running. Please install\n"
111 |             " a more recent version first, using 'easy_install -U setuptools'."
112 |             "\n\n(Currently using %r)"
113 |             ) % (version, e.args[0])
114 |             sys.exit(2)
115 |     except pkg_resources.DistributionNotFound:
116 |         pass
117 | 
118 |     del pkg_resources, sys.modules['pkg_resources']    # reload ok
119 |     return do_download()
120 | 
121 | def download_setuptools(
122 |     version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir,
123 |     delay = 15
124 | ):
125 |     """Download setuptools from a specified location and return its filename
126 | 
127 |     `version` should be a valid setuptools version number that is available
128 |     as an egg for download under the `download_base` URL (which should end
129 |     with a '/'). `to_dir` is the directory where the egg will be downloaded.
130 |     `delay` is the number of seconds to pause before an actual download attempt.
131 |     """
132 |     import urllib2, shutil
133 |     egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3])
134 |     url = download_base + egg_name
135 |     saveto = os.path.join(to_dir, egg_name)
136 |     src = dst = None
137 |     if not os.path.exists(saveto):  # Avoid repeated downloads
138 |         try:
139 |             from distutils import log
140 |             if delay:
141 |                 log.warn("""
142 | ---------------------------------------------------------------------------
143 | This script requires setuptools version %s to run (even to display
144 | help).  I will attempt to download it for you (from
145 | %s), but
146 | you may need to enable firewall access for this script first.
147 | I will start the download in %d seconds.
148 | 
149 | (Note: if this machine does not have network access, please obtain the file
150 | 
151 |    %s
152 | 
153 | and place it in this directory before rerunning this script.)
154 | ---------------------------------------------------------------------------""",
155 |                     version, download_base, delay, url
156 |                 ); from time import sleep; sleep(delay)
157 |             log.warn("Downloading %s", url)
158 |             src = urllib2.urlopen(url)
159 |             # Read/write all in one block, so we don't create a corrupt file
160 |             # if the download is interrupted.
161 |             data = _validate_md5(egg_name, src.read())
162 |             dst = open(saveto,"wb"); dst.write(data)
163 |         finally:
164 |             if src: src.close()
165 |             if dst: dst.close()
166 |     return os.path.realpath(saveto)
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | def main(argv, version=DEFAULT_VERSION):
204 |     """Install or upgrade setuptools and EasyInstall"""
205 |     try:
206 |         import setuptools
207 |     except ImportError:
208 |         egg = None
209 |         try:
210 |             egg = download_setuptools(version, delay=0)
211 |             sys.path.insert(0,egg)
212 |             from setuptools.command.easy_install import main
213 |             return main(list(argv)+[egg])   # we're done here
214 |         finally:
215 |             if egg and os.path.exists(egg):
216 |                 os.unlink(egg)
217 |     else:
218 |         if setuptools.__version__ == '0.0.1':
219 |             print >>sys.stderr, (
220 |             "You have an obsolete version of setuptools installed.  Please\n"
221 |             "remove it from your system entirely before rerunning this script."
222 |             )
223 |             sys.exit(2)
224 | 
225 |     req = "setuptools>="+version
226 |     import pkg_resources
227 |     try:
228 |         pkg_resources.require(req)
229 |     except pkg_resources.VersionConflict:
230 |         try:
231 |             from setuptools.command.easy_install import main
232 |         except ImportError:
233 |             from easy_install import main
234 |         main(list(argv)+[download_setuptools(delay=0)])
235 |         sys.exit(0) # try to force an exit
236 |     else:
237 |         if argv:
238 |             from setuptools.command.easy_install import main
239 |             main(argv)
240 |         else:
241 |             print "Setuptools version",version,"or greater has been installed."
242 |             print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)'
243 | 
244 | def update_md5(filenames):
245 |     """Update our built-in md5 registry"""
246 | 
247 |     import re
248 | 
249 |     for name in filenames:
250 |         base = os.path.basename(name)
251 |         f = open(name,'rb')
252 |         md5_data[base] = md5(f.read()).hexdigest()
253 |         f.close()
254 | 
255 |     data = ["    %r: %r,\n" % it for it in md5_data.items()]
256 |     data.sort()
257 |     repl = "".join(data)
258 | 
259 |     import inspect
260 |     srcfile = inspect.getsourcefile(sys.modules[__name__])
261 |     f = open(srcfile, 'rb'); src = f.read(); f.close()
262 | 
263 |     match = re.search("\nmd5_data = {\n([^}]+)}", src)
264 |     if not match:
265 |         print >>sys.stderr, "Internal error!"
266 |         sys.exit(2)
267 | 
268 |     src = src[:match.start(1)] + repl + src[match.end(1):]
269 |     f = open(srcfile,'w')
270 |     f.write(src)
271 |     f.close()
272 | 
273 | 
274 | if __name__=='__main__':
275 |     if len(sys.argv)>2 and sys.argv[1]=='--md5update':
276 |         update_md5(sys.argv[2:])
277 |     else:
278 |         main(sys.argv[1:])
279 | 
280 | 
281 | 
282 | 
283 | 
284 | 
285 | 


--------------------------------------------------------------------------------
/test/test-gl.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20141103
 3 | ##source=freeBayes v0.9.16
 4 | ##reference=/shared/genomes/b37/full/human_g1k_v37.fasta
 5 | ##phasing=none
 6 | ##commandline="/mnt/thor_pool1/user_data/cc2qe/code/speedseq/bin/freebayes -f /shared/genomes/b37/full/human_g1k_v37.fasta --pooled-discrete --genotype-qualities --min-repeat-entropy 1 --min-alternate-fraction 0.05 --min-alternate-count 2 --region MT:12136..12498 TCGA-E2-A14P-10A-01D-A19H-09.l1.bam TCGA-E2-A14P-10A-01D-A19H-09.l2.bam TCGA-E2-A14P-01A-31D-A19H-09.l1.bam TCGA-E2-A14P-01A-31D-A19H-09.l2.bam"
 7 | ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
 8 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
 9 | ##INFO=<ID=DPB,Number=1,Type=Float,Description="Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype">
10 | ##INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
11 | ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
12 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
13 | ##INFO=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count, with partial observations recorded fractionally">
14 | ##INFO=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observations, with partial observations recorded fractionally">
15 | ##INFO=<ID=PRO,Number=1,Type=Float,Description="Reference allele observation count, with partial observations recorded fractionally">
16 | ##INFO=<ID=PAO,Number=A,Type=Float,Description="Alternate allele observations, with partial observations recorded fractionally">
17 | ##INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
18 | ##INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
19 | ##INFO=<ID=PQR,Number=1,Type=Float,Description="Reference allele quality sum in phred for partial observations">
20 | ##INFO=<ID=PQA,Number=A,Type=Float,Description="Alternate allele quality sum in phred for partial observations">
21 | ##INFO=<ID=SRF,Number=1,Type=Integer,Description="Number of reference observations on the forward strand">
22 | ##INFO=<ID=SRR,Number=1,Type=Integer,Description="Number of reference observations on the reverse strand">
23 | ##INFO=<ID=SAF,Number=A,Type=Integer,Description="Number of alternate observations on the forward strand">
24 | ##INFO=<ID=SAR,Number=A,Type=Integer,Description="Number of alternate observations on the reverse strand">
25 | ##INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
26 | ##INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
27 | ##INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
28 | ##INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
29 | ##INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
30 | ##INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
31 | ##INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
32 | ##INFO=<ID=RPL,Number=A,Type=Float,Description="Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele">
33 | ##INFO=<ID=RPR,Number=A,Type=Float,Description="Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele">
34 | ##INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
35 | ##INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
36 | ##INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.">
37 | ##INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
38 | ##INFO=<ID=GTI,Number=1,Type=Integer,Description="Number of genotyping iterations required to reach convergence or bailout.">
39 | ##INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
40 | ##INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
41 | ##INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
42 | ##INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
43 | ##INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
44 | ##INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
45 | ##INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
46 | ##INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
47 | ##INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
48 | ##INFO=<ID=technology.illumina,Number=A,Type=Float,Description="Fraction of observations supporting the alternate observed in reads from illumina">
49 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
50 | ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
51 | ##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
52 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
53 | ##FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
54 | ##FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
55 | ##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
56 | ##FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
57 | ##INFO=<ID=SSC,Number=1,Type=Float,Description="Somatic score">
58 | ##VEP=v76 cache=/shared/external_bin/ensembl-tools-release-76/cache/homo_sapiens/76_GRCh37 db=.
59 | ##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: Consequence|Codons|Amino_acids|Gene|SYMBOL|Feature|EXON|PolyPhen|SIFT|Protein_position|BIOTYPE">
60 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	H_LS-E2-A14P-10A-01D-A19H-09	H_LS-E2-A14P-01A-31D-A19H-09
61 | 2	128046289	.	C	T	272.753	PASS	SSC=42.1572;AB=0.1875;ABP=57.2971;AC=1;AF=0.25;AN=4;AO=12;CIGAR=1X;DP=94;DPB=94;DPRA=2.13333;EPP=9.52472;EPPR=3.11623;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=46.7056;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=439;QR=2940;RO=82;RPL=5;RPP=3.73412;RPPR=5.65844;RPR=7;RUN=1;SAF=4;SAP=5.9056;SAR=8;SRF=46;SRP=5.65844;SRR=36;TYPE=snp;technology.illumina=1;CSQ=missense_variant|cGa/cAa|R/Q|ENSG00000163161|ERCC3|ENST00000493187|7/15|possibly_damaging(0.862)|deleterious(0.05)|261/718|protein_coding,missense_variant|cGa/cAa|R/Q|ENSG00000163161|ERCC3|ENST00000456257|4/4|probably_damaging(0.967)|tolerated(0.09)|175/188|protein_coding,3_prime_UTR_variant&NMD_transcript_variant|||ENSG00000163161|ERCC3|ENST00000445889|7/15|||-/71|nonsense_mediated_decay,3_prime_UTR_variant&NMD_transcript_variant|||ENSG00000163161|ERCC3|ENST00000426778|7/15|||-/71|nonsense_mediated_decay,downstream_gene_variant|||ENSG00000163161|ERCC3|ENST00000490062|||||retained_intron,non_coding_exon_variant&nc_transcript_variant|||ENSG00000163161|ERCC3|ENST00000494464|6/7||||retained_intron,downstream_gene_variant|||ENSG00000163161|ERCC3|ENST00000460485|||||retained_intron,missense_variant|cGa/cAa|R/Q|ENSG00000163161|ERCC3|ENST00000285398|7/15|possibly_damaging(0.862)|deleterious(0.02)|325/782|protein_coding,downstream_gene_variant|||ENSG00000163161|ERCC3|ENST00000462306|||||retained_intron	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:151.949:30:30:1104:0:0:0	0/1:151.949:64:52:1836:12:439:-33.1263,0,-158.844
62 | 17	7578461	.	C	A	257.251	PASS	SSC=35.826;AB=0.555556;ABP=3.49285;AC=1;AF=0.25;AN=4;AO=10;CIGAR=1X;DP=35;DPB=35;DPRA=1.05882;EPP=6.48466;EPPR=3.09716;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=15.9695;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=346;QR=872;RO=25;RPL=4;RPP=3.87889;RPPR=7.26639;RPR=6;RUN=1;SAF=5;SAP=3.0103;SAR=5;SRF=14;SRP=3.79203;SRR=11;TYPE=snp;technology.illumina=1;CSQ=non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000504937|1/7||||retained_intron,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000508793|5/5|probably_damaging(0.997)|deleterious(0)|157/165|protein_coding,non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000505014|4/5||||retained_intron,non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000504290|1/8||||retained_intron,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000413465|4/7|probably_damaging(0.995)|deleterious(0)|157/285|protein_coding,downstream_gene_variant|||ENSG00000141510|TP53|ENST00000604348||||-/143|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000420246|5/12|probably_damaging(0.994)|deleterious(0)|157/341|protein_coding,upstream_gene_variant|||ENSG00000141510|TP53|ENST00000574684|||||processed_transcript,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000269305|5/11|probably_damaging(0.997)|deleterious(0)|157/393|protein_coding,upstream_gene_variant|||ENSG00000141510|TP53|ENST00000576024||||-/31|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000509690|2/6|probably_damaging(0.997)|deleterious(0)|25/199|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000359597|4/9|probably_damaging(0.994)|deleterious(0)|157/343|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000514944|4/6|probably_damaging(0.993)|deleterious(0)|64/155|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000445888|5/11|probably_damaging(0.997)|deleterious(0)|157/393|protein_coding,downstream_gene_variant|||ENSG00000141510|TP53|ENST00000503591||||-/128|protein_coding,non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000510385|1/8||||retained_intron,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000455263|5/12|probably_damaging(0.987)|deleterious(0)|157/346|protein_coding	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:69.3546:17:17:582:0:0:0,-5.11751,-52.7224	0/1:160.002:18:8:290:10:346:-30.7085,0,-25.685
63 | 17	59861631	.	CTGCTATTTTG	CG	162.924	PASS	SSC=46.502;AB=0.22;ABP=37.059;AC=1;AF=0.25;AN=4;AO=11;CIGAR=1M9D1M;DP=121;DPB=121.818;DPRA=0.704225;EPP=3.20771;EPPR=13.5489;GTI=0;LEN=9;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=37.5146;PAIRED=1;PAIREDR=1;PAO=2;PQA=14.5;PQR=604.5;PRO=20;QA=329;QR=4041;RO=109;RPL=7;RPP=4.78696;RPPR=3.03022;RPR=4;RUN=1;SAF=6;SAP=3.20771;SAR=5;SRF=62;SRP=7.4927;SRR=47;TYPE=del;technology.illumina=1;CSQ=inframe_deletion&splice_region_variant|CAAAATAGCAga/Cga|QNSR/R|ENSG00000136492|BRIP1|ENST00000259008|11/20|||540-543/1249|protein_coding,upstream_gene_variant|||ENSG00000136492|BRIP1|ENST00000583837|||||processed_transcript,inframe_deletion&splice_region_variant&NMD_transcript_variant|CAAAATAGCAga/Cga|QNSR/R|ENSG00000136492|BRIP1|ENST00000579028|2/6|||75-78/84|nonsense_mediated_decay,inframe_deletion&splice_region_variant|CAAAATAGCAga/Cga|QNSR/R|ENSG00000136492|BRIP1|ENST00000577598|10/18|||540-543/994|protein_coding	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:145.893:71:70:2598:0:0:0,-21.0721,-234.097	0/1:140.118:50:39:1443:11:329:-25.4299,0,-125.761
64 | 6	132856480	.	GTTTTTTTTTTTTTGTATTTTTAGTAG	GTTTTTTTTTTTTGTATTTTTAGTAG	39.3609	.	SSC=10.4744;AB=0.121212;ABP=44.1367;AC=1;AF=0.25;AN=4;AO=5;CIGAR=1M1D25M;DP=52;DPB=77.8148;DPRA=0;EPP=3.44459;EPPR=14.8483;GTI=0;LEN=1;MEANALT=7.5;MQM=60;MQMR=58.6774;NS=2;NUMALT=1;ODDS=9.06305;PAIRED=1;PAIREDR=1;PAO=14.6667;PQA=465.083;PQR=759.583;PRO=23.6667;QA=153;QR=1074;RO=31;RPL=4;RPP=6.91895;RPPR=4.76149;RPR=1;RUN=1;SAF=1;SAP=6.91895;SAR=4;SRF=8;SRP=18.771;SRR=23;TYPE=del;technology.illumina=1;CSQ=upstream_gene_variant|||ENSG00000237110|TAAR9|ENST00000434551||||-/347|polymorphic_pseudogene	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:50.0188:19:11:368:1:15:0,-1.16979,-31.9349	0/1:39.3609:33:20:706:4:138:-9.30456,0,-60.533
65 | 6	132857169	.	T	C	2.77482	.	SSC=9.94317;AB=0.0972222;ABP=104.466;AC=1;AF=0.25;AN=4;AO=10;CIGAR=1X;DP=115;DPB=115;DPRA=0;EPP=3.0103;EPPR=3.19643;GTI=1;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=0.11155;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=371;QR=3950;RO=105;RPL=10;RPP=24.725;RPPR=12.1305;RPR=0;RUN=1;SAF=5;SAP=3.0103;SAR=5;SRF=43;SRP=10.476;SRR=62;TYPE=snp;technology.illumina=1;CSQ=upstream_gene_variant|||ENSG00000237110|TAAR9|ENST00000434551||||-/347|polymorphic_pseudogene	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:1.01456e-11:43:40:1520:3:110:-1.41373,0,-128.327	0/1:2.77482:72:65:2430:7:261:-11.3569,0,-206.568
66 | 6	132922238	.	A	G	2.67302e-14	PASS	SSC=35.3617;AB=0.0963855;ABP=237.896;AC=1;AF=0.25;AN=4;AO=17;CIGAR=1X;DP=319;DPB=319;DPRA=0;EPP=18.4661;EPPR=16.3493;GTI=0;LEN=1;MEANALT=1.5;MQM=37.4118;MQMR=42.5615;NS=2;NUMALT=1;ODDS=78.4706;PAIRED=0.764706;PAIREDR=0.923588;PAO=0;PQA=0;PQR=0;PRO=0;QA=243;QR=7960;RO=301;RPL=13;RPP=13.3567;RPPR=33.4903;RPR=4;RUN=1;SAF=10;SAP=4.1599;SAR=7;SRF=132;SRP=12.8865;SRR=169;TYPE=snp;technology.illumina=1;CSQ=intergenic_variant||||||||||	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:142.108:153:152:3816:1:14:0,-42.4729,-342.291	0/1:142.108:166:149:4144:16:229:0,-7.11118,-352.491
67 | X	132838305	.	GAAAAAAAAAAAAAGGTGAAAATT	GAAAAAAAAAAAAGGTGAAAATT	70.8213	PASS	SSC=19.9712;AB=0.15625;ABP=35.8538;AC=1;AF=0.25;AN=4;AO=5;CIGAR=1M1D22M;DP=67;DPB=84.4167;DPRA=0.914286;EPP=3.44459;EPPR=3.0608;GTI=1;LEN=1;MEANALT=6;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=8.14546;PAIRED=1;PAIREDR=1;PAO=11.5;PQA=372.167;PQR=489.167;PRO=15.5;QA=164;QR=1476;RO=43;RPL=2;RPP=3.44459;RPPR=3.46479;RPR=3;RUN=1;SAF=4;SAP=6.91895;SAR=1;SRF=12;SRP=21.2406;SRR=31;TYPE=del;technology.illumina=1;CSQ=intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000406757||||-/256|protein_coding,splice_region_variant&intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000394299||||-/603|protein_coding,intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000370818||||-/580|protein_coding,intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000543339||||-/526|protein_coding	GT:GQ:DP:RO:QR:AO:QA:GL	0/0:0.00125948:35:25:842:0:0:0,-7.52575,-75.9606	0/1:70.8147:32:18:634:5:164:-12.4455,0,-54.8329
68 | 


--------------------------------------------------------------------------------
/test/gatk.vcf:
--------------------------------------------------------------------------------
  1 | ##fileformat=VCFv4.1
  2 | ##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
  3 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
  4 | ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality">
  5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  6 | ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
  7 | ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
  8 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
  9 | ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 10 | ##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
 11 | ##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
 12 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
 13 | ##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
 14 | ##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
 15 | ##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
 16 | ##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
 17 | ##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
 18 | ##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
 19 | ##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
 20 | ##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
 21 | ##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
 22 | ##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
 23 | ##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
 24 | ##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[reads.bam] read_buffer_size=null phone_home=NO_ET read_filter=[] intervals=[chr22:42020321-42527953] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/data/reference/ucsc/hg19/ucsc.hg19.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=CALCULATE_AS_NECESSARY baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=2 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=gatk.log help=false genotype_likelihoods_model=SNP p_nonref_model=EXACT heterozygosity=0.001 pcr_error_rate=1.0E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 computeSLOD=false alleles=(RodBinding name= source=UNBOUND) min_base_quality_score=17 max_deletion_fraction=0.15 multiallelic=false max_alternate_alleles=5 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 bandedIndel=false indelDebug=false ignoreSNPAlleles=false dbsnp=(RodBinding name=dbsnp source=/data/reference/dbSNP_132/dbsnp_132.hg19.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false"
 25 | ##contig=<ID=chr1,length=249250621,assembly=hg19>
 26 | ##contig=<ID=chr10,length=135534747,assembly=hg19>
 27 | ##contig=<ID=chr11,length=135006516,assembly=hg19>
 28 | ##contig=<ID=chr11_gl000202_random,length=40103,assembly=hg19>
 29 | ##contig=<ID=chr12,length=133851895,assembly=hg19>
 30 | ##contig=<ID=chr13,length=115169878,assembly=hg19>
 31 | ##contig=<ID=chr14,length=107349540,assembly=hg19>
 32 | ##contig=<ID=chr15,length=102531392,assembly=hg19>
 33 | ##contig=<ID=chr16,length=90354753,assembly=hg19>
 34 | ##contig=<ID=chr17,length=81195210,assembly=hg19>
 35 | ##contig=<ID=chr17_ctg5_hap1,length=1680828,assembly=hg19>
 36 | ##contig=<ID=chr17_gl000203_random,length=37498,assembly=hg19>
 37 | ##contig=<ID=chr17_gl000204_random,length=81310,assembly=hg19>
 38 | ##contig=<ID=chr17_gl000205_random,length=174588,assembly=hg19>
 39 | ##contig=<ID=chr17_gl000206_random,length=41001,assembly=hg19>
 40 | ##contig=<ID=chr18,length=78077248,assembly=hg19>
 41 | ##contig=<ID=chr18_gl000207_random,length=4262,assembly=hg19>
 42 | ##contig=<ID=chr19,length=59128983,assembly=hg19>
 43 | ##contig=<ID=chr19_gl000208_random,length=92689,assembly=hg19>
 44 | ##contig=<ID=chr19_gl000209_random,length=159169,assembly=hg19>
 45 | ##contig=<ID=chr1_gl000191_random,length=106433,assembly=hg19>
 46 | ##contig=<ID=chr1_gl000192_random,length=547496,assembly=hg19>
 47 | ##contig=<ID=chr2,length=243199373,assembly=hg19>
 48 | ##contig=<ID=chr20,length=63025520,assembly=hg19>
 49 | ##contig=<ID=chr21,length=48129895,assembly=hg19>
 50 | ##contig=<ID=chr21_gl000210_random,length=27682,assembly=hg19>
 51 | ##contig=<ID=chr22,length=51304566,assembly=hg19>
 52 | ##contig=<ID=chr3,length=198022430,assembly=hg19>
 53 | ##contig=<ID=chr4,length=191154276,assembly=hg19>
 54 | ##contig=<ID=chr4_ctg9_hap1,length=590426,assembly=hg19>
 55 | ##contig=<ID=chr4_gl000193_random,length=189789,assembly=hg19>
 56 | ##contig=<ID=chr4_gl000194_random,length=191469,assembly=hg19>
 57 | ##contig=<ID=chr5,length=180915260,assembly=hg19>
 58 | ##contig=<ID=chr6,length=171115067,assembly=hg19>
 59 | ##contig=<ID=chr6_apd_hap1,length=4622290,assembly=hg19>
 60 | ##contig=<ID=chr6_cox_hap2,length=4795371,assembly=hg19>
 61 | ##contig=<ID=chr6_dbb_hap3,length=4610396,assembly=hg19>
 62 | ##contig=<ID=chr6_mann_hap4,length=4683263,assembly=hg19>
 63 | ##contig=<ID=chr6_mcf_hap5,length=4833398,assembly=hg19>
 64 | ##contig=<ID=chr6_qbl_hap6,length=4611984,assembly=hg19>
 65 | ##contig=<ID=chr6_ssto_hap7,length=4928567,assembly=hg19>
 66 | ##contig=<ID=chr7,length=159138663,assembly=hg19>
 67 | ##contig=<ID=chr7_gl000195_random,length=182896,assembly=hg19>
 68 | ##contig=<ID=chr8,length=146364022,assembly=hg19>
 69 | ##contig=<ID=chr8_gl000196_random,length=38914,assembly=hg19>
 70 | ##contig=<ID=chr8_gl000197_random,length=37175,assembly=hg19>
 71 | ##contig=<ID=chr9,length=141213431,assembly=hg19>
 72 | ##contig=<ID=chr9_gl000198_random,length=90085,assembly=hg19>
 73 | ##contig=<ID=chr9_gl000199_random,length=169874,assembly=hg19>
 74 | ##contig=<ID=chr9_gl000200_random,length=187035,assembly=hg19>
 75 | ##contig=<ID=chr9_gl000201_random,length=36148,assembly=hg19>
 76 | ##contig=<ID=chrM,length=16571,assembly=hg19>
 77 | ##contig=<ID=chrUn_gl000211,length=166566,assembly=hg19>
 78 | ##contig=<ID=chrUn_gl000212,length=186858,assembly=hg19>
 79 | ##contig=<ID=chrUn_gl000213,length=164239,assembly=hg19>
 80 | ##contig=<ID=chrUn_gl000214,length=137718,assembly=hg19>
 81 | ##contig=<ID=chrUn_gl000215,length=172545,assembly=hg19>
 82 | ##contig=<ID=chrUn_gl000216,length=172294,assembly=hg19>
 83 | ##contig=<ID=chrUn_gl000217,length=172149,assembly=hg19>
 84 | ##contig=<ID=chrUn_gl000218,length=161147,assembly=hg19>
 85 | ##contig=<ID=chrUn_gl000219,length=179198,assembly=hg19>
 86 | ##contig=<ID=chrUn_gl000220,length=161802,assembly=hg19>
 87 | ##contig=<ID=chrUn_gl000221,length=155397,assembly=hg19>
 88 | ##contig=<ID=chrUn_gl000222,length=186861,assembly=hg19>
 89 | ##contig=<ID=chrUn_gl000223,length=180455,assembly=hg19>
 90 | ##contig=<ID=chrUn_gl000224,length=179693,assembly=hg19>
 91 | ##contig=<ID=chrUn_gl000225,length=211173,assembly=hg19>
 92 | ##contig=<ID=chrUn_gl000226,length=15008,assembly=hg19>
 93 | ##contig=<ID=chrUn_gl000227,length=128374,assembly=hg19>
 94 | ##contig=<ID=chrUn_gl000228,length=129120,assembly=hg19>
 95 | ##contig=<ID=chrUn_gl000229,length=19913,assembly=hg19>
 96 | ##contig=<ID=chrUn_gl000230,length=43691,assembly=hg19>
 97 | ##contig=<ID=chrUn_gl000231,length=27386,assembly=hg19>
 98 | ##contig=<ID=chrUn_gl000232,length=40652,assembly=hg19>
 99 | ##contig=<ID=chrUn_gl000233,length=45941,assembly=hg19>
100 | ##contig=<ID=chrUn_gl000234,length=40531,assembly=hg19>
101 | ##contig=<ID=chrUn_gl000235,length=34474,assembly=hg19>
102 | ##contig=<ID=chrUn_gl000236,length=41934,assembly=hg19>
103 | ##contig=<ID=chrUn_gl000237,length=45867,assembly=hg19>
104 | ##contig=<ID=chrUn_gl000238,length=39939,assembly=hg19>
105 | ##contig=<ID=chrUn_gl000239,length=33824,assembly=hg19>
106 | ##contig=<ID=chrUn_gl000240,length=41933,assembly=hg19>
107 | ##contig=<ID=chrUn_gl000241,length=42152,assembly=hg19>
108 | ##contig=<ID=chrUn_gl000242,length=43523,assembly=hg19>
109 | ##contig=<ID=chrUn_gl000243,length=43341,assembly=hg19>
110 | ##contig=<ID=chrUn_gl000244,length=39929,assembly=hg19>
111 | ##contig=<ID=chrUn_gl000245,length=36651,assembly=hg19>
112 | ##contig=<ID=chrUn_gl000246,length=38154,assembly=hg19>
113 | ##contig=<ID=chrUn_gl000247,length=36422,assembly=hg19>
114 | ##contig=<ID=chrUn_gl000248,length=39786,assembly=hg19>
115 | ##contig=<ID=chrUn_gl000249,length=38502,assembly=hg19>
116 | ##contig=<ID=chrX,length=155270560,assembly=hg19>
117 | ##contig=<ID=chrY,length=59373566,assembly=hg19>
118 | ##reference=file:///data/reference/ucsc/hg19/ucsc.hg19.fasta
119 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	BLANK	NA12878	NA12891	NA12892	NA19238	NA19239	NA19240
120 | chr22	42522392	rs28371738	G	A	2951.95	.	AC=2;AF=0.143;AN=14;BaseQRankSum=0.375;DB;DP=1506;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=123.5516;MQ=253.92;MQ0=0;MQRankSum=0.685;QD=5.90;ReadPosRankSum=0.590	GT:AD:DP:GQ:PL	0/0:6,0:6:18.04:0,18,211	0/1:138,107:250:99:1961,0,3049	0/1:169,77:250:99:1038,0,3533	0/0:249,0:250:99:0,600,5732	0/0:248,1:250:99:0,627,6191	0/0:250,0:250:99:0,615,5899	0/0:250,0:250:99:0,579,5674
121 | chr22	42522613	rs1135840	G	C	11611.03	.	AC=6;AF=0.429;AN=14;BaseQRankSum=16.289;DB;DP=1518;DS;Dels=0.03;FS=0.000;HRun=0;HaplotypeScore=142.5716;MQ=242.46;MQ0=0;MQRankSum=2.010;QD=9.16;ReadPosRankSum=-1.731	GT:AD:DP:GQ:PL	0/1:13,4:17:62.64:63,0,296	0/1:118,127:246:99:2396,0,1719	0/0:241,0:244:99:0,459,4476	0/1:161,85:246:99:1489,0,2353	0/1:110,132:242:99:2561,0,1488	0/1:106,135:242:99:2613,0,1389	0/1:116,126:243:99:2489,0,1537
122 | chr22	42522755	.	C	G	36.98	.	AC=1;AF=0.071;AN=14;BaseQRankSum=-14.866;DP=1527;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=253.4254;MQ=197.36;MQ0=2;MQRankSum=-10.810;QD=0.15;ReadPosRankSum=-17.244	GT:AD:DP:GQ:PL	0/0:26,1:27:51.08:0,51,570	0/0:208,40:248:99:0,236,4169	0/0:192,56:249:99:0,114,4292	0/1:179,66:245:75.42:75,0,3683	0/0:214,32:246:99:0,172,4235	0/0:200,49:249:61.05:0,61,4049	0/0:195,50:246:32.07:0,32,3757
123 | chr22	42523003	rs116917064	A	G	7113.55	.	AC=8;AF=0.571;AN=14;BaseQRankSum=6.026;DB;DP=1433;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=101.7894;MQ=182.04;MQ0=0;MQRankSum=-2.501;QD=4.96;ReadPosRankSum=8.294	GT:AD:DP:GQ:PL	0/1:10,2:12:0.62:1,0,257	1/1:9,173:183:99:2385,273,0	0/1:153,95:249:99:355,0,2355	0/1:140,110:250:99:1334,0,2242	0/1:164,85:249:99:1070,0,2279	0/1:160,90:250:99:1245,0,2300	0/1:156,81:238:99:724,0,2764
124 | chr22	42523077	.	A	G	54.31	.	AC=1;AF=0.071;AN=14;BaseQRankSum=-0.563;DP=1521;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=54.8434;MQ=164.04;MQ0=1;MQRankSum=-2.419;QD=2.59;ReadPosRankSum=-1.229	GT:AD:DP:GQ:PL	0/1:17,4:21:92.74:93,0,533	0/0:249,1:250:99:0,544,6985	0/0:250,0:250:99:0,577,6968	0/0:248,2:250:99:0,605,7687	0/0:248,1:249:99:0,583,7300	0/0:246,2:249:99:0,626,7473	0/0:248,1:249:99:0,594,7553
125 | chr22	42523209	rs28371730	T	C	15556.89	.	AC=8;AF=0.571;AN=14;BaseQRankSum=3.458;DB;DP=1509;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=120.8206;MQ=221.07;MQ0=0;MQRankSum=-4.945;QD=10.31;ReadPosRankSum=0.639	GT:AD:DP:GQ:PL	0/1:3,6:9:99:154,0,101	1/1:6,237:247:99:4532,308,0	0/1:130,117:248:99:1399,0,3147	0/1:112,129:244:99:2641,0,2556	0/1:115,127:247:99:2320,0,2526	0/1:115,128:248:99:2546,0,2520	0/1:143,104:249:99:1965,0,3288
126 | chr22	42523211	rs2004511	T	C	2445.52	.	AC=2;AF=0.143;AN=14;BaseQRankSum=10.587;DB;DP=1509;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=102.7564;MQ=221.50;MQ0=0;MQRankSum=-6.926;QD=4.89;ReadPosRankSum=2.057	GT:AD:DP:GQ:PL	0/0:9,0:9:24.06:0,24,289	0/1:136,113:250:99:1384,0,2176	0/1:146,104:250:99:1108,0,2809	0/0:247,3:250:99:0,439,5546	0/0:245,2:249:99:0,459,5316	0/0:248,2:250:99:0,459,5404	0/0:248,1:250:99:0,533,6069
127 | chr22	42523409	rs1985842	G	T	6801.90	.	AC=6;AF=0.429;AN=14;BaseQRankSum=20.509;DB;DP=1454;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=150.8967;MQ=200.12;MQ0=0;MQRankSum=4.472;QD=5.65;ReadPosRankSum=9.396	GT:AD:DP:GQ:PL	0/1:1,3:4:25.84:53,0,26	0/1:153,95:249:99:1597,0,1798	0/0:245,4:250:99:0,336,4079	0/1:168,82:250:99:1339,0,1880	0/1:147,103:250:99:1522,0,1805	0/1:156,94:250:99:1341,0,2322	0/1:129,71:201:99:949,0,2082
128 | chr22	42523805	rs28371725	C	T	1637.33	.	AC=1;AF=0.071;AN=14;BaseQRankSum=-0.379;DB;DP=1516;DS;Dels=0.00;FS=0.000;HRun=2;HaplotypeScore=77.2321;MQ=226.05;MQ0=0;MQRankSum=2.862;QD=6.55;ReadPosRankSum=0.064	GT:AD:DP:GQ:PL	0/0:16,0:16:39.09:0,39,475	0/0:248,1:249:99:0,613,7187	0/1:132,116:248:99:1676,0,2916	0/0:248,0:248:99:0,625,7171	0/0:248,2:250:99:0,604,7252	0/0:250,0:250:99:0,631,7426	0/0:248,1:249:99:0,584,6964
129 | chr22	42523943	rs16947	A	G	23661.10	.	AC=8;AF=0.571;AN=14;BaseQRankSum=4.602;DB;DP=1514;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=38.3217;MQ=238.64;MQ0=0;MQRankSum=2.485;QD=15.63;ReadPosRankSum=3.749	GT:AD:DP:GQ:PL	0/1:9,5:14:99:163,0,303	1/1:3,246:250:99:8092,667,0	0/1:129,116:246:99:3190,0,2852	0/1:149,98:247:99:2429,0,3588	0/1:129,118:247:99:3267,0,3052	0/1:122,123:245:99:3428,0,3052	0/1:124,119:244:99:3092,0,2845
130 | chr22	42524150	.	C	G	3758.65	.	AC=8;AF=0.571;AN=14;BaseQRankSum=24.314;DP=1506;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=172.5901;MQ=242.92;MQ0=0;MQRankSum=11.537;QD=2.50;ReadPosRankSum=-9.185	GT:AD:DP:GQ:PL	1/1:3,3:6:5.98:46,6,0	0/1:161,88:250:99:708,0,300	0/1:161,88:250:99:635,0,308	0/1:160,90:250:99:658,0,229	0/1:180,69:250:99:478,0,113	0/1:176,73:250:99:530,0,271	0/1:170,79:249:99:704,0,133
131 | chr22	42524435	rs1807313	T	A	5252.25	.	AC=3;AF=0.214;AN=14;BaseQRankSum=-0.192;DB;DP=1526;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=152.3866;MQ=242.06;MQ0=0;MQRankSum=1.923;QD=9.99;ReadPosRankSum=3.008	GT:AD:DP:GQ:PL	0/1:7,19:26:99:456,0,195	0/0:250,0:250:99:0,698,8167	0/0:246,2:249:99:0,673,7735	0/0:248,2:250:99:0,685,7919	0/0:250,0:250:99:0,688,7814	0/1:120,126:247:99:2539,0,3250	0/1:131,110:246:99:2257,0,3278
132 | chr22	42524696	rs58440431	T	C	6423.61	.	AC=2;AF=0.143;AN=14;BaseQRankSum=3.119;DB;DP=1509;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=53.0005;MQ=230.78;MQ0=0;MQRankSum=2.825;QD=12.85;ReadPosRankSum=2.051	GT:AD:DP:GQ:PL	0/0:9,0:9:27.08:0,27,351	0/1:132,116:250:99:3341,0,3914	0/1:141,108:250:99:3082,0,3917	0/0:248,1:250:99:0,692,8578	0/0:250,0:250:99:0,743,8836	0/0:247,2:250:99:0,695,8726	0/0:249,1:250:99:0,699,8650
133 | chr22	42524947	rs3892097	C	T	731.18	.	AC=2;AF=0.143;AN=14;BaseQRankSum=0.602;DB;DP=1495;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=154.5421;MQ=217.65;MQ0=0;MQRankSum=4.304;QD=1.47;ReadPosRankSum=1.019	GT:AD:DP:GQ:PL	0/0:3,0:3:8.99:0,9,89	0/1:108,75:244:99:403,0,1684	0/1:125,74:242:99:375,0,2335	0/0:227,1:249:99:0,460,5036	0/0:226,1:247:99:0,448,4884	0/0:192,1:247:99:0,400,4405	0/0:194,1:247:99:0,405,4694
134 | chr22	42525132	rs1058164	G	C	14639.91	.	AC=5;AF=0.357;AN=14;BaseQRankSum=4.944;DB;DP=1508;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=38.1229;MQ=207.02;MQ0=6;MQRankSum=2.510;QD=11.71;ReadPosRankSum=0.306	GT:AD:DP:GQ:PL	0/0:8,0:8:24.05:0,24,309	0/1:125,125:250:99:3147,0,3294	0/0:245,1:248:99:0,549,7172	0/1:139,109:248:99:2470,0,3232	0/1:136,107:243:99:2545,0,3408	0/1:116,130:247:99:3206,0,2926	0/1:122,124:247:99:3271,0,3300
135 | chr22	42525772	rs28371706	G	A	7552.52	.	AC=4;AF=0.286;AN=14;BaseQRankSum=12.028;DB;DP=1506;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=89.8512;MQ=222.09;MQ0=0;MQRankSum=5.200;QD=9.99;ReadPosRankSum=2.275	GT:AD:DP:GQ:PL	0/1:4,2:6:29.34:29,0,147	0/0:249,0:249:99:0,592,6835	0/0:249,1:250:99:0,590,7041	0/0:248,0:248:99:0,652,7316	0/1:126,120:248:99:2668,0,2833	0/1:134,113:247:99:2453,0,2485	0/1:137,113:250:99:2403,0,2988
136 | chr22	42525798	rs28371705	G	C	1954.58	.	AC=2;AF=0.143;AN=14;BaseQRankSum=6.229;DB;DP=1509;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=36.0442;MQ=228.55;MQ0=0;MQRankSum=0.852;QD=3.91;ReadPosRankSum=6.520	GT:AD:DP:GQ:PL	0/0:9,0:9:27.08:0,27,342	0/1:164,85:250:99:981,0,3519	0/1:171,79:250:99:1020,0,3665	0/0:249,1:250:99:0,526,6474	0/0:249,1:250:99:0,550,6481	0/0:248,2:250:99:0,542,6933	0/0:250,0:250:99:0,604,7282
137 | chr22	42525811	rs28371704	T	C	3688.26	.	AC=2;AF=0.143;AN=14;BaseQRankSum=4.752;DB;DP=1510;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=36.9902;MQ=210.28;MQ0=0;MQRankSum=2.309;QD=7.38;ReadPosRankSum=6.262	GT:AD:DP:GQ:PL	0/0:10,0:10:27.06:0,27,333	0/1:163,86:249:99:1958,0,3391	0/1:167,78:245:99:1730,0,3945	0/0:248,1:249:99:0,542,6887	0/0:246,1:247:99:0,550,6569	0/0:247,1:250:99:0,548,6954	0/0:249,1:250:99:0,557,7079
138 | chr22	42525821	rs28371703	G	T	3940.90	.	AC=2;AF=0.143;AN=14;BaseQRankSum=4.652;DB;DP=1510;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=34.0483;MQ=210.28;MQ0=0;MQRankSum=2.924;QD=7.88;ReadPosRankSum=5.487	GT:AD:DP:GQ:PL	0/0:10,0:10:24.08:0,24,317	0/1:164,85:250:99:2033,0,3659	0/1:167,79:249:99:1907,0,4271	0/0:249,1:250:99:0,565,7321	0/0:249,1:250:99:0,545,7102	0/0:248,2:250:99:0,536,7254	0/0:249,0:250:99:0,605,7633
139 | chr22	42525952	rs71328650	C	A	5872.92	.	AC=7;AF=0.500;AN=14;BaseQRankSum=25.986;DB;DP=1505;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=144.2979;MQ=173.55;MQ0=0;MQRankSum=3.660;QD=4.68;ReadPosRankSum=7.152	GT:AD:DP:GQ:PL	1/1:2,3:5:6:53,6,0	0/1:132,117:250:99:1397,0,702	0/0:248,1:250:99:0,245,2219	0/1:166,83:250:99:1151,0,934	0/1:164,86:250:99:1070,0,1147	0/1:170,80:250:99:1009,0,1141	0/1:162,87:250:99:1194,0,1085
140 | chr22	42526049	.	C	G	8544.41	.	AC=10;AF=0.714;AN=14;BaseQRankSum=-8.121;DP=1505;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=241.7335;MQ=162.18;MQ0=2;MQRankSum=-1.399;QD=6.81;ReadPosRankSum=2.132	GT:AD:DP:GQ:PL	1/1:0,5:5:3:26,3,0	0/1:86,162:248:99:1053,0,1167	0/0:235,12:248:99:0,378,3886	0/1:108,137:245:99:782,0,1662	1/1:3,242:245:99:2351,264,0	1/1:5,245:250:99:2193,222,0	1/1:4,242:246:99:2140,240,0
141 | chr22	42526449	.	T	A	151.47	.	AC=1;AF=0.071;AN=14;BaseQRankSum=2.662;DP=1226;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=41.2083;MQ=240.47;MQ0=0;MQRankSum=0.578;QD=4.89;ReadPosRankSum=3.611	GT:AD:DP:GQ:PL	0/1:23,8:31:99:190,0,694	0/0:188,0:190:99:0,478,5376	0/0:187,0:187:99:0,493,5322	0/0:247,0:249:99:0,634,6728	0/0:185,0:185:99:0,487,5515	0/0:202,0:202:99:0,520,5857	0/0:181,1:182:99:0,440,5362
142 | chr22	42526484	rs28371699	A	C	4220.99	.	AC=6;AF=0.429;AN=14;BaseQRankSum=-17.855;DB;DP=1532;DS;Dels=0.02;FS=0.000;HRun=0;HaplotypeScore=136.8893;MQ=233.92;MQ0=0;MQRankSum=3.448;QD=3.29;ReadPosRankSum=-2.663	GT:AD:DP:GQ:PL	0/1:16,15:31:99:238,0,428	0/1:112,135:247:99:796,0,1908	0/0:227,13:241:99:0,433,4747	0/1:108,133:242:99:588,0,2014	0/1:90,154:245:99:1055,0,1892	0/1:112,131:246:99:741,0,2222	0/1:108,137:246:99:803,0,2266
143 | chr22	42526549	rs56011157	C	T	14276.31	.	AC=8;AF=0.571;AN=14;BaseQRankSum=17.750;DB;DP=1537;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=87.3394;MQ=231.34;MQ0=0;MQRankSum=4.781;QD=9.29;ReadPosRankSum=7.463	GT:AD:DP:GQ:PL	0/1:22,15:37:99:251,0,330	1/1:23,227:250:99:5404,430,0	0/1:151,98:250:99:1878,0,2475	0/1:153,97:250:99:1769,0,2410	0/1:149,100:250:99:1792,0,2569	0/1:164,84:250:99:1440,0,2646	0/1:149,98:248:99:1742,0,2601
144 | chr22	42526561	rs28695233	G	T	4524.61	.	AC=7;AF=0.500;AN=14;BaseQRankSum=9.714;DB;DP=1538;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=98.8415;MQ=220.45;MQ0=0;MQRankSum=9.430;QD=3.02;ReadPosRankSum=7.682	GT:AD:DP:GQ:PL	0/0:22,15:38:15.74:0,16,609	1/1:4,240:249:99:2685,237,0	0/1:142,108:250:99:505,0,3133	0/1:138,109:249:99:521,0,3281	0/1:150,99:249:99:336,0,3601	0/1:153,93:250:99:194,0,3695	0/1:148,97:249:99:283,0,3093
145 | chr22	42526562	rs75276289	G	C	3780.51	.	AC=6;AF=0.429;AN=14;BaseQRankSum=15.200;DB;DP=1540;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=116.4370;MQ=215.67;MQ0=0;MQRankSum=9.072;QD=2.52;ReadPosRankSum=10.863	GT:AD:DP:GQ:PL	0/0:25,15:40:17.73:0,18,633	0/1:50,199:250:99:1522,0,283	0/1:143,106:250:99:600,0,2844	0/1:143,107:250:99:605,0,3002	0/1:151,99:250:99:432,0,3352	0/1:157,93:250:99:254,0,3483	0/1:149,99:248:99:368,0,2999
146 | chr22	42526567	rs76312385	G	A	434.33	.	AC=1;AF=0.071;AN=14;BaseQRankSum=18.089;DB;DP=1540;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=89.3746;MQ=219.80;MQ0=0;MQRankSum=6.196;QD=1.74;ReadPosRankSum=7.564	GT:AD:DP:GQ:PL	0/0:22,18:40:4.68:0,5,427	0/1:34,215:250:56.26:473,0,56	0/0:142,108:250:20.78:0,21,2288	0/0:142,108:250:49.48:0,49,2451	0/0:152,97:250:99:0,210,2801	0/0:150,100:250:34.96:0,35,2515	0/0:148,102:250:77.19:0,77,2590
147 | chr22	42526571	rs74644586	C	G	339.60	.	AC=1;AF=0.071;AN=14;BaseQRankSum=-11.480;DB;DP=1540;DS;Dels=0.02;FS=0.000;HRun=4;HaplotypeScore=93.3402;MQ=218.52;MQ0=0;MQRankSum=3.709;QD=1.36;ReadPosRankSum=6.322	GT:AD:DP:GQ:PL	0/0:22,18:40:36.46:0,36,689	0/1:4,232:239:30.49:378,0,30	0/0:138,110:249:99:0,295,4017	0/0:137,111:249:99:0,250,4041	0/0:147,97:245:99:0,321,4348	0/0:150,97:247:99:0,358,4657	0/0:144,101:247:99:0,275,4123
148 | chr22	42526573	rs1080996	T	G	12579.34	.	AC=8;AF=0.571;AN=14;BaseQRankSum=6.163;DB;DP=1540;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=76.6550;MQ=224.49;MQ0=0;MQRankSum=1.355;QD=8.17;ReadPosRankSum=5.794	GT:AD:DP:GQ:PL	0/1:22,18:40:99:200,0,668	1/1:4,244:248:99:5175,439,0	0/1:136,110:250:99:1862,0,3521	0/1:136,113:249:99:1734,0,3677	0/1:144,99:250:99:1119,0,3818	0/1:150,99:250:99:1196,0,4178	0/1:145,104:250:99:1293,0,3628
149 | chr22	42526580	rs1080995	G	C	16619.47	.	AC=8;AF=0.571;AN=14;BaseQRankSum=7.991;DB;DP=1541;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=56.1489;MQ=221.29;MQ0=0;MQRankSum=2.223;QD=10.78;ReadPosRankSum=4.443	GT:AD:DP:GQ:PL	0/1:22,19:41:99:335,0,664	1/1:15,234:250:99:5895,337,0	0/1:137,113:250:99:2421,0,3301	0/1:134,116:250:99:2262,0,3430	0/1:144,105:250:99:1929,0,3421	0/1:148,101:250:99:1778,0,3867	0/1:142,108:250:99:1999,0,3334
150 | chr22	42526634	.	T	C	32.60	.	AC=1;AF=0.071;AN=14;BaseQRankSum=1.147;DP=1225;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=50.0151;MQ=240.65;MQ0=0;MQRankSum=1.151;QD=1.30;ReadPosRankSum=1.276	GT:AD:DP:GQ:PL	0/1:21,4:25:71.04:71,0,702	0/0:187,2:189:99:0,481,6080	0/0:233,0:233:99:0,667,7351	0/0:230,0:230:99:0,667,7394	0/0:174,1:175:99:0,446,5469	0/0:194,2:196:99:0,498,6239	0/0:174,0:175:99:0,511,5894
151 | chr22	42526679	.	G	C	60.60	.	AC=1;AF=0.071;AN=14;BaseQRankSum=-12.425;DP=1525;DS;Dels=0.09;FS=0.000;HRun=1;HaplotypeScore=331.3182;MQ=215.48;MQ0=0;MQRankSum=-14.680;QD=0.24;ReadPosRankSum=-13.323	GT:AD:DP:GQ:PL	0/0:23,0:23:66.17:0,66,829	0/1:175,56:232:99:99,0,4273	0/0:199,26:226:76.45:0,76,5104	0/0:196,37:233:41.98:0,42,5109	0/0:170,47:218:99:0,162,4505	0/0:188,36:224:99:0,230,4974	0/0:177,47:225:99:0,167,4592
152 | chr22	42526694	rs1065852	G	A	4420.63	.	AC=2;AF=0.143;AN=14;BaseQRankSum=8.566;DB;DP=1529;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=117.6833;MQ=214.96;MQ0=0;MQRankSum=5.852;QD=8.84;ReadPosRankSum=1.454	GT:AD:DP:GQ:PL	0/0:29,0:29:81.24:0,81,1040	0/1:136,114:250:99:2333,0,3170	0/1:145,104:250:99:2087,0,2794	0/0:250,0:250:99:0,586,6963	0/0:247,2:250:99:0,497,6185	0/0:248,2:250:99:0,544,6640	0/0:250,0:250:99:0,571,6444
153 | chr22	42527471	rs28633410	T	C	26831.16	.	AC=10;AF=0.833;AN=12;BaseQRankSum=-1.092;DB;DP=1501;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=79.3853;MQ=176.86;MQ0=0;MQRankSum=-2.644;QD=17.89;ReadPosRankSum=2.185	GT:AD:DP:GQ:PL	./.	1/1:1,249:250:99:5741,478,0	0/1:102,148:250:99:3026,0,1748	0/1:115,132:250:99:2716,0,1896	1/1:1,249:250:99:5040,392,0	1/1:1,248:250:99:5109,427,0	1/1:4,245:249:99:5199,306,0
154 | chr22	42527533	rs28624811	A	G	13619.46	.	AC=7;AF=0.583;AN=12;BaseQRankSum=-8.893;DB;DP=1501;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=86.1948;MQ=179.18;MQ0=0;MQRankSum=0.472;QD=9.08;ReadPosRankSum=0.778	GT:AD:DP:GQ:PL	./.	1/1:2,241:243:99:4171,416,0	0/1:113,132:245:99:2000,0,2018	0/1:120,126:246:99:1781,0,1970	0/1:131,118:249:99:1885,0,1784	0/1:122,126:248:99:1893,0,1807	0/1:122,127:249:99:1890,0,2119
155 | chr22	42527793	rs1080989	C	T	3454.66	.	AC=2;AF=0.167;AN=12;BaseQRankSum=-3.007;DB;DP=1074;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=75.7865;MQ=209.00;MQ0=0;MQRankSum=3.014;QD=9.36;ReadPosRankSum=0.618	GT:AD:DP:GQ:PL	./.	0/1:72,90:162:99:1699,0,1767	0/1:103,96:202:99:1756,0,2532	0/0:188,0:188:99:0,526,5889	0/0:160,0:160:99:0,457,4983	0/0:197,0:198:99:0,544,6100	0/0:156,0:156:99:0,439,5041
156 | chr22	42527891	.	T	A	109.83	.	AC=5;AF=0.417;AN=12;BaseQRankSum=11.235;DP=1500;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=638.4601;MQ=166.82;MQ0=0;MQRankSum=1.444;QD=0.09;ReadPosRankSum=0.839	GT:AD:DP:GQ:PL	./.	0/1:238,7:248:13.70:14,0,38	0/0:246,3:250:5.97:0,6,45	0/1:239,11:250:31.42:31,0,54	0/1:232,16:250:49.09:49,0,76	0/1:233,14:249:52.10:52,0,53	0/1:238,11:250:12.71:13,0,36
157 | 


--------------------------------------------------------------------------------
/test/test_vcf.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import doctest
  3 | import os
  4 | import commands
  5 | from StringIO import StringIO
  6 | 
  7 | import cyvcf
  8 | from cyvcf import utils
  9 | 
 10 | suite = doctest.DocTestSuite(cyvcf.parser)
 11 | 
 12 | 
 13 | def fh(fname):
 14 |     return file(os.path.join(os.path.dirname(__file__), fname))
 15 | 
 16 | 
 17 | class TestVcfSpecs(unittest.TestCase):
 18 | 
 19 |     def test_vcf_4_0(self):
 20 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
 21 |         assert reader.metadata['fileformat'] == 'VCFv4.0'
 22 | 
 23 |         # test we can walk the file at least
 24 |         for r in reader:
 25 | 
 26 |             if r.POS == 1230237:
 27 |                 assert r.is_monomorphic
 28 |             else:
 29 |                 assert not r.is_monomorphic
 30 | 
 31 |             if 'AF' in r.INFO:
 32 |                 self.assertEqual(type(r.INFO['AF']),  type([]))
 33 | 
 34 |             for c in r:
 35 |                 assert c
 36 | 
 37 |                 # issue 19, in the example ref the GQ is length 1
 38 |                 if c.called:
 39 |                     self.assertEqual(type(c.data['GQ']),  type(1))
 40 |                     if 'HQ' in c.data and c.data['HQ'] is not None:
 41 |                         self.assertEqual(type(c.data['HQ']),  type([]))
 42 | 
 43 | 
 44 | 
 45 |     def test_vcf_4_1(self):
 46 |         return
 47 |         reader = cyvcf.Reader(fh('example-4.1.vcf'))
 48 |         self.assertEqual(reader.metadata['fileformat'],  'VCFv4.1')
 49 | 
 50 |         # contigs were added in vcf4.1
 51 |         # probably need to add a reader.contigs attribute
 52 |         assert 'contig' in reader.metadata
 53 | 
 54 |         # test we can walk the file at least
 55 |         for r in reader:
 56 |             for c in r:
 57 |                 assert c
 58 | 
 59 |         # asserting False while I work out what to check
 60 |         assert False
 61 | 
 62 |     def test_vcf_4_1_sv(self):
 63 |         return
 64 | 
 65 |         reader = cyvcf.Reader(fh('example-4.1-sv.vcf'))
 66 | 
 67 |         assert 'SVLEN' in reader.infos
 68 | 
 69 |         # test we can walk the file at least
 70 |         for r in reader:
 71 |             print r
 72 |             for c in r:
 73 |                 print c
 74 |                 assert c
 75 | 
 76 |         # asserting False while I work out what to check
 77 |         assert False
 78 | 
 79 | 
 80 | class TestGatkOutput(unittest.TestCase):
 81 | 
 82 |     filename = 'gatk.vcf'
 83 | 
 84 |     samples = ['BLANK', 'NA12878', 'NA12891', 'NA12892',
 85 |             'NA19238', 'NA19239', 'NA19240']
 86 |     formats = ['AD', 'DP', 'GQ', 'GT', 'PL']
 87 |     infos = ['AC', 'AF', 'AN', 'BaseQRankSum', 'DB', 'DP', 'DS',
 88 |             'Dels', 'FS', 'HRun', 'HaplotypeScore', 'InbreedingCoeff',
 89 |             'MQ', 'MQ0', 'MQRankSum', 'QD', 'ReadPosRankSum']
 90 | 
 91 |     n_calls = 37
 92 | 
 93 |     def setUp(self):
 94 |         self.reader = cyvcf.Reader(fh(self.filename))
 95 | 
 96 |     def testSamples(self):
 97 |         self.assertEqual(self.reader.samples, self.samples)
 98 | 
 99 |     def testFormats(self):
100 |         self.assertEqual(set(self.reader.formats), set(self.formats))
101 | 
102 |     def testInfos(self):
103 |         self.assertEqual(set(self.reader.infos), set(self.infos))
104 | 
105 | 
106 |     def testCalls(self):
107 |         n = 0
108 | 
109 |         for site in self.reader:
110 |             n += 1
111 |             self.assertEqual(len(site.samples), len(self.samples))
112 | 
113 | 
114 |             # check sample name lookup
115 |             for s in self.samples:
116 |                 assert site.genotype(s)
117 | 
118 |             # check ordered access
119 |             self.assertEqual([x.sample for x in site.samples], self.samples)
120 |         self.assertEqual(len(site.gt_phred_likelihoods), len(self.samples))
121 |         self.assertEqual(n,  self.n_calls)
122 | 
123 | 
124 | class TestFreebayesOutput(TestGatkOutput):
125 | 
126 |     filename = 'freebayes.vcf'
127 |     formats = ['AO', 'DP', 'GL', 'GLE', 'GQ', 'GT', 'QA', 'QR', 'RO']
128 |     infos = ['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'BVAR', 'CIGAR',
129 |             'DB', 'DP', 'DPRA', 'EPP', 'EPPR', 'HWE', 'LEN', 'MEANALT',
130 |             'NUMALT', 'RPP', 'MQMR', 'ODDS', 'MQM', 'PAIREDR', 'PAIRED',
131 |             'SAP', 'XRM', 'RO', 'REPEAT', 'XRI', 'XAS', 'XAI', 'SRP',
132 |             'XAM', 'XRS', 'RPPR', 'NS', 'RUN', 'CpG', 'TYPE']
133 |     n_calls = 104
134 | 
135 | 
136 |     def testParse(self):
137 |         reader = cyvcf.Reader(fh('freebayes.vcf'))
138 |         print reader.samples
139 |         self.assertEqual(len(reader.samples), 7)
140 |         n = 0
141 |         for r in reader:
142 |             n+=1
143 |             for x in r:
144 |                 assert x
145 |         assert n == self.n_calls
146 | 
147 | class TestSamtoolsOutput(unittest.TestCase):
148 | 
149 |     def testParse(self):
150 |         reader = cyvcf.Reader(fh('samtools.vcf'))
151 | 
152 |         self.assertEqual(len(reader.samples), 1)
153 |         self.assertEqual(sum(1 for _ in reader), 11)
154 | 
155 | 
156 | class Test1kg(unittest.TestCase):
157 | 
158 |     def testParse(self):
159 |         reader = cyvcf.Reader(fh('1kg.vcf.gz'))
160 | 
161 |         self.assertEqual(len(reader.samples), 629)
162 |         for _ in reader:
163 |             pass
164 | 
165 | 
166 | class TestWriter(unittest.TestCase):
167 | 
168 |     def testWrite(self):
169 | 
170 |         reader = cyvcf.Reader(fh('gatk.vcf'))
171 |         out = StringIO()
172 |         writer = cyvcf.Writer(out, reader)
173 | 
174 |         records = list(reader)
175 | 
176 |         map(writer.write_record, records)
177 |         out.seek(0)
178 |         reader2 = cyvcf.Reader(out)
179 | 
180 |         self.assertEquals(reader.samples, reader2.samples)
181 |         self.assertEquals(reader.formats, reader2.formats)
182 | 
183 |         for k in reader.infos:
184 |             self.assertEquals(reader.infos[k], reader2.infos[k], (reader.infos[k], reader2.infos[k]))
185 | 
186 |         for l, r in zip(records, reader2):
187 |             self.assertEquals(l.samples, r.samples)
188 | 
189 | class TestRecord(unittest.TestCase):
190 | 
191 |     def test_num_calls(self):
192 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
193 |         for var in reader:
194 |             num_calls = (var.num_hom_ref + var.num_hom_alt + \
195 |                          var.num_het + var.num_unknown)
196 |             self.assertEqual(len(var.samples), num_calls)
197 | 
198 |     def test_call_rate(self):
199 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
200 |         for var in reader:
201 |             call_rate = var.call_rate
202 |             if var.POS == 14370:
203 |                 self.assertEqual(3.0/3.0, call_rate)
204 |             if var.POS == 17330:
205 |                 self.assertEqual(3.0/3.0, call_rate)
206 |             if var.POS == 1110696:
207 |                 self.assertEqual(3.0/3.0, call_rate)
208 |             if var.POS == 1230237:
209 |                 self.assertEqual(3.0/3.0, call_rate)
210 |             elif var.POS == 1234567:
211 |                 self.assertEqual(2.0/3.0, call_rate)
212 | 
213 |     def test_aaf(self):
214 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
215 |         for var in reader:
216 |             aaf = var.aaf
217 |             if var.POS == 14370:
218 |                 self.assertEqual(3.0/6.0, aaf)
219 |             if var.POS == 17330:
220 |                 self.assertEqual(1.0/6.0, aaf)
221 |             if var.POS == 1110696:
222 |                 self.assertEqual(None, aaf)
223 |             if var.POS == 1230237:
224 |                 self.assertEqual(0.0/6.0, aaf)
225 |             elif var.POS == 1234567:
226 |                 self.assertEqual(None, aaf)
227 | 
228 |     def test_pi(self):
229 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
230 |         for var in reader:
231 |             pi = var.nucl_diversity
232 |             if var.POS == 14370:
233 |                 self.assertEqual(6.0/10.0, pi)
234 |             if var.POS == 17330:
235 |                 self.assertEqual(1.0/3.0, pi)
236 |             if var.POS == 1110696:
237 |                 self.assertEqual(None, pi)
238 |             if var.POS == 1230237:
239 |                 self.assertEqual(0.0/6.0, pi)
240 |             elif var.POS == 1234567:
241 |                 self.assertEqual(None, pi)
242 | 
243 |     def test_is_snp(self):
244 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
245 |         for var in reader:
246 |             is_snp = var.is_snp
247 |             if var.POS == 14370:
248 |                 self.assertEqual(True, is_snp)
249 |             if var.POS == 17330:
250 |                 self.assertEqual(True, is_snp)
251 |             if var.POS == 1110696:
252 |                 self.assertEqual(True, is_snp)
253 |             if var.POS == 1230237:
254 |                 self.assertEqual(False, is_snp)
255 |             elif var.POS == 1234567:
256 |                 self.assertEqual(False, is_snp)
257 | 
258 |     def test_is_indel(self):
259 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
260 |         for var in reader:
261 |             is_indel = var.is_indel
262 |             if var.POS == 14370:
263 |                 self.assertEqual(False, is_indel)
264 |             if var.POS == 17330:
265 |                 self.assertEqual(False, is_indel)
266 |             if var.POS == 1110696:
267 |                 self.assertEqual(False, is_indel)
268 |             if var.POS == 1230237:
269 |                 self.assertEqual(True, is_indel)
270 |             elif var.POS == 1234567:
271 |                 self.assertEqual(True, is_indel)
272 | 
273 |     def test_is_transition(self):
274 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
275 |         for var in reader:
276 |             is_trans = var.is_transition
277 |             if var.POS == 14370:
278 |                 self.assertEqual(True, is_trans)
279 |             if var.POS == 17330:
280 |                 self.assertEqual(False, is_trans)
281 |             if var.POS == 1110696:
282 |                 self.assertEqual(False, is_trans)
283 |             if var.POS == 1230237:
284 |                 self.assertEqual(False, is_trans)
285 |             elif var.POS == 1234567:
286 |                 self.assertEqual(False, is_trans)
287 | 
288 |     def test_is_deletion(self):
289 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
290 |         for var in reader:
291 |             is_del = var.is_deletion
292 |             if var.POS == 14370:
293 |                 self.assertEqual(False, is_del)
294 |             if var.POS == 17330:
295 |                 self.assertEqual(False, is_del)
296 |             if var.POS == 1110696:
297 |                 self.assertEqual(False, is_del)
298 |             if var.POS == 1230237:
299 |                 self.assertEqual(True, is_del)
300 |             elif var.POS == 1234567:
301 |                 self.assertEqual(False, is_del)
302 | 
303 |     def test_var_type(self):
304 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
305 |         for var in reader:
306 |             type = var.var_type
307 |             if var.POS == 14370:
308 |                 self.assertEqual("snp", type)
309 |             if var.POS == 17330:
310 |                 self.assertEqual("snp", type)
311 |             if var.POS == 1110696:
312 |                 self.assertEqual("snp", type)
313 |             if var.POS == 1230237:
314 |                 self.assertEqual("indel", type)
315 |             elif var.POS == 1234567:
316 |                 self.assertEqual("indel", type)
317 |         # SV tests
318 |         reader = cyvcf.Reader(fh('example-4.1-sv.vcf'))
319 |         for var in reader:
320 |             type = var.var_type
321 |             if var.POS == 2827693:
322 |                 self.assertEqual("sv", type)
323 |             if var.POS == 321682:
324 |                 self.assertEqual("sv", type)
325 |             if var.POS == 14477084:
326 |                 self.assertEqual("sv", type)
327 |             if var.POS == 9425916:
328 |                 self.assertEqual("sv", type)
329 |             elif var.POS == 12665100:
330 |                 self.assertEqual("sv", type)
331 |             elif var.POS == 18665128:
332 |                 self.assertEqual("sv", type)
333 | 
334 | 
335 |     def test_var_subtype(self):
336 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
337 |         for var in reader:
338 |             subtype = var.var_subtype
339 |             if var.POS == 14370:
340 |                 self.assertEqual("ts", subtype)
341 |             if var.POS == 17330:
342 |                 self.assertEqual("tv", subtype)
343 |             if var.POS == 1110696:
344 |                 self.assertEqual("unknown", subtype)
345 |             if var.POS == 1230237:
346 |                 self.assertEqual("del", subtype)
347 |             elif var.POS == 1234567:
348 |                 self.assertEqual("unknown", subtype)
349 |         # SV tests
350 |         reader = cyvcf.Reader(fh('example-4.1-sv.vcf'))
351 |         for var in reader:
352 |             subtype = var.var_subtype
353 |             if var.POS == 2827693:
354 |                 self.assertEqual("DEL", subtype)
355 |             if var.POS == 321682:
356 |                 self.assertEqual("DEL", subtype)
357 |             if var.POS == 14477084:
358 |                 self.assertEqual("DEL:ME:ALU", subtype)
359 |             if var.POS == 9425916:
360 |                 self.assertEqual("INS:ME:L1", subtype)
361 |             elif var.POS == 12665100:
362 |                 self.assertEqual("DUP", subtype)
363 |             elif var.POS == 18665128:
364 |                 self.assertEqual("DUP:TANDEM", subtype)
365 | 
366 |     def test_is_sv(self):
367 |         reader = cyvcf.Reader(fh('example-4.1-sv.vcf'))
368 |         for var in reader:
369 |             is_sv = var.is_sv
370 |             if var.POS == 2827693:
371 |                 self.assertEqual(True, is_sv)
372 |             if var.POS == 321682:
373 |                 self.assertEqual(True, is_sv)
374 |             if var.POS == 14477084:
375 |                 self.assertEqual(True, is_sv)
376 |             if var.POS == 9425916:
377 |                 self.assertEqual(True, is_sv)
378 |             elif var.POS == 12665100:
379 |                 self.assertEqual(True, is_sv)
380 |             elif var.POS == 18665128:
381 |                 self.assertEqual(True, is_sv)
382 | 
383 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
384 |         for var in reader:
385 |             is_sv = var.is_sv
386 |             if var.POS == 14370:
387 |                 self.assertEqual(False, is_sv)
388 |             if var.POS == 17330:
389 |                 self.assertEqual(False, is_sv)
390 |             if var.POS == 1110696:
391 |                 self.assertEqual(False, is_sv)
392 |             if var.POS == 1230237:
393 |                 self.assertEqual(False, is_sv)
394 |             elif var.POS == 1234567:
395 |                 self.assertEqual(False, is_sv)
396 | 
397 |     def test_is_sv_precise(self):
398 |         reader = cyvcf.Reader(fh('example-4.1-sv.vcf'))
399 |         for var in reader:
400 |             is_precise = var.is_sv_precise
401 |             if var.POS == 2827693:
402 |                 self.assertEqual(True, is_precise)
403 |             if var.POS == 321682:
404 |                 self.assertEqual(False, is_precise)
405 |             if var.POS == 14477084:
406 |                 self.assertEqual(False, is_precise)
407 |             if var.POS == 9425916:
408 |                 self.assertEqual(False, is_precise)
409 |             elif var.POS == 12665100:
410 |                 self.assertEqual(False, is_precise)
411 |             elif var.POS == 18665128:
412 |                 self.assertEqual(False, is_precise)
413 | 
414 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
415 |         for var in reader:
416 |             is_precise = var.is_sv_precise
417 |             if var.POS == 14370:
418 |                 self.assertEqual(False, is_precise)
419 |             if var.POS == 17330:
420 |                 self.assertEqual(False, is_precise)
421 |             if var.POS == 1110696:
422 |                 self.assertEqual(False, is_precise)
423 |             if var.POS == 1230237:
424 |                 self.assertEqual(False, is_precise)
425 |             elif var.POS == 1234567:
426 |                 self.assertEqual(False, is_precise)
427 | 
428 |     def test_sv_end(self):
429 |         reader = cyvcf.Reader(fh('example-4.1-sv.vcf'))
430 |         for var in reader:
431 |             sv_end = var.sv_end
432 |             if var.POS == 2827693:
433 |                 self.assertEqual(2827680, sv_end)
434 |             if var.POS == 321682:
435 |                 self.assertEqual(321887, sv_end)
436 |             if var.POS == 14477084:
437 |                 self.assertEqual(14477381, sv_end)
438 |             if var.POS == 9425916:
439 |                 self.assertEqual(9425916, sv_end)
440 |             elif var.POS == 12665100:
441 |                 self.assertEqual(12686200, sv_end)
442 |             elif var.POS == 18665128:
443 |                 self.assertEqual(18665204, sv_end)
444 | 
445 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
446 |         for var in reader:
447 |             sv_end = var.sv_end
448 |             if var.POS == 14370:
449 |                 self.assertEqual(None, sv_end)
450 |             if var.POS == 17330:
451 |                 self.assertEqual(None, sv_end)
452 |             if var.POS == 1110696:
453 |                 self.assertEqual(None, sv_end)
454 |             if var.POS == 1230237:
455 |                 self.assertEqual(None, sv_end)
456 |             elif var.POS == 1234567:
457 |                 self.assertEqual(None, sv_end)
458 | 
459 | 
460 | class TestCall(unittest.TestCase):
461 | 
462 |     def test_phased(self):
463 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
464 |         for var in reader:
465 |             phases = var.gt_phases
466 |             print var
467 |             if var.POS == 14370:
468 |                 self.assertEqual([True, True, False], phases)
469 |             if var.POS == 17330:
470 |                 self.assertEqual([True, True, False], phases)
471 |             if var.POS == 1110696:
472 |                 self.assertEqual([True, True, False], phases)
473 |             if var.POS == 1230237:
474 |                 self.assertEqual([True, True, False], phases)
475 |             elif var.POS == 1234567:
476 |                 self.assertEqual([False, False, False], phases)
477 | 
478 |     def test_gt_bases(self):
479 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
480 |         for var in reader:
481 |             gt_bases = [s.gt_bases for s in var.samples]
482 |             if var.POS == 14370:
483 |                 self.assertEqual(['G|G', 'A|G', 'A/A'], gt_bases)
484 |             elif var.POS == 17330:
485 |                 self.assertEqual(['T|T', 'T|A', 'T/T'], gt_bases)
486 |             elif var.POS == 1110696:
487 |                 self.assertEqual(['G|T', 'T|G', 'T/T'], gt_bases)
488 |             elif var.POS == 1230237:
489 |                 self.assertEqual(['T|T', 'T|T', 'T/T'], gt_bases)
490 |             elif var.POS == 1234567:
491 |                 self.assertEqual([None, 'GTCT/GTACT', 'G/G'], gt_bases)
492 | 
493 |     def test_gt_types(self):
494 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
495 |         for var in reader:
496 |             for s in var:
497 |                 print s.data
498 |             gt_types = [s.gt_type for s in var.samples]
499 |             if var.POS == 14370:
500 |                 self.assertEqual([0,1,3], gt_types)
501 |             elif var.POS == 17330:
502 |                 self.assertEqual([0,1,0], gt_types)
503 |             elif var.POS == 1110696:
504 |                 self.assertEqual([1,1,3], gt_types)
505 |             elif var.POS == 1230237:
506 |                 self.assertEqual([0,0,0], gt_types)
507 |             elif var.POS == 1234567:
508 |                 self.assertEqual([None,1,3], gt_types)
509 | 
510 |     def test_gt_depths(self):
511 |         reader = cyvcf.Reader(fh('example-4.0.vcf'))
512 |         for var in reader:
513 |             for s in var:
514 |                 print s.data
515 |             gt_depths = [s.gt_depth for s in var.samples]
516 |             if var.POS == 14370:
517 |                 self.assertEqual([1,8,5], gt_depths)
518 |             elif var.POS == 17330:
519 |                 self.assertEqual([3,5,3], gt_depths)
520 |             elif var.POS == 1110696:
521 |                 self.assertEqual([6,0,4], gt_depths)
522 |             elif var.POS == 1230237:
523 |                 self.assertEqual([7,4,2], gt_depths)
524 |             elif var.POS == 1234567:
525 |                 self.assertEqual([4,2,3], gt_depths)
526 | 
527 |     def test_gt_ref_depths(self):
528 | 
529 |         reader = cyvcf.Reader(fh('gatk.vcf'))
530 |         for var in reader:
531 |             gt_ref_depths = [s.gt_ref_depth for s in var.samples]
532 |             if var.POS == 42522392:
533 |                 self.assertEqual([6,138,169,249,248,250,250], gt_ref_depths)
534 |             elif var.POS == 42522613:
535 |                 self.assertEqual([13,118,241,161,110,106,116], gt_ref_depths)
536 |             elif var.POS == 42527891:
537 |                 self.assertEqual([-1,238,246,239,232,233,238], gt_ref_depths)
538 | 
539 |     def test_gt_alt_depths(self):
540 | 
541 |         reader = cyvcf.Reader(fh('gatk.vcf'))
542 |         for var in reader:
543 |             gt_alt_depths = [s.gt_alt_depth for s in var.samples]
544 |             if var.POS == 42522392:
545 |                 self.assertEqual([0,107,77,0,1,0,0], gt_alt_depths)
546 |             elif var.POS == 42522613:
547 |                 self.assertEqual([4,127,0,85,132,135,126], gt_alt_depths)
548 |             elif var.POS == 42527891:
549 |                 self.assertEqual([-1,7,3,11,16,14,11], gt_alt_depths)
550 | 
551 |     def test_gt_quals(self):
552 | 
553 |         reader = cyvcf.Reader(fh('gatk.vcf'))
554 |         for var in reader:
555 |             gt_quals = [s.gt_qual for s in var.samples]
556 |             if var.POS == 42522392:
557 |                 self.assertEqual([18.04,99,99,99,99,99,99], gt_quals)
558 |             elif var.POS == 42522613:
559 |                 self.assertEqual([62.64,99,99,99,99,99,99], gt_quals)
560 |             elif var.POS == 42527891:
561 |                 self.assertEqual([-1,13.70,5.97,31.42,49.09,52.10,12.71], gt_quals)
562 | 
563 | 
564 | class TestTabix(unittest.TestCase):
565 | 
566 |     def setUp(self):
567 |         self.reader = cyvcf.Reader(fh('tb.vcf.gz'))
568 | 
569 |         self.run = cyvcf.parser.pysam is not None
570 | 
571 | 
572 |     def testFetchRange(self):
573 |         if not self.run:
574 |             return
575 |         lines = list(self.reader.fetch('20', 14370, 14370))
576 |         self.assertEquals(len(lines), 1)
577 |         self.assertEqual(lines[0].POS, 14370)
578 | 
579 |         lines = list(self.reader.fetch('20', 14370, 17330))
580 |         self.assertEquals(len(lines), 2)
581 |         self.assertEqual(lines[0].POS, 14370)
582 |         self.assertEqual(lines[1].POS, 17330)
583 | 
584 | 
585 |         lines = list(self.reader.fetch('20', 1110695, 1234567))
586 |         self.assertEquals(len(lines), 3)
587 | 
588 |     def testFetchSite(self):
589 |         if not self.run:
590 |             return
591 |         site = self.reader.fetch('20', 14370)
592 |         assert site.POS == 14370
593 | 
594 |         site = self.reader.fetch('20', 14369)
595 |         assert site is None
596 | 
597 | 
598 | 
599 | 
600 | class TestOpenMethods(unittest.TestCase):
601 | 
602 |     samples = 'NA00001 NA00002 NA00003'.split()
603 | 
604 |     def testOpenFilehandle(self):
605 |         r = cyvcf.Reader(fh('example-4.0.vcf'))
606 |         self.assertEqual(self.samples, r.samples)
607 |         self.assertEqual('example-4.0.vcf', os.path.split(r.filename)[1])
608 | 
609 |     def testOpenFilename(self):
610 |         r = cyvcf.Reader(filename='test/example-4.0.vcf')
611 |         self.assertEqual(self.samples, r.samples)
612 | 
613 |     def testOpenFilehandleGzipped(self):
614 |         r = cyvcf.Reader(fh('tb.vcf.gz'))
615 |         self.assertEqual(self.samples, r.samples)
616 | 
617 |     def testOpenFilenameGzipped(self):
618 |         r = cyvcf.Reader(filename='test/tb.vcf.gz')
619 |         self.assertEqual(self.samples, r.samples)
620 | 
621 | 
622 | class TestFilter(unittest.TestCase):
623 | 
624 | 
625 |     def testApplyFilter(self):
626 |         s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq')
627 |         #print out
628 |         assert s == 0
629 |         buf = StringIO()
630 |         buf.write(out)
631 |         buf.seek(0)
632 | 
633 |         print buf.getvalue()
634 |         reader = cyvcf.Reader(buf)
635 | 
636 | 
637 |         # check filter got into output file
638 |         assert 'sq30' in reader.filters
639 | 
640 |         print reader.filters
641 | 
642 |         # check sites were filtered
643 |         n = 0
644 |         for r in reader:
645 |             if r.QUAL < 30:
646 |                 assert 'sq30' in r.FILTER
647 |                 n += 1
648 |             else:
649 |                 assert r.FILTER is None or 'sq30' not in r.FILTER
650 |         assert n == 2
651 | 
652 | 
653 |     def testApplyMultipleFilters(self):
654 |         s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 '
655 |         '--genotype-quality 50 test/example-4.0.vcf sq mgq')
656 |         assert s == 0
657 |         #print out
658 |         buf = StringIO()
659 |         buf.write(out)
660 |         buf.seek(0)
661 |         reader = cyvcf.Reader(buf)
662 | 
663 |         print reader.filters
664 | 
665 |         assert 'mgq50' in reader.filters
666 |         assert 'sq30' in reader.filters
667 | 
668 | 
669 | class TestRegression(unittest.TestCase):
670 | 
671 |     def test_issue_16(self):
672 |         reader = cyvcf.Reader(fh('issue-16.vcf'))
673 |         assert reader.next().QUAL == None
674 | 
675 |     def test_null_mono(self):
676 |         # null qualities were written as blank, causing subsequent parse to fail
677 |         print os.path.abspath(os.path.join(os.path.dirname(__file__),  'null_genotype_mono.vcf'))
678 |         p = cyvcf.Reader(fh('null_genotype_mono.vcf'))
679 |         assert p.samples
680 |         out = StringIO()
681 |         writer = cyvcf.Writer(out, p)
682 |         map(writer.write_record, p)
683 |         out.seek(0)
684 |         print out.getvalue()
685 |         p2 = cyvcf.Reader(out)
686 |         rec = p2.next()
687 |         assert rec.samples
688 | 
689 | 
690 | class TestUtils(unittest.TestCase):
691 | 
692 |     def test_walk(self):
693 |         # easy case: all same sites
694 |         reader1 = cyvcf.Reader(fh('example-4.0.vcf'))
695 |         reader2 = cyvcf.Reader(fh('example-4.0.vcf'))
696 |         reader3 = cyvcf.Reader(fh('example-4.0.vcf'))
697 | 
698 |         n = 0
699 |         for x in utils.walk_together(reader1, reader2, reader3):
700 |             assert len(x) == 3
701 |             assert (x[0] == x[1]) and (x[1] == x[2])
702 |             n+= 1
703 |         assert n == 5
704 | 
705 |         # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left
706 | 
707 |         expected = 'llrrttrl'
708 |         reader1 = cyvcf.Reader(fh('walk_left.vcf'))
709 |         reader2 = cyvcf.Reader(fh('example-4.0.vcf'))
710 | 
711 |         for ex, recs in zip(expected, utils.walk_together(reader1, reader2)):
712 | 
713 |             if ex == 'l':
714 |                 assert recs[0] is not None
715 |                 assert recs[1] is None
716 |             if ex == 'r':
717 |                 assert recs[1] is not None
718 |                 assert recs[0] is None
719 |             if ex == 't':
720 |                 assert recs[0] is not None
721 |                 assert recs[1] is not None
722 | 
723 | 
724 | class TestAD(unittest.TestCase):
725 |     def setUp(self):
726 |         self.reader = cyvcf.Reader(fh('test.vcf'))
727 | 
728 |     def testRefDepth(self):
729 |         v = self.reader.next()
730 |         self.assertEqual(v.samples[0].gt_ref_depth, -1)
731 | 
732 | class TestGLInt(unittest.TestCase):
733 |     def setUp(self):
734 |         self.reader = cyvcf.Reader(fh('test-gl.vcf'))
735 |     def testGLInt(self):
736 |         v = next(self.reader)
737 |         self.assertEqual(v.samples[0].gt_phred_likelihoods, None)
738 | 
739 | 
740 | 
741 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestAD))
742 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput))
743 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFreebayesOutput))
744 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamtoolsOutput))
745 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriter))
746 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix))
747 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods))
748 | #suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter))
749 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg))
750 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord))
751 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall))
752 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression))
753 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGLInt))
754 | 


--------------------------------------------------------------------------------
/cyvcf/parser.pyx:
--------------------------------------------------------------------------------
   1 | import collections
   2 | import re
   3 | import csv
   4 | import gzip
   5 | import sys
   6 | import itertools
   7 | 
   8 | from . import utils
   9 | 
  10 | try:
  11 |     import pysam
  12 | except ImportError:
  13 |     pysam = None
  14 | 
  15 | # Metadata parsers/constants
  16 | RESERVED_INFO = {
  17 |     'AA': 'String', 'AC': 'Integer', 'AF': 'Float', 'AN': 'Integer',
  18 |     'BQ': 'Float', 'CIGAR': 'String', 'DB': 'Flag', 'DP': 'Integer',
  19 |     'END': 'Integer', 'H2': 'Flag', 'MQ': 'Float', 'MQ0': 'Integer',
  20 |     'NS': 'Integer', 'SB': 'String', 'SOMATIC': 'Flag', 'VALIDATED': 'Flag'
  21 | }
  22 | 
  23 | RESERVED_FORMAT = {
  24 |     'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float',
  25 |     'GQ': 'Float', 'HQ': 'Float'
  26 | }
  27 | 
  28 | Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc'])
  29 | Filter = collections.namedtuple('Filter', ['id', 'desc'])
  30 | Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc'])
  31 | 
  32 | HOM_REF = 0
  33 | HET = 1
  34 | HOM_ALT = 3
  35 | UNKNOWN = 2
  36 | 
  37 | cdef _Call _parse_sample(char *sample, list samp_fmt,
  38 |                                list samp_fmt_types, list samp_fmt_nums,
  39 |                                char *name, Record rec):
  40 | 
  41 |     cdef dict sampdict = {x: None for x in samp_fmt}
  42 |     cdef list lvals
  43 | 
  44 |     cdef list svals = sample.split(":")
  45 | 
  46 |     cdef int i
  47 |     cdef int N = len(svals)
  48 |     for i in range(N):
  49 |         fmt = samp_fmt[i]
  50 |         entry_type = samp_fmt_types[i]
  51 |         entry_num = samp_fmt_nums[i]
  52 |         vals = svals[i]
  53 | 
  54 |         # short circuit the most common
  55 |         if vals == ".":
  56 |             continue
  57 |         if vals == "./.":
  58 |             continue
  59 |         if vals == "":
  60 |             continue
  61 | 
  62 | 
  63 |         # we don't need to split single entries
  64 |         if entry_num == 1 or (entry_num is None and ',' not in vals):
  65 |             if entry_type == 'Integer':
  66 |                 if vals.isdigit():
  67 |                     sampdict[fmt] = int(vals)
  68 |                     continue
  69 |                 try:
  70 |                     sampdict[fmt] = float(vals)
  71 |                 except ValueError:
  72 |                     sampdict[fmt] = vals
  73 |                 continue
  74 |             elif entry_type == 'Float':
  75 |                 sampdict[fmt] = float(vals)
  76 |             else:
  77 |                 sampdict[fmt] = vals
  78 | 
  79 |             continue
  80 | 
  81 |         if entry_num == 1 and entry_type == 'String':
  82 |             sampdict[fmt] = vals
  83 |             continue
  84 | 
  85 |         lvals = vals.split(',')
  86 | 
  87 |         if entry_type == 'Integer':
  88 |             sampdict[fmt] = [int(x) if x != '.' else '.' for x in lvals]
  89 |         elif entry_type in ('Float', 'Numeric'):
  90 |             sampdict[fmt] = [float(x) if x != '.' else '.' for x in lvals]
  91 |         else:
  92 |             sampdict[fmt] = vals
  93 |     return _Call(rec, name, sampdict)
  94 | 
  95 | cdef inline list _map(func, list iterable, char *bad='.'):
  96 |     '''``map``, but make bad values None.'''
  97 |     return [func(x) if x != bad else None for x in iterable]
  98 | 
  99 | class _vcf_metadata_parser(object):
 100 |     '''Parse the metadat in the header of a VCF file.'''
 101 |     def __init__(self):
 102 |         super(_vcf_metadata_parser, self).__init__()
 103 |         self.info_pattern = re.compile(r'''\#\#INFO=<
 104 |             ID=(?P<id>[^,]+),
 105 |             Number=(?P<number>-?\d+|\.|[ARG]),
 106 |             Type=(?P<type>Integer|Float|Flag|Character|String),
 107 |             Description="(?P<desc>[^"]*)"
 108 |             >''', re.VERBOSE)
 109 |         self.filter_pattern = re.compile(r'''\#\#FILTER=<
 110 |             ID=(?P<id>[^,]+),
 111 |             Description="(?P<desc>[^"]*)"
 112 |             >''', re.VERBOSE)
 113 |         self.format_pattern = re.compile(r'''\#\#FORMAT=<
 114 |             ID=(?P<id>.+),
 115 |             Number=(?P<number>-?\d+|\.|[ARG]),
 116 |             Type=(?P<type>.+),
 117 |             Description="(?P<desc>.*)"
 118 |             >''', re.VERBOSE)
 119 |         self.meta_pattern = re.compile(r'''##(?P<key>.+)=(?P<val>.+)''')
 120 | 
 121 |     def read_info(self, info_string):
 122 |         '''Read a meta-information INFO line.'''
 123 |         match = self.info_pattern.match(info_string)
 124 |         if not match:
 125 |             raise SyntaxError(
 126 |                 "One of the INFO lines is malformed: %s" % info_string)
 127 | 
 128 |         try:
 129 |             num = int(match.group('number'))
 130 |             if num < 0:
 131 |                 num = None
 132 |         except ValueError:
 133 |             num = None
 134 | 
 135 |         info = Info(match.group('id'), num,
 136 |                      match.group('type'), match.group('desc'))
 137 | 
 138 |         return (match.group('id'), info)
 139 | 
 140 |     def read_filter(self, filter_string):
 141 |         '''Read a meta-information FILTER line.'''
 142 |         match = self.filter_pattern.match(filter_string)
 143 |         if not match:
 144 |             raise SyntaxError(
 145 |                 "One of the FILTER lines is malformed: %s" % filter_string)
 146 | 
 147 |         filt = Filter(match.group('id'), match.group('desc'))
 148 | 
 149 |         return (match.group('id'), filt)
 150 | 
 151 |     def read_format(self, format_string):
 152 |         '''Read a meta-information FORMAT line.'''
 153 |         match = self.format_pattern.match(format_string)
 154 |         if not match:
 155 |             raise SyntaxError(
 156 |                 "One of the FORMAT lines is malformed: %s" % format_string)
 157 | 
 158 |         try:
 159 |             num = int(match.group('number'))
 160 |             if num < 0:
 161 |                 num = None
 162 |         except ValueError:
 163 |             num = None
 164 | 
 165 |         form = Format(match.group('id'), num,
 166 |                        match.group('type'), match.group('desc'))
 167 | 
 168 |         return (match.group('id'), form)
 169 | 
 170 |     def read_meta(self, meta_string):
 171 |         match = self.meta_pattern.match(meta_string)
 172 |         return match.group('key'), match.group('val')
 173 | 
 174 | 
 175 | cdef class _Call(object):
 176 |     """ A genotype call, a cell entry in a VCF file"""
 177 | 
 178 |     cdef public bytes sample   #NA12878
 179 |     cdef bytes gt_nums  #'0/1'
 180 |     # use bytes instead of char * because of C -> Python string complications
 181 |     # see: http://docs.cython.org/src/tutorial/strings.html
 182 |     cdef public Record site   #instance of Record
 183 |     cdef public dict data
 184 |     cdef public bint called, phased
 185 |     cdef list alleles
 186 | 
 187 |     def __cinit__(self, Record site, char *sample, dict data):
 188 |         #: The ``Record`` for this ``_Call``
 189 |         self.site = site
 190 |         #: The sample name
 191 |         self.sample = sample
 192 |         #: Dictionary of data from the VCF file
 193 |         self.data = data
 194 |         # '0/1', '0/0', etc.
 195 |         self.gt_nums = self.data['GT']
 196 |         # True if the GT is not ./.
 197 |         self.called = self.gt_nums is not None
 198 |         # True if the GT is phased (A|G, not A/G)
 199 |         self.phased = self.called and '|' in self.data['GT']
 200 | 
 201 |         if self.called:
 202 |             self.alleles = self.gt_nums.split('|' if self.phased else '/')
 203 |         else:
 204 |             self.alleles = []
 205 | 
 206 |     def __repr__(self):
 207 |         return "Call(sample=%s, GT=%s, GQ=%s)" % (self.sample, self.gt_nums, self.data.get('GQ', ''))
 208 | 
 209 |     def __richcmp__(self, other, int op):
 210 |         """ Two _Calls are equal if their Records are equal
 211 |             and the samples and ``gt_type``s are the same
 212 |         """
 213 |         # < 0 | <= 1 | == 2 | != 3 |  > 4 | >= 5
 214 |         if op == 2: # 2
 215 |             return (self.site == other.site
 216 |                     and self.sample == other.sample
 217 |                     and self.gt_type == other.gt_type)
 218 | 
 219 |     def __getitem__(self, key):
 220 |         """ Lookup value, backwards compatibility """
 221 |         return self.data[key]
 222 | 
 223 |     property gt_bases:
 224 |         def __get__(self):
 225 |             '''The actual genotype alleles.
 226 |                E.g. if VCF genotype is 0/1, return A/G
 227 |             '''
 228 |             # nothing to do if no genotype call
 229 |             if self.called:
 230 |                 # grab the numeric alleles of the gt string; tokenize by phasing
 231 |                 # lookup and return the actual DNA alleles
 232 |                 phase_char = ['/', '|'][self.phased]
 233 |                 try:
 234 |                     return phase_char.join([self.site.alleles[int(a)] \
 235 |                                            if a != '.' else '.' for a in
 236 |                                            self.alleles])
 237 |                 except KeyError:
 238 |                     sys.stderr.write("Allele number not found in list of alleles\n")
 239 |             else:
 240 |                 return None
 241 | 
 242 |     property gt_type:
 243 | 
 244 |         def __get__(self):
 245 |             '''The type of genotype.
 246 |                0 / 00000000 hom ref
 247 |                1 / 00000001 het
 248 |                2 / 00000010 missing
 249 |                3 / 00000011 hom alt
 250 |                hom_ref  = 0
 251 |                het      = 1
 252 |                hom_alt  = 3  (we don;t track _which+ ALT)
 253 |                uncalled = 2
 254 |             '''
 255 |             # extract the numeric alleles of the gt string
 256 |             gt_type = None
 257 |             if self.called:
 258 |                 # grab the numeric alleles of the gt string; tokenize by phasing
 259 | 
 260 |                 if len(self.alleles) == 2:
 261 |                     if self.alleles[0] == self.alleles[1]:
 262 |                         if self.alleles[0] == "0":
 263 |                             gt_type = HOM_REF
 264 |                         else:
 265 |                             gt_type = HOM_ALT
 266 |                     else:
 267 |                         gt_type = HET
 268 |                 elif len(self.alleles) == 1:
 269 |                     if self.alleles[0] == "0":
 270 |                         gt_type = HOM_REF
 271 |                     else:
 272 |                         gt_type = HOM_ALT
 273 | 
 274 |             return gt_type
 275 | 
 276 |     property gt_depth:
 277 |         def __get__(self):
 278 |             '''The depth of aligned sequences that led to the genotype
 279 |             call for this sample.
 280 |             '''
 281 |             # extract the numeric alleles of the gt string
 282 |             try:
 283 |                 depth = self.data['DP']
 284 |                 if depth is not None:
 285 |                     return depth
 286 |                 else:
 287 |                     return -1
 288 |             except KeyError:
 289 |                 return -1
 290 | 
 291 |     property gt_ref_depth:
 292 |         def __get__(self):
 293 |             '''The depth of aligned sequences that supported the
 294 |             reference allele for this sample.
 295 |             '''
 296 |             # extract the numeric alleles of the gt string
 297 |             if 'AD' in self.data:
 298 |                 depths = self.data['AD']
 299 |                 if depths is not None:
 300 |                     # require bi-allelic
 301 |                     if isinstance(depths, (list, tuple)) and len(depths) == 2:
 302 |                         d = depths[0]
 303 |                         if d is None:
 304 |                             return -1
 305 |                         return d
 306 |                     else:
 307 |                         # ref allele is first
 308 |                         return -1
 309 |                 else:
 310 |                     return -1
 311 |             elif 'RO' in self.data:
 312 |                 if self.data['RO'] is not None:
 313 |                     return self.data['RO']
 314 |                 else:
 315 |                     return -1
 316 |             else:
 317 |                 return -1
 318 | 
 319 |     property gt_phred_likelihoods:
 320 |         def __get__(self):
 321 |             if 'PL' in self.data:
 322 |                 return self.data['PL']
 323 |                 # phred-scaled.
 324 |             elif 'GL' in self.data and self.data['GL'] is not None:
 325 |                 # it's not usable anyway, so return None
 326 |                 if not isinstance(self.data["GL"], list):
 327 |                     return None
 328 |                 return [int(round(-10 * g)) if g is not None and g != '.' else None for g in self.data['GL']]
 329 |             else:
 330 |                 return []
 331 | 
 332 | 
 333 |     property gt_alt_depth:
 334 |         def __get__(self):
 335 |             '''The depth of aligned sequences that supported the
 336 |             alternate allele for this sample.
 337 |             '''
 338 |             # extract the numeric alleles of the gt string
 339 | 
 340 |             # GATK style
 341 |             if 'AD' in self.data:
 342 |                 depths = self.data['AD']
 343 |                 if depths is not None:
 344 |                     # require bi-allelic
 345 |                     if not isinstance(depths, (list, tuple)) or len(depths) != 2:
 346 |                         return -1
 347 |                     else:
 348 |                         # alt allele is second
 349 |                         d = depths[1]
 350 |                         if d is None:
 351 |                             return -1
 352 |                         return d
 353 |                 else:
 354 |                     return -1
 355 |             # Freebayes style
 356 |             elif 'AO' in self.data:
 357 |                 depth = self.data['AO']
 358 |                 if depth is not None:
 359 |                     # require bi-allelic
 360 |                     if isinstance(depth, list):
 361 |                         return -1
 362 |                     else:
 363 |                         return depth
 364 |                 else:
 365 |                     return -1
 366 |             else:
 367 |                 return -1
 368 | 
 369 |     @property
 370 |     def gt_qual(self):
 371 |         '''The PHRED-scaled quality of genotype
 372 |         call for this sample.
 373 |         '''
 374 |         # extract the numeric alleles of the gt string
 375 |         try:
 376 |             qual = self.data['GQ']
 377 |             if qual is not None:
 378 |                 return qual
 379 |             else:
 380 |                 return -1
 381 |         except KeyError:
 382 |             return -1
 383 | 
 384 |     property gt_copy_number:
 385 |         def __get__(self):
 386 |             '''The copy number prediction for this sample.
 387 |             '''
 388 |             # extract the numeric alleles of the gt string
 389 |             if not 'CN' in self.data:
 390 |                 return -1
 391 |             qual = self.data['CN']
 392 |             if qual is not None:
 393 |                 return qual
 394 |             else:
 395 |                 return -1
 396 | 
 397 |     @property
 398 |     def is_variant(self):
 399 |         """ Return True if not a reference call """
 400 |         if not self.called:
 401 |             return None
 402 |         return self.gt_type != HOM_REF
 403 | 
 404 |     @property
 405 |     def is_het(self):
 406 |         """ Return True for heterozygous calls """
 407 |         if not self.called:
 408 |             return None
 409 |         return self.gt_type == HET
 410 | 
 411 | 
 412 | cdef class Record(object):
 413 |     """ A set of calls at a site.  Equivalent to a line in a VCF file.
 414 | 
 415 |         The standard VCF fields:
 416 |         CHROM, POS, ID,
 417 |         REF, ALT, QUAL,
 418 |         FILTER, INFO, & FORMAT are available as properties.
 419 | 
 420 |         The list of genotype calls is in the ``samples`` property.
 421 |     """
 422 | 
 423 |     # initialize Cython variables for all of the base attrs.
 424 |     cdef public list alleles, samples, ALT, gt_bases, gt_types, gt_phases, \
 425 |               gt_depths, gt_ref_depths, gt_alt_depths, gt_quals, gt_copy_numbers,\
 426 |               gt_phred_likelihoods
 427 |     # use bytes instead of char * because of C -> Python string complications
 428 |     # see: http://docs.cython.org/src/tutorial/strings.html
 429 |     cdef readonly bytes CHROM, ID, FORMAT
 430 |     cdef public REF
 431 |     cdef readonly object FILTER, QUAL
 432 |     cdef public int POS, start, end, num_hom_ref, num_het, num_hom_alt, \
 433 |              num_unknown, num_called
 434 |     cdef public dict INFO
 435 |     cdef public dict _sample_indexes
 436 |     cdef public bint has_genotypes
 437 | 
 438 |     def __cinit__(self, char *CHROM, int POS, char *ID,
 439 |                         char *REF, list ALT, object QUAL=None,
 440 |                         object FILTER=None, dict INFO=None, object FORMAT=None,
 441 |                         dict sample_indexes=None, list samples=None,
 442 |                         list gt_bases=None, list gt_types=None,
 443 |                         list gt_phases=None, list gt_depths=None,
 444 |                         list gt_ref_depths=None, list gt_alt_depths=None,
 445 |                         list gt_quals=None, list gt_copy_numbers=None, list gt_phred_likelihoods=None,
 446 |                         int num_hom_ref=0, int num_het=0, int num_hom_alt=0, int num_unknown=0, int num_called=0):
 447 |         # CORE VCF fields
 448 |         self.CHROM = CHROM
 449 |         self.POS = POS
 450 |         self.ID = ID
 451 |         self.REF = REF
 452 |         self.ALT = ALT
 453 |         self.QUAL = QUAL
 454 |         self.FILTER = FILTER
 455 |         self.INFO = INFO
 456 |         self.FORMAT = FORMAT
 457 |         # DERIVED fields
 458 |         self.start = self.POS - 1
 459 |         self.end = self.start + len(self.REF)
 460 |         if 'END' in self.INFO:
 461 |              self.end = self.INFO['END']
 462 |         else:
 463 |              self.end = self.start + len(self.REF)
 464 |         self.alleles = [self.REF]
 465 |         self.alleles.extend(self.ALT)
 466 |         self.samples = samples
 467 |         self._sample_indexes = sample_indexes
 468 |         self.gt_bases = gt_bases
 469 |         self.gt_types = gt_types
 470 |         self.gt_phases = gt_phases
 471 |         self.gt_depths = gt_depths
 472 |         self.gt_ref_depths = gt_ref_depths
 473 |         self.gt_alt_depths = gt_alt_depths
 474 |         self.gt_quals = gt_quals
 475 |         self.gt_copy_numbers = gt_copy_numbers
 476 |         self.gt_phred_likelihoods = gt_phred_likelihoods
 477 |         self.num_hom_ref = num_hom_ref
 478 |         self.num_het = num_het
 479 |         self.num_hom_alt = num_hom_alt
 480 |         self.num_unknown = num_unknown
 481 |         self.num_called = num_called
 482 |         if self.FORMAT is not None and sample_indexes is not None:
 483 |             self.has_genotypes = True
 484 |         else:
 485 |             self.has_genotypes = False
 486 | 
 487 |     def __richcmp__(self, other, int op):
 488 |         """ Records are equal if they describe the same variant (same position, alleles) """
 489 | 
 490 |         # < 0 | <= 1 | == 2 | != 3 |  > 4 | >= 5
 491 |         if op == 2: # 2
 492 |             return (self.CHROM == other.CHROM and
 493 |                     self.POS == other.POS and
 494 |                     self.REF == other.REF and
 495 |                     self.ALT == other.ALT)
 496 | 
 497 |     def __iter__(self):
 498 |         return iter(self.samples)
 499 | 
 500 |     def _format_alt(self):
 501 |         return ','.join([x or '.' for x in self.ALT])
 502 | 
 503 |     def _format_qual(self):
 504 |         return str(self.QUAL) if self.QUAL is not None else None
 505 | 
 506 |     def _format_info(self):
 507 |         if not self.INFO:
 508 |             return '.'
 509 |         return ';'.join(["%s=%s" % (x, self._stringify(y)) for x, y in self.INFO.items()])
 510 | 
 511 |     def _format_sample(self, sample):
 512 |         if sample.data["GT"] is None:
 513 |             return "./."
 514 |         return ':'.join(self._stringify(sample.data[f]) for f in self.FORMAT.split(':'))
 515 | 
 516 |     def _stringify(self, x, none='.'):
 517 |         if type(x) == type([]):
 518 |             return ','.join(self._map(str, x, none))
 519 |         return str(x) if x is not None else none
 520 | 
 521 |     def _map(self, func, iterable, none='.'):
 522 |         '''``map``, but make None values none.'''
 523 |         return [func(x) if x is not None else none
 524 |                 for x in iterable]
 525 | 
 526 |     def __repr__(self):
 527 |         if self.has_genotypes == True:
 528 |             core = "\t".join([self.CHROM, str(self.POS), str(self.ID), str(self.REF), self._format_alt(),
 529 |                           self._format_qual() or '.', self.FILTER or '.', self._format_info(), self.FORMAT])
 530 |             samples = "\t".join([self._format_sample(sample) for sample in self.samples])
 531 |             return core + "\t" + samples
 532 |         else:
 533 |             return "\t".join([self.CHROM, str(self.POS), str(self.ID), str(self.REF), self._format_alt(),
 534 |                           self._format_qual() or '.', self.FILTER or '.', self._format_info()])
 535 | 
 536 | 
 537 |     def __cmp__(self, other):
 538 |         return cmp( (self.CHROM, self.POS), (other.CHROM, other.POS))
 539 | 
 540 |     def add_format(self, fmt):
 541 |         tmp = self.FORMAT + ':' + fmt
 542 |         self.FORMAT = tmp
 543 | 
 544 |     def add_filter(self, flt):
 545 |         if self.FILTER is None or self.FILTER == b'PASS':
 546 |             self.FILTER = b''
 547 |         else:
 548 |             tmp = self.FILTER + ';'
 549 |             self.FILTER = tmp
 550 |         tmp = self.FILTER + flt
 551 |         self.FILTER = tmp
 552 | 
 553 |     def add_info(self, info, value=True):
 554 |         self.INFO[info] = value
 555 | 
 556 |     def genotype(self, name):
 557 |         """ Lookup a ``_Call`` for the sample given in ``name`` """
 558 |         return self.samples[self._sample_indexes[name]]
 559 | 
 560 |     @property
 561 |     def call_rate(self):
 562 |         """ The fraction of genotypes that were actually called. """
 563 |         return float(self.num_called) / float(len(self.samples))
 564 | 
 565 |     @property
 566 |     def aaf(self):
 567 |         """ The allele frequency of the alternate allele.
 568 |            NOTE 1: Punt if more than one alternate allele.
 569 |            NOTE 2: Denominator calc'ed from _called_ genotypes.
 570 |         """
 571 |         # skip if more than one alternate allele. assumes bi-allelic
 572 |         if len(self.ALT) > 1:
 573 |             return None
 574 |         hom_ref = self.num_hom_ref
 575 |         het = self.num_het
 576 |         hom_alt = self.num_hom_alt
 577 |         num_chroms = float(2.0*self.num_called)
 578 |         if num_chroms == 0.0:
 579 |             return 0.0
 580 |         else:
 581 |             return float(het + 2*hom_alt)/float(num_chroms)
 582 | 
 583 |     @property
 584 |     def nucl_diversity(self):
 585 |         """
 586 |         pi_hat (estimation of nucleotide diversity) for the site.
 587 |         This metric can be summed across multiple sites to compute regional
 588 |         nucleotide diversity estimates.  For example, pi_hat for all variants
 589 |         in a given gene.
 590 | 
 591 |         Derived from:
 592 |         \"Population Genetics: A Concise Guide, 2nd ed., p.45\"
 593 |           John Gillespie.
 594 |         """
 595 |         # skip if more than one alternate allele. assumes bi-allelic
 596 |         if len(self.ALT) > 1:
 597 |             return None
 598 |         p = self.aaf
 599 |         q = 1.0-p
 600 |         num_chroms = float(2.0*self.num_called)
 601 |         return float(num_chroms/(num_chroms-1.0)) * (2.0 * p * q)
 602 | 
 603 |     def get_hom_refs(self):
 604 |         """ The list of hom ref genotypes"""
 605 |         return [s for s in self.samples if s.gt_type == 0]
 606 | 
 607 |     def get_hom_alts(self):
 608 |         """ The list of hom alt genotypes"""
 609 |         return [s for s in self.samples if s.gt_type == 3]
 610 | 
 611 |     def get_hets(self):
 612 |         """ The list of het genotypes"""
 613 |         return [s for s in self.samples if s.gt_type == 1]
 614 | 
 615 |     def get_unknowns(self):
 616 |         """ The list of unknown genotypes"""
 617 |         return [s for s in self.samples if s.gt_type is None]
 618 | 
 619 |     @property
 620 |     def is_snp(self):
 621 |         """ Return whether or not the variant is a SNP """
 622 |         if len(self.REF) > 1: return False
 623 |         for alt in self.ALT:
 624 |             if alt not in ['A', 'C', 'G', 'T']:
 625 |                 return False
 626 |         return True
 627 | 
 628 |     @property
 629 |     def is_indel(self):
 630 |         """ Return whether or not the variant is an INDEL """
 631 |         is_sv = self.is_sv
 632 | 
 633 |         if len(self.REF) > 1 and not is_sv: return True
 634 |         for alt in self.ALT:
 635 |             if alt is None:
 636 |                 return True
 637 |             elif len(alt) != len(self.REF):
 638 |                 # the diff. b/w INDELs and SVs can be murky.
 639 |                 if not is_sv:
 640 |                     # 1	2827693	.	CCCCTCGCA	C	.	PASS	AC=10;
 641 |                     return True
 642 |                 else:
 643 |                     # 1	2827693	.	CCCCTCGCA	C	.	PASS	SVTYPE=DEL;
 644 |                     return False
 645 |         return False
 646 | 
 647 |     @property
 648 |     def is_sv(self):
 649 |         """ Return whether or not the variant is a structural variant """
 650 |         if self.INFO.get('SVTYPE') is None:
 651 |             return False
 652 |         return True
 653 | 
 654 |     @property
 655 |     def is_transition(self):
 656 |         """ Return whether or not the SNP is a transition """
 657 |         # if multiple alts, it is unclear if we have a transition
 658 |         if len(self.ALT) > 1: return False
 659 | 
 660 |         if self.is_snp:
 661 |             # just one alt allele
 662 |             alt_allele = self.ALT[0]
 663 |             if ((self.REF == b'A' and alt_allele == b'G') or
 664 |                 (self.REF == b'G' and alt_allele == b'A') or
 665 |                 (self.REF == b'C' and alt_allele == b'T') or
 666 |                 (self.REF == b'T' and alt_allele == b'C')):
 667 |                 return True
 668 |             else: return False
 669 |         else: return False
 670 | 
 671 |     @property
 672 |     def is_deletion(self):
 673 |         """ Return whether or not the INDEL is a deletion """
 674 |         # if multiple alts, it is unclear if we have a transition
 675 |         if len(self.ALT) > 1: return False
 676 | 
 677 |         if self.is_indel:
 678 |             # just one alt allele
 679 |             alt_allele = self.ALT[0]
 680 |             if alt_allele is None:
 681 |                 return True
 682 |             if len(self.REF) > len(alt_allele):
 683 |                 return True
 684 |             else: return False
 685 |         else: return False
 686 | 
 687 |     @property
 688 |     def var_type(self):
 689 |         """
 690 |         Return the type of variant [snp, indel, unknown]
 691 |         TO DO: support SVs
 692 |         """
 693 |         if self.is_snp:
 694 |             return "snp"
 695 |         elif self.is_indel:
 696 |             return "indel"
 697 |         elif self.is_sv:
 698 |             return "sv"
 699 |         else:
 700 |             return "unknown"
 701 | 
 702 |     @property
 703 |     def var_subtype(self):
 704 |         """
 705 |         Return the subtype of variant.
 706 |         - For SNPs and INDELs, yeild one of: [ts, tv, ins, del]
 707 |         - For SVs yield either "complex" or the SV type defined
 708 |           in the ALT fields (removing the brackets).
 709 |           E.g.:
 710 |                <DEL>       -> DEL
 711 |                <INS:ME:L1> -> INS:ME:L1
 712 |                <DUP>       -> DUP
 713 | 
 714 |         The logic is meant to follow the rules outlined in the following
 715 |         paragraph at:
 716 | 
 717 |         http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41
 718 | 
 719 |         "For precisely known variants, the REF and ALT fields should contain
 720 |         the full sequences for the alleles, following the usual VCF conventions.
 721 |         For imprecise variants, the REF field may contain a single base and the
 722 |         ALT fields should contain symbolic alleles (e.g. <ID>), described in more
 723 |         detail below. Imprecise variants should also be marked by the presence
 724 |         of an IMPRECISE flag in the INFO field."
 725 |         """
 726 |         if self.is_snp:
 727 |             if self.is_transition:
 728 |                 return "ts"
 729 |             elif len(self.ALT) == 1:
 730 |                 return "tv"
 731 |             else: # multiple ALT alleles.  unclear
 732 |                 return "unknown"
 733 |         elif self.is_indel:
 734 |             if self.is_deletion:
 735 |                 return "del"
 736 |             elif len(self.ALT) == 1:
 737 |                 return "ins"
 738 |             else: # multiple ALT alleles.  unclear
 739 |                 return "unknown"
 740 |         elif self.is_sv:
 741 |             if self.INFO['SVTYPE'] == "BND":
 742 |                 return "complex"
 743 |             elif self.is_sv_precise:
 744 |                 return self.INFO['SVTYPE']
 745 |             else:
 746 |                 # first remove both "<" and ">" from ALT
 747 |                 return self.ALT[0].strip('<>')
 748 |         else:
 749 |             return "unknown"
 750 | 
 751 |     @property
 752 |     def sv_end(self):
 753 |         """ Return the end position for the SV """
 754 |         if self.is_sv:
 755 |             return self.INFO['END']
 756 |         return None
 757 | 
 758 |     @property
 759 |     def is_sv_precise(self):
 760 |         """ Return whether the SV cordinates are mapped
 761 |             to 1 b.p. resolution.
 762 |         """
 763 |         if self.INFO.get('IMPRECISE') is None and not self.is_sv:
 764 |             return False
 765 |         elif self.INFO.get('IMPRECISE') is not None and self.is_sv:
 766 |             return False
 767 |         elif self.INFO.get('IMPRECISE') is None and self.is_sv:
 768 |             return True
 769 | 
 770 |     @property
 771 |     def is_monomorphic(self):
 772 |         """ Return True for reference calls """
 773 |         return len(self.ALT) == 1 and self.ALT[0] is None
 774 | 
 775 | cdef class Reader(object):
 776 | 
 777 |     """ Reader for a VCF v 4.1 file, an iterator returning ``Record objects`` """
 778 |     cdef bytes _col_defn_line
 779 |     cdef char _prepend_chr
 780 |     cdef public object reader
 781 |     cdef bint compressed, prepend_chr
 782 |     cdef public dict metadata, infos, filters, formats,
 783 |     cdef readonly dict _sample_indexes
 784 |     cdef list _header_lines, samp_data
 785 |     cdef public list samples
 786 |     cdef object _tabix
 787 |     cdef public object filename
 788 |     cdef int num_samples
 789 | 
 790 |     def __init__(self, fsock=None, filename=None,
 791 |                         bint compressed=False, bint prepend_chr=False):
 792 |         """ Create a new Reader for a VCF file.
 793 | 
 794 |             You must specify a filename.  Gzipped streams
 795 |             or files are attempted to be recogized by the file extension, or gzipped
 796 |             can be forced with ``compressed=True``
 797 |         """
 798 |         super(VCFReader, self).__init__()
 799 | 
 800 |         if not (fsock or filename):
 801 |             raise Exception('You must provide at least fsock or filename')
 802 | 
 803 |         if filename:
 804 |             self.filename = filename
 805 |             if fsock is None:
 806 |                 self.reader = file(filename)
 807 | 
 808 |         if fsock:
 809 |             self.reader = fsock
 810 |             if filename is None:
 811 |                 if hasattr(fsock, 'name'):
 812 |                     filename = fsock.name
 813 |             self.filename = filename
 814 | 
 815 |         if compressed or (filename and filename.endswith('.gz')):
 816 |             self.reader = gzip.GzipFile(fileobj=self.reader)
 817 | 
 818 |         #: metadata fields from header
 819 |         self.metadata = {}
 820 |         #: INFO fields from header
 821 |         self.infos = {}
 822 |         #: FILTER fields from header
 823 |         self.filters = {}
 824 |         #: FORMAT fields from header
 825 |         self.formats = {}
 826 |         self.samples = []
 827 |         self._sample_indexes = {}
 828 |         self._header_lines = []
 829 |         self._col_defn_line = None
 830 |         self._tabix = None
 831 |         self._prepend_chr = prepend_chr
 832 |         self._parse_metainfo()
 833 | 
 834 |     def __iter__(self):
 835 |         return self
 836 | 
 837 |     def seek(self, offset):
 838 |         self.reader.seek(offset)
 839 | 
 840 |     def tell(self):
 841 |         return self.reader.tell()
 842 | 
 843 |     property raw_header:
 844 |         """Dump the raw, unparsed header lines"""
 845 |         def __get__(self):
 846 |             return ''.join(self._header_lines)
 847 | 
 848 |     def _parse_metainfo(self):
 849 |         '''Parse the information stored in the metainfo of the VCF.
 850 | 
 851 |         The end user shouldn't have to use this.  She can access the metainfo
 852 |         directly with ``self.metadata``.
 853 |         '''
 854 |         # NOTE: Not sure why this was necessary in PyVCF
 855 |         # for attr in ('metadata', 'infos', 'filters', 'formats'):
 856 |         #     setattr(self, attr, {})
 857 | 
 858 |         parser = _vcf_metadata_parser()
 859 | 
 860 |         line = self.reader.next()
 861 |         while line.startswith('##'):
 862 |             self._header_lines.append(line)
 863 |             line = line.rstrip('\n')
 864 | 
 865 |             if line.startswith('##INFO'):
 866 |                 key, val = parser.read_info(line)
 867 |                 self.infos[key] = val
 868 | 
 869 |             elif line.startswith('##FILTER'):
 870 |                 key, val = parser.read_filter(line)
 871 |                 self.filters[key] = val
 872 | 
 873 |             elif line.startswith('##FORMAT'):
 874 |                 key, val = parser.read_format(line)
 875 |                 self.formats[key] = val
 876 | 
 877 |             else:
 878 |                 key, val = parser.read_meta(line.strip())
 879 |                 self.metadata[key] = val
 880 | 
 881 |             line = self.reader.next()
 882 | 
 883 |         if line.startswith('#'):  # the column def'n line - REQ'D
 884 |             self._col_defn_line = line
 885 |             self._header_lines.append(line)
 886 |             fields = line.split()
 887 |             self.samples = fields[9:]
 888 |             self.num_samples = len(self.samples)
 889 |             self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
 890 |         else:
 891 |              sys.exit("Expected column definition line beginning with #.  Not found - exiting.")
 892 | 
 893 | 
 894 |     cdef list _map(Reader self, func, iterable, char *bad='.'):
 895 |         '''``map``, but make bad values None.'''
 896 |         return [func(x) if x != bad else None for x in iterable]
 897 | 
 898 | 
 899 |     def _parse_info(self, info_str):
 900 |         '''Parse the INFO field of a VCF entry into a dictionary of Python
 901 |         types.
 902 | 
 903 |         '''
 904 |         if info_str == '.':
 905 |             return {}
 906 | 
 907 |         cdef list entries = info_str.split(';')
 908 |         cdef dict retdict = {}
 909 | 
 910 |         cdef int i = 0
 911 |         cdef int n = len(entries)
 912 |         cdef char *entry_type
 913 |         cdef list entry
 914 |         # for entry in entries:
 915 |         for i in xrange(n):
 916 |             entry = entries[i].split('=')
 917 |             # entry = entry.split('=')
 918 |             ID = entry[0]
 919 |             if ID in self.infos:
 920 |                 entry_type = self.infos[ID].type
 921 |             elif ID in RESERVED_INFO:
 922 |                 entry_type = RESERVED_INFO[ID]
 923 |             else:
 924 |                 if len(entry) == 1:
 925 |                     entry_type = 'Flag'
 926 |                 else:
 927 |                     entry_type = 'String'
 928 | 
 929 |             if entry_type == b'Integer':
 930 |                 vals = entry[1].split(',')
 931 |                 try:
 932 |                     val = _map(int, vals)
 933 |                 except ValueError:
 934 |                     val = _map(float, vals)
 935 |             elif entry_type == b'Float':
 936 |                 vals = entry[1].split(',')
 937 |                 val = _map(float, vals)
 938 |             elif entry_type == b'Flag':
 939 |                 val = True
 940 |             elif entry_type == b'String':
 941 |                 if len(entry) > 1:
 942 |                     val = entry[1]
 943 |                 else:
 944 |                     val = True
 945 |             elif entry_type == b'Character':
 946 |                 val = entry[1]
 947 |             else:
 948 |                 print >>sys.stderr, "XXXXXXXXXXXXXXXX"
 949 |                 print >>sys.stderr, entry_type, entry
 950 | 
 951 |             try:
 952 |                 if isinstance(val, list) and self.infos[ID].num == 1 and entry_type != b'String':
 953 |                     val = val[0]
 954 |             except KeyError:
 955 |                 pass
 956 | 
 957 |             retdict[ID] = val
 958 | 
 959 |         return retdict
 960 | 
 961 | 
 962 |     def _parse_samples(self, Record rec, list samples, char *samp_fmt_s):
 963 |         '''Parse a sample entry according to the format specified in the FORMAT
 964 |         column.'''
 965 |         cdef list samp_fmt = samp_fmt_s.split(':')
 966 |         cdef int n = len(samp_fmt)
 967 |         cdef list samp_fmt_types = [None] * n
 968 |         cdef list samp_fmt_nums = [None] * n
 969 | 
 970 |         cdef int i = 0
 971 |         cdef char *fmt
 972 |         # for fmt in samp_fmt:
 973 |         for i in xrange(n):
 974 |             fmt = samp_fmt[i]
 975 |             try:
 976 |                 entry_type = self.formats[fmt].type
 977 |                 entry_num = self.formats[fmt].num
 978 |             except KeyError:
 979 |                 entry_num = None
 980 |                 try:
 981 |                     entry_type = RESERVED_FORMAT[fmt]
 982 |                 except KeyError:
 983 |                     entry_type = 'String'
 984 |             samp_fmt_types[i] = entry_type
 985 |             samp_fmt_nums[i] = entry_num
 986 | 
 987 |         cdef int num_hom_ref = 0
 988 |         cdef int num_het = 0
 989 |         cdef int num_hom_alt = 0
 990 |         cdef int num_unknown = 0
 991 |         cdef int num_called = 0
 992 |         rec.samples  = [None] * self.num_samples# list of _Call objects for each sample
 993 |         rec.gt_bases = [None] * self.num_samples# A/A, A|G, G/G, etc.
 994 |         rec.gt_types   = [None] * self.num_samples# 0, 1, 2, etc.
 995 |         rec.gt_phases  = [None] * self.num_samples# T, F, T, etc.
 996 |         rec.gt_depths  = [None] * self.num_samples# 10, 37, 0, etc.
 997 |         rec.gt_ref_depths  = [None] * self.num_samples# 3, 32, 0, etc.
 998 |         rec.gt_alt_depths  = [None] * self.num_samples# 7, 5, 0, etc.
 999 |         rec.gt_quals  = [None] * self.num_samples# 10, 30, 20, etc.
1000 |         rec.gt_copy_numbers  = [None] * self.num_samples# 2, 1, 4, etc.
1001 |         rec.gt_phred_likelihoods = [None] * self.num_samples
1002 | 
1003 |         for i in xrange(self.num_samples):
1004 | 
1005 |             call = _parse_sample(samples[i], samp_fmt, \
1006 |                                  samp_fmt_types, samp_fmt_nums,
1007 |                                  self.samples[i], rec)
1008 | 
1009 |             rec.samples[i] = call
1010 | 
1011 |             alleles = call.gt_bases
1012 |             type = call.gt_type
1013 | 
1014 |             # add to the "all-samples" lists of GT info
1015 |             if alleles is not None:
1016 |                 rec.gt_bases[i] = alleles
1017 |                 rec.gt_types[i] = type if type is not None else 2
1018 |             else:
1019 |                 rec.gt_bases[i] = './.'
1020 |                 rec.gt_types[i] = 2
1021 |             rec.gt_phases[i] = call.phased
1022 |             rec.gt_depths[i] = call.gt_depth
1023 |             rec.gt_ref_depths[i] = call.gt_ref_depth
1024 |             rec.gt_alt_depths[i] = call.gt_alt_depth
1025 |             rec.gt_quals[i] = call.gt_qual
1026 |             rec.gt_copy_numbers[i] = call.gt_copy_number
1027 |             rec.gt_phred_likelihoods[i] = call.gt_phred_likelihoods
1028 | 
1029 |             # 0 / 00000000 hom ref
1030 |             # 1 / 00000001 het
1031 |             # 2 / 00000010 missing
1032 |             # 3 / 00000011 hom alt
1033 | 
1034 |             # tally the appropriate GT count
1035 |             if type == HOM_REF: num_hom_ref += 1
1036 |             elif type == HET: num_het += 1
1037 |             elif type == HOM_ALT: num_hom_alt += 1
1038 |             elif type == None: num_unknown += 1
1039 | 
1040 |         rec.num_called = num_hom_ref + num_het + num_hom_alt
1041 |         rec.num_hom_alt = num_hom_alt
1042 |         rec.num_het = num_het
1043 |         rec.num_hom_ref = num_hom_ref
1044 |         rec.num_unknown = num_unknown
1045 | 
1046 |     def __next__(self):
1047 |         '''Return the next record in the file.'''
1048 |         line = self.reader.next().rstrip()
1049 |         return self.parse(line)
1050 | 
1051 |     def parse(self, line):
1052 |         '''Return the next record in the file.'''
1053 |         cdef list row = line.split('\t')
1054 | 
1055 |         #CHROM
1056 |         cdef bytes chrom = row[0]
1057 |         if self._prepend_chr:
1058 |             chrom = 'chr' + str(chrom)
1059 |         # POS
1060 |         cdef int pos = int(row[1])
1061 |         # ID
1062 |         cdef bytes id = row[2]
1063 |         #REF
1064 |         cdef bytes ref = row[3]
1065 |         #ALT
1066 |         cdef list alt = self._map(str, row[4].split(','))
1067 |         #QUAL
1068 |         cdef object qual
1069 |         if row[5] == b'.':
1070 |             qual = None
1071 |         else:
1072 |             qual = float(row[5])
1073 |         #FILT
1074 |         cdef object filt = row[6].split(';') if ';' in row[6] else row[6]
1075 |         if filt == b'PASS' or filt == b'.':
1076 |              filt = None
1077 |         #INFO
1078 |         cdef dict info = self._parse_info(row[7])
1079 |         #FORMAT
1080 |         cdef bytes fmt
1081 |         try:
1082 |             fmt = row[8]
1083 |         except IndexError:
1084 |             fmt = None
1085 | 
1086 |         rec = Record(chrom, pos, id, ref, alt, qual, filt, info, fmt, self._sample_indexes)
1087 | 
1088 |         # collect GENOTYPE information for the current VCF record 
1089 |         if fmt is not None:
1090 |             self._parse_samples(rec, row[9:], fmt)
1091 |         return rec
1092 | 
1093 |     def fetch(self, chrom, start, end=None):
1094 |         """ fetch records from a Tabix indexed VCF, requires pysam
1095 |             if start and end are specified, return iterator over positions
1096 |             if end not specified, return individual ``_Call`` at start or None
1097 |         """
1098 |         if not pysam:
1099 |             raise Exception('pysam not available, try "pip install pysam"?')
1100 | 
1101 |         if not self.filename:
1102 |             raise Exception('Please provide a filename (or a "normal" fsock)')
1103 | 
1104 |         if not self._tabix:
1105 |             self._tabix = pysam.Tabixfile(self.filename)
1106 | 
1107 |         if self._prepend_chr and chrom[:3] == 'chr':
1108 |             chrom = chrom[3:]
1109 | 
1110 |         # not sure why tabix needs position -1
1111 |         start = start - 1
1112 | 
1113 |         if end is None:
1114 |             self.reader = self._tabix.fetch(chrom, start, start+1)
1115 |             try:
1116 |                 return self.next()
1117 |             except StopIteration:
1118 |                 return None
1119 | 
1120 |         self.reader = self._tabix.fetch(chrom, start, end)
1121 |         return self
1122 | 
1123 | 
1124 | class Writer(object):
1125 |     """ VCF Writer """
1126 | 
1127 |     fixed_fields = "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT".split()
1128 | 
1129 |     def __init__(self, stream, template):
1130 |         self.stream = stream
1131 |         self.writer = csv.writer(stream, delimiter="\t")
1132 |         self.template = template
1133 | 
1134 |         for line in template.metadata.items():
1135 |             stream.write('##%s=%s\n' % line)
1136 |         for line in template.infos.values():
1137 |             stream.write('##INFO=<ID=%s,Number=%s,Type=%s,Description="%s">\n' %
1138 |                     tuple(self._map(str, line)))
1139 |         for line in template.formats.values():
1140 |             stream.write('##FORMAT=<ID=%s,Number=%s,Type=%s,Description="%s">\n' % tuple(self._map(str, line)))
1141 |         for line in template.filters.values():
1142 |             stream.write('##FILTER=<ID=%s,Description="%s">\n' % tuple(self._map(str, line)))
1143 | 
1144 |         self._write_header()
1145 | 
1146 |     def _write_header(self):
1147 |         # TODO: write INFO, etc
1148 |         self.writer.writerow(self.fixed_fields + self.template.samples)
1149 | 
1150 |     def write_record(self, record):
1151 |         """ write a record to the file """
1152 |         ffs = self._map(str, [record.CHROM, record.POS, record.ID, record.REF]) \
1153 |               + [self._format_alt(record.ALT), record.QUAL or '.', record.FILTER or '.',
1154 |                  self._format_info(record.INFO), record.FORMAT]
1155 | 
1156 |         samples = [self._format_sample(record.FORMAT, sample)
1157 |             for sample in record.samples]
1158 |         self.writer.writerow(ffs + samples)
1159 | 
1160 |     def _format_alt(self, alt):
1161 |         return ','.join([x or '.' for x in alt])
1162 | 
1163 |     def _format_info(self, info):
1164 |         if not info:
1165 |             return '.'
1166 |         return ';'.join("%s=%s" % (x, self._stringify(y)) for x, y in info.items())
1167 | 
1168 |     def _format_sample(self, fmt, sample):
1169 |         if sample.data["GT"] is None:
1170 |             return "./."
1171 |         return ':'.join(self._stringify(sample.data[f]) for f in fmt.split(':'))
1172 | 
1173 |     def _stringify(self, x, none='.'):
1174 |         if isinstance(x, list):
1175 |             return ','.join(self._map(str, x, none))
1176 |         return str(x) if x is not None else none
1177 | 
1178 |     def _map(self, func, iterable, none='.'):
1179 |         '''``map``, but make None values none.'''
1180 |         return [func(x) if x is not None else none
1181 |                 for x in iterable]
1182 | 
1183 | def __update_readme():
1184 |     import sys, vcf
1185 |     file('README.rst', 'w').write(vcf.__doc__)
1186 | 
1187 | # backwards compatibility
1188 | VCFReader = Reader
1189 | VCFWriter = Writer
1190 | 


--------------------------------------------------------------------------------