├── test ├── __init__.py ├── 1kg.vcf.gz ├── README.md ├── tb.vcf.gz ├── tb.vcf.gz.tbi ├── prof.py ├── issue-16.vcf ├── example-4.0.vcf ├── walk_left.vcf ├── example-4.1.vcf ├── example-4.1-sv.vcf ├── samtools.vcf ├── test.vcf ├── dbsnp.vcf ├── null_genotype_mono.vcf ├── test-gl.vcf ├── gatk.vcf └── test_vcf.py ├── cyvcf ├── version.py ├── utils.py ├── filters.py ├── __init__.py └── parser.pyx ├── docs ├── INTRO.rst ├── index.rst ├── API.rst ├── HISTORY.rst ├── FILTERS.rst ├── Makefile └── conf.py ├── .gitignore ├── MANIFEST.in ├── LICENSE ├── scripts ├── vcf_melt └── vcf_filter.py ├── README.rst ├── setup.py └── ez_setup.py /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cyvcf/version.py: -------------------------------------------------------------------------------- 1 | __version__="0.1.16" 2 | -------------------------------------------------------------------------------- /test/1kg.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arq5x/cyvcf/HEAD/test/1kg.vcf.gz -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | Test data from GATK 1.4-9 and freebayes 0.9.4. 2 | 3 | 4 | -------------------------------------------------------------------------------- /test/tb.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arq5x/cyvcf/HEAD/test/tb.vcf.gz -------------------------------------------------------------------------------- /docs/INTRO.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | .. automodule:: vcf 5 | 6 | -------------------------------------------------------------------------------- /test/tb.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arq5x/cyvcf/HEAD/test/tb.vcf.gz.tbi -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build 3 | cyvcf.egg-info 4 | dist 5 | cyvcf/parser.c 6 | cyvcf/parser.so 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ez_setup.py 2 | include LICENSE 3 | include README.rst 4 | include cyvcf/parser.c -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | PyVCF - A Variant Call Format Parser for Python 3 | =============================================== 4 | 5 | Contents: 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | INTRO 11 | API 12 | FILTERS 13 | HISTORY 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | 23 | -------------------------------------------------------------------------------- /docs/API.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | vcf.Reader 5 | ---------- 6 | 7 | .. autoclass:: vcf.Reader 8 | :members: 9 | 10 | vcf.Writer 11 | ---------- 12 | 13 | .. autoclass:: vcf.Writer 14 | :members: 15 | 16 | vcf._Record 17 | ----------- 18 | 19 | .. autoclass:: vcf.parser._Record 20 | :members: 21 | 22 | vcf._Call 23 | --------- 24 | 25 | .. autoclass:: vcf.parser._Call 26 | :members: 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /test/prof.py: -------------------------------------------------------------------------------- 1 | import vcf 2 | import cProfile 3 | import timeit 4 | import pstats 5 | import sys 6 | 7 | def parse_1kg(): 8 | for line in vcf.Reader(filename='test/1kg.vcf.gz'): 9 | pass 10 | 11 | if len(sys.argv) == 1: 12 | sys.argv.append(None) 13 | 14 | if sys.argv[1] == 'profile': 15 | cProfile.run('parse_1kg()', '1kg.prof') 16 | p = pstats.Stats('1kg.prof') 17 | p.strip_dirs().sort_stats('time').print_stats() 18 | 19 | elif sys.argv[1] == 'time': 20 | n = 5 21 | t = timeit.timeit('parse_1kg()', "from __main__ import parse_1kg", number=n) 22 | print t/n 23 | else: 24 | print 'prof.py profile/time' 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 John Dougherty 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /cyvcf/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def walk_together(*readers): 4 | """ Walk a set of readers and return lists of records from each 5 | reader, with None if no record present. Caller must check the 6 | inputs are sorted in the same way and use the same reference 7 | otherwise behaviour is undefined. 8 | """ 9 | nexts = [reader.next() for reader in readers] 10 | 11 | while True: 12 | min_next = min([x for x in nexts if x is not None]) 13 | 14 | # this line uses equality on Records, which checks the ALTs 15 | # not sure what to do with records that have overlapping but different 16 | # variation 17 | yield [x if x is None or x == min_next else None for x in nexts] 18 | 19 | # update nexts that we just yielded 20 | for i, n in enumerate(nexts): 21 | 22 | if n is not None and n == min_next: 23 | try: 24 | nexts[i] = readers[i].next() 25 | except StopIteration: 26 | nexts[i] = None 27 | 28 | if all([x is None for x in nexts]): 29 | break 30 | -------------------------------------------------------------------------------- /test/issue-16.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=1000GenomesPilot-NCBI36 5 | ##phasing=partial 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##FILTER= 14 | ##FILTER= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 | 20 1234568 . G . . PASS NS=3;DP=9;AA=G GT ./. ./. ./. 21 | -------------------------------------------------------------------------------- /scripts/vcf_melt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ Melt a VCF file into a tab delimited set of calls, one per line 3 | 4 | VCF files have all the calls from different samples on one line. This 5 | script reads vcf on stdin and writes all calls to stdout in tab delimited 6 | format with one call in one sample per line. This makes it easy to find 7 | a given sample's genotype with, say, grep. 8 | """ 9 | 10 | import sys 11 | import csv 12 | import vcf 13 | 14 | out = csv.writer(sys.stdout, delimiter='\t') 15 | reader = vcf.VCFReader(sys.stdin) 16 | 17 | formats = reader.formats.keys() 18 | infos = reader.infos.keys() 19 | 20 | header = ["SAMPLE"] + formats + ['FILTER', 'CHROM', 'POS', 'REF', 'ALT', 'ID'] + ['info.'+ x for x in infos] 21 | 22 | 23 | out.writerow(header) 24 | 25 | def flatten(x): 26 | if type(x) == type([]): 27 | x = ','.join(map(str, x)) 28 | return x 29 | 30 | for record in reader: 31 | info_row = [flatten(record.INFO.get(x, None)) for x in infos] 32 | fixed = [record.CHROM, record.POS, record.REF, record.ALT, record.ID] 33 | 34 | for sample in record.samples: 35 | row = [sample.sample] 36 | row += [flatten(sample.data.get(x, None)) for x in formats] 37 | row += [record.FILTER or '.'] 38 | row += fixed 39 | row += info_row 40 | out.writerow(row) 41 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | CyVCF 2 | ====== 3 | 4 | A Cython port of the PyVCF library maintained by @jamescasbon. 5 | 6 | The goal of this project is to provide a very fast Python library for parsing and manipulating large VCF files. 7 | Cython has been used to optimize speed. This version is approximately 4 times faster than PyVCF, 8 | and the parsing speed is essentially identical to that of C/C++ libraries provided by PLINKSEQ and VCFLIB. 9 | 10 | The functionality and interface are currently the same as documented here: http://pyvcf.rtfd.org/ 11 | 12 | Installation 13 | ============ 14 | 15 | python setup.py build 16 | python setup.py install 17 | 18 | 19 | Testing 20 | ======= 21 | 22 | python setup.py test 23 | 24 | 25 | Basic usage 26 | =========== 27 | 28 | >>> import cyvcf 29 | >>> vcf_reader = cyvcf.Reader(open('test/example-4.0.vcf', 'rb')) 30 | >>> for record in vcf_reader: 31 | ... print record 32 | 20 14370 G A 29.0 . H2=True;NS=3;DB=True;DP=14;AF=0.5 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 33 | 20 17330 T A 3.0 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:. 34 | 20 1110696 A G,T 67.0 . AA=T;NS=2;DB=True;DP=10;AF=0.333,0.667 GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:. 35 | 20 1230237 T . 47.0 . AA=T;NS=3;DP=13 GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:. 36 | 20 1234567 GTCT G,GTACT 50.0 . AA=G;NS=3;DP=9 GT:GQ:DP ./. 0/2:17:2 1/1:40:3 37 | -------------------------------------------------------------------------------- /test/example-4.0.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=1000GenomesPilot-NCBI36 5 | ##phasing=partial 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##FILTER= 13 | ##FILTER= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 19 | 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 20 | 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 21 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 22 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 23 | 20 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP ./.:35:4 0/2:17:2 1/1:40:3 24 | -------------------------------------------------------------------------------- /test/walk_left.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=1000GenomesPilot-NCBI36 5 | ##phasing=partial 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##FILTER= 14 | ##FILTER= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 | 19 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 21 | 19 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:65,3 22 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:65,4 23 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2:65,3 24 | 21 1234567 microsat1 GTCT G,GTACT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP ./.:35:4 0/2:17:2 1/1:40:3:65,3 25 | -------------------------------------------------------------------------------- /test/example-4.1.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta 5 | ##contig= 6 | ##phasing=partial 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##FILTER= 14 | ##FILTER= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 20 | 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 21 | 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 22 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 23 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 24 | 20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 25 | -------------------------------------------------------------------------------- /cyvcf/filters.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Base(object): 4 | """ Base class for vcf_filter.py filters """ 5 | 6 | name = 'f' 7 | """ name used to activate filter and in VCF headers """ 8 | 9 | description = 'VCF filter base class' 10 | """ descrtiption used in vcf headers """ 11 | 12 | @classmethod 13 | def customize_parser(self, parser): 14 | """ hook to extend argparse parser with custom arguments """ 15 | pass 16 | 17 | def __init__(self, args): 18 | """ create the filter using argparse ``args`` """ 19 | self.threshold = 0 20 | 21 | def __call__(self): 22 | """ filter a site, return not None if the site should be filtered """ 23 | raise NotImplementedError('Filters must implement this method') 24 | 25 | 26 | def filter_name(self): 27 | """ return the name to put in the VCF header, default is ``name`` + ``threshold`` """ 28 | return '%s%s' % (self.name, self.threshold) 29 | 30 | 31 | class SiteQuality(Base): 32 | 33 | description = 'Filter sites by quality' 34 | name = 'sq' 35 | 36 | @classmethod 37 | def customize_parser(self, parser): 38 | parser.add_argument('--site-quality', type=int, default=30, 39 | help='Filter sites below this quality') 40 | 41 | def __init__(self, args): 42 | self.threshold = args.site_quality 43 | 44 | def __call__(self, record): 45 | if record.QUAL < self.threshold: 46 | return record.QUAL 47 | 48 | 49 | class VariantGenotypeQuality(Base): 50 | 51 | description = 'Demand a minimum quality associated with a non reference call' 52 | name = 'mgq' 53 | 54 | @classmethod 55 | def customize_parser(self, parser): 56 | parser.add_argument('--genotype-quality', type=int, default=50, 57 | help='Filter sites with no genotypes above this quality') 58 | 59 | def __init__(self, args): 60 | self.threshold = args.genotype_quality 61 | 62 | def __call__(self, record): 63 | if not record.is_monomorphic: 64 | vgq = max([x['GQ'] for x in record if x.is_variant]) 65 | if vgq < self.threshold: 66 | return vgq 67 | 68 | 69 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import ez_setup 2 | ez_setup.use_setuptools() 3 | 4 | import glob 5 | import os 6 | import sys 7 | from setuptools import setup 8 | from distutils.extension import Extension 9 | 10 | # optional cython 11 | try: 12 | from Cython.Distutils import build_ext 13 | except ImportError: 14 | from distutils.command import build_ext as _build_ext 15 | class build_ext(_build_ext.build_ext): 16 | 17 | description = "change pyx files to corresponding .c/.cpp (fallback when cython is not installed)" 18 | 19 | def build_extensions(self): 20 | # First, sanity-check the 'extensions' list 21 | self.check_extensions_list(self.extensions) 22 | 23 | for extension in self.extensions: 24 | target_ext = '.c' 25 | 26 | patchedsrc = [] 27 | for source in extension.sources: 28 | (root, ext) = os.path.splitext(source) 29 | if ext == '.pyx': 30 | patchedsrc.append(root + target_ext) 31 | else: 32 | patchedsrc.append(source) 33 | 34 | extension.sources = patchedsrc 35 | self.build_extension(extension) 36 | 37 | 38 | if 'setuptools.extension' in sys.modules: 39 | m = sys.modules['setuptools.extension'] 40 | m.Extension.__dict__ = m._Extension.__dict__ 41 | 42 | version_py = os.path.join(os.path.dirname(__file__), 'cyvcf', 'version.py') 43 | version = open(version_py).read().strip().split('=')[-1].replace('"','') 44 | 45 | sources=["cyvcf/parser.pyx"] 46 | exts = [ Extension("cyvcf.parser", sources=sources)] 47 | 48 | setup( 49 | cmdclass= {'build_ext': build_ext}, 50 | name="cyvcf", 51 | version=version, 52 | ext_modules=exts, 53 | test_suite='test.test_vcf.suite', 54 | packages=['cyvcf'], 55 | author="Aaron Quinlan, James Casbon, John Dougherty, Martin Vermaat, Brent Pedersen", 56 | description='A fast Python library for VCF files using Cython for speed.', 57 | url="none", 58 | package_dir = {"cyvcf": "cyvcf"}, 59 | author_email="arq5x@virginia.edu", 60 | classifiers=[ 61 | 'Development Status :: 4 - Beta', 62 | 'Intended Audience :: Science/Research', 63 | 'License :: OSI Approved :: GNU General Public License (GPL)', 64 | 'Topic :: Scientific/Engineering :: Bio-Informatics'] 65 | 66 | ) -------------------------------------------------------------------------------- /scripts/vcf_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import argparse 4 | import pkg_resources 5 | 6 | import vcf 7 | from vcf.parser import _Filter 8 | 9 | parser = argparse.ArgumentParser(description='Filter a VCF file', 10 | formatter_class=argparse.RawDescriptionHelpFormatter, 11 | ) 12 | parser.add_argument('input', metavar='input', type=str, nargs=1, 13 | help='File to process (use - for STDIN)') 14 | parser.add_argument('filters', metavar='filter', type=str, nargs='+', 15 | help='Filters to use') 16 | parser.add_argument('--no-short-circuit', action='store_true', 17 | help='Do not stop filter processing on a site if a single filter fails.') 18 | parser.add_argument('--output', action='store', default=sys.stdout, 19 | help='Filename to output (default stdout)') 20 | parser.add_argument('--no-filtered', action='store_true', 21 | help='Remove failed sites') 22 | 23 | 24 | if __name__ == '__main__': 25 | # TODO: allow filter specification by short name 26 | # TODO: flag that writes filter output into INFO column 27 | # TODO: argument use implies filter use 28 | # TODO: parallelize 29 | # TODO: prevent plugins raising an exception from crashing the script 30 | 31 | 32 | # dynamically build the list of available filters 33 | filters = {} 34 | filter_help = '\n\navailable filters:' 35 | 36 | for p in pkg_resources.iter_entry_points('vcf.filters'): 37 | filt = p.load() 38 | filters[filt.name] = filt 39 | filt.customize_parser(parser) 40 | filter_help += '\n %s:\t%s' % (filt.name, filt.description) 41 | 42 | parser.description += filter_help 43 | 44 | # parse command line args 45 | args = parser.parse_args() 46 | 47 | inp = vcf.Reader(file(args.input[0])) 48 | 49 | # build filter chain 50 | chain = [] 51 | for name in args.filters: 52 | f = filters[name](args) 53 | chain.append(f) 54 | inp.filters[f.filter_name()] = _Filter(f.filter_name(), f.description) 55 | 56 | oup = vcf.Writer(args.output, inp) 57 | 58 | # apply filters 59 | short_circuit = not args.no_short_circuit 60 | 61 | for record in inp: 62 | for filt in chain: 63 | result = filt(record) 64 | if result: 65 | record.add_filter(filt.filter_name()) 66 | if short_circuit: 67 | break 68 | 69 | if (not args.no_filtered) or (record.FILTER == '.'): 70 | oup.write_record(record) 71 | 72 | 73 | -------------------------------------------------------------------------------- /docs/HISTORY.rst: -------------------------------------------------------------------------------- 1 | Development 2 | =========== 3 | 4 | Please use the repository at github: https://github.com/jamescasbon/PyVCF/ 5 | Pull requests gladly accepted. 6 | Issues should be reported at the github issue tracker. 7 | 8 | Changes 9 | ======= 10 | 11 | 0.4.3 Release 12 | ------------- 13 | 14 | * Single floats in Reader._sample_parser not being converted to float #35 15 | * Handle String INFO values when Number=1 in header #34 16 | 17 | 0.4.2 Release 18 | ------------- 19 | 20 | * Installation problems 21 | 22 | 0.4.1 Release 23 | ------------- 24 | 25 | * Installation problems 26 | 27 | 0.4.0 Release 28 | ------------- 29 | 30 | * Package structure 31 | * add ``vcf.utils`` module with ``walk_together`` method 32 | * samtools tests 33 | * support Freebayes' non standard '.' for no call 34 | * fix vcf_melt 35 | * support monomorphic sites, add ``is_monomorphic`` method, handle null QUALs 36 | * filter support for files with monomorphic calls 37 | * Values declared as single are no-longer returned in lists 38 | * several performance improvements 39 | 40 | 41 | 0.3.0 Release 42 | ------------- 43 | 44 | * Fix setup.py for python < 2.7 45 | * Add ``__eq__`` to ``_Record`` and ``_Call`` 46 | * Add ``is_het`` and ``is_variant`` to ``_Call`` 47 | * Drop aggressive parse mode: we're always aggressive. 48 | * Add tabix fetch for single calls, fix one->zero based indexing 49 | * add prepend_chr mode for ``Reader`` to add `chr` to CHROM attributes 50 | 51 | 0.2.2 Release 52 | ------------- 53 | 54 | Documentation release 55 | 56 | 0.2.1 Release 57 | ------------- 58 | 59 | * Add shebang to vcf_filter.py 60 | 61 | 0.2 Release 62 | ----------- 63 | 64 | * Replace genotype dictionary with a ``Call`` object 65 | * Methods on ``Record`` and ``Call`` (thanks @arq5x) 66 | * Shortcut parse_sample when genotype is None 67 | 68 | 0.1 Release 69 | ----------- 70 | 71 | * Added test code 72 | * Added Writer class 73 | * Allow negative number in ``INFO`` and ``FORMAT`` fields (thanks @martijnvermaat) 74 | * Prefer ``vcf.Reader`` to ``vcf.VCFReader`` 75 | * Support compressed files with guessing where filename is available on fsock 76 | * Allow opening by filename as well as filesocket 77 | * Support fetching rows for tabixed indexed files 78 | * Performance improvements (see ``test/prof.py``) 79 | * Added extensible filter script (see FILTERS.md), vcf_filter.py 80 | 81 | Contributions 82 | ------------- 83 | 84 | Project started by @jdoughertyii and taken over by @jamescasbon on 12th January 2011. 85 | Contributions from @arq5x, @brentp, @martijnvermaat, @ian1roberts. 86 | 87 | 88 | -------------------------------------------------------------------------------- /test/example-4.1-sv.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20100501 3 | ##reference=1000GenomesPilot-NCBI36 4 | ##assembly=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##ALT= 16 | ##ALT= 17 | ##ALT= 18 | ##ALT= 19 | ##ALT= 20 | ##ALT= 21 | ##ALT= 22 | ##ALT= 23 | ##ALT= 24 | ##ALT= 25 | ##FORMAT= 26 | ##FORMAT= 27 | ##FORMAT= 28 | ##FORMAT= 29 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 30 | 1 2827693 . CCGTGGATGCGGGGACCCGCATCCCCTCTCCCTTCACAGCTGAGTGACCCACATCCCCTCTCCCCTCGCA C . PASS SVTYPE=DEL;END=2827680;BKPTID=Pindel_LCS_D1099159;HOMLEN=1;HOMSEQ=C;SVLEN=-66 GT:GQ 1/1:13.9 31 | 2 321682 . T 6 PASS IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62 GT:GQ 0/1:12 32 | 2 14477084 . C 12 PASS IMPRECISE;SVTYPE=DEL;END=14477381;SVLEN=-297;MEINFO=AluYa5,5,307,+;CIPOS=-22,18;CIEND=-12,32 GT:GQ 0/1:12 33 | 3 9425916 . C 23 PASS IMPRECISE;SVTYPE=INS;END=9425916;SVLEN=6027;CIPOS=-16,22;MIINFO=L1HS,1,6025,- GT:GQ 1/1:15 34 | 3 12665100 . A 14 PASS IMPRECISE;SVTYPE=DUP;END=12686200;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500 GT:GQ:CN:CNQ ./.:0:3:16.2 35 | 4 18665128 . T 11 PASS IMPRECISE;SVTYPE=DUP;END=18665204;SVLEN=76;CIPOS=-10,10;CIEND=-10,10 GT:GQ:CN:CNQ ./.:0:5:8.3 36 | -------------------------------------------------------------------------------- /docs/FILTERS.rst: -------------------------------------------------------------------------------- 1 | Filtering VCF files 2 | =================== 3 | 4 | The filter script: vcf_filter.py 5 | -------------------------------- 6 | 7 | Filtering a VCF file based on some properties of interest is a common enough 8 | operation that PyVCF offers an extensible script. ``vcf_filter.py`` does 9 | the work of reading input, updating the metadata and filtering the records. 10 | 11 | 12 | Adding a filter 13 | --------------- 14 | 15 | You can reuse this work by providing a filter class, rather than writing your own filter. 16 | For example, lets say I want to filter each site based on the quality of the site. 17 | I can create a class like this:: 18 | 19 | class SiteQuality(vcf.Filter): 20 | 21 | description = 'Filter sites by quality' 22 | name = 'sq' 23 | 24 | @classmethod 25 | def customize_parser(self, parser): 26 | parser.add_argument('--site-quality', type=int, default=30, 27 | help='Filter sites below this quality') 28 | 29 | def __init__(self, args): 30 | self.threshold = args.site_quality 31 | 32 | def __call__(self, record): 33 | if record.QUAL < self.threshold: 34 | return record.QUAL 35 | 36 | 37 | This class subclasses ``vcf.Filter`` which provides the interface for VCF filters. 38 | The ``description``` and ``name`` are metadata about the parser. 39 | The ``customize_parser`` method allows you to add arguments to the script. 40 | We use the ``__init__`` method to grab the argument of interest from the parser. 41 | Finally, the ``__call__`` method processes each record and returns a value if the 42 | filter failed. The base class uses the ``name`` and ``threshold`` to create 43 | the filter ID in the VCF file. 44 | 45 | To make vcf_filter.py aware of the filter, you need to declare a ``vcf.filters`` entry 46 | point in your ``setup``:: 47 | 48 | setup( 49 | ... 50 | entry_points = { 51 | 'vcf.filters': [ 52 | 'site_quality = module.path:SiteQuality', 53 | ] 54 | } 55 | ) 56 | 57 | Now when you call vcf_filter.py, you should see your filter in the list of available filters:: 58 | 59 | >$ vcf_filter.py --help 60 | usage: vcf_filter.py [-h] [--no-short-circuit] [--output OUTPUT] 61 | [--site-quality SITE_QUALITY] 62 | [--genotype-quality GENOTYPE_QUALITY] 63 | input filter [filter ...] 64 | 65 | Filter a VCF file 66 | 67 | available filters: 68 | sq: Filter sites by quality 69 | 70 | positional arguments: 71 | input File to process (use - for STDIN) 72 | filter Filters to use 73 | 74 | optional arguments: 75 | -h, --help show this help message and exit 76 | --no-short-circuit Do not stop filter processing on a site if a single 77 | filter fails. 78 | --output OUTPUT Filename to output (default stdout) 79 | --site-quality SITE_QUALITY 80 | Filter sites below this quality 81 | --genotype-quality GENOTYPE_QUALITY 82 | Filter sites with no genotypes above this quality 83 | 84 | 85 | The filter base class: vcf.Filter 86 | --------------------------------- 87 | 88 | .. autoclass:: vcf.Filter 89 | :members: 90 | 91 | -------------------------------------------------------------------------------- /test/samtools.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##samtoolsVersion=0.1.16 (r963:234) 3 | ##INFO= 4 | ##INFO= 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##FORMAT= 18 | ##FORMAT= 19 | ##FORMAT= 20 | ##FORMAT= 21 | ##FORMAT= 22 | ##FORMAT= 23 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT - 24 | chrX 2774478 . A G 61.3 . DP=8;AF1=1;CI95=0.5,1;DP4=0,0,5,0;MQ=60;FQ=-42 GT:PL:GQ 1/1:94,15,0:27 25 | chrX 2832661 . A T 29 . DP=25;AF1=0.5;CI95=0.5,0.5;DP4=14,1,6,1;MQ=56;FQ=32;PV4=1,0.00034,0.00019,1 GT:PL:GQ 0/1:59,0,144:62 26 | chrX 2832880 . AAT AATAT 109 . INDEL;DP=20;AF1=0.5;CI95=0.5,0.5;DP4=0,7,0,8;MQ=47;FQ=112;PV4=1,1,0.0069,0.091 GT:PL:GQ 0/1:147,0,172:99 27 | chrX 2832920 . TTAT TTATAT 85.5 . INDEL;DP=12;AF1=0.5;CI95=0.5,0.5;DP4=0,6,0,5;MQ=56;FQ=88.5;PV4=1,0.36,0.052,0.017 GT:PL:GQ 0/1:123,0,150:99 28 | chrX 2833534 . TTACGCCCT T 8.18 . INDEL;DP=15;AF1=0.5;CI95=0.5,0.5;DP4=10,0,2,0;MQ=60;FQ=10.8;PV4=1,0.0041,1,0.3 GT:PL:GQ 0/1:45,0,255:47 29 | chrX 2833580 . A G 80 . DP=20;AF1=0.5;CI95=0.5,0.5;DP4=10,2,7,1;MQ=58;FQ=83;PV4=1,1,0.06,1 GT:PL:GQ 0/1:110,0,141:99 30 | chr1 10363194 . cca cCAca 57.5 . INDEL;DP=19;AF1=0.5;CI95=0.5,0.5;DP4=16,0,3,0;MQ=59;FQ=60.5;PV4=1,1,1,0.0008 GT:PL:GQ 0/1:95,0,214:98 31 | chr1 11292952 . T A,C 41 . DP=17;AF1=1;CI95=1,1;DP4=0,0,17,0;MQ=57;FQ=-75 GT:PL:GQ 1/1:74,48,0,66,28,63:85 32 | chr1 38304491 . t tTTTTTTTTTTTTTTTTTTTTTT,tTTTTTTTTTTTTT,tTTTTTTT 16.3 . INDEL;DP=9;AF1=1;CI95=0.5,1;DP4=0,0,0,4;MQ=41;FQ=-40.5 GT:PL:GQ 1/1:105,56,50,54,0,51,98,45,44,95:10 33 | chr1 152195728 . ATTTTTTTTTTT ATTTTTTTTTT,ATTTTTTTTT 36.5 . INDEL;DP=39;AF1=1;CI95=1,1;DP4=1,1,12,19;MQ=59;FQ=-104;PV4=1,0.42,0.4,0.2 GT:PL:GQ 1/1:77,69,0,77,75,73:99 34 | chr1 152276149 . C T 134 . DP=30;AF1=0.5;CI95=0.5,0.5;DP4=2,7,8,11;MQ=33;FQ=45;PV4=0.42,0.23,0.33,1 GT:PL:GQ 0/1:164,0,72:75 35 | -------------------------------------------------------------------------------- /test/test.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##FILTER= 3 | ##source=VarScan2 4 | ##INFO== 15"> 5 | ##INFO= 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##FILTER= 10 | ##FILTER= 11 | ##FORMAT= 12 | ##FORMAT= 13 | ##FORMAT= 14 | ##FORMAT== 15"> 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | ##FORMAT= 20 | ##FORMAT= 21 | ##FORMAT= 22 | ##FORMAT= 23 | ##FORMAT= 24 | ##FORMAT= 25 | ##contig= 26 | ##contig= 27 | ##contig= 28 | ##contig= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##INFO= 50 | ##INFO= 51 | ##INFO= 52 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_01-14G01394 Sample_02-14G04079 Sample_03-14G19847 Sample_04-15G00251 Sample_05-14G24871 Sample_06-04G03785 Sample_07-05G00575 Sample_08-04G05168 Sample_09-12G05170 Sample_10-14G19446-1 Sample_11-15G00148 Sample_12-11G13366 53 | chr1 949608 . G A . PASS ADP=231;WT=5;HET=6;HOM=1;NC=0;CSQ=missense_variant|aGc/aAc|S/N|ENSG00000187608|ISG15|ENST00000379389|2/2|benign(0.009)|tolerated(0.25)|83/165|protein_coding||ENSP00000368699.4:p.Ser83Asn|ENST00000379389.4:c.248G>A|YES,upstream_gene_variant|||ENSG00000224969|RP11-54O7.11|ENST00000458555|||||antisense||||YES GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR 0/0:395:240:240:235:3:1.25%:1.2421E-1:39:15:125:110:3:0 0/1:252:178:178:106:71:39.89%:6.0237E-26:39:38:56:50:32:39 0/1:255:193:193:88:105:54.4%:5.7979E-41:40:38:51:37:53:52 0/1:255:219:219:107:112:51.14%:7.0426E-43:41:39:62:45:60:52 0/1:255:239:239:128:110:46.03%:4.7111E-41:38:37:69:59:55:55 0/0:484:277:277:275:2:0.72%:2.4955E-1:38:15:135:140:2:0 0/1:255:227:227:126:101:44.49%:2.0201E-37:39:38:69:57:50:51 1/1:255:262:262:2:259:98.85%:7.1891E-152:40:38:2:0:138:121 0/1:255:246:246:137:108:43.9%:6.9049E-40:39:39:69:68:48:60 0/0:456:243:243:243:0:0%:1E0:38:0:127:116:0:0 0/0:348:195:195:194:1:0.51%:5E-1:37:15:93:101:1:0 0/0:458:264:264:261:2:0.76%:2.4952E-1:40:38:139:122:0:2 54 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " singlehtml to make a single large HTML file" 22 | @echo " pickle to make pickle files" 23 | @echo " json to make JSON files" 24 | @echo " htmlhelp to make HTML files and a HTML help project" 25 | @echo " qthelp to make HTML files and a qthelp project" 26 | @echo " devhelp to make HTML files and a Devhelp project" 27 | @echo " epub to make an epub" 28 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 29 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 30 | @echo " text to make text files" 31 | @echo " man to make manual pages" 32 | @echo " changes to make an overview of all changed/added/deprecated items" 33 | @echo " linkcheck to check all external links for integrity" 34 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 35 | 36 | clean: 37 | -rm -rf $(BUILDDIR)/* 38 | 39 | html: 40 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 41 | @echo 42 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 43 | 44 | dirhtml: 45 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 48 | 49 | singlehtml: 50 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 51 | @echo 52 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 53 | 54 | pickle: 55 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 56 | @echo 57 | @echo "Build finished; now you can process the pickle files." 58 | 59 | json: 60 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 61 | @echo 62 | @echo "Build finished; now you can process the JSON files." 63 | 64 | htmlhelp: 65 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 66 | @echo 67 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 68 | ".hhp project file in $(BUILDDIR)/htmlhelp." 69 | 70 | qthelp: 71 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 72 | @echo 73 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 74 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 75 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PyVCF.qhcp" 76 | @echo "To view the help file:" 77 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PyVCF.qhc" 78 | 79 | devhelp: 80 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 81 | @echo 82 | @echo "Build finished." 83 | @echo "To view the help file:" 84 | @echo "# mkdir -p $$HOME/.local/share/devhelp/PyVCF" 85 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PyVCF" 86 | @echo "# devhelp" 87 | 88 | epub: 89 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 90 | @echo 91 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 92 | 93 | latex: 94 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 95 | @echo 96 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 97 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 98 | "(use \`make latexpdf' here to do that automatically)." 99 | 100 | latexpdf: 101 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 102 | @echo "Running LaTeX files through pdflatex..." 103 | make -C $(BUILDDIR)/latex all-pdf 104 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 105 | 106 | text: 107 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 108 | @echo 109 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 110 | 111 | man: 112 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 113 | @echo 114 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 115 | 116 | changes: 117 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 118 | @echo 119 | @echo "The overview file is in $(BUILDDIR)/changes." 120 | 121 | linkcheck: 122 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 123 | @echo 124 | @echo "Link check complete; look for any errors in the above output " \ 125 | "or in $(BUILDDIR)/linkcheck/output.txt." 126 | 127 | doctest: 128 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 129 | @echo "Testing of doctests in the sources finished, look at the " \ 130 | "results in $(BUILDDIR)/doctest/output.txt." 131 | -------------------------------------------------------------------------------- /cyvcf/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''A VCFv4.0 parser for Python. 3 | 4 | Online version of PyVCF documentation is available at http://pyvcf.rtfd.org/ 5 | 6 | The intent of this module is to mimic the ``csv`` module in the Python stdlib, 7 | as opposed to more flexible serialization formats like JSON or YAML. ``vcf`` 8 | will attempt to parse the content of each record based on the data types 9 | specified in the meta-information lines -- specifically the ##INFO and 10 | ##FORMAT lines. If these lines are missing or incomplete, it will check 11 | against the reserved types mentioned in the spec. Failing that, it will just 12 | return strings. 13 | 14 | There main interface is the class: ``Reader``. It takes a file-like 15 | object and acts as a reader:: 16 | 17 | >>> import vcf 18 | >>> vcf_reader = vcf.Reader(open('test/example-4.0.vcf', 'rb')) 19 | >>> for record in vcf_reader: 20 | ... print record 21 | Record(CHROM=20, POS=14370, REF=G, ALT=['A']) 22 | Record(CHROM=20, POS=17330, REF=T, ALT=['A']) 23 | Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T']) 24 | Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) 25 | Record(CHROM=20, POS=1234567, REF=GTCT, ALT=['G', 'GTACT']) 26 | 27 | 28 | This produces a great deal of information, but it is conveniently accessed. 29 | The attributes of a Record are the 8 fixed fields from the VCF spec:: 30 | 31 | * ``Record.CHROM`` 32 | * ``Record.POS`` 33 | * ``Record.ID`` 34 | * ``Record.REF`` 35 | * ``Record.ALT`` 36 | * ``Record.QUAL`` 37 | * ``Record.FILTER`` 38 | * ``Record.INFO`` 39 | 40 | plus attributes to handle genotype information: 41 | 42 | * ``Record.FORMAT`` 43 | * ``Record.samples`` 44 | * ``Record.genotype`` 45 | 46 | ``samples`` and ``genotype``, not being the title of any column, are left lowercase. The format 47 | of the fixed fields is from the spec. Comma-separated lists in the VCF are 48 | converted to lists. In particular, one-entry VCF lists are converted to 49 | one-entry Python lists (see, e.g., ``Record.ALT``). Semicolon-delimited lists 50 | of key=value pairs are converted to Python dictionaries, with flags being given 51 | a ``True`` value. Integers and floats are handled exactly as you'd expect:: 52 | 53 | >>> vcf_reader = vcf.Reader(open('test/example-4.0.vcf', 'rb')) 54 | >>> record = vcf_reader.next() 55 | >>> print record.POS 56 | 14370 57 | >>> print record.ALT 58 | ['A'] 59 | >>> print record.INFO['AF'] 60 | [0.5] 61 | 62 | There are a number of convienience methods and properties for each ``Record`` allowing you to 63 | examine properties of interest:: 64 | 65 | >>> print record.num_called, record.call_rate, record.num_unknown 66 | 3 1.0 0 67 | >>> print record.num_hom_ref, record.num_het, record.num_hom_alt 68 | 1 1 1 69 | >>> print record.nucl_diversity, record.aaf 70 | 0.6 0.5 71 | >>> print record.get_hets() 72 | [Call(sample=NA00002, GT=1|0, GQ=48)] 73 | >>> print record.is_snp, record.is_indel, record.is_transition, record.is_deletion 74 | True False True False 75 | >>> print record.var_type, record.var_subtype 76 | snp ts 77 | >>> print record.is_monomorphic 78 | False 79 | 80 | ``record.FORMAT`` will be a string specifying the format of the genotype 81 | fields. In case the FORMAT column does not exist, ``record.FORMAT`` is 82 | ``None``. Finally, ``record.samples`` is a list of dictionaries containing the 83 | parsed sample column and ``record.genotype`` is a way of looking up genotypes 84 | by sample name:: 85 | 86 | >>> record = vcf_reader.next() 87 | >>> for sample in record.samples: 88 | ... print sample['GT'] 89 | 0|0 90 | 0|1 91 | 0/0 92 | >>> print record.genotype('NA00001')['GT'] 93 | 0|0 94 | 95 | The genotypes are represented by ``Call`` objects, which have three attributes: the 96 | corresponding Record ``site``, the sample name in ``sample`` and a dictionary of 97 | call data in ``data``:: 98 | 99 | >>> call = record.genotype('NA00001') 100 | >>> print call.site 101 | Record(CHROM=20, POS=17330, REF=T, ALT=['A']) 102 | >>> print call.sample 103 | NA00001 104 | >>> print call.data 105 | {'GT': '0|0', 'HQ': [58, 50], 'DP': 3, 'GQ': 49} 106 | 107 | Please note that as of release 0.4.0, attributes known to have single values (such as 108 | ``DP`` and ``GQ`` above) are returned as values. Other attributes are returned 109 | as lists (such as ``HQ`` above). 110 | 111 | There are also a number of methods:: 112 | 113 | >>> print call.called, call.gt_type, call.gt_bases, call.phased 114 | True 0 T|T True 115 | 116 | 117 | Metadata regarding the VCF file itself can be investigated through the 118 | following attributes: 119 | 120 | * ``Reader.metadata`` 121 | * ``Reader.infos`` 122 | * ``Reader.filters`` 123 | * ``Reader.formats`` 124 | * ``Reader.samples`` 125 | 126 | For example:: 127 | 128 | >>> vcf_reader.metadata['fileDate'] 129 | '20090805' 130 | >>> vcf_reader.samples 131 | ['NA00001', 'NA00002', 'NA00003'] 132 | >>> vcf_reader.filters 133 | {'q10': Filter(id='q10', desc='Quality below 10'), 's50': Filter(id='s50', desc='Less than 50% of samples have data')} 134 | >>> vcf_reader.infos['AA'].desc 135 | 'Ancestral Allele' 136 | 137 | Random access is supported for files with tabix indexes. Simply call fetch for the 138 | region you are interested in:: 139 | 140 | >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz') 141 | >>> for record in vcf_reader.fetch('20', 1110696, 1230237): 142 | ... print record 143 | Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T']) 144 | Record(CHROM=20, POS=1230237, REF=T, ALT=[None]) 145 | 146 | Or extract a single row:: 147 | 148 | >>> print vcf_reader.fetch('20', 1110696) 149 | Record(CHROM=20, POS=1110696, REF=A, ALT=['G', 'T']) 150 | 151 | 152 | The ``Writer`` class provides a way of writing a VCF file. Currently, you must specify a 153 | template ``Reader`` which provides the metadata:: 154 | 155 | >>> vcf_reader = vcf.Reader(filename='test/tb.vcf.gz') 156 | >>> vcf_writer = vcf.Writer(file('/dev/null', 'w'), vcf_reader) 157 | >>> for record in vcf_reader: 158 | ... vcf_writer.write_record(record) 159 | 160 | 161 | An extensible script is available to filter vcf files in vcf_filter.py. VCF filters 162 | declared by other packages will be available for use in this script. Please 163 | see :doc:`FILTERS` for full description. 164 | 165 | ''' 166 | from .parser import VCFReader, Reader, VCFWriter, Writer 167 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # PyVCF documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Jan 25 12:29:23 2012. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath('..')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.viewcode'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['.templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'PyVCF' 44 | copyright = u'2012, James Casbon, @jdoughertyii' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | import vcf 52 | version = vcf.VERSION 53 | # The full version, including alpha/beta/rc tags. 54 | release = vcf.VERSION 55 | 56 | # The language for content autogenerated by Sphinx. Refer to documentation 57 | # for a list of supported languages. 58 | #language = None 59 | 60 | # There are two options for replacing |today|: either, you set today to some 61 | # non-false value, then it is used: 62 | #today = '' 63 | # Else, today_fmt is used as the format for a strftime call. 64 | #today_fmt = '%B %d, %Y' 65 | 66 | # List of patterns, relative to source directory, that match files and 67 | # directories to ignore when looking for source files. 68 | exclude_patterns = ['.build'] 69 | 70 | # The reST default role (used for this markup: `text`) to use for all documents. 71 | #default_role = None 72 | 73 | # If true, '()' will be appended to :func: etc. cross-reference text. 74 | #add_function_parentheses = True 75 | 76 | # If true, the current module name will be prepended to all description 77 | # unit titles (such as .. function::). 78 | #add_module_names = True 79 | 80 | # If true, sectionauthor and moduleauthor directives will be shown in the 81 | # output. They are ignored by default. 82 | #show_authors = False 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = 'sphinx' 86 | 87 | # A list of ignored prefixes for module index sorting. 88 | #modindex_common_prefix = [] 89 | 90 | 91 | # -- Options for HTML output --------------------------------------------------- 92 | 93 | # The theme to use for HTML and HTML Help pages. See the documentation for 94 | # a list of builtin themes. 95 | html_theme = 'default' 96 | 97 | # Theme options are theme-specific and customize the look and feel of a theme 98 | # further. For a list of options available for each theme, see the 99 | # documentation. 100 | #html_theme_options = {} 101 | 102 | # Add any paths that contain custom themes here, relative to this directory. 103 | #html_theme_path = [] 104 | 105 | # The name for this set of Sphinx documents. If None, it defaults to 106 | # " v documentation". 107 | #html_title = None 108 | 109 | # A shorter title for the navigation bar. Default is the same as html_title. 110 | #html_short_title = None 111 | 112 | # The name of an image file (relative to this directory) to place at the top 113 | # of the sidebar. 114 | #html_logo = None 115 | 116 | # The name of an image file (within the static path) to use as favicon of the 117 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 118 | # pixels large. 119 | #html_favicon = None 120 | 121 | # Add any paths that contain custom static files (such as style sheets) here, 122 | # relative to this directory. They are copied after the builtin static files, 123 | # so a file named "default.css" will overwrite the builtin "default.css". 124 | html_static_path = ['.static'] 125 | 126 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 127 | # using the given strftime format. 128 | #html_last_updated_fmt = '%b %d, %Y' 129 | 130 | # If true, SmartyPants will be used to convert quotes and dashes to 131 | # typographically correct entities. 132 | #html_use_smartypants = True 133 | 134 | # Custom sidebar templates, maps document names to template names. 135 | #html_sidebars = {} 136 | 137 | # Additional templates that should be rendered to pages, maps page names to 138 | # template names. 139 | #html_additional_pages = {} 140 | 141 | # If false, no module index is generated. 142 | #html_domain_indices = True 143 | 144 | # If false, no index is generated. 145 | #html_use_index = True 146 | 147 | # If true, the index is split into individual pages for each letter. 148 | #html_split_index = False 149 | 150 | # If true, links to the reST sources are added to the pages. 151 | #html_show_sourcelink = True 152 | 153 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 154 | #html_show_sphinx = True 155 | 156 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 157 | #html_show_copyright = True 158 | 159 | # If true, an OpenSearch description file will be output, and all pages will 160 | # contain a tag referring to it. The value of this option must be the 161 | # base URL from which the finished HTML is served. 162 | #html_use_opensearch = '' 163 | 164 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 165 | #html_file_suffix = None 166 | 167 | # Output file base name for HTML help builder. 168 | htmlhelp_basename = 'PyVCFdoc' 169 | 170 | 171 | # -- Options for LaTeX output -------------------------------------------------- 172 | 173 | # The paper size ('letter' or 'a4'). 174 | #latex_paper_size = 'letter' 175 | 176 | # The font size ('10pt', '11pt' or '12pt'). 177 | #latex_font_size = '10pt' 178 | 179 | # Grouping the document tree into LaTeX files. List of tuples 180 | # (source start file, target name, title, author, documentclass [howto/manual]). 181 | latex_documents = [ 182 | ('index', 'PyVCF.tex', u'PyVCF Documentation', 183 | u'James Casbon, @jdoughertyii', 'manual'), 184 | ] 185 | 186 | # The name of an image file (relative to this directory) to place at the top of 187 | # the title page. 188 | #latex_logo = None 189 | 190 | # For "manual" documents, if this is true, then toplevel headings are parts, 191 | # not chapters. 192 | #latex_use_parts = False 193 | 194 | # If true, show page references after internal links. 195 | #latex_show_pagerefs = False 196 | 197 | # If true, show URL addresses after external links. 198 | #latex_show_urls = False 199 | 200 | # Additional stuff for the LaTeX preamble. 201 | #latex_preamble = '' 202 | 203 | # Documents to append as an appendix to all manuals. 204 | #latex_appendices = [] 205 | 206 | # If false, no module index is generated. 207 | #latex_domain_indices = True 208 | 209 | 210 | # -- Options for manual page output -------------------------------------------- 211 | 212 | # One entry per manual page. List of tuples 213 | # (source start file, name, description, authors, manual section). 214 | man_pages = [ 215 | ('index', 'pyvcf', u'PyVCF Documentation', 216 | [u'James Casbon, @jdoughertyii'], 1) 217 | ] 218 | -------------------------------------------------------------------------------- /test/dbsnp.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20120118 3 | ##source=dbSNP 4 | ##dbSNP_BUILD_ID=135 5 | ##reference=GRCh37.p5 6 | ##phasing=partial 7 | ##variationPropertyDocumentationUrl=ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##INFO= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | ##INFO= 24 | ##INFO= 25 | ##INFO= 26 | ##INFO=SubSNP->Batch.link_out"> 27 | ##INFO= 28 | ##INFO= 29 | ##INFO= 30 | ##INFO= 31 | ##INFO= 32 | ##INFO= 33 | ##INFO= 34 | ##INFO= 35 | ##INFO= 36 | ##INFO= 37 | ##INFO= 38 | ##INFO= 39 | ##INFO= 40 | ##INFO= 41 | ##INFO= 42 | ##INFO= 43 | ##INFO= 44 | ##INFO=5% minor allele frequency in each and all populations"> 45 | ##INFO=5% minor allele frequency in 1+ populations"> 46 | ##INFO= 47 | ##INFO= 48 | ##INFO= 49 | ##INFO= 50 | ##INFO= 51 | ##INFO= 52 | ##INFO= 53 | ##INFO= 54 | ##INFO= 55 | ##INFO= 56 | ##INFO= 57 | ##INFO= 58 | ##INFO= 59 | ##INFO= 60 | ##INFO= 61 | ##INFO= 62 | ##FILTER= 63 | #CHROM POS ID REF ALT QUAL FILTER INFO 64 | 1 10144 rs144773400 TA T . . RSPOS=10145;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000000005000002000200;WGT=1;VC=DIV;ASP;OTHERKG 65 | 1 10228 rs143255646 TA T . . RSPOS=10229;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG 66 | 1 10234 rs145599635 C T . . RSPOS=10234;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG 67 | 1 10248 rs148908337 A T . . RSPOS=10248;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG 68 | 1 10254 rs140194106 TA T . . RSPOS=10255;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG 69 | 1 10291 rs145427775 C T . . RSPOS=10291;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG 70 | 1 10327 rs112750067 T C . . RSPOS=10327;dbSNPBuildID=132;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG 71 | 1 10329 rs150969722 AC A . . RSPOS=10330;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG 72 | 1 10351 rs145072688 CTA C,CA . . RSPOS=10352;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000210;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG;NOC 73 | 1 10382 rs147093981 AAC A,AC . . RSPOS=10383;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000210;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG;NOC 74 | 1 10433 rs56289060 A AC . . RSPOS=10433;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG 75 | 1 10439 rs112766696 AC A . . RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=050100020015000102000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;SLO;R5;OTH;ASP;GNO;OTHERKG 76 | 1 10439 rs138941843 AC A . . RSPOS=10440;dbSNPBuildID=134;SSR=0;SAO=0;VP=050000020005000002000200;GENEINFO=LOC100652771:100652771;WGT=1;VC=DIV;R5;ASP;OTHERKG 77 | 1 10440 rs112155239 C A . . RSPOS=10440;dbSNPBuildID=132;SSR=0;SAO=0;VP=050000020015000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;OTH;ASP;OTHERKG 78 | 1 10492 rs55998931 C T . . RSPOS=10492;GMAF=0.0617001828153565;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005040002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;VLD;OTHERKG 79 | 1 10519 rs62636508 G C . . RSPOS=10519;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005000002000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;OTHERKG 80 | 1 10583 rs58108140 G A . . RSPOS=10583;GMAF=0.270566727605119;dbSNPBuildID=129;SSR=0;SAO=0;VP=050000020005040016000100;GENEINFO=LOC100652771:100652771;WGT=1;VC=SNV;R5;ASP;VLD;KGPhase1;KGPROD;OTHERKG 81 | -------------------------------------------------------------------------------- /test/null_genotype_mono.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##FORMAT= 3 | ##FORMAT= 4 | ##FORMAT= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##INFO= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | ##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[np_control.bam] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=[tests/read_chr.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/data/reference/ucsc/hg19/ucsc.hg19.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=1 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=info log_to_file=null help=false genotype_likelihoods_model=SNP p_nonref_model=EXACT heterozygosity=0.001 pcr_error_rate=1.0E-4 genotyping_mode=DISCOVERY output_mode=EMIT_ALL_SITES standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 computeSLOD=false alleles=(RodBinding name= source=UNBOUND) min_base_quality_score=17 max_deletion_fraction=0.05 multiallelic=false max_alternate_alleles=5 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 bandedIndel=false indelDebug=false ignoreSNPAlleles=false dbsnp=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false" 24 | ##contig= 25 | ##contig= 26 | ##contig= 27 | ##contig= 28 | ##contig= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##contig= 50 | ##contig= 51 | ##contig= 52 | ##contig= 53 | ##contig= 54 | ##contig= 55 | ##contig= 56 | ##contig= 57 | ##contig= 58 | ##contig= 59 | ##contig= 60 | ##contig= 61 | ##contig= 62 | ##contig= 63 | ##contig= 64 | ##contig= 65 | ##contig= 66 | ##contig= 67 | ##contig= 68 | ##contig= 69 | ##contig= 70 | ##contig= 71 | ##contig= 72 | ##contig= 73 | ##contig= 74 | ##contig= 75 | ##contig= 76 | ##contig= 77 | ##contig= 78 | ##contig= 79 | ##contig= 80 | ##contig= 81 | ##contig= 82 | ##contig= 83 | ##contig= 84 | ##contig= 85 | ##contig= 86 | ##contig= 87 | ##contig= 88 | ##contig= 89 | ##contig= 90 | ##contig= 91 | ##contig= 92 | ##contig= 93 | ##contig= 94 | ##contig= 95 | ##contig= 96 | ##contig= 97 | ##contig= 98 | ##contig= 99 | ##contig= 100 | ##contig= 101 | ##contig= 102 | ##contig= 103 | ##contig= 104 | ##contig= 105 | ##contig= 106 | ##contig= 107 | ##contig= 108 | ##contig= 109 | ##contig= 110 | ##contig= 111 | ##contig= 112 | ##contig= 113 | ##contig= 114 | ##contig= 115 | ##contig= 116 | ##contig= 117 | ##reference=file:///data/reference/ucsc/hg19/ucsc.hg19.fasta 118 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19239 119 | chr1 65312657 . G . . . . GT ./. 120 | -------------------------------------------------------------------------------- /ez_setup.py: -------------------------------------------------------------------------------- 1 | #!python 2 | """Bootstrap setuptools installation 3 | 4 | If you want to use setuptools in your package's setup.py, just include this 5 | file in the same directory with it, and add this to the top of your setup.py:: 6 | 7 | from ez_setup import use_setuptools 8 | use_setuptools() 9 | 10 | If you want to require a specific version of setuptools, set a download 11 | mirror, or use an alternate download directory, you can do so by supplying 12 | the appropriate options to ``use_setuptools()``. 13 | 14 | This file can also be run as a script to install or upgrade setuptools. 15 | """ 16 | import sys 17 | DEFAULT_VERSION = "0.6c11" 18 | DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] 19 | 20 | md5_data = { 21 | 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', 22 | 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', 23 | 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', 24 | 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', 25 | 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', 26 | 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', 27 | 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', 28 | 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', 29 | 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', 30 | 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', 31 | 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', 32 | 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', 33 | 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', 34 | 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', 35 | 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', 36 | 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', 37 | 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', 38 | 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', 39 | 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', 40 | 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', 41 | 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', 42 | 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', 43 | 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', 44 | 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', 45 | 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', 46 | 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', 47 | 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', 48 | 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', 49 | 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', 50 | 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', 51 | 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', 52 | 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', 53 | 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', 54 | 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', 55 | 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', 56 | 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', 57 | 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', 58 | 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', 59 | 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', 60 | 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', 61 | 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', 62 | 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', 63 | } 64 | 65 | import sys, os 66 | try: from hashlib import md5 67 | except ImportError: from md5 import md5 68 | 69 | def _validate_md5(egg_name, data): 70 | if egg_name in md5_data: 71 | digest = md5(data).hexdigest() 72 | if digest != md5_data[egg_name]: 73 | print >>sys.stderr, ( 74 | "md5 validation of %s failed! (Possible download problem?)" 75 | % egg_name 76 | ) 77 | sys.exit(2) 78 | return data 79 | 80 | def use_setuptools( 81 | version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, 82 | download_delay=15 83 | ): 84 | """Automatically find/download setuptools and make it available on sys.path 85 | 86 | `version` should be a valid setuptools version number that is available 87 | as an egg for download under the `download_base` URL (which should end with 88 | a '/'). `to_dir` is the directory where setuptools will be downloaded, if 89 | it is not already available. If `download_delay` is specified, it should 90 | be the number of seconds that will be paused before initiating a download, 91 | should one be required. If an older version of setuptools is installed, 92 | this routine will print a message to ``sys.stderr`` and raise SystemExit in 93 | an attempt to abort the calling script. 94 | """ 95 | was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules 96 | def do_download(): 97 | egg = download_setuptools(version, download_base, to_dir, download_delay) 98 | sys.path.insert(0, egg) 99 | import setuptools; setuptools.bootstrap_install_from = egg 100 | try: 101 | import pkg_resources 102 | except ImportError: 103 | return do_download() 104 | try: 105 | pkg_resources.require("setuptools>="+version); return 106 | except pkg_resources.VersionConflict, e: 107 | if was_imported: 108 | print >>sys.stderr, ( 109 | "The required version of setuptools (>=%s) is not available, and\n" 110 | "can't be installed while this script is running. Please install\n" 111 | " a more recent version first, using 'easy_install -U setuptools'." 112 | "\n\n(Currently using %r)" 113 | ) % (version, e.args[0]) 114 | sys.exit(2) 115 | except pkg_resources.DistributionNotFound: 116 | pass 117 | 118 | del pkg_resources, sys.modules['pkg_resources'] # reload ok 119 | return do_download() 120 | 121 | def download_setuptools( 122 | version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, 123 | delay = 15 124 | ): 125 | """Download setuptools from a specified location and return its filename 126 | 127 | `version` should be a valid setuptools version number that is available 128 | as an egg for download under the `download_base` URL (which should end 129 | with a '/'). `to_dir` is the directory where the egg will be downloaded. 130 | `delay` is the number of seconds to pause before an actual download attempt. 131 | """ 132 | import urllib2, shutil 133 | egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) 134 | url = download_base + egg_name 135 | saveto = os.path.join(to_dir, egg_name) 136 | src = dst = None 137 | if not os.path.exists(saveto): # Avoid repeated downloads 138 | try: 139 | from distutils import log 140 | if delay: 141 | log.warn(""" 142 | --------------------------------------------------------------------------- 143 | This script requires setuptools version %s to run (even to display 144 | help). I will attempt to download it for you (from 145 | %s), but 146 | you may need to enable firewall access for this script first. 147 | I will start the download in %d seconds. 148 | 149 | (Note: if this machine does not have network access, please obtain the file 150 | 151 | %s 152 | 153 | and place it in this directory before rerunning this script.) 154 | ---------------------------------------------------------------------------""", 155 | version, download_base, delay, url 156 | ); from time import sleep; sleep(delay) 157 | log.warn("Downloading %s", url) 158 | src = urllib2.urlopen(url) 159 | # Read/write all in one block, so we don't create a corrupt file 160 | # if the download is interrupted. 161 | data = _validate_md5(egg_name, src.read()) 162 | dst = open(saveto,"wb"); dst.write(data) 163 | finally: 164 | if src: src.close() 165 | if dst: dst.close() 166 | return os.path.realpath(saveto) 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | def main(argv, version=DEFAULT_VERSION): 204 | """Install or upgrade setuptools and EasyInstall""" 205 | try: 206 | import setuptools 207 | except ImportError: 208 | egg = None 209 | try: 210 | egg = download_setuptools(version, delay=0) 211 | sys.path.insert(0,egg) 212 | from setuptools.command.easy_install import main 213 | return main(list(argv)+[egg]) # we're done here 214 | finally: 215 | if egg and os.path.exists(egg): 216 | os.unlink(egg) 217 | else: 218 | if setuptools.__version__ == '0.0.1': 219 | print >>sys.stderr, ( 220 | "You have an obsolete version of setuptools installed. Please\n" 221 | "remove it from your system entirely before rerunning this script." 222 | ) 223 | sys.exit(2) 224 | 225 | req = "setuptools>="+version 226 | import pkg_resources 227 | try: 228 | pkg_resources.require(req) 229 | except pkg_resources.VersionConflict: 230 | try: 231 | from setuptools.command.easy_install import main 232 | except ImportError: 233 | from easy_install import main 234 | main(list(argv)+[download_setuptools(delay=0)]) 235 | sys.exit(0) # try to force an exit 236 | else: 237 | if argv: 238 | from setuptools.command.easy_install import main 239 | main(argv) 240 | else: 241 | print "Setuptools version",version,"or greater has been installed." 242 | print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' 243 | 244 | def update_md5(filenames): 245 | """Update our built-in md5 registry""" 246 | 247 | import re 248 | 249 | for name in filenames: 250 | base = os.path.basename(name) 251 | f = open(name,'rb') 252 | md5_data[base] = md5(f.read()).hexdigest() 253 | f.close() 254 | 255 | data = [" %r: %r,\n" % it for it in md5_data.items()] 256 | data.sort() 257 | repl = "".join(data) 258 | 259 | import inspect 260 | srcfile = inspect.getsourcefile(sys.modules[__name__]) 261 | f = open(srcfile, 'rb'); src = f.read(); f.close() 262 | 263 | match = re.search("\nmd5_data = {\n([^}]+)}", src) 264 | if not match: 265 | print >>sys.stderr, "Internal error!" 266 | sys.exit(2) 267 | 268 | src = src[:match.start(1)] + repl + src[match.end(1):] 269 | f = open(srcfile,'w') 270 | f.write(src) 271 | f.close() 272 | 273 | 274 | if __name__=='__main__': 275 | if len(sys.argv)>2 and sys.argv[1]=='--md5update': 276 | update_md5(sys.argv[2:]) 277 | else: 278 | main(sys.argv[1:]) 279 | 280 | 281 | 282 | 283 | 284 | 285 | -------------------------------------------------------------------------------- /test/test-gl.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20141103 3 | ##source=freeBayes v0.9.16 4 | ##reference=/shared/genomes/b37/full/human_g1k_v37.fasta 5 | ##phasing=none 6 | ##commandline="/mnt/thor_pool1/user_data/cc2qe/code/speedseq/bin/freebayes -f /shared/genomes/b37/full/human_g1k_v37.fasta --pooled-discrete --genotype-qualities --min-repeat-entropy 1 --min-alternate-fraction 0.05 --min-alternate-count 2 --region MT:12136..12498 TCGA-E2-A14P-10A-01D-A19H-09.l1.bam TCGA-E2-A14P-10A-01D-A19H-09.l2.bam TCGA-E2-A14P-01A-31D-A19H-09.l1.bam TCGA-E2-A14P-01A-31D-A19H-09.l2.bam" 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##INFO= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | ##INFO= 24 | ##INFO= 25 | ##INFO= 26 | ##INFO= 27 | ##INFO= 28 | ##INFO= 29 | ##INFO= 30 | ##INFO= 31 | ##INFO= 32 | ##INFO= 33 | ##INFO= 34 | ##INFO= 35 | ##INFO= 36 | ##INFO= 37 | ##INFO= 38 | ##INFO= 39 | ##INFO= 40 | ##INFO= 41 | ##INFO= 42 | ##INFO= 43 | ##INFO= 44 | ##INFO= 45 | ##INFO= 46 | ##INFO= 47 | ##INFO= 48 | ##INFO= 49 | ##FORMAT= 50 | ##FORMAT= 51 | ##FORMAT= 52 | ##FORMAT= 53 | ##FORMAT= 54 | ##FORMAT= 55 | ##FORMAT= 56 | ##FORMAT= 57 | ##INFO= 58 | ##VEP=v76 cache=/shared/external_bin/ensembl-tools-release-76/cache/homo_sapiens/76_GRCh37 db=. 59 | ##INFO= 60 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT H_LS-E2-A14P-10A-01D-A19H-09 H_LS-E2-A14P-01A-31D-A19H-09 61 | 2 128046289 . C T 272.753 PASS SSC=42.1572;AB=0.1875;ABP=57.2971;AC=1;AF=0.25;AN=4;AO=12;CIGAR=1X;DP=94;DPB=94;DPRA=2.13333;EPP=9.52472;EPPR=3.11623;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=46.7056;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=439;QR=2940;RO=82;RPL=5;RPP=3.73412;RPPR=5.65844;RPR=7;RUN=1;SAF=4;SAP=5.9056;SAR=8;SRF=46;SRP=5.65844;SRR=36;TYPE=snp;technology.illumina=1;CSQ=missense_variant|cGa/cAa|R/Q|ENSG00000163161|ERCC3|ENST00000493187|7/15|possibly_damaging(0.862)|deleterious(0.05)|261/718|protein_coding,missense_variant|cGa/cAa|R/Q|ENSG00000163161|ERCC3|ENST00000456257|4/4|probably_damaging(0.967)|tolerated(0.09)|175/188|protein_coding,3_prime_UTR_variant&NMD_transcript_variant|||ENSG00000163161|ERCC3|ENST00000445889|7/15|||-/71|nonsense_mediated_decay,3_prime_UTR_variant&NMD_transcript_variant|||ENSG00000163161|ERCC3|ENST00000426778|7/15|||-/71|nonsense_mediated_decay,downstream_gene_variant|||ENSG00000163161|ERCC3|ENST00000490062|||||retained_intron,non_coding_exon_variant&nc_transcript_variant|||ENSG00000163161|ERCC3|ENST00000494464|6/7||||retained_intron,downstream_gene_variant|||ENSG00000163161|ERCC3|ENST00000460485|||||retained_intron,missense_variant|cGa/cAa|R/Q|ENSG00000163161|ERCC3|ENST00000285398|7/15|possibly_damaging(0.862)|deleterious(0.02)|325/782|protein_coding,downstream_gene_variant|||ENSG00000163161|ERCC3|ENST00000462306|||||retained_intron GT:GQ:DP:RO:QR:AO:QA:GL 0/0:151.949:30:30:1104:0:0:0 0/1:151.949:64:52:1836:12:439:-33.1263,0,-158.844 62 | 17 7578461 . C A 257.251 PASS SSC=35.826;AB=0.555556;ABP=3.49285;AC=1;AF=0.25;AN=4;AO=10;CIGAR=1X;DP=35;DPB=35;DPRA=1.05882;EPP=6.48466;EPPR=3.09716;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=15.9695;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=346;QR=872;RO=25;RPL=4;RPP=3.87889;RPPR=7.26639;RPR=6;RUN=1;SAF=5;SAP=3.0103;SAR=5;SRF=14;SRP=3.79203;SRR=11;TYPE=snp;technology.illumina=1;CSQ=non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000504937|1/7||||retained_intron,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000508793|5/5|probably_damaging(0.997)|deleterious(0)|157/165|protein_coding,non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000505014|4/5||||retained_intron,non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000504290|1/8||||retained_intron,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000413465|4/7|probably_damaging(0.995)|deleterious(0)|157/285|protein_coding,downstream_gene_variant|||ENSG00000141510|TP53|ENST00000604348||||-/143|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000420246|5/12|probably_damaging(0.994)|deleterious(0)|157/341|protein_coding,upstream_gene_variant|||ENSG00000141510|TP53|ENST00000574684|||||processed_transcript,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000269305|5/11|probably_damaging(0.997)|deleterious(0)|157/393|protein_coding,upstream_gene_variant|||ENSG00000141510|TP53|ENST00000576024||||-/31|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000509690|2/6|probably_damaging(0.997)|deleterious(0)|25/199|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000359597|4/9|probably_damaging(0.994)|deleterious(0)|157/343|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000514944|4/6|probably_damaging(0.993)|deleterious(0)|64/155|protein_coding,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000445888|5/11|probably_damaging(0.997)|deleterious(0)|157/393|protein_coding,downstream_gene_variant|||ENSG00000141510|TP53|ENST00000503591||||-/128|protein_coding,non_coding_exon_variant&nc_transcript_variant|||ENSG00000141510|TP53|ENST00000510385|1/8||||retained_intron,missense_variant|Gtc/Ttc|V/F|ENSG00000141510|TP53|ENST00000455263|5/12|probably_damaging(0.987)|deleterious(0)|157/346|protein_coding GT:GQ:DP:RO:QR:AO:QA:GL 0/0:69.3546:17:17:582:0:0:0,-5.11751,-52.7224 0/1:160.002:18:8:290:10:346:-30.7085,0,-25.685 63 | 17 59861631 . CTGCTATTTTG CG 162.924 PASS SSC=46.502;AB=0.22;ABP=37.059;AC=1;AF=0.25;AN=4;AO=11;CIGAR=1M9D1M;DP=121;DPB=121.818;DPRA=0.704225;EPP=3.20771;EPPR=13.5489;GTI=0;LEN=9;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=37.5146;PAIRED=1;PAIREDR=1;PAO=2;PQA=14.5;PQR=604.5;PRO=20;QA=329;QR=4041;RO=109;RPL=7;RPP=4.78696;RPPR=3.03022;RPR=4;RUN=1;SAF=6;SAP=3.20771;SAR=5;SRF=62;SRP=7.4927;SRR=47;TYPE=del;technology.illumina=1;CSQ=inframe_deletion&splice_region_variant|CAAAATAGCAga/Cga|QNSR/R|ENSG00000136492|BRIP1|ENST00000259008|11/20|||540-543/1249|protein_coding,upstream_gene_variant|||ENSG00000136492|BRIP1|ENST00000583837|||||processed_transcript,inframe_deletion&splice_region_variant&NMD_transcript_variant|CAAAATAGCAga/Cga|QNSR/R|ENSG00000136492|BRIP1|ENST00000579028|2/6|||75-78/84|nonsense_mediated_decay,inframe_deletion&splice_region_variant|CAAAATAGCAga/Cga|QNSR/R|ENSG00000136492|BRIP1|ENST00000577598|10/18|||540-543/994|protein_coding GT:GQ:DP:RO:QR:AO:QA:GL 0/0:145.893:71:70:2598:0:0:0,-21.0721,-234.097 0/1:140.118:50:39:1443:11:329:-25.4299,0,-125.761 64 | 6 132856480 . GTTTTTTTTTTTTTGTATTTTTAGTAG GTTTTTTTTTTTTGTATTTTTAGTAG 39.3609 . SSC=10.4744;AB=0.121212;ABP=44.1367;AC=1;AF=0.25;AN=4;AO=5;CIGAR=1M1D25M;DP=52;DPB=77.8148;DPRA=0;EPP=3.44459;EPPR=14.8483;GTI=0;LEN=1;MEANALT=7.5;MQM=60;MQMR=58.6774;NS=2;NUMALT=1;ODDS=9.06305;PAIRED=1;PAIREDR=1;PAO=14.6667;PQA=465.083;PQR=759.583;PRO=23.6667;QA=153;QR=1074;RO=31;RPL=4;RPP=6.91895;RPPR=4.76149;RPR=1;RUN=1;SAF=1;SAP=6.91895;SAR=4;SRF=8;SRP=18.771;SRR=23;TYPE=del;technology.illumina=1;CSQ=upstream_gene_variant|||ENSG00000237110|TAAR9|ENST00000434551||||-/347|polymorphic_pseudogene GT:GQ:DP:RO:QR:AO:QA:GL 0/0:50.0188:19:11:368:1:15:0,-1.16979,-31.9349 0/1:39.3609:33:20:706:4:138:-9.30456,0,-60.533 65 | 6 132857169 . T C 2.77482 . SSC=9.94317;AB=0.0972222;ABP=104.466;AC=1;AF=0.25;AN=4;AO=10;CIGAR=1X;DP=115;DPB=115;DPRA=0;EPP=3.0103;EPPR=3.19643;GTI=1;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=0.11155;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=371;QR=3950;RO=105;RPL=10;RPP=24.725;RPPR=12.1305;RPR=0;RUN=1;SAF=5;SAP=3.0103;SAR=5;SRF=43;SRP=10.476;SRR=62;TYPE=snp;technology.illumina=1;CSQ=upstream_gene_variant|||ENSG00000237110|TAAR9|ENST00000434551||||-/347|polymorphic_pseudogene GT:GQ:DP:RO:QR:AO:QA:GL 0/0:1.01456e-11:43:40:1520:3:110:-1.41373,0,-128.327 0/1:2.77482:72:65:2430:7:261:-11.3569,0,-206.568 66 | 6 132922238 . A G 2.67302e-14 PASS SSC=35.3617;AB=0.0963855;ABP=237.896;AC=1;AF=0.25;AN=4;AO=17;CIGAR=1X;DP=319;DPB=319;DPRA=0;EPP=18.4661;EPPR=16.3493;GTI=0;LEN=1;MEANALT=1.5;MQM=37.4118;MQMR=42.5615;NS=2;NUMALT=1;ODDS=78.4706;PAIRED=0.764706;PAIREDR=0.923588;PAO=0;PQA=0;PQR=0;PRO=0;QA=243;QR=7960;RO=301;RPL=13;RPP=13.3567;RPPR=33.4903;RPR=4;RUN=1;SAF=10;SAP=4.1599;SAR=7;SRF=132;SRP=12.8865;SRR=169;TYPE=snp;technology.illumina=1;CSQ=intergenic_variant|||||||||| GT:GQ:DP:RO:QR:AO:QA:GL 0/0:142.108:153:152:3816:1:14:0,-42.4729,-342.291 0/1:142.108:166:149:4144:16:229:0,-7.11118,-352.491 67 | X 132838305 . GAAAAAAAAAAAAAGGTGAAAATT GAAAAAAAAAAAAGGTGAAAATT 70.8213 PASS SSC=19.9712;AB=0.15625;ABP=35.8538;AC=1;AF=0.25;AN=4;AO=5;CIGAR=1M1D22M;DP=67;DPB=84.4167;DPRA=0.914286;EPP=3.44459;EPPR=3.0608;GTI=1;LEN=1;MEANALT=6;MQM=60;MQMR=60;NS=2;NUMALT=1;ODDS=8.14546;PAIRED=1;PAIREDR=1;PAO=11.5;PQA=372.167;PQR=489.167;PRO=15.5;QA=164;QR=1476;RO=43;RPL=2;RPP=3.44459;RPPR=3.46479;RPR=3;RUN=1;SAF=4;SAP=6.91895;SAR=1;SRF=12;SRP=21.2406;SRR=31;TYPE=del;technology.illumina=1;CSQ=intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000406757||||-/256|protein_coding,splice_region_variant&intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000394299||||-/603|protein_coding,intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000370818||||-/580|protein_coding,intron_variant&feature_truncation|||ENSG00000147257|GPC3|ENST00000543339||||-/526|protein_coding GT:GQ:DP:RO:QR:AO:QA:GL 0/0:0.00125948:35:25:842:0:0:0,-7.52575,-75.9606 0/1:70.8147:32:18:634:5:164:-12.4455,0,-54.8329 68 | -------------------------------------------------------------------------------- /test/gatk.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##FORMAT= 3 | ##FORMAT= 4 | ##FORMAT= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##INFO= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | ##INFO= 24 | ##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[reads.bam] read_buffer_size=null phone_home=NO_ET read_filter=[] intervals=[chr22:42020321-42527953] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/data/reference/ucsc/hg19/ucsc.hg19.fasta rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=CALCULATE_AS_NECESSARY baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=2 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=gatk.log help=false genotype_likelihoods_model=SNP p_nonref_model=EXACT heterozygosity=0.001 pcr_error_rate=1.0E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 computeSLOD=false alleles=(RodBinding name= source=UNBOUND) min_base_quality_score=17 max_deletion_fraction=0.15 multiallelic=false max_alternate_alleles=5 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 bandedIndel=false indelDebug=false ignoreSNPAlleles=false dbsnp=(RodBinding name=dbsnp source=/data/reference/dbSNP_132/dbsnp_132.hg19.vcf) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false" 25 | ##contig= 26 | ##contig= 27 | ##contig= 28 | ##contig= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##contig= 50 | ##contig= 51 | ##contig= 52 | ##contig= 53 | ##contig= 54 | ##contig= 55 | ##contig= 56 | ##contig= 57 | ##contig= 58 | ##contig= 59 | ##contig= 60 | ##contig= 61 | ##contig= 62 | ##contig= 63 | ##contig= 64 | ##contig= 65 | ##contig= 66 | ##contig= 67 | ##contig= 68 | ##contig= 69 | ##contig= 70 | ##contig= 71 | ##contig= 72 | ##contig= 73 | ##contig= 74 | ##contig= 75 | ##contig= 76 | ##contig= 77 | ##contig= 78 | ##contig= 79 | ##contig= 80 | ##contig= 81 | ##contig= 82 | ##contig= 83 | ##contig= 84 | ##contig= 85 | ##contig= 86 | ##contig= 87 | ##contig= 88 | ##contig= 89 | ##contig= 90 | ##contig= 91 | ##contig= 92 | ##contig= 93 | ##contig= 94 | ##contig= 95 | ##contig= 96 | ##contig= 97 | ##contig= 98 | ##contig= 99 | ##contig= 100 | ##contig= 101 | ##contig= 102 | ##contig= 103 | ##contig= 104 | ##contig= 105 | ##contig= 106 | ##contig= 107 | ##contig= 108 | ##contig= 109 | ##contig= 110 | ##contig= 111 | ##contig= 112 | ##contig= 113 | ##contig= 114 | ##contig= 115 | ##contig= 116 | ##contig= 117 | ##contig= 118 | ##reference=file:///data/reference/ucsc/hg19/ucsc.hg19.fasta 119 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT BLANK NA12878 NA12891 NA12892 NA19238 NA19239 NA19240 120 | chr22 42522392 rs28371738 G A 2951.95 . AC=2;AF=0.143;AN=14;BaseQRankSum=0.375;DB;DP=1506;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=123.5516;MQ=253.92;MQ0=0;MQRankSum=0.685;QD=5.90;ReadPosRankSum=0.590 GT:AD:DP:GQ:PL 0/0:6,0:6:18.04:0,18,211 0/1:138,107:250:99:1961,0,3049 0/1:169,77:250:99:1038,0,3533 0/0:249,0:250:99:0,600,5732 0/0:248,1:250:99:0,627,6191 0/0:250,0:250:99:0,615,5899 0/0:250,0:250:99:0,579,5674 121 | chr22 42522613 rs1135840 G C 11611.03 . AC=6;AF=0.429;AN=14;BaseQRankSum=16.289;DB;DP=1518;DS;Dels=0.03;FS=0.000;HRun=0;HaplotypeScore=142.5716;MQ=242.46;MQ0=0;MQRankSum=2.010;QD=9.16;ReadPosRankSum=-1.731 GT:AD:DP:GQ:PL 0/1:13,4:17:62.64:63,0,296 0/1:118,127:246:99:2396,0,1719 0/0:241,0:244:99:0,459,4476 0/1:161,85:246:99:1489,0,2353 0/1:110,132:242:99:2561,0,1488 0/1:106,135:242:99:2613,0,1389 0/1:116,126:243:99:2489,0,1537 122 | chr22 42522755 . C G 36.98 . AC=1;AF=0.071;AN=14;BaseQRankSum=-14.866;DP=1527;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=253.4254;MQ=197.36;MQ0=2;MQRankSum=-10.810;QD=0.15;ReadPosRankSum=-17.244 GT:AD:DP:GQ:PL 0/0:26,1:27:51.08:0,51,570 0/0:208,40:248:99:0,236,4169 0/0:192,56:249:99:0,114,4292 0/1:179,66:245:75.42:75,0,3683 0/0:214,32:246:99:0,172,4235 0/0:200,49:249:61.05:0,61,4049 0/0:195,50:246:32.07:0,32,3757 123 | chr22 42523003 rs116917064 A G 7113.55 . AC=8;AF=0.571;AN=14;BaseQRankSum=6.026;DB;DP=1433;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=101.7894;MQ=182.04;MQ0=0;MQRankSum=-2.501;QD=4.96;ReadPosRankSum=8.294 GT:AD:DP:GQ:PL 0/1:10,2:12:0.62:1,0,257 1/1:9,173:183:99:2385,273,0 0/1:153,95:249:99:355,0,2355 0/1:140,110:250:99:1334,0,2242 0/1:164,85:249:99:1070,0,2279 0/1:160,90:250:99:1245,0,2300 0/1:156,81:238:99:724,0,2764 124 | chr22 42523077 . A G 54.31 . AC=1;AF=0.071;AN=14;BaseQRankSum=-0.563;DP=1521;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=54.8434;MQ=164.04;MQ0=1;MQRankSum=-2.419;QD=2.59;ReadPosRankSum=-1.229 GT:AD:DP:GQ:PL 0/1:17,4:21:92.74:93,0,533 0/0:249,1:250:99:0,544,6985 0/0:250,0:250:99:0,577,6968 0/0:248,2:250:99:0,605,7687 0/0:248,1:249:99:0,583,7300 0/0:246,2:249:99:0,626,7473 0/0:248,1:249:99:0,594,7553 125 | chr22 42523209 rs28371730 T C 15556.89 . AC=8;AF=0.571;AN=14;BaseQRankSum=3.458;DB;DP=1509;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=120.8206;MQ=221.07;MQ0=0;MQRankSum=-4.945;QD=10.31;ReadPosRankSum=0.639 GT:AD:DP:GQ:PL 0/1:3,6:9:99:154,0,101 1/1:6,237:247:99:4532,308,0 0/1:130,117:248:99:1399,0,3147 0/1:112,129:244:99:2641,0,2556 0/1:115,127:247:99:2320,0,2526 0/1:115,128:248:99:2546,0,2520 0/1:143,104:249:99:1965,0,3288 126 | chr22 42523211 rs2004511 T C 2445.52 . AC=2;AF=0.143;AN=14;BaseQRankSum=10.587;DB;DP=1509;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=102.7564;MQ=221.50;MQ0=0;MQRankSum=-6.926;QD=4.89;ReadPosRankSum=2.057 GT:AD:DP:GQ:PL 0/0:9,0:9:24.06:0,24,289 0/1:136,113:250:99:1384,0,2176 0/1:146,104:250:99:1108,0,2809 0/0:247,3:250:99:0,439,5546 0/0:245,2:249:99:0,459,5316 0/0:248,2:250:99:0,459,5404 0/0:248,1:250:99:0,533,6069 127 | chr22 42523409 rs1985842 G T 6801.90 . AC=6;AF=0.429;AN=14;BaseQRankSum=20.509;DB;DP=1454;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=150.8967;MQ=200.12;MQ0=0;MQRankSum=4.472;QD=5.65;ReadPosRankSum=9.396 GT:AD:DP:GQ:PL 0/1:1,3:4:25.84:53,0,26 0/1:153,95:249:99:1597,0,1798 0/0:245,4:250:99:0,336,4079 0/1:168,82:250:99:1339,0,1880 0/1:147,103:250:99:1522,0,1805 0/1:156,94:250:99:1341,0,2322 0/1:129,71:201:99:949,0,2082 128 | chr22 42523805 rs28371725 C T 1637.33 . AC=1;AF=0.071;AN=14;BaseQRankSum=-0.379;DB;DP=1516;DS;Dels=0.00;FS=0.000;HRun=2;HaplotypeScore=77.2321;MQ=226.05;MQ0=0;MQRankSum=2.862;QD=6.55;ReadPosRankSum=0.064 GT:AD:DP:GQ:PL 0/0:16,0:16:39.09:0,39,475 0/0:248,1:249:99:0,613,7187 0/1:132,116:248:99:1676,0,2916 0/0:248,0:248:99:0,625,7171 0/0:248,2:250:99:0,604,7252 0/0:250,0:250:99:0,631,7426 0/0:248,1:249:99:0,584,6964 129 | chr22 42523943 rs16947 A G 23661.10 . AC=8;AF=0.571;AN=14;BaseQRankSum=4.602;DB;DP=1514;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=38.3217;MQ=238.64;MQ0=0;MQRankSum=2.485;QD=15.63;ReadPosRankSum=3.749 GT:AD:DP:GQ:PL 0/1:9,5:14:99:163,0,303 1/1:3,246:250:99:8092,667,0 0/1:129,116:246:99:3190,0,2852 0/1:149,98:247:99:2429,0,3588 0/1:129,118:247:99:3267,0,3052 0/1:122,123:245:99:3428,0,3052 0/1:124,119:244:99:3092,0,2845 130 | chr22 42524150 . C G 3758.65 . AC=8;AF=0.571;AN=14;BaseQRankSum=24.314;DP=1506;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=172.5901;MQ=242.92;MQ0=0;MQRankSum=11.537;QD=2.50;ReadPosRankSum=-9.185 GT:AD:DP:GQ:PL 1/1:3,3:6:5.98:46,6,0 0/1:161,88:250:99:708,0,300 0/1:161,88:250:99:635,0,308 0/1:160,90:250:99:658,0,229 0/1:180,69:250:99:478,0,113 0/1:176,73:250:99:530,0,271 0/1:170,79:249:99:704,0,133 131 | chr22 42524435 rs1807313 T A 5252.25 . AC=3;AF=0.214;AN=14;BaseQRankSum=-0.192;DB;DP=1526;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=152.3866;MQ=242.06;MQ0=0;MQRankSum=1.923;QD=9.99;ReadPosRankSum=3.008 GT:AD:DP:GQ:PL 0/1:7,19:26:99:456,0,195 0/0:250,0:250:99:0,698,8167 0/0:246,2:249:99:0,673,7735 0/0:248,2:250:99:0,685,7919 0/0:250,0:250:99:0,688,7814 0/1:120,126:247:99:2539,0,3250 0/1:131,110:246:99:2257,0,3278 132 | chr22 42524696 rs58440431 T C 6423.61 . AC=2;AF=0.143;AN=14;BaseQRankSum=3.119;DB;DP=1509;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=53.0005;MQ=230.78;MQ0=0;MQRankSum=2.825;QD=12.85;ReadPosRankSum=2.051 GT:AD:DP:GQ:PL 0/0:9,0:9:27.08:0,27,351 0/1:132,116:250:99:3341,0,3914 0/1:141,108:250:99:3082,0,3917 0/0:248,1:250:99:0,692,8578 0/0:250,0:250:99:0,743,8836 0/0:247,2:250:99:0,695,8726 0/0:249,1:250:99:0,699,8650 133 | chr22 42524947 rs3892097 C T 731.18 . AC=2;AF=0.143;AN=14;BaseQRankSum=0.602;DB;DP=1495;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=154.5421;MQ=217.65;MQ0=0;MQRankSum=4.304;QD=1.47;ReadPosRankSum=1.019 GT:AD:DP:GQ:PL 0/0:3,0:3:8.99:0,9,89 0/1:108,75:244:99:403,0,1684 0/1:125,74:242:99:375,0,2335 0/0:227,1:249:99:0,460,5036 0/0:226,1:247:99:0,448,4884 0/0:192,1:247:99:0,400,4405 0/0:194,1:247:99:0,405,4694 134 | chr22 42525132 rs1058164 G C 14639.91 . AC=5;AF=0.357;AN=14;BaseQRankSum=4.944;DB;DP=1508;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=38.1229;MQ=207.02;MQ0=6;MQRankSum=2.510;QD=11.71;ReadPosRankSum=0.306 GT:AD:DP:GQ:PL 0/0:8,0:8:24.05:0,24,309 0/1:125,125:250:99:3147,0,3294 0/0:245,1:248:99:0,549,7172 0/1:139,109:248:99:2470,0,3232 0/1:136,107:243:99:2545,0,3408 0/1:116,130:247:99:3206,0,2926 0/1:122,124:247:99:3271,0,3300 135 | chr22 42525772 rs28371706 G A 7552.52 . AC=4;AF=0.286;AN=14;BaseQRankSum=12.028;DB;DP=1506;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=89.8512;MQ=222.09;MQ0=0;MQRankSum=5.200;QD=9.99;ReadPosRankSum=2.275 GT:AD:DP:GQ:PL 0/1:4,2:6:29.34:29,0,147 0/0:249,0:249:99:0,592,6835 0/0:249,1:250:99:0,590,7041 0/0:248,0:248:99:0,652,7316 0/1:126,120:248:99:2668,0,2833 0/1:134,113:247:99:2453,0,2485 0/1:137,113:250:99:2403,0,2988 136 | chr22 42525798 rs28371705 G C 1954.58 . AC=2;AF=0.143;AN=14;BaseQRankSum=6.229;DB;DP=1509;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=36.0442;MQ=228.55;MQ0=0;MQRankSum=0.852;QD=3.91;ReadPosRankSum=6.520 GT:AD:DP:GQ:PL 0/0:9,0:9:27.08:0,27,342 0/1:164,85:250:99:981,0,3519 0/1:171,79:250:99:1020,0,3665 0/0:249,1:250:99:0,526,6474 0/0:249,1:250:99:0,550,6481 0/0:248,2:250:99:0,542,6933 0/0:250,0:250:99:0,604,7282 137 | chr22 42525811 rs28371704 T C 3688.26 . AC=2;AF=0.143;AN=14;BaseQRankSum=4.752;DB;DP=1510;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=36.9902;MQ=210.28;MQ0=0;MQRankSum=2.309;QD=7.38;ReadPosRankSum=6.262 GT:AD:DP:GQ:PL 0/0:10,0:10:27.06:0,27,333 0/1:163,86:249:99:1958,0,3391 0/1:167,78:245:99:1730,0,3945 0/0:248,1:249:99:0,542,6887 0/0:246,1:247:99:0,550,6569 0/0:247,1:250:99:0,548,6954 0/0:249,1:250:99:0,557,7079 138 | chr22 42525821 rs28371703 G T 3940.90 . AC=2;AF=0.143;AN=14;BaseQRankSum=4.652;DB;DP=1510;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=34.0483;MQ=210.28;MQ0=0;MQRankSum=2.924;QD=7.88;ReadPosRankSum=5.487 GT:AD:DP:GQ:PL 0/0:10,0:10:24.08:0,24,317 0/1:164,85:250:99:2033,0,3659 0/1:167,79:249:99:1907,0,4271 0/0:249,1:250:99:0,565,7321 0/0:249,1:250:99:0,545,7102 0/0:248,2:250:99:0,536,7254 0/0:249,0:250:99:0,605,7633 139 | chr22 42525952 rs71328650 C A 5872.92 . AC=7;AF=0.500;AN=14;BaseQRankSum=25.986;DB;DP=1505;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=144.2979;MQ=173.55;MQ0=0;MQRankSum=3.660;QD=4.68;ReadPosRankSum=7.152 GT:AD:DP:GQ:PL 1/1:2,3:5:6:53,6,0 0/1:132,117:250:99:1397,0,702 0/0:248,1:250:99:0,245,2219 0/1:166,83:250:99:1151,0,934 0/1:164,86:250:99:1070,0,1147 0/1:170,80:250:99:1009,0,1141 0/1:162,87:250:99:1194,0,1085 140 | chr22 42526049 . C G 8544.41 . AC=10;AF=0.714;AN=14;BaseQRankSum=-8.121;DP=1505;DS;Dels=0.01;FS=0.000;HRun=0;HaplotypeScore=241.7335;MQ=162.18;MQ0=2;MQRankSum=-1.399;QD=6.81;ReadPosRankSum=2.132 GT:AD:DP:GQ:PL 1/1:0,5:5:3:26,3,0 0/1:86,162:248:99:1053,0,1167 0/0:235,12:248:99:0,378,3886 0/1:108,137:245:99:782,0,1662 1/1:3,242:245:99:2351,264,0 1/1:5,245:250:99:2193,222,0 1/1:4,242:246:99:2140,240,0 141 | chr22 42526449 . T A 151.47 . AC=1;AF=0.071;AN=14;BaseQRankSum=2.662;DP=1226;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=41.2083;MQ=240.47;MQ0=0;MQRankSum=0.578;QD=4.89;ReadPosRankSum=3.611 GT:AD:DP:GQ:PL 0/1:23,8:31:99:190,0,694 0/0:188,0:190:99:0,478,5376 0/0:187,0:187:99:0,493,5322 0/0:247,0:249:99:0,634,6728 0/0:185,0:185:99:0,487,5515 0/0:202,0:202:99:0,520,5857 0/0:181,1:182:99:0,440,5362 142 | chr22 42526484 rs28371699 A C 4220.99 . AC=6;AF=0.429;AN=14;BaseQRankSum=-17.855;DB;DP=1532;DS;Dels=0.02;FS=0.000;HRun=0;HaplotypeScore=136.8893;MQ=233.92;MQ0=0;MQRankSum=3.448;QD=3.29;ReadPosRankSum=-2.663 GT:AD:DP:GQ:PL 0/1:16,15:31:99:238,0,428 0/1:112,135:247:99:796,0,1908 0/0:227,13:241:99:0,433,4747 0/1:108,133:242:99:588,0,2014 0/1:90,154:245:99:1055,0,1892 0/1:112,131:246:99:741,0,2222 0/1:108,137:246:99:803,0,2266 143 | chr22 42526549 rs56011157 C T 14276.31 . AC=8;AF=0.571;AN=14;BaseQRankSum=17.750;DB;DP=1537;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=87.3394;MQ=231.34;MQ0=0;MQRankSum=4.781;QD=9.29;ReadPosRankSum=7.463 GT:AD:DP:GQ:PL 0/1:22,15:37:99:251,0,330 1/1:23,227:250:99:5404,430,0 0/1:151,98:250:99:1878,0,2475 0/1:153,97:250:99:1769,0,2410 0/1:149,100:250:99:1792,0,2569 0/1:164,84:250:99:1440,0,2646 0/1:149,98:248:99:1742,0,2601 144 | chr22 42526561 rs28695233 G T 4524.61 . AC=7;AF=0.500;AN=14;BaseQRankSum=9.714;DB;DP=1538;DS;Dels=0.00;FS=0.000;HRun=1;HaplotypeScore=98.8415;MQ=220.45;MQ0=0;MQRankSum=9.430;QD=3.02;ReadPosRankSum=7.682 GT:AD:DP:GQ:PL 0/0:22,15:38:15.74:0,16,609 1/1:4,240:249:99:2685,237,0 0/1:142,108:250:99:505,0,3133 0/1:138,109:249:99:521,0,3281 0/1:150,99:249:99:336,0,3601 0/1:153,93:250:99:194,0,3695 0/1:148,97:249:99:283,0,3093 145 | chr22 42526562 rs75276289 G C 3780.51 . AC=6;AF=0.429;AN=14;BaseQRankSum=15.200;DB;DP=1540;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=116.4370;MQ=215.67;MQ0=0;MQRankSum=9.072;QD=2.52;ReadPosRankSum=10.863 GT:AD:DP:GQ:PL 0/0:25,15:40:17.73:0,18,633 0/1:50,199:250:99:1522,0,283 0/1:143,106:250:99:600,0,2844 0/1:143,107:250:99:605,0,3002 0/1:151,99:250:99:432,0,3352 0/1:157,93:250:99:254,0,3483 0/1:149,99:248:99:368,0,2999 146 | chr22 42526567 rs76312385 G A 434.33 . AC=1;AF=0.071;AN=14;BaseQRankSum=18.089;DB;DP=1540;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=89.3746;MQ=219.80;MQ0=0;MQRankSum=6.196;QD=1.74;ReadPosRankSum=7.564 GT:AD:DP:GQ:PL 0/0:22,18:40:4.68:0,5,427 0/1:34,215:250:56.26:473,0,56 0/0:142,108:250:20.78:0,21,2288 0/0:142,108:250:49.48:0,49,2451 0/0:152,97:250:99:0,210,2801 0/0:150,100:250:34.96:0,35,2515 0/0:148,102:250:77.19:0,77,2590 147 | chr22 42526571 rs74644586 C G 339.60 . AC=1;AF=0.071;AN=14;BaseQRankSum=-11.480;DB;DP=1540;DS;Dels=0.02;FS=0.000;HRun=4;HaplotypeScore=93.3402;MQ=218.52;MQ0=0;MQRankSum=3.709;QD=1.36;ReadPosRankSum=6.322 GT:AD:DP:GQ:PL 0/0:22,18:40:36.46:0,36,689 0/1:4,232:239:30.49:378,0,30 0/0:138,110:249:99:0,295,4017 0/0:137,111:249:99:0,250,4041 0/0:147,97:245:99:0,321,4348 0/0:150,97:247:99:0,358,4657 0/0:144,101:247:99:0,275,4123 148 | chr22 42526573 rs1080996 T G 12579.34 . AC=8;AF=0.571;AN=14;BaseQRankSum=6.163;DB;DP=1540;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=76.6550;MQ=224.49;MQ0=0;MQRankSum=1.355;QD=8.17;ReadPosRankSum=5.794 GT:AD:DP:GQ:PL 0/1:22,18:40:99:200,0,668 1/1:4,244:248:99:5175,439,0 0/1:136,110:250:99:1862,0,3521 0/1:136,113:249:99:1734,0,3677 0/1:144,99:250:99:1119,0,3818 0/1:150,99:250:99:1196,0,4178 0/1:145,104:250:99:1293,0,3628 149 | chr22 42526580 rs1080995 G C 16619.47 . AC=8;AF=0.571;AN=14;BaseQRankSum=7.991;DB;DP=1541;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=56.1489;MQ=221.29;MQ0=0;MQRankSum=2.223;QD=10.78;ReadPosRankSum=4.443 GT:AD:DP:GQ:PL 0/1:22,19:41:99:335,0,664 1/1:15,234:250:99:5895,337,0 0/1:137,113:250:99:2421,0,3301 0/1:134,116:250:99:2262,0,3430 0/1:144,105:250:99:1929,0,3421 0/1:148,101:250:99:1778,0,3867 0/1:142,108:250:99:1999,0,3334 150 | chr22 42526634 . T C 32.60 . AC=1;AF=0.071;AN=14;BaseQRankSum=1.147;DP=1225;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=50.0151;MQ=240.65;MQ0=0;MQRankSum=1.151;QD=1.30;ReadPosRankSum=1.276 GT:AD:DP:GQ:PL 0/1:21,4:25:71.04:71,0,702 0/0:187,2:189:99:0,481,6080 0/0:233,0:233:99:0,667,7351 0/0:230,0:230:99:0,667,7394 0/0:174,1:175:99:0,446,5469 0/0:194,2:196:99:0,498,6239 0/0:174,0:175:99:0,511,5894 151 | chr22 42526679 . G C 60.60 . AC=1;AF=0.071;AN=14;BaseQRankSum=-12.425;DP=1525;DS;Dels=0.09;FS=0.000;HRun=1;HaplotypeScore=331.3182;MQ=215.48;MQ0=0;MQRankSum=-14.680;QD=0.24;ReadPosRankSum=-13.323 GT:AD:DP:GQ:PL 0/0:23,0:23:66.17:0,66,829 0/1:175,56:232:99:99,0,4273 0/0:199,26:226:76.45:0,76,5104 0/0:196,37:233:41.98:0,42,5109 0/0:170,47:218:99:0,162,4505 0/0:188,36:224:99:0,230,4974 0/0:177,47:225:99:0,167,4592 152 | chr22 42526694 rs1065852 G A 4420.63 . AC=2;AF=0.143;AN=14;BaseQRankSum=8.566;DB;DP=1529;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=117.6833;MQ=214.96;MQ0=0;MQRankSum=5.852;QD=8.84;ReadPosRankSum=1.454 GT:AD:DP:GQ:PL 0/0:29,0:29:81.24:0,81,1040 0/1:136,114:250:99:2333,0,3170 0/1:145,104:250:99:2087,0,2794 0/0:250,0:250:99:0,586,6963 0/0:247,2:250:99:0,497,6185 0/0:248,2:250:99:0,544,6640 0/0:250,0:250:99:0,571,6444 153 | chr22 42527471 rs28633410 T C 26831.16 . AC=10;AF=0.833;AN=12;BaseQRankSum=-1.092;DB;DP=1501;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=79.3853;MQ=176.86;MQ0=0;MQRankSum=-2.644;QD=17.89;ReadPosRankSum=2.185 GT:AD:DP:GQ:PL ./. 1/1:1,249:250:99:5741,478,0 0/1:102,148:250:99:3026,0,1748 0/1:115,132:250:99:2716,0,1896 1/1:1,249:250:99:5040,392,0 1/1:1,248:250:99:5109,427,0 1/1:4,245:249:99:5199,306,0 154 | chr22 42527533 rs28624811 A G 13619.46 . AC=7;AF=0.583;AN=12;BaseQRankSum=-8.893;DB;DP=1501;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=86.1948;MQ=179.18;MQ0=0;MQRankSum=0.472;QD=9.08;ReadPosRankSum=0.778 GT:AD:DP:GQ:PL ./. 1/1:2,241:243:99:4171,416,0 0/1:113,132:245:99:2000,0,2018 0/1:120,126:246:99:1781,0,1970 0/1:131,118:249:99:1885,0,1784 0/1:122,126:248:99:1893,0,1807 0/1:122,127:249:99:1890,0,2119 155 | chr22 42527793 rs1080989 C T 3454.66 . AC=2;AF=0.167;AN=12;BaseQRankSum=-3.007;DB;DP=1074;DS;Dels=0.01;FS=0.000;HRun=1;HaplotypeScore=75.7865;MQ=209.00;MQ0=0;MQRankSum=3.014;QD=9.36;ReadPosRankSum=0.618 GT:AD:DP:GQ:PL ./. 0/1:72,90:162:99:1699,0,1767 0/1:103,96:202:99:1756,0,2532 0/0:188,0:188:99:0,526,5889 0/0:160,0:160:99:0,457,4983 0/0:197,0:198:99:0,544,6100 0/0:156,0:156:99:0,439,5041 156 | chr22 42527891 . T A 109.83 . AC=5;AF=0.417;AN=12;BaseQRankSum=11.235;DP=1500;DS;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=638.4601;MQ=166.82;MQ0=0;MQRankSum=1.444;QD=0.09;ReadPosRankSum=0.839 GT:AD:DP:GQ:PL ./. 0/1:238,7:248:13.70:14,0,38 0/0:246,3:250:5.97:0,6,45 0/1:239,11:250:31.42:31,0,54 0/1:232,16:250:49.09:49,0,76 0/1:233,14:249:52.10:52,0,53 0/1:238,11:250:12.71:13,0,36 157 | -------------------------------------------------------------------------------- /test/test_vcf.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import doctest 3 | import os 4 | import commands 5 | from StringIO import StringIO 6 | 7 | import cyvcf 8 | from cyvcf import utils 9 | 10 | suite = doctest.DocTestSuite(cyvcf.parser) 11 | 12 | 13 | def fh(fname): 14 | return file(os.path.join(os.path.dirname(__file__), fname)) 15 | 16 | 17 | class TestVcfSpecs(unittest.TestCase): 18 | 19 | def test_vcf_4_0(self): 20 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 21 | assert reader.metadata['fileformat'] == 'VCFv4.0' 22 | 23 | # test we can walk the file at least 24 | for r in reader: 25 | 26 | if r.POS == 1230237: 27 | assert r.is_monomorphic 28 | else: 29 | assert not r.is_monomorphic 30 | 31 | if 'AF' in r.INFO: 32 | self.assertEqual(type(r.INFO['AF']), type([])) 33 | 34 | for c in r: 35 | assert c 36 | 37 | # issue 19, in the example ref the GQ is length 1 38 | if c.called: 39 | self.assertEqual(type(c.data['GQ']), type(1)) 40 | if 'HQ' in c.data and c.data['HQ'] is not None: 41 | self.assertEqual(type(c.data['HQ']), type([])) 42 | 43 | 44 | 45 | def test_vcf_4_1(self): 46 | return 47 | reader = cyvcf.Reader(fh('example-4.1.vcf')) 48 | self.assertEqual(reader.metadata['fileformat'], 'VCFv4.1') 49 | 50 | # contigs were added in vcf4.1 51 | # probably need to add a reader.contigs attribute 52 | assert 'contig' in reader.metadata 53 | 54 | # test we can walk the file at least 55 | for r in reader: 56 | for c in r: 57 | assert c 58 | 59 | # asserting False while I work out what to check 60 | assert False 61 | 62 | def test_vcf_4_1_sv(self): 63 | return 64 | 65 | reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) 66 | 67 | assert 'SVLEN' in reader.infos 68 | 69 | # test we can walk the file at least 70 | for r in reader: 71 | print r 72 | for c in r: 73 | print c 74 | assert c 75 | 76 | # asserting False while I work out what to check 77 | assert False 78 | 79 | 80 | class TestGatkOutput(unittest.TestCase): 81 | 82 | filename = 'gatk.vcf' 83 | 84 | samples = ['BLANK', 'NA12878', 'NA12891', 'NA12892', 85 | 'NA19238', 'NA19239', 'NA19240'] 86 | formats = ['AD', 'DP', 'GQ', 'GT', 'PL'] 87 | infos = ['AC', 'AF', 'AN', 'BaseQRankSum', 'DB', 'DP', 'DS', 88 | 'Dels', 'FS', 'HRun', 'HaplotypeScore', 'InbreedingCoeff', 89 | 'MQ', 'MQ0', 'MQRankSum', 'QD', 'ReadPosRankSum'] 90 | 91 | n_calls = 37 92 | 93 | def setUp(self): 94 | self.reader = cyvcf.Reader(fh(self.filename)) 95 | 96 | def testSamples(self): 97 | self.assertEqual(self.reader.samples, self.samples) 98 | 99 | def testFormats(self): 100 | self.assertEqual(set(self.reader.formats), set(self.formats)) 101 | 102 | def testInfos(self): 103 | self.assertEqual(set(self.reader.infos), set(self.infos)) 104 | 105 | 106 | def testCalls(self): 107 | n = 0 108 | 109 | for site in self.reader: 110 | n += 1 111 | self.assertEqual(len(site.samples), len(self.samples)) 112 | 113 | 114 | # check sample name lookup 115 | for s in self.samples: 116 | assert site.genotype(s) 117 | 118 | # check ordered access 119 | self.assertEqual([x.sample for x in site.samples], self.samples) 120 | self.assertEqual(len(site.gt_phred_likelihoods), len(self.samples)) 121 | self.assertEqual(n, self.n_calls) 122 | 123 | 124 | class TestFreebayesOutput(TestGatkOutput): 125 | 126 | filename = 'freebayes.vcf' 127 | formats = ['AO', 'DP', 'GL', 'GLE', 'GQ', 'GT', 'QA', 'QR', 'RO'] 128 | infos = ['AB', 'ABP', 'AC', 'AF', 'AN', 'AO', 'BVAR', 'CIGAR', 129 | 'DB', 'DP', 'DPRA', 'EPP', 'EPPR', 'HWE', 'LEN', 'MEANALT', 130 | 'NUMALT', 'RPP', 'MQMR', 'ODDS', 'MQM', 'PAIREDR', 'PAIRED', 131 | 'SAP', 'XRM', 'RO', 'REPEAT', 'XRI', 'XAS', 'XAI', 'SRP', 132 | 'XAM', 'XRS', 'RPPR', 'NS', 'RUN', 'CpG', 'TYPE'] 133 | n_calls = 104 134 | 135 | 136 | def testParse(self): 137 | reader = cyvcf.Reader(fh('freebayes.vcf')) 138 | print reader.samples 139 | self.assertEqual(len(reader.samples), 7) 140 | n = 0 141 | for r in reader: 142 | n+=1 143 | for x in r: 144 | assert x 145 | assert n == self.n_calls 146 | 147 | class TestSamtoolsOutput(unittest.TestCase): 148 | 149 | def testParse(self): 150 | reader = cyvcf.Reader(fh('samtools.vcf')) 151 | 152 | self.assertEqual(len(reader.samples), 1) 153 | self.assertEqual(sum(1 for _ in reader), 11) 154 | 155 | 156 | class Test1kg(unittest.TestCase): 157 | 158 | def testParse(self): 159 | reader = cyvcf.Reader(fh('1kg.vcf.gz')) 160 | 161 | self.assertEqual(len(reader.samples), 629) 162 | for _ in reader: 163 | pass 164 | 165 | 166 | class TestWriter(unittest.TestCase): 167 | 168 | def testWrite(self): 169 | 170 | reader = cyvcf.Reader(fh('gatk.vcf')) 171 | out = StringIO() 172 | writer = cyvcf.Writer(out, reader) 173 | 174 | records = list(reader) 175 | 176 | map(writer.write_record, records) 177 | out.seek(0) 178 | reader2 = cyvcf.Reader(out) 179 | 180 | self.assertEquals(reader.samples, reader2.samples) 181 | self.assertEquals(reader.formats, reader2.formats) 182 | 183 | for k in reader.infos: 184 | self.assertEquals(reader.infos[k], reader2.infos[k], (reader.infos[k], reader2.infos[k])) 185 | 186 | for l, r in zip(records, reader2): 187 | self.assertEquals(l.samples, r.samples) 188 | 189 | class TestRecord(unittest.TestCase): 190 | 191 | def test_num_calls(self): 192 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 193 | for var in reader: 194 | num_calls = (var.num_hom_ref + var.num_hom_alt + \ 195 | var.num_het + var.num_unknown) 196 | self.assertEqual(len(var.samples), num_calls) 197 | 198 | def test_call_rate(self): 199 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 200 | for var in reader: 201 | call_rate = var.call_rate 202 | if var.POS == 14370: 203 | self.assertEqual(3.0/3.0, call_rate) 204 | if var.POS == 17330: 205 | self.assertEqual(3.0/3.0, call_rate) 206 | if var.POS == 1110696: 207 | self.assertEqual(3.0/3.0, call_rate) 208 | if var.POS == 1230237: 209 | self.assertEqual(3.0/3.0, call_rate) 210 | elif var.POS == 1234567: 211 | self.assertEqual(2.0/3.0, call_rate) 212 | 213 | def test_aaf(self): 214 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 215 | for var in reader: 216 | aaf = var.aaf 217 | if var.POS == 14370: 218 | self.assertEqual(3.0/6.0, aaf) 219 | if var.POS == 17330: 220 | self.assertEqual(1.0/6.0, aaf) 221 | if var.POS == 1110696: 222 | self.assertEqual(None, aaf) 223 | if var.POS == 1230237: 224 | self.assertEqual(0.0/6.0, aaf) 225 | elif var.POS == 1234567: 226 | self.assertEqual(None, aaf) 227 | 228 | def test_pi(self): 229 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 230 | for var in reader: 231 | pi = var.nucl_diversity 232 | if var.POS == 14370: 233 | self.assertEqual(6.0/10.0, pi) 234 | if var.POS == 17330: 235 | self.assertEqual(1.0/3.0, pi) 236 | if var.POS == 1110696: 237 | self.assertEqual(None, pi) 238 | if var.POS == 1230237: 239 | self.assertEqual(0.0/6.0, pi) 240 | elif var.POS == 1234567: 241 | self.assertEqual(None, pi) 242 | 243 | def test_is_snp(self): 244 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 245 | for var in reader: 246 | is_snp = var.is_snp 247 | if var.POS == 14370: 248 | self.assertEqual(True, is_snp) 249 | if var.POS == 17330: 250 | self.assertEqual(True, is_snp) 251 | if var.POS == 1110696: 252 | self.assertEqual(True, is_snp) 253 | if var.POS == 1230237: 254 | self.assertEqual(False, is_snp) 255 | elif var.POS == 1234567: 256 | self.assertEqual(False, is_snp) 257 | 258 | def test_is_indel(self): 259 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 260 | for var in reader: 261 | is_indel = var.is_indel 262 | if var.POS == 14370: 263 | self.assertEqual(False, is_indel) 264 | if var.POS == 17330: 265 | self.assertEqual(False, is_indel) 266 | if var.POS == 1110696: 267 | self.assertEqual(False, is_indel) 268 | if var.POS == 1230237: 269 | self.assertEqual(True, is_indel) 270 | elif var.POS == 1234567: 271 | self.assertEqual(True, is_indel) 272 | 273 | def test_is_transition(self): 274 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 275 | for var in reader: 276 | is_trans = var.is_transition 277 | if var.POS == 14370: 278 | self.assertEqual(True, is_trans) 279 | if var.POS == 17330: 280 | self.assertEqual(False, is_trans) 281 | if var.POS == 1110696: 282 | self.assertEqual(False, is_trans) 283 | if var.POS == 1230237: 284 | self.assertEqual(False, is_trans) 285 | elif var.POS == 1234567: 286 | self.assertEqual(False, is_trans) 287 | 288 | def test_is_deletion(self): 289 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 290 | for var in reader: 291 | is_del = var.is_deletion 292 | if var.POS == 14370: 293 | self.assertEqual(False, is_del) 294 | if var.POS == 17330: 295 | self.assertEqual(False, is_del) 296 | if var.POS == 1110696: 297 | self.assertEqual(False, is_del) 298 | if var.POS == 1230237: 299 | self.assertEqual(True, is_del) 300 | elif var.POS == 1234567: 301 | self.assertEqual(False, is_del) 302 | 303 | def test_var_type(self): 304 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 305 | for var in reader: 306 | type = var.var_type 307 | if var.POS == 14370: 308 | self.assertEqual("snp", type) 309 | if var.POS == 17330: 310 | self.assertEqual("snp", type) 311 | if var.POS == 1110696: 312 | self.assertEqual("snp", type) 313 | if var.POS == 1230237: 314 | self.assertEqual("indel", type) 315 | elif var.POS == 1234567: 316 | self.assertEqual("indel", type) 317 | # SV tests 318 | reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) 319 | for var in reader: 320 | type = var.var_type 321 | if var.POS == 2827693: 322 | self.assertEqual("sv", type) 323 | if var.POS == 321682: 324 | self.assertEqual("sv", type) 325 | if var.POS == 14477084: 326 | self.assertEqual("sv", type) 327 | if var.POS == 9425916: 328 | self.assertEqual("sv", type) 329 | elif var.POS == 12665100: 330 | self.assertEqual("sv", type) 331 | elif var.POS == 18665128: 332 | self.assertEqual("sv", type) 333 | 334 | 335 | def test_var_subtype(self): 336 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 337 | for var in reader: 338 | subtype = var.var_subtype 339 | if var.POS == 14370: 340 | self.assertEqual("ts", subtype) 341 | if var.POS == 17330: 342 | self.assertEqual("tv", subtype) 343 | if var.POS == 1110696: 344 | self.assertEqual("unknown", subtype) 345 | if var.POS == 1230237: 346 | self.assertEqual("del", subtype) 347 | elif var.POS == 1234567: 348 | self.assertEqual("unknown", subtype) 349 | # SV tests 350 | reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) 351 | for var in reader: 352 | subtype = var.var_subtype 353 | if var.POS == 2827693: 354 | self.assertEqual("DEL", subtype) 355 | if var.POS == 321682: 356 | self.assertEqual("DEL", subtype) 357 | if var.POS == 14477084: 358 | self.assertEqual("DEL:ME:ALU", subtype) 359 | if var.POS == 9425916: 360 | self.assertEqual("INS:ME:L1", subtype) 361 | elif var.POS == 12665100: 362 | self.assertEqual("DUP", subtype) 363 | elif var.POS == 18665128: 364 | self.assertEqual("DUP:TANDEM", subtype) 365 | 366 | def test_is_sv(self): 367 | reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) 368 | for var in reader: 369 | is_sv = var.is_sv 370 | if var.POS == 2827693: 371 | self.assertEqual(True, is_sv) 372 | if var.POS == 321682: 373 | self.assertEqual(True, is_sv) 374 | if var.POS == 14477084: 375 | self.assertEqual(True, is_sv) 376 | if var.POS == 9425916: 377 | self.assertEqual(True, is_sv) 378 | elif var.POS == 12665100: 379 | self.assertEqual(True, is_sv) 380 | elif var.POS == 18665128: 381 | self.assertEqual(True, is_sv) 382 | 383 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 384 | for var in reader: 385 | is_sv = var.is_sv 386 | if var.POS == 14370: 387 | self.assertEqual(False, is_sv) 388 | if var.POS == 17330: 389 | self.assertEqual(False, is_sv) 390 | if var.POS == 1110696: 391 | self.assertEqual(False, is_sv) 392 | if var.POS == 1230237: 393 | self.assertEqual(False, is_sv) 394 | elif var.POS == 1234567: 395 | self.assertEqual(False, is_sv) 396 | 397 | def test_is_sv_precise(self): 398 | reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) 399 | for var in reader: 400 | is_precise = var.is_sv_precise 401 | if var.POS == 2827693: 402 | self.assertEqual(True, is_precise) 403 | if var.POS == 321682: 404 | self.assertEqual(False, is_precise) 405 | if var.POS == 14477084: 406 | self.assertEqual(False, is_precise) 407 | if var.POS == 9425916: 408 | self.assertEqual(False, is_precise) 409 | elif var.POS == 12665100: 410 | self.assertEqual(False, is_precise) 411 | elif var.POS == 18665128: 412 | self.assertEqual(False, is_precise) 413 | 414 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 415 | for var in reader: 416 | is_precise = var.is_sv_precise 417 | if var.POS == 14370: 418 | self.assertEqual(False, is_precise) 419 | if var.POS == 17330: 420 | self.assertEqual(False, is_precise) 421 | if var.POS == 1110696: 422 | self.assertEqual(False, is_precise) 423 | if var.POS == 1230237: 424 | self.assertEqual(False, is_precise) 425 | elif var.POS == 1234567: 426 | self.assertEqual(False, is_precise) 427 | 428 | def test_sv_end(self): 429 | reader = cyvcf.Reader(fh('example-4.1-sv.vcf')) 430 | for var in reader: 431 | sv_end = var.sv_end 432 | if var.POS == 2827693: 433 | self.assertEqual(2827680, sv_end) 434 | if var.POS == 321682: 435 | self.assertEqual(321887, sv_end) 436 | if var.POS == 14477084: 437 | self.assertEqual(14477381, sv_end) 438 | if var.POS == 9425916: 439 | self.assertEqual(9425916, sv_end) 440 | elif var.POS == 12665100: 441 | self.assertEqual(12686200, sv_end) 442 | elif var.POS == 18665128: 443 | self.assertEqual(18665204, sv_end) 444 | 445 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 446 | for var in reader: 447 | sv_end = var.sv_end 448 | if var.POS == 14370: 449 | self.assertEqual(None, sv_end) 450 | if var.POS == 17330: 451 | self.assertEqual(None, sv_end) 452 | if var.POS == 1110696: 453 | self.assertEqual(None, sv_end) 454 | if var.POS == 1230237: 455 | self.assertEqual(None, sv_end) 456 | elif var.POS == 1234567: 457 | self.assertEqual(None, sv_end) 458 | 459 | 460 | class TestCall(unittest.TestCase): 461 | 462 | def test_phased(self): 463 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 464 | for var in reader: 465 | phases = var.gt_phases 466 | print var 467 | if var.POS == 14370: 468 | self.assertEqual([True, True, False], phases) 469 | if var.POS == 17330: 470 | self.assertEqual([True, True, False], phases) 471 | if var.POS == 1110696: 472 | self.assertEqual([True, True, False], phases) 473 | if var.POS == 1230237: 474 | self.assertEqual([True, True, False], phases) 475 | elif var.POS == 1234567: 476 | self.assertEqual([False, False, False], phases) 477 | 478 | def test_gt_bases(self): 479 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 480 | for var in reader: 481 | gt_bases = [s.gt_bases for s in var.samples] 482 | if var.POS == 14370: 483 | self.assertEqual(['G|G', 'A|G', 'A/A'], gt_bases) 484 | elif var.POS == 17330: 485 | self.assertEqual(['T|T', 'T|A', 'T/T'], gt_bases) 486 | elif var.POS == 1110696: 487 | self.assertEqual(['G|T', 'T|G', 'T/T'], gt_bases) 488 | elif var.POS == 1230237: 489 | self.assertEqual(['T|T', 'T|T', 'T/T'], gt_bases) 490 | elif var.POS == 1234567: 491 | self.assertEqual([None, 'GTCT/GTACT', 'G/G'], gt_bases) 492 | 493 | def test_gt_types(self): 494 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 495 | for var in reader: 496 | for s in var: 497 | print s.data 498 | gt_types = [s.gt_type for s in var.samples] 499 | if var.POS == 14370: 500 | self.assertEqual([0,1,3], gt_types) 501 | elif var.POS == 17330: 502 | self.assertEqual([0,1,0], gt_types) 503 | elif var.POS == 1110696: 504 | self.assertEqual([1,1,3], gt_types) 505 | elif var.POS == 1230237: 506 | self.assertEqual([0,0,0], gt_types) 507 | elif var.POS == 1234567: 508 | self.assertEqual([None,1,3], gt_types) 509 | 510 | def test_gt_depths(self): 511 | reader = cyvcf.Reader(fh('example-4.0.vcf')) 512 | for var in reader: 513 | for s in var: 514 | print s.data 515 | gt_depths = [s.gt_depth for s in var.samples] 516 | if var.POS == 14370: 517 | self.assertEqual([1,8,5], gt_depths) 518 | elif var.POS == 17330: 519 | self.assertEqual([3,5,3], gt_depths) 520 | elif var.POS == 1110696: 521 | self.assertEqual([6,0,4], gt_depths) 522 | elif var.POS == 1230237: 523 | self.assertEqual([7,4,2], gt_depths) 524 | elif var.POS == 1234567: 525 | self.assertEqual([4,2,3], gt_depths) 526 | 527 | def test_gt_ref_depths(self): 528 | 529 | reader = cyvcf.Reader(fh('gatk.vcf')) 530 | for var in reader: 531 | gt_ref_depths = [s.gt_ref_depth for s in var.samples] 532 | if var.POS == 42522392: 533 | self.assertEqual([6,138,169,249,248,250,250], gt_ref_depths) 534 | elif var.POS == 42522613: 535 | self.assertEqual([13,118,241,161,110,106,116], gt_ref_depths) 536 | elif var.POS == 42527891: 537 | self.assertEqual([-1,238,246,239,232,233,238], gt_ref_depths) 538 | 539 | def test_gt_alt_depths(self): 540 | 541 | reader = cyvcf.Reader(fh('gatk.vcf')) 542 | for var in reader: 543 | gt_alt_depths = [s.gt_alt_depth for s in var.samples] 544 | if var.POS == 42522392: 545 | self.assertEqual([0,107,77,0,1,0,0], gt_alt_depths) 546 | elif var.POS == 42522613: 547 | self.assertEqual([4,127,0,85,132,135,126], gt_alt_depths) 548 | elif var.POS == 42527891: 549 | self.assertEqual([-1,7,3,11,16,14,11], gt_alt_depths) 550 | 551 | def test_gt_quals(self): 552 | 553 | reader = cyvcf.Reader(fh('gatk.vcf')) 554 | for var in reader: 555 | gt_quals = [s.gt_qual for s in var.samples] 556 | if var.POS == 42522392: 557 | self.assertEqual([18.04,99,99,99,99,99,99], gt_quals) 558 | elif var.POS == 42522613: 559 | self.assertEqual([62.64,99,99,99,99,99,99], gt_quals) 560 | elif var.POS == 42527891: 561 | self.assertEqual([-1,13.70,5.97,31.42,49.09,52.10,12.71], gt_quals) 562 | 563 | 564 | class TestTabix(unittest.TestCase): 565 | 566 | def setUp(self): 567 | self.reader = cyvcf.Reader(fh('tb.vcf.gz')) 568 | 569 | self.run = cyvcf.parser.pysam is not None 570 | 571 | 572 | def testFetchRange(self): 573 | if not self.run: 574 | return 575 | lines = list(self.reader.fetch('20', 14370, 14370)) 576 | self.assertEquals(len(lines), 1) 577 | self.assertEqual(lines[0].POS, 14370) 578 | 579 | lines = list(self.reader.fetch('20', 14370, 17330)) 580 | self.assertEquals(len(lines), 2) 581 | self.assertEqual(lines[0].POS, 14370) 582 | self.assertEqual(lines[1].POS, 17330) 583 | 584 | 585 | lines = list(self.reader.fetch('20', 1110695, 1234567)) 586 | self.assertEquals(len(lines), 3) 587 | 588 | def testFetchSite(self): 589 | if not self.run: 590 | return 591 | site = self.reader.fetch('20', 14370) 592 | assert site.POS == 14370 593 | 594 | site = self.reader.fetch('20', 14369) 595 | assert site is None 596 | 597 | 598 | 599 | 600 | class TestOpenMethods(unittest.TestCase): 601 | 602 | samples = 'NA00001 NA00002 NA00003'.split() 603 | 604 | def testOpenFilehandle(self): 605 | r = cyvcf.Reader(fh('example-4.0.vcf')) 606 | self.assertEqual(self.samples, r.samples) 607 | self.assertEqual('example-4.0.vcf', os.path.split(r.filename)[1]) 608 | 609 | def testOpenFilename(self): 610 | r = cyvcf.Reader(filename='test/example-4.0.vcf') 611 | self.assertEqual(self.samples, r.samples) 612 | 613 | def testOpenFilehandleGzipped(self): 614 | r = cyvcf.Reader(fh('tb.vcf.gz')) 615 | self.assertEqual(self.samples, r.samples) 616 | 617 | def testOpenFilenameGzipped(self): 618 | r = cyvcf.Reader(filename='test/tb.vcf.gz') 619 | self.assertEqual(self.samples, r.samples) 620 | 621 | 622 | class TestFilter(unittest.TestCase): 623 | 624 | 625 | def testApplyFilter(self): 626 | s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 test/example-4.0.vcf sq') 627 | #print out 628 | assert s == 0 629 | buf = StringIO() 630 | buf.write(out) 631 | buf.seek(0) 632 | 633 | print buf.getvalue() 634 | reader = cyvcf.Reader(buf) 635 | 636 | 637 | # check filter got into output file 638 | assert 'sq30' in reader.filters 639 | 640 | print reader.filters 641 | 642 | # check sites were filtered 643 | n = 0 644 | for r in reader: 645 | if r.QUAL < 30: 646 | assert 'sq30' in r.FILTER 647 | n += 1 648 | else: 649 | assert r.FILTER is None or 'sq30' not in r.FILTER 650 | assert n == 2 651 | 652 | 653 | def testApplyMultipleFilters(self): 654 | s, out = commands.getstatusoutput('python scripts/vcf_filter.py --site-quality 30 ' 655 | '--genotype-quality 50 test/example-4.0.vcf sq mgq') 656 | assert s == 0 657 | #print out 658 | buf = StringIO() 659 | buf.write(out) 660 | buf.seek(0) 661 | reader = cyvcf.Reader(buf) 662 | 663 | print reader.filters 664 | 665 | assert 'mgq50' in reader.filters 666 | assert 'sq30' in reader.filters 667 | 668 | 669 | class TestRegression(unittest.TestCase): 670 | 671 | def test_issue_16(self): 672 | reader = cyvcf.Reader(fh('issue-16.vcf')) 673 | assert reader.next().QUAL == None 674 | 675 | def test_null_mono(self): 676 | # null qualities were written as blank, causing subsequent parse to fail 677 | print os.path.abspath(os.path.join(os.path.dirname(__file__), 'null_genotype_mono.vcf')) 678 | p = cyvcf.Reader(fh('null_genotype_mono.vcf')) 679 | assert p.samples 680 | out = StringIO() 681 | writer = cyvcf.Writer(out, p) 682 | map(writer.write_record, p) 683 | out.seek(0) 684 | print out.getvalue() 685 | p2 = cyvcf.Reader(out) 686 | rec = p2.next() 687 | assert rec.samples 688 | 689 | 690 | class TestUtils(unittest.TestCase): 691 | 692 | def test_walk(self): 693 | # easy case: all same sites 694 | reader1 = cyvcf.Reader(fh('example-4.0.vcf')) 695 | reader2 = cyvcf.Reader(fh('example-4.0.vcf')) 696 | reader3 = cyvcf.Reader(fh('example-4.0.vcf')) 697 | 698 | n = 0 699 | for x in utils.walk_together(reader1, reader2, reader3): 700 | assert len(x) == 3 701 | assert (x[0] == x[1]) and (x[1] == x[2]) 702 | n+= 1 703 | assert n == 5 704 | 705 | # artificial case 2 from the left, 2 from the right, 2 together, 1 from the right, 1 from the left 706 | 707 | expected = 'llrrttrl' 708 | reader1 = cyvcf.Reader(fh('walk_left.vcf')) 709 | reader2 = cyvcf.Reader(fh('example-4.0.vcf')) 710 | 711 | for ex, recs in zip(expected, utils.walk_together(reader1, reader2)): 712 | 713 | if ex == 'l': 714 | assert recs[0] is not None 715 | assert recs[1] is None 716 | if ex == 'r': 717 | assert recs[1] is not None 718 | assert recs[0] is None 719 | if ex == 't': 720 | assert recs[0] is not None 721 | assert recs[1] is not None 722 | 723 | 724 | class TestAD(unittest.TestCase): 725 | def setUp(self): 726 | self.reader = cyvcf.Reader(fh('test.vcf')) 727 | 728 | def testRefDepth(self): 729 | v = self.reader.next() 730 | self.assertEqual(v.samples[0].gt_ref_depth, -1) 731 | 732 | class TestGLInt(unittest.TestCase): 733 | def setUp(self): 734 | self.reader = cyvcf.Reader(fh('test-gl.vcf')) 735 | def testGLInt(self): 736 | v = next(self.reader) 737 | self.assertEqual(v.samples[0].gt_phred_likelihoods, None) 738 | 739 | 740 | 741 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestAD)) 742 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGatkOutput)) 743 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFreebayesOutput)) 744 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestSamtoolsOutput)) 745 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestWriter)) 746 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestTabix)) 747 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestOpenMethods)) 748 | #suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestFilter)) 749 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Test1kg)) 750 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRecord)) 751 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCall)) 752 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestRegression)) 753 | suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestGLInt)) 754 | -------------------------------------------------------------------------------- /cyvcf/parser.pyx: -------------------------------------------------------------------------------- 1 | import collections 2 | import re 3 | import csv 4 | import gzip 5 | import sys 6 | import itertools 7 | 8 | from . import utils 9 | 10 | try: 11 | import pysam 12 | except ImportError: 13 | pysam = None 14 | 15 | # Metadata parsers/constants 16 | RESERVED_INFO = { 17 | 'AA': 'String', 'AC': 'Integer', 'AF': 'Float', 'AN': 'Integer', 18 | 'BQ': 'Float', 'CIGAR': 'String', 'DB': 'Flag', 'DP': 'Integer', 19 | 'END': 'Integer', 'H2': 'Flag', 'MQ': 'Float', 'MQ0': 'Integer', 20 | 'NS': 'Integer', 'SB': 'String', 'SOMATIC': 'Flag', 'VALIDATED': 'Flag' 21 | } 22 | 23 | RESERVED_FORMAT = { 24 | 'GT': 'String', 'DP': 'Integer', 'FT': 'String', 'GL': 'Float', 25 | 'GQ': 'Float', 'HQ': 'Float' 26 | } 27 | 28 | Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc']) 29 | Filter = collections.namedtuple('Filter', ['id', 'desc']) 30 | Format = collections.namedtuple('Format', ['id', 'num', 'type', 'desc']) 31 | 32 | HOM_REF = 0 33 | HET = 1 34 | HOM_ALT = 3 35 | UNKNOWN = 2 36 | 37 | cdef _Call _parse_sample(char *sample, list samp_fmt, 38 | list samp_fmt_types, list samp_fmt_nums, 39 | char *name, Record rec): 40 | 41 | cdef dict sampdict = {x: None for x in samp_fmt} 42 | cdef list lvals 43 | 44 | cdef list svals = sample.split(":") 45 | 46 | cdef int i 47 | cdef int N = len(svals) 48 | for i in range(N): 49 | fmt = samp_fmt[i] 50 | entry_type = samp_fmt_types[i] 51 | entry_num = samp_fmt_nums[i] 52 | vals = svals[i] 53 | 54 | # short circuit the most common 55 | if vals == ".": 56 | continue 57 | if vals == "./.": 58 | continue 59 | if vals == "": 60 | continue 61 | 62 | 63 | # we don't need to split single entries 64 | if entry_num == 1 or (entry_num is None and ',' not in vals): 65 | if entry_type == 'Integer': 66 | if vals.isdigit(): 67 | sampdict[fmt] = int(vals) 68 | continue 69 | try: 70 | sampdict[fmt] = float(vals) 71 | except ValueError: 72 | sampdict[fmt] = vals 73 | continue 74 | elif entry_type == 'Float': 75 | sampdict[fmt] = float(vals) 76 | else: 77 | sampdict[fmt] = vals 78 | 79 | continue 80 | 81 | if entry_num == 1 and entry_type == 'String': 82 | sampdict[fmt] = vals 83 | continue 84 | 85 | lvals = vals.split(',') 86 | 87 | if entry_type == 'Integer': 88 | sampdict[fmt] = [int(x) if x != '.' else '.' for x in lvals] 89 | elif entry_type in ('Float', 'Numeric'): 90 | sampdict[fmt] = [float(x) if x != '.' else '.' for x in lvals] 91 | else: 92 | sampdict[fmt] = vals 93 | return _Call(rec, name, sampdict) 94 | 95 | cdef inline list _map(func, list iterable, char *bad='.'): 96 | '''``map``, but make bad values None.''' 97 | return [func(x) if x != bad else None for x in iterable] 98 | 99 | class _vcf_metadata_parser(object): 100 | '''Parse the metadat in the header of a VCF file.''' 101 | def __init__(self): 102 | super(_vcf_metadata_parser, self).__init__() 103 | self.info_pattern = re.compile(r'''\#\#INFO=< 104 | ID=(?P[^,]+), 105 | Number=(?P-?\d+|\.|[ARG]), 106 | Type=(?PInteger|Float|Flag|Character|String), 107 | Description="(?P[^"]*)" 108 | >''', re.VERBOSE) 109 | self.filter_pattern = re.compile(r'''\#\#FILTER=< 110 | ID=(?P[^,]+), 111 | Description="(?P[^"]*)" 112 | >''', re.VERBOSE) 113 | self.format_pattern = re.compile(r'''\#\#FORMAT=< 114 | ID=(?P.+), 115 | Number=(?P-?\d+|\.|[ARG]), 116 | Type=(?P.+), 117 | Description="(?P.*)" 118 | >''', re.VERBOSE) 119 | self.meta_pattern = re.compile(r'''##(?P.+)=(?P.+)''') 120 | 121 | def read_info(self, info_string): 122 | '''Read a meta-information INFO line.''' 123 | match = self.info_pattern.match(info_string) 124 | if not match: 125 | raise SyntaxError( 126 | "One of the INFO lines is malformed: %s" % info_string) 127 | 128 | try: 129 | num = int(match.group('number')) 130 | if num < 0: 131 | num = None 132 | except ValueError: 133 | num = None 134 | 135 | info = Info(match.group('id'), num, 136 | match.group('type'), match.group('desc')) 137 | 138 | return (match.group('id'), info) 139 | 140 | def read_filter(self, filter_string): 141 | '''Read a meta-information FILTER line.''' 142 | match = self.filter_pattern.match(filter_string) 143 | if not match: 144 | raise SyntaxError( 145 | "One of the FILTER lines is malformed: %s" % filter_string) 146 | 147 | filt = Filter(match.group('id'), match.group('desc')) 148 | 149 | return (match.group('id'), filt) 150 | 151 | def read_format(self, format_string): 152 | '''Read a meta-information FORMAT line.''' 153 | match = self.format_pattern.match(format_string) 154 | if not match: 155 | raise SyntaxError( 156 | "One of the FORMAT lines is malformed: %s" % format_string) 157 | 158 | try: 159 | num = int(match.group('number')) 160 | if num < 0: 161 | num = None 162 | except ValueError: 163 | num = None 164 | 165 | form = Format(match.group('id'), num, 166 | match.group('type'), match.group('desc')) 167 | 168 | return (match.group('id'), form) 169 | 170 | def read_meta(self, meta_string): 171 | match = self.meta_pattern.match(meta_string) 172 | return match.group('key'), match.group('val') 173 | 174 | 175 | cdef class _Call(object): 176 | """ A genotype call, a cell entry in a VCF file""" 177 | 178 | cdef public bytes sample #NA12878 179 | cdef bytes gt_nums #'0/1' 180 | # use bytes instead of char * because of C -> Python string complications 181 | # see: http://docs.cython.org/src/tutorial/strings.html 182 | cdef public Record site #instance of Record 183 | cdef public dict data 184 | cdef public bint called, phased 185 | cdef list alleles 186 | 187 | def __cinit__(self, Record site, char *sample, dict data): 188 | #: The ``Record`` for this ``_Call`` 189 | self.site = site 190 | #: The sample name 191 | self.sample = sample 192 | #: Dictionary of data from the VCF file 193 | self.data = data 194 | # '0/1', '0/0', etc. 195 | self.gt_nums = self.data['GT'] 196 | # True if the GT is not ./. 197 | self.called = self.gt_nums is not None 198 | # True if the GT is phased (A|G, not A/G) 199 | self.phased = self.called and '|' in self.data['GT'] 200 | 201 | if self.called: 202 | self.alleles = self.gt_nums.split('|' if self.phased else '/') 203 | else: 204 | self.alleles = [] 205 | 206 | def __repr__(self): 207 | return "Call(sample=%s, GT=%s, GQ=%s)" % (self.sample, self.gt_nums, self.data.get('GQ', '')) 208 | 209 | def __richcmp__(self, other, int op): 210 | """ Two _Calls are equal if their Records are equal 211 | and the samples and ``gt_type``s are the same 212 | """ 213 | # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 214 | if op == 2: # 2 215 | return (self.site == other.site 216 | and self.sample == other.sample 217 | and self.gt_type == other.gt_type) 218 | 219 | def __getitem__(self, key): 220 | """ Lookup value, backwards compatibility """ 221 | return self.data[key] 222 | 223 | property gt_bases: 224 | def __get__(self): 225 | '''The actual genotype alleles. 226 | E.g. if VCF genotype is 0/1, return A/G 227 | ''' 228 | # nothing to do if no genotype call 229 | if self.called: 230 | # grab the numeric alleles of the gt string; tokenize by phasing 231 | # lookup and return the actual DNA alleles 232 | phase_char = ['/', '|'][self.phased] 233 | try: 234 | return phase_char.join([self.site.alleles[int(a)] \ 235 | if a != '.' else '.' for a in 236 | self.alleles]) 237 | except KeyError: 238 | sys.stderr.write("Allele number not found in list of alleles\n") 239 | else: 240 | return None 241 | 242 | property gt_type: 243 | 244 | def __get__(self): 245 | '''The type of genotype. 246 | 0 / 00000000 hom ref 247 | 1 / 00000001 het 248 | 2 / 00000010 missing 249 | 3 / 00000011 hom alt 250 | hom_ref = 0 251 | het = 1 252 | hom_alt = 3 (we don;t track _which+ ALT) 253 | uncalled = 2 254 | ''' 255 | # extract the numeric alleles of the gt string 256 | gt_type = None 257 | if self.called: 258 | # grab the numeric alleles of the gt string; tokenize by phasing 259 | 260 | if len(self.alleles) == 2: 261 | if self.alleles[0] == self.alleles[1]: 262 | if self.alleles[0] == "0": 263 | gt_type = HOM_REF 264 | else: 265 | gt_type = HOM_ALT 266 | else: 267 | gt_type = HET 268 | elif len(self.alleles) == 1: 269 | if self.alleles[0] == "0": 270 | gt_type = HOM_REF 271 | else: 272 | gt_type = HOM_ALT 273 | 274 | return gt_type 275 | 276 | property gt_depth: 277 | def __get__(self): 278 | '''The depth of aligned sequences that led to the genotype 279 | call for this sample. 280 | ''' 281 | # extract the numeric alleles of the gt string 282 | try: 283 | depth = self.data['DP'] 284 | if depth is not None: 285 | return depth 286 | else: 287 | return -1 288 | except KeyError: 289 | return -1 290 | 291 | property gt_ref_depth: 292 | def __get__(self): 293 | '''The depth of aligned sequences that supported the 294 | reference allele for this sample. 295 | ''' 296 | # extract the numeric alleles of the gt string 297 | if 'AD' in self.data: 298 | depths = self.data['AD'] 299 | if depths is not None: 300 | # require bi-allelic 301 | if isinstance(depths, (list, tuple)) and len(depths) == 2: 302 | d = depths[0] 303 | if d is None: 304 | return -1 305 | return d 306 | else: 307 | # ref allele is first 308 | return -1 309 | else: 310 | return -1 311 | elif 'RO' in self.data: 312 | if self.data['RO'] is not None: 313 | return self.data['RO'] 314 | else: 315 | return -1 316 | else: 317 | return -1 318 | 319 | property gt_phred_likelihoods: 320 | def __get__(self): 321 | if 'PL' in self.data: 322 | return self.data['PL'] 323 | # phred-scaled. 324 | elif 'GL' in self.data and self.data['GL'] is not None: 325 | # it's not usable anyway, so return None 326 | if not isinstance(self.data["GL"], list): 327 | return None 328 | return [int(round(-10 * g)) if g is not None and g != '.' else None for g in self.data['GL']] 329 | else: 330 | return [] 331 | 332 | 333 | property gt_alt_depth: 334 | def __get__(self): 335 | '''The depth of aligned sequences that supported the 336 | alternate allele for this sample. 337 | ''' 338 | # extract the numeric alleles of the gt string 339 | 340 | # GATK style 341 | if 'AD' in self.data: 342 | depths = self.data['AD'] 343 | if depths is not None: 344 | # require bi-allelic 345 | if not isinstance(depths, (list, tuple)) or len(depths) != 2: 346 | return -1 347 | else: 348 | # alt allele is second 349 | d = depths[1] 350 | if d is None: 351 | return -1 352 | return d 353 | else: 354 | return -1 355 | # Freebayes style 356 | elif 'AO' in self.data: 357 | depth = self.data['AO'] 358 | if depth is not None: 359 | # require bi-allelic 360 | if isinstance(depth, list): 361 | return -1 362 | else: 363 | return depth 364 | else: 365 | return -1 366 | else: 367 | return -1 368 | 369 | @property 370 | def gt_qual(self): 371 | '''The PHRED-scaled quality of genotype 372 | call for this sample. 373 | ''' 374 | # extract the numeric alleles of the gt string 375 | try: 376 | qual = self.data['GQ'] 377 | if qual is not None: 378 | return qual 379 | else: 380 | return -1 381 | except KeyError: 382 | return -1 383 | 384 | property gt_copy_number: 385 | def __get__(self): 386 | '''The copy number prediction for this sample. 387 | ''' 388 | # extract the numeric alleles of the gt string 389 | if not 'CN' in self.data: 390 | return -1 391 | qual = self.data['CN'] 392 | if qual is not None: 393 | return qual 394 | else: 395 | return -1 396 | 397 | @property 398 | def is_variant(self): 399 | """ Return True if not a reference call """ 400 | if not self.called: 401 | return None 402 | return self.gt_type != HOM_REF 403 | 404 | @property 405 | def is_het(self): 406 | """ Return True for heterozygous calls """ 407 | if not self.called: 408 | return None 409 | return self.gt_type == HET 410 | 411 | 412 | cdef class Record(object): 413 | """ A set of calls at a site. Equivalent to a line in a VCF file. 414 | 415 | The standard VCF fields: 416 | CHROM, POS, ID, 417 | REF, ALT, QUAL, 418 | FILTER, INFO, & FORMAT are available as properties. 419 | 420 | The list of genotype calls is in the ``samples`` property. 421 | """ 422 | 423 | # initialize Cython variables for all of the base attrs. 424 | cdef public list alleles, samples, ALT, gt_bases, gt_types, gt_phases, \ 425 | gt_depths, gt_ref_depths, gt_alt_depths, gt_quals, gt_copy_numbers,\ 426 | gt_phred_likelihoods 427 | # use bytes instead of char * because of C -> Python string complications 428 | # see: http://docs.cython.org/src/tutorial/strings.html 429 | cdef readonly bytes CHROM, ID, FORMAT 430 | cdef public REF 431 | cdef readonly object FILTER, QUAL 432 | cdef public int POS, start, end, num_hom_ref, num_het, num_hom_alt, \ 433 | num_unknown, num_called 434 | cdef public dict INFO 435 | cdef public dict _sample_indexes 436 | cdef public bint has_genotypes 437 | 438 | def __cinit__(self, char *CHROM, int POS, char *ID, 439 | char *REF, list ALT, object QUAL=None, 440 | object FILTER=None, dict INFO=None, object FORMAT=None, 441 | dict sample_indexes=None, list samples=None, 442 | list gt_bases=None, list gt_types=None, 443 | list gt_phases=None, list gt_depths=None, 444 | list gt_ref_depths=None, list gt_alt_depths=None, 445 | list gt_quals=None, list gt_copy_numbers=None, list gt_phred_likelihoods=None, 446 | int num_hom_ref=0, int num_het=0, int num_hom_alt=0, int num_unknown=0, int num_called=0): 447 | # CORE VCF fields 448 | self.CHROM = CHROM 449 | self.POS = POS 450 | self.ID = ID 451 | self.REF = REF 452 | self.ALT = ALT 453 | self.QUAL = QUAL 454 | self.FILTER = FILTER 455 | self.INFO = INFO 456 | self.FORMAT = FORMAT 457 | # DERIVED fields 458 | self.start = self.POS - 1 459 | self.end = self.start + len(self.REF) 460 | if 'END' in self.INFO: 461 | self.end = self.INFO['END'] 462 | else: 463 | self.end = self.start + len(self.REF) 464 | self.alleles = [self.REF] 465 | self.alleles.extend(self.ALT) 466 | self.samples = samples 467 | self._sample_indexes = sample_indexes 468 | self.gt_bases = gt_bases 469 | self.gt_types = gt_types 470 | self.gt_phases = gt_phases 471 | self.gt_depths = gt_depths 472 | self.gt_ref_depths = gt_ref_depths 473 | self.gt_alt_depths = gt_alt_depths 474 | self.gt_quals = gt_quals 475 | self.gt_copy_numbers = gt_copy_numbers 476 | self.gt_phred_likelihoods = gt_phred_likelihoods 477 | self.num_hom_ref = num_hom_ref 478 | self.num_het = num_het 479 | self.num_hom_alt = num_hom_alt 480 | self.num_unknown = num_unknown 481 | self.num_called = num_called 482 | if self.FORMAT is not None and sample_indexes is not None: 483 | self.has_genotypes = True 484 | else: 485 | self.has_genotypes = False 486 | 487 | def __richcmp__(self, other, int op): 488 | """ Records are equal if they describe the same variant (same position, alleles) """ 489 | 490 | # < 0 | <= 1 | == 2 | != 3 | > 4 | >= 5 491 | if op == 2: # 2 492 | return (self.CHROM == other.CHROM and 493 | self.POS == other.POS and 494 | self.REF == other.REF and 495 | self.ALT == other.ALT) 496 | 497 | def __iter__(self): 498 | return iter(self.samples) 499 | 500 | def _format_alt(self): 501 | return ','.join([x or '.' for x in self.ALT]) 502 | 503 | def _format_qual(self): 504 | return str(self.QUAL) if self.QUAL is not None else None 505 | 506 | def _format_info(self): 507 | if not self.INFO: 508 | return '.' 509 | return ';'.join(["%s=%s" % (x, self._stringify(y)) for x, y in self.INFO.items()]) 510 | 511 | def _format_sample(self, sample): 512 | if sample.data["GT"] is None: 513 | return "./." 514 | return ':'.join(self._stringify(sample.data[f]) for f in self.FORMAT.split(':')) 515 | 516 | def _stringify(self, x, none='.'): 517 | if type(x) == type([]): 518 | return ','.join(self._map(str, x, none)) 519 | return str(x) if x is not None else none 520 | 521 | def _map(self, func, iterable, none='.'): 522 | '''``map``, but make None values none.''' 523 | return [func(x) if x is not None else none 524 | for x in iterable] 525 | 526 | def __repr__(self): 527 | if self.has_genotypes == True: 528 | core = "\t".join([self.CHROM, str(self.POS), str(self.ID), str(self.REF), self._format_alt(), 529 | self._format_qual() or '.', self.FILTER or '.', self._format_info(), self.FORMAT]) 530 | samples = "\t".join([self._format_sample(sample) for sample in self.samples]) 531 | return core + "\t" + samples 532 | else: 533 | return "\t".join([self.CHROM, str(self.POS), str(self.ID), str(self.REF), self._format_alt(), 534 | self._format_qual() or '.', self.FILTER or '.', self._format_info()]) 535 | 536 | 537 | def __cmp__(self, other): 538 | return cmp( (self.CHROM, self.POS), (other.CHROM, other.POS)) 539 | 540 | def add_format(self, fmt): 541 | tmp = self.FORMAT + ':' + fmt 542 | self.FORMAT = tmp 543 | 544 | def add_filter(self, flt): 545 | if self.FILTER is None or self.FILTER == b'PASS': 546 | self.FILTER = b'' 547 | else: 548 | tmp = self.FILTER + ';' 549 | self.FILTER = tmp 550 | tmp = self.FILTER + flt 551 | self.FILTER = tmp 552 | 553 | def add_info(self, info, value=True): 554 | self.INFO[info] = value 555 | 556 | def genotype(self, name): 557 | """ Lookup a ``_Call`` for the sample given in ``name`` """ 558 | return self.samples[self._sample_indexes[name]] 559 | 560 | @property 561 | def call_rate(self): 562 | """ The fraction of genotypes that were actually called. """ 563 | return float(self.num_called) / float(len(self.samples)) 564 | 565 | @property 566 | def aaf(self): 567 | """ The allele frequency of the alternate allele. 568 | NOTE 1: Punt if more than one alternate allele. 569 | NOTE 2: Denominator calc'ed from _called_ genotypes. 570 | """ 571 | # skip if more than one alternate allele. assumes bi-allelic 572 | if len(self.ALT) > 1: 573 | return None 574 | hom_ref = self.num_hom_ref 575 | het = self.num_het 576 | hom_alt = self.num_hom_alt 577 | num_chroms = float(2.0*self.num_called) 578 | if num_chroms == 0.0: 579 | return 0.0 580 | else: 581 | return float(het + 2*hom_alt)/float(num_chroms) 582 | 583 | @property 584 | def nucl_diversity(self): 585 | """ 586 | pi_hat (estimation of nucleotide diversity) for the site. 587 | This metric can be summed across multiple sites to compute regional 588 | nucleotide diversity estimates. For example, pi_hat for all variants 589 | in a given gene. 590 | 591 | Derived from: 592 | \"Population Genetics: A Concise Guide, 2nd ed., p.45\" 593 | John Gillespie. 594 | """ 595 | # skip if more than one alternate allele. assumes bi-allelic 596 | if len(self.ALT) > 1: 597 | return None 598 | p = self.aaf 599 | q = 1.0-p 600 | num_chroms = float(2.0*self.num_called) 601 | return float(num_chroms/(num_chroms-1.0)) * (2.0 * p * q) 602 | 603 | def get_hom_refs(self): 604 | """ The list of hom ref genotypes""" 605 | return [s for s in self.samples if s.gt_type == 0] 606 | 607 | def get_hom_alts(self): 608 | """ The list of hom alt genotypes""" 609 | return [s for s in self.samples if s.gt_type == 3] 610 | 611 | def get_hets(self): 612 | """ The list of het genotypes""" 613 | return [s for s in self.samples if s.gt_type == 1] 614 | 615 | def get_unknowns(self): 616 | """ The list of unknown genotypes""" 617 | return [s for s in self.samples if s.gt_type is None] 618 | 619 | @property 620 | def is_snp(self): 621 | """ Return whether or not the variant is a SNP """ 622 | if len(self.REF) > 1: return False 623 | for alt in self.ALT: 624 | if alt not in ['A', 'C', 'G', 'T']: 625 | return False 626 | return True 627 | 628 | @property 629 | def is_indel(self): 630 | """ Return whether or not the variant is an INDEL """ 631 | is_sv = self.is_sv 632 | 633 | if len(self.REF) > 1 and not is_sv: return True 634 | for alt in self.ALT: 635 | if alt is None: 636 | return True 637 | elif len(alt) != len(self.REF): 638 | # the diff. b/w INDELs and SVs can be murky. 639 | if not is_sv: 640 | # 1 2827693 . CCCCTCGCA C . PASS AC=10; 641 | return True 642 | else: 643 | # 1 2827693 . CCCCTCGCA C . PASS SVTYPE=DEL; 644 | return False 645 | return False 646 | 647 | @property 648 | def is_sv(self): 649 | """ Return whether or not the variant is a structural variant """ 650 | if self.INFO.get('SVTYPE') is None: 651 | return False 652 | return True 653 | 654 | @property 655 | def is_transition(self): 656 | """ Return whether or not the SNP is a transition """ 657 | # if multiple alts, it is unclear if we have a transition 658 | if len(self.ALT) > 1: return False 659 | 660 | if self.is_snp: 661 | # just one alt allele 662 | alt_allele = self.ALT[0] 663 | if ((self.REF == b'A' and alt_allele == b'G') or 664 | (self.REF == b'G' and alt_allele == b'A') or 665 | (self.REF == b'C' and alt_allele == b'T') or 666 | (self.REF == b'T' and alt_allele == b'C')): 667 | return True 668 | else: return False 669 | else: return False 670 | 671 | @property 672 | def is_deletion(self): 673 | """ Return whether or not the INDEL is a deletion """ 674 | # if multiple alts, it is unclear if we have a transition 675 | if len(self.ALT) > 1: return False 676 | 677 | if self.is_indel: 678 | # just one alt allele 679 | alt_allele = self.ALT[0] 680 | if alt_allele is None: 681 | return True 682 | if len(self.REF) > len(alt_allele): 683 | return True 684 | else: return False 685 | else: return False 686 | 687 | @property 688 | def var_type(self): 689 | """ 690 | Return the type of variant [snp, indel, unknown] 691 | TO DO: support SVs 692 | """ 693 | if self.is_snp: 694 | return "snp" 695 | elif self.is_indel: 696 | return "indel" 697 | elif self.is_sv: 698 | return "sv" 699 | else: 700 | return "unknown" 701 | 702 | @property 703 | def var_subtype(self): 704 | """ 705 | Return the subtype of variant. 706 | - For SNPs and INDELs, yeild one of: [ts, tv, ins, del] 707 | - For SVs yield either "complex" or the SV type defined 708 | in the ALT fields (removing the brackets). 709 | E.g.: 710 | -> DEL 711 | -> INS:ME:L1 712 | -> DUP 713 | 714 | The logic is meant to follow the rules outlined in the following 715 | paragraph at: 716 | 717 | http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 718 | 719 | "For precisely known variants, the REF and ALT fields should contain 720 | the full sequences for the alleles, following the usual VCF conventions. 721 | For imprecise variants, the REF field may contain a single base and the 722 | ALT fields should contain symbolic alleles (e.g. ), described in more 723 | detail below. Imprecise variants should also be marked by the presence 724 | of an IMPRECISE flag in the INFO field." 725 | """ 726 | if self.is_snp: 727 | if self.is_transition: 728 | return "ts" 729 | elif len(self.ALT) == 1: 730 | return "tv" 731 | else: # multiple ALT alleles. unclear 732 | return "unknown" 733 | elif self.is_indel: 734 | if self.is_deletion: 735 | return "del" 736 | elif len(self.ALT) == 1: 737 | return "ins" 738 | else: # multiple ALT alleles. unclear 739 | return "unknown" 740 | elif self.is_sv: 741 | if self.INFO['SVTYPE'] == "BND": 742 | return "complex" 743 | elif self.is_sv_precise: 744 | return self.INFO['SVTYPE'] 745 | else: 746 | # first remove both "<" and ">" from ALT 747 | return self.ALT[0].strip('<>') 748 | else: 749 | return "unknown" 750 | 751 | @property 752 | def sv_end(self): 753 | """ Return the end position for the SV """ 754 | if self.is_sv: 755 | return self.INFO['END'] 756 | return None 757 | 758 | @property 759 | def is_sv_precise(self): 760 | """ Return whether the SV cordinates are mapped 761 | to 1 b.p. resolution. 762 | """ 763 | if self.INFO.get('IMPRECISE') is None and not self.is_sv: 764 | return False 765 | elif self.INFO.get('IMPRECISE') is not None and self.is_sv: 766 | return False 767 | elif self.INFO.get('IMPRECISE') is None and self.is_sv: 768 | return True 769 | 770 | @property 771 | def is_monomorphic(self): 772 | """ Return True for reference calls """ 773 | return len(self.ALT) == 1 and self.ALT[0] is None 774 | 775 | cdef class Reader(object): 776 | 777 | """ Reader for a VCF v 4.1 file, an iterator returning ``Record objects`` """ 778 | cdef bytes _col_defn_line 779 | cdef char _prepend_chr 780 | cdef public object reader 781 | cdef bint compressed, prepend_chr 782 | cdef public dict metadata, infos, filters, formats, 783 | cdef readonly dict _sample_indexes 784 | cdef list _header_lines, samp_data 785 | cdef public list samples 786 | cdef object _tabix 787 | cdef public object filename 788 | cdef int num_samples 789 | 790 | def __init__(self, fsock=None, filename=None, 791 | bint compressed=False, bint prepend_chr=False): 792 | """ Create a new Reader for a VCF file. 793 | 794 | You must specify a filename. Gzipped streams 795 | or files are attempted to be recogized by the file extension, or gzipped 796 | can be forced with ``compressed=True`` 797 | """ 798 | super(VCFReader, self).__init__() 799 | 800 | if not (fsock or filename): 801 | raise Exception('You must provide at least fsock or filename') 802 | 803 | if filename: 804 | self.filename = filename 805 | if fsock is None: 806 | self.reader = file(filename) 807 | 808 | if fsock: 809 | self.reader = fsock 810 | if filename is None: 811 | if hasattr(fsock, 'name'): 812 | filename = fsock.name 813 | self.filename = filename 814 | 815 | if compressed or (filename and filename.endswith('.gz')): 816 | self.reader = gzip.GzipFile(fileobj=self.reader) 817 | 818 | #: metadata fields from header 819 | self.metadata = {} 820 | #: INFO fields from header 821 | self.infos = {} 822 | #: FILTER fields from header 823 | self.filters = {} 824 | #: FORMAT fields from header 825 | self.formats = {} 826 | self.samples = [] 827 | self._sample_indexes = {} 828 | self._header_lines = [] 829 | self._col_defn_line = None 830 | self._tabix = None 831 | self._prepend_chr = prepend_chr 832 | self._parse_metainfo() 833 | 834 | def __iter__(self): 835 | return self 836 | 837 | def seek(self, offset): 838 | self.reader.seek(offset) 839 | 840 | def tell(self): 841 | return self.reader.tell() 842 | 843 | property raw_header: 844 | """Dump the raw, unparsed header lines""" 845 | def __get__(self): 846 | return ''.join(self._header_lines) 847 | 848 | def _parse_metainfo(self): 849 | '''Parse the information stored in the metainfo of the VCF. 850 | 851 | The end user shouldn't have to use this. She can access the metainfo 852 | directly with ``self.metadata``. 853 | ''' 854 | # NOTE: Not sure why this was necessary in PyVCF 855 | # for attr in ('metadata', 'infos', 'filters', 'formats'): 856 | # setattr(self, attr, {}) 857 | 858 | parser = _vcf_metadata_parser() 859 | 860 | line = self.reader.next() 861 | while line.startswith('##'): 862 | self._header_lines.append(line) 863 | line = line.rstrip('\n') 864 | 865 | if line.startswith('##INFO'): 866 | key, val = parser.read_info(line) 867 | self.infos[key] = val 868 | 869 | elif line.startswith('##FILTER'): 870 | key, val = parser.read_filter(line) 871 | self.filters[key] = val 872 | 873 | elif line.startswith('##FORMAT'): 874 | key, val = parser.read_format(line) 875 | self.formats[key] = val 876 | 877 | else: 878 | key, val = parser.read_meta(line.strip()) 879 | self.metadata[key] = val 880 | 881 | line = self.reader.next() 882 | 883 | if line.startswith('#'): # the column def'n line - REQ'D 884 | self._col_defn_line = line 885 | self._header_lines.append(line) 886 | fields = line.split() 887 | self.samples = fields[9:] 888 | self.num_samples = len(self.samples) 889 | self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) 890 | else: 891 | sys.exit("Expected column definition line beginning with #. Not found - exiting.") 892 | 893 | 894 | cdef list _map(Reader self, func, iterable, char *bad='.'): 895 | '''``map``, but make bad values None.''' 896 | return [func(x) if x != bad else None for x in iterable] 897 | 898 | 899 | def _parse_info(self, info_str): 900 | '''Parse the INFO field of a VCF entry into a dictionary of Python 901 | types. 902 | 903 | ''' 904 | if info_str == '.': 905 | return {} 906 | 907 | cdef list entries = info_str.split(';') 908 | cdef dict retdict = {} 909 | 910 | cdef int i = 0 911 | cdef int n = len(entries) 912 | cdef char *entry_type 913 | cdef list entry 914 | # for entry in entries: 915 | for i in xrange(n): 916 | entry = entries[i].split('=') 917 | # entry = entry.split('=') 918 | ID = entry[0] 919 | if ID in self.infos: 920 | entry_type = self.infos[ID].type 921 | elif ID in RESERVED_INFO: 922 | entry_type = RESERVED_INFO[ID] 923 | else: 924 | if len(entry) == 1: 925 | entry_type = 'Flag' 926 | else: 927 | entry_type = 'String' 928 | 929 | if entry_type == b'Integer': 930 | vals = entry[1].split(',') 931 | try: 932 | val = _map(int, vals) 933 | except ValueError: 934 | val = _map(float, vals) 935 | elif entry_type == b'Float': 936 | vals = entry[1].split(',') 937 | val = _map(float, vals) 938 | elif entry_type == b'Flag': 939 | val = True 940 | elif entry_type == b'String': 941 | if len(entry) > 1: 942 | val = entry[1] 943 | else: 944 | val = True 945 | elif entry_type == b'Character': 946 | val = entry[1] 947 | else: 948 | print >>sys.stderr, "XXXXXXXXXXXXXXXX" 949 | print >>sys.stderr, entry_type, entry 950 | 951 | try: 952 | if isinstance(val, list) and self.infos[ID].num == 1 and entry_type != b'String': 953 | val = val[0] 954 | except KeyError: 955 | pass 956 | 957 | retdict[ID] = val 958 | 959 | return retdict 960 | 961 | 962 | def _parse_samples(self, Record rec, list samples, char *samp_fmt_s): 963 | '''Parse a sample entry according to the format specified in the FORMAT 964 | column.''' 965 | cdef list samp_fmt = samp_fmt_s.split(':') 966 | cdef int n = len(samp_fmt) 967 | cdef list samp_fmt_types = [None] * n 968 | cdef list samp_fmt_nums = [None] * n 969 | 970 | cdef int i = 0 971 | cdef char *fmt 972 | # for fmt in samp_fmt: 973 | for i in xrange(n): 974 | fmt = samp_fmt[i] 975 | try: 976 | entry_type = self.formats[fmt].type 977 | entry_num = self.formats[fmt].num 978 | except KeyError: 979 | entry_num = None 980 | try: 981 | entry_type = RESERVED_FORMAT[fmt] 982 | except KeyError: 983 | entry_type = 'String' 984 | samp_fmt_types[i] = entry_type 985 | samp_fmt_nums[i] = entry_num 986 | 987 | cdef int num_hom_ref = 0 988 | cdef int num_het = 0 989 | cdef int num_hom_alt = 0 990 | cdef int num_unknown = 0 991 | cdef int num_called = 0 992 | rec.samples = [None] * self.num_samples# list of _Call objects for each sample 993 | rec.gt_bases = [None] * self.num_samples# A/A, A|G, G/G, etc. 994 | rec.gt_types = [None] * self.num_samples# 0, 1, 2, etc. 995 | rec.gt_phases = [None] * self.num_samples# T, F, T, etc. 996 | rec.gt_depths = [None] * self.num_samples# 10, 37, 0, etc. 997 | rec.gt_ref_depths = [None] * self.num_samples# 3, 32, 0, etc. 998 | rec.gt_alt_depths = [None] * self.num_samples# 7, 5, 0, etc. 999 | rec.gt_quals = [None] * self.num_samples# 10, 30, 20, etc. 1000 | rec.gt_copy_numbers = [None] * self.num_samples# 2, 1, 4, etc. 1001 | rec.gt_phred_likelihoods = [None] * self.num_samples 1002 | 1003 | for i in xrange(self.num_samples): 1004 | 1005 | call = _parse_sample(samples[i], samp_fmt, \ 1006 | samp_fmt_types, samp_fmt_nums, 1007 | self.samples[i], rec) 1008 | 1009 | rec.samples[i] = call 1010 | 1011 | alleles = call.gt_bases 1012 | type = call.gt_type 1013 | 1014 | # add to the "all-samples" lists of GT info 1015 | if alleles is not None: 1016 | rec.gt_bases[i] = alleles 1017 | rec.gt_types[i] = type if type is not None else 2 1018 | else: 1019 | rec.gt_bases[i] = './.' 1020 | rec.gt_types[i] = 2 1021 | rec.gt_phases[i] = call.phased 1022 | rec.gt_depths[i] = call.gt_depth 1023 | rec.gt_ref_depths[i] = call.gt_ref_depth 1024 | rec.gt_alt_depths[i] = call.gt_alt_depth 1025 | rec.gt_quals[i] = call.gt_qual 1026 | rec.gt_copy_numbers[i] = call.gt_copy_number 1027 | rec.gt_phred_likelihoods[i] = call.gt_phred_likelihoods 1028 | 1029 | # 0 / 00000000 hom ref 1030 | # 1 / 00000001 het 1031 | # 2 / 00000010 missing 1032 | # 3 / 00000011 hom alt 1033 | 1034 | # tally the appropriate GT count 1035 | if type == HOM_REF: num_hom_ref += 1 1036 | elif type == HET: num_het += 1 1037 | elif type == HOM_ALT: num_hom_alt += 1 1038 | elif type == None: num_unknown += 1 1039 | 1040 | rec.num_called = num_hom_ref + num_het + num_hom_alt 1041 | rec.num_hom_alt = num_hom_alt 1042 | rec.num_het = num_het 1043 | rec.num_hom_ref = num_hom_ref 1044 | rec.num_unknown = num_unknown 1045 | 1046 | def __next__(self): 1047 | '''Return the next record in the file.''' 1048 | line = self.reader.next().rstrip() 1049 | return self.parse(line) 1050 | 1051 | def parse(self, line): 1052 | '''Return the next record in the file.''' 1053 | cdef list row = line.split('\t') 1054 | 1055 | #CHROM 1056 | cdef bytes chrom = row[0] 1057 | if self._prepend_chr: 1058 | chrom = 'chr' + str(chrom) 1059 | # POS 1060 | cdef int pos = int(row[1]) 1061 | # ID 1062 | cdef bytes id = row[2] 1063 | #REF 1064 | cdef bytes ref = row[3] 1065 | #ALT 1066 | cdef list alt = self._map(str, row[4].split(',')) 1067 | #QUAL 1068 | cdef object qual 1069 | if row[5] == b'.': 1070 | qual = None 1071 | else: 1072 | qual = float(row[5]) 1073 | #FILT 1074 | cdef object filt = row[6].split(';') if ';' in row[6] else row[6] 1075 | if filt == b'PASS' or filt == b'.': 1076 | filt = None 1077 | #INFO 1078 | cdef dict info = self._parse_info(row[7]) 1079 | #FORMAT 1080 | cdef bytes fmt 1081 | try: 1082 | fmt = row[8] 1083 | except IndexError: 1084 | fmt = None 1085 | 1086 | rec = Record(chrom, pos, id, ref, alt, qual, filt, info, fmt, self._sample_indexes) 1087 | 1088 | # collect GENOTYPE information for the current VCF record 1089 | if fmt is not None: 1090 | self._parse_samples(rec, row[9:], fmt) 1091 | return rec 1092 | 1093 | def fetch(self, chrom, start, end=None): 1094 | """ fetch records from a Tabix indexed VCF, requires pysam 1095 | if start and end are specified, return iterator over positions 1096 | if end not specified, return individual ``_Call`` at start or None 1097 | """ 1098 | if not pysam: 1099 | raise Exception('pysam not available, try "pip install pysam"?') 1100 | 1101 | if not self.filename: 1102 | raise Exception('Please provide a filename (or a "normal" fsock)') 1103 | 1104 | if not self._tabix: 1105 | self._tabix = pysam.Tabixfile(self.filename) 1106 | 1107 | if self._prepend_chr and chrom[:3] == 'chr': 1108 | chrom = chrom[3:] 1109 | 1110 | # not sure why tabix needs position -1 1111 | start = start - 1 1112 | 1113 | if end is None: 1114 | self.reader = self._tabix.fetch(chrom, start, start+1) 1115 | try: 1116 | return self.next() 1117 | except StopIteration: 1118 | return None 1119 | 1120 | self.reader = self._tabix.fetch(chrom, start, end) 1121 | return self 1122 | 1123 | 1124 | class Writer(object): 1125 | """ VCF Writer """ 1126 | 1127 | fixed_fields = "#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT".split() 1128 | 1129 | def __init__(self, stream, template): 1130 | self.stream = stream 1131 | self.writer = csv.writer(stream, delimiter="\t") 1132 | self.template = template 1133 | 1134 | for line in template.metadata.items(): 1135 | stream.write('##%s=%s\n' % line) 1136 | for line in template.infos.values(): 1137 | stream.write('##INFO=\n' % 1138 | tuple(self._map(str, line))) 1139 | for line in template.formats.values(): 1140 | stream.write('##FORMAT=\n' % tuple(self._map(str, line))) 1141 | for line in template.filters.values(): 1142 | stream.write('##FILTER=\n' % tuple(self._map(str, line))) 1143 | 1144 | self._write_header() 1145 | 1146 | def _write_header(self): 1147 | # TODO: write INFO, etc 1148 | self.writer.writerow(self.fixed_fields + self.template.samples) 1149 | 1150 | def write_record(self, record): 1151 | """ write a record to the file """ 1152 | ffs = self._map(str, [record.CHROM, record.POS, record.ID, record.REF]) \ 1153 | + [self._format_alt(record.ALT), record.QUAL or '.', record.FILTER or '.', 1154 | self._format_info(record.INFO), record.FORMAT] 1155 | 1156 | samples = [self._format_sample(record.FORMAT, sample) 1157 | for sample in record.samples] 1158 | self.writer.writerow(ffs + samples) 1159 | 1160 | def _format_alt(self, alt): 1161 | return ','.join([x or '.' for x in alt]) 1162 | 1163 | def _format_info(self, info): 1164 | if not info: 1165 | return '.' 1166 | return ';'.join("%s=%s" % (x, self._stringify(y)) for x, y in info.items()) 1167 | 1168 | def _format_sample(self, fmt, sample): 1169 | if sample.data["GT"] is None: 1170 | return "./." 1171 | return ':'.join(self._stringify(sample.data[f]) for f in fmt.split(':')) 1172 | 1173 | def _stringify(self, x, none='.'): 1174 | if isinstance(x, list): 1175 | return ','.join(self._map(str, x, none)) 1176 | return str(x) if x is not None else none 1177 | 1178 | def _map(self, func, iterable, none='.'): 1179 | '''``map``, but make None values none.''' 1180 | return [func(x) if x is not None else none 1181 | for x in iterable] 1182 | 1183 | def __update_readme(): 1184 | import sys, vcf 1185 | file('README.rst', 'w').write(vcf.__doc__) 1186 | 1187 | # backwards compatibility 1188 | VCFReader = Reader 1189 | VCFWriter = Writer 1190 | --------------------------------------------------------------------------------