├── test2.vcf ├── VERSION ├── ragel ├── .gitignore ├── generate.sh └── gen_vcfheaderline_parser.rl ├── features ├── step_definitions │ ├── bio-vcf_steps.rb │ ├── cli-feature.rb │ ├── vcf_header.rb │ ├── diff_count.rb │ ├── somaticsniper.rb │ ├── sfilter.rb │ └── multisample.rb ├── support │ └── env.rb ├── filter.feature ├── diff_count.feature ├── vcf_header.feature ├── sfilter.feature ├── somaticsniper.feature ├── cli.feature └── multisample.feature ├── .travis.yml ├── lib ├── bio-vcf │ ├── vcf.rb │ ├── vcfline.rb │ ├── utils.rb │ ├── vcfstatistics.rb │ ├── variant.rb │ ├── bedfilter.rb │ ├── vcffile.rb │ ├── template.rb │ ├── vcfrdf.rb │ ├── vcfsample.rb │ ├── vcfheader.rb │ ├── vcfgenotypefield.rb │ ├── vcfrecord.rb │ └── pcows.rb ├── regressiontest.rb ├── bio-vcf.rb └── regressiontest │ └── cli_exec.rb ├── doc ├── json.png ├── pcows.org ├── Compare_VCFs.md ├── Using_RDF.md ├── GATK_comparison.md └── Using_Mongo.md ├── test ├── data │ ├── input │ │ ├── empty.vcf │ │ └── somaticsniper.vcf │ └── regression │ │ ├── empty.ref │ │ ├── thread4_4_failed_filter-stderr.ref │ │ ├── ifilter_s.dp.ref │ │ ├── sfilter_seval_s.dp.ref │ │ ├── seval_s.dp.ref │ │ ├── eval_once.ref │ │ ├── eval_r.info.dp.ref │ │ └── pass1.ref ├── stress │ └── stress_test.sh └── performance │ └── metrics.md ├── Gemfile ├── .gitignore ├── template ├── vcf2rdf.erb ├── vcf2json.erb ├── vcf2json_full_header.erb ├── vcf2rdf_header.erb ├── vcf2json_expanded.erb ├── gatk_vcf2rdf.erb └── vcf2json_use_meta.erb ├── Rakefile ├── LICENSE ├── RELEASE_NOTES.md ├── bio-vcf.gemspec ├── guix.scm └── bin └── bio-vcf /test2.vcf: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.9.6 2 | -------------------------------------------------------------------------------- /ragel/.gitignore: -------------------------------------------------------------------------------- 1 | *.rb 2 | -------------------------------------------------------------------------------- /features/step_definitions/bio-vcf_steps.rb: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | 3 | arch: arm64 4 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcf.rb: -------------------------------------------------------------------------------- 1 | 2 | module BioVcf 3 | end 4 | -------------------------------------------------------------------------------- /doc/json.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vcflib/bio-vcf/HEAD/doc/json.png -------------------------------------------------------------------------------- /test/data/input/empty.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | #CHROM POS ID REF ALT QUAL FILTER INFO 3 | -------------------------------------------------------------------------------- /test/data/regression/empty.ref: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | #CHROM POS ID REF ALT QUAL FILTER INFO 3 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "http://rubygems.org" 2 | 3 | group :development do 4 | gem "rake" 5 | gem "rspec" 6 | gem "cucumber" 7 | end 8 | 9 | 10 | -------------------------------------------------------------------------------- /ragel/generate.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | ragel -R gen_vcfheaderline_parser.rl 4 | [ $? -ne 0 ] && exit 1 5 | 6 | ruby gen_vcfheaderline_parser.rb 7 | 8 | cp gen_vcfheaderline_parser.rb ../lib/bio-vcf/vcfheader_line.rb 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | file.vcf 2 | Gemfile.lock 3 | notes.txt 4 | .bundle 5 | vendor/ 6 | orig/ 7 | pkg/ 8 | scripts/timings.sh 9 | TAGS 10 | test/data/regression/*.new 11 | data/ 12 | fedor.txt 13 | test.vcf 14 | out.bed 15 | rdoc/ 16 | *.gem 17 | -------------------------------------------------------------------------------- /test/data/regression/thread4_4_failed_filter-stderr.ref: -------------------------------------------------------------------------------- 1 | Unknown field name in record, did you mean r.info.t? 2 | Unknown field name in record, did you mean r.info.t? 3 | Unknown field name in record, did you mean r.info.t? 4 | Unknown field name in record, did you mean r.info.t? 5 | ERROR: execution expired 6 | -------------------------------------------------------------------------------- /template/vcf2rdf.erb: -------------------------------------------------------------------------------- 1 | <% 2 | id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')) 3 | %> 4 | :<%= id %> 5 | :query_id "<%= id %>"; 6 | seq:chr "<%= rec.chrom %>" ; 7 | seq:pos <%= rec.pos %> ; 8 | seq:ref "<%= rec.ref %>" ; 9 | seq:alt "<%= rec.alt[0] %>" ; 10 | seq:dp <%= rec.info.dp %> ; 11 | db:vcf true . 12 | 13 | -------------------------------------------------------------------------------- /features/support/env.rb: -------------------------------------------------------------------------------- 1 | 2 | $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib') 3 | require 'bio-vcf' 4 | 5 | require 'rspec/expectations' 6 | 7 | # Add the regression module if in the path (it can also be a gem) 8 | rootdir = File.dirname(__FILE__) + '/../..' 9 | $LOAD_PATH.unshift(rootdir+'/lib/regressiontest',rootdir+'/../regressiontest/lib') 10 | require 'regressiontest' 11 | 12 | include BioVcf 13 | -------------------------------------------------------------------------------- /lib/regressiontest.rb: -------------------------------------------------------------------------------- 1 | # Please require your code below, respecting the naming conventions in the 2 | # bioruby directory tree. 3 | # 4 | # For example, say you have a plugin named bio-plugin, the only uncommented 5 | # line in this file would be 6 | # 7 | # require 'bio/bio-plugin/plugin' 8 | # 9 | # In this file only require other files. Avoid other source code. 10 | 11 | require 'regressiontest/cli_exec' 12 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfline.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | module VcfLine 3 | 4 | # Split a line into fields and check size 5 | def VcfLine.parse line,expected_size=nil 6 | fields = line.strip.split(/\t/) 7 | raise "Unexpected line #{line}" if line.strip.size == 0 or fields.size < 6 8 | raise "Expected #{expected_size} fields but got #{fields.size} in "+fields.to_s if expected_size and fields.size != expected_size 9 | fields 10 | end 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /template/vcf2json.erb: -------------------------------------------------------------------------------- 1 | =HEADER 2 | <% require 'json' %> 3 | { 4 | "HEADER": { 5 | "options": <%= options.to_h.to_json %>, 6 | "files": <%= ARGV %>, 7 | "version": "<%= BIOVCF_VERSION %>" 8 | }, 9 | "BODY": [ 10 | =BODY 11 | { 12 | "seq:chr": "<%= rec.chrom %>", 13 | "seq:pos": <%= rec.pos %>, 14 | "seq:ref": "<%= rec.ref %>", 15 | "seq:alt": "<%= rec.alt[0] %>", 16 | "dp": <%= rec.info.dp %> 17 | }, 18 | =FOOTER 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /template/vcf2json_full_header.erb: -------------------------------------------------------------------------------- 1 | =HEADER 2 | <% require 'json' %> 3 | { 4 | "HEADER": { 5 | "options": <%= options.to_h.to_json %>, 6 | "files": <%= ARGV %>, 7 | "version": "<%= BIOVCF_VERSION %>" 8 | }, 9 | "COLUMNS": <%= header.column_names.to_json %>, 10 | "META": <%= header.meta.to_json %>, 11 | "BODY": [ 12 | =BODY 13 | { 14 | "seq:chr": "<%= rec.chrom %>" , 15 | "seq:pos": <%= rec.pos %> , 16 | "seq:ref": "<%= rec.ref %>" , 17 | "seq:alt": "<%= rec.alt[0] %>" 18 | <% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %> 19 | }, 20 | =FOOTER 21 | ] 22 | } -------------------------------------------------------------------------------- /lib/bio-vcf/utils.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | module ConvertStringToValue 4 | def self::integer?(str) 5 | !!Integer(str) rescue false 6 | end 7 | 8 | def self::float?(str) 9 | !!Float(str) rescue false 10 | end 11 | 12 | def self::convert str 13 | if str =~ /,/ 14 | str.split(/,/).map { |item| convert(item) } 15 | else 16 | if integer?(str) 17 | str.to_i 18 | else 19 | if float?(str) 20 | str.to_f 21 | else 22 | str 23 | end 24 | end 25 | end 26 | end 27 | end 28 | 29 | end 30 | -------------------------------------------------------------------------------- /features/filter.feature: -------------------------------------------------------------------------------- 1 | @filter 2 | Feature: Adding filters 3 | 4 | bio-vcf can add soft filters. Rather than removing failing items we can 5 | inject filter state into the FILTER field. To add state such as PASS or 6 | LowDepth simply use a filter and the --set-filter switch. If a filter already 7 | has state the new one is appended with a semi-colon. 8 | 9 | Scenario: Test the info filter using dp and threads 10 | Given I have input file(s) named "test/data/input/somaticsniper.vcf" 11 | When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'" 12 | Then I expect the named output to match the named output "pass1" 13 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | # require 'rubygems' 4 | require 'rake' 5 | # require 'cucumber/rake/task' 6 | 7 | # Cucumber::Rake::Task.new(:features) do |t| 8 | # t.cucumber_opts = "--bundler false" 9 | # end 10 | 11 | desc 'Run cucumber' # without bundler 12 | task :features do 13 | sh 'cucumber features' 14 | end 15 | 16 | task :default => :features 17 | 18 | task :test => [ :features ] 19 | 20 | require 'rdoc/task' 21 | Rake::RDocTask.new do |rdoc| 22 | version = File.exist?('VERSION') ? File.read('VERSION') : "" 23 | 24 | rdoc.rdoc_dir = 'rdoc' 25 | rdoc.title = "bio-vcf #{version}" 26 | rdoc.rdoc_files.include('README*') 27 | rdoc.rdoc_files.include('lib/**/*.rb') 28 | end 29 | -------------------------------------------------------------------------------- /lib/bio-vcf.rb: -------------------------------------------------------------------------------- 1 | # Please require your code below, respecting the naming conventions in the 2 | # bioruby directory tree. 3 | # 4 | # For example, say you have a plugin named bio-plugin, the only uncommented 5 | # line in this file would be 6 | # 7 | # require 'bio/bio-plugin/plugin' 8 | # 9 | # In this file only require other files. Avoid other source code. 10 | 11 | require 'bio-vcf/utils' 12 | require 'bio-vcf/vcf' 13 | require 'bio-vcf/vcfsample' 14 | require 'bio-vcf/vcfheader_line' 15 | require 'bio-vcf/vcfheader' 16 | require 'bio-vcf/vcfline' 17 | require 'bio-vcf/vcfgenotypefield' 18 | require 'bio-vcf/vcfrecord' 19 | require 'bio-vcf/variant' 20 | require 'bio-vcf/vcfstatistics' 21 | require 'bio-vcf/bedfilter' 22 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfstatistics.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | class VcfStatistics 4 | 5 | def initialize 6 | @count = 0 7 | @ref_alt_count = {} 8 | end 9 | 10 | def add rec 11 | @count += 1 12 | s = rec.ref+">"+rec.alt[0] 13 | @ref_alt_count[s] ||= 0 14 | @ref_alt_count[s] += 1 15 | end 16 | 17 | def print 18 | puts "## ==== Statistics ==================================" 19 | @ref_alt_count.sort_by {|k,v| v}.reverse.each do |k,v| 20 | printf k+"\t%d\t%2.0d%%\n",v,(v.to_f/@count*100).round 21 | end 22 | puts "Total\t#{@count}" 23 | puts "## ==================================================" 24 | end 25 | end 26 | 27 | end 28 | 29 | -------------------------------------------------------------------------------- /test/stress/stress_test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # 3 | # Stress test bio-vcf by running it on large files and comparing 4 | # results using threads 5 | 6 | input=test/data/input/multisample.vcf 7 | filter='--sfilter 's.dp>70' --seval s.dp' 8 | 9 | echo "cat $input | ./bin/bio-vcf --num-threads 1 $filter > stress_simple01.vcf" 10 | cat $input | ./bin/bio-vcf --num-threads 1 $filter > stress_simple01.vcf 11 | cat $input | ./bin/bio-vcf --num-threads 2 $filter > stress_simple02.vcf 12 | cat $input | ./bin/bio-vcf --num-threads 4 $filter > stress_simple03.vcf 13 | cat $input | ./bin/bio-vcf $filter > stress_simple04.vcf 14 | cat $input | ./bin/bio-vcf --thread-lines 3 $filter > stress_simple05.vcf 15 | cat $input | ./bin/bio-vcf --thread-lines 1 $filter > stress_simple06.vcf 16 | -------------------------------------------------------------------------------- /template/vcf2rdf_header.erb: -------------------------------------------------------------------------------- 1 | =HEADER 2 | @prefix rdf: . 3 | @prefix rdfs: . 4 | @prefix dc: . 5 | @prefix hgnc: . 6 | @prefix doi: . 7 | @prefix seq: . 8 | @prefix db: . 9 | @prefix : . 10 | 11 | =BODY 12 | <% 13 | id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')) 14 | %> 15 | :<%= id %> 16 | :query_id "<%= id %>"; 17 | seq:chr "<%= rec.chrom %>" ; 18 | seq:pos <%= rec.pos %> ; 19 | seq:ref "<%= rec.ref %>" ; 20 | seq:alt "<%= rec.alt[0] %>" ; 21 | seq:dp <%= rec.info.dp %> ; 22 | db:vcf true . 23 | 24 | =FOOTER -------------------------------------------------------------------------------- /template/vcf2json_expanded.erb: -------------------------------------------------------------------------------- 1 | =HEADER 2 | <% require 'json' %> 3 | { 4 | "HEADER": { 5 | "options": <%= options.to_h.to_json %>, 6 | "files": <%= ARGV %>, 7 | "version": "<%= BIOVCF_VERSION %>" 8 | }, 9 | "BODY": [ 10 | =BODY 11 | { 12 | "CHR": "<%= rec.chrom %>", 13 | "POS": <%= rec.pos %>, 14 | "REF": "<%= rec.ref %>", 15 | "ALT": <%= rec.alt %>, 16 | "QUAL": <%= rec.qual %>, 17 | 18 | "DP": <%= rec.info.dp %>, 19 | "AF": <%= rec.info.af %>, 20 | "AN": <%= rec.info.an %>, 21 | "MQ": <%= rec.info.mq %>, 22 | "QD": <%= rec.info.qd %>, 23 | "BaseQRankSum": <%= rec.info.baseqranksum %>, 24 | "HaplotypeScore": <%= rec.info.HaplotypeScore %>, 25 | 26 | "samples" : { <% rec.each_sample do |s| %> 27 | "<%= s.name %>": { 28 | "GT": "<%= s.gt %>", 29 | "AD": <%= s.ad %>, 30 | "DP": <%= s.dp %> 31 | } <%= (s.is_last? ? "" : ",") %> 32 | <% end %> 33 | } 34 | 35 | } 36 | =FOOTER 37 | ] 38 | } 39 | -------------------------------------------------------------------------------- /template/gatk_vcf2rdf.erb: -------------------------------------------------------------------------------- 1 | <% 2 | id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')) 3 | sample_num = 0 4 | %> 5 | :<%= id %> 6 | :query_id "<%= id %>"; 7 | seq:chr "<%= rec.chrom %>" ; 8 | seq:pos <%= rec.pos %> ; 9 | seq:ref "<%= rec.ref %>" ; 10 | seq:alt "<%= rec.alt[0] %>" ; 11 | db:gatk true . 12 | 13 | <% rec.each_sample do | s | %> 14 | <% if not s.empty? 15 | sample_name = header.samples[sample_num] 16 | sample_id = id + '_' + Turtle::mangle_identifier(sample_name) 17 | sample_num += 1 18 | if s.ad[0]+s.ad[1] != 0 19 | alt_bias = (s.ad[1].to_f/(s.ad[0]+s.ad[1])).round(2) 20 | end 21 | %> 22 | :<%= sample_id %> 23 | :call_id :<%= id %> ; 24 | sample:name "<%= sample_name %>" ; 25 | sample:gt "<%= s.gt %>" ; 26 | <% s.gti.each do | index | %> 27 | sample:ad<%= index %> <%= s.ad[index] %> ; 28 | sample:gts<%= index %> "<%= s.gts[index] %>" ; 29 | <% end %> 30 | sample:dp <%= s.dp %> ; 31 | sample:alt_bias <%= alt_bias %> . 32 | <% end %> 33 | <% end %> 34 | 35 | 36 | -------------------------------------------------------------------------------- /features/step_definitions/cli-feature.rb: -------------------------------------------------------------------------------- 1 | 2 | Given /^I have input file\(s\) named "(.*?)"$/ do |arg1| 3 | @filenames = arg1.split(/,/) 4 | end 5 | 6 | When /^I execute "(.*?)"$/ do |arg1| 7 | @cmd = arg1 + ' < ' + @filenames[0] 8 | end 9 | 10 | # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1| 11 | # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy 12 | # end 13 | 14 | # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2| 15 | # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy 16 | # end 17 | 18 | 19 | # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2| 20 | # RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy 21 | # end 22 | 23 | # Then(/^I expect no errors$/) do 24 | # RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy 25 | # end 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013-2021 Pjotr Prins 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | ## ChangeLog v0.9.6 (?) 2 | 3 | + Added JSON VCF header output with --json switch 4 | 5 | ## ChangeLog v0.9.5 (20210118) 6 | 7 | + Improved README and installation instructions 8 | + Added guix.scm build and instructions (no need for bundler) 9 | + Moved regressiontest into tree 10 | 11 | ## ChangeLog v0.9.4 (20201222) 12 | 13 | This is an important maintenance release of bio-vcf: 14 | 15 | + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf) 16 | + Fixed tests to match recent Ruby updates 17 | 18 | ## Older release notes 19 | 20 | + Getting ready for a 1.0 release 21 | + Released 0.9.2 as a gem 22 | + 0.9.1 removed a rare threading bug and cleanup on error 23 | + Added support for soft filters (request by Brad Chapman) 24 | + The outputter now writes (properly) in parallel with the parser 25 | + bio-vcf turns any VCF into JSON with header information, and 26 | allows you to pipe that JSON directly into any JSON supporting 27 | language, including Python and Javascript! 28 | 29 | ## Older changes 30 | 31 | For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master). 32 | -------------------------------------------------------------------------------- /template/vcf2json_use_meta.erb: -------------------------------------------------------------------------------- 1 | =HEADER 2 | <% require 'json' %> 3 | { 4 | "HEADER": { 5 | "options":<%= options.to_h.to_json %>, 6 | "files": <%= ARGV %>, 7 | "version": "<%= BIOVCF_VERSION %>" 8 | }, 9 | "COLUMNS": <%= header.column_names.to_json %>, 10 | "META": <%= header.meta.to_json %>, 11 | "BODY": [ 12 | =BODY 13 | <% sample_num = 0 14 | sample_name = nil 15 | sample_size = header.samples.size 16 | %> 17 | { 18 | "seq:chr": "<%= rec.chrom %>" , 19 | "seq:pos": <%= rec.pos %> , 20 | "seq:ref": "<%= rec.ref %>" , 21 | "seq:alt": "<%= rec.alt[0] %>" 22 | <% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>, 23 | "samples" : { 24 | <% rec.each_sample do |s| %> 25 | <% if not s.empty? 26 | sample_name = header.samples[sample_num] 27 | %> 28 | <%= (sample_num!=0 ? "," : "" ) %> 29 | <% sample_num += 1%> 30 | "<%= sample_name %>": { 31 | <% header.meta['FORMAT'].each_key do |k| %> 32 | "<%= k %>": <%= s[k].to_json %><%= (k==header.meta['FORMAT'].keys.last ? "" : "," ) %> 33 | <% end %> 34 | } 35 | <% end %> 36 | <% end %> 37 | } 38 | }, 39 | =FOOTER 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /lib/bio-vcf/variant.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | module Variant 4 | 5 | def Variant.diff normal,tumor 6 | tumor.each_with_index.map {|t,i| t-normal[i]} 7 | end 8 | 9 | def Variant.threshold_diff t,normal,tumor 10 | normal2,tumor2 = apply_threshold(t,normal,tumor) 11 | diff(normal2,tumor2) 12 | end 13 | 14 | def Variant.relative_diff normal,tumor 15 | d = diff(normal,tumor) 16 | total = tumor.each_with_index.map {|t,i| t+normal[i]} 17 | total.each_with_index.map {|t,i| (t==0 ? 0 : ((d[i].to_f/t)*100.0).round/100.0)} 18 | end 19 | 20 | def Variant.relative_threshold_diff t,normal,tumor 21 | normal2,tumor2 = apply_threshold(t,normal,tumor) 22 | relative_diff(normal2,tumor2) 23 | end 24 | 25 | def Variant.index normal,tumor 26 | rd = relative_diff(normal,tumor) 27 | max = rd.reduce(0){|mem,v| (v>mem ? v : mem) } 28 | rd.index(max) 29 | end 30 | 31 | def Variant.apply_threshold t,normal,tumor 32 | normal2 = normal.map{|v| (v>t ? 0 : v) } 33 | tumor2 = tumor.each_with_index.map{|v,i| (normal2[i]==0 ? 0 : v) } 34 | return normal2,tumor2 35 | end 36 | end 37 | 38 | end 39 | -------------------------------------------------------------------------------- /test/data/regression/ifilter_s.dp.ref: -------------------------------------------------------------------------------- 1 | 1 10257 159 242 249 249 186 212 218 2 | 1 10291 165 249 249 247 161 163 189 3 | 1 10297 182 246 250 246 165 158 183 4 | 1 10303 198 247 248 248 172 157 182 5 | 1 10315 212 246 242 245 190 157 189 6 | 1 10321 218 246 248 248 193 164 196 7 | 1 10327 237 238 229 237 209 183 210 8 | 1 12783 58 164 144 182 126 103 158 9 | 1 13116 32 131 102 152 104 88 109 10 | 1 13118 34 129 101 145 99 85 108 11 | 1 13178 52 172 137 172 129 119 148 12 | 1 13302 36 136 99 146 90 65 117 13 | 1 13757 53 201 181 250 152 130 182 14 | 1 13868 75 192 182 224 142 111 167 15 | 1 13896 62 135 143 175 112 81 121 16 | 1 14354 43 158 115 145 72 119 17 | 1 14464 51 155 141 150 83 89 140 18 | 1 14673 36 142 117 157 95 76 131 19 | 1 14699 43 128 109 147 98 78 114 20 | 1 14907 57 216 162 205 153 118 158 21 | 1 14930 68 216 170 210 136 125 164 22 | 1 14933 68 216 169 212 132 128 164 23 | 1 14948 63 192 181 211 129 121 153 24 | 1 14976 56 166 161 196 109 116 135 25 | 1 15118 46 198 129 230 113 126 158 26 | 1 15190 53 208 170 200 126 145 179 27 | 1 15211 54 183 161 171 120 134 168 28 | 1 15274 37 121 102 137 71 67 98 29 | 1 15447 46 242 183 226 137 173 175 30 | 1 15688 37 182 147 184 100 101 148 31 | 1 16103 50 79 86 106 60 61 84 32 | -------------------------------------------------------------------------------- /test/data/regression/sfilter_seval_s.dp.ref: -------------------------------------------------------------------------------- 1 | 1 10257 159 242 249 249 186 212 218 2 | 1 10291 165 249 249 247 161 163 189 3 | 1 10297 182 246 250 246 165 158 183 4 | 1 10303 198 247 248 248 172 157 182 5 | 1 10315 212 246 242 245 190 157 189 6 | 1 10321 218 246 248 248 193 164 196 7 | 1 10327 237 238 229 237 209 183 210 8 | 1 12783 58 164 144 182 126 103 158 9 | 1 13116 32 131 102 152 104 88 109 10 | 1 13118 34 129 101 145 99 85 108 11 | 1 13178 52 172 137 172 129 119 148 12 | 1 13302 36 136 99 146 90 65 117 13 | 1 13757 53 201 181 250 152 130 182 14 | 1 13868 75 192 182 224 142 111 167 15 | 1 13896 62 135 143 175 112 81 121 16 | 1 14464 51 155 141 150 83 89 140 17 | 1 14673 36 142 117 157 95 76 131 18 | 1 14699 43 128 109 147 98 78 114 19 | 1 14907 57 216 162 205 153 118 158 20 | 1 14930 68 216 170 210 136 125 164 21 | 1 14933 68 216 169 212 132 128 164 22 | 1 14948 63 192 181 211 129 121 153 23 | 1 14976 56 166 161 196 109 116 135 24 | 1 15118 46 198 129 230 113 126 158 25 | 1 15190 53 208 170 200 126 145 179 26 | 1 15211 54 183 161 171 120 134 168 27 | 1 15274 37 121 102 137 71 67 98 28 | 1 15447 46 242 183 226 137 173 175 29 | 1 15688 37 182 147 184 100 101 148 30 | 1 16068 33 57 68 81 49 49 58 31 | 1 16103 50 79 86 106 60 61 84 32 | -------------------------------------------------------------------------------- /lib/bio-vcf/bedfilter.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | class BedFilter 4 | def initialize bedfilen 5 | require 'binary_search/native' 6 | 7 | # Parse Bed file and build up search array 8 | chrs = {} 9 | info = {} 10 | File.open(bedfilen).each_line { | line | 11 | (chr,start,stop,gene) = line.strip.split(/\t/)[0..3] 12 | chrs[chr] ||= [] 13 | chrs[chr].push(stop.to_i) 14 | info[chr+':'+stop] = [chr,start.to_i,stop.to_i,gene] 15 | } 16 | # Make sure chrs is sorted 17 | @chrs = {} 18 | chrs.each { | k,list | 19 | @chrs[k] = list.sort 20 | } 21 | @info = info 22 | end 23 | 24 | def contains(rec) 25 | stop_list = @chrs[rec.chrom] 26 | if stop_list 27 | pos = rec.pos 28 | stop = stop_list.bsearch { |bedstop| bedstop >= pos } 29 | if stop 30 | rinfo = @info[rec.chrom+':'+stop.to_s] 31 | raise "Unexpected error in BED record for #{rec.chrom}:#{stop} position" if rinfo == nil 32 | start = rinfo[1] 33 | if pos >= start 34 | # p [rec.chrom,rec.pos,rinfo] 35 | return rinfo 36 | end 37 | end 38 | end 39 | nil 40 | end 41 | end 42 | 43 | end 44 | -------------------------------------------------------------------------------- /features/diff_count.feature: -------------------------------------------------------------------------------- 1 | @diff 2 | 3 | Feature: Variant calling (filters) - diffing nucleotide counts 4 | 5 | Basic filtering happens on the command line with the --filter switch. To 6 | support somewhat more advanced features the following features are 7 | included. 8 | 9 | When diffing nucleotide counts we want to find out which nucleotide defines 10 | the tumor. The difference has to be larger than 0 and the relative difference 11 | is the max. When a threshold is set only those nucleotides are included which 12 | pass the threshold (i.e., no more than x supporting nucleotides in the 13 | reference). 14 | 15 | The advantage is that filtering is possible without actually looking at 16 | the rec.alt and rec.ref values, i.e., no assumptions are being made 17 | about the underlying nucleotides. 18 | 19 | Scenario: Diffing nucleotide counts 20 | 21 | Given normal and tumor counts [0,25,0,1] and [0,40,0,12] 22 | When I look for the difference 23 | Then I expect the diff to be [0,15,0,11] 24 | And I expect the defining tumor nucleotide to be "T" 25 | And I expect the tumor count to be 12 26 | When I set an inclusion threshold for the reference 27 | Then I expect the diff for threshold 2 to be [0,0,0,11] 28 | And the relative diff to be [0,0,0,0.85] 29 | 30 | -------------------------------------------------------------------------------- /test/data/regression/seval_s.dp.ref: -------------------------------------------------------------------------------- 1 | 1 10257 159 242 249 249 186 212 218 2 | 1 10291 165 249 249 247 161 163 189 3 | 1 10297 182 246 250 246 165 158 183 4 | 1 10303 198 247 248 248 172 157 182 5 | 1 10315 212 246 242 245 190 157 189 6 | 1 10321 218 246 248 248 193 164 196 7 | 1 10327 237 238 229 237 209 183 210 8 | 1 10583 8 24 21 23 15 19 19 9 | 1 10665 7 5 2 7 10 | 1 10694 5 5 11 | 1 10723 4 5 6 12 | 1 12783 58 164 144 182 126 103 158 13 | 1 13116 32 131 102 152 104 88 109 14 | 1 13118 34 129 101 145 99 85 108 15 | 1 13178 52 172 137 172 129 119 148 16 | 1 13302 36 136 99 146 90 65 117 17 | 1 13757 53 201 181 250 152 130 182 18 | 1 13868 75 192 182 224 142 111 167 19 | 1 13896 62 135 143 175 112 81 121 20 | 1 14354 43 158 115 145 72 119 21 | 1 14464 51 155 141 150 83 89 140 22 | 1 14673 36 142 117 157 95 76 131 23 | 1 14699 43 128 109 147 98 78 114 24 | 1 14907 57 216 162 205 153 118 158 25 | 1 14930 68 216 170 210 136 125 164 26 | 1 14933 68 216 169 212 132 128 164 27 | 1 14948 63 192 181 211 129 121 153 28 | 1 14976 56 166 161 196 109 116 135 29 | 1 15118 46 198 129 230 113 126 158 30 | 1 15190 53 208 170 200 126 145 179 31 | 1 15211 54 183 161 171 120 134 168 32 | 1 15274 37 121 102 137 71 67 98 33 | 1 15447 46 242 183 226 137 173 175 34 | 1 15688 37 182 147 184 100 101 148 35 | 1 16068 33 57 68 81 49 49 58 36 | 1 16103 50 79 86 106 60 61 84 37 | -------------------------------------------------------------------------------- /bio-vcf.gemspec: -------------------------------------------------------------------------------- 1 | # No longer generated by jeweler 2 | # -*- encoding: utf-8 -*- 3 | 4 | Gem::Specification.new do |s| 5 | s.name = "bio-vcf" 6 | s.version = File.read("VERSION") 7 | 8 | s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= 9 | s.authors = ["Pjotr Prins"] 10 | s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)" 11 | s.email = "pjotr.public01@thebird.nl" 12 | s.executables = ["bio-vcf"] 13 | s.extra_rdoc_files = [ 14 | "LICENSE", 15 | "README.md" 16 | ] 17 | s.files = [ 18 | ".travis.yml", 19 | "Gemfile", 20 | "LICENSE", 21 | "README.md", 22 | "Rakefile", 23 | "VERSION", 24 | "bin/bio-vcf", 25 | "bio-vcf.gemspec", 26 | "ragel/gen_vcfheaderline_parser.rl", 27 | "ragel/generate.sh", 28 | ] 29 | s.files += Dir['lib/**/*.rb'] + Dir['bin/*'] 30 | s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] + 31 | Dir['template/**/*'] 32 | 33 | s.homepage = "http://github.com/vcflib/bio-vcf" 34 | s.licenses = ["MIT"] 35 | s.require_paths = ["lib"] 36 | s.required_ruby_version = Gem::Requirement.new(">= 2.0.0") 37 | # s.rubygems_version = "2.0.3" 38 | s.summary = "Fast multi-purpose multi-threaded VCF parser" 39 | 40 | end 41 | 42 | -------------------------------------------------------------------------------- /features/step_definitions/vcf_header.rb: -------------------------------------------------------------------------------- 1 | Given(/^the VCF header lines$/) do |string| 2 | header = VcfHeader.new 3 | header.add string 4 | @vcf = header 5 | end 6 | 7 | When(/^I parse the VCF header$/) do 8 | end 9 | 10 | Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do 11 | expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'] 12 | end 13 | 14 | Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1| 15 | expect(@vcf.fileformat).to eq arg1 16 | end 17 | 18 | Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1| 19 | expect(@vcf.fileDate).to eq arg1 20 | end 21 | 22 | Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1| 23 | expect(@vcf.field['fileDate']).to eq arg1 24 | end 25 | 26 | Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1| 27 | expect(@vcf.phasing).to eq arg1 28 | end 29 | 30 | Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1| 31 | expect(@vcf.reference).to eq arg1 32 | end 33 | 34 | Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2| 35 | expect(@vcf.format[arg1].to_s).to eq arg2 36 | end 37 | 38 | Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2| 39 | expect(@vcf.info[arg1].to_s).to eq arg2 40 | end 41 | 42 | Then(/^I expect vcf\.meta to contain all header meta information$/) do 43 | m = @vcf.meta 44 | expect(m['fileformat']).to eq "VCFv4.1" 45 | expect(m['FORMAT']['DP']['Number']).to eq "1" 46 | expect(m.size).to be 9 47 | end 48 | 49 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcffile.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | # This class abstracts a VCF file that can be iterated. 3 | # The VCF can be plain text or compressed with gzip 4 | # Note that files compressed with bgzip will not work, as thie ruby implementation of Zlib don't allow concatenated files 5 | class VCFfile 6 | 7 | def initialize(file: "", is_gz: true) 8 | @file = file 9 | @is_gz = is_gz 10 | end 11 | 12 | def parseVCFheader(head_line="") 13 | m=/##INFO=/.match(head_line) 14 | {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]} 15 | end 16 | 17 | 18 | #Returns an enum that can be used as an iterator. 19 | def each 20 | return enum_for(:each) unless block_given? 21 | io = nil 22 | if @is_gz 23 | infile = open(@file) 24 | io = Zlib::GzipReader.new(infile) 25 | else 26 | io = File.open(@file) 27 | end 28 | 29 | header = BioVcf::VcfHeader.new 30 | io.each_line do |line| 31 | line.chomp! 32 | if line =~ /^##fileformat=/ 33 | header.add(line) 34 | next 35 | end 36 | if line =~ /^#/ 37 | header.add(line) 38 | next 39 | end 40 | fields = BioVcf::VcfLine.parse(line) 41 | rec = BioVcf::VcfRecord.new(fields,header) 42 | yield rec 43 | end 44 | end 45 | end 46 | end -------------------------------------------------------------------------------- /features/step_definitions/diff_count.rb: -------------------------------------------------------------------------------- 1 | 2 | Given(/^normal and tumor counts \[(\d+),(\d+),(\d+),(\d+)\] and \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8| 3 | @normal = [arg1,arg2,arg3,arg4].map{|i|i.to_i} 4 | @tumor = [arg5,arg6,arg7,arg8].map{|i|i.to_i} 5 | end 6 | 7 | When(/^I look for the difference$/) do 8 | end 9 | 10 | Then(/^I expect the diff to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4| 11 | expect(Variant.diff(@normal,@tumor)).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i] 12 | end 13 | 14 | Then(/^the relative diff to be \[(\d+),(\d+)\.(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6| 15 | res = [arg1.to_f,(arg2+'.'+arg3).to_f,arg4.to_i,(arg5+'.'+arg6).to_f] 16 | expect(Variant.relative_diff(@normal,@tumor)).to eq res 17 | end 18 | 19 | Then(/^I expect the defining tumor nucleotide to be "(.*?)"$/) do |arg1| 20 | expect(['A','C','G','T'][Variant.index(@normal,@tumor)]).to eq arg1 21 | end 22 | 23 | Then(/^I expect the tumor count to be (\d+)$/) do |arg1| 24 | expect(@tumor[Variant.index(@normal,@tumor)]).to eq arg1.to_i 25 | end 26 | 27 | When(/^I set an inclusion threshold for the reference$/) do 28 | end 29 | 30 | Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5| 31 | @t = arg1.to_i 32 | @t_diff = Variant.threshold_diff(@t,@normal,@tumor) 33 | expect(@t_diff).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i] 34 | end 35 | 36 | Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5| 37 | res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f] 38 | expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res 39 | end 40 | 41 | 42 | -------------------------------------------------------------------------------- /features/vcf_header.feature: -------------------------------------------------------------------------------- 1 | @meta 2 | Feature: Parsing VCF meta information from the header 3 | 4 | Take a header and parse that information as defined by the VCF standard. 5 | 6 | Scenario: When parsing a header line 7 | 8 | Given the VCF header lines 9 | """ 10 | ##fileformat=VCFv4.1 11 | ##fileDate=20140121 12 | ##phasing=none 13 | ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##INFO= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR 19 | """ 20 | When I parse the VCF header 21 | Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'] 22 | And I expect vcf.fileformat to be "VCFv4.1" 23 | And I expect vcf.fileDate to be "20140121" 24 | And I expect vcf.field['fileDate'] to be "20140121" 25 | And I expect vcf.phasing to be "none" 26 | And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta" 27 | And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"} 28 | And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"} 29 | And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"} 30 | And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}' 31 | And I expect vcf.meta to contain all header meta information 32 | 33 | Scenario: When parsing the header of somatic_sniper.vcf 34 | 35 | Do something 36 | -------------------------------------------------------------------------------- /guix.scm: -------------------------------------------------------------------------------- 1 | ;; To use this file to build HEAD of bio-vcf: 2 | ;; 3 | ;; guix build -f guix.scm 4 | ;; 5 | ;; To get a development container (emacs shell will work) 6 | ;; 7 | ;; rm Gemfile.lock # remove any dependencies 8 | ;; guix environment -C -l guix.scm 9 | ;; ruby ./bin/bio-vcf 10 | ;; 11 | ;; rake test # for testing 12 | ;; rake rdoc # for generating docs 13 | 14 | (use-modules 15 | ((guix licenses) #:prefix license:) 16 | (guix gexp) 17 | (guix packages) 18 | (guix git-download) 19 | (guix build-system ruby) 20 | (guix build-system trivial) 21 | (gnu packages ruby) 22 | (gn packages ruby) 23 | (srfi srfi-1) 24 | (ice-9 popen) 25 | (ice-9 rdelim)) 26 | 27 | (define %source-dir (dirname (current-filename))) 28 | 29 | (define %git-commit 30 | (read-string (open-pipe "git show HEAD | head -1 | cut -d ' ' -f 2" OPEN_READ))) 31 | 32 | (define-public bio-vcf-source 33 | (package 34 | (name "bio-vcf-source") 35 | (version (git-version "0.9.5" "HEAD" %git-commit)) 36 | (source (local-file %source-dir #:recursive? #t)) 37 | (build-system trivial-build-system) 38 | (propagated-inputs 39 | `(("ruby" ,ruby) 40 | ("ruby-rake" ,ruby-rake))) 41 | (native-inputs 42 | `(("ruby-cucumber" ,ruby-cucumber) 43 | )) 44 | (arguments 45 | `(#:modules ((guix build utils)) 46 | #:builder 47 | (begin 48 | (use-modules (guix build utils)) 49 | (let ((target (string-append (assoc-ref %outputs "out") 50 | "/share"))) 51 | (write target) 52 | (mkdir-p target) 53 | #t)))) 54 | (synopsis "Smart VCF parser DSL") 55 | (description 56 | "Bio-vcf provides a @acronym{DSL, domain specific language} for processing 57 | the VCF format. Record named fields can be queried with regular expressions. 58 | Bio-vcf is a new generation VCF parser, filter and converter. Bio-vcf is not 59 | only very fast for genome-wide (WGS) data, it also comes with a filtering, 60 | evaluation and rewrite language and can output any type of textual data, 61 | including VCF header and contents in RDF and JSON.") 62 | (home-page "http://github.com/vcflib/bio-vcf") 63 | (license license:expat))) 64 | 65 | 66 | bio-vcf-source 67 | -------------------------------------------------------------------------------- /features/sfilter.feature: -------------------------------------------------------------------------------- 1 | @sfilter 2 | Feature: Sample filters 3 | 4 | Bio-vcf supports sample filters, where every sample is evaluated 5 | independently, though they have the rec information (chrom, pos, info) 6 | available. 7 | 8 | Scenario: Example of a sample 9 | 10 | Given the VCF line 11 | """ 12 | 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 13 | """ 14 | When I evaluate '0/0:6,0:6:3:0,3,33' 15 | Then I expect s.empty? to be false 16 | Then I expect s.dp? to be true 17 | Then I expect s.dp to be 6 18 | And sfilter 's.dp>4' to be true 19 | 20 | # Scenario: Sample with missing data 21 | When I evaluate missing '0/0:6,0:.:3:0,3,33' 22 | Then I expect s.empty? to be false 23 | Then I expect s.dp? to be false 24 | Then I expect s.dp to be nil 25 | And sfilter 's.dp>4' to throw an error 26 | 27 | # Scenario: Sample with missing data with ignore missing set 28 | When I evaluate missing '0/0:6,0:.:3:0,3,33' with ignore missing 29 | Then I expect s.empty? to be false 30 | Then I expect s.dp? to be false 31 | Then I expect s.dp to be nil 32 | And sfilter 's.dp>4' to be false 33 | 34 | # Scenario: Missing sample 35 | When I evaluate empty './.' 36 | Then I expect s.empty? to be true 37 | Then I expect s.dp? to be false 38 | Then I expect s.dp to be nil 39 | And sfilter 's.dp>4' to throw an error 40 | 41 | # Scenario: Missing sample with ignore missing set 42 | When I evaluate empty './.' with ignore missing 43 | Then I expect s.empty? to be true 44 | Then I expect s.dp? to be false 45 | Then I expect s.dp to be nil 46 | And sfilter 's.dp>4' to be false 47 | 48 | # Scenario: Wrong field name in sample 49 | When I evaluate '0/0:6,0:6:3:0,3,33' 50 | Then I expect s.empty? to be false 51 | Then I expect s.dp? to be true 52 | Then I expect s.what? to throw an error 53 | And I expect s.what to throw an error 54 | 55 | # Scenario: Get other information for a sample 56 | When I evaluate '0/0:6,0:6:3:0,3,33' 57 | Then I expect r.chrom to be "1" 58 | And I expect r.alt to be ["G"] 59 | And I expect r.info.af to be 0.667 60 | 61 | -------------------------------------------------------------------------------- /lib/bio-vcf/template.rb: -------------------------------------------------------------------------------- 1 | require 'erb' 2 | 3 | module Bio 4 | 5 | class Template 6 | 7 | def initialize fn, handle_comma=false 8 | @handle_comma = handle_comma 9 | raise "Can not find template #{fn}!" if not File.exist?(fn) 10 | parse(File.read(fn)) 11 | @is_first = true 12 | end 13 | 14 | # Parse the template and split into HEADER, BODY and FOOTER sections 15 | def parse buf 16 | header = [] 17 | body = [] 18 | footer = [] 19 | where = :header 20 | buf.split("\n").each do | line | 21 | case where 22 | when :header 23 | next if line =~ /=HEADER/ 24 | if line =~ /=BODY/ 25 | body = [] 26 | where = :body 27 | next 28 | end 29 | header << line 30 | when :body 31 | if line =~ /=FOOTER/ 32 | footer = [] 33 | where = :footer 34 | next 35 | end 36 | body << line 37 | else 38 | footer << line 39 | end 40 | end 41 | if body == [] 42 | body = header 43 | header = [] 44 | end 45 | @erb_header = ERB.new(header.join("\n")) if header.size 46 | body2 = body.join("\n").reverse 47 | # if there is a comma at the end, eat it 48 | if @handle_comma 49 | body2.chars.each_with_index { |c,i| 50 | break if [']','}'].include?(c) 51 | if c == ',' 52 | body2[i] = " " 53 | break 54 | end 55 | } 56 | end 57 | @erb_body = ERB.new(body2.reverse) if body.size 58 | @erb_footer = ERB.new(footer.join("\n")) if footer.size 59 | end 60 | 61 | def result env 62 | @erb.result(env) 63 | end 64 | 65 | # Call the HEADER template (once) 66 | def header env 67 | if @erb_header 68 | @erb_header.result(env) 69 | else 70 | "" 71 | end 72 | end 73 | 74 | # For every record call the BODY template 75 | def body env 76 | if @erb_body 77 | res = 78 | if @handle_comma and not @is_first 79 | "," 80 | else 81 | "" 82 | end 83 | @is_first = false 84 | res + @erb_body.result(env) 85 | else 86 | "" 87 | end 88 | end 89 | 90 | # Call the FOOTER template (once) 91 | def footer env 92 | if @erb_footer 93 | @erb_footer.result(env) 94 | else 95 | "" 96 | end 97 | end 98 | end 99 | end 100 | -------------------------------------------------------------------------------- /doc/pcows.org: -------------------------------------------------------------------------------- 1 | * Parallel copy-on-write streaming (PCOWS) 2 | 3 | Copy-on-write can be offered by an operating system to provide 4 | efficient parallelisation for streaming operations typical in biology 5 | where chunks of data can be processed independently from the same 6 | starting point. 7 | 8 | PCOWS compares to a regular thread pool with the difference that data 9 | is not communicated through messages or pipes, but simply through 10 | memory. The only catch is that PCOWS threads can not easily communicate 11 | back. The Ruby parallel gem communicates back through pipes but that 12 | means the main thread can not proceed until all the pipes complete. 13 | 14 | PCOWS was first explored in the bio-ruby VCF tool (bio-vcf 0.7 series) 15 | and then replaced by the parallel gem (bio-vcf 0.8 16 | series). Performance, unfortunately, deteriorated to the extent that 17 | PCOWS got retrofitted. 18 | 19 | PCOWS basically reads a file and chunks it on the main thread. Every 20 | chunk gets fed to a copy-on-write version of the Ruby interpreter 21 | which contains the full state up to the point of forking - this is an 22 | inexpensive procedure. Each chunk gets processed to a file. The main 23 | thread tracks these files and issues a separate thread for harvesting 24 | the ordered files so each gets piped to STDOUT in order. 25 | 26 | The reader thread tracks This means the reader thread is only held up 27 | by the number of allowed forks running at a time. 28 | 29 | Based on this description the interface can be expressed as a reader 30 | creating chunks of state that get passed on to a chunk processor named 31 | 'run' which gets a callback 'func'. Every time a chunk gets processed 32 | the reader checks the size of the thread pool and also checks whether 33 | output has become available. 34 | 35 | The threads communicate simply through a file. Each thread writes to 36 | STDOUT which has been redirected to a temporary file with extension 37 | '.part'. When the thread is complete, the file gets renamed by 38 | removing the '.part'. Files that lack the '.part' get printed to the 39 | real STDOUT and deleted on completion. I.e.: 40 | 41 | 1. STDOUT for a worker is redirected to a tempfile with extension '.part' 42 | 2. Worker thread writes to STDOUT 43 | 3. On completion '.part' gets removed from the filename 44 | 4. The main thread checks the next file to be printed based on filenames 45 | 5. A print thread gets invoked 46 | 6. The print thread removes the file 47 | 48 | The only communication between PCOW threads and the main running 49 | thread is therefore the renaming of the temporary file. This will also 50 | work on network mounted systems. For performance it can make a 51 | difference to use a fast disk for temporary files, though the Linux 52 | file caching will make it likely that buffers are still in RAM. 53 | 54 | -------------------------------------------------------------------------------- /lib/regressiontest/cli_exec.rb: -------------------------------------------------------------------------------- 1 | require 'fileutils' 2 | 3 | module RegressionTest 4 | 5 | DEFAULT_TESTDIR = "test/data/regression" 6 | 7 | # Regression test runner compares output in ./test/data/regression 8 | # (by default). The convention is to have a file with names .ref 9 | # (reference) and create .new 10 | # 11 | # You can add an :ignore regex option which ignores lines in the 12 | # comparson files matching a regex 13 | # 14 | # :timeout sets the time out for calling a system command 15 | # 16 | # :should_fail expects the system command to return a non-zero 17 | module CliExec 18 | FilePair = Struct.new(:outfn,:reffn) 19 | 20 | def CliExec::exec command, testname, options = {} 21 | # ---- Find .ref file 22 | fullname = DEFAULT_TESTDIR + "/" + testname 23 | basefn = if File.exist?(testname+".ref") || File.exist?(testname+"-stderr.ref") 24 | testname 25 | elsif File.exist?(fullname + ".ref") || File.exist?(fullname+"-stderr.ref") 26 | FileUtils.mkdir_p DEFAULT_TESTDIR 27 | fullname 28 | else 29 | raise "Can not find reference file for #{testname} - expected #{fullname}.ref" 30 | end 31 | std_out = FilePair.new(basefn + ".new", basefn + ".ref") 32 | std_err = FilePair.new(basefn + "-stderr.new", basefn + "-stderr.ref") 33 | files = [std_out,std_err] 34 | # ---- Create .new file 35 | cmd = command + " > #{std_out.outfn} 2>#{std_err.outfn}" 36 | $stderr.print cmd,"\n" 37 | exec_ret = nil 38 | if options[:timeout] && options[:timeout] > 0 39 | Timeout.timeout(options[:timeout]) do 40 | begin 41 | exec_ret = Kernel.system(cmd) 42 | rescue Timeout::Error 43 | $stderr.print cmd, " failed to finish in under #{options[:timeout]}\n" 44 | return false 45 | end 46 | end 47 | else 48 | exec_ret = Kernel.system(cmd) 49 | end 50 | expect_fail = (options[:should_fail] != nil) 51 | if !expect_fail and exec_ret==0 52 | $stderr.print cmd," returned an error\n" 53 | return false 54 | end 55 | if expect_fail and exec_ret 56 | $stderr.print cmd," did not return an error\n" 57 | return false 58 | end 59 | if options[:ignore] 60 | regex = options[:ignore] 61 | files.each do |f| 62 | outfn = f.outfn 63 | outfn1 = outfn + ".1" 64 | FileUtils.mv(outfn,outfn1) 65 | f1 = File.open(outfn1) 66 | f2 = File.open(outfn,"w") 67 | f1.each_line do | line | 68 | f2.print(line) if line !~ /#{regex}/ 69 | end 70 | f1.close 71 | f2.close 72 | FileUtils::rm(outfn1) 73 | end 74 | end 75 | # ---- Compare the two files 76 | files.each do |f| 77 | next unless File.exist?(f.reffn) 78 | return false unless compare_files(f.outfn,f.reffn,options[:ignore]) 79 | end 80 | return true 81 | end 82 | 83 | def CliExec::compare_files fn1, fn2, ignore = nil 84 | if not File.exist?(fn2) 85 | FileUtils::cp(fn1,fn2) 86 | true 87 | else 88 | cmd = "diff #{fn2} #{fn1}" 89 | $stderr.print cmd+"\n" 90 | return true if Kernel.system(cmd) == true 91 | # Hmmm. We have a different result. We are going to try again 92 | # because sometimes threads have not completed 93 | sleep 0.25 94 | return true if Kernel.system(cmd) == true 95 | $stderr.print "If it is correct, execute \"cp #{fn1} #{fn2}\", and run again" 96 | false 97 | end 98 | end 99 | end 100 | 101 | end 102 | -------------------------------------------------------------------------------- /features/somaticsniper.feature: -------------------------------------------------------------------------------- 1 | @sniper 2 | Feature: VCF for Somatic Sniper 3 | 4 | Here we take a VCF line and parse the information given by Somatic Sniper. 5 | 6 | At this position the reference contains: AAAGAAAAGAAAAA (12A,2G) 7 | At this position the tumor contains: AAAAACACAA (8A,2C) 8 | 9 | rec.alt contains variants C,G. rec.tumor.bcount reflects the contents of the 10 | tumor (8A,2C) so rec.tumor.bcount[rec.alt] reflects the actual number of 11 | variants in the tumor. 12 | 13 | The mapping quality in the BAM file is 37/37 and base quality is 55/60 in normal 14 | and tumor respectively. 15 | 16 | For the second scenario: 17 | 18 | At this position the reference contains: (15A) 19 | At this position the tumor contains: AAAAAAAAATATTA (13A, 3T) 20 | 21 | Scenario: When parsing a record 22 | 23 | Given the somatic sniper vcf line 24 | """ 25 | 1 27691244 . A C,G . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/2:0/2:14:0,12,0,2:12,0,2,0:14:35:14:14,35:37:37,37:1:. 0/1:0/1:10:0,8,0,2:8,2,0,0:18:35:18:20,51:37:37,37:2:33 26 | """ 27 | When I parse the record 28 | Then I expect rec.chrom to contain "1" 29 | Then I expect rec.pos to contain 27691244 30 | Then I expect rec.ref to contain "A" 31 | And I expect rec.alt to contain ["C","G"] 32 | And I expect rec.tumor.dp to be 10 33 | And I expect rec.tumor.dp4 to be [0,8,0,2] 34 | And I expect rec.tumor.bcount.to_ary to be [8,2,0,0] 35 | And I expect rec.tumor.bcount[rec.alt] to be [2,0] 36 | And I expect rec.tumor.bcount["G"] to be 0 37 | And I expect rec.tumor.bcount[1] to be 2 38 | And I expect rec.tumor.bcount[3] to be 0 39 | And I expect rec.tumor.bcount.sum to be 2 40 | And I expect rec.tumor.bcount.max to be 2 41 | And I expect rec.tumor.bq.to_ary to be [20,51] 42 | And I expect rec.tumor.bq["G"] to be 51 43 | And I expect rec.tumor.bq[1] to be 51 44 | And I expect rec.tumor.bq.min to be 20 45 | And I expect rec.tumor.bq.max to be 51 46 | And I expect rec.tumor.amq.to_ary to be [37,37] 47 | And I expect rec.tumor.mq to be 37 48 | And I expect rec.tumor.ss to be 2 49 | And I expect rec.tumor.ssc to be 33 50 | And I expect rec.normal.ssc to be nil 51 | # The following are additional functions 52 | And I expect rec.call_diff to be [-4,2,-2,0] 53 | And I expect rec.call_nuc to be "C" 54 | And I expect rec.call_tumor_count to be 2 55 | And I expect rec.call_normal_count to be 0 56 | And I expect rec.call_tumor_relative_count to be 1.0 57 | 58 | Given the somatic sniper vcf line 59 | """ 60 | 1 27686841 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:15:3,12,0,0:15,0,0,0:66:37:0:25:37:37:0:. 0/1:0/1:16:2,11,0,3:13,0,0,3:30:37:30:34,55:37:37,37:2:37 61 | """ 62 | When I parse the record 63 | Then I expect rec.chrom to contain "1" 64 | Then I expect rec.pos to contain 27686841 65 | Then I expect rec.ref to contain "A" 66 | And I expect rec.alt to contain one ["T"] 67 | And I expect rec.tumor.dp to be 16 68 | And I expect rec.tumor.dp4 to be [2,11,0,3] 69 | And I expect rec.tumor.bcount.to_ary to be [13,0,0,3] 70 | And I expect rec.tumor.bcount[rec.alt] to be one [3] 71 | And I expect rec.tumor.bcount["G"] to be 0 72 | And I expect rec.tumor.bcount["T"] to be 3 73 | And I expect rec.tumor.bcount[1] to be 0 74 | And I expect rec.tumor.bcount[3] to be 3 75 | And I expect rec.tumor.bcount.sum to be 3 76 | And I expect rec.tumor.bcount.max to be 3 77 | And I expect rec.tumor.bq.to_ary to be [34,55] 78 | And I expect rec.tumor.bq["T"] to be 34 79 | And I expect rec.tumor.bq[1] to be 55 80 | And I expect rec.tumor.bq.min to be 34 81 | And I expect rec.tumor.bq.max to be 55 82 | And I expect rec.tumor.amq.to_ary to be [37,37] 83 | And I expect rec.tumor.mq to be 37 84 | And I expect rec.tumor.ss to be 2 85 | 86 | 87 | -------------------------------------------------------------------------------- /doc/Compare_VCFs.md: -------------------------------------------------------------------------------- 1 | # Comparing VCF files 2 | 3 | Between two different pipeline runs we ended up with different VCF 4 | files. The starting point (BAMs) was the same, but in each pipeline 5 | different procedures may have been followed, i.e. the processing steps 6 | for variant calling were not exactly the same. The first 7 | freebayes+varscan2 pipeline (P1) I wrote after testing many callers 8 | including somatic sniper and strelka, so I should know that well 9 | enough. The second pipeline (P2) includes more variant callers. To 10 | find out how they compared and which output was preferred I decided to 11 | do some analysis using bio-vcf. 12 | 13 | ## Comparing freebayes output 14 | 15 | The freebayes somatic variant calling files differed in size. Just 16 | looking at one sample P1 had 479 lines and P2 had 729. The germline 17 | calls, however, where comparable in size. But when I ran a diff it 18 | showed these differed significantly too: 19 | 20 | wc -l germline*vcf 21 | 3527 germline1.vcf 22 | 3500 germline2.vcf 23 | cat germline1.vcf|bio-vcf -e "[r.chr,r.pos]"|sort > germline1.txt 24 | cat germline2.vcf|bio-vcf -e "[r.chr,r.pos]"|sort > germline2.txt 25 | diff germline1.txt germline2.txt |wc 26 | 1751 27 | 28 | To zoom in on settings, lets look at read depth on chromosome 7 (-v 29 | and --num-threads=1 options I typically use when trying new filters 30 | because they give digestable output): 31 | 32 | cat germline1.vcf|bio-vcf -v --num-threads 1 --filter 'rec.chr=="7"' -e '[r.chr,r.pos,r.info.dp]'|sort 33 | 7 90225928 34 34 | 7 95216394 69 35 | 7 97821397 97 36 | 7 97821398 98 37 | 7 97822115 96 38 | 7 97822210 94 39 | 7 98503849 109 40 | 7 98543545 68 41 | 7 98543546 69 42 | 7 98582562 38 43 | 7 98650051 48 44 | 7 99690690 78 45 | 7 99690747 27 46 | 7 99693552 34 47 | 48 | cat germline2.vcf|bio-vcf -v --num-threads 1 --filter 'rec.chr=="7"' -e '[r.chr,r.pos,r.info.dp]'|sort 49 | 7 90225928 121 50 | 7 95216394 534 51 | 7 97822115 1053 52 | 7 97822210 1044 53 | 7 97834704 249 54 | 7 98503849 1579 55 | 7 98547176 59 56 | 7 98553739 21 57 | 7 98648517 75 58 | 7 98650051 344 59 | 7 99690690 455 60 | 7 99690747 168 61 | 7 99693552 107 62 | 63 | OK, this is informative. P1 called variants after removing duplicate reads. P2 64 | did not. That explains the different in number of variants called. 65 | 66 | Unfortunately the sequencing concerns an FFPE dataset. FFPE degrades 67 | over time and DNA changes. In itself this is not a problem because we 68 | sequence many cells and the changed ones do not necessarily 69 | dominate. We do, however, amplify the DNA before sequencing through a 70 | PCR-type process. This means that randomly these FFPE changes may 71 | become dominant at a certain position and variant callers score them 72 | as genuine variants. I have studied this data and there is ample 73 | evidence of this effect. The only way to address this is by removing 74 | duplicate reads - so the amplified reads get compressed into one 75 | (theoretically, because sometimes there are multiple errors confusing 76 | things a bit). Removing duplicates is the *only* way and this can not 77 | happen *after* variant calling. 78 | 79 | This means P2 is out of the window. It is useless data also for the 80 | other variant callers. I don't even have to check the somatic calling. 81 | 82 | ## Conclusion 83 | 84 | A simple read depth check with bio-vcf proved that P2 had no 85 | merit. Either we rerun it after removing duplicates or we rely on P1. 86 | -------------------------------------------------------------------------------- /test/data/regression/eval_once.ref: -------------------------------------------------------------------------------- 1 | {"UnifiedGenotyper"=>{"ID"=>"UnifiedGenotyper", "Version"=>"2.8-1-g932cd3a", "Date"=>"Sat Jan 25 10:33:56 CET 2014", "Epoch"=>1390642436187, "CommandLineOptions"=>"analysis_type=UnifiedGenotyper input_file=[/data_fedor12/BAM/sander/Liver_clones/BIOPSY17513D/mapping/BIOPSY17513D_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone3/mapping/clone3_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone4/mapping/clone4_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone10/mapping/clone10_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone33/mapping/subclone33_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone46/mapping/subclone46_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone105/mapping/subclone105_dedup_realigned_recalibrated.bam] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/data_fedor13/sander/variant_calling/Liver_clones/.queue/scatterGather/UnifiedGenotyper_noref-1-sg/temp_001_of_500/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/data_fedor13/common_data/references/H_sapiens/GATK_b37_bundle_reference/basespace/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false genotype_likelihoods_model=SNP pcr_error_rate=1.0E-4 computeSLOD=false annotateNDA=false pair_hmm_implementation=LOGLESS_CACHING min_base_quality_score=17 max_deletion_fraction=0.05 allSitePLs=false min_indel_count_for_genotyping=5 min_indel_fraction_per_sample=0.25 indelGapContinuationPenalty=10 indelGapOpenPenalty=45 indelHaplotypeSize=80 indelDebug=false ignoreSNPAlleles=false allReadsSP=false ignoreLaneInfo=false reference_sample_calls=(RodBinding name= source=UNBOUND) reference_sample_name=null sample_ploidy=2 min_quality_score=1 max_quality_score=40 site_quality_prior=20 min_power_threshold_for_calling=0.95 min_reference_depth=100 exclude_filtered_reference_sites=false output_mode=EMIT_VARIANTS_ONLY heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.0 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null dbsnp=(RodBinding name= source=UNBOUND) comp=[] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub onlyEmitSamples=[] debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"}} 2 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfrdf.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | # This is some primarily RDF support - which may be moved to another gem 4 | # 5 | # Note that this functionality is superceded by the --template command! Though 6 | # this can be useful for one-liners. 7 | 8 | module VcfRdf 9 | 10 | def VcfRdf::header 11 | print < . 13 | @prefix rdfs: . 14 | @prefix dc: . 15 | @prefix hgnc: . 16 | @prefix doi: . 17 | @prefix db: . 18 | @prefix seq: . 19 | @prefix : . 20 | EOB 21 | end 22 | 23 | def VcfRdf::record id,rec,tags = "{}" 24 | id2 = [id,'ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_') 25 | print <\] 56 | if s =~ /[^\\]\\[^\\]/ 57 | s2 = [] 58 | s.each_char.with_index { |c,i| 59 | res = c 60 | if i>0 and c == '\\' and s[i-1] != '\\' and s[i+1] !~ /^[uUtnr\\"]/ 61 | res = '\\' + c 62 | end 63 | # p [i,c,s[i+1],res] 64 | s2 << res 65 | } 66 | s = s2.join('') 67 | end 68 | s 69 | end 70 | 71 | def RDF::stringify_literal(literal) 72 | RDF::escape_string_literal(literal.to_s) 73 | end 74 | 75 | def RDF::quoted_stringify_literal(literal) 76 | '"' + stringify_literal(literal) + '"' 77 | end 78 | end 79 | 80 | module Turtle 81 | 82 | def Turtle::stringify_literal(literal) 83 | RDF::stringify_literal(literal) 84 | end 85 | 86 | def Turtle::identifier(id) 87 | raise "Illegal identifier #{id}" if id != Turtle::mangle_identifier(id) 88 | end 89 | 90 | # Replace letters/symbols that are not allowed in a Turtle identifier 91 | # (short hand URI). This should be the definite mangler and replace the 92 | # ones in bioruby-table and bio-exominer. Manglers are useful when using 93 | # data from other sources and trying to transform them into simple RDF 94 | # identifiers. 95 | 96 | def Turtle::mangle_identifier(s) 97 | id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\|\>)+/,"_") 98 | id = id.gsub(/\[|\]/,'') 99 | # id = URI::escape(id) 100 | id = id.gsub(/\|/,'_') 101 | id = id.gsub(/\-|:/,'_') 102 | if id != s 103 | # Don't want Bio depency in templates! 104 | # logger = Bio::Log::LoggerPlus.new 'bio-rdf' 105 | # logger.warn "\nWARNING: Changed identifier <#{s}> to <#{id}>" 106 | # $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>" 107 | end 108 | if not RDF::valid_uri?(id) 109 | raise "Invalid URI after mangling <#{s}> to <#{id}>!" 110 | end 111 | valid_id = if id =~ /^\d/ 112 | 'r' + id 113 | else 114 | id 115 | end 116 | valid_id # we certainly hope so! 117 | end 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /features/cli.feature: -------------------------------------------------------------------------------- 1 | @cli 2 | Feature: Command-line interface (CLI) 3 | 4 | bio-vcf has a powerful command line interface. Here we regression test features. 5 | 6 | Scenario: Test the info filter using dp 7 | Given I have input file(s) named "test/data/input/multisample.vcf" 8 | When I execute "./bin/bio-vcf -i --filter 'r.info.dp>100'" 9 | Then I expect the named output to match the named output "r.info.dp" 10 | 11 | Scenario: Test the info filter using dp and threads 12 | Given I have input file(s) named "test/data/input/multisample.vcf" 13 | When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'" 14 | Then I expect the named output to match the named output "thread4" in under 30 seconds 15 | 16 | Scenario: Test the info filter using dp and threads with lines 17 | Given I have input file(s) named "test/data/input/multisample.vcf" 18 | When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'" 19 | Then I expect the named output to match the named output "thread4_4" in under 30 seconds 20 | 21 | Scenario: Test the sample filter using dp 22 | Given I have input file(s) named "test/data/input/multisample.vcf" 23 | When I execute "./bin/bio-vcf -i --sfilter 's.dp>20'" 24 | Then I expect the named output to match the named output "s.dp" 25 | 26 | Scenario: Test the include sample filter using dp 27 | Given I have input file(s) named "test/data/input/multisample.vcf" 28 | When I execute "./bin/bio-vcf -i --ifilter 's.dp>100' --seval s.dp" 29 | Then I expect the named output to match the named output "ifilter_s.dp" 30 | 31 | Scenario: Test the info eval using dp 32 | Given I have input file(s) named "test/data/input/multisample.vcf" 33 | When I execute "./bin/bio-vcf -i --eval 'r.info.dp'" 34 | Then I expect the named output to match the named output "eval_r.info.dp" 35 | 36 | Scenario: Test the sample eval using dp 37 | Given I have input file(s) named "test/data/input/multisample.vcf" 38 | When I execute "./bin/bio-vcf -i --seval 's.dp'" 39 | Then I expect the named output to match the named output "seval_s.dp" 40 | 41 | Scenario: Test the sample filter + eval using dp 42 | Given I have input file(s) named "test/data/input/multisample.vcf" 43 | When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'" 44 | Then I expect the named output to match the named output "sfilter_seval_s.dp" 45 | 46 | Scenario: Rewrite an info field 47 | Given I have input file(s) named "test/data/input/multisample.vcf" 48 | When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'" 49 | Then I expect the named output to match the named output "rewrite.info.sample" 50 | 51 | Scenario: Test eval-once 52 | Given I have input file(s) named "test/data/input/multisample.vcf" 53 | When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']" 54 | Then I expect the named output to match the named output "eval_once" 55 | 56 | Scenario: Test JSON output with header meta data 57 | Given I have input file(s) named "test/data/input/multisample.vcf" 58 | When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb" 59 | Then I expect the named output to match the named output "vcf2json_full_header" 60 | 61 | Scenario: Test JSON output with header meta data and query samples 62 | Given I have input file(s) named "test/data/input/multisample.vcf" 63 | When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb" 64 | Then I expect the named output to match the named output "vcf2json_use_meta" 65 | 66 | Scenario: Test deadlock on failed filter with threads 67 | Given I have input file(s) named "test/data/input/multisample.vcf" 68 | When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'" 69 | Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds 70 | 71 | Scenario: Test VCF with no records 72 | Given I have input file(s) named "test/data/input/empty.vcf" 73 | When I execute "./bin/bio-vcf --timeout=5" 74 | Then I expect no errors 75 | -------------------------------------------------------------------------------- /features/step_definitions/somaticsniper.rb: -------------------------------------------------------------------------------- 1 | Given(/^the somatic sniper vcf line$/) do |string| 2 | @fields = VcfLine.parse(string.split(/\s+/).join("\t")) 3 | end 4 | 5 | When(/^I parse the record$/) do 6 | header = VcfHeader.new 7 | @rec = VcfRecord.new(@fields,header) 8 | end 9 | 10 | Then(/^I expect rec\.chrom to contain "(.*?)"$/) do |arg1| 11 | expect(@rec.chrom).to eq "1" 12 | end 13 | 14 | Then(/^I expect rec\.pos to contain (\d+)$/) do |arg1| 15 | expect(@rec.pos).to eq arg1.to_i 16 | end 17 | 18 | Then(/^I expect rec\.ref to contain "(.*?)"$/) do |arg1| 19 | expect(@rec.ref).to eq arg1 20 | end 21 | 22 | Then(/^I expect rec\.alt to contain \["(.*?)","(.*?)"\]$/) do |arg1, arg2| 23 | expect(@rec.alt).to eq [arg1,arg2] 24 | end 25 | 26 | Then(/^I expect rec\.alt to contain one \["(.*?)"\]$/) do |arg1| 27 | expect(@rec.alt).to eq [arg1] 28 | end 29 | 30 | Then(/^I expect rec\.tumor\.dp to be (\d+)$/) do |arg1| 31 | expect(@rec.tumor.dp).to eq arg1.to_i 32 | end 33 | 34 | Then(/^I expect rec\.tumor\.dp(\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5| 35 | expect(@rec.tumor.dp4).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i] 36 | end 37 | 38 | 39 | Then(/^I expect rec\.tumor\.bcount.to_ary to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4| 40 | expect(@rec.tumor.bcount.to_ary).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i] 41 | end 42 | 43 | Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 44 | expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i,arg2.to_i] 45 | end 46 | 47 | Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be one \[(\d+)\]$/) do |arg1| 48 | expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i] 49 | end 50 | 51 | Then(/^I expect rec\.tumor\.bcount\["(.*?)"\] to be (\d+)$/) do |arg1, arg2| 52 | expect(@rec.tumor.bcount[arg1]).to eq arg2.to_i 53 | end 54 | 55 | Then(/^I expect rec\.tumor\.bcount\[(\d+)\] to be (\d+)$/) do |arg1, arg2| 56 | expect(@rec.tumor.bcount[arg1.to_i]).to eq arg2.to_i 57 | end 58 | 59 | Then(/^I expect rec\.tumor\.bcount\.sum to be (\d+)$/) do |arg1| 60 | expect(@rec.tumor.bcount.sum).to eq arg1.to_i 61 | end 62 | 63 | Then(/^I expect rec\.tumor\.bcount\.max to be (\d+)$/) do |arg1| 64 | expect(@rec.tumor.bcount.max).to eq arg1.to_i 65 | end 66 | 67 | 68 | Then(/^I expect rec\.tumor\.bq\.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 69 | expect(@rec.tumor.bq.to_ary).to eq [arg1.to_i,arg2.to_i] 70 | end 71 | 72 | Then(/^I expect rec\.tumor\.bq\["(.*?)"\] to be (\d+)$/) do |arg1, arg2| 73 | expect(@rec.tumor.bq[arg1]).to eq arg2.to_i 74 | end 75 | 76 | Then(/^I expect rec\.tumor\.bq\[(\d+)\] to be (\d+)$/) do |arg1, arg2| 77 | expect(@rec.tumor.bq[arg1.to_i]).to eq arg2.to_i 78 | end 79 | 80 | Then(/^I expect rec\.tumor\.bq\.min to be (\d+)$/) do |arg1| 81 | expect(@rec.tumor.bq.min).to eq arg1.to_i 82 | end 83 | 84 | Then(/^I expect rec\.tumor\.bq\.max to be (\d+)$/) do |arg1| 85 | expect(@rec.tumor.bq.max).to eq arg1.to_i 86 | end 87 | 88 | 89 | Then(/^I expect rec\.tumor\.amq.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 90 | expect(@rec.tumor.amq.to_ary).to eq [arg1.to_i,arg2.to_i] 91 | end 92 | 93 | Then(/^I expect rec\.tumor\.mq to be (\d+)$/) do |arg1| 94 | expect(@rec.tumor.mq).to eq arg1.to_i 95 | end 96 | 97 | Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1| 98 | expect(@rec.tumor.ss).to eq arg1.to_i 99 | end 100 | 101 | 102 | Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1| 103 | expect(@rec.tumor.ssc).to be 33 104 | end 105 | 106 | Then(/^I expect rec\.normal\.ssc to be nil$/) do 107 | expect(@rec.normal.ssc).to be nil 108 | end 109 | 110 | Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4| 111 | expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i] 112 | end 113 | 114 | Then(/^I expect rec.call_nuc to be "(.*?)"$/) do |arg1| 115 | expect(@rec.call_nuc).to eq arg1 116 | end 117 | 118 | Then(/^I expect rec.call_tumor_count to be (\d+)$/) do |arg1| 119 | expect(@rec.call_tumor_count).to eq arg1.to_i 120 | end 121 | 122 | Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1| 123 | expect(@rec.call_normal_count).to eq arg1.to_i 124 | end 125 | 126 | Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2| 127 | expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f 128 | end 129 | 130 | 131 | -------------------------------------------------------------------------------- /features/multisample.feature: -------------------------------------------------------------------------------- 1 | @multi 2 | Feature: Multi-sample VCF 3 | 4 | Here we take a VCF line and parse the information for multiple named 5 | samples 6 | 7 | Scenario: When parsing a record 8 | 9 | Given the multi sample header line 10 | """ 11 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2 12 | """ 13 | When I parse the header 14 | Given multisample vcf line 15 | """ 16 | 1 10321 . C T 106.30 . AC=5;AF=0.357;AN=14;BaseQRankSum=3.045;DP=1537;Dels=0.01;FS=5.835;HaplotypeScore=220.1531;MLEAC=5;MLEAF=0.357;MQ=26.69;MQ0=258;MQRankSum=-4.870;QD=0.10;ReadPosRankSum=0.815 GT:AD:DP:GQ:PL 0/1:189,25:218:30:30,0,810 0/0:219,22:246:24:0,24,593 0/1:218,27:248:34:34,0,1134 0/0:220,22:248:56:0,56,1207 0/1:168,23:193:19:19,0,493 0/1:139,22:164:46:46,0,689 0/1:167,26:196:20:20,0,522 17 | """ 18 | When I parse the record 19 | Then I expect rec.valid? to be true 20 | Then I expect rec.chrom to contain "1" 21 | Then I expect rec.pos to contain 10321 22 | Then I expect rec.ref to contain "C" 23 | And I expect multisample rec.alt to contain ["T"] 24 | And I expect rec.qual to be 106.30 25 | And I expect rec.info.ac to be 5 26 | And I expect rec.info.af to be 0.357 27 | And I expect rec.info.dp to be 1537 28 | And I expect rec.info['dp'] to be 1537 29 | And I expect rec.info.readposranksum to be 0.815 30 | And I expect rec.info['ReadPosRankSum'] to be 0.815 31 | And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"] 32 | And I expect rec.sample['Original'].ad to be [189,25] 33 | And I expect rec.sample['Original'].gt to be "0/1" 34 | And I expect rec.sample['s3t2'].ad to be [167,26] 35 | And I expect rec.sample['s3t2'].dp to be 196 36 | And I expect rec.sample['s3t2'].gq to be 20 37 | And I expect rec.sample['s3t2'].pl to be [20,0,522] 38 | # And the nicer self resolving 39 | And I expect rec.sample.original.gt to be "0/1" 40 | And I expect rec.sample.s3t2.pl to be [20,0,522] 41 | # And the even better 42 | And I expect rec.original.gt? to be true 43 | And I expect rec.original.gt to be "0/1" 44 | And I expect rec.s3t2.pl to be [20,0,522] 45 | # Check for missing data 46 | And I expect test rec.missing_samples? to be false 47 | And I expect test rec.original? to be true 48 | # Special functions 49 | And I expect r.original? to be true 50 | And I expect r.original.gti? to be true 51 | And I expect r.original.gti to be [0,1] 52 | And I expect r.original.gti[1] to be 1 53 | And I expect r.original.gts? to be true 54 | And I expect r.original.gts to be ["C","T"] 55 | And I expect r.original.gts[1] to be "T" 56 | 57 | Given multisample vcf line 58 | """ 59 | 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL ./. ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33 60 | """ 61 | When I parse the record 62 | Then I expect rec.pos to contain 10723 63 | Then I expect rec.valid? to be true 64 | And I expect rec.original? to be false 65 | And I expect rec.sample.s1t1? to be false 66 | And I expect rec.sample.s3t2? to be true 67 | And I expect rec.missing_samples? to be true 68 | 69 | # Phased genotype 70 | Given multisample vcf line 71 | """ 72 | 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33 73 | """ 74 | When I parse the record 75 | Then I expect rec.pos to contain 10723 76 | Then I expect rec.valid? to be true 77 | And I expect r.original? to be true 78 | And I expect r.original.gts? to be true 79 | And I expect r.original.gts to be ["C","G"] 80 | And I expect r.original.gts[0] to be "C" 81 | And I expect r.original.gts[1] to be "G" 82 | 83 | # INFO fields with matching tails 84 | Given multisample vcf line 85 | """ 86 | 1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL 0|1 ./. 1/1:2,2:4:6:66,6,0 1/1:4,1:5:3:36,3,0 ./. ./. 0/0:6,0:6:3:0,3,33 87 | """ 88 | When I parse the record 89 | Then I expect r.info.end to be 111 90 | And I expect r.info.ciend to be 999 91 | -------------------------------------------------------------------------------- /features/step_definitions/sfilter.rb: -------------------------------------------------------------------------------- 1 | Given(/^the VCF line$/) do |string| 2 | @header = VcfHeader.new 3 | @header.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample") 4 | @vcfline = string 5 | end 6 | 7 | When(/^I evaluate '([^']+)'$/) do |arg1| 8 | # concat VCF line with sample (arg1) 9 | @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t")) 10 | @rec = VcfRecord.new(@fields,@header) 11 | p @rec 12 | @g = @rec.sample['Sample'] 13 | p @g 14 | expect(@g).not_to be nil 15 | @s = VcfSample::Sample.new(@rec,@g) 16 | @ignore_missing = false 17 | end 18 | 19 | Then(/^I expect s\.empty\? to be false$/) do 20 | expect(@s.empty?).to be false 21 | expect(@s.sfilter("s.empty?",do_cache: false)).to be false 22 | end 23 | 24 | Then(/^I expect s\.dp\? to be true$/) do 25 | p ['eval s.dp?',@s.eval("s.dp?",do_cache: false)] 26 | p ['eval s.dp',@s.eval("s.dp",do_cache: false)] 27 | p @g.dp 28 | p @s.dp 29 | p @s.sfilter("s.dp?",do_cache: false) 30 | expect(@s.eval("s.dp?",do_cache: false)).to be true 31 | end 32 | 33 | Then(/^I expect s\.dp to be (\d+)$/) do |arg1| 34 | # p @s.eval("s.dp") 35 | p :now 36 | p ['eval s.dp?',@s.eval("s.dp?",do_cache: false)] 37 | p ['eval s.dp',@s.eval("s.dp",do_cache: false)] 38 | expect(@s.eval("s.dp",do_cache: false)).to equal arg1.to_i 39 | end 40 | 41 | Then(/^sfilter 's\.dp>(\d+)' to be true$/) do |arg1| 42 | expect(@s.sfilter("dp>#{arg1}",do_cache: false)).to be true 43 | end 44 | 45 | When(/^I evaluate missing '([^']+)'$/) do |arg1| 46 | # concat VCF line with sample (arg1) 47 | @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t")) 48 | @rec = VcfRecord.new(@fields,@header) 49 | p @rec 50 | @g = @rec.sample['Sample'] 51 | @s = VcfSample::Sample.new(@rec,@g) 52 | p @s 53 | expect(@s).not_to be nil 54 | @ignore_missing = false 55 | end 56 | 57 | Then(/^I expect s\.dp\? to be false$/) do 58 | expect(@s.eval("s.dp?",do_cache: false)).to be false 59 | end 60 | 61 | Then(/^I expect s\.dp to be nil$/) do 62 | expect(@s.eval("s.dp",ignore_missing_data: @ignore_missing, do_cache: false)).to be nil 63 | end 64 | 65 | Then(/^sfilter 's\.dp>(\d+)' to throw an error$/) do |arg1| 66 | expect { @s.eval("s.dp>#{arg1}",do_cache: false) }.to raise_error NoMethodError 67 | end 68 | 69 | Then(/^sfilter 's\.dp>(\d+)' to be false$/) do |arg1| 70 | expect(@s.sfilter("s.dp>#{arg1}",ignore_missing_data: @ignore_missing, do_cache: false)).to be false 71 | end 72 | 73 | When(/^I evaluate empty '\.\/\.'$/) do 74 | # concat VCF line with sample (arg1) 75 | @fields = VcfLine.parse((@vcfline.split(/\s+/)+['./.']).join("\t")) 76 | @rec = VcfRecord.new(@fields,@header) 77 | p @rec 78 | @g = @rec.sample['Sample'] 79 | @s = VcfSample::Sample.new(@rec,@g) 80 | p @s 81 | expect(@s).not_to be nil 82 | @ignore_missing = false 83 | end 84 | 85 | When(/^I evaluate missing '([^']+)' with ignore missing$/) do |arg1| 86 | # concat VCF line with sample (arg1) 87 | @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t")) 88 | @rec = VcfRecord.new(@fields,@header) 89 | p @rec 90 | @g = @rec.sample['Sample'] 91 | @s = VcfSample::Sample.new(@rec,@g) 92 | p @s 93 | expect(@s).not_to be nil 94 | @ignore_missing = true 95 | end 96 | 97 | Then(/^I expect s\.empty\? to be true$/) do 98 | expect(@s.sfilter("s.empty?",do_cache: false)).to be true 99 | end 100 | 101 | Then(/^I expect s\.dp to throw an error$/) do 102 | # @s.instance_eval { undef :dp } 103 | p @s.eval("s.dp",do_cache: false) 104 | expect { @s.eval("s.dp",do_cache: false) }.to raise_error NoMethodError 105 | end 106 | 107 | When(/^I evaluate empty '\.\/\.' with ignore missing$/) do 108 | # concat VCF line with sample (arg1) 109 | @fields = VcfLine.parse((@vcfline.split(/\s+/)+['./.']).join("\t")) 110 | @rec = VcfRecord.new(@fields,@header) 111 | p @rec 112 | @g = @rec.sample['Sample'] 113 | @s = VcfSample::Sample.new(@rec,@g) 114 | p @s 115 | expect(@s).not_to be nil 116 | @ignore_missing = true 117 | end 118 | 119 | Then(/^I expect s\.what\? to throw an error$/) do 120 | expect { @s.eval("s.what?",do_cache: false) }.to raise_error NoMethodError 121 | end 122 | 123 | Then(/^I expect s\.what to throw an error$/) do 124 | expect { @s.eval("s.what",do_cache: false) }.to raise_error NoMethodError 125 | end 126 | 127 | Then(/^I expect r\.chrom to be "(.*?)"$/) do |arg1| 128 | expect(@s.eval("r.chrom",do_cache: false)).to eq "1" 129 | end 130 | 131 | Then(/^I expect r\.alt to be \["(.*?)"\]$/) do |arg1| 132 | expect(@s.eval("r.alt",do_cache: false)).to eq ["G"] 133 | end 134 | 135 | Then(/^I expect r\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2| 136 | expect(@s.eval("r.info.af",do_cache: false)).to eq 0.667 137 | end 138 | 139 | 140 | -------------------------------------------------------------------------------- /doc/Using_RDF.md: -------------------------------------------------------------------------------- 1 | # Using bio-vcf with RDF 2 | 3 | bio-vcf can output many types of formats. In this exercise we will load 4 | a triple store (4store) with VCF data and do some queries on that. 5 | 6 | ## Install and start 4store 7 | 8 | ### On GNU Guix 9 | 10 | See https://github.com/pjotrp/guix-notes/blob/master/packages/4store.org 11 | 12 | ### On Debian 13 | 14 | Get root 15 | 16 | ```sh 17 | su 18 | apt-get install avahi-daemon 19 | apt-get install raptor-utils 20 | exit 21 | ``` 22 | 23 | As normal user 24 | 25 | ```sh 26 | guix package -i sparql-query curl 27 | ``` 28 | 29 | Initialize and start the server again as root (or another user) 30 | 31 | ``` 32 | su 33 | export PATH=/home/user/.guix-profile/bin:$PATH 34 | mkdir -p /var/lib/4store 35 | dbname=test 36 | 4s-backend-setup $dbname 37 | 4s-backend $dbname 38 | 4s-httpd -p 8000 $dbname 39 | ``` 40 | 41 | Try the web browser and point it to http://localhost:8000/status/ 42 | 43 | Open a new terminal as user. 44 | 45 | 46 | Generate rdf with bio-vcf template 47 | 48 | ```ruby 49 | =HEADER 50 | @prefix : . 51 | =BODY 52 | <% 53 | id = ['chr'+rec.chr,rec.pos,rec.alt].join('_') 54 | %> 55 | :<%= id %> 56 | :query_id "<%= id %>"; 57 | :chr "<%= rec.chr %>" ; 58 | :alt "<%= rec.alt.join("") %>" ; 59 | :pos <%= rec.pos %> . 60 | 61 | 62 | ``` 63 | 64 | so it looks like 65 | 66 | ``` 67 | :chrX_134713855_A 68 | :query_id "chrX_134713855_A"; 69 | :chr "X" ; 70 | :alt "A" ; 71 | :pos 134713855 . 72 | ``` 73 | 74 | and test with rapper using [gatk_exome.vcf](https://github.com/pjotrp/bioruby-vcf/blob/master/test/data/input/gatk_exome.vcf) 75 | 76 | ```sh 77 | cat gatk_exome.vcf |bio-vcf -v --template rdf_template.erb 78 | cat gatk_exome.vcf |bio-vcf -v --template rdf_template.erb > my.rdf 79 | rapper -i turtle my.rdf 80 | ``` 81 | 82 | Load into 4store (when no errors) 83 | 84 | ```bash 85 | rdf=my.rdf 86 | uri=http://localhost:8000/data/http://biobeat.org/data/$rdf 87 | curl -X DELETE $uri 88 | curl -T $rdf -H 'Content-Type: application/x-turtle' $uri 89 | 201 imported successfully 90 | This is a 4store SPARQL server 91 | ``` 92 | 93 | First SPARQL query 94 | 95 | ```sh 96 | SELECT ?id 97 | WHERE 98 | { 99 | ?id "X". 100 | } 101 | ``` 102 | 103 | ``` 104 | cat sparql1.rq |sparql-query "http://localhost:8000/sparql/" -p 105 | ┌──────────────────────────────────────────────┐ 106 | │ ?id │ 107 | ├──────────────────────────────────────────────┤ 108 | │ │ 109 | │ │ 110 | │ │ 111 | └──────────────────────────────────────────────┘ 112 | ``` 113 | 114 | A simple python query may look like 115 | 116 | ```python 117 | import requests 118 | import subprocess 119 | 120 | host = "http://localhost:8000/" 121 | 122 | query = """ 123 | SELECT ?s ?p ?o WHERE { 124 | ?s ?p ?o . 125 | } LIMIT 10 126 | """ 127 | 128 | r = requests.post(host, data={ "query": query, "output": "text" }) 129 | # print r.url 130 | 131 | print r.text 132 | ``` 133 | 134 | renders 135 | 136 | ``` 137 | ?id 138 | 139 | 140 | 141 | ``` 142 | 143 | A working example if you are using the server 144 | http://guix.genenetwork.org and the correct PREFIX: 145 | 146 | ```python 147 | #! /usr/bin/env python 148 | import requests 149 | import subprocess 150 | 151 | host = "http://guix.genenetwork.org/sparql/" 152 | query = """ 153 | PREFIX : 154 | SELECT ?id ?chr ?pos ?alt 155 | WHERE 156 | { 157 | { ?id :chr "X" . } 158 | UNION 159 | { ?id :chr "1" . } 160 | ?id :chr ?chr . 161 | ?id :alt ?alt . 162 | ?id :pos ?pos . 163 | FILTER (?pos > 107911705) . 164 | } 165 | """ 166 | r = requests.post(host, data={ "query": query, "output": "text" }) 167 | print r.text 168 | ``` 169 | 170 | ## EBI 171 | 172 | 173 | EBI SPARQL has some advanced examples of queries, such as 174 | 175 | ``` 176 | https://www.ebi.ac.uk/rdf/services/ensembl/sparql 177 | PREFIX rdf: 178 | PREFIX rdfs: 179 | PREFIX dcterms: 180 | PREFIX dc: 181 | PREFIX obo: 182 | PREFIX skos: 183 | PREFIX sio: 184 | PREFIX faldo: 185 | PREFIX identifiers: 186 | PREFIX ensembl: 187 | PREFIX ensembltranscript: 188 | PREFIX ensemblexon: 189 | PREFIX ensemblprotein: 190 | PREFIX ensemblterms: 191 | 192 | SELECT DISTINCT ?transcript ?id ?typeLabel ?reference ?begin ?end ?location { 193 | ?transcript obo:SO_transcribed_from ensembl:ENSG00000139618 ; 194 | a ?type; 195 | dc:identifier ?id . 196 | OPTIONAL { 197 | ?transcript faldo:location ?location . 198 | ?location faldo:begin [faldo:position ?begin] . 199 | ?location faldo:end [faldo:position ?end ] . 200 | ?location faldo:reference ?reference . 201 | } 202 | OPTIONAL {?type rdfs:label ?typeLabel} 203 | } 204 | ``` 205 | 206 | See https://www.ebi.ac.uk/rdf/services/ensembl/sparql 207 | 208 | # Exercise 209 | 210 | Today's exercise is to create a graph using bio-vcf and/or a small program using 211 | RDF triples and define a SPARQL query. 212 | 213 | The more interesting the graph/SPARQL the better. 214 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfsample.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | module VcfSample 3 | 4 | # Check whether a sample is empty (on the raw string value) 5 | def VcfSample::empty? s 6 | s==nil or s == './.' or s == '' or s[0..2]=='./.' or s[0..1] == '.:' 7 | end 8 | 9 | class Sample 10 | # Initialized sample with rec and genotypefield 11 | # 12 | # #0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4}, 13 | def initialize num,rec,genotypefield 14 | @num = num 15 | @rec = rec 16 | @sample = genotypefield 17 | @format = @sample.format 18 | @values = @sample.values 19 | end 20 | 21 | def empty? 22 | cache_empty ||= VcfSample::empty?(@sample.to_s) 23 | end 24 | 25 | def is_last? 26 | # $stderr.print(@num,@rec.header.num_samples) 27 | @num == @rec.header.num_samples-1 28 | end 29 | 30 | def name 31 | @sample.name 32 | end 33 | 34 | def eval expr, ignore_missing_data: false, quiet: false, do_cache: true 35 | caching_eval :eval, :call_cached_eval, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache 36 | end 37 | 38 | def sfilter expr, ignore_missing_data: false, quiet: true, do_cache: true 39 | caching_eval :sfilter, :call_cached_sfilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache 40 | end 41 | 42 | def ifilter expr, ignore_missing_data: false, quiet: false, do_cache: true 43 | caching_eval :ifilter, :call_cached_ifilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache 44 | end 45 | 46 | def efilter expr, ignore_missing_data: false, quiet: false, do_cache: true 47 | caching_eval :efilter, :call_cached_efilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache 48 | end 49 | 50 | # Split GT into index values 51 | def gti 52 | v = fetch_values("GT") 53 | v = './.' if v == '.' #In case that you have a single missing value, make both as missing. 54 | v.split(/[\/\|]/).map{ |v| (v=='.' ? nil : v.to_i) } 55 | end 56 | 57 | def gtindex 58 | v = fetch_values("GT") 59 | return case v 60 | when nil then nil 61 | when '.' then nil 62 | when './.' then nil 63 | when '0/0' then 0 64 | when '0/1' then 1 65 | when '1/1' then 2 66 | else 67 | raise "Unknown genotype #{v}" 68 | end 69 | end 70 | 71 | # Split GT into into a nucleode sequence 72 | def gts 73 | gti.map { |i| (i ? @rec.get_gt(i) : nil) } 74 | end 75 | 76 | def cache_method(name, &block) 77 | self.class.send(:define_method, name, &block) 78 | end 79 | 80 | def [] name 81 | if @format[name] 82 | v = fetch_values(name) 83 | return nil if VcfValue::empty?(v) 84 | return ConvertStringToValue::convert(v) 85 | end 86 | nil 87 | end 88 | 89 | def method_missing(m, *args, &block) 90 | name = m.to_s.upcase 91 | # p [:here,name,m ,@values] 92 | # p [:respond_to_call_cached_eval,respond_to?(:call_cached_eval)] 93 | if name =~ /\?$/ 94 | # test for valid field 95 | return !VcfValue::empty?(fetch_values(name.chop)) 96 | else 97 | if @format[name] 98 | cache_method(m) { 99 | v = fetch_values(name) 100 | return nil if VcfValue::empty?(v) 101 | ConvertStringToValue::convert(v) 102 | } 103 | self.send(m) 104 | else 105 | super(m, *args, &block) 106 | end 107 | end 108 | end 109 | 110 | private 111 | 112 | def fetch_values name 113 | n = @format[name] 114 | raise NoMethodError.new("Unknown sample field <#{name}>") if not n 115 | @values[n] # <-- save names with upcase! 116 | end 117 | 118 | def caching_eval method, cached_method, expr, ignore_missing_data: false, quiet: false, do_cache: true 119 | begin 120 | if do_cache 121 | if not respond_to?(cached_method) 122 | code = 123 | """ 124 | def #{cached_method}(rec,sample) 125 | r = rec 126 | s = sample 127 | #{expr} 128 | end 129 | """ 130 | self.class.class_eval(code) 131 | end 132 | self.send(cached_method,@rec,self) 133 | else 134 | # This is used for testing mostly 135 | print "WARNING: NOT CACHING #{method}\n" 136 | self.class.class_eval { undef :call_cached_eval } if respond_to?(:call_cached_eval) 137 | self.class.class_eval { undef :call_cached_sfilter } if respond_to?(:call_cached_sfilter) 138 | r = @rec 139 | s = @sample 140 | eval(expr) 141 | end 142 | rescue NoMethodError => e 143 | $stderr.print "\n#{method} trying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet 144 | if not quiet 145 | $stderr.print [:format,@format,:sample,@values],"\n" 146 | $stderr.print [:filter,expr],"\n" 147 | end 148 | if ignore_missing_data 149 | $stderr.print e.message if not quiet and not empty? 150 | return false 151 | else 152 | raise NoMethodError.new(e.message + ". Can not evaluate empty sample data by default: test for s.empty? or use the -i switch!") 153 | end 154 | end 155 | end 156 | 157 | end 158 | 159 | end 160 | end 161 | -------------------------------------------------------------------------------- /test/performance/metrics.md: -------------------------------------------------------------------------------- 1 | Round of testing on Macbook PRO running Linux with Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz 2 | and 3 | 4 | ruby -v 5 | ruby 2.1.0p0 (2013-12-25 revision 44422) [x86_64-linux] 6 | 7 | wc test/tmp/test.vcf 8 | 12469 137065 2053314 test/tmp/test.vcf 9 | 10 | time ./bin/bio-vcf -i --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null 11 | vcf 0.0.3-pre4 (biogem Ruby 2.1.0) by Pjotr Prins 2014 12 | Options: {:show_help=>false, :ignore_missing=>true, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"} 13 | real 0m1.215s 14 | user 0m1.208s 15 | sys 0m0.004s 16 | 17 | Reload 18 | 19 | time ./bin/bio-vcf -i --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null 20 | vcf 0.0.3-pre4 (biogem Ruby 2.1.0) by Pjotr Prins 2014 21 | Options: {:show_help=>false, :ignore_missing=>true, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"} 22 | real 0m1.194s 23 | user 0m1.172s 24 | sys 0m0.016s 25 | 26 | Introduced method caching 27 | 28 | real 0m1.190s 29 | user 0m1.180s 30 | sys 0m0.004s 31 | 32 | Introduce !!Float test 33 | 34 | real 0m1.187s 35 | user 0m1.180s 36 | sys 0m0.004s 37 | 38 | Cache sample index 39 | 40 | real 0m1.156s 41 | user 0m1.148s 42 | sys 0m0.004s 43 | 44 | Run the profiler 45 | 46 | ruby -rprofile ./bin/bio-vcf -i --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null 47 | vcf 0.0.3-pre4 (biogem Ruby 2.1.0) by Pjotr Prins 2014 48 | Options: {:show_help=>false, :ignore_missing=>true, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"} 49 | % cumulative self self total 50 | time seconds seconds calls ms/call ms/call name 51 | 9.45 2.19 2.19 34968 0.06 0.76 Object#parse_line 52 | 7.25 3.87 1.68 75031 0.02 0.03 BioVcf::VcfRecordInfo#[]= 53 | 7.12 5.52 1.65 34968 0.05 0.29 Kernel.eval 54 | 6.86 7.11 1.59 87481 0.02 0.10 BioVcf::VcfRecordInfo#initialize 55 | 5.57 8.40 1.29 35994 0.04 0.47 Array#each 56 | 4.14 9.36 0.96 34253 0.03 0.65 BioVcf::VcfRecord#each_sample 57 | 3.93 10.27 0.91 93880 0.01 0.03 BioVcf::VcfRecordParser.get_format 58 | 3.88 11.17 0.90 145920 0.01 0.01 String#split 59 | 60 | Late parsing of info field without split: 61 | 62 | real 0m1.124s 63 | user 0m1.120s 64 | sys 0m0.008s 65 | 66 | Global sample info caching 67 | 68 | real 0m1.032s 69 | user 0m1.020s 70 | sys 0m0.008s 71 | 72 | Assign some repeated Hash queries 73 | 74 | real 0m1.028s 75 | user 0m1.024s 76 | sys 0m0.000s 77 | 78 | Profiler now picking out eval for further optimization 79 | 80 | % cumulative self self total 81 | time seconds seconds calls ms/call ms/call name 82 | 10.45 1.80 1.80 34968 0.05 0.59 Object#parse_line 83 | 7.89 3.16 1.36 34968 0.04 0.17 Kernel.eval 84 | 5.69 4.14 0.98 34253 0.03 0.57 BioVcf::VcfRecord#each_sample 85 | 4.93 4.99 0.85 12497 0.07 1.37 nil# 86 | 87 | Compiling sample eval 88 | 89 | real 0m0.820s 90 | user 0m0.812s 91 | sys 0m0.004s 92 | 93 | Compiling record eval 94 | 95 | real 0m0.647s 96 | user 0m0.644s 97 | sys 0m0.000s 98 | 99 | Walk examples by index, rather than by name 100 | 101 | real 0m0.612s 102 | user 0m0.596s 103 | sys 0m0.012s 104 | 105 | More caching 106 | 107 | real 0m0.600s 108 | user 0m0.592s 109 | sys 0m0.004s 110 | 111 | And the latest profiling 112 | 113 | % cumulative self self total 114 | time seconds seconds calls ms/call ms/call name 115 | 12.98 2.02 2.02 34968 0.06 0.51 Object#parse_line 116 | 7.78 3.23 1.21 22518 0.05 0.14 BioVcf::VcfRecord#sample_by_index 117 | 5.59 4.10 0.87 34253 0.03 0.47 BioVcf::VcfRecord#each_sample 118 | 4.82 4.85 0.75 34968 0.02 0.03 BioVcf::ConvertStringToValue.integer? 119 | 4.50 5.55 0.70 12450 0.06 0.13 BioVcf::VcfRecordInfo#method_missing 120 | 4.31 6.22 0.67 69974 0.01 0.03 Class#new 121 | 4.24 6.88 0.66 12499 0.05 1.23 nil# 122 | 3.79 7.47 0.59 12450 0.05 0.06 BioVcf::VcfLine.parse 123 | 124 | Introduced --num-threads 125 | 126 | time ./bin/bio-vcf -i --num-threads --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null 127 | 128 | on a dual-core running Linux 129 | 130 | real 0m0.389s 131 | user 0m1.132s 132 | sys 0m0.148s 133 | 134 | Latest 135 | 136 | time ./bin/bio-vcf -i --num-threads 4 --thread-lines 2000 --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null 137 | vcf 0.8.3-pre1 (biogem Ruby 2.1.0) by Pjotr Prins 2014 138 | Options: {:show_help=>false, :source=>"https://github.com/CuppenResearch/bioruby-vcf", :version=>"0.8.3-pre1 (Pjotr Prins)", :date=>"2014-12-31 13:30:32 +0300", :thread_lines=>2000, :ignore_missing=>true, :num_threads=>4, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"} 139 | real 0m0.600s 140 | user 0m1.472s 141 | sys 0m0.068s 142 | 143 | Profiling: 144 | 145 | . % cumulative self self total 146 | time seconds seconds calls ms/call ms/call name 147 | 15.01 3.23 3.23 34968 0.09 0.82 Object#parse_line 148 | 8.22 5.00 1.77 22518 0.08 0.22 BioVcf::VcfRecord#sample_by_index 149 | 4.97 6.07 1.07 22518 0.05 0.27 BioVcf::VcfSample::Sample#sfilter 150 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfheader.rb: -------------------------------------------------------------------------------- 1 | # This module parses the VCF header. A header consists of lines 2 | # containing fields. Most fields are of 'key=value' type and appear 3 | # only once. These can be retrieved with the find_field method. 4 | # 5 | # INFO, FORMAT and contig fields are special as they appear multiple times 6 | # and contain multiple key values (identified by an ID field). 7 | # To retrieve these call 'info' and 'format' functions respectively, 8 | # which return a hash on the contained ID. 9 | # 10 | # For the INFO and FORMAT fields a Ragel parser is used, mostly to 11 | # deal with embedded quoted fields. 12 | 13 | module BioVcf 14 | 15 | module VcfHeaderParser 16 | def VcfHeaderParser.get_column_names(lines) 17 | lines.each do | line | 18 | if line =~ /^#[^#]/ 19 | # the first line that starts with a single hash 20 | names = line.split 21 | names[0].sub!(/^#/,'') 22 | return names 23 | end 24 | end 25 | nil 26 | end 27 | 28 | def VcfHeaderParser.parse_field(line, debug) 29 | BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(line, debug: debug) 30 | end 31 | end 32 | 33 | class VcfHeader 34 | 35 | attr_reader :lines, :field 36 | 37 | def initialize(debug = false) 38 | @debug = debug 39 | @lines = [] 40 | @field = {} 41 | @meta = nil 42 | @cached_filter_index = {} 43 | end 44 | 45 | # Add a new field to the header 46 | def add line 47 | @lines += line.split(/\n/) 48 | end 49 | 50 | # Push a special key value list to the header 51 | def tag h 52 | h2 = h.dup 53 | [:show_help,:skip_header,:verbose,:quiet,:debug].each { |key| h2.delete(key) } 54 | info = h2.map { |k,v| k.to_s.capitalize+'='+'"'+v.to_s+'"' }.join(',') 55 | line = '##BioVcf=<'+info+'>' 56 | @lines.insert(-2,line) 57 | line 58 | end 59 | 60 | def version 61 | @version ||= lines[0].scan(/##fileformat=VCFv(\d+\.\d+)/)[0][0] 62 | end 63 | 64 | def column_names 65 | @column_names ||= VcfHeaderParser::get_column_names(@lines) 66 | end 67 | 68 | def columns 69 | @column ||= column_names.size 70 | end 71 | 72 | def printable_header_line(fields) 73 | fields.map { | field | 74 | if field == '#samples' 75 | samples 76 | else 77 | field 78 | end 79 | }.join("\t") 80 | end 81 | 82 | def samples 83 | @samples ||= if column_names.size > 8 84 | column_names[9..-1] 85 | else 86 | [] 87 | end 88 | end 89 | 90 | def samples_index_array 91 | @all_samples_index ||= column_names[9..-1].fill{|i| i} 92 | end 93 | 94 | def num_samples 95 | @num_samples ||= ( samples == nil ? 0 : samples.size ) 96 | end 97 | 98 | # Returns the field number for a sample (starting with 9) 99 | def sample_index 100 | return @sample_index if @sample_index 101 | index = {} 102 | samples.each_with_index { |k,i| index[k] = i+9 ; index[k.downcase] = i+9 } 103 | @sample_index = index 104 | index 105 | end 106 | 107 | # Give a list of samples (by index and/or name) and return 0-based index values 108 | # The cache has to be able to hanle multiple lists - that is why it is a hash. 109 | def sample_subset_index list 110 | cached = @cached_filter_index[list] 111 | if cached 112 | l = cached 113 | else 114 | l = [] 115 | list = samples_index_array() if not list 116 | list.each { |i| 117 | value = 118 | begin 119 | Integer(i) 120 | rescue 121 | idx = samples.index(i) 122 | if idx != nil 123 | idx 124 | else 125 | raise "Unknown sample name '#{i}'" 126 | end 127 | end 128 | l << value 129 | } 130 | @cached_filter_index[list] = l 131 | end 132 | l 133 | end 134 | 135 | # Look for a line in the header with the field name and return the 136 | # value, otherwise return nil 137 | def find_field name 138 | return field[name] if field[name] 139 | @lines.each do | line | 140 | value = line.scan(/###{name}=(.*)/) 141 | if value[0] 142 | v = value[0][0] 143 | field[name] = v 144 | return v 145 | end 146 | end 147 | nil 148 | end 149 | 150 | # Look for all the lines that match the field name and return 151 | # a hash of hashes. An empty hash is returned when there are 152 | # no matches. 153 | def find_fields name 154 | res = {} 155 | @lines.each do | line | 156 | value = line.scan(/###{name}=<(.*)>/) 157 | if value[0] 158 | str = value[0][0] 159 | # p str 160 | v = VcfHeaderParser.parse_field(line,@debug) 161 | id = v['ID'] 162 | res[id] = v 163 | end 164 | end 165 | # p res 166 | res 167 | end 168 | 169 | def format 170 | find_fields('FORMAT') 171 | end 172 | 173 | def filter 174 | find_fields('FILTER') 175 | end 176 | 177 | def contig 178 | find_fields('contig') 179 | end 180 | 181 | def info 182 | find_fields('INFO') 183 | end 184 | 185 | def gatkcommandline 186 | find_fields('GATKCommandLine') 187 | end 188 | 189 | def meta 190 | return @meta if @meta 191 | res = { 'INFO' => {}, 'FORMAT' => {}, 'FILTER' => {}, 'contig' => {}, 'GATKCommandLine' => {} } 192 | @lines.each do | line | 193 | value = line.scan(/##(.*?)=(.*)/) 194 | if value[0] 195 | k,v = value[0] 196 | if k != 'FORMAT' and k != 'INFO' and k != 'FILTER' and k != 'contig' and k != 'GATKCommandLine' 197 | # p [k,v] 198 | res[k] = v 199 | end 200 | end 201 | end 202 | res['INFO'] = info() 203 | res['FORMAT'] = format() 204 | res['FILTER'] = filter() 205 | res['contig'] = contig() 206 | res['GATKCommandLine'] = gatkcommandline() 207 | # p [:res, res] 208 | @meta = res # cache values 209 | res 210 | end 211 | 212 | def method_missing(m, *args, &block) 213 | name = m.to_s 214 | value = find_field(name) 215 | return value if value 216 | raise "Unknown VCF header query '#{name}'" 217 | end 218 | 219 | end 220 | end 221 | -------------------------------------------------------------------------------- /features/step_definitions/multisample.rb: -------------------------------------------------------------------------------- 1 | Given(/^the multi sample header line$/) do |string| 2 | @header = VcfHeader.new 3 | @header.add(string) 4 | end 5 | 6 | When(/^I parse the header$/) do 7 | expect(@header.column_names.size).to eq 16 8 | expect(@header.samples.size).to eq 7 9 | expect(@header.samples).to eq ["Original", "s1t1", "s2t1", "s3t1", "s1t2", "s2t2", "s3t2"] 10 | end 11 | 12 | Given(/^multisample vcf line$/) do |string| 13 | @fields = VcfLine.parse(string.split(/\s+/).join("\t")) 14 | @rec1 = VcfRecord.new(@fields,@header) 15 | end 16 | 17 | Then(/^I expect multisample rec\.alt to contain \["(.*?)"\]$/) do |arg1| 18 | expect(@rec1.alt).to eq ["T"] 19 | end 20 | 21 | Then(/^I expect rec\.qual to be (\d+)\.(\d+)$/) do |arg1, arg2| 22 | expect(@rec1.qual).to eq 106.3 23 | end 24 | 25 | Then(/^I expect rec\.info\.ac to be (\d+)$/) do |arg1| 26 | expect(@rec1.info.ac).to eq arg1.to_i 27 | end 28 | Then(/^I expect rec\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2| 29 | expect(@rec1.info.af).to eq 0.357 30 | end 31 | 32 | Then(/^I expect rec\.info\.dp to be (\d+)$/) do |arg1| 33 | expect(@rec1.info.dp).to eq 1537 34 | end 35 | 36 | Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2| 37 | expect(@rec1.info.readposranksum).to eq 0.815 38 | end 39 | 40 | Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1| 41 | expect(@rec1.info['dp']).to eq 1537 42 | end 43 | 44 | Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2| 45 | expect(@rec1.info['ReadPosRankSum']).to eq 0.815 46 | end 47 | 48 | Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15| 49 | expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"] 50 | end 51 | 52 | Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1| 53 | expect(@rec1.sample['Original'].gt).to eq "0/1" 54 | end 55 | 56 | Then(/^I expect rec\.original\.gt to be "(.*?)"$/) do |arg1| 57 | expect(@rec1.original.gt).to eq "0/1" 58 | end 59 | 60 | Then(/^I expect rec\.sample\['Original'\]\.gt to be "(.*?)"$/) do |arg1| 61 | expect(@rec1.sample['Original'].gt).to eq "0/1" 62 | end 63 | 64 | Then(/^I expect rec\.sample\['Original'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 65 | expect(@rec1.sample['Original'].ad).to eq [189,25] 66 | end 67 | 68 | Then(/^I expect rec\.sample\['Original'\]\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 69 | expect(@rec1.sample['Original'].gt).to eq "0/1" 70 | end 71 | 72 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4| 73 | expect(@rec1.sample['s3t2'].ad).to eq [167,26] 74 | end 75 | 76 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.dp to be (\d+)$/) do |arg1, arg2, arg3| 77 | expect(@rec1.sample['s3t2'].dp).to eq 196 78 | end 79 | 80 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.gq to be (\d+)$/) do |arg1, arg2, arg3| 81 | expect(@rec1.sample['s3t2'].gq).to eq 20 82 | end 83 | 84 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5| 85 | expect(@rec1.sample['s3t2'].pl).to eq [20,0,522] 86 | end 87 | 88 | Then(/^I expect rec\.sample\.original\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 89 | expect(@rec1.sample.original.gt).to eq "0/1" 90 | end 91 | 92 | Then(/^I expect rec\.sample\.s(\d+)t(\d+)\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5| 93 | expect(@rec1.sample.s3t2.pl).to eq [20,0,522] 94 | end 95 | 96 | Then(/^I expect rec\.original\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 97 | expect(@rec1.original.gt).to eq "0/1" 98 | end 99 | 100 | Then(/^I expect rec\.s(\d+)t(\d+)\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5| 101 | expect(@rec1.s3t2.pl).to eq [20,0,522] 102 | end 103 | 104 | Then(/^I expect test rec\.missing_samples\? to be false$/) do 105 | expect(@rec1.missing_samples?).to be false 106 | end 107 | 108 | Then(/^I expect test rec\.original\? to be true$/) do 109 | expect(@rec1.original?).to be true 110 | end 111 | 112 | Then(/^I expect rec\.missing_samples\? to be true$/) do 113 | expect(@rec1.missing_samples?).to be true 114 | end 115 | 116 | Then(/^I expect rec\.original\? to be true$/) do 117 | expect(@rec1.original?).to be true 118 | end 119 | 120 | Then(/^I expect rec\.original\? to be false$/) do 121 | expect(@rec1.original?).to eq false 122 | end 123 | 124 | Then(/^I expect rec\.sample\.s(\d+)t(\d+)\? to be false$/) do |arg1, arg2| 125 | expect(@rec1.sample.s1t1?).to eq false 126 | end 127 | 128 | Then(/^I expect rec\.sample\.s(\d+)t(\d+)\? to be true$/) do |arg1, arg2| 129 | expect(@rec1.sample.s3t2?).to eq true 130 | end 131 | 132 | Then(/^I expect rec\.valid\? to be true$/) do 133 | expect(@rec1.valid?).to eq true 134 | end 135 | 136 | Then(/^I expect r\.original\.gt\? to be true$/) do 137 | expect(@rec1.original.gt?).to be true 138 | end 139 | 140 | Then(/^I expect r\.original\? to be true$/) do 141 | expect(@rec1.original?).to be true 142 | end 143 | 144 | Then(/^I expect rec\.original\? to be true$/) do 145 | expect(@rec1.original?).to be true 146 | end 147 | 148 | Then(/^I expect rec\.original\.gt\? to be true$/) do 149 | expect(@rec1.original.gt?).to be true 150 | end 151 | 152 | Then(/^I expect r\.original\.gti\? to be true$/) do 153 | expect(@rec1.original.gti?).to eq true 154 | end 155 | 156 | Then(/^I expect r\.original\.gti to be \[(\d+),(\d+)\]$/) do |arg1, arg2| 157 | expect(@rec1.original.gti).to eq [arg1.to_i,arg2.to_i] 158 | end 159 | 160 | Then(/^I expect r\.original\.gti\[(\d+)\] to be (\d+)$/) do |arg1, arg2| 161 | expect(@rec1.original.gti[arg1.to_i]).to eq arg2.to_i 162 | end 163 | 164 | Then(/^I expect r\.original\.gts\? to be true$/) do 165 | expect(@rec1.original.gts?).to eq true 166 | end 167 | 168 | Then(/^I expect r\.original\.gts to be \["(.*?)","(.*?)"\]$/) do |arg1, arg2| 169 | expect(@rec1.original.gts).to eq [arg1,arg2] 170 | end 171 | 172 | Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2| 173 | expect(@rec1.original.gts[arg1.to_i]).to eq arg2 174 | end 175 | 176 | Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1| 177 | expect(@rec1.info.end).to eq arg1.to_i 178 | end 179 | 180 | Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1| 181 | expect(@rec1.info.ciend).to eq arg1.to_i 182 | end 183 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfgenotypefield.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | MAXINT=100_000 4 | 5 | class ValueError < Exception 6 | end 7 | 8 | module VcfValue 9 | def VcfValue::empty? v 10 | v == nil or v == '' or v == '.' 11 | end 12 | end 13 | 14 | # Helper class for a list of (variant) values, such as A,G. 15 | # The [] function does the hard work. You can pass in an index (integer) 16 | # or nucleotide which translates to an index. 17 | # (see ./features for examples) 18 | class VcfNucleotideCount4 19 | def initialize alt,list 20 | @alt = alt 21 | @list = list.split(/,/).map{|i| i.to_i} 22 | end 23 | 24 | def [] idx 25 | if idx.kind_of?(Integer) 26 | # return a value 27 | @list[idx] 28 | elsif idx.kind_of?(String) 29 | # return a value 30 | @list[["A","C","G","T"].index(idx)] 31 | else idx.kind_of?(Array) 32 | # return a list of values 33 | idx.map { |nuc| 34 | idx2 = ["A","C","G","T"].index(nuc) 35 | # p [idx,nuc,idx2,@list] 36 | @list[idx2] 37 | } 38 | end 39 | end 40 | 41 | def to_ary 42 | @list 43 | end 44 | 45 | # Return the max value on the nucleotides in the list (typically rec.alt) 46 | def max list = @alt 47 | values = self[list] 48 | values.reduce(0){ |memo,v| (v>memo ? v : memo) } 49 | end 50 | 51 | def min list = @alt 52 | values = self[list] 53 | values.reduce(MAXINT){ |memo,v| (vmemo ? v : memo) } 92 | end 93 | 94 | def min 95 | @list.reduce(MAXINT){ |memo,v| (v does not exist!" if !@format[name] 197 | @format[name] 198 | end 199 | 200 | def fetch_value name 201 | values[fetch(name)] 202 | end 203 | 204 | # Return an integer list 205 | def ilist name 206 | v = fetch_value(name) 207 | return nil if not v 208 | v.split(',').map{|i| i.to_i} 209 | end 210 | 211 | end 212 | 213 | # Holds all samples 214 | class VcfGenotypeFields 215 | def initialize fields, format, header, ref, alt 216 | @fields = fields 217 | @format = format 218 | @header = header 219 | @ref = ref 220 | @alt = alt 221 | @samples = {} # lazy cache 222 | @sample_index = @header.sample_index() 223 | end 224 | 225 | def [] name 226 | begin 227 | if name.is_a? String 228 | field_num = @sample_index[name] 229 | else 230 | field_num = name + 9 # assume integer 231 | end 232 | @samples[name] ||= VcfGenotypeField.new(field_num-9,@fields[field_num],@format,@header,@ref,@alt) 233 | rescue TypeError 234 | $stderr.print "Unknown field name <#{name}> in record, did you mean r.info.#{name}?\n" 235 | raise 236 | end 237 | end 238 | 239 | def method_missing(m, *args, &block) 240 | name = m.to_s 241 | if name =~ /\?$/ 242 | # test for valid sample 243 | return !VcfSample::empty?(@fields[@sample_index[name.chop]]) 244 | else 245 | num = @sample_index[name]-9 246 | @samples[name] ||= VcfGenotypeField.new(num,@fields[@sample_index[name]],@format,@header,@ref,@alt) 247 | end 248 | end 249 | 250 | end 251 | end 252 | -------------------------------------------------------------------------------- /lib/bio-vcf/vcfrecord.rb: -------------------------------------------------------------------------------- 1 | module BioVcf 2 | 3 | class VcfRecordInfo 4 | def initialize s 5 | @info = s 6 | end 7 | 8 | def to_s 9 | if @h 10 | @h.map { |k,v| (v ? @original_key[k] + '=' + v : @original_key[k]) }.join(';') 11 | else 12 | @info 13 | end 14 | end 15 | 16 | def [] k 17 | # split_fields if not @h 18 | # /#{m}=(?[^;])/.@info 19 | kupper = k.upcase 20 | v = if @h 21 | @h[kupper] 22 | else 23 | @info =~ /[\A;]#{k}=([^;]+)/i 24 | value = $1 25 | # p [m,value] 26 | # m = @info.match(/#{m.to_s.upcase}=(?[^;]+)/) slower! 27 | # value = m[:value] 28 | if value == nil 29 | split_fields # no option but to split 30 | @h[kupper] 31 | else 32 | value 33 | end 34 | end 35 | ConvertStringToValue::convert(v) 36 | end 37 | 38 | # Set INFO fields (used by --rewrite) 39 | def []= k, v 40 | split_fields if not @h 41 | kupper = k.upcase 42 | @h[kupper] = v 43 | @original_key[kupper] = k 44 | end 45 | 46 | def fields 47 | split_fields 48 | @h.keys 49 | end 50 | 51 | def method_missing(m, *args, &block) 52 | self[m.to_s] 53 | end 54 | 55 | private 56 | 57 | def split_fields 58 | return @h if @h 59 | @h = {} 60 | @original_key = {} 61 | @info.split(/;/).each do |f| 62 | k,v = f.split(/=/) 63 | kupper = k.upcase 64 | @h[kupper] = v 65 | @original_key[kupper] = k 66 | end 67 | end 68 | end 69 | 70 | module VcfRecordParser 71 | # Parse the format field into a Hash 72 | def VcfRecordParser.get_format s 73 | if s==$cached_sample_format_s 74 | $cached_sample_format 75 | else 76 | h = {} 77 | s.split(/:/).each_with_index { |v,i| h[v] = i } 78 | $cached_sample_format = h 79 | $cached_sample_format_s = s 80 | h 81 | end 82 | end 83 | def VcfRecordParser.get_info s 84 | VcfRecordInfo.new(s) 85 | end 86 | end 87 | 88 | module VcfRecordCall 89 | def call_diff 90 | Variant.diff(normal.bcount.to_ary,tumor.bcount.to_ary) 91 | end 92 | 93 | def call_nuc 94 | ['A','C','G','T'][index()] 95 | end 96 | 97 | # Get the GT when 0 is REF and >0 is ALT 98 | def get_gt(index) 99 | if index == 0 100 | ref() 101 | else 102 | alt[index-1] 103 | end 104 | end 105 | 106 | def call_tumor_count 107 | tumor.bcount.to_ary[index()] 108 | end 109 | 110 | def call_tumor_relative_count 111 | Variant.relative_diff(normal.bcount.to_ary,tumor.bcount.to_ary)[index()] 112 | end 113 | 114 | def call_normal_count 115 | normal.bcount.to_ary[index()] 116 | end 117 | 118 | def index 119 | Variant.index(self.normal.bcount.to_ary,self.tumor.bcount.to_ary) 120 | end 121 | end 122 | 123 | class VcfRecord 124 | 125 | include VcfRecordCall 126 | 127 | attr_reader :header 128 | 129 | def initialize fields, header 130 | @fields = fields 131 | @header = header 132 | @sample_by_index = [] 133 | end 134 | 135 | def chrom 136 | @fields[0] 137 | end 138 | 139 | alias :chr :chrom 140 | 141 | def pos 142 | @pos ||= @fields[1].to_i 143 | end 144 | 145 | def ids 146 | @ids ||= @fields[2].split(';') 147 | end 148 | 149 | def id 150 | ids[0] 151 | end 152 | 153 | def ref 154 | @refs ||= @fields[3] 155 | end 156 | 157 | def alt 158 | @alt ||= @fields[4].split(/,/) 159 | end 160 | 161 | def qual 162 | @qual ||= @fields[5].to_f 163 | end 164 | 165 | def filter 166 | @filter ||= @fields[6] 167 | end 168 | 169 | def info 170 | @info ||= VcfRecordParser.get_info(@fields[7]) 171 | end 172 | 173 | def format 174 | @format ||= VcfRecordParser.get_format(@fields[8]) 175 | end 176 | 177 | # Return the first (single) sample (used in one sample VCF) 178 | def first 179 | @first ||= VcfGenotypeField.new(0,@fields[9],format,@header,ref,alt) 180 | end 181 | 182 | # Return the normal sample (used in two sample VCF) 183 | def normal 184 | first 185 | end 186 | 187 | # Return the tumor sample (used in two sample VCF) 188 | def tumor 189 | @tumor ||= VcfGenotypeField.new(1,@fields[10],format,@header,ref,alt) 190 | end 191 | 192 | # Return the sample as a named hash 193 | def sample 194 | @sample ||= VcfGenotypeFields.new(@fields,format,@header,ref,alt) 195 | end 196 | 197 | def sample_by_name name 198 | sample[name] 199 | end 200 | 201 | def sample_by_index i 202 | raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer) 203 | @sample_by_index[i] ||= VcfGenotypeField.new(i,@fields[i+9],format,@header,ref,alt) 204 | end 205 | 206 | # Walk the samples. list contains an Array of int (the index) 207 | def each_sample(list = nil) 208 | @header.sample_subset_index(list).each { |i| 209 | yield VcfSample::Sample.new(i,self,sample_by_index(i)) 210 | } 211 | end 212 | 213 | def samples 214 | list = [] 215 | each_sample { |s| list << s } 216 | list 217 | end 218 | 219 | def missing_samples? 220 | @fields[9..-1].each { |sample| 221 | return true if VcfSample::empty?(sample) 222 | } 223 | false 224 | end 225 | 226 | def valid? 227 | @fields.size == @header.column_names.size 228 | end 229 | 230 | def eval expr, ignore_missing_data: true, quiet: false 231 | begin 232 | if not respond_to?(:call_cached_eval) 233 | code = 234 | """ 235 | def call_cached_eval(rec,fields) 236 | r = rec 237 | #{expr} 238 | end 239 | """ 240 | self.class.class_eval(code) 241 | end 242 | res = call_cached_eval(self,@fields) 243 | if res.kind_of?(Array) 244 | res.join("\t") 245 | else 246 | res 247 | end 248 | rescue NoMethodError => e 249 | if not quiet 250 | $stderr.print "RECORD ERROR!\n" 251 | $stderr.print [@fields],"\n" 252 | $stderr.print expr,"\n" 253 | $stderr.print "To ignore this error use the -i switch!\n" 254 | end 255 | if ignore_missing_data 256 | $stderr.print e.message if not quiet 257 | return false 258 | else 259 | raise 260 | end 261 | end 262 | end 263 | 264 | def gfilter expr, ignore_missing_data: true, quiet: false 265 | begin 266 | if not respond_to?(:call_cached_filter) 267 | code = 268 | """ 269 | def call_cached_gfilter(rec,fields) 270 | r = rec 271 | #{expr} 272 | end 273 | """ 274 | self.class.class_eval(code) 275 | end 276 | res = call_cached_gfilter(self,@fields) 277 | if res.kind_of?(Array) 278 | res.join("\t") 279 | else 280 | res 281 | end 282 | rescue NoMethodError => e 283 | if not quiet 284 | $stderr.print "RECORD ERROR!\n" 285 | $stderr.print [@fields],"\n" 286 | $stderr.print expr,"\n" 287 | $stderr.print "To ignore this error use the -i switch!\n" 288 | end 289 | if ignore_missing_data 290 | $stderr.print e.message if not quiet 291 | return false 292 | else 293 | raise 294 | end 295 | end 296 | end 297 | 298 | def add_to_filter_field str 299 | filter = @fields[6] 300 | if not filter or filter == '.' or filter == 'PASS' 301 | filter = str 302 | else 303 | values = filter.split(/;/) 304 | if not values.include?(str) 305 | filter = filter +';'+str 306 | end 307 | end 308 | filter = '.' if filter == nil or filter == '' 309 | @fields[6] = filter 310 | filter 311 | end 312 | 313 | # Return the sample 314 | def method_missing(m, *args, &block) 315 | name = m.to_s 316 | if name =~ /\?$/ 317 | # Query for empty sample name 318 | @sample_index ||= @header.sample_index 319 | return !VcfSample::empty?(@fields[@sample_index[name.chop]]) 320 | else 321 | sample[name] 322 | end 323 | end 324 | 325 | end 326 | end 327 | -------------------------------------------------------------------------------- /doc/GATK_comparison.md: -------------------------------------------------------------------------------- 1 | # Comparing two large GATK files 2 | 3 | This is the exercise to explore the differences in the full BWA-GATK 4 | pipeline vs. a chunking-scatter-gather approach that is magnitudes 5 | faster. Using my tools 6 | [bio-vcf](https://github.com/pjotrp/bioruby-vcf), 7 | [bio-table](https://github.com/pjotrp/bioruby-table) and 8 | [bio-locus](https://github.com/pjotrp/bio-locus) interesting 9 | edge effects were found. 10 | 11 | The GATK output variant files are similar in size: 12 | 13 | -rw-r--r-- 1 pjotr users 992725762 Aug 29 11:22 HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf 14 | -rw-r--r-- 1 pjotr users 987147441 Aug 29 11:26 HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf 15 | 16 | Naming suggests the second one (scatter) is unsorted but it is actually sorted. 17 | 18 | Install bio-vcf, add it to the path if required, and you should see 19 | 20 | gem env 21 | export PATH=$GEM_HOME/bin:PATH 22 | gem install bio-vcf 23 | bio-vcf 24 | bio-vcf 0.9.0 (biogem Ruby 2.1.2 with pcows) by Pjotr Prins 2015 25 | 26 | Create simple position files with calls 27 | 28 | /usr/bin/time -v bio-vcf -e '[r.chrom,r.pos,r.alt]' < scatter/HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf > scatter_calls.vcf 29 | /usr/bin/time -v bio-vcf -e '[r.chrom,r.pos,r.alt]' < full/HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf > full_calls.vcf 30 | 31 | Count the calls (we are ignoring the limited header info) 32 | 33 | wc -l scatter_calls.vcf 34 | 4773423 scatter_calls.vcf 35 | 36 | wc -l full_calls.vcf 37 | 4795998 full_calls.vcf 38 | 39 | (4795998-4773423)/4795998*100 or 0.5%. Do a diff and count de diffs 40 | 41 | egrep -c '^>' calls.diff 42 | 30401 43 | egrep -c '^<' calls.diff 44 | 52976 45 | 46 | 52976/4795998*100 or 1.1% of different calls, hmmm. Remove the GL contig 47 | 48 | grep -v GL00 calls.diff > calls_wo_GL00.diff 49 | egrep -c '^<' calls_wo_GL00.diff 50 | 48797 51 | 52 | Now install bio-table and bio-locus 53 | 54 | gem install bio-table 55 | bio-table 56 | bio-table 1.0.0 Copyright (C) 2012-2014 Pjotr Prins 57 | gem install bio-locus 58 | bio-locus 59 | bio-locus 0.0.6 (biogem Ruby 2.1.2) by Pjotr Prins 2014 60 | 61 | Create a new VCF file using the diff information. Find all the calls 62 | 63 | egrep '^(<|>)' calls_wo_GL00.diff |grep -v GATKCommandLine| grep -v CHROM > all_diff.txt 64 | bio-table --columns 0,1 < all_diff.txt > chrom_pos_diff.txt 65 | 66 | with vi remove first > and < and make sure there is a tab: 67 | 68 | %s/^..//g 69 | %s/$/^INA/g 70 | 71 | Now use bio-locus to create full VCF files containing only these entries 72 | 73 | bio-locus --store --alt exclude < chrom_pos_diff.txt 74 | bio-locus 0.0.6 (biogem Ruby 2.1.2) by Pjotr Prins 2014 75 | Stored 73644 positions out of 75414 in locus.db (1770 duplicate hits) 76 | bio-locus --match --alt exclude < ../full/HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf > diff_full.vcf 77 | bio-locus --match --verbose -d < ../scatter/HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf > diff_scatter.vcf 78 | (note: bio-locus is - still - a slow tool) 79 | 80 | Interestingly there is very little overlap between the call positions (only 1770 are shared)... 81 | 82 | wc -l diff*.vcf 83 | 48903 diff_full.vcf 84 | 26731 diff_scatter.vcf 85 | 86 | So, arguably the difference is 87 | (48903+26731)/4795998*100 or 1.6% of calls. 88 | 89 | Now we can compare the call contents 90 | 91 | bio-vcf -i -e '[r.chrom,r.pos,r.info.af]' --seval 's.dp' < diff_scatter.vcf > diff_scatter_af_sdp.txt 92 | 93 | For those that do match we see a difference in sample read depth 94 | pointing out the two methods differ in placing reads. So, let's see 95 | if we can find significant differences in frequency and read depth. 96 | 97 | First I am reducing the data to one chromosome to be able to work a bit 98 | faster 99 | 100 | bio-vcf --filter 'r.chrom=="3"' < ../full/HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf > full.vcf 101 | bio-vcf --filter 'r.chrom=="3"' < ../scatter/HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf > scatter.vcf 102 | wc -l *.vcf 103 | 301922 full.vcf 104 | 300326 scatter.vcf 105 | 106 | bio-vcf -i -e '[r.pos,r.alt,r.info.af]' --seval 's.dp' < full.vcf > full_af_dp.txt 107 | bio-vcf -i -e '[r.pos,r.alt, r.info.af]' --seval 's.dp' < scatter.vcf > scatter_af_dp.txt 108 | wc -l *.txt 109 | 40001 full_af_dp.txt 110 | 40001 scatter_af_dp.txt 111 | diff scatter_af_dp.txt full_af_dp.txt |grep -c '>' 112 | 2189 113 | 114 | Differences typically look like 115 | 116 | < 402884 G 0.5 26 117 | < 403020 A 0.5 21 118 | --- 119 | > 402884 G 0.5 29 120 | > 403020 A 0.5 25 121 | 122 | i.e. the scatter approach has a different read depth 26 instead of 29 123 | and 21 instead of 25 - reads end up in other places. Frequency-wise we 124 | don't see much differentce. More intriguing, a difference would be 125 | 126 | 1525d1529 127 | < 663038 CATATGTTATATGTGTATGTATTGTATACAT 1.0 4 128 | 129 | where an extra call for an insertion was made in the scatter approach 130 | with a DP of 4. 131 | 132 | Now let's quantify how much DP differs 133 | 134 | Combine the tables 135 | 136 | bio-table --columns 0,3 < full_af_dp.txt > f_dp.txt 137 | bio-table --columns 0,3 < scatter_af_dp.txt > s_dp.txt 138 | bio-table --merge f_dp.txt s_dp.txt > merged.txt 139 | 140 | After editing the table header to show 'pos\tfull\tscatter' 141 | 142 | bio-table --num-filter 'value[1]!=value[0]' < merged.txt |wc -l 143 | 2416 144 | 145 | it shows that for chromosome 3, 2416 calls out of 301922 (0.8%) have a 146 | different read depth. 147 | 148 | bio-table --verbose --debug --num-filter '(value[1]-value[0]).abs > 3' < merged_no_NA.txt|wc -l 149 | 356 150 | 151 | 356 (0.1% of total calls) showed a read depth difference larger than 4 152 | reads. And 61 showed a read depth difference larger than 10 reads: 153 | 154 | fedor13:~/bcosc/gvcf/chr3$ bio-table --verbose --debug --num-filter '(value[1]-value[0]).abs > 10' < merged_no_NA.txt 155 | bio-table 1.0.0 Copyright (C) 2012-2014 Pjotr Prins 156 | INFO bio-table: Array: [{:show_help=>false, :write_header=>true, :skip=>0, :debug=>true, :num_filter=>"(value[1]-value[0]).abs > 10"}] 157 | DEBUG bio-table: Filtering on (value[1]-value[0]).abs > 10 158 | INFO bio-table: Array: ["pos", "full", "scatter"] 159 | pos full scatter 160 | 855606 24 9 161 | 855609 24 10 162 | 855610 24 10 163 | 855617 22 9 164 | 1432738 17 3 165 | 1434184 35 22 166 | 3173353 37 7 167 | 3713421 42 11 168 | 3713601 31 13 169 | 3713646 39 24 170 | 3713647 39 24 171 | 3713669 43 29 172 | 3714214 33 19 173 | 3762444 18 3 174 | 3764753 29 15 175 | 3764808 29 15 176 | 3764830 26 15 177 | 3764904 22 11 178 | 3764918 24 13 179 | 3937379 26 13 180 | 3937468 19 7 181 | 4005161 17 4 182 | 4005568 35 22 183 | 4297382 27 13 184 | 4297644 34 20 185 | 4958959 31 10 186 | 4958960 31 10 187 | 4959272 35 11 188 | 4959609 27 9 189 | 4960175 27 5 190 | 6159995 17 0.5 191 | 8432601 34 23 192 | 8432973 24 13 193 | 8432986 26 15 194 | 11414301 12 28 195 | 11414307 12 28 196 | 11414313 12 28 197 | 11414323 12 28 198 | 11528593 21 6 199 | 12070044 39 23 200 | 12070272 34 18 201 | 12071772 34 22 202 | 15010075 27 16 203 | 17740441 43 27 204 | 17740468 52 33 205 | 18367833 26 13 206 | 18367863 29 16 207 | 18367950 32 19 208 | 18973094 39 27 209 | 18976962 30 16 210 | 18977299 20 8 211 | 19000198 42 29 212 | 19000214 40 26 213 | 19898931 15 2 214 | 19898945 13 2 215 | 19898997 16 5 216 | 19899206 37 24 217 | 21293536 40 25 218 | 21293836 26 15 219 | 21294685 36 24 220 | 221 | Based on this information (you can see clustering at certain 'hot 222 | spots') I suspect that these are border effects in the regions where 223 | chunking took place. Note also that scatter more often has *less* 224 | reads which means we are missing reads because of edges (it is a BWA 225 | thing). 226 | 227 | Even so, even *with* these downsides, the method may work for rapid 228 | diagnostics, provided the chunking affected FN/FP calls do not fall in 229 | the regions of interest. For this setup, where time to diagnostic 230 | counts (including cancer), it may prove a valuable approach. I also 231 | suggest we find a way of setting chunking in regions of little 232 | interest (outside coding genes and or regions of low variation as Brad 233 | suggested). 234 | 235 | -------------------------------------------------------------------------------- /ragel/gen_vcfheaderline_parser.rl: -------------------------------------------------------------------------------- 1 | # Ragel lexer for VCF-header 2 | # 3 | # This is compact a parser/lexer for the VCF header format. Bio-vcf 4 | # uses the parser to generate meta information that can be output to 5 | # (for example) JSON format. The advantage of using ragel as a state 6 | # engine is that it allows for easy parsing of key-value pairs with 7 | # syntax checking and, for example, escaped quotes in quoted string 8 | # values. This ragel parser/lexer generates valid Ruby; it should be 9 | # fairly trivial to generate python/C/JAVA instead. Note that this 10 | # edition validates ID and Number fields only. Other fields are 11 | # dumped 'AS IS'. 12 | # 13 | # Note the .rb version is generated from ./ragel/gen_vcfheaderline_parser.rl 14 | # 15 | # by Pjotr Prins (c) 2014/2015 16 | 17 | module BioVcf 18 | 19 | module VcfHeaderParser 20 | 21 | module RagelKeyValues 22 | 23 | def self.debug msg 24 | # nothing 25 | # $stderr.print "DEBUG: ",msg,"\n" 26 | end 27 | 28 | =begin 29 | %%{ 30 | 31 | machine simple_lexer; 32 | 33 | action mark { ts=p } 34 | action endquoted { 35 | emit.call(:value,data,ts,p) 36 | } 37 | 38 | action kw { 39 | emit.call(:kw,data,ts,p) 40 | } 41 | 42 | squote = "'"; 43 | dquote = '"'; 44 | not_squote_or_escape = [^'\\]; 45 | not_dquote_or_escape = [^"\\]; 46 | escaped_something = /\\./; 47 | ss = squote ( not_squote_or_escape | escaped_something )* >mark %endquoted squote; 48 | dd = dquote ( not_dquote_or_escape | escaped_something )* >mark %endquoted dquote; 49 | 50 | integer = ('+'|'-')?digit+; 51 | float = ('+'|'-')?digit+'.'digit+; 52 | assignment = '='; 53 | identifier = ( alnum (alnum|'.'|'_')* ); 54 | version = ( digit (alnum|'.'|'_'|'-')* ); 55 | str = (ss|dd)* ; 56 | boolean = '.'; 57 | date = str; 58 | key_word = ( ('Type'|'Description'|'Source'|identifier - ('ID'|'Number'|'length'|'Version'|'assembly'|'Date'|'CommandLineOptions')) >mark %{ emit.call(:key_word,data,ts,p) } ); 59 | any_value = ( str|( integer|float|boolean|identifier >mark %{ emit.call(:value,data,ts,p) } )); 60 | id_value = ( identifier >mark %{ emit.call(:value,data,ts,p) } ); 61 | 62 | version_value = ( str| ( version >mark %{ emit.call(:value,data,ts,p) } )); 63 | date_value = ( date ); 64 | gatk_value = ( str ); 65 | number_value = ( ( integer|boolean|'A'|'R'|'G' ) >mark %{ emit.call(:value,data,ts,p) } ); 66 | 67 | id_kv = ( ( ('ID'|'assembly') %kw '=' id_value ) %{ debug("ID FOUND") } @!{ error_code="Malformed ID"} ); 68 | version_kv = ( ( ('Version') %kw '=' version_value ) @!{ error_code="Version"} ); 69 | number_kv = ( ( ('Number'|'length') %kw '=' number_value ) @!{ error_code="Number"} ); 70 | date_kv = ( ( ('Date') %kw '=' date_value ) %{ debug("DATE FOUND") } @!{ error_code="Date"} ); 71 | gatk_kv = ( ( ('CommandLineOptions') %kw '=' gatk_value ) @!{ error_code="GATK"} ); 72 | key_value = ( id_kv | version_kv | date_kv | number_kv | gatk_kv | (key_word '=' any_value) ) %{ debug("KEY_VALUE found") } >mark @!{ error_code="unknown key-value " }; 73 | 74 | main := ( '##' ('FILTER'|'FORMAT'|'contig'|'INFO'|'ALT'|'GATKCommandLine') '=') (('<'|',') key_value )* '>'; 75 | }%% 76 | =end 77 | 78 | %% write data; 79 | # %% this just fixes syntax highlighting... 80 | 81 | def self.run_lexer(buf, options = {}) 82 | do_debug = (options[:debug] == true) 83 | $stderr.print "---> ",buf,"\n" if do_debug 84 | data = buf.unpack("c*") if(buf.is_a?(String)) 85 | eof = data.length 86 | values = [] 87 | stack = [] 88 | 89 | emit = lambda { |type, data, ts, p| 90 | # Print the type and text of the last read token 91 | # p ts,p 92 | $stderr.print "EMITTED: #{type}: #{data[ts...p].pack('c*')}\n" if do_debug 93 | values << [type,data[ts...p].pack('c*')] 94 | } 95 | 96 | error_code = nil 97 | 98 | %% write init; 99 | %% write exec; 100 | 101 | raise "ERROR: "+error_code+" in "+buf if error_code 102 | 103 | begin 104 | res = {} 105 | # p values 106 | values.each_slice(2) do | a,b | 107 | $stderr.print '*',a,b if do_debug 108 | keyword = a[1] 109 | value = b[1] 110 | value = value.to_i if ['length','Epoch'].index(keyword) 111 | res[keyword] = value 112 | # p h[:value] if h[:name]==:identifier or h[:name]==:value or h[:name]==:string 113 | end 114 | rescue 115 | print "ERROR: " 116 | p values 117 | raise 118 | end 119 | $stderr.print(res,"\n") if do_debug 120 | res 121 | end 122 | end 123 | end 124 | end 125 | 126 | if __FILE__ == $0 127 | 128 | gatkcommandline = < 130 | LINE1 131 | 132 | h = {} 133 | s = gatkcommandline.strip 134 | # print s,"\n" 135 | result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true) 136 | # h[result['ID']] = result 137 | # p result 138 | 139 | lines = < 13.0"> 141 | ##FORMAT= 142 | ##FORMAT= 143 | ##FORMAT= 144 | ##INFO= 145 | ##INFO= 146 | ##INFO= 147 | ##INFO= 148 | ##INFO= 149 | ##contig= 150 | ##contig= 151 | LINES 152 | 153 | h = {} 154 | lines.strip.split("\n").each { |s| 155 | # print s,"\n" 156 | result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true) 157 | h[result['ID']] = result 158 | p result 159 | } 160 | p h 161 | 162 | raise "ERROR" if h != {"HaplotypeScoreHigh"=>{"ID"=>"HaplotypeScoreHigh", "Description"=>"HaplotypeScore > 13.0"}, "GT"=>{"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}, "DP"=>{"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth", "Extra"=>"Yes?"}, "DP4"=>{"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}, "PM"=>{"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}, "VP"=>{"ID"=>"VP", "Number"=>"1", "Type"=>"String", "Description"=>"Variation Property. Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf", "Source"=>"dbsnp", "Version"=>"138"}, "GENEINFO"=>{"ID"=>"GENEINFO", "Number"=>"1", "Type"=>"String", "Description"=>"Pairs each of gene symbol:gene id. The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)"}, "CLNHGVS"=>{"ID"=>"CLNHGVS", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical INFO tags."}, "CLNHGVS1"=>{"ID"=>"CLNHGVS1", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from \\\"HGVS\\\". The order of these 'variants' corresponds to the order of the info in the other clinical INFO tags."}, "XXXY12"=>{"ID"=>"XXXY12"}, "Y"=>{"ID"=>"Y", "length"=>59373566}} 163 | 164 | 165 | end # test 166 | -------------------------------------------------------------------------------- /lib/bio-vcf/pcows.rb: -------------------------------------------------------------------------------- 1 | # Parallel copy-on-write streaming (PCOWS) 2 | 3 | require 'tempfile' 4 | 5 | class PCOWS 6 | 7 | RUNNINGEXT = 'part' # file extension 8 | 9 | def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false) 10 | num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default 11 | # $stderr.print "Using ",num_threads,"threads \n" 12 | @num_threads = num_threads 13 | @chunk_size = chunk_size 14 | @pid_list = [] 15 | @name = name 16 | @timeout = timeout 17 | @quiet = quiet 18 | @debug = debug 19 | if @debug 20 | $stderr.print "PCOWS running in DEBUG MODE\n" 21 | end 22 | if multi_threaded 23 | @tmpdir = Dir::mktmpdir(@name+'_') 24 | end 25 | @last_output = 0 # counter 26 | @output_locked = false 27 | end 28 | 29 | # Feed the worker 'func and state' to COWS. Note that func is a 30 | # lambda closure so it can pick up surrounding scope at invocation 31 | # in addition to the data captured in 'state'. 32 | 33 | def submit_worker(func,state) 34 | pid = nil 35 | if multi_threaded 36 | count = @pid_list.size+1 37 | fn = mktmpfilename(count) 38 | pid = fork do 39 | # ---- This is running a new copy-on-write process 40 | tempfn = fn+'.'+RUNNINGEXT 41 | STDOUT.reopen(File.open(tempfn, 'w+')) 42 | func.call(state).each { | line | print line } 43 | STDOUT.flush 44 | STDOUT.close 45 | # sleep 0.1 46 | # f.flush 47 | # f.close 48 | # sleep 0.2 # interval to make sure we are done writing, 49 | # otherwise there may be misses at the end of a 50 | # block (maybe the f.close fixed it) 51 | 52 | FileUtils::mv(tempfn,fn) 53 | exit(0) 54 | end 55 | Process.detach(pid) 56 | else 57 | # ---- Single threaded: call in main process and output immediately 58 | func.call(state).each { | line | print line } 59 | end 60 | @pid_list << [ pid,count,fn ] 61 | return true 62 | end 63 | 64 | def submit_final_worker(func,state) 65 | @final_worker = true 66 | submit_worker(func,state) 67 | end 68 | 69 | # Make sure no more than num_threads are running at the same time - 70 | # this is achieved by checking the PID table and the running files 71 | # in the tmpdir 72 | 73 | def wait_for_worker_slot() 74 | return if single_threaded 75 | Timeout.timeout(@timeout) do 76 | printed_timeout_message = false 77 | while true 78 | # ---- count running pids 79 | running = @pid_list.reduce(0) do | sum, info | 80 | (pid,count,fn) = info 81 | if pid_or_file_running?(pid,fn) 82 | sum+1 83 | else 84 | sum 85 | end 86 | end 87 | return if running < @num_threads 88 | if not printed_timeout_message 89 | $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet 90 | printed_timeout_message = true 91 | end 92 | sleep 0.1 93 | end 94 | end 95 | end 96 | 97 | # ---- In this section the output gets collected and passed on to a 98 | # printer thread. This function makes sure the printing is 99 | # ordered and that no printers are running at the same 100 | # time. The printer thread should be doing as little processing 101 | # as possible. 102 | # 103 | # In this implementation type==:by_line will call func for 104 | # each line. Otherwise it is called once with the filename. 105 | def process_output(func=nil,type=:by_line, blocking=false) 106 | return if single_threaded 107 | output = lambda { |fn| 108 | if type == :by_line 109 | File.new(fn).each_line { |buf| 110 | print buf 111 | } 112 | else 113 | func.call(fn) 114 | end 115 | } 116 | if @output_locked 117 | # ---- is the other thread still running? We wait until it 118 | # is finished to start the next one 119 | (pid,count,fn) = @output_locked 120 | $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet 121 | return if File.exist?(fn) # continue because thread still processing 122 | # Now we should remove the .keep file 123 | cleanup_keep_file(fn) 124 | @last_output += 1 # get next one in line 125 | @output_locked = false 126 | end 127 | # ---- process the next output chunk. After completion it 128 | # gets renamed to chunk.keep. This to avoid missing 129 | # output (if we unlink the file prematurely) 130 | if info = @pid_list[@last_output] 131 | (pid,count,fn) = info 132 | $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug 133 | if File.exist?(fn) 134 | # Yes! We have the next output, create outputter 135 | @output_locked = info 136 | $stderr.print "Set lock on ",[info],"\n" if not @quiet 137 | if not blocking 138 | $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet 139 | pid = fork do 140 | output.call(fn) 141 | # after finishing output move it to .keep 142 | FileUtils::mv(fn,fn+'.keep') 143 | exit(0) 144 | end 145 | Process.detach(pid) 146 | else 147 | $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet 148 | output.call(fn) 149 | FileUtils::mv(fn,fn+'.keep') 150 | end 151 | else 152 | sleep 0.2 153 | end 154 | end 155 | end 156 | 157 | # Wait for a worker slot to appear. When working the pid is writing 158 | # a file with extension .part(ial). After completion the file is 159 | # renamed without .part and a slot is free. 160 | def wait_for_worker(info) 161 | (pid,count,fn) = info 162 | if pid_or_file_running?(pid,fn) 163 | $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet 164 | begin 165 | Timeout.timeout(@timeout) do 166 | while not File.exist?(fn) # wait for the result to appear 167 | sleep 0.2 168 | return if not pid_or_file_running?(pid,fn) # worker is gone 169 | end 170 | end 171 | # Partial file should have been renamed: 172 | raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn) 173 | $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet 174 | rescue Timeout::Error 175 | # Kill it to speed up exit 176 | Process.kill 9, pid 177 | Process.wait pid 178 | $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n" 179 | $stderr.print "Bailing out" 180 | raise 181 | end 182 | end 183 | end 184 | 185 | # This is the final cleanup after the reader thread is done. All workers 186 | # need to complete. 187 | 188 | def wait_for_workers() 189 | return if single_threaded 190 | @pid_list.each do |info| 191 | wait_for_worker(info) 192 | end 193 | end 194 | 195 | def process_remaining_output() 196 | return if single_threaded 197 | $stderr.print "Processing remaining output...\n" if not @quiet 198 | while @output_locked 199 | sleep 0.2 200 | process_output() # keep trying 201 | end 202 | @pid_list.each do |info| 203 | (pid,count,fn) = info 204 | while pid_or_file_running?(pid,fn) or File.exist?(fn) 205 | $stderr.print "Trying: ",[info],"\n" if not @quiet 206 | process_output(nil,:by_line,true) 207 | sleep 0.2 208 | end 209 | end 210 | while @output_locked 211 | sleep 0.1 212 | process_output(nil,:by_line,true) 213 | end 214 | cleanup_tmpdir() 215 | end 216 | 217 | def cleanup() 218 | @pid_list.each do |info| 219 | (pid,count,fn) = info 220 | if pid_running?(pid) 221 | $stderr.print "Killing child ",[info],"\n" 222 | begin 223 | Process.kill 9, pid 224 | Process.wait pid 225 | rescue Errno::ENOENT 226 | $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet 227 | rescue Errno::ESRCH 228 | $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet 229 | end 230 | end 231 | File.unlink(fn) if File.exist?(fn) 232 | cleanup_keep_file(fn,wait: false) 233 | tempfn = fn+'.'+RUNNINGEXT 234 | File.unlink(tempfn) if File.exist?(tempfn) 235 | end 236 | cleanup_tmpdir() 237 | end 238 | 239 | private 240 | 241 | def mktmpfilename(num,ext=nil) 242 | @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '') 243 | end 244 | 245 | def pid_or_file_running?(pid,fn) 246 | (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT) 247 | end 248 | 249 | def pid_running?(pid) 250 | begin 251 | fpid,status=Process.waitpid2(pid,Process::WNOHANG) 252 | rescue Errno::ECHILD, Errno::ESRCH 253 | return false 254 | end 255 | return true if nil == fpid && nil == status 256 | return ! (status.exited? || status.signaled?) 257 | end 258 | 259 | def single_threaded 260 | @num_threads == 1 261 | end 262 | 263 | def multi_threaded 264 | @num_threads > 1 265 | end 266 | 267 | def cpu_count 268 | begin 269 | return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo' 270 | # Actually, the JVM does not allow fork... 271 | return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java 272 | rescue LoadError 273 | # Count on MAC 274 | return Integer `sysctl -n hw.ncpu 2>/dev/null` 275 | end 276 | $stderr.print "Could not determine number of CPUs" if not @quiet 277 | 1 278 | end 279 | 280 | def cleanup_keep_file(fn, opts = { wait: true }) 281 | if not @debug 282 | keep = fn+'.keep' 283 | return if not opts[:wait] and !File.exist?(keep) 284 | $stderr.print "Trying to remove #{keep}\n" if not @quiet 285 | while true 286 | if File.exist?(keep) 287 | $stderr.print "Removing #{keep}\n" if not @quiet 288 | File.unlink(keep) 289 | break # forever loop 290 | end 291 | sleep 0.1 292 | end #forever 293 | end 294 | end 295 | 296 | def cleanup_tmpdir 297 | if not @debug 298 | $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet 299 | Dir.unlink(@tmpdir) if @tmpdir 300 | end 301 | end 302 | 303 | end 304 | -------------------------------------------------------------------------------- /test/data/regression/eval_r.info.dp.ref: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##FILTER= 3 | ##FORMAT= 4 | ##FORMAT= 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##GATKCommandLine= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##INFO= 15 | ##INFO= 16 | ##INFO= 17 | ##INFO= 18 | ##INFO= 19 | ##INFO= 20 | ##INFO= 21 | ##INFO= 22 | ##INFO= 23 | ##INFO= 24 | ##INFO= 25 | ##INFO= 26 | ##INFO= 27 | ##INFO= 28 | ##INFO= 29 | ##contig= 30 | ##contig= 31 | ##contig= 32 | ##contig= 33 | ##contig= 34 | ##contig= 35 | ##contig= 36 | ##contig= 37 | ##contig= 38 | ##contig= 39 | ##contig= 40 | ##contig= 41 | ##contig= 42 | ##contig= 43 | ##contig= 44 | ##contig= 45 | ##contig= 46 | ##contig= 47 | ##contig= 48 | ##contig= 49 | ##contig= 50 | ##contig= 51 | ##contig= 52 | ##contig= 53 | ##contig= 54 | ##contig= 55 | ##contig= 56 | ##contig= 57 | ##contig= 58 | ##contig= 59 | ##contig= 60 | ##contig= 61 | ##contig= 62 | ##contig= 63 | ##contig= 64 | ##contig= 65 | ##contig= 66 | ##contig= 67 | ##contig= 68 | ##contig= 69 | ##contig= 70 | ##contig= 71 | ##contig= 72 | ##contig= 73 | ##contig= 74 | ##contig= 75 | ##contig= 76 | ##contig= 77 | ##contig= 78 | ##contig= 79 | ##contig= 80 | ##contig= 81 | ##contig= 82 | ##contig= 83 | ##contig= 84 | ##contig= 85 | ##contig= 86 | ##contig= 87 | ##contig= 88 | ##contig= 89 | ##contig= 90 | ##contig= 91 | ##contig= 92 | ##contig= 93 | ##contig= 94 | ##contig= 95 | ##contig= 96 | ##contig= 97 | ##contig= 98 | ##contig= 99 | ##contig= 100 | ##contig= 101 | ##contig= 102 | ##contig= 103 | ##contig= 104 | ##contig= 105 | ##contig= 106 | ##contig= 107 | ##contig= 108 | ##contig= 109 | ##contig= 110 | ##contig= 111 | ##contig= 112 | ##contig= 113 | ##reference=file:human_g1k_v37.fasta 114 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Original s1t1 s2t1 s3t1 s1t2 s2t2 s3t2 115 | 1518 116 | 1433 117 | 1440 118 | 1460 119 | 1500 120 | 1537 121 | 1641 122 | 129 123 | 28 124 | 22 125 | 18 126 | 939 127 | 721 128 | 703 129 | 929 130 | 692 131 | 1149 132 | 1108 133 | 830 134 | 764 135 | 809 136 | 754 137 | 719 138 | 1083 139 | 1092 140 | 1089 141 | 1050 142 | 939 143 | 1004 144 | 1090 145 | 992 146 | 636 147 | 1213 148 | 904 149 | 398 150 | 526 151 | -------------------------------------------------------------------------------- /doc/Using_Mongo.md: -------------------------------------------------------------------------------- 1 | # Using bio-vcf with MongoDB 2 | 3 | bio-vcf can output many types of formats. In this exercise we will load 4 | Mongo with VCF data and do some queries on that. 5 | 6 | ## Install Mongo (Debian) 7 | 8 | With su (password 'bioinformatics') 9 | 10 | ```sh 11 | su 12 | apt-get install mongodb 13 | ``` 14 | 15 | ## Install Mongo in $HOME 16 | 17 | Mongo comes with many distributions. Here we installed with guix. Check 18 | 19 | ```sh 20 | guix package -A mongodb 21 | mongodb 3.3.3 out gn/packages/mongodb.scm:31:2 22 | ``` 23 | 24 | Create a directory for the database 25 | 26 | ```sh 27 | mkdir -p ~/opt/var/mongodb 28 | mkdir -p ~/opt/etc 29 | ``` 30 | 31 | And create a configuration file ~/opt/etc/mongo.conf 32 | 33 | ``` 34 | verbose = true 35 | port = 27017 36 | dbpath = /home/user/opt/var/mongodb/ 37 | noauth = true 38 | maxConns = 5 39 | rest = true 40 | ``` 41 | 42 | and run Mongo 43 | 44 | ```sh 45 | env LC_ALL=C mongod --config ~/opt/etc/mongo.conf 46 | ``` 47 | 48 | ```ruby 49 | use admin 50 | db.createUser({user:"admin", pwd:"admin123", roles:[{role:"root", db:"admin"}]}) 51 | ``` 52 | 53 | ## Use client 54 | 55 | ```python 56 | mongo 57 | use admin 58 | db.createUser({user:"admin", pwd:"admin123", roles:[{role:"root", db:"admin"}]}) 59 | ``` 60 | 61 | or on a different host 62 | 63 | ```python 64 | mongo --host 192.168.1.24 65 | ``` 66 | 67 | ## Tutorial Mongo 68 | 69 | Using the example on MongoDB's [website](https://docs.mongodb.org/getting-started/shell/import-data/) 70 | 71 | ### Load data 72 | 73 | Records look like: 74 | 75 | ```javascript 76 | {"address": {"building": "2780", "coord": [-73.98241999999999, 40.579505], "street": "Stillwell Avenue", "zipcode": "11224"}, "borough": "Brooklyn", "cuisine": "American ", "grades": [{"date": {"$date": 1402358400000}, "grade": "A", "score": 5}, {"date": {"$date": 1370390400000}, "grade": "A", "score": 7}, {"date": {"$date": 1334275200000}, "grade": "A", "score": 12}, {"date": {"$date": 1318377600000}, "grade": "A", "score": 12}], "name": "Riviera Caterer", "restaurant_id": "40356018"} 77 | {"address": {"building": "351", "coord": [-73.98513559999999, 40.7676919], "street": "West 57 Street", "zipcode": "10019"}, "borough": "Manhattan", "cuisine": "Irish", "grades": [{"date": {"$date": 1409961600000}, "grade": "A", "score": 2}, {"date": {"$date": 1374451200000}, "grade": "A", "score": 11}, {"date": {"$date": 1343692800000}, "grade": "A", "score": 12}, {"date": {"$date": 1325116800000}, "grade": "A", "score": 12}], "name": "Dj Reynolds Pub And Restaurant", "restaurant_id": "30191841"} 78 | ``` 79 | 80 | Note there are no specific identifiers. Or are there? 81 | 82 | ```sh 83 | wget https://raw.githubusercontent.com/mongodb/docs-assets/primer-dataset/primer-dataset.json 84 | mongoimport --db test --collection restaurants --drop --file primer-dataset.json 85 | Mon Apr 11 00:24:50.963 dropping: test.restaurants 86 | Mon Apr 11 00:24:52.375 check 9 25359 87 | Mon Apr 11 00:24:52.448 imported 25359 objects 88 | ``` 89 | 90 | ### Use the shell 91 | 92 | Run the mongo shell with 93 | 94 | ```sh 95 | mongo 96 | ``` 97 | 98 | ```ruby 99 | use test 100 | db.restaurants.find() 101 | db.restaurants.find( { "borough": "Manhattan" } ) 102 | db.restaurants.find( { "grades.score": { $gt: 30 } } ) 103 | ... AND ... 104 | db.restaurants.find( { "cuisine": "Italian", "address.zipcode": "10075" ,"grades.score": { $gt: 30 }} ) 105 | ... OR ... 106 | db.restaurants.find( 107 | { $or: [ { "cuisine": "Italian" }, { "address.zipcode": "10075" } ] } 108 | ) 109 | ... SORT ... 110 | db.restaurants.find().sort( { "borough": 1, "address.zipcode": 1 } ) 111 | ... Count ... 112 | db.restaurants.aggregate( 113 | [ 114 | { $group: { "_id": "$borough", "count": { $sum: 1 } } } 115 | ] 116 | ); 117 | 118 | db.restaurants.aggregate( 119 | [ 120 | { $match: { "borough": "Queens", "cuisine": "Brazilian" } }, 121 | { $group: { "_id": "$address.zipcode" , "count": { $sum: 1 } } } 122 | ] 123 | ); 124 | ... Index ... 125 | db.restaurants.createIndex( { "cuisine": 1, "address.zipcode": -1 } ) 126 | ``` 127 | 128 | ### Prepare template with bio-vcf 129 | 130 | ```sh 131 | wget https://github.com/pjotrp/bioruby-vcf/raw/master/test/data/input/gatk_exome.vcf 132 | cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos]' 133 | ``` 134 | 135 | Let's create a template named gatk_template.json 136 | 137 | ```ruby 138 | 139 | { 140 | "rec": { 141 | "chr": "<%= rec.chrom %>", 142 | "pos": <%= rec.pos %>, 143 | "ref": "<%= rec.ref %>", 144 | "alt": "<%= rec.alt[0] %>", 145 | "dp": <%= rec.info.dp %> 146 | } 147 | } 148 | ``` 149 | 150 | And run it 151 | 152 | ```sh 153 | cat gatk_exome.vcf |bio-vcf --template gatk_template.json |less 154 | cat gatk_exome.vcf |bio-vcf --template gatk_template.json > gatk_exome.json 155 | ``` 156 | 157 | Looks like 158 | 159 | ``` 160 | { 161 | "rec": { 162 | "chr": "X", 163 | "pos": 134713855, 164 | "ref": "G", 165 | "alt": "A", 166 | "dp": 4 167 | } 168 | } 169 | ``` 170 | 171 | Import into mongo 172 | 173 | mongo v. 2.0.6 174 | ```sh 175 | mongoimport --db gatk --collection vcf --drop --file gatk_exome.json --jsonArray 176 | ``` 177 | mongo v. 3.2.3 178 | ```sh 179 | mongoimport --db gatk --collection vcf --drop --file gatk_exome.json 180 | ``` 181 | 182 | 183 | 184 | ```ruby 185 | use gatk 186 | db.vcf.find() 187 | db.vcf.find( { "rec.chr": "X" } ) 188 | db.vcf.find( { "rec.chr": "X" } ).count() 189 | 3 190 | db.vcf.find( { "rec.dp": { $gt: 5 }} ) 191 | db.vcf.find( { "rec.dp": { $gt: 5 }} ).count() 192 | 25 193 | ``` 194 | 195 | Comparable bio-vcf statements 196 | 197 | ``` 198 | cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos,r.ref,r.alt,r.info.dp]' --filter "r.chr=='X'"|grep -v '#' |wc -l 199 | =>"[r.chr,r.pos,r.ref,r.alt,r.info.dp]", :filter=>"r.chr=='X'"} 200 | 3 201 | cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos,r.ref,r.alt,r.info.dp]' --filter "r.info.dp>5"|grep -v '#' |wc -l 202 | =>"[r.chr,r.pos,r.ref,r.alt,r.info.dp]", :filter=>"r.info.dp>5"} 203 | 25 204 | ``` 205 | 206 | Exercise 1. 207 | 208 | With bio-vcf take the field "Variant Confidence/Quality by Depth" and 209 | filter on QD>12.0. How many matches? Answer 112 out of 175 210 | 211 | Exercise 2. 212 | 213 | Do the same with MongoDB. So you can do 214 | 215 | ```ruby 216 | db.vcf.find( { "rec.qd": { $gt: 12.0 }} ).count() 217 | 112 218 | ``` 219 | 220 | ## Now for some real data 221 | 222 | Let's use our PIK3CA data in two samples 223 | 224 | ``` 225 | cat gene_PIK3CA.vcf |bio-vcf --samples 2,3 --seval s.dp 226 | cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3 --seval s.dp --sfilter "s.dp>7" 227 | cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 0,3 --sfilter 's.dp>20' --seval s.dp 228 | 3 178916645 24 39 229 | 3 178916651 30 31 230 | 3 178921407 32 43 231 | 3 178936082 24 24 232 | 3 178936091 27 32 233 | 3 178947904 23 33 234 | 3 178952072 38 45 235 | 3 178952085 35 45 236 | 3 178952088 34 45 237 | ``` 238 | 239 | Looking at annotations 240 | 241 | ``` 242 | cat gene_PIK3CA.vcf |bio-vcf --eval [r.chr,r.pos,r.info.ann] |grep ENST00000263967|wc -l 243 | 30 244 | ``` 245 | 246 | alternative 247 | 248 | ``` 249 | cat gene_PIK3CA.vcf |bio-vcf --eval '[r.chr,r.pos,r.info.ann]' --filter 'r.info.ann =~ /ENST00000263967/' --seval 's.dp' 250 | 3 178921407 T|synonymous_variant|LOW|PIK3CA|ENSG00000121879|transcript|ENST00000263967|protein_coding|5/21|c.889C>T|p.Leu297Leu|1046/9093|889/3207|297/1068|| 32 32 38 43 27 34 30 37 32 36 44 37 25 27 43 30 11 23 19 37 28 17 13 ... 251 | ``` 252 | 253 | Let's try and do the same with Mongo 254 | 255 | ``` 256 | 257 | { 258 | "rec": { 259 | "chr": "<%= rec.chrom %>", 260 | "pos": <%= rec.pos %>, 261 | "ref": "<%= rec.ref %>", 262 | "alt": "<%= rec.alt[0] %>", 263 | "dp": <%= rec.info.dp %>, 264 | "ann": '"<%= rec.info.ann %>"' 265 | } 266 | } 267 | ``` 268 | 269 | ```sh 270 | mongoimport --db PIK3CA --collection vcf --drop --file PIK3CA.json --jsonArray 271 | ``` 272 | 273 | ```ruby 274 | db.vcf.find({"rec.ann": /ENST00000263967/i }).count() 275 | 30 276 | ``` 277 | 278 | ## Load results into Python 279 | 280 | ```sh 281 | guix package -i python2-pip 282 | export PYTHONPATH="/home/user/.guix-profile/lib/python2.7/site-packages" 283 | pip install --install-option="--prefix=$HOME/opt/python" pymongo 284 | export PYTHONPATH="/home/user/.guix-profile/lib/python2.7/site-packages:$HOME/opt/python/lib/python2.7/site-packages" 285 | ``` 286 | 287 | Now start python: 288 | 289 | ```python 290 | from pymongo import MongoClient 291 | 292 | client = MongoClient() 293 | db = client.test 294 | # cursor = db.restaurants.find() 295 | cursor = db.restaurants.find({"borough": "Manhattan"}) 296 | for document in cursor: 297 | print(document) 298 | print(document["cuisine"]) 299 | print(document["grades"][0]["score"]>10) 300 | 301 | ``` 302 | 303 | ## Exercise 1 304 | 305 | Write a Python script which queries the PIK3CA VCF file for the annotation as in 306 | 307 | 308 | ```ruby 309 | db.vcf.find({"rec.ann": /ENST00000263967/i }).count() 310 | 30 311 | ``` 312 | 313 | ## Exercise 2 314 | 315 | Write a Python mongo script which queries the PIK3CA file for something 316 | similar to 317 | 318 | ```sh 319 | cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3 --seval s.dp --sfilter "s.dp>7" 320 | ``` 321 | 322 | when the bio-vcf template is 323 | 324 | ```ruby 325 | 326 | { 327 | "rec": { 328 | "chr": "<%= rec.chrom %>", 329 | "pos": <%= rec.pos %>, 330 | "ref": "<%= rec.ref %>", 331 | "alt": "<%= rec.alt[0] %>", 332 | "dp": <%= rec.info.dp %>, 333 | "samples": [ 334 | <%= 335 | a = [] 336 | rec.each_sample { |s| a.push s.dp } 337 | a.join(',') 338 | %> 339 | ] 340 | } 341 | } 342 | ``` 343 | 344 | So output looks like 345 | 346 | ```ruby 347 | { 348 | "rec": { 349 | "chr": "3", 350 | "pos": 178916581, 351 | "ref": "T", 352 | "alt": "C", 353 | "dp": 2345, 354 | "samples": [ 355 | 11,11,21,20,27,10,16,17,19,15,18,20,16,9,18,22,6,2,6,9,8,7,7,10,11,12,4,9,7,9,8,10,7,18,8,7 356 | ,7,4,11,4,8,8,8,14,13,23,13,11,12,3,10,27,31,16,12,1,3,4,15,10,20,8,4,0,25,2,10,9,13,20,17,14,25,15,19,16 357 | ,29,13,10,7,4,5,1,1,2,26,17,16,8,4,5,14,14,6,5,0,5,5,11,10,17,8,5,20,9,16,5,21,14,5,4,3,13,7,0,9,5,12,0,2 358 | ,9,14,2,4,7,1,15,7,14,12,4,14,16,26,7,22,5,4,7,10,11,14,19,25,11,2,28,25,29,30,23,30,35,33,32,27,4,30,25, 359 | 33,32,5,9,19,13,13,16,17,8,1,19,8,6,1,20,1,21,6,8,12,33,22,2,16,9,26,23 360 | ] 361 | } 362 | } 363 | 364 | ``` 365 | 366 | Hint: the answer is 25 367 | 368 | ```sh 369 | user@debian:~$ cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3 --seval s.dp --sfilter "s.dp>7" 370 | bio-vcf 0.9.2 (biogem Ruby 2.3.0 with pcows) by Pjotr Prins 2015 371 | Options: {:show_help=>false, :source=>"https://github.com/pjotrp/bioruby-vcf", :version=>"0.9.2 (Pjotr Prins)", :date=>"2016-04-11 12:11:27 +0200", :thread_lines=>40000, :timeout=>180, :sfilter_samples=>["2", "3"], :seval=>"s.dp", :skip_header=>true, :sfilter=>"s.dp>7"} 372 | 3 178916581 21 20 373 | 3 178916644 35 29 374 | 3 178916645 35 39 375 | 3 178916651 36 31 376 | 3 178916931 26 23 377 | 3 178917478 21 19 378 | 3 178919190 21 24 379 | 3 178921407 38 43 380 | 3 178921525 13 18 381 | 3 178921553 14 11 382 | 3 178922274 15 16 383 | 3 178922277 17 17 384 | 3 178922364 19 24 385 | 3 178927401 32 26 386 | 3 178927410 36 31 387 | 3 178927969 13 11 388 | 3 178928098 18 26 389 | 3 178936082 22 24 390 | 3 178936091 24 32 391 | 3 178938747 12 12 392 | 3 178941853 15 15 393 | 3 178947904 25 33 394 | 3 178952072 46 45 395 | 3 178952085 51 45 396 | 3 178952088 47 45 397 | user@debian:~$ cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3 --seval s.dp --sfilter "s.dp>7"|wc -l 398 | bio-vcf 0.9.2 (biogem Ruby 2.3.0 with pcows) by Pjotr Prins 2015 399 | Options: {:show_help=>false, :source=>"https://github.com/pjotrp/bioruby-vcf", :version=>"0.9.2 (Pjotr Prins)", :date=>"2016-04-11 12:11:42 +0200", :thread_lines=>40000, :timeout=>180, :sfilter_samples=>["2", "3"], :seval=>"s.dp", :skip_header=>true, :sfilter=>"s.dp>7"} 400 | 25 401 | ``` 402 | -------------------------------------------------------------------------------- /test/data/input/somaticsniper.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20140121 3 | ##phasing=none 4 | ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##FORMAT= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR 19 | 1 1636394 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:17:12,5,0,0:0,17,0,0:75:36:0:56:37:37:0:. 0/1:0/1:6:3,1,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36 20 | 1 36217006 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:21:9,12,0,0:0,21,0,0:90:31:0:60:37:37:0:. 0/1:0/1:6:2,2,0,2:0,4,0,2:24:31:24:60,53:37:37,37:2:31 21 | 1 46527674 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:. 0/1:0/1:4:2,0,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34 22 | 1 108417572 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,7,0,0:48:42:0:41:37:37:0:. 0/1:0/1:4:2,0,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:31 23 | 1 155170305 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:5,4,0,0:0,9,0,0:54:34:0:59:37:37:0:. 0/1:0/1:6:3,1,2,0:0,4,0,2:27:34:27:44,60:37:37,37:2:32 24 | 1 155449089 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:5,4,0,0:0,0,9,0:54:37:0:60:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34 25 | 1 169847826 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:. 0/1:0/1:4:2,0,2,0:2,0,2,0:30:37:30:60,60:37:37,37:2:34 26 | 1 203098164 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:3,7,0,0:0,10,0,0:57:40:0:53:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:37 27 | 2 39213209 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:16:8,8,0,0:0,0,16,0:75:42:0:56:37:37:0:. 0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,59:37:37,37:2:42 28 | 2 86691250 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:6,2,0,0:0,0,8,0:51:42:0:59:37:37:0:. 0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,60:37:37,37:2:33 29 | 2 88874243 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:17:12,5,0,0:0,0,17,0:78:34:0:59:37:37:0:. 0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:34 30 | 2 121728044 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,0,9,0:54:42:0:55:37:37:0:. 0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,57:37:37,37:2:36 31 | 2 170062591 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:4,5,0,0:0,0,9,0:54:39:0:60:37:37:0:. 0/1:0/1:5:2,1,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:35 32 | 2 216257844 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:2,5,0,0:0,0,7,0:48:40:0:60:37:37:0:. 0/1:0/1:3:1,0,0,2:2,0,1,0:1:40:33:60,60:37:37,37:2:30 33 | 2 222322623 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:15:6,9,0,0:0,15,0,0:72:33:0:60:37:37:0:. 0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33 34 | 3 25675413 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:6,2,0,0:8,0,0,0:51:32:0:59:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,0,2:6:32:25:34,37:37:37,37:2:30 35 | 3 36779638 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:8,5,0,0:0,13,0,0:66:36:0:60:37:37:0:. 0/1:0/1:6:1,3,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36 36 | 3 123458847 . T C . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:4,6,0,0:0,0,0,10:54:39:0:41:37:37:0:. 0/1:0/1:5:1,2,1,1:0,2,0,3:32:39:32:55,45:37:37,37:2:35 37 | 3 124351308 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:7,1,0,0:0,0,8,0:51:42:0:58:37:37:0:. 0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:33 38 | 3 142171996 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:8,2,0,0:0,10,0,0:57:37:0:59:37:37:0:. 0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:35 39 | 3 189526168 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:23:11,12,0,0:0,0,23,0:96:36:0:59:37:37:0:. 0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:36 40 | 4 82058553 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:. 0/1:0/1:4:2,0,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:30 41 | 4 122769998 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:7,0,0,0:0,7,0,0:48:40:0:54:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:30 42 | 5 13850856 . G C . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:3,5,0,0:0,0,8,0:51:40:0:51:37:37:0:. 0/1:0/1:3:0,1,2,0:0,2,1,0:1:40:33:49,60:37:37,37:2:33 43 | 5 132038609 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:10,2,0,0:0,12,0,0:63:33:0:58:37:37:0:. 0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33 44 | 5 137756599 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:7,6,0,0:0,13,0,0:66:31:0:60:37:37:0:. 0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:31 45 | 5 141974902 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:9,5,0,0:0,14,0,0:69:40:0:60:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:40 46 | 6 2749400 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:8,2,0,0:0,10,0,0:57:35:0:60:37:37:0:. 0/1:0/1:4:2,0,0,2:0,2,0,2:28:35:28:60,39:37:37,37:2:34 47 | 7 95217113 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,0,7,0:48:36:0:60:37:37:0:. 0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:30 48 | 7 140434525 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:8,6,0,0:0,0,14,0:69:31:0:59:37:37:0:. 0/1:0/1:6:3,1,0,2:2,0,4,0:24:31:24:60,58:37:37,37:2:31 49 | 7 151856059 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:20:9,10,1,0:0,19,0,1:50:33:0:59:37:37:0:. 0/1:0/1:7:1,4,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30 50 | 8 42958817 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:6,6,0,0:0,12,0,0:60:36:0:55:37:37:0:. 0/1:0/1:7:4,1,1,1:0,5,0,2:29:36:29:48,60:37:37,37:2:35 51 | 8 131070237 . A G . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:9,4,0,0:13,0,0,0:66:39:0:60:37:37:0:. 0/1:0/1:5:2,1,1,1:3,0,2,0:32:39:32:60,60:37:37,37:2:39 52 | 8 141711010 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:8,2,0,0:0,10,0,0:57:40:0:59:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,55:37:37,37:2:37 53 | 8 145059674 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:7,7,0,0:0,14,0,0:69:34:0:56:37:37:0:. 0/1:0/1:5:2,1,0,2:0,3,0,2:27:34:27:60,60:37:37,37:2:34 54 | 9 111651620 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:4,3,0,0:7,0,0,0:48:92:0:60:37:37:0:. 0/1:0/1:7:1,2,3,1:3,0,0,4:58:92:85:60,60:37:37,37:2:31 55 | 9 111685156 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:9,4,0,0:0,0,13,0:66:37:0:60:37:37:0:. 0/1:0/1:4:1,1,2,0:2,0,2,0:12:37:30:60,37:37:37,37:2:37 56 | 10 6525571 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:0,8,0,0:0,0,8,0:51:37:0:55:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:32 57 | 10 97197246 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:7,3,0,0:0,10,0,0:57:39:0:58:37:37:0:. 0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:60,53:37:37,37:2:36 58 | 11 58949455 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:8,4,0,0:0,0,12,0:60:33:0:54:37:37:0:. 0/1:0/1:7:3,2,1,1:2,0,5,0:26:33:26:60,36:37:37,37:2:33 59 | 11 65481082 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:17:12,5,0,0:0,0,17,0:78:45:0:58:37:37:0:. 0/1:0/1:3:0,1,1,1:2,0,1,0:1:45:34:60,60:37:37,37:2:45 60 | 11 94180424 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:23:17,6,0,0:0,0,23,0:96:34:0:56:37:37:0:. 0/1:0/1:5:2,1,2,0:2,0,3,0:27:34:27:60,59:37:37,37:2:34 61 | 11 121036021 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:3,9,0,0:0,12,0,0:63:31:0:59:37:37:0:. 0/1:0/1:6:4,0,0,2:0,4,0,2:24:31:24:59,60:37:37,37:2:31 62 | 12 994952 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:1,8,0,0:0,9,0,0:54:33:0:59:37:37:0:. 0/1:0/1:7:2,3,1,1:0,5,0,2:26:33:26:51,55:37:37,37:2:32 63 | 12 69233187 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:5,4,0,0:0,9,0,0:51:33:0:53:37:37:0:. 0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30 64 | 12 77436879 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:4,6,0,0:0,0,10,0:57:34:0:60:37:37:0:. 0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:33 65 | 12 96641273 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:4,4,0,0:0,0,8,0:51:33:0:60:37:37:0:. 0/1:0/1:7:2,3,1,1:2,0,5,0:26:33:26:60,50:37:37,37:2:30 66 | 12 110813986 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:4,4,0,0:0,0,8,0:51:39:0:59:37:37:0:. 0/1:0/1:5:1,2,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:33 67 | 12 122825587 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:21:12,9,0,0:0,21,0,0:90:34:0:59:37:37:0:. 0/1:0/1:5:3,0,2,0:0,3,0,2:27:34:27:60,60:37:37,37:2:34 68 | 14 30135337 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,7,0,0:48:40:0:60:37:37:0:. 0/1:0/1:3:1,0,0,2:0,1,0,2:1:40:33:60,60:37:37,37:2:30 69 | 14 51398458 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:4,5,0,0:0,0,9,0:54:36:0:55:37:37:0:. 0/1:0/1:6:0,4,1,1:2,0,4,0:29:36:29:60,59:37:37,37:2:33 70 | 15 43170722 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:11:6,5,0,0:0,11,0,0:60:31:0:60:37:37:0:. 0/1:0/1:6:4,0,2,0:0,4,0,2:24:31:24:56,54:37:37,37:2:31 71 | 15 50862183 . C A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:7,2,0,0:0,9,0,0:54:35:0:60:37:37:0:. 0/1:0/1:4:0,2,0,2:2,2,0,0:28:35:28:45,59:37:37,37:2:33 72 | 15 64332347 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:6,1,0,0:0,7,0,0:48:39:0:60:37:37:0:. 0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:58,56:37:37,37:2:30 73 | 15 80845030 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:1,8,0,0:0,9,0,0:54:31:0:60:37:37:0:. 0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:30 74 | 16 1812938 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:11:8,3,0,0:0,0,11,0:60:34:0:46:37:37:0:. 0/1:0/1:5:1,2,1,1:2,0,3,0:27:34:27:46,55:37:37,37:2:33 75 | 16 3582808 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:6,4,0,0:0,0,10,0:57:34:0:59:37:37:0:. 0/1:0/1:5:0,3,2,0:2,0,3,0:27:34:27:60,60:37:37,37:2:33 76 | 16 14042032 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:. 0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:57,60:37:37,37:2:30 77 | 16 23619204 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,0,7,0:48:42:0:60:37:37:0:. 0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:31 78 | 17 41256142 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:5,3,0,0:0,0,8,0:51:40:0:60:37:37:0:. 0/1:0/1:3:0,1,2,0:2,0,1,0:1:40:33:57,60:37:37,37:2:33 79 | 17 61784013 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,9,0,0:54:45:0:60:37:37:0:. 0/1:0/1:3:0,1,1,1:0,1,0,2:1:45:34:60,60:37:37,37:2:36 80 | 18 45423074 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:16:8,8,0,0:0,16,0,0:75:46:0:53:37:37:0:. 0/1:0/1:10:3,4,3,0:0,7,0,3:39:46:39:60,60:37:37,37:2:46 81 | 18 60985432 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:9,4,0,1:0,0,13,0:32:42:0:60:37:37:0:. 0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,57:37:37,37:2:41 82 | 19 39664512 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:8,6,0,0:0,14,0,0:69:42:0:60:37:37:0:. 0/1:0/1:4:0,2,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:42 83 | 19 49473085 . G C . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:4,4,0,0:0,0,8,0:51:37:0:60:37:37:0:. 0/1:0/1:4:1,1,2,0:0,2,2,0:30:37:30:48,56:37:37,37:2:32 84 | 20 34135210 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:2,6,0,0:0,8,0,0:51:36:0:59:37:37:0:. 0/1:0/1:6:2,2,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:32 85 | 20 35663882 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:2,5,0,0:0,0,7,0:48:37:0:56:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:30 86 | X 70341572 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:11:6,5,0,0:0,11,0,0:60:33:0:56:37:37:0:. 0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:58,60:37:37,37:2:33 87 | X 123164862 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,0,7,0:48:40:0:59:37:37:0:. 0/1:0/1:3:1,0,2,0:2,0,1,0:1:40:33:60,60:37:37,37:2:30 88 | -------------------------------------------------------------------------------- /test/data/regression/pass1.ref: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.1 2 | ##fileDate=20140121 3 | ##phasing=none 4 | ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta 5 | ##FORMAT= 6 | ##FORMAT= 7 | ##FORMAT= 8 | ##FORMAT= 9 | ##FORMAT= 10 | ##FORMAT= 11 | ##FORMAT= 12 | ##FORMAT= 13 | ##FORMAT= 14 | ##FORMAT= 15 | ##FORMAT= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FILTER=5 and r.tumor.dp>7"> 19 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR 20 | 1 1636394 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:17:12,5,0,0:0,17,0,0:75:36:0:56:37:37:0:. 0/1:0/1:6:3,1,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36 21 | 1 36217006 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:21:9,12,0,0:0,21,0,0:90:31:0:60:37:37:0:. 0/1:0/1:6:2,2,0,2:0,4,0,2:24:31:24:60,53:37:37,37:2:31 22 | 1 46527674 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:. 0/1:0/1:4:2,0,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34 23 | 1 108417572 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,7,0,0:48:42:0:41:37:37:0:. 0/1:0/1:4:2,0,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:31 24 | 1 155170305 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:5,4,0,0:0,9,0,0:54:34:0:59:37:37:0:. 0/1:0/1:6:3,1,2,0:0,4,0,2:27:34:27:44,60:37:37,37:2:32 25 | 1 155449089 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:5,4,0,0:0,0,9,0:54:37:0:60:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34 26 | 1 169847826 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:. 0/1:0/1:4:2,0,2,0:2,0,2,0:30:37:30:60,60:37:37,37:2:34 27 | 1 203098164 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:3,7,0,0:0,10,0,0:57:40:0:53:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:37 28 | 2 39213209 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:16:8,8,0,0:0,0,16,0:75:42:0:56:37:37:0:. 0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,59:37:37,37:2:42 29 | 2 86691250 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:6,2,0,0:0,0,8,0:51:42:0:59:37:37:0:. 0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,60:37:37,37:2:33 30 | 2 88874243 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:17:12,5,0,0:0,0,17,0:78:34:0:59:37:37:0:. 0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:34 31 | 2 121728044 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,0,9,0:54:42:0:55:37:37:0:. 0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,57:37:37,37:2:36 32 | 2 170062591 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:4,5,0,0:0,0,9,0:54:39:0:60:37:37:0:. 0/1:0/1:5:2,1,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:35 33 | 2 216257844 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:2,5,0,0:0,0,7,0:48:40:0:60:37:37:0:. 0/1:0/1:3:1,0,0,2:2,0,1,0:1:40:33:60,60:37:37,37:2:30 34 | 2 222322623 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:15:6,9,0,0:0,15,0,0:72:33:0:60:37:37:0:. 0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33 35 | 3 25675413 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:6,2,0,0:8,0,0,0:51:32:0:59:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,0,2:6:32:25:34,37:37:37,37:2:30 36 | 3 36779638 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:8,5,0,0:0,13,0,0:66:36:0:60:37:37:0:. 0/1:0/1:6:1,3,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36 37 | 3 123458847 . T C . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:4,6,0,0:0,0,0,10:54:39:0:41:37:37:0:. 0/1:0/1:5:1,2,1,1:0,2,0,3:32:39:32:55,45:37:37,37:2:35 38 | 3 124351308 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:7,1,0,0:0,0,8,0:51:42:0:58:37:37:0:. 0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:33 39 | 3 142171996 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:8,2,0,0:0,10,0,0:57:37:0:59:37:37:0:. 0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:35 40 | 3 189526168 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:23:11,12,0,0:0,0,23,0:96:36:0:59:37:37:0:. 0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:36 41 | 4 82058553 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:. 0/1:0/1:4:2,0,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:30 42 | 4 122769998 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:7,0,0,0:0,7,0,0:48:40:0:54:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:30 43 | 5 13850856 . G C . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:3,5,0,0:0,0,8,0:51:40:0:51:37:37:0:. 0/1:0/1:3:0,1,2,0:0,2,1,0:1:40:33:49,60:37:37,37:2:33 44 | 5 132038609 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:10,2,0,0:0,12,0,0:63:33:0:58:37:37:0:. 0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33 45 | 5 137756599 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:7,6,0,0:0,13,0,0:66:31:0:60:37:37:0:. 0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:31 46 | 5 141974902 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:9,5,0,0:0,14,0,0:69:40:0:60:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:40 47 | 6 2749400 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:8,2,0,0:0,10,0,0:57:35:0:60:37:37:0:. 0/1:0/1:4:2,0,0,2:0,2,0,2:28:35:28:60,39:37:37,37:2:34 48 | 7 95217113 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,0,7,0:48:36:0:60:37:37:0:. 0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:30 49 | 7 140434525 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:8,6,0,0:0,0,14,0:69:31:0:59:37:37:0:. 0/1:0/1:6:3,1,0,2:2,0,4,0:24:31:24:60,58:37:37,37:2:31 50 | 7 151856059 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:20:9,10,1,0:0,19,0,1:50:33:0:59:37:37:0:. 0/1:0/1:7:1,4,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30 51 | 8 42958817 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:6,6,0,0:0,12,0,0:60:36:0:55:37:37:0:. 0/1:0/1:7:4,1,1,1:0,5,0,2:29:36:29:48,60:37:37,37:2:35 52 | 8 131070237 . A G . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:9,4,0,0:13,0,0,0:66:39:0:60:37:37:0:. 0/1:0/1:5:2,1,1,1:3,0,2,0:32:39:32:60,60:37:37,37:2:39 53 | 8 141711010 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:8,2,0,0:0,10,0,0:57:40:0:59:37:37:0:. 0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,55:37:37,37:2:37 54 | 8 145059674 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:7,7,0,0:0,14,0,0:69:34:0:56:37:37:0:. 0/1:0/1:5:2,1,0,2:0,3,0,2:27:34:27:60,60:37:37,37:2:34 55 | 9 111651620 . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:4,3,0,0:7,0,0,0:48:92:0:60:37:37:0:. 0/1:0/1:7:1,2,3,1:3,0,0,4:58:92:85:60,60:37:37,37:2:31 56 | 9 111685156 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:13:9,4,0,0:0,0,13,0:66:37:0:60:37:37:0:. 0/1:0/1:4:1,1,2,0:2,0,2,0:12:37:30:60,37:37:37,37:2:37 57 | 10 6525571 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:0,8,0,0:0,0,8,0:51:37:0:55:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:32 58 | 10 97197246 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:7,3,0,0:0,10,0,0:57:39:0:58:37:37:0:. 0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:60,53:37:37,37:2:36 59 | 11 58949455 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:8,4,0,0:0,0,12,0:60:33:0:54:37:37:0:. 0/1:0/1:7:3,2,1,1:2,0,5,0:26:33:26:60,36:37:37,37:2:33 60 | 11 65481082 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:17:12,5,0,0:0,0,17,0:78:45:0:58:37:37:0:. 0/1:0/1:3:0,1,1,1:2,0,1,0:1:45:34:60,60:37:37,37:2:45 61 | 11 94180424 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:23:17,6,0,0:0,0,23,0:96:34:0:56:37:37:0:. 0/1:0/1:5:2,1,2,0:2,0,3,0:27:34:27:60,59:37:37,37:2:34 62 | 11 121036021 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:12:3,9,0,0:0,12,0,0:63:31:0:59:37:37:0:. 0/1:0/1:6:4,0,0,2:0,4,0,2:24:31:24:59,60:37:37,37:2:31 63 | 12 994952 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:1,8,0,0:0,9,0,0:54:33:0:59:37:37:0:. 0/1:0/1:7:2,3,1,1:0,5,0,2:26:33:26:51,55:37:37,37:2:32 64 | 12 69233187 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:5,4,0,0:0,9,0,0:51:33:0:53:37:37:0:. 0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30 65 | 12 77436879 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:4,6,0,0:0,0,10,0:57:34:0:60:37:37:0:. 0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:33 66 | 12 96641273 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:4,4,0,0:0,0,8,0:51:33:0:60:37:37:0:. 0/1:0/1:7:2,3,1,1:2,0,5,0:26:33:26:60,50:37:37,37:2:30 67 | 12 110813986 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:4,4,0,0:0,0,8,0:51:39:0:59:37:37:0:. 0/1:0/1:5:1,2,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:33 68 | 12 122825587 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:21:12,9,0,0:0,21,0,0:90:34:0:59:37:37:0:. 0/1:0/1:5:3,0,2,0:0,3,0,2:27:34:27:60,60:37:37,37:2:34 69 | 14 30135337 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,7,0,0:48:40:0:60:37:37:0:. 0/1:0/1:3:1,0,0,2:0,1,0,2:1:40:33:60,60:37:37,37:2:30 70 | 14 51398458 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:4,5,0,0:0,0,9,0:54:36:0:55:37:37:0:. 0/1:0/1:6:0,4,1,1:2,0,4,0:29:36:29:60,59:37:37,37:2:33 71 | 15 43170722 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:11:6,5,0,0:0,11,0,0:60:31:0:60:37:37:0:. 0/1:0/1:6:4,0,2,0:0,4,0,2:24:31:24:56,54:37:37,37:2:31 72 | 15 50862183 . C A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:7,2,0,0:0,9,0,0:54:35:0:60:37:37:0:. 0/1:0/1:4:0,2,0,2:2,2,0,0:28:35:28:45,59:37:37,37:2:33 73 | 15 64332347 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:6,1,0,0:0,7,0,0:48:39:0:60:37:37:0:. 0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:58,56:37:37,37:2:30 74 | 15 80845030 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:1,8,0,0:0,9,0,0:54:31:0:60:37:37:0:. 0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:30 75 | 16 1812938 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:11:8,3,0,0:0,0,11,0:60:34:0:46:37:37:0:. 0/1:0/1:5:1,2,1,1:2,0,3,0:27:34:27:46,55:37:37,37:2:33 76 | 16 3582808 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:10:6,4,0,0:0,0,10,0:57:34:0:59:37:37:0:. 0/1:0/1:5:0,3,2,0:2,0,3,0:27:34:27:60,60:37:37,37:2:33 77 | 16 14042032 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:. 0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:57,60:37:37,37:2:30 78 | 16 23619204 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,0,7,0:48:42:0:60:37:37:0:. 0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:31 79 | 17 41256142 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:5,3,0,0:0,0,8,0:51:40:0:60:37:37:0:. 0/1:0/1:3:0,1,2,0:2,0,1,0:1:40:33:57,60:37:37,37:2:33 80 | 17 61784013 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:9:6,3,0,0:0,9,0,0:54:45:0:60:37:37:0:. 0/1:0/1:3:0,1,1,1:0,1,0,2:1:45:34:60,60:37:37,37:2:36 81 | 18 45423074 . C T . PASS . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:16:8,8,0,0:0,16,0,0:75:46:0:53:37:37:0:. 0/1:0/1:10:3,4,3,0:0,7,0,3:39:46:39:60,60:37:37,37:2:46 82 | 18 60985432 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:9,4,0,1:0,0,13,0:32:42:0:60:37:37:0:. 0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,57:37:37,37:2:41 83 | 19 39664512 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:14:8,6,0,0:0,14,0,0:69:42:0:60:37:37:0:. 0/1:0/1:4:0,2,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:42 84 | 19 49473085 . G C . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:4,4,0,0:0,0,8,0:51:37:0:60:37:37:0:. 0/1:0/1:4:1,1,2,0:0,2,2,0:30:37:30:48,56:37:37,37:2:32 85 | 20 34135210 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:8:2,6,0,0:0,8,0,0:51:36:0:59:37:37:0:. 0/1:0/1:6:2,2,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:32 86 | 20 35663882 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:2,5,0,0:0,0,7,0:48:37:0:56:37:37:0:. 0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:30 87 | X 70341572 . C T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:11:6,5,0,0:0,11,0,0:60:33:0:56:37:37:0:. 0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:58,60:37:37,37:2:33 88 | X 123164862 . G A . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC 0/0:0/0:7:5,2,0,0:0,0,7,0:48:40:0:59:37:37:0:. 0/1:0/1:3:1,0,2,0:2,0,1,0:1:40:33:60,60:37:37,37:2:30 89 | -------------------------------------------------------------------------------- /bin/bio-vcf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # 3 | # bio-vcf parser and transformer 4 | # Author:: Pjotr Prins 5 | # License:: MIT 6 | # 7 | # Copyright (C) 2014-2021 Pjotr Prins 8 | 9 | USAGE = "Vcf parser" 10 | 11 | gempath = File.dirname(File.dirname(__FILE__)) 12 | $: << File.join(gempath,'lib') 13 | 14 | VERSION_FILENAME=File.join(gempath,'VERSION') 15 | version = File.new(VERSION_FILENAME).read.chomp 16 | 17 | require 'bio-vcf' 18 | require 'bio-vcf/pcows' 19 | require 'optparse' 20 | require 'timeout' 21 | require 'fileutils' 22 | require 'json' 23 | 24 | # Uncomment when using the bio-logger 25 | # require 'bio-logger' 26 | # log = Bio::Log::LoggerPlus.new 'vcf' 27 | # log.outputters = Bio::Log::Outputter.stderr 28 | # Bio::Log::CLI.logger('stderr') 29 | # Bio::Log::CLI.trace('info') 30 | 31 | options = { show_help: false, source: 'https://github.com/vcflib/bio-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 } 32 | opts = OptionParser.new do |o| 33 | o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf" 34 | 35 | o.on('-i','--ignore-missing', 'Ignore missing data') do 36 | options[:ignore_missing] = true 37 | end 38 | o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd| 39 | options[:filter] = cmd 40 | end 41 | 42 | o.on('--sfilter cmd',String, 'Evaluate filter on each sample') do |cmd| 43 | options[:sfilter] = cmd 44 | end 45 | o.on("--sfilter-samples list", Array, "Filter on selected samples (e.g., 0,1") do |l| 46 | options[:sfilter_samples] = l 47 | end 48 | 49 | o.on('--ifilter cmd','--if cmd',String, 'Include filter') do |cmd| 50 | options[:ifilter] = cmd 51 | end 52 | o.on("--ifilter-samples list", Array, "Include set - implicitely defines exclude set") do |l| 53 | options[:ifilter_samples] = l 54 | end 55 | 56 | o.on('--efilter cmd','--ef cmd',String, 'Exclude filter') do |cmd| 57 | options[:efilter] = cmd 58 | end 59 | o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l| 60 | options[:efilter_samples] = l 61 | end 62 | o.on('--add-filter name',String, 'Set/add filter field to name') do |name| 63 | options[:add_filter] = name 64 | end 65 | 66 | o.on("--bed bedfile", String, "Filter on BED elements") do |bed| 67 | options[:bed] = bed 68 | end 69 | 70 | o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd| 71 | options[:eval] = cmd 72 | end 73 | o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd| 74 | options[:eval_once] = true 75 | options[:eval] = cmd 76 | # options[:num_threads] = 1 77 | # options[:thread_lines] = 1 78 | options[:skip_header] = true 79 | end 80 | o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd| 81 | options[:seval] = cmd 82 | options[:skip_header] = true 83 | end 84 | o.on("--rewrite eval", "Rewrite INFO") do |s| 85 | options[:rewrite] = s 86 | end 87 | o.on("--samples list", Array, "Output selected samples") do |l| 88 | options[:samples] = l 89 | end 90 | o.on("--json", "Try to coerce header into JSON (for records check out --template!)") do |b| 91 | options[:json] = true 92 | options[:skip_header] = true 93 | end 94 | o.on("--rdf", "Try to coerce header into Turtle RDF (requires RDF --template!)") do |b| 95 | require 'bio-vcf/vcfrdf' 96 | options[:rdf] = true 97 | options[:skip_header] = true 98 | end 99 | o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i| 100 | options[:num_threads] = i 101 | end 102 | o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i| 103 | options[:thread_lines] = i 104 | end 105 | o.on_tail("--id name", String, "Identifier") do |s| 106 | options[:id] = s 107 | end 108 | o.on_tail("--tags list", String, "Add tags") do |s| 109 | options[:tags] = s 110 | end 111 | 112 | o.on("--skip-header", "Do not output VCF header info") do 113 | options[:skip_header] = true 114 | end 115 | 116 | o.on("--set-header list", Array, "Set a special tab delimited output header (#samples expands to sample names)") do |list| 117 | options[:set_header] = list 118 | options[:skip_header] = true 119 | end 120 | 121 | o.on("-t erb","--template erb",String, "Use ERB template for output") do |s| 122 | require 'bio-vcf/vcfrdf' 123 | require 'erb' 124 | options[:template] = s 125 | options[:skip_header] = true 126 | end 127 | 128 | o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t| 129 | options[:tag] = true 130 | end 131 | 132 | o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i| 133 | options[:timeout] = i 134 | end 135 | 136 | # Uncomment the following when using the bio-logger 137 | # o.separator "" 138 | # o.on("--logger filename",String,"Log to file (default stderr)") do | name | 139 | # Bio::Log::CLI.logger(name) 140 | # end 141 | # 142 | # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s | 143 | # Bio::Log::CLI.trace(s) 144 | # end 145 | # 146 | o.on("--names", "Output sample names") do |q| 147 | options[:quiet] = true 148 | options[:num_threads] = nil 149 | options[:eval_once] = true 150 | options[:eval] = 'header.samples.join("\t")' 151 | # options[:num_threads] = 1 152 | # options[:thread_lines] = 1 153 | options[:skip_header] = true 154 | end 155 | o.on("--statistics", "Output statistics") do |q| 156 | options[:statistics] = true 157 | options[:num_threads] = nil 158 | end 159 | o.on("-q", "--quiet", "Run quietly") do |q| 160 | # Bio::Log::CLI.trace('error') 161 | options[:quiet] = true 162 | end 163 | 164 | o.on("-v", "--verbose", "Run verbosely") do |v| 165 | options[:verbose] = true 166 | end 167 | 168 | o.on("--debug", "Show debug messages and keep intermediate output") do |v| 169 | # Bio::Log::CLI.trace('debug') 170 | options[:debug] = true 171 | end 172 | 173 | o.separator "" 174 | o.on_tail('-h', '--help', 'display this help and exit') do 175 | options[:show_help] = true 176 | end 177 | end 178 | 179 | opts.parse!(ARGV) 180 | 181 | BIOVCF_VERSION=version 182 | BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2021\n" 183 | $stderr.print BIOVCF_BANNER if !options[:quiet] 184 | 185 | if options[:show_help] 186 | print opts 187 | print USAGE 188 | exit 1 189 | end 190 | 191 | if RUBY_VERSION =~ /^1/ 192 | $stderr.print "WARNING: bio-vcf does not run on Ruby 1.x\n" 193 | end 194 | 195 | $stderr.print "Options: ",options,"\n" if !options[:quiet] 196 | 197 | if options[:template] 198 | include BioVcf::RDF 199 | require 'bio-vcf/template' 200 | fn = options[:template] 201 | raise "No template #{fn}!" if not File.exist?(fn) 202 | # template = ERB.new(File.read(fn)) 203 | template = Bio::Template.new(fn,options[:json]) 204 | end 205 | 206 | stats = nil 207 | if options[:statistics] 208 | options[:num_threads] = nil 209 | stats = BioVcf::VcfStatistics.new 210 | end 211 | 212 | # Check for option combinations 213 | raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter] 214 | raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter] 215 | raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter] 216 | # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter] 217 | # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter] 218 | 219 | if options[:samples] 220 | samples = options[:samples].map { |s| s.to_i } 221 | end 222 | 223 | include BioVcf 224 | 225 | # Parse the header section of a VCF file (chomping STDIN) 226 | def parse_header line, samples, options 227 | header = VcfHeader.new(options[:debug]) 228 | header.add(line) 229 | do_parse_header = !options[:skip_header] 230 | print line if do_parse_header 231 | STDIN.each_line do | headerline | 232 | if headerline !~ /^#/ 233 | # If no records in VCF, we never get here 234 | line = headerline 235 | break # end of header 236 | end 237 | header.add(headerline) 238 | if do_parse_header 239 | if headerline =~ /^#CHR/ 240 | # Parse the column names line #CHROM POS ID REF ALT QUAL... 241 | # and forms the selected header.column_names 242 | # 243 | # The header before actual data contains the sample names, 244 | # thereforefirst inject the BioVcf meta information 245 | print header.tag(options),"\n" if options[:tag] 246 | # Then the additional filter(s) 247 | # ##FILTER= 248 | add_filter = options[:add_filter] 249 | if add_filter 250 | print "##FILTER=\n" 251 | end 252 | 253 | selected = header.column_names 254 | if samples 255 | newfields = selected[0..8] 256 | samples.each do |s| 257 | newfields << selected[s+9] 258 | end 259 | selected = newfields 260 | end 261 | print "#",selected.join("\t"),"\n" 262 | else 263 | print headerline 264 | end 265 | end 266 | end 267 | print header.printable_header_line(options[:set_header]),"\n" if options[:set_header] 268 | VcfRdf::header if options[:rdf] 269 | if line =~ /^#/ 270 | # We did not read a record 271 | line = nil 272 | end 273 | return header,line 274 | end 275 | 276 | # Parse a VCF line and return the (template) result as a string buffer 277 | # This is the main work horse that parses through every VCF record: 278 | def parse_line line,header,options,bedfilter,samples,template,stats=nil 279 | fields = VcfLine.parse(line) 280 | rec = VcfRecord.new(fields,header) 281 | r = rec # alias 282 | 283 | filter = options[:filter] 284 | sfilter = options[:sfilter] 285 | efilter = options[:efilter] 286 | ifilter = options[:ifilter] 287 | add_filter = options[:add_filter] # contains a filter name (soft filter) 288 | seval = options[:seval] 289 | ignore_missing = options[:ignore_missing] 290 | quiet = options[:quiet] 291 | set_filter_field = nil 292 | 293 | if sfilter or efilter or ifilter or seval 294 | # check for samples 295 | header_samples = header.column_names[9..-1] 296 | raise "Empty sample list, can not execute query!" if not header_samples 297 | end 298 | 299 | # -------------------------- 300 | # Filtering and set analysis 301 | if bedfilter 302 | bed = bedfilter.contains(rec) 303 | return if not bed 304 | end 305 | 306 | skip = lambda { |&m| 307 | matched = m.call 308 | if add_filter 309 | set_filter_field = true if matched 310 | false # always continue processing with an add-filter 311 | else 312 | not matched 313 | end 314 | } 315 | 316 | if filter 317 | return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) } 318 | end 319 | 320 | if sfilter # sample 'or' filter 321 | rec.each_sample(options[:sfilter_samples]) do | sample | 322 | return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) } 323 | end 324 | end 325 | 326 | if ifilter # include sample filter 327 | found = false 328 | rec.each_sample(options[:ifilter_samples]) do | sample | 329 | if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet) 330 | found = true 331 | break 332 | end 333 | end 334 | # Skip if there are no matches 335 | return if skip.call {found} 336 | end 337 | 338 | if efilter # exclude sample filter 339 | rec.each_sample(options[:efilter_samples]) do | sample | 340 | return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) } 341 | end 342 | end 343 | 344 | stats.add(rec) if stats 345 | 346 | # ----------------------------- 347 | # From here on decide on output 348 | 349 | rec.add_to_filter_field(add_filter) if set_filter_field 350 | 351 | if samples 352 | # Select certain samples for output 353 | newfields = fields[0..8] 354 | samples.each do |s| 355 | newfields << fields[s+9] 356 | end 357 | fields = newfields 358 | end 359 | if options[:eval] or seval 360 | begin 361 | results = nil # result string 362 | if options[:eval] 363 | res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet) 364 | results = res if res 365 | end 366 | if seval 367 | list = (results ? [] : [rec.chr,rec.pos]) 368 | rec.each_sample(options[:sfilter_samples]) { | sample | 369 | list << sample.eval(seval,ignore_missing_data: ignore_missing,quiet: quiet) 370 | } 371 | results = (results ? results.to_s + "\t" : "" ) + list.join("\t") 372 | end 373 | rescue => e 374 | $stderr.print "\nLine: ",line 375 | $stderr.print "ERROR evaluating --eval <#{options[:eval]}> #{e.message}\n" 376 | raise if options[:verbose] 377 | exit 1 378 | end 379 | if results 380 | str = if options[:json] 381 | results.to_json 382 | else 383 | results.to_s 384 | end 385 | return str+"\n" 386 | end 387 | else 388 | if options[:rdf] 389 | # Output Turtle RDF 390 | VcfRdf::record(options[:id],rec,options[:tags]) 391 | elsif options[:template] 392 | # Use ERB template 393 | begin 394 | template.body(binding) 395 | rescue Exception => e 396 | $stderr.print e,": ",fields,"\n" 397 | $stderr.print e.backtrace.inspect if options[:verbose] 398 | raise 399 | end 400 | elsif options[:rewrite] 401 | # Default behaviour prints VCF line, but rewrite info 402 | eval(options[:rewrite]) 403 | (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n" 404 | elsif stats 405 | # do nothing 406 | else 407 | # Default behaviour prints VCF line 408 | fields.join("\t")+"\n" 409 | end 410 | end 411 | end 412 | 413 | CHUNK_SIZE = options[:thread_lines] 414 | 415 | pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout], 416 | options[:quiet],options[:debug]) 417 | header = nil 418 | header_output_completed = false 419 | chunk_lines = [] 420 | line_number=0 421 | 422 | if options[:bed] 423 | bedfilter = BedFilter.new(options[:bed]) 424 | end 425 | 426 | begin 427 | # Define linear parser function (going through one chunk) 428 | process = lambda { | lines | 429 | res = [] 430 | lines.each do | line | 431 | res << parse_line(line,header,options,bedfilter,samples,template,stats) 432 | end 433 | res 434 | } 435 | 436 | # ---- Main loop 437 | STDIN.each_line do | line | 438 | line_number += 1 439 | 440 | # ---- Skip embedded headers down the line... 441 | next if header_output_completed and line =~ /^#/ 442 | 443 | # ---- In the following section header information is handled - 444 | # this only happens once. 445 | 446 | # ---- Parse the header lines (chomps from STDIN) 447 | # and returns header info and the current line 448 | if line =~ /^#/ 449 | header, line = parse_header(line,samples,options) 450 | if line.nil? 451 | # No line after header, to there are no records to process 452 | break 453 | end 454 | end 455 | # p [line_number,line] 456 | # ---- After the header continue processing 457 | if not header_output_completed 458 | # one-time post-header processing 459 | if not options[:efilter_samples] and options[:ifilter_samples] 460 | # Create exclude set as a complement of include set 461 | options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples] 462 | end 463 | print template.header(binding) if template 464 | header_output_completed = true 465 | end 466 | 467 | if options[:eval_once] 468 | # this happens if we only want one line evaluated - say to get 469 | # the number of samples, or parse an item in the header 470 | print parse_line(line,header,options,bedfilter,samples,template,stats) 471 | exit 0 472 | end 473 | 474 | # ---- Lines are collected in one buffer and the lines buffer 475 | # is added to the chunks list (for the threads) 476 | chunk_lines << line 477 | 478 | # ---- In the following section the VCF lines are parsed by chunks 479 | # The chunks may go into different threads 480 | 481 | if chunk_lines.size >= CHUNK_SIZE 482 | # ---- process one chunk 483 | $stderr.print '.' if not options[:quiet] 484 | pcows.wait_for_worker_slot() 485 | pcows.submit_worker(process,chunk_lines) 486 | pcows.process_output() 487 | 488 | chunk_lines = [] 489 | end 490 | end 491 | pcows.submit_final_worker(process,chunk_lines) 492 | pcows.wait_for_workers() 493 | pcows.process_remaining_output() 494 | 495 | print template.footer(binding) if template 496 | stats.print if stats 497 | 498 | rescue Exception => e 499 | if e.message != 'exit' 500 | $stderr.print "ERROR: " 501 | $stderr.print e.message,"\n" 502 | end 503 | pcows.cleanup() 504 | raise if options[:verbose] 505 | exit 1 506 | end 507 | --------------------------------------------------------------------------------