├── test2.vcf
├── VERSION
├── ragel
    ├── .gitignore
    ├── generate.sh
    └── gen_vcfheaderline_parser.rl
├── features
    ├── step_definitions
    │   ├── bio-vcf_steps.rb
    │   ├── cli-feature.rb
    │   ├── vcf_header.rb
    │   ├── diff_count.rb
    │   ├── somaticsniper.rb
    │   ├── sfilter.rb
    │   └── multisample.rb
    ├── support
    │   └── env.rb
    ├── filter.feature
    ├── diff_count.feature
    ├── vcf_header.feature
    ├── sfilter.feature
    ├── somaticsniper.feature
    ├── cli.feature
    └── multisample.feature
├── .travis.yml
├── lib
    ├── bio-vcf
    │   ├── vcf.rb
    │   ├── vcfline.rb
    │   ├── utils.rb
    │   ├── vcfstatistics.rb
    │   ├── variant.rb
    │   ├── bedfilter.rb
    │   ├── vcffile.rb
    │   ├── template.rb
    │   ├── vcfrdf.rb
    │   ├── vcfsample.rb
    │   ├── vcfheader.rb
    │   ├── vcfgenotypefield.rb
    │   ├── vcfrecord.rb
    │   └── pcows.rb
    ├── regressiontest.rb
    ├── bio-vcf.rb
    └── regressiontest
    │   └── cli_exec.rb
├── doc
    ├── json.png
    ├── pcows.org
    ├── Compare_VCFs.md
    ├── Using_RDF.md
    ├── GATK_comparison.md
    └── Using_Mongo.md
├── test
    ├── data
    │   ├── input
    │   │   ├── empty.vcf
    │   │   └── somaticsniper.vcf
    │   └── regression
    │   │   ├── empty.ref
    │   │   ├── thread4_4_failed_filter-stderr.ref
    │   │   ├── ifilter_s.dp.ref
    │   │   ├── sfilter_seval_s.dp.ref
    │   │   ├── seval_s.dp.ref
    │   │   ├── eval_once.ref
    │   │   ├── eval_r.info.dp.ref
    │   │   └── pass1.ref
    ├── stress
    │   └── stress_test.sh
    └── performance
    │   └── metrics.md
├── Gemfile
├── .gitignore
├── template
    ├── vcf2rdf.erb
    ├── vcf2json.erb
    ├── vcf2json_full_header.erb
    ├── vcf2rdf_header.erb
    ├── vcf2json_expanded.erb
    ├── gatk_vcf2rdf.erb
    └── vcf2json_use_meta.erb
├── Rakefile
├── LICENSE
├── RELEASE_NOTES.md
├── bio-vcf.gemspec
├── guix.scm
└── bin
    └── bio-vcf


/test2.vcf:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.9.6
2 | 


--------------------------------------------------------------------------------
/ragel/.gitignore:
--------------------------------------------------------------------------------
1 | *.rb
2 | 


--------------------------------------------------------------------------------
/features/step_definitions/bio-vcf_steps.rb:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | 
3 | arch: arm64
4 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcf.rb:
--------------------------------------------------------------------------------
1 | 
2 | module BioVcf
3 | end
4 | 


--------------------------------------------------------------------------------
/doc/json.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vcflib/bio-vcf/HEAD/doc/json.png


--------------------------------------------------------------------------------
/test/data/input/empty.vcf:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.0
2 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
3 | 


--------------------------------------------------------------------------------
/test/data/regression/empty.ref:
--------------------------------------------------------------------------------
1 | ##fileformat=VCFv4.0
2 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
3 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source "http://rubygems.org"
 2 | 
 3 | group :development do
 4 |   gem "rake"
 5 |   gem "rspec"
 6 |   gem "cucumber"
 7 | end
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/ragel/generate.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | ragel -R gen_vcfheaderline_parser.rl 
4 | [ $? -ne 0 ] && exit 1
5 | 
6 | ruby gen_vcfheaderline_parser.rb
7 | 
8 | cp gen_vcfheaderline_parser.rb ../lib/bio-vcf/vcfheader_line.rb
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | file.vcf
 2 | Gemfile.lock
 3 | notes.txt
 4 | .bundle
 5 | vendor/
 6 | orig/
 7 | pkg/
 8 | scripts/timings.sh
 9 | TAGS
10 | test/data/regression/*.new
11 | data/
12 | fedor.txt
13 | test.vcf
14 | out.bed
15 | rdoc/
16 | *.gem
17 | 


--------------------------------------------------------------------------------
/test/data/regression/thread4_4_failed_filter-stderr.ref:
--------------------------------------------------------------------------------
1 | Unknown field name <t> in record, did you mean r.info.t?
2 | Unknown field name <t> in record, did you mean r.info.t?
3 | Unknown field name <t> in record, did you mean r.info.t?
4 | Unknown field name <t> in record, did you mean r.info.t?
5 | ERROR: execution expired
6 | 


--------------------------------------------------------------------------------
/template/vcf2rdf.erb:
--------------------------------------------------------------------------------
 1 | <%
 2 |   id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')) 
 3 | %>
 4 | :<%= id %>
 5 |   :query_id    "<%= id %>";
 6 |   seq:chr      "<%= rec.chrom %>" ;
 7 |   seq:pos      <%= rec.pos %> ;
 8 |   seq:ref      "<%= rec.ref %>" ;
 9 |   seq:alt      "<%= rec.alt[0] %>" ;
10 |   seq:dp       <%= rec.info.dp %> ;
11 |   db:vcf       true .
12 | 
13 | 


--------------------------------------------------------------------------------
/features/support/env.rb:
--------------------------------------------------------------------------------
 1 | 
 2 | $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
 3 | require 'bio-vcf'
 4 | 
 5 | require 'rspec/expectations'
 6 | 
 7 | # Add the regression module if in the path (it can also be a gem)
 8 | rootdir = File.dirname(__FILE__) + '/../..'
 9 | $LOAD_PATH.unshift(rootdir+'/lib/regressiontest',rootdir+'/../regressiontest/lib')
10 | require 'regressiontest'
11 | 
12 | include BioVcf
13 | 


--------------------------------------------------------------------------------
/lib/regressiontest.rb:
--------------------------------------------------------------------------------
 1 | # Please require your code below, respecting the naming conventions in the
 2 | # bioruby directory tree.
 3 | #
 4 | # For example, say you have a plugin named bio-plugin, the only uncommented
 5 | # line in this file would be 
 6 | #
 7 | #   require 'bio/bio-plugin/plugin'
 8 | #
 9 | # In this file only require other files. Avoid other source code.
10 | 
11 | require 'regressiontest/cli_exec'
12 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfline.rb:
--------------------------------------------------------------------------------
 1 | module BioVcf
 2 |   module VcfLine
 3 | 
 4 |     # Split a line into fields and check size
 5 |     def VcfLine.parse line,expected_size=nil
 6 |       fields = line.strip.split(/\t/)
 7 |       raise "Unexpected line #{line}" if line.strip.size == 0 or fields.size < 6
 8 |       raise "Expected #{expected_size} fields but got #{fields.size} in "+fields.to_s if expected_size and fields.size != expected_size
 9 |       fields
10 |     end
11 |   end
12 | end
13 | 


--------------------------------------------------------------------------------
/template/vcf2json.erb:
--------------------------------------------------------------------------------
 1 | =HEADER
 2 | <% require 'json' %>
 3 | {
 4 |     "HEADER": {
 5 | 	"options": <%= options.to_h.to_json %>,
 6 | 	"files": <%= ARGV %>,
 7 | 	"version": "<%= BIOVCF_VERSION %>"
 8 |     },
 9 |     "BODY": [
10 | =BODY
11 | 	{
12 | 	    "seq:chr": "<%= rec.chrom %>",
13 | 	    "seq:pos": <%= rec.pos %>,
14 | 	    "seq:ref": "<%= rec.ref %>",
15 | 	    "seq:alt": "<%= rec.alt[0] %>",
16 | 	    "dp":      <%= rec.info.dp %>
17 | 	},
18 | =FOOTER
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/template/vcf2json_full_header.erb:
--------------------------------------------------------------------------------
 1 | =HEADER
 2 | <% require 'json' %>
 3 | {
 4 |     "HEADER": {
 5 | 	"options":  <%= options.to_h.to_json %>,
 6 | 	"files":    <%= ARGV %>,
 7 | 	"version":  "<%= BIOVCF_VERSION %>"
 8 |     },
 9 |     "COLUMNS": <%= header.column_names.to_json %>,
10 |     "META": <%= header.meta.to_json %>,
11 |     "BODY": [
12 | =BODY
13 | 	{
14 | 	    "seq:chr": "<%= rec.chrom %>" ,
15 | 	    "seq:pos": <%= rec.pos %> ,
16 | 	    "seq:ref": "<%= rec.ref %>" ,
17 | 	    "seq:alt": "<%= rec.alt[0] %>"
18 | 	    <% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>
19 | 	},
20 | =FOOTER
21 |     ]
22 | }


--------------------------------------------------------------------------------
/lib/bio-vcf/utils.rb:
--------------------------------------------------------------------------------
 1 | module BioVcf
 2 | 
 3 |   module ConvertStringToValue
 4 |     def self::integer?(str)
 5 |       !!Integer(str) rescue false
 6 |     end
 7 | 
 8 |     def self::float?(str)
 9 |       !!Float(str) rescue false
10 |     end
11 | 
12 |     def self::convert str
13 |       if str =~ /,/
14 |         str.split(/,/).map { |item| convert(item) }
15 |       else
16 |         if integer?(str)
17 |           str.to_i 
18 |         else
19 |           if float?(str)
20 |             str.to_f 
21 |           else
22 |             str
23 |           end
24 |         end
25 |       end
26 |     end
27 |   end
28 | 
29 | end
30 | 


--------------------------------------------------------------------------------
/features/filter.feature:
--------------------------------------------------------------------------------
 1 | @filter
 2 | Feature: Adding filters
 3 | 
 4 |   bio-vcf can add soft filters. Rather than removing failing items we can
 5 |   inject filter state into the FILTER field. To add state such as PASS or
 6 |   LowDepth simply use a filter and the --set-filter switch. If a filter already
 7 |   has state the new one is appended with a semi-colon.
 8 |   
 9 |   Scenario: Test the info filter using dp and threads
10 |     Given I have input file(s) named "test/data/input/somaticsniper.vcf"
11 |     When I execute "./bin/bio-vcf --add-filter PASS --filter 'r.normal.dp>5 and r.tumor.dp>7'"
12 |     Then I expect the named output to match the named output "pass1"
13 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | 
 3 | # require 'rubygems'
 4 | require 'rake'
 5 | # require 'cucumber/rake/task'
 6 | 
 7 | # Cucumber::Rake::Task.new(:features) do |t|
 8 |   # t.cucumber_opts = "--bundler false"
 9 | # end
10 | 
11 | desc 'Run cucumber' # without bundler
12 | task :features do
13 |   sh 'cucumber features'
14 | end
15 | 
16 | task :default => :features
17 | 
18 | task :test => [ :features ]
19 | 
20 | require 'rdoc/task'
21 | Rake::RDocTask.new do |rdoc|
22 |   version = File.exist?('VERSION') ? File.read('VERSION') : ""
23 | 
24 |   rdoc.rdoc_dir = 'rdoc'
25 |   rdoc.title = "bio-vcf #{version}"
26 |   rdoc.rdoc_files.include('README*')
27 |   rdoc.rdoc_files.include('lib/**/*.rb')
28 | end
29 | 


--------------------------------------------------------------------------------
/lib/bio-vcf.rb:
--------------------------------------------------------------------------------
 1 | # Please require your code below, respecting the naming conventions in the
 2 | # bioruby directory tree.
 3 | #
 4 | # For example, say you have a plugin named bio-plugin, the only uncommented
 5 | # line in this file would be 
 6 | #
 7 | #   require 'bio/bio-plugin/plugin'
 8 | #
 9 | # In this file only require other files. Avoid other source code.
10 | 
11 | require 'bio-vcf/utils'
12 | require 'bio-vcf/vcf'
13 | require 'bio-vcf/vcfsample'
14 | require 'bio-vcf/vcfheader_line'
15 | require 'bio-vcf/vcfheader'
16 | require 'bio-vcf/vcfline'
17 | require 'bio-vcf/vcfgenotypefield'
18 | require 'bio-vcf/vcfrecord'
19 | require 'bio-vcf/variant'
20 | require 'bio-vcf/vcfstatistics'
21 | require 'bio-vcf/bedfilter'
22 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfstatistics.rb:
--------------------------------------------------------------------------------
 1 | module BioVcf
 2 | 
 3 |   class VcfStatistics
 4 | 
 5 |     def initialize
 6 |       @count = 0
 7 |       @ref_alt_count = {}
 8 |     end
 9 | 
10 |     def add rec
11 |       @count += 1
12 |       s = rec.ref+">"+rec.alt[0]
13 |       @ref_alt_count[s] ||= 0
14 |       @ref_alt_count[s] += 1
15 |     end
16 | 
17 |     def print
18 |       puts "## ==== Statistics =================================="
19 |       @ref_alt_count.sort_by {|k,v| v}.reverse.each do |k,v|
20 |         printf k+"\t%d\t%2.0d%%\n",v,(v.to_f/@count*100).round
21 |       end
22 |       puts "Total\t#{@count}"
23 |       puts "## =================================================="
24 |     end
25 |   end
26 | 
27 | end
28 | 
29 | 


--------------------------------------------------------------------------------
/test/stress/stress_test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #
 3 | # Stress test bio-vcf by running it on large files and comparing
 4 | # results using threads
 5 | 
 6 | input=test/data/input/multisample.vcf
 7 | filter='--sfilter 's.dp>70' --seval s.dp'
 8 | 
 9 | echo "cat $input | ./bin/bio-vcf --num-threads 1 $filter > stress_simple01.vcf"
10 | cat $input | ./bin/bio-vcf --num-threads 1 $filter > stress_simple01.vcf
11 | cat $input | ./bin/bio-vcf --num-threads 2 $filter > stress_simple02.vcf
12 | cat $input | ./bin/bio-vcf --num-threads 4 $filter > stress_simple03.vcf
13 | cat $input | ./bin/bio-vcf $filter > stress_simple04.vcf
14 | cat $input | ./bin/bio-vcf --thread-lines 3 $filter > stress_simple05.vcf
15 | cat $input | ./bin/bio-vcf --thread-lines 1 $filter > stress_simple06.vcf
16 | 


--------------------------------------------------------------------------------
/template/vcf2rdf_header.erb:
--------------------------------------------------------------------------------
 1 | =HEADER
 2 | @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
 3 | @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
 4 | @prefix dc: <http://purl.org/dc/elements/1.1/> .
 5 | @prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
 6 | @prefix doi: <http://dx.doi.org/> .
 7 | @prefix seq: <http://biobeat.org/rdf/seq#> .
 8 | @prefix db: <http://biobeat.org/rdf/db#> .
 9 | @prefix : <http://biobeat.org/rdf/dbsnp#> .
10 | 
11 | =BODY
12 | <%
13 |   id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')) 
14 | %>
15 | :<%= id %>
16 |   :query_id    "<%= id %>";
17 |   seq:chr      "<%= rec.chrom %>" ;
18 |   seq:pos      <%= rec.pos %> ;
19 |   seq:ref      "<%= rec.ref %>" ;
20 |   seq:alt      "<%= rec.alt[0] %>" ;
21 |   seq:dp       <%= rec.info.dp %> ;
22 |   db:vcf       true .
23 | 
24 | =FOOTER


--------------------------------------------------------------------------------
/template/vcf2json_expanded.erb:
--------------------------------------------------------------------------------
 1 | =HEADER
 2 | <% require 'json' %>
 3 | {
 4 |     "HEADER": {
 5 | 	"options": <%= options.to_h.to_json %>,
 6 | 	"files": <%= ARGV %>,
 7 | 	"version": "<%= BIOVCF_VERSION %>"
 8 |     },
 9 |     "BODY": [
10 | =BODY
11 | 	{
12 | 	    "CHR": "<%= rec.chrom %>",
13 | 	    "POS": <%= rec.pos %>,
14 | 	    "REF": "<%= rec.ref %>",
15 | 	    "ALT": <%= rec.alt %>,
16 | 	    "QUAL": <%= rec.qual %>,
17 | 
18 | 	    "DP":  <%= rec.info.dp %>,
19 | 	    "AF":  <%= rec.info.af %>,
20 | 	    "AN":  <%= rec.info.an %>,
21 | 	    "MQ":  <%= rec.info.mq %>,
22 | 	    "QD":  <%= rec.info.qd %>,
23 | 	    "BaseQRankSum":  <%= rec.info.baseqranksum %>,
24 | 	    "HaplotypeScore":  <%= rec.info.HaplotypeScore %>,
25 | 
26 |             "samples" : { <% rec.each_sample do |s| %>
27 |                "<%= s.name %>": {
28 |                  "GT": "<%= s.gt %>",
29 |                  "AD": <%= s.ad %>,
30 |                  "DP": <%= s.dp %>
31 |                } <%= (s.is_last? ? "" : ",") %>
32 |                <% end %>
33 |             }
34 | 
35 | 	}
36 | =FOOTER
37 |     ]
38 | }
39 | 


--------------------------------------------------------------------------------
/template/gatk_vcf2rdf.erb:
--------------------------------------------------------------------------------
 1 | <%
 2 |   id = Turtle::mangle_identifier(['ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')) 
 3 |   sample_num = 0
 4 | %>
 5 | :<%= id %>
 6 |   :query_id    "<%= id %>";
 7 |   seq:chr      "<%= rec.chrom %>" ;
 8 |   seq:pos      <%= rec.pos %> ;
 9 |   seq:ref      "<%= rec.ref %>" ;
10 |   seq:alt      "<%= rec.alt[0] %>" ;
11 |   db:gatk      true .
12 | 
13 | <% rec.each_sample do | s | %>
14 |   <% if not s.empty? 
15 |     sample_name = header.samples[sample_num]
16 |     sample_id = id + '_' + Turtle::mangle_identifier(sample_name)
17 |     sample_num += 1
18 |     if s.ad[0]+s.ad[1] != 0
19 |       alt_bias = (s.ad[1].to_f/(s.ad[0]+s.ad[1])).round(2) 
20 |     end
21 |   %>
22 | :<%= sample_id %>
23 |   :call_id     :<%= id %> ;
24 |   sample:name  "<%= sample_name  %>" ;
25 |   sample:gt    "<%= s.gt %>" ;
26 |   <% s.gti.each do | index | %>
27 |   sample:ad<%= index %>    <%= s.ad[index] %> ;
28 |   sample:gts<%= index %>   "<%= s.gts[index] %>" ;
29 |   <% end %>
30 |   sample:dp        <%= s.dp %> ;
31 |   sample:alt_bias  <%= alt_bias %> .
32 |   <% end %>
33 | <% end %>
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/features/step_definitions/cli-feature.rb:
--------------------------------------------------------------------------------
 1 | 
 2 | Given /^I have input file\(s\) named "(.*?)"$/ do |arg1|
 3 |   @filenames = arg1.split(/,/)
 4 | end
 5 | 
 6 | When /^I execute "(.*?)"$/ do |arg1|
 7 |   @cmd = arg1 + ' < ' + @filenames[0]
 8 | end
 9 | 
10 | # Then(/^I expect the named output to match the named output "(.*?)"$/) do |arg1|
11 | #   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)').should be_truthy
12 | # end
13 | 
14 | # Then(/^I expect the named output to match the named output "([^"]*)" in under (\d+) seconds$/) do |arg1, arg2|
15 | #   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(##BioVcf|date|"version":)',timeout: arg2.to_i).should be_truthy
16 | # end
17 | 
18 | 
19 | # Then(/^I expect an error and the named output to match the named output "(.*?)" in under (\d+) seconds$/) do |arg1,arg2|
20 | #   RegressionTest::CliExec::exec(@cmd,arg1,ignore: '(FATAL|Waiting|from|vcf|Options|Final pid)',should_fail: true,timeout:arg2.to_i).should be_truthy
21 | # end
22 | 
23 | # Then(/^I expect no errors$/) do 
24 | #   RegressionTest::CliExec::exec(@cmd, "empty").should be_truthy
25 | # end
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013-2021 Pjotr Prins <pjotr.public68@thebird.nl>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/RELEASE_NOTES.md:
--------------------------------------------------------------------------------
 1 | ## ChangeLog v0.9.6 (?)
 2 | 
 3 | + Added JSON VCF header output with --json switch
 4 | 
 5 | ## ChangeLog v0.9.5 (20210118)
 6 | 
 7 | + Improved README and installation instructions
 8 | + Added guix.scm build and instructions (no need for bundler)
 9 | + Moved regressiontest into tree
10 | 
11 | ## ChangeLog v0.9.4 (20201222)
12 | 
13 | This is an important maintenance release of bio-vcf:
14 | 
15 | + Rename bioruby-vcf to bio-vcf and migrate project to [vcflib](https://github.com/vcflib/bio-vcf)
16 | + Fixed tests to match recent Ruby updates
17 | 
18 | ## Older release notes
19 | 
20 | + Getting ready for a 1.0 release
21 | + Released 0.9.2 as a gem
22 | + 0.9.1 removed a rare threading bug and cleanup on error
23 | + Added support for soft filters (request by Brad Chapman)
24 | + The outputter now writes (properly) in parallel with the parser
25 | + bio-vcf turns any VCF into JSON with header information, and
26 |   allows you to pipe that JSON directly into any JSON supporting
27 |   language, including Python and Javascript!
28 | 
29 | ## Older changes
30 | 
31 | For older changes view the git [log](https://github.com/vcflib/bio-vcf/commits/master).
32 | 


--------------------------------------------------------------------------------
/template/vcf2json_use_meta.erb:
--------------------------------------------------------------------------------
 1 | =HEADER
 2 | <% require 'json' %>
 3 | {
 4 |     "HEADER": {
 5 | 	"options":<%= options.to_h.to_json %>,
 6 | 	"files": <%= ARGV %>,
 7 | 	"version": "<%= BIOVCF_VERSION %>"
 8 |     },
 9 |     "COLUMNS": <%= header.column_names.to_json %>,
10 |     "META": <%= header.meta.to_json %>,
11 |     "BODY": [
12 | =BODY
13 | 	<% sample_num = 0
14 | 	sample_name = nil
15 | 	sample_size = header.samples.size
16 | 	%>
17 | 	{
18 | 	    "seq:chr": "<%= rec.chrom %>" ,
19 | 	    "seq:pos": <%= rec.pos %> ,
20 | 	    "seq:ref": "<%= rec.ref %>" ,
21 | 	    "seq:alt": "<%= rec.alt[0] %>"
22 | 	    <% if rec.info.dp %> , "dp": <%= rec.info.dp %> <% end %>,
23 | 	    "samples" : {
24 | 		<% rec.each_sample do |s| %>
25 | 		<% if not s.empty? 
26 | 		sample_name = header.samples[sample_num]
27 | 		%>
28 | 		<%= (sample_num!=0 ? "," : "" ) %>
29 | 		<% sample_num += 1%>
30 | 		"<%= sample_name %>": {
31 | 		    <% header.meta['FORMAT'].each_key do |k| %>
32 | 		    "<%= k %>": <%= s[k].to_json %><%= (k==header.meta['FORMAT'].keys.last ? "" : "," ) %>
33 | 		    <% end %>
34 | 		}
35 | 		<% end %>
36 | 		<% end %>
37 | 	    }
38 | 	},
39 | =FOOTER
40 |     ]
41 | }
42 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/variant.rb:
--------------------------------------------------------------------------------
 1 | module BioVcf
 2 | 
 3 |   module Variant
 4 | 
 5 |     def Variant.diff normal,tumor
 6 |       tumor.each_with_index.map {|t,i| t-normal[i]}
 7 |     end
 8 | 
 9 |     def Variant.threshold_diff t,normal,tumor
10 |       normal2,tumor2 = apply_threshold(t,normal,tumor)
11 |       diff(normal2,tumor2)
12 |     end
13 | 
14 |     def Variant.relative_diff normal,tumor
15 |       d = diff(normal,tumor)
16 |       total = tumor.each_with_index.map {|t,i| t+normal[i]}
17 |       total.each_with_index.map {|t,i| (t==0 ? 0 : ((d[i].to_f/t)*100.0).round/100.0)}
18 |     end
19 | 
20 |     def Variant.relative_threshold_diff t,normal,tumor
21 |       normal2,tumor2 = apply_threshold(t,normal,tumor)
22 |       relative_diff(normal2,tumor2)
23 |     end
24 | 
25 |     def Variant.index normal,tumor
26 |       rd = relative_diff(normal,tumor) 
27 |       max = rd.reduce(0){|mem,v| (v>mem ? v : mem) }
28 |       rd.index(max)
29 |     end
30 | 
31 |     def Variant.apply_threshold t,normal,tumor
32 |       normal2 = normal.map{|v| (v>t ? 0 : v) }
33 |       tumor2 = tumor.each_with_index.map{|v,i| (normal2[i]==0 ? 0 : v) }
34 |       return normal2,tumor2
35 |     end
36 |   end
37 | 
38 | end
39 | 


--------------------------------------------------------------------------------
/test/data/regression/ifilter_s.dp.ref:
--------------------------------------------------------------------------------
 1 | 1	10257	159	242	249	249	186	212	218
 2 | 1	10291	165	249	249	247	161	163	189
 3 | 1	10297	182	246	250	246	165	158	183
 4 | 1	10303	198	247	248	248	172	157	182
 5 | 1	10315	212	246	242	245	190	157	189
 6 | 1	10321	218	246	248	248	193	164	196
 7 | 1	10327	237	238	229	237	209	183	210
 8 | 1	12783	58	164	144	182	126	103	158
 9 | 1	13116	32	131	102	152	104	88	109
10 | 1	13118	34	129	101	145	99	85	108
11 | 1	13178	52	172	137	172	129	119	148
12 | 1	13302	36	136	99	146	90	65	117
13 | 1	13757	53	201	181	250	152	130	182
14 | 1	13868	75	192	182	224	142	111	167
15 | 1	13896	62	135	143	175	112	81	121
16 | 1	14354	43	158	115	145		72	119
17 | 1	14464	51	155	141	150	83	89	140
18 | 1	14673	36	142	117	157	95	76	131
19 | 1	14699	43	128	109	147	98	78	114
20 | 1	14907	57	216	162	205	153	118	158
21 | 1	14930	68	216	170	210	136	125	164
22 | 1	14933	68	216	169	212	132	128	164
23 | 1	14948	63	192	181	211	129	121	153
24 | 1	14976	56	166	161	196	109	116	135
25 | 1	15118	46	198	129	230	113	126	158
26 | 1	15190	53	208	170	200	126	145	179
27 | 1	15211	54	183	161	171	120	134	168
28 | 1	15274	37	121	102	137	71	67	98
29 | 1	15447	46	242	183	226	137	173	175
30 | 1	15688	37	182	147	184	100	101	148
31 | 1	16103	50	79	86	106	60	61	84
32 | 


--------------------------------------------------------------------------------
/test/data/regression/sfilter_seval_s.dp.ref:
--------------------------------------------------------------------------------
 1 | 1	10257	159	242	249	249	186	212	218
 2 | 1	10291	165	249	249	247	161	163	189
 3 | 1	10297	182	246	250	246	165	158	183
 4 | 1	10303	198	247	248	248	172	157	182
 5 | 1	10315	212	246	242	245	190	157	189
 6 | 1	10321	218	246	248	248	193	164	196
 7 | 1	10327	237	238	229	237	209	183	210
 8 | 1	12783	58	164	144	182	126	103	158
 9 | 1	13116	32	131	102	152	104	88	109
10 | 1	13118	34	129	101	145	99	85	108
11 | 1	13178	52	172	137	172	129	119	148
12 | 1	13302	36	136	99	146	90	65	117
13 | 1	13757	53	201	181	250	152	130	182
14 | 1	13868	75	192	182	224	142	111	167
15 | 1	13896	62	135	143	175	112	81	121
16 | 1	14464	51	155	141	150	83	89	140
17 | 1	14673	36	142	117	157	95	76	131
18 | 1	14699	43	128	109	147	98	78	114
19 | 1	14907	57	216	162	205	153	118	158
20 | 1	14930	68	216	170	210	136	125	164
21 | 1	14933	68	216	169	212	132	128	164
22 | 1	14948	63	192	181	211	129	121	153
23 | 1	14976	56	166	161	196	109	116	135
24 | 1	15118	46	198	129	230	113	126	158
25 | 1	15190	53	208	170	200	126	145	179
26 | 1	15211	54	183	161	171	120	134	168
27 | 1	15274	37	121	102	137	71	67	98
28 | 1	15447	46	242	183	226	137	173	175
29 | 1	15688	37	182	147	184	100	101	148
30 | 1	16068	33	57	68	81	49	49	58
31 | 1	16103	50	79	86	106	60	61	84
32 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/bedfilter.rb:
--------------------------------------------------------------------------------
 1 | module BioVcf
 2 | 
 3 |   class BedFilter
 4 |     def initialize bedfilen
 5 |       require 'binary_search/native'
 6 | 
 7 |       # Parse Bed file and build up search array
 8 |       chrs = {}
 9 |       info = {}
10 |       File.open(bedfilen).each_line { | line |
11 |         (chr,start,stop,gene) = line.strip.split(/\t/)[0..3]
12 |         chrs[chr] ||= []
13 |         chrs[chr].push(stop.to_i)
14 |         info[chr+':'+stop] = [chr,start.to_i,stop.to_i,gene]
15 |       }
16 |       # Make sure chrs is sorted
17 |       @chrs = {}
18 |       chrs.each { | k,list |
19 |         @chrs[k] = list.sort
20 |       }
21 |       @info = info
22 |     end
23 | 
24 |     def contains(rec)
25 |       stop_list = @chrs[rec.chrom]
26 |       if stop_list
27 |         pos = rec.pos
28 |         stop = stop_list.bsearch { |bedstop| bedstop >= pos }
29 |         if stop
30 |           rinfo = @info[rec.chrom+':'+stop.to_s]
31 |           raise "Unexpected error in BED record for #{rec.chrom}:#{stop} position" if rinfo == nil
32 |           start = rinfo[1]
33 |           if pos >= start
34 |             # p [rec.chrom,rec.pos,rinfo]
35 |             return rinfo
36 |           end
37 |         end
38 |       end
39 |       nil
40 |     end
41 |   end
42 | 
43 | end
44 | 


--------------------------------------------------------------------------------
/features/diff_count.feature:
--------------------------------------------------------------------------------
 1 | @diff
 2 | 
 3 | Feature: Variant calling (filters) - diffing nucleotide counts
 4 | 
 5 |   Basic filtering happens on the command line with the --filter switch. To
 6 |   support somewhat more advanced features the following features are
 7 |   included.
 8 | 
 9 |   When diffing nucleotide counts we want to find out which nucleotide defines
10 |   the tumor. The difference has to be larger than 0 and the relative difference
11 |   is the max. When a threshold is set only those nucleotides are included which
12 |   pass the threshold (i.e., no more than x supporting nucleotides in the
13 |   reference). 
14 | 
15 |   The advantage is that filtering is possible without actually looking at
16 |   the rec.alt and rec.ref values, i.e., no assumptions are being made 
17 |   about the underlying nucleotides.
18 | 
19 |   Scenario: Diffing nucleotide counts
20 | 
21 |     Given normal and tumor counts [0,25,0,1] and [0,40,0,12]
22 |     When I look for the difference
23 |     Then I expect the diff to be [0,15,0,11]
24 |     And I expect the defining tumor nucleotide to be "T"
25 |     And I expect the tumor count to be 12
26 |     When I set an inclusion threshold for the reference
27 |     Then I expect the diff for threshold 2 to be [0,0,0,11]
28 |     And the relative diff to be [0,0,0,0.85]
29 |    
30 | 


--------------------------------------------------------------------------------
/test/data/regression/seval_s.dp.ref:
--------------------------------------------------------------------------------
 1 | 1	10257	159	242	249	249	186	212	218
 2 | 1	10291	165	249	249	247	161	163	189
 3 | 1	10297	182	246	250	246	165	158	183
 4 | 1	10303	198	247	248	248	172	157	182
 5 | 1	10315	212	246	242	245	190	157	189
 6 | 1	10321	218	246	248	248	193	164	196
 7 | 1	10327	237	238	229	237	209	183	210
 8 | 1	10583	8	24	21	23	15	19	19
 9 | 1	10665			7	5		2	7
10 | 1	10694			5	5			
11 | 1	10723			4	5			6
12 | 1	12783	58	164	144	182	126	103	158
13 | 1	13116	32	131	102	152	104	88	109
14 | 1	13118	34	129	101	145	99	85	108
15 | 1	13178	52	172	137	172	129	119	148
16 | 1	13302	36	136	99	146	90	65	117
17 | 1	13757	53	201	181	250	152	130	182
18 | 1	13868	75	192	182	224	142	111	167
19 | 1	13896	62	135	143	175	112	81	121
20 | 1	14354	43	158	115	145		72	119
21 | 1	14464	51	155	141	150	83	89	140
22 | 1	14673	36	142	117	157	95	76	131
23 | 1	14699	43	128	109	147	98	78	114
24 | 1	14907	57	216	162	205	153	118	158
25 | 1	14930	68	216	170	210	136	125	164
26 | 1	14933	68	216	169	212	132	128	164
27 | 1	14948	63	192	181	211	129	121	153
28 | 1	14976	56	166	161	196	109	116	135
29 | 1	15118	46	198	129	230	113	126	158
30 | 1	15190	53	208	170	200	126	145	179
31 | 1	15211	54	183	161	171	120	134	168
32 | 1	15274	37	121	102	137	71	67	98
33 | 1	15447	46	242	183	226	137	173	175
34 | 1	15688	37	182	147	184	100	101	148
35 | 1	16068	33	57	68	81	49	49	58
36 | 1	16103	50	79	86	106	60	61	84
37 | 


--------------------------------------------------------------------------------
/bio-vcf.gemspec:
--------------------------------------------------------------------------------
 1 | # No longer generated by jeweler
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | Gem::Specification.new do |s|
 5 |   s.name = "bio-vcf"
 6 |   s.version = File.read("VERSION")
 7 | 
 8 |   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
 9 |   s.authors = ["Pjotr Prins"]
10 |   s.description = "Smart lazy multi-threaded parser for VCF format with useful filtering and output rewriting (JSON, RDF etc.)"
11 |   s.email = "pjotr.public01@thebird.nl"
12 |   s.executables = ["bio-vcf"]
13 |   s.extra_rdoc_files = [
14 |     "LICENSE",
15 |     "README.md"
16 |   ]
17 |   s.files = [
18 |     ".travis.yml",
19 |     "Gemfile",
20 |     "LICENSE",
21 |     "README.md",
22 |     "Rakefile",
23 |     "VERSION",
24 |     "bin/bio-vcf",
25 |     "bio-vcf.gemspec",
26 |     "ragel/gen_vcfheaderline_parser.rl",
27 |     "ragel/generate.sh",
28 |   ]
29 |   s.files += Dir['lib/**/*.rb'] + Dir['bin/*'] 
30 |   s.files += Dir['[A-Z]*'] + Dir['test/**/*'] + Dir['features/**/*'] +
31 |              Dir['template/**/*']
32 |   
33 |   s.homepage = "http://github.com/vcflib/bio-vcf"
34 |   s.licenses = ["MIT"]
35 |   s.require_paths = ["lib"]
36 |   s.required_ruby_version = Gem::Requirement.new(">= 2.0.0")
37 |   # s.rubygems_version = "2.0.3"
38 |   s.summary = "Fast multi-purpose multi-threaded VCF parser"
39 | 
40 | end
41 | 
42 | 


--------------------------------------------------------------------------------
/features/step_definitions/vcf_header.rb:
--------------------------------------------------------------------------------
 1 | Given(/^the VCF header lines$/) do |string|
 2 |   header = VcfHeader.new
 3 |   header.add string
 4 |   @vcf = header
 5 | end
 6 | 
 7 | When(/^I parse the VCF header$/) do
 8 | end
 9 | 
10 | Then(/^I expect vcf\.columns to be \[CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR'\]$/) do
11 |   expect(@vcf.column_names).to eq ['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
12 | end
13 | 
14 | Then(/^I expect vcf\.fileformat to be "(.*?)"$/) do |arg1|
15 |   expect(@vcf.fileformat).to eq arg1
16 | end
17 | 
18 | Then(/^I expect vcf\.fileDate to be "(.*?)"$/) do |arg1|
19 |   expect(@vcf.fileDate).to eq arg1
20 | end
21 | 
22 | Then(/^I expect vcf.field\['fileDate'\] to be "(.*?)"$/) do |arg1|
23 |   expect(@vcf.field['fileDate']).to eq arg1
24 | end
25 | 
26 | Then(/^I expect vcf\.phasing to be "(.*?)"$/) do |arg1|
27 |   expect(@vcf.phasing).to eq arg1
28 | end
29 | 
30 | Then(/^I expect vcf\.reference to be "(.*?)"$/) do |arg1|
31 |   expect(@vcf.reference).to eq arg1
32 | end
33 | 
34 | Then(/^I expect vcf\.format\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
35 |   expect(@vcf.format[arg1].to_s).to eq arg2
36 | end
37 | 
38 | Then(/^I expect vcf\.info\['(\w+)'\] to be (\{[^}]+\})/) do |arg1,arg2|
39 |   expect(@vcf.info[arg1].to_s).to eq arg2
40 | end
41 | 
42 | Then(/^I expect vcf\.meta to contain all header meta information$/) do
43 |   m = @vcf.meta
44 |   expect(m['fileformat']).to eq "VCFv4.1"
45 |   expect(m['FORMAT']['DP']['Number']).to eq "1"
46 |   expect(m.size).to be 9
47 | end
48 | 
49 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcffile.rb:
--------------------------------------------------------------------------------
 1 | module BioVcf
 2 |     # This class abstracts a VCF file that can be iterated. 
 3 |     # The VCF can be plain text or compressed with gzip
 4 |     # Note that files compressed with bgzip will not work, as thie ruby implementation of Zlib don't allow concatenated files
 5 |     class VCFfile
 6 |     
 7 |         def initialize(file: "", is_gz: true)
 8 |             @file = file
 9 |             @is_gz = is_gz
10 |         end
11 |     
12 |         def parseVCFheader(head_line="")
13 |             m=/##INFO=<ID=(.+),Number=(.+),Type=(.+),Description="(.+)">/.match(head_line)
14 |             {:id=>m[1],:number=>m[2],:type=>m[3],:desc=>m[4]}
15 |         end
16 |     
17 |     
18 |         #Returns an enum that can be used as an iterator. 
19 |         def each
20 |             return enum_for(:each) unless block_given? 
21 |             io = nil
22 |             if @is_gz
23 |                 infile = open(@file)
24 |                 io = Zlib::GzipReader.new(infile) 
25 |             else
26 |                 io =  File.open(@file)
27 |             end
28 |             
29 |             header = BioVcf::VcfHeader.new 
30 |             io.each_line do |line|  
31 |                 line.chomp!
32 |                 if line =~ /^##fileformat=/
33 |                     header.add(line)  
34 |                     next
35 |                 end
36 |                 if line =~ /^#/
37 |                     header.add(line)
38 |                     next
39 |                 end
40 |                 fields = BioVcf::VcfLine.parse(line)
41 |                 rec    = BioVcf::VcfRecord.new(fields,header)
42 |                 yield rec
43 |             end
44 |         end
45 |     end
46 | end


--------------------------------------------------------------------------------
/features/step_definitions/diff_count.rb:
--------------------------------------------------------------------------------
 1 | 
 2 | Given(/^normal and tumor counts \[(\d+),(\d+),(\d+),(\d+)\] and \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8|
 3 |   @normal = [arg1,arg2,arg3,arg4].map{|i|i.to_i}
 4 |   @tumor = [arg5,arg6,arg7,arg8].map{|i|i.to_i}
 5 | end
 6 | 
 7 | When(/^I look for the difference$/) do
 8 | end
 9 | 
10 | Then(/^I expect the diff to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
11 |   expect(Variant.diff(@normal,@tumor)).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
12 | end
13 | 
14 | Then(/^the relative diff to be \[(\d+),(\d+)\.(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6|
15 |   res = [arg1.to_f,(arg2+'.'+arg3).to_f,arg4.to_i,(arg5+'.'+arg6).to_f]
16 |   expect(Variant.relative_diff(@normal,@tumor)).to eq res
17 | end
18 | 
19 | Then(/^I expect the defining tumor nucleotide to be "(.*?)"$/) do |arg1|
20 |   expect(['A','C','G','T'][Variant.index(@normal,@tumor)]).to eq arg1
21 | end
22 | 
23 | Then(/^I expect the tumor count to be (\d+)$/) do |arg1|
24 |   expect(@tumor[Variant.index(@normal,@tumor)]).to eq arg1.to_i
25 | end
26 | 
27 | When(/^I set an inclusion threshold for the reference$/) do
28 | end
29 | 
30 | Then(/^I expect the diff for threshold (\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
31 |   @t = arg1.to_i
32 |   @t_diff = Variant.threshold_diff(@t,@normal,@tumor) 
33 |   expect(@t_diff).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i]
34 | end
35 | 
36 | Then(/^the relative diff to be \[(\d+),(\d+),(\d+),(\d+)\.(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
37 |   res = [arg1.to_f,arg2.to_i,arg3.to_i,(arg4.to_s+'.'+arg5.to_s).to_f]
38 |   expect(Variant.relative_threshold_diff(@t,@normal,@tumor)).to eq res
39 | end
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/features/vcf_header.feature:
--------------------------------------------------------------------------------
 1 | @meta
 2 | Feature: Parsing VCF meta information from the header
 3 | 
 4 |   Take a header and parse that information as defined by the VCF standard.
 5 |   
 6 |   Scenario: When parsing a header line
 7 | 
 8 |     Given the VCF header lines
 9 |     """
10 | ##fileformat=VCFv4.1
11 | ##fileDate=20140121
12 | ##phasing=none
13 | ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
14 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
15 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
16 | ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
17 | ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
18 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR
19 |     """
20 |     When I parse the VCF header
21 |     Then I expect vcf.columns to be [CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','NORMAL','TUMOR']
22 |     And I expect vcf.fileformat to be "VCFv4.1"
23 |     And I expect vcf.fileDate to be "20140121"
24 |     And I expect vcf.field['fileDate'] to be "20140121"
25 |     And I expect vcf.phasing to be "none"
26 |     And I expect vcf.reference to be "file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta"
27 |     And I expect vcf.format['GT'] to be {"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}
28 |     And I expect vcf.format['DP'] to be {"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth"}
29 |     And I expect vcf.format['DP4'] to be {"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}
30 |     And I expect vcf.info['PM'] to be {"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}'
31 |     And I expect vcf.meta to contain all header meta information
32 | 
33 |   Scenario: When parsing the header of somatic_sniper.vcf
34 | 
35 |     Do something
36 | 


--------------------------------------------------------------------------------
/guix.scm:
--------------------------------------------------------------------------------
 1 | ;; To use this file to build HEAD of bio-vcf:
 2 | ;;
 3 | ;;   guix build -f guix.scm
 4 | ;;
 5 | ;; To get a development container (emacs shell will work)
 6 | ;;
 7 | ;;   rm Gemfile.lock # remove any dependencies
 8 | ;;   guix environment -C -l guix.scm
 9 | ;;   ruby ./bin/bio-vcf
10 | ;;
11 | ;;   rake test # for testing
12 | ;;   rake rdoc # for generating docs
13 | 
14 | (use-modules
15 |   ((guix licenses) #:prefix license:)
16 |   (guix gexp)
17 |   (guix packages)
18 |   (guix git-download)
19 |   (guix build-system ruby)
20 |   (guix build-system trivial)
21 |   (gnu packages ruby)
22 |   (gn packages ruby)
23 |   (srfi srfi-1)
24 |   (ice-9 popen)
25 |   (ice-9 rdelim))
26 | 
27 | (define %source-dir (dirname (current-filename)))
28 | 
29 | (define %git-commit
30 |     (read-string (open-pipe "git show HEAD | head -1 | cut -d ' ' -f 2" OPEN_READ)))
31 | 
32 | (define-public bio-vcf-source
33 |   (package
34 |     (name "bio-vcf-source")
35 |     (version (git-version "0.9.5" "HEAD" %git-commit))
36 |     (source (local-file %source-dir #:recursive? #t))
37 |     (build-system trivial-build-system)
38 |     (propagated-inputs
39 |      `(("ruby" ,ruby)
40 |        ("ruby-rake" ,ruby-rake)))
41 |     (native-inputs
42 |      `(("ruby-cucumber" ,ruby-cucumber)
43 |     ))
44 |     (arguments
45 |      `(#:modules ((guix build utils))
46 |        #:builder
47 |        (begin
48 |          (use-modules (guix build utils))
49 |          (let ((target (string-append (assoc-ref %outputs "out")
50 |                                       "/share")))
51 |            (write target)
52 |            (mkdir-p target)
53 |            #t))))
54 |     (synopsis "Smart VCF parser DSL")
55 |     (description
56 |      "Bio-vcf provides a @acronym{DSL, domain specific language} for processing
57 | the VCF format.  Record named fields can be queried with regular expressions.
58 | Bio-vcf is a new generation VCF parser, filter and converter.  Bio-vcf is not
59 | only very fast for genome-wide (WGS) data, it also comes with a filtering,
60 | evaluation and rewrite language and can output any type of textual data,
61 | including VCF header and contents in RDF and JSON.")
62 |     (home-page "http://github.com/vcflib/bio-vcf")
63 |     (license license:expat)))
64 | 
65 | 
66 | bio-vcf-source
67 | 


--------------------------------------------------------------------------------
/features/sfilter.feature:
--------------------------------------------------------------------------------
 1 | @sfilter
 2 | Feature: Sample filters
 3 | 
 4 |   Bio-vcf supports sample filters, where every sample is evaluated
 5 |   independently, though they have the rec information (chrom, pos, info)
 6 |   available.
 7 | 
 8 |   Scenario: Example of a sample
 9 |   
10 |     Given the VCF line
11 |     """
12 | 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL
13 |     """
14 |     When I evaluate '0/0:6,0:6:3:0,3,33'
15 |     Then I expect s.empty? to be false
16 |     Then I expect s.dp? to be true
17 |     Then I expect s.dp to be 6
18 |     And sfilter 's.dp>4' to be true
19 | 
20 |   # Scenario: Sample with missing data
21 |     When I evaluate missing '0/0:6,0:.:3:0,3,33'
22 |     Then I expect s.empty? to be false
23 |     Then I expect s.dp? to be false
24 |     Then I expect s.dp to be nil
25 |     And sfilter 's.dp>4' to throw an error
26 | 
27 |   # Scenario: Sample with missing data with ignore missing set
28 |     When I evaluate missing '0/0:6,0:.:3:0,3,33' with ignore missing
29 |     Then I expect s.empty? to be false
30 |     Then I expect s.dp? to be false
31 |     Then I expect s.dp to be nil
32 |     And sfilter 's.dp>4' to be false
33 | 
34 |   # Scenario: Missing sample
35 |     When I evaluate empty './.'
36 |     Then I expect s.empty? to be true
37 |     Then I expect s.dp? to be false
38 |     Then I expect s.dp to be nil
39 |     And sfilter 's.dp>4' to throw an error
40 | 
41 |   # Scenario: Missing sample with ignore missing set
42 |     When I evaluate empty './.' with ignore missing
43 |     Then I expect s.empty? to be true
44 |     Then I expect s.dp? to be false
45 |     Then I expect s.dp to be nil
46 |     And sfilter 's.dp>4' to be false
47 | 
48 |   # Scenario: Wrong field name in sample
49 |     When I evaluate '0/0:6,0:6:3:0,3,33'
50 |     Then I expect s.empty? to be false
51 |     Then I expect s.dp? to be true
52 |     Then I expect s.what? to throw an error
53 |     And I expect s.what to throw an error
54 | 
55 |   # Scenario: Get other information for a sample
56 |     When I evaluate '0/0:6,0:6:3:0,3,33'
57 |     Then I expect r.chrom to be "1"
58 |     And I expect r.alt to be ["G"]
59 |     And I expect r.info.af to be 0.667
60 | 
61 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/template.rb:
--------------------------------------------------------------------------------
  1 | require 'erb'
  2 | 
  3 | module Bio
  4 | 
  5 |   class Template
  6 | 
  7 |     def initialize fn, handle_comma=false
  8 |       @handle_comma = handle_comma
  9 |       raise "Can not find template #{fn}!" if not File.exist?(fn)
 10 |       parse(File.read(fn))
 11 |       @is_first = true
 12 |     end
 13 | 
 14 |     # Parse the template and split into HEADER, BODY and FOOTER sections
 15 |     def parse buf
 16 |       header = []
 17 |       body = []
 18 |       footer = []
 19 |       where = :header
 20 |       buf.split("\n").each do | line |
 21 |         case where
 22 |           when :header
 23 |             next if line =~ /=HEADER/
 24 |             if line =~ /=BODY/
 25 |               body = []
 26 |               where = :body
 27 |               next
 28 |             end
 29 |             header << line
 30 |           when :body
 31 |             if line =~ /=FOOTER/
 32 |               footer = []
 33 |               where = :footer
 34 |               next
 35 |             end
 36 |             body << line
 37 |           else
 38 |             footer << line
 39 |         end
 40 |       end
 41 |       if body == []
 42 |         body = header
 43 |         header = []
 44 |       end
 45 |       @erb_header = ERB.new(header.join("\n")) if header.size
 46 |       body2 = body.join("\n").reverse
 47 |       # if there is a comma at the end, eat it
 48 |       if @handle_comma
 49 |         body2.chars.each_with_index { |c,i|
 50 |           break if [']','}'].include?(c)
 51 |           if c == ','
 52 |             body2[i] = " "
 53 |             break
 54 |           end
 55 |         }
 56 |       end
 57 |       @erb_body   = ERB.new(body2.reverse) if body.size
 58 |       @erb_footer = ERB.new(footer.join("\n")) if footer.size
 59 |     end
 60 | 
 61 |     def result env
 62 |       @erb.result(env)
 63 |     end
 64 | 
 65 |     # Call the HEADER template (once)
 66 |     def header env
 67 |       if @erb_header
 68 |         @erb_header.result(env)
 69 |       else
 70 |         ""
 71 |       end
 72 |     end
 73 | 
 74 |     # For every record call the BODY template
 75 |     def body env
 76 |       if @erb_body
 77 |         res =
 78 |           if @handle_comma and not @is_first
 79 |             ","
 80 |           else
 81 |             ""
 82 |           end
 83 |         @is_first = false
 84 |         res + @erb_body.result(env)
 85 |       else
 86 |         ""
 87 |       end
 88 |     end
 89 | 
 90 |     # Call the FOOTER template (once)
 91 |     def footer env
 92 |       if @erb_footer
 93 |         @erb_footer.result(env)
 94 |       else
 95 |         ""
 96 |       end
 97 |     end
 98 |   end
 99 | end
100 | 


--------------------------------------------------------------------------------
/doc/pcows.org:
--------------------------------------------------------------------------------
 1 | * Parallel copy-on-write streaming (PCOWS)
 2 | 
 3 | Copy-on-write can be offered by an operating system to provide
 4 | efficient parallelisation for streaming operations typical in biology
 5 | where chunks of data can be processed independently from the same
 6 | starting point. 
 7 | 
 8 | PCOWS compares to a regular thread pool with the difference that data
 9 | is not communicated through messages or pipes, but simply through
10 | memory. The only catch is that PCOWS threads can not easily communicate
11 | back. The Ruby parallel gem communicates back through pipes but that
12 | means the main thread can not proceed until all the pipes complete.
13 | 
14 | PCOWS was first explored in the bio-ruby VCF tool (bio-vcf 0.7 series)
15 | and then replaced by the parallel gem (bio-vcf 0.8
16 | series). Performance, unfortunately, deteriorated to the extent that
17 | PCOWS got retrofitted.
18 | 
19 | PCOWS basically reads a file and chunks it on the main thread. Every
20 | chunk gets fed to a copy-on-write version of the Ruby interpreter
21 | which contains the full state up to the point of forking - this is an
22 | inexpensive procedure. Each chunk gets processed to a file. The main
23 | thread tracks these files and issues a separate thread for harvesting
24 | the ordered files so each gets piped to STDOUT in order.
25 | 
26 | The reader thread tracks This means the reader thread is only held up
27 | by the number of allowed forks running at a time.
28 | 
29 | Based on this description the interface can be expressed as a reader
30 | creating chunks of state that get passed on to a chunk processor named
31 | 'run' which gets a callback 'func'. Every time a chunk gets processed
32 | the reader checks the size of the thread pool and also checks whether
33 | output has become available.
34 | 
35 | The threads communicate simply through a file. Each thread writes to
36 | STDOUT which has been redirected to a temporary file with extension
37 | '.part'.  When the thread is complete, the file gets renamed by
38 | removing the '.part'. Files that lack the '.part' get printed to the
39 | real STDOUT and deleted on completion. I.e.:
40 | 
41 | 1. STDOUT for a worker is redirected to a tempfile with extension '.part'
42 | 2. Worker thread writes to STDOUT
43 | 3. On completion '.part' gets removed from the filename
44 | 4. The main thread checks the next file to be printed based on filenames
45 | 5. A print thread gets invoked
46 | 6. The print thread removes the file
47 | 
48 | The only communication between PCOW threads and the main running
49 | thread is therefore the renaming of the temporary file. This will also
50 | work on network mounted systems. For performance it can make a
51 | difference to use a fast disk for temporary files, though the Linux
52 | file caching will make it likely that buffers are still in RAM.
53 | 
54 | 


--------------------------------------------------------------------------------
/lib/regressiontest/cli_exec.rb:
--------------------------------------------------------------------------------
  1 | require 'fileutils'
  2 | 
  3 | module RegressionTest
  4 | 
  5 |   DEFAULT_TESTDIR = "test/data/regression"
  6 | 
  7 |   # Regression test runner compares output in ./test/data/regression
  8 |   # (by default).  The convention is to have a file with names .ref
  9 |   # (reference) and create .new
 10 |   #
 11 |   # You can add an :ignore regex option which ignores lines in the
 12 |   # comparson files matching a regex
 13 |   #
 14 |   # :timeout sets the time out for calling a system command
 15 |   #
 16 |   # :should_fail expects the system command to return a non-zero
 17 |   module CliExec
 18 |     FilePair = Struct.new(:outfn,:reffn)
 19 | 
 20 |     def CliExec::exec command, testname, options = {}
 21 |       # ---- Find .ref file
 22 |       fullname = DEFAULT_TESTDIR + "/" + testname 
 23 |       basefn = if File.exist?(testname+".ref") || File.exist?(testname+"-stderr.ref")
 24 |                 testname 
 25 |               elsif File.exist?(fullname + ".ref") || File.exist?(fullname+"-stderr.ref")
 26 |                 FileUtils.mkdir_p DEFAULT_TESTDIR
 27 |                 fullname
 28 |               else
 29 |                 raise "Can not find reference file for #{testname} - expected #{fullname}.ref"
 30 |               end
 31 |       std_out = FilePair.new(basefn + ".new", basefn + ".ref")
 32 |       std_err = FilePair.new(basefn + "-stderr.new", basefn + "-stderr.ref")
 33 |       files = [std_out,std_err]
 34 |       # ---- Create .new file
 35 |       cmd = command + " > #{std_out.outfn} 2>#{std_err.outfn}"
 36 |       $stderr.print cmd,"\n"
 37 |       exec_ret = nil
 38 |       if options[:timeout] && options[:timeout] > 0
 39 |         Timeout.timeout(options[:timeout]) do
 40 |           begin
 41 |             exec_ret = Kernel.system(cmd)
 42 |           rescue Timeout::Error
 43 |             $stderr.print cmd, " failed to finish in under #{options[:timeout]}\n"
 44 |             return false
 45 |           end
 46 |         end
 47 |       else
 48 |         exec_ret = Kernel.system(cmd)
 49 |       end
 50 |       expect_fail = (options[:should_fail] != nil)
 51 |       if !expect_fail and exec_ret==0
 52 |         $stderr.print cmd," returned an error\n"
 53 |         return false 
 54 |       end
 55 |       if expect_fail and exec_ret
 56 |         $stderr.print cmd," did not return an error\n"
 57 |         return false 
 58 |       end
 59 |       if options[:ignore]
 60 |         regex = options[:ignore]
 61 |         files.each do |f|
 62 |           outfn = f.outfn
 63 |           outfn1 = outfn + ".1"
 64 |           FileUtils.mv(outfn,outfn1)
 65 |           f1 = File.open(outfn1)
 66 |           f2 = File.open(outfn,"w")
 67 |           f1.each_line do | line |
 68 |             f2.print(line) if line !~ /#{regex}/
 69 |           end
 70 |           f1.close
 71 |           f2.close
 72 |           FileUtils::rm(outfn1)
 73 |         end
 74 |       end
 75 |       # ---- Compare the two files
 76 |       files.each do |f|
 77 |         next unless File.exist?(f.reffn)
 78 |         return false unless compare_files(f.outfn,f.reffn,options[:ignore])
 79 |       end
 80 |       return true
 81 |     end
 82 | 
 83 |     def CliExec::compare_files fn1, fn2, ignore = nil
 84 |       if not File.exist?(fn2)
 85 |         FileUtils::cp(fn1,fn2)
 86 |         true
 87 |       else
 88 |         cmd = "diff #{fn2} #{fn1}"
 89 |         $stderr.print cmd+"\n"
 90 |         return true if Kernel.system(cmd) == true
 91 |         # Hmmm. We have a different result. We are going to try again
 92 |         # because sometimes threads have not completed
 93 |         sleep 0.25 
 94 |         return true if Kernel.system(cmd) == true
 95 |         $stderr.print "If it is correct, execute \"cp #{fn1} #{fn2}\", and run again"
 96 |         false
 97 |       end
 98 |     end
 99 |   end
100 | 
101 | end
102 | 


--------------------------------------------------------------------------------
/features/somaticsniper.feature:
--------------------------------------------------------------------------------
 1 | @sniper
 2 | Feature: VCF for Somatic Sniper
 3 | 
 4 |   Here we take a VCF line and parse the information given by Somatic Sniper. 
 5 | 
 6 |   At this position the reference contains: AAAGAAAAGAAAAA  (12A,2G)
 7 |   At this position the tumor contains:     AAAAACACAA      (8A,2C)
 8 | 
 9 |   rec.alt contains variants C,G.  rec.tumor.bcount reflects the contents of the
10 |   tumor (8A,2C) so rec.tumor.bcount[rec.alt] reflects the actual number of
11 |   variants in the tumor. 
12 | 
13 |   The mapping quality in the BAM file is 37/37 and base quality is 55/60 in normal
14 |   and tumor respectively.
15 | 
16 |   For the second scenario:
17 | 
18 |   At this position the reference contains: (15A)
19 |   At this position the tumor contains:     AAAAAAAAATATTA (13A, 3T)
20 | 
21 |   Scenario: When parsing a record
22 | 
23 |     Given the somatic sniper vcf line
24 |     """
25 | 1       27691244        .       A       C,G     .       .       .       GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC        0/2:0/2:14:0,12,0,2:12,0,2,0:14:35:14:14,35:37:37,37:1:.        0/1:0/1:10:0,8,0,2:8,2,0,0:18:35:18:20,51:37:37,37:2:33
26 |     """
27 |     When I parse the record
28 |     Then I expect rec.chrom to contain "1"
29 |     Then I expect rec.pos to contain 27691244
30 |     Then I expect rec.ref to contain "A"
31 |     And I expect rec.alt to contain ["C","G"]
32 |     And I expect rec.tumor.dp to be 10
33 |     And I expect rec.tumor.dp4 to be [0,8,0,2]
34 |     And I expect rec.tumor.bcount.to_ary to be [8,2,0,0]
35 |     And I expect rec.tumor.bcount[rec.alt] to be [2,0]
36 |     And I expect rec.tumor.bcount["G"] to be 0 
37 |     And I expect rec.tumor.bcount[1] to be 2
38 |     And I expect rec.tumor.bcount[3] to be 0
39 |     And I expect rec.tumor.bcount.sum to be 2
40 |     And I expect rec.tumor.bcount.max to be 2
41 |     And I expect rec.tumor.bq.to_ary to be [20,51]
42 |     And I expect rec.tumor.bq["G"] to be 51
43 |     And I expect rec.tumor.bq[1] to be 51
44 |     And I expect rec.tumor.bq.min to be 20
45 |     And I expect rec.tumor.bq.max to be 51
46 |     And I expect rec.tumor.amq.to_ary to be [37,37]
47 |     And I expect rec.tumor.mq to be 37
48 |     And I expect rec.tumor.ss to be 2
49 |     And I expect rec.tumor.ssc to be 33
50 |     And I expect rec.normal.ssc to be nil
51 |     # The following are additional functions
52 |     And I expect rec.call_diff to be [-4,2,-2,0]
53 |     And I expect rec.call_nuc to be "C"
54 |     And I expect rec.call_tumor_count to be 2
55 |     And I expect rec.call_normal_count to be 0
56 |     And I expect rec.call_tumor_relative_count to be 1.0
57 | 
58 |     Given the somatic sniper vcf line
59 |     """
60 | 1 27686841  . A T . . . GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC  0/0:0/0:15:3,12,0,0:15,0,0,0:66:37:0:25:37:37:0:. 0/1:0/1:16:2,11,0,3:13,0,0,3:30:37:30:34,55:37:37,37:2:37
61 |     """
62 |     When I parse the record
63 |     Then I expect rec.chrom to contain "1"
64 |     Then I expect rec.pos to contain 27686841
65 |     Then I expect rec.ref to contain "A"
66 |     And I expect rec.alt to contain one ["T"]
67 |     And I expect rec.tumor.dp to be 16
68 |     And I expect rec.tumor.dp4 to be [2,11,0,3]
69 |     And I expect rec.tumor.bcount.to_ary to be [13,0,0,3]
70 |     And I expect rec.tumor.bcount[rec.alt] to be one [3]
71 |     And I expect rec.tumor.bcount["G"] to be 0 
72 |     And I expect rec.tumor.bcount["T"] to be 3
73 |     And I expect rec.tumor.bcount[1] to be 0
74 |     And I expect rec.tumor.bcount[3] to be 3
75 |     And I expect rec.tumor.bcount.sum to be 3
76 |     And I expect rec.tumor.bcount.max to be 3
77 |     And I expect rec.tumor.bq.to_ary to be [34,55]
78 |     And I expect rec.tumor.bq["T"] to be 34
79 |     And I expect rec.tumor.bq[1] to be 55
80 |     And I expect rec.tumor.bq.min to be 34
81 |     And I expect rec.tumor.bq.max to be 55
82 |     And I expect rec.tumor.amq.to_ary to be [37,37]
83 |     And I expect rec.tumor.mq to be 37
84 |     And I expect rec.tumor.ss to be 2
85 | 
86 |     
87 | 


--------------------------------------------------------------------------------
/doc/Compare_VCFs.md:
--------------------------------------------------------------------------------
 1 | # Comparing VCF files
 2 | 
 3 | Between two different pipeline runs we ended up with different VCF
 4 | files. The starting point (BAMs) was the same, but in each pipeline
 5 | different procedures may have been followed, i.e. the processing steps
 6 | for variant calling were not exactly the same. The first
 7 | freebayes+varscan2 pipeline (P1) I wrote after testing many callers
 8 | including somatic sniper and strelka, so I should know that well
 9 | enough. The second pipeline (P2) includes more variant callers. To
10 | find out how they compared and which output was preferred I decided to
11 | do some analysis using bio-vcf.
12 | 
13 | ## Comparing freebayes output
14 | 
15 | The freebayes somatic variant calling files differed in size. Just
16 | looking at one sample P1 had 479 lines and P2 had 729. The germline
17 | calls, however, where comparable in size. But when I ran a diff it
18 | showed these differed significantly too:
19 | 
20 |     wc -l germline*vcf
21 |       3527 germline1.vcf
22 |       3500 germline2.vcf
23 |     cat germline1.vcf|bio-vcf -e "[r.chr,r.pos]"|sort > germline1.txt
24 |     cat germline2.vcf|bio-vcf -e "[r.chr,r.pos]"|sort > germline2.txt
25 |     diff germline1.txt germline2.txt |wc
26 |        1751
27 | 
28 | To zoom in on settings, lets look at read depth on chromosome 7 (-v
29 | and --num-threads=1 options I typically use when trying new filters
30 | because they give digestable output):
31 | 
32 |     cat germline1.vcf|bio-vcf -v --num-threads 1 --filter 'rec.chr=="7"' -e '[r.chr,r.pos,r.info.dp]'|sort
33 |         7       90225928        34
34 |         7       95216394        69
35 |         7       97821397        97
36 |         7       97821398        98
37 |         7       97822115        96
38 |         7       97822210        94
39 |         7       98503849        109
40 |         7       98543545        68
41 |         7       98543546        69
42 |         7       98582562        38
43 |         7       98650051        48
44 |         7       99690690        78
45 |         7       99690747        27
46 |         7       99693552        34
47 | 
48 |     cat germline2.vcf|bio-vcf -v --num-threads 1 --filter 'rec.chr=="7"' -e '[r.chr,r.pos,r.info.dp]'|sort
49 |         7       90225928        121
50 |         7       95216394        534
51 |         7       97822115        1053
52 |         7       97822210        1044
53 |         7       97834704        249
54 |         7       98503849        1579
55 |         7       98547176        59
56 |         7       98553739        21
57 |         7       98648517        75
58 |         7       98650051        344
59 |         7       99690690        455
60 |         7       99690747        168
61 |         7       99693552        107
62 | 
63 | OK, this is informative. P1 called variants after removing duplicate reads. P2
64 | did not. That explains the different in number of variants called.
65 | 
66 | Unfortunately the sequencing concerns an FFPE dataset. FFPE degrades
67 | over time and DNA changes. In itself this is not a problem because we
68 | sequence many cells and the changed ones do not necessarily
69 | dominate. We do, however, amplify the DNA before sequencing through a
70 | PCR-type process. This means that randomly these FFPE changes may
71 | become dominant at a certain position and variant callers score them
72 | as genuine variants. I have studied this data and there is ample
73 | evidence of this effect. The only way to address this is by removing
74 | duplicate reads - so the amplified reads get compressed into one
75 | (theoretically, because sometimes there are multiple errors confusing
76 | things a bit). Removing duplicates is the *only* way and this can not
77 | happen *after* variant calling.
78 | 
79 | This means P2 is out of the window. It is useless data also for the
80 | other variant callers. I don't even have to check the somatic calling.
81 | 
82 | ## Conclusion
83 | 
84 | A simple read depth check with bio-vcf proved that P2 had no
85 | merit. Either we rerun it after removing duplicates or we rely on P1.
86 | 


--------------------------------------------------------------------------------
/test/data/regression/eval_once.ref:
--------------------------------------------------------------------------------
1 | {"UnifiedGenotyper"=>{"ID"=>"UnifiedGenotyper", "Version"=>"2.8-1-g932cd3a", "Date"=>"Sat Jan 25 10:33:56 CET 2014", "Epoch"=>1390642436187, "CommandLineOptions"=>"analysis_type=UnifiedGenotyper input_file=[/data_fedor12/BAM/sander/Liver_clones/BIOPSY17513D/mapping/BIOPSY17513D_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone3/mapping/clone3_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone4/mapping/clone4_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone10/mapping/clone10_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone33/mapping/subclone33_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone46/mapping/subclone46_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone105/mapping/subclone105_dedup_realigned_recalibrated.bam] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/data_fedor13/sander/variant_calling/Liver_clones/.queue/scatterGather/UnifiedGenotyper_noref-1-sg/temp_001_of_500/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/data_fedor13/common_data/references/H_sapiens/GATK_b37_bundle_reference/basespace/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false genotype_likelihoods_model=SNP pcr_error_rate=1.0E-4 computeSLOD=false annotateNDA=false pair_hmm_implementation=LOGLESS_CACHING min_base_quality_score=17 max_deletion_fraction=0.05 allSitePLs=false min_indel_count_for_genotyping=5 min_indel_fraction_per_sample=0.25 indelGapContinuationPenalty=10 indelGapOpenPenalty=45 indelHaplotypeSize=80 indelDebug=false ignoreSNPAlleles=false allReadsSP=false ignoreLaneInfo=false reference_sample_calls=(RodBinding name= source=UNBOUND) reference_sample_name=null sample_ploidy=2 min_quality_score=1 max_quality_score=40 site_quality_prior=20 min_power_threshold_for_calling=0.95 min_reference_depth=100 exclude_filtered_reference_sites=false output_mode=EMIT_VARIANTS_ONLY heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.0 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null dbsnp=(RodBinding name= source=UNBOUND) comp=[] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub onlyEmitSamples=[] debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"}}
2 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfrdf.rb:
--------------------------------------------------------------------------------
  1 | module BioVcf
  2 | 
  3 |   # This is some primarily RDF support - which may be moved to another gem
  4 |   #
  5 |   # Note that this functionality is superceded by the --template command! Though
  6 |   # this can be useful for one-liners.
  7 | 
  8 |   module VcfRdf
  9 | 
 10 |     def VcfRdf::header
 11 |       print <<EOB
 12 | @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
 13 | @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
 14 | @prefix dc: <http://purl.org/dc/elements/1.1/> .
 15 | @prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
 16 | @prefix doi: <http://dx.doi.org/> .
 17 | @prefix db: <http://biobeat.org/rdf/db#> .
 18 | @prefix seq: <http://biobeat.org/rdf/seq#> .
 19 | @prefix : <http://biobeat.org/rdf/vcf#> .
 20 | EOB
 21 |     end
 22 | 
 23 |     def VcfRdf::record id,rec,tags = "{}"
 24 |       id2 = [id,'ch'+rec.chrom,rec.pos,rec.alt.join('')].join('_')
 25 |       print <<OUT
 26 | :#{id2} seq:chr \"#{rec.chrom}\" .
 27 | :#{id2} seq:pos #{rec.pos} .
 28 | :#{id2} seq:alt \"#{rec.alt[0]}\" .
 29 | :#{id2} db:vcf true .
 30 | OUT
 31 |       hash = eval(tags)
 32 |       if hash
 33 |         hash.each do |k,v|
 34 |           print ":#{id2} #{k} #{v} .\n"
 35 |         end
 36 |       end
 37 |       print "\n"
 38 |     end
 39 |   end
 40 | 
 41 | 
 42 | # RDF support module. Original is part of bioruby-rdf by Pjotr Prins
 43 | #
 44 | 
 45 |   module RDF
 46 | 
 47 |     def RDF::valid_uri? uri
 48 |       uri =~ /^([!#$&-;=?_a-z~]|%[0-9a-f]{2})+$/i
 49 |     end
 50 | 
 51 |     def RDF::escape_string_literal(literal)
 52 |       s = literal.to_s
 53 |       # Put a slash before every double quote if there is no such slash already
 54 |       s = s.gsub(/(?<!\\)"/,'\"')
 55 |       # Put a slash before a single slash if it is not \["utnr>\]
 56 |       if s =~ /[^\\]\\[^\\]/
 57 |         s2 = []
 58 |         s.each_char.with_index { |c,i| 
 59 |           res = c
 60 |           if i>0 and c == '\\' and s[i-1] != '\\' and s[i+1] !~ /^[uUtnr\\"]/
 61 |             res = '\\' + c
 62 |           end
 63 |           # p [i,c,s[i+1],res]
 64 |           s2 << res
 65 |         }
 66 |         s = s2.join('')
 67 |       end
 68 |       s
 69 |     end
 70 | 
 71 |     def RDF::stringify_literal(literal)
 72 |       RDF::escape_string_literal(literal.to_s)
 73 |     end
 74 | 
 75 |     def RDF::quoted_stringify_literal(literal)
 76 |       '"' + stringify_literal(literal) + '"'
 77 |     end
 78 |   end
 79 | 
 80 |   module Turtle
 81 | 
 82 |     def Turtle::stringify_literal(literal)
 83 |       RDF::stringify_literal(literal)
 84 |     end
 85 | 
 86 |     def Turtle::identifier(id)
 87 |       raise "Illegal identifier #{id}" if id != Turtle::mangle_identifier(id)
 88 |     end
 89 | 
 90 |     # Replace letters/symbols that are not allowed in a Turtle identifier
 91 |     # (short hand URI). This should be the definite mangler and replace the 
 92 |     # ones in bioruby-table and bio-exominer. Manglers are useful when using
 93 |     # data from other sources and trying to transform them into simple RDF 
 94 |     # identifiers.
 95 | 
 96 |     def Turtle::mangle_identifier(s)
 97 |       id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\|\>)+/,"_")
 98 |       id = id.gsub(/\[|\]/,'')
 99 |       # id = URI::escape(id)
100 |       id = id.gsub(/\|/,'_')
101 |       id = id.gsub(/\-|:/,'_')
102 |       if id != s 
103 |         # Don't want Bio depency in templates!
104 |         # logger = Bio::Log::LoggerPlus.new 'bio-rdf'
105 |         # logger.warn "\nWARNING: Changed identifier <#{s}> to <#{id}>"
106 |         # $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
107 |       end
108 |       if not RDF::valid_uri?(id)
109 |         raise "Invalid URI after mangling <#{s}> to <#{id}>!"
110 |       end
111 |       valid_id = if id =~ /^\d/
112 |                    'r' + id
113 |                  else
114 |                    id
115 |                  end
116 |       valid_id  # we certainly hope so!
117 |     end
118 |   end
119 | end
120 | 


--------------------------------------------------------------------------------
/features/cli.feature:
--------------------------------------------------------------------------------
 1 | @cli
 2 | Feature: Command-line interface (CLI)
 3 | 
 4 |   bio-vcf has a powerful command line interface. Here we regression test features.
 5 | 
 6 |   Scenario: Test the info filter using dp
 7 |     Given I have input file(s) named "test/data/input/multisample.vcf"
 8 |     When I execute "./bin/bio-vcf -i --filter 'r.info.dp>100'"
 9 |     Then I expect the named output to match the named output "r.info.dp"
10 | 
11 |   Scenario: Test the info filter using dp and threads
12 |     Given I have input file(s) named "test/data/input/multisample.vcf"
13 |     When I execute "./bin/bio-vcf -i --num-threads 4 --filter 'r.info.dp>2'"
14 |     Then I expect the named output to match the named output "thread4" in under 30 seconds
15 | 
16 |   Scenario: Test the info filter using dp and threads with lines
17 |     Given I have input file(s) named "test/data/input/multisample.vcf"
18 |     When I execute "./bin/bio-vcf -i --num-threads 4 --thread-lines 4 --filter 'r.info.dp>2'"
19 |     Then I expect the named output to match the named output "thread4_4" in under 30 seconds
20 | 
21 |   Scenario: Test the sample filter using dp
22 |     Given I have input file(s) named "test/data/input/multisample.vcf"
23 |     When I execute "./bin/bio-vcf -i --sfilter 's.dp>20'"
24 |     Then I expect the named output to match the named output "s.dp"
25 | 
26 |   Scenario: Test the include sample filter using dp
27 |     Given I have input file(s) named "test/data/input/multisample.vcf"
28 |     When I execute "./bin/bio-vcf -i --ifilter 's.dp>100' --seval s.dp"
29 |     Then I expect the named output to match the named output "ifilter_s.dp"
30 | 
31 |   Scenario: Test the info eval using dp
32 |     Given I have input file(s) named "test/data/input/multisample.vcf"
33 |     When I execute "./bin/bio-vcf -i --eval 'r.info.dp'"
34 |     Then I expect the named output to match the named output "eval_r.info.dp"
35 | 
36 |   Scenario: Test the sample eval using dp
37 |     Given I have input file(s) named "test/data/input/multisample.vcf"
38 |     When I execute "./bin/bio-vcf -i --seval 's.dp'"
39 |     Then I expect the named output to match the named output "seval_s.dp"
40 | 
41 |   Scenario: Test the sample filter + eval using dp
42 |     Given I have input file(s) named "test/data/input/multisample.vcf"
43 |     When I execute "./bin/bio-vcf -i --sfilter 's.dp>10' --seval 's.dp'"
44 |     Then I expect the named output to match the named output "sfilter_seval_s.dp"
45 | 
46 |   Scenario: Rewrite an info field
47 |     Given I have input file(s) named "test/data/input/multisample.vcf"
48 |     When I execute "./bin/bio-vcf --rewrite rec.info[\'sample\']=\'XXXXX\'"
49 |     Then I expect the named output to match the named output "rewrite.info.sample"
50 | 
51 |   Scenario: Test eval-once
52 |     Given I have input file(s) named "test/data/input/multisample.vcf"
53 |     When I execute "./bin/bio-vcf --eval-once header.meta[\'GATKCommandLine\']"
54 |     Then I expect the named output to match the named output "eval_once"
55 | 
56 |   Scenario: Test JSON output with header meta data
57 |     Given I have input file(s) named "test/data/input/multisample.vcf"
58 |     When I execute "./bin/bio-vcf --template template/vcf2json_full_header.erb"
59 |     Then I expect the named output to match the named output "vcf2json_full_header"
60 | 
61 |   Scenario: Test JSON output with header meta data and query samples
62 |     Given I have input file(s) named "test/data/input/multisample.vcf"
63 |     When I execute "./bin/bio-vcf --template template/vcf2json_use_meta.erb"
64 |     Then I expect the named output to match the named output "vcf2json_use_meta"
65 | 
66 |   Scenario: Test deadlock on failed filter with threads
67 |     Given I have input file(s) named "test/data/input/multisample.vcf"
68 |     When I execute "./bin/bio-vcf -q --timeout 4 --num-threads 4 --thread-lines 4 --filter 't.info.dp>2'"
69 |     Then I expect an error and the named output to match the named output "thread4_4_failed_filter" in under 30 seconds
70 | 
71 |   Scenario: Test VCF with no records
72 |     Given I have input file(s) named "test/data/input/empty.vcf"
73 |     When I execute "./bin/bio-vcf --timeout=5"
74 |     Then I expect no errors
75 | 


--------------------------------------------------------------------------------
/features/step_definitions/somaticsniper.rb:
--------------------------------------------------------------------------------
  1 | Given(/^the somatic sniper vcf line$/) do |string|
  2 |   @fields = VcfLine.parse(string.split(/\s+/).join("\t"))
  3 | end
  4 | 
  5 | When(/^I parse the record$/) do
  6 |   header = VcfHeader.new
  7 |   @rec = VcfRecord.new(@fields,header)
  8 | end
  9 | 
 10 | Then(/^I expect rec\.chrom to contain "(.*?)"$/) do |arg1|
 11 |   expect(@rec.chrom).to eq  "1"
 12 | end
 13 | 
 14 | Then(/^I expect rec\.pos to contain (\d+)$/) do |arg1|
 15 |   expect(@rec.pos).to eq arg1.to_i
 16 | end
 17 | 
 18 | Then(/^I expect rec\.ref to contain "(.*?)"$/) do |arg1|
 19 |   expect(@rec.ref).to eq arg1
 20 | end
 21 | 
 22 | Then(/^I expect rec\.alt to contain \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
 23 |   expect(@rec.alt).to eq [arg1,arg2]
 24 | end
 25 | 
 26 | Then(/^I expect rec\.alt to contain one \["(.*?)"\]$/) do |arg1|
 27 |   expect(@rec.alt).to eq [arg1]
 28 | end
 29 | 
 30 | Then(/^I expect rec\.tumor\.dp to be (\d+)$/) do |arg1|
 31 |   expect(@rec.tumor.dp).to eq arg1.to_i
 32 | end
 33 | 
 34 | Then(/^I expect rec\.tumor\.dp(\d+) to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
 35 |   expect(@rec.tumor.dp4).to eq [arg2.to_i,arg3.to_i,arg4.to_i,arg5.to_i]
 36 | end
 37 | 
 38 | 
 39 | Then(/^I expect rec\.tumor\.bcount.to_ary to be \[(\d+),(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
 40 |   expect(@rec.tumor.bcount.to_ary).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
 41 | end
 42 | 
 43 | Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 44 |   expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i,arg2.to_i]
 45 | end
 46 | 
 47 | Then(/^I expect rec\.tumor\.bcount\[rec\.alt\] to be one \[(\d+)\]$/) do |arg1|
 48 |   expect(@rec.tumor.bcount[@rec.alt]).to eq [arg1.to_i]
 49 | end
 50 | 
 51 | Then(/^I expect rec\.tumor\.bcount\["(.*?)"\] to be (\d+)$/) do |arg1, arg2|
 52 |   expect(@rec.tumor.bcount[arg1]).to eq arg2.to_i
 53 | end
 54 | 
 55 | Then(/^I expect rec\.tumor\.bcount\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
 56 |   expect(@rec.tumor.bcount[arg1.to_i]).to eq arg2.to_i
 57 | end
 58 | 
 59 | Then(/^I expect rec\.tumor\.bcount\.sum to be (\d+)$/) do |arg1|
 60 |   expect(@rec.tumor.bcount.sum).to eq arg1.to_i
 61 | end
 62 | 
 63 | Then(/^I expect rec\.tumor\.bcount\.max to be (\d+)$/) do |arg1|
 64 |   expect(@rec.tumor.bcount.max).to eq arg1.to_i
 65 | end
 66 | 
 67 | 
 68 | Then(/^I expect rec\.tumor\.bq\.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 69 |   expect(@rec.tumor.bq.to_ary).to eq [arg1.to_i,arg2.to_i]
 70 | end
 71 | 
 72 | Then(/^I expect rec\.tumor\.bq\["(.*?)"\] to be (\d+)$/) do |arg1, arg2|
 73 |   expect(@rec.tumor.bq[arg1]).to eq arg2.to_i
 74 | end
 75 | 
 76 | Then(/^I expect rec\.tumor\.bq\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
 77 |   expect(@rec.tumor.bq[arg1.to_i]).to eq arg2.to_i
 78 | end
 79 | 
 80 | Then(/^I expect rec\.tumor\.bq\.min to be (\d+)$/) do |arg1|
 81 |   expect(@rec.tumor.bq.min).to eq arg1.to_i
 82 | end
 83 | 
 84 | Then(/^I expect rec\.tumor\.bq\.max to be (\d+)$/) do |arg1|
 85 |   expect(@rec.tumor.bq.max).to eq arg1.to_i
 86 | end
 87 | 
 88 | 
 89 | Then(/^I expect rec\.tumor\.amq.to_ary to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 90 |   expect(@rec.tumor.amq.to_ary).to eq [arg1.to_i,arg2.to_i]
 91 | end
 92 | 
 93 | Then(/^I expect rec\.tumor\.mq to be (\d+)$/) do |arg1|
 94 |   expect(@rec.tumor.mq).to eq arg1.to_i
 95 | end
 96 | 
 97 | Then(/^I expect rec\.tumor\.ss to be (\d+)$/) do |arg1|
 98 |   expect(@rec.tumor.ss).to eq arg1.to_i
 99 | end
100 | 
101 | 
102 | Then(/^I expect rec\.tumor\.ssc to be (\d+)$/) do |arg1|
103 |   expect(@rec.tumor.ssc).to be 33
104 | end
105 | 
106 | Then(/^I expect rec\.normal\.ssc to be nil$/) do
107 |   expect(@rec.normal.ssc).to be nil
108 | end
109 | 
110 | Then(/^I expect rec.call_diff to be \[(\-\d+),(\d+),(\-\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
111 |   expect(@rec.call_diff).to eq [arg1.to_i,arg2.to_i,arg3.to_i,arg4.to_i]
112 | end
113 | 
114 | Then(/^I expect rec.call_nuc to be "(.*?)"$/) do |arg1|
115 |   expect(@rec.call_nuc).to eq arg1
116 | end
117 | 
118 | Then(/^I expect rec.call_tumor_count to be (\d+)$/) do |arg1|
119 |   expect(@rec.call_tumor_count).to eq arg1.to_i
120 | end
121 | 
122 | Then(/^I expect rec.call_normal_count to be (\d+)$/) do |arg1|
123 |   expect(@rec.call_normal_count).to eq arg1.to_i
124 | end
125 | 
126 | Then(/^I expect rec.call_tumor_relative_count to be (\d+)\.(\d+)$/) do |arg1, arg2|
127 |   expect(@rec.call_tumor_relative_count).to eq (arg1.to_s+'.'+arg2.to_s).to_f
128 | end
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/features/multisample.feature:
--------------------------------------------------------------------------------
 1 | @multi
 2 | Feature: Multi-sample VCF
 3 | 
 4 |   Here we take a VCF line and parse the information for multiple named 
 5 |   samples
 6 | 
 7 |   Scenario: When parsing a record
 8 | 
 9 |     Given the multi sample header line
10 |     """
11 | #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  Original	s1t1	s2t1	s3t1	s1t2	s2t2	s3t2
12 |     """
13 |     When I parse the header
14 |     Given multisample vcf line
15 |     """
16 | 1       10321   .       C       T       106.30  .       AC=5;AF=0.357;AN=14;BaseQRankSum=3.045;DP=1537;Dels=0.01;FS=5.835;HaplotypeScore=220.1531;MLEAC=5;MLEAF=0.357;MQ=26.69;MQ0=258;MQRankSum=-4.870;QD=0.10;ReadPosRankSum=0.815    GT:AD:DP:GQ:PL  0/1:189,25:218:30:30,0,810      0/0:219,22:246:24:0,24,593      0/1:218,27:248:34:34,0,1134     0/0:220,22:248:56:0,56,1207     0/1:168,23:193:19:19,0,493      0/1:139,22:164:46:46,0,689      0/1:167,26:196:20:20,0,522    
17 |     """
18 |     When I parse the record
19 |     Then I expect rec.valid? to be true
20 |     Then I expect rec.chrom to contain "1"
21 |     Then I expect rec.pos to contain 10321
22 |     Then I expect rec.ref to contain "C"
23 |     And I expect multisample rec.alt to contain ["T"]
24 |     And I expect rec.qual to be 106.30
25 |     And I expect rec.info.ac to be 5
26 |     And I expect rec.info.af to be 0.357
27 |     And I expect rec.info.dp to be 1537
28 |     And I expect rec.info['dp'] to be 1537
29 |     And I expect rec.info.readposranksum to be 0.815
30 |     And I expect rec.info['ReadPosRankSum'] to be 0.815
31 |     And I expect rec.info.fields to contain ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
32 |     And I expect rec.sample['Original'].ad to be [189,25]
33 |     And I expect rec.sample['Original'].gt to be "0/1"
34 |     And I expect rec.sample['s3t2'].ad to be [167,26]
35 |     And I expect rec.sample['s3t2'].dp to be 196 
36 |     And I expect rec.sample['s3t2'].gq to be 20
37 |     And I expect rec.sample['s3t2'].pl to be [20,0,522]
38 |     # And the nicer self resolving
39 |     And I expect rec.sample.original.gt to be "0/1"
40 |     And I expect rec.sample.s3t2.pl to be [20,0,522]
41 |     # And the even better
42 |     And I expect rec.original.gt? to be true
43 |     And I expect rec.original.gt to be "0/1"
44 |     And I expect rec.s3t2.pl to be [20,0,522]
45 |     # Check for missing data
46 |     And I expect test rec.missing_samples? to be false 
47 |     And I expect test rec.original? to be true
48 |     # Special functions
49 |     And I expect r.original? to be true
50 |     And I expect r.original.gti? to be true
51 |     And I expect r.original.gti to be [0,1]
52 |     And I expect r.original.gti[1] to be 1
53 |     And I expect r.original.gts? to be true
54 |     And I expect r.original.gts to be ["C","T"]
55 |     And I expect r.original.gts[1] to be "T"
56 | 
57 |     Given multisample vcf line
58 |     """
59 | 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  ./. ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
60 |     """
61 |     When I parse the record
62 |     Then I expect rec.pos to contain 10723
63 |     Then I expect rec.valid? to be true
64 |     And I expect rec.original? to be false
65 |     And I expect rec.sample.s1t1? to be false
66 |     And I expect rec.sample.s3t2? to be true
67 |     And I expect rec.missing_samples? to be true
68 | 
69 |     # Phased genotype
70 |     Given multisample vcf line
71 |     """
72 | 1 10723 . C G 73.85 . AC=4;AF=0.667;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  0|1 ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
73 |     """
74 |     When I parse the record
75 |     Then I expect rec.pos to contain 10723
76 |     Then I expect rec.valid? to be true
77 |     And I expect r.original? to be true
78 |     And I expect r.original.gts? to be true
79 |     And I expect r.original.gts to be ["C","G"]
80 |     And I expect r.original.gts[0] to be "C"
81 |     And I expect r.original.gts[1] to be "G"
82 |     
83 |     # INFO fields with matching tails
84 |     Given multisample vcf line
85 |     """
86 | 1 10723 . C G 73.85 . AC=4;AF=0.667;CIEND=999;END=111;AN=6;BaseQRankSum=1.300;DP=18;Dels=0.00;FS=3.680;HaplotypeScore=0.0000;MLEAC=4;MLEAF=0.667;MQ=20.49;MQ0=11;MQRankSum=1.754;QD=8.21;ReadPosRankSum=0.000 GT:AD:DP:GQ:PL  0|1 ./. 1/1:2,2:4:6:66,6,0  1/1:4,1:5:3:36,3,0  ./. ./.  0/0:6,0:6:3:0,3,33
87 |     """
88 |     When I parse the record
89 |     Then I expect r.info.end to be 111
90 |     And I expect r.info.ciend to be 999
91 | 


--------------------------------------------------------------------------------
/features/step_definitions/sfilter.rb:
--------------------------------------------------------------------------------
  1 | Given(/^the VCF line$/) do |string|
  2 |   @header = VcfHeader.new
  3 |   @header.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample")
  4 |   @vcfline = string
  5 | end
  6 | 
  7 | When(/^I evaluate '([^']+)'$/) do |arg1|
  8 |   # concat VCF line with sample (arg1)
  9 |   @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
 10 |   @rec = VcfRecord.new(@fields,@header)
 11 |   p @rec
 12 |   @g = @rec.sample['Sample']
 13 |   p @g
 14 |   expect(@g).not_to be nil
 15 |   @s = VcfSample::Sample.new(@rec,@g)
 16 |   @ignore_missing = false
 17 | end
 18 | 
 19 | Then(/^I expect s\.empty\? to be false$/) do
 20 |   expect(@s.empty?).to be false
 21 |   expect(@s.sfilter("s.empty?",do_cache: false)).to be false
 22 | end
 23 | 
 24 | Then(/^I expect s\.dp\? to be true$/) do
 25 |   p ['eval s.dp?',@s.eval("s.dp?",do_cache: false)]
 26 |   p ['eval s.dp',@s.eval("s.dp",do_cache: false)]
 27 |   p @g.dp
 28 |   p @s.dp
 29 |   p @s.sfilter("s.dp?",do_cache: false)
 30 |   expect(@s.eval("s.dp?",do_cache: false)).to be true
 31 | end
 32 | 
 33 | Then(/^I expect s\.dp to be (\d+)$/) do |arg1|
 34 |   # p @s.eval("s.dp")
 35 |   p :now
 36 |   p ['eval s.dp?',@s.eval("s.dp?",do_cache: false)]
 37 |   p ['eval s.dp',@s.eval("s.dp",do_cache: false)]
 38 |   expect(@s.eval("s.dp",do_cache: false)).to equal arg1.to_i
 39 | end
 40 | 
 41 | Then(/^sfilter 's\.dp>(\d+)' to be true$/) do |arg1|
 42 |   expect(@s.sfilter("dp>#{arg1}",do_cache: false)).to be true
 43 | end
 44 | 
 45 | When(/^I evaluate missing '([^']+)'$/) do |arg1|
 46 |   # concat VCF line with sample (arg1)
 47 |   @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
 48 |   @rec = VcfRecord.new(@fields,@header)
 49 |   p @rec
 50 |   @g = @rec.sample['Sample']
 51 |   @s = VcfSample::Sample.new(@rec,@g)
 52 |   p @s
 53 |   expect(@s).not_to be nil
 54 |   @ignore_missing = false
 55 | end
 56 | 
 57 | Then(/^I expect s\.dp\? to be false$/) do
 58 |   expect(@s.eval("s.dp?",do_cache: false)).to be false
 59 | end
 60 | 
 61 | Then(/^I expect s\.dp to be nil$/) do
 62 |   expect(@s.eval("s.dp",ignore_missing_data: @ignore_missing, do_cache: false)).to be nil 
 63 | end
 64 | 
 65 | Then(/^sfilter 's\.dp>(\d+)' to throw an error$/) do |arg1|
 66 |   expect { @s.eval("s.dp>#{arg1}",do_cache: false) }.to raise_error NoMethodError
 67 | end
 68 | 
 69 | Then(/^sfilter 's\.dp>(\d+)' to be false$/) do |arg1|
 70 |    expect(@s.sfilter("s.dp>#{arg1}",ignore_missing_data: @ignore_missing, do_cache: false)).to be false
 71 | end
 72 | 
 73 | When(/^I evaluate empty '\.\/\.'$/) do
 74 |   # concat VCF line with sample (arg1)
 75 |   @fields = VcfLine.parse((@vcfline.split(/\s+/)+['./.']).join("\t"))
 76 |   @rec = VcfRecord.new(@fields,@header)
 77 |   p @rec
 78 |   @g = @rec.sample['Sample']
 79 |   @s = VcfSample::Sample.new(@rec,@g)
 80 |   p @s
 81 |   expect(@s).not_to be nil
 82 |   @ignore_missing = false
 83 | end
 84 | 
 85 | When(/^I evaluate missing '([^']+)' with ignore missing$/) do |arg1|
 86 |   # concat VCF line with sample (arg1)
 87 |   @fields = VcfLine.parse((@vcfline.split(/\s+/)+[arg1]).join("\t"))
 88 |   @rec = VcfRecord.new(@fields,@header)
 89 |   p @rec
 90 |   @g = @rec.sample['Sample']
 91 |   @s = VcfSample::Sample.new(@rec,@g)
 92 |   p @s
 93 |   expect(@s).not_to be nil
 94 |   @ignore_missing = true
 95 | end
 96 | 
 97 | Then(/^I expect s\.empty\? to be true$/) do
 98 |   expect(@s.sfilter("s.empty?",do_cache: false)).to be true
 99 | end
100 | 
101 | Then(/^I expect s\.dp to throw an error$/) do
102 |   # @s.instance_eval { undef :dp }
103 |   p @s.eval("s.dp",do_cache: false)
104 |   expect { @s.eval("s.dp",do_cache: false) }.to raise_error NoMethodError
105 | end
106 | 
107 | When(/^I evaluate empty '\.\/\.' with ignore missing$/) do
108 |   # concat VCF line with sample (arg1)
109 |   @fields = VcfLine.parse((@vcfline.split(/\s+/)+['./.']).join("\t"))
110 |   @rec = VcfRecord.new(@fields,@header)
111 |   p @rec
112 |   @g = @rec.sample['Sample']
113 |   @s = VcfSample::Sample.new(@rec,@g)
114 |   p @s
115 |   expect(@s).not_to be nil
116 |   @ignore_missing = true
117 | end
118 | 
119 | Then(/^I expect s\.what\? to throw an error$/) do
120 |   expect { @s.eval("s.what?",do_cache: false) }.to raise_error NoMethodError
121 | end
122 | 
123 | Then(/^I expect s\.what to throw an error$/) do
124 |   expect { @s.eval("s.what",do_cache: false) }.to raise_error NoMethodError
125 | end
126 | 
127 | Then(/^I expect r\.chrom to be "(.*?)"$/) do |arg1|
128 |   expect(@s.eval("r.chrom",do_cache: false)).to eq "1"
129 | end
130 | 
131 | Then(/^I expect r\.alt to be \["(.*?)"\]$/) do |arg1|
132 |   expect(@s.eval("r.alt",do_cache: false)).to eq ["G"] 
133 | end
134 | 
135 | Then(/^I expect r\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
136 |   expect(@s.eval("r.info.af",do_cache: false)).to eq 0.667
137 | end
138 | 
139 | 
140 | 


--------------------------------------------------------------------------------
/doc/Using_RDF.md:
--------------------------------------------------------------------------------
  1 | # Using bio-vcf with RDF
  2 | 
  3 | bio-vcf can output many types of formats. In this exercise we will load
  4 | a triple store (4store) with VCF data and do some queries on that.
  5 | 
  6 | ## Install and start 4store
  7 | 
  8 | ### On GNU Guix
  9 | 
 10 | See https://github.com/pjotrp/guix-notes/blob/master/packages/4store.org
 11 | 
 12 | ### On Debian
 13 | 
 14 | Get root
 15 | 
 16 | ```sh
 17 | su
 18 | apt-get install avahi-daemon
 19 | apt-get install raptor-utils
 20 | exit
 21 | ```
 22 | 
 23 | As normal user
 24 | 
 25 | ```sh
 26 | guix package -i sparql-query curl
 27 | ```
 28 | 
 29 | Initialize and start the server again as root (or another user)
 30 | 
 31 | ```
 32 | su
 33 | export PATH=/home/user/.guix-profile/bin:$PATH
 34 | mkdir -p /var/lib/4store
 35 | dbname=test
 36 | 4s-backend-setup $dbname
 37 | 4s-backend $dbname
 38 | 4s-httpd -p 8000 $dbname
 39 | ```
 40 | 
 41 | Try the web browser and point it to http://localhost:8000/status/
 42 | 
 43 | Open a new terminal as user.
 44 | 
 45 | 
 46 | Generate rdf with bio-vcf template
 47 | 
 48 | ```ruby
 49 | =HEADER
 50 | @prefix : <http://biobeat.org/rdf/ns#> .
 51 | =BODY
 52 | <%
 53 | id = ['chr'+rec.chr,rec.pos,rec.alt].join('_')
 54 | %>
 55 | :<%= id %>
 56 |   :query_id "<%= id %>";
 57 |   :chr "<%= rec.chr %>" ;
 58 |   :alt "<%= rec.alt.join("") %>" ;
 59 |   :pos <%= rec.pos %> .
 60 | 
 61 | 
 62 | ```
 63 | 
 64 | so it looks like
 65 | 
 66 | ```
 67 | :chrX_134713855_A
 68 |   :query_id "chrX_134713855_A";
 69 |   :chr "X" ;
 70 |   :alt "A" ;
 71 |   :pos 134713855 .
 72 | ```
 73 | 
 74 | and test with rapper using [gatk_exome.vcf](https://github.com/pjotrp/bioruby-vcf/blob/master/test/data/input/gatk_exome.vcf)
 75 | 
 76 | ```sh
 77 | cat gatk_exome.vcf |bio-vcf -v --template rdf_template.erb
 78 | cat gatk_exome.vcf |bio-vcf -v --template rdf_template.erb > my.rdf
 79 | rapper -i turtle my.rdf
 80 | ```
 81 | 
 82 | Load into 4store (when no errors)
 83 | 
 84 | ```bash
 85 | rdf=my.rdf
 86 | uri=http://localhost:8000/data/http://biobeat.org/data/$rdf
 87 | curl -X DELETE $uri
 88 | curl -T $rdf -H 'Content-Type: application/x-turtle' $uri
 89 | 201 imported successfully
 90 | This is a 4store SPARQL server
 91 | ```
 92 | 
 93 | First SPARQL query
 94 | 
 95 | ```sh
 96 | SELECT ?id
 97 | WHERE
 98 | {
 99 |   ?id   <http://biobeat.org/rdf/ns#chr>    "X".
100 | }
101 | ```
102 | 
103 | ```
104 | cat sparql1.rq |sparql-query "http://localhost:8000/sparql/" -p
105 | ┌──────────────────────────────────────────────┐
106 | │ ?id                                          │
107 | ├──────────────────────────────────────────────┤
108 | │ <http://biobeat.org/rdf/ns#chrX_107911706_C> │
109 | │ <http://biobeat.org/rdf/ns#chrX_55172537_A>  │
110 | │ <http://biobeat.org/rdf/ns#chrX_134713855_A> │
111 | └──────────────────────────────────────────────┘
112 | ```
113 | 
114 | A simple python query may look like
115 | 
116 | ```python
117 | import requests
118 | import subprocess
119 | 
120 | host = "http://localhost:8000/"
121 | 
122 | query = """
123 | SELECT ?s ?p ?o WHERE {
124 |     ?s ?p ?o .
125 | } LIMIT 10
126 | """
127 | 
128 | r = requests.post(host, data={ "query": query, "output": "text" })
129 | # print r.url
130 | 
131 | print r.text
132 | ```
133 | 
134 | renders
135 | 
136 | ```
137 | ?id
138 | <http://biobeat.org/rdf/ns#chrX_107911706_C>
139 | <http://biobeat.org/rdf/ns#chrX_55172537_A>
140 | <http://biobeat.org/rdf/ns#chrX_134713855_A>
141 | ```
142 | 
143 | A working example if you are using the server
144 | http://guix.genenetwork.org and the correct PREFIX:
145 | 
146 | ```python
147 | #! /usr/bin/env python
148 | import requests
149 | import subprocess
150 | 
151 | host = "http://guix.genenetwork.org/sparql/"
152 | query = """
153 | PREFIX : <http://biobeat.org/rdf/pjotr/ns#>
154 | SELECT ?id ?chr ?pos ?alt
155 | WHERE
156 | {
157 |   { ?id   :chr      "X" . }
158 |   UNION
159 |   { ?id   :chr      "1" . }
160 |   ?id   :chr    ?chr .
161 |   ?id   :alt    ?alt .
162 |   ?id   :pos    ?pos .
163 |   FILTER (?pos > 107911705) .
164 | }
165 | """
166 | r = requests.post(host, data={ "query": query, "output": "text" })
167 | print r.text
168 | ```
169 | 
170 | ## EBI
171 | 
172 | 
173 | EBI SPARQL has some advanced examples of queries, such as
174 | 
175 | ```
176 | https://www.ebi.ac.uk/rdf/services/ensembl/sparql
177 | PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
178 | PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
179 | PREFIX dcterms: <http://purl.org/dc/terms/>
180 | PREFIX dc: <http://purl.org/dc/elements/1.1/>
181 | PREFIX obo: <http://purl.obolibrary.org/obo/>
182 | PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
183 | PREFIX sio: <http://semanticscience.org/resource/>
184 | PREFIX faldo: <http://biohackathon.org/resource/faldo#>
185 | PREFIX identifiers: <http://identifiers.org/>
186 | PREFIX ensembl: <http://rdf.ebi.ac.uk/resource/ensembl/>
187 | PREFIX ensembltranscript: <http://rdf.ebi.ac.uk/resource/ensembl.transcript/>
188 | PREFIX ensemblexon: <http://rdf.ebi.ac.uk/resource/ensembl.exon/>
189 | PREFIX ensemblprotein: <http://rdf.ebi.ac.uk/resource/ensembl.protein/>
190 | PREFIX ensemblterms: <http://rdf.ebi.ac.uk/terms/ensembl/>
191 | 
192 | SELECT DISTINCT ?transcript ?id ?typeLabel ?reference ?begin ?end ?location {
193 |   ?transcript obo:SO_transcribed_from ensembl:ENSG00000139618 ;
194 |               a ?type;
195 |               dc:identifier ?id .
196 |   OPTIONAL {
197 |     ?transcript faldo:location ?location .
198 |     ?location faldo:begin [faldo:position ?begin] .
199 |     ?location faldo:end [faldo:position ?end ] .
200 |     ?location faldo:reference ?reference .
201 |   }
202 |   OPTIONAL {?type rdfs:label ?typeLabel}
203 | }
204 | ```
205 | 
206 | See https://www.ebi.ac.uk/rdf/services/ensembl/sparql
207 | 
208 | # Exercise
209 | 
210 | Today's exercise is to create a graph using bio-vcf and/or a small program using
211 | RDF triples and define a SPARQL query.
212 | 
213 | The more interesting the graph/SPARQL the better.
214 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfsample.rb:
--------------------------------------------------------------------------------
  1 | module BioVcf
  2 |   module VcfSample
  3 | 
  4 |     # Check whether a sample is empty (on the raw string value)
  5 |     def VcfSample::empty? s
  6 |       s==nil or s == './.' or s == '' or s[0..2]=='./.' or s[0..1] == '.:'
  7 |     end
  8 | 
  9 |     class Sample
 10 |       # Initialized sample with rec and genotypefield
 11 |       #
 12 |       # #<BioVcf::VcfGenotypeField:0x00000001a0c188 @values=["0/0", "151,8", "159", "99", "0,195,2282"], @format={"GT"=>0, "AD"=>1, "DP"=>2, "GQ"=>3, "PL"=>4},
 13 |       def initialize num,rec,genotypefield
 14 |         @num = num
 15 |         @rec = rec
 16 |         @sample = genotypefield
 17 |         @format = @sample.format
 18 |         @values = @sample.values
 19 |       end
 20 | 
 21 |       def empty?
 22 |        cache_empty ||= VcfSample::empty?(@sample.to_s)
 23 |       end
 24 | 
 25 |       def is_last?
 26 |         # $stderr.print(@num,@rec.header.num_samples)
 27 |         @num == @rec.header.num_samples-1
 28 |       end
 29 | 
 30 |       def name
 31 |         @sample.name
 32 |       end
 33 | 
 34 |       def eval expr, ignore_missing_data: false, quiet: false, do_cache: true
 35 |         caching_eval :eval, :call_cached_eval, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
 36 |       end
 37 | 
 38 |       def sfilter expr, ignore_missing_data: false, quiet: true, do_cache: true
 39 |         caching_eval :sfilter, :call_cached_sfilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
 40 |       end
 41 | 
 42 |       def ifilter expr, ignore_missing_data: false, quiet: false, do_cache: true
 43 |         caching_eval :ifilter, :call_cached_ifilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
 44 |       end
 45 | 
 46 |       def efilter expr, ignore_missing_data: false, quiet: false, do_cache: true
 47 |         caching_eval :efilter, :call_cached_efilter, expr, ignore_missing_data: ignore_missing_data, quiet: quiet, do_cache: do_cache
 48 |       end
 49 | 
 50 |       # Split GT into index values
 51 |       def gti
 52 |         v = fetch_values("GT")
 53 |         v = './.' if v == '.' #In case that you have a single missing value, make both as missing.
 54 |         v.split(/[\/\|]/).map{ |v| (v=='.' ? nil : v.to_i) }
 55 |       end
 56 | 
 57 |       def gtindex
 58 |         v = fetch_values("GT")
 59 |         return case v
 60 |                when nil then nil
 61 |                when '.' then nil
 62 |                when './.' then nil
 63 |                when '0/0' then 0
 64 |                when '0/1' then 1
 65 |                when '1/1' then 2
 66 |                else
 67 |                  raise "Unknown genotype #{v}"
 68 |                end
 69 |       end
 70 | 
 71 |       # Split GT into into a nucleode sequence
 72 |       def gts
 73 |         gti.map { |i| (i ? @rec.get_gt(i) : nil) }
 74 |       end
 75 | 
 76 |       def cache_method(name, &block)
 77 |         self.class.send(:define_method, name, &block)
 78 |       end
 79 | 
 80 |       def [] name
 81 |         if @format[name]
 82 |           v = fetch_values(name)
 83 |           return nil if VcfValue::empty?(v)
 84 |           return ConvertStringToValue::convert(v)
 85 |         end
 86 |         nil
 87 |       end
 88 | 
 89 |       def method_missing(m, *args, &block)
 90 |         name = m.to_s.upcase
 91 |         # p [:here,name,m ,@values]
 92 |         # p [:respond_to_call_cached_eval,respond_to?(:call_cached_eval)]
 93 |         if name =~ /\?$/
 94 |           # test for valid field
 95 |           return !VcfValue::empty?(fetch_values(name.chop))
 96 |         else
 97 |           if @format[name]
 98 |             cache_method(m) {
 99 |               v = fetch_values(name)
100 |               return nil if VcfValue::empty?(v)
101 |               ConvertStringToValue::convert(v)
102 |             }
103 |             self.send(m)
104 |           else
105 |             super(m, *args, &block)
106 |           end
107 |         end
108 |       end
109 | 
110 |   private
111 | 
112 |       def fetch_values name
113 |         n = @format[name]
114 |         raise NoMethodError.new("Unknown sample field <#{name}>") if not n
115 |         @values[n]  # <-- save names with upcase!
116 |       end
117 | 
118 |       def caching_eval method, cached_method, expr, ignore_missing_data: false, quiet: false, do_cache: true
119 |         begin
120 |           if do_cache
121 |             if not respond_to?(cached_method)
122 |               code =
123 |               """
124 |               def #{cached_method}(rec,sample)
125 |                 r = rec
126 |                 s = sample
127 |                 #{expr}
128 |               end
129 |               """
130 |               self.class.class_eval(code)
131 |             end
132 |             self.send(cached_method,@rec,self)
133 |           else
134 |             # This is used for testing mostly
135 |             print "WARNING: NOT CACHING #{method}\n"
136 |             self.class.class_eval { undef :call_cached_eval } if respond_to?(:call_cached_eval)
137 |             self.class.class_eval { undef :call_cached_sfilter } if respond_to?(:call_cached_sfilter)
138 |             r = @rec
139 |             s = @sample
140 |             eval(expr)
141 |           end
142 |         rescue NoMethodError => e
143 |           $stderr.print "\n#{method} trying to evaluate on an empty sample #{@sample.values.to_s}!\n" if not empty? and not quiet
144 |           if not quiet
145 |             $stderr.print [:format,@format,:sample,@values],"\n"
146 |             $stderr.print [:filter,expr],"\n"
147 |           end
148 |           if ignore_missing_data
149 |             $stderr.print e.message if not quiet and not empty?
150 |             return false
151 |           else
152 |             raise NoMethodError.new(e.message + ". Can not evaluate empty sample data by default: test for s.empty? or use the -i switch!")
153 |           end
154 |         end
155 |       end
156 | 
157 |     end
158 | 
159 |   end
160 | end
161 | 


--------------------------------------------------------------------------------
/test/performance/metrics.md:
--------------------------------------------------------------------------------
  1 | Round of testing on Macbook PRO running Linux with Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz
  2 | and
  3 | 
  4 |     ruby -v
  5 |     ruby 2.1.0p0 (2013-12-25 revision 44422) [x86_64-linux]
  6 | 
  7 |     wc test/tmp/test.vcf 
  8 |       12469  137065 2053314 test/tmp/test.vcf
  9 | 
 10 |     time ./bin/bio-vcf -i --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null
 11 |     vcf 0.0.3-pre4 (biogem Ruby 2.1.0) by Pjotr Prins 2014
 12 |     Options: {:show_help=>false, :ignore_missing=>true, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"}
 13 |     real    0m1.215s
 14 |     user    0m1.208s
 15 |     sys     0m0.004s
 16 | 
 17 | Reload
 18 | 
 19 |     time ./bin/bio-vcf -i --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null
 20 |     vcf 0.0.3-pre4 (biogem Ruby 2.1.0) by Pjotr Prins 2014
 21 |     Options: {:show_help=>false, :ignore_missing=>true, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"}
 22 |     real    0m1.194s
 23 |     user    0m1.172s
 24 |     sys     0m0.016s
 25 | 
 26 | Introduced method caching
 27 |         
 28 |     real    0m1.190s
 29 |     user    0m1.180s
 30 |     sys     0m0.004s
 31 | 
 32 | Introduce !!Float test
 33 | 
 34 |     real    0m1.187s
 35 |     user    0m1.180s
 36 |     sys     0m0.004s
 37 | 
 38 | Cache sample index 
 39 | 
 40 |     real    0m1.156s
 41 |     user    0m1.148s
 42 |     sys     0m0.004s
 43 | 
 44 | Run the profiler 
 45 | 
 46 |     ruby  -rprofile  ./bin/bio-vcf -i --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null
 47 |     vcf 0.0.3-pre4 (biogem Ruby 2.1.0) by Pjotr Prins 2014
 48 |     Options: {:show_help=>false, :ignore_missing=>true, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"}
 49 |       %   cumulative   self              self     total
 50 |      time   seconds   seconds    calls  ms/call  ms/call  name
 51 |       9.45     2.19      2.19    34968     0.06     0.76  Object#parse_line
 52 |       7.25     3.87      1.68    75031     0.02     0.03  BioVcf::VcfRecordInfo#[]=
 53 |       7.12     5.52      1.65    34968     0.05     0.29  Kernel.eval
 54 |       6.86     7.11      1.59    87481     0.02     0.10  BioVcf::VcfRecordInfo#initialize
 55 |       5.57     8.40      1.29    35994     0.04     0.47  Array#each
 56 |       4.14     9.36      0.96    34253     0.03     0.65  BioVcf::VcfRecord#each_sample
 57 |       3.93    10.27      0.91    93880     0.01     0.03  BioVcf::VcfRecordParser.get_format
 58 |       3.88    11.17      0.90   145920     0.01     0.01  String#split
 59 | 
 60 | Late parsing of info field without split:
 61 | 
 62 |     real    0m1.124s
 63 |     user    0m1.120s
 64 |     sys     0m0.008s
 65 | 
 66 | Global sample info caching
 67 | 
 68 |     real    0m1.032s
 69 |     user    0m1.020s
 70 |     sys     0m0.008s
 71 | 
 72 | Assign some repeated Hash queries
 73 | 
 74 |     real    0m1.028s
 75 |     user    0m1.024s
 76 |     sys     0m0.000s
 77 | 
 78 | Profiler now picking out eval for further optimization
 79 | 
 80 |       %   cumulative   self              self     total
 81 |      time   seconds   seconds    calls  ms/call  ms/call  name
 82 |      10.45     1.80      1.80    34968     0.05     0.59  Object#parse_line
 83 |       7.89     3.16      1.36    34968     0.04     0.17  Kernel.eval
 84 |       5.69     4.14      0.98    34253     0.03     0.57  BioVcf::VcfRecord#each_sample
 85 |       4.93     4.99      0.85    12497     0.07     1.37  nil#
 86 | 
 87 | Compiling sample eval
 88 | 
 89 |     real    0m0.820s
 90 |     user    0m0.812s
 91 |     sys     0m0.004s
 92 | 
 93 | Compiling record eval
 94 | 
 95 |     real    0m0.647s
 96 |     user    0m0.644s
 97 |     sys     0m0.000s
 98 | 
 99 | Walk examples by index, rather than by name
100 | 
101 |     real    0m0.612s
102 |     user    0m0.596s
103 |     sys     0m0.012s
104 | 
105 | More caching
106 | 
107 |     real    0m0.600s
108 |     user    0m0.592s
109 |     sys     0m0.004s
110 | 
111 | And the latest profiling
112 | 
113 |       %   cumulative   self              self     total
114 |      time   seconds   seconds    calls  ms/call  ms/call  name
115 |      12.98     2.02      2.02    34968     0.06     0.51  Object#parse_line
116 |       7.78     3.23      1.21    22518     0.05     0.14  BioVcf::VcfRecord#sample_by_index
117 |       5.59     4.10      0.87    34253     0.03     0.47  BioVcf::VcfRecord#each_sample
118 |       4.82     4.85      0.75    34968     0.02     0.03  BioVcf::ConvertStringToValue.integer?
119 |       4.50     5.55      0.70    12450     0.06     0.13  BioVcf::VcfRecordInfo#method_missing
120 |       4.31     6.22      0.67    69974     0.01     0.03  Class#new
121 |       4.24     6.88      0.66    12499     0.05     1.23  nil#
122 |       3.79     7.47      0.59    12450     0.05     0.06  BioVcf::VcfLine.parse
123 | 
124 | Introduced --num-threads
125 | 
126 |     time ./bin/bio-vcf -i --num-threads --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null
127 | 
128 | on a dual-core running Linux
129 | 
130 |     real    0m0.389s
131 |     user    0m1.132s
132 |     sys     0m0.148s
133 | 
134 | Latest
135 | 
136 |     time ./bin/bio-vcf -i --num-threads 4 --thread-lines 2000 --filter 'r.info.dp>20' --sfilter 's.dp>10' < test/tmp/test.vcf > /dev/null
137 |     vcf 0.8.3-pre1 (biogem Ruby 2.1.0) by Pjotr Prins 2014
138 |     Options: {:show_help=>false, :source=>"https://github.com/CuppenResearch/bioruby-vcf", :version=>"0.8.3-pre1 (Pjotr Prins)", :date=>"2014-12-31 13:30:32 +0300", :thread_lines=>2000, :ignore_missing=>true, :num_threads=>4, :filter=>"r.info.dp>20", :sfilter=>"s.dp>10"}
139 |     real    0m0.600s
140 |     user    0m1.472s
141 |     sys     0m0.068s
142 | 
143 | Profiling:
144 | 
145 | .  %   cumulative   self              self     total
146 |  time   seconds   seconds    calls  ms/call  ms/call  name
147 |  15.01     3.23      3.23    34968     0.09     0.82  Object#parse_line
148 |   8.22     5.00      1.77    22518     0.08     0.22  BioVcf::VcfRecord#sample_by_index
149 |   4.97     6.07      1.07    22518     0.05     0.27  BioVcf::VcfSample::Sample#sfilter
150 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfheader.rb:
--------------------------------------------------------------------------------
  1 | # This module parses the VCF header. A header consists of lines
  2 | # containing fields. Most fields are of 'key=value' type and appear
  3 | # only once.  These can be retrieved with the find_field method.
  4 | #
  5 | # INFO, FORMAT and contig fields are special as they appear multiple times
  6 | # and contain multiple key values (identified by an ID field).
  7 | # To retrieve these call 'info' and 'format' functions respectively,
  8 | # which return a hash on the contained ID.
  9 | #
 10 | # For the INFO and FORMAT fields a Ragel parser is used, mostly to
 11 | # deal with embedded quoted fields.
 12 | 
 13 | module BioVcf
 14 | 
 15 |   module VcfHeaderParser
 16 |     def VcfHeaderParser.get_column_names(lines)
 17 |       lines.each do | line |
 18 |         if line =~ /^#[^#]/
 19 |           # the first line that starts with a single hash
 20 |           names = line.split
 21 |           names[0].sub!(/^#/,'')
 22 |           return names
 23 |         end
 24 |       end
 25 |       nil
 26 |     end
 27 | 
 28 |     def VcfHeaderParser.parse_field(line, debug)
 29 |       BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(line, debug: debug)
 30 |     end
 31 |   end
 32 | 
 33 |   class VcfHeader
 34 | 
 35 |     attr_reader :lines, :field
 36 | 
 37 |     def initialize(debug = false)
 38 |       @debug = debug
 39 |       @lines = []
 40 |       @field = {}
 41 |       @meta = nil
 42 |       @cached_filter_index = {}
 43 |     end
 44 | 
 45 |     # Add a new field to the header
 46 |     def add line
 47 |       @lines += line.split(/\n/)
 48 |     end
 49 | 
 50 |     # Push a special key value list to the header
 51 |     def tag h
 52 |       h2 = h.dup
 53 |       [:show_help,:skip_header,:verbose,:quiet,:debug].each { |key| h2.delete(key) }
 54 |       info = h2.map { |k,v| k.to_s.capitalize+'='+'"'+v.to_s+'"' }.join(',')
 55 |       line = '##BioVcf=<'+info+'>'
 56 |       @lines.insert(-2,line)
 57 |       line
 58 |     end
 59 | 
 60 |     def version
 61 |       @version ||= lines[0].scan(/##fileformat=VCFv(\d+\.\d+)/)[0][0]
 62 |     end
 63 | 
 64 |     def column_names
 65 |       @column_names ||= VcfHeaderParser::get_column_names(@lines)
 66 |     end
 67 | 
 68 |     def columns
 69 |       @column ||= column_names.size
 70 |     end
 71 | 
 72 |     def printable_header_line(fields)
 73 |       fields.map { | field |
 74 |         if field == '#samples'
 75 |           samples
 76 |         else
 77 |           field
 78 |         end
 79 |       }.join("\t")
 80 |     end
 81 | 
 82 |     def samples
 83 |       @samples ||= if column_names.size > 8
 84 |                      column_names[9..-1]
 85 |                    else
 86 |                      []
 87 |                    end
 88 |     end
 89 | 
 90 |     def samples_index_array
 91 |       @all_samples_index ||= column_names[9..-1].fill{|i| i}
 92 |     end
 93 | 
 94 |     def num_samples
 95 |       @num_samples ||= ( samples == nil ? 0 : samples.size )
 96 |     end
 97 | 
 98 |     # Returns the field number for a sample (starting with 9)
 99 |     def sample_index
100 |       return @sample_index if @sample_index
101 |       index = {}
102 |       samples.each_with_index { |k,i| index[k] = i+9 ; index[k.downcase] = i+9 }
103 |       @sample_index = index
104 |       index
105 |     end
106 | 
107 |     # Give a list of samples (by index and/or name) and return 0-based index values
108 |     # The cache has to be able to hanle multiple lists - that is why it is a hash.
109 |     def sample_subset_index list
110 |       cached = @cached_filter_index[list]
111 |       if cached
112 |         l = cached
113 |       else
114 |         l = []
115 |         list = samples_index_array() if not list
116 |         list.each { |i|
117 |           value =
118 |             begin
119 |               Integer(i)
120 |             rescue
121 |               idx = samples.index(i)
122 |               if idx != nil
123 |                 idx
124 |               else
125 |                 raise "Unknown sample name '#{i}'"
126 |               end
127 |             end
128 |           l << value
129 |         }
130 |         @cached_filter_index[list] = l
131 |       end
132 |       l
133 |     end
134 | 
135 |     # Look for a line in the header with the field name and return the
136 |     # value, otherwise return nil
137 |     def find_field name
138 |       return field[name] if field[name]
139 |       @lines.each do | line |
140 |         value = line.scan(/###{name}=(.*)/)
141 |         if value[0]
142 |           v = value[0][0]
143 |           field[name] = v
144 |           return v
145 |         end
146 |       end
147 |       nil
148 |     end
149 | 
150 |     # Look for all the lines that match the field name and return
151 |     # a hash of hashes. An empty hash is returned when there are
152 |     # no matches.
153 |     def find_fields name
154 |       res = {}
155 |       @lines.each do | line |
156 |         value = line.scan(/###{name}=<(.*)>/)
157 |         if value[0]
158 |           str = value[0][0]
159 |           # p str
160 |           v = VcfHeaderParser.parse_field(line,@debug)
161 |           id = v['ID']
162 |           res[id] = v
163 |         end
164 |       end
165 |       # p res
166 |       res
167 |     end
168 | 
169 |     def format
170 |       find_fields('FORMAT')
171 |     end
172 | 
173 |     def filter
174 |       find_fields('FILTER')
175 |     end
176 | 
177 |     def contig
178 |       find_fields('contig')
179 |     end
180 | 
181 |     def info
182 |       find_fields('INFO')
183 |     end
184 | 
185 |     def gatkcommandline
186 |       find_fields('GATKCommandLine')
187 |     end
188 | 
189 |     def meta
190 |       return @meta if @meta
191 |       res = { 'INFO' => {}, 'FORMAT' => {}, 'FILTER' => {}, 'contig' => {}, 'GATKCommandLine' => {} }
192 |       @lines.each do | line |
193 |         value = line.scan(/##(.*?)=(.*)/)
194 |         if value[0]
195 |           k,v = value[0]
196 |           if k != 'FORMAT' and k != 'INFO' and k != 'FILTER' and k != 'contig' and k != 'GATKCommandLine'
197 |             # p [k,v]
198 |             res[k] = v
199 |           end
200 |         end
201 |       end
202 |       res['INFO'] = info()
203 |       res['FORMAT'] = format()
204 |       res['FILTER'] = filter()
205 |       res['contig'] = contig()
206 |       res['GATKCommandLine'] = gatkcommandline()
207 |       # p [:res, res]
208 |       @meta = res # cache values
209 |       res
210 |     end
211 | 
212 |     def method_missing(m, *args, &block)
213 |       name = m.to_s
214 |       value = find_field(name)
215 |       return value if value
216 |       raise "Unknown VCF header query '#{name}'"
217 |     end
218 | 
219 |   end
220 | end
221 | 


--------------------------------------------------------------------------------
/features/step_definitions/multisample.rb:
--------------------------------------------------------------------------------
  1 | Given(/^the multi sample header line$/) do |string|
  2 |   @header = VcfHeader.new
  3 |   @header.add(string)
  4 | end
  5 | 
  6 | When(/^I parse the header$/) do
  7 |   expect(@header.column_names.size).to eq 16
  8 |   expect(@header.samples.size).to eq 7
  9 |   expect(@header.samples).to eq ["Original", "s1t1", "s2t1", "s3t1", "s1t2", "s2t2", "s3t2"]
 10 | end
 11 | 
 12 | Given(/^multisample vcf line$/) do |string|
 13 |   @fields = VcfLine.parse(string.split(/\s+/).join("\t"))
 14 |   @rec1 = VcfRecord.new(@fields,@header)
 15 | end
 16 | 
 17 | Then(/^I expect multisample rec\.alt to contain \["(.*?)"\]$/) do |arg1|
 18 |   expect(@rec1.alt).to eq ["T"]
 19 | end
 20 | 
 21 | Then(/^I expect rec\.qual to be (\d+)\.(\d+)$/) do |arg1, arg2|
 22 |   expect(@rec1.qual).to eq 106.3
 23 | end
 24 | 
 25 | Then(/^I expect rec\.info\.ac to be (\d+)$/) do |arg1|
 26 |   expect(@rec1.info.ac).to eq arg1.to_i
 27 | end
 28 | Then(/^I expect rec\.info\.af to be (\d+)\.(\d+)$/) do |arg1, arg2|
 29 |   expect(@rec1.info.af).to eq 0.357
 30 | end
 31 | 
 32 | Then(/^I expect rec\.info\.dp to be (\d+)$/) do |arg1|
 33 |   expect(@rec1.info.dp).to eq 1537
 34 | end
 35 | 
 36 | Then(/^I expect rec\.info\.readposranksum to be (\d+)\.(\d+)$/) do |arg1, arg2|
 37 |   expect(@rec1.info.readposranksum).to eq 0.815
 38 | end
 39 | 
 40 | Then(/^I expect rec\.info\['dp'\] to be (\d+)$/) do |arg1|
 41 |   expect(@rec1.info['dp']).to eq 1537
 42 | end
 43 | 
 44 | Then(/^I expect rec\.info\['ReadPosRankSum'\] to be (\d+)\.(\d+)$/) do |arg1, arg2|
 45 |   expect(@rec1.info['ReadPosRankSum']).to eq 0.815
 46 | end
 47 | 
 48 | Then(/^I expect rec\.info\.fields to contain \["(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)", "(.*?)"\]$/) do |arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15|
 49 |   expect(@rec1.info.fields).to eq ["AC", "AF", "AN", "BASEQRANKSUM", "DP", "DELS", "FS", "HAPLOTYPESCORE", "MLEAC", "MLEAF", "MQ", "MQ0", "MQRANKSUM", "QD", "READPOSRANKSUM"]
 50 | end
 51 | 
 52 | Then(/^I expect rec\.sample\.original\.gt to be "(.*?)"$/) do |arg1|
 53 |   expect(@rec1.sample['Original'].gt).to eq "0/1"
 54 | end
 55 | 
 56 | Then(/^I expect rec\.original\.gt to be "(.*?)"$/) do |arg1|
 57 |   expect(@rec1.original.gt).to eq "0/1"
 58 | end
 59 | 
 60 | Then(/^I expect rec\.sample\['Original'\]\.gt to be "(.*?)"$/) do |arg1|
 61 |   expect(@rec1.sample['Original'].gt).to eq "0/1"
 62 | end
 63 | 
 64 | Then(/^I expect rec\.sample\['Original'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 65 |   expect(@rec1.sample['Original'].ad).to eq [189,25]
 66 | end
 67 | 
 68 | Then(/^I expect rec\.sample\['Original'\]\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 69 |   expect(@rec1.sample['Original'].gt).to eq "0/1"
 70 | end
 71 | 
 72 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.ad to be \[(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4|
 73 |   expect(@rec1.sample['s3t2'].ad).to eq [167,26]
 74 | end
 75 | 
 76 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.dp to be (\d+)$/) do |arg1, arg2, arg3|
 77 |   expect(@rec1.sample['s3t2'].dp).to eq 196
 78 | end
 79 | 
 80 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.gq to be (\d+)$/) do |arg1, arg2, arg3|
 81 |   expect(@rec1.sample['s3t2'].gq).to eq 20
 82 | end
 83 | 
 84 | Then(/^I expect rec\.sample\['s(\d+)t(\d+)'\]\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
 85 |   expect(@rec1.sample['s3t2'].pl).to eq [20,0,522]
 86 | end
 87 | 
 88 | Then(/^I expect rec\.sample\.original\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 89 |   expect(@rec1.sample.original.gt).to eq "0/1"
 90 | end
 91 | 
 92 | Then(/^I expect rec\.sample\.s(\d+)t(\d+)\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
 93 |   expect(@rec1.sample.s3t2.pl).to eq [20,0,522]
 94 | end
 95 | 
 96 | Then(/^I expect rec\.original\.gt to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
 97 |   expect(@rec1.original.gt).to eq "0/1"
 98 | end
 99 | 
100 | Then(/^I expect rec\.s(\d+)t(\d+)\.pl to be \[(\d+),(\d+),(\d+)\]$/) do |arg1, arg2, arg3, arg4, arg5|
101 |   expect(@rec1.s3t2.pl).to eq [20,0,522]
102 | end
103 | 
104 | Then(/^I expect test rec\.missing_samples\? to be false$/) do
105 |   expect(@rec1.missing_samples?).to be false
106 | end
107 | 
108 | Then(/^I expect test rec\.original\? to be true$/) do
109 |   expect(@rec1.original?).to be true
110 | end
111 | 
112 | Then(/^I expect rec\.missing_samples\? to be true$/) do
113 |   expect(@rec1.missing_samples?).to be true
114 | end
115 | 
116 | Then(/^I expect rec\.original\? to be true$/) do
117 |   expect(@rec1.original?).to be true
118 | end
119 | 
120 | Then(/^I expect rec\.original\? to be false$/) do
121 |   expect(@rec1.original?).to eq false
122 | end
123 | 
124 | Then(/^I expect rec\.sample\.s(\d+)t(\d+)\? to be false$/) do |arg1, arg2|
125 |   expect(@rec1.sample.s1t1?).to eq false
126 | end
127 | 
128 | Then(/^I expect rec\.sample\.s(\d+)t(\d+)\? to be true$/) do |arg1, arg2|
129 |   expect(@rec1.sample.s3t2?).to eq true
130 | end
131 | 
132 | Then(/^I expect rec\.valid\? to be true$/) do
133 |   expect(@rec1.valid?).to eq true
134 | end
135 | 
136 | Then(/^I expect r\.original\.gt\? to be true$/) do
137 |   expect(@rec1.original.gt?).to be true
138 | end
139 | 
140 | Then(/^I expect r\.original\? to be true$/) do
141 |   expect(@rec1.original?).to be true
142 | end
143 | 
144 | Then(/^I expect rec\.original\? to be true$/) do
145 |   expect(@rec1.original?).to be true
146 | end
147 | 
148 | Then(/^I expect rec\.original\.gt\? to be true$/) do
149 |   expect(@rec1.original.gt?).to be true
150 | end
151 | 
152 | Then(/^I expect r\.original\.gti\? to be true$/) do
153 |   expect(@rec1.original.gti?).to eq true
154 | end
155 | 
156 | Then(/^I expect r\.original\.gti to be \[(\d+),(\d+)\]$/) do |arg1, arg2|
157 |   expect(@rec1.original.gti).to eq [arg1.to_i,arg2.to_i]
158 | end
159 | 
160 | Then(/^I expect r\.original\.gti\[(\d+)\] to be (\d+)$/) do |arg1, arg2|
161 |   expect(@rec1.original.gti[arg1.to_i]).to eq arg2.to_i
162 | end
163 | 
164 | Then(/^I expect r\.original\.gts\? to be true$/) do
165 |   expect(@rec1.original.gts?).to eq true
166 | end
167 | 
168 | Then(/^I expect r\.original\.gts to be \["(.*?)","(.*?)"\]$/) do |arg1, arg2|
169 |   expect(@rec1.original.gts).to eq [arg1,arg2]
170 | end
171 | 
172 | Then(/^I expect r\.original\.gts\[(\d+)\] to be "(.*?)"$/) do |arg1, arg2|
173 |   expect(@rec1.original.gts[arg1.to_i]).to eq arg2
174 | end
175 | 
176 | Then(/^I expect r\.info\.end to be (\d+)$/) do |arg1|
177 |   expect(@rec1.info.end).to eq arg1.to_i
178 | end
179 | 
180 | Then(/^I expect r\.info\.ciend to be (\d+)$/) do |arg1|
181 |   expect(@rec1.info.ciend).to eq arg1.to_i
182 | end
183 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfgenotypefield.rb:
--------------------------------------------------------------------------------
  1 | module BioVcf
  2 | 
  3 |   MAXINT=100_000
  4 | 
  5 |   class ValueError < Exception
  6 |   end
  7 | 
  8 |   module VcfValue
  9 |     def VcfValue::empty? v
 10 |       v == nil or v == '' or v == '.'
 11 |     end
 12 |   end
 13 | 
 14 |   # Helper class for a list of (variant) values, such as A,G.
 15 |   # The [] function does the hard work. You can pass in an index (integer)
 16 |   # or nucleotide which translates to an index.
 17 |   # (see ./features for examples)
 18 |   class VcfNucleotideCount4
 19 |     def initialize alt,list
 20 |       @alt = alt
 21 |       @list = list.split(/,/).map{|i| i.to_i}
 22 |     end
 23 | 
 24 |     def [] idx
 25 |       if idx.kind_of?(Integer)
 26 |         # return a value
 27 |         @list[idx]
 28 |       elsif idx.kind_of?(String)
 29 |         # return a value
 30 |         @list[["A","C","G","T"].index(idx)]
 31 |       else idx.kind_of?(Array)
 32 |         # return a list of values
 33 |         idx.map { |nuc|
 34 |           idx2 = ["A","C","G","T"].index(nuc)
 35 |           # p [idx,nuc,idx2,@list]
 36 |           @list[idx2]
 37 |         }
 38 |       end
 39 |     end
 40 | 
 41 |     def to_ary
 42 |       @list
 43 |     end
 44 | 
 45 |     # Return the max value on the nucleotides in the list (typically rec.alt)
 46 |     def max list = @alt
 47 |       values = self[list]
 48 |       values.reduce(0){ |memo,v| (v>memo ? v : memo) }
 49 |     end
 50 | 
 51 |     def min list = @alt
 52 |       values = self[list]
 53 |       values.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
 54 |     end
 55 | 
 56 |     def sum list = @alt
 57 |       values = self[list]
 58 |       values.reduce(0){ |memo,v| v+memo }
 59 |     end
 60 | 
 61 | 
 62 |   end
 63 | 
 64 |   # Handle info fields with multiple entries, possibly relating to ALT (single nucleotide only)
 65 |   class VcfAltInfoList
 66 |     def initialize alt,list
 67 |       @alt = alt
 68 |       @list = list.split(/,/).map{|i| i.to_i}
 69 |     end
 70 | 
 71 |     def [] idx
 72 |       if idx.kind_of?(Integer)
 73 |         @list[idx].to_i
 74 |       elsif idx.kind_of?(String)
 75 |         @list[@alt.index(idx)].to_i
 76 |       else idx.kind_of?(Array)
 77 |         idx.map { |nuc|
 78 |           idx2 = @alt.index(nuc)
 79 |           # p [idx,nuc,idx2,@list]
 80 |           @list[idx2].to_i
 81 |         }
 82 |       end
 83 |     end
 84 | 
 85 |     def to_ary
 86 |       @list
 87 |     end
 88 | 
 89 |     # Return the max value on the nucleotides in the list (typically rec.alt)
 90 |     def max
 91 |       @list.reduce(0){ |memo,v| (v>memo ? v : memo) }
 92 |     end
 93 | 
 94 |     def min
 95 |       @list.reduce(MAXINT){ |memo,v| (v<memo ? v : memo) }
 96 |     end
 97 | 
 98 |     def sum
 99 |       @list.reduce(0){ |memo,v| v+memo }
100 |     end
101 |   end
102 | 
103 |   class VcfGenotypeField
104 | 
105 |     attr_reader :format, :values, :header
106 | 
107 |     def initialize sample_num, s, format, header, ref, alt
108 |       @is_empty = VcfSample::empty?(s)
109 |       @sample_num = sample_num
110 |       @original_s = s
111 |       @format = format
112 |       @header = header
113 |       @ref = ref
114 |       @alt = alt
115 |     end
116 | 
117 |     def to_s
118 |       @original_s
119 |     end
120 | 
121 |     def name
122 |       @header.samples[@sample_num]
123 |     end
124 | 
125 |     def values
126 |       @cache_values ||= @original_s.split(/:/)
127 |     end
128 | 
129 |     def empty?
130 |       @is_empty
131 |     end
132 | 
133 |     def valid?
134 |       !empty?
135 |     end
136 | 
137 |     def dp4
138 |       ilist('DP4')
139 |     end
140 |     def ad
141 |       ilist('AD')
142 |     end
143 |     def pl
144 |       ilist('PL')
145 |     end
146 | 
147 |     def bcount
148 |       VcfNucleotideCount4.new(@alt,values[fetch('BCOUNT')])
149 |     end
150 | 
151 |     def bq
152 |       VcfAltInfoList.new(@alt,values[fetch('BQ')])
153 |     end
154 | 
155 |     def amq
156 |       VcfAltInfoList.new(@alt,values[fetch('AMQ')])
157 |     end
158 | 
159 |     def gti?
160 |       not VcfValue::empty?(fetch_value("GT"))
161 |     end
162 | 
163 |     def gti
164 |       gt.split(/[\/\|]/).map { |g| g.to_i }
165 |     end
166 | 
167 |     def gts?
168 |       not VcfValue::empty?(fetch_value("GT"))
169 |     end
170 | 
171 |     def gts
172 |       genotypes = [@ref] + @alt
173 |       gti.map { |i| genotypes[i] }
174 |     end
175 | 
176 |     # Returns the value of a field
177 |     def method_missing(m, *args, &block)
178 |       return nil if @is_empty
179 |       if m =~ /\?$/
180 |         # query if a value exists, e.g., r.info.dp? or s.dp?
181 |         v = values[fetch(m.to_s.upcase.chop)]
182 |         return (not VcfValue::empty?(v))
183 |       else
184 |         v = values[fetch(m.to_s.upcase)]
185 |         return nil if VcfValue::empty?(v)
186 |         return v.to_i if v =~ /^\d+$/
187 |         return v.to_f if v =~ /^\d+\.\d+$/
188 |         v
189 |       end
190 |     end
191 | 
192 |   private
193 | 
194 |     # Fetch a value and throw an error if it does not exist
195 |     def fetch name
196 |       raise "ERROR: Field with name <#{name}> does not exist!" if !@format[name]
197 |       @format[name]
198 |     end
199 | 
200 |     def fetch_value name
201 |       values[fetch(name)]
202 |     end
203 | 
204 |     # Return an integer list
205 |     def ilist name
206 |       v = fetch_value(name)
207 |       return nil if not v
208 |       v.split(',').map{|i| i.to_i}
209 |     end
210 | 
211 |   end
212 | 
213 |   # Holds all samples
214 |   class VcfGenotypeFields
215 |     def initialize fields, format, header, ref, alt
216 |       @fields = fields
217 |       @format = format
218 |       @header = header
219 |       @ref = ref
220 |       @alt = alt
221 |       @samples = {} # lazy cache
222 |       @sample_index = @header.sample_index()
223 |     end
224 | 
225 |     def [] name
226 |       begin
227 |         if name.is_a? String
228 |           field_num = @sample_index[name]
229 |         else
230 |           field_num = name + 9 # assume integer
231 |         end
232 |         @samples[name] ||= VcfGenotypeField.new(field_num-9,@fields[field_num],@format,@header,@ref,@alt)
233 |       rescue TypeError
234 |         $stderr.print "Unknown field name <#{name}> in record, did you mean r.info.#{name}?\n"
235 |         raise
236 |       end
237 |     end
238 | 
239 |     def method_missing(m, *args, &block)
240 |       name = m.to_s
241 |       if name =~ /\?$/
242 |         # test for valid sample
243 |         return !VcfSample::empty?(@fields[@sample_index[name.chop]])
244 |       else
245 |         num = @sample_index[name]-9
246 |         @samples[name] ||= VcfGenotypeField.new(num,@fields[@sample_index[name]],@format,@header,@ref,@alt)
247 |       end
248 |     end
249 | 
250 |   end
251 | end
252 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/vcfrecord.rb:
--------------------------------------------------------------------------------
  1 | module BioVcf
  2 | 
  3 |   class VcfRecordInfo
  4 |     def initialize s
  5 |       @info = s
  6 |     end
  7 | 
  8 |     def to_s
  9 |       if @h
 10 |         @h.map { |k,v| (v ? @original_key[k] + '=' + v : @original_key[k])  }.join(';')
 11 |       else
 12 |         @info
 13 |       end
 14 |     end
 15 | 
 16 |     def [] k
 17 |       # split_fields if not @h
 18 |       # /#{m}=(?<value>[^;])/.@info
 19 |       kupper = k.upcase
 20 |       v = if @h
 21 |             @h[kupper]
 22 |           else
 23 |             @info =~ /[\A;]#{k}=([^;]+)/i
 24 |             value = $1
 25 |             # p [m,value]
 26 |             # m = @info.match(/#{m.to_s.upcase}=(?<value>[^;]+)/) slower!
 27 |             # value = m[:value]
 28 |             if value == nil
 29 |               split_fields # no option but to split
 30 |               @h[kupper]
 31 |             else
 32 |               value
 33 |             end
 34 |           end
 35 |       ConvertStringToValue::convert(v)
 36 |     end
 37 | 
 38 |     # Set INFO fields (used by --rewrite)
 39 |     def []= k, v
 40 |       split_fields if not @h
 41 |       kupper = k.upcase
 42 |       @h[kupper] = v
 43 |       @original_key[kupper] = k
 44 |     end
 45 | 
 46 |     def fields
 47 |       split_fields
 48 |       @h.keys
 49 |     end
 50 | 
 51 |     def method_missing(m, *args, &block)
 52 |       self[m.to_s]
 53 |     end
 54 | 
 55 |   private
 56 | 
 57 |     def split_fields
 58 |       return @h if @h
 59 |       @h = {}
 60 |       @original_key = {}
 61 |       @info.split(/;/).each do |f|
 62 |         k,v = f.split(/=/)
 63 |         kupper = k.upcase
 64 |         @h[kupper] = v
 65 |         @original_key[kupper] = k
 66 |       end
 67 |     end
 68 |   end
 69 | 
 70 |   module VcfRecordParser
 71 |     # Parse the format field into a Hash
 72 |     def VcfRecordParser.get_format s
 73 |       if s==$cached_sample_format_s
 74 |         $cached_sample_format
 75 |       else
 76 |         h = {}
 77 |         s.split(/:/).each_with_index { |v,i| h[v] = i }
 78 |         $cached_sample_format = h
 79 |         $cached_sample_format_s = s
 80 |         h
 81 |       end
 82 |     end
 83 |     def VcfRecordParser.get_info s
 84 |       VcfRecordInfo.new(s)
 85 |     end
 86 |   end
 87 | 
 88 |   module VcfRecordCall
 89 |     def call_diff
 90 |       Variant.diff(normal.bcount.to_ary,tumor.bcount.to_ary)
 91 |     end
 92 | 
 93 |     def call_nuc
 94 |       ['A','C','G','T'][index()]
 95 |     end
 96 | 
 97 |     # Get the GT when 0 is REF and >0 is ALT
 98 |     def get_gt(index)
 99 |       if index == 0
100 |         ref()
101 |       else
102 |         alt[index-1]
103 |       end
104 |     end
105 | 
106 |     def call_tumor_count
107 |       tumor.bcount.to_ary[index()]
108 |     end
109 | 
110 |     def call_tumor_relative_count
111 |       Variant.relative_diff(normal.bcount.to_ary,tumor.bcount.to_ary)[index()]
112 |     end
113 | 
114 |     def call_normal_count
115 |       normal.bcount.to_ary[index()]
116 |     end
117 | 
118 |     def index
119 |       Variant.index(self.normal.bcount.to_ary,self.tumor.bcount.to_ary)
120 |     end
121 |   end
122 | 
123 |   class VcfRecord
124 | 
125 |     include VcfRecordCall
126 | 
127 |     attr_reader :header
128 | 
129 |     def initialize fields, header
130 |       @fields = fields
131 |       @header = header
132 |       @sample_by_index = []
133 |     end
134 | 
135 |     def chrom
136 |       @fields[0]
137 |     end
138 | 
139 |     alias :chr :chrom
140 | 
141 |     def pos
142 |       @pos ||= @fields[1].to_i
143 |     end
144 | 
145 |     def ids
146 |       @ids ||= @fields[2].split(';')
147 |     end
148 | 
149 |     def id
150 |       ids[0]
151 |     end
152 | 
153 |     def ref
154 |       @refs ||= @fields[3]
155 |     end
156 | 
157 |     def alt
158 |       @alt ||= @fields[4].split(/,/)
159 |     end
160 | 
161 |     def qual
162 |       @qual ||= @fields[5].to_f
163 |     end
164 | 
165 |     def filter
166 |       @filter ||= @fields[6]
167 |     end
168 | 
169 |     def info
170 |       @info ||= VcfRecordParser.get_info(@fields[7])
171 |     end
172 | 
173 |     def format
174 |       @format ||= VcfRecordParser.get_format(@fields[8])
175 |     end
176 | 
177 |     # Return the first (single) sample (used in one sample VCF)
178 |     def first
179 |       @first ||= VcfGenotypeField.new(0,@fields[9],format,@header,ref,alt)
180 |     end
181 | 
182 |     # Return the normal sample (used in two sample VCF)
183 |     def normal
184 |       first
185 |     end
186 | 
187 |     # Return the tumor sample (used in two sample VCF)
188 |     def tumor
189 |       @tumor ||= VcfGenotypeField.new(1,@fields[10],format,@header,ref,alt)
190 |     end
191 | 
192 |     # Return the sample as a named hash
193 |     def sample
194 |       @sample ||= VcfGenotypeFields.new(@fields,format,@header,ref,alt)
195 |     end
196 | 
197 |     def sample_by_name name
198 |       sample[name]
199 |     end
200 | 
201 |     def sample_by_index i
202 |       raise "Can not index sample on parameter <#{i}>" if not i.kind_of?(Integer)
203 |       @sample_by_index[i] ||= VcfGenotypeField.new(i,@fields[i+9],format,@header,ref,alt)
204 |     end
205 | 
206 |     # Walk the samples. list contains an Array of int (the index)
207 |     def each_sample(list = nil)
208 |       @header.sample_subset_index(list).each { |i|
209 |         yield VcfSample::Sample.new(i,self,sample_by_index(i))
210 |       }
211 |     end
212 | 
213 |     def samples
214 |       list = []
215 |       each_sample { |s| list << s }
216 |       list
217 |     end
218 | 
219 |     def missing_samples?
220 |       @fields[9..-1].each { |sample|
221 |         return true if VcfSample::empty?(sample)
222 |       }
223 |       false
224 |     end
225 | 
226 |     def valid?
227 |       @fields.size == @header.column_names.size
228 |     end
229 | 
230 |     def eval expr, ignore_missing_data: true, quiet: false
231 |       begin
232 |         if not respond_to?(:call_cached_eval)
233 |           code =
234 |           """
235 |           def call_cached_eval(rec,fields)
236 |             r = rec
237 |             #{expr}
238 |           end
239 |           """
240 |           self.class.class_eval(code)
241 |         end
242 |         res = call_cached_eval(self,@fields)
243 |         if res.kind_of?(Array)
244 |           res.join("\t")
245 |         else
246 |           res
247 |         end
248 |       rescue NoMethodError => e
249 |         if not quiet
250 |           $stderr.print "RECORD ERROR!\n"
251 |           $stderr.print [@fields],"\n"
252 |           $stderr.print expr,"\n"
253 |           $stderr.print "To ignore this error use the -i switch!\n"
254 |         end
255 |         if ignore_missing_data
256 |           $stderr.print e.message if not quiet
257 |           return false
258 |         else
259 |           raise
260 |         end
261 |       end
262 |     end
263 | 
264 |     def gfilter expr, ignore_missing_data: true, quiet: false
265 |       begin
266 |         if not respond_to?(:call_cached_filter)
267 |           code =
268 |           """
269 |           def call_cached_gfilter(rec,fields)
270 |             r = rec
271 |             #{expr}
272 |           end
273 |           """
274 |           self.class.class_eval(code)
275 |         end
276 |         res = call_cached_gfilter(self,@fields)
277 |         if res.kind_of?(Array)
278 |           res.join("\t")
279 |         else
280 |           res
281 |         end
282 |       rescue NoMethodError => e
283 |         if not quiet
284 |           $stderr.print "RECORD ERROR!\n"
285 |           $stderr.print [@fields],"\n"
286 |           $stderr.print expr,"\n"
287 |           $stderr.print "To ignore this error use the -i switch!\n"
288 |         end
289 |         if ignore_missing_data
290 |           $stderr.print e.message if not quiet
291 |           return false
292 |         else
293 |           raise
294 |         end
295 |       end
296 |     end
297 | 
298 |     def add_to_filter_field str
299 |       filter = @fields[6]
300 |       if not filter or filter == '.' or filter == 'PASS'
301 |         filter = str
302 |       else
303 |         values = filter.split(/;/)
304 |         if not values.include?(str)
305 |           filter = filter +';'+str
306 |         end
307 |       end
308 |       filter = '.' if filter == nil or filter == ''
309 |       @fields[6] = filter
310 |       filter
311 |     end
312 | 
313 |     # Return the sample
314 |     def method_missing(m, *args, &block)
315 |       name = m.to_s
316 |       if name =~ /\?$/
317 |         # Query for empty sample name
318 |         @sample_index ||= @header.sample_index
319 |         return !VcfSample::empty?(@fields[@sample_index[name.chop]])
320 |       else
321 |         sample[name]
322 |       end
323 |     end
324 | 
325 |   end
326 | end
327 | 


--------------------------------------------------------------------------------
/doc/GATK_comparison.md:
--------------------------------------------------------------------------------
  1 | # Comparing two large GATK files
  2 | 
  3 | This is the exercise to explore the differences in the full BWA-GATK
  4 | pipeline vs. a chunking-scatter-gather approach that is magnitudes
  5 | faster. Using my tools
  6 | [bio-vcf](https://github.com/pjotrp/bioruby-vcf),
  7 | [bio-table](https://github.com/pjotrp/bioruby-table) and
  8 | [bio-locus](https://github.com/pjotrp/bio-locus) interesting
  9 | edge effects were found.
 10 | 
 11 | The GATK output variant files are similar in size:
 12 | 
 13 |     -rw-r--r-- 1 pjotr users 992725762 Aug 29 11:22 HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf
 14 |     -rw-r--r-- 1 pjotr users 987147441 Aug 29 11:26 HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf
 15 | 
 16 | Naming suggests the second one (scatter) is unsorted but it is actually sorted.
 17 | 
 18 | Install bio-vcf, add it to the path if required, and you should see
 19 | 
 20 |     gem env
 21 |     export PATH=$GEM_HOME/bin:PATH
 22 |     gem install bio-vcf
 23 |     bio-vcf
 24 |     bio-vcf 0.9.0 (biogem Ruby 2.1.2 with pcows) by Pjotr Prins 2015
 25 | 
 26 | Create simple position files with calls
 27 | 
 28 |     /usr/bin/time -v bio-vcf -e '[r.chrom,r.pos,r.alt]' < scatter/HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf > scatter_calls.vcf
 29 |     /usr/bin/time -v bio-vcf -e '[r.chrom,r.pos,r.alt]' < full/HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf > full_calls.vcf
 30 | 
 31 | Count the calls (we are ignoring the limited header info)
 32 | 
 33 |     wc -l scatter_calls.vcf 
 34 |       4773423 scatter_calls.vcf
 35 | 
 36 |     wc -l full_calls.vcf 
 37 |       4795998 full_calls.vcf
 38 | 
 39 | (4795998-4773423)/4795998*100 or 0.5%. Do a diff and count de diffs
 40 | 
 41 |     egrep -c '^>' calls.diff 
 42 |       30401
 43 |     egrep -c '^<' calls.diff 
 44 |       52976
 45 | 
 46 | 52976/4795998*100 or 1.1% of different calls, hmmm. Remove the GL contig
 47 | 
 48 |     grep -v GL00 calls.diff > calls_wo_GL00.diff
 49 |     egrep -c '^<' calls_wo_GL00.diff 
 50 |       48797
 51 | 
 52 | Now install bio-table and bio-locus
 53 | 
 54 |      gem install bio-table
 55 |      bio-table
 56 |        bio-table 1.0.0 Copyright (C) 2012-2014 Pjotr Prins <pjotr.prins@thebird.nl>
 57 |      gem install bio-locus
 58 |      bio-locus
 59 |        bio-locus 0.0.6 (biogem Ruby 2.1.2) by Pjotr Prins 2014
 60 | 
 61 | Create a new VCF file using the diff information. Find all the calls
 62 | 
 63 |      egrep '^(<|>)' calls_wo_GL00.diff |grep -v GATKCommandLine| grep -v CHROM > all_diff.txt
 64 |      bio-table --columns 0,1 < all_diff.txt > chrom_pos_diff.txt
 65 | 
 66 | with vi remove first > and < and make sure there is a tab: 
 67 | 
 68 |      %s/^..//g
 69 |      %s/$/^INA/g
 70 | 
 71 | Now use bio-locus to create full VCF files containing only these entries
 72 | 
 73 |      bio-locus --store --alt exclude < chrom_pos_diff.txt 
 74 |      bio-locus 0.0.6 (biogem Ruby 2.1.2) by Pjotr Prins 2014
 75 |        Stored 73644 positions out of 75414 in locus.db (1770 duplicate hits)
 76 |      bio-locus --match --alt exclude < ../full/HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf > diff_full.vcf
 77 |      bio-locus --match --verbose -d < ../scatter/HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf > diff_scatter.vcf
 78 |      (note: bio-locus is - still - a slow tool)
 79 | 
 80 | Interestingly there is very little overlap between the call positions (only 1770 are shared)...
 81 | 
 82 |      wc -l diff*.vcf
 83 |        48903 diff_full.vcf
 84 |        26731 diff_scatter.vcf
 85 | 
 86 | So, arguably the difference is
 87 | (48903+26731)/4795998*100 or 1.6% of calls.
 88 | 
 89 | Now we can compare the call contents
 90 | 
 91 |      bio-vcf -i -e '[r.chrom,r.pos,r.info.af]' --seval 's.dp' < diff_scatter.vcf > diff_scatter_af_sdp.txt
 92 | 
 93 | For those that do match we see a difference in sample read depth
 94 | pointing out the two methods differ in placing reads. So, let's see
 95 | if we can find significant differences in frequency and read depth.
 96 | 
 97 | First I am reducing the data to one chromosome to be able to work a bit
 98 | faster
 99 | 
100 |      bio-vcf --filter 'r.chrom=="3"' < ../full/HiSeqX_R1.fastq.sorted_dedup_realigned.bam.realigned.raw_variants.vcf > full.vcf
101 |      bio-vcf --filter 'r.chrom=="3"' < ../scatter/HiSeqX_R1.fastq_dedup_realigned.bam.realigned.raw_variants.vcf > scatter.vcf
102 |      wc -l *.vcf
103 |        301922 full.vcf
104 |        300326 scatter.vcf
105 | 
106 |      bio-vcf -i -e '[r.pos,r.alt,r.info.af]' --seval 's.dp' < full.vcf > full_af_dp.txt
107 |      bio-vcf -i -e '[r.pos,r.alt, r.info.af]' --seval 's.dp' < scatter.vcf > scatter_af_dp.txt
108 |      wc -l *.txt
109 |        40001 full_af_dp.txt
110 |        40001 scatter_af_dp.txt
111 |      diff scatter_af_dp.txt full_af_dp.txt |grep -c '>'
112 |        2189
113 | 
114 | Differences typically look like 
115 | 
116 |       < 402884        G       0.5     26
117 |       < 403020        A       0.5     21
118 |       ---
119 |       > 402884        G       0.5     29
120 |       > 403020        A       0.5     25
121 | 
122 | i.e. the scatter approach has a different read depth 26 instead of 29
123 | and 21 instead of 25 - reads end up in other places. Frequency-wise we
124 | don't see much differentce. More intriguing, a difference would be
125 | 
126 |     1525d1529
127 |     < 663038        CATATGTTATATGTGTATGTATTGTATACAT 1.0     4
128 | 
129 | where an extra call for an insertion was made in the scatter approach
130 | with a DP of 4.
131 | 
132 | Now let's quantify how much DP differs 
133 | 
134 | Combine the tables
135 | 
136 |       bio-table --columns 0,3 < full_af_dp.txt > f_dp.txt 
137 |       bio-table --columns 0,3 < scatter_af_dp.txt > s_dp.txt
138 |       bio-table --merge f_dp.txt s_dp.txt > merged.txt
139 | 
140 | After editing the table header to show 'pos\tfull\tscatter' 
141 | 
142 |       bio-table --num-filter 'value[1]!=value[0]' < merged.txt |wc -l
143 |         2416
144 | 
145 | it shows that for chromosome 3, 2416 calls out of 301922 (0.8%) have a
146 | different read depth. 
147 | 
148 |       bio-table --verbose --debug --num-filter '(value[1]-value[0]).abs > 3' < merged_no_NA.txt|wc -l
149 |         356
150 | 
151 | 356 (0.1% of total calls) showed a read depth difference larger than 4
152 | reads. And 61 showed a read depth difference larger than 10 reads:
153 | 
154 |       fedor13:~/bcosc/gvcf/chr3$ bio-table --verbose --debug --num-filter '(value[1]-value[0]).abs > 10' < merged_no_NA.txt 
155 |       bio-table 1.0.0 Copyright (C) 2012-2014 Pjotr Prins <pjotr.prins@thebird.nl>
156 |        INFO bio-table: Array: [{:show_help=>false, :write_header=>true, :skip=>0, :debug=>true, :num_filter=>"(value[1]-value[0]).abs > 10"}]
157 |       DEBUG bio-table: Filtering on (value[1]-value[0]).abs > 10
158 |        INFO bio-table: Array: ["pos", "full", "scatter"]
159 |       pos             full    scatter
160 |       855606          24      9
161 |       855609          24      10
162 |       855610          24      10
163 |       855617          22      9
164 |       1432738         17      3
165 |       1434184         35      22
166 |       3173353         37      7
167 |       3713421         42      11
168 |       3713601         31      13
169 |       3713646         39      24
170 |       3713647         39      24
171 |       3713669         43      29
172 |       3714214         33      19
173 |       3762444         18      3
174 |       3764753         29      15
175 |       3764808         29      15
176 |       3764830         26      15
177 |       3764904         22      11
178 |       3764918         24      13
179 |       3937379         26      13
180 |       3937468         19      7
181 |       4005161         17      4
182 |       4005568         35      22
183 |       4297382         27      13
184 |       4297644         34      20
185 |       4958959         31      10
186 |       4958960         31      10
187 |       4959272         35      11
188 |       4959609         27      9
189 |       4960175         27      5
190 |       6159995         17      0.5
191 |       8432601         34      23
192 |       8432973         24      13
193 |       8432986         26      15
194 |       11414301        12      28
195 |       11414307        12      28
196 |       11414313        12      28
197 |       11414323        12      28
198 |       11528593        21      6
199 |       12070044        39      23
200 |       12070272        34      18
201 |       12071772        34      22
202 |       15010075        27      16
203 |       17740441        43      27
204 |       17740468        52      33
205 |       18367833        26      13
206 |       18367863        29      16
207 |       18367950        32      19
208 |       18973094        39      27
209 |       18976962        30      16
210 |       18977299        20      8
211 |       19000198        42      29
212 |       19000214        40      26
213 |       19898931        15      2
214 |       19898945        13      2
215 |       19898997        16      5
216 |       19899206        37      24
217 |       21293536        40      25
218 |       21293836        26      15
219 |       21294685        36      24
220 | 
221 | Based on this information (you can see clustering at certain 'hot
222 | spots') I suspect that these are border effects in the regions where
223 | chunking took place. Note also that scatter more often has *less*
224 | reads which means we are missing reads because of edges (it is a BWA
225 | thing).
226 | 
227 | Even so, even *with* these downsides, the method may work for rapid
228 | diagnostics, provided the chunking affected FN/FP calls do not fall in
229 | the regions of interest. For this setup, where time to diagnostic
230 | counts (including cancer), it may prove a valuable approach. I also
231 | suggest we find a way of setting chunking in regions of little
232 | interest (outside coding genes and or regions of low variation as Brad
233 | suggested).
234 | 
235 | 


--------------------------------------------------------------------------------
/ragel/gen_vcfheaderline_parser.rl:
--------------------------------------------------------------------------------
  1 | # Ragel lexer for VCF-header
  2 | #
  3 | # This is compact a parser/lexer for the VCF header format. Bio-vcf
  4 | # uses the parser to generate meta information that can be output to
  5 | # (for example) JSON format. The advantage of using ragel as a state
  6 | # engine is that it allows for easy parsing of key-value pairs with
  7 | # syntax checking and, for example, escaped quotes in quoted string
  8 | # values. This ragel parser/lexer generates valid Ruby; it should be
  9 | # fairly trivial to generate python/C/JAVA instead. Note that this
 10 | # edition validates ID and Number fields only.  Other fields are
 11 | # dumped 'AS IS'.
 12 | #
 13 | # Note the .rb version is generated from ./ragel/gen_vcfheaderline_parser.rl
 14 | #
 15 | # by Pjotr Prins (c) 2014/2015
 16 | 
 17 | module BioVcf
 18 | 
 19 |   module VcfHeaderParser
 20 | 
 21 |     module RagelKeyValues
 22 | 
 23 |       def self.debug msg
 24 |         # nothing
 25 | 	# $stderr.print "DEBUG: ",msg,"\n"
 26 |       end
 27 |       
 28 | =begin
 29 | %%{
 30 | 
 31 |   machine simple_lexer;
 32 |   
 33 |   action mark { ts=p }
 34 |   action endquoted {
 35 |     emit.call(:value,data,ts,p)
 36 |   }
 37 | 
 38 |   action kw {
 39 |     emit.call(:kw,data,ts,p)
 40 |   }
 41 |   
 42 |   squote = "'";
 43 |   dquote = '"';
 44 |   not_squote_or_escape = [^'\\];
 45 |   not_dquote_or_escape = [^"\\];
 46 |   escaped_something = /\\./;
 47 |   ss = squote ( not_squote_or_escape | escaped_something )* >mark %endquoted squote;
 48 |   dd = dquote ( not_dquote_or_escape | escaped_something )* >mark %endquoted dquote;
 49 | 
 50 |   integer     = ('+'|'-')?digit+;
 51 |   float       = ('+'|'-')?digit+'.'digit+;
 52 |   assignment  = '=';
 53 |   identifier  = ( alnum (alnum|'.'|'_')* ); 
 54 |   version     = ( digit (alnum|'.'|'_'|'-')* ); 
 55 |   str         = (ss|dd)* ;
 56 |   boolean     = '.';
 57 |   date        = str;
 58 |   key_word    = ( ('Type'|'Description'|'Source'|identifier - ('ID'|'Number'|'length'|'Version'|'assembly'|'Date'|'CommandLineOptions')) >mark %{ emit.call(:key_word,data,ts,p) } );
 59 |   any_value   = ( str|( integer|float|boolean|identifier >mark %{ emit.call(:value,data,ts,p) } ));
 60 |   id_value   = ( identifier >mark %{ emit.call(:value,data,ts,p) } );
 61 | 
 62 |   version_value  = ( str| ( version >mark %{ emit.call(:value,data,ts,p) } ));
 63 |   date_value  = ( date );
 64 |   gatk_value  = ( str );
 65 |   number_value = ( ( integer|boolean|'A'|'R'|'G' ) >mark %{ emit.call(:value,data,ts,p) } );
 66 | 
 67 |   id_kv     = ( ( ('ID'|'assembly') %kw '=' id_value ) %{ debug("ID FOUND") } @!{ error_code="Malformed ID"} );
 68 |   version_kv = ( ( ('Version') %kw '=' version_value ) @!{ error_code="Version"} );
 69 |   number_kv = ( ( ('Number'|'length') %kw '=' number_value ) @!{ error_code="Number"} );
 70 |   date_kv =  ( ( ('Date') %kw '=' date_value ) %{ debug("DATE FOUND") } @!{ error_code="Date"} );
 71 |   gatk_kv =  ( ( ('CommandLineOptions') %kw '=' gatk_value ) @!{ error_code="GATK"} );
 72 |   key_value = ( id_kv | version_kv | date_kv | number_kv | gatk_kv | (key_word '=' any_value) ) %{ debug("KEY_VALUE found") } >mark @!{ error_code="unknown key-value " };
 73 | 
 74 |   main := ( '##' ('FILTER'|'FORMAT'|'contig'|'INFO'|'ALT'|'GATKCommandLine') '=') (('<'|',') key_value )* '>';
 75 | }%%
 76 | =end
 77 | 
 78 | %% write data;
 79 | # %% this just fixes syntax highlighting...
 80 | 
 81 | def self.run_lexer(buf, options = {})
 82 |   do_debug = (options[:debug] == true)
 83 |   $stderr.print "---> ",buf,"\n" if do_debug
 84 |   data = buf.unpack("c*") if(buf.is_a?(String))
 85 |   eof = data.length
 86 |   values = []
 87 |   stack = []
 88 | 
 89 |   emit = lambda { |type, data, ts, p|
 90 |     # Print the type and text of the last read token
 91 |     # p ts,p
 92 |     $stderr.print "EMITTED: #{type}: #{data[ts...p].pack('c*')}\n" if do_debug
 93 |     values << [type,data[ts...p].pack('c*')]
 94 |   }
 95 | 
 96 |   error_code = nil
 97 |   
 98 |   %% write init;
 99 |   %% write exec;
100 | 
101 |   raise "ERROR: "+error_code+" in "+buf if error_code
102 | 
103 |   begin
104 |     res = {}
105 |     # p values
106 |     values.each_slice(2) do | a,b |
107 |       $stderr.print '*',a,b if do_debug
108 |       keyword = a[1]
109 |       value = b[1]
110 |       value = value.to_i if ['length','Epoch'].index(keyword)
111 |       res[keyword] = value
112 |       # p h[:value] if h[:name]==:identifier or h[:name]==:value or h[:name]==:string
113 |     end
114 |   rescue
115 |     print "ERROR: "
116 |     p values
117 |     raise
118 |   end
119 |   $stderr.print(res,"\n") if do_debug
120 |   res
121 | end
122 |     end
123 |   end 
124 | end
125 | 
126 | if __FILE__ == $0
127 | 
128 | gatkcommandline = <<LINE1
129 | ##GATKCommandLine=<ID=CombineVariants,Version=3.2-2-gec30cee,Date="Thu Oct 30 13:41:59 CET 2014",Epoch=1414672919266,CommandLineOptions="analysis_type=CombineVariants input_file=[] showFullBamList=false read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/hpc/cog_bioinf/GENOMES/Homo_sapiens.GRCh37.GATK.illumina/Homo_sapiens.GRCh37.GATK.illumina.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=1000 baq=OFF baqGapOpenPenalty=40.0 refactor_NDN_cigar_string=false fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false variant=[(RodBindingCollection [(RodBinding name=variant source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_snps.vcf)]), (RodBindingCollection [(RodBinding name=variant2 source=/hpc/cog_bioinf/data/robert/testIAP/testSubsetExome/tmp/testSubsetExome.filtered_indels.vcf)])] out=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub genotypemergeoption=UNSORTED filteredrecordsmergetype=KEEP_IF_ANY_UNFILTERED multipleallelesmergetype=BY_TYPE rod_priority_list=null printComplexMerges=false filteredAreUncalled=false minimalVCF=false excludeNonVariants=false setKey=set assumeIdenticalSamples=false minimumN=1 suppressCommandLineHeader=false mergeInfoWithMaxAC=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
130 | LINE1
131 | 
132 | h = {}
133 | s = gatkcommandline.strip
134 | # print s,"\n"
135 | result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
136 | # h[result['ID']] = result
137 | # p result
138 | 
139 | lines = <<LINES
140 | ##FILTER=<ID=HaplotypeScoreHigh,Description="HaplotypeScore > 13.0">
141 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
142 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth",Extra="Yes?">
143 | ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
144 | ##INFO=<ID=PM,Number=0,Type=Flag,Description="Variant is Precious(Clinical,Pubmed Cited)">
145 | ##INFO=<ID=VP,Number=1,Type=String,Description="Variation Property.  Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf",Source="dbsnp",Version="138">
146 | ##INFO=<ID=GENEINFO,Number=1,Type=String,Description="Pairs each of gene symbol:gene id.  The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)">
147 | ##INFO=<ID=CLNHGVS,Number=.,Type=String,Description="Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical  INFO tags.">
148 | ##INFO=<ID=CLNHGVS1,Number=.,Type=String,Description="Variant names from \\"HGVS\\". The order of these 'variants' corresponds to the order of the info in the other clinical  INFO tags.">
149 | ##contig=<ID=XXXY12>
150 | ##contig=<ID=Y,length=59373566>
151 | LINES
152 | 
153 | h = {}
154 | lines.strip.split("\n").each { |s|
155 |   # print s,"\n"
156 |   result = BioVcf::VcfHeaderParser::RagelKeyValues.run_lexer(s, debug: true)
157 |   h[result['ID']] = result
158 |   p result
159 | }
160 | p h
161 | 
162 | raise "ERROR" if h != {"HaplotypeScoreHigh"=>{"ID"=>"HaplotypeScoreHigh", "Description"=>"HaplotypeScore > 13.0"}, "GT"=>{"ID"=>"GT", "Number"=>"1", "Type"=>"String", "Description"=>"Genotype"}, "DP"=>{"ID"=>"DP", "Number"=>"1", "Type"=>"Integer", "Description"=>"Total read depth", "Extra"=>"Yes?"}, "DP4"=>{"ID"=>"DP4", "Number"=>"4", "Type"=>"Integer", "Description"=>"# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases"}, "PM"=>{"ID"=>"PM", "Number"=>"0", "Type"=>"Flag", "Description"=>"Variant is Precious(Clinical,Pubmed Cited)"}, "VP"=>{"ID"=>"VP", "Number"=>"1", "Type"=>"String", "Description"=>"Variation Property.  Documentation is at ftp://ftp.ncbi.nlm.nih.gov/snp/specs/dbSNP_BitField_latest.pdf", "Source"=>"dbsnp", "Version"=>"138"}, "GENEINFO"=>{"ID"=>"GENEINFO", "Number"=>"1", "Type"=>"String", "Description"=>"Pairs each of gene symbol:gene id.  The gene symbol and id are delimited by a colon (:), and each pair is delimited by a vertical bar (|)"}, "CLNHGVS"=>{"ID"=>"CLNHGVS", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from HGVS. The order of these variants corresponds to the order of the info in the other clinical  INFO tags."}, "CLNHGVS1"=>{"ID"=>"CLNHGVS1", "Number"=>".", "Type"=>"String", "Description"=>"Variant names from \\\"HGVS\\\". The order of these 'variants' corresponds to the order of the info in the other clinical  INFO tags."}, "XXXY12"=>{"ID"=>"XXXY12"}, "Y"=>{"ID"=>"Y", "length"=>59373566}}
163 | 
164 | 
165 | end # test
166 | 


--------------------------------------------------------------------------------
/lib/bio-vcf/pcows.rb:
--------------------------------------------------------------------------------
  1 | # Parallel copy-on-write streaming (PCOWS)
  2 | 
  3 | require 'tempfile'
  4 | 
  5 | class PCOWS
  6 | 
  7 |   RUNNINGEXT = 'part' # file extension
  8 |   
  9 |   def initialize(num_threads,chunk_size,name=File.basename(__FILE__),timeout=180,quiet=false,debug=false)
 10 |     num_threads = cpu_count() if not num_threads # FIXME: set to cpu_num by default
 11 |     # $stderr.print "Using ",num_threads,"threads \n"
 12 |     @num_threads = num_threads
 13 |     @chunk_size = chunk_size
 14 |     @pid_list = []
 15 |     @name = name
 16 |     @timeout = timeout
 17 |     @quiet = quiet
 18 |     @debug = debug
 19 |     if @debug
 20 |       $stderr.print "PCOWS running in DEBUG MODE\n"
 21 |     end
 22 |     if multi_threaded
 23 |       @tmpdir =  Dir::mktmpdir(@name+'_')
 24 |     end
 25 |     @last_output = 0 # counter
 26 |     @output_locked = false
 27 |   end
 28 | 
 29 |   # Feed the worker 'func and state' to COWS. Note that func is a
 30 |   # lambda closure so it can pick up surrounding scope at invocation
 31 |   # in addition to the data captured in 'state'.
 32 |   
 33 |   def submit_worker(func,state)
 34 |     pid = nil
 35 |     if multi_threaded
 36 |       count = @pid_list.size+1
 37 |       fn = mktmpfilename(count)
 38 |       pid = fork do
 39 |         # ---- This is running a new copy-on-write process
 40 |         tempfn = fn+'.'+RUNNINGEXT
 41 |         STDOUT.reopen(File.open(tempfn, 'w+'))
 42 |         func.call(state).each { | line | print line }
 43 |         STDOUT.flush
 44 |         STDOUT.close
 45 |         # sleep 0.1
 46 |         # f.flush
 47 |         # f.close
 48 |         # sleep 0.2  # interval to make sure we are done writing,
 49 |                    # otherwise there may be misses at the end of a
 50 |                    # block (maybe the f.close fixed it)
 51 | 
 52 |         FileUtils::mv(tempfn,fn)
 53 |         exit(0)
 54 |       end
 55 |       Process.detach(pid)
 56 |     else
 57 |       # ---- Single threaded: call in main process and output immediately
 58 |       func.call(state).each { | line | print line }
 59 |     end
 60 |     @pid_list << [ pid,count,fn ]
 61 |     return true
 62 |   end
 63 | 
 64 |   def submit_final_worker(func,state)
 65 |     @final_worker = true
 66 |     submit_worker(func,state)
 67 |   end
 68 |   
 69 |   # Make sure no more than num_threads are running at the same time -
 70 |   # this is achieved by checking the PID table and the running files
 71 |   # in the tmpdir
 72 | 
 73 |   def wait_for_worker_slot()
 74 |     return if single_threaded
 75 |     Timeout.timeout(@timeout) do
 76 |       printed_timeout_message = false
 77 |       while true
 78 |         # ---- count running pids
 79 |         running = @pid_list.reduce(0) do | sum, info |
 80 |           (pid,count,fn) = info
 81 |           if pid_or_file_running?(pid,fn)
 82 |             sum+1
 83 |           else
 84 |             sum
 85 |           end
 86 |         end
 87 |         return if running < @num_threads
 88 |         if not printed_timeout_message
 89 |           $stderr.print "Waiting for slot (timeout=#{@timeout})\n" if not @quiet
 90 |           printed_timeout_message = true
 91 |         end
 92 |         sleep 0.1        
 93 |       end
 94 |     end
 95 |   end
 96 | 
 97 |   # ---- In this section the output gets collected and passed on to a
 98 |   #      printer thread. This function makes sure the printing is
 99 |   #      ordered and that no printers are running at the same
100 |   #      time. The printer thread should be doing as little processing
101 |   #      as possible.
102 |   #
103 |   #      In this implementation type==:by_line will call func for
104 |   #      each line. Otherwise it is called once with the filename.
105 |   def process_output(func=nil,type=:by_line, blocking=false)
106 |     return if single_threaded
107 |     output = lambda { |fn|
108 |       if type == :by_line
109 |         File.new(fn).each_line { |buf|
110 |           print buf
111 |         }
112 |       else
113 |         func.call(fn)
114 |       end
115 |     }
116 |     if @output_locked
117 |       # ---- is the other thread still running? We wait until it
118 |       #      is finished to start the next one
119 |       (pid,count,fn) = @output_locked
120 |       $stderr.print "Checking for output_lock on existing #{fn}\n" if not @quiet
121 |       return if File.exist?(fn)  # continue because thread still processing
122 |       # Now we should remove the .keep file
123 |       cleanup_keep_file(fn)
124 |       @last_output += 1          # get next one in line
125 |       @output_locked = false
126 |     end
127 |     # ---- process the next output chunk. After completion it
128 |     #      gets renamed to chunk.keep. This to avoid missing
129 |     #      output (if we unlink the file prematurely)
130 |     if info = @pid_list[@last_output]
131 |       (pid,count,fn) = info
132 |       $stderr.print "Testing (#{@last_output}) for output file ",[info],"\n" if @debug
133 |       if File.exist?(fn)
134 |         # Yes! We have the next output, create outputter
135 |         @output_locked = info
136 |         $stderr.print "Set lock on ",[info],"\n" if not @quiet
137 |         if not blocking
138 |           $stderr.print "Processing output file #{fn} (non-blocking)\n" if not @quiet
139 |           pid = fork do
140 |             output.call(fn)
141 |             # after finishing output move it to .keep
142 |             FileUtils::mv(fn,fn+'.keep')
143 |             exit(0)
144 |           end
145 |           Process.detach(pid)
146 |         else
147 |           $stderr.print "Processing output file #{fn} (blocking)\n" if not @quiet
148 |           output.call(fn)
149 |           FileUtils::mv(fn,fn+'.keep')
150 |         end
151 |       else
152 |         sleep 0.2
153 |       end
154 |     end
155 |   end
156 | 
157 |   # Wait for a worker slot to appear. When working the pid is writing
158 |   # a file with extension .part(ial). After completion the file is
159 |   # renamed without .part and a slot is free.
160 |   def wait_for_worker(info)
161 |     (pid,count,fn) = info
162 |     if pid_or_file_running?(pid,fn)
163 |       $stderr.print "Waiting up to #{@timeout} seconds for pid=#{pid} to complete #{fn}\n" if not @quiet
164 |       begin
165 |         Timeout.timeout(@timeout) do
166 |           while not File.exist?(fn)  # wait for the result to appear
167 |             sleep 0.2
168 |             return if not pid_or_file_running?(pid,fn) # worker is gone
169 |           end
170 |         end
171 |         # Partial file should have been renamed:
172 |         raise "FATAL: child process #{pid} appears to have crashed #{fn}" if not File.exist?(fn)
173 |         $stderr.print "OK pid=#{pid}, processing starts of #{fn}\n" if not @quiet
174 |       rescue Timeout::Error
175 |         # Kill it to speed up exit
176 |         Process.kill 9, pid
177 |         Process.wait pid
178 |         $stderr.print "FATAL: child process killed because it stopped responding, pid = #{pid}, fn = #{fn}, count = #{count}\n"
179 |         $stderr.print "Bailing out"
180 |         raise
181 |       end
182 |     end
183 |   end
184 |   
185 |   # This is the final cleanup after the reader thread is done. All workers
186 |   # need to complete.
187 |   
188 |   def wait_for_workers()
189 |     return if single_threaded
190 |     @pid_list.each do |info|
191 |       wait_for_worker(info)
192 |     end
193 |   end
194 | 
195 |   def process_remaining_output()
196 |     return if single_threaded
197 |     $stderr.print "Processing remaining output...\n" if not @quiet
198 |     while @output_locked
199 |       sleep 0.2
200 |       process_output() # keep trying
201 |     end
202 |     @pid_list.each do |info|
203 |       (pid,count,fn) = info
204 |       while pid_or_file_running?(pid,fn) or File.exist?(fn)
205 |         $stderr.print "Trying: ",[info],"\n" if not @quiet
206 |         process_output(nil,:by_line,true)
207 |         sleep 0.2
208 |       end
209 |     end
210 |     while @output_locked
211 |       sleep 0.1
212 |       process_output(nil,:by_line,true)
213 |     end
214 |     cleanup_tmpdir()
215 |   end
216 | 
217 |   def cleanup()
218 |     @pid_list.each do |info|
219 |       (pid,count,fn) = info
220 |       if pid_running?(pid)
221 |         $stderr.print "Killing child ",[info],"\n"
222 |         begin
223 |           Process.kill 9, pid
224 |           Process.wait pid
225 |         rescue Errno::ENOENT
226 |           $stdout.puts "INFO: #{pidfile} did not exist: Errno::ENOENT" if not @quiet
227 |         rescue Errno::ESRCH
228 |           $stdout.puts "INFO: The process #{opid} did not exist: Errno::ESRCH" if not @quiet
229 |         end
230 |       end
231 |       File.unlink(fn) if File.exist?(fn)
232 |       cleanup_keep_file(fn,wait: false)
233 |       tempfn = fn+'.'+RUNNINGEXT
234 |       File.unlink(tempfn) if File.exist?(tempfn)
235 |     end
236 |     cleanup_tmpdir()
237 |   end
238 |   
239 |   private
240 |   
241 |   def mktmpfilename(num,ext=nil)
242 |     @tmpdir+sprintf("/%0.6d-",num)+@name+(ext ? '.'+ext : '')
243 |   end
244 |   
245 |   def pid_or_file_running?(pid,fn)
246 |     (pid && pid_running?(pid)) or File.exist?(fn+'.'+RUNNINGEXT)
247 |   end
248 |   
249 |   def pid_running?(pid)
250 |     begin
251 |       fpid,status=Process.waitpid2(pid,Process::WNOHANG)
252 |     rescue Errno::ECHILD, Errno::ESRCH
253 |       return false
254 |     end
255 |     return true if nil == fpid && nil == status
256 |     return ! (status.exited? || status.signaled?)
257 |   end
258 | 
259 |   def single_threaded
260 |     @num_threads == 1
261 |   end
262 |   
263 |   def multi_threaded
264 |     @num_threads > 1
265 |   end
266 | 
267 |   def cpu_count
268 |     begin
269 |       return File.read('/proc/cpuinfo').scan(/^processor\s*:/).size if File.exist? '/proc/cpuinfo'
270 |       # Actually, the JVM does not allow fork...
271 |       return Java::Java.lang.Runtime.getRuntime.availableProcessors if defined? Java::Java
272 |     rescue LoadError
273 |       # Count on MAC
274 |       return Integer `sysctl -n hw.ncpu 2>/dev/null`
275 |     end
276 |     $stderr.print "Could not determine number of CPUs" if not @quiet
277 |     1
278 |   end
279 | 
280 |   def cleanup_keep_file(fn, opts = { wait: true })
281 |     if not @debug
282 |       keep = fn+'.keep'
283 |       return if not opts[:wait] and !File.exist?(keep)
284 |       $stderr.print "Trying to remove #{keep}\n" if not @quiet
285 |       while true
286 |         if File.exist?(keep)
287 |           $stderr.print "Removing #{keep}\n" if not @quiet
288 |           File.unlink(keep)
289 |           break # forever loop
290 |         end
291 |         sleep 0.1
292 |       end #forever
293 |     end
294 |   end
295 | 
296 |   def cleanup_tmpdir
297 |     if not @debug
298 |       $stderr.print "Removing dir #{@tmpdir}\n" if not @quiet
299 |       Dir.unlink(@tmpdir) if @tmpdir
300 |     end
301 |   end
302 |   
303 | end
304 | 


--------------------------------------------------------------------------------
/test/data/regression/eval_r.info.dp.ref:
--------------------------------------------------------------------------------
  1 | ##fileformat=VCFv4.1
  2 | ##FILTER=<ID=LowQual,Description="Low quality">
  3 | ##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
  4 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
  5 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
  6 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  7 | ##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
  8 | ##GATKCommandLine=<ID=UnifiedGenotyper,Version=2.8-1-g932cd3a,Date="Sat Jan 25 10:33:56 CET 2014",Epoch=1390642436187,CommandLineOptions="analysis_type=UnifiedGenotyper input_file=[/data_fedor12/BAM/sander/Liver_clones/BIOPSY17513D/mapping/BIOPSY17513D_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone3/mapping/clone3_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone4/mapping/clone4_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/clone10/mapping/clone10_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone33/mapping/subclone33_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone46/mapping/subclone46_dedup_realigned_recalibrated.bam, /data_fedor12/BAM/sander/Liver_clones/subclone105/mapping/subclone105_dedup_realigned_recalibrated.bam] read_buffer_size=null phone_home=AWS gatk_key=null tag=NA read_filter=[] intervals=[/data_fedor13/sander/variant_calling/Liver_clones/.queue/scatterGather/UnifiedGenotyper_noref-1-sg/temp_001_of_500/scatter.intervals] excludeIntervals=null interval_set_rule=UNION interval_merging=ALL interval_padding=0 reference_sequence=/data_fedor13/common_data/references/H_sapiens/GATK_b37_bundle_reference/basespace/human_g1k_v37.fasta nonDeterministicRandomSeed=false disableDithering=false maxRuntime=-1 maxRuntimeUnits=MINUTES downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 fix_misencoded_quality_scores=false allow_potentially_misencoded_quality_scores=false useOriginalQualities=false defaultBaseQualities=-1 performanceLog=null BQSR=null quantize_quals=0 disable_indel_quals=false emit_original_quals=false preserve_qscores_less_than=6 globalQScorePrior=-1.0 allow_bqsr_on_reduced_bams_despite_repeated_warnings=false validation_strictness=SILENT remove_program_records=false keep_program_records=false sample_rename_mapping_file=null unsafe=null disable_auto_index_creation_and_locking_when_reading_rods=false num_threads=1 num_cpu_threads_per_data_thread=1 num_io_threads=0 monitorThreadEfficiency=false num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false generateShadowBCF=false variant_index_type=DYNAMIC_SEEK variant_index_parameter=-1 logging_level=INFO log_to_file=null help=false version=false genotype_likelihoods_model=SNP pcr_error_rate=1.0E-4 computeSLOD=false annotateNDA=false pair_hmm_implementation=LOGLESS_CACHING min_base_quality_score=17 max_deletion_fraction=0.05 allSitePLs=false min_indel_count_for_genotyping=5 min_indel_fraction_per_sample=0.25 indelGapContinuationPenalty=10 indelGapOpenPenalty=45 indelHaplotypeSize=80 indelDebug=false ignoreSNPAlleles=false allReadsSP=false ignoreLaneInfo=false reference_sample_calls=(RodBinding name= source=UNBOUND) reference_sample_name=null sample_ploidy=2 min_quality_score=1 max_quality_score=40 site_quality_prior=20 min_power_threshold_for_calling=0.95 min_reference_depth=100 exclude_filtered_reference_sites=false output_mode=EMIT_VARIANTS_ONLY heterozygosity=0.001 indel_heterozygosity=1.25E-4 genotyping_mode=DISCOVERY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 alleles=(RodBinding name= source=UNBOUND) max_alternate_alleles=6 input_prior=[] contamination_fraction_to_filter=0.0 contamination_fraction_per_sample_file=null p_nonref_model=EXACT_INDEPENDENT exactcallslog=null dbsnp=(RodBinding name= source=UNBOUND) comp=[] out=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub no_cmdline_in_header=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub bcf=org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub onlyEmitSamples=[] debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false">
  9 | ##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
 10 | ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
 11 | ##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
 12 | ##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
 13 | ##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
 14 | ##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
 15 | ##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
 16 | ##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
 17 | ##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
 18 | ##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
 19 | ##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
 20 | ##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
 21 | ##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
 22 | ##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
 23 | ##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
 24 | ##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
 25 | ##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
 26 | ##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
 27 | ##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
 28 | ##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
 29 | ##contig=<ID=1,length=249250621,assembly=b37>
 30 | ##contig=<ID=2,length=243199373,assembly=b37>
 31 | ##contig=<ID=3,length=198022430,assembly=b37>
 32 | ##contig=<ID=4,length=191154276,assembly=b37>
 33 | ##contig=<ID=5,length=180915260,assembly=b37>
 34 | ##contig=<ID=6,length=171115067,assembly=b37>
 35 | ##contig=<ID=7,length=159138663,assembly=b37>
 36 | ##contig=<ID=8,length=146364022,assembly=b37>
 37 | ##contig=<ID=9,length=141213431,assembly=b37>
 38 | ##contig=<ID=10,length=135534747,assembly=b37>
 39 | ##contig=<ID=11,length=135006516,assembly=b37>
 40 | ##contig=<ID=12,length=133851895,assembly=b37>
 41 | ##contig=<ID=13,length=115169878,assembly=b37>
 42 | ##contig=<ID=14,length=107349540,assembly=b37>
 43 | ##contig=<ID=15,length=102531392,assembly=b37>
 44 | ##contig=<ID=16,length=90354753,assembly=b37>
 45 | ##contig=<ID=17,length=81195210,assembly=b37>
 46 | ##contig=<ID=18,length=78077248,assembly=b37>
 47 | ##contig=<ID=19,length=59128983,assembly=b37>
 48 | ##contig=<ID=20,length=63025520,assembly=b37>
 49 | ##contig=<ID=21,length=48129895,assembly=b37>
 50 | ##contig=<ID=22,length=51304566,assembly=b37>
 51 | ##contig=<ID=X,length=155270560,assembly=b37>
 52 | ##contig=<ID=Y,length=59373566,assembly=b37>
 53 | ##contig=<ID=MT,length=16569,assembly=b37>
 54 | ##contig=<ID=GL000207.1,length=4262,assembly=b37>
 55 | ##contig=<ID=GL000226.1,length=15008,assembly=b37>
 56 | ##contig=<ID=GL000229.1,length=19913,assembly=b37>
 57 | ##contig=<ID=GL000231.1,length=27386,assembly=b37>
 58 | ##contig=<ID=GL000210.1,length=27682,assembly=b37>
 59 | ##contig=<ID=GL000239.1,length=33824,assembly=b37>
 60 | ##contig=<ID=GL000235.1,length=34474,assembly=b37>
 61 | ##contig=<ID=GL000201.1,length=36148,assembly=b37>
 62 | ##contig=<ID=GL000247.1,length=36422,assembly=b37>
 63 | ##contig=<ID=GL000245.1,length=36651,assembly=b37>
 64 | ##contig=<ID=GL000197.1,length=37175,assembly=b37>
 65 | ##contig=<ID=GL000203.1,length=37498,assembly=b37>
 66 | ##contig=<ID=GL000246.1,length=38154,assembly=b37>
 67 | ##contig=<ID=GL000249.1,length=38502,assembly=b37>
 68 | ##contig=<ID=GL000196.1,length=38914,assembly=b37>
 69 | ##contig=<ID=GL000248.1,length=39786,assembly=b37>
 70 | ##contig=<ID=GL000244.1,length=39929,assembly=b37>
 71 | ##contig=<ID=GL000238.1,length=39939,assembly=b37>
 72 | ##contig=<ID=GL000202.1,length=40103,assembly=b37>
 73 | ##contig=<ID=GL000234.1,length=40531,assembly=b37>
 74 | ##contig=<ID=GL000232.1,length=40652,assembly=b37>
 75 | ##contig=<ID=GL000206.1,length=41001,assembly=b37>
 76 | ##contig=<ID=GL000240.1,length=41933,assembly=b37>
 77 | ##contig=<ID=GL000236.1,length=41934,assembly=b37>
 78 | ##contig=<ID=GL000241.1,length=42152,assembly=b37>
 79 | ##contig=<ID=GL000243.1,length=43341,assembly=b37>
 80 | ##contig=<ID=GL000242.1,length=43523,assembly=b37>
 81 | ##contig=<ID=GL000230.1,length=43691,assembly=b37>
 82 | ##contig=<ID=GL000237.1,length=45867,assembly=b37>
 83 | ##contig=<ID=GL000233.1,length=45941,assembly=b37>
 84 | ##contig=<ID=GL000204.1,length=81310,assembly=b37>
 85 | ##contig=<ID=GL000198.1,length=90085,assembly=b37>
 86 | ##contig=<ID=GL000208.1,length=92689,assembly=b37>
 87 | ##contig=<ID=GL000191.1,length=106433,assembly=b37>
 88 | ##contig=<ID=GL000227.1,length=128374,assembly=b37>
 89 | ##contig=<ID=GL000228.1,length=129120,assembly=b37>
 90 | ##contig=<ID=GL000214.1,length=137718,assembly=b37>
 91 | ##contig=<ID=GL000221.1,length=155397,assembly=b37>
 92 | ##contig=<ID=GL000209.1,length=159169,assembly=b37>
 93 | ##contig=<ID=GL000218.1,length=161147,assembly=b37>
 94 | ##contig=<ID=GL000220.1,length=161802,assembly=b37>
 95 | ##contig=<ID=GL000213.1,length=164239,assembly=b37>
 96 | ##contig=<ID=GL000211.1,length=166566,assembly=b37>
 97 | ##contig=<ID=GL000199.1,length=169874,assembly=b37>
 98 | ##contig=<ID=GL000217.1,length=172149,assembly=b37>
 99 | ##contig=<ID=GL000216.1,length=172294,assembly=b37>
100 | ##contig=<ID=GL000215.1,length=172545,assembly=b37>
101 | ##contig=<ID=GL000205.1,length=174588,assembly=b37>
102 | ##contig=<ID=GL000219.1,length=179198,assembly=b37>
103 | ##contig=<ID=GL000224.1,length=179693,assembly=b37>
104 | ##contig=<ID=GL000223.1,length=180455,assembly=b37>
105 | ##contig=<ID=GL000195.1,length=182896,assembly=b37>
106 | ##contig=<ID=GL000212.1,length=186858,assembly=b37>
107 | ##contig=<ID=GL000222.1,length=186861,assembly=b37>
108 | ##contig=<ID=GL000200.1,length=187035,assembly=b37>
109 | ##contig=<ID=GL000193.1,length=189789,assembly=b37>
110 | ##contig=<ID=GL000194.1,length=191469,assembly=b37>
111 | ##contig=<ID=GL000225.1,length=211173,assembly=b37>
112 | ##contig=<ID=GL000192.1,length=547496,assembly=b37>
113 | ##reference=file:human_g1k_v37.fasta
114 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Original	s1t1	s2t1	s3t1	s1t2	s2t2	s3t2
115 | 1518
116 | 1433
117 | 1440
118 | 1460
119 | 1500
120 | 1537
121 | 1641
122 | 129
123 | 28
124 | 22
125 | 18
126 | 939
127 | 721
128 | 703
129 | 929
130 | 692
131 | 1149
132 | 1108
133 | 830
134 | 764
135 | 809
136 | 754
137 | 719
138 | 1083
139 | 1092
140 | 1089
141 | 1050
142 | 939
143 | 1004
144 | 1090
145 | 992
146 | 636
147 | 1213
148 | 904
149 | 398
150 | 526
151 | 


--------------------------------------------------------------------------------
/doc/Using_Mongo.md:
--------------------------------------------------------------------------------
  1 | # Using bio-vcf with MongoDB
  2 | 
  3 | bio-vcf can output many types of formats. In this exercise we will load
  4 | Mongo with VCF data and do some queries on that.
  5 | 
  6 | ## Install Mongo (Debian)
  7 | 
  8 | With su (password 'bioinformatics')
  9 | 
 10 | ```sh
 11 | su
 12 | apt-get install mongodb
 13 | ```
 14 | 
 15 | ## Install Mongo in $HOME
 16 | 
 17 | Mongo comes with many distributions. Here we installed with guix. Check
 18 | 
 19 | ```sh
 20 | guix package -A mongodb
 21 |   mongodb 3.3.3   out     gn/packages/mongodb.scm:31:2
 22 | ```
 23 | 
 24 | Create a directory for the database
 25 | 
 26 | ```sh
 27 | mkdir -p ~/opt/var/mongodb
 28 | mkdir -p ~/opt/etc
 29 | ```
 30 | 
 31 | And create a configuration file ~/opt/etc/mongo.conf
 32 | 
 33 | ```
 34 | verbose = true
 35 | port = 27017
 36 | dbpath = /home/user/opt/var/mongodb/
 37 | noauth = true
 38 | maxConns = 5
 39 | rest = true
 40 | ```
 41 | 
 42 | and run Mongo
 43 | 
 44 | ```sh
 45 | env LC_ALL=C mongod --config ~/opt/etc/mongo.conf
 46 | ```
 47 | 
 48 | ```ruby
 49 | use admin
 50 | db.createUser({user:"admin", pwd:"admin123", roles:[{role:"root", db:"admin"}]})
 51 | ```
 52 | 
 53 | ## Use client
 54 | 
 55 | ```python
 56 | mongo
 57 | use admin
 58 | db.createUser({user:"admin", pwd:"admin123", roles:[{role:"root", db:"admin"}]})
 59 | ```
 60 | 
 61 | or on a different host
 62 | 
 63 | ```python
 64 | mongo --host 192.168.1.24
 65 | ```
 66 | 
 67 | ## Tutorial Mongo
 68 | 
 69 | Using the example on MongoDB's [website](https://docs.mongodb.org/getting-started/shell/import-data/)
 70 | 
 71 | ### Load data
 72 | 
 73 | Records look like:
 74 | 
 75 | ```javascript
 76 | {"address": {"building": "2780", "coord": [-73.98241999999999, 40.579505], "street": "Stillwell Avenue", "zipcode": "11224"}, "borough": "Brooklyn", "cuisine": "American ", "grades": [{"date": {"$date": 1402358400000}, "grade": "A", "score": 5}, {"date": {"$date": 1370390400000}, "grade": "A", "score": 7}, {"date": {"$date": 1334275200000}, "grade": "A", "score": 12}, {"date": {"$date": 1318377600000}, "grade": "A", "score": 12}], "name": "Riviera Caterer", "restaurant_id": "40356018"}
 77 | {"address": {"building": "351", "coord": [-73.98513559999999, 40.7676919], "street": "West   57 Street", "zipcode": "10019"}, "borough": "Manhattan", "cuisine": "Irish", "grades": [{"date": {"$date": 1409961600000}, "grade": "A", "score": 2}, {"date": {"$date": 1374451200000}, "grade": "A", "score": 11}, {"date": {"$date": 1343692800000}, "grade": "A", "score": 12}, {"date": {"$date": 1325116800000}, "grade": "A", "score": 12}], "name": "Dj Reynolds Pub And Restaurant", "restaurant_id": "30191841"}
 78 | ```
 79 | 
 80 | Note there are no specific identifiers. Or are there?
 81 | 
 82 | ```sh
 83 | wget https://raw.githubusercontent.com/mongodb/docs-assets/primer-dataset/primer-dataset.json
 84 | mongoimport --db test --collection restaurants --drop --file primer-dataset.json
 85 | Mon Apr 11 00:24:50.963 dropping: test.restaurants
 86 | Mon Apr 11 00:24:52.375 check 9 25359
 87 | Mon Apr 11 00:24:52.448 imported 25359 objects
 88 | ```
 89 | 
 90 | ### Use the shell
 91 | 
 92 | Run the mongo shell with
 93 | 
 94 | ```sh
 95 | mongo
 96 | ```
 97 | 
 98 | ```ruby
 99 | use test
100 | db.restaurants.find()
101 | db.restaurants.find( { "borough": "Manhattan" } )
102 | db.restaurants.find( { "grades.score": { $gt: 30 } } )
103 | ... AND ...
104 | db.restaurants.find( { "cuisine": "Italian", "address.zipcode": "10075" ,"grades.score": { $gt: 30 }} )
105 | ... OR ...
106 | db.restaurants.find(
107 |    { $or: [ { "cuisine": "Italian" }, { "address.zipcode": "10075" } ] }
108 | )
109 | ... SORT ...
110 | db.restaurants.find().sort( { "borough": 1, "address.zipcode": 1 } )
111 | ... Count ...
112 | db.restaurants.aggregate(
113 |    [
114 |      { $group: { "_id": "$borough", "count": { $sum: 1 } } }
115 |    ]
116 |    );
117 | 
118 | db.restaurants.aggregate(
119 |    [
120 |      { $match: { "borough": "Queens", "cuisine": "Brazilian" } },
121 |      { $group: { "_id": "$address.zipcode" , "count": { $sum: 1 } } }
122 |    ]
123 |    );
124 | ... Index ...
125 | db.restaurants.createIndex( { "cuisine": 1, "address.zipcode": -1 } )
126 | ```
127 | 
128 | ### Prepare template with bio-vcf
129 | 
130 | ```sh
131 | wget https://github.com/pjotrp/bioruby-vcf/raw/master/test/data/input/gatk_exome.vcf
132 | cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos]'
133 | ```
134 | 
135 | Let's create a template named gatk_template.json
136 | 
137 | ```ruby
138 | 
139 | {
140 |     "rec": {
141 |             "chr": "<%= rec.chrom %>",
142 |             "pos": <%= rec.pos %>,
143 |             "ref": "<%= rec.ref %>",
144 |             "alt": "<%= rec.alt[0] %>",
145 |             "dp":  <%= rec.info.dp %>
146 |     }
147 | }
148 | ```
149 | 
150 | And run it
151 | 
152 | ```sh
153 | cat gatk_exome.vcf |bio-vcf --template gatk_template.json |less
154 | cat gatk_exome.vcf |bio-vcf --template gatk_template.json > gatk_exome.json
155 | ```
156 | 
157 | Looks like
158 | 
159 | ```
160 | {
161 |   "rec": {
162 |             "chr": "X",
163 |             "pos": 134713855,
164 |             "ref": "G",
165 |             "alt": "A",
166 |             "dp":  4
167 |    }
168 | }
169 | ```
170 | 
171 | Import into mongo
172 | 
173 | mongo v. 2.0.6
174 | ```sh
175 | mongoimport --db gatk --collection vcf --drop --file gatk_exome.json --jsonArray
176 | ```
177 | mongo v. 3.2.3
178 | ```sh
179 | mongoimport --db gatk --collection vcf --drop --file gatk_exome.json
180 | ```
181 | 
182 | 
183 | 
184 | ```ruby
185 | use gatk
186 | db.vcf.find()
187 | db.vcf.find( { "rec.chr": "X" } )
188 | db.vcf.find( { "rec.chr": "X" } ).count()
189 | 3
190 | db.vcf.find( { "rec.dp": { $gt: 5 }}  )
191 | db.vcf.find( { "rec.dp": { $gt: 5 }}  ).count()
192 | 25
193 | ```
194 | 
195 | Comparable bio-vcf statements
196 | 
197 | ```
198 | cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos,r.ref,r.alt,r.info.dp]' --filter "r.chr=='X'"|grep -v '#' |wc -l
199 | =>"[r.chr,r.pos,r.ref,r.alt,r.info.dp]", :filter=>"r.chr=='X'"}
200 | 3
201 | cat gatk_exome.vcf |bio-vcf --eval '[r.chr,r.pos,r.ref,r.alt,r.info.dp]' --filter "r.info.dp>5"|grep -v '#' |wc -l
202 | =>"[r.chr,r.pos,r.ref,r.alt,r.info.dp]", :filter=>"r.info.dp>5"}
203 | 25
204 | ```
205 | 
206 | Exercise 1.
207 | 
208 | With bio-vcf take the field "Variant Confidence/Quality by Depth" and
209 | filter on QD>12.0. How many matches? Answer 112 out of 175
210 | 
211 | Exercise 2.
212 | 
213 | Do the same with MongoDB. So you can do
214 | 
215 | ```ruby
216 | db.vcf.find( { "rec.qd": { $gt: 12.0 }}  ).count()
217 | 112
218 | ```
219 | 
220 | ## Now for some real data
221 | 
222 | Let's use our PIK3CA data in two samples
223 | 
224 | ```
225 | cat gene_PIK3CA.vcf |bio-vcf --samples 2,3  --seval s.dp
226 | cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3  --seval s.dp --sfilter "s.dp>7"
227 | cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 0,3 --sfilter 's.dp>20' --seval s.dp
228 | 3       178916645       24      39
229 | 3       178916651       30      31
230 | 3       178921407       32      43
231 | 3       178936082       24      24
232 | 3       178936091       27      32
233 | 3       178947904       23      33
234 | 3       178952072       38      45
235 | 3       178952085       35      45
236 | 3       178952088       34      45
237 | ```
238 | 
239 | Looking at annotations
240 | 
241 | ```
242 | cat gene_PIK3CA.vcf |bio-vcf --eval [r.chr,r.pos,r.info.ann] |grep ENST00000263967|wc -l
243 | 30
244 | ```
245 | 
246 | alternative
247 | 
248 | ```
249 | cat gene_PIK3CA.vcf |bio-vcf --eval '[r.chr,r.pos,r.info.ann]' --filter 'r.info.ann =~ /ENST00000263967/' --seval 's.dp'
250 | 3       178921407       T|synonymous_variant|LOW|PIK3CA|ENSG00000121879|transcript|ENST00000263967|protein_coding|5/21|c.889C>T|p.Leu297Leu|1046/9093|889/3207|297/1068||    32      32      38      43      27      34      30      37      32      36      44      37  25       27      43      30      11      23      19      37      28      17      13 ...
251 | ```
252 | 
253 | Let's try and do the same with Mongo
254 | 
255 | ```
256 | 
257 | {
258 |   "rec": {
259 |             "chr": "<%= rec.chrom %>",
260 |             "pos": <%= rec.pos %>,
261 |             "ref": "<%= rec.ref %>",
262 |             "alt": "<%= rec.alt[0] %>",
263 |             "dp":  <%= rec.info.dp %>,
264 |             "ann":  '"<%= rec.info.ann %>"'
265 |    }
266 | }
267 | ```
268 | 
269 | ```sh
270 | mongoimport --db PIK3CA --collection vcf --drop --file PIK3CA.json --jsonArray
271 | ```
272 | 
273 | ```ruby
274 | db.vcf.find({"rec.ann": /ENST00000263967/i }).count()
275 | 30
276 | ```
277 | 
278 | ## Load results into Python
279 | 
280 | ```sh
281 | guix package -i python2-pip
282 | export PYTHONPATH="/home/user/.guix-profile/lib/python2.7/site-packages"
283 | pip install --install-option="--prefix=$HOME/opt/python" pymongo
284 | export PYTHONPATH="/home/user/.guix-profile/lib/python2.7/site-packages:$HOME/opt/python/lib/python2.7/site-packages"
285 | ```
286 | 
287 | Now start python:
288 | 
289 | ```python
290 | from pymongo import MongoClient
291 | 
292 | client = MongoClient()
293 | db = client.test
294 | # cursor = db.restaurants.find()
295 | cursor = db.restaurants.find({"borough": "Manhattan"})
296 | for document in cursor:
297 |     print(document)
298 |     print(document["cuisine"])
299 |     print(document["grades"][0]["score"]>10)
300 | 
301 | ```
302 | 
303 | ## Exercise 1
304 | 
305 | Write a Python script which queries the PIK3CA VCF file for the annotation as in
306 | 
307 | 
308 | ```ruby
309 | db.vcf.find({"rec.ann": /ENST00000263967/i }).count()
310 | 30
311 | ```
312 | 
313 | ## Exercise 2
314 | 
315 | Write a Python mongo script which queries the PIK3CA file for something
316 | similar to
317 | 
318 | ```sh
319 | cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3  --seval s.dp --sfilter "s.dp>7"
320 | ```
321 | 
322 | when the bio-vcf template is
323 | 
324 | ```ruby
325 | 
326 | {
327 |   "rec": {
328 |             "chr": "<%= rec.chrom %>",
329 |             "pos": <%= rec.pos %>,
330 |             "ref": "<%= rec.ref %>",
331 |             "alt": "<%= rec.alt[0] %>",
332 |             "dp":  <%= rec.info.dp %>,
333 |             "samples": [
334 |               <%=
335 |               a = []
336 |               rec.each_sample { |s| a.push s.dp }
337 |               a.join(',')
338 |               %>
339 |    ]
340 |   }
341 | }
342 | ```
343 | 
344 | So output looks like
345 | 
346 | ```ruby
347 | {
348 |   "rec": {
349 |             "chr": "3",
350 |             "pos": 178916581,
351 |             "ref": "T",
352 |             "alt": "C",
353 |             "dp":  2345,
354 |             "samples": [
355 |               11,11,21,20,27,10,16,17,19,15,18,20,16,9,18,22,6,2,6,9,8,7,7,10,11,12,4,9,7,9,8,10,7,18,8,7
356 | ,7,4,11,4,8,8,8,14,13,23,13,11,12,3,10,27,31,16,12,1,3,4,15,10,20,8,4,0,25,2,10,9,13,20,17,14,25,15,19,16
357 | ,29,13,10,7,4,5,1,1,2,26,17,16,8,4,5,14,14,6,5,0,5,5,11,10,17,8,5,20,9,16,5,21,14,5,4,3,13,7,0,9,5,12,0,2
358 | ,9,14,2,4,7,1,15,7,14,12,4,14,16,26,7,22,5,4,7,10,11,14,19,25,11,2,28,25,29,30,23,30,35,33,32,27,4,30,25,
359 | 33,32,5,9,19,13,13,16,17,8,1,19,8,6,1,20,1,21,6,8,12,33,22,2,16,9,26,23
360 |    ]
361 |   }
362 | }
363 | 
364 | ```
365 | 
366 | Hint: the answer is 25
367 | 
368 | ```sh
369 | user@debian:~$ cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3  --seval s.dp --sfilter "s.dp>7"
370 | bio-vcf 0.9.2 (biogem Ruby 2.3.0 with pcows) by Pjotr Prins 2015
371 | Options: {:show_help=>false, :source=>"https://github.com/pjotrp/bioruby-vcf", :version=>"0.9.2 (Pjotr Prins)", :date=>"2016-04-11 12:11:27 +0200", :thread_lines=>40000, :timeout=>180, :sfilter_samples=>["2", "3"], :seval=>"s.dp", :skip_header=>true, :sfilter=>"s.dp>7"}
372 | 3       178916581       21      20
373 | 3       178916644       35      29
374 | 3       178916645       35      39
375 | 3       178916651       36      31
376 | 3       178916931       26      23
377 | 3       178917478       21      19
378 | 3       178919190       21      24
379 | 3       178921407       38      43
380 | 3       178921525       13      18
381 | 3       178921553       14      11
382 | 3       178922274       15      16
383 | 3       178922277       17      17
384 | 3       178922364       19      24
385 | 3       178927401       32      26
386 | 3       178927410       36      31
387 | 3       178927969       13      11
388 | 3       178928098       18      26
389 | 3       178936082       22      24
390 | 3       178936091       24      32
391 | 3       178938747       12      12
392 | 3       178941853       15      15
393 | 3       178947904       25      33
394 | 3       178952072       46      45
395 | 3       178952085       51      45
396 | 3       178952088       47      45
397 | user@debian:~$ cat gene_PIK3CA.vcf |bio-vcf --sfilter-samples 2,3  --seval s.dp --sfilter "s.dp>7"|wc -l
398 | bio-vcf 0.9.2 (biogem Ruby 2.3.0 with pcows) by Pjotr Prins 2015
399 | Options: {:show_help=>false, :source=>"https://github.com/pjotrp/bioruby-vcf", :version=>"0.9.2 (Pjotr Prins)", :date=>"2016-04-11 12:11:42 +0200", :thread_lines=>40000, :timeout=>180, :sfilter_samples=>["2", "3"], :seval=>"s.dp", :skip_header=>true, :sfilter=>"s.dp>7"}
400 | 25
401 | ```
402 | 


--------------------------------------------------------------------------------
/test/data/input/somaticsniper.vcf:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20140121
 3 | ##phasing=none
 4 | ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##FORMAT=<ID=IGT,Number=1,Type=String,Description="Genotype when called independently (only filled if called in joint prior mode)">
 7 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
 8 | ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
 9 | ##FORMAT=<ID=BCOUNT,Number=4,Type=Integer,Description="Occurrence count for each base at this site (A,C,G,T)">
10 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
11 | ##FORMAT=<ID=JGQ,Number=1,Type=Integer,Description="Joint genotype quality (only filled if called in join prior mode)">
12 | ##FORMAT=<ID=VAQ,Number=1,Type=Integer,Description="Variant allele quality">
13 | ##FORMAT=<ID=BQ,Number=.,Type=Integer,Description="Average base quality">
14 | ##FORMAT=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality across all reads">
15 | ##FORMAT=<ID=AMQ,Number=.,Type=Integer,Description="Average mapping quality for each allele present in the genotype">
16 | ##FORMAT=<ID=SS,Number=1,Type=Integer,Description="Variant status relative to non-adjacent Normal, 0=wildtype,1=germline,2=somatic,3=LOH,4=unknown">
17 | ##FORMAT=<ID=SSC,Number=1,Type=Integer,Description="Somatic Score">
18 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR
19 | 1	1636394	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:17:12,5,0,0:0,17,0,0:75:36:0:56:37:37:0:.	0/1:0/1:6:3,1,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36
20 | 1	36217006	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:21:9,12,0,0:0,21,0,0:90:31:0:60:37:37:0:.	0/1:0/1:6:2,2,0,2:0,4,0,2:24:31:24:60,53:37:37,37:2:31
21 | 1	46527674	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:.	0/1:0/1:4:2,0,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34
22 | 1	108417572	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,7,0,0:48:42:0:41:37:37:0:.	0/1:0/1:4:2,0,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:31
23 | 1	155170305	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:5,4,0,0:0,9,0,0:54:34:0:59:37:37:0:.	0/1:0/1:6:3,1,2,0:0,4,0,2:27:34:27:44,60:37:37,37:2:32
24 | 1	155449089	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:5,4,0,0:0,0,9,0:54:37:0:60:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34
25 | 1	169847826	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:.	0/1:0/1:4:2,0,2,0:2,0,2,0:30:37:30:60,60:37:37,37:2:34
26 | 1	203098164	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:3,7,0,0:0,10,0,0:57:40:0:53:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:37
27 | 2	39213209	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:16:8,8,0,0:0,0,16,0:75:42:0:56:37:37:0:.	0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,59:37:37,37:2:42
28 | 2	86691250	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:6,2,0,0:0,0,8,0:51:42:0:59:37:37:0:.	0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,60:37:37,37:2:33
29 | 2	88874243	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:17:12,5,0,0:0,0,17,0:78:34:0:59:37:37:0:.	0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:34
30 | 2	121728044	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,0,9,0:54:42:0:55:37:37:0:.	0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,57:37:37,37:2:36
31 | 2	170062591	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:4,5,0,0:0,0,9,0:54:39:0:60:37:37:0:.	0/1:0/1:5:2,1,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:35
32 | 2	216257844	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:2,5,0,0:0,0,7,0:48:40:0:60:37:37:0:.	0/1:0/1:3:1,0,0,2:2,0,1,0:1:40:33:60,60:37:37,37:2:30
33 | 2	222322623	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:15:6,9,0,0:0,15,0,0:72:33:0:60:37:37:0:.	0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33
34 | 3	25675413	.	A	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:6,2,0,0:8,0,0,0:51:32:0:59:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,0,2:6:32:25:34,37:37:37,37:2:30
35 | 3	36779638	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:8,5,0,0:0,13,0,0:66:36:0:60:37:37:0:.	0/1:0/1:6:1,3,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36
36 | 3	123458847	.	T	C	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:4,6,0,0:0,0,0,10:54:39:0:41:37:37:0:.	0/1:0/1:5:1,2,1,1:0,2,0,3:32:39:32:55,45:37:37,37:2:35
37 | 3	124351308	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:7,1,0,0:0,0,8,0:51:42:0:58:37:37:0:.	0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:33
38 | 3	142171996	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:8,2,0,0:0,10,0,0:57:37:0:59:37:37:0:.	0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:35
39 | 3	189526168	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:23:11,12,0,0:0,0,23,0:96:36:0:59:37:37:0:.	0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:36
40 | 4	82058553	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:.	0/1:0/1:4:2,0,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:30
41 | 4	122769998	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:7,0,0,0:0,7,0,0:48:40:0:54:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:30
42 | 5	13850856	.	G	C	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:3,5,0,0:0,0,8,0:51:40:0:51:37:37:0:.	0/1:0/1:3:0,1,2,0:0,2,1,0:1:40:33:49,60:37:37,37:2:33
43 | 5	132038609	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:10,2,0,0:0,12,0,0:63:33:0:58:37:37:0:.	0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33
44 | 5	137756599	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:7,6,0,0:0,13,0,0:66:31:0:60:37:37:0:.	0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:31
45 | 5	141974902	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:9,5,0,0:0,14,0,0:69:40:0:60:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:40
46 | 6	2749400	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:8,2,0,0:0,10,0,0:57:35:0:60:37:37:0:.	0/1:0/1:4:2,0,0,2:0,2,0,2:28:35:28:60,39:37:37,37:2:34
47 | 7	95217113	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,0,7,0:48:36:0:60:37:37:0:.	0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:30
48 | 7	140434525	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:8,6,0,0:0,0,14,0:69:31:0:59:37:37:0:.	0/1:0/1:6:3,1,0,2:2,0,4,0:24:31:24:60,58:37:37,37:2:31
49 | 7	151856059	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:20:9,10,1,0:0,19,0,1:50:33:0:59:37:37:0:.	0/1:0/1:7:1,4,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30
50 | 8	42958817	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:6,6,0,0:0,12,0,0:60:36:0:55:37:37:0:.	0/1:0/1:7:4,1,1,1:0,5,0,2:29:36:29:48,60:37:37,37:2:35
51 | 8	131070237	.	A	G	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:9,4,0,0:13,0,0,0:66:39:0:60:37:37:0:.	0/1:0/1:5:2,1,1,1:3,0,2,0:32:39:32:60,60:37:37,37:2:39
52 | 8	141711010	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:8,2,0,0:0,10,0,0:57:40:0:59:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,55:37:37,37:2:37
53 | 8	145059674	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:7,7,0,0:0,14,0,0:69:34:0:56:37:37:0:.	0/1:0/1:5:2,1,0,2:0,3,0,2:27:34:27:60,60:37:37,37:2:34
54 | 9	111651620	.	A	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:4,3,0,0:7,0,0,0:48:92:0:60:37:37:0:.	0/1:0/1:7:1,2,3,1:3,0,0,4:58:92:85:60,60:37:37,37:2:31
55 | 9	111685156	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:9,4,0,0:0,0,13,0:66:37:0:60:37:37:0:.	0/1:0/1:4:1,1,2,0:2,0,2,0:12:37:30:60,37:37:37,37:2:37
56 | 10	6525571	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:0,8,0,0:0,0,8,0:51:37:0:55:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:32
57 | 10	97197246	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:7,3,0,0:0,10,0,0:57:39:0:58:37:37:0:.	0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:60,53:37:37,37:2:36
58 | 11	58949455	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:8,4,0,0:0,0,12,0:60:33:0:54:37:37:0:.	0/1:0/1:7:3,2,1,1:2,0,5,0:26:33:26:60,36:37:37,37:2:33
59 | 11	65481082	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:17:12,5,0,0:0,0,17,0:78:45:0:58:37:37:0:.	0/1:0/1:3:0,1,1,1:2,0,1,0:1:45:34:60,60:37:37,37:2:45
60 | 11	94180424	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:23:17,6,0,0:0,0,23,0:96:34:0:56:37:37:0:.	0/1:0/1:5:2,1,2,0:2,0,3,0:27:34:27:60,59:37:37,37:2:34
61 | 11	121036021	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:3,9,0,0:0,12,0,0:63:31:0:59:37:37:0:.	0/1:0/1:6:4,0,0,2:0,4,0,2:24:31:24:59,60:37:37,37:2:31
62 | 12	994952	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:1,8,0,0:0,9,0,0:54:33:0:59:37:37:0:.	0/1:0/1:7:2,3,1,1:0,5,0,2:26:33:26:51,55:37:37,37:2:32
63 | 12	69233187	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:5,4,0,0:0,9,0,0:51:33:0:53:37:37:0:.	0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30
64 | 12	77436879	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:4,6,0,0:0,0,10,0:57:34:0:60:37:37:0:.	0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:33
65 | 12	96641273	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:4,4,0,0:0,0,8,0:51:33:0:60:37:37:0:.	0/1:0/1:7:2,3,1,1:2,0,5,0:26:33:26:60,50:37:37,37:2:30
66 | 12	110813986	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:4,4,0,0:0,0,8,0:51:39:0:59:37:37:0:.	0/1:0/1:5:1,2,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:33
67 | 12	122825587	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:21:12,9,0,0:0,21,0,0:90:34:0:59:37:37:0:.	0/1:0/1:5:3,0,2,0:0,3,0,2:27:34:27:60,60:37:37,37:2:34
68 | 14	30135337	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,7,0,0:48:40:0:60:37:37:0:.	0/1:0/1:3:1,0,0,2:0,1,0,2:1:40:33:60,60:37:37,37:2:30
69 | 14	51398458	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:4,5,0,0:0,0,9,0:54:36:0:55:37:37:0:.	0/1:0/1:6:0,4,1,1:2,0,4,0:29:36:29:60,59:37:37,37:2:33
70 | 15	43170722	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:11:6,5,0,0:0,11,0,0:60:31:0:60:37:37:0:.	0/1:0/1:6:4,0,2,0:0,4,0,2:24:31:24:56,54:37:37,37:2:31
71 | 15	50862183	.	C	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:7,2,0,0:0,9,0,0:54:35:0:60:37:37:0:.	0/1:0/1:4:0,2,0,2:2,2,0,0:28:35:28:45,59:37:37,37:2:33
72 | 15	64332347	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:6,1,0,0:0,7,0,0:48:39:0:60:37:37:0:.	0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:58,56:37:37,37:2:30
73 | 15	80845030	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:1,8,0,0:0,9,0,0:54:31:0:60:37:37:0:.	0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:30
74 | 16	1812938	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:11:8,3,0,0:0,0,11,0:60:34:0:46:37:37:0:.	0/1:0/1:5:1,2,1,1:2,0,3,0:27:34:27:46,55:37:37,37:2:33
75 | 16	3582808	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:6,4,0,0:0,0,10,0:57:34:0:59:37:37:0:.	0/1:0/1:5:0,3,2,0:2,0,3,0:27:34:27:60,60:37:37,37:2:33
76 | 16	14042032	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:.	0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:57,60:37:37,37:2:30
77 | 16	23619204	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,0,7,0:48:42:0:60:37:37:0:.	0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:31
78 | 17	41256142	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:5,3,0,0:0,0,8,0:51:40:0:60:37:37:0:.	0/1:0/1:3:0,1,2,0:2,0,1,0:1:40:33:57,60:37:37,37:2:33
79 | 17	61784013	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,9,0,0:54:45:0:60:37:37:0:.	0/1:0/1:3:0,1,1,1:0,1,0,2:1:45:34:60,60:37:37,37:2:36
80 | 18	45423074	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:16:8,8,0,0:0,16,0,0:75:46:0:53:37:37:0:.	0/1:0/1:10:3,4,3,0:0,7,0,3:39:46:39:60,60:37:37,37:2:46
81 | 18	60985432	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:9,4,0,1:0,0,13,0:32:42:0:60:37:37:0:.	0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,57:37:37,37:2:41
82 | 19	39664512	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:8,6,0,0:0,14,0,0:69:42:0:60:37:37:0:.	0/1:0/1:4:0,2,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:42
83 | 19	49473085	.	G	C	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:4,4,0,0:0,0,8,0:51:37:0:60:37:37:0:.	0/1:0/1:4:1,1,2,0:0,2,2,0:30:37:30:48,56:37:37,37:2:32
84 | 20	34135210	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:2,6,0,0:0,8,0,0:51:36:0:59:37:37:0:.	0/1:0/1:6:2,2,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:32
85 | 20	35663882	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:2,5,0,0:0,0,7,0:48:37:0:56:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:30
86 | X	70341572	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:11:6,5,0,0:0,11,0,0:60:33:0:56:37:37:0:.	0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:58,60:37:37,37:2:33
87 | X	123164862	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,0,7,0:48:40:0:59:37:37:0:.	0/1:0/1:3:1,0,2,0:2,0,1,0:1:40:33:60,60:37:37,37:2:30
88 | 


--------------------------------------------------------------------------------
/test/data/regression/pass1.ref:
--------------------------------------------------------------------------------
 1 | ##fileformat=VCFv4.1
 2 | ##fileDate=20140121
 3 | ##phasing=none
 4 | ##reference=file:///data/GENOMES/human_GATK_GRCh37/GRCh37_gatk.fasta
 5 | ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 6 | ##FORMAT=<ID=IGT,Number=1,Type=String,Description="Genotype when called independently (only filled if called in joint prior mode)">
 7 | ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total read depth">
 8 | ##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="# high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases">
 9 | ##FORMAT=<ID=BCOUNT,Number=4,Type=Integer,Description="Occurrence count for each base at this site (A,C,G,T)">
10 | ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
11 | ##FORMAT=<ID=JGQ,Number=1,Type=Integer,Description="Joint genotype quality (only filled if called in join prior mode)">
12 | ##FORMAT=<ID=VAQ,Number=1,Type=Integer,Description="Variant allele quality">
13 | ##FORMAT=<ID=BQ,Number=.,Type=Integer,Description="Average base quality">
14 | ##FORMAT=<ID=MQ,Number=1,Type=Integer,Description="Average mapping quality across all reads">
15 | ##FORMAT=<ID=AMQ,Number=.,Type=Integer,Description="Average mapping quality for each allele present in the genotype">
16 | ##FORMAT=<ID=SS,Number=1,Type=Integer,Description="Variant status relative to non-adjacent Normal, 0=wildtype,1=germline,2=somatic,3=LOH,4=unknown">
17 | ##FORMAT=<ID=SSC,Number=1,Type=Integer,Description="Somatic Score">
18 | ##FILTER=<ID=PASS,Description="r.normal.dp>5 and r.tumor.dp>7">
19 | #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR
20 | 1	1636394	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:17:12,5,0,0:0,17,0,0:75:36:0:56:37:37:0:.	0/1:0/1:6:3,1,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36
21 | 1	36217006	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:21:9,12,0,0:0,21,0,0:90:31:0:60:37:37:0:.	0/1:0/1:6:2,2,0,2:0,4,0,2:24:31:24:60,53:37:37,37:2:31
22 | 1	46527674	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:.	0/1:0/1:4:2,0,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34
23 | 1	108417572	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,7,0,0:48:42:0:41:37:37:0:.	0/1:0/1:4:2,0,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:31
24 | 1	155170305	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:5,4,0,0:0,9,0,0:54:34:0:59:37:37:0:.	0/1:0/1:6:3,1,2,0:0,4,0,2:27:34:27:44,60:37:37,37:2:32
25 | 1	155449089	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:5,4,0,0:0,0,9,0:54:37:0:60:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:34
26 | 1	169847826	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,0,9,0:54:37:0:60:37:37:0:.	0/1:0/1:4:2,0,2,0:2,0,2,0:30:37:30:60,60:37:37,37:2:34
27 | 1	203098164	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:3,7,0,0:0,10,0,0:57:40:0:53:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:37
28 | 2	39213209	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:16:8,8,0,0:0,0,16,0:75:42:0:56:37:37:0:.	0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,59:37:37,37:2:42
29 | 2	86691250	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:6,2,0,0:0,0,8,0:51:42:0:59:37:37:0:.	0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,60:37:37,37:2:33
30 | 2	88874243	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:17:12,5,0,0:0,0,17,0:78:34:0:59:37:37:0:.	0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:34
31 | 2	121728044	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,0,9,0:54:42:0:55:37:37:0:.	0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,57:37:37,37:2:36
32 | 2	170062591	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:4,5,0,0:0,0,9,0:54:39:0:60:37:37:0:.	0/1:0/1:5:2,1,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:35
33 | 2	216257844	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:2,5,0,0:0,0,7,0:48:40:0:60:37:37:0:.	0/1:0/1:3:1,0,0,2:2,0,1,0:1:40:33:60,60:37:37,37:2:30
34 | 2	222322623	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:15:6,9,0,0:0,15,0,0:72:33:0:60:37:37:0:.	0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33
35 | 3	25675413	.	A	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:6,2,0,0:8,0,0,0:51:32:0:59:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,0,2:6:32:25:34,37:37:37,37:2:30
36 | 3	36779638	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:8,5,0,0:0,13,0,0:66:36:0:60:37:37:0:.	0/1:0/1:6:1,3,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:36
37 | 3	123458847	.	T	C	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:4,6,0,0:0,0,0,10:54:39:0:41:37:37:0:.	0/1:0/1:5:1,2,1,1:0,2,0,3:32:39:32:55,45:37:37,37:2:35
38 | 3	124351308	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:7,1,0,0:0,0,8,0:51:42:0:58:37:37:0:.	0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:33
39 | 3	142171996	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:8,2,0,0:0,10,0,0:57:37:0:59:37:37:0:.	0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:35
40 | 3	189526168	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:23:11,12,0,0:0,0,23,0:96:36:0:59:37:37:0:.	0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:36
41 | 4	82058553	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:.	0/1:0/1:4:2,0,2,0:0,2,0,2:30:37:30:60,60:37:37,37:2:30
42 | 4	122769998	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:7,0,0,0:0,7,0,0:48:40:0:54:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:30
43 | 5	13850856	.	G	C	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:3,5,0,0:0,0,8,0:51:40:0:51:37:37:0:.	0/1:0/1:3:0,1,2,0:0,2,1,0:1:40:33:49,60:37:37,37:2:33
44 | 5	132038609	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:10,2,0,0:0,12,0,0:63:33:0:58:37:37:0:.	0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:33
45 | 5	137756599	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:7,6,0,0:0,13,0,0:66:31:0:60:37:37:0:.	0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:31
46 | 5	141974902	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:9,5,0,0:0,14,0,0:69:40:0:60:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,60:37:37,37:2:40
47 | 6	2749400	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:8,2,0,0:0,10,0,0:57:35:0:60:37:37:0:.	0/1:0/1:4:2,0,0,2:0,2,0,2:28:35:28:60,39:37:37,37:2:34
48 | 7	95217113	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,0,7,0:48:36:0:60:37:37:0:.	0/1:0/1:6:1,3,1,1:2,0,4,0:29:36:29:60,60:37:37,37:2:30
49 | 7	140434525	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:8,6,0,0:0,0,14,0:69:31:0:59:37:37:0:.	0/1:0/1:6:3,1,0,2:2,0,4,0:24:31:24:60,58:37:37,37:2:31
50 | 7	151856059	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:20:9,10,1,0:0,19,0,1:50:33:0:59:37:37:0:.	0/1:0/1:7:1,4,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30
51 | 8	42958817	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:6,6,0,0:0,12,0,0:60:36:0:55:37:37:0:.	0/1:0/1:7:4,1,1,1:0,5,0,2:29:36:29:48,60:37:37,37:2:35
52 | 8	131070237	.	A	G	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:9,4,0,0:13,0,0,0:66:39:0:60:37:37:0:.	0/1:0/1:5:2,1,1,1:3,0,2,0:32:39:32:60,60:37:37,37:2:39
53 | 8	141711010	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:8,2,0,0:0,10,0,0:57:40:0:59:37:37:0:.	0/1:0/1:3:1,0,2,0:0,1,0,2:1:40:33:60,55:37:37,37:2:37
54 | 8	145059674	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:7,7,0,0:0,14,0,0:69:34:0:56:37:37:0:.	0/1:0/1:5:2,1,0,2:0,3,0,2:27:34:27:60,60:37:37,37:2:34
55 | 9	111651620	.	A	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:4,3,0,0:7,0,0,0:48:92:0:60:37:37:0:.	0/1:0/1:7:1,2,3,1:3,0,0,4:58:92:85:60,60:37:37,37:2:31
56 | 9	111685156	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:13:9,4,0,0:0,0,13,0:66:37:0:60:37:37:0:.	0/1:0/1:4:1,1,2,0:2,0,2,0:12:37:30:60,37:37:37,37:2:37
57 | 10	6525571	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:0,8,0,0:0,0,8,0:51:37:0:55:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:32
58 | 10	97197246	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:7,3,0,0:0,10,0,0:57:39:0:58:37:37:0:.	0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:60,53:37:37,37:2:36
59 | 11	58949455	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:8,4,0,0:0,0,12,0:60:33:0:54:37:37:0:.	0/1:0/1:7:3,2,1,1:2,0,5,0:26:33:26:60,36:37:37,37:2:33
60 | 11	65481082	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:17:12,5,0,0:0,0,17,0:78:45:0:58:37:37:0:.	0/1:0/1:3:0,1,1,1:2,0,1,0:1:45:34:60,60:37:37,37:2:45
61 | 11	94180424	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:23:17,6,0,0:0,0,23,0:96:34:0:56:37:37:0:.	0/1:0/1:5:2,1,2,0:2,0,3,0:27:34:27:60,59:37:37,37:2:34
62 | 11	121036021	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:12:3,9,0,0:0,12,0,0:63:31:0:59:37:37:0:.	0/1:0/1:6:4,0,0,2:0,4,0,2:24:31:24:59,60:37:37,37:2:31
63 | 12	994952	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:1,8,0,0:0,9,0,0:54:33:0:59:37:37:0:.	0/1:0/1:7:2,3,1,1:0,5,0,2:26:33:26:51,55:37:37,37:2:32
64 | 12	69233187	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:5,4,0,0:0,9,0,0:51:33:0:53:37:37:0:.	0/1:0/1:7:4,1,1,1:0,5,0,2:26:33:26:60,60:37:37,37:2:30
65 | 12	77436879	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:4,6,0,0:0,0,10,0:57:34:0:60:37:37:0:.	0/1:0/1:5:2,1,0,2:2,0,3,0:27:34:27:60,60:37:37,37:2:33
66 | 12	96641273	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:4,4,0,0:0,0,8,0:51:33:0:60:37:37:0:.	0/1:0/1:7:2,3,1,1:2,0,5,0:26:33:26:60,50:37:37,37:2:30
67 | 12	110813986	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:4,4,0,0:0,0,8,0:51:39:0:59:37:37:0:.	0/1:0/1:5:1,2,1,1:2,0,3,0:32:39:32:60,60:37:37,37:2:33
68 | 12	122825587	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:21:12,9,0,0:0,21,0,0:90:34:0:59:37:37:0:.	0/1:0/1:5:3,0,2,0:0,3,0,2:27:34:27:60,60:37:37,37:2:34
69 | 14	30135337	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,7,0,0:48:40:0:60:37:37:0:.	0/1:0/1:3:1,0,0,2:0,1,0,2:1:40:33:60,60:37:37,37:2:30
70 | 14	51398458	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:4,5,0,0:0,0,9,0:54:36:0:55:37:37:0:.	0/1:0/1:6:0,4,1,1:2,0,4,0:29:36:29:60,59:37:37,37:2:33
71 | 15	43170722	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:11:6,5,0,0:0,11,0,0:60:31:0:60:37:37:0:.	0/1:0/1:6:4,0,2,0:0,4,0,2:24:31:24:56,54:37:37,37:2:31
72 | 15	50862183	.	C	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:7,2,0,0:0,9,0,0:54:35:0:60:37:37:0:.	0/1:0/1:4:0,2,0,2:2,2,0,0:28:35:28:45,59:37:37,37:2:33
73 | 15	64332347	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:6,1,0,0:0,7,0,0:48:39:0:60:37:37:0:.	0/1:0/1:5:3,0,1,1:0,3,0,2:32:39:32:58,56:37:37,37:2:30
74 | 15	80845030	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:1,8,0,0:0,9,0,0:54:31:0:60:37:37:0:.	0/1:0/1:6:2,2,2,0:0,4,0,2:24:31:24:60,60:37:37,37:2:30
75 | 16	1812938	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:11:8,3,0,0:0,0,11,0:60:34:0:46:37:37:0:.	0/1:0/1:5:1,2,1,1:2,0,3,0:27:34:27:46,55:37:37,37:2:33
76 | 16	3582808	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:10:6,4,0,0:0,0,10,0:57:34:0:59:37:37:0:.	0/1:0/1:5:0,3,2,0:2,0,3,0:27:34:27:60,60:37:37,37:2:33
77 | 16	14042032	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:6,1,0,0:0,7,0,0:48:37:0:60:37:37:0:.	0/1:0/1:4:0,2,2,0:0,2,0,2:30:37:30:57,60:37:37,37:2:30
78 | 16	23619204	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,0,7,0:48:42:0:60:37:37:0:.	0/1:0/1:4:1,1,1,1:2,0,2,0:35:42:35:60,60:37:37,37:2:31
79 | 17	41256142	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:5,3,0,0:0,0,8,0:51:40:0:60:37:37:0:.	0/1:0/1:3:0,1,2,0:2,0,1,0:1:40:33:57,60:37:37,37:2:33
80 | 17	61784013	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:9:6,3,0,0:0,9,0,0:54:45:0:60:37:37:0:.	0/1:0/1:3:0,1,1,1:0,1,0,2:1:45:34:60,60:37:37,37:2:36
81 | 18	45423074	.	C	T	.	PASS	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:16:8,8,0,0:0,16,0,0:75:46:0:53:37:37:0:.	0/1:0/1:10:3,4,3,0:0,7,0,3:39:46:39:60,60:37:37,37:2:46
82 | 18	60985432	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:9,4,0,1:0,0,13,0:32:42:0:60:37:37:0:.	0/1:0/1:4:2,0,1,1:2,0,2,0:30:42:35:60,57:37:37,37:2:41
83 | 19	39664512	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:14:8,6,0,0:0,14,0,0:69:42:0:60:37:37:0:.	0/1:0/1:4:0,2,1,1:0,2,0,2:30:42:35:60,60:37:37,37:2:42
84 | 19	49473085	.	G	C	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:4,4,0,0:0,0,8,0:51:37:0:60:37:37:0:.	0/1:0/1:4:1,1,2,0:0,2,2,0:30:37:30:48,56:37:37,37:2:32
85 | 20	34135210	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:8:2,6,0,0:0,8,0,0:51:36:0:59:37:37:0:.	0/1:0/1:6:2,2,1,1:0,4,0,2:29:36:29:60,60:37:37,37:2:32
86 | 20	35663882	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:2,5,0,0:0,0,7,0:48:37:0:56:37:37:0:.	0/1:0/1:4:1,1,0,2:2,0,2,0:30:37:30:60,60:37:37,37:2:30
87 | X	70341572	.	C	T	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:11:6,5,0,0:0,11,0,0:60:33:0:56:37:37:0:.	0/1:0/1:7:3,2,1,1:0,5,0,2:26:33:26:58,60:37:37,37:2:33
88 | X	123164862	.	G	A	.	.	.	GT:IGT:DP:DP4:BCOUNT:GQ:JGQ:VAQ:BQ:MQ:AMQ:SS:SSC	0/0:0/0:7:5,2,0,0:0,0,7,0:48:40:0:59:37:37:0:.	0/1:0/1:3:1,0,2,0:2,0,1,0:1:40:33:60,60:37:37,37:2:30
89 | 


--------------------------------------------------------------------------------
/bin/bio-vcf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | #
  3 | # bio-vcf parser and transformer
  4 | # Author:: Pjotr Prins
  5 | # License:: MIT
  6 | #
  7 | # Copyright (C) 2014-2021 Pjotr Prins <pjotr.prins@thebird.nl>
  8 | 
  9 | USAGE = "Vcf parser"
 10 | 
 11 | gempath = File.dirname(File.dirname(__FILE__))
 12 | $: << File.join(gempath,'lib')
 13 | 
 14 | VERSION_FILENAME=File.join(gempath,'VERSION')
 15 | version = File.new(VERSION_FILENAME).read.chomp
 16 | 
 17 | require 'bio-vcf'
 18 | require 'bio-vcf/pcows'
 19 | require 'optparse'
 20 | require 'timeout'
 21 | require 'fileutils'
 22 | require 'json'
 23 | 
 24 | # Uncomment when using the bio-logger
 25 | # require 'bio-logger'
 26 | # log = Bio::Log::LoggerPlus.new 'vcf'
 27 | # log.outputters = Bio::Log::Outputter.stderr
 28 | # Bio::Log::CLI.logger('stderr')
 29 | # Bio::Log::CLI.trace('info')
 30 | 
 31 | options = { show_help: false, source: 'https://github.com/vcflib/bio-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 }
 32 | opts = OptionParser.new do |o|
 33 |   o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g.  #{File.basename($0)} < test/data/input/somaticsniper.vcf"
 34 | 
 35 |   o.on('-i','--ignore-missing', 'Ignore missing data') do
 36 |     options[:ignore_missing] = true
 37 |   end
 38 |   o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd|
 39 |     options[:filter] = cmd
 40 |   end
 41 | 
 42 |   o.on('--sfilter cmd',String, 'Evaluate filter on each sample') do |cmd|
 43 |     options[:sfilter] = cmd
 44 |   end
 45 |   o.on("--sfilter-samples list", Array, "Filter on selected samples (e.g., 0,1") do |l|
 46 |     options[:sfilter_samples] = l
 47 |   end
 48 | 
 49 |   o.on('--ifilter cmd','--if cmd',String, 'Include filter') do |cmd|
 50 |     options[:ifilter] = cmd
 51 |   end
 52 |   o.on("--ifilter-samples list", Array, "Include set - implicitely defines exclude set") do |l|
 53 |     options[:ifilter_samples] = l
 54 |   end
 55 | 
 56 |   o.on('--efilter cmd','--ef cmd',String, 'Exclude filter') do |cmd|
 57 |     options[:efilter] = cmd
 58 |   end
 59 |   o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l|
 60 |     options[:efilter_samples] = l
 61 |   end
 62 |   o.on('--add-filter name',String, 'Set/add filter field to name') do |name|
 63 |     options[:add_filter] = name
 64 |   end
 65 | 
 66 |   o.on("--bed bedfile", String, "Filter on BED elements") do |bed|
 67 |     options[:bed] = bed
 68 |   end
 69 | 
 70 |   o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd|
 71 |     options[:eval] = cmd
 72 |   end
 73 |   o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd|
 74 |     options[:eval_once] = true
 75 |     options[:eval] = cmd
 76 |     # options[:num_threads] = 1
 77 |     # options[:thread_lines] = 1
 78 |     options[:skip_header] = true
 79 |   end
 80 |   o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd|
 81 |     options[:seval] = cmd
 82 |     options[:skip_header] = true
 83 |   end
 84 |   o.on("--rewrite eval", "Rewrite INFO") do |s|
 85 |     options[:rewrite] = s
 86 |   end
 87 |   o.on("--samples list", Array, "Output selected samples") do |l|
 88 |     options[:samples] = l
 89 |   end
 90 |   o.on("--json", "Try to coerce header into JSON (for records check out --template!)") do |b|
 91 |     options[:json] = true
 92 |     options[:skip_header] = true
 93 |   end
 94 |   o.on("--rdf", "Try to coerce header into Turtle RDF (requires RDF --template!)") do |b|
 95 |     require 'bio-vcf/vcfrdf'
 96 |     options[:rdf] = true
 97 |     options[:skip_header] = true
 98 |   end
 99 |   o.on("--num-threads [num]", Integer, "Multi-core version (default ALL)") do |i|
100 |     options[:num_threads] = i
101 |   end
102 |   o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i|
103 |     options[:thread_lines] = i
104 |   end
105 |   o.on_tail("--id name", String, "Identifier") do |s|
106 |     options[:id] = s
107 |   end
108 |   o.on_tail("--tags list", String, "Add tags") do |s|
109 |     options[:tags] = s
110 |   end
111 | 
112 |   o.on("--skip-header", "Do not output VCF header info") do
113 |     options[:skip_header] = true
114 |   end
115 | 
116 |   o.on("--set-header list", Array, "Set a special tab delimited output header (#samples expands to sample names)") do |list|
117 |     options[:set_header] = list
118 |     options[:skip_header] = true
119 |   end
120 | 
121 |   o.on("-t erb","--template erb",String, "Use ERB template for output") do |s|
122 |     require 'bio-vcf/vcfrdf'
123 |     require 'erb'
124 |     options[:template] = s
125 |     options[:skip_header] = true
126 |   end
127 | 
128 |   o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t|
129 |     options[:tag] = true
130 |   end
131 | 
132 |   o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i|
133 |     options[:timeout] = i
134 |   end
135 | 
136 |   # Uncomment the following when using the bio-logger
137 |   # o.separator ""
138 |   # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
139 |   #   Bio::Log::CLI.logger(name)
140 |   # end
141 |   #
142 |   # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
143 |   #   Bio::Log::CLI.trace(s)
144 |   # end
145 |   #
146 |   o.on("--names", "Output sample names") do |q|
147 |     options[:quiet] = true
148 |     options[:num_threads] = nil
149 |     options[:eval_once] = true
150 |     options[:eval] = 'header.samples.join("\t")'
151 |     # options[:num_threads] = 1
152 |     # options[:thread_lines] = 1
153 |     options[:skip_header] = true
154 |   end
155 |   o.on("--statistics", "Output statistics") do |q|
156 |     options[:statistics] = true
157 |     options[:num_threads] = nil
158 |   end
159 |   o.on("-q", "--quiet", "Run quietly") do |q|
160 |     # Bio::Log::CLI.trace('error')
161 |     options[:quiet] = true
162 |   end
163 | 
164 |   o.on("-v", "--verbose", "Run verbosely") do |v|
165 |     options[:verbose] = true
166 |   end
167 | 
168 |   o.on("--debug", "Show debug messages and keep intermediate output") do |v|
169 |     # Bio::Log::CLI.trace('debug')
170 |     options[:debug] = true
171 |   end
172 | 
173 |   o.separator ""
174 |   o.on_tail('-h', '--help', 'display this help and exit') do
175 |     options[:show_help] = true
176 |   end
177 | end
178 | 
179 | opts.parse!(ARGV)
180 | 
181 | BIOVCF_VERSION=version
182 | BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015-2021\n"
183 | $stderr.print BIOVCF_BANNER if !options[:quiet]
184 | 
185 | if options[:show_help]
186 |   print opts
187 |   print USAGE
188 |   exit 1
189 | end
190 | 
191 | if RUBY_VERSION =~ /^1/
192 |   $stderr.print "WARNING: bio-vcf does not run on Ruby 1.x\n"
193 | end
194 | 
195 | $stderr.print "Options: ",options,"\n" if !options[:quiet]
196 | 
197 | if options[:template]
198 |   include BioVcf::RDF
199 |   require 'bio-vcf/template'
200 |   fn = options[:template]
201 |   raise "No template #{fn}!" if not File.exist?(fn)
202 |   # template = ERB.new(File.read(fn))
203 |   template = Bio::Template.new(fn,options[:json])
204 | end
205 | 
206 | stats = nil
207 | if options[:statistics]
208 |   options[:num_threads] = nil
209 |   stats = BioVcf::VcfStatistics.new
210 | end
211 | 
212 | # Check for option combinations
213 | raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter]
214 | raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter]
215 | raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter]
216 | # raise "Soft filter not supported with --ifilter" if options[:add_filter] and options[:ifilter]
217 | # raise "Soft filter not supported with --efilter" if options[:add_filter] and options[:efilter]
218 | 
219 | if options[:samples]
220 |   samples = options[:samples].map { |s| s.to_i }
221 | end
222 | 
223 | include BioVcf
224 | 
225 | # Parse the header section of a VCF file (chomping STDIN)
226 | def parse_header line, samples, options
227 |   header = VcfHeader.new(options[:debug])
228 |   header.add(line)
229 |   do_parse_header = !options[:skip_header]
230 |   print line if do_parse_header
231 |   STDIN.each_line do | headerline |
232 |     if headerline !~ /^#/
233 |       # If no records in VCF, we never get here
234 |       line = headerline
235 |       break # end of header
236 |     end
237 |     header.add(headerline)
238 |     if do_parse_header
239 |       if headerline =~ /^#CHR/
240 |         # Parse the column names line #CHROM POS ID REF ALT QUAL...
241 |         # and forms the selected header.column_names
242 |         #
243 |         # The header before actual data contains the sample names,
244 |         # thereforefirst inject the BioVcf meta information
245 |         print header.tag(options),"\n" if options[:tag]
246 |         # Then the additional filter(s)
247 |         # ##FILTER=<ID=LowQual,Description="Low quality">
248 |         add_filter = options[:add_filter]
249 |         if add_filter
250 |           print "##FILTER=<ID=",add_filter,",Description=\"",options[:filter],"\">\n"
251 |         end
252 | 
253 |         selected = header.column_names
254 |         if samples
255 |           newfields = selected[0..8]
256 |           samples.each do |s|
257 |             newfields << selected[s+9]
258 |           end
259 |           selected = newfields
260 |         end
261 |         print "#",selected.join("\t"),"\n"
262 |       else
263 |         print headerline
264 |       end
265 |     end
266 |   end
267 |   print header.printable_header_line(options[:set_header]),"\n" if options[:set_header]
268 |   VcfRdf::header if options[:rdf]
269 |   if line =~ /^#/
270 |     # We did not read a record
271 |     line = nil
272 |   end
273 |   return header,line
274 | end
275 | 
276 | # Parse a VCF line and return the (template) result as a string buffer
277 | # This is the main work horse that parses through every VCF record:
278 | def parse_line line,header,options,bedfilter,samples,template,stats=nil
279 |   fields = VcfLine.parse(line)
280 |   rec = VcfRecord.new(fields,header)
281 |   r = rec # alias
282 | 
283 |   filter = options[:filter]
284 |   sfilter = options[:sfilter]
285 |   efilter = options[:efilter]
286 |   ifilter = options[:ifilter]
287 |   add_filter = options[:add_filter] # contains a filter name (soft filter)
288 |   seval = options[:seval]
289 |   ignore_missing = options[:ignore_missing]
290 |   quiet = options[:quiet]
291 |   set_filter_field = nil
292 | 
293 |   if sfilter or efilter or ifilter or seval
294 |     # check for samples
295 |     header_samples = header.column_names[9..-1]
296 |     raise "Empty sample list, can not execute query!" if not header_samples
297 |   end
298 | 
299 |   # --------------------------
300 |   # Filtering and set analysis
301 |   if bedfilter
302 |     bed = bedfilter.contains(rec)
303 |     return if not bed
304 |   end
305 | 
306 |   skip = lambda { |&m|
307 |     matched = m.call
308 |     if add_filter
309 |       set_filter_field = true if matched
310 |       false  # always continue processing with an add-filter
311 |     else
312 |       not matched
313 |     end
314 |   }
315 | 
316 |   if filter
317 |     return if skip.call { rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) }
318 |   end
319 | 
320 |   if sfilter # sample 'or' filter
321 |     rec.each_sample(options[:sfilter_samples]) do | sample |
322 |       return if skip.call { sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) }
323 |     end
324 |   end
325 | 
326 |   if ifilter # include sample filter
327 |     found = false
328 |     rec.each_sample(options[:ifilter_samples]) do | sample |
329 |       if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet)
330 |         found = true
331 |         break
332 |       end
333 |     end
334 |     # Skip if there are no matches
335 |     return if skip.call {found}
336 |   end
337 | 
338 |   if efilter # exclude sample filter
339 |     rec.each_sample(options[:efilter_samples]) do | sample |
340 |       return if skip.call{ sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) }
341 |     end
342 |   end
343 | 
344 |   stats.add(rec) if stats
345 | 
346 |   # -----------------------------
347 |   # From here on decide on output
348 | 
349 |   rec.add_to_filter_field(add_filter) if set_filter_field
350 | 
351 |   if samples
352 |     # Select certain samples for output
353 |     newfields = fields[0..8]
354 |     samples.each do |s|
355 |       newfields << fields[s+9]
356 |     end
357 |     fields = newfields
358 |   end
359 |   if options[:eval] or seval
360 |     begin
361 |       results = nil # result string
362 |       if options[:eval]
363 |         res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet)
364 |         results = res if res
365 |       end
366 |       if seval
367 |         list = (results ? [] : [rec.chr,rec.pos])
368 |         rec.each_sample(options[:sfilter_samples]) { | sample |
369 |           list << sample.eval(seval,ignore_missing_data: ignore_missing,quiet: quiet)
370 |         }
371 |         results = (results ? results.to_s + "\t" : "" ) + list.join("\t")
372 |       end
373 |     rescue => e
374 |       $stderr.print "\nLine: ",line
375 |       $stderr.print "ERROR evaluating --eval <#{options[:eval]}> #{e.message}\n"
376 |       raise if options[:verbose]
377 |       exit 1
378 |     end
379 |     if results
380 |       str = if options[:json]
381 |               results.to_json
382 |             else
383 |               results.to_s
384 |             end
385 |       return str+"\n"
386 |     end
387 |   else
388 |     if options[:rdf]
389 |       # Output Turtle RDF
390 |       VcfRdf::record(options[:id],rec,options[:tags])
391 |     elsif options[:template]
392 |       # Use ERB template
393 |       begin
394 |         template.body(binding)
395 |       rescue Exception => e
396 |         $stderr.print e,": ",fields,"\n"
397 |         $stderr.print e.backtrace.inspect if options[:verbose]
398 |         raise
399 |       end
400 |     elsif options[:rewrite]
401 |       # Default behaviour prints VCF line, but rewrite info
402 |       eval(options[:rewrite])
403 |       (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n"
404 |     elsif stats
405 |       # do nothing
406 |     else
407 |       # Default behaviour prints VCF line
408 |       fields.join("\t")+"\n"
409 |     end
410 |   end
411 | end
412 | 
413 | CHUNK_SIZE = options[:thread_lines]
414 | 
415 | pcows = PCOWS.new(options[:num_threads],CHUNK_SIZE,'bio-vcf',options[:timeout],
416 |                   options[:quiet],options[:debug])
417 | header = nil
418 | header_output_completed = false
419 | chunk_lines = []
420 | line_number=0
421 | 
422 | if options[:bed]
423 |   bedfilter = BedFilter.new(options[:bed])
424 | end
425 | 
426 | begin
427 |   # Define linear parser function (going through one chunk)
428 |   process = lambda { | lines |
429 |     res = []
430 |     lines.each do | line |
431 |       res << parse_line(line,header,options,bedfilter,samples,template,stats)
432 |     end
433 |     res
434 |   }
435 | 
436 |   # ---- Main loop
437 |   STDIN.each_line do | line |
438 |     line_number += 1
439 | 
440 |     # ---- Skip embedded headers down the line...
441 |     next if header_output_completed and line =~ /^#/
442 | 
443 |     # ---- In the following section header information is handled -
444 |     #      this only happens once.
445 | 
446 |     # ---- Parse the header lines (chomps from STDIN)
447 |     #      and returns header info and the current line
448 |     if line =~ /^#/
449 |       header, line = parse_header(line,samples,options)
450 |       if line.nil?
451 |         # No line after header, to there are no records to process
452 |         break
453 |       end
454 |     end
455 |     # p [line_number,line]
456 |     # ---- After the header continue processing
457 |     if not header_output_completed
458 |       # one-time post-header processing
459 |       if not options[:efilter_samples] and options[:ifilter_samples]
460 |         # Create exclude set as a complement of include set
461 |         options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples]
462 |       end
463 |       print template.header(binding) if template
464 |       header_output_completed = true
465 |     end
466 | 
467 |     if options[:eval_once]
468 |       # this happens if we only want one line evaluated - say to get
469 |       # the number of samples, or parse an item in the header
470 |       print parse_line(line,header,options,bedfilter,samples,template,stats)
471 |       exit 0
472 |     end
473 | 
474 |     # ---- Lines are collected in one buffer and the lines buffer
475 |     #      is added to the chunks list (for the threads)
476 |     chunk_lines << line
477 | 
478 |     # ---- In the following section the VCF lines are parsed by chunks
479 |     #      The chunks may go into different threads
480 | 
481 |     if chunk_lines.size >= CHUNK_SIZE
482 |       # ---- process one chunk
483 |       $stderr.print '.' if not options[:quiet]
484 |       pcows.wait_for_worker_slot()
485 |       pcows.submit_worker(process,chunk_lines)
486 |       pcows.process_output()
487 | 
488 |       chunk_lines = []
489 |     end
490 |   end
491 |   pcows.submit_final_worker(process,chunk_lines)
492 |   pcows.wait_for_workers()
493 |   pcows.process_remaining_output()
494 | 
495 |   print template.footer(binding) if template
496 |   stats.print if stats
497 | 
498 | rescue Exception => e
499 |   if e.message != 'exit'
500 |     $stderr.print "ERROR: "
501 |     $stderr.print e.message,"\n"
502 |   end
503 |   pcows.cleanup()
504 |   raise if options[:verbose]
505 |   exit 1
506 | end
507 | 


--------------------------------------------------------------------------------