├── .gitignore ├── .gitmodules ├── .travis.yml ├── 1KG_Index ├── 1KG_IDs.txt ├── 1kG_ethnic_index.py └── README ├── LICENSE ├── Makefile ├── README.md ├── bin ├── plotBfst.R ├── plotHapLrt.R ├── plotHaplotypes.R ├── plotPfst.R ├── plotSmoothed.R ├── plotWCfst.R ├── plotXPEHH.R ├── plot_roc.r ├── vcf2sqlite.py ├── vcf_strip_extra_headers ├── vcfbiallelic ├── vcfclearid ├── vcfclearinfo ├── vcfcomplex ├── vcfgtcompare.sh ├── vcfindelproximity ├── vcfindels ├── vcfmultiallelic ├── vcfmultiway ├── vcfmultiwayscripts ├── vcfnobiallelicsnps ├── vcfnoindels ├── vcfnosnps ├── vcfnulldotslashdot ├── vcfplotaltdiscrepancy.r ├── vcfplotaltdiscrepancy.sh ├── vcfplotsitediscrepancy.r ├── vcfplottstv.sh ├── vcfprintaltdiscrepancy.r ├── vcfprintaltdiscrepancy.sh ├── vcfqualfilter ├── vcfregionreduce ├── vcfregionreduce_and_cut ├── vcfregionreduce_pipe ├── vcfregionreduce_uncompressed ├── vcfremovenonATGC ├── vcfsnps ├── vcfsort └── vcfvarstats ├── examples ├── 612.bFst_SatApr12_12_29_17.png ├── 612.counts_FriApr11_13_16_42.png ├── 612.nocounts.smoothed_TueApr15_13_25_53.png ├── 612.nocounts_FriApr11_13_16_38.png ├── 612.wcfst.txt_FriApr11_13_17_02.png ├── headCrest.haps.txt_ThuMay29_15_08_53.pdf ├── headCrest.haps.txt_ThuMay29_15_08_53.png ├── phasing-diff-res.png ├── phasing-results.png ├── scaffold612.d-stat.10kb.txt_abba-baba_WedOct29_10_17_10.png ├── t.hapass.txt_TueJun24_11_52_08.png ├── xp-phased.txt ├── xp-phased.txt_TueApr29_13_20_47.png ├── xp-unphased.txt ├── xp-unphased.txt_TueApr29_13_20_58.png └── xpehh_WedApr23_11_30_10.png ├── logos ├── websiteLogo.pdf └── websiteLogo.png ├── samples ├── sample.vcf ├── scaffold612.phased.vcf └── scaffold612.vcf └── src ├── BedReader.h ├── LD.cpp ├── Variant.cpp ├── Variant.h ├── abba-baba.cpp ├── bFst.cpp ├── cdflib.cpp ├── cdflib.hpp ├── convert.h ├── dumpContigsFromHeader.cpp ├── genotypeSummary.cpp ├── gl-XPEHH.cpp ├── gpatInfo.hpp ├── hapLrt.cpp ├── iHS.cpp ├── join.h ├── mt19937ar.h ├── pFst.cpp ├── pdflib.cpp ├── pdflib.hpp ├── permuteGPAT++.cpp ├── permuteGPATsmoother.cpp ├── plotHaps.cpp ├── popStats.cpp ├── rnglib.cpp ├── rnglib.hpp ├── segmentFst.cpp ├── sequenceDiversity.cpp ├── smoother.cpp ├── split.cpp ├── split.h ├── ssw.c ├── ssw.h ├── ssw_cpp.cpp ├── ssw_cpp.h ├── var.cpp ├── var.hpp ├── vcf2fasta.cpp ├── vcf2tsv.cpp ├── vcfaddinfo.cpp ├── vcfafpath.cpp ├── vcfallelicprimitives.cpp ├── vcfaltcount.cpp ├── vcfannotate.cpp ├── vcfannotategenotypes.cpp ├── vcfbreakmulti.cpp ├── vcfcat.cpp ├── vcfcheck.cpp ├── vcfclassify.cpp ├── vcfcleancomplex.cpp ├── vcfcombine.cpp ├── vcfcommonsamples.cpp ├── vcfcountalleles.cpp ├── vcfcreatemulti.cpp ├── vcfdistance.cpp ├── vcfecho.cpp ├── vcfentropy.cpp ├── vcfevenregions.cpp ├── vcffilter.cpp ├── vcffixup.cpp ├── vcfflatten.cpp ├── vcfgeno2alleles.cpp ├── vcfgeno2haplo.cpp ├── vcfgenosamplenames.cpp ├── vcfgenosummarize.cpp ├── vcfgenotypecompare.cpp ├── vcfgenotypes.cpp ├── vcfglbound.cpp ├── vcfglxgt.cpp ├── vcfhetcount.cpp ├── vcfhethomratio.cpp ├── vcfindex.cpp ├── vcfintersect.cpp ├── vcfkeepgeno.cpp ├── vcfkeepinfo.cpp ├── vcfkeepsamples.cpp ├── vcfleftalign.cpp ├── vcflength.cpp ├── vcfnumalt.cpp ├── vcfoverlay.cpp ├── vcfparsealts.cpp ├── vcfprimers.cpp ├── vcfqual2info.cpp ├── vcfrandom.cpp ├── vcfrandomsample.cpp ├── vcfremap.cpp ├── vcfremoveaberrantgenotypes.cpp ├── vcfremovesamples.cpp ├── vcfroc.cpp ├── vcfsample2info.cpp ├── vcfsamplediff.cpp ├── vcfsamplenames.cpp ├── vcfsamplestats.cpp ├── vcfsitesummarize.cpp ├── vcfsom.cpp ├── vcfstats.cpp ├── vcfstreamsort.cpp ├── vcfuniq.cpp ├── vcfuniqalleles.cpp ├── wcFst.cpp └── xpEHH.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .Rhistory 3 | .*swp 4 | .nfs* 5 | *.o 6 | BedReader.cpp 7 | Fasta.cpp 8 | Fasta.h 9 | Makefile.bad 10 | Multinomial.cpp 11 | Multinomial.h 12 | Pilot1 13 | Pilot2 14 | VCF.h 15 | VariantFilter.h 16 | b.vcf 17 | bugs/ 18 | callgrind.out.7143 19 | f 20 | freebayes.chr20.integrated.nogeno.20101123.vcf 21 | glorder 22 | glorder.cpp 23 | glorder.py 24 | glorder.pyc 25 | gmon.out 26 | multimaptest.cpp 27 | pooled.sqlite 28 | pooled.sqlite3 29 | shunt 30 | shunt.c 31 | t.bed 32 | test.db 33 | test.vcf 34 | test.vcf.gz 35 | test.vcf.gz.tbi 36 | test/ 37 | vcf2tsv 38 | vcfaddinfo 39 | vcfaddtag.cpp 40 | vcfafpath 41 | vcfallelicprimitives 42 | vcfaltcount 43 | vcfannotate 44 | vcfannotategenotypes 45 | vcfbreakmulti 46 | vcfcheck 47 | vcfclassify 48 | vcfcleancomplex 49 | vcfcommonsamples 50 | vcfcountalleles 51 | vcfcreatemulti 52 | vcfdistance 53 | vcfecho 54 | vcfentropy 55 | vcffilter 56 | vcffixup 57 | vcffixup.cpp.bak 58 | vcfflatten 59 | vcfgeno2haplo 60 | vcfgenotypecompare 61 | vcfgenotypes 62 | vcfglxgt 63 | vcfhaplotyecompare.cpp 64 | vcfhetcount 65 | vcfhethomratio 66 | vcfintersect 67 | vcfkeepfields 68 | vcfkeepgeno 69 | vcfkeepinfo 70 | vcfkeepsamples 71 | vcflength 72 | vcfmultiwaywwwindexfilter 73 | vcfnogeno 74 | vcfnogeno.cpp 75 | vcfnumalt 76 | vcfoverlay 77 | vcfparallel 78 | vcfparsealts 79 | vcfphylo.cpp 80 | vcfplotaltdiscrepancy.r.loess 81 | vcfplottstv.r 82 | vcfprimers 83 | vcfrandom 84 | vcfrandomsample 85 | vcfremap 86 | vcfremoveaberrantgenotypes 87 | vcfremovesamples 88 | vcfroc 89 | vcfsamplediff 90 | vcfsamplenames 91 | vcfsitesummarize 92 | vcfsom 93 | vcfsplit.cpp 94 | vcfstats 95 | vcfstreamsort 96 | vcfuniqalleles 97 | #vcfcountalleles.cpp# 98 | .vcfplotaltdiscrepancy.r.swo 99 | .vcfstats.cpp.swn 100 | .vcfstats.cpp.swo 101 | a.out 102 | vcfuniq 103 | vcfcat 104 | vcfevenregions 105 | vcfgenosummarize 106 | vcfgenosamplenames 107 | vcf2fasta 108 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tabixpp"] 2 | path = tabixpp 3 | url = https://github.com/ekg/tabixpp.git 4 | [submodule "smithwaterman"] 5 | path = smithwaterman 6 | url = https://github.com/ekg/smithwaterman.git 7 | [submodule "multichoose"] 8 | path = multichoose 9 | url = https://github.com/ekg/multichoose.git 10 | [submodule "fastahack"] 11 | path = fastahack 12 | url = https://github.com/ekg/fastahack.git 13 | [submodule "intervaltree"] 14 | path = intervaltree 15 | url = https://github.com/ekg/intervaltree.git 16 | [submodule "fsom"] 17 | path = fsom 18 | url = https://github.com/ekg/fsom.git 19 | [submodule "filevercmp"] 20 | path = filevercmp 21 | url = https://github.com/ekg/filevercmp.git 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: required 3 | compiler: 4 | - gcc 5 | before_install: 6 | - sudo apt-get install g++ 7 | - sudo apt-get install build-essential 8 | os: 9 | - linux 10 | script: make -------------------------------------------------------------------------------- /1KG_Index/1kG_ethnic_index.py: -------------------------------------------------------------------------------- 1 | import argparse, csv, os, sys 2 | 3 | parser=argparse.ArgumentParser(description="Determines the index of individuals\ 4 | of a given ethnicity within a 1000 Genomes VCF") 5 | parser.add_argument("VCF", type=str, help="VCF of 1000 Genomes individuals") 6 | parser.add_argument("Population", type=str, help="1KG identifier for population\ 7 | to be found in the index, enter \"ALL|\" to print index for all populations in \ 8 | the VCF") 9 | arg=parser.parse_args() 10 | 11 | a=os.path.abspath(sys.argv[0]).split("/")[:-1] 12 | b="/".join(a) 13 | 14 | try: 15 | open(b+"/1KG_IDs.txt") 16 | except IOError: 17 | print "Missing file \"1KG_IDs.txt\" containing the IDs for 1000 Genomes" 18 | 19 | popdict={} 20 | with open(b+"/1KG_IDs.txt") as t: 21 | for line in csv.reader(t,delimiter='\t'): 22 | popdict[line[1]]=line[0] 23 | 24 | with open(arg.VCF) as t: 25 | for line in csv.reader(t,delimiter='\t'): 26 | if line[0]=='#CHROM': 27 | for j in range(len(line)): 28 | if 'NA' in line[j] or 'HG' in line[j]: 29 | vcfind=line[j:] 30 | break 31 | 32 | indict={} 33 | for i in range(len(vcfind)): 34 | try: 35 | eth=popdict[vcfind[i]] 36 | except: 37 | print"Non 1000 Genomes individuals found in VCF" 38 | if eth in indict.keys(): 39 | indict[eth].append(i) 40 | else: 41 | indict[eth]=[i] 42 | 43 | if arg.Population=="All": 44 | for j in indict.keys(): 45 | l=[str(i) for i in indict[j]] 46 | print j,"=",",".join(l) 47 | 48 | else: 49 | try: 50 | indict[arg.Population] 51 | except IOError: 52 | print "Population not in VCF or invalid population ID" 53 | l=[str(i) for i in indict[arg.Population]] 54 | print ",".join(l) 55 | -------------------------------------------------------------------------------- /1KG_Index/README: -------------------------------------------------------------------------------- 1 | This is an accessory script to find the index of 1000 Genome individuals for a specific population within a 1000 Genomes derived VCF. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Erik Garrison 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #OBJ_DIR = ./ 2 | HEADERS = src/Variant.h \ 3 | src/split.h \ 4 | src/pdflib.hpp \ 5 | src/var.hpp \ 6 | src/cdflib.hpp \ 7 | src/rnglib.hpp \ 8 | src/join.h 9 | SOURCES = src/Variant.cpp \ 10 | src/rnglib.cpp \ 11 | src/var.cpp \ 12 | src/pdflib.cpp \ 13 | src/cdflib.cpp \ 14 | src/split.cpp 15 | OBJECTS= $(SOURCES:.cpp=.o) 16 | 17 | # TODO 18 | #vcfstats.cpp 19 | 20 | BIN_SOURCES = src/vcfecho.cpp \ 21 | src/dumpContigsFromHeader.cpp \ 22 | src/bFst.cpp \ 23 | src/hapLrt.cpp \ 24 | src/popStats.cpp \ 25 | src/wcFst.cpp \ 26 | src/segmentFst.cpp \ 27 | src/genotypeSummary.cpp \ 28 | src/sequenceDiversity.cpp \ 29 | src/pFst.cpp \ 30 | src/smoother.cpp \ 31 | src/LD.cpp \ 32 | src/plotHaps.cpp \ 33 | src/abba-baba.cpp \ 34 | src/permuteGPAT++.cpp \ 35 | src/vcfaltcount.cpp \ 36 | src/vcfhetcount.cpp \ 37 | src/vcfhethomratio.cpp \ 38 | src/vcffilter.cpp \ 39 | src/vcf2tsv.cpp \ 40 | src/vcfgenotypes.cpp \ 41 | src/vcfannotategenotypes.cpp \ 42 | src/vcfcommonsamples.cpp \ 43 | src/vcfremovesamples.cpp \ 44 | src/vcfkeepsamples.cpp \ 45 | src/vcfsamplenames.cpp \ 46 | src/vcfgenotypecompare.cpp \ 47 | src/vcffixup.cpp \ 48 | src/vcfclassify.cpp \ 49 | src/vcfsamplediff.cpp \ 50 | src/vcfremoveaberrantgenotypes.cpp \ 51 | src/vcfrandom.cpp \ 52 | src/vcfparsealts.cpp \ 53 | src/vcfstats.cpp \ 54 | src/vcfflatten.cpp \ 55 | src/vcfprimers.cpp \ 56 | src/vcfnumalt.cpp \ 57 | src/vcfcleancomplex.cpp \ 58 | src/vcfintersect.cpp \ 59 | src/vcfannotate.cpp \ 60 | src/vcfallelicprimitives.cpp \ 61 | src/vcfoverlay.cpp \ 62 | src/vcfaddinfo.cpp \ 63 | src/vcfkeepinfo.cpp \ 64 | src/vcfkeepgeno.cpp \ 65 | src/vcfafpath.cpp \ 66 | src/vcfcountalleles.cpp \ 67 | src/vcflength.cpp \ 68 | src/vcfdistance.cpp \ 69 | src/vcfrandomsample.cpp \ 70 | src/vcfentropy.cpp \ 71 | src/vcfglxgt.cpp \ 72 | src/vcfroc.cpp \ 73 | src/vcfsom.cpp \ 74 | src/vcfcheck.cpp \ 75 | src/vcfstreamsort.cpp \ 76 | src/vcfuniq.cpp \ 77 | src/vcfuniqalleles.cpp \ 78 | src/vcfremap.cpp \ 79 | src/vcf2fasta.cpp \ 80 | src/vcfsitesummarize.cpp \ 81 | src/vcfbreakmulti.cpp \ 82 | src/vcfcreatemulti.cpp \ 83 | src/vcfevenregions.cpp \ 84 | src/vcfcat.cpp \ 85 | src/vcfgenosummarize.cpp \ 86 | src/vcfgenosamplenames.cpp \ 87 | src/vcfgeno2haplo.cpp \ 88 | src/vcfleftalign.cpp \ 89 | src/vcfcombine.cpp \ 90 | src/vcfgeno2alleles.cpp \ 91 | src/vcfindex.cpp \ 92 | src/vcfsample2info.cpp \ 93 | src/vcfqual2info.cpp \ 94 | src/vcfglbound.cpp \ 95 | 96 | #BINS = $(BIN_SOURCES:.cpp=) 97 | BINS = $(addprefix bin/,$(notdir $(BIN_SOURCES:.cpp=))) 98 | SHORTBINS = $(notdir $(BIN_SOURCES:.cpp=)) 99 | 100 | TABIX = tabixpp/tabix.o 101 | 102 | FASTAHACK = fastahack/Fasta.o 103 | 104 | SMITHWATERMAN = smithwaterman/SmithWatermanGotoh.o 105 | 106 | REPEATS = smithwaterman/Repeats.o 107 | 108 | INDELALLELE = smithwaterman/IndelAllele.o 109 | 110 | DISORDER = smithwaterman/disorder.c 111 | 112 | LEFTALIGN = smithwaterman/LeftAlign.o 113 | 114 | FSOM = fsom/fsom.o 115 | 116 | FILEVERCMP = filevercmp/filevercmp.o 117 | 118 | INCLUDES = -I. -L. -Ltabixpp/ -ltabix -lz -lm 119 | 120 | all: $(OBJECTS) $(BINS) 121 | 122 | 123 | GIT_VERSION := $(shell git describe --abbrev=4 --dirty --always) 124 | 125 | CXX = g++ 126 | CXXFLAGS = -O3 -D_FILE_OFFSET_BITS=64 -std=c++0x -DVERSION=\"$(GIT_VERSION)\" 127 | #CXXFLAGS = -O2 128 | #CXXFLAGS = -pedantic -Wall -Wshadow -Wpointer-arith -Wcast-qual 129 | 130 | SSW = src/ssw.o src/ssw_cpp.o 131 | 132 | ssw.o: src/ssw.h 133 | ssw_cpp.o:src/ssw_cpp.h 134 | 135 | openmp: 136 | $(MAKE) CXXFLAGS="$(CXXFLAGS) -fopenmp -D HAS_OPENMP" 137 | 138 | profiling: 139 | $(MAKE) CXXFLAGS="$(CXXFLAGS) -g" all 140 | 141 | gprof: 142 | $(MAKE) CXXFLAGS="$(CXXFLAGS) -pg" all 143 | 144 | $(OBJECTS): $(SOURCES) $(HEADERS) $(TABIX) 145 | $(CXX) -c -o $@ src/$(*F).cpp $(INCLUDES) $(LDFLAGS) $(CXXFLAGS) 146 | 147 | $(TABIX): 148 | cd tabixpp && $(MAKE) 149 | 150 | $(SMITHWATERMAN): 151 | cd smithwaterman && $(MAKE) 152 | 153 | $(DISORDER): $(SMITHWATERMAN) 154 | 155 | $(REPEATS): $(SMITHWATERMAN) 156 | 157 | $(LEFTALIGN): $(SMITHWATERMAN) 158 | 159 | $(INDELALLELE): $(SMITHWATERMAN) 160 | 161 | $(FASTAHACK): 162 | cd fastahack && $(MAKE) 163 | 164 | $(FSOM): 165 | cd fsom && $(CXX) $(CXXFLAGS) -c fsom.c -lm 166 | 167 | $(FILEVERCMP): 168 | cd filevercmp && make 169 | 170 | $(SHORTBINS): 171 | $(MAKE) bin/$@ 172 | 173 | $(BINS): $(BIN_SOURCES) $(OBJECTS) $(SMITHWATERMAN) $(FASTAHACK) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FSOM) $(FILEVERCMP) 174 | $(CXX) $(OBJECTS) $(SMITHWATERMAN) $(REPEATS) $(DISORDER) $(LEFTALIGN) $(INDELALLELE) $(SSW) $(FASTAHACK) $(FSOM) $(FILEVERCMP) tabixpp/tabix.o tabixpp/bgzf.o src/$(notdir $@).cpp -o $@ $(INCLUDES) $(LDFLAGS) $(CXXFLAGS) 175 | 176 | 177 | pull: 178 | git pull 179 | 180 | update: pull all 181 | 182 | clean: 183 | rm -f $(BINS) $(OBJECTS) 184 | rm -f ssw_cpp.o ssw.o 185 | cd tabixpp && make clean 186 | cd smithwaterman && make clean 187 | cd fastahack && make clean 188 | 189 | .PHONY: clean all 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DO NOT USE THIS VERSION OF VCFLIB, except for the wiki. This fork has been merged. 2 | 3 | -------------------------------------------------------------------------------- /bin/plotBfst.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args pFst.txt 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | plotPfst<-function(x){ 6 | require("ggplot2") 7 | dat<-read.table( x, header=FALSE ) 8 | dat$V2 <-dat$V2 / 1e3 9 | 10 | theplot<-ggplot(dat, aes(x=V2, y=V9))+geom_point()+geom_segment(aes(x=V2, xend=V2, y=V10, yend=V11))+labs(x="KB position", y="Fst")+theme_grey(15) 11 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.png")), collapse="_") 12 | ggsave(filename=pngName, width=20, height=4, units="in", theplot) 13 | } 14 | 15 | plotPfst(cmd_args) -------------------------------------------------------------------------------- /bin/plotHapLrt.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args pFst.txt 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | plotPfst<-function(x){ 6 | require("ggplot2") 7 | dat<-read.table( x, header=FALSE ) 8 | dat<-dat[dat$V5 < 0.9,] 9 | theplot<-ggplot(dat, aes(x=V2/1e3, y=-log10(V5)*V6))+geom_point()+theme_grey(15)+labs(x="KB position", y="-log10(hapLRT * sign)")+geom_hline(aes(yintercept=0), linetype="dashed", colour="red") 10 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.png")), collapse="_") 11 | ggsave(filename=pngName, width=20, height=4, units="in", theplot) 12 | } 13 | 14 | plotPfst(cmd_args) -------------------------------------------------------------------------------- /bin/plotHaplotypes.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args plotHapOutput.txt 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | 6 | imageHap<-function(x){ 7 | 8 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.pdf")), collapse="_") 9 | 10 | dat<-read.table( x[1], header=FALSE ) 11 | pos<-dat[,1] 12 | dat<-dat[,2:length(dat)] 13 | print(head(dat)) 14 | hd<-dist(t(dat), method="binary") 15 | or<-hclust(hd) 16 | or$labels<-1:length(dat) 17 | print(or$labels) 18 | 19 | pdf(pngName, width=9, height=8) 20 | par(mfrow=c(2,1)) 21 | image(1-as.matrix(dat[,or$order]), yaxt="n", xaxt="n", ylab="Haplotypes", xlab="SNP", cex.lab=1.1) 22 | plot(or, main="", cex=1.1, lwd=2, sub="", xlab="") 23 | dev.off() 24 | 25 | } 26 | 27 | imageHap(cmd_args) -------------------------------------------------------------------------------- /bin/plotPfst.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args pFst.txt 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | plotPfst<-function(x){ 6 | require("ggplot2") 7 | dat<-read.table( x, header=FALSE ) 8 | dat$V2<-1:length(dat$V2) 9 | dat<-dat[dat$V3 < 0.9,] 10 | theplot<-ggplot(dat, aes(x=V2, y=-log10(V3)))+geom_point()+theme_grey(15)+labs(x="SNP index", y="-log10(pFst)") 11 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.png")), collapse="_") 12 | ggsave(filename=pngName, width=20, height=4, units="in", theplot) 13 | } 14 | 15 | plotPfst(cmd_args) -------------------------------------------------------------------------------- /bin/plotSmoothed.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args smoothedpFst.txt wcFst|pFst|abba-baba 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | plotPfst<-function(x){ 6 | require("ggplot2") 7 | dat<-read.table( x[1], header=FALSE ) 8 | dat$V2<-1:length(dat$V2) 9 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.png")), collapse="_") 10 | 11 | theplot<-NULL 12 | 13 | if(x[2] == "pFst"){ 14 | theplot<-ggplot(dat, aes(x=V2, y=-log10(V5), colour=V4))+geom_point()+theme_grey(15)+labs(x="SNP index", y="-log10(smoothed pFst)")+scale_colour_continuous(low="grey", high="red", name="variants in window") 15 | } 16 | if(x[2] == "wcFst"){ 17 | theplot<-ggplot(dat, aes(x=V2, y=V5, colour=V4))+geom_point()+theme_grey(15)+labs(x="SNP index", y="smoothed wcFst")+scale_colour_continuous(low="grey", high="red", name="variants in window") 18 | } 19 | if(x[2] == "xpEHH"){ 20 | theplot<-ggplot(dat, aes(x=V2, y=V5, colour=V4))+geom_point()+theme_grey(15)+labs(x="SNP index", y="smoothed xpEHH")+scale_colour_continuous(low="grey", high="red", name="variants in window") 21 | } 22 | if(x[2] == "abba-baba"){ 23 | theplot<-ggplot(dat, aes(x=V3, y=V5, colour=V4))+geom_point()+theme_grey(15)+labs(x="SNP index", y="D-statistic")+scale_colour_continuous(low="grey", high="red", name="variants in window") 24 | } 25 | ggsave(filename=pngName, width=20, height=4, units="in", theplot) 26 | } 27 | 28 | plotPfst(cmd_args) 29 | -------------------------------------------------------------------------------- /bin/plotWCfst.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args pFst.txt 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | plotPfst<-function(x){ 6 | require("ggplot2") 7 | dat<-read.table( x, header=FALSE ) 8 | dat$V2<-1:length(dat$V2) 9 | theplot<-ggplot(dat, aes(x=V2, y=V5))+geom_point()+theme_grey(15)+labs(x="SNP index", y="wcFst")+ylim(0,1.1) 10 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.png")), collapse="_") 11 | ggsave(filename=pngName, width=20, height=4, units="in", theplot) 12 | } 13 | 14 | plotPfst(cmd_args) -------------------------------------------------------------------------------- /bin/plotXPEHH.R: -------------------------------------------------------------------------------- 1 | #usage: nohup R --vanilla < plotPfst --args pFst.txt 2 | 3 | cmd_args <- commandArgs(trailingOnly = TRUE) 4 | 5 | plotPfst<-function(x){ 6 | require("ggplot2") 7 | dat<-read.table( x, header=FALSE ) 8 | dat$V2 <-dat$V2 / 1e3 9 | theplot<-ggplot(dat, aes(x=V2, y=V6))+geom_point()+labs(x="KB position", y="raw XPEHH")+theme_grey(15) 10 | pngName<-paste(c(x, format(Sys.time(), "%a%b%d_%H_%M_%S.png")), collapse="_") 11 | ggsave(filename=pngName, width=20, height=4, units="in", theplot) 12 | } 13 | 14 | plotPfst(cmd_args) -------------------------------------------------------------------------------- /bin/vcf2sqlite.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import re 5 | import sqlite3 6 | 7 | if len(sys.argv) < 2: 8 | print "usage", sys.argv[0], " [dbname]" 9 | print "reads VCF on stdin, and writes output to a sqlite3 db [dbname]" 10 | exit(1) 11 | 12 | dbname = sys.argv[1] 13 | 14 | # parse the header 15 | # into a mapping from tag -> type 16 | 17 | infotypes = {} 18 | infonumbers = {} 19 | 20 | for line in sys.stdin: 21 | if line.startswith('##INFO'): 22 | # 1: 32 | # unclear how to deal with these 33 | continue 34 | else: 35 | number = int(number) 36 | typestr = t.groups()[0] 37 | infotypes[id] = typestr 38 | infonumbers[id] = number 39 | else: 40 | continue 41 | elif line.startswith('##'): 42 | continue 43 | else: 44 | break # header line, sample names etc. 45 | 46 | # write the table schema 47 | 48 | infotype_to_sqltype = {} 49 | infotype_to_sqltype["Flag"] = "boolean" 50 | infotype_to_sqltype["Integer"] = "integer" 51 | infotype_to_sqltype["Float"] = "real" 52 | infotype_to_sqltype["String"] = "text" 53 | 54 | tablecmd = """create table alleles""" 55 | specs = ["CHROM text", 56 | "POS integer", 57 | "ID text", 58 | "REF text", 59 | "ALT text", 60 | "QUAL real", 61 | "FILTER text"] 62 | 63 | sorted_fields = sorted(infotypes.keys()) 64 | for field in sorted_fields: 65 | infotype = infotypes[field] 66 | sqltype = infotype_to_sqltype[infotype] 67 | field = field.replace(".", "_") # escape periods, which are not allowed 68 | specs.append(field + " " + sqltype) 69 | 70 | tablecmd += " (" + ", ".join(specs) + ")" 71 | 72 | conn = sqlite3.connect(dbname) 73 | conn.execute(tablecmd) 74 | 75 | # for each record 76 | # parse the record 77 | # for each allele 78 | 79 | for line in sys.stdin: 80 | fields = line.split('\t') 81 | chrom, pos, id, ref, alts, qual, filter, info = fields[:8] 82 | alts = alts.split(",") 83 | altindex = 0 84 | chrom = "\'" + chrom + "\'" 85 | id = "\'" + id + "\'" 86 | ref = "\'" + ref + "\'" 87 | filter = "\'" + filter + "\'" 88 | for alt in alts: 89 | alt = "\'" + alt + "\'" 90 | info_values = {} 91 | for pair in info.split(";"): 92 | if pair.find("=") is not -1: 93 | pair = pair.split("=") 94 | key = pair[0] 95 | value = pair[1] 96 | if not infonumbers.has_key(key): 97 | continue 98 | if infonumbers[key] == -1: 99 | values = value.split(",") 100 | value = values[altindex] 101 | info_values[key] = value 102 | else: 103 | # boolean flag 104 | info_values[pair] = "1" 105 | ordered_insertion = [] 106 | for field in sorted_fields: 107 | value = "null" 108 | if info_values.has_key(field): 109 | value = info_values[field] 110 | if infotypes[field] == "String": 111 | value = "\'" + value + "\'" 112 | else: 113 | # missing flag means "false" for that flag 114 | if infotypes[field] == "Flag": 115 | value = "0" 116 | ordered_insertion.append(value) 117 | cmd = "insert into alleles values (" \ 118 | + ", ".join([chrom, pos, id, ref, alt, qual, filter]) \ 119 | + ", " \ 120 | + ", ".join(ordered_insertion) + ")" 121 | conn.execute(cmd) 122 | altindex += 1 123 | 124 | conn.commit() 125 | 126 | # TODO ignoring samples (for now) 127 | 128 | # add indexes everywhere? 129 | 130 | conn.close() 131 | -------------------------------------------------------------------------------- /bin/vcf_strip_extra_headers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | my $seen_non_header = 0; 4 | 5 | while () { 6 | if (! $seen_non_header) { 7 | if (/^#/) { 8 | } else { 9 | $seen_non_header = 1; 10 | } 11 | print; 12 | } else { 13 | if (! /^#/) { 14 | print; 15 | } 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /bin/vcfbiallelic: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alt = $5; 14 | if ($alt =~ /,/) { 15 | # remove anything which isn't biallelic 16 | } else { 17 | print; 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /bin/vcfclearid: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | 4 | import sys 5 | 6 | for line in sys.stdin: 7 | if line.startswith("#"): 8 | print line.strip() 9 | else: 10 | fields = line.strip().split("\t") 11 | fields[2] = "." 12 | print "\t".join(fields) 13 | -------------------------------------------------------------------------------- /bin/vcfclearinfo: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | 4 | import sys 5 | 6 | for line in sys.stdin: 7 | if line.startswith("#"): 8 | print line.strip() 9 | else: 10 | fields = line.strip().split("\t") 11 | fields[7] = "." 12 | print "\t".join(fields) 13 | -------------------------------------------------------------------------------- /bin/vcfcomplex: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasindel = 0; 15 | @alts = split(/,/, $alts); 16 | $snp = 1; 17 | foreach $alt (@alts) { 18 | if (length($ref) > 1 || length($alt) != length($ref)) { 19 | $snp = 0; 20 | } 21 | } 22 | if (!$snp) { 23 | print; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /bin/vcfgtcompare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# != 3 ]; 4 | then 5 | echo "usage: $0 [annotation] [fileA] [fileB]" 6 | echo "annotates records in the first file with genotypes and sites from the second" 7 | exit 8 | fi 9 | 10 | annotation=$1 11 | fileA=$2 12 | fileB=$3 13 | 14 | vcfcommonsamples $fileA $fileB \ 15 | | vcfannotategenotypes $annotation - $fileB \ 16 | | vcfgenotypecompare $annotation - 17 | -------------------------------------------------------------------------------- /bin/vcfindelproximity: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | 5 | 6 | # for line in the vcf 7 | # stuff the line into a queue 8 | # when you reach an indel 9 | # record the position 10 | # pop lines from the back of the queue until we are at the current position 11 | # 12 | 13 | my @lines; 14 | 15 | my $prox = $ARGV[0]; 16 | 17 | my $lastchrom = ""; 18 | my $indelpos = 0; 19 | 20 | while () { 21 | 22 | if ($_ =~ /^#/) { 23 | print $_; 24 | next; 25 | } 26 | 27 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 28 | my $chrom = $1; 29 | my $pos = $2; 30 | my $tag = $3; 31 | my $ref = $4; 32 | my $alt = $5; 33 | #print "chrom: $chrom, pos: $pos, ref: $ref, alt: $alt\n"; 34 | 35 | # if new chrom, print out everything from last one 36 | if ($lastchrom == "") { 37 | $lastchrom = $chrom; 38 | } 39 | 40 | if ($chrom != $lastchrom) { 41 | while ($lines) { 42 | print pop(@lines); 43 | } 44 | } 45 | 46 | unshift(@lines, $_); 47 | 48 | my $diff = length($ref) - length($alt); 49 | 50 | if ($diff != 0) { 51 | # insertion 52 | if ($indelpos == 0) { 53 | $indelpos = $pos; 54 | } 55 | $nextindelpos = $pos; 56 | #print "last $indelpos next $nextindelpos\n"; 57 | while (@lines) { 58 | my $line = pop(@lines); 59 | $line =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 60 | my $c = $1; 61 | my $p = $2; 62 | my $t = $3; 63 | my $r = $4; 64 | my $a = $5; 65 | # print indels 66 | if (length($r) - length($a) != 0) { 67 | print $line; 68 | } else { 69 | # print other events which are more than prox away from indels 70 | if (abs($indelpos - $p) >= $prox and abs($nextindelpos - $p) >= $prox) { 71 | print $line; 72 | } 73 | } 74 | } 75 | $indelpos = $pos; 76 | } 77 | } 78 | 79 | # flush lines end of file 80 | while ($lines) { 81 | print pop(@lines); 82 | } 83 | -------------------------------------------------------------------------------- /bin/vcfindels: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasindel = 0; 15 | @alts = split(/,/, $alts); 16 | $snp = 1; 17 | foreach $alt (@alts) { 18 | if (length($alt) != length($ref)) { 19 | $snp = 0; 20 | } 21 | } 22 | if (!$snp) { 23 | print; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /bin/vcfmultiallelic: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alt = $5; 14 | if ($alt =~ /,/) { 15 | print; 16 | } else { 17 | # remove anything which isn't multiallelic 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /bin/vcfmultiway: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | reference=$1 4 | shift 5 | 6 | echo comparing $@ 7 | 8 | for fileA in $@; 9 | do 10 | for fileB in $@; 11 | do 12 | if [ "$fileA" = "$fileB" ] 13 | then 14 | vcfstats $fileA >$(basename $fileA).stats.txt 15 | else 16 | vcfintersect -r $reference -i $fileA $fileB | vcfstats >$(basename $fileA).common.$(basename $fileB).stats.txt 17 | vcfintersect -r $reference -v -i $fileB $fileA | vcfstats >$(basename $fileA).unique.$(basename $fileB).stats.txt 18 | fi 19 | done 20 | done 21 | -------------------------------------------------------------------------------- /bin/vcfmultiwayscripts: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | reference=$1 4 | outdir=$2 5 | scriptsdir=$3 6 | shift 7 | shift 8 | shift 9 | 10 | mkdir -p $outdir 11 | mkdir -p $scriptsdir 12 | 13 | echo comparing $@ 14 | 15 | for fileA in $@; 16 | do 17 | fileA=$(pwd)/$fileA 18 | for fileB in $@; 19 | do 20 | fileB=$(pwd)/$fileB 21 | echo $fileA vs $fileB 22 | if [ "$fileA" = "$fileB" ] 23 | then 24 | echo "vcfstats $fileA >$outdir/$(basename $fileA).stats" >$scriptsdir/$(basename $fileA).sh 25 | else 26 | echo "vcfintersect -r $reference -i $fileA $fileB | vcfstats >$outdir/$(basename $fileA).common.$(basename $fileB).stats" >$scriptsdir/$(basename $fileA).common.$(basename $fileB).sh 27 | echo "vcfintersect -r $reference -v -i $fileB $fileA | vcfstats >$outdir/$(basename $fileA).unique.$(basename $fileB).stats" >$scriptsdir/$(basename $fileA).unique.$(basename $fileB).sh 28 | fi 29 | done 30 | done 31 | -------------------------------------------------------------------------------- /bin/vcfnobiallelicsnps: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasnonsnp = 0; 15 | $biallelic = 1; 16 | if ($alts =~ /,/) { 17 | $biallelic = 0; 18 | } 19 | @alts = split(/,/, $alts); 20 | foreach $alt (@alts) { 21 | if (!(length($alt)==1 && length($alt) == length($ref))) { 22 | $hasnonsnp = 1; 23 | } 24 | } 25 | if ($hasnonsnp || !$biallelic) { 26 | print; 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /bin/vcfnoindels: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasindel = 0; 15 | @alts = split(/,/, $alts); 16 | foreach $alt (@alts) { 17 | if (length($alt ) != length($ref)) { 18 | $hasindel = 1; 19 | } 20 | } 21 | if (! $hasindel) { 22 | print; 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /bin/vcfnosnps: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasnonsnp = 0; 15 | @alts = split(/,/, $alts); 16 | foreach $alt (@alts) { 17 | if (!(length($alt)==1 && length($alt) == length($ref))) { 18 | $hasnonsnp = 1; 19 | } 20 | } 21 | if ($hasnonsnp) { 22 | print; 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /bin/vcfnulldotslashdot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import math 5 | 6 | def bincoeff(n,k): return math.factorial(n) / (math.factorial(n-k)*math.factorial(k)) 7 | def multcoeff(n,k): return bincoeff(n+k-1,k) 8 | 9 | for line in sys.stdin: 10 | if line.startswith("#"): 11 | print line.strip() 12 | continue 13 | fields = line.strip().split("\t") 14 | alleles = len(fields[4].split(","))+1 15 | # assume that we have GT:GL 16 | # how many genotypes? assume diploid 17 | flatgls = ",".join(map(str,[0]*multcoeff(alleles,2))) 18 | for i in range(9, len(fields)): 19 | if fields[i] == ".": 20 | fields[i] = "./.:" + flatgls 21 | print "\t".join(fields) 22 | 23 | -------------------------------------------------------------------------------- /bin/vcfplotaltdiscrepancy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | filename=$1 4 | tag=$2 5 | 6 | vcf2tsv \ 7 | | tsvsplit \ 8 | QUAL \ 9 | AC \ 10 | $tag.has_variant \ 11 | | tf2binary \ 12 | | vcfplotsitediscrepancy.r $filename $tag 13 | -------------------------------------------------------------------------------- /bin/vcfplotsitediscrepancy.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript --vanilla --slave 2 | 3 | # get the input VCF tabular format, assert that sites must have AC > 0 4 | vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0) 5 | 6 | filename <- commandArgs(TRUE)[1] 7 | tag <- commandArgs(TRUE)[2] 8 | 9 | tag.has_variant <- paste(tag, '.has_variant', sep='') 10 | 11 | vcf.numberOfSites <- length(vcf$AC) 12 | vcf.sitesTruePositive <- mean(vcf[, tag.has_variant]) 13 | 14 | # false detection count 15 | x <- cbind(by(vcf$AC, vcf$AC, function(i) length(i))) 16 | 17 | byac <- data.frame(ac=as.numeric(rownames(x)), sites=as.vector(x)) 18 | 19 | # count true positive sites 20 | byac$site_tpc <- as.vector(cbind(by(vcf[, tag.has_variant], vcf$AC, function(i) sum(i)))) 21 | # fpc == false detection count 22 | byac$site_fpc <- byac$sites - byac$site_tpc 23 | # site detection fpr is 1 - true positive rate 24 | byac$site_fpr <- 1 - ( byac$site_tpc / byac$sites ) 25 | 26 | #byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac <= i, select=site_fpr))))) 27 | byac$site_fprlt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 28 | s <- subset(byac, ac <= i, select=c(site_fpc, sites)) 29 | return(sum(s$site_fpc) / sum(s$sites)) 30 | }))) 31 | 32 | #byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) mean(subset(byac, ac >= i, select=site_fpr))))) 33 | byac$site_fprgt <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) { 34 | s <- subset(byac, ac >= i, select=c(site_fpc, sites)) 35 | return(sum(s$site_fpc) / sum(s$sites)) 36 | }))) 37 | 38 | byac$cfs <- as.vector(cbind(tapply(byac$ac, byac$ac, function(i) sum(subset(byac, ac <= i, select=sites)) / length(vcf$AC)))) 39 | 40 | 41 | pdf(paste(filename, '.', tag, '.site_FDR.vs.AC.smooth.pdf', sep='')) 42 | par(cex=0.75) 43 | par(mar=c(5,4,4,5) + 0.1) 44 | plot(byac$cfs, ylim=c(0,1.0), 45 | xlab='alternate allele count (AC)', xaxt='n', 46 | ylab='false discovery rate (FDR)', yaxt='n', type='l', col='red') 47 | axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1)) 48 | axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75) 49 | grid(lty=5) 50 | par(new=T) 51 | title(paste(filename, 'putative site false discovery rate versus', tag, '(smoothed)')) 52 | par(new=T) 53 | countTicks <- seq(0,1,0.1) * vcf.numberOfSites 54 | axis(4, at=seq(0,1,0.1), labels=round(countTicks)) 55 | par(col='red') 56 | mtext("number of sites", side=4, line=3, cex=0.75) 57 | par(col='black') 58 | par(new=T) 59 | plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n') 60 | par(new=T) 61 | lines(byac$ac, predict(loess(byac$site_fpr ~ byac$ac, span=0.5)), col="blue") 62 | par(new=T, cex=0.65) 63 | mtext(paste("site FDR: ", round(1 - vcf.sitesTruePositive, digits=4), sep='')) 64 | par(new=T, cex=0.65) 65 | legend('topleft', c('cumulative sites', 'site FDR (loess smoothed)', 'FDR at AC'), 66 | fill=c('red', 'blue', 'black')) 67 | garbage <- dev.off() 68 | 69 | 70 | 71 | pdf(paste(filename, '.', tag, '.site_FDR.vs.AC.cumulative.pdf', sep='')) 72 | par(cex=0.75) 73 | par(mar=c(5,4,4,5) + 0.1) 74 | plot(byac$cfs, ylim=c(0,1.0), 75 | xlab='alternate allele count (AC)', xaxt='n', 76 | ylab='false discovery rate (FDR)', yaxt='n', type='l', col='red') 77 | axis(2, at=seq(0,1,0.1), labels=seq(0,1,0.1)) 78 | axis(1, at=seq(0,max(byac$ac),10), labels=seq(0,max(byac$ac),10), cex=0.75) 79 | grid(lty=5) 80 | par(new=T) 81 | title(paste(filename, 'putative false discovery rate versus', tag, '(cumulative)')) 82 | par(new=T) 83 | countTicks <- seq(0,1,0.1) * vcf.numberOfSites 84 | axis(4, at=seq(0,1,0.1), labels=round(countTicks)) 85 | par(col='red') 86 | mtext("number of sites", side=4, line=3, cex=0.75) 87 | par(col='black') 88 | par(new=T) 89 | plot(byac$site_fpr, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n') 90 | par(new=T) 91 | plot(byac$site_fprgt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='green') 92 | par(new=T) 93 | plot(byac$site_fprlt, ylim=c(0,1.0), xlab='', xaxt='n', ylab='', yaxt='n', type='l', col='blue') 94 | par(new=T, cex=0.65) 95 | mtext(paste("site FDR: ", round(1 - vcf.sitesTruePositive, digits=4), sep='')) 96 | par(new=T, cex=0.65) 97 | legend('topleft', c('cumulative sites', 'site FDR <= AC', 'site FDR >= AC', 'FDR at AC'), 98 | fill=c('red', 'blue', 'green', 'black')) 99 | garbage <- dev.off() 100 | -------------------------------------------------------------------------------- /bin/vcfplottstv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | filename=$1 4 | title=$2 5 | 6 | vcf2tsv \ 7 | | tsvsplit \ 8 | QUAL \ 9 | AC \ 10 | AF \ 11 | TS \ 12 | | tf2binary \ 13 | | vcfplottstv.r $filename $title 14 | -------------------------------------------------------------------------------- /bin/vcfprintaltdiscrepancy.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript --vanilla --slave 2 | 3 | # get the input VCF tabular format, assert that sites must have AC > 0 4 | vcf <- subset(read.table(pipe('cat /dev/stdin'), header=T), AC > 0) 5 | 6 | tag <- commandArgs(TRUE)[1] 7 | 8 | tag.genotypes_alternate_count <- paste(tag, '.genotypes.alternate_count', sep='') 9 | tag.non_reference_discrepancy_count <- paste(tag, '.site.non_reference_discrepancy.count', sep='') 10 | tag.non_reference_discrepancy_normalizer <- paste(tag, '.site.non_reference_discrepancy.normalizer', sep='') 11 | tag.non_reference_sensitivity_count <- paste(tag, '.site.non_reference_sensitivity.count', sep='') 12 | tag.non_reference_sensitivity_normalizer <- paste(tag, '.site.non_reference_sensitivity.normalizer', sep='') 13 | tag.alternate_positive_discrepancy <- paste(tag, '.site.alternate_positive_discrepancy', sep='') 14 | tag.alternate_negative_discrepancy <- paste(tag, '.site.alternate_negative_discrepancy', sep='') 15 | tag.has_variant <- paste(tag, '.has_variant', sep='') 16 | 17 | vcf.numberOfSites <- length(vcf[, tag.genotypes_alternate_count]) 18 | vcf.totalAltAlleles <- sum(vcf[, tag.genotypes_alternate_count]) 19 | vcf.positiveDiscrepancy <- sum(vcf[, tag.alternate_positive_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count]) 20 | vcf.negativeDiscrepancy <- sum(vcf[, tag.alternate_negative_discrepancy]) / sum(vcf[, tag.genotypes_alternate_count]) 21 | vcf.sitesTruePositive <- sum(vcf[, tag.has_variant]) / nrow(vcf) 22 | 23 | cat('number of sites', vcf.numberOfSites, '\n') 24 | cat('total alternate alleles', vcf.totalAltAlleles, '\n') 25 | cat('positive discrepancy', vcf.positiveDiscrepancy, '\n') 26 | cat('negative discrepancy', vcf.negativeDiscrepancy, '\n') 27 | 28 | x <- cbind(by(vcf, vcf$AC, 29 | function(x) { 30 | sum(x[, tag.alternate_positive_discrepancy]) / sum(x[, tag.genotypes_alternate_count]) 31 | })) 32 | 33 | byac <- data.frame(ac=as.numeric(rownames(x)), fdr=as.vector(x)) 34 | 35 | print(byac) 36 | 37 | 38 | -------------------------------------------------------------------------------- /bin/vcfprintaltdiscrepancy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | tag=$1 4 | 5 | vcf2tsv \ 6 | | tsvsplit \ 7 | QUAL \ 8 | AC \ 9 | $tag.has_variant \ 10 | $tag.site.alternate_negative_discrepancy \ 11 | $tag.site.alternate_positive_discrepancy \ 12 | $tag.genotypes.alternate_count \ 13 | $tag.site.non_reference_sensitivity.count \ 14 | $tag.site.non_reference_sensitivity.normalizer \ 15 | $tag.site.non_reference_discrepancy.count \ 16 | $tag.site.non_reference_discrepancy.normalizer \ 17 | | tf2binary \ 18 | | vcfprintaltdiscrepancy.r $tag 19 | -------------------------------------------------------------------------------- /bin/vcfqualfilter: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # 4 | 5 | use Getopt::Long; 6 | my $cutoff = -1; 7 | my $max = -1; 8 | my $indel = 0; 9 | my $snp = 0; 10 | $result = GetOptions ("c|cutoff=i" => \$cutoff, 11 | "m|max=i" => \$max, 12 | "i|indel=i" => \$indel, 13 | "s|snp=i" => \$snp); 14 | 15 | 16 | while () { 17 | if ($_ =~ /^#/) { 18 | print $_; 19 | next; 20 | } 21 | 22 | if ($_ =~ /^(.*?\t){6}(.*?)\t/) { 23 | $qual = $1; 24 | } 25 | if ($cutoff ne -1 and $qual >= $cutoff and ($max eq -1 or $qual <= $max)) { 26 | print $_; 27 | } elsif ($snp and $_ =~ "SNP" and $qual >= $snp) { 28 | print $_; 29 | } elsif ($indel and $_ =~ "INS\|DEL" and $qual >= $indel) { 30 | print $_; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /bin/vcfregionreduce: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; 4 | then 5 | echo "Usage: $0 [region file] [directory]" 6 | echo 7 | echo "Generates \`basename directory\`.vcf.gz, which is the concatenation" 8 | echo "of files in the directory named [directory]/[region1].vcf.gz," 9 | echo "[directory]/[region2].vcf.gz, etc. in the order in which they" 10 | echo "occur in the region file." 11 | echo 12 | echo "A tabix index is subsequently generated." 13 | exit 1 14 | fi 15 | 16 | regionfile=$1 17 | mergedir=$2 18 | mergename=$(basename $mergedir) 19 | vcfgenotypes=$mergename.vcf.gz 20 | #vcfsites=$mergename.sites.vcf.gz 21 | 22 | firstfile=$mergedir/$(head -1 $regionfile).vcf.gz 23 | files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf.gz; done) 24 | 25 | ( zcat $firstfile | head -1000 | grep ^# 26 | for file in $files 27 | do 28 | zcat $file | grep -v "^#" 29 | done ) | ( bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes ) 30 | -------------------------------------------------------------------------------- /bin/vcfregionreduce_and_cut: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; 4 | then 5 | echo "Usage: $0 [region file] [directory]" 6 | echo 7 | echo "Generates \`basename directory\`.vcf.gz and \`basename directory\`.sites.vcf.gz" 8 | echo "which are the concatenation of files in the directory named [directory]/[region1].vcf.gz," 9 | echo "[directory]/[region2].vcf.gz, etc. in the order in which they occur in the region file." 10 | echo 11 | echo "Tabix indexes are simultaneously generated." 12 | exit 1 13 | fi 14 | 15 | regionfile=$1 16 | mergedir=$2 17 | mergename=$(basename $mergedir) 18 | vcfgenotypes=$mergename.vcf.gz 19 | vcfsites=$mergename.sites.vcf.gz 20 | 21 | regions=$(cat $regionfile) 22 | 23 | firstfile=$mergedir/$(echo $regions | cut -f 1 -d\ ).vcf.gz 24 | files=$(for region in $regions; do echo $mergedir/$region.vcf.gz; done) 25 | 26 | ( zcat $firstfile | head -1000 | grep ^# 27 | for file in $files 28 | do 29 | zcat $file | grep -v "^#" 30 | done ) | uniq | pee \ 31 | "bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes" \ 32 | "cut -f -8 | bgzip >$vcfsites && tabix -p vcf $vcfsites" 33 | -------------------------------------------------------------------------------- /bin/vcfregionreduce_pipe: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; 4 | then 5 | echo "Usage: $0 [region file] [directory]" 6 | echo 7 | echo "Generates \`basename directory\`.vcf.gz, which is the concatenation" 8 | echo "of files in the directory named [directory]/[region1].vcf.gz," 9 | echo "[directory]/[region2].vcf.gz, etc. in the order in which they" 10 | echo "occur in the region file." 11 | echo 12 | echo "A tabix index is subsequently generated." 13 | exit 1 14 | fi 15 | 16 | regionfile=$1 17 | mergedir=$2 18 | mergename=$(basename $mergedir) 19 | vcfgenotypes=$mergename.vcf.gz 20 | #vcfsites=$mergename.sites.vcf.gz 21 | 22 | firstfile=$mergedir/$(head -1 $regionfile).vcf.gz 23 | files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf.gz; done) 24 | 25 | zcat $firstfile | head -1000 | grep ^# 26 | for file in $files 27 | do 28 | zcat $file | grep -v "^#" 29 | done 30 | -------------------------------------------------------------------------------- /bin/vcfregionreduce_uncompressed: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 2 ]; 4 | then 5 | echo "Usage: $0 [region file] [directory]" 6 | echo 7 | echo "Generates \`basename directory\`.vcf.gz, which is the concatenation" 8 | echo "of files in the directory named [directory]/[region1].vcf.gz," 9 | echo "[directory]/[region2].vcf.gz, etc. in the order in which they" 10 | echo "occur in the region file." 11 | echo 12 | echo "A tabix index is subsequently generated." 13 | exit 1 14 | fi 15 | 16 | regionfile=$1 17 | mergedir=$2 18 | mergename=$(basename $mergedir) 19 | vcfgenotypes=$mergename.vcf.gz 20 | #vcfsites=$mergename.sites.vcf.gz 21 | 22 | firstfile=$mergedir/$(head -1 $regionfile).vcf 23 | files=$(for region in $(cat $regionfile); do echo $mergedir/$region.vcf; done) 24 | 25 | ( cat $firstfile | head -1000 | grep ^# 26 | for file in $files 27 | do 28 | cat $file | grep -v "^#" 29 | done ) | ( bgzip >$vcfgenotypes && tabix -p vcf $vcfgenotypes ) 30 | -------------------------------------------------------------------------------- /bin/vcfremovenonATGC: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasJunk = 0; 15 | @alts = split(/,/, $alts); 16 | 17 | if (!($ref =~ /A|T|G|C/)) { 18 | $hasJunk = 1; 19 | } 20 | foreach $alt (@alts) { 21 | if (!($alt =~ /A|T|G|C/)) { 22 | $hasJunk = 1; 23 | } 24 | } 25 | if (!$hasJunk) { 26 | print; 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /bin/vcfsnps: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | while () { 5 | if ($_ =~ /^#/) { 6 | print; 7 | } else { 8 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 9 | $chrom = $1; 10 | $pos = $2; 11 | $tag = $3; 12 | $ref = $4; 13 | $alts = $5; 14 | $hasindel = 0; 15 | @alts = split(/,/, $alts); 16 | $snp = 1; 17 | foreach $alt (@alts) { 18 | if (length($ref) > 1 || length($alt) != length($ref)) { 19 | $snp = 0; 20 | } 21 | } 22 | if ($snp) { 23 | print; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /bin/vcfsort: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | head -1000 $1 | grep "^#"; cat $@ | grep -v "^#" | sort -k1,1d -k2,2n 4 | -------------------------------------------------------------------------------- /bin/vcfvarstats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | 4 | use IPC::Open2; 5 | 6 | sub revcomplement { 7 | $revcom = reverse shift; 8 | $revcom =~ tr/ACGTacgt/TGCAtgca/; 9 | return $revcom; 10 | } 11 | 12 | $reference = $ARGV[0]; 13 | 14 | if ($reference) { 15 | $pid = open2(\*FASTAHACK_OUT, \*FASTAHACK_IN, "fastahack -c $reference"); 16 | } 17 | 18 | #print FASTAHACK_IN "1:10000\n"; 19 | #$result = ; 20 | #print $result; 21 | 22 | 23 | #open(VCF, $file); 24 | 25 | $ts = 0; 26 | $tv = 0; 27 | $cpg = 0; 28 | $total = 0; 29 | $snp = 0; 30 | $mnp = 0; 31 | $mnplen = 0; 32 | %mnp = (); 33 | $ins = 0; 34 | $inslen = 0; 35 | %ins = (); 36 | $del = 0; 37 | $dellen = 0; 38 | %del = (); 39 | 40 | %dint = (); # di-nucleotide distribution 41 | 42 | while () { 43 | if ($_ =~ /^#/) { 44 | next; 45 | } else { 46 | $_ =~ /^(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+?)\t/; 47 | $chrom = $1; 48 | $pos = $2; 49 | $tag = $3; 50 | $ref = $4; 51 | $alt = $5; 52 | #print "chrom: $chrom, pos: $pos, ref: $ref, alt: $alt\n"; 53 | } 54 | 55 | $diff = length($ref) - length($alt); 56 | 57 | $is_snp = 0; 58 | if ($_ =~ /SNP/) { 59 | $is_snp = 1; 60 | $snp += 1; 61 | # get di-nt's 62 | if ($reference) { 63 | if ($_ =~ /^(\d+)\t(\d+)/) { 64 | $seq = $1; 65 | $start = $2; 66 | $end = $2 + 1; 67 | print FASTAHACK_IN "$seq:$start..$end\n"; 68 | $dibp = ; 69 | chomp $dibp; 70 | $dint{$dibp} += 1; 71 | } 72 | } 73 | } elsif ($diff eq 0 and length($ref) eq 1) { 74 | $snp += 1; 75 | $is_snp = 1; 76 | } elsif ($diff eq 0 and length($ref) gt 1) { 77 | $mnp += 1; 78 | $mnplen += length($ref); 79 | $mnp{length($ref)} += 1; 80 | } 81 | if ($is_snp) { 82 | if ((($ref eq "A" and $alt eq "G") or ($ref eq "G" and $alt eq "A")) 83 | or 84 | (($ref eq "C" and $alt eq "T") or ($ref eq "T" and $alt eq "C"))) { 85 | $ts += 1; 86 | } else { 87 | $tv += 1; 88 | } 89 | if ($_ =~ /CpG/) { $cpg += 1; } 90 | } 91 | 92 | if ($diff lt 0) { 93 | $len = abs($diff); 94 | $ins += 1; 95 | $inslen += $len; 96 | $ins{$len} += 1; 97 | } 98 | 99 | if ($diff gt 0) { 100 | $len = abs($diff); 101 | $del += 1; 102 | $dellen += $len; 103 | $del{$len} += 1; 104 | } 105 | #elsif (length($ref) > 1 and $diff eq 0) { 106 | # print $_ . "\n"; 107 | # $mnp += 1; 108 | # $mnplen += length($ref); 109 | # $mnp{length($ref)} += 1; 110 | #} 111 | 112 | $total += 1; 113 | } 114 | 115 | if ($total == 0) { 116 | die "no VCF records read on stdin\n"; 117 | } 118 | 119 | print "total variants:\t$total" . "\n"; 120 | print "\n"; 121 | if ($snp > 0) { 122 | print "total snps:\t$snp\n"; 123 | print "transitions:\t$ts\n"; 124 | print "transversions:\t$tv\n"; 125 | if ($tv > 0) { 126 | print "ts/tv ratio:\t" . ($ts / $tv) . "\n"; 127 | } 128 | print "CpG sites:\t$cpg\n"; 129 | if ($cpg > 0) { 130 | print "CpG/total snps:\t" . ($cpg / $snp) . "\n"; 131 | } 132 | } 133 | 134 | if (($ins + $del) > 0) { 135 | print "\n"; 136 | print "total indels:\t" . ($ins + $del) . "\n"; 137 | print "insertions:\t$ins\t$inslen bp\n"; 138 | print "deletions:\t$del\t$dellen bp\n"; 139 | 140 | $max = 0; 141 | while ( my ($size, $count) = each(%ins) ) { 142 | if ($size > $max) { $max = $size; } 143 | } 144 | while ( my ($size, $count) = each(%del) ) { 145 | if ($size > $max) { $max = $size; } 146 | } 147 | 148 | print "\n"; 149 | 150 | if ($inslen > 0 and $dellen > 0) { 151 | $indel_length_ratio = $inslen / $dellen; 152 | print "ins/del length ratio:\t$indel_length_ratio\n"; 153 | print "\n"; 154 | print "indel size frequency distribution\n"; 155 | print "size\tins\tdel\tins/del\tcurr/prev\n"; 156 | 157 | $last_delcount = 0; 158 | $last_inscount = 0; 159 | $last_ratio_del = 0; 160 | $last_ratio_ins = 0; 161 | for (1 .. $max) { 162 | $inscount = $ins{$_}; 163 | $delcount = $del{$_}; 164 | if ($last_delcount != 0) { 165 | $last_ratio_del = $delcount / $last_delcount; 166 | } 167 | if ($last_inscount != 0) { 168 | $last_ratio_ins = $inscount / $last_inscount; 169 | } 170 | $last_delcount = $delcount; 171 | $last_inscount = $inscount; 172 | if ($inscount > 0 and $delcount > 0) { 173 | $ratio = $inscount / $delcount; 174 | } else { 175 | $ratio = ""; 176 | } 177 | print "$_\t$inscount\t$delcount\t" 178 | . sprintf("%.3f", $ratio); 179 | if ($last_ratio_ins != 0 or $last_ratio_del != 0) { 180 | print "\t"; 181 | if ($last_ratio_ins != 0) { 182 | print sprintf("%.3f", $last_ratio_ins); 183 | } 184 | print "\t"; 185 | if ($last_ratio_del != 0) { 186 | print sprintf("%.3f", $last_ratio_del); 187 | } 188 | print "\n"; 189 | } else { 190 | print "\n"; 191 | } 192 | } 193 | # FIXME 194 | #print "\t\t\t\t" . sprintf("%.3f", $even_odd_ratio_sum_ins / $ins) 195 | # . "\t" . sprintf("%.3f", $even_odd_ratio_sum_del / $del); 196 | } 197 | } 198 | 199 | if ($mnplen > 0) { 200 | print "\n"; 201 | print "total mnps:\t$mnp\n"; 202 | print "mnps length:\t$mnplen\n"; 203 | print "mnp size distribution\n"; 204 | $max = 0; 205 | while ( my ($size, $count) = each(%mnp) ) { 206 | if ($size > $max) { $max = $size; } 207 | } 208 | print "size\tcount\n"; 209 | for (2 .. $max) { 210 | print $_ . "\t" . $mnp{$_} . "\n"; 211 | } 212 | } 213 | 214 | if ($reference) { 215 | 216 | print "\n"; 217 | 218 | print "di-nucleotide distribution for SNPs\n"; 219 | print "di-nt\tcount\tcount/(total snps / 16)\n"; 220 | while ( my ($dibp, $count) = each(%dint) ) { 221 | print "$dibp\t$count\t" . ($count / ($snp / 16)) . "\n"; 222 | } 223 | 224 | } 225 | 226 | -------------------------------------------------------------------------------- /examples/612.bFst_SatApr12_12_29_17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/612.bFst_SatApr12_12_29_17.png -------------------------------------------------------------------------------- /examples/612.counts_FriApr11_13_16_42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/612.counts_FriApr11_13_16_42.png -------------------------------------------------------------------------------- /examples/612.nocounts.smoothed_TueApr15_13_25_53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/612.nocounts.smoothed_TueApr15_13_25_53.png -------------------------------------------------------------------------------- /examples/612.nocounts_FriApr11_13_16_38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/612.nocounts_FriApr11_13_16_38.png -------------------------------------------------------------------------------- /examples/612.wcfst.txt_FriApr11_13_17_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/612.wcfst.txt_FriApr11_13_17_02.png -------------------------------------------------------------------------------- /examples/headCrest.haps.txt_ThuMay29_15_08_53.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/headCrest.haps.txt_ThuMay29_15_08_53.pdf -------------------------------------------------------------------------------- /examples/headCrest.haps.txt_ThuMay29_15_08_53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/headCrest.haps.txt_ThuMay29_15_08_53.png -------------------------------------------------------------------------------- /examples/phasing-diff-res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/phasing-diff-res.png -------------------------------------------------------------------------------- /examples/phasing-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/phasing-results.png -------------------------------------------------------------------------------- /examples/scaffold612.d-stat.10kb.txt_abba-baba_WedOct29_10_17_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/scaffold612.d-stat.10kb.txt_abba-baba_WedOct29_10_17_10.png -------------------------------------------------------------------------------- /examples/t.hapass.txt_TueJun24_11_52_08.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/t.hapass.txt_TueJun24_11_52_08.png -------------------------------------------------------------------------------- /examples/xp-phased.txt_TueApr29_13_20_47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/xp-phased.txt_TueApr29_13_20_47.png -------------------------------------------------------------------------------- /examples/xp-unphased.txt_TueApr29_13_20_58.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/xp-unphased.txt_TueApr29_13_20_58.png -------------------------------------------------------------------------------- /examples/xpehh_WedApr23_11_30_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/examples/xpehh_WedApr23_11_30_10.png -------------------------------------------------------------------------------- /logos/websiteLogo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/logos/websiteLogo.pdf -------------------------------------------------------------------------------- /logos/websiteLogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zeeev/vcflib/276af31c9a4e0d5c23b4cbfe6df2a35678791c87/logos/websiteLogo.png -------------------------------------------------------------------------------- /samples/sample.vcf: -------------------------------------------------------------------------------- 1 | ##fileformat=VCFv4.0 2 | ##fileDate=20090805 3 | ##source=myImputationProgramV3.1 4 | ##reference=1000GenomesPilot-NCBI36 5 | ##phasing=partial 6 | ##INFO= 7 | ##INFO= 8 | ##INFO= 9 | ##INFO= 10 | ##INFO= 11 | ##INFO= 12 | ##INFO= 13 | ##INFO= 14 | ##FILTER= 15 | ##FILTER= 16 | ##FORMAT= 17 | ##FORMAT= 18 | ##FORMAT= 19 | ##FORMAT= 20 | ##ALT= 21 | ##ALT= 22 | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 23 | 19 111 . A C 9.6 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 24 | 19 112 . A G 10 . . GT:HQ 0|0:10,10 0|0:10,10 0/1:3,3 25 | 20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. 26 | 20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3:.,. 27 | 20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4:.,. 28 | 20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:.:56,60 0|0:48:4:51,51 0/0:61:2:.,. 29 | 20 1234567 microsat1 G GA,GAC 50 PASS NS=3;DP=9;AA=G;AN=6;AC=3,1 GT:GQ:DP 0/1:.:4 0/2:17:2 1/1:40:3 30 | 20 1235237 . T . . . . GT 0/0 0|0 ./. 31 | X 10 rsTest AC A,ATG 10 PASS . GT 0 0/1 0|2 32 | -------------------------------------------------------------------------------- /src/BedReader.h: -------------------------------------------------------------------------------- 1 | #ifndef BEDREADER_H 2 | #define BEDREADER_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "intervaltree/IntervalTree.h" 13 | #include "split.h" 14 | 15 | using namespace std; 16 | 17 | string strip(string const& str, char const* separators = " \t") { 18 | string::size_type const first = str.find_first_not_of(separators); 19 | return (first == string::npos) ? string() 20 | : str.substr(first, str.find_last_not_of(separators) - first + 1); 21 | } 22 | 23 | void parseRegion( 24 | string& region, 25 | string& startSeq, 26 | int& startPos, 27 | int& stopPos) { 28 | 29 | size_t foundFirstColon = region.find(":"); 30 | 31 | // we only have a single string, use the whole sequence as the target 32 | if (foundFirstColon == string::npos) { 33 | startSeq = region; 34 | startPos = 0; 35 | stopPos = -1; 36 | } else { 37 | startSeq = region.substr(0, foundFirstColon); 38 | string sep = ".."; 39 | size_t foundRangeSep = region.find(sep, foundFirstColon); 40 | if (foundRangeSep == string::npos) { 41 | sep = "-"; 42 | foundRangeSep = region.find("-", foundFirstColon); 43 | } 44 | if (foundRangeSep == string::npos) { 45 | startPos = atoi(region.substr(foundFirstColon + 1).c_str()); 46 | // differ from bamtools in this regard, in that we process only 47 | // the specified position if a range isn't given 48 | stopPos = startPos + 1; 49 | } else { 50 | startPos = atoi(region.substr(foundFirstColon + 1, foundRangeSep - foundFirstColon).c_str()); 51 | // if we have range sep specified, but no second number, read to the end of sequence 52 | if (foundRangeSep + sep.size() != region.size()) { 53 | stopPos = atoi(region.substr(foundRangeSep + sep.size()).c_str()); // end-exclusive, bed-format 54 | } else { 55 | //stopPos = reference.sequenceLength(startSeq); 56 | stopPos = -1; 57 | } 58 | } 59 | } 60 | } 61 | 62 | // stores the posiitional information of a bed target entry 63 | class BedTarget { 64 | 65 | public: 66 | 67 | string seq; // sequence name 68 | int left; // left position 69 | int right; // right position, adjusted to 0-base 70 | string desc; // descriptive information, target name typically 71 | 72 | BedTarget(string s) { 73 | parseRegion(s, seq, left, right); 74 | } 75 | 76 | BedTarget(string s, int l, int r, string d = "") 77 | : seq(s) 78 | , left(l) 79 | , right(r) 80 | , desc(d) 81 | { } 82 | 83 | }; 84 | 85 | 86 | class BedReader { 87 | 88 | bool _isOpen; 89 | ifstream file; 90 | 91 | public: 92 | 93 | bool isOpen(void) { return _isOpen; } 94 | 95 | vector targets; 96 | map > intervals; // intervals by reference sequence 97 | 98 | vector entries(void) { 99 | 100 | vector entries; 101 | 102 | if (!isOpen()) { 103 | cerr << "bed targets file is not open" << endl; 104 | exit(1); 105 | } 106 | 107 | string line; 108 | while (std::getline(file, line)) { 109 | vector fields = split(line, " \t"); 110 | BedTarget entry(strip(fields[0]), 111 | atoi(strip(fields[1]).c_str()), 112 | atoi(strip(fields[2]).c_str()), 113 | (fields.size() >= 4) ? strip(fields[3]) : ""); 114 | entries.push_back(entry); 115 | } 116 | 117 | return entries; 118 | 119 | } 120 | 121 | vector targetsContained(BedTarget& target) { 122 | vector > results; 123 | intervals[target.seq].findContained(target.left, target.right, results); 124 | vector contained; 125 | for (vector >::iterator r = results.begin(); r != results.end(); ++r) { 126 | contained.push_back(r->value); 127 | } 128 | return contained; 129 | } 130 | 131 | vector targetsOverlapping(BedTarget& target) { 132 | vector > results; 133 | intervals[target.seq].findOverlapping(target.left, target.right, results); 134 | vector overlapping; 135 | for (vector >::iterator r = results.begin(); r != results.end(); ++r) { 136 | overlapping.push_back(r->value); 137 | } 138 | return overlapping; 139 | } 140 | 141 | BedReader(void) 142 | : _isOpen(false) 143 | { } 144 | 145 | BedReader(string& fname) 146 | : _isOpen(false) { 147 | open(fname); 148 | } 149 | 150 | void addTargets(vector& targets) { 151 | map > > intervalsBySeq; 152 | for (vector::iterator t = targets.begin(); t != targets.end(); ++t) { 153 | intervalsBySeq[t->seq].push_back(Interval(1 + t->left, t->right, &*t)); 154 | } 155 | for (map > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) { 156 | intervals[s->first] = IntervalTree(s->second); 157 | } 158 | } 159 | 160 | void open(const string& fname) { 161 | file.open(fname.c_str()); 162 | _isOpen = true; 163 | targets = entries(); 164 | map > > intervalsBySeq; 165 | for (vector::iterator t = targets.begin(); t != targets.end(); ++t) { 166 | intervalsBySeq[t->seq].push_back(Interval(1 + t->left, t->right, &*t)); 167 | } 168 | for (map > >::iterator s = intervalsBySeq.begin(); s != intervalsBySeq.end(); ++s) { 169 | intervals[s->first] = IntervalTree(s->second); 170 | } 171 | } 172 | 173 | }; 174 | 175 | #endif 176 | 177 | -------------------------------------------------------------------------------- /src/convert.h: -------------------------------------------------------------------------------- 1 | #ifndef __CONVERT_H 2 | #define __CONVERT_H 3 | 4 | #include 5 | 6 | // converts the string into the specified type, setting r to the converted 7 | // value and returning true/false on success or failure 8 | template 9 | bool convert(const std::string& s, T& r) { 10 | std::istringstream iss(s); 11 | iss >> r; 12 | return iss.eof() ? true : false; 13 | } 14 | 15 | template 16 | std::string convert(const T& r) { 17 | std::ostringstream oss; 18 | oss << r; 19 | return oss.str(); 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /src/dumpContigsFromHeader.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include "var.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace vcf; 12 | 13 | int main(int argc, char** argv) { 14 | 15 | string filename = argv[1]; 16 | 17 | VariantCallFile variantFile; 18 | 19 | variantFile.open(filename); 20 | 21 | vector headerLines = split (variantFile.header, "\n"); 22 | 23 | for(vector::iterator it = headerLines.begin(); it != headerLines.end(); it++){ 24 | 25 | // cerr << "h:" << (*it) << endl; 26 | 27 | if((*it).substr(0,8) == "##contig"){ 28 | string contigInfo = (*it).substr(10, (*it).length() -11); 29 | // cerr << contigInfo << endl; 30 | vector info = split(contigInfo, ","); 31 | for(vector::iterator sub = info.begin(); sub != info.end(); sub++){ 32 | // cerr << "s:" << (*sub) << endl; 33 | vector subfield = split((*sub), "="); 34 | if(subfield[0] == "ID"){ 35 | cout << subfield[1] << "\t"; 36 | } 37 | if(subfield[0] == "length"){ 38 | cout << subfield[1] << endl; 39 | } 40 | } 41 | 42 | } 43 | 44 | 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/gpatInfo.hpp: -------------------------------------------------------------------------------- 1 | #ifndef gpatInfo_H 2 | #define gpatInfo_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | void printVersion(void){ 10 | 11 | std::cerr << "------------------------------------------------------" << std::endl; 12 | std::cerr << " This is a vcflib::GPAT++ tool " << std::endl; 13 | 14 | std::cerr << "-Version : " << VERSION << std::endl; 15 | std::cerr << "-Contact : zev.kronenberg [at] gmail.com " << std::endl; 16 | std::cerr << "-Notes : If you find a bug, please open a report on github!" << std::endl; 17 | std::cerr << "-Support : Please post questions to biostars.org " << std::endl; 18 | std::cerr << "-Contributers : " << std::endl; 19 | std::cerr << " Zev Kronenberg (UW Genome Sciences) " << std::endl; 20 | std::cerr << " Mark Yandell (UU Human genetics) " << std::endl; 21 | std::cerr << " Mike Shario (UU Biology) " << std::endl; 22 | std::cerr << " EJ Osborne (UU Human genetics) " << std::endl; 23 | std::cerr << " Brett Kennedy (UU Human genetics) " << std::endl; 24 | std::cerr << " Daniel Ence (UU Human genetics) " << std::endl; 25 | std::cerr << " Erik Garrison (Wellcome Trust Sanger Institute) " << std::endl; 26 | std::cerr << " Travis Collier (UC Davis) " << std::endl; 27 | std::cerr << " - Your name goes here -' " << std::endl; 28 | 29 | std::cerr << "------------------------------------------------------" << std::endl; 30 | } 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/join.h: -------------------------------------------------------------------------------- 1 | #ifndef __JOIN_H 2 | #define __JOIN_H 3 | 4 | // functions to split a string by a specific delimiter 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // join a vector of elements by a delimiter object. ostream<< must be defined 11 | // for both class S and T and an ostream, as it is e.g. in the case of strings 12 | // and character arrays 13 | template 14 | std::string join(std::vector& elems, S& delim) { 15 | std::stringstream ss; 16 | typename std::vector::iterator e = elems.begin(); 17 | ss << *e++; 18 | for (; e != elems.end(); ++e) { 19 | ss << delim << *e; 20 | } 21 | return ss.str(); 22 | } 23 | 24 | // same for lists 25 | template 26 | std::string join(std::list& elems, S& delim) { 27 | std::stringstream ss; 28 | typename std::list::iterator e = elems.begin(); 29 | ss << *e++; 30 | for (; e != elems.end(); ++e) { 31 | ss << delim << *e; 32 | } 33 | return ss.str(); 34 | } 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/pdflib.hpp: -------------------------------------------------------------------------------- 1 | double i4_binomial_pdf ( int n, double p, int k ); 2 | int i4_binomial_sample ( int n, double pp ); 3 | double i4vec_multinomial_pdf ( int n, double p[], int m, int x[] ); 4 | int *i4vec_multinomial_sample ( int n, double p[], int ncat ); 5 | double r8_beta_pdf ( double alpha, double beta, double rval ); 6 | double r8_beta_sample ( double aa, double bb ); 7 | double r8_chi_pdf ( double df, double rval ); 8 | double r8_chi_sample ( double df ); 9 | double r8_choose ( int n, int k ); 10 | double r8_epsilon ( void ); 11 | double r8_exponential_pdf ( double beta, double rval ); 12 | double r8_exponential_sample ( double lambda ); 13 | double r8_exponential_01_pdf ( double rval ); 14 | double r8_exponential_01_sample ( ); 15 | double r8_gamma_log ( double x ); 16 | double r8_gamma_pdf ( double beta, double alpha, double rval ); 17 | double r8_gamma_sample ( double a, double r ); 18 | double r8_gamma_01_pdf ( double alpha, double rval ); 19 | double r8_gamma_01_sample ( double a ); 20 | double r8_invchi_pdf ( double df, double rval ); 21 | double r8_invchi_sample ( double df ); 22 | double r8_invgam_pdf ( double beta, double alpha, double rval ); 23 | double r8_invgam_sample ( double beta, double alpha ); 24 | double r8_max ( double x, double y ); 25 | double r8_min ( double x, double y ); 26 | double r8_normal_pdf ( double av, double sd, double rval ); 27 | double r8_normal_sample ( double av, double sd ); 28 | double r8_normal_01_pdf ( double rval ); 29 | double r8_normal_01_sample ( ); 30 | double r8_scinvchi_pdf ( double df, double s, double rval ); 31 | double r8_scinvchi_sample ( double df, double s ); 32 | double r8_uniform_pdf ( double lower, double upper, double rval ); 33 | double r8_uniform_sample ( double low, double high ); 34 | double r8_uniform_01_pdf ( double rval ); 35 | double r8_uniform_01_sample ( void ); 36 | double *r8mat_mtv_new ( int m, int n, double a[], double x[] ); 37 | double *r8mat_mv_new ( int m, int n, double a[], double x[] ); 38 | double r8mat_podet ( int n, double r[] ); 39 | double *r8mat_pofac ( int n, double a[] ); 40 | double *r8mat_poinv ( int n, double r[] ); 41 | double *r8mat_upsol ( int n, double r[], double b[] ); 42 | double *r8mat_utsol ( int n, double r[], double b[] ); 43 | double r8vec_dot_product ( int n, double a1[], double a2[] ); 44 | double r8vec_multinormal_pdf ( int n, double mu[], double r[], double c_det, 45 | double x[] ); 46 | double *r8vec_multinormal_sample ( int n, double mu[], double r[] ); 47 | 48 | -------------------------------------------------------------------------------- /src/permuteGPAT++.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This program was created at: Fri Apr 17 14:59:53 2015 4 | This program was created by: Zev N. Kronenberg 5 | 6 | 7 | Contact: zev.kronenber@gmail.com 8 | 9 | Organization: Unviersity of Utah 10 | School of Medicine 11 | Salt Lake City, Utah 12 | 13 | 14 | The MIT License (MIT) 15 | 16 | Copyright (c) <2015> 17 | 18 | Permission is hereby granted, free of charge, to any person obtaining a copy 19 | of this software and associated documentation files (the "Software"), to deal 20 | in the Software without restriction, including without limitation the rights 21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 22 | copies of the Software, and to permit persons to whom the Software is 23 | furnished to do so, subject to the following conditions: 24 | 25 | The above copyright notice and this permission notice shall be included in 26 | all copies or substantial portions of the Software. 27 | 28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 34 | THE SOFTWARE. 35 | 36 | 37 | */ 38 | #include 39 | #include "split.h" 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | struct options{ 52 | std::string file; 53 | int npermutation; 54 | int nsuc; 55 | }globalOpts; 56 | 57 | static const char *optString = "f:n:s:"; 58 | 59 | using namespace std; 60 | 61 | 62 | //------------------------------- OPTIONS -------------------------------- 63 | int parseOpts(int argc, char** argv) 64 | { 65 | int opt = 0; 66 | globalOpts.file = "NA"; 67 | 68 | globalOpts.nsuc = 1; 69 | globalOpts.npermutation = 1000; 70 | 71 | opt = getopt(argc, argv, optString); 72 | while(opt != -1){ 73 | switch(opt){ 74 | case 'f': 75 | { 76 | globalOpts.file = optarg; 77 | break; 78 | } 79 | case 'n': 80 | { 81 | globalOpts.npermutation = atoi(((string)optarg).c_str()); 82 | cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl; 83 | break; 84 | } 85 | case 's': 86 | { 87 | globalOpts.nsuc = atoi(((string)optarg).c_str()); 88 | cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl; 89 | break; 90 | } 91 | case '?': 92 | { 93 | break; 94 | } 95 | } 96 | 97 | opt = getopt( argc, argv, optString ); 98 | } 99 | return 1; 100 | } 101 | //------------------------------- SUBROUTINE -------------------------------- 102 | /* 103 | Function input : NA 104 | 105 | Function does : prints help 106 | 107 | Function returns: NA 108 | 109 | */ 110 | void printHelp() 111 | { 112 | 113 | cerr << endl << endl; 114 | cerr << "INFO: help" << endl; 115 | cerr << "INFO: description:" << endl; 116 | cerr << " permuteGPAT++ is a method for adding empirical p-values to a GPAT++ score." << endl ; 117 | cerr << " Currently permuteGPAT++ only supports wcFst, but will be extended. " << endl ; 118 | cerr << endl; 119 | cerr << "OUTPUT: permuteGPAT++ will append three additional columns:" << endl; 120 | cerr << " 1. The number of successes " << endl; 121 | cerr << " 2. The number of trials " << endl; 122 | cerr << " 3. The empirical p-value " << endl << endl; 123 | 124 | cerr << "INFO: usage: permuteGPAT++ -f gpat.txt -n 5 -s 1 "<< endl; 125 | cerr << endl; 126 | cerr << "INFO: file: f -- argument: the input file "<< endl; 127 | cerr << "INFO: number: n -- argument: the number of permutations to run for each value [1000]" << endl; 128 | cerr << "INFO: success: s -- argument: stop permutations after \'s\' successes [1]" << endl; 129 | 130 | 131 | cerr << endl; 132 | 133 | } 134 | 135 | //------------------------------- MAIN -------------------------------- 136 | /* 137 | Comments: 138 | */ 139 | 140 | int main( int argc, char** argv) 141 | { 142 | int parse = parseOpts(argc, argv); 143 | 144 | if(globalOpts.file.compare("NA") == 0){ 145 | cerr << "FATAL: no file was provided" << endl; 146 | printHelp(); 147 | exit(1); 148 | } 149 | 150 | 151 | vector data; 152 | 153 | ifstream gpat (globalOpts.file.c_str()); 154 | 155 | string line; 156 | 157 | if(gpat.is_open()){ 158 | 159 | while(getline(gpat, line)){ 160 | vector region = split(line, "\t"); 161 | // will change for other output 162 | double fst = atof(region[4].c_str()); 163 | 164 | if(fst < 0){ 165 | fst = 0; 166 | } 167 | 168 | data.push_back(fst); 169 | } 170 | } 171 | else{ 172 | cerr << "FATAL: coult not open file: " << globalOpts.file << endl; 173 | exit(1); 174 | } 175 | 176 | gpat.clear(); 177 | gpat.seekg(0, gpat.beg); 178 | 179 | cerr << "INFO: read values to permute: " << data.size() << endl; 180 | 181 | srand (time(NULL)); 182 | 183 | if(gpat.is_open()){ 184 | 185 | while(getline(gpat, line)){ 186 | vector region = split(line, "\t"); 187 | 188 | double value = atof(region[4].c_str()); 189 | 190 | if(value < 0){ 191 | value = 0; 192 | } 193 | 194 | 195 | double suc = 0; 196 | double per = 0; 197 | int datas = data.size(); 198 | double pv = (1.0 / globalOpts.npermutation); 199 | 200 | while( suc < globalOpts.nsuc && per < globalOpts.npermutation){ 201 | per += 1.0; 202 | 203 | int r = rand() % datas; 204 | 205 | if(value < data[r]){ 206 | suc += 1; 207 | } 208 | } 209 | if(suc > 0){ 210 | pv = suc / per; 211 | } 212 | cout << line << "\t" << suc << "\t" << per << "\t" << pv << endl; 213 | } 214 | 215 | } 216 | else{ 217 | cerr << "FATAL: coult not open file: " << globalOpts.file << endl; 218 | exit(1); 219 | } 220 | 221 | 222 | return 0; 223 | } 224 | -------------------------------------------------------------------------------- /src/permuteGPATsmoother.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | This program was created at: Fri Apr 17 14:59:53 2015 4 | This program was created by: Zev N. Kronenberg 5 | 6 | 7 | Contact: zev.kronenber@gmail.com 8 | 9 | Organization: Unviersity of Utah 10 | School of Medicine 11 | Salt Lake City, Utah 12 | 13 | 14 | The MIT License (MIT) 15 | 16 | Copyright (c) <2015> 17 | 18 | Permission is hereby granted, free of charge, to any person obtaining a copy 19 | of this software and associated documentation files (the "Software"), to deal 20 | in the Software without restriction, including without limitation the rights 21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 22 | copies of the Software, and to permit persons to whom the Software is 23 | furnished to do so, subject to the following conditions: 24 | 25 | The above copyright notice and this permission notice shall be included in 26 | all copies or substantial portions of the Software. 27 | 28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 34 | THE SOFTWARE. 35 | 36 | 37 | */ 38 | #include 39 | #include "split.h" 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | struct options{ 52 | std::string file; 53 | int npermutation; 54 | int nsuc; 55 | }globalOpts; 56 | 57 | static const char *optString = "f:n:s:"; 58 | 59 | using namespace std; 60 | 61 | 62 | //------------------------------- OPTIONS -------------------------------- 63 | int parseOpts(int argc, char** argv) 64 | { 65 | int opt = 0; 66 | globalOpts.file = "NA"; 67 | 68 | globalOpts.nsuc = 1; 69 | globalOpts.npermutation = 1000; 70 | 71 | opt = getopt(argc, argv, optString); 72 | while(opt != -1){ 73 | switch(opt){ 74 | case 'f': 75 | { 76 | globalOpts.file = optarg; 77 | break; 78 | } 79 | case 'n': 80 | { 81 | globalOpts.npermutation = atoi(((string)optarg).c_str()); 82 | cerr << "INFO: permuteGPAT++ will do N permutations: " << globalOpts.npermutation << endl; 83 | break; 84 | } 85 | case 's': 86 | { 87 | globalOpts.nsuc = atoi(((string)optarg).c_str()); 88 | cerr << "INFO: permuteGPAT++ will stop permutations after N successes: " << globalOpts.nsuc << endl; 89 | break; 90 | } 91 | case '?': 92 | { 93 | break; 94 | } 95 | } 96 | 97 | opt = getopt( argc, argv, optString ); 98 | } 99 | return 1; 100 | } 101 | //------------------------------- SUBROUTINE -------------------------------- 102 | /* 103 | Function input : NA 104 | 105 | Function does : prints help 106 | 107 | Function returns: NA 108 | 109 | */ 110 | void printHelp() 111 | { 112 | 113 | cerr << endl << endl; 114 | cerr << "INFO: help" << endl; 115 | cerr << "INFO: description:" << endl; 116 | cerr << " permuteGPAT++ is a method for adding empirical p-values to a GPAT++ score." << endl ; 117 | cerr << " Currently permuteGPAT++ only supports wcFst, but will be extended. " << endl ; 118 | cerr << endl; 119 | cerr << "OUTPUT: permuteGPAT++ will append three additional columns:" << endl; 120 | cerr << " 1. The number of successes " << endl; 121 | cerr << " 2. The number of trials " << endl; 122 | cerr << " 3. The empirical p-value " << endl << endl; 123 | 124 | cerr << "INFO: usage: permuteGPAT++ -f gpat.txt -n 5 -s 1 "<< endl; 125 | cerr << endl; 126 | cerr << "INFO: file: f -- argument: the input file "<< endl; 127 | cerr << "INFO: number: n -- argument: the number of permutations to run for each value [1000]" << endl; 128 | cerr << "INFO: success: s -- argument: stop permutations after \'s\' successes [1]" << endl; 129 | 130 | 131 | cerr << endl; 132 | 133 | } 134 | 135 | //------------------------------- MAIN -------------------------------- 136 | /* 137 | Comments: 138 | */ 139 | 140 | int main( int argc, char** argv) 141 | { 142 | int parse = parseOpts(argc, argv); 143 | 144 | if(globalOpts.file.compare("NA") == 0){ 145 | cerr << "FATAL: no file was provided" << endl; 146 | printHelp(); 147 | exit(1); 148 | } 149 | 150 | 151 | vector data; 152 | 153 | ifstream gpat (globalOpts.file.c_str()); 154 | 155 | string line; 156 | 157 | if(gpat.is_open()){ 158 | 159 | while(getline(gpat, line)){ 160 | vector region = split(line, "\t"); 161 | // will change for other output 162 | double fst = atof(region[4].c_str()); 163 | 164 | if(fst < 0){ 165 | fst = 0; 166 | } 167 | 168 | data.push_back(fst); 169 | } 170 | } 171 | else{ 172 | cerr << "FATAL: coult not open file: " << globalOpts.file << endl; 173 | exit(1); 174 | } 175 | 176 | gpat.clear(); 177 | gpat.seekg(0, gpat.beg); 178 | 179 | cerr << "INFO: read values to permute: " << data.size() << endl; 180 | 181 | srand (time(NULL)); 182 | 183 | if(gpat.is_open()){ 184 | 185 | while(getline(gpat, line)){ 186 | vector region = split(line, "\t"); 187 | 188 | double value = atof(region[4].c_str()); 189 | 190 | if(value < 0){ 191 | value = 0; 192 | } 193 | 194 | 195 | double suc = 0; 196 | double per = 0; 197 | int datas = data.size(); 198 | double pv = (1.0 / globalOpts.npermutation); 199 | 200 | while( suc < globalOpts.nsuc && per < globalOpts.npermutation){ 201 | per += 1.0; 202 | 203 | int r = rand() % datas; 204 | 205 | if(value < data[r]){ 206 | suc += 1; 207 | } 208 | } 209 | if(suc > 0){ 210 | pv = suc / per; 211 | } 212 | cout << line << "\t" << suc << "\t" << per << "\t" << pv << endl; 213 | } 214 | 215 | } 216 | else{ 217 | cerr << "FATAL: coult not open file: " << globalOpts.file << endl; 218 | exit(1); 219 | } 220 | 221 | 222 | return 0; 223 | } 224 | -------------------------------------------------------------------------------- /src/rnglib.hpp: -------------------------------------------------------------------------------- 1 | void advance_state ( int k ); 2 | bool antithetic_get ( ); 3 | void antithetic_memory ( int i, bool &value ); 4 | void antithetic_set ( bool value ); 5 | void cg_get ( int g, int &cg1, int &cg2 ); 6 | void cg_memory ( int i, int g, int &cg1, int &cg2 ); 7 | void cg_set ( int g, int cg1, int cg2 ); 8 | int cgn_get ( ); 9 | void cgn_memory ( int i, int &g ); 10 | void cgn_set ( int g ); 11 | void get_state ( int &cg1, int &cg2 ); 12 | int i4_uni ( ); 13 | void ig_get ( int g, int &ig1, int &ig2 ); 14 | void ig_memory ( int i, int g, int &ig1, int &ig2 ); 15 | void ig_set ( int g, int ig1, int ig2 ); 16 | void init_generator ( int t ); 17 | void initialize ( ); 18 | bool initialized_get ( ); 19 | void initialized_memory ( int i, bool &initialized ); 20 | void initialized_set ( ); 21 | void lg_get ( int g, int &lg1, int &lg2 ); 22 | void lg_memory ( int i, int g, int &lg1, int &lg2 ); 23 | void lg_set ( int g, int lg1, int lg2 ); 24 | int multmod ( int a, int s, int m ); 25 | float r4_uni_01 ( ); 26 | double r8_uni_01 ( ); 27 | void set_initial_seed ( int ig1, int ig2 ); 28 | void set_seed ( int cg1, int cg2 ); 29 | #if !defined(TIMESTAMP) 30 | #define TIMESTAMP 31 | void timestamp ( ); 32 | #endif 33 | -------------------------------------------------------------------------------- /src/split.cpp: -------------------------------------------------------------------------------- 1 | #include "split.h" 2 | 3 | 4 | std::vector &split(const std::string &s, char delim, std::vector &elems) { 5 | std::string delims = std::string(1, delim); 6 | tokenize(s, elems, delims); 7 | return elems; 8 | } 9 | 10 | std::vector split(const std::string &s, char delim) { 11 | std::vector elems; 12 | return split(s, delim, elems); 13 | } 14 | 15 | std::vector &split(const std::string &s, const std::string& delims, std::vector &elems) { 16 | tokenize(s, elems, delims); 17 | return elems; 18 | } 19 | 20 | std::vector split(const std::string &s, const std::string& delims) { 21 | std::vector elems; 22 | return split(s, delims, elems); 23 | } 24 | -------------------------------------------------------------------------------- /src/split.h: -------------------------------------------------------------------------------- 1 | #ifndef __SPLIT_H 2 | #define __SPLIT_H 3 | 4 | // functions to split a string by a specific delimiter 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803 11 | 12 | // split a string on a single delimiter character (delim) 13 | std::vector& split(const std::string &s, char delim, std::vector &elems); 14 | std::vector split(const std::string &s, char delim); 15 | 16 | // split a string on any character found in the string of delimiters (delims) 17 | std::vector& split(const std::string &s, const std::string& delims, std::vector &elems); 18 | std::vector split(const std::string &s, const std::string& delims); 19 | 20 | // from Marius, http://stackoverflow.com/a/1493195/238609 21 | template < class ContainerT > 22 | void tokenize(const std::string& str, ContainerT& tokens, 23 | const std::string& delimiters = " ", const bool trimEmpty = false) 24 | { 25 | 26 | std::string::size_type pos, lastPos = 0; 27 | while(true) 28 | { 29 | pos = str.find_first_of(delimiters, lastPos); 30 | if(pos == std::string::npos) 31 | { 32 | 33 | pos = str.length(); 34 | 35 | if(pos != lastPos || !trimEmpty) { 36 | tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos)); 37 | } 38 | 39 | break; 40 | } 41 | else 42 | { 43 | if(pos != lastPos || !trimEmpty) { 44 | tokens.push_back(typename ContainerT::value_type(str.data()+lastPos, (typename ContainerT::value_type::size_type)pos-lastPos)); 45 | } 46 | } 47 | 48 | lastPos = pos + 1; 49 | } 50 | }; 51 | 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /src/var.hpp: -------------------------------------------------------------------------------- 1 | // not to complicate the issue but I need a different variant object to handle populations. 2 | 3 | #ifndef __VAR_H 4 | #define __VAR_H 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include "split.h" 14 | 15 | using namespace std; 16 | 17 | class zvar{ 18 | public: 19 | 20 | string name; 21 | 22 | int npop; 23 | 24 | string seqid; 25 | long int pos; 26 | 27 | double nalt ; 28 | double nref ; 29 | double af ; 30 | 31 | double alpha; 32 | double beta ; 33 | 34 | virtual void loadPop(vector< map< string, vector > >& group, string seqid, long int position) = 0; 35 | virtual void estimatePosterior() = 0 ; 36 | virtual ~zvar() = 0; 37 | void setPopName(string popName); 38 | 39 | }; 40 | 41 | class genotype : public zvar { 42 | 43 | public: 44 | 45 | double nhomr; 46 | double nhoma; 47 | double nhet ; 48 | double ngeno; 49 | double fis ; 50 | double hfrq ; 51 | 52 | vector genoIndex; 53 | vector gts ; 54 | vector< vector < double > > genoLikelihoods; 55 | vector< vector < double > > genoLikelihoodsCDF; 56 | 57 | virtual double unphred(map< string, vector > & geno, int index) = 0; 58 | virtual void loadPop(vector< map< string, vector > >& group, string seqid, long int position); 59 | virtual ~genotype() = 0; 60 | void estimatePosterior(); 61 | 62 | 63 | }; 64 | 65 | class pooled : public zvar{ 66 | public: 67 | 68 | double ntot ; 69 | double afsum ; 70 | 71 | vector nalts; 72 | vector nrefs; 73 | vector afs ; 74 | 75 | void loadPop(vector< map< string, vector > >& group, string seqid, long int position); 76 | void estimatePosterior(); 77 | 78 | ~pooled(); 79 | 80 | double bound(double v); 81 | 82 | pooled(void); 83 | 84 | }; 85 | 86 | class gt : public genotype{ 87 | public: 88 | gt(void); 89 | double unphred(map< string, vector > & geno, int index); 90 | ~gt(); 91 | }; 92 | 93 | class gl : public genotype{ 94 | public: 95 | gl(void); 96 | double unphred(map< string, vector > & geno, int index); 97 | ~gl(); 98 | }; 99 | 100 | class gp : public genotype{ 101 | public: 102 | gp(void); 103 | double unphred(map< string, vector > & geno, int index); 104 | ~gp(); 105 | }; 106 | 107 | 108 | class pl : public genotype{ 109 | public: 110 | pl(void); 111 | double unphred(map< string, vector > & geno, int index); 112 | ~pl(); 113 | }; 114 | 115 | #endif 116 | -------------------------------------------------------------------------------- /src/vcfaddinfo.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | // adds non-overlapping info fields from varB to varA 11 | void addInfo(Variant& varA, Variant& varB) { 12 | for (map >::iterator i = varB.info.begin(); i != varB.info.end(); ++i) { 13 | if (varA.info.find(i->first) == varA.info.end()) { 14 | varA.info[i->first] = i->second; 15 | } 16 | } 17 | } 18 | 19 | int main(int argc, char** argv) { 20 | 21 | if (argc != 3) { 22 | cerr << "usage: " << argv[0] << " " << endl 23 | << "Adds info fields from the second file which are not present in the first vcf file." << endl; 24 | return 1; 25 | } 26 | 27 | string filenameA = argv[1]; 28 | string filenameB = argv[2]; 29 | 30 | if (filenameA == filenameB) { 31 | cerr << "it won't help to add info data from the same file!" << endl; 32 | return 1; 33 | } 34 | 35 | VariantCallFile variantFileA; 36 | if (filenameA == "-") { 37 | variantFileA.open(std::cin); 38 | } else { 39 | variantFileA.open(filenameA); 40 | } 41 | 42 | VariantCallFile variantFileB; 43 | if (filenameB == "-") { 44 | variantFileB.open(std::cin); 45 | } else { 46 | variantFileB.open(filenameB); 47 | } 48 | 49 | if (!variantFileA.is_open() || !variantFileB.is_open()) { 50 | return 1; 51 | } 52 | 53 | Variant varA(variantFileA); 54 | Variant varB(variantFileB); 55 | 56 | // while the first file doesn't match the second positionally, 57 | // step forward, annotating each genotype record with an empty genotype 58 | // when the two match, iterate through the genotypes from the first file 59 | // and get the genotypes reported in the second file 60 | 61 | variantFileA.getNextVariant(varA); 62 | variantFileB.getNextVariant(varB); 63 | 64 | variantFileA.header = unionInfoHeaderLines(variantFileA.header, variantFileB.header); 65 | 66 | cout << variantFileA.header << endl; 67 | 68 | do { 69 | 70 | while (!variantFileB.done() 71 | && (varB.sequenceName < varA.sequenceName 72 | || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)) 73 | ) { 74 | variantFileB.getNextVariant(varB); 75 | } 76 | 77 | while (!variantFileA.done() 78 | && (varA.sequenceName < varB.sequenceName 79 | || (varA.sequenceName == varB.sequenceName && varA.position < varB.position)) 80 | ) { 81 | cout << varA << endl; 82 | variantFileA.getNextVariant(varA); 83 | } 84 | 85 | while (!variantFileB.done() 86 | && (varB.sequenceName < varA.sequenceName 87 | || (varB.sequenceName == varA.sequenceName && varB.position < varA.position)) 88 | ) { 89 | variantFileB.getNextVariant(varB); 90 | } 91 | 92 | while (!variantFileA.done() && varA.sequenceName == varB.sequenceName && varA.position == varB.position) { 93 | addInfo(varA, varB); 94 | cout << varA << endl; 95 | variantFileA.getNextVariant(varA); 96 | variantFileB.getNextVariant(varB); 97 | } 98 | 99 | } while (!variantFileA.done() && !variantFileB.done()); 100 | 101 | if (!variantFileA.done()) { 102 | cout << varA << endl; 103 | while (variantFileA.getNextVariant(varA)) { 104 | cout << varA << endl; 105 | } 106 | } 107 | 108 | return 0; 109 | 110 | } 111 | 112 | -------------------------------------------------------------------------------- /src/vcfafpath.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | int main(int argc, char** argv) { 10 | 11 | VariantCallFile variantFile; 12 | 13 | if (argc > 1) { 14 | string filename = argv[1]; 15 | variantFile.open(filename); 16 | } else { 17 | variantFile.open(std::cin); 18 | } 19 | 20 | if (!variantFile.is_open()) { 21 | return 1; 22 | } 23 | 24 | //cout << variantFile.header << endl; 25 | 26 | Variant var(variantFile); 27 | while (variantFile.getNextVariant(var)) { 28 | //cout << var << endl; 29 | double afref = 1; 30 | map > allelesByAf; 31 | vector afd; 32 | vector& afstr = var.info["AF"]; 33 | for (vector::iterator af = afstr.begin(); af != afstr.end(); ++af) { 34 | double r; convert(*af, r); 35 | afd.push_back(r); 36 | } 37 | vector::iterator af = afd.begin(); 38 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a, ++af) { 39 | afref -= *af; 40 | allelesByAf[*af].push_back(*a); 41 | } 42 | cout << var.ref; 43 | for (map >::reverse_iterator a = allelesByAf.rbegin(); a != allelesByAf.rend(); ++a) { 44 | cout << " -> " << join(a->second, ", "); 45 | } 46 | cout << endl; 47 | } 48 | 49 | return 0; 50 | 51 | } 52 | 53 | -------------------------------------------------------------------------------- /src/vcfaltcount.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc != 2) { 13 | cerr << "usage: " << argv[0] << " " << endl 14 | << "count the number of alternate alleles in all records in the vcf file" << endl; 15 | return 1; 16 | } 17 | 18 | string filename = argv[1]; 19 | 20 | VariantCallFile variantFile; 21 | variantFile.open(filename); 22 | if (!variantFile.is_open()) { 23 | return 1; 24 | } 25 | 26 | unsigned int alternateAlleleCount = 0; 27 | 28 | Variant var(variantFile); 29 | while (variantFile.getNextVariant(var)) { 30 | //cout << var << endl; 31 | for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { 32 | //string& name = s->first; 33 | map >& sample = s->second; 34 | string& genotype = sample["GT"].front(); 35 | vector gt = split(genotype, "|/"); 36 | int alt = 0; 37 | for (vector::iterator g = gt.begin(); g != gt.end(); ++g) { 38 | if (*g != "0") 39 | ++alt; 40 | } 41 | alternateAlleleCount += alt; 42 | } 43 | } 44 | 45 | cout << alternateAlleleCount << endl; 46 | 47 | return 0; 48 | 49 | } 50 | 51 | -------------------------------------------------------------------------------- /src/vcfannotate.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "BedReader.h" 3 | #include 4 | 5 | using namespace std; 6 | using namespace vcf; 7 | 8 | 9 | void printSummary(char** argv) { 10 | cerr << "usage: " << argv[0] << " [options] []" << endl 11 | << endl 12 | << "options:" << endl 13 | << " -b, --bed use annotations provided by this BED file" << endl 14 | << " -k, --key use this INFO field key for the annotations" << endl 15 | << " -d, --default use this INFO field key for records without annotations" << endl 16 | << endl 17 | << "Intersect the records in the VCF file with targets provided in a BED file." << endl 18 | << "Intersections are done on the reference sequences in the VCF file." << endl 19 | << "If no VCF filename is specified on the command line (last argument) the VCF" << endl 20 | << "read from stdin." << endl; 21 | exit(0); 22 | } 23 | 24 | int main(int argc, char** argv) { 25 | 26 | string bedFileName; 27 | string annotationInfoKey; 28 | string defaultAnnotationValue; 29 | 30 | if (argc == 1) 31 | printSummary(argv); 32 | 33 | int c; 34 | while (true) { 35 | static struct option long_options[] = 36 | { 37 | /* These options set a flag. */ 38 | //{"verbose", no_argument, &verbose_flag, 1}, 39 | {"help", no_argument, 0, 'h'}, 40 | {"bed", required_argument, 0, 'b'}, 41 | {"key", required_argument, 0, 'k'}, 42 | {"default", required_argument, 0, 'd'}, 43 | {0, 0, 0, 0} 44 | }; 45 | /* getopt_long stores the option index here. */ 46 | int option_index = 0; 47 | 48 | c = getopt_long (argc, argv, "hb:k:d:", 49 | long_options, &option_index); 50 | 51 | if (c == -1) 52 | break; 53 | 54 | switch (c) { 55 | case 'b': 56 | bedFileName = string(optarg); 57 | break; 58 | 59 | case 'k': 60 | annotationInfoKey = string(optarg); 61 | break; 62 | 63 | case 'd': 64 | defaultAnnotationValue = string(optarg); 65 | break; 66 | 67 | case 'h': 68 | printSummary(argv); 69 | break; 70 | 71 | case '?': 72 | printSummary(argv); 73 | exit(1); 74 | break; 75 | 76 | default: 77 | abort (); 78 | } 79 | } 80 | 81 | if (bedFileName.empty()) { 82 | cerr << "a BED file is required when intersecting" << endl; 83 | exit(1); 84 | } 85 | 86 | BedReader bed(bedFileName); 87 | 88 | VariantCallFile variantFile; 89 | string inputFilename; 90 | if (optind == argc - 1) { 91 | inputFilename = argv[optind]; 92 | variantFile.open(inputFilename); 93 | } else { 94 | variantFile.open(std::cin); 95 | } 96 | 97 | if (!variantFile.is_open()) { 98 | cout << "could not open VCF file" << endl; 99 | return 1; 100 | } 101 | 102 | string line = "##INFO="; 104 | variantFile.addHeaderLine(line); 105 | 106 | cout << variantFile.header << endl; 107 | 108 | Variant var(variantFile); 109 | while (variantFile.getNextVariant(var)) { 110 | BedTarget record(var.sequenceName, var.position, var.position + var.ref.size() - 1, ""); 111 | vector overlaps = bed.targetsOverlapping(record); 112 | vector annotations; 113 | if (!overlaps.empty()) { 114 | for (vector::iterator t = overlaps.begin(); t != overlaps.end(); ++t) { 115 | annotations.push_back((*t)->desc); 116 | } 117 | var.info[annotationInfoKey].push_back(join(annotations, ":")); 118 | } else if (!defaultAnnotationValue.empty()) { 119 | var.info[annotationInfoKey].push_back(defaultAnnotationValue); 120 | } 121 | cout << var << endl; 122 | } 123 | 124 | return 0; 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/vcfbreakmulti.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "convert.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | 10 | double convertStrDbl(const string& s) { 11 | double r; 12 | convert(s, r); 13 | return r; 14 | } 15 | 16 | void printSummary(char** argv) { 17 | cerr << "usage: " << argv[0] << " [options] [file]" << endl 18 | << endl 19 | << "If multiple alleles are specified in a single record, break the record into" << endl 20 | << "multiple lines, preserving allele-specific INFO fields." << endl; 21 | exit(0); 22 | } 23 | 24 | int main(int argc, char** argv) { 25 | 26 | bool includePreviousBaseForIndels = true; 27 | bool useMNPs = false; 28 | 29 | VariantCallFile variantFile; 30 | 31 | int c; 32 | while (true) { 33 | static struct option long_options[] = 34 | { 35 | /* These options set a flag. */ 36 | //{"verbose", no_argument, &verbose_flag, 1}, 37 | {"help", no_argument, 0, 'h'}, 38 | {0, 0, 0, 0} 39 | }; 40 | /* getopt_long stores the option index here. */ 41 | int option_index = 0; 42 | 43 | c = getopt_long (argc, argv, "h", 44 | long_options, &option_index); 45 | 46 | if (c == -1) 47 | break; 48 | 49 | switch (c) { 50 | 51 | case 'h': 52 | printSummary(argv); 53 | break; 54 | 55 | case '?': 56 | printSummary(argv); 57 | exit(1); 58 | break; 59 | 60 | default: 61 | abort (); 62 | } 63 | } 64 | 65 | if (optind < argc) { 66 | string filename = argv[optind]; 67 | variantFile.open(filename); 68 | } else { 69 | variantFile.open(std::cin); 70 | } 71 | 72 | if (!variantFile.is_open()) { 73 | return 1; 74 | } 75 | 76 | cout << variantFile.header << endl; 77 | 78 | Variant var(variantFile); 79 | while (variantFile.getNextVariant(var)) { 80 | 81 | int numalt = var.alt.size(); 82 | 83 | if (numalt == 1) { 84 | cout << var << endl; 85 | continue; 86 | } 87 | 88 | vector variants; 89 | for (int i = 0; i < numalt; ++i) { 90 | variants.push_back(var); 91 | } 92 | 93 | for (int i = 0; i < numalt; ++i) { 94 | Variant& v = variants.at(i); 95 | vector altsToRemove; 96 | for (int j = 0; j < numalt; ++j) { 97 | if (j != i) { 98 | altsToRemove.push_back(var.alt.at(j)); 99 | } 100 | } 101 | for (vector::iterator a = altsToRemove.begin(); a != altsToRemove.end(); ++a) { 102 | v.removeAlt(*a); 103 | } 104 | } 105 | 106 | for (vector::iterator v = variants.begin(); v != variants.end(); ++v) { 107 | cout << *v << endl; 108 | } 109 | } 110 | 111 | return 0; 112 | 113 | } 114 | 115 | -------------------------------------------------------------------------------- /src/vcfcat.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | if (argc == 1) { 9 | cout << "usage: " << argv[0] << " [file1] [file2] ... [fileN]" << endl 10 | << "Concatenates VCF files." << endl; 11 | return 0; 12 | } else { 13 | for (int i = 1; i < argc; ++i) { 14 | VariantCallFile variantFile; 15 | string filename = argv[i]; 16 | variantFile.open(filename); 17 | if (!variantFile.is_open()) { 18 | cerr << "could not open " << argv[i] << endl; 19 | return 1; 20 | } 21 | if (i == 1) { 22 | cout << variantFile.header << endl; 23 | } 24 | Variant var(variantFile); 25 | while (variantFile.getNextVariant(var)) { 26 | cout << var << endl; 27 | } 28 | } 29 | } 30 | 31 | return 0; 32 | 33 | } 34 | 35 | -------------------------------------------------------------------------------- /src/vcfcheck.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include "fastahack/Fasta.h" 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | void printSummary(char** argv) { 10 | cerr << "usage: " << argv[0] << " [options] " << endl 11 | << endl 12 | << "options:" << endl 13 | << " -f, --fasta-reference FASTA reference file to use to obtain primer sequences" << endl 14 | << " -x, --exclude-failures If a record fails, don't print it. Otherwise do." << endl 15 | << " -k, --keep-failures Print if the record fails, otherwise not." << endl 16 | << endl 17 | << "Verifies that the VCF REF field matches the reference as described." << endl 18 | << endl; 19 | exit(0); 20 | } 21 | 22 | 23 | int main(int argc, char** argv) { 24 | 25 | int c; 26 | string fastaRef; 27 | bool keepFailures = false; 28 | bool excludeFailures = false; 29 | 30 | if (argc == 1) 31 | printSummary(argv); 32 | 33 | while (true) { 34 | static struct option long_options[] = 35 | { 36 | /* These options set a flag. */ 37 | //{"verbose", no_argument, &verbose_flag, 1}, 38 | {"help", no_argument, 0, 'h'}, 39 | {"fasta-reference", required_argument, 0, 'f'}, 40 | {"exclude-failures", no_argument, 0, 'x'}, 41 | {"keep-failures", no_argument, 0, 'k'}, 42 | //{"length", no_argument, &printLength, true}, 43 | {0, 0, 0, 0} 44 | }; 45 | /* getopt_long stores the option index here. */ 46 | int option_index = 0; 47 | 48 | c = getopt_long (argc, argv, "hxkf:", 49 | long_options, &option_index); 50 | 51 | /* Detect the end of the options. */ 52 | if (c == -1) 53 | break; 54 | 55 | switch (c) 56 | { 57 | case 0: 58 | /* If this option set a flag, do nothing else now. */ 59 | if (long_options[option_index].flag != 0) 60 | break; 61 | printf ("option %s", long_options[option_index].name); 62 | if (optarg) 63 | printf (" with arg %s", optarg); 64 | printf ("\n"); 65 | break; 66 | 67 | case 'f': 68 | fastaRef = optarg; 69 | break; 70 | 71 | case 'x': 72 | excludeFailures = true; 73 | break; 74 | 75 | case 'k': 76 | keepFailures = true; 77 | break; 78 | 79 | case 'h': 80 | printSummary(argv); 81 | exit(0); 82 | break; 83 | 84 | case '?': 85 | /* getopt_long already printed an error message. */ 86 | printSummary(argv); 87 | exit(1); 88 | break; 89 | 90 | default: 91 | abort (); 92 | } 93 | } 94 | 95 | if (fastaRef.empty()) { 96 | cerr << "a FASTA reference sequence must be specified" << endl; 97 | exit(1); 98 | } 99 | 100 | FastaReference ref; 101 | ref.open(fastaRef); 102 | 103 | VariantCallFile variantFile; 104 | string inputFilename; 105 | if (optind == argc - 1) { 106 | inputFilename = argv[optind]; 107 | variantFile.open(inputFilename); 108 | } else { 109 | variantFile.open(std::cin); 110 | } 111 | 112 | if (!variantFile.is_open()) { 113 | return 1; 114 | } 115 | 116 | if (keepFailures || excludeFailures) { 117 | cout << variantFile.header << endl; 118 | } 119 | 120 | Variant var(variantFile); 121 | while (variantFile.getNextVariant(var)) { 122 | int refstart = var.position - 1; // convert to 0-based 123 | string matchedRef = ref.getSubSequence(var.sequenceName, refstart, var.ref.size()); 124 | if (var.ref != matchedRef) { 125 | if (keepFailures) { 126 | cout << var << endl; 127 | } else if (!excludeFailures) { 128 | cout << "mismatched reference " << var.ref << " should be " << matchedRef << " at " 129 | << var.sequenceName << ":" << var.position << endl; 130 | } 131 | } else if (excludeFailures) { 132 | cout << var << endl; 133 | } 134 | } 135 | 136 | return 0; 137 | 138 | } 139 | 140 | -------------------------------------------------------------------------------- /src/vcfclassify.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | bool isTransition(string& ref, string& alt) { 11 | if (((ref == "A" && alt == "G") || (ref == "G" && alt == "A")) || 12 | ((ref == "C" && alt == "T") || (ref == "T" && alt == "C"))) { 13 | return true; 14 | } else { 15 | return false; 16 | } 17 | } 18 | 19 | bool hasTransition(Variant& var) { 20 | string& ref = var.ref; 21 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 22 | string& alt = *a; 23 | if (isTransition(ref, alt)) { 24 | return true; 25 | } 26 | } 27 | return false; 28 | } 29 | 30 | bool hasTransversion(Variant& var) { 31 | string& ref = var.ref; 32 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 33 | string& alt = *a; 34 | if (!isTransition(ref, alt)) { 35 | return true; 36 | } 37 | } 38 | return false; 39 | } 40 | 41 | bool hasInsertion(Variant& var) { 42 | string& ref = var.ref; 43 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 44 | string& alt = *a; 45 | if (ref.size() < alt.size()) { 46 | return true; 47 | } 48 | } 49 | return false; 50 | } 51 | 52 | bool hasDeletion(Variant& var) { 53 | string& ref = var.ref; 54 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 55 | string& alt = *a; 56 | if (ref.size() > alt.size()) { 57 | return true; 58 | } 59 | } 60 | return false; 61 | } 62 | 63 | bool hasMNP(Variant& var) { 64 | string& ref = var.ref; 65 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 66 | string& alt = *a; 67 | if (ref.size() > 1 && alt.size() == ref.size()) { 68 | return true; 69 | } 70 | } 71 | return false; 72 | } 73 | 74 | bool hasSNP(Variant& var) { 75 | string& ref = var.ref; 76 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 77 | string& alt = *a; 78 | if (ref.size() == 1 && alt.size() == 1) { 79 | return true; 80 | } 81 | } 82 | return false; 83 | } 84 | 85 | int main(int argc, char** argv) { 86 | 87 | if (argc != 2) { 88 | cerr << "usage: " << argv[0] << " " << endl 89 | << "outputs a VCF stream each variant is tagged by allele class: snp, ts/tv, indel, mnp" << endl; 90 | return 1; 91 | } 92 | 93 | string filename = argv[1]; 94 | 95 | VariantCallFile variantFile; 96 | if (filename == "-") { 97 | variantFile.open(std::cin); 98 | } else { 99 | variantFile.open(filename); 100 | } 101 | 102 | if (!variantFile.is_open()) { 103 | cerr << "could not open " << filename << endl; 104 | return 1; 105 | } 106 | 107 | Variant var(variantFile); 108 | 109 | string line; 110 | line = "##INFO="; 111 | variantFile.addHeaderLine(line); 112 | line = "##INFO="; 113 | variantFile.addHeaderLine(line); 114 | line = "##INFO="; 115 | variantFile.addHeaderLine(line); 116 | line = "##INFO="; 117 | variantFile.addHeaderLine(line); 118 | line = "##INFO="; 119 | variantFile.addHeaderLine(line); 120 | line = "##INFO="; 121 | variantFile.addHeaderLine(line); 122 | // TODO handle lengths at poly-allelic sites 123 | //line = "##INFO="; 124 | //variantFile.addHeaderLine(line); 125 | 126 | // write the new header 127 | cout << variantFile.header << endl; 128 | 129 | 130 | while (variantFile.getNextVariant(var)) { 131 | 132 | if (hasSNP(var)) { 133 | var.infoFlags["SNP"] = true; 134 | } 135 | 136 | if (hasTransition(var)) { 137 | var.infoFlags["TS"] = true; 138 | } 139 | 140 | if (hasTransversion(var)) { 141 | var.infoFlags["TV"] = true; 142 | } 143 | 144 | if (hasInsertion(var)) { 145 | var.infoFlags["INS"] = true; 146 | } 147 | 148 | if (hasDeletion(var)) { 149 | var.infoFlags["DEL"] = true; 150 | } 151 | 152 | if (hasMNP(var)) { 153 | var.infoFlags["MNP"] = true; 154 | } 155 | 156 | cout << var << endl; 157 | } 158 | 159 | return 0; 160 | 161 | } 162 | 163 | -------------------------------------------------------------------------------- /src/vcfcleancomplex.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | 11 | int main(int argc, char** argv) { 12 | 13 | if (argc != 2) { 14 | cerr << "usage: " << argv[0] << " " << endl 15 | << "outputs a VCF stream in which 'long' non-complex" 16 | << "alleles have their position corrected." << endl 17 | << "assumes that VCF records can't overlap 5'->3'" << endl; 18 | return 1; 19 | } 20 | 21 | string filename = argv[1]; 22 | 23 | VariantCallFile variantFile; 24 | if (filename == "-") { 25 | variantFile.open(std::cin); 26 | } else { 27 | variantFile.open(filename); 28 | } 29 | 30 | if (!variantFile.is_open()) { 31 | cerr << "could not open " << filename << endl; 32 | return 1; 33 | } 34 | 35 | Variant var(variantFile); 36 | 37 | // write the new header 38 | cout << variantFile.header << endl; 39 | 40 | // print the records, filtering is done via the setting of varA's output sample names 41 | while (variantFile.getNextVariant(var)) { 42 | // if we just have one parsed alternate (non-complex case) 43 | map > parsedAlts = var.parsedAlternates(true, true); // use mnps, and previous for indels 44 | // but the alt string is long 45 | //cerr << var.alt.size() << " " << parsedAlts.size() << endl; 46 | if (var.alt.size() == 1 && parsedAlts.size() > 1) { 47 | string& alternate = var.alt.front(); 48 | vector& vs = parsedAlts[alternate]; 49 | vector valleles; 50 | for (vector::iterator a = vs.begin(); a != vs.end(); ++a) { 51 | if (a->ref != a->alt) { 52 | valleles.push_back(*a); 53 | } 54 | } 55 | if (valleles.size() == 1) { 56 | // do we have extra sequence hanging around? 57 | VariantAllele& varallele = valleles.front(); 58 | if (vs.front().ref == vs.front().alt) { 59 | var.position = varallele.position; 60 | var.ref = var.ref.substr(vs.front().ref.size(), varallele.ref.size()); 61 | var.alt.front() = varallele.alt; 62 | } 63 | } 64 | } 65 | cout << var << endl; 66 | } 67 | 68 | return 0; 69 | 70 | } 71 | 72 | -------------------------------------------------------------------------------- /src/vcfcommonsamples.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | template 10 | vector intersection(vector& a, vector& b) { 11 | map inA; 12 | map inAB; 13 | for (typename vector::iterator i = a.begin(); i != a.end(); ++i) { 14 | inA[*i] = true; 15 | } 16 | for (typename vector::iterator i = b.begin(); i != b.end(); ++i) { 17 | if (inA.find(*i) != inA.end()) { 18 | inAB[*i] = true; 19 | } 20 | } 21 | vector aIb; 22 | for (typename map::iterator i = inAB.begin(); i != inAB.end(); ++i) { 23 | aIb.push_back(i->first); 24 | } 25 | return aIb; 26 | } 27 | 28 | int main(int argc, char** argv) { 29 | 30 | if (argc != 3) { 31 | cerr << "usage: " << argv[0] << " " << endl 32 | << "outputs each record in the first file, removing samples not present in the second" << endl; 33 | return 1; 34 | } 35 | 36 | string filenameA = argv[1]; 37 | string filenameB = argv[2]; 38 | 39 | if (filenameA == filenameB) { 40 | cerr << "you're just spinning your wheels matching the samples in " 41 | << filenameA << " to the samples in " << filenameB << endl; 42 | return 1; 43 | } 44 | 45 | VariantCallFile variantFileA; 46 | if (filenameA == "-") { 47 | variantFileA.open(std::cin); 48 | } else { 49 | variantFileA.open(filenameA); 50 | } 51 | 52 | VariantCallFile variantFileB; 53 | if (filenameB == "-") { 54 | variantFileB.open(std::cin); 55 | } else { 56 | variantFileB.open(filenameB); 57 | } 58 | 59 | if (!variantFileA.is_open() || !variantFileB.is_open()) { 60 | return 1; 61 | } 62 | 63 | Variant varA(variantFileA); 64 | Variant varB(variantFileB); 65 | 66 | vector commonSamples = intersection(variantFileA.sampleNames, variantFileB.sampleNames); 67 | 68 | // update sample list in header 69 | variantFileA.updateSamples(commonSamples); 70 | 71 | // and restrict the output sample names in the variant to those we are keeping 72 | varA.setOutputSampleNames(commonSamples); 73 | 74 | // write the new header 75 | cout << variantFileA.header << endl; 76 | 77 | // print the records, filtering is done via the setting of varA's output sample names 78 | while (variantFileA.getNextVariant(varA)) { 79 | cout << varA << endl; 80 | } 81 | 82 | return 0; 83 | 84 | } 85 | 86 | -------------------------------------------------------------------------------- /src/vcfcountalleles.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | int uniqueAlleles = 0; 22 | 23 | Variant var(variantFile); 24 | while (variantFile.getNextVariant(var)) { 25 | uniqueAlleles += var.alleles.size(); 26 | } 27 | 28 | cout << uniqueAlleles << endl; 29 | 30 | return 0; 31 | 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/vcfcreatemulti.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "convert.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | 11 | double convertStrDbl(const string& s) { 12 | double r; 13 | convert(s, r); 14 | return r; 15 | } 16 | 17 | void printSummary(char** argv) { 18 | cerr << "usage: " << argv[0] << " [options] [file]" << endl 19 | << endl 20 | << "If overlapping alleles are represented across multiple records, merge" << endl 21 | << "them into a single record. Currently only for indels." << endl; 22 | exit(0); 23 | } 24 | 25 | Variant createMultiallelic(vector& vars) { 26 | 27 | if (vars.size() == 1) { 28 | return vars.front(); 29 | } 30 | 31 | int maxpos = vars.front().position + vars.front().ref.size(); 32 | for (vector::iterator v = vars.begin(); v != vars.end(); ++v) { 33 | //cerr << *v << endl; 34 | if (maxpos < v->position + v->ref.size()) { 35 | maxpos = v->position + v->ref.size(); 36 | } 37 | } 38 | 39 | int numalt = vars.size(); 40 | //cerr << "gots overlapping vars " << vars.front().position << "-" << vars.back().position << endl; 41 | 42 | // get REF 43 | // use start position to extend all other alleles 44 | int start = vars.front().position; 45 | string ref = vars.front().ref; 46 | 47 | for (vector::iterator v = vars.begin() + 1; v != vars.end(); ++v) { 48 | int sdiff = (v->position + v->ref.size()) - (start + ref.size()); 49 | int pdiff = (start + ref.size()) - v->position; 50 | if (sdiff > 0) { 51 | ref.append(v->ref.substr(pdiff, sdiff)); 52 | } 53 | } 54 | 55 | //cerr << "ref would be " << ref << " for vars from " 56 | // << vars.front().position << " to " << vars.back().position << endl; 57 | 58 | Variant var = vars.front(); 59 | var.alt.clear(); 60 | var.ref = ref; 61 | 62 | for (vector::iterator v = vars.begin(); v != vars.end(); ++v) { 63 | // add alternates and splice them into the reference 64 | int p5diff = v->position - var.position; 65 | int p3diff = (var.position + var.ref.size()) - (v->position + v->ref.size()); 66 | string before; 67 | string after; 68 | if (p5diff > 0) { 69 | before = var.ref.substr(0, p5diff); 70 | } 71 | if (p3diff > 0 && p3diff < var.ref.size()) { 72 | after = var.ref.substr(var.ref.size() - p3diff); 73 | } 74 | if (p5diff || p3diff) { 75 | for (vector::iterator a = v->alt.begin(); a != v->alt.end(); ++a) { 76 | var.alt.push_back(before); 77 | string& alt = var.alt.back(); 78 | alt.append(*a); 79 | alt.append(after); 80 | } 81 | } else { 82 | for (vector::iterator a = v->alt.begin(); a != v->alt.end(); ++a) { 83 | var.alt.push_back(*a); 84 | } 85 | } 86 | } 87 | 88 | stringstream s; 89 | s << vars.front().position << "-" << vars.back().position; 90 | var.info["combined"].push_back(s.str()); 91 | 92 | return var; 93 | 94 | } 95 | 96 | int main(int argc, char** argv) { 97 | 98 | VariantCallFile variantFile; 99 | 100 | int c; 101 | while (true) { 102 | static struct option long_options[] = 103 | { 104 | /* These options set a flag. */ 105 | //{"verbose", no_argument, &verbose_flag, 1}, 106 | {"help", no_argument, 0, 'h'}, 107 | {0, 0, 0, 0} 108 | }; 109 | /* getopt_long stores the option index here. */ 110 | int option_index = 0; 111 | 112 | c = getopt_long (argc, argv, "h", 113 | long_options, &option_index); 114 | 115 | if (c == -1) 116 | break; 117 | 118 | switch (c) { 119 | 120 | case 'h': 121 | printSummary(argv); 122 | break; 123 | 124 | case '?': 125 | printSummary(argv); 126 | exit(1); 127 | break; 128 | 129 | default: 130 | abort (); 131 | } 132 | } 133 | 134 | if (optind < argc) { 135 | string filename = argv[optind]; 136 | variantFile.open(filename); 137 | } else { 138 | variantFile.open(std::cin); 139 | } 140 | 141 | if (!variantFile.is_open()) { 142 | return 1; 143 | } 144 | 145 | variantFile.addHeaderLine("##INFO="); 146 | 147 | cout << variantFile.header << endl; 148 | 149 | bool first = true; 150 | bool already = false; 151 | Variant var(variantFile); 152 | vector vars; 153 | string lastSeq; 154 | 155 | while (variantFile.getNextVariant(var)) { 156 | 157 | if (lastSeq.empty()) { 158 | lastSeq = var.sequenceName; 159 | } 160 | 161 | if (vars.empty()) { 162 | vars.push_back(var); 163 | continue; 164 | } else { 165 | int maxpos = vars.front().position + vars.front().ref.size(); 166 | for (vector::iterator v = vars.begin(); v != vars.end(); ++v) { 167 | if (maxpos < v->position + v->ref.size()) { 168 | maxpos = v->position + v->ref.size(); 169 | } 170 | } 171 | if (var.sequenceName != lastSeq) { 172 | Variant result = createMultiallelic(vars); 173 | cout << result << endl; 174 | vars.clear(); 175 | lastSeq = var.sequenceName; 176 | vars.push_back(var); 177 | } else if (var.position < maxpos) { 178 | vars.push_back(var); 179 | } else { 180 | Variant result = createMultiallelic(vars); 181 | cout << result << endl; 182 | vars.clear(); 183 | vars.push_back(var); 184 | } 185 | } 186 | 187 | } 188 | 189 | Variant result = createMultiallelic(vars); 190 | cout << result << endl; 191 | 192 | return 0; 193 | 194 | } 195 | 196 | -------------------------------------------------------------------------------- /src/vcfdistance.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | int main(int argc, char** argv) { 10 | 11 | if (argc > 1) { 12 | cerr << "usage: " << argv[0] << " <[vcf file]" << endl 13 | << "adds a tag (BasesToNextVariant) to each variant record which indicates" << endl 14 | << "the distance to the nearest variant" << endl; 15 | return 1; 16 | } 17 | 18 | VariantCallFile variantFile; 19 | variantFile.open(std::cin); 20 | 21 | if (!variantFile.is_open()) { 22 | return 1; 23 | } 24 | 25 | Variant varA(variantFile); 26 | Variant varB(variantFile); 27 | Variant varC(variantFile); 28 | 29 | vector vars; 30 | vars.push_back(&varA); 31 | vars.push_back(&varB); 32 | vars.push_back(&varC); 33 | 34 | for (vector::iterator v = vars.begin(); v != vars.end(); ++v) { 35 | variantFile.getNextVariant(**v); 36 | } 37 | 38 | string tag = "BasesToClosestVariant"; 39 | string line = "##INFO="; 41 | variantFile.addHeaderLine(line); 42 | 43 | cout << variantFile.header << endl; 44 | 45 | // get the first distances 46 | if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) { 47 | vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); 48 | } 49 | 50 | while (variantFile.getNextVariant(*vars.back())) { 51 | 52 | if (vars.at(1)->sequenceName == vars.at(0)->sequenceName && 53 | vars.at(1)->sequenceName == vars.at(2)->sequenceName) { 54 | vars.at(1)->info[tag].push_back(convert(min(vars.at(1)->position - vars.at(0)->position, 55 | vars.at(2)->position - vars.at(1)->position))); 56 | } else if (vars.at(1)->sequenceName == vars.at(0)->sequenceName) { 57 | vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); 58 | } else if (vars.at(2)->sequenceName == vars.at(1)->sequenceName) { 59 | vars.at(1)->info[tag].push_back(convert(vars.at(2)->position - vars.at(1)->position)); 60 | } else { 61 | // don't add the tag 62 | } 63 | cout << *vars.front() << endl; 64 | // rotate 65 | Variant* v = vars.at(0); 66 | vars.at(0) = vars.at(1); 67 | vars.at(1) = vars.at(2); 68 | vars.at(2) = v; 69 | 70 | } 71 | 72 | // assign the last distances 73 | 74 | if (vars.at(0)->sequenceName == vars.at(1)->sequenceName) { 75 | vars.at(0)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); 76 | cout << *vars.at(0) << endl; 77 | 78 | vars.at(1)->info[tag].push_back(convert(vars.at(1)->position - vars.at(0)->position)); 79 | cout << *vars.at(1) << endl; 80 | } 81 | 82 | return 0; 83 | 84 | } 85 | 86 | -------------------------------------------------------------------------------- /src/vcfecho.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | cout << variantFile.header << endl; 22 | 23 | Variant var(variantFile); 24 | while (variantFile.getNextVariant(var)) { 25 | cout << var << endl; 26 | } 27 | 28 | return 0; 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/vcfentropy.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include "fastahack/Fasta.h" 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | void printSummary(char** argv) { 10 | cerr << "usage: " << argv[0] << " [options] " << endl 11 | << endl 12 | << "options:" << endl 13 | << " -f, --fasta-reference FASTA reference file to use to obtain flanking sequences" << endl 14 | << " -w, --window-size Size of the window over which to calculate entropy" << endl 15 | << endl 16 | << "Anotates the output VCF file with, for each record, EntropyLeft, EntropyRight," << endl 17 | << "EntropyCenter, which are the entropies of the sequence of the given window size to the" << endl 18 | << "left, right, and center of the record. Also adds EntropyRef and EntropyAlt for each alt." << endl 19 | << endl; 20 | exit(0); 21 | } 22 | 23 | 24 | int main(int argc, char** argv) { 25 | 26 | int c; 27 | string fastaRef; 28 | int windowSize = 0; 29 | 30 | if (argc == 1) 31 | printSummary(argv); 32 | 33 | while (true) { 34 | static struct option long_options[] = 35 | { 36 | /* These options set a flag. */ 37 | //{"verbose", no_argument, &verbose_flag, 1}, 38 | {"help", no_argument, 0, 'h'}, 39 | {"fasta-reference", required_argument, 0, 'f'}, 40 | {"window-size", required_argument, 0, 'w'}, 41 | //{"length", no_argument, &printLength, true}, 42 | {0, 0, 0, 0} 43 | }; 44 | /* getopt_long stores the option index here. */ 45 | int option_index = 0; 46 | 47 | c = getopt_long (argc, argv, "hf:w:", 48 | long_options, &option_index); 49 | 50 | /* Detect the end of the options. */ 51 | if (c == -1) 52 | break; 53 | 54 | switch (c) 55 | { 56 | case 0: 57 | /* If this option set a flag, do nothing else now. */ 58 | if (long_options[option_index].flag != 0) 59 | break; 60 | printf ("option %s", long_options[option_index].name); 61 | if (optarg) 62 | printf (" with arg %s", optarg); 63 | printf ("\n"); 64 | break; 65 | 66 | case 'f': 67 | fastaRef = optarg; 68 | break; 69 | 70 | case 'w': 71 | windowSize = atoi(optarg); 72 | break; 73 | 74 | case 'h': 75 | printSummary(argv); 76 | exit(0); 77 | break; 78 | 79 | case '?': 80 | /* getopt_long already printed an error message. */ 81 | printSummary(argv); 82 | exit(1); 83 | break; 84 | 85 | default: 86 | abort (); 87 | } 88 | } 89 | 90 | if (windowSize == 0) { 91 | cerr << "a window size must be specified" << endl; 92 | exit(1); 93 | } 94 | if (fastaRef.empty()) { 95 | cerr << "a FASTA reference sequence must be specified" << endl; 96 | exit(1); 97 | } 98 | 99 | FastaReference ref; 100 | ref.open(fastaRef); 101 | 102 | VariantCallFile variantFile; 103 | string inputFilename; 104 | if (optind == argc - 1) { 105 | inputFilename = argv[optind]; 106 | variantFile.open(inputFilename); 107 | } else { 108 | variantFile.open(std::cin); 109 | } 110 | 111 | if (!variantFile.is_open()) { 112 | return 1; 113 | } 114 | 115 | variantFile.addHeaderLine("##INFO="); 116 | variantFile.addHeaderLine("##INFO="); 117 | variantFile.addHeaderLine("##INFO="); 118 | variantFile.addHeaderLine("##INFO="); 119 | variantFile.addHeaderLine("##INFO="); 120 | 121 | cout << variantFile.header << endl; 122 | 123 | Variant var(variantFile); 124 | while (variantFile.getNextVariant(var)) { 125 | 126 | // get the ref start and end positions 127 | int refstart = var.position - 1; // convert to 0-based 128 | int refend = var.position + var.ref.size() - 1; 129 | string leftseq = ref.getSubSequence(var.sequenceName, refstart - windowSize, windowSize); 130 | string rightseq = ref.getSubSequence(var.sequenceName, refend, windowSize); 131 | string centerseq = ref.getSubSequence(var.sequenceName, refstart - windowSize/2, windowSize); 132 | double entropyLeft = shannon_H((char*) &leftseq[0], windowSize); 133 | double entropyRight = shannon_H((char*) &rightseq[0], windowSize); 134 | double entropyCenter = shannon_H((char*) ¢erseq[0], windowSize); 135 | double entropyRef = shannon_H((char*) var.ref.c_str(), var.ref.size()); 136 | 137 | var.info["EntropyLeft"].clear(); 138 | var.info["EntropyRight"].clear(); 139 | var.info["EntropyCenter"].clear(); 140 | var.info["EntropyRef"].clear(); 141 | var.info["EntropyAlt"].clear(); 142 | 143 | var.info["EntropyLeft"].push_back(convert(entropyLeft)); 144 | var.info["EntropyRight"].push_back(convert(entropyRight)); 145 | var.info["EntropyCenter"].push_back(convert(entropyCenter)); 146 | var.info["EntropyRef"].push_back(convert(entropyRef)); 147 | 148 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 149 | double entropyAlt = shannon_H((char*) a->c_str(), a->size()); 150 | var.info["EntropyAlt"].push_back(convert(entropyAlt)); 151 | } 152 | 153 | cout << var << endl; 154 | } 155 | 156 | return 0; 157 | 158 | } 159 | 160 | -------------------------------------------------------------------------------- /src/vcffixup.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | int countAlts(Variant& var, int alleleIndex) { 11 | int alts = 0; 12 | for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { 13 | map >& sample = s->second; 14 | map >::iterator gt = sample.find("GT"); 15 | if (gt != sample.end()) { 16 | map genotype = decomposeGenotype(gt->second.front()); 17 | for (map::iterator g = genotype.begin(); g != genotype.end(); ++g) { 18 | if (g->first == alleleIndex) { 19 | alts += g->second; 20 | } 21 | } 22 | } 23 | } 24 | return alts; 25 | } 26 | 27 | int countAlleles(Variant& var) { 28 | int alleles = 0; 29 | for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { 30 | map >& sample = s->second; 31 | map >::iterator gt = sample.find("GT"); 32 | if (gt != sample.end()) { 33 | map genotype = decomposeGenotype(gt->second.front()); 34 | for (map::iterator g = genotype.begin(); g != genotype.end(); ++g) { 35 | if (g->first != NULL_ALLELE) { 36 | alleles += g->second; 37 | } 38 | } 39 | } 40 | } 41 | return alleles; 42 | } 43 | 44 | int main(int argc, char** argv) { 45 | 46 | if (argc != 2) { 47 | cerr << "usage: " << argv[0] << " " << endl 48 | << "outputs a VCF stream where AC and NS have been generated for each record using sample genotypes" << endl; 49 | return 1; 50 | } 51 | 52 | string filename = argv[1]; 53 | 54 | VariantCallFile variantFile; 55 | if (filename == "-") { 56 | variantFile.open(std::cin); 57 | } else { 58 | variantFile.open(filename); 59 | } 60 | 61 | if (!variantFile.is_open()) { 62 | cerr << "could not open " << filename << endl; 63 | return 1; 64 | } 65 | 66 | Variant var(variantFile); 67 | 68 | // remove header lines we're going to add 69 | variantFile.removeInfoHeaderLine("AC"); 70 | variantFile.removeInfoHeaderLine("AF"); 71 | variantFile.removeInfoHeaderLine("NS"); 72 | variantFile.removeInfoHeaderLine("AN"); 73 | 74 | // and add them back, so as not to duplicate them if they are already there 75 | variantFile.addHeaderLine("##INFO="); 76 | variantFile.addHeaderLine("##INFO="); 77 | variantFile.addHeaderLine("##INFO="); 78 | variantFile.addHeaderLine("##INFO="); 79 | 80 | // write the new header 81 | cout << variantFile.header << endl; 82 | 83 | // print the records, filtering is done via the setting of varA's output sample names 84 | while (variantFile.getNextVariant(var)) { 85 | stringstream ns; 86 | ns << var.samples.size(); 87 | var.info["NS"].clear(); 88 | var.info["NS"].push_back(ns.str()); 89 | 90 | var.info["AC"].clear(); 91 | var.info["AF"].clear(); 92 | var.info["AN"].clear(); 93 | 94 | int allelecount = countAlleles(var); 95 | stringstream an; 96 | an << allelecount; 97 | var.info["AN"].push_back(an.str()); 98 | 99 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 100 | string& allele = *a; 101 | int altcount = countAlts(var, var.getAltAlleleIndex(allele) + 1); 102 | stringstream ac; 103 | ac << altcount; 104 | var.info["AC"].push_back(ac.str()); 105 | stringstream af; 106 | af << (double) altcount / (double) allelecount; 107 | var.info["AF"].push_back(af.str()); 108 | } 109 | cout << var << endl; 110 | } 111 | 112 | return 0; 113 | 114 | } 115 | 116 | -------------------------------------------------------------------------------- /src/vcfgeno2alleles.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc > 1) { 13 | cerr << "usage: " << argv[0] << " <[vcf file]" << endl 14 | << "modifies the genotypes field to provide the literal alleles rather than indexes" << endl; 15 | return 1; 16 | } 17 | 18 | VariantCallFile variantFile; 19 | 20 | variantFile.open(std::cin); 21 | 22 | if (!variantFile.is_open()) { 23 | return 1; 24 | } 25 | 26 | cout << variantFile.header << endl; 27 | 28 | Variant var(variantFile); 29 | while (variantFile.getNextVariant(var)) { 30 | map > >::iterator s = var.samples.begin(); 31 | map > >::iterator sEnd = var.samples.end(); 32 | 33 | for (; s != sEnd; ++s) { 34 | map >& sample = s->second; 35 | vector& gtstrs = sample["GT"]; 36 | string& genotype = gtstrs.front(); 37 | vector gt = split(genotype, "|/"); 38 | 39 | // report the sample and it's genotype 40 | stringstream o; 41 | for (vector::iterator g = gt.begin(); g != gt.end(); ++g) { 42 | int index = atoi(g->c_str()); 43 | o << var.alleles[index]; 44 | if (g != (gt.end()-1)) o << "/"; 45 | } 46 | gtstrs.clear(); 47 | gtstrs.push_back(o.str()); 48 | } 49 | cout << var << endl; 50 | } 51 | return 0; 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/vcfgenosamplenames.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | variantFile.addHeaderLine("##FORMAT="); 22 | 23 | cout << variantFile.header << endl; 24 | 25 | Variant var(variantFile); 26 | while (variantFile.getNextVariant(var)) { 27 | var.format.push_back("SN"); 28 | for (map > >::iterator s = var.samples.begin(); 29 | s != var.samples.end(); ++s) { 30 | s->second["SN"].clear(); 31 | s->second["SN"].push_back(s->first); 32 | } 33 | cout << var << endl; 34 | } 35 | 36 | return 0; 37 | 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/vcfgenosummarize.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | using namespace std; 9 | using namespace vcf; 10 | 11 | 12 | int main(int argc, char** argv) { 13 | 14 | if (argc > 1 && (argv[1] == "-h" || argv[1] == "--help")) { 15 | cerr << "usage: " << argv[0] << " <[input file] >[output vcf]" << endl 16 | << "Adds summary statistics to each record summarizing qualities reported in" << endl 17 | << "called genotypes. Uses:" << endl 18 | << "RO (reference observation count), QR (quality sum reference observations)" << endl 19 | << "AO (alternate observation count), QA (quality sum alternate observations)" << endl; 20 | return 1; 21 | } 22 | 23 | VariantCallFile variantFile; 24 | if (argc == 1) { 25 | variantFile.open(std::cin); 26 | } else { 27 | string filename = argv[argc-1]; 28 | variantFile.open(filename); 29 | if (!variantFile.is_open()) { 30 | cerr << "could not open " << filename << endl; 31 | return 1; 32 | } 33 | } 34 | 35 | Variant var(variantFile); 36 | 37 | variantFile.removeInfoHeaderLine("AQR"); 38 | variantFile.addHeaderLine("##INFO="); 39 | variantFile.removeInfoHeaderLine("AQA"); 40 | variantFile.addHeaderLine("##INFO="); 41 | variantFile.removeInfoHeaderLine("QR"); 42 | variantFile.addHeaderLine("##INFO="); 43 | variantFile.removeInfoHeaderLine("QA"); 44 | variantFile.addHeaderLine("##INFO="); 45 | variantFile.removeInfoHeaderLine("RQA"); 46 | variantFile.addHeaderLine("##INFO="); 47 | 48 | // write the new header 49 | cout << variantFile.header << endl; 50 | 51 | // print the records, filtering is done via the setting of varA's output sample names 52 | while (variantFile.getNextVariant(var)) { 53 | int refobs = 0; 54 | int refqual = 0; 55 | vector altobs(var.alt.size(), 0); 56 | vector altqual(var.alt.size(), 0); 57 | for (map > >::iterator s = var.samples.begin(); 58 | s != var.samples.end(); ++s) { 59 | map >& sample = s->second; 60 | int x; 61 | if (sample.find("RO") != sample.end()) { 62 | convert(sample["RO"].front(), x); 63 | refobs += x; 64 | if (sample.find("QR") != sample.end()) { 65 | convert(sample["QR"].front(), x); 66 | refqual += x; 67 | } 68 | } 69 | if (sample.find("AO") != sample.end()) { 70 | vector& aos = sample["AO"]; 71 | for (int i = 0; i != var.alt.size(); ++i) { 72 | convert(aos[i], x); 73 | altobs[i] += x; 74 | } 75 | if (sample.find("QA") != sample.end()) { 76 | vector& qas = sample["QA"]; 77 | for (int i = 0; i != var.alt.size(); ++i) { 78 | convert(qas[i], x); 79 | altqual[i] += x; 80 | } 81 | } 82 | } 83 | } 84 | var.info["QR"].push_back(convert(refqual)); 85 | if (refobs == 0 || refqual == 0) { 86 | var.info["AQR"].push_back(convert(0)); 87 | } else { 88 | var.info["AQR"].push_back(convert((double)refqual/(double)refobs)); 89 | } 90 | 91 | for (int i = 0; i != var.alt.size(); ++i) { 92 | var.info["QA"].push_back(convert(altqual[i])); 93 | var.info["AQA"].push_back(convert((double)altqual[i]/(double)altobs[i])); 94 | if (refobs == 0 || refqual == 0) { 95 | var.info["RQA"].push_back(convert(1)); 96 | } else { 97 | var.info["RQA"].push_back(convert(((double)altqual[i]/(double)altobs[i]) / 98 | ((double)refqual/(double)refobs))); 99 | } 100 | } 101 | cout << var << endl; 102 | } 103 | 104 | return 0; 105 | 106 | } 107 | 108 | -------------------------------------------------------------------------------- /src/vcfgenotypes.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc != 2) { 13 | cerr << "usage: " << argv[0] << " " << endl 14 | << "report the genotypes for each sample, for each variant in the vcf file" << endl; 15 | return 1; 16 | } 17 | 18 | string filename = argv[1]; 19 | 20 | VariantCallFile variantFile; 21 | 22 | if (filename == "-") { 23 | variantFile.open(std::cin); 24 | } else { 25 | variantFile.open(filename); 26 | } 27 | 28 | if (!variantFile.is_open()) { 29 | return 1; 30 | } 31 | 32 | Variant var(variantFile); 33 | while (variantFile.getNextVariant(var)) { 34 | map > >::iterator s = var.samples.begin(); 35 | map > >::iterator sEnd = var.samples.end(); 36 | 37 | cout << var.sequenceName << "\t" 38 | << var.position << "\t" 39 | << var.ref << "\t"; 40 | var.printAlt(cout); cout << "\t"; 41 | var.printAlleles(cout); cout << "\t"; 42 | 43 | for (; s != sEnd; ++s) { 44 | map >& sample = s->second; 45 | string& genotype = sample["GT"].front(); // XXX assumes we can only have one GT value 46 | vector gt = split(genotype, "|/"); 47 | 48 | // report the sample and it's genotype 49 | cout << s->first << ":"; 50 | for (vector::iterator g = gt.begin(); g != gt.end(); ++g) { 51 | int index = atoi(g->c_str()); 52 | cout << var.alleles[index]; 53 | if (g != (gt.end()-1)) cout << "/"; 54 | } 55 | cout << "\t"; 56 | } 57 | cout << endl; 58 | } 59 | return 0; 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/vcfglbound.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | 5 | using namespace std; 6 | using namespace vcf; 7 | 8 | void printSummary(char** argv) { 9 | cerr << "usage: " << argv[0] << " [options] " << endl 10 | << endl 11 | << "options:" << endl 12 | << " -b, --bound N Bound GLs to this limit." << endl 13 | << " -x, --exclude-broken If GLs are > 0, remove site." << endl 14 | << endl 15 | << "Adjust GLs so that the maximum GL is 0 by dividing all GLs for each sample by the max." << endl 16 | << "Then cap (bound) at N (e.g. -10)." << endl; 17 | exit(0); 18 | } 19 | 20 | 21 | int main(int argc, char** argv) { 22 | 23 | bool excludeBroken = false; 24 | double glBound = 0; 25 | int c; 26 | 27 | while (true) { 28 | static struct option long_options[] = 29 | { 30 | /* These options set a flag. */ 31 | //{"verbose", no_argument, &verbose_flag, 1}, 32 | {"help", no_argument, 0, 'h'}, 33 | {"bound", required_argument, 0, 'b'}, 34 | {"exclude-broken", no_argument, 0, 'x'}, 35 | //{"length", no_argument, &printLength, true}, 36 | {0, 0, 0, 0} 37 | }; 38 | /* getopt_long stores the option index here. */ 39 | int option_index = 0; 40 | 41 | c = getopt_long (argc, argv, "hxb:", 42 | long_options, &option_index); 43 | 44 | /* Detect the end of the options. */ 45 | if (c == -1) 46 | break; 47 | 48 | switch (c) 49 | { 50 | case 0: 51 | /* If this option set a flag, do nothing else now. */ 52 | if (long_options[option_index].flag != 0) 53 | break; 54 | printf ("option %s", long_options[option_index].name); 55 | if (optarg) 56 | printf (" with arg %s", optarg); 57 | printf ("\n"); 58 | break; 59 | 60 | case 'b': 61 | glBound = atof(optarg); 62 | break; 63 | 64 | case 'x': 65 | excludeBroken = true; 66 | break; 67 | 68 | case 'h': 69 | printSummary(argv); 70 | exit(0); 71 | break; 72 | 73 | case '?': 74 | /* getopt_long already printed an error message. */ 75 | printSummary(argv); 76 | exit(1); 77 | break; 78 | 79 | default: 80 | abort (); 81 | } 82 | } 83 | 84 | if (glBound == 0) { 85 | cerr << "a bound is required when running vcfglbound (try -10)" << endl; 86 | exit(1); 87 | } 88 | 89 | VariantCallFile variantFile; 90 | string inputFilename; 91 | if (optind == argc - 1) { 92 | inputFilename = argv[optind]; 93 | variantFile.open(inputFilename); 94 | } else { 95 | variantFile.open(std::cin); 96 | } 97 | 98 | if (!variantFile.is_open()) { 99 | return 1; 100 | } 101 | 102 | cout << variantFile.header << endl; 103 | 104 | Variant var(variantFile); 105 | while (variantFile.getNextVariant(var)) { 106 | if (find(var.format.begin(), var.format.end(), "GL") == var.format.end()) { 107 | cout << var << endl; 108 | continue; 109 | } 110 | if (find(var.format.begin(), var.format.end(), "GT") == var.format.end()) { 111 | var.format.push_back("GT"); 112 | reverse(var.format.begin(), var.format.end()); 113 | } 114 | bool isbroken = false; 115 | for (map > >::iterator s = var.samples.begin(); 116 | s != var.samples.end(); ++s) { 117 | map >& sample = s->second; 118 | map >::iterator l = sample.find("GL"); 119 | if (l != sample.end()) { 120 | 121 | // find the gl max 122 | vector& glstrs = l->second; 123 | vector gls; 124 | for (vector::iterator gl = glstrs.begin(); gl != glstrs.end(); ++gl) { 125 | double d; 126 | convert(*gl, d); 127 | gls.push_back(d); 128 | } 129 | 130 | isbroken = false; // reset every iteration 131 | for (vector::iterator g = gls.begin(); g != gls.end(); ++g) { 132 | if (*g > 0) { 133 | isbroken = true; 134 | break; 135 | } 136 | } 137 | if (isbroken) { 138 | if (excludeBroken) { 139 | cerr << var.sequenceName << ":" << var.position << ", sample " << s->first << " has GL > 0" << endl; 140 | break; 141 | } else { 142 | cerr << "VCF record @ " << var.sequenceName << ":" << var.position << ", sample " << s->first << " has GL > 0, not processing, but outputting" << endl; 143 | continue; 144 | } 145 | } 146 | 147 | // normalize GLs to -10 min 0 max using division by max and bounding at -10 148 | double minGL = 0; 149 | for (vector::iterator g = gls.begin(); g != gls.end(); ++g) { 150 | if (*g < minGL) minGL = *g; 151 | } 152 | double maxGL = minGL; 153 | for (vector::iterator g = gls.begin(); g != gls.end(); ++g) { 154 | if (*g > maxGL) maxGL = *g; 155 | } 156 | // modify gls 157 | for (vector::iterator g = gls.begin(); g != gls.end(); ++g) { 158 | *g = max(glBound, *g - maxGL); 159 | } 160 | 161 | // and pack back into GL field 162 | glstrs.clear(); 163 | for (vector::iterator g = gls.begin(); g != gls.end(); ++g) { 164 | glstrs.push_back(convert(*g)); 165 | } 166 | } 167 | } 168 | if (excludeBroken && isbroken) { 169 | cerr << "excluding VCF record @ " << var.sequenceName << ":" << var.position << " due to GLs > 0" << endl; 170 | } else { 171 | cout << var << endl; 172 | } 173 | } 174 | 175 | return 0; 176 | 177 | } 178 | 179 | -------------------------------------------------------------------------------- /src/vcfglxgt.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | 5 | using namespace std; 6 | using namespace vcf; 7 | 8 | void printSummary(char** argv) { 9 | cerr << "usage: " << argv[0] << " [options] " << endl 10 | << endl 11 | << "options:" << endl 12 | << " -n, --fix-null-genotypes only apply to null and partly-null genotypes" << endl 13 | << endl 14 | << "Set genotypes using the maximum genotype likelihood for each sample." << endl 15 | << endl; 16 | exit(0); 17 | } 18 | 19 | 20 | int main(int argc, char** argv) { 21 | 22 | bool fixNull = false; 23 | int c; 24 | 25 | while (true) { 26 | static struct option long_options[] = 27 | { 28 | /* These options set a flag. */ 29 | //{"verbose", no_argument, &verbose_flag, 1}, 30 | {"help", no_argument, 0, 'h'}, 31 | {"fix-null-genotypes", no_argument, 0, 'n'}, 32 | //{"length", no_argument, &printLength, true}, 33 | {0, 0, 0, 0} 34 | }; 35 | /* getopt_long stores the option index here. */ 36 | int option_index = 0; 37 | 38 | c = getopt_long (argc, argv, "hn", 39 | long_options, &option_index); 40 | 41 | /* Detect the end of the options. */ 42 | if (c == -1) 43 | break; 44 | 45 | switch (c) 46 | { 47 | case 0: 48 | /* If this option set a flag, do nothing else now. */ 49 | if (long_options[option_index].flag != 0) 50 | break; 51 | printf ("option %s", long_options[option_index].name); 52 | if (optarg) 53 | printf (" with arg %s", optarg); 54 | printf ("\n"); 55 | break; 56 | 57 | case 'n': 58 | fixNull = true; 59 | break; 60 | 61 | case 'h': 62 | printSummary(argv); 63 | exit(0); 64 | break; 65 | 66 | case '?': 67 | /* getopt_long already printed an error message. */ 68 | printSummary(argv); 69 | exit(1); 70 | break; 71 | 72 | default: 73 | abort (); 74 | } 75 | } 76 | 77 | VariantCallFile variantFile; 78 | string inputFilename; 79 | if (optind == argc - 1) { 80 | inputFilename = argv[optind]; 81 | variantFile.open(inputFilename); 82 | } else { 83 | variantFile.open(std::cin); 84 | } 85 | 86 | if (!variantFile.is_open()) { 87 | return 1; 88 | } 89 | 90 | cout << variantFile.header << endl; 91 | 92 | map, list > > glOrderCache; 93 | 94 | Variant var(variantFile); 95 | while (variantFile.getNextVariant(var)) { 96 | if (find(var.format.begin(), var.format.end(), "GL") == var.format.end()) { 97 | cout << var << endl; 98 | continue; 99 | } 100 | if (find(var.format.begin(), var.format.end(), "GT") == var.format.end()) { 101 | var.format.push_back("GT"); 102 | reverse(var.format.begin(), var.format.end()); 103 | } 104 | for (map > >::iterator s = var.samples.begin(); 105 | s != var.samples.end(); ++s) { 106 | map >& sample = s->second; 107 | map >::iterator g = sample.find("GT"); 108 | map >::iterator l = sample.find("GL"); 109 | if (l != sample.end()) { 110 | if (g == sample.end()) { 111 | sample["GT"].push_back("./."); 112 | g = sample.find("GT"); 113 | } 114 | 115 | string& gt = g->second.front(); 116 | // if we are fixing null but the genotype is fully specified, continue 117 | if (fixNull && gt.find(".") == string::npos) continue; 118 | string splitter = "/"; 119 | if (gt.find("|") != string::npos) splitter = "|"; 120 | int samplePloidy = split(gt, splitter).size(); 121 | int numAlleles = var.alt.size() + 1; // including reference 122 | 123 | // get the gt GL ordering 124 | pair pa = make_pair(samplePloidy, numAlleles); 125 | map, list > >::iterator order = glOrderCache.find(pa); 126 | if (order == glOrderCache.end()) { 127 | glOrderCache[pa] = glorder(samplePloidy, numAlleles); 128 | } 129 | list >& glOrdering = glOrderCache[pa]; 130 | 131 | // find the gl max 132 | vector& gls = l->second; 133 | vector::iterator p = gls.begin(); 134 | double maxGl; 135 | convert(*p, maxGl); ++p; 136 | int i = 1, maxindex = 0; 137 | for (; p != gls.end(); ++p, ++i) { 138 | double cgl; 139 | convert(*p, cgl); 140 | if (cgl > maxGl) { 141 | maxGl = cgl; 142 | maxindex = i; // prefers == gls in order of listing 143 | } 144 | } 145 | 146 | // determine which genotype it represents 147 | // modify, if the GT is part-null 148 | vector& gtv = g->second; 149 | list >::iterator b = glOrdering.begin(); 150 | advance(b, maxindex); 151 | /* 152 | cout << "changing sample " << s->first << " gt from " << gt << " to " << join(*b, "/") 153 | << " gls are "; 154 | int q = 0; 155 | for (list >::iterator i = glOrdering.begin(); i != glOrdering.end(); ++i, ++q) { 156 | cout << join(*i, "/") << ":" << sample["GL"].at(q) << ", "; 157 | } 158 | cout << endl; 159 | */ 160 | 161 | gtv.clear(); 162 | gtv.push_back(join(*b, "/")); 163 | } 164 | } 165 | cout << var << endl; 166 | } 167 | 168 | return 0; 169 | 170 | } 171 | 172 | -------------------------------------------------------------------------------- /src/vcfhetcount.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc == 2 && (argv[1] == "-h" || argv[1] == "--help")) { 13 | cerr << "usage: " << argv[0] << " " << endl 14 | << "count the number of alternate alleles in heterozygous genotypes in all records in the vcf file" << endl 15 | << "outputs a count for each individual in the file" << endl; 16 | return 1; 17 | } 18 | 19 | 20 | string inputFilename; 21 | VariantCallFile variantFile; 22 | if (optind == argc - 1) { 23 | inputFilename = argv[optind]; 24 | variantFile.open(inputFilename); 25 | } else { 26 | variantFile.open(std::cin); 27 | } 28 | 29 | if (!variantFile.is_open()) { 30 | return 1; 31 | } 32 | 33 | unsigned int hetAlleleCount = 0; 34 | map hetCounts; 35 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 36 | hetCounts[*s] = 0; 37 | } 38 | 39 | Variant var(variantFile); 40 | while (variantFile.getNextVariant(var)) { 41 | //cout << var << endl; 42 | for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { 43 | string name = s->first; 44 | map >& sample = s->second; 45 | string& genotype = sample["GT"].front(); 46 | vector gt = split(genotype, "|/"); 47 | int alt = 0; 48 | for (vector::iterator g = gt.begin(); g != gt.end(); ++g) { 49 | if (*g != "0") 50 | ++alt; 51 | } 52 | if (alt != gt.size()) { 53 | hetCounts[name] += alt; 54 | //hetAlleleCount += alt; 55 | } 56 | } 57 | } 58 | 59 | //cout << hetAlleleCount << endl; 60 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 61 | cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s; 62 | } 63 | cout << endl; 64 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 65 | cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << hetCounts[*s]; 66 | } 67 | cout << endl; 68 | 69 | return 0; 70 | 71 | } 72 | 73 | -------------------------------------------------------------------------------- /src/vcfhethomratio.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc != 2) { 13 | cerr << "usage: " << argv[0] << " " << endl 14 | << "outputs the het/hom ratio for each individual in the file" << endl; 15 | return 1; 16 | } 17 | 18 | string filename = argv[1]; 19 | 20 | VariantCallFile variantFile; 21 | if (filename == "-") { 22 | variantFile.open(std::cin); 23 | } else { 24 | variantFile.open(filename); 25 | } 26 | if (!variantFile.is_open()) { 27 | cerr << "could not open " << filename << endl; 28 | return 1; 29 | } 30 | 31 | map hetCounts; 32 | map homCounts; 33 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 34 | hetCounts[*s] = 0; 35 | homCounts[*s] = 0; 36 | } 37 | 38 | Variant var(variantFile); 39 | while (variantFile.getNextVariant(var)) { 40 | //cout << var << endl; 41 | for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { 42 | string name = s->first; 43 | map >& sample = s->second; 44 | string& gt = sample["GT"].front(); 45 | map genotype = decomposeGenotype(gt); 46 | if (isHet(genotype)) { 47 | ++hetCounts[name]; 48 | } else if (isHomNonRef(genotype)) { 49 | ++homCounts[name]; 50 | } 51 | } 52 | } 53 | 54 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 55 | cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << *s; 56 | } 57 | cout << endl; 58 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 59 | cout << (s == variantFile.sampleNames.begin() ? "" : "\t") << (double) hetCounts[*s] / (double) homCounts[*s]; 60 | } 61 | cout << endl; 62 | 63 | return 0; 64 | 65 | } 66 | 67 | -------------------------------------------------------------------------------- /src/vcfindex.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "convert.h" 3 | #include 4 | 5 | using namespace std; 6 | using namespace vcf; 7 | 8 | int main(int argc, char** argv) { 9 | 10 | VariantCallFile variantFile; 11 | 12 | if (argc > 1) { 13 | string filename = argv[1]; 14 | variantFile.open(filename); 15 | } else { 16 | variantFile.open(std::cin); 17 | } 18 | 19 | if (!variantFile.is_open()) { 20 | return 1; 21 | } 22 | 23 | string idname = "id"; 24 | long int uid = 0; 25 | 26 | variantFile.addHeaderLine("##INFO="); 27 | cout << variantFile.header << endl; 28 | 29 | Variant var(variantFile); 30 | while (variantFile.getNextVariant(var)) { 31 | vector& idxs = var.info[idname]; 32 | idxs.clear(); 33 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 34 | idxs.push_back(convert(uid++)); 35 | } 36 | cout << var << endl; 37 | } 38 | 39 | return 0; 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/vcfkeepgeno.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | 11 | int main(int argc, char** argv) { 12 | 13 | if (argc < 3) { 14 | cerr << "usage: " << argv[0] << " [FIELD1] [FIELD2] ..." << endl 15 | << "outputs each record in the vcf file, removing FORMAT fields not listed" 16 | << "on the command line from sample specifications in the output" 17 | << endl; 18 | return 1; 19 | } 20 | 21 | string filename = argv[1]; 22 | 23 | vector newFormat; 24 | set fieldsToKeep; 25 | for (int i = 2; i < argc; ++i) { 26 | fieldsToKeep.insert(argv[i]); 27 | newFormat.push_back(argv[i]); 28 | } 29 | 30 | VariantCallFile variantFile; 31 | if (filename == "-") { 32 | variantFile.open(std::cin); 33 | } else { 34 | variantFile.open(filename); 35 | } 36 | 37 | if (!variantFile.is_open()) { 38 | return 1; 39 | } 40 | 41 | Variant var(variantFile); 42 | 43 | vector formatIds = variantFile.formatIds(); 44 | for (vector::iterator i = formatIds.begin(); i != formatIds.end(); ++i) { 45 | if (!fieldsToKeep.count(*i)) { 46 | variantFile.removeGenoHeaderLine(*i); 47 | } 48 | } 49 | 50 | // write the header 51 | cout << variantFile.header << endl; 52 | 53 | // print the records, filtering is done via the setting of varA's output sample names 54 | while (variantFile.getNextVariant(var)) { 55 | var.format = newFormat; 56 | cout << var << endl; 57 | } 58 | 59 | return 0; 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/vcfkeepinfo.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc < 3) { 13 | cerr << "usage: " << argv[0] << " [FIELD1] [FIELD2] ..." << endl 14 | << "outputs each record in the vcf file, removing INFO fields not listed on the command line" << endl; 15 | return 1; 16 | } 17 | 18 | string filename = argv[1]; 19 | 20 | set fieldsToKeep; 21 | for (int i = 2; i < argc; ++i) { 22 | fieldsToKeep.insert(argv[i]); 23 | } 24 | 25 | VariantCallFile variantFile; 26 | if (filename == "-") { 27 | variantFile.open(std::cin); 28 | } else { 29 | variantFile.open(filename); 30 | } 31 | 32 | if (!variantFile.is_open()) { 33 | return 1; 34 | } 35 | 36 | Variant var(variantFile); 37 | 38 | vector fieldsToErase; 39 | vector infoIds = variantFile.infoIds(); 40 | for (vector::iterator i = infoIds.begin(); i != infoIds.end(); ++i) { 41 | if (!fieldsToKeep.count(*i)) { 42 | fieldsToErase.push_back(*i); 43 | variantFile.removeInfoHeaderLine(*i); 44 | } 45 | } 46 | 47 | // write the header 48 | cout << variantFile.header << endl; 49 | 50 | // print the records, filtering is done via the setting of varA's output sample names 51 | while (variantFile.getNextVariant(var)) { 52 | for (vector::iterator f = fieldsToErase.begin(); f != fieldsToErase.end(); ++f) { 53 | var.info.erase(*f); 54 | var.infoFlags.erase(*f); 55 | } 56 | cout << var << endl; 57 | } 58 | 59 | return 0; 60 | 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/vcfkeepsamples.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | int main(int argc, char** argv) { 10 | 11 | if (argc < 3) { 12 | cerr << "usage: " << argv[0] << " [SAMPLE1] [SAMPLE2] ..." << endl 13 | << "outputs each record in the vcf file, removing samples not listed on the command line" << endl; 14 | return 1; 15 | } 16 | 17 | string filename = argv[1]; 18 | 19 | vector samplesToKeep; 20 | for (int i = 2; i < argc; ++i) { 21 | samplesToKeep.push_back(argv[i]); 22 | } 23 | 24 | VariantCallFile variantFile; 25 | if (filename == "-") { 26 | variantFile.open(std::cin); 27 | } else { 28 | variantFile.open(filename); 29 | } 30 | 31 | if (!variantFile.is_open()) { 32 | return 1; 33 | } 34 | 35 | Variant var(variantFile); 36 | 37 | // update sample list in header 38 | variantFile.updateSamples(samplesToKeep); 39 | 40 | // and restrict the output sample names in the variant to those we are keeping 41 | var.setOutputSampleNames(samplesToKeep); 42 | 43 | // write the new header 44 | cout << variantFile.header << endl; 45 | 46 | // print the records, filtering is done via the setting of varA's output sample names 47 | while (variantFile.getNextVariant(var)) { 48 | cout << var << endl; 49 | } 50 | 51 | return 0; 52 | 53 | } 54 | 55 | -------------------------------------------------------------------------------- /src/vcflength.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "convert.h" 3 | #include 4 | 5 | using namespace std; 6 | using namespace vcf; 7 | 8 | int main(int argc, char** argv) { 9 | 10 | VariantCallFile variantFile; 11 | 12 | if (argc > 1) { 13 | string filename = argv[1]; 14 | variantFile.open(filename); 15 | } else { 16 | variantFile.open(std::cin); 17 | } 18 | 19 | if (!variantFile.is_open()) { 20 | return 1; 21 | } 22 | 23 | variantFile.addHeaderLine("##INFO="); 24 | variantFile.addHeaderLine("##INFO="); 25 | variantFile.addHeaderLine("##INFO="); 26 | cout << variantFile.header << endl; 27 | 28 | Variant var(variantFile); 29 | while (variantFile.getNextVariant(var)) { 30 | vector& lengths = var.info["length"]; 31 | lengths.clear(); 32 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 33 | lengths.push_back(convert((int) a->size() - (int) var.ref.size())); 34 | } 35 | vector& lengthsRef = var.info["length.ref"]; 36 | lengthsRef.clear(); 37 | lengthsRef.push_back(convert(var.ref.size())); 38 | vector& lengthsAlt = var.info["length.alt"]; 39 | lengthsAlt.clear(); 40 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 41 | lengthsAlt.push_back(convert((int) a->size())); 42 | } 43 | cout << var << endl; 44 | } 45 | 46 | return 0; 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/vcfnumalt.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | int main(int argc, char** argv) { 11 | 12 | if (argc != 2) { 13 | cerr << "usage: " << argv[0] << " " << endl 14 | << "outputs a VCF stream where NUMALT has been generated for each record using sample genotypes" << endl; 15 | return 1; 16 | } 17 | 18 | string filename = argv[1]; 19 | 20 | VariantCallFile variantFile; 21 | if (filename == "-") { 22 | variantFile.open(std::cin); 23 | } else { 24 | variantFile.open(filename); 25 | } 26 | 27 | if (!variantFile.is_open()) { 28 | cerr << "could not open " << filename << endl; 29 | return 1; 30 | } 31 | 32 | Variant var(variantFile); 33 | 34 | // remove header lines we're going to add 35 | variantFile.removeInfoHeaderLine("NUMALT"); 36 | 37 | // and add them back, so as not to duplicate them if they are already there 38 | variantFile.addHeaderLine("##INFO="); 39 | 40 | // write the new header 41 | cout << variantFile.header << endl; 42 | 43 | // print the records, filtering is done via the setting of varA's output sample names 44 | while (variantFile.getNextVariant(var)) { 45 | stringstream na; 46 | na << var.alt.size(); 47 | var.info["NUMALT"].clear(); 48 | var.info["NUMALT"].push_back(na.str()); 49 | cout << var << endl; 50 | } 51 | 52 | return 0; 53 | 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/vcfoverlay.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include 3 | 4 | using namespace std; 5 | using namespace vcf; 6 | 7 | 8 | void printSummary(char** argv) { 9 | cerr << "usage: " << argv[0] << " [options] [ ...]" << endl 10 | << endl 11 | << "options:" << endl 12 | << " -h, --help this dialog" << endl 13 | << endl 14 | << "Overlays records in the input vcf files in the order in which they appear." << endl; 15 | exit(0); 16 | } 17 | 18 | int main(int argc, char** argv) { 19 | 20 | if (argc == 1) 21 | printSummary(argv); 22 | 23 | int c; 24 | while (true) { 25 | static struct option long_options[] = 26 | { 27 | {"help", no_argument, 0, 'h'}, 28 | {0, 0, 0, 0} 29 | }; 30 | int option_index = 0; 31 | 32 | c = getopt_long (argc, argv, "h", 33 | long_options, &option_index); 34 | 35 | if (c == -1) 36 | break; 37 | 38 | switch (c) { 39 | case 'h': 40 | printSummary(argv); 41 | break; 42 | 43 | case '?': 44 | printSummary(argv); 45 | exit(1); 46 | break; 47 | 48 | default: 49 | abort (); 50 | } 51 | } 52 | 53 | // idea here is to shadow-merge 54 | // records from the VCF files, which are provided in order of desired merge 55 | 56 | map > variantFiles; 57 | map > > linesByPrecedence; 58 | int i = optind; 59 | 60 | if (optind < argc - 1) { 61 | while (i < argc) { 62 | int index = i++; 63 | VariantCallFile*& variantFile = variantFiles[index].first; 64 | Variant& var = variantFiles[index].second; 65 | string inputFilename = argv[optind++]; 66 | variantFile = new VariantCallFile; 67 | variantFile->open(inputFilename); 68 | var.setVariantCallFile(variantFile); 69 | if (!variantFile->is_open()) { 70 | cout << "could not open VCF file" << endl; 71 | exit(1); 72 | } else { 73 | if (variantFile->getNextVariant(var)) { 74 | linesByPrecedence[var.sequenceName][var.position][index] = variantFile->line; 75 | } 76 | } 77 | } 78 | } else { 79 | cerr << "no input files specified" << endl; 80 | exit(1); 81 | } 82 | 83 | cout << variantFiles.begin()->second.first->header << endl; 84 | 85 | while (!linesByPrecedence.empty()) { 86 | // get the lowest entry in the buffer of observed lines 87 | // print the first line 88 | // get the next variant from that file, put it back into the map 89 | const string& lowestChrom = linesByPrecedence.begin()->first; 90 | const long int lowestPosition = linesByPrecedence.begin()->second.begin()->first; 91 | map& lowestLine = linesByPrecedence.begin()->second.begin()->second; 92 | cout << lowestLine.begin()->second << endl; 93 | 94 | for (map::iterator g = lowestLine.begin(); g != lowestLine.end(); ++g) { 95 | int index = g->first; 96 | VariantCallFile& variantFile = *variantFiles[index].first; 97 | Variant& var = variantFiles[index].second; 98 | if (!variantFile.getNextVariant(var)) { 99 | variantFiles.erase(index); 100 | } else { 101 | linesByPrecedence[var.sequenceName][var.position][index] = variantFile.line; 102 | } 103 | } 104 | 105 | linesByPrecedence[lowestChrom].erase(lowestPosition); 106 | if (linesByPrecedence[lowestChrom].empty()) { 107 | linesByPrecedence.erase(lowestChrom); 108 | } 109 | } 110 | 111 | // flush the rest of the variant records if there are any 112 | 113 | return 0; 114 | } 115 | 116 | -------------------------------------------------------------------------------- /src/vcfparsealts.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | cout << variantFile.header << endl; 22 | 23 | Variant var(variantFile); 24 | while (variantFile.getNextVariant(var)) { 25 | map > variants = var.parsedAlternates(); 26 | cout << var << endl; 27 | for (map >::iterator va = variants.begin(); va != variants.end(); ++va) { 28 | cout << " ( " << va->first << " :: "; 29 | vector& vars = va->second; 30 | vector::iterator g = vars.begin(); 31 | for (; g != vars.end(); ++g) { 32 | cout << *g << "; "; 33 | } 34 | cout << " ) "; 35 | } 36 | cout << endl; 37 | } 38 | 39 | return 0; 40 | 41 | } 42 | 43 | -------------------------------------------------------------------------------- /src/vcfprimers.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include "fastahack/Fasta.h" 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | void printSummary(char** argv) { 10 | cerr << "usage: " << argv[0] << " [options] " << endl 11 | << endl 12 | << "options:" << endl 13 | << " -f, --fasta-reference FASTA reference file to use to obtain primer sequences" << endl 14 | << " -l, --primer-length The length of the primer sequences on each side of the variant" << endl 15 | << endl 16 | << "For each VCF record, extract the flanking sequences, and write them to stdout as FASTA" << endl 17 | << "records suitable for alignment. This tool is intended for use in designing validation" << endl 18 | << "experiments. Primers extracted which would flank all of the alleles at multi-allelic" << endl 19 | << "sites. The name of the FASTA \"reads\" indicates the VCF record which they apply to." << endl 20 | << "The form is >CHROM_POS_LEFT for the 3' primer and >CHROM_POS_RIGHT for the 5' primer," << endl 21 | << "for example:" << endl 22 | << endl 23 | << ">20_233255_LEFT" << endl 24 | << "CCATTGTATATATAGACCATAATTTCTTTATCCAATCATCTGTTGATGGA" << endl 25 | << ">20_233255_RIGHT" << endl 26 | << "ACTCAGTTGATTCCATACCTTTGCCATCATGAATCATGTTGTAATAAACA" << endl 27 | << endl; 28 | exit(0); 29 | } 30 | 31 | 32 | int main(int argc, char** argv) { 33 | 34 | int c; 35 | string fastaRef; 36 | int primerLength = 0; 37 | 38 | if (argc == 1) 39 | printSummary(argv); 40 | 41 | while (true) { 42 | static struct option long_options[] = 43 | { 44 | /* These options set a flag. */ 45 | //{"verbose", no_argument, &verbose_flag, 1}, 46 | {"help", no_argument, 0, 'h'}, 47 | {"fasta-reference", required_argument, 0, 'f'}, 48 | {"primer-length", required_argument, 0, 'l'}, 49 | //{"length", no_argument, &printLength, true}, 50 | {0, 0, 0, 0} 51 | }; 52 | /* getopt_long stores the option index here. */ 53 | int option_index = 0; 54 | 55 | c = getopt_long (argc, argv, "hf:l:", 56 | long_options, &option_index); 57 | 58 | /* Detect the end of the options. */ 59 | if (c == -1) 60 | break; 61 | 62 | switch (c) 63 | { 64 | case 0: 65 | /* If this option set a flag, do nothing else now. */ 66 | if (long_options[option_index].flag != 0) 67 | break; 68 | printf ("option %s", long_options[option_index].name); 69 | if (optarg) 70 | printf (" with arg %s", optarg); 71 | printf ("\n"); 72 | break; 73 | 74 | case 'f': 75 | fastaRef = optarg; 76 | break; 77 | 78 | case 'l': 79 | primerLength = atoi(optarg); 80 | break; 81 | 82 | case 'h': 83 | printSummary(argv); 84 | exit(0); 85 | break; 86 | 87 | case '?': 88 | /* getopt_long already printed an error message. */ 89 | printSummary(argv); 90 | exit(1); 91 | break; 92 | 93 | default: 94 | abort (); 95 | } 96 | } 97 | 98 | if (primerLength == 0) { 99 | cerr << "a primer length must be specified" << endl; 100 | exit(1); 101 | } 102 | if (fastaRef.empty()) { 103 | cerr << "a FASTA reference sequence must be specified" << endl; 104 | exit(1); 105 | } 106 | 107 | FastaReference ref; 108 | ref.open(fastaRef); 109 | 110 | VariantCallFile variantFile; 111 | string inputFilename; 112 | if (optind == argc - 1) { 113 | inputFilename = argv[optind]; 114 | variantFile.open(inputFilename); 115 | } else { 116 | variantFile.open(std::cin); 117 | } 118 | 119 | if (!variantFile.is_open()) { 120 | return 1; 121 | } 122 | 123 | Variant var(variantFile); 124 | while (variantFile.getNextVariant(var)) { 125 | // get the ref start and end positions 126 | int refstart = var.position - 1; // convert to 0-based 127 | int refend = var.position + var.ref.size() - 1; 128 | string leftprimer = ref.getSubSequence(var.sequenceName, refstart - primerLength, primerLength); 129 | string rightprimer = ref.getSubSequence(var.sequenceName, refend, primerLength); 130 | //cout << var << endl; 131 | cout << ">" << var.sequenceName << "_" << var.position << "_LEFT" << endl 132 | << leftprimer << endl 133 | << ">" << var.sequenceName << "_" << var.position << "_RIGHT" << endl 134 | << rightprimer << endl; 135 | } 136 | 137 | return 0; 138 | 139 | } 140 | 141 | -------------------------------------------------------------------------------- /src/vcfqual2info.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc == 1) { 11 | cerr << "usage: " << argv[0] << " [key] [vcf_file]" << endl 12 | << "Puts QUAL into an info field tag keyed by [key]." << endl 13 | << "The VCF file may be omitted and read from stdin." << endl; 14 | return 1; 15 | } 16 | 17 | string key = argv[1]; 18 | 19 | if (argc > 2) { 20 | string filename = argv[2]; 21 | variantFile.open(filename); 22 | } else { 23 | variantFile.open(std::cin); 24 | } 25 | 26 | if (!variantFile.is_open()) { 27 | return 1; 28 | } 29 | 30 | variantFile.addHeaderLine("##INFO="); 31 | 32 | cout << variantFile.header << endl; 33 | 34 | Variant var(variantFile); 35 | while (variantFile.getNextVariant(var)) { 36 | var.info[key].clear(); 37 | var.info[key].push_back(convert(var.quality)); 38 | cout << var << endl; 39 | } 40 | 41 | return 0; 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/vcfrandom.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "Variant.h" 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | int main(int argc, char** argv) { 11 | 12 | VariantCallFile variantFile; 13 | 14 | stringstream headerss; 15 | headerss << "##fileformat=VCFv4.0" << endl 16 | << "##source=vcfrandom" << endl 17 | << "##reference=/d2/data/references/build_37/human_reference_v37.fa" << endl 18 | << "##phasing=none" << endl 19 | << "##INFO=" << endl 20 | << "##INFO=" << endl 21 | << "##INFO=" << endl 22 | << "##INFO=" << endl 23 | << "##INFO=" << endl 24 | << "##FORMAT=" << endl 25 | << "##FORMAT=" << endl 26 | << "##FORMAT=" << endl 27 | << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tbill"; 28 | 29 | string header = headerss.str(); 30 | variantFile.openForOutput(header); 31 | 32 | cout << variantFile.header << endl; 33 | 34 | srand(time(NULL)); 35 | 36 | vector atgc; 37 | atgc.push_back("A"); 38 | atgc.push_back("T"); 39 | atgc.push_back("G"); 40 | atgc.push_back("C"); 41 | 42 | for (int i = 1; i < 10; ++i) { 43 | Variant var(variantFile); 44 | var.sequenceName = "one"; 45 | var.id = "."; 46 | var.filter = "."; 47 | var.ref = atgc.at(rand() % 4); 48 | var.quality = 100; 49 | stringstream s; 50 | s << rand() % 100; 51 | var.info["DP"].push_back(s.str()); 52 | var.format.push_back("GT"); 53 | var.format.push_back("DP"); 54 | var.position = i; 55 | for (vector::iterator s = var.sampleNames.begin(); s != var.sampleNames.end(); ++s) { 56 | string& name = *s; 57 | var.alt.clear(); 58 | var.alt.push_back(atgc.at(rand() % 4)); 59 | var.alt.push_back(atgc.at(rand() % 4)); 60 | var.samples[name]["GT"].push_back("0/1"); 61 | stringstream dp; 62 | dp << floor(rand() % 100); 63 | var.samples[name]["DP"].push_back(dp.str()); 64 | } 65 | cout << var << endl; 66 | } 67 | 68 | return 0; 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/vcfrandomsample.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "BedReader.h" 3 | #include 4 | #include "mt19937ar.h" 5 | #include 6 | #include 7 | #include "convert.h" 8 | 9 | using namespace std; 10 | using namespace vcf; 11 | 12 | 13 | void printSummary(char** argv) { 14 | cerr << "usage: " << argv[0] << " [options] []" << endl 15 | << endl 16 | << "options:" << endl 17 | << " -r, --rate RATE base sampling probability per locus" << endl 18 | << " -s, --scale-by KEY scale sampling likelihood by this Float info field" << endl 19 | << " -p, --random-seed N use this random seed (by default read from /dev/random)" << endl 20 | << " -q, --pseudorandom-seed use a pseudorandom seed (by default read from /dev/random)" << endl 21 | << endl 22 | << "Randomly sample sites from an input VCF file, which may be provided as stdin." << endl 23 | << "Scale the sampling probability by the field specified in KEY. This may be" << endl 24 | << "used to provide uniform sampling across allele frequencies, for instance." << endl; 25 | exit(0); 26 | } 27 | 28 | int main(int argc, char** argv) { 29 | 30 | double rate = 1.0; 31 | int seed = 0; 32 | bool useprng = false; 33 | string scaleByKey; 34 | 35 | if (argc == 1) 36 | printSummary(argv); 37 | 38 | int c; 39 | while (true) { 40 | static struct option long_options[] = 41 | { 42 | {"help", no_argument, 0, 'h'}, 43 | {"rate", required_argument, 0, 'r'}, 44 | {"scale-by", required_argument, 0, 's'}, 45 | {"random-seed", required_argument, 0, 'p'}, 46 | {"pseudorandom-seed", required_argument, 0, 'q'}, 47 | {0, 0, 0, 0} 48 | }; 49 | 50 | int option_index = 0; 51 | c = getopt_long (argc, argv, "hqr:s:p:", 52 | long_options, &option_index); 53 | 54 | if (c == -1) 55 | break; 56 | 57 | switch (c) { 58 | case 'r': 59 | rate = atof(optarg); 60 | break; 61 | 62 | case 's': 63 | scaleByKey = optarg; 64 | break; 65 | 66 | case 'p': 67 | seed = atoi(optarg); 68 | break; 69 | 70 | case 'q': 71 | useprng = true; 72 | break; 73 | 74 | case 'h': 75 | printSummary(argv); 76 | break; 77 | 78 | case '?': 79 | printSummary(argv); 80 | exit(1); 81 | break; 82 | 83 | default: 84 | abort (); 85 | } 86 | } 87 | 88 | VariantCallFile variantFile; 89 | string inputFilename; 90 | if (optind == argc - 1) { 91 | inputFilename = argv[optind]; 92 | variantFile.open(inputFilename); 93 | } else { 94 | variantFile.open(std::cin); 95 | } 96 | 97 | if (!variantFile.is_open()) { 98 | cout << "could not open VCF file" << endl; 99 | return 1; 100 | } 101 | 102 | // seed prng with random bits from /dev/random 103 | if (!seed) { 104 | fstream random; 105 | if (useprng) { 106 | random.open("/dev/urandom", fstream::in); 107 | } else { 108 | random.open("/dev/random", fstream::in); 109 | } 110 | random.get((char*) &seed, sizeof(int)); 111 | random.close(); 112 | } 113 | 114 | init_genrand(seed); 115 | 116 | vector args; 117 | for (int i = 0; i < argc; ++i) { 118 | args.push_back(argv[i]); 119 | } 120 | 121 | stringstream liness; 122 | liness << "##sampling=\"random sampling using " 123 | << join(args, " ") 124 | << " using random seed " 125 | << seed << "\""; 126 | variantFile.addHeaderLine(liness.str()); 127 | 128 | cout << variantFile.header << endl; 129 | 130 | // check that we can use the scaling key 131 | if (!scaleByKey.empty()) { 132 | if (variantFile.infoTypes.find(scaleByKey) == variantFile.infoTypes.end()) { 133 | cerr << "could not find info key " << scaleByKey << endl; 134 | exit(1); 135 | } else { 136 | if (variantFile.infoTypes[scaleByKey] != FIELD_FLOAT) { 137 | cerr << "cannot use " << scaleByKey << " as a scaling factor, as it is not of type Float" << endl; 138 | exit(1); 139 | } 140 | } 141 | } 142 | 143 | Variant var(variantFile); 144 | while (variantFile.getNextVariant(var)) { 145 | double randN = genrand_real1(); 146 | if (!scaleByKey.empty()) { 147 | if (var.info.find(scaleByKey) != var.info.end()) { 148 | double val; 149 | 150 | // hack, sum the values of interest if we have multiple values 151 | // really, this is only suitable for AF stuff 152 | vector& vals = var.info[scaleByKey]; 153 | for (vector::iterator b = vals.begin(); b != vals.end(); ++b) { 154 | double f; 155 | convert(*b, f); 156 | val += f; 157 | } 158 | val /= vals.size(); 159 | 160 | if (val > 1) { 161 | cerr << "cannot scale by " << scaleByKey << "=" << val << " as it is > 1" << endl; 162 | exit(1); 163 | } 164 | randN *= val; 165 | } 166 | } 167 | if (randN < rate) { 168 | cout << var << endl; 169 | } 170 | } 171 | 172 | return 0; 173 | 174 | } 175 | -------------------------------------------------------------------------------- /src/vcfremoveaberrantgenotypes.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace std; 8 | using namespace vcf; 9 | 10 | void stripAberrant(Variant& var) { 11 | map > >::iterator s = var.samples.begin(); 12 | while (s != var.samples.end()) { 13 | map >& sample = s->second; 14 | map genotype = decomposeGenotype(sample["GT"].front()); 15 | int refobs = 0; 16 | convert(sample["RO"].front(), refobs); 17 | if (isHomNonRef(genotype) && refobs > 0) { 18 | var.samples.erase(s); 19 | } else if (isHomRef(genotype)) { 20 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 21 | int alleleIndex = var.altAlleleIndexes[*a]; 22 | int altobs = 0; 23 | convert(sample["AO"].at(alleleIndex), altobs); 24 | if (altobs > 0) { 25 | var.samples.erase(s); 26 | break; 27 | } 28 | } 29 | } 30 | ++s; 31 | } 32 | } 33 | 34 | int main(int argc, char** argv) { 35 | 36 | if (argc != 2) { 37 | cerr << "usage: " << argv[0] << " " << endl 38 | << "strips samples which are homozygous but have observations implying heterozygosity" << endl; 39 | return 1; 40 | } 41 | 42 | string filename = argv[1]; 43 | 44 | VariantCallFile variantFile; 45 | if (filename == "-") { 46 | variantFile.open(std::cin); 47 | } else { 48 | variantFile.open(filename); 49 | } 50 | 51 | if (!variantFile.is_open()) { 52 | cerr << "could not open " << filename << endl; 53 | return 1; 54 | } 55 | 56 | Variant var(variantFile); 57 | 58 | // TODO check if AC is present 59 | // ensure that AC is listed as an info field 60 | string line = "##filter=\"removed homozygous genotypes which have observations implying heterozygosity\">"; 61 | variantFile.addHeaderLine(line); 62 | 63 | // write the new header 64 | cout << variantFile.header << endl; 65 | 66 | // print the records, filtering is done via the setting of varA's output sample names 67 | while (variantFile.getNextVariant(var)) { 68 | stripAberrant(var); 69 | cout << var << endl; 70 | } 71 | 72 | return 0; 73 | 74 | } 75 | 76 | -------------------------------------------------------------------------------- /src/vcfremovesamples.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | // remove elements in a from b 10 | template 11 | vector removeElems(vector& a, vector& b) { 12 | map inA; 13 | map inAB; 14 | for (typename vector::iterator i = a.begin(); i != a.end(); ++i) { 15 | inA[*i] = true; 16 | } 17 | for (typename vector::iterator i = b.begin(); i != b.end(); ++i) { 18 | if (inA.find(*i) == inA.end()) { 19 | inAB[*i] = true; 20 | } 21 | } 22 | vector aNb; 23 | for (typename map::iterator i = inAB.begin(); i != inAB.end(); ++i) { 24 | aNb.push_back(i->first); 25 | } 26 | return aNb; 27 | } 28 | 29 | int main(int argc, char** argv) { 30 | 31 | if (argc < 3) { 32 | cerr << "usage: " << argv[0] << " [SAMPLE1] [SAMPLE2] ..." << endl 33 | << "outputs each record in the vcf file, removing samples listed on the command line" << endl; 34 | return 1; 35 | } 36 | 37 | string filename = argv[1]; 38 | 39 | vector samplesToRemove; 40 | for (int i = 2; i < argc; ++i) { 41 | samplesToRemove.push_back(argv[i]); 42 | } 43 | 44 | VariantCallFile variantFile; 45 | if (filename == "-") { 46 | variantFile.open(std::cin); 47 | } else { 48 | variantFile.open(filename); 49 | } 50 | 51 | if (!variantFile.is_open()) { 52 | return 1; 53 | } 54 | 55 | Variant var(variantFile); 56 | 57 | vector samplesToKeep = removeElems(samplesToRemove, variantFile.sampleNames); 58 | 59 | // update sample list in header 60 | variantFile.updateSamples(samplesToKeep); 61 | 62 | // and restrict the output sample names in the variant to those we are keeping 63 | var.setOutputSampleNames(samplesToKeep); 64 | 65 | // write the new header 66 | cout << variantFile.header << endl; 67 | 68 | // print the records, filtering is done via the setting of varA's output sample names 69 | while (variantFile.getNextVariant(var)) { 70 | cout << var << endl; 71 | } 72 | 73 | return 0; 74 | 75 | } 76 | 77 | -------------------------------------------------------------------------------- /src/vcfsamplenames.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | for (vector::iterator sample = variantFile.sampleNames.begin(); 22 | sample != variantFile.sampleNames.end(); ++sample) { 23 | cout << *sample << endl; 24 | } 25 | 26 | return 0; 27 | 28 | } 29 | 30 | -------------------------------------------------------------------------------- /src/vcfsamplestats.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include "split.h" 3 | #include 4 | #include 5 | #include 6 | 7 | using namespace vcf; 8 | 9 | 10 | void printSummary(char** argv) { 11 | cerr << "usage: " << argv[0] << " [options] " << endl 12 | << "options:" << endl 13 | << endl 14 | << " -h, --help this dialog" << endl 15 | << endl 16 | << "By default, output a table of this form:" << endl 17 | << "sample" << " " 18 | << "sitecount" << " " 19 | << "refcount" << " " 20 | << "altcount" << " " 21 | << "homcount" << " " 22 | << "hetcount" << " " 23 | << "avg_gq" << " " 24 | << "avg_dp" << endl 25 | << endl 26 | << "for each sample in the VCF file." << endl 27 | << "Reads from stdin if no file is specified on the command line." << endl 28 | << endl; 29 | exit(0); 30 | } 31 | 32 | 33 | int main(int argc, char** argv) { 34 | 35 | int c; 36 | //bool outputTotalStats = false; 37 | 38 | while (true) { 39 | static struct option long_options[] = 40 | { 41 | /* These options set a flag. */ 42 | //{"verbose", no_argument, &verbose_flag, 1}, 43 | {"help", no_argument, 0, 'h'}, 44 | //{"totals", no_argument, 0, 't'}, 45 | //{"length", no_argument, &printLength, true}, 46 | {0, 0, 0, 0} 47 | }; 48 | /* getopt_long stores the option index here. */ 49 | int option_index = 0; 50 | 51 | c = getopt_long (argc, argv, "h", 52 | long_options, &option_index); 53 | 54 | /* Detect the end of the options. */ 55 | if (c == -1) 56 | break; 57 | 58 | switch (c) 59 | { 60 | case 0: 61 | /* If this option set a flag, do nothing else now. */ 62 | if (long_options[option_index].flag != 0) 63 | break; 64 | printf ("option %s", long_options[option_index].name); 65 | if (optarg) 66 | printf (" with arg %s", optarg); 67 | printf ("\n"); 68 | break; 69 | 70 | //case 't': 71 | // outputTotalStats = true; 72 | //break; 73 | 74 | case 'h': 75 | printSummary(argv); 76 | exit(0); 77 | break; 78 | 79 | case '?': 80 | /* getopt_long already printed an error message. */ 81 | printSummary(argv); 82 | exit(1); 83 | break; 84 | 85 | default: 86 | abort (); 87 | } 88 | } 89 | 90 | VariantCallFile variantFile; 91 | if (optind == argc - 1) { 92 | string inputFilename = argv[optind]; 93 | variantFile.open(inputFilename); 94 | } else { 95 | variantFile.open(std::cin); 96 | } 97 | 98 | if (!variantFile.is_open()) { 99 | return 1; 100 | } 101 | 102 | 103 | map sitecount; 104 | map refcount; 105 | map altcount; 106 | map homcount; 107 | map hetcount; 108 | map gqsum; 109 | map dpsum; 110 | 111 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 112 | string& sample = *s; 113 | sitecount[sample] = 0; 114 | refcount[sample] = 0; 115 | altcount[sample] = 0; 116 | homcount[sample] = 0; 117 | hetcount[sample] = 0; 118 | gqsum[sample] = 0; 119 | } 120 | 121 | Variant var(variantFile); 122 | while (variantFile.getNextVariant(var)) { 123 | 124 | for (map > >::iterator s = var.samples.begin(); s != var.samples.end(); ++s) { 125 | 126 | string name = s->first; 127 | map >& sample = s->second; 128 | 129 | sitecount[name] += 1; 130 | 131 | int gq; 132 | if (convert(sample["GQ"].front(), gq)) { 133 | gqsum[name] += gq; 134 | } 135 | 136 | int dp; 137 | if (convert(sample["DP"].front(), dp)) 138 | dpsum[name] += dp; 139 | 140 | string& genotype = sample["GT"].front(); 141 | vector gt = split(genotype, "|/"); 142 | 143 | int alt = 0; 144 | int ref = 0; 145 | 146 | for (vector::iterator g = gt.begin(); g != gt.end(); ++g) { 147 | if (*g != "0") { 148 | ++alt; 149 | } else { 150 | ++ref; 151 | } 152 | } 153 | 154 | if (alt != gt.size()) { 155 | hetcount[name] += alt; 156 | } 157 | 158 | if (alt == gt.size() || ref == gt.size()) { 159 | homcount[name] += 1; 160 | } 161 | 162 | refcount[name] += ref; 163 | altcount[name] += alt; 164 | 165 | } 166 | } 167 | 168 | cout << "sample" << "\t" 169 | << "sitecount" << "\t" 170 | << "refcount" << "\t" 171 | << "altcount" << "\t" 172 | << "homcount" << "\t" 173 | << "hetcount" << "\t" 174 | << "avg_gq" << "\t" 175 | << "avg_dp" << endl; 176 | for (vector::iterator s = variantFile.sampleNames.begin(); s != variantFile.sampleNames.end(); ++s) { 177 | string& sample = *s; 178 | cout << sample << "\t" 179 | 180 | << sitecount[sample] << "\t" 181 | << refcount[sample] << "\t" 182 | << altcount[sample] << "\t" 183 | << homcount[sample] << "\t" 184 | << hetcount[sample] << "\t" 185 | << (float) gqsum[sample] / (float) sitecount[sample] << "\t" 186 | << (float) dpsum[sample] / (float) sitecount[sample] 187 | << endl; 188 | } 189 | 190 | return 0; 191 | 192 | } 193 | 194 | -------------------------------------------------------------------------------- /src/vcfsitesummarize.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | // obtain all possible field names 22 | vector infofields; 23 | vector infoflags; 24 | 25 | for (map::iterator i = variantFile.infoTypes.begin(); i != variantFile.infoTypes.end(); ++i) { 26 | if (variantFile.infoCounts[i->first] != ALLELE_NUMBER) { 27 | if (i->second == FIELD_BOOL) { 28 | infoflags.push_back(i->first); 29 | } else { 30 | infofields.push_back(i->first); 31 | } 32 | } 33 | } 34 | 35 | // write header 36 | 37 | // defaults 38 | cout << "CHROM\tPOS\tID\tREF\tQUAL\tFILTER"; 39 | 40 | // configurable info field 41 | for (vector::iterator i = infofields.begin(); i != infofields.end(); ++i) { 42 | cout << "\t" << *i; 43 | } 44 | for (vector::iterator i = infoflags.begin(); i != infoflags.end(); ++i) { 45 | cout << "\t" << *i; 46 | } 47 | cout << endl; 48 | 49 | Variant var(variantFile); 50 | while (variantFile.getNextVariant(var)) { 51 | 52 | cout << var.sequenceName << "\t" 53 | << var.position << "\t" 54 | << var.id << "\t" 55 | << var.ref << "\t" 56 | << var.quality << "\t" 57 | << var.filter; 58 | 59 | for (vector::iterator i = infofields.begin(); i != infofields.end(); ++i) { 60 | vector value; 61 | string& name = *i; 62 | map >::iterator f = var.info.find(name); 63 | if (f != var.info.end()) { 64 | value = f->second; 65 | if (value.size() == 1) { 66 | cout << "\t" << value.front(); 67 | } else { 68 | cout << "\t"; // null 69 | } 70 | } else { 71 | cout << "\t"; // null 72 | } 73 | } 74 | 75 | for (vector::iterator i = infoflags.begin(); i != infoflags.end(); ++i) { 76 | string value; 77 | string& name = *i; 78 | map::iterator f = var.infoFlags.find(name); 79 | cout << "\t"; 80 | if (f != var.infoFlags.end()) { 81 | cout << 1; 82 | } else { 83 | cout << 0; 84 | } 85 | } 86 | 87 | cout << endl; 88 | 89 | } 90 | 91 | return 0; 92 | 93 | } 94 | 95 | -------------------------------------------------------------------------------- /src/vcfstreamsort.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include 3 | #include 4 | #include "convert.h" 5 | 6 | using namespace std; 7 | using namespace vcf; 8 | 9 | bool listContains(list& l, string& v) { 10 | for (list::iterator i = l.begin(); i != l.end(); ++i) { 11 | if (*i == v) return true; 12 | } 13 | return false; 14 | } 15 | 16 | void printSummary(char** argv) { 17 | cerr << "usage: " << argv[0] << " [options] [vcf file]" << endl 18 | << endl 19 | << "Sorts the input (either stdin or file) using a streaming sort algorithm." 20 | << endl 21 | << "options:" << endl 22 | << endl 23 | << " -h, --help this dialog" << endl 24 | << " -w, --window N number of sites to sort (default 10000)" << endl 25 | << " -a, --all load all sites and then sort in memory" << endl; 26 | } 27 | 28 | int main(int argc, char** argv) { 29 | 30 | VariantCallFile variantFile; 31 | int sortSitesWindow = 10000; 32 | bool sortAll = false; 33 | 34 | int c; 35 | 36 | while (true) { 37 | static struct option long_options[] = 38 | { 39 | /* These options set a flag. */ 40 | //{"verbose", no_argument, &verbose_flag, 1}, 41 | {"help", no_argument, 0, 'h'}, 42 | {"window", required_argument, 0, 'w'}, 43 | {"all", required_argument, 0, 'a'}, 44 | {0, 0, 0, 0} 45 | }; 46 | /* getopt_long stores the option index here. */ 47 | int option_index = 0; 48 | 49 | c = getopt_long (argc, argv, "haw:", 50 | long_options, &option_index); 51 | 52 | if (c == -1) 53 | break; 54 | 55 | string field; 56 | 57 | switch (c) 58 | { 59 | 60 | case 'w': 61 | if (!convert(optarg, sortSitesWindow)) { 62 | cerr << "could not parse --window, -w" << endl; 63 | exit(1); 64 | } 65 | break; 66 | 67 | case 'a': 68 | sortAll = true; 69 | break; 70 | 71 | case 'h': 72 | printSummary(argv); 73 | exit(0); 74 | break; 75 | 76 | default: 77 | break; 78 | } 79 | } 80 | 81 | if (optind == argc - 1) { 82 | string inputFilename = argv[optind]; 83 | variantFile.open(inputFilename); 84 | } else { 85 | variantFile.open(std::cin); 86 | } 87 | 88 | if (!variantFile.is_open()) { 89 | return 1; 90 | } 91 | 92 | cout << variantFile.header << endl; 93 | 94 | map > > > records; 95 | long int back = 0; 96 | int numrecords = 0; 97 | list sequenceNames; 98 | 99 | variantFile.parseSamples = false; 100 | Variant var(variantFile); 101 | while (variantFile.getNextVariant(var)) { 102 | //cerr << "at position " << var.sequenceName << ":" << var.position << endl; 103 | if (!listContains(sequenceNames, var.sequenceName)) { 104 | //cerr << "adding new sequence name " << var.sequenceName << endl; 105 | sequenceNames.push_back(var.sequenceName); 106 | } 107 | records[var.sequenceName][var.position][var.vrepr()].push_back(var); 108 | if (records[var.sequenceName][var.position].size() == 1) ++numrecords; 109 | if (!sortAll && numrecords > sortSitesWindow) { 110 | //cerr << "outputting a position" << endl; 111 | if (records[sequenceNames.front()].empty()) { 112 | //cerr << "end of reference sequence " << sequenceNames.front() << endl; 113 | sequenceNames.pop_front(); 114 | } 115 | map > >& frecords = records[sequenceNames.front()]; 116 | map >& vars = frecords.begin()->second; 117 | for (map >::iterator v = vars.begin(); v != vars.end(); ++v) { 118 | for (vector::iterator s = v->second.begin(); s != v->second.end(); ++s) { 119 | cout << s->originalLine << endl; 120 | } 121 | } 122 | frecords.erase(frecords.begin()); 123 | --numrecords; 124 | } 125 | } 126 | //cerr << "done processing input, cleaning up" << endl; 127 | for (list::iterator s = sequenceNames.begin(); s != sequenceNames.end(); ++s) { 128 | map > >& q = records[*s]; 129 | for (map > >::iterator r = q.begin(); r != q.end(); ++r) { 130 | for (map >::iterator v = r->second.begin(); v != r->second.end(); ++v) { 131 | for (vector::iterator s = v->second.begin(); s != v->second.end(); ++s) { 132 | cout << s->originalLine << endl; 133 | } 134 | } 135 | --numrecords; 136 | } 137 | } 138 | //cerr << numrecords << " remain" << endl; 139 | 140 | return 0; 141 | 142 | } 143 | 144 | -------------------------------------------------------------------------------- /src/vcfuniq.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | 3 | using namespace std; 4 | using namespace vcf; 5 | 6 | int main(int argc, char** argv) { 7 | 8 | VariantCallFile variantFile; 9 | 10 | if (argc > 1) { 11 | string filename = argv[1]; 12 | variantFile.open(filename); 13 | } else { 14 | variantFile.open(std::cin); 15 | } 16 | 17 | if (!variantFile.is_open()) { 18 | return 1; 19 | } 20 | 21 | cout << variantFile.header << endl; 22 | 23 | string lastsn; 24 | long int lastpos; 25 | string lastref; 26 | vector lastalt; 27 | 28 | variantFile.parseSamples = false; 29 | Variant var(variantFile); 30 | while (variantFile.getNextVariant(var)) { 31 | if (!lastsn.empty() 32 | && (lastsn == var.sequenceName 33 | && lastpos == var.position 34 | && lastref == var.ref 35 | && lastalt == var.alt)) { 36 | continue; 37 | } else { 38 | lastsn = var.sequenceName; 39 | lastpos = var.position; 40 | lastref = var.ref; 41 | lastalt = var.alt; 42 | cout << var.originalLine << endl; 43 | } 44 | } 45 | 46 | return 0; 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /src/vcfuniqalleles.cpp: -------------------------------------------------------------------------------- 1 | #include "Variant.h" 2 | #include 3 | 4 | using namespace std; 5 | using namespace vcf; 6 | 7 | int main(int argc, char** argv) { 8 | 9 | VariantCallFile variantFile; 10 | 11 | if (argc > 1) { 12 | string filename = argv[1]; 13 | variantFile.open(filename); 14 | } else { 15 | variantFile.open(std::cin); 16 | } 17 | 18 | if (!variantFile.is_open()) { 19 | return 1; 20 | } 21 | 22 | cout << variantFile.header << endl; 23 | 24 | string lastsn; 25 | long int lastpos; 26 | string lastref; 27 | vector lastalt; 28 | 29 | Variant var(variantFile); 30 | while (variantFile.getNextVariant(var)) { 31 | set alleles; 32 | vector alleles_to_remove; 33 | for (vector::iterator a = var.alt.begin(); a != var.alt.end(); ++a) { 34 | if (*a != var.ref) { 35 | if (alleles.find(*a) == alleles.end()) { 36 | alleles.insert(*a); 37 | } else { 38 | alleles_to_remove.push_back(*a); 39 | } 40 | } else { 41 | alleles_to_remove.push_back(*a); // same as ref 42 | } 43 | } 44 | for (vector::iterator a = alleles_to_remove.begin(); a != alleles_to_remove.end(); ++a) { 45 | cerr << "removing " << *a << " from " << var.sequenceName << ":" << var.position << endl; 46 | var.removeAlt(*a); 47 | } 48 | cout << var << endl; 49 | } 50 | 51 | return 0; 52 | 53 | } 54 | 55 | --------------------------------------------------------------------------------