├── README.md
└── code
    ├── extract_FORMAT_vcf.r
    └── get_Qvalue.r


/README.md:
--------------------------------------------------------------------------------
  1 | # Tip and tricks for VCF files
  2 | 
  3 | - [Usefull tools](https://github.com/IARC-bioinfo/VCF-tricks#usefull-tools)
  4 | - [Use R VariantAnnotation bioconductor package](https://github.com/IARC-bioinfo/VCF-tricks#use-r-variantannotation-bioconductor-package)
  5 | - [Manually processing VCF in R](https://github.com/IARC-bioinfo/VCF-tricks#manually-processing-vcf-in-r)
  6 | - [Manually processing VCF in bash](https://github.com/IARC-bioinfo/VCF-tricks#manually-processing-vcf-in-bash)
  7 | 
  8 | ## Usefull tools
  9 | ### Samtools organisation and repositories
 10 | - File format [specification](http://samtools.github.io/hts-specs/)
 11 | - Bcftools [github page](https://github.com/samtools/bcftools)
 12 | - Bcftools [webpage](http://samtools.github.io/bcftools/)
 13 | - Bcftools [man page](http://samtools.github.io/bcftools/bcftools.html)
 14 | 
 15 | Compilation (from [here](http://samtools.github.io/bcftools/)):
 16 | ```bash
 17 | git clone --branch=develop git://github.com/samtools/htslib.git
 18 | git clone --branch=develop git://github.com/samtools/bcftools.git
 19 | git clone --branch=develop git://github.com/samtools/samtools.git
 20 | cd bcftools; make
 21 | cd ../samtools; make
 22 | cd ../htslib; make
 23 | ```
 24 | 
 25 | ### Other tools
 26 | - R bioconductor package [Rsamtools](http://bioconductor.org/packages/release/bioc/html/Rsamtools.html)
 27 | - vcflib [github page](https://github.com/ekg/vcflib)
 28 | - vt [wiki](http://genome.sph.umich.edu/wiki/Vt) and [github page](https://github.com/atks/vt)
 29 | - bedtools [documentation](http://bedtools.readthedocs.org) and [github page](https://github.com/arq5x/bedtools2)
 30 | - PyVCF [github page](https://github.com/jamescasbon/PyVCF)
 31 | - VCFtools [webpage](https://vcftools.github.io/) and [github page](https://github.com/vcftools/vcftools). It has been mostly replaced with bcftools but some commands are still only available in VCFtools (in particular [vcf-annotate](https://vcftools.github.io/perl_module.html#vcf-annotate))
 32 | 
 33 | ## Use R VariantAnnotation bioconductor package
 34 | 
 35 | All the commands below assume the package `VariantAnnotation` has been loaded into R using `library(VariantAnnotation)`.
 36 | 
 37 | ### Replace INFO/DP field with GENO/DP field
 38 | ```R
 39 | vcf <- readVcf("test.vcf", "hg19")
 40 | info(vcf)$DP=geno(vcf)$DP
 41 | writeVcf(vcf,"test.vcf")
 42 | ```
 43 | 
 44 | ### Create a new INFO field
 45 | 
 46 | Here it's called `DP_T`and filled with `.` (dot represent missing values in VCF files) but it could be anything you like.
 47 | ```R
 48 | vcf <- readVcf("test.vcf", "hg19")
 49 | newInfo <- DataFrame(Number=1, Type="Integer",Description="DP in normal",row.names="DP_N")
 50 | info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
 51 | info(vcf)$DP_N="."
 52 | writeVcf(vcf,"test.vcf")
 53 | ```
 54 | 
 55 | ### Split a multi-sample VCF into n-single sample VCFs
 56 | ```R
 57 | vcf_file = "test.vcf"
 58 | for (cur_sample in samples(scanVcfHeader(vcf_file))) {
 59 |   writeVcf(readVcf(vcf_file, "hg19",ScanVcfParam(sample = cur_sample)),paste(cur_sample,".vcf",sep = ""))
 60 | }
 61 | ```
 62 | 
 63 | ## Manually processing VCF in R
 64 | 
 65 | Look at these functions too: https://github.com/sahilseth/vcfparser
 66 | 
 67 | ### Extract expected Q-value from a needlestack calling
 68 | Once a needlestack calling has been launch, user can want to compute what would be the Q-value of a particular sample at a particular position.
 69 | This could help controlling the false negative rate.  
 70 | Given a VCF chunk from [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) bioconductor package, a sample ID and a particular mutation, [this](https://github.com/IARCbioinfo/VCF-tricks/blob/master/code/get_Qvalue.r) function extract the corresponding Q-value.
 71 | 
 72 | ### Loading a VCF file as a data frame
 73 | On Unix systems (Mac or Linux), automatically pipe it with `grep` and `sed` to remove the header.
 74 | ```R
 75 | my_vcf=read.table(pipe("grep -v '^##' test.vcf | sed s/^#//"),stringsAsFactors=F,header=T,sep="\t")
 76 | ```
 77 | On Windows, you can manually remove the header lines (starting with `##`) and also the `#` character from the line containing the column names. After that you can read it using:
 78 | ```R
 79 | my_vcf=read.table("test_noheader.vcf",stringsAsFactors=F,header=T,sep="\t")
 80 | ```
 81 | 
 82 | ### Two R functions to extract values from INFO or GENOTYPE fields
 83 | 
 84 | Gist: https://gist.github.com/mfoll/a4dfbb92068dc559f130
 85 | ```R
 86 | get_info=function(info,field,num=T) {
 87 |   get_single_info=function(single_info,field) {
 88 |     grep_res=grep(paste("^",field,"=",sep=""),unlist(strsplit(single_info,";")),value=T)
 89 |     if (length(grep_res)>0) strsplit(grep_res,"=")[[1]][2] else NA
 90 |   }
 91 |   res=unlist(lapply(info,get_single_info,field))
 92 |   if (num) as.numeric(res) else res
 93 | }
 94 | 
 95 | get_genotype=function(genotype,format,field,num=T) {
 96 |   get_single_genotype=function(single_genotype,format,field) {
 97 |     single_res=unlist(strsplit(single_genotype,":"))[which(unlist(strsplit(format,":"))==field)]
 98 |     if (length(single_res)>0) single_res else NA
 99 |   }
100 |   res=unlist(lapply(genotype,get_single_genotype,format,field))
101 |   if (num) as.numeric(res) else res
102 | }
103 | ```
104 | 
105 | Both function are vectorized (i.e. you can give them a vector of `INFO` fields or a vector of `GENOTYPE` fields. The genotype field requires that you give the format of the field (for example `"GT:AO:DP"`). In both functions the `field` argument indicates which field you want to extract. By default the result is converted to a numeric value, unless you specify `num=FALSE` when you call the functions.
106 | 
107 | ### Get genotype columns and sample names:
108 | ```R
109 | # list of columns containing sample specific data
110 | GT_cols=(which(names(my_vcf)=="FORMAT")+1):ncol(my_vcf)
111 | # extract sample names
112 | SM=names(my_vcf)[GT_cols]
113 | ```
114 | 
115 | ### Get number of variants for each position in a VCF file
116 | Following function returns, for each position in the input VCF file, the number of samples having a `QVALUE` higher than the input threshold (`QVAL_thr`, default=50), __i.e__. the number of variants.  
117 | 
118 | ```R
119 | # after reading a VCF with read.table(), see last example
120 | get_number_of_variants <- function(vcf, QVAL_thr = 50){
121 |   unlist(lapply(1:nrow(vcf), function(i) {
122 |     all_QVAL = unlist(lapply((which(colnames(vcf)=="FORMAT")+1) : ncol(vcf),
123 |                              function(id) get_genotype(vcf[i,id], vcf[i,"FORMAT"], field = "QVAL") ))
124 |     sum(all_QVAL>=QVAL_thr)
125 |   }))
126 | }
127 | ```
128 | 
129 | ### Using all the above
130 | This assumes that you have a `my_vcf` data frame loaded, the two functions above and the objects `GT_cols` and `SM`.
131 | 
132 | - Extract the variant type (`TYPE`) from all lines from the INFO field:
133 | 
134 |   ```R
135 |   get_info(my_vcf$INFO,"TYPE",num=F)
136 |   ```
137 | - Use it to fikter only variants with `TYPE=snv`:
138 | 
139 |   ```R
140 |   my_vcf[which(get_info(my_vcf$INFO,"TYPE",num=F)=="snv"),]
141 |   ```
142 | - Extract coverage (`DP`) of each sample at a given line (1 here):
143 | 
144 |   ```R
145 |   get_genotype(my_vcf[1,GT_cols],my_vcf$FORMAT[1],"DP")
146 |   ```
147 | 
148 | - Extract coverage of all lines of a given sample (`MY_SAMPLE` here):
149 | 
150 |   ```R
151 |   get_genotype(my_vcf[,"MY_SAMPLE"],my_vcf$FORMAT[1],"DP")
152 |   ```
153 | You can replace `"MY_SAMPLE"` with `SM[1]` to take the first sample without typing manually its name (usefull if you have only one for example).
154 | 
155 | - Plot the distribution of allelic fraction from the first sample (assuming `AO` and `DP` fields are available in the genotype column):
156 | 
157 |   ```R
158 |   AO=get_genotype(my_vcf[,SM[1]],my_vcf$FORMAT[1],"AO")
159 |   DP=get_genotype(my_vcf[,SM[1]],my_vcf$FORMAT[1],"DP")
160 |   hist(AO/DP)
161 |   ```
162 | 
163 | ### Built a TSV file, from VCF to extract a particular field from FORMAT
164 | 
165 | [extract_FORMAT_vcf.r](https://github.com/IARCbioinfo/VCF-tricks/blob/master/code/extract_FORMAT_vcf.r) is a script which extract, for each variant in the VCF, and for each sample, the value of a `field` in `FORMAT`.  
166 | Example of command line:
167 | ```
168 | Rscript extract_FORMAT_vcf.r --input_vcf=testepic.vcf.gz --field=DS
169 | ```
170 | 
171 | 
172 | ## Manually processing VCF in bash
173 | 
174 | ### Split a n-samples VCF
175 | 
176 | This bash script splits a big VCF from n samples into n VCF with file name = sample name (save these lines into big_VCF_to_samples.sh)
177 | 
178 | Gist : https://gist.github.com/tdelhomme/cb28dec176b55c43e887
179 | ```bash
180 | #!/bin/bash
181 | 
182 | if [ $# -eq 0 ]; then #if no provided parameters
183 |   echo 'usage : ./big_VCF_to_samples.sh <input big VCF> <result folder>'
184 | else
185 |   mkdir -p $2
186 |   IFS= read -a array <<< $(grep "#CHROM" $1 | head -1 | awk '{for(i=10;i<=NF;++i)print $i}')
187 |   samples=${array[0]}
188 |   for i in `seq 1 $(echo "$samples" | wc -w)`;
189 |   do
190 |     bcftools view -s $(echo "$samples" | cut -d" " -f$i) $1 > $2"/"$(echo "$samples" | cut -d" " -f$i).vcf
191 |   done
192 | fi
193 | ```
194 | 
195 | ### Generate a sorted and merged BED file from positions in a VCF
196 | ```
197 | awk '{ if (!/^#/) print $1"	"$2"	"$2+1}' input.vcf | sort -k1,1 -k2,2n | bedtools merge -i stdin
198 | ```
199 | 
200 | ### Compute number of positions in a bed file
201 | ```
202 | cat mybedfile.bed | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2+1 }END{print SUM}'
203 | ```
204 | 
205 | ### Get the samples ID from a VCF
206 | ```
207 | grep "^#CHROM" input.vcf | tr '\t' '\n' | grep -v -E '#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT'
208 | ```
209 | 
210 | ### Downsample a VCF
211 | Here a zipped one (assuming the header is less than 10000 lines), from which we randomly extract 1 million lines:
212 | ```
213 | (zcat human_9606_b150_GRCh38p7.vcf.gz | head -n 10000 | zgrep ^# ; zgrep -v ^# human_9606_b150_GRCh38p7.vcf.gz | shuf -n 1000000 | LC_ALL=C sort -k1,1V -k2,2n) | gzip > human_9606_b150_GRCh38p7_small.vcf.gz
214 | ```
215 | 


--------------------------------------------------------------------------------
/code/extract_FORMAT_vcf.r:
--------------------------------------------------------------------------------
 1 | library(VariantAnnotation)
 2 | 
 3 | args <- commandArgs(TRUE)
 4 | parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
 5 | argsL <- as.list(as.character(as.data.frame(do.call("rbind", parseArgs(args)))$V2))
 6 | names(argsL) <- as.data.frame(do.call("rbind", parseArgs(args)))$V1
 7 | args <- argsL;rm(argsL)
 8 | 
 9 | if(is.null(args$input_vcf))            {stop("no input VCF file")} else {input_vcf = args$input_vcf}
10 | if(is.null(args$field))                {stop("no FORMAT field to query")} else {field=args$field}
11 | if(is.null(args$out_txt))              {out_txt = paste(gsub(".vcf.gz","",input_vcf),"_",field,"_extract.txt",sep="")} else {out_txt=args$out_txt}
12 | if(is.null(args$nb_mut))               {nb_mut = 1000} else {nb_mut = as.numeric(args$nb_mut)}
13 | 
14 | vcf <- open(VcfFile(input_vcf,  yieldSize=nb_mut))
15 | vcf_chunk = readVcf(vcf, "hg19")
16 | 
17 | `#CHR` = c()
18 | `#START` = c()
19 | `#REF` = c()
20 | `#ALT` = c()
21 | `#FILTER` = c()
22 |   
23 | while(dim(vcf_chunk)[1] != 0) {
24 |   `#CHR` = c(`#CHR`, as.character(seqnames(rowRanges(vcf_chunk))))
25 |   `#START` = c(`#START`, start(ranges(rowRanges(vcf_chunk,"seqnames"))))
26 |   `#REF` = c(`#REF`, as.character(rowRanges(vcf_chunk)$REF))
27 |   `#ALT` = c(`#ALT`, as.character(unlist(rowRanges(vcf_chunk)$ALT)))
28 |   `#FILTER` = as.character(unlist(rowRanges(vcf_chunk)$FILTER))
29 |   if( exists("total_vcf_chunk") ) {
30 |     total_vcf_chunk = cbind(total_vcf_chunk, t(geno(vcf_chunk, field)))
31 |   } else { total_vcf_chunk = t(geno(vcf_chunk, field)) }
32 |   vcf_chunk = readVcf(vcf, "hg19")
33 | }
34 | 
35 | res_matrix = rbind(`#CHR`, `#START`, `#REF`, `#ALT`, `#FILTER`, total_vcf_chunk)
36 | 
37 | write.table(data.frame("#CHR:POS" = rownames(res_matrix), res_matrix, check.names = F), file = out_txt, row.names=FALSE, sep="\t", quote = F)
38 | 


--------------------------------------------------------------------------------
/code/get_Qvalue.r:
--------------------------------------------------------------------------------
 1 | # should build a vcf_chunk in a first step, as the following:
 2 | # vcf_file=open(VcfFile(vcf,  yieldSize=10000))
 3 | # vcf_chunk = readVcf(vcf_file, "hg19")
 4 | 
 5 | # mut is in the form: chr:start_ref/alt
 6 | # sm corresponds to one sample as it is defined in the VCF file
 7 | 
 8 | toQvalue <- function(vcf_chunk, sm, mut, VAF_coeff=1){
 9 |   id_sm = which(colnames(geno(vcf_chunk,"GT"))==sm)
10 |   id_mut = which(rownames(geno(vcf_chunk,"GT"))==mut)
11 |   if(isEmpty(id_mut)) return(0)
12 |   mu_est = info(vcf_chunk[id_mut,])$ERR
13 |   sig_est = info(vcf_chunk[id_mut,])$SIG
14 |   all_DP = unlist(as.list(geno(vcf_chunk[id_mut,],"DP")))
15 |   all_AO = unlist(as.list(geno(vcf_chunk[id_mut,],"AO")))
16 |   DPi = all_DP[id_sm]
17 |   AOi = round(all_AO[id_sm] * VAF_coeff)
18 | 
19 |   if(AOi>DPi) return(0)
20 |   res = unlist(-10*log10(p.adjust((dnbinom(c(all_AO,AOi),size=1/sig_est,mu=mu_est*c(all_DP,DPi)) +
21 |                                pnbinom(c(all_AO,AOi),size=1/sig_est,mu=mu_est*c(all_DP,DPi),lower.tail = F)))
22 |                    [length(all_DP)+1]))
23 |   if(res>1000){return(1000)} else {return(res)}
24 | }
25 | 


--------------------------------------------------------------------------------