├── README.md
└── code
├── extract_FORMAT_vcf.r
└── get_Qvalue.r
/README.md:
--------------------------------------------------------------------------------
1 | # Tip and tricks for VCF files
2 |
3 | - [Usefull tools](https://github.com/IARC-bioinfo/VCF-tricks#usefull-tools)
4 | - [Use R VariantAnnotation bioconductor package](https://github.com/IARC-bioinfo/VCF-tricks#use-r-variantannotation-bioconductor-package)
5 | - [Manually processing VCF in R](https://github.com/IARC-bioinfo/VCF-tricks#manually-processing-vcf-in-r)
6 | - [Manually processing VCF in bash](https://github.com/IARC-bioinfo/VCF-tricks#manually-processing-vcf-in-bash)
7 |
8 | ## Usefull tools
9 | ### Samtools organisation and repositories
10 | - File format [specification](http://samtools.github.io/hts-specs/)
11 | - Bcftools [github page](https://github.com/samtools/bcftools)
12 | - Bcftools [webpage](http://samtools.github.io/bcftools/)
13 | - Bcftools [man page](http://samtools.github.io/bcftools/bcftools.html)
14 |
15 | Compilation (from [here](http://samtools.github.io/bcftools/)):
16 | ```bash
17 | git clone --branch=develop git://github.com/samtools/htslib.git
18 | git clone --branch=develop git://github.com/samtools/bcftools.git
19 | git clone --branch=develop git://github.com/samtools/samtools.git
20 | cd bcftools; make
21 | cd ../samtools; make
22 | cd ../htslib; make
23 | ```
24 |
25 | ### Other tools
26 | - R bioconductor package [Rsamtools](http://bioconductor.org/packages/release/bioc/html/Rsamtools.html)
27 | - vcflib [github page](https://github.com/ekg/vcflib)
28 | - vt [wiki](http://genome.sph.umich.edu/wiki/Vt) and [github page](https://github.com/atks/vt)
29 | - bedtools [documentation](http://bedtools.readthedocs.org) and [github page](https://github.com/arq5x/bedtools2)
30 | - PyVCF [github page](https://github.com/jamescasbon/PyVCF)
31 | - VCFtools [webpage](https://vcftools.github.io/) and [github page](https://github.com/vcftools/vcftools). It has been mostly replaced with bcftools but some commands are still only available in VCFtools (in particular [vcf-annotate](https://vcftools.github.io/perl_module.html#vcf-annotate))
32 |
33 | ## Use R VariantAnnotation bioconductor package
34 |
35 | All the commands below assume the package `VariantAnnotation` has been loaded into R using `library(VariantAnnotation)`.
36 |
37 | ### Replace INFO/DP field with GENO/DP field
38 | ```R
39 | vcf <- readVcf("test.vcf", "hg19")
40 | info(vcf)$DP=geno(vcf)$DP
41 | writeVcf(vcf,"test.vcf")
42 | ```
43 |
44 | ### Create a new INFO field
45 |
46 | Here it's called `DP_T`and filled with `.` (dot represent missing values in VCF files) but it could be anything you like.
47 | ```R
48 | vcf <- readVcf("test.vcf", "hg19")
49 | newInfo <- DataFrame(Number=1, Type="Integer",Description="DP in normal",row.names="DP_N")
50 | info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
51 | info(vcf)$DP_N="."
52 | writeVcf(vcf,"test.vcf")
53 | ```
54 |
55 | ### Split a multi-sample VCF into n-single sample VCFs
56 | ```R
57 | vcf_file = "test.vcf"
58 | for (cur_sample in samples(scanVcfHeader(vcf_file))) {
59 | writeVcf(readVcf(vcf_file, "hg19",ScanVcfParam(sample = cur_sample)),paste(cur_sample,".vcf",sep = ""))
60 | }
61 | ```
62 |
63 | ## Manually processing VCF in R
64 |
65 | Look at these functions too: https://github.com/sahilseth/vcfparser
66 |
67 | ### Extract expected Q-value from a needlestack calling
68 | Once a needlestack calling has been launch, user can want to compute what would be the Q-value of a particular sample at a particular position.
69 | This could help controlling the false negative rate.
70 | Given a VCF chunk from [VariantAnnotation](https://bioconductor.org/packages/release/bioc/html/VariantAnnotation.html) bioconductor package, a sample ID and a particular mutation, [this](https://github.com/IARCbioinfo/VCF-tricks/blob/master/code/get_Qvalue.r) function extract the corresponding Q-value.
71 |
72 | ### Loading a VCF file as a data frame
73 | On Unix systems (Mac or Linux), automatically pipe it with `grep` and `sed` to remove the header.
74 | ```R
75 | my_vcf=read.table(pipe("grep -v '^##' test.vcf | sed s/^#//"),stringsAsFactors=F,header=T,sep="\t")
76 | ```
77 | On Windows, you can manually remove the header lines (starting with `##`) and also the `#` character from the line containing the column names. After that you can read it using:
78 | ```R
79 | my_vcf=read.table("test_noheader.vcf",stringsAsFactors=F,header=T,sep="\t")
80 | ```
81 |
82 | ### Two R functions to extract values from INFO or GENOTYPE fields
83 |
84 | Gist: https://gist.github.com/mfoll/a4dfbb92068dc559f130
85 | ```R
86 | get_info=function(info,field,num=T) {
87 | get_single_info=function(single_info,field) {
88 | grep_res=grep(paste("^",field,"=",sep=""),unlist(strsplit(single_info,";")),value=T)
89 | if (length(grep_res)>0) strsplit(grep_res,"=")[[1]][2] else NA
90 | }
91 | res=unlist(lapply(info,get_single_info,field))
92 | if (num) as.numeric(res) else res
93 | }
94 |
95 | get_genotype=function(genotype,format,field,num=T) {
96 | get_single_genotype=function(single_genotype,format,field) {
97 | single_res=unlist(strsplit(single_genotype,":"))[which(unlist(strsplit(format,":"))==field)]
98 | if (length(single_res)>0) single_res else NA
99 | }
100 | res=unlist(lapply(genotype,get_single_genotype,format,field))
101 | if (num) as.numeric(res) else res
102 | }
103 | ```
104 |
105 | Both function are vectorized (i.e. you can give them a vector of `INFO` fields or a vector of `GENOTYPE` fields. The genotype field requires that you give the format of the field (for example `"GT:AO:DP"`). In both functions the `field` argument indicates which field you want to extract. By default the result is converted to a numeric value, unless you specify `num=FALSE` when you call the functions.
106 |
107 | ### Get genotype columns and sample names:
108 | ```R
109 | # list of columns containing sample specific data
110 | GT_cols=(which(names(my_vcf)=="FORMAT")+1):ncol(my_vcf)
111 | # extract sample names
112 | SM=names(my_vcf)[GT_cols]
113 | ```
114 |
115 | ### Get number of variants for each position in a VCF file
116 | Following function returns, for each position in the input VCF file, the number of samples having a `QVALUE` higher than the input threshold (`QVAL_thr`, default=50), __i.e__. the number of variants.
117 |
118 | ```R
119 | # after reading a VCF with read.table(), see last example
120 | get_number_of_variants <- function(vcf, QVAL_thr = 50){
121 | unlist(lapply(1:nrow(vcf), function(i) {
122 | all_QVAL = unlist(lapply((which(colnames(vcf)=="FORMAT")+1) : ncol(vcf),
123 | function(id) get_genotype(vcf[i,id], vcf[i,"FORMAT"], field = "QVAL") ))
124 | sum(all_QVAL>=QVAL_thr)
125 | }))
126 | }
127 | ```
128 |
129 | ### Using all the above
130 | This assumes that you have a `my_vcf` data frame loaded, the two functions above and the objects `GT_cols` and `SM`.
131 |
132 | - Extract the variant type (`TYPE`) from all lines from the INFO field:
133 |
134 | ```R
135 | get_info(my_vcf$INFO,"TYPE",num=F)
136 | ```
137 | - Use it to fikter only variants with `TYPE=snv`:
138 |
139 | ```R
140 | my_vcf[which(get_info(my_vcf$INFO,"TYPE",num=F)=="snv"),]
141 | ```
142 | - Extract coverage (`DP`) of each sample at a given line (1 here):
143 |
144 | ```R
145 | get_genotype(my_vcf[1,GT_cols],my_vcf$FORMAT[1],"DP")
146 | ```
147 |
148 | - Extract coverage of all lines of a given sample (`MY_SAMPLE` here):
149 |
150 | ```R
151 | get_genotype(my_vcf[,"MY_SAMPLE"],my_vcf$FORMAT[1],"DP")
152 | ```
153 | You can replace `"MY_SAMPLE"` with `SM[1]` to take the first sample without typing manually its name (usefull if you have only one for example).
154 |
155 | - Plot the distribution of allelic fraction from the first sample (assuming `AO` and `DP` fields are available in the genotype column):
156 |
157 | ```R
158 | AO=get_genotype(my_vcf[,SM[1]],my_vcf$FORMAT[1],"AO")
159 | DP=get_genotype(my_vcf[,SM[1]],my_vcf$FORMAT[1],"DP")
160 | hist(AO/DP)
161 | ```
162 |
163 | ### Built a TSV file, from VCF to extract a particular field from FORMAT
164 |
165 | [extract_FORMAT_vcf.r](https://github.com/IARCbioinfo/VCF-tricks/blob/master/code/extract_FORMAT_vcf.r) is a script which extract, for each variant in the VCF, and for each sample, the value of a `field` in `FORMAT`.
166 | Example of command line:
167 | ```
168 | Rscript extract_FORMAT_vcf.r --input_vcf=testepic.vcf.gz --field=DS
169 | ```
170 |
171 |
172 | ## Manually processing VCF in bash
173 |
174 | ### Split a n-samples VCF
175 |
176 | This bash script splits a big VCF from n samples into n VCF with file name = sample name (save these lines into big_VCF_to_samples.sh)
177 |
178 | Gist : https://gist.github.com/tdelhomme/cb28dec176b55c43e887
179 | ```bash
180 | #!/bin/bash
181 |
182 | if [ $# -eq 0 ]; then #if no provided parameters
183 | echo 'usage : ./big_VCF_to_samples.sh '
184 | else
185 | mkdir -p $2
186 | IFS= read -a array <<< $(grep "#CHROM" $1 | head -1 | awk '{for(i=10;i<=NF;++i)print $i}')
187 | samples=${array[0]}
188 | for i in `seq 1 $(echo "$samples" | wc -w)`;
189 | do
190 | bcftools view -s $(echo "$samples" | cut -d" " -f$i) $1 > $2"/"$(echo "$samples" | cut -d" " -f$i).vcf
191 | done
192 | fi
193 | ```
194 |
195 | ### Generate a sorted and merged BED file from positions in a VCF
196 | ```
197 | awk '{ if (!/^#/) print $1" "$2" "$2+1}' input.vcf | sort -k1,1 -k2,2n | bedtools merge -i stdin
198 | ```
199 |
200 | ### Compute number of positions in a bed file
201 | ```
202 | cat mybedfile.bed | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2+1 }END{print SUM}'
203 | ```
204 |
205 | ### Get the samples ID from a VCF
206 | ```
207 | grep "^#CHROM" input.vcf | tr '\t' '\n' | grep -v -E '#CHROM|POS|ID|REF|ALT|QUAL|FILTER|INFO|FORMAT'
208 | ```
209 |
210 | ### Downsample a VCF
211 | Here a zipped one (assuming the header is less than 10000 lines), from which we randomly extract 1 million lines:
212 | ```
213 | (zcat human_9606_b150_GRCh38p7.vcf.gz | head -n 10000 | zgrep ^# ; zgrep -v ^# human_9606_b150_GRCh38p7.vcf.gz | shuf -n 1000000 | LC_ALL=C sort -k1,1V -k2,2n) | gzip > human_9606_b150_GRCh38p7_small.vcf.gz
214 | ```
215 |
--------------------------------------------------------------------------------
/code/extract_FORMAT_vcf.r:
--------------------------------------------------------------------------------
1 | library(VariantAnnotation)
2 |
3 | args <- commandArgs(TRUE)
4 | parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
5 | argsL <- as.list(as.character(as.data.frame(do.call("rbind", parseArgs(args)))$V2))
6 | names(argsL) <- as.data.frame(do.call("rbind", parseArgs(args)))$V1
7 | args <- argsL;rm(argsL)
8 |
9 | if(is.null(args$input_vcf)) {stop("no input VCF file")} else {input_vcf = args$input_vcf}
10 | if(is.null(args$field)) {stop("no FORMAT field to query")} else {field=args$field}
11 | if(is.null(args$out_txt)) {out_txt = paste(gsub(".vcf.gz","",input_vcf),"_",field,"_extract.txt",sep="")} else {out_txt=args$out_txt}
12 | if(is.null(args$nb_mut)) {nb_mut = 1000} else {nb_mut = as.numeric(args$nb_mut)}
13 |
14 | vcf <- open(VcfFile(input_vcf, yieldSize=nb_mut))
15 | vcf_chunk = readVcf(vcf, "hg19")
16 |
17 | `#CHR` = c()
18 | `#START` = c()
19 | `#REF` = c()
20 | `#ALT` = c()
21 | `#FILTER` = c()
22 |
23 | while(dim(vcf_chunk)[1] != 0) {
24 | `#CHR` = c(`#CHR`, as.character(seqnames(rowRanges(vcf_chunk))))
25 | `#START` = c(`#START`, start(ranges(rowRanges(vcf_chunk,"seqnames"))))
26 | `#REF` = c(`#REF`, as.character(rowRanges(vcf_chunk)$REF))
27 | `#ALT` = c(`#ALT`, as.character(unlist(rowRanges(vcf_chunk)$ALT)))
28 | `#FILTER` = as.character(unlist(rowRanges(vcf_chunk)$FILTER))
29 | if( exists("total_vcf_chunk") ) {
30 | total_vcf_chunk = cbind(total_vcf_chunk, t(geno(vcf_chunk, field)))
31 | } else { total_vcf_chunk = t(geno(vcf_chunk, field)) }
32 | vcf_chunk = readVcf(vcf, "hg19")
33 | }
34 |
35 | res_matrix = rbind(`#CHR`, `#START`, `#REF`, `#ALT`, `#FILTER`, total_vcf_chunk)
36 |
37 | write.table(data.frame("#CHR:POS" = rownames(res_matrix), res_matrix, check.names = F), file = out_txt, row.names=FALSE, sep="\t", quote = F)
38 |
--------------------------------------------------------------------------------
/code/get_Qvalue.r:
--------------------------------------------------------------------------------
1 | # should build a vcf_chunk in a first step, as the following:
2 | # vcf_file=open(VcfFile(vcf, yieldSize=10000))
3 | # vcf_chunk = readVcf(vcf_file, "hg19")
4 |
5 | # mut is in the form: chr:start_ref/alt
6 | # sm corresponds to one sample as it is defined in the VCF file
7 |
8 | toQvalue <- function(vcf_chunk, sm, mut, VAF_coeff=1){
9 | id_sm = which(colnames(geno(vcf_chunk,"GT"))==sm)
10 | id_mut = which(rownames(geno(vcf_chunk,"GT"))==mut)
11 | if(isEmpty(id_mut)) return(0)
12 | mu_est = info(vcf_chunk[id_mut,])$ERR
13 | sig_est = info(vcf_chunk[id_mut,])$SIG
14 | all_DP = unlist(as.list(geno(vcf_chunk[id_mut,],"DP")))
15 | all_AO = unlist(as.list(geno(vcf_chunk[id_mut,],"AO")))
16 | DPi = all_DP[id_sm]
17 | AOi = round(all_AO[id_sm] * VAF_coeff)
18 |
19 | if(AOi>DPi) return(0)
20 | res = unlist(-10*log10(p.adjust((dnbinom(c(all_AO,AOi),size=1/sig_est,mu=mu_est*c(all_DP,DPi)) +
21 | pnbinom(c(all_AO,AOi),size=1/sig_est,mu=mu_est*c(all_DP,DPi),lower.tail = F)))
22 | [length(all_DP)+1]))
23 | if(res>1000){return(1000)} else {return(res)}
24 | }
25 |
--------------------------------------------------------------------------------