├── NAMESPACE
├── QuASAR.Rproj
├── DESCRIPTION
├── man
    ├── PrepForGenotyping.Rd
    ├── comp.rho.Rd
    ├── logLikBetaBinomial2.Rd
    ├── gLogLikBetaBinomial.Rd
    ├── lrtEpsRhoBinom.Rd
    ├── logLikBetaBinomialRhoEps.Rd
    ├── fitAseNull.Rd
    ├── UnionExtractFields.Rd
    ├── aseInference.Rd
    └── fitAseNullMulti.Rd
├── mpra
    ├── mpra.R
    ├── README.md
    └── preprocess.R
├── R
    ├── comp.rho.R
    ├── logLikBetaBinomial2.R
    ├── logLikBetaBinomialRhoEps.R
    ├── QuASAR-package.R
    ├── gLogLikBetaBinomial.R
    ├── lrtEpsRhoBinom.R
    ├── PrepForGenotyping.R
    ├── UnionExtractFields.R
    ├── fitQuasarMpra.R
    ├── fitAseNull.R
    ├── fitAseNullMulti.R
    └── aseInference.R
├── LICENSE
├── scripts
    ├── exampleWorkflow.R
    └── convertPileupToQuasar.R
└── README.md


/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2 (4.0.1): do not edit by hand
 2 | 
 3 | export(PrepForGenotyping)
 4 | export(UnionExtractFields)
 5 | export(aseInference)
 6 | export(comp.rho)
 7 | export(fitAseNull)
 8 | export(fitAseNullMulti)
 9 | export(gLogLikBetaBinomial)
10 | export(logLikBetaBinomial2)
11 | export(logLikBetaBinomialRhoEps)
12 | export(lrtEpsRhoBinom)
13 | export(fitQuasarMpra)


--------------------------------------------------------------------------------
/QuASAR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: QuASAR
 2 | Type: Package
 3 | Title: Quantitative Allele Specific Analysis of Reads
 4 | Version: 0.1
 5 | Date: 2014-06-16
 6 | Author: Chris Harvey, Greg Moyerbrailean and Roger Pique-Regi
 7 | Maintainer: Roger Pique-Regi <rpique@gmail.com>
 8 | Description: Joint genotyping, error estimation, and Allele Specific Expression
 9 |     inference for multiple RNA-Seq samples.
10 | License: MIT + file LICENSE
11 | 


--------------------------------------------------------------------------------
/man/PrepForGenotyping.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{PrepForGenotyping}
 3 | \alias{PrepForGenotyping}
 4 | \title{PrepForGenotyping}
 5 | \usage{
 6 | PrepForGenotyping(ase.dat, min.coverage, dampen.priors = TRUE)
 7 | }
 8 | \arguments{
 9 | \item{ase.dat}{data from UnionExtractFields.R}
10 | 
11 | \item{min.coverage}{minimumm coverage across all combined sample for locus inclusion}
12 | 
13 | \item{dampen.priors}{prevents underflow}
14 | }
15 | \value{
16 | blah blah blah
17 | }
18 | \description{
19 | PrepForGenotyping
20 | }
21 | 
22 | 


--------------------------------------------------------------------------------
/man/comp.rho.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{comp.rho}
 3 | \alias{comp.rho}
 4 | \title{rho calculation}
 5 | \usage{
 6 | comp.rho(ref, alt, eps)
 7 | }
 8 | \arguments{
 9 | \item{ref}{integer. reference allele count}
10 | 
11 | \item{alt}{integer. alternate allele count}
12 | 
13 | \item{eps}{float. error estimate in [0,1]}
14 | }
15 | \value{
16 | rho float. elstimate of alternate allele frequency
17 | }
18 | \description{
19 | given reference and alternate read counts, & an error estimate, return the
20 | estimated alternate alle frequency
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/mpra/mpra.R:
--------------------------------------------------------------------------------
 1 | ## QuASAR ############################
 2 | 
 3 | library(QuASAR)
 4 | library(qqman)
 5 | 
 6 | ## Import and preprocess the data by first running preprocess.R
 7 | ## Here we already have it saved as a text file as example
 8 | HepG2 <- read.table("http://genome.grid.wayne.edu/quasar/samplempra/HepG2.mpra.txt",sep='\t',as.is=T,header=T)
 9 | 
10 | ## Fitting the QuASAR model
11 | HepG2.res <- fitQuasarMpra(HepG2$R,HepG2$A,HepG2$DNA_prop)
12 | 
13 | ## Number of significant hits 10% FDR
14 | sum(HepG2.res$padj_quasar<0.1)
15 | 
16 | ## QQ-plot 
17 | qq(HepG2.res$pval3)
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/R/comp.rho.R:
--------------------------------------------------------------------------------
 1 | #' @title rho calculation
 2 | #'
 3 | #' @description
 4 | #' given reference and alternate read counts, & an error estimate, return the 
 5 | #' estimated alternate alle frequency
 6 | #'
 7 | #' @param ref integer. reference allele count 
 8 | #' @param alt integer. alternate allele count 
 9 | #' @param eps float. error estimate in [0,1]
10 | #' @return rho float. elstimate of alternate allele frequency
11 | #' @export
12 | comp.rho <- function(ref,alt,eps){
13 |   rho <- (ref * (1 - eps) - eps * alt) / ((ref + alt) * (1 - 2 * eps))
14 |   rho[rho<0] <- 0
15 |   rho[rho>1] <- 1
16 |   rho
17 | }	
18 | 


--------------------------------------------------------------------------------
/man/logLikBetaBinomial2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{logLikBetaBinomial2}
 3 | \alias{logLikBetaBinomial2}
 4 | \title{log-likelihood of beta-binomial model for heterozygotes}
 5 | \usage{
 6 | logLikBetaBinomial2(logit.p, D, R, A)
 7 | }
 8 | \arguments{
 9 | \item{logit.p}{logit(rho) value in [0, 1]}
10 | 
11 | \item{D}{dispersion estimate}
12 | 
13 | \item{R}{reference read count}
14 | 
15 | \item{A}{alternate read count}
16 | }
17 | \value{
18 | log likelihood
19 | }
20 | \description{
21 | returns the log likelihood of reference & alternate read count data
22 | given rho, & dispersion.
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/R/logLikBetaBinomial2.R:
--------------------------------------------------------------------------------
 1 | #' @title log-likelihood of beta-binomial model for heterozygotes
 2 | #'
 3 | #' @description
 4 | #' returns the log likelihood of reference & alternate read count data
 5 | #' given rho, & dispersion.
 6 | #'
 7 | #' @param logit.p logit(rho) value in [0, 1]
 8 | #' @param D dispersion estimate
 9 | #' @param R reference read count
10 | #' @param A alternate read count
11 | #' @return log likelihood
12 | #' @export
13 | logLikBetaBinomial2 <- function(logit.p,D,R,A){
14 |   p <- plogis(logit.p)
15 |   aux <- (lgamma(R+p*D) + lgamma(A + (1-p)*D) - lgamma(R+A+D) - lgamma(p*D) - lgamma((1-p)*D) + lgamma(D));
16 |   -sum(aux)
17 | }
18 | 


--------------------------------------------------------------------------------
/man/gLogLikBetaBinomial.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{gLogLikBetaBinomial}
 3 | \alias{gLogLikBetaBinomial}
 4 | \title{gradient of log-likelihood of beta-binomial model}
 5 | \usage{
 6 | gLogLikBetaBinomial(logit.p, D, R, A)
 7 | }
 8 | \arguments{
 9 | \item{logit.p}{logit(rho)}
10 | 
11 | \item{D}{dispersion estimate}
12 | 
13 | \item{R}{reference read count}
14 | 
15 | \item{A}{alternate read count}
16 | }
17 | \value{
18 | gradient evaulated at logit(rho)
19 | }
20 | \description{
21 | returns the gradient of the log likelihood of reference & alternate
22 | read count data given logit(rho) and adispersion estimate.
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/lrtEpsRhoBinom.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{lrtEpsRhoBinom}
 3 | \alias{lrtEpsRhoBinom}
 4 | \title{critical value for likelihood ratio test with misspecififed genotypes}
 5 | \usage{
 6 | lrtEpsRhoBinom(ref, alt, eps, rho)
 7 | }
 8 | \arguments{
 9 | \item{ref}{reference read count}
10 | 
11 | \item{alt}{alternate read count}
12 | 
13 | \item{eps}{estimate of error}
14 | 
15 | \item{rho}{rho estimate}
16 | }
17 | \value{
18 | llrt critical value
19 | }
20 | \description{
21 | returns the critical value of llrt comparing with
22 | H0: the het is mis-specified or rho=o.5 & H1: the het has imbalance with estimated rho
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/man/logLikBetaBinomialRhoEps.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{logLikBetaBinomialRhoEps}
 3 | \alias{logLikBetaBinomialRhoEps}
 4 | \title{log-likelihood of beta-binomial model}
 5 | \usage{
 6 | logLikBetaBinomialRhoEps(rho, eps, D, R, A)
 7 | }
 8 | \arguments{
 9 | \item{rho}{rho value in [0, 1]}
10 | 
11 | \item{eps}{estimate of error}
12 | 
13 | \item{D}{dispersion estimate}
14 | 
15 | \item{R}{reference read count}
16 | 
17 | \item{A}{alternate read count}
18 | }
19 | \value{
20 | log likelihood
21 | }
22 | \description{
23 | returns the log likelihood of reference & alternate read count data
24 | given rho, dispersion, and error
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/R/logLikBetaBinomialRhoEps.R:
--------------------------------------------------------------------------------
 1 | #' @title log-likelihood of beta-binomial model
 2 | #'
 3 | #' @description
 4 | #' returns the log likelihood of reference & alternate read count data
 5 | #' given rho, dispersion, and error
 6 | #'
 7 | #' @param rho rho value in [0, 1]
 8 | #' @param eps estimate of error
 9 | #' @param D dispersion estimate
10 | #' @param R reference read count
11 | #' @param A alternate read count
12 | #' @return log likelihood
13 | #' @export
14 | logLikBetaBinomialRhoEps <- function(rho,eps,D,R,A){
15 |   p <- (rho*(1-eps)+(1-rho)*eps)
16 |   aux <- (lgamma(R+p*D) + lgamma(A + (1-p)*D) - lgamma(R+A+D) - lgamma(p*D) - lgamma((1-p)*D) + lgamma(D)) ##+ lgamma(R+A+1) - lgamma(A+1) - lgamma(R+1)
17 |   aux
18 | }
19 | 


--------------------------------------------------------------------------------
/R/QuASAR-package.R:
--------------------------------------------------------------------------------
 1 | ##' Quantitative allele-specific analysis of reads
 2 | ##' 
 3 | ##' \tabular{ll}{
 4 | ##' Package: \tab QuASAR\cr
 5 | ##' Type: \tab Package\cr
 6 | ##' Version: \tab 0.1\cr
 7 | ##' Date: \tab 28-07-2014\cr
 8 | ##' License: \tab MIT + file LICENSE \cr
 9 | ##' LazyLoad: \tab yes\cr
10 | ##' }
11 | ##' 
12 | ##' Joint genotyping, error estimation, and Allele Specific Expression
13 | ##' inference for multiple RNA-Seq samples.
14 | ##' 
15 | ##' @name QuASAR-package
16 | ##' @aliases QuASAR
17 | ##' @docType package
18 | ##' @title Quantitative allele-specific analysis of reads
19 | ##' @author Roger Pique-Regi \email{rpique@wayne.edu}, Chris Harvey \email{fl9788@wayne.edu}, Gregory Moyerbrailean \email{ex7689@wayne.edu}
20 | ##' 
21 | ##roxygen()
22 | 


--------------------------------------------------------------------------------
/R/gLogLikBetaBinomial.R:
--------------------------------------------------------------------------------
 1 | #' @title gradient of log-likelihood of beta-binomial model
 2 | #'
 3 | #' @description
 4 | #' returns the gradient of the log likelihood of reference & alternate 
 5 | #' read count data given logit(rho) and adispersion estimate.
 6 | #'
 7 | #' @param logit.p logit(rho) 
 8 | #' @param D dispersion estimate
 9 | #' @param R reference read count
10 | #' @param A alternate read count
11 | #' @return gradient evaulated at logit(rho)
12 | #' @export
13 | gLogLikBetaBinomial <- function(logit.p,D,R,A){
14 |   p <- plogis(logit.p)
15 |  # aux <- D * (digamma(R+p*D) - digamma(A + (1-p)*D) - digamma(p*D) + digamma((1-p)*D)) 
16 |   aux <- (p * (1-p))*D*(digamma(R+p*D) - digamma(A + (1-p)*D) - digamma(p*D) + digamma((1-p)*D)) 
17 |   as.matrix(-aux)
18 | }
19 | 


--------------------------------------------------------------------------------
/man/fitAseNull.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{fitAseNull}
 3 | \alias{fitAseNull}
 4 | \title{fit QuASAR for a single sample}
 5 | \usage{
 6 | fitAseNull(ref, alt, eps = 0.1, log.gmat, max.it = 100, tol = 1e-16,
 7 |   fixGprior = TRUE)
 8 | }
 9 | \arguments{
10 | \item{ref}{reference read count}
11 | 
12 | \item{alt}{alternate read count}
13 | 
14 | \item{eps}{starting value}
15 | 
16 | \item{log.gmat}{log of genotype priors}
17 | 
18 | \item{max.it}{maximum number of iterations for algorithm}
19 | 
20 | \item{tol}{tolerance to assess convergance}
21 | 
22 | \item{fixGprior}{logical to fix genotype priors across all steps of the algorithm}
23 | }
24 | \value{
25 | list of genotypes, log genotypes, error estimate, log-likelihood, and the sum of
26 | log likelihoods
27 | }
28 | \description{
29 | for a single sample conduct genotyping and estimate sample error
30 | }
31 | 
32 | 


--------------------------------------------------------------------------------
/man/UnionExtractFields.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{UnionExtractFields}
 3 | \alias{UnionExtractFields}
 4 | \title{UnionExtractFields}
 5 | \usage{
 6 | UnionExtractFields(fileList, combine = FALSE)
 7 | }
 8 | \arguments{
 9 | \item{fileList}{List of files *.quasar.in.gz}
10 | 
11 | \item{combine}{Collapses all samples into a single sample if true. Default is false}
12 | }
13 | \value{
14 | returns a R list with the following elements:
15 | Ref: Matrix with number of reads matching the reference allele.
16 | Alt: Matrix with number of reads matching the alternate allele.
17 | Err: Matrix with number of reads matchine neither the ref. or alt. allele.
18 | anno: Object with the annotation for the SNPs.
19 | }
20 | \description{
21 | From a list of files (fileList), find the union of all loci and return a list with the data prepared to run QuASAR.
22 | }
23 | \author{
24 | Chris Harvey
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/R/lrtEpsRhoBinom.R:
--------------------------------------------------------------------------------
 1 | #' @title critical value for likelihood ratio test with misspecififed genotypes
 2 | #'
 3 | #' @description
 4 | #' returns the critical value of llrt comparing with 
 5 | #' H0: the het is mis-specified or rho=o.5 & H1: the het has imbalance with estimated rho
 6 | #'
 7 | #' @param ref reference read count
 8 | #' @param alt alternate read count
 9 | #' @param eps estimate of error
10 | #' @param rho rho estimate
11 | #' @return llrt critical value
12 | #' @export
13 | lrtEpsRhoBinom <- function(ref,alt,eps,rho){
14 |   log.gt <- cbind(g0   = ref * log(1 - eps) + alt * log(eps),
15 |                   g1t0 = (ref + alt) * log(0.5),
16 |                   g1t1 = ref * log((1 - eps) * rho + eps * (1 - rho)) + alt * log((1 - eps) * (1 - rho) + eps * rho),
17 |                   g2   = ref * log(eps) + alt * log(1 - eps)
18 |                   )
19 |   lrt <- log.gt[,"g1t1"]-pmax(log.gt[,"g0"],log.gt[,"g1t0"],log.gt[,"g2"])
20 | }
21 | 


--------------------------------------------------------------------------------
/man/aseInference.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{aseInference}
 3 | \alias{aseInference}
 4 | \title{aseInference}
 5 | \usage{
 6 | aseInference(gts, eps.vect, priors, ref.mat, alt.mat, min.cov, sample.names,
 7 |   annos)
 8 | }
 9 | \arguments{
10 | \item{gts}{posterior probabilities of genotypes from QuASAR}
11 | 
12 | \item{eps.vect}{QuASAR estimates of sequencing error for each sample}
13 | 
14 | \item{priors}{1K genomes minor allele frequencies as priors}
15 | 
16 | \item{ref.mat}{matrix of reference allele counts}
17 | 
18 | \item{alt.mat}{matrix of alternate allele counts}
19 | 
20 | \item{min.cov}{threshold for the minimum coverage across all samples}
21 | 
22 | \item{sample.names}{verctor of sample names}
23 | 
24 | \item{annos}{annotations for all loci}
25 | }
26 | \value{
27 | inference.data list.
28 | }
29 | \description{
30 | using genotypes from QuASAR, conduct inference on allelic imbalance
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/man/fitAseNullMulti.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2 (4.0.1): do not edit by hand
 2 | \name{fitAseNullMulti}
 3 | \alias{fitAseNullMulti}
 4 | \title{fit QuASAR for a multiple samples}
 5 | \usage{
 6 | fitAseNullMulti(ref, alt, eps = rep(0.1, ncol(ref)), log.gmat, max.it = 100,
 7 |   tol = 1e-16, fixGprior = TRUE, verbose = TRUE)
 8 | }
 9 | \arguments{
10 | \item{ref}{matrix of reference read count}
11 | 
12 | \item{alt}{matrix alternate read count}
13 | 
14 | \item{eps}{vector of starting values}
15 | 
16 | \item{log.gmat}{log of genotype priors}
17 | 
18 | \item{max.it}{maximum number of iterations for algorithm}
19 | 
20 | \item{tol}{tolerance to assess convergance}
21 | 
22 | \item{fixGprior}{logical to fix genotype priors across all steps of the algorithm}
23 | 
24 | \item{verbose}{logical turn on reporting during algorithm iterations}
25 | }
26 | \value{
27 | list of genotypes, log genotypes, vector oferror estimates, log-likelihood, and the sum of
28 | log likelihoods
29 | }
30 | \description{
31 | joint geontype across samples and estimate sample error
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Wayne State University 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/R/PrepForGenotyping.R:
--------------------------------------------------------------------------------
 1 | ##' @title PrepForGenotyping
 2 | ##' @param ase.dat data from UnionExtractFields.R 
 3 | ##' @param min.coverage minimumm coverage across all combined sample for locus inclusion 
 4 | ##' @param dampen.priors prevents underflow 
 5 | ##' @return a list with reference allele counts, alternate allele counts, a matrix of gt priors, and annotations
 6 | ##' @export
 7 | PrepForGenotyping <- function(ase.dat, min.coverage, dampen.priors=TRUE){
 8 | 	ref.all <- ase.dat[[1]]
 9 | 	alt.all <- ase.dat[[2]]
10 | 	phi.all <- ase.dat[[4]]$af 
11 | 	n.samples <-dim(ref.all)[2]
12 | 
13 | 	##################################################################  
14 | 	## reads.floor ~ minumum coverage across all samples
15 | 	## collapsed.indicator ~ loci with minimum coverage
16 | 	## dat.collapssed.final ~ dat.collapsed with >reads.floor minimum coverage
17 | 	## ref.all.final ~ sample wise reference counts for loci with sufficient covergae
18 | 	## alt.all.final ~ sample wise alternate counts for loci with sufficient covergae
19 | 	## annotations.included.samples ~ annotations for loci with sufficient coverage
20 | 	## gmat ~ genotype priors from 1k genomes for loci with combined coverage >15
21 | 	## gmat.collapsed ~ genotype priors from 1k genomes for all loci
22 | 	reads.floor <- min.coverage
23 | 	collapsed.indicator <- (rowSums(ref.all + alt.all) > reads.floor)
24 | 	ref.all.final <- ref.all[collapsed.indicator, ]		
25 | 	alt.all.final <- alt.all[collapsed.indicator, ]
26 | 	phi.all.final <- phi.all[collapsed.indicator]
27 | 	annotations.included.samples <- ase.dat$anno[collapsed.indicator, ]
28 | 	gmat <- cbind(g0=(1-phi.all.final)^2, g1=2*phi.all.final*(1-phi.all.final), g2=phi.all.final^2)
29 | 
30 | 	if(dampen.priors){
31 | 		gmat <- (gmat+0.0001)/1.0003
32 | 	}
33 | 
34 | 	list(ref=ref.all.final, alt=alt.all.final, gmat=gmat, annotations=annotations.included.samples)
35 | }
36 | 


--------------------------------------------------------------------------------
/R/UnionExtractFields.R:
--------------------------------------------------------------------------------
 1 | ##' @title UnionExtractFields
 2 | ##' @description
 3 | ##' From a list of files (fileList), find the union of all loci and return a list with the data prepared to run QuASAR.
 4 | ##' @param fileList List of files *.quasar.in.gz
 5 | ##' @param combine Collapses all samples into a single sample if true. Default is false
 6 | ##' @return returns a R list with the following elements:
 7 | ##' Ref: Matrix with number of reads matching the reference allele.
 8 | ##' Alt: Matrix with number of reads matching the alternate allele.
 9 | ##' Err: Matrix with number of reads matchine neither the ref. or alt. allele.
10 | ##' anno: Object with the annotation for the SNPs. 
11 | ##' @author Chris Harvey
12 | #' @export
13 | UnionExtractFields <- function(fileList, combine=FALSE){			
14 | 	#browser()
15 | 	tmpFile <- scan(pipe("mktemp -t"),character(0))
16 | 	system(paste("zcat ", paste(fileList, collapse=" "), " | grep -v -w '^chr\\|^NA' | cut -f 1-3,6-7 | sortBed -i stdin | uniq | gzip > ", tmpFile))
17 | 	anno <- read.table(gzfile(tmpFile),sep="\t",as.is=T)	
18 | 	aux <- sapply(fileList,function(fn){			
19 | 				#fn <- fileList[1]
20 | 				cat("Processing:",fn,"\n")
21 | 				command=paste("intersectBed -a ",tmpFile," -b ",fn," -wao | cut -f 1-3,13-15 ",sep="")
22 | 				aa <- read.table(pipe(command),sep="\t",as.is=T,na.strings=".")
23 | 				aa[is.na(aa)] <- 0
24 | 				stopifnot(identical(aa[,1:3],anno[,1:3]))
25 | 				aa[,-(1:3)]
26 | 			})
27 | 	colnames(anno) = c("chr","pos0","pos", "rsID", "af")
28 | 	Ref <- as.matrix(do.call(cbind,aux[1,]))
29 | 	Alt <- as.matrix(do.call(cbind,aux[2,]))
30 | 	Err <- as.matrix(do.call(cbind,aux[3,]))
31 | 	return.list<-list(ref=Ref,alt=Alt,err=Err,anno=anno);
32 | 	if(combine==TRUE){
33 | 		allRef<-apply(Ref, MARGIN=1, sum)
34 | 		allAlt<-apply(Alt, MARGIN=1, sum)
35 | 		allErr<-apply(Err, MARGIN=1, sum)
36 | 		return.list$all<-as.matrix(cbind(allRef, allAlt, allErr))
37 | 
38 | 	}
39 | 	return(return.list)
40 | }
41 | 


--------------------------------------------------------------------------------
/scripts/exampleWorkflow.R:
--------------------------------------------------------------------------------
 1 | ##################################################################
 2 | ## QuASAR Example Workflow
 3 | ##################################################################
 4 | 
 5 | ##################################################################    
 6 | ## 1.) Download example pre-processed data
 7 | ##################################################################
 8 | 
 9 | 
10 | urlData="http://genome.grid.wayne.edu/quasar/sampleinput/"
11 | fileNames <- paste0("t",c(2,4,6,12,18,24),"hr_Huvec_Rep1.quasar.in.gz")
12 | sapply(fileNames,function (ii) download.file(paste0(urlData,ii),ii))
13 | 
14 | 
15 | ##################################################################    
16 | ## 2.) Load data into an R object
17 | ##################################################################
18 | 
19 | ase.dat <- UnionExtractFields(fileNames, combine=TRUE)
20 | ase.dat.gt <- PrepForGenotyping(ase.dat, min.coverage=5)
21 | str(ase.dat.gt)
22 | sample.names <- colnames(ase.dat.gt$ref)
23 | 
24 | ################################################################## 
25 | ## 2.) QuASAR Model Fitting
26 | ## ase.joint ~ object for joint genotyoping across all samples
27 | ##################################################################
28 | 
29 | ase.joint <- fitAseNullMulti(ase.dat.gt$ref, ase.dat.gt$alt, log.gmat=log(ase.dat.gt$gmat))
30 | str(ase.joint)
31 | 
32 | ## The ase.joint$gt object contains the posterior probability for each genotype: g0) is homozygote reference, g1) heterozygote, g2) homozygote alternate.
33 | head(ase.joint$gt)
34 | 
35 | ## The base-calling error parameters are stored in ase.joint$eps
36 | 
37 | ## Saving the output genotype probabilities
38 | out_dat <- data.frame(ase.dat.gt$annotations[, -5], map=ase.joint$gt)
39 | write.table(out_dat, file='genotypes.txt', row.names=FALSE, col.names=FALSE, quote=FALSE,sep="\t")
40 | 
41 | ##################################################################
42 | ## 3.) ASE inference
43 | ##################################################################
44 | ourInferenceData <- aseInference(gts=ase.joint$gt, eps.vect=ase.joint$eps, priors=ase.dat.gt$gmat, ref.mat=ase.dat.gt$ref, alt.mat=ase.dat.gt$alt, min.cov=10, sample.names=sample.names, annos=ase.dat.gt$annotations)
45 | str(ourInferenceData)
46 | 


--------------------------------------------------------------------------------
/R/fitQuasarMpra.R:
--------------------------------------------------------------------------------
 1 | #' Title fitQuasarMpra
 2 | #'
 3 | #' @description
 4 | #' For an MPRA experiment assess allele specific expression. 
 5 | #' 
 6 | #' @param ref reference read count
 7 | #' @param alt alternate read count
 8 | #' @param prop reference DNA proportion (def.=0.5)
 9 | #' @param eps base-calling error rate (def.=0.001)
10 | #' @param nbreaks number of breaks of coverage to estimate dispersion (def.=10)
11 | #'
12 | #' @return data.frame with the following fields: 
13 | #' - bin total coverage bin used
14 | #' - betas.beta.binom  logit transfomation of the allelic skew
15 | #' - betas_se standard error of beta 
16 | #' - betas_z zscore for beta-plogis(propr) 
17 | #' - pval3 p.value 
18 | #' - padj_quasar BH adjusted p.value 
19 | #' 
20 | #' @export
21 | #'
22 | #' @examples
23 | #' 
24 | fitQuasarMpra <- function(ref,alt,prop=0.5,eps=0.001,nbreaks=10){
25 |   tot <- ref + alt;
26 |   cov_breaks <- unique(c(0,quantile(tot,(1:nbreaks)/nbreaks)))
27 |   bin <- cut(tot,cov_breaks)
28 |   M <- exp((0:500)/50)
29 |   ## aux ~ loglikelihood of the beta bionmial model across D values
30 |   ##               with null \rho value
31 |   ## Mmax ~ disperison which maximizes the llk
32 |   Mvec <- sapply(levels(bin),function(mybin){
33 |     aux <- sapply(M,function(M){
34 |       sum(logLikBetaBinomialRhoEps(prop[bin==mybin],eps,M,ref[bin==mybin ],alt[bin==mybin]))
35 |     })
36 |     Mmax <- M[which.max(aux)]
37 |     Mmax
38 |   })
39 |   #Mvec
40 |   #cat("#Disp: ", round(Mvec, 2), "\n")
41 |   aux2 <- t(sapply(1:length(ref),function(ii){
42 |     auxLogis <- optim(0,
43 |                       fn=logLikBetaBinomial2,
44 |                       gr=gLogLikBetaBinomial,
45 |                       D=Mvec[bin[ii]],
46 |                       R=ref[ii],
47 |                       A=alt[ii],
48 |                       method="L-BFGS-B",
49 |                       hessian=TRUE,
50 |                       lower=-5,
51 |                       upper=5)
52 |     c(auxLogis$par,1/(auxLogis$hessian)^.5)
53 |   }))
54 |   ## eps ~ jointly inferred error rate for this sample
55 |   rho3 <- plogis(aux2[,1])
56 |   betas.beta.binom <- aux2[,1]
57 |   lrt3 <-
58 |     logLikBetaBinomialRhoEps(rho3,eps,Mvec[bin],ref,alt) -
59 |     logLikBetaBinomialRhoEps(prop,eps,Mvec[bin],ref,alt)
60 |   pval3 <- (1-pchisq(2*lrt3,df=1))
61 |   ##
62 |   ##betas.se <- abs(betas.beta.binom/qnorm(pval3/2))
63 |   betas_se <- aux2[, 2]
64 |   ##  betas_w <- 1/betas_se^2
65 |   betas_z <- (betas.beta.binom - qlogis(prop))/betas_se
66 |   ##
67 |   padj_quasar <- p.adjust(pval3,method="BH")
68 |   data.frame(bin,betas.beta.binom,betas_se,betas_z,pval3,padj_quasar)
69 | }
70 | 


--------------------------------------------------------------------------------
/mpra/README.md:
--------------------------------------------------------------------------------
 1 | # QuASAR-MPRA: Accurate allele-specific analysis for massively parallel  reporter assays
 2 | We have further developed our method for allele specific analysis QuASAR (quantitative allele-specific analysis of reads) ([Harvey et al, 2015]) to analyze allele specific signals in barcoded read counts data from MPRAs (preprint available in [Kalita et al,2017]). Using this approach, we can take into account the uncertainty on the original plasmid proportions, over-dispersion, and sequencing errors. Here, we demonstrate how to use QuASAR-MPRA to analyze the MPRA data by [Tewhey et al,2016].
 3 | 
 4 | The current software is still in development and we will kindly appreciate any comments and bug reports. We assume that you already have installed the QuASAR library. 
 5 | 
 6 | The [mpra.R] script contains the instructions to run the test. As an example we provide the HepG2 data for the forward strand in [Tewhey et al,2016].[preprocessing.R] contains the steps to prepare this input file.
 7 | 
 8 | ```R
 9 | library(QuASAR)
10 | 
11 | ## Loading the sample data:
12 | HepG2 <- read.table("http://genome.grid.wayne.edu/quasar/samplempra/HepG2.mpra.txt",sep='\t',as.is=T,header=T)
13 | 
14 | ## Fitting the QuASAR model:
15 | HepG2.res <- fitQuasarMpra(HepG2$R,HepG2$A,HepG2$DNA_prop)
16 | 
17 | ## Number of significant hits 10% FDR:
18 | sum(HepG2.res$padj_quasar<0.1)
19 | 
20 | ## QQ-plot: 
21 | library(qqman)
22 | qq(HepG2.res$pval3)
23 | 
24 | ```
25 | 
26 | To fit the model we use the `fitQuasarMpra` function that needs three input vectors: 
27 | - `ref` = number of RNA reads for the reference allele 
28 | - `alt` = number of RNA reads for the alternate allele 
29 | - `prop` = reference DNA proportion in the plasmid library
30 | 
31 | The returned data frame `HepG2.res` has the following fileds:
32 | - `bin` total coverage bin used
33 | - `betas.beta.binom`  logit transfomation of the RNA allelic skew
34 | - `betas_se` standard error for the beta parameter estimate
35 | - `betas_z` zscore for `betas.beta.binom` -plogis(`propr`) 
36 | - `pval3` p.value 
37 | - `padj_quasar` BH adjusted p.value 
38 | 
39 | ```R
40 | > head(HepG2.res)
41 |                   bin betas.beta.binom  betas_se    betas_z       pval3 padj_quasar
42 | 1   (2.7e+03,3.3e+03]      -0.12164712 0.1409832 -0.2849424 0.774655868   1.0000000
43 | 2   (2.1e+03,2.7e+03]      -0.22659942 0.1630645 -1.0884486 0.273956397   1.0000000
44 | 3  (3.3e+03,3.96e+03]      -0.05672033 0.1282004 -0.6543586 0.513012947   1.0000000
45 | 4 (7.56e+03,2.22e+06]      -0.46219112 0.2338646 -0.1999347 0.838163133   1.0000000
46 | 5   (2.7e+03,3.3e+03]      -0.04063415 0.1403660  0.8889998 0.376112030   1.0000000
47 | 6             (0,859]      -1.17993132 0.5370852 -2.8797701 0.001298818   0.1951728
48 | ```
49 | 
50 | <!-- links -->
51 | [Kalita et al,2017]:http://biorxiv.org/content/early/2017/02/03/105627
52 | [Harvey et al, 2015]:http://bioinformatics.oxfordjournals.org/content/31/8/1235
53 | [mpra.R]:mpra.R
54 | [process.R]:process.R
55 | [Tewhey et al,2016]:https://www.ncbi.nlm.nih.gov/pubmed/27259153
56 | 
57 | 


--------------------------------------------------------------------------------
/R/fitAseNull.R:
--------------------------------------------------------------------------------
 1 | #' @title fit QuASAR for a single sample
 2 | #'
 3 | #' @description
 4 | #' for a single sample conduct genotyping and estimate sample error
 5 | #'
 6 | #' @param ref reference read count
 7 | #' @param alt alternate read count
 8 | #' @param eps starting value
 9 | #' @param log.gmat log of genotype priors
10 | #' @param max.it maximum number of iterations for algorithm
11 | #' @param tol tolerance to assess convergance
12 | #' @param fixGprior logical to fix genotype priors across all steps of the algorithm
13 | #' @return list of genotypes, log genotypes, error estimate, log-likelihood, and the sum of
14 | #' log likelihoods 
15 | #' @export
16 | fitAseNull <- function(ref,alt,eps=0.1,log.gmat,max.it=100,tol=1E-16,fixGprior=TRUE){
17 |   L <- length(ref);
18 |   stopifnot(L == length(alt));
19 |   ## Parameter initialization
20 |   ##log.gmat <- matrix(log(1/3),L,3)
21 |   if(missing(log.gmat)){
22 |     log.gmat <- matrix(log(c(0.25,0.5,0.25)),1,3)
23 |     colnames(log.gmat) <- c("g0","g1","g2");
24 |   }
25 |   it.num <- 0
26 |   logit.eps <- qlogis(0.1);
27 |   converged <- FALSE;
28 |   logliksum <- 0;
29 |   while(it.num <= max.it){
30 |   ############ E-step #############
31 |     log.gt <- cbind(g0   = ref * log(1 - eps) + alt * log(eps) + log.gmat[,"g0"],
32 |                     g1 = (ref + alt) * log(0.5) + log.gmat[,"g1"],
33 |                     g2   = ref * log(eps) + alt * log(1 - eps) + log.gmat[,"g2"]
34 |                     )
35 |     ##log.gt[log.gt < (-200)] <- (-200)
36 |     ##browser()
37 |     ## This normalizes marginal posterior probabilities to add 1
38 | 	log.gt.max <- pmax(log.gt[,"g0"],log.gt[,"g1"],log.gt[,"g2"])
39 | 	log.gt <- apply(log.gt,2,function(col){col-log.gt.max})	
40 | 	loglik <- log(exp(log.gt) %*% rep(1,3))+log.gt.max;	
41 | 	loglik2 <- log(exp(log.gt) %*% rep(1,3));		
42 | 	
43 |     new.logliksum <- sum(loglik);
44 |     if(abs(new.logliksum-logliksum)<tol)
45 |       converged <- TRUE;
46 |     log.gt <- apply(log.gt,2,function(col){col-loglik2})
47 |     gt <- exp(log.gt);
48 |     if((it.num == max.it) | (converged==TRUE))
49 |       break;
50 |     ###### M-STEP  #####
51 |     ## w6=w7 and w5=w4 , w2=w1, 
52 |     ## epsilon
53 |     converged <- TRUE;
54 |     new.logit.eps <- log(sum( gt[,"g2"] * ref + gt[,"g0"] * alt)) - log(sum( gt[,"g0"] * ref + gt[,"g2"] * alt))
55 |    ## browser()
56 |     if((logit.eps-new.logit.eps)>tol)
57 |       converged <- FALSE;
58 |     logit.eps <- new.logit.eps;
59 |     eps <- plogis(logit.eps)
60 |     ## ## ## opt 2
61 |     if(!fixGprior){
62 |       log.gmat[,"g0"] <- log.gt[,"g0"]
63 |       log.gmat[,"g1"] <- log.gt[,"g1"]
64 |       log.gmat[,"g2"] <- log.gt[,"g2"]
65 |     }
66 |     ## ##
67 |     it.num <- it.num +1;
68 |     cat("#it:",it.num,"eps=",eps,"Post:",colMeans(gt),"loglik",logliksum,"DeltaLogLik",abs(new.logliksum-logliksum),"\n");
69 |     stopifnot(!is.na(eps))
70 |     logliksum <- new.logliksum;
71 |   }
72 |   colnames(log.gt) <- c("g0","g1","g2");
73 |   ##log.gt[log.gt < (-200)] <- (-200)
74 |   log.gt.max <- pmax(log.gt[,"g0"],log.gt[,"g1"],log.gt[,"g2"])
75 |   log.gt <- apply(log.gt,2,function(col){col-log.gt.max})	
76 |   ## This normalizes marginal posterior probabilities to add 1
77 |   loglik2 <- log(exp(log.gt) %*% rep(1,3));
78 |   log.gt <- apply(log.gt,2,function(col){col-loglik2})
79 |   gt <- exp(log.gt);
80 |     
81 |   invisible(list(gt=gt,log.gt=log.gt,eps=eps,loglik=loglik,logliksum=logliksum))
82 | }
83 | 


--------------------------------------------------------------------------------
/scripts/convertPileupToQuasar.R:
--------------------------------------------------------------------------------
 1 | ###################################################################
 2 | ## Generate read counts covering reference and alternate alleles ##
 3 | ## from a bed-formatted pilup file 															 ##
 4 | ###################################################################
 5 | 
 6 | # Get the numerical base call quality score form the reported ascii value
 7 | qual <- function(char) { strtoi(charToRaw(char),16L)-33 };
 8 | 
 9 | ## Default values
10 | mincov <- 4				# Minimum coverage
11 | maxcov <- 200000 	# Max coverage
12 | 
13 | ## Get inputs from command line
14 | cargs <- commandArgs(trail=TRUE);
15 | if(length(cargs) >= 1)
16 | 	pileupFile <- cargs[1];
17 | if(length(cargs) >= 2)
18 | 	mincov <- cargs[2];
19 | if(length(cargs) >= 3)
20 | 	maxcov <- cargs[3];
21 | 
22 | # Get the input data
23 | command <- paste("less ", pileupFile,
24 | 		"| awk ' $5 >=", mincov, " && $5 <=", maxcov, "'")
25 | pileup <- read.table(file=pipe(command), header=F, quote="", comment.char="",
26 | 	as.is=T, sep="\t") 
27 | names(pileup) <- c("chr", "pos-1", "pos", "ref", "num.reads", "read.alleles",
28 | 	"read.quality", "rsID", "TKG.Ref", "alt", "af")
29 | 
30 | # See if the ref allels match, then discard uncesessary columns
31 | indMatch <- (toupper(pileup$ref) == pileup$TKG.Ref)
32 | pileup <- pileup[indMatch, c("chr", "pos-1", "pos", "ref", "alt", "rsID",
33 | 	"num.reads", "read.alleles", "read.quality", "af")]
34 | stopifnot(mean(indMatch)>0.8) ## Stop if too many errors
35 | rm(indMatch)
36 | 
37 | # Duplicates can arise for (at least) 3 reasons: tri+ alleleic SNPs,
38 | # indels (should already be filtered), and incongruencies between 
39 | # genome assembly reference allele and 1KG reference allele
40 | d1 <- duplicated(paste(pileup$chr, pileup$pos, sep=":"))
41 | d2 <- duplicated(paste(pileup$chr, pileup$pos, sep=":"), fromLast=T)
42 | pileup <- pileup[!(d1 | d2), ]
43 | rm(d1,d2)
44 | 
45 | ## Filter the alleles to remove those at the beginning 
46 | ## and end of a mapped read, then remove records with no reads
47 | pileup$read.alleles.filt <- mapply(gsub, '[a-zA-Z., ]\\$', '$', 
48 | 																		pileup$read.alleles)
49 | pileup$read.alleles.filt <- mapply(gsub, '\\^[[:punct:][:alnum:]][a-zA-Z., ]',
50 | 																		'^', pileup$read.alleles.filt)
51 | pileup <- pileup[nchar(pileup$read.alleles.filt)>0, ]
52 | 
53 | ## Examine the base quality scores. Output summaries, but 
54 | ## don't filter anything
55 | qual <- sapply(1:nrow(pileup), function(ii){qual(pileup$read.quality[ii])})
56 | qual <- unlist(qual)
57 | qtr <- quantile(qual, seq(0, 1, 0.1))
58 | qual.table <- table(qual);
59 | qtr
60 | qual.table
61 | 
62 | ## Clean up the read alleles and count the matches
63 | pileup$read.alleles.clean <- mapply(gsub, '[\\.\\, ]', pileup$ref, 
64 | 																		pileup$read.alleles.filt)
65 | pileup$read.alleles.clean <- toupper(pileup$read.alleles.clean)
66 | pileup$ref <- toupper(pileup$ref)
67 | pileup$ref.matches <- as.integer(nchar(mapply(gsub, paste('[^', 
68 | 	as.character(pileup$ref), ']', sep=""), '', pileup$read.alleles.clean)))
69 | pileup$alt.matches <- as.integer(nchar(mapply(gsub, paste('[^', 
70 | 	as.character(pileup$alt), ']', sep=""), '', pileup$read.alleles.clean)))
71 | 
72 | ## Log the number of reads not matching either ref or alt allele
73 | pileup$errors <- as.integer(nchar(mapply(gsub, paste('[^ACGT]', sep=""), 
74 | 	'', pileup$read.alleles.clean))) - (pileup$alt.matches + pileup$ref.matches)
75 | 
76 | ## Reorder by position so chr names appear right
77 | pileup <- pileup[order(pileup$chr, pileup$pos), c("chr", "pos-1", "pos", 
78 | 	"ref", "alt", "rsID", "af", "ref.matches", "alt.matches", "errors")];
79 | 
80 | ## Output the clean pileup file
81 | oName <- gsub(".*/", "", gsub(".pileup.bed.gz", "", pileupFile));
82 | outFile <- paste(oName, ".quasar.in.gz", sep="");
83 | write.table(pileup, gzfile(outFile), quote=F, col.names=F, 
84 | 	row.names=F, sep="\t")
85 | 


--------------------------------------------------------------------------------
/R/fitAseNullMulti.R:
--------------------------------------------------------------------------------
  1 | #' @title fit QuASAR for a multiple samples
  2 | #'
  3 | #' @description
  4 | #' joint geontype across samples and estimate sample error
  5 | #'
  6 | #' @param ref matrix of reference read count
  7 | #' @param alt matrix alternate read count
  8 | #' @param eps vector of starting values
  9 | #' @param log.gmat log of genotype priors
 10 | #' @param max.it maximum number of iterations for algorithm
 11 | #' @param tol tolerance to assess convergance
 12 | #' @param fixGprior logical to fix genotype priors across all steps of the algorithm
 13 | #' @param verbose logical turn on reporting during algorithm iterations
 14 | #' @return list of genotypes, log genotypes, vector oferror estimates, log-likelihood, and the sum of
 15 | #' log likelihoods 
 16 | #' @export
 17 | fitAseNullMulti <- function(ref,alt,eps=rep(0.1,ncol(ref)),log.gmat,max.it=100,tol=1E-16,fixGprior=TRUE,verbose=TRUE){
 18 | 	L <- nrow(ref);
 19 | 	S <- ncol(ref);
 20 | 	##browser()
 21 | 	stopifnot(L == nrow(alt));
 22 | 	## Parameter initialization
 23 | 	##log.gmat <- matrix(log(1/3),L,3)
 24 | 	if(missing(log.gmat)){
 25 | 		log.gmat <- matrix(log(c(0.25,0.5,0.25)),1,3)
 26 | 		colnames(log.gmat) <- c("g0","g1","g2");
 27 |         }
 28 | 
 29 |         it.num <- 0
 30 | 	logit.eps <- qlogis(eps);
 31 | 	converged <- FALSE;
 32 | 	logliksum <- 0;
 33 | 	rs <- rowSums((ref + alt)) * log(0.5);
 34 | 	while(it.num <= max.it){
 35 | 		############ E-step #############
 36 | 		log.gt <- cbind(ref %*% log(1 - eps) + alt %*% log(eps) + log.gmat[,"g0"],
 37 | 				rs  + log.gmat[,"g1"],
 38 | 				ref %*% log(eps) + alt %*% log(1 - eps) + log.gmat[,"g2"]
 39 | 		)
 40 | 		colnames(log.gt) <- c("g0","g1","g2");
 41 | 		log.gt.max <- pmax(log.gt[,"g0"],log.gt[,"g1"],log.gt[,"g2"])
 42 | 		log.gt <- apply(log.gt,2,function(col){col-log.gt.max})	
 43 | 		## This normalizes marginal posterior probabilities to add 1
 44 | 		loglik <- log(exp(log.gt) %*% rep(1,3))+log.gt.max;	
 45 | 		loglik2 <- log(exp(log.gt) %*% rep(1,3));		
 46 | 		##browser()		
 47 | 		## This normalizes marginal posterior probabilities to add 1
 48 | 		##loglik <- log(exp(log.gt) %*% rep(1,3));
 49 | 		#norm.genotypes<-t(sapply(1:L,function(ii){
 50 | 		#	exp(log.gt[ii,])*(exp(loglik[ii])^-1)
 51 | 		#
 52 | 		#	}))
 53 | 		#cat(norm.genotypes[1,],'\n')
 54 | 		new.logliksum <- sum(loglik);
 55 | 		#browser()
 56 | 		if(abs(new.logliksum-logliksum)<tol)
 57 | 			converged <- TRUE;
 58 | 		## Normalize such that rows of exp(log.gt) add to 1 
 59 | 		log.gt <- apply(log.gt,2,function(col){col-loglik2})
 60 | 		gt <- exp(log.gt);
 61 | 		if((it.num == max.it) | (converged==TRUE))
 62 | 			break;
 63 | 		###### M-STEP  #####
 64 | 		## w6=w7 and w5=w4 , w2=w1, 
 65 | 		## epsilon
 66 | 		converged <- TRUE;
 67 | 		num <- t(gt[,"g2"]) %*% ref + t(gt[,"g0"]) %*% alt + 0.01;  ## 0.01 added pseudo-counts to avoid 0,0 reads
 68 | 		den <- t(gt[,"g0"]) %*% ref + t(gt[,"g2"]) %*% alt + 0.01;
 69 | 		new.logit.eps <- as.vector(log(num) - log(den));
 70 | 		##browser()
 71 | 		if(max(abs(logit.eps-new.logit.eps))>tol)
 72 | 			converged <- FALSE;
 73 | 		logit.eps <- new.logit.eps;
 74 | 		eps <- plogis(logit.eps)
 75 | 		## ## ## opt 2
 76 | 		if(!fixGprior){
 77 | 			log.gmat[,"g0"] <- log.gt[,"g0"]
 78 | 			log.gmat[,"g1"] <- log.gt[,"g1"]
 79 | 			log.gmat[,"g2"] <- log.gt[,"g2"]
 80 | 		}
 81 | 		## ##
 82 | 		it.num <- it.num +1;
 83 | 		if(verbose){
 84 | 			cat("#it:",it.num,"eps=",eps,"Post:",colMeans(gt),"loglik",logliksum,"DeltaLogLik",abs(new.logliksum-logliksum),"\n");
 85 | 		}
 86 | 		stopifnot(!is.na(eps))
 87 | 		logliksum <- new.logliksum;
 88 | 	}
 89 | 	log.gt <- cbind(ref %*% log(1 - eps) + alt %*% log(eps) + log.gmat[,"g0"],
 90 | 			rs  + log.gmat[,"g1"],
 91 | 			ref %*% log(eps) + alt %*% log(1 - eps) + log.gmat[,"g2"]
 92 | 	)
 93 | 	colnames(log.gt) <- c("g0","g1","g2");
 94 | 	##log.gt[log.gt < (-200)] <- (-200)
 95 | 	log.gt.max <- pmax(log.gt[,"g0"],log.gt[,"g1"],log.gt[,"g2"])
 96 | 	log.gt <- apply(log.gt,2,function(col){col-log.gt.max})	
 97 | 	## This normalizes marginal posterior probabilities to add 1
 98 | 	loglik2 <- log(exp(log.gt) %*% rep(1,3));
 99 | 	log.gt <- apply(log.gt,2,function(col){col-loglik2})
100 | 	gt <- exp(log.gt);
101 | 	
102 | 	invisible(list(gt=gt,log.gt=log.gt,eps=eps,loglik=loglik,logliksum=logliksum))
103 | }
104 | 


--------------------------------------------------------------------------------
/mpra/preprocess.R:
--------------------------------------------------------------------------------
  1 | #This script shows does the preprocessing for the HepG2 cell-line data from 
  2 | #Tewhey et al. The other cell-lines can be processed in a similar manner. 
  3 | 
  4 | library(data.table)
  5 | 
  6 | ## ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75661/suppl/GSE75661_79k_collapsed_counts.txt.gz
  7 | 
  8 | MPRA_counts <- fread("curl 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75661/suppl/GSE75661_79k_collapsed_counts.txt.gz' | zcat")
  9 | 
 10 | ## Preprocessing DNA
 11 | mpra <- MPRA_counts[,c(1:6)]
 12 | mpra <- transform(mpra, DNA=Plasmid_r1+Plasmid_r2+Plasmid_r3+Plasmid_r4+Plasmid_r5)
 13 | mpra$Oligo <- as.character(mpra$Oligo)
 14 | alt <- mpra[grep('alt', mpra$Oligo),]
 15 | RC <- mpra[ grep('RC', mpra$Oligo), ]
 16 | 
 17 | mpra_F <- subset(mpra, !(mpra$Oligo %in% RC$Oligo))
 18 | x <- strsplit(mpra_F$Oligo, "_")
 19 | mpra_F$rsID <- sapply(x, function(y) { y[1] })
 20 | mpra_F$Allele <- sapply(x, function(y) { y[2] })
 21 | R <- mpra_F[grep('A',mpra_F$Allele),]
 22 | mpra_F <- transform(mpra_F, Allele_class=ifelse(Oligo %in% R$Oligo, "R", "A"), alt_hap=ifelse(Oligo %in% alt$Oligo, "1", "0"))
 23 | 
 24 | x <- strsplit(RC$Oligo, "_")
 25 | RC$rsID <- sapply(x, function(y) { y[1] })
 26 | #mpra$Allele_class <- sapply(x, function(y) { y[2] })
 27 | RC$Allele <- sapply(x, function(y) { y[3] })
 28 | R <- RC[grep('A',RC$Allele),]
 29 | RC <- transform(RC, Allele_class=ifelse(Oligo %in% R$Oligo, "R", "A"),alt_hap=ifelse(Oligo %in% alt$Oligo, "1", "0"))
 30 | 
 31 | mpra_ref <- subset(mpra_F, alt_hap=="0")
 32 | RC_ref <- subset(RC, alt_hap=="0")
 33 | mpra_alt <- subset(mpra_F, alt_hap=="1")
 34 | RC_alt <- subset(RC, alt_hap=="1")
 35 | allele_count <- dcast(mpra_ref, rsID ~ Allele_class, value.var="DNA", sum)
 36 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
 37 | y_100 <- subset(allele_count_5, R+A>=100)
 38 | y_100 <- transform(y_100, DNA_prop=R/(A+R), logit_prop=log2(R/A))
 39 | mpra_ref_Filt <- merge(mpra_ref, y_100, by="rsID")
 40 | 
 41 | allele_count <- dcast(mpra_alt, rsID ~ Allele_class, value.var="DNA", sum)
 42 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
 43 | y_100 <- subset(allele_count_5, R+A>=100)
 44 | y_100 <- transform(y_100, DNA_prop=R/(A+R), logit_prop=log2(R/A))
 45 | mpra_alt_Filt <- merge(mpra_alt, y_100, by="rsID")
 46 | ## 
 47 | dna_mpra_Filt <- rbind(mpra_ref_Filt, mpra_alt_Filt)
 48 | 
 49 | allele_count <- dcast(RC_ref, rsID ~ Allele_class, value.var="DNA", sum)
 50 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
 51 | y_100 <- subset(allele_count_5, R+A>=100)
 52 | y_100 <- transform(y_100, DNA_prop=R/(A+R), logit_prop=log2(R/A))
 53 | RC_ref_Filt <- merge(RC_ref, y_100, by="rsID")
 54 | 
 55 | allele_count <- dcast(RC_alt, rsID ~ Allele_class, value.var="DNA", sum)
 56 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
 57 | y_100 <- subset(allele_count_5, R+A>=100)
 58 | y_100 <- transform(y_100, DNA_prop=R/(A+R), logit_prop=log2(R/A))
 59 | RC_alt_Filt <- merge(RC_alt, y_100, by="rsID")
 60 | ##
 61 | dna_RC_Filt <- rbind(RC_ref_Filt, RC_alt_Filt)
 62 | 
 63 | ####
 64 | 
 65 | 
 66 | 
 67 | mpra <- MPRA_counts[,c(1,15:19)]
 68 | mpra <- transform(mpra, RNA=HepG2_r1+HepG2_r2+HepG2_r3+HepG2_r4+HepG2_r5)
 69 | ##mpra$Oligo <- as.character(mpra$Oligo)
 70 | alt <- mpra[grep('alt', mpra$Oligo),]
 71 | RC <- mpra[ grep('RC', mpra$Oligo), ]
 72 | names(dna_mpra_Filt)[12] <- "DNA_A"
 73 | names(dna_mpra_Filt)[13] <- "DNA_R"
 74 | names(dna_RC_Filt)[12] <- "DNA_A"
 75 | names(dna_RC_Filt)[13] <- "DNA_R"
 76 | dna_RC_Filt <- dna_RC_Filt[,-c(3:7)]
 77 | dna_mpra_Filt <- dna_mpra_Filt[,-c(3:7)]
 78 | RC <- merge(RC,dna_RC_Filt, by="Oligo")
 79 | mpra_F <- subset(mpra, !(mpra$Oligo %in% RC$Oligo))
 80 | mpra_F <- merge(mpra_F,dna_mpra_Filt, by="Oligo")
 81 | mpra_F <- subset(mpra_F, !(mpra_F$Oligo %in% RC$Oligo))
 82 | x <- strsplit(mpra_F$Oligo, "_")
 83 | mpra_F$rsID <- sapply(x, function(y) { y[1] })
 84 | mpra_F$Allele <- sapply(x, function(y) { y[2] })
 85 | R <- mpra_F[grep('A',mpra_F$Allele),]
 86 | mpra_F <- transform(mpra_F, Allele_class=ifelse(Oligo %in% R$Oligo, "R", "A"), alt_hap=ifelse(Oligo %in% alt$Oligo, "1", "0"))
 87 | 
 88 | x <- strsplit(RC$Oligo, "_")
 89 | RC$rsID <- sapply(x, function(y) { y[1] })
 90 | #mpra$Allele_class <- sapply(x, function(y) { y[2] })
 91 | RC$Allele <- sapply(x, function(y) { y[3] })
 92 | R <- RC[grep('A',RC$Allele),]
 93 | RC <- transform(RC, Allele_class=ifelse(Oligo %in% R$Oligo, "R", "A"),alt_hap=ifelse(Oligo %in% alt$Oligo, "1", "0"))
 94 | 
 95 | mpra_ref <- subset(mpra_F, alt_hap=="0")
 96 | RC_ref <- subset(RC, alt_hap=="0")
 97 | mpra_alt <- subset(mpra_F, alt_hap=="1")
 98 | RC_alt <- subset(RC, alt_hap=="1")
 99 | allele_count <- dcast(mpra_ref, rsID ~ Allele_class, value.var="RNA", sum)
100 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
101 | mpra_ref_Filt <- merge(mpra_ref, allele_count_5, by="rsID")
102 | 
103 | allele_count <- dcast(mpra_alt, rsID ~ Allele_class, value.var="RNA", sum)
104 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
105 | mpra_alt_Filt <- merge(mpra_alt, allele_count_5, by="rsID")
106 | mpra_Filt <- rbind(mpra_ref_Filt, mpra_alt_Filt)
107 | 
108 | allele_count <- dcast(RC_ref, rsID ~ Allele_class, value.var="RNA", sum)
109 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
110 | RC_ref_Filt <- merge(RC_ref, allele_count_5, by="rsID")
111 | 
112 | allele_count <- dcast(RC_alt, rsID ~ Allele_class, value.var="RNA", sum)
113 | allele_count_5 <- subset(allele_count,  R>= 5 & A>=5)
114 | RC_alt_Filt <- merge(RC_alt, allele_count_5, by="rsID")
115 | RC_Filt <- rbind(RC_ref_Filt, RC_alt_Filt)
116 | 
117 | ## Last step for the object needed to run QuASAR:
118 | HepG2 <- unique(mpra_Filt[,-c(2:11)])
119 | 
120 | HepG2_RC <- unique(RC_Filt[,-c(2:11)])
121 | 
122 | ##write.table(HepG2,file=gzfile("mpra/HepG2.mpra.txt.gz"),row.names=F,quote=F,sep="\t")
123 | 
124 | 


--------------------------------------------------------------------------------
/R/aseInference.R:
--------------------------------------------------------------------------------
  1 | #' @title aseInference
  2 | #'
  3 | #' @description
  4 | #' using genotypes from QuASAR, conduct inference on allelic imbalance 
  5 | #'
  6 | #' @param gts posterior probabilities of genotypes from QuASAR
  7 | #' @param eps.vect QuASAR estimates of sequencing error for each sample
  8 | #' @param priors 1K genomes minor allele frequencies as priors 
  9 | #' @param ref.mat matrix of reference allele counts
 10 | #' @param alt.mat matrix of alternate allele counts
 11 | #' @param min.cov threshold for the minimum coverage across all samples
 12 | #' @param sample.names verctor of sample names
 13 | #' @param annos annotations for all loci
 14 | #' @return inference.data list. 
 15 | #'
 16 | #' @export
 17 | 
 18 | aseInference <- function(gts, eps.vect, priors, ref.mat, alt.mat, min.cov, sample.names, annos){
 19 |   ##################################################################
 20 |   ## inference
 21 |   ##################################################################
 22 |   n.eps <- length(eps.vect)
 23 |   inference.data <- lapply(seq_along(1:n.eps), function(ii){
 24 | 
 25 |     sample <- ii
 26 |     this.sample <- sample.names[sample]
 27 |     coverage <- (ref.mat[, sample] + alt.mat[, sample])
 28 |     coverage.floor <- min.cov
 29 |     coverage.ind <- (coverage>coverage.floor)
 30 |     ref <- ref.mat[coverage.ind, sample]
 31 |     alt <- alt.mat[coverage.ind, sample]
 32 |     phi <- priors[coverage.ind]
 33 |     eps <- eps.vect[sample]
 34 |     het <- gts[coverage.ind, 2]
 35 |     het.ind <- (het > 0.99)
 36 |     numb.hets <- sum(het.ind)
 37 |     annotations <- annos[coverage.ind, ][het.ind, ]
 38 | 
 39 |     cat("============================================================\n", sep="")
 40 |     cat("==========Processing Sample: ", this.sample, "==========\n", sep="")
 41 |     cat("==========", numb.hets, " heterozygotes with [P(het)>.99]==========","\n", sep="")
 42 | 
 43 |     ##################################################################
 44 |     ## rho ~ calculate rho using eps
 45 |     ## rho0 ~ simple rho estimate
 46 |     ## lrt ~ likelihood ratio test using the simple rho estimate
 47 |     ## pval ~ pval of the llk ratio test using the chi-square approximation
 48 |     rho <- comp.rho(ref,alt,eps)
 49 |     rho0 <- plogis(log(ref)-log(alt))
 50 |     lrt <- lrtEpsRhoBinom(ref,alt,eps,rho)
 51 |     pval <- (1-pchisq(2*lrt,df=1))
 52 | 
 53 |     ##################################################################
 54 |     ## D ~ a grid of possible dispersion values for the Beta-binomial model
 55 |     ## aux ~ loglikelihood of the beta bionmial model across D values
 56 |     ## with null \rho value
 57 |     ## Dmax ~ disperison which maximizes the llk
 58 |     D <- exp((0:500)/50)
 59 |     aux <- sapply(D,function(D){
 60 |       sum(logLikBetaBinomialRhoEps(0.5,eps,D,ref[het.ind],alt[het.ind]))
 61 |     })
 62 |     Dmax <- D[which.max(aux)]
 63 | 
 64 |     cat("==========Dispersion estimate: ", round(Dmax, 3), "==========\n", sep="")
 65 | 
 66 |     ##################################################################
 67 |     ## Find MLE for \rho using the the Beta-Binomial model
 68 |     ## Dmax2 ~ the dispersian parameter estimed from the llk in the
 69 |     ##   previous step
 70 |     ## aux2 ~ optimization of the Beta-biomial model in terms of
 71 |     ##            logit(\rho)
 72 |     ## rho3 ~ vector of rho estimates from expit(logit(\rho))
 73 |     ## lrt3 ~ Recaluclate Het LRT using the beta-bionomial
 74 |     ## pval3 ~ pval of thr llk ratio test using the chi-square approximation
 75 |     ## betas.beta.binom ~ logit(\rho) or beta value for heterozygotes
 76 |     ## betas.se ~ standard error of the beta value
 77 |     ## betas.z ~ Z scores of the heterozygote beta values
 78 |     ## betas.pval ~ pvalues for the above z-scores
 79 |     Dmax2 <- Dmax
 80 | 
 81 |     aux2 <- t(sapply(1:sum(het.ind),function(ii){
 82 |                          auxLogis <- optim(0,fn=logLikBetaBinomial2,
 83 |                                             gr=gLogLikBetaBinomial,
 84 |                                             D=Dmax2,
 85 |                                             R=ref[het.ind][ii],
 86 |                                             A=alt[het.ind][ii],
 87 |                                             method="L-BFGS-B", hessian=TRUE)
 88 |                          c(auxLogis$par,1/(auxLogis$hessian)^.5)
 89 |                        }))
 90 |     rho3 <- plogis(aux2[,1])
 91 |     betas.beta.binom <- aux2[,1]
 92 |     #betas.se <- aux2[,2]
 93 | 
 94 | 
 95 |     #rho3 <- plogis(auxLogis$par)
 96 |     lrt3 <- logLikBetaBinomialRhoEps(rho3,eps,Dmax2,ref[het.ind],alt[het.ind]) -
 97 |       logLikBetaBinomialRhoEps(0.5,eps,Dmax,ref[het.ind],alt[het.ind])
 98 |     pval3 <- (1-pchisq(2*lrt3,df=1))
 99 |     betas.se <- abs(betas.beta.binom/qnorm(pval3/2))
100 |     betas.se[which(betas.se=='NaN')] <- aux2[, 2][which(betas.se=='NaN')]
101 | 
102 | 
103 |     #betas.beta.binom <- auxLogis$par
104 |     #betas.se <- 1/(diag(auxLogis$hessian)^.5)
105 |     betas.z <- betas.beta.binom/betas.se
106 |     betas.pval <- 2*pnorm(-abs(betas.z))
107 | 
108 |   
109 |     ##################################################################
110 |     ## rho2 ~ reassign rho calculated with a simple estimate from epsilon
111 |     ## rho2[het.ind] ~ heterozygotes are re-assigned rho values
112 |     ##         calucalted from grid optimization above
113 |     ## aux ~ choose the null model whith largest probability
114 |     ## lrt2 ~ llkRT with Beta-bionmial and the possibility that
115 |     ##        that the heterozygote is of a different genotype
116 |     ## pval2 ~ pval of thr llk ratio test using the chi-square approximation
117 |     ##         (includes uncertainty in genotyping)
118 |     ## qv2 ~ qvalue object calucalted from the llkRT
119 |     ## qv2.qvals ~ qvalues from the previous calculation
120 |     rho2 <- rho
121 |     rho2[het.ind] <- rho3
122 |     aux <- pmax(logLikBetaBinomialRhoEps(0.0,eps,Dmax,ref,alt),
123 |                 logLikBetaBinomialRhoEps(1.0,eps,Dmax,ref,alt),
124 |                 logLikBetaBinomialRhoEps(0.5,eps,Dmax,ref,alt))
125 |     lrt2 <- logLikBetaBinomialRhoEps(rho2,eps,Dmax2,ref,alt) - aux;
126 |     pval2 <- (1-pchisq(2*lrt2,df=1))
127 | 
128 | 
129 |     ##################################################################
130 |     ## return data frame
131 |     rsID <- annotations
132 |     betas <- betas.beta.binom
133 |     temp <- list(dat=data.frame(annotations$rsID, annotations$chr, annotations$pos0, betas, betas.se, pval2[het.ind]), n.hets=numb.hets, dispersion= Dmax2)
134 | 
135 | 
136 |   }) ## Returns a list of data & metaData
137 | 
138 |   names(inference.data) <- sample.names
139 |   inference.data
140 | }
141 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # QuASAR: Quantitative Allele Specific Analysis of Reads
  2 | QuASAR ([Harvey et al, 2015]) is an R package, that implements a statistical method for: i) genotyping from next-generation sequencing reads, and ii) conducting inference on allelic imbalance at heterozygous sites. The sequencing data can be RNA-seq, DNase-seq, ATAC-seq or any other type of high-throughput sequencing data. The input data to QuASAR is a processed pileup file (as detailed later). Here, we do not cover in depth important pre-processing steps such as choice of the aligner, read filtering and duplicate removal. For our new method on MPRA data analysis please check the [mpra folder](mpra/). 
  3 | 
  4 | We also want to emphasize that the current software is still in development, we would kindly appreciate any comments and bug reports.
  5 | <!---
  6 | Prior to analsyis, RNA-Seq data must undergo alignment with a modern aligner, quality filtering, duplicate removal, and the creation of pileups. There are many tools and tutorials available for preprocessing Next Generation Sequencing data, but we will only describe the tools we used and expect the user to have basic familiarity with standard bioinformatics command-line tools. Our goal with this tutorial is to cover the following:
  7 | 
  8 | 1. Installing QuASAR
  9 | 2. Preprocessing 
 10 |    * Alignment, filtering, and removing duplicates. (Description of, not a tutorial how)
 11 |    * Pileups and clean pileups
 12 | 3. QuASAR analyis pipeline
 13 |    * Genotyping single or multiple samples
 14 |    * Inference on ASE
 15 |    * Sample workflow
 16 | 
 17 | **Quick-start**: Users comfortable processing RNA-Seq data to the level of pileups should skip to the second step of preprocessing. 
 18 | -->
 19 | 
 20 | ## 1. Installation
 21 | 
 22 | To install from within an R session:
 23 | 
 24 | ```R
 25 | require(devtools)
 26 | install_github('piquelab/QuASAR')
 27 | library('QuASAR')
 28 | ```
 29 | 
 30 | However, this method is occasionally problematic. Alternatively, you can clone/fork this repository and then build the package:
 31 | ```C
 32 | git clone https://github.com/piquelab/QuASAR.git
 33 | R CMD build QuASAR
 34 | ```
 35 | then in R,
 36 | ```R
 37 | install.packages('QuASAR_x.y.tar.gz')
 38 | library(QuASAR)
 39 | ```
 40 | 
 41 | ## 2. Preprocessing
 42 | ### Alignment & filtering
 43 | Raw reads can be aligned to the reference genome using your favorite aligner. Because allele-specific analysis is extremely sensitive to read biases and mapping errors, we strongly recommend adding steps to remove PCR duplicates and to remove reads aligning to areas with known mappability issues (e.g., [Degner et al, 2009]).
 44 | 
 45 | 
 46 | ### Pileups & cleaned pileups
 47 | Note: These steps require [samtools] and [bedtools].
 48 | 
 49 | Using the samtools mpileup command, create a pileup file from aligned reads. Provide a fasta-formatted reference genome (hg19.fa) and a bed file of positions you wish to pileup on (e.g., 1K genomes SNP positions [1KG snp file]):
 50 | 
 51 | ```C
 52 | samtools mpileup -f hg19.fa -l snps.af.bed input.bam | gzip > input.pileup.gz
 53 | ```
 54 | 
 55 | Next, convert the pileup file into bed format and use intersectBed to include the allele frequencies from a bed file. The bed file with allele frequencies should be seven columns: 1-3) coordinate, 4) SNP ID, 5) reference allele, 6) alternate allele, 7) allele frequency. This can be the same [1KG snp file] used in the pileup stage. The awk filter step (below) removes positions not covered by a read, positions covered by indels, and reference skips:
 56 | 
 57 | ```C
 58 | less input.pileup.gz | awk -v OFS='\t' '{ if ($4>0 && $5 !~ /[^\^][<>]/ && $5 !~ /\+[0-9]+[ACGTNacgtn]+/ && $5 !~ /-[0-9]+[ACGTNacgtn]+/ && $5 !~ /[^\^]\*/) print $1,$2-1,$2,$3,$4,$5,$6}' | sortBed -i stdin | intersectBed -a stdin -b snps.af.bed -wo | cut -f 1-7,11-14 | gzip > input.pileup.bed.gz
 59 | ```
 60 | 
 61 | Finally, get the read counts at each position, and, if desired, perform any additional filtering. The result will be the input file for QuASAR. An example processing script is provided here: [scripts/convertPileupToQuasar.R].
 62 | 
 63 | ```C
 64 | R --vanilla --args input.pileup.bed.gz < convertPileupToQuasar.R
 65 | ```
 66 | 
 67 | Here is an example of how the QuASAR infput file should look:
 68 | 
 69 | ```C
 70 | zless input.quasar.in.gz | head -5
 71 | chr1	879910	879911	G	A	rs143853699	0.02	21	0	0
 72 | chr1	892379	892380	G	A	rs150615968	0.0041	22	0	0
 73 | chr1	893384	893385	G	A	rs140972868	0.01	6	0	0
 74 | chr1	894101	894102	A	T	rs188691615	0.01	6	0	0
 75 | chr1	894430	894431	G	A	rs201791495	9e-04	9	0	0
 76 | ```
 77 | 
 78 | The fields are as follows: 
 79 | 1. Chromosome 
 80 | 2. Start position 
 81 | 3. End position 
 82 | 4. Reference allele 
 83 | 5. Alternate allele 
 84 | 6. SNP ID 
 85 | 7. SNP allele frequency 
 86 | 8. Number of reads mapping to the reference allele 
 87 | 9. Number of reads mapping to the alternate allele 
 88 | 10. Number of reads not mapping to either allele
 89 | 
 90 | ## 3. Running QuASAR
 91 | 
 92 | ### Prepare the input samples 
 93 | For a test run we provide a small sample dataset containing 6 samples from the same individual. 
 94 | The following commands will download the data to the current folder:
 95 | 
 96 | ```R
 97 | urlData="http://genome.grid.wayne.edu/quasar/sampleinput/"
 98 | fileNames <- paste0("t",c(2,4,6,12,18,24),"hr_Huvec_Rep1.quasar.in.gz")
 99 | sapply(fileNames,function (ii) download.file(paste0(urlData,ii),ii))
100 | ```
101 | 
102 | To run the sample data, or any data, we provide a few helper functions to merge samples across the union of all annotated sites (`UnionExtractFields`), and to filter sites with insufficient coverage across all samples (`PrepForGenotyping`). Note: these functions utilize calls to [bedtools].
103 | 
104 | ```R
105 | ase.dat <- UnionExtractFields(fileNames, combine=TRUE)
106 | ase.dat.gt <- PrepForGenotyping(ase.dat, min.coverage=5)
107 | sample.names <- colnames(ase.dat.gt$ref)
108 | ```
109 | 
110 | ### Genotyping an individual from multiple samples
111 | Genotyping an individual using `fitAseNullMulti` requires a matrix of reference counts and a matrix of alternate counts where where the columns are ordered by sample. The final argument is a matrix of priors for the minor allele frquency, for which we use the 1K genomes MAFs assumed to be at Hardy-Weinberg equilibrium.  
112 | ```R
113 | ase.joint <- fitAseNullMulti(ase.dat.gt$ref, ase.dat.gt$alt, log.gmat=log(ase.dat.gt$gmat))
114 | ```
115 | This function returns a list with the following members:
116 | ```R
117 | names(ase.joint)
118 | [1] "gt"        "log.gt"    "eps"       "loglik"    "logliksum"
119 | ```
120 | where the posterior probability of the genotypes, `gt`, across all samples are accessed as follows:
121 | ```C
122 | head(ase.joint$gt)
123 |                g0           g1           g2
124 | [1,] 2.870026e-98 1.000000e+00 2.939460e-70
125 | [2,] 1.465195e-27 7.773259e-04 9.992227e-01
126 | [3,] 3.732811e-61 4.308038e-07 9.999996e-01
127 | [4,] 9.992226e-01 7.774208e-04 1.714236e-27
128 | [5,] 9.435425e-87 9.726281e-10 1.000000e+00
129 | [6,] 9.999863e-01 1.372351e-05 6.274482e-46
130 | ```
131 | 
132 | g0=homozygous reference, g1=heterozygous, & g2=homozygous alternate. To save the output genotype probabilities together with the SNP annotation, we do:                                                                                                
133 | ```R
134 | out_dat <- data.frame(ase.dat.gt$annotations[, -5], map=ase.joint$gt)
135 | write.table(out_dat, file='genotypes.txt', row.names=FALSE, col.names=FALSE, quote=FALSE,sep="\t")
136 | ```
137 | 
138 | Estimates of error parameters `eps` for each sample are:
139 | 
140 | ```C
141 | ase.joint$eps
142 | [1] 0.0008748778 0.0007617141 0.0008152132 0.0007819780 0.0008956686
143 | [6] 0.0007597717
144 | ```
145 | 
146 | 
147 | ### Inference on ASE
148 | Using `aseInference` to conduct inference on ASE for an individual requires the posterior probabilities of each genotypes from the previous step `"gt"`, estimates of sequencing error for each sample `"eps"`, the same priors used in the previous step, reference counts, alternate counts, minimum coverage, sample names, and variant annotations. 
149 | ```R
150 | ourInferenceData <- aseInference(gts=ase.joint$gt, eps.vect=ase.joint$eps, priors=ase.dat.gt$gmat, ref.mat=ase.dat.gt$ref, alt.mat=ase.dat.gt$alt, min.cov=10, sample.names=sample.names, annos=ase.dat.gt$annotations)
151 | ```
152 | This function returns a list where each element corresponds to an input sample:
153 | ```R
154 | names(ourInferenceData[[1]])
155 | [1] "dat"        "n.hets"     "dispersion"
156 | ```
157 | where `dat` contains estimates of allelic imbalance `betas`, standard errors `betas.se`, & pvalues from an LRT for ASE detailed in [Harvey et al, 2014]. Note that the number of rows (SNPs) in each sample corresponds to the the number of heterozygous SNPs passing a minimum coverage filter. 
158 | ```R
159 |  head(ourInferenceData[[1]]$dat)
160 |  annotations.rsID annotations.chr annotations.pos0       betas  betas.se    pval3 
161 | 1        rs2272757            chr1           881626  0.15175892 0.6005410 0.80049721
162 | 2        rs2465128            chr1           981930  0.17948875 0.6445723 0.78065789
163 | 3        rs9442391            chr1           984301 -0.15175892 0.6005410 0.80049721
164 | 4       rs12142199            chr1          1249186 -0.43478406 0.4845478 0.36955958
165 | 5           rs7290            chr1          1477243 -0.99328368 0.5969363 0.09611857
166 | 6           rs7533            chr1          1479332 -0.09853221 0.3981711 0.80455070
167 | ```
168 | The final members of the list are the number of heterozygotes and the esimtate of dispersion for each sample.
169 | ```R
170 | head(ourInferenceData[[1]]$n.hets)
171 | [1] 2856
172 | head(ourInferenceData[[1]]$dispersion)
173 | [1] 64.07152
174 | ```
175 | 
176 | The code for this sample workflow is located here:
177 | [scripts/exampleWorkflow.R]
178 | 
179 | <!-- links -->
180 | [Harvey et al, 2015]:http://bioinformatics.oxfordjournals.org/content/31/8/1235
181 | [Degner et al, 2009]:http://www.ncbi.nlm.nih.gov/pubmed/19808877
182 | [samtools]:http://samtools.sourceforge.net/
183 | [bedtools]:https://github.com/arq5x/bedtools2
184 | [scripts/convertPileupToQuasar.R]:scripts/convertPileupToQuasar.R
185 | [scripts/exampleWorkflow.R]:scripts/exampleWorkflow.R
186 | [1KG snp file]:http://genome.grid.wayne.edu/centisnps/1kgSnps.html
187 | 


--------------------------------------------------------------------------------