├── .Rbuildignore ├── .gitignore ├── DESCRIPTION ├── MeRIPtools.Rproj ├── NAMESPACE ├── R ├── AllGenerics.R ├── Guitar_functions.R ├── MeRIP_class.R ├── PlotGene.R ├── QTL_PoissonGamma.R ├── QTL_betaBinomial.R ├── QTL_betaBinomial.neg.R ├── RcppExports.R ├── countReads.R ├── countReadsCarRNA.R ├── countReadsFromBam.R ├── get_peak_logOR.R ├── gtfToGeneModel.R ├── initialization.R ├── methods.R ├── plotGeneCov.R ├── plotMetaGene.R ├── plotPCA.DESeq2.R ├── qqplot.pvalue.R └── randomPeak.R ├── README.md ├── man ├── BetaBinTest.Rd ├── GuitarPlotNew.Rd ├── IP.files.Rd ├── Input.files.Rd ├── MetaGene.Rd ├── PrepCoveragePlot-MeRIP.Peak-method.Rd ├── QNBtest.Rd ├── QTL_BetaBin.Rd ├── QTL_BetaBin2.Rd ├── QTL_BetaBin_permute.Rd ├── QTL_PoissonGamma.Rd ├── RADARtest.Rd ├── adjustExprLevel-MeRIP.Peak-method.Rd ├── annotatePeak-MeRIP.Peak-method.Rd ├── callPeakBinomial.Rd ├── callPeakFisher.Rd ├── consistentPeak-MeRIP.Peak-method.Rd ├── countReads.Rd ├── counts.Rd ├── dot-Bino_test.Rd ├── geneBins-MeRIP-method.Rd ├── geneExpression-MeRIP.Peak-method.Rd ├── geneExpression.Rd ├── geneExpressionTMP.Rd ├── get_peak_logOR.Rd ├── gtfToGeneModel.Rd ├── jointPeakCount.Rd ├── normalizeLibrary-MeRIP.Peak-method.Rd ├── peakDistribution.Rd ├── plotGene.Rd ├── plotGeneCov-MeRIP.Peak-method.Rd ├── plotGeneCoverage.Rd ├── plotGeneMonster.Rd ├── plotGenePair.Rd ├── plotMetaGene.Rd ├── plotMetaGeneMulti.Rd ├── plotPCAfromMatrix.Rd ├── plotTPM.Rd ├── qqplot.pvalue.Rd ├── reportJointPeak.Rd ├── results.Rd ├── select.Rd ├── swapChr22.QTL_BetaBin.Rd └── swapChr22.QTL_BetaBin.permute.Rd └── src ├── .PoissonGamma.cpp.swp ├── .gitignore ├── Makevars ├── PoissonGamma.cpp └── RcppExports.cpp /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: MeRIPtools 2 | Type: Package 3 | Title: Analyze MeRIP-seq data and QTL calling 4 | Version: 0.2.1 5 | Author: Zijie Zhang, Kaixuan Luo , Chuan He, Xin He 6 | Maintainer: The package maintainer 7 | Description: The MeRIPtools is a comprehensive toolset for analyzing MeRIP-seq (e.g. m6A-seq) and data visualization. 8 | License: MIT License 9 | Encoding: UTF-8 10 | LazyData: true 11 | Depends: GenomicFeatures, Rsamtools, ggplot2, doParallel, foreach,grid,rtracklayer,GenomicAlignments,reshape2,Rcpp,RcppArmadillo, qvalue 12 | RoxygenNote: 6.1.1 13 | LinkingTo: Rcpp, RcppArmadillo 14 | Imports: Guitar, stringr,vcfR, broom, DESeq2, ggsci 15 | #Imports: Guitar, stringr,vcfR,gamlss, broom, DESeq2, ggsci 16 | 17 | -------------------------------------------------------------------------------- /MeRIPtools.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | importFrom(Rcpp,sourceCpp) 2 | useDynLib(MeRIPtools) 3 | exportPattern("^[[:alpha:]]+") 4 | 5 | 6 | import(BSgenome) 7 | import(DESeq2) 8 | import(GenomicRanges) 9 | import(Guitar) 10 | #import(QNB) 11 | import(broom) 12 | #import(gamlss) 13 | #import(gamlss.dist) 14 | import(rtracklayer) 15 | import(stringr) 16 | import(vcfR) 17 | -------------------------------------------------------------------------------- /R/AllGenerics.R: -------------------------------------------------------------------------------- 1 | #' @import DESeq2 2 | #' @export 3 | setGeneric("counts", getGeneric("counts", package = "DESeq2")) 4 | 5 | #' @export 6 | #' @rdname counts 7 | setMethod("counts", signature("MeRIP"), function(object){ 8 | object@reads 9 | }) 10 | 11 | #' @export 12 | #' @rdname Input.files 13 | setGeneric("Input.files", function(object) { 14 | standardGeneric("Input.files") 15 | }) 16 | 17 | #' @export 18 | #' @rdname IP.files 19 | setGeneric("IP.files", function(object) { 20 | standardGeneric("IP.files") 21 | }) 22 | 23 | #' @export 24 | setGeneric("geneBins", function(object) { 25 | standardGeneric("geneBins") 26 | }) 27 | 28 | #' @export 29 | setGeneric("jointPeak", function(object) { 30 | standardGeneric("jointPeak") 31 | }) 32 | 33 | #' @export 34 | setGeneric("filter", function(object, ... ) { 35 | standardGeneric("filter") 36 | }) 37 | 38 | #' @export 39 | setGeneric("extractInput", function(object) { 40 | standardGeneric("extractInput") 41 | }) 42 | 43 | #' @export 44 | setGeneric("extractIP", function(object, ...) { 45 | standardGeneric("extractIP") 46 | }) 47 | 48 | #' @export 49 | setGeneric("PrepCoveragePlot", function(object, ...) { 50 | standardGeneric("PrepCoveragePlot") 51 | }) 52 | 53 | #' @export 54 | setGeneric("normalizeLibrary",function(object, ...){ 55 | standardGeneric("normalizeLibrary") 56 | }) 57 | 58 | #' @export 59 | setGeneric("adjustExprLevel",function(object, adjustBy = "geneSum" ){ 60 | standardGeneric("adjustExprLevel") 61 | }) 62 | 63 | #' @export 64 | setGeneric("geneExpression",function(object, ...){ 65 | standardGeneric("geneExpression") 66 | }) 67 | 68 | #' @export 69 | setGeneric("consistentPeak",function(object, samplenames = NULL, joint_threshold = NA, threads = 1){ 70 | standardGeneric("consistentPeak") 71 | }) 72 | 73 | #' @export 74 | setGeneric("variable",function(object){ 75 | standardGeneric("variable") 76 | }) 77 | 78 | #' @export 79 | setGeneric("variable<-",function(object, value){ 80 | standardGeneric("variable<-") 81 | }) 82 | 83 | #' @export 84 | setGeneric("samplenames",function(object){ 85 | standardGeneric("samplenames") 86 | }) 87 | 88 | #' @export 89 | setGeneric("samplenames<-",function(object, value){ 90 | standardGeneric("samplenames<-") 91 | }) 92 | 93 | #' @export 94 | setGeneric("QNBtest",function(object){ 95 | standardGeneric("QNBtest") 96 | }) 97 | 98 | #' @export 99 | setGeneric("peakDistribution",function(object){ 100 | standardGeneric("peakDistribution") 101 | }) 102 | 103 | #' @export 104 | setGeneric("plotGeneCov", function(object, geneName, libraryType = "opposite", center = mean,ZoomIn = NULL, adjustExprLevel = F , split = FALSE, ... ){ 105 | standardGeneric("plotGeneCov") 106 | }) 107 | 108 | #' @export 109 | setGeneric("plotSNPpeakPairs",function(object, genotypeFile, SNPID, geneName, libraryType = "opposite", center = mean,ZoomIn = NULL, adjustExprLevel,adjustExpr_peak_range = NULL ){ 110 | standardGeneric("plotSNPpeakPairs") 111 | }) 112 | 113 | #' @export 114 | setGeneric("geneExpressionTMP",function(object, meanFragmentLength = 150, normalize = T){ 115 | standardGeneric("geneExpressionTMP") 116 | }) 117 | 118 | #' @export 119 | setGeneric("RADARtest",function(object, exclude ,maxPsi){ 120 | standardGeneric("RADARtest") 121 | }) 122 | 123 | #' @export 124 | setGeneric("select" ,function(object, samples, keepData=TRUE){ 125 | standardGeneric("select") 126 | }) 127 | 128 | #' @export 129 | setGeneric("results", function(object){standardGeneric("results")}) 130 | 131 | #' @export 132 | setGeneric("BetaBinTest",function(object, AdjIPeffi = TRUE , AdjustGC = FALSE, BSgenome = BSgenome.Hsapiens.UCSC.hg38, thread = 1){standardGeneric("BetaBinTest")}) 133 | 134 | #' @export 135 | setGeneric("annotatePeak",function(object, threads = 1){standardGeneric("annotatePeak")}) 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /R/MeRIP_class.R: -------------------------------------------------------------------------------- 1 | 2 | #' @export 3 | MeRIP <- setClass("MeRIP", 4 | representation( reads = "matrix", 5 | binSize = "numeric", 6 | geneModel = "GRangesList", 7 | gtf = "character", 8 | bamPath.input = "character", 9 | bamPath.ip = "character", 10 | samplenames = "character", 11 | geneBins = "data.frame", 12 | geneSum = "matrix", 13 | GTF = "GRanges", 14 | mode = "character" 15 | ), 16 | validity = function(object){ 17 | errors <- character() 18 | ## Check basic element 19 | if( all(dim(object@reads) <= 1) ){ 20 | errors <- c(errors, paste0("read count table is a required data for MeRIP instance!")) 21 | } 22 | if( length(object@binSize)==0){ 23 | errors <- c(errors, paste0("bin size to slice the transcript is a required parameter for MeRIP instance!")) 24 | } 25 | if( length(object@geneModel)<=0 ){ 26 | errors <- c(errors, paste0("GeneModel is a required data for MeRIP instance!")) 27 | } 28 | if( length(object@bamPath.input)<=0){ 29 | errors <- c(errors, paste0("path to the input BAM files are the required data for MeRIP instance!")) 30 | } 31 | if( length(object@bamPath.ip)<=0){ 32 | errors <- c(errors, paste0("path to the IP BAM files are the required data for MeRIP instance!")) 33 | } 34 | if( length(object@samplenames)<=0){ 35 | errors <- c(errors, paste0("Sample names are the required data for MeRIP instance!")) 36 | } 37 | ## match data dimension with num of samples 38 | if(ncol(object@reads) != 2 * length(object@samplenames) ){ 39 | errors <- c(errors, paste0("The number of colnumns of read count is ",ncol(object@reads),". The number of samples is ",length(object@samplenames),". The number of colnumns of read count should be 2x the number of samples!")) 40 | } 41 | if(length(object@bamPath.input) != length(object@samplenames) ){ 42 | errors <- c(errors, paste0("The number of input bam file path(es) should equal to the number of samplenames!")) 43 | } 44 | if(length(object@bamPath.ip) != length(object@samplenames) ){ 45 | errors <- c(errors, paste0("The number of IP bam file path(es) should equal to the number of samplenames!")) 46 | } 47 | 48 | if (length(errors) == 0) TRUE else errors 49 | }, 50 | prototype(geneBins = data.frame(gene = character(), bin = character() ), geneSum = matrix(),GTF = GRanges(), mode = "mRNA" ) 51 | ) 52 | 53 | #' @export 54 | MeRIP.Peak <- setClass("MeRIP.Peak",representation( peakCallResult = "matrix", 55 | jointPeak_id_pairs = "matrix", 56 | jointPeaks = "data.frame", 57 | jointPeak_ip = "matrix", 58 | jointPeak_input = "matrix", 59 | norm.jointPeak_ip = "matrix", 60 | sizeFactor = "data.frame", 61 | variate = "data.frame", 62 | jointPeak_adjExpr = "matrix", 63 | test.est = "matrix", 64 | peakCalling = "character", 65 | jointPeak_threshold = "numeric", 66 | test.method = "character"), 67 | contains = "MeRIP", 68 | prototype(peakCalling = "none", jointPeak_threshold = 0, test.method = "none") 69 | ) 70 | 71 | 72 | 73 | #' @export 74 | setMethod("show",signature("MeRIP"), function(object){ 75 | summary(object) 76 | }) 77 | #' @export 78 | setMethod("show",signature("MeRIP.Peak"), function(object){ 79 | summary(object) 80 | }) 81 | 82 | 83 | #' @export 84 | setMethod("summary", signature("MeRIP"), function(object){ 85 | cat(paste0("MeRIP dataset of ",length(object@samplenames)," samples.\n")) 86 | cat("The total read count for Input and IP samples are (Million reads):\n") 87 | totoReads <- rbind("Input" = round(colSums(object@reads[,1:length(object@samplenames)])/1e6, digits = 2), 88 | "IP" = round(colSums(object@reads[,-c(1:length(object@samplenames))])/1e6, digits = 2)) 89 | colnames(totoReads) <- object@samplenames 90 | print(totoReads) 91 | }) 92 | #' @export 93 | setMethod("summary", signature("MeRIP.Peak"), function(object){ 94 | cat(paste0("MeRIP dataset of ",length(object@samplenames)," samples.\n")) 95 | cat("The total read count for Input and IP samples are (Million reads):\n") 96 | totoReads <- rbind("Input" = round(colSums(object@reads[,1:length(object@samplenames)])/1e6, digits = 2), 97 | "IP" = round(colSums(object@reads[,-c(1:length(object@samplenames))])/1e6, digits = 2)) 98 | colnames(totoReads) <- object@samplenames 99 | 100 | print(totoReads) 101 | if(object@peakCalling != "none"){cat(paste0("\nPeak calling done by ",object@peakCalling,".\n"))} 102 | if(nrow(object@jointPeaks)>0 & object@jointPeak_threshold> 0 ){ 103 | cat(paste0(nrow(object@jointPeaks)," joint peak reported at threshold ",object@jointPeak_threshold," (requiring peaks to be called in at least ",object@jointPeak_threshold," samples).\n") ) 104 | } 105 | if(nrow(object@geneSum)>0){ cat("Input gene level count available.\n") } 106 | if(nrow(object@variate)>0){ cat(paste0("There are ",ncol(object@variate)," predictor variables/covariates. Can access by function variable(MeRIPdata). \n"))} 107 | if(object@test.method != "none" & nrow(object@test.est)>0){ print(paste0("Differential peaks tested by ",object@test.method,".\n"))} 108 | }) 109 | 110 | 111 | ## A function to convert old version list data into S4 class MeRIP dataset 112 | #' @export 113 | makeMeRIPfromList <- function(x, gtf){ 114 | if( all(c("reads","binSize","geneModel","bamPath.input","bamPath.ip","samplenames") %in% names(x) ) ){ 115 | return(MeRIP(reads = x$reads, binSize = x$binSize, geneModel = x$geneModel, gtf = gtf ,bamPath.input = x$bamPath.input, bamPath.ip = x$bamPath.ip, samplenames = x$samplenames) ) 116 | }else{ 117 | stop("The list must have the most basic elements for MeRIP dataset! ") 118 | } 119 | 120 | } 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /R/PlotGene.R: -------------------------------------------------------------------------------- 1 | #' @title old coverage plot function. 2 | #' @param IP_BAM The bam files for IP samples 3 | #' @param INPUT_BAM The bam files for INPUT samples 4 | #' @param size.IP The size factor for IP libraries 5 | #' @param size.INPUT The size factor for INPUT libraries 6 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot 7 | #' @param geneModel The gene model generated by gtfToGeneModel() function 8 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library 9 | #' @param GTF gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf") 10 | #' @export 11 | ## the main function to plot m6A-seq on one group of data 12 | plotGene <- function(IP_BAM, INPUT_BAM, size.IP, size.INPUT, geneName, geneModel, libraryType = "opposite", center = mean ,GTF,ZoomIn=NULL){ 13 | IP.cov <- getAveCoverage(geneModel= geneModel,bamFiles = IP_BAM,geneName = geneName,size.factor = size.IP, libraryType = libraryType, center = center, ZoomIn = ZoomIn) 14 | INPUT.cov <- getAveCoverage(geneModel= geneModel,bamFiles = INPUT_BAM,geneName = geneName,size.factor = size.INPUT, libraryType = libraryType, center = center,ZoomIn = ZoomIn) 15 | cov.data <- data.frame(IP=IP.cov,Input=INPUT.cov,genome_location=as.numeric(names(IP.cov) ) ) 16 | yscale <- max(IP.cov,INPUT.cov) 17 | p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Input,colour =\"Input\"))+geom_line(aes(y=IP,colour=\"IP\"))+labs(y=\"normalized coverage\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) ),1))+ 18 | theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 19 | panel.grid.minor = element_blank(), axis.line = element_line(colour = \"black\"))" 20 | 21 | p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn) 22 | p <- paste(p1,p2,sep = "+") 23 | eval(parse( text = p )) 24 | } 25 | 26 | ## the main function to plot m6A-seq on two group of data 27 | #' @title plotGenePair 28 | #' @description plot tow groups of samples in the same figure 29 | #' @param Ctl_IP_BAM The bam files for Control IP samples 30 | #' @param Ctl_INPUT_BAM The bam files for Control INPUT samples 31 | #' @param Treat_IP_BAM The bam files for treated IP samples 32 | #' @param Treat_INPUT_BAM The bam files for treated INPUT samples 33 | #' @param Ctl_size.IP The size factor for IP libraries 34 | #' @param Ctl_size.INPUT The size factor for INPUT libraries 35 | #' @param Treat_size.IP The size factor for IP libraries 36 | #' @param Treat_size.INPUT The size factor for INPUT libraries 37 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot 38 | #' @param geneModel The gene model generated by gtfToGeneModel() function 39 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library 40 | #' @export 41 | plotGenePair <- function(Ctl_IP_BAM,Ctl_INPUT_BAM,Treat_IP_BAM,Treat_INPUT_BAM,Ctl_size.IP,Ctl_size.INPUT,Treat_size.IP,Treat_size.INPUT,geneName,geneModel, libraryType = "ooposite",center = mean, GTF ,ZoomIn=NULL){ 42 | Ctl_IP.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Ctl_IP_BAM,geneName = geneName,size.factor = Ctl_size.IP, libraryType = libraryType,center = center, ZoomIn = ZoomIn) 43 | Ctl_INPUT.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Ctl_INPUT_BAM,geneName = geneName,size.factor = Ctl_size.INPUT,libraryType = libraryType, center = center , ZoomIn = ZoomIn) 44 | Treat_IP.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Treat_IP_BAM,geneName = geneName,size.factor = Treat_size.IP, libraryType = libraryType, center = center,ZoomIn = ZoomIn) 45 | Treat_INPUT.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Treat_INPUT_BAM,geneName = geneName,size.factor = Treat_size.INPUT, libraryType = libraryType, center = center,ZoomIn = ZoomIn) 46 | cov.data <- data.frame(Ctl_IP=Ctl_IP.cov, Ctl_Input = Ctl_INPUT.cov, 47 | Treat_IP=Treat_IP.cov, Treat_Input = Treat_INPUT.cov, 48 | genome_location=as.numeric(names(Ctl_IP.cov) ) ) 49 | yscale <- max(Ctl_IP.cov,Ctl_INPUT.cov,Treat_IP.cov,Treat_INPUT.cov) 50 | p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Ctl_Input,colour =\"Ctl Input\"))+geom_line(aes(y=Treat_IP,colour=\"Treat IP\"))+geom_line(aes(y=Treat_Input,colour =\"Treat Input\"))+geom_line(aes(y=Ctl_IP,colour=\"Ctl IP\"))+labs(y=\"normalized coverage\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) ),1))+ 51 | theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 52 | panel.grid.minor = element_blank(), axis.line = element_line(colour = \"black\"))" 53 | p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn) 54 | p <- paste(p1,p2,sep = "+") 55 | eval(parse( text = p )) 56 | 57 | } 58 | 59 | 60 | #' @title plotGeneMonster 61 | #' @param readsOut The data list from countReads and other analysis. 62 | #' @param geneName The gene symbol to be ploted. 63 | #' @param GTF The GRanges object containing gtf annotation. 64 | #' @param ZoomIn c(start,end) The coordinate to zoom in at the gene to be ploted. 65 | #' @export 66 | plotGeneMonster <- function(readsOut, geneName, libraryType = "opposite", center = "mean", GTF, ZoomIn = NULL){ 67 | if("X" %in% names(readsOut) ){ 68 | X <- readsOut$X 69 | plotGenePair(Ctl_IP_BAM = readsOut$bamPath.ip[X == unique(X)[1]], 70 | Ctl_INPUT_BAM = readsOut$bamPath.input[X == unique(X)[1]], 71 | Treat_IP_BAM = readsOut$bamPath.ip[X == unique(X)[2]], 72 | Treat_INPUT_BAM = readsOut$bamPath.input[X == unique(X)[2]], 73 | Ctl_size.IP = readsOut$sizeFactor$ip[X == unique(X)[1]], 74 | Ctl_size.INPUT = readsOut$sizeFactor$input[X == unique(X)[1]], 75 | Treat_size.IP = readsOut$sizeFactor$ip[X == unique(X)[2]], 76 | Treat_size.INPUT = readsOut$sizeFactor$input[X == unique(X)[2]], 77 | geneName = geneName, 78 | geneModel = readsOut$geneModel, 79 | libraryType = libraryType,center = center,GTF = GTF,ZoomIn = ZoomIn ) 80 | }else{ 81 | plotGene(IP_BAM = readsOut$bamPath.ip, 82 | INPUT_BAM = readsOut$bamPath.input, 83 | size.IP = readsOut$sizeFactor$ip, 84 | size.INPUT = readsOut$sizeFactor$input, 85 | geneName = geneName, 86 | geneModel = readsOut$geneModel, 87 | libraryType = libraryType,center = center,GTF = GTF,ZoomIn = ZoomIn) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /R/QTL_PoissonGamma.R: -------------------------------------------------------------------------------- 1 | #' @title QTL_PoissonGamma 2 | #' @param pheno The phenotype data matrix. Needs to be IP read count that has been normalized for expression level. 3 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!! 4 | #' @param peak_bed The peak file in BED12 format that needs to correspond to phenotype data matrix. 5 | #' @param testWindow Integer. Test SNPs in bp window flanking the peak. 6 | #' @param Chromosome The chromsome to run QTL test. 7 | #' @param Range The position range on a chromosome to test. 8 | #' @param Covariates The matrix for covariates to be included in the test. 9 | #' @param maxPsi The max estimation for the random effect parameter Psi. 10 | #' @import stringr 11 | #' @import vcfR 12 | #' @export 13 | QTL_PoissonGamma <- function( pheno, vcf_file, peak_bed, testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, maxPsi = 100, thread = 1 ){ 14 | 15 | ##check input 16 | if(nrow(pheno) != nrow(peak_bed) ){ 17 | stop("The number of row of phenotype is not equal to the number of peaks!") 18 | } 19 | 20 | ## set ranges on the chromosome that can be tested 21 | con <- pipe(paste0("zcat ",vcf_file," | awk '!/^#/ {print $2}' | tail -n1")) 22 | vcfRange <- c( read.table(gzfile(vcf_file), nrows = 1)[,2] , 23 | scan( con , quiet = T )) 24 | close(con) 25 | ## update test Range if necessary 26 | if(!is.null(Range)){ 27 | vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) ) 28 | }else{ 29 | vcfRange <- IRanges(vcfRange[1],vcfRange[2]) 30 | } 31 | 32 | 33 | ## parse bed12 file 34 | colnames(peak_bed) <- c("chr","start","end","name","score","strand","thickStart","thickEnd","RGB","numBlock","blockSize","blockStart") 35 | peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T) 36 | test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow ) > start(vcfRange) ) & (( peak_bed$start-testWindow ) < end(vcfRange) ) ) 37 | peak_bed.gr <- peak_bed.gr[test.id ] 38 | phenoY <- pheno[test.id,] 39 | 40 | ## test each peak 41 | startTime <- Sys.time() 42 | registerDoParallel(thread) 43 | testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind )%dopar% { 44 | 45 | ## get the range where SNPs are available 46 | testRange <- intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange) 47 | 48 | ## Test association if there is SNP available for this peak 49 | if(length(testRange)==1){ 50 | ## read genotype 51 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ~/tmp",i,".vcf.gz")) 52 | geno.vcf <-try( read.vcfR( file =paste0("~/tmp",i,".vcf.gz"), verbose = F ) , silent = T) 53 | #################################################################### 54 | ### This is to handle a wired error in the read.vcfR function. 55 | if(class(geno.vcf) == "try-error"){ 56 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ~/tmp",i,".vcf.gz")) 57 | geno.vcf <-read.vcfR( file =paste0("~/tmp",i,".vcf.gz"), verbose = F ) 58 | } 59 | #################################################################### 60 | file.remove(paste0("~/tmp",i,".vcf.gz")) 61 | ## filter biallelic snps 62 | geno.vcf <- geno.vcf[is.biallelic(geno.vcf),] 63 | 64 | ## get genotype as Dosage 65 | tmp_geno <- extract.gt(geno.vcf, element = 'GT' ) 66 | geno <- t( apply( tmp_geno ,1, .genoDosage ) ) 67 | colnames(geno) <- colnames(tmp_geno) 68 | 69 | ## Determine whether to include covariates 70 | if( is.null(Covariates) ){ 71 | Y <- round( phenoY[i,] ) 72 | psi <- 10 73 | tmp_est <- t( apply(geno,1,function(X){ 74 | model1 <- glm(Y ~ X, family = poisson(link = 'log')) 75 | coef <- model1$coefficients 76 | mu2 <- coef[1] 77 | beta <- coef[2] 78 | est <- try(unlist(PoissionGamma(Y, X, beta, psi, mu2, gamma = 0.75, steps = 50, down = 0.1,psi_cutoff = maxPsi))) 79 | if(class(est) != "try-error"){ return(est) } 80 | }) 81 | ) 82 | 83 | ## Post process estimations 84 | tested.id <- match(rownames(tmp_est),geno.vcf@fix[,"ID"]) 85 | ## calculate distance with respect to transcript(gene) strand 86 | distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){ 87 | as.integer(geno.vcf@fix[tested.id,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) 88 | }else{ 89 | round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[tested.id,"POS"]) 90 | } 91 | report <- data.frame( 92 | SNP = paste(geno.vcf@fix[tested.id,"CHROM"],geno.vcf@fix[tested.id,"POS"],sep = ":"), 93 | SNPID = rownames(tmp_est), 94 | REF = geno.vcf@fix[tested.id,"REF"], 95 | ALT = geno.vcf@fix[tested.id,"ALT"], 96 | PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ), 97 | DISTANCE = distance, 98 | beta = tmp_est[,"beta"], 99 | pvalue = tmp_est[,"p_value"], 100 | psi = tmp_est[,"psi"] 101 | ) 102 | 103 | }else{ 104 | ## Check covariates 105 | if(!is.numeric(Covariates)){stop("Please convert covariates into numerical variables...")} 106 | Y <- unlist( round( phenoY[i,] ) ) 107 | psi <- 10 108 | ## Test against each genotype 109 | tmp_est <- t( apply(geno,1,function(X1){ 110 | X.all <- cbind(X1,Covariates) # new design matrix 111 | colnames(X.all) <- paste("X",1:ncol(X.all),sep = "") 112 | design.multiBeta <- formula( paste( "log(Y+1) ~ ",paste("X.all[,", 1:ncol(X.all),"]", sep = "",collapse = " + ")) ) 113 | ## Run multi-beta PoissonGamma 114 | aa <- unlist(summary( lm( design.multiBeta ) )$coefficients[, 1]) 115 | mu2 <- aa[1] 116 | beta <- aa[2:(ncol(X.all)+1 )] 117 | est <- try(unlist(PoissonGamma::PoissionGamma_multiple_beta(Y, X.all, beta, psi, mu2, gamma = 0.25, steps = 10, down = 0.1,psi_cutoff = maxPsi))) 118 | if(class(est) != "try-error"){ return(est) } 119 | }) 120 | ) 121 | 122 | ## Post process estimations 123 | tested.id <- match(rownames(tmp_est),geno.vcf@fix[,"ID"]) 124 | ## calculate distance with respect to transcript(gene) strand 125 | distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){ 126 | as.integer(geno.vcf@fix[tested.id,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) 127 | }else{ 128 | round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[tested.id,"POS"]) 129 | } 130 | report <- data.frame( 131 | SNP = paste(geno.vcf@fix[tested.id,"CHROM"],geno.vcf@fix[tested.id,"POS"],sep = ":"), 132 | SNPID = rownames(tmp_est), 133 | REF = geno.vcf@fix[tested.id,"REF"], 134 | ALT = geno.vcf@fix[tested.id,"ALT"], 135 | PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ), 136 | DISTANCE = distance, 137 | beta = tmp_est[,"beta1"], 138 | pvalue = tmp_est[,"p_value3"], 139 | psi = tmp_est[,"psi"] 140 | ) 141 | 142 | } 143 | report 144 | } 145 | 146 | } 147 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 148 | endTime <- Sys.time() 149 | cat(paste("Time used to test association: ",difftime(endTime, startTime, units = "mins")," mins... \n")) 150 | 151 | return(testResult) 152 | } 153 | 154 | 155 | 156 | .genoDosage <- function(x){ 157 | return( stringr::str_count(x,"1") ) 158 | } 159 | 160 | -------------------------------------------------------------------------------- /R/QTL_betaBinomial.R: -------------------------------------------------------------------------------- 1 | #' @title QTL_BetaBin 2 | #' @param MeRIPdata The MeRIP.Peak object 3 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!! 4 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 5 | #' @param testWindow Integer. Test SNPs in bp window flanking the peak. 6 | #' @param Chromosome The chromsome to run QTL test. 7 | #' @param Range The position range on a chromosome to test. 8 | #' @param Covariates The matrix for covariates to be included in the test. 9 | #' @param AdjustGC Logic. Choose whether explicitly adjust GC bias. 10 | #' @param AdjIPeffi Logic. Choose whether explicitly adjust overall IP efficiency 11 | #' @param normalizeGenotype Logic. Choose whether genotype is normalized to mean = 0, var = 1 before regression. 12 | #' @import stringr 13 | #' @import vcfR 14 | #' @import BSgenome 15 | #' @import gamlss 16 | #' @import gamlss.dist 17 | #' @import broom 18 | #' @export 19 | QTL_BetaBin <- function( MeRIPdata , vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, AdjIPeffi = TRUE , PCsToInclude = 0 , normalizeGenotype = FALSE, thread = 1 ){ 20 | 21 | ##check input 22 | if( !is(MeRIPdata, "MeRIP.Peak") ){ 23 | stop("The input MeRIPdata needs to be an MeRIP.Peak object!") 24 | }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){ 25 | stop("The peak counts matrix dimension must match the dimension of jointPeaks!") 26 | } 27 | 28 | ## check samples in genotype files and in MeRIP.Peak object 29 | tmpVcf <- tempfile(fileext = ".vcf.gz") 30 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf)) 31 | tmp.vcf <-try( read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 32 | ############################################################################################################################################################ 33 | ### This is to handle a wired error in the read.vcfR function. ############################################################################################# 34 | if(class(tmp.vcf) == "try-error"){ ## 35 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf)) ## 36 | tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F ) ## 37 | } ## 38 | ############################################################################################################################################################ 39 | unlink(tmpVcf) # remove the temp file to free space. 40 | genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)] 41 | if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){ 42 | stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!") 43 | }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){ 44 | cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ") 45 | cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n") 46 | MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 47 | cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!")) 48 | }else{ 49 | ## make sure the order of samples aligned between phenotype and genotype 50 | MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 51 | } 52 | 53 | ### Preprocess 54 | ##fitler out peaks with zero count 55 | MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) ) 56 | cat("Peaks with zero read count in input data have been removed.\n") 57 | 58 | T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] ) 59 | T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] ) 60 | 61 | ##filter out peaks with OR < 1 62 | enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold}) 63 | MeRIPdata <- filter(MeRIPdata, enrichFlag ) 64 | cat(paste0("Peaks with odd ratio > 1 in more than ",MeRIPdata@jointPeak_threshold," samples will be retained.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n")) 65 | 66 | ## estimate IP efficiency 67 | OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ) 68 | colnames(OR) <- MeRIPdata@samplenames 69 | logOR <- log(OR) 70 | 71 | logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails 72 | K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){ 73 | 74 | fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] )) 75 | y.est <- predict(fit, newdata = data.frame(m = rowMeans(logOR))) 76 | return( y.est - rowMeans(logOR) ) 77 | }) 78 | 79 | ## estimate GC bias offset 80 | if(AdjustGC){ 81 | cat("Computing GC content for peaks\n") 82 | ## GC content correction 83 | peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata)) 84 | cat("...") 85 | 86 | registerDoParallel( thread ) 87 | peakSeq <- foreach( i = 1:length(peak.gr), .combine = c )%dopar%{ 88 | paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "") 89 | } 90 | peakGC <- sapply( peakSeq, function(x){ sum( str_count(x, c("G","g","C","c")) )/nchar(x) } ) 91 | 92 | cat("...") 93 | 94 | peakGC_l <- round(peakGC,digits = 2) 95 | peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks 96 | peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks 97 | l <- sort(unique(peakGC_l)) 98 | if(AdjIPeffi){y <- (log( OR ) - K_IPe_ij )}else{y <- log(OR) } 99 | colnames(y) <- MeRIPdata@samplenames 100 | b.l <- tapply(rowMeans( y ), peakGC_l , median) 101 | bil <- apply( y, 2, tapply, peakGC_l, median ) 102 | bi. <- apply( y , 2, median ) 103 | b.. <- median( y ) 104 | Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. ) 105 | Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{ 106 | GC_fit <- lm(Fil[,ii] ~ poly(l,4) ) 107 | predict(GC_fit, newdata = data.frame(l = peakGC) ) 108 | } 109 | colnames(Fij) <- MeRIPdata@samplenames 110 | cat("...\n") 111 | } 112 | ##Principal components 113 | if(PCsToInclude > 0 & PCsToInclude <= length(MeRIPdata@samplenames) ){ 114 | cat("Computing Principal components.\n") 115 | if(AdjustGC & AdjIPeffi){ 116 | PCs <- prcomp(t( (log(OR ) - K_IPe_ij - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ) )$x 117 | }else if( AdjustGC & !AdjIPeffi){ 118 | PCs <- prcomp(t( (log(OR ) - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 119 | }else if(AdjIPeffi){ 120 | PCs <- prcomp(t( (log(OR ) - K_IPe_ij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 121 | }else{ 122 | PCs <- prcomp( t( log(OR)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 123 | } 124 | }else if(PCsToInclude > length(MeRIPdata@samplenames) ){ 125 | stop("The number of PCs needs to be no larger than the sample size!") 126 | } 127 | 128 | cat("The following message can be ignored...\n----------------\n") 129 | ## set ranges on the chromosome that can be tested 130 | con1 <- pipe(paste0("zcat ",vcf_file," | grep -w '",Chromosome,"' |awk '!/^#/ {print $2}' | head -n1")) 131 | con2 <- pipe(paste0("zcat ",vcf_file," | grep -w '",Chromosome,"' |awk '!/^#/ {print $2}' | tail -n1")) 132 | vcfRange <- c( scan( con1 , quiet = T ) , 133 | scan( con2 , quiet = T )) 134 | close(con1) 135 | close(con2) 136 | cat("----------------\n") 137 | ## update test Range if necessary 138 | if(!is.null(Range)){ 139 | vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) ) 140 | }else{ 141 | vcfRange <- IRanges(vcfRange[1],vcfRange[2]) 142 | } 143 | 144 | 145 | ## parse bed12 file 146 | peak_bed <- jointPeak(MeRIPdata) 147 | peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T) 148 | test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow ) > start(vcfRange) ) & (( peak_bed$start-testWindow ) < end(vcfRange) ) ) 149 | 150 | peak_bed.gr <- peak_bed.gr[test.id ] 151 | Y1 <- extractIP(MeRIPdata)[test.id,] 152 | Y0 <- extractInput(MeRIPdata)[test.id,] 153 | if(AdjustGC){FIj <- Fij[test.id,] } 154 | 155 | ## ditermine study design according to parameters 156 | variables <- "offset(log(T1/T0))" 157 | if( AdjIPeffi ){ variables <- paste(variables, "offset(K_IPe_j)", sep = " +") } 158 | if( AdjustGC ){ variables <- paste(variables, "offset(Fj)", sep = " +") } 159 | if(! is.null(Covariates) ){ 160 | variables <- paste(variables, paste(colnames(Covariates),collapse = " + "), sep = "+") 161 | } 162 | if( PCsToInclude > 0 ){ 163 | variables <- paste(variables, paste("PC", 1:PCsToInclude, sep = "",collapse = " + "), sep = "+") 164 | } 165 | design <- formula( paste0("cbind(Y1i , Y0i) ~" ," G + ", variables) ) 166 | 167 | cat(paste0("Start beta-binomial regression for ",length(peak_bed.gr)," peaks and SNPs in ",round(testWindow/1000,digits = 1),"kb flanking each peaks on chromosome ",Chromosome,".\n")) 168 | if(AdjustGC){cat("Will correct sample specific GC bias\n")} 169 | ## test each peak 170 | startTime <- Sys.time() 171 | registerDoParallel(thread) 172 | testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind )%dopar% { 173 | 174 | ## get the range where SNPs are available 175 | testRange <- GenomicRanges::intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange) 176 | 177 | ## Test association if there is SNP available for this peak 178 | if(length(testRange)==1){ 179 | 180 | ## Use unix command line to accesss the genotype from vcf file. This is for fast data access. 181 | tmpVcf <- tempfile(fileext = ".vcf.gz") 182 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ",tmpVcf)) 183 | geno.vcf <-try( read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 184 | ############################################################################################################################################################ 185 | ### This is to handle a wired error in the read.vcfR function. ############################################################################################# 186 | if(class(geno.vcf) == "try-error"){ ## 187 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip >",tmpVcf)) ## 188 | geno.vcf <-read.vcfR( file =tmpVcf, verbose = F ) ## 189 | } ## 190 | ############################################################################################################################################################ 191 | unlink(tmpVcf) # remove the temp file to free space. 192 | 193 | ## check if genotype available for this peak 194 | if( nrow(geno.vcf@fix) == 0 ){ return(NULL) } 195 | 196 | ## Get genotype as dosage format and filter for MAF 197 | if( unique(geno.vcf@gt[,"FORMAT"]) == "DS" ){ 198 | ## Directly extract dosage 199 | geno <- if(nrow(geno.vcf@fix) == 1){ 200 | t(apply(extract.gt(geno.vcf, element = 'DS' ),2,as.numeric ) ) 201 | }else{ 202 | apply(extract.gt(geno.vcf, element = 'DS' ),2,as.numeric ) 203 | } 204 | rownames(geno) <- geno.vcf@fix[,"ID"] 205 | ## filter out any genotype that has MAF<0.05 206 | MAF <- apply(geno,1,function(x) !any(table(round(x) )>0.95*ncol(geno)) ) 207 | geno <- geno[MAF,] 208 | geno.vcf <- geno.vcf[MAF,] 209 | }else if(unique(geno.vcf@gt[,"FORMAT"]) == "GT"){ 210 | geno.vcf <- geno.vcf[is.biallelic(geno.vcf),] 211 | ## get genotype as Dosage 212 | tmp_geno <- extract.gt(geno.vcf, element = 'GT' ) 213 | geno <- t( apply( tmp_geno ,1, .genoDosage ) ) 214 | colnames(geno) <- colnames(tmp_geno) 215 | ## filter out any genotype that has MAF<0.05 216 | MAF <- apply(geno,1,function(x) !any(table(x)>0.95*ncol(geno)) ) 217 | geno <- geno[MAF,] 218 | geno.vcf <- geno.vcf[MAF,] 219 | } 220 | if( nrow(geno) == 0 ){ return(NULL) } ## skip this iteration if no genotype left after filtering 221 | 222 | ## normalized genotype is necessary 223 | if(normalizeGenotype){ 224 | geno <- t( apply(geno,1,function(x){ (x - mean(x) )/sd(x) }) ) 225 | } 226 | 227 | if(AdjustGC){Fj <- FIj[i,]} 228 | K_IPe_j <- K_IPe_ij[i,] 229 | 230 | ## Test QTLs for peak.j 231 | tmp_est <- as.data.frame(matrix(nrow = nrow(geno),ncol = 5),row.names = rownames(geno) ) 232 | for( ii in 1:nrow(geno) ){ 233 | if(AdjustGC){ 234 | fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , Fj , G = geno[ii,]) 235 | }else{ 236 | fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , G = geno[ii,]) 237 | } 238 | 239 | ## add PCs to data 240 | if(PCsToInclude > 1 ){ fit_data <- cbind(fit_data,data.frame(PCs[,1:PCsToInclude]) )}else if(PCsToInclude == 1){fit_data <- cbind(fit_data,data.frame(PC1 = PCs[,1]) ) } 241 | ## add covariates 242 | if(! is.null(Covariates) ){fit_data <- cbind(fit_data,as.data.frame(Covariates) ) } 243 | 244 | fit <- try( gamlss( design ,data = fit_data , family = BB(mu.link = "logit")) ) 245 | if(class(fit)[1]!= "try-error"){ 246 | est <- tidy(fit) 247 | tmp_est[ii,] <- data.frame(beta = est[est$term == "G","estimate"], 248 | std.err = est[est$term == "G","std.error"], 249 | pvalue = est[est$term == "G","p.value"], 250 | theta = 1/exp(est[est$parameter == "sigma","estimate"]), 251 | p.theta = est[est$parameter == "sigma","p.value"] ) 252 | }else{ 253 | tmp_est[ii,] <- data.frame(beta = NA, std.err = NA, pvalue =NA,theta = NA, p.theta = NA ) 254 | } 255 | 256 | } 257 | colnames(tmp_est) <- c("beta","std.err","pvalue","theta","p.theta") 258 | 259 | ## calculate distance with respect to transcript(gene) strand 260 | distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){ 261 | as.integer(geno.vcf@fix[,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) 262 | }else{ 263 | round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[,"POS"]) 264 | } 265 | report <- cbind(data.frame( 266 | SNP = paste(geno.vcf@fix[,"CHROM"],geno.vcf@fix[,"POS"],sep = ":"), 267 | SNPID = rownames(tmp_est), 268 | REF = geno.vcf@fix[,"REF"], 269 | ALT = geno.vcf@fix[,"ALT"], 270 | PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ), 271 | DISTANCE = distance 272 | ),tmp_est) 273 | report[!is.na(report$beta),] 274 | } 275 | 276 | } 277 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 278 | endTime <- Sys.time() 279 | cat(paste("Time used to test association: ",round(difftime(endTime, startTime, units = "mins"),digits = 1)," mins. \n")) 280 | cat(paste0(nrow(testResult)," SNP-peak pair tested.\n")) 281 | return(testResult) 282 | } 283 | 284 | 285 | ## Helper function to convert genotype into dosage. 286 | .genoDosage <- function(x){ 287 | return( stringr::str_count(x,"1") ) 288 | } 289 | 290 | -------------------------------------------------------------------------------- /R/QTL_betaBinomial.neg.R: -------------------------------------------------------------------------------- 1 | #' @title swapChr22.QTL_BetaBin 2 | #' @param MeRIPdata The MeRIP.Peak object 3 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!! 4 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 5 | #' @param testWindow Integer. Test SNPs in bp window flanking the peak. 6 | #' @param Chromosome The chromsome to run QTL test. 7 | #' @param Range The position range on a chromosome to test. 8 | #' @param Covariates The matrix for covariates to be included in the test. 9 | #' @import stringr 10 | #' @import vcfR 11 | #' @import BSgenome 12 | #' @import gamlss 13 | #' @import gamlss.dist 14 | #' @import broom 15 | #' @export 16 | swapChr22.QTL_BetaBin <- function( MeRIPdata , vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, AdjIPeffi = TRUE , PCsToInclude = 0 , thread = 1 ){ 17 | 18 | ##check input 19 | if( !is(MeRIPdata, "MeRIP.Peak") ){ 20 | stop("The input MeRIPdata needs to be an MeRIP.Peak object!") 21 | }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){ 22 | stop("The peak counts matrix dimension must match the dimension of jointPeaks!") 23 | } 24 | 25 | ## check samples in genotype files and in MeRIP.Peak object 26 | tmpVcf <- tempfile(fileext = ".vcf.gz") 27 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf)) 28 | tmp.vcf <-try( read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 29 | ############################################################################################################################################################ 30 | ### This is to handle a wired error in the read.vcfR function. ############################################################################################# 31 | if(class(tmp.vcf) == "try-error"){ ## 32 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf)) ## 33 | tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F ) ## 34 | } ## 35 | ############################################################################################################################################################ 36 | unlink(tmpVcf) # remove the temp file to free space. 37 | genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)] 38 | if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){ 39 | stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!") 40 | }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){ 41 | cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ") 42 | cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n") 43 | MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 44 | cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!")) 45 | }else{ 46 | ## make sure the order of samples aligned between phenotype and genotype 47 | MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 48 | } 49 | 50 | 51 | ### Preprocess 52 | ##fitler out peaks with zero count 53 | MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) ) 54 | cat("Peaks with zero read count in input data have been removed.\n") 55 | 56 | T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] ) 57 | T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] ) 58 | 59 | ##filter out peaks with OR < 1 60 | enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold}) 61 | MeRIPdata <- filter(MeRIPdata, enrichFlag ) 62 | cat(paste0("Peaks with odd ratio < 1 in more than ",MeRIPdata@jointPeak_threshold," samples have been removed.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n")) 63 | 64 | ## estimate IP efficiency 65 | OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ) 66 | colnames(OR) <- MeRIPdata@samplenames 67 | logOR <- log(OR) 68 | 69 | logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails 70 | K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){ 71 | 72 | fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] )) 73 | y.est <- predict(fit, newdata = data.frame(m = rowMeans(logOR))) 74 | return( y.est - rowMeans(logOR) ) 75 | }) 76 | 77 | if(AdjustGC){ 78 | cat("Computing GC content for peaks\n") 79 | ## GC content correction 80 | peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata)) 81 | cat("...") 82 | registerDoParallel( thread ) 83 | peakGC <- foreach( i = 1:length(peak.gr), .combine = c)%dopar%{ 84 | peakSeq <- paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "") 85 | sum( str_count(peakSeq, c("G","g","C","c")) )/nchar(peakSeq) 86 | } 87 | cat("...") 88 | peakGC_l <- round(peakGC,digits = 2) 89 | peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks 90 | peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks 91 | l <- sort(unique(peakGC_l)) 92 | if(AdjIPeffi){y <- (log( OR ) - K_IPe_ij)}else{y <- log(OR) } 93 | colnames(y) <- MeRIPdata@samplenames 94 | b.l <- tapply(rowMeans( y ), peakGC_l , median) 95 | bil <- apply( y, 2, tapply, peakGC_l, median ) 96 | bi. <- apply( y , 2, median ) 97 | b.. <- median( y ) 98 | Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. ) 99 | Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{ 100 | GC_fit <- lm(Fil[,ii] ~ poly(l,4) ) 101 | predict(GC_fit, newdata = data.frame(l = peakGC) ) 102 | } 103 | colnames(Fij) <- MeRIPdata@samplenames 104 | cat("...\n") 105 | } 106 | ##Principal components 107 | if(PCsToInclude > 0 & PCsToInclude <= length(MeRIPdata@samplenames) ){ 108 | cat("Computing Principal components.\n") 109 | if(AdjustGC & AdjIPeffi){ 110 | PCs <- prcomp(t( (log(OR ) - K_IPe_ij - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ) )$x 111 | }else if( AdjustGC & !AdjIPeffi){ 112 | PCs <- prcomp(t( (log(OR ) - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 113 | }else if(AdjIPeffi){ 114 | PCs <- prcomp(t( (log(OR ) - K_IPe_ij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 115 | }else{ 116 | PCs <- prcomp( t( log(OR)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 117 | } 118 | }else if(PCsToInclude > length(MeRIPdata@samplenames) ){ 119 | stop("The number of PCs needs to be no larger than the sample size!") 120 | } 121 | 122 | MeRIPdata@jointPeaks$chr <- gsub("chr22","chr222",MeRIPdata@jointPeaks$chr ) 123 | MeRIPdata@jointPeaks$chr <- gsub("chr21","chr22",MeRIPdata@jointPeaks$chr ) 124 | ## set ranges on the chromosome that can be tested 125 | con <- pipe(paste0("zcat ",vcf_file," | awk '!/^#/ {print $2}' | tail -n1")) 126 | vcfRange <- c( read.table(gzfile(vcf_file), nrows = 1)[,2] , 127 | scan( con , quiet = T )) 128 | close(con) 129 | ## update test Range if necessary 130 | if(!is.null(Range)){ 131 | vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) ) 132 | }else{ 133 | vcfRange <- IRanges(vcfRange[1],vcfRange[2]) 134 | } 135 | 136 | ## parse bed12 file 137 | peak_bed <- jointPeak(MeRIPdata) 138 | peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T) 139 | test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow ) > start(vcfRange) ) & (( peak_bed$start-testWindow ) < end(vcfRange) ) ) 140 | 141 | peak_bed.gr <- peak_bed.gr[test.id ] 142 | Y1 <- extractIP(MeRIPdata)[test.id,] 143 | Y0 <- extractInput(MeRIPdata)[test.id,] 144 | if(AdjustGC){FIj <- Fij[test.id,] } 145 | 146 | ## ditermine study design according to parameters 147 | variables <- "offset(log(T1/T0))" 148 | if( AdjIPeffi ){ variables <- paste(variables, "offset(K_IPe_ij)", sep = " +") } 149 | if( AdjustGC ){ variables <- paste(variables, "offset(Fj)", sep = " +") } 150 | if(! is.null(Covariates) ){ 151 | colnames(Covariates) 152 | variables <- paste(variables, paste(colnames(Covariates),collapse = " + "), sep = "+") 153 | } 154 | if( PCsToInclude > 0 ){ 155 | variables <- paste(variables, paste("PC", 1:PCsToInclude, sep = "",collapse = " + "), sep = "+") 156 | } 157 | design <- formula( paste0("cbind(Y1i , Y0i) ~" ," G + ", variables) ) 158 | 159 | cat(paste0("Start beta-binomial regression for ",length(peak_bed.gr)," peaks and SNPs in ",round(testWindow/1000,digits = 1),"kb flanking each peaks on chromosome ",Chromosome,".\n")) 160 | if(AdjustGC){cat("Will correct sample specific GC bias\n")} 161 | ## test each peak 162 | startTime <- Sys.time() 163 | registerDoParallel(thread) 164 | testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind )%dopar% { 165 | 166 | ## get the range where SNPs are available 167 | testRange <- GenomicRanges::intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange) 168 | 169 | ## Test association if there is SNP available for this peak 170 | if(length(testRange)==1){ 171 | 172 | ## Use unix command line to accesss the genotype from vcf file. This is for fast data access. 173 | tmpVcf <- tempfile(fileext = ".vcf.gz") 174 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ",tmpVcf)) 175 | geno.vcf <-try( read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 176 | #################################################################### 177 | ### This is to handle a wired error in the read.vcfR function. 178 | if(class(geno.vcf) == "try-error"){ 179 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip >",tmpVcf)) 180 | geno.vcf <-read.vcfR( file =tmpVcf, verbose = F ) 181 | } 182 | #################################################################### 183 | unlink(tmpVcf) # remove the temp file to free space. 184 | 185 | ## filter biallelic snps 186 | geno.vcf <- geno.vcf[is.biallelic(geno.vcf),] 187 | 188 | ## get genotype as Dosage 189 | tmp_geno <- extract.gt(geno.vcf, element = 'GT' ) 190 | geno <- t( apply( tmp_geno ,1, .genoDosage ) ) 191 | colnames(geno) <- colnames(tmp_geno) 192 | ## filter out any genotype that has M(major)AF<0.05 193 | MAF <- apply(geno,1,function(x) !any(table(x)>0.95*ncol(geno)) ) 194 | geno <- geno[MAF,] 195 | geno.vcf <- geno.vcf[MAF,] 196 | 197 | if(AdjustGC){Fj <- FIj[i,]} 198 | K_IPe_j <- K_IPe_ij[i,] 199 | 200 | tmp_est <- as.data.frame(matrix(nrow = nrow(geno),ncol = 4),row.names = rownames(geno) ) 201 | for( ii in 1:nrow(geno) ){ 202 | if(AdjustGC){fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j, Fj , G = geno[ii,])}else{fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , G = geno[ii,])} 203 | ## add PCs to data 204 | if(PCsToInclude > 1 ){ fit_data <- cbind(fit_data,data.frame(PCs[,1:PCsToInclude]) )}else if(PCsToInclude == 1){fit_data <- cbind(fit_data,data.frame(PC1 = PCs[,1]) ) } 205 | ## add covariates 206 | if(! is.null(Covariates) ){fit_data <- cbind(fit_data,as.data.frame(Covariates) ) } 207 | 208 | fit <- try( gamlss( design ,data = fit_data , family = BB(mu.link = "logit")) ) 209 | if(class(fit)[1]!= "try-error"){ 210 | est <- tidy(fit) 211 | tmp_est[ii,] <- data.frame(beta = est[est$term == "G","estimate"], 212 | theta = 1/exp(est[est$parameter == "sigma","estimate"]), 213 | pvalue = est[est$term == "G","p.value"], 214 | p.theta = est[est$parameter == "sigma","p.value"] ) 215 | }else{ 216 | tmp_est[ii,] <- data.frame(beta = NA, theta = NA, pvalue =NA, p.theta = NA ) 217 | } 218 | 219 | } 220 | colnames(tmp_est) <- c("beta","theta","pvalue","p.theta") 221 | 222 | ## calculate distance with respect to transcript(gene) strand 223 | distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){ 224 | as.integer(geno.vcf@fix[,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) 225 | }else{ 226 | round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[,"POS"]) 227 | } 228 | report <- cbind(data.frame( 229 | SNP = paste(geno.vcf@fix[,"CHROM"],geno.vcf@fix[,"POS"],sep = ":"), 230 | SNPID = rownames(tmp_est), 231 | REF = geno.vcf@fix[,"REF"], 232 | ALT = geno.vcf@fix[,"ALT"], 233 | PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ), 234 | DISTANCE = distance 235 | ),tmp_est) 236 | report[!is.na(report$beta),] 237 | } 238 | 239 | } 240 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 241 | endTime <- Sys.time() 242 | cat(paste("Time used to test association: ",round(difftime(endTime, startTime, units = "mins"),digits = 1)," mins. \n")) 243 | 244 | return(testResult) 245 | } 246 | 247 | 248 | #' @title swapChr1.QTL_BetaBin 249 | #' @description Take the phenotype on chr2 and test their association with SNPs on chr1 250 | #' @param MeRIPdata The MeRIP.Peak object 251 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!! 252 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 253 | #' @param testWindow Integer. Test SNPs in bp window flanking the peak. 254 | #' @param Chromosome The chromsome to run QTL test. 255 | #' @param Range The position range on a chromosome to test. 256 | #' @param Covariates The matrix for covariates to be included in the test. 257 | #' @import stringr 258 | #' @import vcfR 259 | #' @import BSgenome 260 | #' @import gamlss 261 | #' @import gamlss.dist 262 | #' @import broom 263 | #' @export 264 | swapChr1.QTL_BetaBin <- function( MeRIPdata , vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, AdjIPeffi = TRUE , PCsToInclude = 0 , thread = 1 ){ 265 | 266 | ##check input 267 | if( !is(MeRIPdata, "MeRIP.Peak") ){ 268 | stop("The input MeRIPdata needs to be an MeRIP.Peak object!") 269 | }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){ 270 | stop("The peak counts matrix dimension must match the dimension of jointPeaks!") 271 | } 272 | 273 | ## check samples in genotype files and in MeRIP.Peak object 274 | tmpVcf <- tempfile(fileext = ".vcf.gz") 275 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf)) 276 | tmp.vcf <-try( read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 277 | ############################################################################################################################################################ 278 | ### This is to handle a wired error in the read.vcfR function. ############################################################################################# 279 | if(class(tmp.vcf) == "try-error"){ ## 280 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf)) ## 281 | tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F ) ## 282 | } ## 283 | ############################################################################################################################################################ 284 | unlink(tmpVcf) # remove the temp file to free space. 285 | genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)] 286 | if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){ 287 | stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!") 288 | }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){ 289 | cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ") 290 | cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n") 291 | MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 292 | cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!")) 293 | }else{ 294 | ## make sure the order of samples aligned between phenotype and genotype 295 | MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 296 | } 297 | 298 | 299 | ### Preprocess 300 | ##fitler out peaks with zero count 301 | MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) ) 302 | cat("Peaks with zero read count in input data have been removed.\n") 303 | 304 | T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] ) 305 | T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] ) 306 | 307 | ##filter out peaks with OR < 1 308 | enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold}) 309 | MeRIPdata <- filter(MeRIPdata, enrichFlag ) 310 | cat(paste0("Peaks with odd ratio < 1 in more than ",MeRIPdata@jointPeak_threshold," samples have been removed.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n")) 311 | 312 | ## estimate IP efficiency 313 | OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ) 314 | colnames(OR) <- MeRIPdata@samplenames 315 | logOR <- log(OR) 316 | 317 | logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails 318 | K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){ 319 | 320 | fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] )) 321 | y.est <- predict(fit, newdata = data.frame(m = rowMeans(logOR))) 322 | return( y.est - rowMeans(logOR) ) 323 | }) 324 | 325 | if(AdjustGC){ 326 | cat("Computing GC content for peaks\n") 327 | ## GC content correction 328 | peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata)) 329 | cat("...") 330 | registerDoParallel( thread ) 331 | peakGC <- foreach( i = 1:length(peak.gr), .combine = c)%dopar%{ 332 | peakSeq <- paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "") 333 | sum( str_count(peakSeq, c("G","g","C","c")) )/nchar(peakSeq) 334 | } 335 | cat("...") 336 | peakGC_l <- round(peakGC,digits = 2) 337 | peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks 338 | peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks 339 | l <- sort(unique(peakGC_l)) 340 | if(AdjIPeffi){y <- (log( OR ) - K_IPe_ij)}else{y <- log(OR) } 341 | colnames(y) <- MeRIPdata@samplenames 342 | b.l <- tapply(rowMeans( y ), peakGC_l , median) 343 | bil <- apply( y, 2, tapply, peakGC_l, median ) 344 | bi. <- apply( y , 2, median ) 345 | b.. <- median( y ) 346 | Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. ) 347 | Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{ 348 | GC_fit <- lm(Fil[,ii] ~ poly(l,4) ) 349 | predict(GC_fit, newdata = data.frame(l = peakGC) ) 350 | } 351 | colnames(Fij) <- MeRIPdata@samplenames 352 | cat("...\n") 353 | } 354 | ##Principal components 355 | if(PCsToInclude > 0 & PCsToInclude <= length(MeRIPdata@samplenames) ){ 356 | cat("Computing Principal components.\n") 357 | if(AdjustGC & AdjIPeffi){ 358 | PCs <- prcomp(t( (log(OR ) - K_IPe_ij - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ) )$x 359 | }else if( AdjustGC & !AdjIPeffi){ 360 | PCs <- prcomp(t( (log(OR ) - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 361 | }else if(AdjIPeffi){ 362 | PCs <- prcomp(t( (log(OR ) - K_IPe_ij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 363 | }else{ 364 | PCs <- prcomp( t( log(OR)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x 365 | } 366 | }else if(PCsToInclude > length(MeRIPdata@samplenames) ){ 367 | stop("The number of PCs needs to be no larger than the sample size!") 368 | } 369 | 370 | MeRIPdata@jointPeaks$chr <- gsub("chr1","chr111",MeRIPdata@jointPeaks$chr ) 371 | MeRIPdata@jointPeaks$chr <- gsub("chr2","chr1",MeRIPdata@jointPeaks$chr ) 372 | ## set ranges on the chromosome that can be tested 373 | con <- pipe(paste0("zcat ",vcf_file," | awk '!/^#/ {print $2}' | tail -n1")) 374 | vcfRange <- c( read.table(gzfile(vcf_file), nrows = 1)[,2] , 375 | scan( con , quiet = T )) 376 | close(con) 377 | ## update test Range if necessary 378 | if(!is.null(Range)){ 379 | vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) ) 380 | }else{ 381 | vcfRange <- IRanges(vcfRange[1],vcfRange[2]) 382 | } 383 | 384 | ## parse bed12 file 385 | peak_bed <- jointPeak(MeRIPdata) 386 | peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T) 387 | test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow ) > start(vcfRange) ) & (( peak_bed$start-testWindow ) < end(vcfRange) ) ) 388 | 389 | peak_bed.gr <- peak_bed.gr[test.id ] 390 | Y1 <- extractIP(MeRIPdata)[test.id,] 391 | Y0 <- extractInput(MeRIPdata)[test.id,] 392 | if(AdjustGC){FIj <- Fij[test.id,] } 393 | 394 | ## ditermine study design according to parameters 395 | variables <- "offset(log(T1/T0))" 396 | if( AdjIPeffi ){ variables <- paste(variables, "offset(K_IPe_j)", sep = " +") } 397 | if( AdjustGC ){ variables <- paste(variables, "offset(Fj)", sep = " +") } 398 | if(! is.null(Covariates) ){ 399 | colnames(Covariates) 400 | variables <- paste(variables, paste(colnames(Covariates),collapse = " + "), sep = "+") 401 | } 402 | if( PCsToInclude > 0 ){ 403 | variables <- paste(variables, paste("PC", 1:PCsToInclude, sep = "",collapse = " + "), sep = "+") 404 | } 405 | design <- formula( paste0("cbind(Y1i , Y0i) ~" ," G + ", variables) ) 406 | 407 | cat(paste0("Start beta-binomial regression for ",length(peak_bed.gr)," peaks and SNPs in ",round(testWindow/1000,digits = 1),"kb flanking each peaks on chromosome ",Chromosome,".\n")) 408 | if(AdjustGC){cat("Will correct sample specific GC bias\n")} 409 | ## test each peak 410 | startTime <- Sys.time() 411 | registerDoParallel(thread) 412 | testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind )%dopar% { 413 | 414 | ## get the range where SNPs are available 415 | testRange <- GenomicRanges::intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange) 416 | 417 | ## Test association if there is SNP available for this peak 418 | if(length(testRange)==1){ 419 | 420 | ## Use unix command line to accesss the genotype from vcf file. This is for fast data access. 421 | tmpVcf <- tempfile(fileext = ".vcf.gz") 422 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ",tmpVcf)) 423 | geno.vcf <-try( read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 424 | #################################################################### 425 | ### This is to handle a wired error in the read.vcfR function. 426 | if(class(geno.vcf) == "try-error"){ 427 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip >",tmpVcf)) 428 | geno.vcf <-read.vcfR( file =tmpVcf, verbose = F ) 429 | } 430 | #################################################################### 431 | unlink(tmpVcf) # remove the temp file to free space. 432 | 433 | ## filter biallelic snps 434 | geno.vcf <- geno.vcf[is.biallelic(geno.vcf),] 435 | 436 | ## get genotype as Dosage 437 | tmp_geno <- extract.gt(geno.vcf, element = 'GT' ) 438 | geno <- t( apply( tmp_geno ,1, .genoDosage ) ) 439 | colnames(geno) <- colnames(tmp_geno) 440 | ## filter out any genotype that has M(major)AF<0.05 441 | MAF <- apply(geno,1,function(x) !any(table(x)>0.95*ncol(geno)) ) 442 | geno <- geno[MAF,] 443 | geno.vcf <- geno.vcf[MAF,] 444 | 445 | if(AdjustGC){Fj <- FIj[i,]} 446 | K_IPe_j <- K_IPe_ij[i,] 447 | 448 | tmp_est <- as.data.frame(matrix(nrow = nrow(geno),ncol = 4),row.names = rownames(geno) ) 449 | for( ii in 1:nrow(geno) ){ 450 | if(AdjustGC){fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j, Fj , G = geno[ii,])}else{fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , G = geno[ii,])} 451 | ## add PCs to data 452 | if(PCsToInclude > 1 ){ fit_data <- cbind(fit_data,data.frame(PCs[,1:PCsToInclude]) )}else if(PCsToInclude == 1){fit_data <- cbind(fit_data,data.frame(PC1 = PCs[,1]) ) } 453 | ## add covariates 454 | if(! is.null(Covariates) ){fit_data <- cbind(fit_data,as.data.frame(Covariates) ) } 455 | 456 | fit <- try( gamlss( design ,data = fit_data , family = BB(mu.link = "logit")) ) 457 | if(class(fit)[1]!= "try-error"){ 458 | est <- tidy(fit) 459 | tmp_est[ii,] <- data.frame(beta = est[est$term == "G","estimate"], 460 | theta = 1/exp(est[est$parameter == "sigma","estimate"]), 461 | pvalue = est[est$term == "G","p.value"], 462 | p.theta = est[est$parameter == "sigma","p.value"] ) 463 | }else{ 464 | tmp_est[ii,] <- data.frame(beta = NA, theta = NA, pvalue =NA, p.theta = NA ) 465 | } 466 | 467 | } 468 | colnames(tmp_est) <- c("beta","theta","pvalue","p.theta") 469 | 470 | ## calculate distance with respect to transcript(gene) strand 471 | distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){ 472 | as.integer(geno.vcf@fix[,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) 473 | }else{ 474 | round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[,"POS"]) 475 | } 476 | report <- cbind(data.frame( 477 | SNP = paste(geno.vcf@fix[,"CHROM"],geno.vcf@fix[,"POS"],sep = ":"), 478 | SNPID = rownames(tmp_est), 479 | REF = geno.vcf@fix[,"REF"], 480 | ALT = geno.vcf@fix[,"ALT"], 481 | PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ), 482 | DISTANCE = distance 483 | ),tmp_est) 484 | report[!is.na(report$beta),] 485 | } 486 | 487 | } 488 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 489 | endTime <- Sys.time() 490 | cat(paste("Time used to test association: ",round(difftime(endTime, startTime, units = "mins"),digits = 1)," mins. \n")) 491 | 492 | return(testResult) 493 | } 494 | 495 | 496 | ## Helper function to convert genotype into dosage. 497 | .genoDosage <- function(x){ 498 | return( stringr::str_count(x,"1") ) 499 | } 500 | 501 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | gradient_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, n) { 5 | .Call('_MeRIPtools_gradient_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, n) 6 | } 7 | 8 | log_factorial <- function(Y) { 9 | .Call('_MeRIPtools_log_factorial', PACKAGE = 'MeRIPtools', Y) 10 | } 11 | 12 | log_factorial_calculated <- function(N) { 13 | .Call('_MeRIPtools_log_factorial_calculated', PACKAGE = 'MeRIPtools', N) 14 | } 15 | 16 | LogLikelihood_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, n, sum_log_factorial_Y) { 17 | .Call('_MeRIPtools_LogLikelihood_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, n, sum_log_factorial_Y) 18 | } 19 | 20 | select_stepsize_for_beta <- function(Y, X, gra_beta, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down) { 21 | .Call('_MeRIPtools_select_stepsize_for_beta', PACKAGE = 'MeRIPtools', Y, X, gra_beta, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down) 22 | } 23 | 24 | select_stepsize_for_psi <- function(Y, X, gra_psi, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff) { 25 | .Call('_MeRIPtools_select_stepsize_for_psi', PACKAGE = 'MeRIPtools', Y, X, gra_psi, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff) 26 | } 27 | 28 | select_stepsize_for_mu2 <- function(Y, X, gra_mu2, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down) { 29 | .Call('_MeRIPtools_select_stepsize_for_mu2', PACKAGE = 'MeRIPtools', Y, X, gra_mu2, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down) 30 | } 31 | 32 | gradient_descent_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff) { 33 | .Call('_MeRIPtools_gradient_descent_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff) 34 | } 35 | 36 | gradient_descent_alt <- function(Y, X, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff) { 37 | .Call('_MeRIPtools_gradient_descent_alt', PACKAGE = 'MeRIPtools', Y, X, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff) 38 | } 39 | 40 | PoissionGamma <- function(Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) { 41 | .Call('_MeRIPtools_PoissionGamma', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) 42 | } 43 | 44 | Fisher_information_one_beta <- function(Y, X, beta, psi, mu2, n) { 45 | .Call('_MeRIPtools_Fisher_information_one_beta', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, n) 46 | } 47 | 48 | PoissionGamma_FISHER <- function(Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) { 49 | .Call('_MeRIPtools_PoissionGamma_FISHER', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) 50 | } 51 | 52 | gradient_and_LogLikelihood_for_individual_sample <- function(Y, W, V, WY, WWY, W3Y, W4Y, VY, VVY, V3Y, V4Y, WW, W3, W4, VV, V3, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y) { 53 | .Call('_MeRIPtools_gradient_and_LogLikelihood_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, WY, WWY, W3Y, W4Y, VY, VVY, V3Y, V4Y, WW, W3, W4, VV, V3, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y) 54 | } 55 | 56 | LogLikelihood_for_individual_sample <- function(Y, W, V, WW, VV, W3, V3, W4, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y) { 57 | .Call('_MeRIPtools_LogLikelihood_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, WW, VV, W3, V3, W4, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y) 58 | } 59 | 60 | select_stepsize_for_a_parameter <- function(Y, W, V, WW, VV, W3, V3, W4, V4, ll, sum_log_factorial_Y, gradient, parameters, ind, gamma, n, down) { 61 | .Call('_MeRIPtools_select_stepsize_for_a_parameter', PACKAGE = 'MeRIPtools', Y, W, V, WW, VV, W3, V3, W4, V4, ll, sum_log_factorial_Y, gradient, parameters, ind, gamma, n, down) 62 | } 63 | 64 | gradient_descent_for_individual_sample <- function(Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) { 65 | .Call('_MeRIPtools_gradient_descent_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) 66 | } 67 | 68 | coordinate_descent_for_individual_sample <- function(Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) { 69 | .Call('_MeRIPtools_coordinate_descent_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) 70 | } 71 | 72 | gradient_multiple_beta_psi_mu2 <- function(Y, X, Xbeta, psi, mu2, n, k) { 73 | .Call('_MeRIPtools_gradient_multiple_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, Xbeta, psi, mu2, n, k) 74 | } 75 | 76 | LogLikelihood_multiple_beta_psi_mu2 <- function(Y, Xbeta, psi, mu2, n, sum_log_factorial_Y) { 77 | .Call('_MeRIPtools_LogLikelihood_multiple_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, Xbeta, psi, mu2, n, sum_log_factorial_Y) 78 | } 79 | 80 | select_stepsize_for_multiple_beta <- function(Y, X, betas, Xbeta, ind, m_gra_beta, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down) { 81 | .Call('_MeRIPtools_select_stepsize_for_multiple_beta', PACKAGE = 'MeRIPtools', Y, X, betas, Xbeta, ind, m_gra_beta, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down) 82 | } 83 | 84 | select_stepsize_for_psi_with_multiple_beta <- function(Y, Xbeta, gra_psi, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff) { 85 | .Call('_MeRIPtools_select_stepsize_for_psi_with_multiple_beta', PACKAGE = 'MeRIPtools', Y, Xbeta, gra_psi, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff) 86 | } 87 | 88 | select_stepsize_for_mu2_with_multiple_beta <- function(Y, Xbeta, gra_mu2, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down) { 89 | .Call('_MeRIPtools_select_stepsize_for_mu2_with_multiple_beta', PACKAGE = 'MeRIPtools', Y, Xbeta, gra_mu2, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down) 90 | } 91 | 92 | gradient_descent_multiple_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, n, k, down, psi_cutoff) { 93 | .Call('_MeRIPtools_gradient_descent_multiple_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, n, k, down, psi_cutoff) 94 | } 95 | 96 | Fisher_information <- function(Y, X, Xbeta, psi, mu2, n, k) { 97 | .Call('_MeRIPtools_Fisher_information', PACKAGE = 'MeRIPtools', Y, X, Xbeta, psi, mu2, n, k) 98 | } 99 | 100 | PoissionGamma_multiple_beta <- function(Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) { 101 | .Call('_MeRIPtools_PoissionGamma_multiple_beta', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) 102 | } 103 | 104 | -------------------------------------------------------------------------------- /R/countReads.R: -------------------------------------------------------------------------------- 1 | ## main function that takes a gtf file and bam files to count read for continuous windows 2 | ### The function requires for each sample, input and IP named for the same prefix (filename). input and IP each have postfix of input.bam/m6A.bam 3 | #### If multiple IP sample used a shared input library, sharedInput can be specified. 4 | #' @title countReads 5 | #' @description This is the very first function in MeRIP-seq data analysis that initianize a `MeRIP` object. This function takes BAM files of Input/IP library of each samples as input and use given GTF file as gene annotation to divide genes into consecutive bins of user defined size. 6 | #' @param samplenames The names of each sample (prefix for bam files). 7 | #' @param gtf The gtf format gene annotation file 8 | #' @param fragmentLength The RNA fragment length (insert size of the library). 9 | #' @param modification The modification used to name the BAM files. 10 | #' @param bamFolder Path to the folder where bam file locates 11 | #' @param binSize The size of consecutive bins to slice the transcripts 12 | #' @param threads The number of threads to use for hyperthreading 13 | #' @param strandToKeep According to library preparation protocol, choose which strand to count. Stranded RNA library usually seq the "ooposite" strand. Small RNA library seq the "same" strand. 14 | #' @param outputDir The directory to save output files 15 | #' @param saveOutput Logical option indicating whether to save output as an RDS file. 16 | #' @param paired Logical indicating whether the input bam files are from paired end sequencing. Default is FALSE. If using paired end data, the read length will be estimated from the data and only good mate are counted. 17 | #' @export 18 | countReads<-function( 19 | samplenames,# file name of samples 20 | gtf, # gtf file used for peak calling 21 | fragmentLength = 150, 22 | bamFolder, 23 | outputDir=NA, 24 | modification = "m6A", 25 | binSize = 50, 26 | strandToKeep = "opposite", 27 | paired = FALSE, 28 | threads = 1, 29 | saveOutput = FALSE 30 | ){ 31 | 32 | ##read bam files 33 | bamPath.input = paste(bamFolder,"/",samplenames,".input.bam",sep="") 34 | bamPath.IP = paste(bamFolder,"/",samplenames,".",modification,".bam",sep="") 35 | no.samples = length(samplenames) 36 | 37 | ## Check for missing files and index bam files 38 | if( !all(file.exists(bamPath.input)) ) stop( "input bam file missing!!!" ) 39 | if( !all(file.exists(bamPath.IP)) ) stop( "IP bam file missing!!!" ) 40 | num_bam_files <- length(bamPath.input) 41 | for (ibam in 1:num_bam_files) { 42 | inputfile = bamPath.input[ibam] 43 | IPfile = bamPath.IP[ibam] 44 | if (! file.exists(paste(inputfile,'.bai',sep=""))) { 45 | print(paste("Stage: index bam file", inputfile)) 46 | indexBam(inputfile) 47 | } 48 | if (! file.exists(paste(IPfile,'.bai',sep=""))) { 49 | print(paste("Stage: index bam file", IPfile)) 50 | indexBam(IPfile) 51 | } 52 | } 53 | 54 | 55 | 56 | ## This step removes ambiguous annotations and returns gene model 57 | cat("Reading gtf file to obtain gene model\nFilter out ambiguous model...\n") 58 | geneGRList = gtfToGeneModel(gtf) #get the gene model in GRList with only single chromosome and strand. 59 | cat("Gene model obtained from gtf file...\n") 60 | 61 | ## Check BAM headers and remove chr in geneModel that is not in BAM file. 62 | bamHeader <- scanBamHeader(bamPath.input, what=c("targets") ) 63 | seqLevels <- unique( unlist( lapply( bamHeader, function(x) names( x$targets) ) ) ) 64 | geneGRList <- geneGRList[ unlist( runValue( seqnames( geneGRList ) ) ) %in% seqLevels ] 65 | 66 | 67 | no.genes=length(geneGRList)## define number of genes 68 | 69 | cat("counting reads for each genes, this step may takes a few hours....\n") 70 | start_time <- Sys.time() 71 | registerDoParallel( cores = threads) 72 | cat(paste("Hyper-thread registered:",getDoParRegistered(),"\n")) 73 | cat(paste("Using",getDoParWorkers(),"thread(s) to count reads in continuous bins...\n")) 74 | reads <- foreach(i = 1:no.genes, .combine = rbind) %dopar%{ 75 | 76 | geneName = names(geneGRList)[i] 77 | geneModel =reduce( geneGRList[geneName][[1]] )## merge overlapping exons 78 | 79 | # DNA location to gene location conversion 80 | df.geneModel= as.data.frame(geneModel) ##data frame of gene model 81 | dna.range = as.data.frame(range(geneModel)) 82 | df.geneModel$end = df.geneModel$end - dna.range$start + 1 83 | df.geneModel$start = df.geneModel$start - dna.range$start + 1 84 | DNA2RNA = rep(0,dna.range$end - dna.range$start +1) 85 | no.exon = dim(df.geneModel)[1] 86 | for (j in 1:no.exon){DNA2RNA[df.geneModel$start[j]:df.geneModel$end[j]]=1} 87 | exon.length = sum(DNA2RNA) 88 | DNA2RNA=cumsum(DNA2RNA)*DNA2RNA 89 | 90 | ## skip any gene with smaller than 200bp transcript 91 | if(exon.length < 200) {return(NULL)} 92 | 93 | #creat a corresponding map from RNA to DNA 94 | #RNA2DNA = 1:exon.length 95 | #pointer = 1 96 | #for (j in 1:no.exon){ 97 | # RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1) ]= RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1)] + df.geneModel$start[j] -pointer 98 | # pointer = pointer + df.geneModel$width[j] 99 | #} 100 | #RNA2DNA = RNA2DNA + dna.range$start -1 #back to chromosome coordinates 101 | 102 | ## switch strand because stranded RNA library protocol sequence reverse strand 103 | if(strandToKeep == "opposite"){ 104 | reads.strand = character() 105 | if(dna.range$strand == "+"){reads.strand = "-"}else{reads.strand = "+"} ## switch strand on RNA reads for Truseq protocol 106 | }else if(strandToKeep == "same"){ 107 | reads.strand = as.character(dna.range$strand) 108 | }else{ 109 | cat("Currently m6Amonter only support strand specific RNA-seq data.\nCounting reads at opposite strand by defalt...\n") 110 | reads.strand = character() 111 | if(dna.range$strand == "+"){reads.strand = "-"}else{reads.strand = "+"} 112 | } 113 | 114 | #create start points of continuous window 115 | if(exon.length <= binSize){ 116 | slidingStart = 1 117 | }else{ 118 | ## use the 3' end terminal bin as a elastic-size bin 119 | if(dna.range$strand == "+"){ 120 | slidingStart = seq(from = 1, to = ( exon.length - binSize - exon.length %% binSize + 1) , length.out = floor(exon.length/binSize) ) 121 | }else{ # make the first bin elastic bin if a gene is on reverse strand 122 | slidingStart = c(1, seq(from = binSize + exon.length %% binSize + 1, to = ( exon.length - binSize + 1) , length.out = floor(exon.length/binSize) - 1 ) ) 123 | } 124 | } 125 | 126 | 127 | #count reads in all samples 128 | ba.IP = sapply(bamPath.IP,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired) 129 | ba.input = sapply(bamPath.input,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired) 130 | 131 | if(is.vector(ba.IP) ){# if there is only one window for this gene, make it a matrix to avoid bug 132 | ba.IP = matrix(ba.IP, nrow = 1 ) 133 | ba.input = matrix( ba.input, nrow = 1 ) 134 | } 135 | ba.counts <- cbind(ba.input,ba.IP) 136 | rownames(ba.counts) <- paste(geneName,slidingStart,sep = ",") 137 | 138 | ba.counts 139 | } 140 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 141 | end_time <- Sys.time() 142 | cat(paste("Time used to count reads:",difftime(end_time, start_time, units = "mins"),"mins... \n")) 143 | 144 | colnames(reads) <- c(paste(samplenames,"input",sep = "-"),paste(samplenames,"IP",sep = "-")) 145 | 146 | 147 | data.out <- MeRIP(reads = reads, binSize = binSize, gtf = gtf, geneModel = geneGRList, bamPath.input = bamPath.input, bamPath.ip = bamPath.IP, samplenames = samplenames) 148 | if(saveOutput){ 149 | ## create output directory 150 | dir.create(outputDir, showWarnings = FALSE, recursive = TRUE) 151 | saveRDS(data.out,paste0(outputDir,"/MeRIP_readCounts.RDS")) 152 | } 153 | 154 | 155 | return(data.out) 156 | } 157 | -------------------------------------------------------------------------------- /R/countReadsCarRNA.R: -------------------------------------------------------------------------------- 1 | #' @title countReadsCarRNA 2 | #' @description This is the very first function in MeRIP-seq data analysis that initianize a `MeRIP` object. This function takes BAM files of Input/IP library of each samples as input and use given GTF file as gene annotation to divide genes into consecutive bins of user defined size. 3 | #' @param samplenames The names of each sample (prefix for bam files). 4 | #' @param gtf The gtf format gene annotation file 5 | #' @param fragmentLength The RNA fragment length (insert size of the library). 6 | #' @param modification The modification used to name the BAM files. 7 | #' @param bamFolder Path to the folder where bam file locates 8 | #' @param binSize The size of consecutive bins to slice the transcripts 9 | #' @param nonCodingAnnotation The annotation file or files for non-coding RNA in SAF format. Can be a character or vector of path(s) to the SAF file(s). The default is NA. 10 | #' @param threads The number of threads to use for hyperthreading 11 | #' @param strandToKeep According to library preparation protocol, choose which strand to count. Stranded RNA library usually seq the "ooposite" strand. Small RNA library seq the "same" strand. 12 | #' @param outputDir The directory to save output files 13 | #' @param saveOutput Logical option indicating whether to save output as an RDS file. 14 | #' @param paired Logical indicating whether the input bam files are from paired end sequencing. Default is FALSE. If using paired end data, the read length will be estimated from the data and only good mate are counted. 15 | #' @export 16 | countReadsCarRNA<-function( 17 | samplenames,# file name of samples 18 | gtf, # gtf file used for peak calling 19 | fragmentLength = 150, 20 | bamFolder, 21 | nonCodingAnnotation = NA, 22 | outputDir=NA, 23 | modification = "m6A", 24 | binSize = 50, 25 | strandToKeep = "opposite", 26 | paired = FALSE, 27 | threads = 1, 28 | saveOutput = FALSE 29 | ){ 30 | 31 | ##read bam files 32 | bamPath.input = paste(bamFolder,"/",samplenames,".input.bam",sep="") 33 | bamPath.IP = paste(bamFolder,"/",samplenames,".",modification,".bam",sep="") 34 | no.samples = length(samplenames) 35 | 36 | ## Check for missing files and index bam files 37 | if( !all(file.exists(bamPath.input)) ) stop( "input bam file missing!!!" ) 38 | if( !all(file.exists(bamPath.IP)) ) stop( "IP bam file missing!!!" ) 39 | num_bam_files <- length(bamPath.input) 40 | for (ibam in 1:num_bam_files) { 41 | inputfile = bamPath.input[ibam] 42 | IPfile = bamPath.IP[ibam] 43 | if (! file.exists(paste(inputfile,'.bai',sep=""))) { 44 | print(paste("Stage: index bam file", inputfile)) 45 | indexBam(inputfile) 46 | } 47 | if (! file.exists(paste(IPfile,'.bai',sep=""))) { 48 | print(paste("Stage: index bam file", IPfile)) 49 | indexBam(IPfile) 50 | } 51 | } 52 | 53 | 54 | ## This step removes ambiguous annotations and returns gene model 55 | cat("Reading gtf file to obtain gene model\nFilter out ambiguous model...\n") 56 | geneGRListFromGTF = gtfToGeneModel(gtf) #get the gene model in GRList with only single chromosome and strand. 57 | cat("Gene model obtained from gtf file...\n") 58 | geneGRList.wholeGene <- range(geneGRListFromGTF) # collpase exons into one transcript 59 | 60 | ## Build model for non-coding RNAs if annotation is supplied. 61 | if( any( ! is.na( nonCodingAnnotation) ) ){ 62 | cat( "Non-coding RNA model obtained from SAF file...\n" ) 63 | if( length(nonCodingAnnotation) ==1 ){ 64 | nonCodingAnnotation.saf <- read.table(nonCodingAnnotation, sep = "\t", header = TRUE, col.names = c("GeneID", "Chr", "Start", "End","Strand") ) 65 | }else{ 66 | nonCodingAnnotation.saf <- foreach( ann = nonCodingAnnotation, .combine = rbind )%do%{ read.table( ann, sep = "\t", header = TRUE, col.names = c("GeneID", "Chr", "Start", "End","Strand") ) } 67 | } 68 | 69 | geneGRList.nonCoding <- makeGRangesListFromDataFrame(nonCodingAnnotation.saf, names.field = "GeneID") 70 | names(geneGRList.nonCoding) <- nonCodingAnnotation.saf$GeneID 71 | 72 | geneGRListCombine <- c( geneGRList.wholeGene, geneGRList.nonCoding ) 73 | }else{ 74 | geneGRListCombine <- geneGRList.wholeGene 75 | } 76 | 77 | ## Check BAM headers and remove chr in geneModel that is not in BAM file. 78 | bamHeader <- scanBamHeader(bamPath.input, what=c("targets") ) 79 | seqLevels <- unique( unlist( lapply( bamHeader, function(x) names( x$targets) ) ) ) 80 | geneGRListCombine <- geneGRListCombine[ unlist( runValue( seqnames( geneGRListCombine ) ) ) %in% seqLevels ] 81 | 82 | 83 | ## Process the gene region first. 84 | ## Count reads in transcripts. Because we are interested in carRNA, we count reads on pre-mRNA instead of mRNA. 85 | no.genes=length(geneGRListCombine)## define number of genes 86 | 87 | ## Divide annotations into batches for counting reads 88 | batchNum <- max( round( no.genes/2e4 ), 1 ) # start with at least 1 batch 89 | batchPoints <- round( seq( 1, no.genes, length.out = batchNum+1 ) ) # Stop points of each batch 90 | batches <- foreach( j = 1:(length(batchPoints) -1 ) )%do%{ return(batchPoints[j]:batchPoints[j+1] ) } 91 | 92 | cat(paste("Dividing annotations into", batchNum,"batches for read count quantification... \n")) 93 | 94 | 95 | for( iBatch in 1:batchNum ){ 96 | 97 | cat(paste0( "Counting reads for each gene in batch # ",iBatch, ", this step may takes a few hours...\n") ) 98 | 99 | tmpReads <- .CountReadsBatch(geneGRList = geneGRListCombine[ batches[[iBatch]] ], 100 | binSize = binSize, 101 | bamPath.IP = bamPath.IP, 102 | bamPath.input = bamPath.input, 103 | strandToKeep = strandToKeep, 104 | fragmentLength = fragmentLength, 105 | paired = paired, 106 | threads = threads 107 | ) 108 | 109 | cat(paste0( "Finished quantification for batch # ",iBatch,"; ",batchNum - iBatch, " batches to be processed...\n\n") ) 110 | 111 | ## filter out zero count genes 112 | rowTotoal <- rowSums(tmpReads) > 0 113 | tmpGeneName <- unlist( lapply( strsplit( rownames(tmpReads) , ","), function(x) x[1] ) ) 114 | tmpZeroGene <- tapply(rowTotoal , tmpGeneName, sum) 115 | if( any(tmpZeroGene == 0) ){ 116 | tmpReads <- tmpReads[ tmpGeneName %in% names( tmpZeroGene[tmpZeroGene >= 0] ), ] 117 | 118 | } 119 | 120 | ## save the current data 121 | eval(parse(text = paste0("Reads_",iBatch," <- tmpReads ") ) ) 122 | 123 | } 124 | 125 | cat(paste0( "Finished reads quantification. Combining read counts from batch runs...\n") ) 126 | 127 | ## combine all reads 128 | eval(parse(text = paste0("reads <- rbind(",paste0( "Reads_",1:batchNum,collapse = ",") , ") ") ) ) 129 | 130 | ## remove temp read count tables 131 | eval(parse(text = paste0("rm(",paste0( "Reads_",1:batchNum,collapse = ",") , ") ") ) ) 132 | 133 | colnames(reads) <- c(paste(samplenames,"input",sep = "-"),paste(samplenames,"IP",sep = "-")) 134 | 135 | 136 | data.out <- MeRIP(reads = reads, binSize = binSize, gtf = gtf, geneModel = geneGRListCombine, bamPath.input = bamPath.input, bamPath.ip = bamPath.IP, samplenames = samplenames, mode = "carRNA") 137 | if(saveOutput){ 138 | ## create output directory 139 | dir.create(outputDir, showWarnings = FALSE, recursive = TRUE) 140 | saveRDS(data.out,paste0(outputDir,"/MeRIP_readCounts.RDS")) 141 | } 142 | 143 | 144 | return(data.out) 145 | } 146 | 147 | 148 | ## A helper function to process a batch of annotation features. 149 | .CountReadsBatch <- function(geneGRList, 150 | binSize, 151 | bamPath.IP, 152 | bamPath.input, 153 | strandToKeep, 154 | fragmentLength, 155 | paired, 156 | threads 157 | ){ 158 | 159 | 160 | start_time <- Sys.time() 161 | registerDoParallel( cores = threads) 162 | cat(paste("Hyper-thread registered:",getDoParRegistered(),"\n")) 163 | cat(paste("Using",getDoParWorkers(),"thread(s) to count reads in continuous bins...\n")) 164 | reads <- foreach(i = 1:length(geneGRList), .combine = rbind) %dopar%{ 165 | 166 | geneName = names(geneGRList)[i] 167 | geneModel =geneGRList[geneName][[1]] 168 | 169 | # DNA location to gene location conversion 170 | df.geneModel= as.data.frame(geneModel) ##data frame of gene model 171 | dna.range = as.data.frame(range(geneModel)) 172 | df.geneModel$end = df.geneModel$end - dna.range$start + 1 173 | df.geneModel$start = df.geneModel$start - dna.range$start + 1 174 | DNA2RNA = 1:dna.range$width 175 | #DNA2RNA = rep(0,dna.range$end - dna.range$start +1) 176 | #no.exon = dim(df.geneModel)[1] 177 | #for (j in 1:no.exon){DNA2RNA[df.geneModel$start[j]:df.geneModel$end[j]]=1} 178 | #exon.length = sum(DNA2RNA) 179 | #DNA2RNA=cumsum(DNA2RNA)*DNA2RNA 180 | geneLength = dna.range$width 181 | 182 | ## skip any gene with smaller than 200bp transcript 183 | if(geneLength < 200) {return(NULL)} 184 | 185 | 186 | ## switch strand because stranded RNA library protocol sequence reverse strand 187 | if(strandToKeep == "opposite"){ 188 | reads.strand = character() 189 | if(dna.range$strand == "+"){reads.strand = "-"}else if(dna.range$strand == "-"){reads.strand = "+"}else{reads.strand = "*"} ## switch strand on RNA reads for Truseq protocol 190 | }else if(strandToKeep == "same"){ 191 | reads.strand = as.character(dna.range$strand) 192 | }else{ 193 | cat("Currently RADAR only support strand specific RNA-seq data.\nCounting reads at opposite strand by defalt...\n") 194 | reads.strand = character() 195 | if(dna.range$strand == "+"){reads.strand = "-"}else if(dna.range$strand == "-"){reads.strand = "+"}else{reads.strand = "*"} 196 | } 197 | 198 | #create start points of continuous window 199 | if(exon.length <= binSize){ 200 | slidingStart = 1 201 | }else{ 202 | ## use the 3' end terminal bin as a elastic-size bin 203 | if(dna.range$strand == "+"){ 204 | slidingStart = seq(from = 1, to = ( exon.length - binSize - exon.length %% binSize + 1) , length.out = floor(exon.length/binSize) ) 205 | }else{ # make the first bin elastic bin if a gene is on reverse strand 206 | slidingStart = c(1, seq(from = binSize + exon.length %% binSize + 1, to = ( exon.length - binSize + 1) , length.out = floor(exon.length/binSize) - 1 ) ) 207 | } 208 | } 209 | 210 | #count reads in all samples 211 | ba.IP = sapply(bamPath.IP,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired) 212 | ba.input = sapply(bamPath.input,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired) 213 | 214 | if(is.vector(ba.IP) ){# if there is only one window for this gene, make it a matrix to avoid bug 215 | ba.IP = matrix(ba.IP, nrow = 1) 216 | ba.input = matrix( ba.input, nrow = 1 ) 217 | } 218 | ba.counts <- cbind(ba.input,ba.IP) 219 | rownames(ba.counts) <- paste(geneName,slidingStart,sep = ",") 220 | 221 | ba.counts 222 | } 223 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 224 | end_time <- Sys.time() 225 | cat(paste("Time used to count reads in this batch:",difftime(end_time, start_time, units = "mins"),"mins... \n")) 226 | 227 | return(reads) 228 | 229 | } 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | -------------------------------------------------------------------------------- /R/countReadsFromBam.R: -------------------------------------------------------------------------------- 1 | ## a helper function to count reads from bam files 2 | .countReadFromBam <- 3 | function(bam,which,DNA2RNA,reads.strand,fragmentLength,left,sliding, binSize ,paired){ 4 | 5 | if(paired){ # pair-end sequence 6 | ba = scanBam(bam, param=ScanBamParam( which=which, what =c("pos","strand","qwidth","isize"), flag = scanBamFlag(isProperPair = T,isSecondMateRead = F) ) ) 7 | ba = data.frame( pos = ba[[1]]$pos, strand = ba[[1]]$strand ,readLength = ba[[1]]$qwidth, isize = ba[[1]]$isize ) 8 | ba = ba[ ba$pos > left, ] 9 | ba = ba[ba$strand == reads.strand | reads.strand == "*", ] ## filter for strand 10 | ba$pos = DNA2RNA[ba$pos - left] ## convert mapped read pos into RNA position 11 | ba = ba[ ba$pos > 0, ] ## drop intron reads. 12 | ##shift the read pos to the center of the reads 13 | if(reads.strand == "+"){ba$pos = ba$pos + round(abs(ba$isize)/2) }else{ba$pos = ba$pos + ba$readLength - round(abs(ba$isize)/2) } 14 | ##count the reads in the sliding windows 15 | no.window = length(sliding) 16 | windowCounts = vector(length = no.window) 17 | for(j in 1:no.window){ 18 | windowCounts[j]= sum( ba$pos >= sliding[j] & ba$pos < (sliding[j] + binSize) ) 19 | } # count regular bins 20 | 21 | if( as.character( strand(which) ) == "+" ){ 22 | windowCounts[no.window] = windowCounts[no.window] + sum( ba$pos >= (sliding[no.window] + binSize) & ba$pos <= max(DNA2RNA) ) #count the extra part of the last bin on positive strand 23 | }else{ 24 | windowCounts[1] = windowCounts[1] + sum( ba$pos >= (sliding[1] + binSize) & ba$pos <= (sliding[1] + binSize + max(DNA2RNA) %% binSize ) ) #count the extra part of the first bin on negative strand 25 | } 26 | 27 | 28 | }else{ # single-end sequence 29 | ba = scanBam(bam, param=ScanBamParam( which=which, what =c("pos","strand","qwidth") ) ) 30 | ba = data.frame( pos = ba[[1]]$pos, strand = ba[[1]]$strand, readLength = ba[[1]]$qwidth ) 31 | ba = ba[ ba$pos > left, ] 32 | ba = ba[ba$strand == reads.strand | reads.strand == "*", ] ## filter for strand 33 | ba$pos = DNA2RNA[ba$pos - left] ## convert mapped read pos into RNA position 34 | ba = ba[ ba$pos > 0, ] ## drop intron reads. 35 | ##shift the read pos to the center of the reads 36 | if(reads.strand == "+"){ba$pos = ba$pos + round(fragmentLength/2) }else{ba$pos = ba$pos + ba$readLength - round(fragmentLength/2) } 37 | ##count the reads in the sliding windows 38 | no.window = length(sliding) 39 | windowCounts = vector(length = no.window) 40 | for(j in 1:no.window){ 41 | windowCounts[j]= sum( ba$pos >= sliding[j] & ba$pos < (sliding[j] + binSize) ) 42 | } # count regular bins 43 | 44 | if( as.character( strand(which) ) == "+" ){ 45 | windowCounts[no.window] = windowCounts[no.window] + sum( ba$pos >= (sliding[no.window] + binSize) & ba$pos <= max(DNA2RNA) ) #count the extra part of the last bin on positive strand 46 | }else{ 47 | windowCounts[1] = windowCounts[1] + sum( ba$pos >= (sliding[1] + binSize) & ba$pos <= (sliding[1] + binSize + max(DNA2RNA) %% binSize ) ) #count the extra part of the first bin on negative strand 48 | } 49 | 50 | 51 | } 52 | 53 | return(windowCounts) 54 | } 55 | -------------------------------------------------------------------------------- /R/get_peak_logOR.R: -------------------------------------------------------------------------------- 1 | #' @title get_peak_logOR 2 | #' @description Compute peak logOR, adjust GC bias and IP efficiency for m6A QTL analysis 3 | #' @param MeRIPdata The MeRIP.Peak object containing peak calling result. 4 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!! 5 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 6 | #' @param AdjustGC Logic. Choose whether explicitly adjust GC bias (default: TRUE). 7 | #' @param AdjIPeffi Logic. Choose whether explicitly adjust overall IP efficiency (default: TRUE). 8 | #' @import vcfR 9 | #' @import BSgenome 10 | #' @export 11 | get_peak_logOR <- function( MeRIPdata, vcf_file = NULL, BSgenome = BSgenome.Hsapiens.UCSC.hg19, AdjustGC = TRUE, AdjIPeffi = TRUE, thread = 1 ){ 12 | 13 | ## check input 14 | if( !is(MeRIPdata, "MeRIP.Peak") ){ 15 | stop("The input MeRIPdata needs to be an MeRIP.Peak object!") 16 | }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){ 17 | stop("The peak counts matrix dimension must match the dimension of jointPeaks!") 18 | } 19 | 20 | ## select samples matching the vcf file 21 | if(!is.null(vcf_file)){ 22 | ## check samples in genotype files and in MeRIP.Peak object 23 | tmpVcf <- tempfile(fileext = ".vcf.gz") 24 | system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf)) 25 | tmp.vcf <- try(read.vcfR( file =tmpVcf, verbose = F ) , silent = T) 26 | ########################################################################################################## 27 | ### This is to handle a wired error in the read.vcfR function. ########################################### 28 | if(class(tmp.vcf) == "try-error"){ ## 29 | system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf)) ## 30 | tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F ) ## 31 | } ## 32 | ########################################################################################################## 33 | unlink(tmpVcf) # remove the temp file to free space. 34 | genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)] 35 | 36 | if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){ 37 | stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!") 38 | }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){ 39 | cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ") 40 | cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n") 41 | MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 42 | cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!")) 43 | }else{ 44 | ## make sure the order of samples aligned between phenotype and genotype 45 | MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata)) ) 46 | } 47 | 48 | }else{ 49 | cat("No VCF file input. Use all samples in the MeRIPdata data. ") 50 | } 51 | 52 | 53 | ### Preprocess 54 | ## Filter out peaks with zero count 55 | MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) ) 56 | cat("Peaks with zero read count in input data have been removed.\n") 57 | 58 | T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] ) 59 | T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] ) 60 | 61 | ## Filter out peaks with OR < 1 62 | enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold}) 63 | MeRIPdata <- filter(MeRIPdata, enrichFlag ) 64 | cat(paste0("Peaks with odd ratio > 1 in more than ",MeRIPdata@jointPeak_threshold," samples will be retained.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n")) 65 | 66 | ## Estimate IP efficiency 67 | OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ) 68 | colnames(OR) <- MeRIPdata@samplenames 69 | logOR <- log(OR) 70 | 71 | logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails 72 | K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){ 73 | 74 | fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] )) 75 | y.est <- predict(fit, newdata = data.frame(m = rowMeans(logOR))) 76 | return( y.est - rowMeans(logOR) ) 77 | }) 78 | 79 | ## Estimate GC bias offset 80 | if(AdjustGC){ 81 | cat("Computing GC content for peaks\n") 82 | ## GC content correction 83 | peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata)) 84 | cat("...") 85 | 86 | registerDoParallel( thread ) 87 | peakSeq <- foreach( i = 1:length(peak.gr), .combine = c )%dopar%{ 88 | paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "") 89 | } 90 | peakGC <- sapply( peakSeq, function(x){ sum( str_count(x, c("G","g","C","c")) )/nchar(x) } ) 91 | 92 | cat("...") 93 | 94 | peakGC_l <- round(peakGC,digits = 2) 95 | peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks 96 | peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks 97 | l <- sort(unique(peakGC_l)) 98 | 99 | if(AdjIPeffi){ 100 | y <- log(OR) - K_IPe_ij 101 | }else{ 102 | y <- log(OR) 103 | } 104 | colnames(y) <- MeRIPdata@samplenames 105 | 106 | b.l <- tapply(rowMeans( y ), peakGC_l , median) 107 | bil <- apply( y, 2, tapply, peakGC_l, median ) 108 | bi. <- apply( y , 2, median ) 109 | b.. <- median( y ) 110 | Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. ) 111 | Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{ 112 | GC_fit <- lm(Fil[,ii] ~ poly(l,4) ) 113 | predict(GC_fit, newdata = data.frame(l = peakGC) ) 114 | } 115 | colnames(Fij) <- MeRIPdata@samplenames 116 | cat("...\n") 117 | } 118 | 119 | ## Adjust logOR with GC and IP efficiency 120 | if(AdjustGC & AdjIPeffi){ 121 | cat("Adjust logOR with GC and IP efficiency ... \n") 122 | logOR_adjusted <- log(OR) - K_IPe_ij - Fij 123 | }else if( AdjustGC & !AdjIPeffi){ 124 | cat("Adjust logOR with GC ... \n") 125 | logOR_adjusted <- log(OR) - Fij 126 | }else if(AdjIPeffi){ 127 | cat("Adjust logOR with IP efficiency ... \n") 128 | logOR_adjusted <- log(OR) - K_IPe_ij 129 | }else{ 130 | logOR_adjusted <- log(OR) 131 | } 132 | 133 | ## parse bed12 file to get peak coordinates 134 | peak_bed <- jointPeak(MeRIPdata) 135 | peak_bed$PEAK <- paste0(peak_bed$chr,":", peak_bed$start,"-",peak_bed$end,"_",peak_bed$name,"_",peak_bed$strand ) 136 | 137 | peak_logOR <- data.frame(peak_bed[, c("chr", "start", "end", "PEAK")], logOR_adjusted) 138 | 139 | return(list(logOR = peak_logOR, MeRIPdata = MeRIPdata)) 140 | 141 | } 142 | 143 | 144 | -------------------------------------------------------------------------------- /R/gtfToGeneModel.R: -------------------------------------------------------------------------------- 1 | #' @title gtfToGeneModel 2 | #' @description to remove ambiguous gene model and return gene model as genomic ranges object 3 | #' @param gtf gtf file to build gene model 4 | #' @return geneModel A genomic range list containing gene model 5 | #' @export 6 | ##helper function to remove ambiguous gene model and return gene model as genomic ranges object 7 | gtfToGeneModel = function(gtf){ 8 | 9 | #load gene models for genes 10 | ## make the tx database object 11 | if(is.character(gtf)){ 12 | txdb=makeTxDbFromGFF(gtf,format="gtf") 13 | }else{ 14 | txdb = gtf 15 | } 16 | 17 | cols <- c("tx_chrom", "tx_strand") 18 | single_strand_genes <- genes(txdb, columns=cols) 19 | 20 | single_strand_genes_names = names(single_strand_genes) 21 | all_gene_names = names(genes(txdb,single.strand.genes.only=F)) 22 | ambiguious_genes = setdiff(all_gene_names,single_strand_genes_names) 23 | ambi_genes = genes(txdb,single.strand.genes.only=F)[ambiguious_genes] 24 | resol_ambi_genes = lapply(ambi_genes,function(x){ return(x[1]) }) 25 | 26 | #deal with ambiguous chromosome 27 | wantedChrom=unique(sapply(single_strand_genes$tx_chrom,unlist) )[nchar(unique(sapply(single_strand_genes$tx_chrom,unlist) )) <=5] 28 | id = which(sapply(single_strand_genes$tx_chrom,unlist) %in% wantedChrom ) 29 | wantedGeneNames = names(single_strand_genes[id]) 30 | 31 | geneModel = exonsBy(txdb,by="gene") 32 | 33 | single_strand_geneModel = geneModel[wantedGeneNames] 34 | ambi_geneModel = geneModel[ambiguious_genes] 35 | resol_ambi_geneModel = GRangesList() 36 | if(length(ambiguious_genes) > 0 ){ 37 | for(i in 1: length(ambi_geneModel)){ 38 | id = as.data.frame(findOverlaps(ambi_geneModel[[i]],resol_ambi_genes[names(ambi_geneModel)[i]][[1]]) )$queryHits 39 | newModel = GRangesList(ambi_geneModel[[i]][id]) 40 | names(newModel) = names(ambi_geneModel)[i] 41 | if( all(match (as.character( seqnames(newModel[[1]]) ), wantedChrom,nomatch=0 )>0 ) ) { 42 | resol_ambi_geneModel = c(resol_ambi_geneModel,newModel) 43 | } 44 | } 45 | } 46 | 47 | 48 | final_geneModel = c(single_strand_geneModel,resol_ambi_geneModel) 49 | seqlevels(final_geneModel) = wantedChrom 50 | 51 | return(final_geneModel) 52 | } 53 | -------------------------------------------------------------------------------- /R/initialization.R: -------------------------------------------------------------------------------- 1 | 2 | new_initialization_of_parameters2 <- function(Y, W, V){ 3 | logY <- log(Y) 4 | W2 <- W^2 5 | W3 <- W*W2 6 | W4 <- W*W3 7 | V2 <- V^2 8 | V3 <- V*V2 9 | V4 <- V*V3 10 | 11 | 12 | fitlm <- summary(lm(Y ~ V + V2 + V3 + V4 -1)) 13 | Bs <- fitlm$coefficients[, 1] 14 | B1 <- Bs[1] 15 | B2 <- Bs[2] 16 | B3 <- Bs[3] 17 | B4 <- Bs[4] 18 | C1 <- B2/B1 19 | C2 <- B3/B1 20 | C3 <- B4/B1 21 | 22 | newV <- log(V + C1*V2 + C2*V3 + C3*V4) 23 | newY <- logY - newV 24 | as <- summary(lm(newY ~ W + W2 + W3 + W4))$coefficients[, 1] 25 | a0 <- -0.01 26 | b1 <- exp(as[1] - a0) 27 | b2 <- b1*C1 28 | b3 <- b1*C2 29 | b4 <- b1*C3 30 | as[1] <- a0 31 | bs <- c(b1, b2, b3, b4) 32 | est <- list(as = as, bs = bs) 33 | return(est) 34 | } 35 | 36 | 37 | 38 | 39 | new_initialization_of_parameters <- function(Y, W, V){ 40 | logY <- log(Y) 41 | W2 <- W^2 42 | W3 <- W*W2 43 | W4 <- W*W3 44 | V2 <- V^2 45 | V3 <- V*V2 46 | V4 <- V*V3 47 | 48 | fitlm <- summary(lm(Y ~ V + V2 + V3 + V4 -1)) 49 | Bs <- fitlm$coefficients[, 1] 50 | B1 <- Bs[1] 51 | B2 <- Bs[2] 52 | B3 <- Bs[3] 53 | B4 <- Bs[4] 54 | C1 <- B2/B1 55 | C2 <- B3/B1 56 | C3 <- B4/B1 57 | mV <- mean(V) 58 | mV2 <- mV*mV 59 | mV3 <- mV2*mV 60 | mV4 <- mV3*mV 61 | F1 <- mV + C1*mV2+ C2*mV3 + C3*mV4 62 | 63 | as <- summary(lm(logY ~ W + W2 + W3 + W4))$coefficients[, 1] 64 | f1 <- as[1] 65 | a1 <- as[2] 66 | a2 <- as[3] 67 | a3 <- as[4] 68 | a4 <- as[5] 69 | mW <- mean(W) 70 | mW2 <- mW*mW 71 | mW3 <- mW2*mW 72 | mW4 <- mW3*mW 73 | F2 <- a1*mW + a2*mW2 + a3*mW3 + a4*mW4 74 | a0 <- (log(B1) + f1 - F2 - log(F1))/2 75 | b1 <- exp(f1 - a0)/F1 76 | b2 <- b1*C1 77 | b3 <- b1*C2 78 | b4 <- b1*C3 79 | 80 | newV <- log(b1*V + b2*V2 + b3*V3 + b4*V4) 81 | newY <- logY - newV 82 | as <- summary(lm(newY ~ W + W2 + W3 + W4))$coefficients[, 1] 83 | 84 | bs <- c(b1, b2, b3, b4) 85 | est <- list(as = as, bs = bs) 86 | return(est) 87 | } 88 | 89 | 90 | 91 | 92 | correct_individual_sample <- function(Y, gc, reads, gamma = 0.8, steps = 50, down = 0.1){ 93 | 94 | data <- Y 95 | flag <- Y!=0 96 | Y <- Y[flag] 97 | gc <- gc[flag] 98 | reads <- reads[flag] 99 | est <- new_initialization_of_parameters(Y, gc, reads) 100 | as <- unlist(est$as) 101 | bs <- unlist(est$bs) 102 | a0 <- as[1] 103 | a1 <- as[2] 104 | a2 <- as[3] 105 | a3 <- as[4] 106 | a4 <- as[5] 107 | b1 <- bs[1] 108 | b2 <- bs[2] 109 | b3 <- bs[3] 110 | b4 <- bs[4] 111 | res <- gradient_descent_for_individual_sample(Y, gc, reads, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) 112 | 113 | data[flag] <- res$corrected 114 | report <- list(parameter = res$parameter, corrected = data) 115 | 116 | return(report) 117 | } 118 | -------------------------------------------------------------------------------- /R/plotGeneCov.R: -------------------------------------------------------------------------------- 1 | #' @title plotGeneCoverage 2 | #' @param IP_BAM The bam files for IP samples 3 | #' @param INPUT_BAM The bam files for INPUT samples 4 | #' @param size.IP The size factor for IP libraries 5 | #' @param size.INPUT The size factor for INPUT libraries 6 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot 7 | #' @param geneModel The gene model generated by gtfToGeneModel() function 8 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library 9 | #' @param GTF gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf") 10 | #' @param adjustExprLevel Logic parameter determining whether adjust coverage so that input are at "same" expression level. 11 | #' @param plotSNP The option to plot SNP on the figure. Null by default. If want to include SNP in the plot, the parameter needs to ba a dataframe like this: data.frame(loc= position, anno="A/G") 12 | #' @import ggsci 13 | #' @export 14 | plotGeneCoverage <- function(IP_BAMs, INPUT_BAMs, size.IP, size.INPUT,X, geneName, geneModel, libraryType = "opposite", center = mean ,GTF,ZoomIn=NULL, adjustExprLevel = FALSE, adjustExpr_peak_range = NULL, plotSNP = NULL){ 15 | 16 | ## Get INPUT coverage first if adjust for expression level 17 | if(adjustExprLevel){ 18 | locus <- as.data.frame( range(geneModel[geneName][[1]]) ) 19 | 20 | if( !is.null(adjustExpr_peak_range) ){ 21 | locus$start = adjustExpr_peak_range[1] 22 | locus$end = adjustExpr_peak_range[2] 23 | locus$width = adjustExpr_peak_range[2] - adjustExpr_peak_range[1] + 1 24 | }else if(is.null(ZoomIn)){ 25 | }else{ 26 | locus$start = ZoomIn[1] 27 | locus$end = ZoomIn[2] 28 | locus$width = ZoomIn[2] - ZoomIn[1] + 1 29 | } 30 | local.covs <- sapply(INPUT_BAMs,getCov,locus=locus, libraryType = libraryType) 31 | cov.size <- colSums( local.covs) / mean(colSums(local.covs) ) 32 | ## add expression level adjust factor into library size factor 33 | size.INPUT.adj <- size.INPUT*cov.size 34 | size.IP.adj <- size.IP*cov.size 35 | 36 | registerDoParallel( length(levels(X)) ) 37 | INPUT.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{ 38 | getAveCoverage(geneModel= geneModel,bamFiles = INPUT_BAMs[X==ii],geneName = geneName,size.factor = size.INPUT.adj[X==ii], libraryType = libraryType, center = center,ZoomIn = ZoomIn) 39 | } 40 | IP.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{ 41 | getAveCoverage(geneModel= geneModel,bamFiles = IP_BAMs[X==ii],geneName = geneName,size.factor = size.IP.adj[X==ii], libraryType = libraryType, center = center, ZoomIn = ZoomIn) 42 | } 43 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 44 | }else{ 45 | 46 | registerDoParallel( length(levels(X)) ) 47 | INPUT.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{ 48 | getAveCoverage(geneModel= geneModel,bamFiles = INPUT_BAMs[X==ii],geneName = geneName,size.factor = size.INPUT[X==ii], libraryType = libraryType, center = center,ZoomIn = ZoomIn) 49 | } 50 | IP.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{ 51 | getAveCoverage(geneModel= geneModel,bamFiles = IP_BAMs[X==ii],geneName = geneName,size.factor = size.IP[X==ii], libraryType = libraryType, center = center, ZoomIn = ZoomIn) 52 | } 53 | rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals) 54 | 55 | } 56 | 57 | 58 | 59 | 60 | cov.data <- data.frame(genome_location=rep(as.numeric(rownames(IP.cov) ),length(levels(X))), 61 | IP=c(IP.cov),Input=c(INPUT.cov), 62 | Group = factor( rep(levels(X),rep(nrow(IP.cov),length(levels(X)) ) ), levels = levels(X) ) 63 | ) 64 | yscale <- max(IP.cov,INPUT.cov) 65 | 66 | chr <- unique(as.character(as.data.frame(geneModel[geneName])$seqnames)) 67 | 68 | p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Input,colour =Group))+geom_ribbon(aes(ymax = IP,ymin=0,fill=Group), alpha = 0.4)+labs(y=\"normalized coverage\",x = paste0( \"Genome location on chromosome: \", chr) )+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0,0,0,0))+theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 69 | panel.grid.minor = element_blank(), axis.line = element_line(colour = \"black\"), axis.ticks = element_line(colour = \"black\"), axis.title = element_text(color = \"black\", size = 18),axis.text = element_text(color = \"black\", size = 15) ) + scale_fill_nejm(name=\"IP\") + scale_colour_nejm(name=\"INPUT\")+ scale_y_continuous(expand = c(0, 0))" 70 | 71 | p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn) 72 | 73 | ## handle the option of plot the SNP in the gene model. 74 | if(is.null(plotSNP) ){ 75 | p <- paste(p1,p2,sep = "+") 76 | }else{ 77 | 78 | ## if the SNP is outside of the gene 79 | if(plotSNP$loc >max(cov.data$genome_location) ){ 80 | 81 | plotSNP_new <- max(cov.data$genome_location) + 0.02*length(cov.data$genome_location) 82 | p3 <- "annotate(\"rect\",xmin = ( plotSNP_new -2 ), xmax = ( plotSNP_new +2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+ 83 | annotate(\"text\", ,x=plotSNP_new, y = -0.1*yscale, label= paste0( chr,\":\",as.character(plotSNP$loc)), alpha = .99, colour = \"black\")+ 84 | annotate(\"text\", ,x=plotSNP_new, y = 0, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0,0,0.06,0))" 85 | p <- paste(p1,p2,p3,sep = "+") 86 | 87 | }else if( plotSNP$locmax(cov.data$genome_location) ){ 187 | 188 | plotSNP_new <- max(cov.data$genome_location) + 0.02*length(cov.data$genome_location) 189 | p3 <- "annotate(\"rect\",xmin = ( plotSNP_new -2 ), xmax = ( plotSNP_new +2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+ 190 | annotate(\"text\", ,x=plotSNP_new, y = -0.1*yscale, label= paste0( chr,\":\",as.character(plotSNP$loc)), alpha = .99, colour = \"black\")+ 191 | annotate(\"text\", ,x=plotSNP_new, y = 0, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0,0,0.06,0))" 192 | p <- paste(p1,p2,p3,sep = "+") 193 | 194 | }else if( plotSNP$loc0){ 298 | for(i in 1:length(anno.intron)){ 299 | anno.intron[i] <- paste0("annotate(\"segment\", x =", df.exon$end[i] ,", xend =", df.exon$start[i+1] ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")") 300 | } 301 | p <- paste( paste(anno.exon,collapse = "+"), paste(anno.intron,collapse = "+"), sep = "+") 302 | }else{ 303 | p <-paste(anno.exon,collapse = "+") 304 | } 305 | 306 | 307 | return(p) 308 | 309 | }else{ 310 | zoomIn.gr <- exon.new[1] 311 | ranges(zoomIn.gr) <- IRanges(start = zoomIn[1],end = zoomIn[2]) 312 | exon.zoom <- GenomicRanges::intersect(exon.new, zoomIn.gr) 313 | cds.current.zoom <- GenomicRanges::intersect(exon.zoom, cds.current) 314 | utr.current.zoom <- GenomicRanges::setdiff(exon.zoom,cds.current.zoom) 315 | exon.zoom.new <- sort( c(cds.current.zoom,utr.current.zoom) ) 316 | 317 | cds.id <- unique( queryHits( findOverlaps(exon.zoom.new, cds.current.zoom)) ) 318 | df.exon <- as.data.frame(exon.zoom.new) 319 | anno.exon <- character(length = length(exon.zoom)) 320 | ## add exon plot if # exon > 0 321 | if(length(exon.zoom.new) > 0){ 322 | for(i in 1:length(exon.zoom.new)){ 323 | if( i %in% cds.id){ 324 | anno.exon[i] <- paste0("annotate(\"rect\", xmin =",df.exon$start[i] ,", xmax = ",df.exon$end[i] ,", ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"black\")" ) 325 | }else{ 326 | anno.exon[i] <- paste0("annotate(\"rect\",xmin =",df.exon$start[i] ,", xmax = ",df.exon$end[i] ,", ymin = -0.06*yscale, ymax = -0.04*yscale, alpha = .99, colour = \"black\")") 327 | } 328 | } 329 | } 330 | 331 | ## plot intron when there are more than two exons 332 | anno.intron <- character(length = max( length(exon.zoom.new)-1, 0 ) ) 333 | if(length(anno.intron)>0){ 334 | for(i in 1:length(anno.intron)){ 335 | anno.intron[i] <- paste0("annotate(\"segment\", x =", df.exon$end[i] ,", xend =", df.exon$start[i+1] ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")") 336 | } 337 | } 338 | ## When there is only one exon and zoomIn range spans intron 339 | if( length(exon.zoom.new) > 0 && start(zoomIn.gr) 0 && end(zoomIn.gr) > end(exon.zoom)[length(exon.zoom)] ){ 344 | anno.intron <- c(anno.intron, 345 | paste0("annotate(\"segment\", x =", end(exon.zoom)[length(exon.zoom)] ,", xend =", end(zoomIn.gr) ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")") ) 346 | } 347 | ## When there is no exon but zoomIn ranges is in intron 348 | if( length(exon.zoom.new) == 0 ){ 349 | anno.intron <- c(anno.intron, 350 | paste0("annotate(\"segment\", x =", start(zoomIn.gr) ,", xend =", end(zoomIn.gr) ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")") ) 351 | } 352 | 353 | ## combine intron and exon plots 354 | if( length(anno.intron) > 0 & length(anno.exon) >0 ){ 355 | p <- paste( paste(anno.exon,collapse = "+"), paste(anno.intron,collapse = "+"), sep = "+") 356 | }else if( length(anno.exon) >0 ){ 357 | p <- paste(anno.exon,collapse = "+") 358 | }else{ 359 | p <- paste(anno.intron,collapse = "+") 360 | } 361 | 362 | return(p) 363 | } 364 | 365 | } 366 | -------------------------------------------------------------------------------- /R/plotMetaGene.R: -------------------------------------------------------------------------------- 1 | #' @title plotMetaGene 2 | #' @param peak the data frame of peak in bed12 format. 3 | #' @param gtf The annotation file. 4 | #' @import Guitar 5 | #' @import ggsci 6 | #' @export 7 | plotMetaGene <- function(peak, gtf ){ 8 | feature <- list('peak'=.peakToGRangesList(peak) ) 9 | txdb <- makeTxDbFromGFF(gtf,format = "gtf") 10 | gc_txdb <- .makeGuitarCoordsFromTxDb(txdb, noBins=50) 11 | GuitarCoords <- gc_txdb 12 | m <- .countGuitarDensity( 13 | feature[[1]], 14 | GuitarCoords, 15 | 5) 16 | ct = cbind(m,Feature="peak") 17 | ct[[4]] <- as.character(ct[[4]]) 18 | 19 | ## make plot_no-fill 20 | ct$weight <- ct$count # as numeric 21 | ct1 <- ct[ct$category=="mRNA",] # mRNA 22 | ct2 <- ct[ct$category=="lncRNA",] # lncRNA 23 | 24 | d <- mcols(GuitarCoords) 25 | 26 | pos=Feature=weight=NULL 27 | 28 | id1 <- which(match(ct1$comp,c("Front","Back")) >0 ) 29 | ct1 <- ct1[-id1,] 30 | id2 <- which(match(ct2$comp,c("Front","Back")) >0 ) 31 | ct2 <- ct2[-id2,] 32 | 33 | # normalize feature 34 | featureSet <- as.character(unique(ct$Feature)) 35 | for (i in 1:length(featureSet)) { 36 | id <- (ct1$Feature==featureSet[i]) 37 | ct1$weight[id] <- ct1$weight[id]/sum(ct1$weight[id]) 38 | 39 | id <- (ct2$Feature==featureSet[i]) 40 | ct2$weight[id] <- ct2$weight[id]/sum(ct2$weight[id]) 41 | } 42 | 43 | 44 | p2 <- 45 | ggplot(ct2, aes(x=pos, weight=weight)) + 46 | ggtitle("Distribution on lncRNA") + 47 | xlab("") + 48 | ylab("Frequency") + 49 | geom_density(adjust=1,aes(fill=factor(Feature),colour=factor(Feature)) ) + 50 | annotate("text", x = 0.5, y = -0.25, label = "lncRNA", size = 5)+ 51 | annotate("rect", xmin = 0, xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+ 52 | theme_bw() + theme(axis.ticks = element_blank(), axis.text.x = element_blank(),panel.border = element_blank(), panel.grid.major = element_blank(), 53 | panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),axis.ticks.y = element_line(colour = "black"), 54 | plot.title = element_text(face = "bold",hjust = 0.5, size = 20), 55 | axis.text.y = element_text( color = "black", size = 15), 56 | axis.title.y = element_text( color = "black", size = 15) 57 | )+ scale_color_aaas(name = "Peak(s)")+scale_fill_aaas(name = "Peak(s)") 58 | 59 | # normalization by length of components in mRNA 60 | # calculate relative length of each components 61 | temp <- unique(d[,c(1,3,4,5)]) 62 | id1 <- which(match(temp$comp,c("Front","Back")) >0 ) 63 | temp <- temp[-id1,] # remove DNA 64 | id1 <- which(match(temp$category,"mRNA") >0 ) 65 | temp <- temp[id1,] 66 | temp <- matrix(temp$interval,ncol=3) 67 | temp <- temp/rowSums(temp) 68 | temp <- colSums(temp) 69 | temp <-temp/sum(temp) 70 | weight <- temp 71 | names(weight) <- c("5'UTR","CDS","3'UTR") 72 | 73 | # density 74 | cds_id <- which(ct1$comp=="CDS") 75 | utr3_id <- which(ct1$comp=="UTR3") 76 | utr5_id <- which(ct1$comp=="UTR5") 77 | ct1$count[utr5_id] <- ct1$count[utr5_id]*weight["5'UTR"] 78 | ct1$count[cds_id] <- ct1$count[cds_id]*weight["CDS"] 79 | ct1$count[utr3_id] <- ct1$count[utr3_id]*weight["3'UTR"] 80 | 81 | # re-normalization 82 | featureSet <- as.character(unique(ct$Feature)) 83 | for (i in 1:length(featureSet)) { 84 | id <- (ct1$Feature==featureSet[i]) 85 | ct1$weight[id] <- ct1$count[id]/sum(ct1$count[id]) 86 | } 87 | x <- cumsum(weight) 88 | ct1$pos[utr5_id] <- ct1$pos[utr5_id]*weight["5'UTR"] + 0 89 | ct1$pos[cds_id] <- ct1$pos[cds_id]*weight["CDS"] + x[1] 90 | ct1$pos[utr3_id] <- ct1$pos[utr3_id]*weight["3'UTR"] + x[2] 91 | 92 | p1 <- 93 | ggplot(ct1, aes(x=pos, weight=weight)) + 94 | ggtitle("Distribution on mRNA") + 95 | xlab("") + 96 | ylab("Frequency") + 97 | geom_density(adjust=1,aes(fill=factor(Feature),colour=factor(Feature)) ) + 98 | annotate("text", x = x[1]/2, y = -0.25, label = "5'UTR", size = 5) + 99 | annotate("text", x = x[1] + weight[2]/2, y = -0.28, label = "CDS", size = 5) + 100 | annotate("text", x = x[2] + weight[3]/2, y = -0.25, label = "3'UTR", size = 5) + 101 | geom_vline(xintercept= x[1:2], linetype="dotted") + 102 | annotate("rect", xmin = 0, xmax = x[1], ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+ 103 | annotate("rect", xmin = x[2], xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+ 104 | annotate("rect", xmin = x[1], xmax = x[2], ymin = -0.16, ymax = -0.04, alpha = .2, colour = "black")+ 105 | theme_bw() + theme(axis.ticks = element_blank(), axis.text.x = element_blank(),panel.border = element_blank(), panel.grid.major = element_blank(), 106 | panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),axis.ticks.y = element_line(colour = "black"), 107 | plot.title = element_text(face = "bold",hjust = 0.5, size = 20), 108 | axis.text.y = element_text( color = "black", size = 15), 109 | axis.title.y = element_text( color = "black", size = 15) 110 | )+ scale_color_aaas(name = "Peak(s)")+scale_fill_aaas(name = "Peak(s)" ) 111 | 112 | .multiplot(p1, p2, cols=2) 113 | 114 | cat("NOTE this function is a wrapper for R package \"Guitar\".\nIf you use the metaGene plot in publication, please cite the original reference:\nCui et al 2016 BioMed Research International \n") 115 | } 116 | 117 | 118 | #' @title plotMEtaGeneMulti A wrapper function for Guitar to plot meta gene plot of multiple samples overlaid on each other. 119 | #' @param peakList The list of peak, each object of list should have a data frame of peak in bed12 format. 120 | #' @param gtf The annotation file for the gene model. 121 | #' @param saveToPDFprefix Set a name to save the plot to PDF. 122 | #' @param includeNeighborDNA Whether to include upstrean and downstream region in the meta gene. 123 | #' @param fill The logic option to chose whether to fill the density curve. 124 | #' @import Guitar 125 | #' @import ggsci 126 | #' @export 127 | plotMetaGeneMulti <- function(peakList,gtf,saveToPDFprefix=NA, fill=FALSE, 128 | includeNeighborDNA=FALSE){ 129 | 130 | gfeature <- lapply(peakList,.peakToGRangesList) 131 | names(gfeature) <- names(peakList) 132 | txdb <- makeTxDbFromGFF(gtf,format = "gtf") 133 | gc_txdb <- .makeGuitarCoordsFromTxDb(txdb, noBins=50) 134 | 135 | GuitarPlotNew(gfeature, GuitarCoordsFromTxDb = gc_txdb,saveToPDFprefix=saveToPDFprefix,fill = fill, 136 | includeNeighborDNA=includeNeighborDNA) 137 | 138 | cat("NOTE this function is a wrapper for R package \"Guitar\".\nIf you use the metaGene plot in publication, please cite the original reference:\nCui et al 2016 BioMed Research International \n") 139 | 140 | } 141 | -------------------------------------------------------------------------------- /R/plotPCA.DESeq2.R: -------------------------------------------------------------------------------- 1 | 2 | plotPCA.DESeq2 <- function(data,group = NULL, returnPC = FALSE){ 3 | if(is.null(group)){ 4 | colData <- data.frame(group=colnames(data)) 5 | }else{ 6 | colData <- data.frame(group=group) 7 | } 8 | 9 | rownames(colData) <- colnames(data) 10 | dds <- DESeq2::DESeqDataSetFromMatrix(data,colData,design = ~group) 11 | cat("Using regularized log transformation of DESeq2 to tranform data...\n") 12 | rld <- DESeq2::rlog(dds) 13 | 14 | if(returnPC){ 15 | PCs <- DESeq2::plotPCA(rld,intgroup = "group",returnData=TRUE) 16 | return(PCs) 17 | }else{ 18 | cat("Plot PCA using the rlog transformed data...\n") 19 | DESeq2::plotPCA(rld,intgroup = "group") 20 | } 21 | 22 | } 23 | 24 | #' @title plotPCAfromMatrix 25 | #' @param m The matrix of count data 26 | #' @param group The factor levels to color the samples. Should be the save number as the # of matrix columns 27 | #' @param standardize Logic parameter indicating whether to standardize the count data to have unit variance. The default is TRUE. 28 | #' @param loglink Logic parameter determine whether to take log of the metrix data. Default is TRUE. If your input matrix is at log scale, use FALSE. 29 | #' @import ggsci 30 | #' @export 31 | plotPCAfromMatrix <- function(m,group, standardize = TRUE,loglink = TRUE ){ 32 | if(loglink){ 33 | mm <- log(m + 1) 34 | }else{ 35 | mm <- m 36 | } 37 | pc <- prcomp(t(mm), scale. = standardize) 38 | pca.df <- as.data.frame(pc$x) 39 | vars <- apply(pca.df ,2, var) 40 | props <- 100*(vars / sum(vars) ) 41 | makeLab <- function(x,pc) paste0("PC",pc,": ",round(x,digits = 2),"% variance") 42 | ggplot(data = pca.df,aes(PC1,PC2,label = rownames(pca.df),colour=group) )+geom_text()+ xlab(makeLab(props[1],1)) + ylab(makeLab(props[2],2))+ 43 | theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 44 | panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1), 45 | axis.title.x=element_text(size=20, color="black", hjust=0.5 ), 46 | axis.title.y=element_text(size=20, color = "black", vjust=0.4, angle=90 ), 47 | legend.title=element_text(size = 18),legend.text = element_text(size = 18 ), 48 | axis.text.x = element_text(size = 18 ) ,axis.text.y = element_text(size = 18 ) ) 49 | 50 | } 51 | 52 | -------------------------------------------------------------------------------- /R/qqplot.pvalue.R: -------------------------------------------------------------------------------- 1 | #' @title qqplot.pvalue 2 | #' @param x can be a vector (p value of one group) or a list of vector (p value of multiple groups). 3 | #' @param pointSize The size of data points 4 | #' @param legendSize The size of points in the legend 5 | #' @export 6 | qqplot.pvalue <- function(x,pointSize = 1,legendSize = 4){ 7 | library(ggplot2) 8 | if(is.list(x)){ 9 | nn<-sapply(x, length) 10 | rs<-cumsum(nn) 11 | re<-rs-nn+1 12 | n<-min(nn) 13 | if (!is.null(names(x))) { 14 | grp=factor(rep(names(x), nn), levels=names(x)) 15 | names(x)<-NULL 16 | } else { 17 | grp=factor(rep(1:length(x), nn)) 18 | } 19 | pvo<-x 20 | pvalues<-numeric(sum(nn)) 21 | exp.x<-numeric(sum(nn)) 22 | for(i in 1:length(pvo)) { 23 | pvalues[rs[i]:re[i]] <- -log10(pvo[[i]]) 24 | exp.x[rs[i]:re[i]] <- -log10((rank(pvo[[i]], ties.method="first")-.5)/nn[i]) 25 | } 26 | thin <- unique(data.frame(pvalues = round(pvalues, 3), 27 | exp.x = round(exp.x, 3), 28 | grp=grp)) 29 | grp = thin$grp 30 | pvalues <- thin$pvalues 31 | exp.x <- thin$exp.x 32 | qq.melt <- data.frame(expected=exp.x, Observed = pvalues, label = grp) 33 | ggplot(data = qq.melt, aes(expected, Observed,colour = label) )+geom_point(size = pointSize)+geom_abline(slope = 1)+xlab("Expected(-log10 p-value)")+ylab("Observed(-log10 p-value)")+ 34 | theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 35 | panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))+ 36 | guides(color = guide_legend(override.aes = list(size=legendSize))) 37 | 38 | }else{ 39 | expected <- -log10(seq(0,1,1/(length(x)-1 ) )) 40 | p <- -log10(sort(x)) 41 | 42 | ## remove some 43 | tail <- which(p>quantile(x,0.75) ) 44 | head <-seq(1,length(which(p<=quantile(x,0.75) )),10 ) 45 | qq.data <- data.frame(expected=c(expected[head],expected[tail]), Observed=c(p[head],p[tail]) ) 46 | ggplot(data = qq.data, aes(expected, Observed) )+geom_point(size = 1.5,colour = "blue")+geom_abline(slope = 1)+xlab("Expected(-log10 p-value)")+ylab("Observed(-log10 p-value)")+ 47 | theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), 48 | panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) 49 | 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /R/randomPeak.R: -------------------------------------------------------------------------------- 1 | ## the main function to generate fixed size random sampled windows on transcriptome given annotation GTF file 2 | randomPeaks = function(size,gtf){ 3 | library(GenomicFeatures) 4 | 5 | geneModel = gtfToGeneModel(gtf) 6 | no.genes = length(geneModel) 7 | bed12=data.frame() # initiate the bed12 data frame to store the peaks 8 | pb <- txtProgressBar(min = 1, max = no.genes, style = 3) 9 | for(i in 1:no.genes){ 10 | # DNA location to gene location conversion 11 | df.geneModel= as.data.frame(reduce(geneModel[i][[1]]) )##data frame of gene model 12 | dna.range = as.data.frame(range(geneModel[i])) 13 | df.geneModel$end = df.geneModel$end - dna.range$start + 1 14 | df.geneModel$start = df.geneModel$start - dna.range$start + 1 15 | DNA2RNA = rep(0,dna.range$end - dna.range$start +1) 16 | no.exon = dim(df.geneModel)[1] 17 | for (j in 1:no.exon){DNA2RNA[df.geneModel$start[j]:df.geneModel$end[j]]=1} 18 | exon.length = sum(DNA2RNA) 19 | DNA2RNA=cumsum(DNA2RNA)*DNA2RNA 20 | 21 | #creat a corresponding map from RNA to DNA 22 | RNA2DNA = 1:exon.length 23 | pointer = 1 24 | for (j in 1:no.exon){ 25 | RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1) ]= RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1)] + df.geneModel$start[j] -pointer 26 | pointer = pointer + df.geneModel$width[j] 27 | } 28 | RNA2DNA = RNA2DNA + dna.range$start -1 #back to chromosome coordinates 29 | 30 | no.peak.to.sample = round(exon.length/1000) 31 | peaks.rna = ceiling(runif(no.peak.to.sample,size/2+1,exon.length-size/2-1)) 32 | 33 | if(no.peak.to.sample > 0 ){ 34 | peaks.dna = as.data.frame ( t(sapply(peaks.rna,function(x,RNA2DNA,size,strand,chrom){ 35 | left = RNA2DNA[x-size/2] 36 | right = RNA2DNA[x + size/2] 37 | return(c(chrom,left,right,strand) ) 38 | },RNA2DNA = RNA2DNA,size = size,strand = as.character(dna.range$strand), chrom =as.character(dna.range$seqnames)) ) ) 39 | colnames(peaks.dna)=c("chr","start","end","strand") 40 | peak.gr = makeGRangesFromDataFrame( peaks.dna ) 41 | for(j in 1:no.peak.to.sample){ 42 | tmp = GenomicRanges::intersect(peak.gr[j],reduce(geneModel[i][[1]]) ) 43 | bed.tmp = data.frame(matrix(nrow=1,ncol=12)) 44 | colnames(bed.tmp)=c("chr","start","end","name","score","strand","thickStart","thickEnd","itemRgb","blockCount","blockSize","blockStart") 45 | bed.tmp["chr"] = unique(as.character(seqnames(tmp))) 46 | bed.tmp[c(2,3)] = as.data.frame(range(tmp))[1,c(2,3)] 47 | bed.tmp["strand"]= unique(as.character(strand(tmp))) 48 | bed.tmp["name"] = names(geneModel)[i] 49 | bed.tmp["score"] = 1 50 | bed.tmp[c("thickStart","thickEnd")] = bed.tmp[c(2,3)] 51 | bed.tmp["itemRgb"] = NA 52 | bed.tmp["blockCount"] = length(tmp) 53 | bed.tmp["blockSize"] = paste(as.data.frame(tmp)[,4],collapse=",") 54 | bed.tmp["blockStart"] = paste(as.data.frame(tmp)[,2] - rep(as.numeric(bed.tmp[2]),as.numeric(bed.tmp["blockCount"])),collapse=",") 55 | bed12 = rbind(bed12,bed.tmp) 56 | } 57 | 58 | } 59 | 60 | setTxtProgressBar(pb, i) 61 | } 62 | return(bed12) 63 | } 64 | 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MeRIPtools 2 | Tool sets to analyze high throughput data for RNA modifications 3 | 4 | ### Install the R package from Github 5 | 6 | Depends: GenomicFeatures, Rsamtools, ggplot2, doParallel, foreach,grid,rtracklayer,GenomicAlignments,reshape2,Rcpp,RcppArmadillo, 7 | Guitar, stringr,vcfR,gamlss, broom, DESeq2 8 | 9 | install.packages("devtools") 10 | library(devtools) 11 | install_github("scottzijiezhang/MeRIPtools") 12 | library("MeRIPtools") 13 | 14 | ## Manual page 15 | 16 | Please refer to [manual page](https://scottzijiezhang.github.io/MeRIPtoolsManual/) for detailed instructions. 17 | 18 | ## Citation 19 | MeRIPtools is a tool sets that implemented functions for peak calling, QTL calling, differential methylation analysis, visualization. 20 | 21 | If you used MeRIPtools in your publication, please cite: 22 | Zhang, Z., Luo, K., Zou, Z. et al. Genetic analyses support the contribution of mRNA N6-methyladenosine (m6A) modification to human disease heritability. _Nat Genet_ 52, 939–949 (2020). https://doi.org/10.1038/s41588-020-0644-z 23 | 24 | **Note** MeRIPtools also have wrapper functions to call functions from other R packages to do specific analysis. 25 | If you used the `plotMetaGene` or `MetaGene` function, please cite the original R package [`Guitar`](https://bioconductor.org/packages/release/bioc/html/Guitar.html) 26 | Cui X, Wei Z, Zhang L, Liu H, Sun L, Zhang s, Huang Y, Meng J (2016). “Guitar: an R/Bioconductor package for gene annotation guided transcriptomic analysis of RNA related genomic features.” BioMed Research International. 27 | -------------------------------------------------------------------------------- /man/BetaBinTest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{BetaBinTest} 5 | \alias{BetaBinTest} 6 | \title{BetaBinTest} 7 | \usage{ 8 | \S4method{BetaBinTest}{MeRIP.Peak}(object, AdjIPeffi = TRUE, 9 | AdjustGC = FALSE, BSgenome = BSgenome.Hsapiens.UCSC.hg38) 10 | } 11 | \arguments{ 12 | \item{object}{The MeRIP.Peak object} 13 | 14 | \item{AdjIPeffi}{Logic option determining whether to adjust overall IP efficiency, default is TRUE.} 15 | 16 | \item{AdjustGC}{Logic option determining whether to adjust GC bias in each bins, default is FALSE.} 17 | 18 | \item{BSgenome}{The BSgenome object that will used for calculating GC content for each bin if AdjustGC = TRUE.} 19 | } 20 | \description{ 21 | Perform inferential test using beta-binomial regression model. 22 | } 23 | -------------------------------------------------------------------------------- /man/GuitarPlotNew.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Guitar_functions.R 3 | \name{GuitarPlotNew} 4 | \alias{GuitarPlotNew} 5 | \title{GuitarPlotNew} 6 | \usage{ 7 | GuitarPlotNew(gfeatures, GuitarCoordsFromTxDb = NA, txdb = NA, 8 | genome = NA, noBins = 10, saveToPDFprefix = NA, 9 | returnCount = FALSE, includeNeighborDNA = FALSE, 10 | maximalFeatureAmbiguity = 5, rescaleComponent = TRUE, fill = FALSE, 11 | adjust = 1) 12 | } 13 | \description{ 14 | GuitarPlotNew 15 | } 16 | -------------------------------------------------------------------------------- /man/IP.files.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AllGenerics.R, R/methods.R 3 | \docType{methods} 4 | \name{IP.files} 5 | \alias{IP.files} 6 | \alias{IP.files,MeRIP-method} 7 | \title{IP.files} 8 | \usage{ 9 | IP.files(object) 10 | 11 | \S4method{IP.files}{MeRIP}(object) 12 | } 13 | \arguments{ 14 | \item{object}{The MeRIP object} 15 | } 16 | \description{ 17 | Extract path to IP BAM files. 18 | } 19 | -------------------------------------------------------------------------------- /man/Input.files.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AllGenerics.R, R/methods.R 3 | \docType{methods} 4 | \name{Input.files} 5 | \alias{Input.files} 6 | \alias{Input.files,MeRIP-method} 7 | \title{Input.files} 8 | \usage{ 9 | Input.files(object) 10 | 11 | \S4method{Input.files}{MeRIP}(object) 12 | } 13 | \arguments{ 14 | \item{object}{The MeRIP object} 15 | } 16 | \description{ 17 | Extract path to INPUT BAM files. 18 | } 19 | -------------------------------------------------------------------------------- /man/MetaGene.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{MetaGene} 4 | \alias{MetaGene} 5 | \title{MetaGene} 6 | \usage{ 7 | MetaGene(MeRIP.Peak) 8 | } 9 | \arguments{ 10 | \item{MeRIP.Peak}{The MeRIP.Peak object} 11 | } 12 | \description{ 13 | MetaGene 14 | } 15 | -------------------------------------------------------------------------------- /man/PrepCoveragePlot-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{PrepCoveragePlot,MeRIP.Peak-method} 5 | \alias{PrepCoveragePlot,MeRIP.Peak-method} 6 | \title{Prepare coverage plot} 7 | \usage{ 8 | \S4method{PrepCoveragePlot}{MeRIP.Peak}(object, gtf = NULL) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP object} 12 | 13 | \item{gtf}{optional gtf file if the stored path to gtf file has changed.} 14 | } 15 | \description{ 16 | import GTF into the MeRIP object for plot 17 | } 18 | -------------------------------------------------------------------------------- /man/QNBtest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{QNBtest} 5 | \alias{QNBtest} 6 | \title{Wraper function to use QNB package to test for differential peaks.} 7 | \usage{ 8 | \S4method{QNBtest}{MeRIP.Peak}(object) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP.Peak object} 12 | } 13 | \description{ 14 | Wraper function to use QNB package to test for differential peaks. 15 | } 16 | -------------------------------------------------------------------------------- /man/QTL_BetaBin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/QTL_betaBinomial.R 3 | \name{QTL_BetaBin} 4 | \alias{QTL_BetaBin} 5 | \title{QTL_BetaBin} 6 | \usage{ 7 | QTL_BetaBin(MeRIPdata, vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19, 8 | testWindow = 1e+05, Chromosome, Range = NULL, Covariates = NULL, 9 | AdjustGC = TRUE, AdjIPeffi = TRUE, PCsToInclude = 0, 10 | normalizeGenotype = FALSE, thread = 1) 11 | } 12 | \arguments{ 13 | \item{MeRIPdata}{The MeRIP.Peak object} 14 | 15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 16 | 17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.} 18 | 19 | \item{testWindow}{Integer. Test SNPs in bp window flanking the peak.} 20 | 21 | \item{Chromosome}{The chromsome to run QTL test.} 22 | 23 | \item{Range}{The position range on a chromosome to test.} 24 | 25 | \item{Covariates}{The matrix for covariates to be included in the test.} 26 | 27 | \item{AdjustGC}{Logic. Choose whether explicitly adjust GC bias.} 28 | 29 | \item{AdjIPeffi}{Logic. Choose whether explicitly adjust overall IP efficiency} 30 | 31 | \item{normalizeGenotype}{Logic. Choose whether genotype is normalized to mean = 0, var = 1 before regression.} 32 | } 33 | \description{ 34 | QTL_BetaBin 35 | } 36 | -------------------------------------------------------------------------------- /man/QTL_BetaBin2.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/QTL_betaBinomial2.R 3 | \name{QTL_BetaBin2} 4 | \alias{QTL_BetaBin2} 5 | \title{QTL_BetaBin2} 6 | \usage{ 7 | QTL_BetaBin2(MeRIPdata, vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19, 8 | testWindow = 1e+05, Chromosome, Range = NULL, Covariates = NULL, 9 | AdjustGC = TRUE, PCsToInclude = 0, normalizeGenotype = FALSE, 10 | thread = 1) 11 | } 12 | \arguments{ 13 | \item{MeRIPdata}{The MeRIP.Peak object} 14 | 15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 16 | 17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.} 18 | 19 | \item{testWindow}{Integer. Test SNPs in bp window flanking the peak.} 20 | 21 | \item{Chromosome}{The chromsome to run QTL test.} 22 | 23 | \item{Range}{The position range on a chromosome to test.} 24 | 25 | \item{Covariates}{The matrix for covariates to be included in the test.} 26 | 27 | \item{AdjustGC}{Logic. Choose whether explicitly adjust GC bias.} 28 | 29 | \item{normalizeGenotype}{Logic. Choose whether genotype is normalized to mean = 0, var = 1 before regression.} 30 | } 31 | \description{ 32 | Same as QTL_BetaBin except sizeFactor estimated by Median of ratio method was used to scale library size instead of total coverage. 33 | } 34 | -------------------------------------------------------------------------------- /man/QTL_BetaBin_permute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/QTL_betaBinomial_permute.R 3 | \name{QTL_BetaBin_permute} 4 | \alias{QTL_BetaBin_permute} 5 | \title{QTL_BetaBin_permute} 6 | \usage{ 7 | QTL_BetaBin_permute(MeRIPdata, vcf_file, 8 | BSgenome = BSgenome.Hsapiens.UCSC.hg19, testWindow = 1e+05, 9 | Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, 10 | AdjIPeffi = TRUE, PCsToInclude = 0, thread = 1, Nround = 1) 11 | } 12 | \arguments{ 13 | \item{MeRIPdata}{The MeRIP.Peak object} 14 | 15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 16 | 17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.} 18 | 19 | \item{testWindow}{Integer. Test SNPs in bp window flanking the peak.} 20 | 21 | \item{Chromosome}{The chromsome to run QTL test.} 22 | 23 | \item{Range}{The position range on a chromosome to test.} 24 | 25 | \item{Covariates}{The matrix for covariates to be included in the test.} 26 | } 27 | \description{ 28 | QTL_BetaBin_permute 29 | } 30 | -------------------------------------------------------------------------------- /man/QTL_PoissonGamma.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/QTL_PoissonGamma.R 3 | \name{QTL_PoissonGamma} 4 | \alias{QTL_PoissonGamma} 5 | \title{QTL_PoissonGamma} 6 | \usage{ 7 | QTL_PoissonGamma(pheno, vcf_file, peak_bed, testWindow = 1e+05, 8 | Chromosome, Range = NULL, Covariates = NULL, maxPsi = 100, 9 | thread = 1) 10 | } 11 | \arguments{ 12 | \item{pheno}{The phenotype data matrix. Needs to be IP read count that has been normalized for expression level.} 13 | 14 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 15 | 16 | \item{peak_bed}{The peak file in BED12 format that needs to correspond to phenotype data matrix.} 17 | 18 | \item{testWindow}{Integer. Test SNPs in bp window flanking the peak.} 19 | 20 | \item{Chromosome}{The chromsome to run QTL test.} 21 | 22 | \item{Range}{The position range on a chromosome to test.} 23 | 24 | \item{Covariates}{The matrix for covariates to be included in the test.} 25 | 26 | \item{maxPsi}{The max estimation for the random effect parameter Psi.} 27 | } 28 | \description{ 29 | QTL_PoissonGamma 30 | } 31 | -------------------------------------------------------------------------------- /man/RADARtest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{RADARtest} 5 | \alias{RADARtest} 6 | \title{RADARtest} 7 | \usage{ 8 | \S4method{RADARtest}{MeRIP.Peak}(object, exclude = NULL, maxPsi = 100) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP.Peak object} 12 | 13 | \item{exclude}{A vector to specify sample names to be excluded in the test.} 14 | 15 | \item{maxPsi}{The max random effect parameter Psi} 16 | } 17 | \description{ 18 | Perform inferential test using Poisson Random effect model in RADAR package 19 | } 20 | -------------------------------------------------------------------------------- /man/adjustExprLevel-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{adjustExprLevel,MeRIP.Peak-method} 5 | \alias{adjustExprLevel,MeRIP.Peak-method} 6 | \title{adjustExprLevel} 7 | \usage{ 8 | \S4method{adjustExprLevel}{MeRIP.Peak}(object, adjustBy = "geneSum") 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP.Peak object that has been normalized for library size.} 12 | 13 | \item{adjustBy}{By default, adjust post-IP count by INPUT geneSum. Can also choose "pos" to use current position count to adjust for expression level.} 14 | } 15 | \value{ 16 | object The MeRIP.Peak object now with IP-count adjusted for expression level. 17 | } 18 | \description{ 19 | adjustExprLevel 20 | } 21 | -------------------------------------------------------------------------------- /man/annotatePeak-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{annotatePeak,MeRIP.Peak-method} 5 | \alias{annotatePeak,MeRIP.Peak-method} 6 | \title{annotatePeak} 7 | \usage{ 8 | \S4method{annotatePeak}{MeRIP.Peak}(object, threads = 1) 9 | } 10 | \arguments{ 11 | \item{MeRIP.Peak}{The MeRIP.Peak object} 12 | } 13 | \description{ 14 | annotatePeak 15 | } 16 | -------------------------------------------------------------------------------- /man/callPeakBinomial.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{callPeakBinomial} 4 | \alias{callPeakBinomial} 5 | \title{callPeakBinomial} 6 | \usage{ 7 | callPeakBinomial(MeRIP, min_counts = 15, peak_cutoff_fdr = 0.05, 8 | peak_cutoff_oddRatio = 1, threads = 1) 9 | } 10 | \arguments{ 11 | \item{MeRIP}{The MeRIP object.} 12 | 13 | \item{min_counts}{The minimal number of reads (input + IP) required in a bin to be called a peak.} 14 | 15 | \item{peak_cutoff_fdr}{The cutoff of fdr to call a bin peak.} 16 | 17 | \item{peak_cutoff_oddRatio}{The minimal oddRatio (IP/input) threshold to call a peak.} 18 | 19 | \item{threads}{The number of threads to use. Default uses 1 threads.} 20 | } 21 | \description{ 22 | callPeakBinomial 23 | } 24 | -------------------------------------------------------------------------------- /man/callPeakFisher.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{callPeakFisher} 4 | \alias{callPeakFisher} 5 | \title{callPeakFisher} 6 | \usage{ 7 | callPeakFisher(MeRIP, min_counts = 15, peak_cutoff_fdr = 0.05, 8 | peak_cutoff_oddRatio = 1, threads = 1) 9 | } 10 | \arguments{ 11 | \item{MeRIP}{The MeRIP object from countReads function.} 12 | 13 | \item{min_counts}{The minimal number of reads present in a bin to be called a peak.} 14 | 15 | \item{peak_cutoff_fdr}{The cutoff of fdr of fisher's exact test to call peak.} 16 | 17 | \item{peak_cutoff_oddRatio}{The minimal oddRatio of fisher's exact test to call peak.} 18 | 19 | \item{threads}{The number of threads to use. Default uses 1 threads.} 20 | } 21 | \description{ 22 | callPeakFisher 23 | } 24 | -------------------------------------------------------------------------------- /man/consistentPeak-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{consistentPeak,MeRIP.Peak-method} 5 | \alias{consistentPeak,MeRIP.Peak-method} 6 | \title{consistentPeak} 7 | \usage{ 8 | \S4method{consistentPeak}{MeRIP.Peak}(object, samplenames = NULL, 9 | joint_threshold = NA, threads = 1) 10 | } 11 | \arguments{ 12 | \item{object}{The MeRIP.Peak object contain peak calling result.} 13 | 14 | \item{samplenames}{The samplenames to be reported for consistent peaks.} 15 | 16 | \item{joint_threshold}{Define the number of sample required to have consistent peak in a locus to call consistent peak in a group.} 17 | 18 | \item{threads}{The number of threads to use.} 19 | } 20 | \value{ 21 | Peak consistent across specified samples at specified joint_threshod. If joint_threshold not specified, report consistent peaks across all samples specified. If no sample specified, report consistent peak across all samples. 22 | } 23 | \description{ 24 | consistentPeak 25 | } 26 | -------------------------------------------------------------------------------- /man/countReads.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/countReads.R 3 | \name{countReads} 4 | \alias{countReads} 5 | \title{countReads} 6 | \usage{ 7 | countReads(samplenames, gtf, fragmentLength = 150, bamFolder, 8 | outputDir = NA, modification = "m6A", binSize = 50, 9 | strandToKeep = "opposite", paired = FALSE, threads = 1, 10 | saveOutput = T) 11 | } 12 | \arguments{ 13 | \item{samplenames}{The names of each sample (prefix for bam files).} 14 | 15 | \item{gtf}{The gtf format gene annotation file} 16 | 17 | \item{fragmentLength}{The RNA fragment length (insert size of the library).} 18 | 19 | \item{bamFolder}{Path to the folder where bam file locates} 20 | 21 | \item{outputDir}{The directory to save output files} 22 | 23 | \item{modification}{The modification used to name the BAM files.} 24 | 25 | \item{binSize}{The size of consecutive bins to slice the transcripts} 26 | 27 | \item{strandToKeep}{According to library preparation protocol, choose which strand to count. Stranded RNA library usually seq the "ooposite" strand. Small RNA library seq the "same" strand.} 28 | 29 | \item{paired}{Logical indicating whether the input bam files are from paired end sequencing. Default is FALSE. If using paired end data, the read length will be estimated from the data and only good mate are counted.} 30 | 31 | \item{threads}{The number of threads to use for hyperthreading} 32 | 33 | \item{saveOutput}{Logical option indicating whether to save output as an RDS file.} 34 | } 35 | \description{ 36 | This is the very first function in MeRIP-seq data analysis that initianize a `MeRIP` object. This function takes BAM files of Input/IP library of each samples as input and use given GTF file as gene annotation to divide genes into consecutive bins of user defined size. 37 | } 38 | -------------------------------------------------------------------------------- /man/counts.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/AllGenerics.R, R/methods.R 3 | \docType{methods} 4 | \name{counts,MeRIP-method} 5 | \alias{counts,MeRIP-method} 6 | \alias{counts,MeRIP.Peak-method} 7 | \title{counts} 8 | \usage{ 9 | \S4method{counts}{MeRIP}(object) 10 | 11 | \S4method{counts}{MeRIP.Peak}(object) 12 | } 13 | \arguments{ 14 | \item{object}{The MeRIP object} 15 | } 16 | \description{ 17 | Extract all (Input + IP) read count of the MeRIP object. 18 | } 19 | -------------------------------------------------------------------------------- /man/dot-Bino_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{.Bino_test} 4 | \alias{.Bino_test} 5 | \title{Binomial test} 6 | \usage{ 7 | .Bino_test(IP, input, IP_overall, input_overall, pseudo_count = 1) 8 | } 9 | \value{ 10 | data frame with p-values and odds ratio 11 | } 12 | \description{ 13 | Binomial test 14 | } 15 | -------------------------------------------------------------------------------- /man/geneBins-MeRIP-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{geneBins,MeRIP-method} 5 | \alias{geneBins,MeRIP-method} 6 | \title{geneBins} 7 | \usage{ 8 | \S4method{geneBins}{MeRIP}(object) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP object} 12 | } 13 | \value{ 14 | data.frame 15 | } 16 | \description{ 17 | Extract the data.frame that maps the consecutive bins to genes. 18 | } 19 | -------------------------------------------------------------------------------- /man/geneExpression-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{geneExpression,MeRIP.Peak-method} 5 | \alias{geneExpression,MeRIP.Peak-method} 6 | \title{geneExpression} 7 | \usage{ 8 | \S4method{geneExpression}{MeRIP.Peak}(object) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP object} 12 | } 13 | \description{ 14 | Extract gene level expression (RNAseq) data in normalized read counts 15 | } 16 | -------------------------------------------------------------------------------- /man/geneExpression.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{geneExpression} 5 | \alias{geneExpression} 6 | \title{extractor for RNAseq data} 7 | \usage{ 8 | \S4method{geneExpression}{MeRIP.Peak}(object) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP object} 12 | } 13 | \description{ 14 | Extract gene level expression (RNAseq) data in normalized read counts 15 | } 16 | -------------------------------------------------------------------------------- /man/geneExpressionTMP.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{geneExpressionTMP} 5 | \alias{geneExpressionTMP} 6 | \title{getTPM from geneSum} 7 | \usage{ 8 | \S4method{geneExpressionTMP}{MeRIP.Peak}(object, 9 | meanFragmentLength = 150, normalize = T) 10 | } 11 | \arguments{ 12 | \item{object}{The MeRIP.Peak object} 13 | 14 | \item{meanFragmentLength}{The mean length of RNA fragment (insert of RNA library). Default is 150bp.} 15 | 16 | \item{normalize}{Logical indicating whether normalized TPM or raw TPM should be returned.} 17 | } 18 | \description{ 19 | getTPM from geneSum 20 | } 21 | -------------------------------------------------------------------------------- /man/get_peak_logOR.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/get_peak_logOR.R 3 | \name{get_peak_logOR} 4 | \alias{get_peak_logOR} 5 | \title{get_peak_logOR} 6 | \usage{ 7 | get_peak_logOR(MeRIPdata, vcf_file = NULL, 8 | BSgenome = BSgenome.Hsapiens.UCSC.hg19, AdjustGC = TRUE, 9 | AdjIPeffi = TRUE, thread = 1) 10 | } 11 | \arguments{ 12 | \item{MeRIPdata}{The MeRIP.Peak object containing peak calling result.} 13 | 14 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 15 | 16 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.} 17 | 18 | \item{AdjustGC}{Logic. Choose whether explicitly adjust GC bias (default: TRUE).} 19 | 20 | \item{AdjIPeffi}{Logic. Choose whether explicitly adjust overall IP efficiency (default: TRUE).} 21 | } 22 | \description{ 23 | Compute peak logOR, adjust GC bias and IP efficiency for m6A QTL analysis 24 | } 25 | -------------------------------------------------------------------------------- /man/gtfToGeneModel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/gtfToGeneModel.R 3 | \name{gtfToGeneModel} 4 | \alias{gtfToGeneModel} 5 | \title{gtfToGeneModel} 6 | \usage{ 7 | gtfToGeneModel(gtf) 8 | } 9 | \arguments{ 10 | \item{gtf}{gtf file to build gene model} 11 | } 12 | \value{ 13 | geneModel A genomic range list containing gene model 14 | } 15 | \description{ 16 | to remove ambiguous gene model and return gene model as genomic ranges object 17 | } 18 | -------------------------------------------------------------------------------- /man/jointPeakCount.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{jointPeakCount} 4 | \alias{jointPeakCount} 5 | \title{jointPeakCount} 6 | \usage{ 7 | jointPeakCount(MeRIPdata) 8 | } 9 | \arguments{ 10 | \item{MeRIPdata}{The data list as output of callPeakFisher()} 11 | } 12 | \value{ 13 | Returns the MeRIP object with joint peak read count for Input and IP stored. 14 | } 15 | \description{ 16 | Extract read count of joint peaks for each sample and store the read counts in the MeRIP.Peak object. 17 | } 18 | -------------------------------------------------------------------------------- /man/normalizeLibrary-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{normalizeLibrary,MeRIP.Peak-method} 5 | \alias{normalizeLibrary,MeRIP.Peak-method} 6 | \title{normalizeLibrary} 7 | \usage{ 8 | \S4method{normalizeLibrary}{MeRIP.Peak}(object) 9 | } 10 | \arguments{ 11 | \item{object}{MeRIP.Peak object.} 12 | } 13 | \description{ 14 | Normalized the input as RNA-seq data and normalize IP by enrichment. Specifically, we normalize ip libraries sizes so that the geometry mean of enrichment are the same. 15 | } 16 | -------------------------------------------------------------------------------- /man/peakDistribution.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{peakDistribution} 5 | \alias{peakDistribution} 6 | \title{plot distribution of peaks on gene annotation} 7 | \usage{ 8 | \S4method{peakDistribution}{MeRIP.Peak}(object) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP.Peak object} 12 | 13 | \item{saveName}{the file name to save ditribution plot} 14 | } 15 | \description{ 16 | plot distribution of peaks on gene annotation 17 | } 18 | -------------------------------------------------------------------------------- /man/plotGene.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PlotGene.R 3 | \name{plotGene} 4 | \alias{plotGene} 5 | \title{old coverage plot function.} 6 | \usage{ 7 | plotGene(IP_BAM, INPUT_BAM, size.IP, size.INPUT, geneName, geneModel, 8 | libraryType = "opposite", center = mean, GTF, ZoomIn = NULL) 9 | } 10 | \arguments{ 11 | \item{IP_BAM}{The bam files for IP samples} 12 | 13 | \item{INPUT_BAM}{The bam files for INPUT samples} 14 | 15 | \item{size.IP}{The size factor for IP libraries} 16 | 17 | \item{size.INPUT}{The size factor for INPUT libraries} 18 | 19 | \item{geneName}{The name (as defined in gtf file) of the gene you want to plot} 20 | 21 | \item{geneModel}{The gene model generated by gtfToGeneModel() function} 22 | 23 | \item{libraryType}{"opposite" for mRNA stranded library, "same" for samll RNA library} 24 | 25 | \item{GTF}{gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")} 26 | } 27 | \description{ 28 | old coverage plot function. 29 | } 30 | -------------------------------------------------------------------------------- /man/plotGeneCov-MeRIP.Peak-method.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{plotGeneCov,MeRIP.Peak-method} 5 | \alias{plotGeneCov,MeRIP.Peak-method} 6 | \title{plotGeneCov} 7 | \usage{ 8 | \S4method{plotGeneCov}{MeRIP.Peak}(object, geneName, 9 | libraryType = "opposite", center = mean, ZoomIn = NULL, 10 | adjustExprLevel = F, adjustExpr_peak_range = NULL) 11 | } 12 | \arguments{ 13 | \item{object}{The data list from countReads and other analysis.} 14 | 15 | \item{geneName}{The gene symbol to be ploted.} 16 | 17 | \item{libraryType}{Specify whether the library is the same or opposite strand of the original RNA molecule. Default is "opposite".} 18 | 19 | \item{center}{Specify the method to calculate average coverage of each group. Could be mean or median.} 20 | 21 | \item{ZoomIn}{c(start,end) The coordinate to zoom in at the gene to be ploted.} 22 | 23 | \item{adjustExprLevel}{logical parameter. Specify whether normalize the two group so that they have similar expression level.} 24 | 25 | \item{GTF}{The GRanges object containing gtf annotation. Can obtain by rtracklayer::import("file.gtf", format= "gtf").} 26 | } 27 | \description{ 28 | plotGeneCov 29 | } 30 | -------------------------------------------------------------------------------- /man/plotGeneCoverage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotGeneCov.R 3 | \name{plotGeneCoverage} 4 | \alias{plotGeneCoverage} 5 | \title{plotGeneCoverage} 6 | \usage{ 7 | plotGeneCoverage(IP_BAMs, INPUT_BAMs, size.IP, size.INPUT, X, geneName, 8 | geneModel, libraryType = "opposite", center = mean, GTF, 9 | ZoomIn = NULL, adjustExprLevel = FALSE, 10 | adjustExpr_peak_range = NULL, plotSNP = NULL) 11 | } 12 | \arguments{ 13 | \item{size.IP}{The size factor for IP libraries} 14 | 15 | \item{size.INPUT}{The size factor for INPUT libraries} 16 | 17 | \item{geneName}{The name (as defined in gtf file) of the gene you want to plot} 18 | 19 | \item{geneModel}{The gene model generated by gtfToGeneModel() function} 20 | 21 | \item{libraryType}{"opposite" for mRNA stranded library, "same" for samll RNA library} 22 | 23 | \item{GTF}{gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")} 24 | 25 | \item{adjustExprLevel}{Logic parameter determining whether adjust coverage so that input are at "same" expression level.} 26 | 27 | \item{plotSNP}{The option to plot SNP on the figure. Null by default. If want to include SNP in the plot, the parameter needs to ba a dataframe like this: data.frame(loc= position, anno="A/G")} 28 | 29 | \item{IP_BAM}{The bam files for IP samples} 30 | 31 | \item{INPUT_BAM}{The bam files for INPUT samples} 32 | } 33 | \description{ 34 | plotGeneCoverage 35 | } 36 | -------------------------------------------------------------------------------- /man/plotGeneMonster.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PlotGene.R 3 | \name{plotGeneMonster} 4 | \alias{plotGeneMonster} 5 | \title{plotGeneMonster} 6 | \usage{ 7 | plotGeneMonster(readsOut, geneName, libraryType = "opposite", 8 | center = "mean", GTF, ZoomIn = NULL) 9 | } 10 | \arguments{ 11 | \item{readsOut}{The data list from countReads and other analysis.} 12 | 13 | \item{geneName}{The gene symbol to be ploted.} 14 | 15 | \item{GTF}{The GRanges object containing gtf annotation.} 16 | 17 | \item{ZoomIn}{c(start,end) The coordinate to zoom in at the gene to be ploted.} 18 | } 19 | \description{ 20 | plotGeneMonster 21 | } 22 | -------------------------------------------------------------------------------- /man/plotGenePair.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PlotGene.R 3 | \name{plotGenePair} 4 | \alias{plotGenePair} 5 | \title{plotGenePair} 6 | \usage{ 7 | plotGenePair(Ctl_IP_BAM, Ctl_INPUT_BAM, Treat_IP_BAM, Treat_INPUT_BAM, 8 | Ctl_size.IP, Ctl_size.INPUT, Treat_size.IP, Treat_size.INPUT, geneName, 9 | geneModel, libraryType = "ooposite", center = mean, GTF, 10 | ZoomIn = NULL) 11 | } 12 | \arguments{ 13 | \item{Ctl_IP_BAM}{The bam files for Control IP samples} 14 | 15 | \item{Ctl_INPUT_BAM}{The bam files for Control INPUT samples} 16 | 17 | \item{Treat_IP_BAM}{The bam files for treated IP samples} 18 | 19 | \item{Treat_INPUT_BAM}{The bam files for treated INPUT samples} 20 | 21 | \item{Ctl_size.IP}{The size factor for IP libraries} 22 | 23 | \item{Ctl_size.INPUT}{The size factor for INPUT libraries} 24 | 25 | \item{Treat_size.IP}{The size factor for IP libraries} 26 | 27 | \item{Treat_size.INPUT}{The size factor for INPUT libraries} 28 | 29 | \item{geneName}{The name (as defined in gtf file) of the gene you want to plot} 30 | 31 | \item{geneModel}{The gene model generated by gtfToGeneModel() function} 32 | 33 | \item{libraryType}{"opposite" for mRNA stranded library, "same" for samll RNA library} 34 | } 35 | \description{ 36 | plot tow groups of samples in the same figure 37 | } 38 | -------------------------------------------------------------------------------- /man/plotMetaGene.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotMetaGene.R 3 | \name{plotMetaGene} 4 | \alias{plotMetaGene} 5 | \title{plotMetaGene} 6 | \usage{ 7 | plotMetaGene(peak, gtf) 8 | } 9 | \arguments{ 10 | \item{peak}{the data frame of peak in bed12 format.} 11 | 12 | \item{gtf}{The annotation file.} 13 | } 14 | \description{ 15 | plotMetaGene 16 | } 17 | -------------------------------------------------------------------------------- /man/plotMetaGeneMulti.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotMetaGene.R 3 | \name{plotMetaGeneMulti} 4 | \alias{plotMetaGeneMulti} 5 | \title{plotMEtaGeneMulti A wrapper function for Guitar to plot meta gene plot of multiple samples overlaid on each other.} 6 | \usage{ 7 | plotMetaGeneMulti(peakList, gtf, saveToPDFprefix = NA, 8 | includeNeighborDNA = FALSE) 9 | } 10 | \arguments{ 11 | \item{peakList}{The list of peak, each object of list should have a data frame of peak in bed12 format.} 12 | 13 | \item{gtf}{The annotation file for the gene model.} 14 | 15 | \item{saveToPDFprefix}{Set a name to save the plot to PDF.} 16 | 17 | \item{includeNeighborDNA}{Whether to include upstrean and downstream region in the meta gene.} 18 | } 19 | \description{ 20 | plotMEtaGeneMulti A wrapper function for Guitar to plot meta gene plot of multiple samples overlaid on each other. 21 | } 22 | -------------------------------------------------------------------------------- /man/plotPCAfromMatrix.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plotPCA.DESeq2.R 3 | \name{plotPCAfromMatrix} 4 | \alias{plotPCAfromMatrix} 5 | \title{plotPCAfromMatrix} 6 | \usage{ 7 | plotPCAfromMatrix(m, group, standardize = TRUE, loglink = TRUE) 8 | } 9 | \arguments{ 10 | \item{m}{The matrix of count data} 11 | 12 | \item{group}{The factor levels to color the samples. Should be the save number as the # of matrix columns} 13 | 14 | \item{standardize}{Logic parameter indicating whether to standardize the count data to have unit variance. The default is TRUE.} 15 | 16 | \item{loglink}{Logic parameter determine whether to take log of the metrix data. Default is TRUE. If your input matrix is at log scale, use FALSE.} 17 | } 18 | \description{ 19 | plotPCAfromMatrix 20 | } 21 | -------------------------------------------------------------------------------- /man/plotTPM.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{plotTPM} 4 | \alias{plotTPM} 5 | \title{plotTPM} 6 | \usage{ 7 | plotTPM(TPM, geneName, group, logCount = FALSE, facet_grid = FALSE) 8 | } 9 | \arguments{ 10 | \item{TPM}{Dataframe of gene TPM} 11 | 12 | \item{geneName}{The name of genes to be ploted.} 13 | 14 | \item{group}{Categorical info for each sample.} 15 | 16 | \item{logCount}{where to plot count at log scale} 17 | } 18 | \description{ 19 | plotTPM 20 | } 21 | -------------------------------------------------------------------------------- /man/qqplot.pvalue.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/qqplot.pvalue.R 3 | \name{qqplot.pvalue} 4 | \alias{qqplot.pvalue} 5 | \title{qqplot.pvalue} 6 | \usage{ 7 | qqplot.pvalue(x, pointSize = 1, legendSize = 4) 8 | } 9 | \arguments{ 10 | \item{x}{can be a vector (p value of one group) or a list of vector (p value of multiple groups).} 11 | 12 | \item{pointSize}{The size of data points} 13 | 14 | \item{legendSize}{The size of points in the legend} 15 | } 16 | \description{ 17 | qqplot.pvalue 18 | } 19 | -------------------------------------------------------------------------------- /man/reportJointPeak.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \name{reportJointPeak} 4 | \alias{reportJointPeak} 5 | \title{reportJointPeak} 6 | \usage{ 7 | reportJointPeak(MeRIPdata, joint_threshold = 2, threads = 1) 8 | } 9 | \arguments{ 10 | \item{MeRIPdata}{The MeRIP.Peak object containing peak calling result} 11 | 12 | \item{joint_threshold}{Define the number of sample required to have consistent peak in a locus to call this bin a joint peak.} 13 | 14 | \item{threads}{The number of threads to use.} 15 | } 16 | \value{ 17 | MeRIP.Peak object with jointPeaks data 18 | } 19 | \description{ 20 | reportJointPeak 21 | } 22 | -------------------------------------------------------------------------------- /man/results.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{results} 5 | \alias{results} 6 | \title{export results} 7 | \usage{ 8 | \S4method{results}{MeRIP.Peak}(object) 9 | } 10 | \arguments{ 11 | \item{object}{The MeRIP.Peak object.} 12 | } 13 | \value{ 14 | joint peaks (with test result) in a data.frame. 15 | } 16 | \description{ 17 | The extractor for final test result. 18 | } 19 | -------------------------------------------------------------------------------- /man/select.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/methods.R 3 | \docType{methods} 4 | \name{select} 5 | \alias{select} 6 | \title{subset MeRIPdata} 7 | \usage{ 8 | \S4method{select}{MeRIP}(object, samples) 9 | 10 | \S4method{select}{MeRIP.Peak}(object, samples, keepData = TRUE) 11 | } 12 | \arguments{ 13 | \item{object}{The MeRIP object} 14 | 15 | \item{samples}{The samplenames to be subset or the index number of samples to be subset.} 16 | 17 | \item{object}{The MeRIP.Peak object} 18 | 19 | \item{samples}{The samplenames to be subset or the index number of samples to be subset.} 20 | } 21 | \value{ 22 | an MeRIP object of selected samples. 23 | 24 | an MeRIP.Peak object of selected samples. 25 | } 26 | \description{ 27 | subset dataset by samples. 28 | 29 | subset dataset by samples. 30 | } 31 | -------------------------------------------------------------------------------- /man/swapChr22.QTL_BetaBin.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/QTL_betaBinomial.neg.R 3 | \name{swapChr22.QTL_BetaBin} 4 | \alias{swapChr22.QTL_BetaBin} 5 | \title{QTL_BetaBin} 6 | \usage{ 7 | swapChr22.QTL_BetaBin(MeRIPdata, vcf_file, 8 | BSgenome = BSgenome.Hsapiens.UCSC.hg19, testWindow = 1e+05, 9 | Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, 10 | AdjIPeffi = TRUE, PCsToInclude = 0, thread = 1) 11 | } 12 | \arguments{ 13 | \item{MeRIPdata}{The MeRIP.Peak object} 14 | 15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 16 | 17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.} 18 | 19 | \item{testWindow}{Integer. Test SNPs in bp window flanking the peak.} 20 | 21 | \item{Chromosome}{The chromsome to run QTL test.} 22 | 23 | \item{Range}{The position range on a chromosome to test.} 24 | 25 | \item{Covariates}{The matrix for covariates to be included in the test.} 26 | } 27 | \description{ 28 | QTL_BetaBin 29 | } 30 | -------------------------------------------------------------------------------- /man/swapChr22.QTL_BetaBin.permute.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/QTL_betaBinomial.neg.permute.R 3 | \name{swapChr22.QTL_BetaBin.permute} 4 | \alias{swapChr22.QTL_BetaBin.permute} 5 | \title{swapChr22.QTL_BetaBin.permute} 6 | \usage{ 7 | swapChr22.QTL_BetaBin.permute(MeRIPdata, vcf_file, 8 | BSgenome = BSgenome.Hsapiens.UCSC.hg19, testWindow = 1e+05, 9 | Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, 10 | AdjIPeffi = TRUE, PCsToInclude = 0, thread = 1, Nround = 1) 11 | } 12 | \arguments{ 13 | \item{MeRIPdata}{The MeRIP.Peak object} 14 | 15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!} 16 | 17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.} 18 | 19 | \item{testWindow}{Integer. Test SNPs in bp window flanking the peak.} 20 | 21 | \item{Chromosome}{The chromsome to run QTL test.} 22 | 23 | \item{Range}{The position range on a chromosome to test.} 24 | 25 | \item{Covariates}{The matrix for covariates to be included in the test.} 26 | } 27 | \description{ 28 | swapChr22.QTL_BetaBin.permute 29 | } 30 | -------------------------------------------------------------------------------- /src/.PoissonGamma.cpp.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottzijiezhang/MeRIPtools/626569af9b0b5ee48f8e0d165d229cc906ce9851/src/.PoissonGamma.cpp.swp -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | *.dll 4 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | ## Use the R_HOME indirection to support installations of multiple R version 2 | PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) 3 | --------------------------------------------------------------------------------