├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── MeRIPtools.Rproj
├── NAMESPACE
├── R
    ├── AllGenerics.R
    ├── Guitar_functions.R
    ├── MeRIP_class.R
    ├── PlotGene.R
    ├── QTL_PoissonGamma.R
    ├── QTL_betaBinomial.R
    ├── QTL_betaBinomial.neg.R
    ├── RcppExports.R
    ├── countReads.R
    ├── countReadsCarRNA.R
    ├── countReadsFromBam.R
    ├── get_peak_logOR.R
    ├── gtfToGeneModel.R
    ├── initialization.R
    ├── methods.R
    ├── plotGeneCov.R
    ├── plotMetaGene.R
    ├── plotPCA.DESeq2.R
    ├── qqplot.pvalue.R
    └── randomPeak.R
├── README.md
├── man
    ├── BetaBinTest.Rd
    ├── GuitarPlotNew.Rd
    ├── IP.files.Rd
    ├── Input.files.Rd
    ├── MetaGene.Rd
    ├── PrepCoveragePlot-MeRIP.Peak-method.Rd
    ├── QNBtest.Rd
    ├── QTL_BetaBin.Rd
    ├── QTL_BetaBin2.Rd
    ├── QTL_BetaBin_permute.Rd
    ├── QTL_PoissonGamma.Rd
    ├── RADARtest.Rd
    ├── adjustExprLevel-MeRIP.Peak-method.Rd
    ├── annotatePeak-MeRIP.Peak-method.Rd
    ├── callPeakBinomial.Rd
    ├── callPeakFisher.Rd
    ├── consistentPeak-MeRIP.Peak-method.Rd
    ├── countReads.Rd
    ├── counts.Rd
    ├── dot-Bino_test.Rd
    ├── geneBins-MeRIP-method.Rd
    ├── geneExpression-MeRIP.Peak-method.Rd
    ├── geneExpression.Rd
    ├── geneExpressionTMP.Rd
    ├── get_peak_logOR.Rd
    ├── gtfToGeneModel.Rd
    ├── jointPeakCount.Rd
    ├── normalizeLibrary-MeRIP.Peak-method.Rd
    ├── peakDistribution.Rd
    ├── plotGene.Rd
    ├── plotGeneCov-MeRIP.Peak-method.Rd
    ├── plotGeneCoverage.Rd
    ├── plotGeneMonster.Rd
    ├── plotGenePair.Rd
    ├── plotMetaGene.Rd
    ├── plotMetaGeneMulti.Rd
    ├── plotPCAfromMatrix.Rd
    ├── plotTPM.Rd
    ├── qqplot.pvalue.Rd
    ├── reportJointPeak.Rd
    ├── results.Rd
    ├── select.Rd
    ├── swapChr22.QTL_BetaBin.Rd
    └── swapChr22.QTL_BetaBin.permute.Rd
└── src
    ├── .PoissonGamma.cpp.swp
    ├── .gitignore
    ├── Makevars
    ├── PoissonGamma.cpp
    └── RcppExports.cpp


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: MeRIPtools
 2 | Type: Package
 3 | Title: Analyze MeRIP-seq data and QTL calling
 4 | Version: 0.2.1
 5 | Author: Zijie Zhang, Kaixuan Luo , Chuan He, Xin He
 6 | Maintainer: The package maintainer <scottzjzhang@uchicago.edu>
 7 | Description: The MeRIPtools is a comprehensive toolset for analyzing MeRIP-seq (e.g. m6A-seq) and data visualization. 
 8 | License: MIT License
 9 | Encoding: UTF-8
10 | LazyData: true
11 | Depends: GenomicFeatures, Rsamtools, ggplot2, doParallel, foreach,grid,rtracklayer,GenomicAlignments,reshape2,Rcpp,RcppArmadillo, qvalue
12 | RoxygenNote: 6.1.1
13 | LinkingTo: Rcpp, RcppArmadillo
14 | Imports: Guitar, stringr,vcfR, broom, DESeq2, ggsci
15 | #Imports: Guitar, stringr,vcfR,gamlss, broom, DESeq2, ggsci
16 |  
17 | 


--------------------------------------------------------------------------------
/MeRIPtools.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | importFrom(Rcpp,sourceCpp)
 2 | useDynLib(MeRIPtools)
 3 | exportPattern("^[[:alpha:]]+")
 4 | 
 5 | 
 6 | import(BSgenome)
 7 | import(DESeq2)
 8 | import(GenomicRanges)
 9 | import(Guitar)
10 | #import(QNB)
11 | import(broom)
12 | #import(gamlss)
13 | #import(gamlss.dist)
14 | import(rtracklayer)
15 | import(stringr)
16 | import(vcfR)
17 | 


--------------------------------------------------------------------------------
/R/AllGenerics.R:
--------------------------------------------------------------------------------
  1 | #' @import DESeq2
  2 | #' @export
  3 | setGeneric("counts", getGeneric("counts", package = "DESeq2"))
  4 | 
  5 | #' @export
  6 | #' @rdname counts
  7 | setMethod("counts", signature("MeRIP"), function(object){
  8 |   object@reads
  9 | })
 10 | 
 11 | #' @export
 12 | #' @rdname Input.files
 13 | setGeneric("Input.files", function(object) {
 14 |   standardGeneric("Input.files")
 15 | })
 16 | 
 17 | #' @export
 18 | #' @rdname IP.files
 19 | setGeneric("IP.files", function(object) {
 20 |   standardGeneric("IP.files")
 21 | })
 22 | 
 23 | #' @export
 24 | setGeneric("geneBins", function(object) {
 25 |   standardGeneric("geneBins")
 26 | })
 27 | 
 28 | #' @export
 29 | setGeneric("jointPeak", function(object) {
 30 |   standardGeneric("jointPeak")
 31 | })
 32 | 
 33 | #' @export
 34 | setGeneric("filter", function(object, ... ) {
 35 |   standardGeneric("filter")
 36 | })
 37 | 
 38 | #' @export
 39 | setGeneric("extractInput", function(object) {
 40 |   standardGeneric("extractInput")
 41 | })
 42 | 
 43 | #' @export
 44 | setGeneric("extractIP", function(object, ...) {
 45 |   standardGeneric("extractIP")
 46 | })
 47 | 
 48 | #' @export
 49 | setGeneric("PrepCoveragePlot", function(object, ...) {
 50 |   standardGeneric("PrepCoveragePlot")
 51 | })
 52 | 
 53 | #' @export
 54 | setGeneric("normalizeLibrary",function(object, ...){
 55 |   standardGeneric("normalizeLibrary")
 56 | })
 57 | 
 58 | #' @export
 59 | setGeneric("adjustExprLevel",function(object,  adjustBy = "geneSum" ){
 60 |   standardGeneric("adjustExprLevel")
 61 | })
 62 | 
 63 | #' @export
 64 | setGeneric("geneExpression",function(object, ...){
 65 |   standardGeneric("geneExpression")
 66 | })
 67 | 
 68 | #' @export
 69 | setGeneric("consistentPeak",function(object, samplenames = NULL, joint_threshold = NA, threads = 1){
 70 |   standardGeneric("consistentPeak")
 71 | })
 72 | 
 73 | #' @export
 74 | setGeneric("variable",function(object){
 75 |   standardGeneric("variable")
 76 | })
 77 | 
 78 | #' @export
 79 | setGeneric("variable<-",function(object, value){
 80 |   standardGeneric("variable<-")
 81 | })
 82 | 
 83 | #' @export
 84 | setGeneric("samplenames",function(object){
 85 |   standardGeneric("samplenames")
 86 | })
 87 | 
 88 | #' @export
 89 | setGeneric("samplenames<-",function(object, value){
 90 |   standardGeneric("samplenames<-")
 91 | })
 92 | 
 93 | #' @export
 94 | setGeneric("QNBtest",function(object){
 95 |   standardGeneric("QNBtest")
 96 | })
 97 | 
 98 | #' @export
 99 | setGeneric("peakDistribution",function(object){
100 |   standardGeneric("peakDistribution")
101 | })
102 | 
103 | #' @export
104 | setGeneric("plotGeneCov", function(object, geneName, libraryType = "opposite", center = mean,ZoomIn = NULL, adjustExprLevel = F , split = FALSE, ... ){
105 |   standardGeneric("plotGeneCov")
106 |   })
107 | 
108 | #' @export
109 | setGeneric("plotSNPpeakPairs",function(object, genotypeFile, SNPID, geneName, libraryType = "opposite", center = mean,ZoomIn = NULL, adjustExprLevel,adjustExpr_peak_range = NULL ){
110 |   standardGeneric("plotSNPpeakPairs")
111 |   })
112 | 
113 | #' @export
114 | setGeneric("geneExpressionTMP",function(object, meanFragmentLength = 150, normalize = T){
115 |   standardGeneric("geneExpressionTMP")
116 | })
117 | 
118 | #' @export
119 | setGeneric("RADARtest",function(object,  exclude ,maxPsi){
120 |   standardGeneric("RADARtest")
121 | })
122 | 
123 | #' @export
124 | setGeneric("select" ,function(object, samples, keepData=TRUE){
125 |   standardGeneric("select")
126 | })
127 | 
128 | #' @export
129 | setGeneric("results", function(object){standardGeneric("results")})
130 | 
131 | #' @export
132 | setGeneric("BetaBinTest",function(object,  AdjIPeffi = TRUE ,  AdjustGC = FALSE, BSgenome = BSgenome.Hsapiens.UCSC.hg38, thread = 1){standardGeneric("BetaBinTest")})
133 | 
134 | #' @export
135 | setGeneric("annotatePeak",function(object, threads = 1){standardGeneric("annotatePeak")})
136 | 
137 | 
138 | 
139 |  


--------------------------------------------------------------------------------
/R/MeRIP_class.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #' @export
  3 | MeRIP <- setClass("MeRIP",
  4 |          representation( reads = "matrix",
  5 |                          binSize = "numeric",
  6 |                          geneModel = "GRangesList",
  7 |                          gtf = "character",
  8 |                          bamPath.input = "character",
  9 |                          bamPath.ip = "character",
 10 |                          samplenames = "character",
 11 |                          geneBins = "data.frame",
 12 |                          geneSum = "matrix",
 13 |                          GTF = "GRanges",
 14 |                          mode = "character"
 15 |                          ),
 16 |          validity = function(object){
 17 |            errors <- character()
 18 |            ## Check basic element
 19 |            if( all(dim(object@reads) <= 1) ){
 20 |              errors <- c(errors, paste0("read count table is a required data for MeRIP instance!"))
 21 |            }
 22 |            if( length(object@binSize)==0){
 23 |              errors <- c(errors, paste0("bin size to slice the transcript is a required parameter for MeRIP instance!"))
 24 |            }
 25 |            if( length(object@geneModel)<=0 ){
 26 |              errors <- c(errors, paste0("GeneModel is a required data for MeRIP instance!"))
 27 |            }
 28 |            if( length(object@bamPath.input)<=0){
 29 |              errors <- c(errors, paste0("path to the input BAM files are the required data for MeRIP instance!"))
 30 |            }
 31 |            if( length(object@bamPath.ip)<=0){
 32 |              errors <- c(errors, paste0("path to the IP BAM files are the required data for MeRIP instance!"))
 33 |            }
 34 |            if( length(object@samplenames)<=0){
 35 |              errors <- c(errors, paste0("Sample names are the required data for MeRIP instance!"))
 36 |            }
 37 |            ## match data dimension with num of samples
 38 |            if(ncol(object@reads) !=  2 * length(object@samplenames) ){
 39 |              errors <- c(errors, paste0("The number of colnumns of read count is ",ncol(object@reads),". The number of samples is ",length(object@samplenames),". The number of colnumns of read count should be 2x the number of samples!"))
 40 |            }
 41 |            if(length(object@bamPath.input) != length(object@samplenames) ){
 42 |              errors <- c(errors, paste0("The number of input bam file path(es) should equal to the number of samplenames!"))
 43 |            }
 44 |            if(length(object@bamPath.ip) != length(object@samplenames) ){
 45 |              errors <- c(errors, paste0("The number of IP bam file path(es) should equal to the number of samplenames!"))
 46 |            }
 47 |            
 48 |            if (length(errors) == 0) TRUE else errors
 49 |          },
 50 |          prototype(geneBins = data.frame(gene = character(), bin = character() ), geneSum = matrix(),GTF = GRanges(),  mode = "mRNA" )
 51 |          )
 52 | 
 53 | #' @export
 54 | MeRIP.Peak <- setClass("MeRIP.Peak",representation( peakCallResult = "matrix",
 55 |                                                     jointPeak_id_pairs = "matrix",
 56 |                                                     jointPeaks = "data.frame",
 57 |                                                     jointPeak_ip = "matrix",
 58 |                                                     jointPeak_input = "matrix",
 59 |                                                     norm.jointPeak_ip = "matrix",
 60 |                                                     sizeFactor = "data.frame",
 61 |                                                     variate = "data.frame",
 62 |                                                     jointPeak_adjExpr = "matrix",
 63 |                                                     test.est = "matrix",
 64 |                                                     peakCalling = "character",
 65 |                                                     jointPeak_threshold = "numeric",
 66 |                                                     test.method = "character"),
 67 |                        contains = "MeRIP",
 68 |                        prototype(peakCalling = "none", jointPeak_threshold = 0, test.method = "none")
 69 |                        )
 70 | 
 71 | 
 72 | 
 73 | #' @export
 74 | setMethod("show",signature("MeRIP"), function(object){
 75 |  summary(object)
 76 | })
 77 | #' @export
 78 | setMethod("show",signature("MeRIP.Peak"), function(object){
 79 |   summary(object)
 80 | })
 81 | 
 82 | 
 83 | #' @export
 84 | setMethod("summary", signature("MeRIP"), function(object){
 85 |   cat(paste0("MeRIP dataset of ",length(object@samplenames)," samples.\n"))
 86 |   cat("The total read count for Input and IP samples are (Million reads):\n")
 87 |   totoReads <- rbind("Input" = round(colSums(object@reads[,1:length(object@samplenames)])/1e6, digits = 2),
 88 |                      "IP" = round(colSums(object@reads[,-c(1:length(object@samplenames))])/1e6, digits = 2))
 89 |   colnames(totoReads) <- object@samplenames
 90 |   print(totoReads)
 91 | })
 92 | #' @export
 93 | setMethod("summary", signature("MeRIP.Peak"), function(object){
 94 |   cat(paste0("MeRIP dataset of ",length(object@samplenames)," samples.\n"))
 95 |   cat("The total read count for Input and IP samples are (Million reads):\n")
 96 |   totoReads <- rbind("Input" = round(colSums(object@reads[,1:length(object@samplenames)])/1e6, digits = 2),
 97 |                      "IP" = round(colSums(object@reads[,-c(1:length(object@samplenames))])/1e6, digits = 2))
 98 |   colnames(totoReads) <- object@samplenames
 99 |   
100 |   print(totoReads)
101 |   if(object@peakCalling != "none"){cat(paste0("\nPeak calling done by ",object@peakCalling,".\n"))}
102 |   if(nrow(object@jointPeaks)>0 & object@jointPeak_threshold> 0 ){
103 |     cat(paste0(nrow(object@jointPeaks)," joint peak reported at threshold ",object@jointPeak_threshold," (requiring peaks to be called in at least ",object@jointPeak_threshold," samples).\n") )
104 |   }
105 |   if(nrow(object@geneSum)>0){ cat("Input gene level count available.\n") }
106 |   if(nrow(object@variate)>0){ cat(paste0("There are ",ncol(object@variate)," predictor variables/covariates. Can access by function variable(MeRIPdata). \n"))}
107 |   if(object@test.method != "none" & nrow(object@test.est)>0){ print(paste0("Differential peaks tested by ",object@test.method,".\n"))}
108 | })
109 | 
110 | 
111 | ## A function to convert old version list data into S4 class MeRIP dataset
112 | #' @export
113 | makeMeRIPfromList <- function(x, gtf){
114 |   if( all(c("reads","binSize","geneModel","bamPath.input","bamPath.ip","samplenames") %in% names(x) ) ){
115 |     return(MeRIP(reads = x$reads, binSize = x$binSize, geneModel = x$geneModel, gtf = gtf ,bamPath.input = x$bamPath.input, bamPath.ip = x$bamPath.ip, samplenames = x$samplenames) )
116 |   }else{
117 |     stop("The list must have the most basic elements for MeRIP dataset! ")
118 |       }
119 | 
120 | }
121 | 
122 | 
123 | 
124 | 


--------------------------------------------------------------------------------
/R/PlotGene.R:
--------------------------------------------------------------------------------
 1 | #' @title old coverage plot function.
 2 | #' @param IP_BAM The bam files for IP samples
 3 | #' @param INPUT_BAM The bam files for INPUT samples
 4 | #' @param size.IP The size factor for IP libraries
 5 | #' @param size.INPUT The size factor for INPUT libraries
 6 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot
 7 | #' @param geneModel The gene model generated by gtfToGeneModel() function
 8 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library
 9 | #' @param GTF gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")
10 | #' @export
11 | ## the main function to plot m6A-seq on one group of data
12 | plotGene <- function(IP_BAM, INPUT_BAM, size.IP, size.INPUT, geneName, geneModel, libraryType = "opposite", center = mean ,GTF,ZoomIn=NULL){
13 |   IP.cov <- getAveCoverage(geneModel= geneModel,bamFiles = IP_BAM,geneName = geneName,size.factor = size.IP, libraryType = libraryType, center = center, ZoomIn = ZoomIn)
14 |   INPUT.cov <- getAveCoverage(geneModel= geneModel,bamFiles = INPUT_BAM,geneName = geneName,size.factor = size.INPUT, libraryType = libraryType, center = center,ZoomIn = ZoomIn)
15 |   cov.data <- data.frame(IP=IP.cov,Input=INPUT.cov,genome_location=as.numeric(names(IP.cov) ) )
16 |   yscale <- max(IP.cov,INPUT.cov)
17 |   p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Input,colour =\"Input\"))+geom_line(aes(y=IP,colour=\"IP\"))+labs(y=\"normalized coverage\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) ),1))+
18 |     theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
19 |                        panel.grid.minor = element_blank(), axis.line = element_line(colour = \"black\"))"
20 | 
21 |   p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn)
22 |   p <- paste(p1,p2,sep = "+")
23 |   eval(parse( text = p ))
24 | }
25 | 
26 | ## the main function to plot m6A-seq on two group of data
27 | #' @title plotGenePair
28 | #' @description plot tow groups of samples in the same figure
29 | #' @param Ctl_IP_BAM The bam files for Control IP samples
30 | #' @param Ctl_INPUT_BAM The bam files for Control INPUT samples
31 | #' @param Treat_IP_BAM The bam files for treated IP samples
32 | #' @param Treat_INPUT_BAM The bam files for treated INPUT samples
33 | #' @param Ctl_size.IP The size factor for IP libraries
34 | #' @param Ctl_size.INPUT The size factor for INPUT libraries
35 | #' @param Treat_size.IP The size factor for IP libraries
36 | #' @param Treat_size.INPUT The size factor for INPUT libraries
37 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot
38 | #' @param geneModel The gene model generated by gtfToGeneModel() function
39 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library
40 | #' @export
41 | plotGenePair <- function(Ctl_IP_BAM,Ctl_INPUT_BAM,Treat_IP_BAM,Treat_INPUT_BAM,Ctl_size.IP,Ctl_size.INPUT,Treat_size.IP,Treat_size.INPUT,geneName,geneModel, libraryType = "ooposite",center = mean, GTF ,ZoomIn=NULL){
42 |   Ctl_IP.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Ctl_IP_BAM,geneName = geneName,size.factor = Ctl_size.IP, libraryType = libraryType,center = center, ZoomIn = ZoomIn)
43 |   Ctl_INPUT.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Ctl_INPUT_BAM,geneName = geneName,size.factor = Ctl_size.INPUT,libraryType = libraryType, center = center , ZoomIn = ZoomIn)
44 |   Treat_IP.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Treat_IP_BAM,geneName = geneName,size.factor = Treat_size.IP, libraryType = libraryType, center = center,ZoomIn = ZoomIn)
45 |   Treat_INPUT.cov <- getAveCoverage(geneModel= geneModel,bamFiles = Treat_INPUT_BAM,geneName = geneName,size.factor = Treat_size.INPUT, libraryType = libraryType, center = center,ZoomIn = ZoomIn)
46 |   cov.data <- data.frame(Ctl_IP=Ctl_IP.cov, Ctl_Input = Ctl_INPUT.cov,
47 |                          Treat_IP=Treat_IP.cov, Treat_Input = Treat_INPUT.cov,
48 |                          genome_location=as.numeric(names(Ctl_IP.cov) ) )
49 |   yscale <- max(Ctl_IP.cov,Ctl_INPUT.cov,Treat_IP.cov,Treat_INPUT.cov)
50 |   p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Ctl_Input,colour =\"Ctl Input\"))+geom_line(aes(y=Treat_IP,colour=\"Treat IP\"))+geom_line(aes(y=Treat_Input,colour =\"Treat Input\"))+geom_line(aes(y=Ctl_IP,colour=\"Ctl IP\"))+labs(y=\"normalized coverage\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) ),1))+
51 |     theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
52 |                        panel.grid.minor = element_blank(), axis.line = element_line(colour = \"black\"))"
53 |   p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn)
54 |   p <- paste(p1,p2,sep = "+")
55 |   eval(parse( text = p ))
56 | 
57 | }
58 | 
59 | 
60 | #' @title plotGeneMonster
61 | #' @param readsOut The data list from countReads and other analysis.
62 | #' @param geneName The gene symbol to be ploted.
63 | #' @param GTF The GRanges object containing gtf annotation.
64 | #' @param ZoomIn c(start,end) The coordinate to zoom in at the gene to be ploted.
65 | #' @export
66 | plotGeneMonster <- function(readsOut, geneName, libraryType = "opposite", center = "mean", GTF, ZoomIn = NULL){
67 |   if("X" %in% names(readsOut) ){
68 |     X <- readsOut$X
69 |     plotGenePair(Ctl_IP_BAM = readsOut$bamPath.ip[X == unique(X)[1]],
70 |                  Ctl_INPUT_BAM = readsOut$bamPath.input[X == unique(X)[1]],
71 |                  Treat_IP_BAM = readsOut$bamPath.ip[X == unique(X)[2]],
72 |                  Treat_INPUT_BAM = readsOut$bamPath.input[X == unique(X)[2]],
73 |                  Ctl_size.IP = readsOut$sizeFactor$ip[X == unique(X)[1]],
74 |                  Ctl_size.INPUT = readsOut$sizeFactor$input[X == unique(X)[1]],
75 |                  Treat_size.IP = readsOut$sizeFactor$ip[X == unique(X)[2]],
76 |                  Treat_size.INPUT = readsOut$sizeFactor$input[X == unique(X)[2]],
77 |                  geneName = geneName,
78 |                  geneModel = readsOut$geneModel,
79 |                  libraryType = libraryType,center = center,GTF = GTF,ZoomIn = ZoomIn )
80 |   }else{
81 |     plotGene(IP_BAM = readsOut$bamPath.ip,
82 |              INPUT_BAM =  readsOut$bamPath.input,
83 |              size.IP = readsOut$sizeFactor$ip,
84 |              size.INPUT = readsOut$sizeFactor$input,
85 |              geneName = geneName,
86 |              geneModel = readsOut$geneModel,
87 |              libraryType = libraryType,center = center,GTF = GTF,ZoomIn = ZoomIn)
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/R/QTL_PoissonGamma.R:
--------------------------------------------------------------------------------
  1 | #' @title QTL_PoissonGamma
  2 | #' @param pheno The phenotype data matrix. Needs to be IP read count that has been normalized for expression level.
  3 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!!
  4 | #' @param peak_bed The peak file in BED12 format that needs to correspond to phenotype data matrix.
  5 | #' @param testWindow Integer. Test SNPs in <testWindow> bp window flanking the peak.
  6 | #' @param Chromosome The chromsome to run QTL test.
  7 | #' @param Range The position range on a chromosome to test.
  8 | #' @param Covariates The matrix for covariates to be included in the test.
  9 | #' @param maxPsi The max estimation for the random effect parameter Psi.
 10 | #' @import stringr
 11 | #' @import vcfR
 12 | #' @export
 13 | QTL_PoissonGamma <- function( pheno, vcf_file, peak_bed, testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, maxPsi = 100, thread = 1 ){
 14 | 
 15 |   ##check input
 16 |   if(nrow(pheno) != nrow(peak_bed) ){
 17 |     stop("The number of row of phenotype is not equal to the number of peaks!")
 18 |   }
 19 | 
 20 |   ## set ranges on the chromosome that can be tested
 21 |   con <- pipe(paste0("zcat ",vcf_file," | awk '!/^#/ {print $2}' | tail -n1"))
 22 |   vcfRange <- c( read.table(gzfile(vcf_file), nrows = 1)[,2] ,
 23 |                 scan( con , quiet = T ))
 24 |   close(con)
 25 |   ## update test Range if necessary
 26 |   if(!is.null(Range)){
 27 |     vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) )
 28 |   }else{
 29 |     vcfRange <- IRanges(vcfRange[1],vcfRange[2])
 30 |   }
 31 | 
 32 | 
 33 |   ## parse bed12 file
 34 |   colnames(peak_bed) <- c("chr","start","end","name","score","strand","thickStart","thickEnd","RGB","numBlock","blockSize","blockStart")
 35 |   peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T)
 36 |   test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow )  > start(vcfRange) ) & (( peak_bed$start-testWindow )  < end(vcfRange) ) )
 37 |   peak_bed.gr <- peak_bed.gr[test.id ]
 38 |   phenoY <- pheno[test.id,]
 39 | 
 40 |   ## test each peak
 41 |   startTime <- Sys.time()
 42 |   registerDoParallel(thread)
 43 |   testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind  )%dopar% {
 44 | 
 45 |     ## get the range where SNPs are available
 46 |     testRange <- intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange)
 47 | 
 48 |     ## Test association if there is SNP available for this peak
 49 |     if(length(testRange)==1){
 50 |       ## read genotype
 51 |       system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ~/tmp",i,".vcf.gz"))
 52 |       geno.vcf <-try(  read.vcfR( file =paste0("~/tmp",i,".vcf.gz"), verbose = F ) , silent = T)
 53 |       ####################################################################
 54 |       ### This is to handle a wired error in the read.vcfR function.
 55 |       if(class(geno.vcf) == "try-error"){
 56 |         system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ~/tmp",i,".vcf.gz"))
 57 |         geno.vcf <-read.vcfR( file =paste0("~/tmp",i,".vcf.gz"), verbose = F )
 58 |       }
 59 |       ####################################################################
 60 |       file.remove(paste0("~/tmp",i,".vcf.gz"))
 61 |       ## filter biallelic snps
 62 |       geno.vcf <- geno.vcf[is.biallelic(geno.vcf),]
 63 | 
 64 |       ## get genotype as Dosage
 65 |       tmp_geno <- extract.gt(geno.vcf, element = 'GT' )
 66 |       geno <- t( apply( tmp_geno ,1, .genoDosage ) )
 67 |       colnames(geno) <- colnames(tmp_geno)
 68 | 
 69 |       ## Determine whether to include covariates
 70 |       if( is.null(Covariates) ){
 71 |         Y <- round( phenoY[i,] )
 72 |         psi <- 10
 73 |         tmp_est <- t( apply(geno,1,function(X){
 74 |           model1 <- glm(Y ~ X, family = poisson(link = 'log'))
 75 |           coef <- model1$coefficients
 76 |           mu2 <- coef[1]
 77 |           beta <- coef[2]
 78 |           est <- try(unlist(PoissionGamma(Y, X, beta, psi, mu2, gamma = 0.75, steps = 50, down = 0.1,psi_cutoff = maxPsi)))
 79 |           if(class(est) != "try-error"){ return(est) }
 80 |         })
 81 |         )
 82 | 
 83 |         ## Post process estimations
 84 |         tested.id <- match(rownames(tmp_est),geno.vcf@fix[,"ID"])
 85 |         ## calculate distance with respect to transcript(gene) strand
 86 |         distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){
 87 |           as.integer(geno.vcf@fix[tested.id,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2)
 88 |         }else{
 89 |           round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[tested.id,"POS"])
 90 |           }
 91 |         report <- data.frame(
 92 |           SNP = paste(geno.vcf@fix[tested.id,"CHROM"],geno.vcf@fix[tested.id,"POS"],sep = ":"),
 93 |           SNPID = rownames(tmp_est),
 94 |           REF = geno.vcf@fix[tested.id,"REF"],
 95 |           ALT = geno.vcf@fix[tested.id,"ALT"],
 96 |           PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ),
 97 |           DISTANCE = distance,
 98 |           beta = tmp_est[,"beta"],
 99 |           pvalue = tmp_est[,"p_value"],
100 |           psi = tmp_est[,"psi"]
101 |         )
102 | 
103 |       }else{
104 |         ## Check covariates
105 |         if(!is.numeric(Covariates)){stop("Please convert covariates into numerical variables...")}
106 |         Y <- unlist( round( phenoY[i,] ) )
107 |         psi <- 10
108 |         ## Test against each genotype
109 |         tmp_est <- t( apply(geno,1,function(X1){
110 |           X.all <- cbind(X1,Covariates) # new design matrix
111 |           colnames(X.all) <- paste("X",1:ncol(X.all),sep = "")
112 |           design.multiBeta <- formula( paste( "log(Y+1) ~ ",paste("X.all[,", 1:ncol(X.all),"]", sep = "",collapse = " + ")) )
113 |           ## Run multi-beta PoissonGamma
114 |           aa <- unlist(summary( lm( design.multiBeta ) )$coefficients[, 1])
115 |           mu2 <- aa[1]
116 |           beta <- aa[2:(ncol(X.all)+1 )]
117 |           est <- try(unlist(PoissonGamma::PoissionGamma_multiple_beta(Y, X.all, beta, psi, mu2, gamma = 0.25, steps = 10, down = 0.1,psi_cutoff = maxPsi)))
118 |           if(class(est) != "try-error"){ return(est) }
119 |           })
120 |         )
121 | 
122 |         ## Post process estimations
123 |         tested.id <- match(rownames(tmp_est),geno.vcf@fix[,"ID"])
124 |         ## calculate distance with respect to transcript(gene) strand
125 |         distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){
126 |           as.integer(geno.vcf@fix[tested.id,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2)
127 |         }else{
128 |           round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[tested.id,"POS"])
129 |         }
130 |         report <- data.frame(
131 |           SNP = paste(geno.vcf@fix[tested.id,"CHROM"],geno.vcf@fix[tested.id,"POS"],sep = ":"),
132 |           SNPID = rownames(tmp_est),
133 |           REF = geno.vcf@fix[tested.id,"REF"],
134 |           ALT = geno.vcf@fix[tested.id,"ALT"],
135 |           PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ),
136 |           DISTANCE = distance,
137 |           beta = tmp_est[,"beta1"],
138 |           pvalue = tmp_est[,"p_value3"],
139 |           psi = tmp_est[,"psi"]
140 |         )
141 | 
142 |       }
143 |       report
144 |     }
145 | 
146 |   }
147 |   rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
148 |   endTime <- Sys.time()
149 |   cat(paste("Time used to test association: ",difftime(endTime, startTime, units = "mins")," mins... \n"))
150 | 
151 |   return(testResult)
152 | }
153 | 
154 | 
155 | 
156 | .genoDosage <- function(x){
157 |   return( stringr::str_count(x,"1") )
158 | }
159 | 
160 | 


--------------------------------------------------------------------------------
/R/QTL_betaBinomial.R:
--------------------------------------------------------------------------------
  1 | #' @title QTL_BetaBin
  2 | #' @param MeRIPdata The MeRIP.Peak object
  3 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!!
  4 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 
  5 | #' @param testWindow Integer. Test SNPs in <testWindow> bp window flanking the peak.
  6 | #' @param Chromosome The chromsome to run QTL test.
  7 | #' @param Range The position range on a chromosome to test.
  8 | #' @param Covariates The matrix for covariates to be included in the test.
  9 | #' @param AdjustGC Logic. Choose whether explicitly adjust GC bias.
 10 | #' @param AdjIPeffi Logic. Choose whether explicitly adjust overall IP efficiency
 11 | #' @param normalizeGenotype Logic. Choose whether genotype is normalized to mean = 0, var = 1 before regression.
 12 | #' @import stringr
 13 | #' @import vcfR
 14 | #' @import BSgenome
 15 | #' @import gamlss
 16 | #' @import gamlss.dist
 17 | #' @import broom
 18 | #' @export
 19 | QTL_BetaBin <- function( MeRIPdata , vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, AdjIPeffi = TRUE , PCsToInclude = 0 , normalizeGenotype = FALSE, thread = 1 ){
 20 |   
 21 |   ##check input
 22 |   if( !is(MeRIPdata, "MeRIP.Peak") ){
 23 |     stop("The input MeRIPdata needs to be an MeRIP.Peak object!")
 24 |   }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){
 25 |     stop("The peak counts matrix dimension must match the dimension of jointPeaks!")
 26 |   }
 27 |   
 28 |   ## check samples in genotype files and in MeRIP.Peak object
 29 |   tmpVcf <- tempfile(fileext = ".vcf.gz")
 30 |   system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf))
 31 |   tmp.vcf <-try(  read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
 32 |   ############################################################################################################################################################
 33 |   ### This is to handle a wired error in the read.vcfR function. #############################################################################################
 34 |   if(class(tmp.vcf) == "try-error"){                                                                                                                       ##
 35 |     system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf))  ##
 36 |     tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F )                                                                                                       ##
 37 |   }                                                                                                                                                         ##
 38 |   ############################################################################################################################################################
 39 |   unlink(tmpVcf) # remove the temp file to free space.
 40 |   genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)]
 41 |   if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){
 42 |     stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!")
 43 |   }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){
 44 |     cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ")
 45 |     cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n")
 46 |     MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
 47 |     cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!"))
 48 |   }else{
 49 |     ## make sure the order of samples aligned between phenotype and genotype
 50 |     MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
 51 |   }
 52 |   
 53 |   ### Preprocess
 54 |   ##fitler out peaks with zero count
 55 |   MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) )
 56 |   cat("Peaks with zero read count in input data have been removed.\n")
 57 |   
 58 |   T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] )
 59 |   T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] )
 60 |   
 61 |   ##filter out peaks with OR < 1
 62 |   enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold})
 63 |   MeRIPdata <-  filter(MeRIPdata, enrichFlag )
 64 |   cat(paste0("Peaks with odd ratio > 1 in more than ",MeRIPdata@jointPeak_threshold," samples will be retained.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n"))
 65 |   
 66 |   ## estimate IP efficiency
 67 |   OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 )
 68 |   colnames(OR) <- MeRIPdata@samplenames
 69 |   logOR <- log(OR)
 70 |   
 71 |   logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails
 72 |   K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){
 73 |     
 74 |     fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] ))
 75 |     y.est <- predict(fit, newdata =  data.frame(m = rowMeans(logOR)))
 76 |     return( y.est - rowMeans(logOR) )
 77 |   })
 78 |   
 79 |   ## estimate GC bias offset
 80 |   if(AdjustGC){
 81 |     cat("Computing GC content for peaks\n")
 82 |     ## GC content correction
 83 |     peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata))
 84 |     cat("...")
 85 |     
 86 |     registerDoParallel( thread )
 87 |     peakSeq <- foreach( i = 1:length(peak.gr), .combine = c )%dopar%{
 88 |       paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "")
 89 |     }
 90 |     peakGC <- sapply( peakSeq, function(x){ sum( str_count(x, c("G","g","C","c"))  )/nchar(x) } )
 91 |     
 92 |     cat("...")
 93 |     
 94 |     peakGC_l <- round(peakGC,digits = 2)
 95 |     peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks
 96 |     peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks
 97 |     l <- sort(unique(peakGC_l))
 98 |     if(AdjIPeffi){y <- (log( OR ) - K_IPe_ij )}else{y <-  log(OR) }
 99 |     colnames(y) <- MeRIPdata@samplenames
100 |     b.l <- tapply(rowMeans( y ), peakGC_l ,  median) 
101 |     bil <- apply( y, 2, tapply, peakGC_l, median )
102 |     bi. <- apply(  y , 2, median )
103 |     b.. <- median(  y )
104 |     Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. )
105 |     Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{
106 |       GC_fit <- lm(Fil[,ii] ~ poly(l,4) )
107 |       predict(GC_fit, newdata = data.frame(l = peakGC) )
108 |     }
109 |     colnames(Fij) <- MeRIPdata@samplenames
110 |     cat("...\n")
111 |   }
112 |   ##Principal components
113 |   if(PCsToInclude > 0 & PCsToInclude <= length(MeRIPdata@samplenames) ){
114 |     cat("Computing Principal components.\n")
115 |     if(AdjustGC & AdjIPeffi){
116 |       PCs <- prcomp(t( (log(OR ) -  K_IPe_ij - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ) )$x
117 |     }else if( AdjustGC & !AdjIPeffi){
118 |       PCs <- prcomp(t( (log(OR ) - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
119 |     }else if(AdjIPeffi){
120 |       PCs <- prcomp(t( (log(OR ) - K_IPe_ij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
121 |     }else{
122 |       PCs <- prcomp( t( log(OR)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
123 |     }
124 |   }else if(PCsToInclude > length(MeRIPdata@samplenames) ){
125 |     stop("The number of PCs needs to be no larger than the sample size!")
126 |   }
127 |  
128 |   cat("The following message can be ignored...\n----------------\n")
129 |   ## set ranges on the chromosome that can be tested
130 |   con1 <- pipe(paste0("zcat ",vcf_file," | grep -w '",Chromosome,"' |awk '!/^#/ {print $2}' | head -n1"))
131 |   con2 <- pipe(paste0("zcat ",vcf_file," | grep -w '",Chromosome,"' |awk '!/^#/ {print $2}' | tail -n1"))
132 |   vcfRange <- c( scan( con1 , quiet = T ) ,
133 |                 scan( con2 , quiet = T ))
134 |   close(con1)
135 |   close(con2)
136 |   cat("----------------\n")
137 |   ## update test Range if necessary
138 |   if(!is.null(Range)){
139 |     vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) )
140 |   }else{
141 |     vcfRange <- IRanges(vcfRange[1],vcfRange[2])
142 |   }
143 |   
144 | 
145 |   ## parse bed12 file
146 |   peak_bed <- jointPeak(MeRIPdata)
147 |   peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T)
148 |   test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow )  > start(vcfRange) ) & (( peak_bed$start-testWindow )  < end(vcfRange) ) )
149 |   
150 |   peak_bed.gr <- peak_bed.gr[test.id ]
151 |   Y1 <- extractIP(MeRIPdata)[test.id,]
152 |   Y0 <- extractInput(MeRIPdata)[test.id,]
153 |   if(AdjustGC){FIj <- Fij[test.id,] }
154 |   
155 |   ## ditermine study design according to parameters
156 |   variables <-  "offset(log(T1/T0))"
157 |   if( AdjIPeffi ){ variables <- paste(variables, "offset(K_IPe_j)", sep  = " +") }
158 |   if( AdjustGC ){ variables <- paste(variables, "offset(Fj)", sep = " +") } 
159 |   if(! is.null(Covariates)  ){ 
160 |     variables <-  paste(variables, paste(colnames(Covariates),collapse = " + "), sep = "+")
161 |   }
162 |   if( PCsToInclude > 0 ){
163 |     variables <-  paste(variables, paste("PC", 1:PCsToInclude, sep = "",collapse = " + "), sep = "+")
164 |   }
165 |   design <- formula( paste0("cbind(Y1i , Y0i) ~" ,"  G + ", variables) )
166 |   
167 |   cat(paste0("Start beta-binomial regression for ",length(peak_bed.gr)," peaks and SNPs in ",round(testWindow/1000,digits = 1),"kb flanking each peaks on chromosome ",Chromosome,".\n"))
168 |   if(AdjustGC){cat("Will correct sample specific GC bias\n")}
169 |   ## test each peak
170 |   startTime <- Sys.time()
171 |   registerDoParallel(thread)
172 |   testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind  )%dopar% {
173 | 
174 |     ## get the range where SNPs are available
175 |     testRange <- GenomicRanges::intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange)
176 | 
177 |     ## Test association if there is SNP available for this peak
178 |     if(length(testRange)==1){
179 |       
180 |       ## Use unix command line to accesss the genotype from vcf file. This is for fast data access.
181 |       tmpVcf <- tempfile(fileext = ".vcf.gz")
182 |       system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ",tmpVcf))
183 |       geno.vcf <-try(  read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
184 |       ############################################################################################################################################################
185 |       ### This is to handle a wired error in the read.vcfR function. #############################################################################################
186 |       if(class(geno.vcf) == "try-error"){                                                                                                                       ##
187 |         system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip >",tmpVcf))  ##
188 |         geno.vcf <-read.vcfR( file =tmpVcf, verbose = F )                                                                                                       ##
189 |       }                                                                                                                                                         ##
190 |       ############################################################################################################################################################
191 |       unlink(tmpVcf) # remove the temp file to free space.
192 |       
193 |       ## check if genotype available for this peak
194 |       if( nrow(geno.vcf@fix) == 0 ){ return(NULL) }
195 |       
196 |       ## Get genotype as dosage format and filter for MAF
197 |       if( unique(geno.vcf@gt[,"FORMAT"]) == "DS" ){
198 |         ## Directly extract dosage
199 |         geno <- if(nrow(geno.vcf@fix) == 1){ 
200 |           t(apply(extract.gt(geno.vcf, element = 'DS' ),2,as.numeric ) )
201 |         }else{
202 |           apply(extract.gt(geno.vcf, element = 'DS' ),2,as.numeric )
203 |           }
204 |         rownames(geno) <- geno.vcf@fix[,"ID"]
205 |         ## filter out any genotype that has MAF<0.05
206 |         MAF <- apply(geno,1,function(x) !any(table(round(x) )>0.95*ncol(geno)) )
207 |         geno <- geno[MAF,] 
208 |         geno.vcf <- geno.vcf[MAF,]
209 |       }else if(unique(geno.vcf@gt[,"FORMAT"]) == "GT"){
210 |         geno.vcf <- geno.vcf[is.biallelic(geno.vcf),]
211 |         ## get genotype as Dosage
212 |         tmp_geno <- extract.gt(geno.vcf, element = 'GT' )
213 |         geno <- t( apply( tmp_geno ,1, .genoDosage ) )
214 |         colnames(geno) <- colnames(tmp_geno)
215 |         ## filter out any genotype that has MAF<0.05
216 |         MAF <- apply(geno,1,function(x) !any(table(x)>0.95*ncol(geno)) )
217 |         geno <- geno[MAF,] 
218 |         geno.vcf <- geno.vcf[MAF,]
219 |       }
220 |       if( nrow(geno) == 0 ){ return(NULL) } ## skip this iteration if no genotype left after filtering
221 |       
222 |       ## normalized genotype is necessary
223 |       if(normalizeGenotype){
224 |         geno <- t( apply(geno,1,function(x){ (x - mean(x) )/sd(x) }) )
225 |       }
226 |       
227 |       if(AdjustGC){Fj <- FIj[i,]}
228 |       K_IPe_j <- K_IPe_ij[i,]
229 |       
230 |       ## Test QTLs for peak.j
231 |       tmp_est <- as.data.frame(matrix(nrow = nrow(geno),ncol = 5),row.names = rownames(geno) )
232 |       for( ii in 1:nrow(geno) ){
233 |         if(AdjustGC){
234 |           fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , Fj , G = geno[ii,])
235 |         }else{
236 |             fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , G = geno[ii,])
237 |         }
238 |         
239 |         ## add PCs to data
240 |         if(PCsToInclude > 1 ){ fit_data <- cbind(fit_data,data.frame(PCs[,1:PCsToInclude]) )}else if(PCsToInclude == 1){fit_data <- cbind(fit_data,data.frame(PC1 = PCs[,1]) ) }
241 |         ## add covariates
242 |         if(! is.null(Covariates) ){fit_data <- cbind(fit_data,as.data.frame(Covariates) )  }
243 |         
244 |         fit <- try( gamlss( design ,data = fit_data , family = BB(mu.link = "logit")) )
245 |         if(class(fit)[1]!= "try-error"){
246 |           est <- tidy(fit)
247 |           tmp_est[ii,] <- data.frame(beta =  est[est$term == "G","estimate"],
248 |                                      std.err = est[est$term == "G","std.error"],
249 |                                      pvalue = est[est$term == "G","p.value"], 
250 |                                      theta = 1/exp(est[est$parameter == "sigma","estimate"]),
251 |                                      p.theta = est[est$parameter == "sigma","p.value"] ) 
252 |         }else{
253 |             tmp_est[ii,] <- data.frame(beta = NA, std.err = NA,  pvalue =NA,theta = NA, p.theta = NA ) 
254 |           }
255 |         
256 |       }
257 |       colnames(tmp_est) <- c("beta","std.err","pvalue","theta","p.theta")
258 |         
259 |       ## calculate distance with respect to transcript(gene) strand
260 |       distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){
261 |         as.integer(geno.vcf@fix[,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2)
262 |       }else{
263 |         round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[,"POS"])
264 |       }
265 |       report <- cbind(data.frame(
266 |         SNP = paste(geno.vcf@fix[,"CHROM"],geno.vcf@fix[,"POS"],sep = ":"),
267 |         SNPID = rownames(tmp_est),
268 |         REF = geno.vcf@fix[,"REF"],
269 |         ALT = geno.vcf@fix[,"ALT"],
270 |         PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ),
271 |         DISTANCE = distance
272 |       ),tmp_est)
273 |       report[!is.na(report$beta),]
274 |     }
275 | 
276 |   }
277 |   rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
278 |   endTime <- Sys.time()
279 |   cat(paste("Time used to test association: ",round(difftime(endTime, startTime, units = "mins"),digits = 1)," mins. \n"))
280 |   cat(paste0(nrow(testResult)," SNP-peak pair tested.\n"))
281 |   return(testResult)
282 | }
283 | 
284 | 
285 | ## Helper function to convert genotype into dosage. 
286 | .genoDosage <- function(x){
287 |   return( stringr::str_count(x,"1") )
288 | }
289 | 
290 | 


--------------------------------------------------------------------------------
/R/QTL_betaBinomial.neg.R:
--------------------------------------------------------------------------------
  1 | #' @title swapChr22.QTL_BetaBin
  2 | #' @param MeRIPdata The MeRIP.Peak object
  3 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!!
  4 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 
  5 | #' @param testWindow Integer. Test SNPs in <testWindow> bp window flanking the peak.
  6 | #' @param Chromosome The chromsome to run QTL test.
  7 | #' @param Range The position range on a chromosome to test.
  8 | #' @param Covariates The matrix for covariates to be included in the test.
  9 | #' @import stringr
 10 | #' @import vcfR
 11 | #' @import BSgenome
 12 | #' @import gamlss
 13 | #' @import gamlss.dist
 14 | #' @import broom
 15 | #' @export
 16 | swapChr22.QTL_BetaBin <- function( MeRIPdata , vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, AdjIPeffi = TRUE , PCsToInclude = 0 , thread = 1 ){
 17 |    
 18 |   ##check input
 19 |   if( !is(MeRIPdata, "MeRIP.Peak") ){
 20 |     stop("The input MeRIPdata needs to be an MeRIP.Peak object!")
 21 |   }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){
 22 |     stop("The peak counts matrix dimension must match the dimension of jointPeaks!")
 23 |   }
 24 |   
 25 |   ## check samples in genotype files and in MeRIP.Peak object
 26 |   tmpVcf <- tempfile(fileext = ".vcf.gz")
 27 |   system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf))
 28 |   tmp.vcf <-try(  read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
 29 |   ############################################################################################################################################################
 30 |   ### This is to handle a wired error in the read.vcfR function. #############################################################################################
 31 |   if(class(tmp.vcf) == "try-error"){                                                                                                                       ##
 32 |     system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf))  ##
 33 |     tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F )                                                                                                       ##
 34 |   }                                                                                                                                                         ##
 35 |   ############################################################################################################################################################
 36 |   unlink(tmpVcf) # remove the temp file to free space.
 37 |   genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)]
 38 |   if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){
 39 |     stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!")
 40 |   }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){
 41 |     cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ")
 42 |     cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n")
 43 |     MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
 44 |     cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!"))
 45 |   }else{
 46 |     ## make sure the order of samples aligned between phenotype and genotype
 47 |     MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
 48 |   }
 49 |   
 50 |   
 51 |   ### Preprocess
 52 |   ##fitler out peaks with zero count
 53 |   MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) )
 54 |   cat("Peaks with zero read count in input data have been removed.\n")
 55 |   
 56 |   T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] )
 57 |   T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] )
 58 |   
 59 |   ##filter out peaks with OR < 1
 60 |   enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold})
 61 |   MeRIPdata <-  filter(MeRIPdata, enrichFlag )
 62 |   cat(paste0("Peaks with odd ratio < 1 in more than ",MeRIPdata@jointPeak_threshold," samples have been removed.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n"))
 63 |   
 64 |   ## estimate IP efficiency
 65 |   OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 )
 66 |   colnames(OR) <- MeRIPdata@samplenames
 67 |   logOR <- log(OR)
 68 |   
 69 |   logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails
 70 |   K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){
 71 |     
 72 |     fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] ))
 73 |     y.est <- predict(fit, newdata =  data.frame(m = rowMeans(logOR)))
 74 |     return( y.est - rowMeans(logOR) )
 75 |   })
 76 |   
 77 |   if(AdjustGC){
 78 |     cat("Computing GC content for peaks\n")
 79 |     ## GC content correction
 80 |     peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata))
 81 |     cat("...")
 82 |     registerDoParallel( thread )
 83 |     peakGC <- foreach( i = 1:length(peak.gr), .combine = c)%dopar%{
 84 |       peakSeq <- paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "")
 85 |       sum( str_count(peakSeq, c("G","g","C","c"))  )/nchar(peakSeq)
 86 |     }
 87 |     cat("...")
 88 |     peakGC_l <- round(peakGC,digits = 2)
 89 |     peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks
 90 |     peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks
 91 |     l <- sort(unique(peakGC_l))
 92 |     if(AdjIPeffi){y <- (log( OR ) - K_IPe_ij)}else{y <-  log(OR) }
 93 |     colnames(y) <- MeRIPdata@samplenames
 94 |     b.l <- tapply(rowMeans( y ), peakGC_l ,  median) 
 95 |     bil <- apply( y, 2, tapply, peakGC_l, median )
 96 |     bi. <- apply(  y , 2, median )
 97 |     b.. <- median(  y )
 98 |     Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. )
 99 |     Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{
100 |       GC_fit <- lm(Fil[,ii] ~ poly(l,4) )
101 |       predict(GC_fit, newdata = data.frame(l = peakGC) )
102 |     }
103 |     colnames(Fij) <- MeRIPdata@samplenames
104 |     cat("...\n")
105 |   }
106 |   ##Principal components
107 |   if(PCsToInclude > 0 & PCsToInclude <= length(MeRIPdata@samplenames) ){
108 |     cat("Computing Principal components.\n")
109 |     if(AdjustGC & AdjIPeffi){
110 |       PCs <- prcomp(t( (log(OR ) -  K_IPe_ij - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ) )$x
111 |     }else if( AdjustGC & !AdjIPeffi){
112 |       PCs <- prcomp(t( (log(OR ) - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
113 |     }else if(AdjIPeffi){
114 |       PCs <- prcomp(t( (log(OR ) - K_IPe_ij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
115 |     }else{
116 |       PCs <- prcomp( t( log(OR)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
117 |     }
118 |   }else if(PCsToInclude > length(MeRIPdata@samplenames) ){
119 |     stop("The number of PCs needs to be no larger than the sample size!")
120 |   }
121 |  
122 |   MeRIPdata@jointPeaks$chr <- gsub("chr22","chr222",MeRIPdata@jointPeaks$chr )
123 |   MeRIPdata@jointPeaks$chr <- gsub("chr21","chr22",MeRIPdata@jointPeaks$chr )
124 |   ## set ranges on the chromosome that can be tested
125 |   con <- pipe(paste0("zcat ",vcf_file," | awk '!/^#/ {print $2}' | tail -n1"))
126 |   vcfRange <- c( read.table(gzfile(vcf_file), nrows = 1)[,2] ,
127 |                 scan( con , quiet = T ))
128 |   close(con)
129 |   ## update test Range if necessary
130 |   if(!is.null(Range)){
131 |     vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) )
132 |   }else{
133 |     vcfRange <- IRanges(vcfRange[1],vcfRange[2])
134 |   }
135 | 
136 |   ## parse bed12 file
137 |   peak_bed <- jointPeak(MeRIPdata)
138 |   peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T)
139 |   test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow )  > start(vcfRange) ) & (( peak_bed$start-testWindow )  < end(vcfRange) ) )
140 |   
141 |   peak_bed.gr <- peak_bed.gr[test.id ]
142 |   Y1 <- extractIP(MeRIPdata)[test.id,]
143 |   Y0 <- extractInput(MeRIPdata)[test.id,]
144 |   if(AdjustGC){FIj <- Fij[test.id,] }
145 |   
146 |   ## ditermine study design according to parameters
147 |   variables <-  "offset(log(T1/T0))"
148 |   if( AdjIPeffi ){ variables <- paste(variables, "offset(K_IPe_ij)", sep  = " +") }
149 |   if( AdjustGC ){ variables <- paste(variables, "offset(Fj)", sep = " +") } 
150 |   if(! is.null(Covariates)  ){ 
151 |     colnames(Covariates)
152 |     variables <-  paste(variables, paste(colnames(Covariates),collapse = " + "), sep = "+")
153 |   }
154 |   if( PCsToInclude > 0 ){
155 |     variables <-  paste(variables, paste("PC", 1:PCsToInclude, sep = "",collapse = " + "), sep = "+")
156 |   }
157 |   design <- formula( paste0("cbind(Y1i , Y0i) ~" ,"  G + ", variables) )
158 |   
159 |   cat(paste0("Start beta-binomial regression for ",length(peak_bed.gr)," peaks and SNPs in ",round(testWindow/1000,digits = 1),"kb flanking each peaks on chromosome ",Chromosome,".\n"))
160 |   if(AdjustGC){cat("Will correct sample specific GC bias\n")}
161 |   ## test each peak
162 |   startTime <- Sys.time()
163 |   registerDoParallel(thread)
164 |   testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind  )%dopar% {
165 | 
166 |     ## get the range where SNPs are available
167 |     testRange <- GenomicRanges::intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange)
168 | 
169 |     ## Test association if there is SNP available for this peak
170 |     if(length(testRange)==1){
171 |       
172 |       ## Use unix command line to accesss the genotype from vcf file. This is for fast data access.
173 |       tmpVcf <- tempfile(fileext = ".vcf.gz")
174 |       system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ",tmpVcf))
175 |       geno.vcf <-try(  read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
176 |       ####################################################################
177 |       ### This is to handle a wired error in the read.vcfR function.
178 |       if(class(geno.vcf) == "try-error"){
179 |         system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip >",tmpVcf))
180 |         geno.vcf <-read.vcfR( file =tmpVcf, verbose = F )
181 |       }
182 |       ####################################################################
183 |       unlink(tmpVcf) # remove the temp file to free space.
184 |       
185 |       ## filter biallelic snps
186 |       geno.vcf <- geno.vcf[is.biallelic(geno.vcf),]
187 | 
188 |       ## get genotype as Dosage
189 |       tmp_geno <- extract.gt(geno.vcf, element = 'GT' )
190 |       geno <- t( apply( tmp_geno ,1, .genoDosage ) )
191 |       colnames(geno) <- colnames(tmp_geno)
192 |       ## filter out any genotype that has M(major)AF<0.05
193 |       MAF <- apply(geno,1,function(x) !any(table(x)>0.95*ncol(geno)) )
194 |       geno <- geno[MAF,] 
195 |       geno.vcf <- geno.vcf[MAF,] 
196 |       
197 |       if(AdjustGC){Fj <- FIj[i,]}
198 |       K_IPe_j <- K_IPe_ij[i,]
199 |       
200 |       tmp_est <- as.data.frame(matrix(nrow = nrow(geno),ncol = 4),row.names = rownames(geno) )
201 |       for( ii in 1:nrow(geno) ){
202 |         if(AdjustGC){fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j, Fj , G = geno[ii,])}else{fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , G = geno[ii,])}
203 |         ## add PCs to data
204 |         if(PCsToInclude > 1 ){ fit_data <- cbind(fit_data,data.frame(PCs[,1:PCsToInclude]) )}else if(PCsToInclude == 1){fit_data <- cbind(fit_data,data.frame(PC1 = PCs[,1]) ) }
205 |         ## add covariates
206 |         if(! is.null(Covariates) ){fit_data <- cbind(fit_data,as.data.frame(Covariates) )  }
207 |         
208 |         fit <- try( gamlss( design ,data = fit_data , family = BB(mu.link = "logit")) )
209 |         if(class(fit)[1]!= "try-error"){
210 |           est <- tidy(fit)
211 |           tmp_est[ii,] <- data.frame(beta =  est[est$term == "G","estimate"], 
212 |                                      theta = 1/exp(est[est$parameter == "sigma","estimate"]), 
213 |                                      pvalue = est[est$term == "G","p.value"], 
214 |                                      p.theta = est[est$parameter == "sigma","p.value"] ) 
215 |         }else{
216 |             tmp_est[ii,] <- data.frame(beta = NA, theta = NA, pvalue =NA, p.theta = NA ) 
217 |           }
218 |         
219 |       }
220 |       colnames(tmp_est) <- c("beta","theta","pvalue","p.theta")
221 |         
222 |       ## calculate distance with respect to transcript(gene) strand
223 |       distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){
224 |         as.integer(geno.vcf@fix[,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2)
225 |       }else{
226 |         round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[,"POS"])
227 |       }
228 |       report <- cbind(data.frame(
229 |         SNP = paste(geno.vcf@fix[,"CHROM"],geno.vcf@fix[,"POS"],sep = ":"),
230 |         SNPID = rownames(tmp_est),
231 |         REF = geno.vcf@fix[,"REF"],
232 |         ALT = geno.vcf@fix[,"ALT"],
233 |         PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ),
234 |         DISTANCE = distance
235 |       ),tmp_est)
236 |       report[!is.na(report$beta),]
237 |     }
238 | 
239 |   }
240 |   rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
241 |   endTime <- Sys.time()
242 |   cat(paste("Time used to test association: ",round(difftime(endTime, startTime, units = "mins"),digits = 1)," mins. \n"))
243 | 
244 |   return(testResult)
245 | }
246 | 
247 | 
248 | #' @title swapChr1.QTL_BetaBin
249 | #' @description Take the phenotype on chr2 and test their association with SNPs on chr1
250 | #' @param MeRIPdata The MeRIP.Peak object
251 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!!
252 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 
253 | #' @param testWindow Integer. Test SNPs in <testWindow> bp window flanking the peak.
254 | #' @param Chromosome The chromsome to run QTL test.
255 | #' @param Range The position range on a chromosome to test.
256 | #' @param Covariates The matrix for covariates to be included in the test.
257 | #' @import stringr
258 | #' @import vcfR
259 | #' @import BSgenome
260 | #' @import gamlss
261 | #' @import gamlss.dist
262 | #' @import broom
263 | #' @export
264 | swapChr1.QTL_BetaBin <- function( MeRIPdata , vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,testWindow = 100000, Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE, AdjIPeffi = TRUE , PCsToInclude = 0 , thread = 1 ){
265 |   
266 |   ##check input
267 |   if( !is(MeRIPdata, "MeRIP.Peak") ){
268 |     stop("The input MeRIPdata needs to be an MeRIP.Peak object!")
269 |   }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){
270 |     stop("The peak counts matrix dimension must match the dimension of jointPeaks!")
271 |   }
272 |   
273 |   ## check samples in genotype files and in MeRIP.Peak object
274 |   tmpVcf <- tempfile(fileext = ".vcf.gz")
275 |   system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf))
276 |   tmp.vcf <-try(  read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
277 |   ############################################################################################################################################################
278 |   ### This is to handle a wired error in the read.vcfR function. #############################################################################################
279 |   if(class(tmp.vcf) == "try-error"){                                                                                                                       ##
280 |     system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf))  ##
281 |     tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F )                                                                                                       ##
282 |   }                                                                                                                                                         ##
283 |   ############################################################################################################################################################
284 |   unlink(tmpVcf) # remove the temp file to free space.
285 |   genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)]
286 |   if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){
287 |     stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!")
288 |   }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){
289 |     cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ")
290 |     cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n")
291 |     MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
292 |     cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!"))
293 |   }else{
294 |     ## make sure the order of samples aligned between phenotype and genotype
295 |     MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
296 |   }
297 |   
298 |   
299 |   ### Preprocess
300 |   ##fitler out peaks with zero count
301 |   MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) )
302 |   cat("Peaks with zero read count in input data have been removed.\n")
303 |   
304 |   T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] )
305 |   T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] )
306 |   
307 |   ##filter out peaks with OR < 1
308 |   enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold})
309 |   MeRIPdata <-  filter(MeRIPdata, enrichFlag )
310 |   cat(paste0("Peaks with odd ratio < 1 in more than ",MeRIPdata@jointPeak_threshold," samples have been removed.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n"))
311 |   
312 |   ## estimate IP efficiency
313 |   OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 )
314 |   colnames(OR) <- MeRIPdata@samplenames
315 |   logOR <- log(OR)
316 |   
317 |   logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails
318 |   K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){
319 |     
320 |     fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] ))
321 |     y.est <- predict(fit, newdata =  data.frame(m = rowMeans(logOR)))
322 |     return( y.est - rowMeans(logOR) )
323 |   })
324 |   
325 |   if(AdjustGC){
326 |     cat("Computing GC content for peaks\n")
327 |     ## GC content correction
328 |     peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata))
329 |     cat("...")
330 |     registerDoParallel( thread )
331 |     peakGC <- foreach( i = 1:length(peak.gr), .combine = c)%dopar%{
332 |       peakSeq <- paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "")
333 |       sum( str_count(peakSeq, c("G","g","C","c"))  )/nchar(peakSeq)
334 |     }
335 |     cat("...")
336 |     peakGC_l <- round(peakGC,digits = 2)
337 |     peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks
338 |     peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks
339 |     l <- sort(unique(peakGC_l))
340 |     if(AdjIPeffi){y <- (log( OR ) - K_IPe_ij)}else{y <-  log(OR) }
341 |     colnames(y) <- MeRIPdata@samplenames
342 |     b.l <- tapply(rowMeans( y ), peakGC_l ,  median) 
343 |     bil <- apply( y, 2, tapply, peakGC_l, median )
344 |     bi. <- apply(  y , 2, median )
345 |     b.. <- median(  y )
346 |     Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. )
347 |     Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{
348 |       GC_fit <- lm(Fil[,ii] ~ poly(l,4) )
349 |       predict(GC_fit, newdata = data.frame(l = peakGC) )
350 |     }
351 |     colnames(Fij) <- MeRIPdata@samplenames
352 |     cat("...\n")
353 |   }
354 |   ##Principal components
355 |   if(PCsToInclude > 0 & PCsToInclude <= length(MeRIPdata@samplenames) ){
356 |     cat("Computing Principal components.\n")
357 |     if(AdjustGC & AdjIPeffi){
358 |       PCs <- prcomp(t( (log(OR ) -  K_IPe_ij - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ) )$x
359 |     }else if( AdjustGC & !AdjIPeffi){
360 |       PCs <- prcomp(t( (log(OR ) - Fij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
361 |     }else if(AdjIPeffi){
362 |       PCs <- prcomp(t( (log(OR ) - K_IPe_ij)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
363 |     }else{
364 |       PCs <- prcomp( t( log(OR)[apply(extractIP(MeRIPdata),1,function(x) all(x!=0)),] ))$x
365 |     }
366 |   }else if(PCsToInclude > length(MeRIPdata@samplenames) ){
367 |     stop("The number of PCs needs to be no larger than the sample size!")
368 |   }
369 |   
370 |   MeRIPdata@jointPeaks$chr <- gsub("chr1","chr111",MeRIPdata@jointPeaks$chr )
371 |   MeRIPdata@jointPeaks$chr <- gsub("chr2","chr1",MeRIPdata@jointPeaks$chr )
372 |   ## set ranges on the chromosome that can be tested
373 |   con <- pipe(paste0("zcat ",vcf_file," | awk '!/^#/ {print $2}' | tail -n1"))
374 |   vcfRange <- c( read.table(gzfile(vcf_file), nrows = 1)[,2] ,
375 |                  scan( con , quiet = T ))
376 |   close(con)
377 |   ## update test Range if necessary
378 |   if(!is.null(Range)){
379 |     vcfRange <- intersect(IRanges(vcfRange[1],vcfRange[2]),IRanges(Range[1],Range[2]) )
380 |   }else{
381 |     vcfRange <- IRanges(vcfRange[1],vcfRange[2])
382 |   }
383 |   
384 |   ## parse bed12 file
385 |   peak_bed <- jointPeak(MeRIPdata)
386 |   peak_bed.gr <- makeGRangesFromDataFrame(peak_bed, keep.extra.columns = T)
387 |   test.id <- which(peak_bed$chr == Chromosome & (( peak_bed$end+testWindow )  > start(vcfRange) ) & (( peak_bed$start-testWindow )  < end(vcfRange) ) )
388 |   
389 |   peak_bed.gr <- peak_bed.gr[test.id ]
390 |   Y1 <- extractIP(MeRIPdata)[test.id,]
391 |   Y0 <- extractInput(MeRIPdata)[test.id,]
392 |   if(AdjustGC){FIj <- Fij[test.id,] }
393 |   
394 |   ## ditermine study design according to parameters
395 |   variables <-  "offset(log(T1/T0))"
396 |   if( AdjIPeffi ){ variables <- paste(variables, "offset(K_IPe_j)", sep  = " +") }
397 |   if( AdjustGC ){ variables <- paste(variables, "offset(Fj)", sep = " +") } 
398 |   if(! is.null(Covariates)  ){ 
399 |     colnames(Covariates)
400 |     variables <-  paste(variables, paste(colnames(Covariates),collapse = " + "), sep = "+")
401 |   }
402 |   if( PCsToInclude > 0 ){
403 |     variables <-  paste(variables, paste("PC", 1:PCsToInclude, sep = "",collapse = " + "), sep = "+")
404 |   }
405 |   design <- formula( paste0("cbind(Y1i , Y0i) ~" ,"  G + ", variables) )
406 |   
407 |   cat(paste0("Start beta-binomial regression for ",length(peak_bed.gr)," peaks and SNPs in ",round(testWindow/1000,digits = 1),"kb flanking each peaks on chromosome ",Chromosome,".\n"))
408 |   if(AdjustGC){cat("Will correct sample specific GC bias\n")}
409 |   ## test each peak
410 |   startTime <- Sys.time()
411 |   registerDoParallel(thread)
412 |   testResult <- foreach( i = 1:length(peak_bed.gr), .combine = rbind  )%dopar% {
413 |     
414 |     ## get the range where SNPs are available
415 |     testRange <- GenomicRanges::intersect(IRanges(start(peak_bed.gr[i])-testWindow, end(peak_bed.gr[i])+testWindow ),vcfRange)
416 |     
417 |     ## Test association if there is SNP available for this peak
418 |     if(length(testRange)==1){
419 |       
420 |       ## Use unix command line to accesss the genotype from vcf file. This is for fast data access.
421 |       tmpVcf <- tempfile(fileext = ".vcf.gz")
422 |       system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}(!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip > ",tmpVcf))
423 |       geno.vcf <-try(  read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
424 |       ####################################################################
425 |       ### This is to handle a wired error in the read.vcfR function.
426 |       if(class(geno.vcf) == "try-error"){
427 |         system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0} (!/^#/ && $2 > ",start(testRange)," && $2 < ",end(testRange)," ) {print $0}'| gzip >",tmpVcf))
428 |         geno.vcf <-read.vcfR( file =tmpVcf, verbose = F )
429 |       }
430 |       ####################################################################
431 |       unlink(tmpVcf) # remove the temp file to free space.
432 |       
433 |       ## filter biallelic snps
434 |       geno.vcf <- geno.vcf[is.biallelic(geno.vcf),]
435 |       
436 |       ## get genotype as Dosage
437 |       tmp_geno <- extract.gt(geno.vcf, element = 'GT' )
438 |       geno <- t( apply( tmp_geno ,1, .genoDosage ) )
439 |       colnames(geno) <- colnames(tmp_geno)
440 |       ## filter out any genotype that has M(major)AF<0.05
441 |       MAF <- apply(geno,1,function(x) !any(table(x)>0.95*ncol(geno)) )
442 |       geno <- geno[MAF,] 
443 |       geno.vcf <- geno.vcf[MAF,] 
444 |       
445 |       if(AdjustGC){Fj <- FIj[i,]}
446 |       K_IPe_j <- K_IPe_ij[i,]
447 |       
448 |       tmp_est <- as.data.frame(matrix(nrow = nrow(geno),ncol = 4),row.names = rownames(geno) )
449 |       for( ii in 1:nrow(geno) ){
450 |         if(AdjustGC){fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j, Fj , G = geno[ii,])}else{fit_data <- data.frame(Y0i = Y0[i,], Y1i = Y1[i,], T1, T0, K_IPe_j , G = geno[ii,])}
451 |         ## add PCs to data
452 |         if(PCsToInclude > 1 ){ fit_data <- cbind(fit_data,data.frame(PCs[,1:PCsToInclude]) )}else if(PCsToInclude == 1){fit_data <- cbind(fit_data,data.frame(PC1 = PCs[,1]) ) }
453 |         ## add covariates
454 |         if(! is.null(Covariates) ){fit_data <- cbind(fit_data,as.data.frame(Covariates) )  }
455 |         
456 |         fit <- try( gamlss( design ,data = fit_data , family = BB(mu.link = "logit")) )
457 |         if(class(fit)[1]!= "try-error"){
458 |           est <- tidy(fit)
459 |           tmp_est[ii,] <- data.frame(beta =  est[est$term == "G","estimate"], 
460 |                                      theta = 1/exp(est[est$parameter == "sigma","estimate"]), 
461 |                                      pvalue = est[est$term == "G","p.value"], 
462 |                                      p.theta = est[est$parameter == "sigma","p.value"] ) 
463 |         }else{
464 |           tmp_est[ii,] <- data.frame(beta = NA, theta = NA, pvalue =NA, p.theta = NA ) 
465 |         }
466 |         
467 |       }
468 |       colnames(tmp_est) <- c("beta","theta","pvalue","p.theta")
469 |       
470 |       ## calculate distance with respect to transcript(gene) strand
471 |       distance <- if(as.character(strand(peak_bed.gr[i])) == "+"){
472 |         as.integer(geno.vcf@fix[,"POS"])- round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2)
473 |       }else{
474 |         round((peak_bed.gr[i]$thickStart+peak_bed.gr[i]$thickEnd)/2) - as.integer(geno.vcf@fix[,"POS"])
475 |       }
476 |       report <- cbind(data.frame(
477 |         SNP = paste(geno.vcf@fix[,"CHROM"],geno.vcf@fix[,"POS"],sep = ":"),
478 |         SNPID = rownames(tmp_est),
479 |         REF = geno.vcf@fix[,"REF"],
480 |         ALT = geno.vcf@fix[,"ALT"],
481 |         PEAK = paste0(Chromosome,":",peak_bed.gr[i]$thickStart,"-",peak_bed.gr[i]$thickEnd,"_",peak_bed.gr[i]$name,"_",strand( peak_bed.gr[i]) ),
482 |         DISTANCE = distance
483 |       ),tmp_est)
484 |       report[!is.na(report$beta),]
485 |     }
486 |     
487 |   }
488 |   rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
489 |   endTime <- Sys.time()
490 |   cat(paste("Time used to test association: ",round(difftime(endTime, startTime, units = "mins"),digits = 1)," mins. \n"))
491 |   
492 |   return(testResult)
493 | }
494 | 
495 | 
496 | ## Helper function to convert genotype into dosage. 
497 | .genoDosage <- function(x){
498 |   return( stringr::str_count(x,"1") )
499 | }
500 | 
501 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | gradient_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, n) {
  5 |     .Call('_MeRIPtools_gradient_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, n)
  6 | }
  7 | 
  8 | log_factorial <- function(Y) {
  9 |     .Call('_MeRIPtools_log_factorial', PACKAGE = 'MeRIPtools', Y)
 10 | }
 11 | 
 12 | log_factorial_calculated <- function(N) {
 13 |     .Call('_MeRIPtools_log_factorial_calculated', PACKAGE = 'MeRIPtools', N)
 14 | }
 15 | 
 16 | LogLikelihood_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, n, sum_log_factorial_Y) {
 17 |     .Call('_MeRIPtools_LogLikelihood_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, n, sum_log_factorial_Y)
 18 | }
 19 | 
 20 | select_stepsize_for_beta <- function(Y, X, gra_beta, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down) {
 21 |     .Call('_MeRIPtools_select_stepsize_for_beta', PACKAGE = 'MeRIPtools', Y, X, gra_beta, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down)
 22 | }
 23 | 
 24 | select_stepsize_for_psi <- function(Y, X, gra_psi, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff) {
 25 |     .Call('_MeRIPtools_select_stepsize_for_psi', PACKAGE = 'MeRIPtools', Y, X, gra_psi, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff)
 26 | }
 27 | 
 28 | select_stepsize_for_mu2 <- function(Y, X, gra_mu2, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down) {
 29 |     .Call('_MeRIPtools_select_stepsize_for_mu2', PACKAGE = 'MeRIPtools', Y, X, gra_mu2, ll, beta, psi, mu2, n, gamma, sum_log_factorial_Y, down)
 30 | }
 31 | 
 32 | gradient_descent_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff) {
 33 |     .Call('_MeRIPtools_gradient_descent_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff)
 34 | }
 35 | 
 36 | gradient_descent_alt <- function(Y, X, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff) {
 37 |     .Call('_MeRIPtools_gradient_descent_alt', PACKAGE = 'MeRIPtools', Y, X, psi, mu2, gamma, steps, sum_log_factorial_Y, down, psi_cutoff)
 38 | }
 39 | 
 40 | PoissionGamma <- function(Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) {
 41 |     .Call('_MeRIPtools_PoissionGamma', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff)
 42 | }
 43 | 
 44 | Fisher_information_one_beta <- function(Y, X, beta, psi, mu2, n) {
 45 |     .Call('_MeRIPtools_Fisher_information_one_beta', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, n)
 46 | }
 47 | 
 48 | PoissionGamma_FISHER <- function(Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) {
 49 |     .Call('_MeRIPtools_PoissionGamma_FISHER', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff)
 50 | }
 51 | 
 52 | gradient_and_LogLikelihood_for_individual_sample <- function(Y, W, V, WY, WWY, W3Y, W4Y, VY, VVY, V3Y, V4Y, WW, W3, W4, VV, V3, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y) {
 53 |     .Call('_MeRIPtools_gradient_and_LogLikelihood_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, WY, WWY, W3Y, W4Y, VY, VVY, V3Y, V4Y, WW, W3, W4, VV, V3, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y)
 54 | }
 55 | 
 56 | LogLikelihood_for_individual_sample <- function(Y, W, V, WW, VV, W3, V3, W4, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y) {
 57 |     .Call('_MeRIPtools_LogLikelihood_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, WW, VV, W3, V3, W4, V4, a0, a1, a2, a3, a4, b1, b2, b3, b4, n, sum_log_factorial_Y)
 58 | }
 59 | 
 60 | select_stepsize_for_a_parameter <- function(Y, W, V, WW, VV, W3, V3, W4, V4, ll, sum_log_factorial_Y, gradient, parameters, ind, gamma, n, down) {
 61 |     .Call('_MeRIPtools_select_stepsize_for_a_parameter', PACKAGE = 'MeRIPtools', Y, W, V, WW, VV, W3, V3, W4, V4, ll, sum_log_factorial_Y, gradient, parameters, ind, gamma, n, down)
 62 | }
 63 | 
 64 | gradient_descent_for_individual_sample <- function(Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) {
 65 |     .Call('_MeRIPtools_gradient_descent_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down)
 66 | }
 67 | 
 68 | coordinate_descent_for_individual_sample <- function(Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down) {
 69 |     .Call('_MeRIPtools_coordinate_descent_for_individual_sample', PACKAGE = 'MeRIPtools', Y, W, V, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down)
 70 | }
 71 | 
 72 | gradient_multiple_beta_psi_mu2 <- function(Y, X, Xbeta, psi, mu2, n, k) {
 73 |     .Call('_MeRIPtools_gradient_multiple_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, Xbeta, psi, mu2, n, k)
 74 | }
 75 | 
 76 | LogLikelihood_multiple_beta_psi_mu2 <- function(Y, Xbeta, psi, mu2, n, sum_log_factorial_Y) {
 77 |     .Call('_MeRIPtools_LogLikelihood_multiple_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, Xbeta, psi, mu2, n, sum_log_factorial_Y)
 78 | }
 79 | 
 80 | select_stepsize_for_multiple_beta <- function(Y, X, betas, Xbeta, ind, m_gra_beta, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down) {
 81 |     .Call('_MeRIPtools_select_stepsize_for_multiple_beta', PACKAGE = 'MeRIPtools', Y, X, betas, Xbeta, ind, m_gra_beta, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down)
 82 | }
 83 | 
 84 | select_stepsize_for_psi_with_multiple_beta <- function(Y, Xbeta, gra_psi, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff) {
 85 |     .Call('_MeRIPtools_select_stepsize_for_psi_with_multiple_beta', PACKAGE = 'MeRIPtools', Y, Xbeta, gra_psi, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down, psi_cutoff)
 86 | }
 87 | 
 88 | select_stepsize_for_mu2_with_multiple_beta <- function(Y, Xbeta, gra_mu2, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down) {
 89 |     .Call('_MeRIPtools_select_stepsize_for_mu2_with_multiple_beta', PACKAGE = 'MeRIPtools', Y, Xbeta, gra_mu2, ll, psi, mu2, n, gamma, sum_log_factorial_Y, down)
 90 | }
 91 | 
 92 | gradient_descent_multiple_beta_psi_mu2 <- function(Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, n, k, down, psi_cutoff) {
 93 |     .Call('_MeRIPtools_gradient_descent_multiple_beta_psi_mu2', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, sum_log_factorial_Y, n, k, down, psi_cutoff)
 94 | }
 95 | 
 96 | Fisher_information <- function(Y, X, Xbeta, psi, mu2, n, k) {
 97 |     .Call('_MeRIPtools_Fisher_information', PACKAGE = 'MeRIPtools', Y, X, Xbeta, psi, mu2, n, k)
 98 | }
 99 | 
100 | PoissionGamma_multiple_beta <- function(Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff) {
101 |     .Call('_MeRIPtools_PoissionGamma_multiple_beta', PACKAGE = 'MeRIPtools', Y, X, beta, psi, mu2, gamma, steps, down, psi_cutoff)
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/R/countReads.R:
--------------------------------------------------------------------------------
  1 | ## main function that takes a gtf file and bam files to count read for continuous windows
  2 | ### The function requires for each sample, input and IP named for the same prefix (filename). input and IP each have postfix of input.bam/m6A.bam
  3 | #### If multiple IP sample used a shared input library, sharedInput can be specified.
  4 | #' @title countReads
  5 | #' @description This is the very first function in MeRIP-seq data analysis that initianize a `MeRIP` object. This function takes BAM files of Input/IP library of each samples as input and use given GTF file as gene annotation to divide genes into consecutive bins of user defined size.  
  6 | #' @param samplenames The names of each sample (prefix for bam files).
  7 | #' @param gtf The gtf format gene annotation file
  8 | #' @param fragmentLength The RNA fragment length (insert size of the library).
  9 | #' @param modification The modification used to name the BAM files.
 10 | #' @param bamFolder Path to the folder where bam file locates
 11 | #' @param binSize The size of consecutive bins to slice the transcripts
 12 | #' @param threads The number of threads to use for hyperthreading
 13 | #' @param strandToKeep According to library preparation protocol, choose which strand to count. Stranded RNA library usually seq the "ooposite" strand. Small RNA library seq the "same" strand.
 14 | #' @param outputDir The directory to save output files
 15 | #' @param saveOutput Logical option indicating whether to save output as an RDS file.
 16 | #' @param paired Logical indicating whether the input bam files are from paired end sequencing. Default is FALSE. If using paired end data, the read length will be estimated from the data and only good mate are counted.
 17 | #' @export
 18 | countReads<-function(
 19 |   samplenames,# file name of samples
 20 |   gtf, # gtf file used for peak calling
 21 |   fragmentLength = 150,
 22 |   bamFolder,
 23 |   outputDir=NA,
 24 |   modification = "m6A",
 25 |   binSize = 50,
 26 |   strandToKeep = "opposite",
 27 |   paired = FALSE,
 28 |   threads = 1,
 29 |   saveOutput = FALSE
 30 | ){
 31 | 
 32 |   ##read bam files
 33 |   bamPath.input = paste(bamFolder,"/",samplenames,".input.bam",sep="")
 34 |   bamPath.IP = paste(bamFolder,"/",samplenames,".",modification,".bam",sep="")
 35 |   no.samples = length(samplenames)
 36 | 
 37 |   ## Check for missing files and index bam files
 38 |   if( !all(file.exists(bamPath.input)) ) stop( "input bam file missing!!!" )
 39 |   if( !all(file.exists(bamPath.IP)) ) stop( "IP bam file missing!!!" )
 40 |   num_bam_files <- length(bamPath.input)
 41 |   for (ibam in 1:num_bam_files) {
 42 |     inputfile = bamPath.input[ibam]
 43 |     IPfile = bamPath.IP[ibam]
 44 |     if (! file.exists(paste(inputfile,'.bai',sep=""))) {
 45 |       print(paste("Stage: index bam file", inputfile))
 46 |       indexBam(inputfile)
 47 |     }
 48 |     if (! file.exists(paste(IPfile,'.bai',sep=""))) {
 49 |       print(paste("Stage: index bam file", IPfile))
 50 |       indexBam(IPfile)
 51 |     }
 52 |   }
 53 | 
 54 | 
 55 |   
 56 |   ## This step removes ambiguous annotations and returns gene model
 57 |   cat("Reading gtf file to obtain gene model\nFilter out ambiguous model...\n")
 58 |   geneGRList = gtfToGeneModel(gtf) #get the gene model in GRList with only single chromosome and strand.
 59 |   cat("Gene model obtained from gtf file...\n")
 60 | 
 61 |   ## Check BAM headers and remove chr in geneModel that is not in BAM file. 
 62 |   bamHeader <- scanBamHeader(bamPath.input, what=c("targets") )
 63 |   seqLevels <- unique( unlist( lapply( bamHeader, function(x) names( x$targets) ) ) )
 64 |   geneGRList <- geneGRList[ unlist( runValue( seqnames( geneGRList ) ) ) %in% seqLevels ]
 65 |   
 66 |   
 67 |   no.genes=length(geneGRList)## define number of genes
 68 | 
 69 |   cat("counting reads for each genes, this step may takes a few hours....\n")
 70 |   start_time <- Sys.time()
 71 |   registerDoParallel( cores = threads)
 72 |   cat(paste("Hyper-thread registered:",getDoParRegistered(),"\n"))
 73 |   cat(paste("Using",getDoParWorkers(),"thread(s) to count reads in continuous bins...\n"))
 74 |   reads <- foreach(i = 1:no.genes, .combine = rbind) %dopar%{
 75 | 
 76 |     geneName = names(geneGRList)[i]
 77 |     geneModel =reduce( geneGRList[geneName][[1]] )## merge overlapping exons
 78 | 
 79 |     # DNA location to gene location conversion
 80 |     df.geneModel= as.data.frame(geneModel) ##data frame of gene model
 81 |     dna.range = as.data.frame(range(geneModel))
 82 |     df.geneModel$end = df.geneModel$end - dna.range$start + 1
 83 |     df.geneModel$start = df.geneModel$start - dna.range$start + 1
 84 |     DNA2RNA = rep(0,dna.range$end - dna.range$start +1)
 85 |     no.exon = dim(df.geneModel)[1]
 86 |     for (j in 1:no.exon){DNA2RNA[df.geneModel$start[j]:df.geneModel$end[j]]=1}
 87 |     exon.length = sum(DNA2RNA)
 88 |     DNA2RNA=cumsum(DNA2RNA)*DNA2RNA
 89 | 
 90 |     ## skip any gene with smaller than 200bp transcript
 91 |     if(exon.length < 200) {return(NULL)}
 92 | 
 93 |     #creat a corresponding map from RNA to DNA
 94 |     #RNA2DNA = 1:exon.length
 95 |     #pointer = 1
 96 |     #for (j in 1:no.exon){
 97 |     #  RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1) ]= RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1)] + df.geneModel$start[j] -pointer
 98 |     #  pointer = pointer + df.geneModel$width[j]
 99 |     #}
100 |     #RNA2DNA = RNA2DNA + dna.range$start -1 #back to chromosome coordinates
101 | 
102 |     ## switch strand because stranded RNA library protocol sequence reverse strand
103 |     if(strandToKeep == "opposite"){
104 |       reads.strand = character()
105 |       if(dna.range$strand == "+"){reads.strand = "-"}else{reads.strand = "+"} ## switch strand on RNA reads for Truseq protocol
106 |     }else if(strandToKeep == "same"){
107 |       reads.strand = as.character(dna.range$strand)
108 |     }else{
109 |       cat("Currently m6Amonter only support strand specific RNA-seq data.\nCounting reads at opposite strand by defalt...\n")
110 |       reads.strand = character()
111 |       if(dna.range$strand == "+"){reads.strand = "-"}else{reads.strand = "+"}
112 |     }
113 | 
114 |     #create start points of continuous window
115 |     if(exon.length <= binSize){
116 |       slidingStart = 1
117 |     }else{
118 |       ## use the 3' end terminal bin as a elastic-size bin
119 |       if(dna.range$strand == "+"){
120 |         slidingStart = seq(from = 1, to = ( exon.length - binSize - exon.length %% binSize + 1) , length.out = floor(exon.length/binSize) ) 
121 |       }else{ # make the first bin elastic bin if a gene is on reverse strand
122 |         slidingStart = c(1, seq(from = binSize + exon.length %% binSize + 1, to = ( exon.length - binSize + 1) , length.out = floor(exon.length/binSize) - 1 )  )
123 |       }
124 |     }
125 | 
126 | 
127 |     #count reads in all samples
128 |     ba.IP = sapply(bamPath.IP,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired)
129 |     ba.input = sapply(bamPath.input,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired)
130 | 
131 |     if(is.vector(ba.IP) ){# if there is only one window for this gene, make it a matrix to avoid bug
132 |       ba.IP = matrix(ba.IP, nrow = 1 )
133 |       ba.input = matrix( ba.input, nrow = 1 )
134 |     }
135 |     ba.counts <- cbind(ba.input,ba.IP)
136 |     rownames(ba.counts) <-  paste(geneName,slidingStart,sep = ",")
137 | 
138 |     ba.counts
139 |   }
140 |   rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
141 |   end_time <- Sys.time()
142 |   cat(paste("Time used to count reads:",difftime(end_time, start_time, units = "mins"),"mins... \n"))
143 | 
144 |   colnames(reads) <- c(paste(samplenames,"input",sep = "-"),paste(samplenames,"IP",sep = "-"))
145 | 
146 | 
147 |   data.out <- MeRIP(reads = reads, binSize = binSize, gtf = gtf, geneModel = geneGRList, bamPath.input = bamPath.input, bamPath.ip = bamPath.IP, samplenames = samplenames)
148 |   if(saveOutput){
149 |     ## create output directory
150 |     dir.create(outputDir, showWarnings = FALSE, recursive = TRUE)
151 |     saveRDS(data.out,paste0(outputDir,"/MeRIP_readCounts.RDS"))
152 |   }
153 | 
154 | 
155 |   return(data.out)
156 | }
157 | 


--------------------------------------------------------------------------------
/R/countReadsCarRNA.R:
--------------------------------------------------------------------------------
  1 | #' @title countReadsCarRNA
  2 | #' @description This is the very first function in MeRIP-seq data analysis that initianize a `MeRIP` object. This function takes BAM files of Input/IP library of each samples as input and use given GTF file as gene annotation to divide genes into consecutive bins of user defined size.  
  3 | #' @param samplenames The names of each sample (prefix for bam files).
  4 | #' @param gtf The gtf format gene annotation file
  5 | #' @param fragmentLength The RNA fragment length (insert size of the library).
  6 | #' @param modification The modification used to name the BAM files.
  7 | #' @param bamFolder Path to the folder where bam file locates
  8 | #' @param binSize The size of consecutive bins to slice the transcripts
  9 | #' @param nonCodingAnnotation The annotation file or files for non-coding RNA in SAF format. Can be a character or vector of path(s) to the SAF file(s). The default is NA.
 10 | #' @param threads The number of threads to use for hyperthreading
 11 | #' @param strandToKeep According to library preparation protocol, choose which strand to count. Stranded RNA library usually seq the "ooposite" strand. Small RNA library seq the "same" strand.
 12 | #' @param outputDir The directory to save output files
 13 | #' @param saveOutput Logical option indicating whether to save output as an RDS file.
 14 | #' @param paired Logical indicating whether the input bam files are from paired end sequencing. Default is FALSE. If using paired end data, the read length will be estimated from the data and only good mate are counted.
 15 | #' @export
 16 | countReadsCarRNA<-function(
 17 |   samplenames,# file name of samples
 18 |   gtf, # gtf file used for peak calling
 19 |   fragmentLength = 150,
 20 |   bamFolder,
 21 |   nonCodingAnnotation = NA,
 22 |   outputDir=NA,
 23 |   modification = "m6A",
 24 |   binSize = 50,
 25 |   strandToKeep = "opposite",
 26 |   paired = FALSE,
 27 |   threads = 1,
 28 |   saveOutput = FALSE
 29 | ){
 30 |   
 31 |   ##read bam files
 32 |   bamPath.input = paste(bamFolder,"/",samplenames,".input.bam",sep="")
 33 |   bamPath.IP = paste(bamFolder,"/",samplenames,".",modification,".bam",sep="")
 34 |   no.samples = length(samplenames)
 35 |   
 36 |   ## Check for missing files and index bam files
 37 |   if( !all(file.exists(bamPath.input)) ) stop( "input bam file missing!!!" )
 38 |   if( !all(file.exists(bamPath.IP)) ) stop( "IP bam file missing!!!" )
 39 |   num_bam_files <- length(bamPath.input)
 40 |   for (ibam in 1:num_bam_files) {
 41 |     inputfile = bamPath.input[ibam]
 42 |     IPfile = bamPath.IP[ibam]
 43 |     if (! file.exists(paste(inputfile,'.bai',sep=""))) {
 44 |       print(paste("Stage: index bam file", inputfile))
 45 |       indexBam(inputfile)
 46 |     }
 47 |     if (! file.exists(paste(IPfile,'.bai',sep=""))) {
 48 |       print(paste("Stage: index bam file", IPfile))
 49 |       indexBam(IPfile)
 50 |     }
 51 |   }
 52 |   
 53 |   
 54 |   ## This step removes ambiguous annotations and returns gene model
 55 |   cat("Reading gtf file to obtain gene model\nFilter out ambiguous model...\n")
 56 |   geneGRListFromGTF = gtfToGeneModel(gtf) #get the gene model in GRList with only single chromosome and strand.
 57 |   cat("Gene model obtained from gtf file...\n")
 58 |   geneGRList.wholeGene <- range(geneGRListFromGTF) # collpase exons into one transcript
 59 |   
 60 |   ## Build model for non-coding RNAs if annotation is supplied.
 61 |   if( any( ! is.na( nonCodingAnnotation) ) ){
 62 |     cat( "Non-coding RNA model obtained from SAF file...\n" )
 63 |     if( length(nonCodingAnnotation) ==1 ){
 64 |       nonCodingAnnotation.saf <- read.table(nonCodingAnnotation, sep = "\t", header = TRUE, col.names = c("GeneID", "Chr", "Start", "End","Strand") )
 65 |     }else{
 66 |       nonCodingAnnotation.saf <- foreach( ann = nonCodingAnnotation, .combine = rbind )%do%{ read.table( ann, sep = "\t", header = TRUE, col.names = c("GeneID", "Chr", "Start", "End","Strand") ) }
 67 |     }
 68 |     
 69 |     geneGRList.nonCoding <- makeGRangesListFromDataFrame(nonCodingAnnotation.saf, names.field = "GeneID")
 70 |     names(geneGRList.nonCoding) <- nonCodingAnnotation.saf$GeneID
 71 |     
 72 |     geneGRListCombine <- c( geneGRList.wholeGene, geneGRList.nonCoding )
 73 |   }else{
 74 |     geneGRListCombine <- geneGRList.wholeGene
 75 |   }
 76 |   
 77 |   ## Check BAM headers and remove chr in geneModel that is not in BAM file. 
 78 |   bamHeader <- scanBamHeader(bamPath.input, what=c("targets") )
 79 |   seqLevels <- unique( unlist( lapply( bamHeader, function(x) names( x$targets) ) ) )
 80 |   geneGRListCombine <- geneGRListCombine[ unlist( runValue( seqnames( geneGRListCombine ) ) ) %in% seqLevels ]
 81 |   
 82 |   
 83 |   ## Process the gene region first.
 84 |   ## Count reads in transcripts. Because we are interested in carRNA, we count reads on pre-mRNA instead of mRNA. 
 85 |   no.genes=length(geneGRListCombine)## define number of genes
 86 |   
 87 |   ## Divide annotations into batches for counting reads
 88 |   batchNum <-  max( round( no.genes/2e4 ), 1 ) # start with at least 1 batch
 89 |   batchPoints <- round( seq( 1, no.genes, length.out = batchNum+1 ) ) # Stop points of each batch
 90 |   batches <- foreach( j = 1:(length(batchPoints) -1 ) )%do%{ return(batchPoints[j]:batchPoints[j+1] ) }
 91 |   
 92 |   cat(paste("Dividing annotations into", batchNum,"batches for read count quantification... \n"))
 93 |   
 94 |   
 95 |   for( iBatch in 1:batchNum ){
 96 |     
 97 |     cat(paste0( "Counting reads for each gene in batch # ",iBatch, ", this step may takes a few hours...\n") )
 98 |     
 99 |     tmpReads <- .CountReadsBatch(geneGRList = geneGRListCombine[ batches[[iBatch]] ],
100 |                                  binSize = binSize,
101 |                                  bamPath.IP = bamPath.IP,
102 |                                  bamPath.input = bamPath.input,
103 |                                  strandToKeep = strandToKeep,
104 |                                  fragmentLength = fragmentLength,
105 |                                  paired = paired,
106 |                                  threads = threads
107 |     )
108 |     
109 |     cat(paste0( "Finished quantification for batch # ",iBatch,"; ",batchNum - iBatch, " batches to be processed...\n\n") )
110 |     
111 |     ## filter out zero count genes
112 |     rowTotoal <- rowSums(tmpReads) > 0
113 |     tmpGeneName <- unlist( lapply( strsplit( rownames(tmpReads) , ","), function(x) x[1] ) )
114 |     tmpZeroGene <-  tapply(rowTotoal , tmpGeneName, sum) 
115 |     if( any(tmpZeroGene == 0) ){
116 |       tmpReads <- tmpReads[ tmpGeneName %in% names( tmpZeroGene[tmpZeroGene >= 0] ), ]
117 |       
118 |     }
119 |     
120 |     ## save the current data
121 |     eval(parse(text = paste0("Reads_",iBatch," <- tmpReads ") ) )
122 |     
123 |   }
124 |   
125 |   cat(paste0( "Finished reads quantification. Combining read counts from batch runs...\n") )
126 |   
127 |   ## combine all reads
128 |   eval(parse(text = paste0("reads <- rbind(",paste0( "Reads_",1:batchNum,collapse = ",") , ") ") ) )
129 |   
130 |   ## remove temp read count tables
131 |   eval(parse(text = paste0("rm(",paste0( "Reads_",1:batchNum,collapse = ",") , ") ") ) )
132 |   
133 |   colnames(reads) <- c(paste(samplenames,"input",sep = "-"),paste(samplenames,"IP",sep = "-"))
134 |   
135 |   
136 |   data.out <- MeRIP(reads = reads, binSize = binSize, gtf = gtf, geneModel = geneGRListCombine, bamPath.input = bamPath.input, bamPath.ip = bamPath.IP, samplenames = samplenames, mode = "carRNA")
137 |   if(saveOutput){
138 |     ## create output directory
139 |     dir.create(outputDir, showWarnings = FALSE, recursive = TRUE)
140 |     saveRDS(data.out,paste0(outputDir,"/MeRIP_readCounts.RDS"))
141 |   }
142 |   
143 |   
144 |   return(data.out)
145 | }
146 | 
147 | 
148 | ## A helper function to process a batch of annotation features.
149 | .CountReadsBatch <- function(geneGRList,
150 |                              binSize,
151 |                              bamPath.IP,
152 |                              bamPath.input,
153 |                              strandToKeep,
154 |                              fragmentLength,
155 |                              paired,
156 |                              threads
157 | ){
158 |   
159 |   
160 |   start_time <- Sys.time()
161 |   registerDoParallel( cores = threads)
162 |   cat(paste("Hyper-thread registered:",getDoParRegistered(),"\n"))
163 |   cat(paste("Using",getDoParWorkers(),"thread(s) to count reads in continuous bins...\n"))
164 |   reads <- foreach(i = 1:length(geneGRList), .combine = rbind) %dopar%{
165 |     
166 |     geneName = names(geneGRList)[i]
167 |     geneModel =geneGRList[geneName][[1]] 
168 |     
169 |     # DNA location to gene location conversion
170 |     df.geneModel= as.data.frame(geneModel) ##data frame of gene model
171 |     dna.range = as.data.frame(range(geneModel))
172 |     df.geneModel$end = df.geneModel$end - dna.range$start + 1
173 |     df.geneModel$start = df.geneModel$start - dna.range$start + 1
174 |     DNA2RNA = 1:dna.range$width
175 |     #DNA2RNA = rep(0,dna.range$end - dna.range$start +1)
176 |     #no.exon = dim(df.geneModel)[1]
177 |     #for (j in 1:no.exon){DNA2RNA[df.geneModel$start[j]:df.geneModel$end[j]]=1}
178 |     #exon.length = sum(DNA2RNA)
179 |     #DNA2RNA=cumsum(DNA2RNA)*DNA2RNA
180 |     geneLength = dna.range$width
181 |     
182 |     ## skip any gene with smaller than 200bp transcript
183 |     if(geneLength < 200) {return(NULL)}
184 |     
185 |     
186 |     ## switch strand because stranded RNA library protocol sequence reverse strand
187 |     if(strandToKeep == "opposite"){
188 |       reads.strand = character()
189 |       if(dna.range$strand == "+"){reads.strand = "-"}else if(dna.range$strand == "-"){reads.strand = "+"}else{reads.strand = "*"} ## switch strand on RNA reads for Truseq protocol
190 |     }else if(strandToKeep == "same"){
191 |       reads.strand = as.character(dna.range$strand)
192 |     }else{
193 |       cat("Currently RADAR only support strand specific RNA-seq data.\nCounting reads at opposite strand by defalt...\n")
194 |       reads.strand = character()
195 |       if(dna.range$strand == "+"){reads.strand = "-"}else if(dna.range$strand == "-"){reads.strand = "+"}else{reads.strand = "*"}
196 |     }
197 |     
198 |     #create start points of continuous window
199 |     if(exon.length <= binSize){
200 |       slidingStart = 1
201 |     }else{
202 |       ## use the 3' end terminal bin as a elastic-size bin
203 |       if(dna.range$strand == "+"){
204 |         slidingStart = seq(from = 1, to = ( exon.length - binSize - exon.length %% binSize + 1) , length.out = floor(exon.length/binSize) ) 
205 |       }else{ # make the first bin elastic bin if a gene is on reverse strand
206 |         slidingStart = c(1, seq(from = binSize + exon.length %% binSize + 1, to = ( exon.length - binSize + 1) , length.out = floor(exon.length/binSize) - 1 )  )
207 |       }
208 |     }
209 |     
210 |     #count reads in all samples
211 |     ba.IP = sapply(bamPath.IP,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired)
212 |     ba.input = sapply(bamPath.input,.countReadFromBam,which = range(geneModel),reads.strand = reads.strand,DNA2RNA = DNA2RNA,fragmentLength=fragmentLength,left=dna.range$start,sliding = slidingStart, binSize = binSize, paired = paired)
213 |     
214 |     if(is.vector(ba.IP) ){# if there is only one window for this gene, make it a matrix to avoid bug
215 |       ba.IP = matrix(ba.IP, nrow = 1)
216 |       ba.input = matrix( ba.input, nrow = 1 )
217 |     }
218 |     ba.counts <- cbind(ba.input,ba.IP)
219 |     rownames(ba.counts) <-  paste(geneName,slidingStart,sep = ",")
220 |     
221 |     ba.counts
222 |   }
223 |   rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
224 |   end_time <- Sys.time()
225 |   cat(paste("Time used to count reads in this batch:",difftime(end_time, start_time, units = "mins"),"mins... \n"))
226 |   
227 |   return(reads)
228 |   
229 | }
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 


--------------------------------------------------------------------------------
/R/countReadsFromBam.R:
--------------------------------------------------------------------------------
 1 | ## a helper function to count reads from bam files
 2 | .countReadFromBam <-
 3 |   function(bam,which,DNA2RNA,reads.strand,fragmentLength,left,sliding, binSize ,paired){
 4 | 
 5 |     if(paired){  # pair-end sequence
 6 |       ba = scanBam(bam, param=ScanBamParam( which=which, what =c("pos","strand","qwidth","isize"), flag = scanBamFlag(isProperPair = T,isSecondMateRead = F) ) )
 7 |       ba = data.frame( pos = ba[[1]]$pos, strand = ba[[1]]$strand ,readLength = ba[[1]]$qwidth, isize = ba[[1]]$isize )
 8 |       ba = ba[ ba$pos > left, ]
 9 |       ba = ba[ba$strand == reads.strand | reads.strand == "*", ] ## filter for strand
10 |       ba$pos = DNA2RNA[ba$pos - left] ## convert mapped read pos into RNA position
11 |       ba = ba[ ba$pos > 0, ] ## drop intron reads.
12 |       ##shift the read pos to the center of the reads
13 |       if(reads.strand == "+"){ba$pos = ba$pos + round(abs(ba$isize)/2) }else{ba$pos = ba$pos + ba$readLength - round(abs(ba$isize)/2)  }
14 |       ##count the reads in the sliding windows
15 |       no.window = length(sliding)
16 |       windowCounts = vector(length = no.window)
17 |       for(j in 1:no.window){
18 |         windowCounts[j]= sum( ba$pos >= sliding[j] &  ba$pos < (sliding[j] + binSize)  ) 
19 |       } # count regular bins
20 |       
21 |       if( as.character( strand(which) ) == "+" ){
22 |         windowCounts[no.window] = windowCounts[no.window] + sum( ba$pos >=  (sliding[no.window] + binSize) & ba$pos <= max(DNA2RNA) ) #count the extra part of the last bin on positive strand
23 |       }else{
24 |         windowCounts[1] = windowCounts[1] + sum( ba$pos >=  (sliding[1] + binSize) & ba$pos <= (sliding[1] + binSize + max(DNA2RNA) %% binSize ) ) #count the extra part of the first bin on negative strand
25 |       }
26 |       
27 | 
28 |     }else{ # single-end sequence
29 |       ba = scanBam(bam, param=ScanBamParam( which=which, what =c("pos","strand","qwidth") ) )
30 |       ba = data.frame( pos = ba[[1]]$pos, strand = ba[[1]]$strand, readLength = ba[[1]]$qwidth )
31 |       ba = ba[ ba$pos > left, ]
32 |       ba = ba[ba$strand == reads.strand | reads.strand == "*", ] ## filter for strand
33 |       ba$pos = DNA2RNA[ba$pos - left] ## convert mapped read pos into RNA position
34 |       ba = ba[ ba$pos > 0, ] ## drop intron reads.
35 |       ##shift the read pos to the center of the reads
36 |       if(reads.strand == "+"){ba$pos = ba$pos + round(fragmentLength/2) }else{ba$pos = ba$pos + ba$readLength - round(fragmentLength/2)  }
37 |       ##count the reads in the sliding windows
38 |       no.window = length(sliding)
39 |       windowCounts = vector(length = no.window)
40 |       for(j in 1:no.window){
41 |         windowCounts[j]= sum( ba$pos >= sliding[j] &  ba$pos < (sliding[j] + binSize)  ) 
42 |       } # count regular bins
43 |       
44 |       if( as.character( strand(which) ) == "+" ){
45 |         windowCounts[no.window] = windowCounts[no.window] + sum( ba$pos >=  (sliding[no.window] + binSize) & ba$pos <= max(DNA2RNA) ) #count the extra part of the last bin on positive strand
46 |       }else{
47 |         windowCounts[1] = windowCounts[1] + sum( ba$pos >=  (sliding[1] + binSize) & ba$pos <= (sliding[1] + binSize + max(DNA2RNA) %% binSize ) ) #count the extra part of the first bin on negative strand
48 |       }
49 |       
50 |       
51 |     }
52 | 
53 |     return(windowCounts)
54 |   }
55 | 


--------------------------------------------------------------------------------
/R/get_peak_logOR.R:
--------------------------------------------------------------------------------
  1 | #' @title get_peak_logOR
  2 | #' @description Compute peak logOR, adjust GC bias and IP efficiency for m6A QTL analysis
  3 | #' @param MeRIPdata The MeRIP.Peak object containing peak calling result.
  4 | #' @param vcf_file The vcf file for genotype. The chromosome position must be sorted!!
  5 | #' @param BSgenome The BSgenome object. This needs to match the genome version of the gtf files. 
  6 | #' @param AdjustGC Logic. Choose whether explicitly adjust GC bias (default: TRUE). 
  7 | #' @param AdjIPeffi Logic. Choose whether explicitly adjust overall IP efficiency (default: TRUE). 
  8 | #' @import vcfR
  9 | #' @import BSgenome
 10 | #' @export
 11 | get_peak_logOR <- function( MeRIPdata, vcf_file = NULL, BSgenome = BSgenome.Hsapiens.UCSC.hg19, AdjustGC = TRUE, AdjIPeffi = TRUE, thread = 1 ){
 12 |   
 13 |   ## check input
 14 |   if( !is(MeRIPdata, "MeRIP.Peak") ){
 15 |     stop("The input MeRIPdata needs to be an MeRIP.Peak object!")
 16 |   }else if( !nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_ip) & nrow(MeRIPdata@jointPeaks) == nrow(MeRIPdata@jointPeak_input) ){
 17 |     stop("The peak counts matrix dimension must match the dimension of jointPeaks!")
 18 |   }
 19 |   
 20 |   ## select samples matching the vcf file
 21 |   if(!is.null(vcf_file)){
 22 |     ## check samples in genotype files and in MeRIP.Peak object
 23 |     tmpVcf <- tempfile(fileext = ".vcf.gz")
 24 |     system(paste0("zcat ",vcf_file," | awk 'NR==1 {print $0} (/^#CHROM/){print $0}'| gzip > ",tmpVcf))
 25 |     tmp.vcf <- try(read.vcfR( file =tmpVcf, verbose = F ) , silent = T)
 26 |     ##########################################################################################################
 27 |     ### This is to handle a wired error in the read.vcfR function. ###########################################
 28 |     if(class(tmp.vcf) == "try-error"){                                                                      ##
 29 |       system(paste0("zcat ",vcf_file," | awk '/^#/ {print $0}'| gzip >",tmpVcf))                            ##
 30 |       tmp.vcf <-read.vcfR( file =tmpVcf, verbose = F )                                                      ##
 31 |     }                                                                                                       ##
 32 |     ##########################################################################################################
 33 |     unlink(tmpVcf) # remove the temp file to free space.
 34 |     genotypeSamples <- colnames(tmp.vcf@gt)[-c(1)]
 35 |     
 36 |     if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) == 0 ){
 37 |       stop("The samplenames must match in VCF file and in MeRIP.Peak object! We found no overlap between sample names in these two files!")
 38 |     }else if(length(intersect(genotypeSamples,samplenames(MeRIPdata))) != length(samplenames(MeRIPdata) )){
 39 |       cat("The samples in the VCF don't totally match samples in the MeRIP.Peak object; ")
 40 |       cat("Only samples in MeRIP.Peak object overlapping samples in VCF file will be analyzed in QTL mapping!\nSubsetting samples...\n")
 41 |       MeRIPdata <- MeRIPtools::select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
 42 |       cat(paste0(paste(intersect(genotypeSamples,samplenames(MeRIPdata)),collapse = " "), "\n(",length(intersect(genotypeSamples,samplenames(MeRIPdata))),") samples will be analyzed!"))
 43 |     }else{
 44 |       ## make sure the order of samples aligned between phenotype and genotype
 45 |       MeRIPdata <- select(MeRIPdata, intersect(genotypeSamples,samplenames(MeRIPdata))  )
 46 |     }
 47 |     
 48 |   }else{
 49 |     cat("No VCF file input. Use all samples in the MeRIPdata data. ")
 50 |   }
 51 |   
 52 |   
 53 |   ### Preprocess
 54 |   ## Filter out peaks with zero count
 55 |   MeRIPdata <- filter(MeRIPdata, !apply(extractInput(MeRIPdata), 1, function(x) any(x == 0 )) )
 56 |   cat("Peaks with zero read count in input data have been removed.\n")
 57 |   
 58 |   T0 <- colSums(counts(MeRIPdata)[,1:length(MeRIPdata@samplenames)] )
 59 |   T1 <- colSums(counts(MeRIPdata)[,(length(MeRIPdata@samplenames)+1) : (2*length(MeRIPdata@samplenames)) ] )
 60 |   
 61 |   ## Filter out peaks with OR < 1
 62 |   enrichFlag <- apply( t( t(extractIP(MeRIPdata))/T1 )/ t( t( extractInput(MeRIPdata) )/T0 ),1,function(x){sum(x>1)> MeRIPdata@jointPeak_threshold})
 63 |   MeRIPdata <-  filter(MeRIPdata, enrichFlag )
 64 |   cat(paste0("Peaks with odd ratio > 1 in more than ",MeRIPdata@jointPeak_threshold," samples will be retained.\n",nrow(jointPeak(MeRIPdata))," peaks remaining for QTL mapping.\n"))
 65 |   
 66 |   ## Estimate IP efficiency
 67 |   OR <- t( apply(extractIP(MeRIPdata),1,.noZero)/T1 )/ t( t( extractInput(MeRIPdata) )/T0 )
 68 |   colnames(OR) <- MeRIPdata@samplenames
 69 |   logOR <- log(OR)
 70 |   
 71 |   logOR.id <- which( rowMeans(logOR) < quantile( rowMeans(logOR), 0.95 ) & rowMeans(logOR) > quantile( rowMeans(logOR), 0.05) )# remove two tails
 72 |   K_IPe_ij <- apply(logOR[logOR.id,], 2, function(x){
 73 |     
 74 |     fit <- lm(y~m, data = data.frame(y = x, m=rowMeans(logOR)[logOR.id] ))
 75 |     y.est <- predict(fit, newdata =  data.frame(m = rowMeans(logOR)))
 76 |     return( y.est - rowMeans(logOR) )
 77 |   })
 78 |   
 79 |   ## Estimate GC bias offset
 80 |   if(AdjustGC){
 81 |     cat("Computing GC content for peaks\n")
 82 |     ## GC content correction
 83 |     peak.gr <- .peakToGRangesList( jointPeak(MeRIPdata))
 84 |     cat("...")
 85 |     
 86 |     registerDoParallel( thread )
 87 |     peakSeq <- foreach( i = 1:length(peak.gr), .combine = c )%dopar%{
 88 |       paste( getSeq( BSgenome , peak.gr[[i]] ,as.character =T ) , collapse = "")
 89 |     }
 90 |     peakGC <- sapply( peakSeq, function(x){ sum( str_count(x, c("G","g","C","c"))  )/nchar(x) } )
 91 |     
 92 |     cat("...")
 93 |     
 94 |     peakGC_l <- round(peakGC,digits = 2)
 95 |     peakGC_l[which(peakGC_l<0.2)] <-median(peakGC_l[which(peakGC_l<0.2)] ) # combine some bins at low GC due to low number of peaks
 96 |     peakGC_l[which(peakGC_l>0.84)] <-median(peakGC_l[which(peakGC_l>0.84)] ) # combine some bins at high GC due to low number of peaks
 97 |     l <- sort(unique(peakGC_l))
 98 |     
 99 |     if(AdjIPeffi){ 
100 |       y <- log(OR) - K_IPe_ij
101 |     }else{
102 |       y <- log(OR)
103 |     }
104 |     colnames(y) <- MeRIPdata@samplenames
105 |     
106 |     b.l <- tapply(rowMeans( y ), peakGC_l ,  median) 
107 |     bil <- apply( y, 2, tapply, peakGC_l, median )
108 |     bi. <- apply(  y , 2, median )
109 |     b.. <- median(  y )
110 |     Fil <- as.data.frame( as.matrix(bil) - as.vector(b.l) ) - ( bi. - b.. )
111 |     Fij <- foreach( ii = 1:length(MeRIPdata@samplenames), .combine = cbind)%dopar%{
112 |       GC_fit <- lm(Fil[,ii] ~ poly(l,4) )
113 |       predict(GC_fit, newdata = data.frame(l = peakGC) )
114 |     }
115 |     colnames(Fij) <- MeRIPdata@samplenames
116 |     cat("...\n")
117 |   }
118 |   
119 |   ## Adjust logOR with GC and IP efficiency
120 |   if(AdjustGC & AdjIPeffi){
121 |     cat("Adjust logOR with GC and IP efficiency ... \n")
122 |     logOR_adjusted <- log(OR) -  K_IPe_ij - Fij
123 |   }else if( AdjustGC & !AdjIPeffi){
124 |     cat("Adjust logOR with GC ... \n")
125 |     logOR_adjusted <- log(OR) - Fij
126 |   }else if(AdjIPeffi){
127 |     cat("Adjust logOR with IP efficiency ... \n")
128 |     logOR_adjusted <- log(OR) - K_IPe_ij
129 |   }else{
130 |     logOR_adjusted <- log(OR)
131 |   }
132 |   
133 |   ## parse bed12 file to get peak coordinates
134 |   peak_bed <- jointPeak(MeRIPdata)
135 |   peak_bed$PEAK <- paste0(peak_bed$chr,":", peak_bed$start,"-",peak_bed$end,"_",peak_bed$name,"_",peak_bed$strand )
136 |   
137 |   peak_logOR <- data.frame(peak_bed[, c("chr", "start", "end", "PEAK")], logOR_adjusted)
138 |   
139 |   return(list(logOR = peak_logOR, MeRIPdata = MeRIPdata))
140 |   
141 | }
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/R/gtfToGeneModel.R:
--------------------------------------------------------------------------------
 1 | #' @title gtfToGeneModel
 2 | #' @description to remove ambiguous gene model and return gene model as genomic ranges object
 3 | #' @param gtf gtf file to build gene model
 4 | #' @return geneModel A genomic range list containing gene model
 5 | #' @export
 6 | ##helper function to remove ambiguous gene model and return gene model as genomic ranges object
 7 | gtfToGeneModel = function(gtf){
 8 | 
 9 |   #load gene models for  genes
10 |   ## make the tx database object
11 |   if(is.character(gtf)){
12 |     txdb=makeTxDbFromGFF(gtf,format="gtf")
13 |   }else{
14 |     txdb = gtf
15 |   }
16 | 
17 |   cols <- c("tx_chrom", "tx_strand")
18 |   single_strand_genes <- genes(txdb, columns=cols)
19 | 
20 |   single_strand_genes_names = names(single_strand_genes)
21 |   all_gene_names = names(genes(txdb,single.strand.genes.only=F))
22 |   ambiguious_genes = setdiff(all_gene_names,single_strand_genes_names)
23 |   ambi_genes = genes(txdb,single.strand.genes.only=F)[ambiguious_genes]
24 |   resol_ambi_genes = lapply(ambi_genes,function(x){ return(x[1])  })
25 | 
26 |   #deal with ambiguous chromosome
27 |   wantedChrom=unique(sapply(single_strand_genes$tx_chrom,unlist) )[nchar(unique(sapply(single_strand_genes$tx_chrom,unlist) )) <=5]
28 |   id = which(sapply(single_strand_genes$tx_chrom,unlist) %in% wantedChrom )
29 |   wantedGeneNames = names(single_strand_genes[id])
30 | 
31 |   geneModel = exonsBy(txdb,by="gene")
32 | 
33 |   single_strand_geneModel = geneModel[wantedGeneNames]
34 |   ambi_geneModel = geneModel[ambiguious_genes]
35 |   resol_ambi_geneModel = GRangesList()
36 |   if(length(ambiguious_genes) > 0 ){
37 |     for(i in 1: length(ambi_geneModel)){
38 |     id = as.data.frame(findOverlaps(ambi_geneModel[[i]],resol_ambi_genes[names(ambi_geneModel)[i]][[1]]) )$queryHits
39 |     newModel = GRangesList(ambi_geneModel[[i]][id])
40 |     names(newModel) = names(ambi_geneModel)[i]
41 |     if( all(match (as.character( seqnames(newModel[[1]]) ), wantedChrom,nomatch=0 )>0 ) ) {
42 |       resol_ambi_geneModel = c(resol_ambi_geneModel,newModel)
43 |       }
44 |     }
45 |   }
46 | 
47 | 
48 |   final_geneModel = c(single_strand_geneModel,resol_ambi_geneModel)
49 |   seqlevels(final_geneModel) = wantedChrom
50 | 
51 |   return(final_geneModel)
52 | }
53 | 


--------------------------------------------------------------------------------
/R/initialization.R:
--------------------------------------------------------------------------------
  1 | 
  2 | new_initialization_of_parameters2 <- function(Y, W, V){
  3 |   logY <- log(Y)
  4 |   W2 <- W^2
  5 |   W3 <- W*W2
  6 |   W4 <- W*W3
  7 |   V2 <- V^2
  8 |   V3 <- V*V2
  9 |   V4 <- V*V3
 10 | 
 11 | 
 12 |   fitlm <- summary(lm(Y ~ V + V2 + V3 + V4 -1))
 13 |   Bs <- fitlm$coefficients[, 1]
 14 |   B1 <- Bs[1]
 15 |   B2 <- Bs[2]
 16 |   B3 <- Bs[3]
 17 |   B4 <- Bs[4]
 18 |   C1 <- B2/B1
 19 |   C2 <- B3/B1
 20 |   C3 <- B4/B1
 21 | 
 22 |   newV <- log(V + C1*V2 + C2*V3 + C3*V4)
 23 |   newY <- logY - newV
 24 |   as <- summary(lm(newY ~ W + W2 + W3 + W4))$coefficients[, 1]
 25 |   a0 <- -0.01
 26 |   b1 <- exp(as[1] - a0)
 27 |   b2 <- b1*C1
 28 |   b3 <- b1*C2
 29 |   b4 <- b1*C3
 30 |   as[1] <- a0
 31 |   bs <- c(b1, b2, b3, b4)
 32 |   est <- list(as = as, bs = bs)
 33 |   return(est)
 34 | }
 35 | 
 36 | 
 37 | 
 38 | 
 39 | new_initialization_of_parameters <- function(Y, W, V){
 40 |   logY <- log(Y)
 41 |   W2 <- W^2
 42 |   W3 <- W*W2
 43 |   W4 <- W*W3
 44 |   V2 <- V^2
 45 |   V3 <- V*V2
 46 |   V4 <- V*V3
 47 | 
 48 |   fitlm <- summary(lm(Y ~ V + V2 + V3 + V4 -1))
 49 |   Bs <- fitlm$coefficients[, 1]
 50 |   B1 <- Bs[1]
 51 |   B2 <- Bs[2]
 52 |   B3 <- Bs[3]
 53 |   B4 <- Bs[4]
 54 |   C1 <- B2/B1
 55 |   C2 <- B3/B1
 56 |   C3 <- B4/B1
 57 |   mV <- mean(V)
 58 |   mV2 <- mV*mV
 59 |   mV3 <- mV2*mV
 60 |   mV4 <- mV3*mV
 61 |   F1 <- mV + C1*mV2+ C2*mV3 + C3*mV4
 62 | 
 63 |   as <- summary(lm(logY ~ W + W2 + W3 + W4))$coefficients[, 1]
 64 |   f1 <- as[1]
 65 |   a1 <- as[2]
 66 |   a2 <- as[3]
 67 |   a3 <- as[4]
 68 |   a4 <- as[5]
 69 |   mW <- mean(W)
 70 |   mW2 <- mW*mW
 71 |   mW3 <- mW2*mW
 72 |   mW4 <- mW3*mW
 73 |   F2 <- a1*mW + a2*mW2 + a3*mW3 + a4*mW4
 74 |   a0 <- (log(B1) + f1 - F2 - log(F1))/2
 75 |   b1 <- exp(f1 - a0)/F1
 76 |   b2 <- b1*C1
 77 |   b3 <- b1*C2
 78 |   b4 <- b1*C3
 79 | 
 80 |   newV <- log(b1*V + b2*V2 + b3*V3 + b4*V4)
 81 |   newY <- logY - newV
 82 |   as <- summary(lm(newY ~ W + W2 + W3 + W4))$coefficients[, 1]
 83 | 
 84 |   bs <- c(b1, b2, b3, b4)
 85 |   est <- list(as = as, bs = bs)
 86 |   return(est)
 87 | }
 88 | 
 89 | 
 90 | 
 91 | 
 92 | correct_individual_sample <- function(Y, gc, reads, gamma =  0.8, steps = 50, down = 0.1){
 93 | 
 94 |   data <- Y
 95 |   flag <- Y!=0
 96 |   Y <- Y[flag]
 97 |   gc <- gc[flag]
 98 |   reads <- reads[flag]
 99 |   est <- new_initialization_of_parameters(Y, gc, reads)
100 |   as <- unlist(est$as)
101 |   bs <- unlist(est$bs)
102 |   a0 <- as[1]
103 |   a1 <- as[2]
104 |   a2 <- as[3]
105 |   a3 <- as[4]
106 |   a4 <- as[5]
107 |   b1 <- bs[1]
108 |   b2 <- bs[2]
109 |   b3 <- bs[3]
110 |   b4 <- bs[4]
111 |   res <- gradient_descent_for_individual_sample(Y, gc, reads, a0, a1, a2, a3, a4, b1, b2, b3, b4, gamma, steps, down)
112 | 
113 |   data[flag] <- res$corrected
114 |   report <- list(parameter = res$parameter, corrected = data)
115 | 
116 |   return(report)
117 | }
118 | 


--------------------------------------------------------------------------------
/R/plotGeneCov.R:
--------------------------------------------------------------------------------
  1 | #' @title plotGeneCoverage
  2 | #' @param IP_BAM The bam files for IP samples
  3 | #' @param INPUT_BAM The bam files for INPUT samples
  4 | #' @param size.IP The size factor for IP libraries
  5 | #' @param size.INPUT The size factor for INPUT libraries
  6 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot
  7 | #' @param geneModel The gene model generated by gtfToGeneModel() function
  8 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library
  9 | #' @param GTF gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")
 10 | #' @param adjustExprLevel Logic parameter determining whether adjust coverage so that input are at "same" expression level.
 11 | #' @param plotSNP The option to plot SNP on the figure. Null by default. If want to include SNP in the plot, the parameter needs to ba a dataframe like this:  data.frame(loc= position, anno="A/G")
 12 | #' @import ggsci
 13 | #' @export
 14 | plotGeneCoverage <- function(IP_BAMs, INPUT_BAMs, size.IP, size.INPUT,X, geneName, geneModel, libraryType = "opposite", center = mean ,GTF,ZoomIn=NULL, adjustExprLevel = FALSE, adjustExpr_peak_range = NULL, plotSNP = NULL){
 15 |   
 16 |   ## Get INPUT coverage first if adjust for expression level
 17 |   if(adjustExprLevel){
 18 |     locus <- as.data.frame( range(geneModel[geneName][[1]]) )
 19 |     
 20 |     if( !is.null(adjustExpr_peak_range) ){
 21 |       locus$start = adjustExpr_peak_range[1]
 22 |       locus$end = adjustExpr_peak_range[2]
 23 |       locus$width = adjustExpr_peak_range[2] - adjustExpr_peak_range[1] + 1
 24 |     }else if(is.null(ZoomIn)){
 25 |     }else{
 26 |       locus$start = ZoomIn[1]
 27 |       locus$end = ZoomIn[2]
 28 |       locus$width = ZoomIn[2] - ZoomIn[1] + 1
 29 |     }
 30 |     local.covs <- sapply(INPUT_BAMs,getCov,locus=locus, libraryType = libraryType)
 31 |     cov.size <- colSums( local.covs) / mean(colSums(local.covs) )
 32 |     ## add expression level adjust factor into library size factor
 33 |     size.INPUT.adj <- size.INPUT*cov.size
 34 |     size.IP.adj <- size.IP*cov.size
 35 |     
 36 |     registerDoParallel( length(levels(X)) )
 37 |     INPUT.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
 38 |       getAveCoverage(geneModel= geneModel,bamFiles = INPUT_BAMs[X==ii],geneName = geneName,size.factor = size.INPUT.adj[X==ii], libraryType = libraryType, center = center,ZoomIn = ZoomIn)
 39 |     }
 40 |     IP.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
 41 |       getAveCoverage(geneModel= geneModel,bamFiles = IP_BAMs[X==ii],geneName = geneName,size.factor = size.IP.adj[X==ii], libraryType = libraryType, center = center, ZoomIn = ZoomIn)
 42 |     }
 43 |     rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
 44 |   }else{
 45 |     
 46 |     registerDoParallel( length(levels(X)) )
 47 |     INPUT.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
 48 |       getAveCoverage(geneModel= geneModel,bamFiles = INPUT_BAMs[X==ii],geneName = geneName,size.factor = size.INPUT[X==ii], libraryType = libraryType, center = center,ZoomIn = ZoomIn)
 49 |     }
 50 |     IP.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
 51 |       getAveCoverage(geneModel= geneModel,bamFiles = IP_BAMs[X==ii],geneName = geneName,size.factor = size.IP[X==ii], libraryType = libraryType, center = center, ZoomIn = ZoomIn)
 52 |     }
 53 |     rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
 54 |     
 55 |   }
 56 |   
 57 | 
 58 | 
 59 | 
 60 |   cov.data <- data.frame(genome_location=rep(as.numeric(rownames(IP.cov) ),length(levels(X))),
 61 |                          IP=c(IP.cov),Input=c(INPUT.cov),
 62 |                          Group = factor( rep(levels(X),rep(nrow(IP.cov),length(levels(X)) ) ), levels = levels(X) )
 63 |   )
 64 |   yscale <- max(IP.cov,INPUT.cov)
 65 | 
 66 |   chr <- unique(as.character(as.data.frame(geneModel[geneName])$seqnames))
 67 | 
 68 |   p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Input,colour =Group))+geom_ribbon(aes(ymax = IP,ymin=0,fill=Group), alpha = 0.4)+labs(y=\"normalized coverage\",x = paste0( \"Genome location on chromosome: \", chr) )+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0,0,0,0))+theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
 69 |   panel.grid.minor = element_blank(), axis.line = element_line(colour = \"black\"),  axis.ticks = element_line(colour = \"black\"), axis.title = element_text(color = \"black\", size = 18),axis.text = element_text(color = \"black\", size = 15) ) + scale_fill_nejm(name=\"IP\") + scale_colour_nejm(name=\"INPUT\")+ scale_y_continuous(expand = c(0, 0))"
 70 | 
 71 |   p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn)
 72 | 
 73 |   ## handle the option of plot the SNP in the gene model.
 74 |   if(is.null(plotSNP) ){
 75 |     p <- paste(p1,p2,sep = "+")
 76 |   }else{
 77 | 
 78 |     ## if the SNP is outside of the gene
 79 |     if(plotSNP$loc >max(cov.data$genome_location) ){
 80 | 
 81 |       plotSNP_new <- max(cov.data$genome_location) + 0.02*length(cov.data$genome_location)
 82 |       p3 <- "annotate(\"rect\",xmin = ( plotSNP_new -2 ), xmax = ( plotSNP_new +2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+
 83 |       annotate(\"text\", ,x=plotSNP_new, y = -0.1*yscale, label= paste0(  chr,\":\",as.character(plotSNP$loc)), alpha = .99, colour = \"black\")+
 84 |       annotate(\"text\", ,x=plotSNP_new, y = 0, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0,0,0.06,0))"
 85 |       p <- paste(p1,p2,p3,sep = "+")
 86 | 
 87 |     }else if( plotSNP$loc<min(cov.data$genome_location) ){
 88 | 
 89 |       plotSNP_new <- min(cov.data$genome_location) - 0.02*length(cov.data$genome_location)
 90 |       p3 <- "annotate(\"rect\",xmin = ( plotSNP_new -2 ), xmax = ( plotSNP_new +2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+
 91 |       annotate(\"text\", ,x=plotSNP_new, y = -0.1*yscale, label= paste0(  chr,\":\",as.character(plotSNP$loc)), alpha = .99, colour = \"black\",fontface =2)+
 92 |       annotate(\"text\", ,x=plotSNP_new, y = 0, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\",fontface =2)+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0.06,0,0,0))"
 93 |       p <- paste(p1,p2,p3,sep = "+")
 94 | 
 95 |     }else{ ## if the SNP is within the gene
 96 |       p3 <- "annotate(\"rect\",xmin = (plotSNP$loc-2 ), xmax = ( plotSNP$loc+2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+
 97 |       annotate(\"text\", ,x=plotSNP$loc, y = -0.1*yscale, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\")"
 98 |       p <- paste(p1,p2,p3,sep = "+")
 99 |     }
100 | 
101 |   }
102 | 
103 |   eval(parse( text = p ))
104 | }
105 | 
106 | #' @title plotGeneCoverageSplit
107 | #' @param IP_BAM The bam files for IP samples
108 | #' @param INPUT_BAM The bam files for INPUT samples
109 | #' @param size.IP The size factor for IP libraries
110 | #' @param size.INPUT The size factor for INPUT libraries
111 | #' @param geneName The name (as defined in gtf file) of the gene you want to plot
112 | #' @param geneModel The gene model generated by gtfToGeneModel() function
113 | #' @param libraryType "opposite" for mRNA stranded library, "same" for samll RNA library
114 | #' @param GTF gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")
115 | #' @param adjustExprLevel Logic parameter determining whether adjust coverage so that input are at "same" expression level.
116 | #' @param plotSNP The option to plot SNP on the figure. Null by default. If want to include SNP in the plot, the parameter needs to ba a dataframe like this:  data.frame(loc= position, anno="A/G")
117 | #' @param Names Sample names to be plotted. 
118 | #' @export
119 | plotGeneCoverageSplit <- function(IP_BAMs, INPUT_BAMs, size.IP, size.INPUT,X, geneName, geneModel, libraryType = "opposite", center = mean ,GTF,ZoomIn=NULL, adjustExprLevel = FALSE, adjustExpr_peak_range = NULL, plotSNP = NULL, Names){
120 |   
121 |   ## Get INPUT coverage first if adjust for expression level
122 |   if(adjustExprLevel){
123 |     locus <- as.data.frame( range(geneModel[geneName][[1]]) )
124 |     
125 |     if( !is.null(adjustExpr_peak_range) ){
126 |       locus$start = adjustExpr_peak_range[1]
127 |       locus$end = adjustExpr_peak_range[2]
128 |       locus$width = adjustExpr_peak_range[2] - adjustExpr_peak_range[1] + 1
129 |     }else if(is.null(ZoomIn)){
130 |     }else{
131 |       locus$start = ZoomIn[1]
132 |       locus$end = ZoomIn[2]
133 |       locus$width = ZoomIn[2] - ZoomIn[1] + 1
134 |     }
135 |     local.covs <- sapply(INPUT_BAMs,getCov,locus=locus, libraryType = libraryType)
136 |     cov.size <- colSums( local.covs) / mean(colSums(local.covs) )
137 |     ## add expression level adjust factor into library size factor
138 |     size.INPUT.adj <- size.INPUT*cov.size
139 |     size.IP.adj <- size.IP*cov.size
140 |     
141 |     registerDoParallel( length(levels(X)) )
142 |     INPUT.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
143 |       getAllCoverage(geneModel= geneModel,bamFiles = INPUT_BAMs[X==ii],geneName = geneName,size.factor = size.INPUT.adj[X==ii], libraryType = libraryType, center = center,ZoomIn = ZoomIn)
144 |     }
145 |     IP.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
146 |       getAllCoverage(geneModel= geneModel,bamFiles = IP_BAMs[X==ii],geneName = geneName,size.factor = size.IP.adj[X==ii], libraryType = libraryType, center = center, ZoomIn = ZoomIn)
147 |     }
148 |     rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
149 |   }else{
150 |     
151 |     registerDoParallel( length(levels(X)) )
152 |     INPUT.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
153 |       getAllCoverage(geneModel= geneModel,bamFiles = INPUT_BAMs[X==ii],geneName = geneName,size.factor = size.INPUT[X==ii], libraryType = libraryType, center = center,ZoomIn = ZoomIn)
154 |     }
155 |     IP.cov <- foreach(ii = levels(X),.combine = cbind)%dopar%{
156 |       getAllCoverage(geneModel= geneModel,bamFiles = IP_BAMs[X==ii],geneName = geneName,size.factor = size.IP[X==ii], libraryType = libraryType, center = center, ZoomIn = ZoomIn)
157 |     }
158 |     rm(list=ls(name=foreach:::.foreachGlobals), pos=foreach:::.foreachGlobals)
159 |     
160 |   }
161 |   
162 |   cov.data <- data.frame(genome_location=rep(as.numeric(rownames(IP.cov) ),length( X )), 
163 |                          IP=c(IP.cov),Input=c(INPUT.cov),
164 |                          Group = factor( rep( X ,rep(nrow(IP.cov),length( X ) ) ), levels = levels(X) ),
165 |                          Indiv = rep(Names, rep(nrow(IP.cov),length( X ) )  ),
166 |                          Index = rep( unlist(sapply(table(X), function(x) 1:x ) ), rep(nrow(IP.cov),length( X ) )  )
167 |   )
168 |   
169 |   
170 |   
171 |   yscale <-  max(IP.cov,INPUT.cov)
172 |   
173 |   chr <- unique(as.character(as.data.frame(geneModel[geneName])$seqnames))
174 |   
175 |   p1 <- "ggplot(data = cov.data,aes(genome_location))+geom_line(aes(y=Input),colour = \"#00BFC4\")+geom_ribbon(aes(ymax = IP,ymin=0),fill= \"#F8766D\", alpha = 0.4)+labs(y=\"Normalized coverage\",x = paste0( chr) )+facet_grid(Index~Group) +scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/4) )[-c(1,5)] ),expand = c(0,0))+theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
176 |   panel.grid.minor = element_blank(),strip.text.x = element_text(color = \"black\", size = 18),strip.text.y = element_text(color = \"black\", size = 15) , axis.line = element_line(colour = \"black\", size = 1),axis.title = element_text(color = \"black\", size = 18),axis.text = element_text( colour = \"black\", size = 15) ) + scale_fill_nejm(name=\"IP\") + scale_colour_nejm(name=\"INPUT\") + scale_y_continuous(expand = c(0, 0)) +geom_text(aes(label = Indiv, x = mean(genome_location), y = yscale*0.88 ))"
177 |   
178 |   p2 <- .getGeneModelAnno(geneModel,geneName,GTF,ZoomIn)
179 |   
180 |   ## handle the option of plot the SNP in the gene model.
181 |   if(is.null(plotSNP) ){
182 |     p <- paste(p1,p2,sep = "+")
183 |   }else{
184 |     
185 |     ## if the SNP is outside of the gene
186 |     if(plotSNP$loc >max(cov.data$genome_location) ){
187 |       
188 |       plotSNP_new <- max(cov.data$genome_location) + 0.02*length(cov.data$genome_location)
189 |       p3 <- "annotate(\"rect\",xmin = ( plotSNP_new -2 ), xmax = ( plotSNP_new +2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+
190 |       annotate(\"text\", ,x=plotSNP_new, y = -0.1*yscale, label= paste0(  chr,\":\",as.character(plotSNP$loc)), alpha = .99, colour = \"black\")+
191 |       annotate(\"text\", ,x=plotSNP_new, y = 0, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\")+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0,0,0.06,0))"
192 |       p <- paste(p1,p2,p3,sep = "+")
193 |       
194 |     }else if( plotSNP$loc<min(cov.data$genome_location) ){
195 |       
196 |       plotSNP_new <- min(cov.data$genome_location) - 0.02*length(cov.data$genome_location)
197 |       p3 <- "annotate(\"rect\",xmin = ( plotSNP_new -2 ), xmax = ( plotSNP_new +2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+
198 |       annotate(\"text\", ,x=plotSNP_new, y = -0.1*yscale, label= paste0(  chr,\":\",as.character(plotSNP$loc)), alpha = .99, colour = \"black\",fontface =2)+
199 |       annotate(\"text\", ,x=plotSNP_new, y = 0, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\",fontface =2)+scale_x_continuous(breaks = round(seq(min(cov.data$genome_location), max(cov.data$genome_location), by = ((max(cov.data$genome_location)-min(cov.data$genome_location))/10) )),expand = c(0.06,0,0,0))"
200 |       p <- paste(p1,p2,p3,sep = "+")
201 |       
202 |     }else{ ## if the SNP is within the gene
203 |       p3 <- "annotate(\"rect\",xmin = (plotSNP$loc-2 ), xmax = ( plotSNP$loc+2 ) , ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"red\")+
204 |       annotate(\"text\", ,x=plotSNP$loc, y = -0.1*yscale, label=as.character(plotSNP$anno), alpha = .99, colour = \"blue\")"
205 |       p <- paste(p1,p2,p3,sep = "+")
206 |     }
207 |     
208 |   }
209 |   
210 |   eval(parse( text = p ))
211 | }
212 | 
213 | 
214 | 
215 | ## helper function to get average coverage of a gene of multiple samples
216 | getAveCoverage <- function(geneModel,bamFiles,geneName,size.factor, libraryType = libraryType, center ,ZoomIn){
217 |   locus <- as.data.frame( range(geneModel[geneName][[1]]) )
218 |   if(is.null(ZoomIn)){
219 |   }else{
220 |     locus$start = ZoomIn[1]
221 |     locus$end = ZoomIn[2]
222 |     locus$width = ZoomIn[2] - ZoomIn[1] + 1
223 |   }
224 |   covs <- sapply(bamFiles,getCov,locus=locus, libraryType = libraryType)
225 |   covs <- t( t(covs)/size.factor )
226 |   ave.cov <- apply(covs,1, center)
227 |   return(ave.cov)
228 | }
229 | 
230 | ## helper function to each get coverage of a gene of multiple samples
231 | getAllCoverage <- function(geneModel,bamFiles,geneName,size.factor, libraryType = libraryType, center ,ZoomIn){
232 |   locus <- as.data.frame( range(geneModel[geneName][[1]]) )
233 |   if(is.null(ZoomIn)){
234 |   }else{
235 |     locus$start = ZoomIn[1]
236 |     locus$end = ZoomIn[2]
237 |     locus$width = ZoomIn[2] - ZoomIn[1] + 1
238 |   }
239 |   covs <- sapply(bamFiles,getCov,locus=locus, libraryType = libraryType)
240 |   covs <- t( t(covs)/size.factor )
241 |   
242 |   return(covs)
243 | }
244 | 
245 | 
246 | getCov <- function(bf,locus, libraryType ){
247 |   s_param <- ScanBamParam(which = GRanges(locus$seqnames,IRanges(locus$start,locus$end)))
248 |   p_param <- PileupParam(max_depth=1000000,min_nucleotide_depth=0,distinguish_nucleotides=F)
249 |   #get coverage from the bam file
250 |   res <- pileup(bf,scanBamParam = s_param,pileupParam = p_param)
251 |   if(libraryType == "opposite"){
252 |     res <- res[res$strand!=locus$strand,]
253 |   }else if (libraryType == "same"){
254 |     res <- res[res$strand==locus$strand,]
255 |   }else{
256 |     stop("libraryType must be opposite or same... ")
257 |   }
258 |   cov <- vector(length = locus$width)
259 |   names(cov) <- c(locus$start:locus$end)
260 |   cov[1:locus$width] <- 0
261 |   cov[res$pos-locus$start+1] <- res$count
262 |   return(cov)
263 | }
264 | 
265 | .getGeneModelAnno <- function(geneModel,geneName,gtf_grange,zoomIn = NULL){
266 |   exon.current <- reduce( geneModel[geneName][[1]] )
267 |   startCodon <-  reduce( gtf_grange[gtf_grange$type == "start_codon" & gtf_grange$gene_id == geneName] )
268 |   stopCodon <- reduce( gtf_grange[gtf_grange$type == "stop_codon" & gtf_grange$gene_id == geneName] )
269 |   if(as.logical(strand(exon.current)[1]=="-")){
270 |     startCodon <- startCodon[which.max( start(startCodon) )]
271 |     stopCodon <- stopCodon[which.min( start(stopCodon) )]
272 |     cdsRange <- stopCodon
273 |     end(cdsRange) <- end(startCodon)
274 |     cds.current <- suppressWarnings( GenomicRanges::intersect(exon.current,cdsRange) )
275 |   }else{
276 |     startCodon <- startCodon[which.min( start(startCodon) )]
277 |     stopCodon <- stopCodon[which.max( start(stopCodon) )]
278 |     cdsRange <- startCodon
279 |     end(cdsRange) <- end(stopCodon)
280 |     cds.current <- suppressWarnings( GenomicRanges::intersect(exon.current,cdsRange) )
281 |   }
282 |   utr.current <- GenomicRanges::setdiff(exon.current,cds.current)
283 |   exon.new <- sort( c(cds.current,utr.current) )
284 | 
285 |   if(is.null(zoomIn)){
286 |     cds.id <- unique( queryHits( findOverlaps(exon.new, cds.current)) )
287 |     df.exon <- as.data.frame(exon.new)
288 |     anno.exon <- character(length = length(exon.new))
289 |     anno.intron <- character(length = length(exon.new)-1 )
290 |     for(i in 1:length(exon.new)){
291 |       if( i %in% cds.id){
292 |         anno.exon[i] <- paste0("annotate(\"rect\", xmin =",df.exon$start[i] ,", xmax = ",df.exon$end[i] ,", ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"black\")" )
293 |       }else{
294 |         anno.exon[i] <- paste0("annotate(\"rect\",xmin =",df.exon$start[i] ,", xmax = ",df.exon$end[i] ,", ymin = -0.06*yscale, ymax = -0.04*yscale, alpha = .99, colour = \"black\")")
295 |       }
296 |     }
297 |     if(length(anno.intron)>0){
298 |       for(i in 1:length(anno.intron)){
299 |         anno.intron[i] <- paste0("annotate(\"segment\", x =", df.exon$end[i] ,", xend =", df.exon$start[i+1] ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")")
300 |       }
301 |       p <- paste( paste(anno.exon,collapse = "+"), paste(anno.intron,collapse = "+"), sep = "+")
302 |     }else{
303 |       p <-paste(anno.exon,collapse = "+")
304 |     }
305 | 
306 | 
307 |     return(p)
308 | 
309 |   }else{
310 |     zoomIn.gr <- exon.new[1]
311 |     ranges(zoomIn.gr) <- IRanges(start = zoomIn[1],end = zoomIn[2])
312 |     exon.zoom <- GenomicRanges::intersect(exon.new, zoomIn.gr)
313 |     cds.current.zoom <- GenomicRanges::intersect(exon.zoom, cds.current)
314 |     utr.current.zoom <- GenomicRanges::setdiff(exon.zoom,cds.current.zoom)
315 |     exon.zoom.new <-  sort( c(cds.current.zoom,utr.current.zoom) )
316 | 
317 |     cds.id <- unique( queryHits( findOverlaps(exon.zoom.new, cds.current.zoom)) )
318 |     df.exon <- as.data.frame(exon.zoom.new)
319 |     anno.exon <- character(length = length(exon.zoom))
320 |     ## add exon plot if # exon > 0
321 |     if(length(exon.zoom.new) > 0){
322 |       for(i in 1:length(exon.zoom.new)){
323 |         if( i %in% cds.id){
324 |           anno.exon[i] <- paste0("annotate(\"rect\", xmin =",df.exon$start[i] ,", xmax = ",df.exon$end[i] ,", ymin = -0.08*yscale, ymax = -0.02*yscale, alpha = .99, colour = \"black\")" )
325 |         }else{
326 |           anno.exon[i] <- paste0("annotate(\"rect\",xmin =",df.exon$start[i] ,", xmax = ",df.exon$end[i] ,", ymin = -0.06*yscale, ymax = -0.04*yscale, alpha = .99, colour = \"black\")")
327 |         }
328 |       }
329 |     }
330 | 
331 |     ## plot intron when there are more than two exons
332 |     anno.intron <- character(length = max( length(exon.zoom.new)-1, 0 )  )
333 |     if(length(anno.intron)>0){
334 |       for(i in 1:length(anno.intron)){
335 |         anno.intron[i] <- paste0("annotate(\"segment\", x =", df.exon$end[i] ,", xend =", df.exon$start[i+1] ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")")
336 |       }
337 |     }
338 |     ## When there is only one exon and zoomIn range spans intron
339 |     if( length(exon.zoom.new) > 0 && start(zoomIn.gr)<start(exon.zoom)[1]){
340 |       anno.intron <- c(paste0("annotate(\"segment\", x =", start(zoomIn.gr) ,", xend =", start(exon.zoom)[1] ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")"),
341 |                        anno.intron)
342 |     }
343 |     if( length(exon.zoom.new) > 0 && end(zoomIn.gr) > end(exon.zoom)[length(exon.zoom)] ){
344 |       anno.intron <- c(anno.intron,
345 |                        paste0("annotate(\"segment\", x =", end(exon.zoom)[length(exon.zoom)] ,", xend =", end(zoomIn.gr) ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")") )
346 |     }
347 |     ## When there is no exon but zoomIn ranges is in intron
348 |     if( length(exon.zoom.new) == 0 ){
349 |       anno.intron <- c(anno.intron,
350 |                        paste0("annotate(\"segment\", x =", start(zoomIn.gr) ,", xend =", end(zoomIn.gr) ,", y = -0.05*yscale, yend = -0.05*yscale, alpha = .99, colour = \"black\")") )
351 |     }
352 | 
353 |     ## combine intron and exon plots
354 |     if( length(anno.intron) > 0 & length(anno.exon) >0 ){
355 |       p <- paste( paste(anno.exon,collapse = "+"), paste(anno.intron,collapse = "+"), sep = "+")
356 |     }else if( length(anno.exon) >0 ){
357 |       p <- paste(anno.exon,collapse = "+")
358 |     }else{
359 |       p <- paste(anno.intron,collapse = "+")
360 |     }
361 | 
362 |     return(p)
363 |   }
364 | 
365 | }
366 | 


--------------------------------------------------------------------------------
/R/plotMetaGene.R:
--------------------------------------------------------------------------------
  1 | #' @title plotMetaGene
  2 | #' @param peak the data frame of peak in bed12 format.
  3 | #' @param gtf The annotation file.
  4 | #' @import Guitar
  5 | #' @import ggsci
  6 | #' @export
  7 | plotMetaGene <- function(peak, gtf ){
  8 |   feature <- list('peak'=.peakToGRangesList(peak) )
  9 |   txdb <- makeTxDbFromGFF(gtf,format = "gtf")
 10 |   gc_txdb <- .makeGuitarCoordsFromTxDb(txdb, noBins=50)
 11 |   GuitarCoords <- gc_txdb
 12 |   m <- .countGuitarDensity(
 13 |     feature[[1]],
 14 |     GuitarCoords,
 15 |     5)
 16 |   ct = cbind(m,Feature="peak")
 17 |   ct[[4]] <- as.character(ct[[4]])
 18 | 
 19 |   ## make plot_no-fill
 20 |   ct$weight <- ct$count # as numeric
 21 |   ct1 <- ct[ct$category=="mRNA",] # mRNA
 22 |   ct2 <- ct[ct$category=="lncRNA",] # lncRNA
 23 | 
 24 |   d <- mcols(GuitarCoords)
 25 | 
 26 |   pos=Feature=weight=NULL
 27 | 
 28 |   id1 <- which(match(ct1$comp,c("Front","Back")) >0 )
 29 |   ct1 <- ct1[-id1,]
 30 |   id2 <- which(match(ct2$comp,c("Front","Back")) >0 )
 31 |   ct2 <- ct2[-id2,]
 32 | 
 33 |   # normalize feature
 34 |   featureSet <- as.character(unique(ct$Feature))
 35 |   for (i in 1:length(featureSet)) {
 36 |     id <- (ct1$Feature==featureSet[i])
 37 |     ct1$weight[id] <- ct1$weight[id]/sum(ct1$weight[id])
 38 | 
 39 |     id <- (ct2$Feature==featureSet[i])
 40 |     ct2$weight[id] <- ct2$weight[id]/sum(ct2$weight[id])
 41 |   }
 42 |   
 43 |   
 44 |   p2 <-
 45 |     ggplot(ct2, aes(x=pos, weight=weight)) +
 46 |     ggtitle("Distribution on lncRNA")  +
 47 |     xlab("") +
 48 |     ylab("Frequency") +
 49 |     geom_density(adjust=1,aes(fill=factor(Feature),colour=factor(Feature)) ) +
 50 |     annotate("text", x = 0.5, y = -0.25, label = "lncRNA", size = 5)+
 51 |     annotate("rect", xmin = 0, xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
 52 |     theme_bw() + theme(axis.ticks = element_blank(), axis.text.x = element_blank(),panel.border = element_blank(), panel.grid.major = element_blank(),
 53 |                        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),axis.ticks.y = element_line(colour = "black"),
 54 |                        plot.title = element_text(face = "bold",hjust = 0.5, size = 20),
 55 |                        axis.text.y = element_text( color = "black", size = 15),
 56 |                        axis.title.y = element_text( color = "black", size = 15)
 57 |                        )+ scale_color_aaas(name = "Peak(s)")+scale_fill_aaas(name = "Peak(s)")
 58 | 
 59 |   # normalization by length of components in mRNA
 60 |   # calculate relative length of each components
 61 |   temp <- unique(d[,c(1,3,4,5)])
 62 |   id1 <- which(match(temp$comp,c("Front","Back")) >0 )
 63 |   temp <- temp[-id1,] # remove DNA
 64 |   id1 <- which(match(temp$category,"mRNA") >0 )
 65 |   temp <- temp[id1,]
 66 |   temp <- matrix(temp$interval,ncol=3)
 67 |   temp <- temp/rowSums(temp)
 68 |   temp <- colSums(temp)
 69 |   temp <-temp/sum(temp)
 70 |   weight <- temp
 71 |   names(weight) <- c("5'UTR","CDS","3'UTR")
 72 | 
 73 |   # density
 74 |   cds_id <- which(ct1$comp=="CDS")
 75 |   utr3_id <- which(ct1$comp=="UTR3")
 76 |   utr5_id <- which(ct1$comp=="UTR5")
 77 |   ct1$count[utr5_id] <- ct1$count[utr5_id]*weight["5'UTR"]
 78 |   ct1$count[cds_id] <- ct1$count[cds_id]*weight["CDS"]
 79 |   ct1$count[utr3_id] <- ct1$count[utr3_id]*weight["3'UTR"]
 80 | 
 81 |   # re-normalization
 82 |   featureSet <- as.character(unique(ct$Feature))
 83 |   for (i in 1:length(featureSet)) {
 84 |     id <- (ct1$Feature==featureSet[i])
 85 |     ct1$weight[id] <- ct1$count[id]/sum(ct1$count[id])
 86 |   }
 87 |   x <- cumsum(weight)
 88 |   ct1$pos[utr5_id] <- ct1$pos[utr5_id]*weight["5'UTR"] + 0
 89 |   ct1$pos[cds_id] <- ct1$pos[cds_id]*weight["CDS"] + x[1]
 90 |   ct1$pos[utr3_id] <- ct1$pos[utr3_id]*weight["3'UTR"] + x[2]
 91 | 
 92 |   p1 <-
 93 |     ggplot(ct1, aes(x=pos, weight=weight))  +
 94 |     ggtitle("Distribution on mRNA") +
 95 |     xlab("") +
 96 |     ylab("Frequency") +
 97 |     geom_density(adjust=1,aes(fill=factor(Feature),colour=factor(Feature)) ) +
 98 |     annotate("text", x = x[1]/2, y = -0.25, label = "5'UTR", size = 5) +
 99 |     annotate("text", x = x[1] + weight[2]/2, y = -0.28, label = "CDS", size = 5) +
100 |     annotate("text", x = x[2] + weight[3]/2, y = -0.25, label = "3'UTR", size = 5) +
101 |     geom_vline(xintercept= x[1:2], linetype="dotted") +
102 |     annotate("rect", xmin = 0, xmax = x[1], ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
103 |     annotate("rect", xmin = x[2], xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
104 |     annotate("rect", xmin = x[1], xmax = x[2], ymin = -0.16, ymax = -0.04, alpha = .2, colour = "black")+
105 |     theme_bw() + theme(axis.ticks = element_blank(), axis.text.x = element_blank(),panel.border = element_blank(), panel.grid.major = element_blank(),
106 |                        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),axis.ticks.y = element_line(colour = "black"),
107 |                        plot.title = element_text(face = "bold",hjust = 0.5, size = 20),
108 |                        axis.text.y = element_text( color = "black", size = 15),
109 |                        axis.title.y = element_text( color = "black", size = 15)
110 |     )+ scale_color_aaas(name = "Peak(s)")+scale_fill_aaas(name = "Peak(s)"  )
111 | 
112 |   .multiplot(p1, p2, cols=2)
113 |   
114 |   cat("NOTE this function is a wrapper for R package \"Guitar\".\nIf you use the metaGene plot in publication, please cite the original reference:\nCui et al 2016 BioMed Research International \n")
115 | }
116 | 
117 | 
118 | #' @title plotMEtaGeneMulti A wrapper function for Guitar to plot meta gene plot of multiple samples overlaid on each other.
119 | #' @param peakList The list of peak, each object of list should have a data frame of peak in bed12 format.
120 | #' @param gtf The annotation file for the gene model.
121 | #' @param saveToPDFprefix Set a name to save the plot to PDF.
122 | #' @param includeNeighborDNA Whether to include upstrean and downstream region in the meta gene.
123 | #' @param fill The logic option to chose whether to fill the density curve.
124 | #' @import Guitar
125 | #' @import ggsci
126 | #' @export
127 | plotMetaGeneMulti <- function(peakList,gtf,saveToPDFprefix=NA, fill=FALSE,
128 |                               includeNeighborDNA=FALSE){
129 | 
130 |   gfeature <- lapply(peakList,.peakToGRangesList)
131 |   names(gfeature) <- names(peakList)
132 |   txdb <- makeTxDbFromGFF(gtf,format = "gtf")
133 |   gc_txdb <- .makeGuitarCoordsFromTxDb(txdb, noBins=50)
134 | 
135 |   GuitarPlotNew(gfeature, GuitarCoordsFromTxDb = gc_txdb,saveToPDFprefix=saveToPDFprefix,fill = fill,
136 |              includeNeighborDNA=includeNeighborDNA)
137 |   
138 |   cat("NOTE this function is a wrapper for R package \"Guitar\".\nIf you use the metaGene plot in publication, please cite the original reference:\nCui et al 2016 BioMed Research International \n")
139 |   
140 | }
141 | 


--------------------------------------------------------------------------------
/R/plotPCA.DESeq2.R:
--------------------------------------------------------------------------------
 1 | 
 2 | plotPCA.DESeq2 <- function(data,group = NULL, returnPC = FALSE){
 3 |   if(is.null(group)){
 4 |     colData <- data.frame(group=colnames(data))
 5 |   }else{
 6 |     colData <- data.frame(group=group)
 7 |   }
 8 | 
 9 |   rownames(colData) <- colnames(data)
10 |   dds <- DESeq2::DESeqDataSetFromMatrix(data,colData,design = ~group)
11 |   cat("Using regularized log transformation of DESeq2 to tranform data...\n")
12 |   rld <- DESeq2::rlog(dds)
13 | 
14 |   if(returnPC){
15 |     PCs <- DESeq2::plotPCA(rld,intgroup = "group",returnData=TRUE)
16 |     return(PCs)
17 |   }else{
18 |     cat("Plot PCA using the rlog transformed data...\n")
19 |     DESeq2::plotPCA(rld,intgroup = "group")
20 |   }
21 | 
22 | }
23 | 
24 | #' @title plotPCAfromMatrix
25 | #' @param m The matrix of count data
26 | #' @param group The factor levels to color the samples. Should be the save number as the # of matrix columns
27 | #' @param standardize Logic parameter indicating whether to standardize the count data to have unit variance. The default is TRUE. 
28 | #' @param loglink Logic parameter determine whether to take log of the metrix data. Default is TRUE. If your input matrix is at log scale, use FALSE.
29 | #' @import ggsci
30 | #' @export
31 | plotPCAfromMatrix <- function(m,group, standardize = TRUE,loglink = TRUE ){
32 |   if(loglink){
33 |     mm <- log(m + 1)
34 |   }else{
35 |     mm <- m
36 |   }
37 |   pc <- prcomp(t(mm), scale. = standardize)
38 |   pca.df <- as.data.frame(pc$x)
39 |   vars <- apply(pca.df ,2, var)
40 |   props <- 100*(vars / sum(vars) )
41 |   makeLab <- function(x,pc) paste0("PC",pc,": ",round(x,digits = 2),"% variance")
42 |   ggplot(data = pca.df,aes(PC1,PC2,label = rownames(pca.df),colour=group) )+geom_text()+ xlab(makeLab(props[1],1)) + ylab(makeLab(props[2],2))+
43 |     theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
44 |                        panel.grid.minor = element_blank(), axis.line = element_line(colour = "black",size = 1),
45 |                        axis.title.x=element_text(size=20, color="black", hjust=0.5 ),
46 |                        axis.title.y=element_text(size=20, color = "black", vjust=0.4, angle=90 ),
47 |                        legend.title=element_text(size = 18),legend.text = element_text(size = 18 ),
48 |                        axis.text.x = element_text(size = 18 ) ,axis.text.y = element_text(size = 18 )  )
49 | 
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/R/qqplot.pvalue.R:
--------------------------------------------------------------------------------
 1 | #' @title qqplot.pvalue
 2 | #' @param x can be a vector (p value of one group) or a list of vector (p value of multiple groups).
 3 | #' @param pointSize The size of data points
 4 | #' @param legendSize The size of points in the legend
 5 | #' @export
 6 | qqplot.pvalue <- function(x,pointSize = 1,legendSize = 4){
 7 |   library(ggplot2)
 8 |   if(is.list(x)){
 9 |     nn<-sapply(x, length)
10 |     rs<-cumsum(nn)
11 |     re<-rs-nn+1
12 |     n<-min(nn)
13 |     if (!is.null(names(x))) {
14 |       grp=factor(rep(names(x), nn), levels=names(x))
15 |       names(x)<-NULL
16 |     } else {
17 |       grp=factor(rep(1:length(x), nn))
18 |     }
19 |     pvo<-x
20 |     pvalues<-numeric(sum(nn))
21 |     exp.x<-numeric(sum(nn))
22 |     for(i in 1:length(pvo)) {
23 |       pvalues[rs[i]:re[i]] <- -log10(pvo[[i]])
24 |       exp.x[rs[i]:re[i]] <- -log10((rank(pvo[[i]], ties.method="first")-.5)/nn[i])
25 |     }
26 |     thin <- unique(data.frame(pvalues = round(pvalues, 3),
27 |                               exp.x = round(exp.x, 3),
28 |                               grp=grp))
29 |     grp = thin$grp
30 |     pvalues <- thin$pvalues
31 |     exp.x <- thin$exp.x
32 |     qq.melt <- data.frame(expected=exp.x, Observed = pvalues, label = grp)
33 |     ggplot(data = qq.melt, aes(expected, Observed,colour = label) )+geom_point(size = pointSize)+geom_abline(slope = 1)+xlab("Expected(-log10 p-value)")+ylab("Observed(-log10 p-value)")+
34 |       theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
35 |                          panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))+
36 |       guides(color = guide_legend(override.aes = list(size=legendSize)))
37 | 
38 |   }else{
39 |     expected <- -log10(seq(0,1,1/(length(x)-1 ) ))
40 |     p <- -log10(sort(x))
41 | 
42 |     ## remove some
43 |     tail <- which(p>quantile(x,0.75) )
44 |     head <-seq(1,length(which(p<=quantile(x,0.75) )),10 )
45 |     qq.data <- data.frame(expected=c(expected[head],expected[tail]), Observed=c(p[head],p[tail]) )
46 |     ggplot(data = qq.data, aes(expected, Observed) )+geom_point(size = 1.5,colour = "blue")+geom_abline(slope = 1)+xlab("Expected(-log10 p-value)")+ylab("Observed(-log10 p-value)")+
47 |       theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
48 |                          panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
49 | 
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/R/randomPeak.R:
--------------------------------------------------------------------------------
 1 | ## the main function to generate fixed size random sampled windows on transcriptome given annotation GTF file
 2 | randomPeaks = function(size,gtf){
 3 |   library(GenomicFeatures)
 4 | 
 5 |   geneModel = gtfToGeneModel(gtf)
 6 |   no.genes = length(geneModel)
 7 |   bed12=data.frame() # initiate the bed12 data frame to store the peaks
 8 |   pb <-  txtProgressBar(min = 1, max = no.genes, style = 3)
 9 |   for(i in 1:no.genes){
10 |     # DNA location to gene location conversion
11 |     df.geneModel= as.data.frame(reduce(geneModel[i][[1]]) )##data frame of gene model
12 |     dna.range = as.data.frame(range(geneModel[i]))
13 |     df.geneModel$end = df.geneModel$end - dna.range$start + 1
14 |     df.geneModel$start = df.geneModel$start - dna.range$start + 1
15 |     DNA2RNA = rep(0,dna.range$end - dna.range$start +1)
16 |     no.exon = dim(df.geneModel)[1]
17 |     for (j in 1:no.exon){DNA2RNA[df.geneModel$start[j]:df.geneModel$end[j]]=1}
18 |     exon.length = sum(DNA2RNA)
19 |     DNA2RNA=cumsum(DNA2RNA)*DNA2RNA
20 | 
21 |     #creat a corresponding map from RNA to DNA
22 |     RNA2DNA = 1:exon.length
23 |     pointer = 1
24 |     for (j in 1:no.exon){
25 |       RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1) ]= RNA2DNA[pointer:(pointer+df.geneModel$width[j]-1)] + df.geneModel$start[j] -pointer
26 |       pointer = pointer + df.geneModel$width[j]
27 |     }
28 |     RNA2DNA = RNA2DNA + dna.range$start -1 #back to chromosome coordinates
29 | 
30 |     no.peak.to.sample = round(exon.length/1000)
31 |     peaks.rna = ceiling(runif(no.peak.to.sample,size/2+1,exon.length-size/2-1))
32 | 
33 |     if(no.peak.to.sample > 0 ){
34 |       peaks.dna = as.data.frame ( t(sapply(peaks.rna,function(x,RNA2DNA,size,strand,chrom){
35 |         left = RNA2DNA[x-size/2]
36 |         right = RNA2DNA[x + size/2]
37 |         return(c(chrom,left,right,strand) )
38 |       },RNA2DNA = RNA2DNA,size = size,strand = as.character(dna.range$strand), chrom =as.character(dna.range$seqnames))  ) )
39 |       colnames(peaks.dna)=c("chr","start","end","strand")
40 |       peak.gr = makeGRangesFromDataFrame( peaks.dna )
41 |       for(j in 1:no.peak.to.sample){
42 |         tmp = GenomicRanges::intersect(peak.gr[j],reduce(geneModel[i][[1]]) )
43 |         bed.tmp = data.frame(matrix(nrow=1,ncol=12))
44 |         colnames(bed.tmp)=c("chr","start","end","name","score","strand","thickStart","thickEnd","itemRgb","blockCount","blockSize","blockStart")
45 |         bed.tmp["chr"] = unique(as.character(seqnames(tmp)))
46 |         bed.tmp[c(2,3)] = as.data.frame(range(tmp))[1,c(2,3)]
47 |         bed.tmp["strand"]= unique(as.character(strand(tmp)))
48 |         bed.tmp["name"] = names(geneModel)[i]
49 |         bed.tmp["score"] = 1
50 |         bed.tmp[c("thickStart","thickEnd")] = bed.tmp[c(2,3)]
51 |         bed.tmp["itemRgb"] = NA
52 |         bed.tmp["blockCount"] = length(tmp)
53 |         bed.tmp["blockSize"] = paste(as.data.frame(tmp)[,4],collapse=",")
54 |         bed.tmp["blockStart"] = paste(as.data.frame(tmp)[,2] - rep(as.numeric(bed.tmp[2]),as.numeric(bed.tmp["blockCount"])),collapse=",")
55 |         bed12 = rbind(bed12,bed.tmp)
56 |       }
57 | 
58 |     }
59 | 
60 |     setTxtProgressBar(pb, i)
61 |   }
62 |   return(bed12)
63 | }
64 | 
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MeRIPtools
 2 | Tool sets to analyze high throughput data for RNA modifications
 3 | 
 4 | ### Install the R package from Github
 5 | 
 6 | Depends: GenomicFeatures, Rsamtools, ggplot2, doParallel, foreach,grid,rtracklayer,GenomicAlignments,reshape2,Rcpp,RcppArmadillo,
 7 | Guitar, stringr,vcfR,gamlss, broom, DESeq2
 8 | 
 9 | 	install.packages("devtools")
10 | 	library(devtools)
11 | 	install_github("scottzijiezhang/MeRIPtools")
12 | 	library("MeRIPtools")
13 | 
14 | ## Manual page
15 | 
16 | Please refer to [manual page](https://scottzijiezhang.github.io/MeRIPtoolsManual/) for detailed instructions.  
17 | 
18 | ## Citation 
19 | MeRIPtools is a tool sets that implemented functions for peak calling, QTL calling, differential methylation analysis, visualization. 
20 | 
21 | If you used MeRIPtools in your publication, please cite:
22 | Zhang, Z., Luo, K., Zou, Z. et al. Genetic analyses support the contribution of mRNA N6-methyladenosine (m6A) modification to human disease heritability. _Nat Genet_ 52, 939–949 (2020). https://doi.org/10.1038/s41588-020-0644-z
23 | 
24 | **Note** MeRIPtools also have wrapper functions to call functions from other R packages to do specific analysis.  
25 | If you used the `plotMetaGene` or `MetaGene` function, please cite the original R package [`Guitar`](https://bioconductor.org/packages/release/bioc/html/Guitar.html)  
26 | Cui X, Wei Z, Zhang L, Liu H, Sun L, Zhang s, Huang Y, Meng J (2016). “Guitar: an R/Bioconductor package for gene annotation guided transcriptomic analysis of RNA related genomic features.” BioMed Research International. 
27 | 


--------------------------------------------------------------------------------
/man/BetaBinTest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{BetaBinTest}
 5 | \alias{BetaBinTest}
 6 | \title{BetaBinTest}
 7 | \usage{
 8 | \S4method{BetaBinTest}{MeRIP.Peak}(object, AdjIPeffi = TRUE,
 9 |   AdjustGC = FALSE, BSgenome = BSgenome.Hsapiens.UCSC.hg38)
10 | }
11 | \arguments{
12 | \item{object}{The MeRIP.Peak object}
13 | 
14 | \item{AdjIPeffi}{Logic option determining whether to adjust overall IP efficiency, default is TRUE.}
15 | 
16 | \item{AdjustGC}{Logic option determining whether to adjust GC bias in each bins, default is FALSE.}
17 | 
18 | \item{BSgenome}{The BSgenome object that will used for calculating GC content for each bin if AdjustGC = TRUE.}
19 | }
20 | \description{
21 | Perform inferential test using beta-binomial regression model.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/GuitarPlotNew.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Guitar_functions.R
 3 | \name{GuitarPlotNew}
 4 | \alias{GuitarPlotNew}
 5 | \title{GuitarPlotNew}
 6 | \usage{
 7 | GuitarPlotNew(gfeatures, GuitarCoordsFromTxDb = NA, txdb = NA,
 8 |   genome = NA, noBins = 10, saveToPDFprefix = NA,
 9 |   returnCount = FALSE, includeNeighborDNA = FALSE,
10 |   maximalFeatureAmbiguity = 5, rescaleComponent = TRUE, fill = FALSE,
11 |   adjust = 1)
12 | }
13 | \description{
14 | GuitarPlotNew
15 | }
16 | 


--------------------------------------------------------------------------------
/man/IP.files.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGenerics.R, R/methods.R
 3 | \docType{methods}
 4 | \name{IP.files}
 5 | \alias{IP.files}
 6 | \alias{IP.files,MeRIP-method}
 7 | \title{IP.files}
 8 | \usage{
 9 | IP.files(object)
10 | 
11 | \S4method{IP.files}{MeRIP}(object)
12 | }
13 | \arguments{
14 | \item{object}{The MeRIP object}
15 | }
16 | \description{
17 | Extract path to IP BAM files.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/Input.files.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGenerics.R, R/methods.R
 3 | \docType{methods}
 4 | \name{Input.files}
 5 | \alias{Input.files}
 6 | \alias{Input.files,MeRIP-method}
 7 | \title{Input.files}
 8 | \usage{
 9 | Input.files(object)
10 | 
11 | \S4method{Input.files}{MeRIP}(object)
12 | }
13 | \arguments{
14 | \item{object}{The MeRIP object}
15 | }
16 | \description{
17 | Extract path to INPUT BAM files.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/MetaGene.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{MetaGene}
 4 | \alias{MetaGene}
 5 | \title{MetaGene}
 6 | \usage{
 7 | MetaGene(MeRIP.Peak)
 8 | }
 9 | \arguments{
10 | \item{MeRIP.Peak}{The MeRIP.Peak object}
11 | }
12 | \description{
13 | MetaGene
14 | }
15 | 


--------------------------------------------------------------------------------
/man/PrepCoveragePlot-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{PrepCoveragePlot,MeRIP.Peak-method}
 5 | \alias{PrepCoveragePlot,MeRIP.Peak-method}
 6 | \title{Prepare coverage plot}
 7 | \usage{
 8 | \S4method{PrepCoveragePlot}{MeRIP.Peak}(object, gtf = NULL)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP object}
12 | 
13 | \item{gtf}{optional gtf file if the stored path to gtf file has changed.}
14 | }
15 | \description{
16 | import GTF into the MeRIP object for plot
17 | }
18 | 


--------------------------------------------------------------------------------
/man/QNBtest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{QNBtest}
 5 | \alias{QNBtest}
 6 | \title{Wraper function to use QNB package to test for differential peaks.}
 7 | \usage{
 8 | \S4method{QNBtest}{MeRIP.Peak}(object)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP.Peak object}
12 | }
13 | \description{
14 | Wraper function to use QNB package to test for differential peaks.
15 | }
16 | 


--------------------------------------------------------------------------------
/man/QTL_BetaBin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/QTL_betaBinomial.R
 3 | \name{QTL_BetaBin}
 4 | \alias{QTL_BetaBin}
 5 | \title{QTL_BetaBin}
 6 | \usage{
 7 | QTL_BetaBin(MeRIPdata, vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,
 8 |   testWindow = 1e+05, Chromosome, Range = NULL, Covariates = NULL,
 9 |   AdjustGC = TRUE, AdjIPeffi = TRUE, PCsToInclude = 0,
10 |   normalizeGenotype = FALSE, thread = 1)
11 | }
12 | \arguments{
13 | \item{MeRIPdata}{The MeRIP.Peak object}
14 | 
15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
16 | 
17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.}
18 | 
19 | \item{testWindow}{Integer. Test SNPs in <testWindow> bp window flanking the peak.}
20 | 
21 | \item{Chromosome}{The chromsome to run QTL test.}
22 | 
23 | \item{Range}{The position range on a chromosome to test.}
24 | 
25 | \item{Covariates}{The matrix for covariates to be included in the test.}
26 | 
27 | \item{AdjustGC}{Logic. Choose whether explicitly adjust GC bias.}
28 | 
29 | \item{AdjIPeffi}{Logic. Choose whether explicitly adjust overall IP efficiency}
30 | 
31 | \item{normalizeGenotype}{Logic. Choose whether genotype is normalized to mean = 0, var = 1 before regression.}
32 | }
33 | \description{
34 | QTL_BetaBin
35 | }
36 | 


--------------------------------------------------------------------------------
/man/QTL_BetaBin2.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/QTL_betaBinomial2.R
 3 | \name{QTL_BetaBin2}
 4 | \alias{QTL_BetaBin2}
 5 | \title{QTL_BetaBin2}
 6 | \usage{
 7 | QTL_BetaBin2(MeRIPdata, vcf_file, BSgenome = BSgenome.Hsapiens.UCSC.hg19,
 8 |   testWindow = 1e+05, Chromosome, Range = NULL, Covariates = NULL,
 9 |   AdjustGC = TRUE, PCsToInclude = 0, normalizeGenotype = FALSE,
10 |   thread = 1)
11 | }
12 | \arguments{
13 | \item{MeRIPdata}{The MeRIP.Peak object}
14 | 
15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
16 | 
17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.}
18 | 
19 | \item{testWindow}{Integer. Test SNPs in <testWindow> bp window flanking the peak.}
20 | 
21 | \item{Chromosome}{The chromsome to run QTL test.}
22 | 
23 | \item{Range}{The position range on a chromosome to test.}
24 | 
25 | \item{Covariates}{The matrix for covariates to be included in the test.}
26 | 
27 | \item{AdjustGC}{Logic. Choose whether explicitly adjust GC bias.}
28 | 
29 | \item{normalizeGenotype}{Logic. Choose whether genotype is normalized to mean = 0, var = 1 before regression.}
30 | }
31 | \description{
32 | Same as QTL_BetaBin except sizeFactor estimated by Median of ratio method was used to scale library size instead of total coverage.
33 | }
34 | 


--------------------------------------------------------------------------------
/man/QTL_BetaBin_permute.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/QTL_betaBinomial_permute.R
 3 | \name{QTL_BetaBin_permute}
 4 | \alias{QTL_BetaBin_permute}
 5 | \title{QTL_BetaBin_permute}
 6 | \usage{
 7 | QTL_BetaBin_permute(MeRIPdata, vcf_file,
 8 |   BSgenome = BSgenome.Hsapiens.UCSC.hg19, testWindow = 1e+05,
 9 |   Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE,
10 |   AdjIPeffi = TRUE, PCsToInclude = 0, thread = 1, Nround = 1)
11 | }
12 | \arguments{
13 | \item{MeRIPdata}{The MeRIP.Peak object}
14 | 
15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
16 | 
17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.}
18 | 
19 | \item{testWindow}{Integer. Test SNPs in <testWindow> bp window flanking the peak.}
20 | 
21 | \item{Chromosome}{The chromsome to run QTL test.}
22 | 
23 | \item{Range}{The position range on a chromosome to test.}
24 | 
25 | \item{Covariates}{The matrix for covariates to be included in the test.}
26 | }
27 | \description{
28 | QTL_BetaBin_permute
29 | }
30 | 


--------------------------------------------------------------------------------
/man/QTL_PoissonGamma.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/QTL_PoissonGamma.R
 3 | \name{QTL_PoissonGamma}
 4 | \alias{QTL_PoissonGamma}
 5 | \title{QTL_PoissonGamma}
 6 | \usage{
 7 | QTL_PoissonGamma(pheno, vcf_file, peak_bed, testWindow = 1e+05,
 8 |   Chromosome, Range = NULL, Covariates = NULL, maxPsi = 100,
 9 |   thread = 1)
10 | }
11 | \arguments{
12 | \item{pheno}{The phenotype data matrix. Needs to be IP read count that has been normalized for expression level.}
13 | 
14 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
15 | 
16 | \item{peak_bed}{The peak file in BED12 format that needs to correspond to phenotype data matrix.}
17 | 
18 | \item{testWindow}{Integer. Test SNPs in <testWindow> bp window flanking the peak.}
19 | 
20 | \item{Chromosome}{The chromsome to run QTL test.}
21 | 
22 | \item{Range}{The position range on a chromosome to test.}
23 | 
24 | \item{Covariates}{The matrix for covariates to be included in the test.}
25 | 
26 | \item{maxPsi}{The max estimation for the random effect parameter Psi.}
27 | }
28 | \description{
29 | QTL_PoissonGamma
30 | }
31 | 


--------------------------------------------------------------------------------
/man/RADARtest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{RADARtest}
 5 | \alias{RADARtest}
 6 | \title{RADARtest}
 7 | \usage{
 8 | \S4method{RADARtest}{MeRIP.Peak}(object, exclude = NULL, maxPsi = 100)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP.Peak object}
12 | 
13 | \item{exclude}{A vector to specify sample names to be excluded in the test.}
14 | 
15 | \item{maxPsi}{The max random effect parameter Psi}
16 | }
17 | \description{
18 | Perform inferential test using Poisson Random effect model in RADAR package
19 | }
20 | 


--------------------------------------------------------------------------------
/man/adjustExprLevel-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{adjustExprLevel,MeRIP.Peak-method}
 5 | \alias{adjustExprLevel,MeRIP.Peak-method}
 6 | \title{adjustExprLevel}
 7 | \usage{
 8 | \S4method{adjustExprLevel}{MeRIP.Peak}(object, adjustBy = "geneSum")
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP.Peak object that has been normalized for library size.}
12 | 
13 | \item{adjustBy}{By default, adjust post-IP count by INPUT geneSum. Can also choose "pos" to use current position count to adjust for expression level.}
14 | }
15 | \value{
16 | object The MeRIP.Peak object now with IP-count adjusted for expression level.
17 | }
18 | \description{
19 | adjustExprLevel
20 | }
21 | 


--------------------------------------------------------------------------------
/man/annotatePeak-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{annotatePeak,MeRIP.Peak-method}
 5 | \alias{annotatePeak,MeRIP.Peak-method}
 6 | \title{annotatePeak}
 7 | \usage{
 8 | \S4method{annotatePeak}{MeRIP.Peak}(object, threads = 1)
 9 | }
10 | \arguments{
11 | \item{MeRIP.Peak}{The MeRIP.Peak object}
12 | }
13 | \description{
14 | annotatePeak
15 | }
16 | 


--------------------------------------------------------------------------------
/man/callPeakBinomial.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{callPeakBinomial}
 4 | \alias{callPeakBinomial}
 5 | \title{callPeakBinomial}
 6 | \usage{
 7 | callPeakBinomial(MeRIP, min_counts = 15, peak_cutoff_fdr = 0.05,
 8 |   peak_cutoff_oddRatio = 1, threads = 1)
 9 | }
10 | \arguments{
11 | \item{MeRIP}{The MeRIP object.}
12 | 
13 | \item{min_counts}{The minimal number of reads (input + IP) required in a bin to be called a peak.}
14 | 
15 | \item{peak_cutoff_fdr}{The cutoff of fdr to call a bin peak.}
16 | 
17 | \item{peak_cutoff_oddRatio}{The minimal oddRatio (IP/input) threshold to call a peak.}
18 | 
19 | \item{threads}{The number of threads to use. Default uses 1 threads.}
20 | }
21 | \description{
22 | callPeakBinomial
23 | }
24 | 


--------------------------------------------------------------------------------
/man/callPeakFisher.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{callPeakFisher}
 4 | \alias{callPeakFisher}
 5 | \title{callPeakFisher}
 6 | \usage{
 7 | callPeakFisher(MeRIP, min_counts = 15, peak_cutoff_fdr = 0.05,
 8 |   peak_cutoff_oddRatio = 1, threads = 1)
 9 | }
10 | \arguments{
11 | \item{MeRIP}{The MeRIP object from countReads function.}
12 | 
13 | \item{min_counts}{The minimal number of reads present in a bin to be called a peak.}
14 | 
15 | \item{peak_cutoff_fdr}{The cutoff of fdr of fisher's exact test to call peak.}
16 | 
17 | \item{peak_cutoff_oddRatio}{The minimal oddRatio of fisher's exact test to call peak.}
18 | 
19 | \item{threads}{The number of threads to use. Default uses 1 threads.}
20 | }
21 | \description{
22 | callPeakFisher
23 | }
24 | 


--------------------------------------------------------------------------------
/man/consistentPeak-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{consistentPeak,MeRIP.Peak-method}
 5 | \alias{consistentPeak,MeRIP.Peak-method}
 6 | \title{consistentPeak}
 7 | \usage{
 8 | \S4method{consistentPeak}{MeRIP.Peak}(object, samplenames = NULL,
 9 |   joint_threshold = NA, threads = 1)
10 | }
11 | \arguments{
12 | \item{object}{The MeRIP.Peak object contain peak calling result.}
13 | 
14 | \item{samplenames}{The samplenames to be reported for consistent peaks.}
15 | 
16 | \item{joint_threshold}{Define the number of sample required to have consistent peak in a locus to call consistent peak in a group.}
17 | 
18 | \item{threads}{The number of threads to use.}
19 | }
20 | \value{
21 | Peak consistent across specified samples at specified joint_threshod. If joint_threshold not specified, report consistent peaks across all samples specified. If no sample specified, report consistent peak across all samples.
22 | }
23 | \description{
24 | consistentPeak
25 | }
26 | 


--------------------------------------------------------------------------------
/man/countReads.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/countReads.R
 3 | \name{countReads}
 4 | \alias{countReads}
 5 | \title{countReads}
 6 | \usage{
 7 | countReads(samplenames, gtf, fragmentLength = 150, bamFolder,
 8 |   outputDir = NA, modification = "m6A", binSize = 50,
 9 |   strandToKeep = "opposite", paired = FALSE, threads = 1,
10 |   saveOutput = T)
11 | }
12 | \arguments{
13 | \item{samplenames}{The names of each sample (prefix for bam files).}
14 | 
15 | \item{gtf}{The gtf format gene annotation file}
16 | 
17 | \item{fragmentLength}{The RNA fragment length (insert size of the library).}
18 | 
19 | \item{bamFolder}{Path to the folder where bam file locates}
20 | 
21 | \item{outputDir}{The directory to save output files}
22 | 
23 | \item{modification}{The modification used to name the BAM files.}
24 | 
25 | \item{binSize}{The size of consecutive bins to slice the transcripts}
26 | 
27 | \item{strandToKeep}{According to library preparation protocol, choose which strand to count. Stranded RNA library usually seq the "ooposite" strand. Small RNA library seq the "same" strand.}
28 | 
29 | \item{paired}{Logical indicating whether the input bam files are from paired end sequencing. Default is FALSE. If using paired end data, the read length will be estimated from the data and only good mate are counted.}
30 | 
31 | \item{threads}{The number of threads to use for hyperthreading}
32 | 
33 | \item{saveOutput}{Logical option indicating whether to save output as an RDS file.}
34 | }
35 | \description{
36 | This is the very first function in MeRIP-seq data analysis that initianize a `MeRIP` object. This function takes BAM files of Input/IP library of each samples as input and use given GTF file as gene annotation to divide genes into consecutive bins of user defined size.
37 | }
38 | 


--------------------------------------------------------------------------------
/man/counts.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/AllGenerics.R, R/methods.R
 3 | \docType{methods}
 4 | \name{counts,MeRIP-method}
 5 | \alias{counts,MeRIP-method}
 6 | \alias{counts,MeRIP.Peak-method}
 7 | \title{counts}
 8 | \usage{
 9 | \S4method{counts}{MeRIP}(object)
10 | 
11 | \S4method{counts}{MeRIP.Peak}(object)
12 | }
13 | \arguments{
14 | \item{object}{The MeRIP object}
15 | }
16 | \description{
17 | Extract all (Input + IP) read count of the MeRIP object.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/dot-Bino_test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{.Bino_test}
 4 | \alias{.Bino_test}
 5 | \title{Binomial test}
 6 | \usage{
 7 | .Bino_test(IP, input, IP_overall, input_overall, pseudo_count = 1)
 8 | }
 9 | \value{
10 | data frame with p-values and odds ratio
11 | }
12 | \description{
13 | Binomial test
14 | }
15 | 


--------------------------------------------------------------------------------
/man/geneBins-MeRIP-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{geneBins,MeRIP-method}
 5 | \alias{geneBins,MeRIP-method}
 6 | \title{geneBins}
 7 | \usage{
 8 | \S4method{geneBins}{MeRIP}(object)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP object}
12 | }
13 | \value{
14 | data.frame
15 | }
16 | \description{
17 | Extract the data.frame that maps the consecutive bins to genes.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/geneExpression-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{geneExpression,MeRIP.Peak-method}
 5 | \alias{geneExpression,MeRIP.Peak-method}
 6 | \title{geneExpression}
 7 | \usage{
 8 | \S4method{geneExpression}{MeRIP.Peak}(object)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP object}
12 | }
13 | \description{
14 | Extract gene level expression (RNAseq) data in normalized read counts
15 | }
16 | 


--------------------------------------------------------------------------------
/man/geneExpression.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{geneExpression}
 5 | \alias{geneExpression}
 6 | \title{extractor for RNAseq data}
 7 | \usage{
 8 | \S4method{geneExpression}{MeRIP.Peak}(object)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP object}
12 | }
13 | \description{
14 | Extract gene level expression (RNAseq) data in normalized read counts
15 | }
16 | 


--------------------------------------------------------------------------------
/man/geneExpressionTMP.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{geneExpressionTMP}
 5 | \alias{geneExpressionTMP}
 6 | \title{getTPM from geneSum}
 7 | \usage{
 8 | \S4method{geneExpressionTMP}{MeRIP.Peak}(object,
 9 |   meanFragmentLength = 150, normalize = T)
10 | }
11 | \arguments{
12 | \item{object}{The MeRIP.Peak object}
13 | 
14 | \item{meanFragmentLength}{The mean length of RNA fragment (insert of RNA library). Default is 150bp.}
15 | 
16 | \item{normalize}{Logical indicating whether normalized TPM or raw TPM should be returned.}
17 | }
18 | \description{
19 | getTPM from geneSum
20 | }
21 | 


--------------------------------------------------------------------------------
/man/get_peak_logOR.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/get_peak_logOR.R
 3 | \name{get_peak_logOR}
 4 | \alias{get_peak_logOR}
 5 | \title{get_peak_logOR}
 6 | \usage{
 7 | get_peak_logOR(MeRIPdata, vcf_file = NULL,
 8 |   BSgenome = BSgenome.Hsapiens.UCSC.hg19, AdjustGC = TRUE,
 9 |   AdjIPeffi = TRUE, thread = 1)
10 | }
11 | \arguments{
12 | \item{MeRIPdata}{The MeRIP.Peak object containing peak calling result.}
13 | 
14 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
15 | 
16 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.}
17 | 
18 | \item{AdjustGC}{Logic. Choose whether explicitly adjust GC bias (default: TRUE).}
19 | 
20 | \item{AdjIPeffi}{Logic. Choose whether explicitly adjust overall IP efficiency (default: TRUE).}
21 | }
22 | \description{
23 | Compute peak logOR, adjust GC bias and IP efficiency for m6A QTL analysis
24 | }
25 | 


--------------------------------------------------------------------------------
/man/gtfToGeneModel.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/gtfToGeneModel.R
 3 | \name{gtfToGeneModel}
 4 | \alias{gtfToGeneModel}
 5 | \title{gtfToGeneModel}
 6 | \usage{
 7 | gtfToGeneModel(gtf)
 8 | }
 9 | \arguments{
10 | \item{gtf}{gtf file to build gene model}
11 | }
12 | \value{
13 | geneModel A genomic range list containing gene model
14 | }
15 | \description{
16 | to remove ambiguous gene model and return gene model as genomic ranges object
17 | }
18 | 


--------------------------------------------------------------------------------
/man/jointPeakCount.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{jointPeakCount}
 4 | \alias{jointPeakCount}
 5 | \title{jointPeakCount}
 6 | \usage{
 7 | jointPeakCount(MeRIPdata)
 8 | }
 9 | \arguments{
10 | \item{MeRIPdata}{The data list as output of callPeakFisher()}
11 | }
12 | \value{
13 | Returns the MeRIP object with joint peak read count for Input and IP stored.
14 | }
15 | \description{
16 | Extract read count of joint peaks for each sample and store the read counts in the MeRIP.Peak object.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/normalizeLibrary-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{normalizeLibrary,MeRIP.Peak-method}
 5 | \alias{normalizeLibrary,MeRIP.Peak-method}
 6 | \title{normalizeLibrary}
 7 | \usage{
 8 | \S4method{normalizeLibrary}{MeRIP.Peak}(object)
 9 | }
10 | \arguments{
11 | \item{object}{MeRIP.Peak object.}
12 | }
13 | \description{
14 | Normalized the input as RNA-seq data and normalize IP by enrichment. Specifically, we normalize ip libraries sizes so that the geometry mean of enrichment are the same.
15 | }
16 | 


--------------------------------------------------------------------------------
/man/peakDistribution.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{peakDistribution}
 5 | \alias{peakDistribution}
 6 | \title{plot distribution of peaks on gene annotation}
 7 | \usage{
 8 | \S4method{peakDistribution}{MeRIP.Peak}(object)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP.Peak object}
12 | 
13 | \item{saveName}{the file name to save ditribution plot}
14 | }
15 | \description{
16 | plot distribution of peaks on gene annotation
17 | }
18 | 


--------------------------------------------------------------------------------
/man/plotGene.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/PlotGene.R
 3 | \name{plotGene}
 4 | \alias{plotGene}
 5 | \title{old coverage plot function.}
 6 | \usage{
 7 | plotGene(IP_BAM, INPUT_BAM, size.IP, size.INPUT, geneName, geneModel,
 8 |   libraryType = "opposite", center = mean, GTF, ZoomIn = NULL)
 9 | }
10 | \arguments{
11 | \item{IP_BAM}{The bam files for IP samples}
12 | 
13 | \item{INPUT_BAM}{The bam files for INPUT samples}
14 | 
15 | \item{size.IP}{The size factor for IP libraries}
16 | 
17 | \item{size.INPUT}{The size factor for INPUT libraries}
18 | 
19 | \item{geneName}{The name (as defined in gtf file) of the gene you want to plot}
20 | 
21 | \item{geneModel}{The gene model generated by gtfToGeneModel() function}
22 | 
23 | \item{libraryType}{"opposite" for mRNA stranded library, "same" for samll RNA library}
24 | 
25 | \item{GTF}{gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")}
26 | }
27 | \description{
28 | old coverage plot function.
29 | }
30 | 


--------------------------------------------------------------------------------
/man/plotGeneCov-MeRIP.Peak-method.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{plotGeneCov,MeRIP.Peak-method}
 5 | \alias{plotGeneCov,MeRIP.Peak-method}
 6 | \title{plotGeneCov}
 7 | \usage{
 8 | \S4method{plotGeneCov}{MeRIP.Peak}(object, geneName,
 9 |   libraryType = "opposite", center = mean, ZoomIn = NULL,
10 |   adjustExprLevel = F, adjustExpr_peak_range = NULL)
11 | }
12 | \arguments{
13 | \item{object}{The data list from countReads and other analysis.}
14 | 
15 | \item{geneName}{The gene symbol to be ploted.}
16 | 
17 | \item{libraryType}{Specify whether the library is the same or opposite strand of the original RNA molecule. Default is "opposite".}
18 | 
19 | \item{center}{Specify the method to calculate average coverage of each group. Could be mean or median.}
20 | 
21 | \item{ZoomIn}{c(start,end) The coordinate to zoom in at the gene to be ploted.}
22 | 
23 | \item{adjustExprLevel}{logical parameter. Specify whether normalize the two group so that they have similar expression level.}
24 | 
25 | \item{GTF}{The GRanges object containing gtf annotation. Can obtain by rtracklayer::import("file.gtf", format= "gtf").}
26 | }
27 | \description{
28 | plotGeneCov
29 | }
30 | 


--------------------------------------------------------------------------------
/man/plotGeneCoverage.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotGeneCov.R
 3 | \name{plotGeneCoverage}
 4 | \alias{plotGeneCoverage}
 5 | \title{plotGeneCoverage}
 6 | \usage{
 7 | plotGeneCoverage(IP_BAMs, INPUT_BAMs, size.IP, size.INPUT, X, geneName,
 8 |   geneModel, libraryType = "opposite", center = mean, GTF,
 9 |   ZoomIn = NULL, adjustExprLevel = FALSE,
10 |   adjustExpr_peak_range = NULL, plotSNP = NULL)
11 | }
12 | \arguments{
13 | \item{size.IP}{The size factor for IP libraries}
14 | 
15 | \item{size.INPUT}{The size factor for INPUT libraries}
16 | 
17 | \item{geneName}{The name (as defined in gtf file) of the gene you want to plot}
18 | 
19 | \item{geneModel}{The gene model generated by gtfToGeneModel() function}
20 | 
21 | \item{libraryType}{"opposite" for mRNA stranded library, "same" for samll RNA library}
22 | 
23 | \item{GTF}{gtf annotation as GRanges object. Can be obtained by GTF <- rtracklayer::import("xxx.gtf",format = "gtf")}
24 | 
25 | \item{adjustExprLevel}{Logic parameter determining whether adjust coverage so that input are at "same" expression level.}
26 | 
27 | \item{plotSNP}{The option to plot SNP on the figure. Null by default. If want to include SNP in the plot, the parameter needs to ba a dataframe like this:  data.frame(loc= position, anno="A/G")}
28 | 
29 | \item{IP_BAM}{The bam files for IP samples}
30 | 
31 | \item{INPUT_BAM}{The bam files for INPUT samples}
32 | }
33 | \description{
34 | plotGeneCoverage
35 | }
36 | 


--------------------------------------------------------------------------------
/man/plotGeneMonster.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/PlotGene.R
 3 | \name{plotGeneMonster}
 4 | \alias{plotGeneMonster}
 5 | \title{plotGeneMonster}
 6 | \usage{
 7 | plotGeneMonster(readsOut, geneName, libraryType = "opposite",
 8 |   center = "mean", GTF, ZoomIn = NULL)
 9 | }
10 | \arguments{
11 | \item{readsOut}{The data list from countReads and other analysis.}
12 | 
13 | \item{geneName}{The gene symbol to be ploted.}
14 | 
15 | \item{GTF}{The GRanges object containing gtf annotation.}
16 | 
17 | \item{ZoomIn}{c(start,end) The coordinate to zoom in at the gene to be ploted.}
18 | }
19 | \description{
20 | plotGeneMonster
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plotGenePair.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/PlotGene.R
 3 | \name{plotGenePair}
 4 | \alias{plotGenePair}
 5 | \title{plotGenePair}
 6 | \usage{
 7 | plotGenePair(Ctl_IP_BAM, Ctl_INPUT_BAM, Treat_IP_BAM, Treat_INPUT_BAM,
 8 |   Ctl_size.IP, Ctl_size.INPUT, Treat_size.IP, Treat_size.INPUT, geneName,
 9 |   geneModel, libraryType = "ooposite", center = mean, GTF,
10 |   ZoomIn = NULL)
11 | }
12 | \arguments{
13 | \item{Ctl_IP_BAM}{The bam files for Control IP samples}
14 | 
15 | \item{Ctl_INPUT_BAM}{The bam files for Control INPUT samples}
16 | 
17 | \item{Treat_IP_BAM}{The bam files for treated IP samples}
18 | 
19 | \item{Treat_INPUT_BAM}{The bam files for treated INPUT samples}
20 | 
21 | \item{Ctl_size.IP}{The size factor for IP libraries}
22 | 
23 | \item{Ctl_size.INPUT}{The size factor for INPUT libraries}
24 | 
25 | \item{Treat_size.IP}{The size factor for IP libraries}
26 | 
27 | \item{Treat_size.INPUT}{The size factor for INPUT libraries}
28 | 
29 | \item{geneName}{The name (as defined in gtf file) of the gene you want to plot}
30 | 
31 | \item{geneModel}{The gene model generated by gtfToGeneModel() function}
32 | 
33 | \item{libraryType}{"opposite" for mRNA stranded library, "same" for samll RNA library}
34 | }
35 | \description{
36 | plot tow groups of samples in the same figure
37 | }
38 | 


--------------------------------------------------------------------------------
/man/plotMetaGene.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotMetaGene.R
 3 | \name{plotMetaGene}
 4 | \alias{plotMetaGene}
 5 | \title{plotMetaGene}
 6 | \usage{
 7 | plotMetaGene(peak, gtf)
 8 | }
 9 | \arguments{
10 | \item{peak}{the data frame of peak in bed12 format.}
11 | 
12 | \item{gtf}{The annotation file.}
13 | }
14 | \description{
15 | plotMetaGene
16 | }
17 | 


--------------------------------------------------------------------------------
/man/plotMetaGeneMulti.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotMetaGene.R
 3 | \name{plotMetaGeneMulti}
 4 | \alias{plotMetaGeneMulti}
 5 | \title{plotMEtaGeneMulti A wrapper function for Guitar to plot meta gene plot of multiple samples overlaid on each other.}
 6 | \usage{
 7 | plotMetaGeneMulti(peakList, gtf, saveToPDFprefix = NA,
 8 |   includeNeighborDNA = FALSE)
 9 | }
10 | \arguments{
11 | \item{peakList}{The list of peak, each object of list should have a data frame of peak in bed12 format.}
12 | 
13 | \item{gtf}{The annotation file for the gene model.}
14 | 
15 | \item{saveToPDFprefix}{Set a name to save the plot to PDF.}
16 | 
17 | \item{includeNeighborDNA}{Whether to include upstrean and downstream region in the meta gene.}
18 | }
19 | \description{
20 | plotMEtaGeneMulti A wrapper function for Guitar to plot meta gene plot of multiple samples overlaid on each other.
21 | }
22 | 


--------------------------------------------------------------------------------
/man/plotPCAfromMatrix.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plotPCA.DESeq2.R
 3 | \name{plotPCAfromMatrix}
 4 | \alias{plotPCAfromMatrix}
 5 | \title{plotPCAfromMatrix}
 6 | \usage{
 7 | plotPCAfromMatrix(m, group, standardize = TRUE, loglink = TRUE)
 8 | }
 9 | \arguments{
10 | \item{m}{The matrix of count data}
11 | 
12 | \item{group}{The factor levels to color the samples. Should be the save number as the # of matrix columns}
13 | 
14 | \item{standardize}{Logic parameter indicating whether to standardize the count data to have unit variance. The default is TRUE.}
15 | 
16 | \item{loglink}{Logic parameter determine whether to take log of the metrix data. Default is TRUE. If your input matrix is at log scale, use FALSE.}
17 | }
18 | \description{
19 | plotPCAfromMatrix
20 | }
21 | 


--------------------------------------------------------------------------------
/man/plotTPM.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{plotTPM}
 4 | \alias{plotTPM}
 5 | \title{plotTPM}
 6 | \usage{
 7 | plotTPM(TPM, geneName, group, logCount = FALSE, facet_grid = FALSE)
 8 | }
 9 | \arguments{
10 | \item{TPM}{Dataframe of gene TPM}
11 | 
12 | \item{geneName}{The name of genes to be ploted.}
13 | 
14 | \item{group}{Categorical info for each sample.}
15 | 
16 | \item{logCount}{where to plot count at log scale}
17 | }
18 | \description{
19 | plotTPM
20 | }
21 | 


--------------------------------------------------------------------------------
/man/qqplot.pvalue.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/qqplot.pvalue.R
 3 | \name{qqplot.pvalue}
 4 | \alias{qqplot.pvalue}
 5 | \title{qqplot.pvalue}
 6 | \usage{
 7 | qqplot.pvalue(x, pointSize = 1, legendSize = 4)
 8 | }
 9 | \arguments{
10 | \item{x}{can be a vector (p value of one group) or a list of vector (p value of multiple groups).}
11 | 
12 | \item{pointSize}{The size of data points}
13 | 
14 | \item{legendSize}{The size of points in the legend}
15 | }
16 | \description{
17 | qqplot.pvalue
18 | }
19 | 


--------------------------------------------------------------------------------
/man/reportJointPeak.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \name{reportJointPeak}
 4 | \alias{reportJointPeak}
 5 | \title{reportJointPeak}
 6 | \usage{
 7 | reportJointPeak(MeRIPdata, joint_threshold = 2, threads = 1)
 8 | }
 9 | \arguments{
10 | \item{MeRIPdata}{The MeRIP.Peak object containing peak calling result}
11 | 
12 | \item{joint_threshold}{Define the number of sample required to have consistent peak in a locus to call this bin a joint peak.}
13 | 
14 | \item{threads}{The number of threads to use.}
15 | }
16 | \value{
17 | MeRIP.Peak object with jointPeaks data
18 | }
19 | \description{
20 | reportJointPeak
21 | }
22 | 


--------------------------------------------------------------------------------
/man/results.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{results}
 5 | \alias{results}
 6 | \title{export results}
 7 | \usage{
 8 | \S4method{results}{MeRIP.Peak}(object)
 9 | }
10 | \arguments{
11 | \item{object}{The MeRIP.Peak object.}
12 | }
13 | \value{
14 | joint peaks (with test result) in a data.frame.
15 | }
16 | \description{
17 | The extractor for final test result.
18 | }
19 | 


--------------------------------------------------------------------------------
/man/select.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/methods.R
 3 | \docType{methods}
 4 | \name{select}
 5 | \alias{select}
 6 | \title{subset MeRIPdata}
 7 | \usage{
 8 | \S4method{select}{MeRIP}(object, samples)
 9 | 
10 | \S4method{select}{MeRIP.Peak}(object, samples, keepData = TRUE)
11 | }
12 | \arguments{
13 | \item{object}{The MeRIP object}
14 | 
15 | \item{samples}{The samplenames to be subset or the index number of samples to be subset.}
16 | 
17 | \item{object}{The MeRIP.Peak object}
18 | 
19 | \item{samples}{The samplenames to be subset or the index number of samples to be subset.}
20 | }
21 | \value{
22 | an MeRIP object of selected samples.
23 | 
24 | an MeRIP.Peak object of selected samples.
25 | }
26 | \description{
27 | subset dataset by samples.
28 | 
29 | subset dataset by samples.
30 | }
31 | 


--------------------------------------------------------------------------------
/man/swapChr22.QTL_BetaBin.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/QTL_betaBinomial.neg.R
 3 | \name{swapChr22.QTL_BetaBin}
 4 | \alias{swapChr22.QTL_BetaBin}
 5 | \title{QTL_BetaBin}
 6 | \usage{
 7 | swapChr22.QTL_BetaBin(MeRIPdata, vcf_file,
 8 |   BSgenome = BSgenome.Hsapiens.UCSC.hg19, testWindow = 1e+05,
 9 |   Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE,
10 |   AdjIPeffi = TRUE, PCsToInclude = 0, thread = 1)
11 | }
12 | \arguments{
13 | \item{MeRIPdata}{The MeRIP.Peak object}
14 | 
15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
16 | 
17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.}
18 | 
19 | \item{testWindow}{Integer. Test SNPs in <testWindow> bp window flanking the peak.}
20 | 
21 | \item{Chromosome}{The chromsome to run QTL test.}
22 | 
23 | \item{Range}{The position range on a chromosome to test.}
24 | 
25 | \item{Covariates}{The matrix for covariates to be included in the test.}
26 | }
27 | \description{
28 | QTL_BetaBin
29 | }
30 | 


--------------------------------------------------------------------------------
/man/swapChr22.QTL_BetaBin.permute.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/QTL_betaBinomial.neg.permute.R
 3 | \name{swapChr22.QTL_BetaBin.permute}
 4 | \alias{swapChr22.QTL_BetaBin.permute}
 5 | \title{swapChr22.QTL_BetaBin.permute}
 6 | \usage{
 7 | swapChr22.QTL_BetaBin.permute(MeRIPdata, vcf_file,
 8 |   BSgenome = BSgenome.Hsapiens.UCSC.hg19, testWindow = 1e+05,
 9 |   Chromosome, Range = NULL, Covariates = NULL, AdjustGC = TRUE,
10 |   AdjIPeffi = TRUE, PCsToInclude = 0, thread = 1, Nround = 1)
11 | }
12 | \arguments{
13 | \item{MeRIPdata}{The MeRIP.Peak object}
14 | 
15 | \item{vcf_file}{The vcf file for genotype. The chromosome position must be sorted!!}
16 | 
17 | \item{BSgenome}{The BSgenome object. This needs to match the genome version of the gtf files.}
18 | 
19 | \item{testWindow}{Integer. Test SNPs in <testWindow> bp window flanking the peak.}
20 | 
21 | \item{Chromosome}{The chromsome to run QTL test.}
22 | 
23 | \item{Range}{The position range on a chromosome to test.}
24 | 
25 | \item{Covariates}{The matrix for covariates to be included in the test.}
26 | }
27 | \description{
28 | swapChr22.QTL_BetaBin.permute
29 | }
30 | 


--------------------------------------------------------------------------------
/src/.PoissonGamma.cpp.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottzijiezhang/MeRIPtools/626569af9b0b5ee48f8e0d165d229cc906ce9851/src/.PoissonGamma.cpp.swp


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | ## Use the R_HOME indirection to support installations of multiple R version
2 | PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
3 | 


--------------------------------------------------------------------------------