├── DE_TCGA_2_updated_github.R ├── MatchMixeR ├── DESCRIPTION ├── MatchMixeR.Rproj ├── NAMESPACE ├── R │ ├── MatchMixeR.R │ ├── ckmeans.R │ ├── dwd.R │ ├── eb.R │ ├── functions.R │ ├── gq.R │ └── xpn.R ├── Results of devtools check.txt ├── data │ └── gpl570_gpl96.rda ├── man │ ├── CONOR.Rd │ ├── MM.Rd │ ├── flmer.Rd │ └── gpl570_gpl96.Rd ├── src │ ├── MatchMixeR.dll │ ├── XPN_MLE.c │ ├── XPN_MLE.o │ ├── ckmeans_c.c │ └── ckmeans_c.o └── vignettes │ ├── overview.Rmd │ └── overview.html ├── README.md └── Simulation_updated_github.R /DE_TCGA_2_updated_github.R: -------------------------------------------------------------------------------- 1 | 2 | ### This code was written by Dr. Serin Zhang, as a part of her Statistics PhD 3 | ### dissertation project at Florida State University 4 | 5 | library("genefilter") 6 | library("MatchMixeR") 7 | 8 | # setwd("C:/Users/Disa Yu/Dropbox/Cross-platform-normalization") 9 | # getwd() 10 | 11 | load("./Real_data/matched_TCGA_mat.Rdata") 12 | load("./Real_data/NormalvsTumor_mat.Rdata") 13 | 14 | ## na in array data -> 0 15 | matched_array_mat[is.na(matched_array_mat)] <- 0 16 | 17 | ## choose 100 samples(out of 523 matched samples) as training set 18 | train.ID <- sample(1:523,100,replace=FALSE) 19 | X <- matched_array_mat[,train.ID] ; Y <- matched_seq_mat[,train.ID] 20 | train.sample <- colnames(X) 21 | OLSmod <- MatchMixeR:::OLS(X, Y) 22 | FLMERmod <-MM(X, Y) 23 | 24 | ### DE analysis 25 | ## Get DEGs from Yuhang's list (SigGeneList_1.xls) 26 | SigGeneList <- read.csv("./Real_data/SigGeneList_nocomments.csv") 27 | 28 | SigGene <- SigGeneList[,c(1,3,7)] 29 | # using cut off p-value < 0.001 ---> # of DEGs = 9555 30 | SigGene_padj <- SigGene[SigGene$padj< .001,] 31 | colnames(SigGene_padj)[1] <- "SigGeneList_padj" 32 | SigG1 <- as.character(SigGene_padj$SigGeneList_padj) 33 | glist_1 <- strsplit(SigG1, "|", fixed=TRUE) 34 | SigG1_ID <-t(data.frame(glist_1)) 35 | SigG1_ID <- data.frame(SigG1_ID) 36 | colnames(SigG1_ID)[1] <- "geneID" 37 | common.gene <- data.frame(row.names(matched_array_mat)); colnames(common.gene) <-"geneID" 38 | DEG1 <- merge(cbind(common.gene,c(1:16146)),SigG1_ID,by="geneID") 39 | DEG_pa <- as.character(DEG1$geneID) 40 | 41 | ## remove train samples from the test sample pool 42 | normal_array_1 <- normal_array[ ,setdiff(colnames(normal_array), train.sample)] 43 | tumor_array_1 <- tumor_array[,setdiff (colnames(tumor_array), train.sample)] 44 | normal_seq_1 <- normal_seq[ ,setdiff(colnames(normal_seq), train.sample)] 45 | tumor_seq_1 <- tumor_seq[,setdiff (colnames(tumor_seq), train.sample)] 46 | s1 <- length(colnames(normal_array_1)); s2 <-length(colnames(tumor_array_1)) 47 | s3 <- length(colnames(normal_seq_1)); s4 <-length(colnames(tumor_seq_1)) 48 | 49 | TP <- matrix(0,30,8); FP <- matrix(0,30,8); com <- matrix(0,30,4) 50 | colnames(TP) <- c("raw","OLS","MM","DWD","XPN","EB","GQ","Fisher") 51 | colnames(FP) <- c("raw","OLS","MM","DWD","XPN","EB","GQ","Fisher") 52 | colnames(com) <- c("array.P", "seq.P", "com.P", "com.TP") 53 | 54 | for (i in 1:5) 55 | { print(i) 56 | n1 <- 20; n2 <- 0 # GroupA&PlatformX ; GroupB&PlatformX 57 | n3 <- 0; n4 <- 20 # GroupA&PlatformY ; GroupB&PlatformY 58 | normal_arrayID <- sample(1:s1,n1,replace=FALSE) 59 | tumor_arrayID <- sample(1:s2,n2,replace=FALSE) 60 | normal_seqID <- sample(1:s3,n3,replace=FALSE) 61 | tumor_seqID <- sample(1:s4,n4,replace=FALSE) 62 | test.array <- cbind(normal_array_1[,normal_arrayID],tumor_array_1[,tumor_arrayID]) 63 | test.array[is.na(test.array)] = 0 64 | test.seq <- cbind(normal_seq_1[,normal_seqID],tumor_seq_1[,tumor_seqID]) 65 | 66 | XtransOLS <- test.array*OLSmod$betamat[, "Slope"] + OLSmod$betamat[, "Intercept"] 67 | XtransFLMER <- test.array*FLMERmod$betamat[, "Slope"] + FLMERmod$betamat[, "Intercept"] 68 | test.array<- data.frame(test.array);test.seq<- data.frame(test.seq) 69 | DWDmod <- dwd(test.array, test.seq) 70 | EBmod <- eb(test.array, test.seq) 71 | GQmod <- gq(test.array,test.seq) # Check: is gq from genefilter package? 72 | XPNmod <- xpn(test.array, test.seq) 73 | 74 | ## check stat. power and type I error 75 | DataArray <- as.matrix(test.array) 76 | DataSeq <- as.matrix(test.seq) 77 | DataRaw <- as.matrix(cbind(test.array, test.seq)) 78 | DataOLS <- as.matrix(cbind(XtransOLS, test.seq)) 79 | DataFLMER <- as.matrix(cbind(XtransFLMER, test.seq)) 80 | DataDWD <- as.matrix(cbind(DWDmod$x,DWDmod$y)) 81 | DataXPN <- as.matrix(cbind(XPNmod$x,XPNmod$y)) 82 | DataEB <- as.matrix(cbind(EBmod$x,EBmod$y)) 83 | DataGQ <- as.matrix(cbind(GQmod$x,GQmod$y)) 84 | fac_1 <- as.factor(c(rep(0, n1), rep(1, n2),rep(0, n3), rep(1, n4))) 85 | fac_array <- as.factor(c(rep(0, n1), rep(1, n2))) 86 | fac_seq <- as.factor(c(rep(0, n3), rep(1, n4))) 87 | rr.raw <- rowttests(DataRaw, fac_1) 88 | rr.OLS <- rowttests(DataOLS, fac_1) 89 | rr.FLMER <- rowttests(DataFLMER, fac_1) 90 | rr.DWD <- rowttests(DataDWD, fac_1) 91 | rr.XPN <- rowttests(DataXPN, fac_1) 92 | rr.EB <- rowttests(DataEB, fac_1) 93 | rr.GQ <- rowttests(DataGQ, fac_1) 94 | rr.array <- rowttests(DataArray, fac_array) 95 | rr.seq <- rowttests(DataSeq, fac_seq) 96 | 97 | p.array <- rr.array[rr.array$p.value < 0.05,] 98 | TP.array <- as.numeric(dim(p.array[row.names(p.array) %in% DEG_pa,])[1]) 99 | FP.array <- as.numeric(dim(p.array)[1]) - TP.array 100 | 101 | p.seq <- rr.seq[rr.seq$p.value < 0.05,] 102 | TP.seq <- as.numeric(dim(p.seq[row.names(p.seq) %in% DEG_pa,])[1]) 103 | FP.seq <- as.numeric(dim(p.seq)[1]) - TP.seq 104 | 105 | # common TP bet.array&seq results 106 | tp.array <- p.array[row.names(p.array) %in% DEG_pa,] 107 | tp.seq <- p.seq[row.names(p.seq) %in% DEG_pa,] 108 | com.p <- as.numeric(dim(p.array[row.names(p.array) %in% row.names(p.seq),])[1]) 109 | com.tp <- as.numeric(dim(tp.array[row.names(tp.array) %in% row.names(tp.seq),])[1]) 110 | array.p <- as.numeric(dim(p.array)[1]) 111 | seq.p <- as.numeric(dim(p.seq)[1]) 112 | 113 | sumlogP <- -2*(log(rr.array$p.value) + log(rr.seq$p.value)) #sum of log 114 | combineP <- 1-pchisq(sumlogP,4) 115 | rr.combine <- cbind(rr.array,combineP) 116 | p.fisher <- rr.combine[rr.combine$combineP < 0.05,] 117 | TP.fisher <- as.numeric(dim(p.fisher[row.names(p.fisher) %in% DEG_pa,])[1]) 118 | FP.fisher<- as.numeric(dim(p.fisher)[1]) - TP.fisher 119 | 120 | 121 | p.raw <- rr.raw[rr.raw$p.value < 0.05,] 122 | TP.raw <- as.numeric(dim(p.raw[row.names(p.raw) %in% DEG_pa,])[1]) 123 | FP.raw <- as.numeric(dim(p.raw)[1]) - TP.raw 124 | 125 | 126 | p.OLS <- rr.OLS[rr.OLS$p.value < 0.05,] 127 | TP.OLS <- as.numeric(dim(p.OLS[row.names(p.OLS) %in% DEG_pa,])[1]) 128 | FP.OLS <- as.numeric(dim(p.OLS)[1]) - TP.OLS 129 | 130 | 131 | p.FLMER <- rr.FLMER[rr.FLMER$p.value < 0.05,] 132 | TP.FLMER <- as.numeric(dim(p.FLMER[row.names(p.FLMER) %in% DEG_pa,])[1]) 133 | FP.FLMER <- as.numeric(dim(p.FLMER)[1]) - TP.FLMER 134 | 135 | p.DWD <- rr.DWD[rr.DWD$p.value < 0.05,] 136 | TP.DWD <- as.numeric(dim(p.DWD[row.names(p.DWD) %in% DEG_pa,])[1]) 137 | FP.DWD <- as.numeric(dim(p.DWD)[1]) - TP.DWD 138 | 139 | p.XPN <- rr.XPN[rr.XPN$p.value < 0.05,] 140 | TP.XPN <- as.numeric(dim(p.XPN[row.names(p.XPN) %in% DEG_pa,])[1]) 141 | FP.XPN <- as.numeric(dim(p.XPN)[1]) - TP.XPN 142 | 143 | p.EB <- rr.EB[rr.EB$p.value < 0.05,] 144 | TP.EB <- as.numeric(dim(p.EB[row.names(p.EB) %in% DEG_pa,])[1]) 145 | FP.EB <- as.numeric(dim(p.EB)[1]) - TP.EB 146 | 147 | p.GQ <- rr.GQ[rr.GQ$p.value < 0.05,] 148 | TP.GQ <- as.numeric(dim(p.GQ[row.names(p.GQ) %in% DEG_pa,])[1]) 149 | FP.GQ <- as.numeric(dim(p.GQ)[1]) - TP.GQ 150 | 151 | TP[i,1]<-TP.raw; FP[i,1]<-FP.raw 152 | TP[i,2]<-TP.OLS; FP[i,2]<-FP.OLS 153 | TP[i,3]<-TP.FLMER; FP[i,3]<-FP.FLMER 154 | TP[i,4]<-TP.DWD; FP[i,4]<-FP.DWD 155 | TP[i,5]<-TP.XPN; FP[i,5]<-FP.XPN 156 | TP[i,6]<-TP.EB; FP[i,6]<-FP.EB 157 | TP[i,7]<-TP.GQ; FP[i,7]<-FP.GQ 158 | TP[i,8]<-TP.fisher; FP[i,8]<-FP.fisher 159 | 160 | com [i,1]<- array.p;com [i,2]<-seq.p; com [i,3]<-com.p; com [i,4]<-com.tp 161 | } 162 | 163 | ex6 <- cbind(TP,FP,com) #repeat from line 62 with different n1-n4 numbers: ex1(10,10,10,10), ex2(15,15,5,5), ex3(15,5,5,15), ex4(20,0,5,15), ex5(25,5,0,10), ex6(20,0,0,20) 164 | class(ex6) # "matrix" 165 | dim(ex6) # 30 20 166 | 167 | tp <- mean(ex6[,2:8]) 168 | fp <- mean(ex6[,10:16]) 169 | P <- tp/(tp + fp) 170 | R <- tp/9555 171 | F1 <- 2*(P*R)/(P+R) 172 | 173 | # Getting tp, fp, and F1 values for MM in ex6 174 | tp_MM = ex6[,3] 175 | fp_MM = ex6[,11] 176 | 177 | # Getting tp, fp, and F1 values for DWD in ex6 178 | tp_DWD = ex6[,4] 179 | fp_DWD = ex6[,12] 180 | 181 | ratio6 = "ex6(20,0,0,20)" 182 | # cat("iteration = ", iter <- iter + 1, "\n") 183 | 184 | cat("The ratio is ", ratio6, "\n", 185 | "The value of tp is ", tp, "\n", 186 | "The value of fp is ", fp, "\n", 187 | "The value of F1 is ", F1) 188 | -------------------------------------------------------------------------------- /MatchMixeR/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: MatchMixeR 2 | Type: Package 3 | Title: MatchMixeR: a cross-platform normalization tool 4 | Version: 0.1.1 5 | Author: Serin Zhang, Xing Qiu, Jinfeng Zhang 6 | Maintainer: Serin Zhang 7 | Description: A matched sample based mixed effect regression model for cross-platform normalization 8 | License: GPL (>= 2) 9 | Encoding: UTF-8 10 | VignetteBuilder: knitr 11 | Suggests: knitr, rmarkdown, limma 12 | LazyData: true 13 | RoxygenNote: 6.1.1 14 | -------------------------------------------------------------------------------- /MatchMixeR/MatchMixeR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /MatchMixeR/NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(MM) 4 | export(dwd) 5 | export(eb) 6 | export(flmer) 7 | export(gq) 8 | export(xpn) 9 | useDynLib(MatchMixeR,ckmeans_c) 10 | -------------------------------------------------------------------------------- /MatchMixeR/R/MatchMixeR.R: -------------------------------------------------------------------------------- 1 | # Per-gene OLS model 2 | #'@param X gene expression matrix 3 | #'@param Y gene expression matrix 4 | OLS <- function(X, Y){ 5 | ## A fast row-wise univariate OLS regression based on linear algebra 6 | Xbar <- rowMeans(X); Ybar <- rowMeans(Y, na.rm = TRUE) 7 | X.c <- sweep(X, 1, Xbar); Y.c <- sweep(Y, 1, Ybar) 8 | CovXY <- rowSums(X.c * Y.c, na.rm = TRUE) 9 | VarX <- rowSums(X.c^2, na.rm = TRUE); VarY <- rowSums(Y.c^2, na.rm = TRUE) 10 | ## regression coefs. 11 | beta1 <- CovXY / VarX; beta0 <- Ybar - beta1 * Xbar 12 | betamat <- cbind("Intercept"=beta0, "Slope"=beta1) 13 | ## Pearson corr. coefs. 14 | cc <- CovXY / sqrt(VarX * VarY) 15 | ## predictions 16 | Yhat <- sweep(X, 1, beta1, FUN="*") + beta0 %o% rep(1, ncol(X)) 17 | ## RSS vector 18 | RSS <- rowSums((Y - Yhat)^2) 19 | SS1 <- rowSums(Yhat^2) 20 | SST <- rowSums(Y^2) 21 | return(list(betamat=betamat, corr=cc, covxy=CovXY, varx=VarX, Yhat=Yhat, RSS=RSS, var.explained.prop=SS1/SST, Xbar=Xbar, Ybar=Ybar)) 22 | } 23 | 24 | ## The main cross-platform normalization procedure: FLMER is based on 25 | ## the proposed moment-based method 26 | #'@param X gene expression matrix 27 | #'@param Y gene expression matrix 28 | #'@export 29 | flmer <- function(Xmat, Ymat){ 30 | Xmat <- as.matrix(Xmat); Ymat <- as.matrix(Ymat) 31 | ## 11/12/2018. Use new notations; p==ngenes; n==sample size 32 | p <- nrow(Xmat); n <- ncol(Xmat); N <- p*n 33 | ## calculate some useful items 34 | Xibar <- rowMeans(Xmat); Xbar <- mean(Xibar) 35 | Xmat.c <- Xmat - Xibar 36 | Yibar <- rowMeans(Ymat); Ybar <- mean(Yibar) 37 | Ymat.c <- Ymat - Yibar 38 | covXY <- rowSums(Xmat.c * Ymat.c) 39 | covXX <- rowSums(Xmat.c * Xmat.c) 40 | beta.yixi <- covXY / covXX 41 | ## The overall regression 42 | Xc <- Xmat - mean(Xmat); Yc <- Ymat - mean(Ymat) 43 | beta.yx <- sum(Xc*Yc) / sum(Xc^2) 44 | ## In this case, we assume that there is no collinearity in Z 45 | qprime <- 2*p 46 | var.epsilon <- sum((Ymat.c - Xmat.c * beta.yixi)^2) / (N - qprime) 47 | ## Preparing for calculating S, \| \mathbf{R}_{Z|\mathbf{X}} \|^2, 48 | ## and \|Z' \mathbf{R}_{Z|\mathbf{X}} \|^2 terms. 49 | XiYibar <- covXY/n + Xibar*Yibar 50 | ## Xi2bar = \|Xi\|^2 / n 51 | Xi2bar <- covXX/n + Xibar^2 52 | ## Xs is Xmat standardized by the global mean/sd 53 | Xs <- (Xmat - Xbar) / sqrt(sum((Xmat-Xbar)^2)) 54 | Xsbar <- rowMeans(Xs) 55 | ## XsX is the inner producd between Xs and X. It is denoted as 56 | ## \varsigma in the notes. 57 | XsX <- rowSums(Xs*Xmat) 58 | XsNorm2 <- sum(Xs^2) 59 | ## the normalized version of S 60 | S <- n^2 * (sum((Yibar - Ybar -(Xibar - Xbar)*beta.yx)^2) + sum((XiYibar -Ybar*Xibar -(Xi2bar - Xbar*Xibar)*beta.yx)^2)) / var.epsilon 61 | ## Calculate \| \mathbf{R}_{Z|\mathbf{X}} \|^2 62 | Projxz.norm2 <- n + XsNorm2 * ( n^2*sum(Xsbar^2) + sum(XsX^2)) + n*sum(Xibar^2)/p 63 | Rzx.norm2 <- sum(Xmat^2) + N -Projxz.norm2 64 | ## General terms; without n^4 yet. 65 | S1 <- sum(Xibar^2); S2 <- sum(Xsbar^2); S3 <- sum(XsX^2)/n^2 66 | ZPZ.norm2 <- p^2/N^2 + S2^2 + 2*( p*S1/N^2 + S2*S3) + S1^2/N^2 + 2*(sum(Xibar*XsX)^2)/(N*n^2) + S3^2 67 | ## subtractions; to multiply by *n^4 68 | Minus.terms <- sum((1/N + Xsbar^2)^2) + 2*sum((Xibar/N + Xsbar*XsX/n)^2) + sum((Xibar^2/N + XsX^2/n^2)^2) 69 | ## added terms 70 | XiNorm2 <- rowSums(Xmat^2) 71 | Add.terms <- sum((1/n -1/N -Xsbar^2)^2) + 2*sum((Xibar/n -Xibar/N -Xsbar*XsX/n)^2) + sum((XiNorm2/n^2 -Xibar^2/N -XsX^2/n^2)^2) 72 | ZRzx.norm2 <- n^4*(ZPZ.norm2 - Minus.terms + Add.terms) 73 | ## Estimate lambda based on Moment matching 74 | lambdahat <- max(0, (S - Rzx.norm2) / ZRzx.norm2) 75 | ## Inference of the fixed effects. First, compute some small 76 | ## "building blocks". 77 | Axx <- XiNorm2 - (lambdahat/(1+n*lambdahat)) * n^2 * Xibar^2 78 | A1x <- n*Xibar/(1+n*lambdahat) 79 | A1y <- n*Yibar/(1+n*lambdahat) 80 | Axy <- XiYibar*n - (lambdahat/(1+n*lambdahat)) * n^2 * Xibar* Yibar 81 | W11 <- n/(1+n*lambdahat) - lambdahat*A1x^2/(1+lambdahat*Axx) 82 | W1x <- A1x/(1+lambdahat*Axx) 83 | W1y <- A1y -lambdahat*A1x*Axy / (1+lambdahat*Axx) 84 | Wxx <- Axx/(1+lambdahat*Axx) 85 | Wxy <- Axy / (1+lambdahat*Axx) 86 | ## Now the actual estimation 87 | covBeta <- var.epsilon * solve(matrix(c(sum(W11), sum(W1x), sum(W1x), sum(Wxx)), 2)) 88 | betahat <- drop(covBeta %*% c(sum(W1y), sum(Wxy)) / var.epsilon) 89 | names(betahat) <- c("Intercept", "Slope") 90 | dimnames(covBeta) <- list(c("Intercept", "Slope"), c("Intercept", "Slope")) 91 | ## Mixed effects terms via EBLUP 92 | gamma0hat <- lambdahat * (W1y - betahat[1]*W11 - betahat[2]*W1x) 93 | gamma1hat <- lambdahat * (Wxy - betahat[1]*W1x - betahat[2]*Wxx) 94 | ## Individual betas 95 | betamat <- t(rbind(gamma0hat, gamma1hat) + betahat) 96 | colnames(betamat) <- c("Intercept", "Slope") 97 | ## t-statistics for the fixed effects 98 | t.fixed <- betahat/sqrt(diag(covBeta)) 99 | ## the predicted Yhat 100 | Yhat <- Xmat * betamat[, "Slope"] + betamat[, "Intercept"] 101 | return(list(betahat=betahat, betamat=betamat, Yhat=Yhat, 102 | lambdahat=lambdahat, var.epsilon=var.epsilon, 103 | covBeta=covBeta, t.fixed=t.fixed)) 104 | } 105 | 106 | 107 | 108 | ## Implement the covariance transformed FLMER as follows. 109 | #'Match-MixeR 110 | #' 111 | #'fit mixed effect regression model 112 | #'@param X gene expression matrix 113 | #'@param Y gene expression matrix 114 | #'@export 115 | MM <- function(Xmat, Ymat){ 116 | rr1 <- OLS(Xmat, Ymat) 117 | cov1 <- cov(rr1$betamat) 118 | s0 <- sqrt(cov1[1,1]); s1 <- sqrt(cov1[2,2]); rho <- cov1[1,2]/s0/s1 119 | a12 <- rho/sqrt(1-rho^2); a22 <- s1/(s0 * sqrt(1-rho^2)) 120 | A <- matrix(c(1, a12, 121 | 0, a22), 2, byrow=TRUE) 122 | ## the X transformation 123 | Xtilde <- a12 + a22*Xmat 124 | ## apply flmer() to the covariance transformed data 125 | rr3 <- flmer(Xtilde, Ymat) 126 | ## the reverse transformation. Note that each *row* of "betamat" is 127 | ## beta_i; so we have to transpose the reverse matrix multiplication. 128 | betamat <- rr3$betamat %*% t(A); colnames(betamat) <- colnames(rr3$betamat) 129 | ## other misc. items 130 | betahat <- drop(A %*% rr3$betahat); names(betahat) <- names(rr3$betahat) 131 | lambdahat <- rr3$lambdahat 132 | covGamma <- rr3$lambdahat * rr3$var.epsilon * (A %*% t(A)) 133 | Yhat <- rr3$Yhat 134 | var.epsilon <- rr3$var.epsilon 135 | covBeta <- A %*% rr3$covBeta %*% t(A) 136 | t.fixed <- betahat/sqrt(diag(covBeta)) 137 | return(list(betahat=betahat, betamat=betamat, Yhat=Yhat, 138 | lambdahat=lambdahat, var.epsilon=var.epsilon, 139 | covGamma <- covGamma, 140 | covBeta=covBeta, t.fixed=t.fixed)) 141 | } 142 | 143 | 144 | -------------------------------------------------------------------------------- /MatchMixeR/R/ckmeans.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 3 | 4 | #This program is free software: you can redistribute it and/or modify 5 | #it under the terms of the GNU General Public License as published by 6 | #the Free Software Foundation, either version 3 of the License, or 7 | #(at your option) any later version. 8 | 9 | #This program is distributed in the hope that it will be useful, 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | #GNU General Public License for more details. 13 | 14 | #You should have received a copy of the GNU General Public License 15 | #along with this program. If not, see . 16 | ###################################################################### 17 | 18 | ckmeans = function (x, centers, iter.max = 10, nstart = 1, distance = "pearson") 19 | { 20 | 21 | 22 | if (distance == "pearson"){ 23 | distance = 1 24 | }else if(distance == "euclidean"){ 25 | distance = 2 26 | }else if(distance == "manhattan"){ 27 | distance = 3 28 | }else if(distance == "absolutepearson"){ 29 | distance = 4 30 | } 31 | nmeth = "lloyd" 32 | 33 | #Normalization consistent with flexclust 34 | x = x/sqrt(rowSums(x^2)) 35 | 36 | 37 | do_one <- function(nmeth) { 38 | #' @useDynLib MatchMixeR ckmeans_c 39 | Z <- .C(ckmeans_c, as.double(x), as.integer(m), 40 | as.integer(ncol(x)), centers = as.double(centers), 41 | as.integer(k), c1 = integer(m), iter = as.integer(iter.max), 42 | nc = integer(k), wss = double(k), as.integer(distance)) 43 | if (Z$iter > iter.max) { 44 | warning("did not converge in ", 45 | iter.max, " iterations", call. = FALSE) 46 | Z$converged = FALSE 47 | }else{ 48 | Z$converged = TRUE 49 | } 50 | if (any(Z$nc == 0)) { 51 | warning("empty cluster: try a better set of initial centers", 52 | call. = FALSE) 53 | Z$empty <- TRUE 54 | }else{ 55 | Z$empty <- FALSE 56 | } 57 | Z 58 | } 59 | x <- as.matrix(x) 60 | m <- nrow(x) 61 | if (missing(centers)) 62 | stop("'centers' must be a number or a matrix") 63 | 64 | if (length(centers) == 1L) { 65 | k <- centers 66 | if (nstart == 1) 67 | centers <- x[sample.int(m, k), , drop = FALSE] 68 | if (nstart >= 2 || any(duplicated(centers))) { 69 | cn <- unique(x) 70 | mm <- nrow(cn) 71 | if (mm < k) 72 | stop("more cluster centers than distinct data points.") 73 | centers <- cn[sample.int(mm, k), , drop = FALSE] 74 | } 75 | } 76 | else { 77 | centers <- as.matrix(centers) 78 | if (any(duplicated(centers))) 79 | stop("initial centers are not distinct") 80 | cn <- NULL 81 | k <- nrow(centers) 82 | if (m < k) 83 | stop("more cluster centers than data points") 84 | } 85 | if (iter.max < 1) 86 | stop("'iter.max' must be positive") 87 | if (ncol(x) != ncol(centers)) 88 | stop("must have same number of columns in 'x' and 'centers'") 89 | Z <- do_one(nmeth) 90 | if (nstart >= 2 && !is.null(cn)) { 91 | best <- sum(Z$wss) 92 | for (i in 2:nstart) { 93 | centers <- cn[sample.int(mm, k), , drop = FALSE] 94 | ZZ <- do_one(nmeth) 95 | if ((z <- sum(ZZ$wss)) < best) { 96 | Z <- ZZ 97 | best <- z 98 | } 99 | } 100 | } 101 | centers <- matrix(Z$centers, k) 102 | dimnames(centers) <- list(1L:k, dimnames(x)[[2L]]) 103 | cluster <- Z$c1 104 | if (!is.null(rn <- rownames(x))) 105 | names(cluster) <- rn 106 | out <- list(cluster = cluster, centers = centers, withinss = Z$wss, 107 | size = Z$nc, empty = Z$empty, converged = Z$converged) 108 | class(out) <- "kmeans" 109 | out 110 | } 111 | -------------------------------------------------------------------------------- /MatchMixeR/R/dwd.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 3 | 4 | #This program is free software: you can redistribute it and/or modify 5 | #it under the terms of the GNU General Public License as published by 6 | #the Free Software Foundation, either version 3 of the License, or 7 | #(at your option) any later version. 8 | 9 | #This program is distributed in the hope that it will be useful, 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | #GNU General Public License for more details. 13 | 14 | #You should have received a copy of the GNU General Public License 15 | #along with this program. If not, see . 16 | ###################################################################### 17 | 18 | sepelimdwd = function(Xp,Xn,penalty, useSparse=TRUE){ 19 | #Xp and Xn are matrices. penalty is a scalar. This is an adaptation of the Matlab function of the same name, written by J. S. Marron, available at https://genome.unc.edu/pubsup/dwd/. 20 | 21 | flag <- 0 22 | #Dimensions of the data 23 | dp <- dim(Xp)[1] 24 | np <- dim(Xp)[2] 25 | dn <- dim(Xn)[1] 26 | nn <- dim(Xn)[2] 27 | if (dn != dp) {stop('The dimensions are incomapatible.')} 28 | d <- dp 29 | 30 | #Dimension reduction in HDLSS setting. 31 | XpnY <- as.matrix(cbind(Xp,-1*Xn)) 32 | XpnY11 <- XpnY[1,1] 33 | n <- np + nn 34 | if(d>n){ 35 | qrfact = qr(XpnY) 36 | Q = qr.Q(qrfact) 37 | R = qr.R(qrfact) 38 | RpnY = R 39 | dnew = n 40 | }else{ 41 | RpnY = XpnY 42 | dnew = d 43 | } 44 | 45 | y = ones(np + nn,1) 46 | y[(np+1):(np+nn),1] = -1 47 | ym = y[2:n,1] 48 | 49 | # nv is the number of variables (eliminating beta) 50 | # nc is the number of constraints 51 | nv = 1 + dnew + 4*n 52 | nc = 2*n 53 | #Set up the block structure, constraint matrix, rhs, and cost vector 54 | 55 | blk = list() 56 | blk$type = character() 57 | blk$size = list() 58 | blk$type[1] = 's' 59 | blk$size[[1]] = cbind(dnew+1,3*ones(1,n)) 60 | blk$type[2] = 'l' 61 | blk$size[[2]] = n 62 | 63 | 64 | Avec = list() 65 | A = zeros(nc,nv-n) 66 | col1 = RpnY[,1] 67 | A[1:(n-1),2:(dnew+1)] = t(RpnY[,2:n] - col1%*%t(ym)) 68 | A[1:(n-1),seq(dnew+5,dnew+1+3*n,3)] = -1*speye(n-1) 69 | A[1:(n-1),seq(dnew+6,dnew+2+3*n,3)] = speye(n-1) 70 | A[1:(n-1),dnew+2] = ym 71 | A[1:(n-1),dnew+3] = -1*ym 72 | A[n,1] = 1 73 | A[(n+1):(n+n),seq(dnew+4,dnew+3+3*n,3)] = speye(n) 74 | 75 | 76 | Avec[[1]] = t(A) 77 | Avec[[2]] = (rbind(cbind(-1*ym,speye(n-1)),zeros(1+n,n))) 78 | b = rbind(zeros(n-1,1),ones(1+n,1)) 79 | 80 | 81 | C = list() 82 | c = zeros(nv-n,1) 83 | c[seq(dnew+2,dnew+1+3*n,3),1] = ones(n,1) 84 | c[seq(dnew+3,dnew+2+3*n,3),1] = ones(n,1) 85 | 86 | 87 | C[[1]] = c 88 | C[[2]] = penalty*ones(n,1) 89 | 90 | 91 | #Solve the SOCP problem 92 | 93 | ##################CLSOCP############################ 94 | CL_K <- unlist(blk$size) 95 | CL_qlen <-sum(blk$size[[1]]) 96 | CL_llen <-sum(blk$size[[2]]) 97 | CL_type <- c(rep('q',length(blk$size[[1]])),rep('l',length(blk$size[[2]]))) 98 | CL_A <- cbind(t(Avec[[1]]),Avec[[2]]) 99 | CL_c <- rbind(C[[1]],C[[2]]) 100 | 101 | 102 | soln <- CLSOCP::socp(CL_A,b,CL_c,CL_K,CL_type,gamma_fac=.3,sigma0 = .1,use_sparse=useSparse) 103 | 104 | 105 | X1 <- soln$x[1:CL_qlen] 106 | X2 <- soln$x[(CL_qlen+1):(CL_qlen+CL_llen)] 107 | lambda <- soln$y 108 | ##################################################### 109 | 110 | 111 | # Compute the normal vector w and constant term beta. 112 | 113 | barw = X1[2:(dnew+1)] 114 | if (d>n){ 115 | w = Q %*% barw 116 | }else{ 117 | w = barw 118 | } 119 | beta = X1[dnew + 2] - X1[dnew + 3] - X2[1] - t(col1)%*%barw 120 | normw = norm(w) 121 | if (normw < 1 - 1e-3){ 122 | print(normw) 123 | } 124 | normwm1 = 0 125 | if (normw > 1 - 1e-3){ 126 | w = w/normw 127 | normwm1 = norm(w) - 1 128 | beta = beta/normw 129 | } 130 | 131 | 132 | # Compute the minimum of the supposedly positive 133 | # and the maximum of the supposedly negative residuals. 134 | # Refine the primal solution and print its objective value. 135 | 136 | residp = t(Xp) %*% w + beta[1] #optimization 137 | residn = t(Xn) %*% w + beta[1] #optimization 138 | minresidp = min(residp) 139 | maxresidn = max(residn) 140 | res = t(XpnY) %*% w + beta[1] * y 141 | rsc = 1/sqrt(penalty) 142 | xi = -1* res + rsc[1] 143 | xi[xi<0] <- 0 144 | totalviolation = sum(xi) 145 | minresidpmod = min(residp + xi[1:np]) 146 | maxresidnmod = max(residn - xi[(np+1):n]) 147 | minxi = min(xi) 148 | maxxi = max(xi) 149 | resn = res + xi 150 | rresn = 1 / resn 151 | primalobj = penalty * sum(xi) + sum(rresn) 152 | # print(primalobj) 153 | 154 | 155 | #Compute the dual solution alp and print its objective value. 156 | alp = zeros(n,1) 157 | lambda1 = lambda[1:(n-1)] 158 | alp[1] = -1*t(ym)%*%lambda1 159 | alp[2:n] = lambda1 160 | alp = alp * (as.numeric(alp>0)) 161 | sump = sum(alp[1:np]) 162 | sumn = sum(alp[(np+1):n]) 163 | sum2 = (sump + sumn)/2 164 | alp[1:np] = (sum2/sump)*alp[1:np] 165 | alp[(np+1):n] = (sum2/sumn)*alp[(np+1):n] 166 | maxalp = max(alp) 167 | if (maxalp > penalty | maxxi > 1e-3){ 168 | alp = (penalty[1]/maxalp)*alp 169 | } 170 | minalp = min(alp) 171 | p = RpnY%*%alp 172 | eta = -1*norm(p) 173 | gamma = 2*sqrt(alp) 174 | dualobj = eta + sum(gamma) 175 | # print(dualobj) 176 | 177 | 178 | 179 | #dualgap is a measure of the accuracy of the solution 180 | dualgap = primalobj - dualobj 181 | # print(dualgap) 182 | 183 | if (dualgap > 1e-4){ 184 | flag = -1 185 | } 186 | 187 | 188 | return(list(w=w,beta=beta,residp=residp,residn=residn,alp=alp,totalviolation=totalviolation,dualgap=dualgap,flag=flag)) 189 | 190 | } 191 | 192 | 193 | DWD1SM = function(trainp,trainn,threshfact = 100, useSparse = TRUE){ 194 | 195 | np = dim(trainp)[2] 196 | nn = dim(trainn)[2] 197 | # vpwdist2 = numeric(np*nn) 198 | vpwdist2x <- fields::rdist(t(trainp),t(trainn)) 199 | # for (ip in 1:np){ 200 | # vpwdist2[((ip-1)*nn+1):(ip*nn)] <- colSums((trainp[,ip] - trainn)^2) #optimization 201 | # } 202 | # medianpwdist2 = median(vpwdist2) 203 | medianpwdist2 = median(vpwdist2x)^2 204 | 205 | penalty = threshfact / medianpwdist2 206 | sepelimout = sepelimdwd(trainp,trainn,penalty,useSparse=useSparse) 207 | w = sepelimout$w 208 | flag = sepelimout$flag 209 | if (flag == -1){ 210 | cat("Inaccurate solution!\n") 211 | } 212 | if (flag == -2){ 213 | cat("Infeasible or unbounded optimization problem!\n") 214 | } 215 | dirvec = w/norm(w) 216 | return(dirvec) 217 | } 218 | 219 | #'@param X gene expression matrix 220 | #'@param Y gene expression matrix 221 | #'@export 222 | dwd = function(platform1.data, platform2.data, platform1.train = NULL, platform2.train=NULL, p1.names=0, p2.names=0,p1.train.names=0, p2.train.names=0, skip.match=FALSE, use.sparse = TRUE) { 223 | 224 | #Match names 225 | if(is.null(platform1.train) & is.null(platform2.train)){ 226 | input = processplatforms(list(x=platform1.data,y=platform2.data), namesvec = c(p1.names, p2.names), skip.match=skip.match) 227 | 228 | 229 | 230 | #Get normal vector for DWD adjustment 231 | dirvec = DWD1SM(input[[1]],input[[2]],useSparse=use.sparse) 232 | #Project the data 233 | vprojp = t(input[[1]]) %*% dirvec 234 | vprojn = t(input[[2]]) %*% dirvec 235 | meanprojp = mean(vprojp) 236 | meanprojn = mean(vprojn) 237 | output = list() 238 | p1.adjust <- -1 * meanprojp * dirvec 239 | p2.adjust <- -1 * meanprojn * dirvec 240 | names(p1.adjust) <- rownames(input[[1]]) 241 | names(p2.adjust) <- rownames(input[[2]]) 242 | output$x = input[[1]] + p1.adjust 243 | output$y = input[[2]] + p2.adjust 244 | output$p1.adjust <- p1.adjust 245 | output$p2.adjust <- p2.adjust 246 | 247 | }else{ 248 | input = processplatforms(list(x=platform1.data,y=platform2.data,platform1.train,platform2.train), namesvec = c(p1.names, p2.names, p1.train.names, p2.train.names), skip.match=skip.match) 249 | 250 | 251 | #Get normal vector for DWD adjustment 252 | dirvec = DWD1SM(input[[3]],input[[4]],useSparse=use.sparse) 253 | 254 | #Project the data 255 | vprojp = t(input[[3]]) %*% dirvec 256 | vprojn = t(input[[4]]) %*% dirvec 257 | 258 | meanprojp = mean(vprojp) 259 | meanprojn = mean(vprojn) 260 | 261 | output = list() 262 | p1.adjust <- -1 * meanprojp * dirvec 263 | p2.adjust <- -1 * meanprojn * dirvec 264 | names(p1.adjust) <- rownames(input[[1]]) 265 | names(p2.adjust) <- rownames(input[[2]]) 266 | output$x = input[[1]] + p1.adjust 267 | output$y = input[[2]] + p2.adjust 268 | output$p1.adjust <- p1.adjust 269 | output$p2.adjust <- p2.adjust 270 | } 271 | 272 | return(output) 273 | 274 | } 275 | 276 | norm = function(aMatrix){ 277 | #Returns the largest singular value of aMatrix 278 | o = svd(aMatrix,nu=0,nv=0) 279 | return(o$d[1]) 280 | 281 | } 282 | 283 | 284 | 285 | -------------------------------------------------------------------------------- /MatchMixeR/R/eb.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 3 | 4 | #This program is free software: you can redistribute it and/or modify 5 | #it under the terms of the GNU General Public License as published by 6 | #the Free Software Foundation, either version 3 of the License, or 7 | #(at your option) any later version. 8 | 9 | #This program is distributed in the hope that it will be useful, 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | #GNU General Public License for more details. 13 | 14 | #You should have received a copy of the GNU General Public License 15 | #along with this program. If not, see . 16 | ###################################################################### 17 | 18 | ###################################################################### 19 | #This file is adapted from the ComBat program, available from 20 | #http://jlab.byu.edu/ComBat/Abstract.html. Please cite appropriately 21 | #when using these functions. Type citation("CONOR") at the R prompt for 22 | #details. 23 | 24 | #'@param X gene expression matrix 25 | #'@param Y gene expression matrix 26 | #'@export 27 | eb = function(platform1.data, platform2.data, par.prior=TRUE, filter=FALSE, prior.plots=FALSE, p1.names=0, p2.names=0, skip.match=FALSE){ 28 | 29 | #Match names 30 | input = processplatforms(list(x=platform1.data,y=platform2.data),namesvec = c(p1.names, p2.names), skip.match=skip.match) 31 | 32 | #Dimensions 33 | nx = dim(input$x)[2] 34 | ny = dim(input$y)[2] 35 | 36 | #Prepare the data for ComBat 37 | dat = cbind(input$x,input$y) 38 | batch = character(nx+ny) 39 | batch[1:nx] = "X" 40 | batch[(nx + 1):(nx + ny)] = "Y" 41 | saminfo = data.frame(as.character(1:(nx+ny)), as.character(1:(nx+ny)), batch) 42 | colnames(saminfo) = c("Array name", "sample name", "Batch") 43 | orignames = colnames(dat) 44 | colnames(dat) = as.character(1:(nx+ny)) 45 | 46 | #Call ComBat 47 | cb = ComBat(dat, saminfo, par.prior=par.prior, filter=filter, prior.plots=prior.plots) 48 | colnames(cb) = orignames 49 | 50 | #Return results 51 | return(list(x=cb[,1:nx], y=cb[,(nx+1):(nx+ny)])) 52 | } 53 | 54 | #The code from here down is from the original authors, with slight modification, and was obtained at: http://jlab.byu.edu//ComBat/Download_files/ComBat.R 55 | 56 | #This long comment is from the original authors, and describes the original, unmodified version. I have made changes only to the interface, so it can use a data.frame as input instead of a file name. 57 | # 'expression_xls' is the expression index file (e.g. outputted by dChip); 'sample_info_file' is a tab-delimited text file containing the colums: Array name, sample name, Batch, and any other covariates to be included in the modeling; 'type' currently supports two data file types 'txt' for a tab-delimited text file and 'csv' for an Excel .csv file (sometimes R handles the .csv file better, so use this if you have problems with a .txt file!); 'write' if 'T' ComBat writes adjusted data to a file, and if 'F' and ComBat outputs the adjusted data matrix if 'F' (so assign it to an object! i.e. NewData <- ComBat('my expression.xls','Sample info file.txt', write=F)); 'covariates=all' will use all of the columns in your sample info file in the modeling (except array/sample name), if you only want use a some of the columns in your sample info file, specify these columns here as a vector (you must include the Batch column in this list); 'par.prior' if 'T' uses the parametric adjustments, if 'F' uses the nonparametric adjustments--if you are unsure what to use, try the parametric adjustments (they run faster) and check the plots to see if these priors are reasonable; 'filter=value' filters the genes with absent calls in > 1-value of the samples. The defaut here (as well as in dchip) is .8. Filter if you can as the EB adjustments work better after filtering. Filter must be numeric if your expression index file contains presence/absence calls (but you can set it >1 if you don't want to filter any genes) and must be 'F' if your data doesn't have presence/absence calls; 'skip' is the number of columns that contain probe names and gene information, so 'skip=5' implies the first expression values are in column 6; 'prior.plots' if true will give prior plots with black as a kernal estimate of the empirical batch effect density and red as the parametric estimate. 58 | 59 | ComBat <- function(dat, saminfo, type='txt', covariates='all', par.prior=T, filter=F, skip=0, prior.plots=T){ 60 | #debug: expression_xls='exp.txt'; sample_info_file='sam.txt'; type='txt'; write=T; covariates='all'; par.prior=T; filter=F; skip=0; prior.plots=T 61 | 62 | #Two lines commented out by Jason Rudy here: 63 | #cat('Reading Sample Information File\n') 64 | #saminfo <- read.table(sample_info_file, header=T, sep='\t',comment.char='') 65 | 66 | if(sum(colnames(saminfo)=="Batch")!=1){return('ERROR: Sample Information File does not have a Batch column!')} 67 | 68 | ## 01/18/2019 modified by Xing 69 | dat <- as.matrix(dat) 70 | 71 | #Lines commented out by Jason Rudy from here: 72 | #cat('Reading Expression Data File\n') 73 | #if(type=='csv'){ 74 | # dat <- read.csv(expression_xls,header=T,as.is=T) 75 | #print(dat[1:2,]) 76 | # dat <- dat[,trim.dat(dat)] 77 | #print(colnames(dat)) 78 | # colnames(dat)=scan(expression_xls,what='character',nlines=1,sep=',',quiet=T)[1:ncol(dat)] 79 | #print(colnames(dat)) 80 | # }else{ 81 | # dat <- read.table(expression_xls,header=T,comment.char='',fill=T,sep='\t', as.is=T) 82 | # dat <- dat[,trim.dat(dat)] 83 | # colnames(dat)=scan(expression_xls,what='character',nlines=1,sep='\t',quiet=T)[1:ncol(dat)] 84 | # } 85 | #to here. 86 | 87 | 88 | 89 | #From here down, the code is unaltered but for one minor bug fix where ComBat returns. 90 | if (skip>0){ 91 | geneinfo <- as.matrix(dat[,1:skip]) 92 | dat <- dat[,-c(1:skip)] 93 | }else{geneinfo=NULL} 94 | #print(geneinfo[1:4]) 95 | #print(dat[1:2,]) 96 | 97 | if(filter){ 98 | ngenes <- nrow(dat) 99 | col <- ncol(dat)/2 100 | present <- apply(dat, 1, filter.absent, filter) 101 | dat <- dat[present, -(2*(1:col))] 102 | if (skip>0){geneinfo <- geneinfo[present,]} 103 | cat('Filtered genes absent in more than',filter,'of samples. Genes remaining:',nrow(dat),'; Genes filtered:',ngenes-nrow(dat),'\n') 104 | } 105 | 106 | if(any(apply(dat,2,mode)!='numeric')){return('ERROR: Array expression columns contain non-numeric values! (Check your .xls file for non-numeric values and if this is not the problem, make a .csv file and use the type=csv option)')} 107 | 108 | tmp <- match(colnames(dat),saminfo[,1]) 109 | if(any(is.na(tmp))){return('ERROR: Sample Information File and Data Array Names are not the same!')} 110 | tmp1 <- match(saminfo[,1],colnames(dat)) 111 | saminfo <- saminfo[tmp1[!is.na(tmp1)],] 112 | 113 | if(any(covariates != 'all')){saminfo <- saminfo[,c(1:2,covariates)]} 114 | design <- design.mat(saminfo) 115 | 116 | 117 | batches <- list.batch(saminfo) 118 | n.batch <- length(batches) 119 | n.batches <- sapply(batches, length) 120 | n.array <- sum(n.batches) 121 | 122 | ## Check for missing values 123 | NAs = any(is.na(dat)) 124 | if(NAs){cat(c('Found',sum(is.na(dat)),'Missing Data Values\n'),sep=' ')} 125 | #print(dat[1:2,]) 126 | ##Standardize Data across genes 127 | cat('Standardizing Data across genes\n') 128 | if (!NAs){B.hat <- solve(t(design)%*%design)%*%t(design)%*%t(as.matrix(dat))}else{B.hat=apply(dat,1,Beta.NA,design)} #Standarization Model 129 | grand.mean <- t(n.batches/n.array)%*%B.hat[1:n.batch,] 130 | if (!NAs){var.pooled <- ((dat-t(design%*%B.hat))^2)%*%rep(1/n.array,n.array)}else{var.pooled <- apply(dat-t(design%*%B.hat),1,var,na.rm=T)} 131 | 132 | stand.mean <- t(grand.mean)%*%t(rep(1,n.array)) 133 | if(!is.null(design)){tmp <- design;tmp[,c(1:n.batch)] <- 0;stand.mean <- stand.mean+t(tmp%*%B.hat)} 134 | s.data <- (dat-stand.mean)/(sqrt(var.pooled)%*%t(rep(1,n.array))) 135 | 136 | ##Get regression batch effect parameters 137 | cat("Fitting L/S model and finding priors\n") 138 | batch.design <- design[,1:n.batch] 139 | if (!NAs){gamma.hat <- solve(t(batch.design)%*%batch.design)%*%t(batch.design)%*%t(as.matrix(s.data))}else{gamma.hat=apply(s.data,1,Beta.NA,batch.design)} 140 | delta.hat <- NULL 141 | for (i in batches){ 142 | delta.hat <- rbind(delta.hat,apply(s.data[,i], 1, var,na.rm=T)) 143 | } 144 | 145 | ##Find Priors 146 | gamma.bar <- apply(gamma.hat, 1, mean) 147 | t2 <- apply(gamma.hat, 1, var) 148 | a.prior <- apply(delta.hat, 1, aprior) 149 | b.prior <- apply(delta.hat, 1, bprior) 150 | 151 | 152 | ##Plot empirical and parametric priors 153 | 154 | if (prior.plots & par.prior){ 155 | par(mfrow=c(2,2)) 156 | tmp <- density(gamma.hat[1,]) 157 | plot(tmp, type='l', main="Density Plot") 158 | xx <- seq(min(tmp$x), max(tmp$x), length=100) 159 | lines(xx,dnorm(xx,gamma.bar[1],sqrt(t2[1])), col=2) 160 | qqnorm(gamma.hat[1,]) 161 | qqline(gamma.hat[1,], col=2) 162 | 163 | tmp <- density(delta.hat[1,]) 164 | invgam <- 1/rgamma(ncol(delta.hat),a.prior[1],b.prior[1]) 165 | tmp1 <- density(invgam) 166 | plot(tmp, typ='l', main="Density Plot", ylim=c(0,max(tmp$y,tmp1$y))) 167 | lines(tmp1, col=2) 168 | qqplot(delta.hat[1,], invgam, xlab="Sample Quantiles", ylab='Theoretical Quantiles') 169 | lines(c(0,max(invgam)),c(0,max(invgam)),col=2) 170 | title('Q-Q Plot') 171 | } 172 | 173 | ##Find EB batch adjustments 174 | 175 | gamma.star <- delta.star <- NULL 176 | if(par.prior){ 177 | cat("Finding parametric adjustments\n") 178 | for (i in 1:n.batch){ 179 | temp <- it.sol(s.data[,batches[[i]]],gamma.hat[i,],delta.hat[i,],gamma.bar[i],t2[i],a.prior[i],b.prior[i]) 180 | gamma.star <- rbind(gamma.star,temp[1,]) 181 | delta.star <- rbind(delta.star,temp[2,]) 182 | } 183 | }else{ 184 | cat("Finding nonparametric adjustments\n") 185 | for (i in 1:n.batch){ 186 | temp <- int.eprior(as.matrix(s.data[,batches[[i]]]),gamma.hat[i,],delta.hat[i,]) 187 | gamma.star <- rbind(gamma.star,temp[1,]) 188 | delta.star <- rbind(delta.star,temp[2,]) 189 | } 190 | } 191 | 192 | 193 | ### Normalize the Data ### 194 | cat("Adjusting the Data\n") 195 | 196 | bayesdata <- s.data 197 | j <- 1 198 | 199 | for (i in batches){ 200 | bayesdata[,i] <- (bayesdata[,i]-t(batch.design[i,]%*%gamma.star))/(sqrt(delta.star[j,])%*%t(rep(1,n.batches[j]))) 201 | j <- j+1 202 | } 203 | 204 | bayesdata <- (bayesdata*(sqrt(var.pooled)%*%t(rep(1,n.array))))+stand.mean 205 | if(FALSE){ 206 | output_file <- paste('Adjusted',expression_xls,'.xls',sep='_') 207 | #print(geneinfo[1:2]) 208 | #print(bayesdata[1:2,1:4]) 209 | #cat(c(colnames(geneinfo),colnames(dat),'\n'),file=output_file,sep='\t') 210 | #suppressWarnings(write.table(cbind(geneinfo,formatC(as.matrix(bayesdata), format = "f")), file=output_file, sep="\t", quote=F,row.names=F,col.names=F,append=T)) 211 | if(!is.null(geneinfo)){ outdata <- cbind(ProbeID=geneinfo, bayesdata)}else{outdata=bayesdata}; 212 | write.table(outdata, file=output_file, sep="\t",row.names=F) 213 | cat("Adjusted data saved in file:",output_file,"\n") 214 | }else{ 215 | 216 | #Minor change here: This is a bug fix by Jason Rudy 217 | if(!is.null(geneinfo)){ 218 | return(cbind(geneinfo,bayesdata)) 219 | }else 220 | return(bayesdata) 221 | } 222 | } 223 | 224 | # filters data based on presence/absence call 225 | filter.absent <- function(x,pct){ 226 | present <- T 227 | col <- length(x)/2 228 | pct.absent <- (sum(x[2*(1:col)]=="A") + sum(x[2*(1:col)]=="M"))/col 229 | if(pct.absent > pct){present <- F} 230 | present 231 | } 232 | 233 | # Next two functions make the design matrix (X) from the sample info file 234 | build.design <- function(vec, des=NULL, start=2){ 235 | tmp <- matrix(0,length(vec),nlevels(vec)-start+1) 236 | for (i in 1:ncol(tmp)){tmp[,i] <- vec==levels(vec)[i+start-1]} 237 | cbind(des,tmp) 238 | } 239 | 240 | design.mat <- function(saminfo){ 241 | tmp <- which(colnames(saminfo) == 'Batch') 242 | tmp1 <- as.factor(saminfo[,tmp]) 243 | cat("Found",nlevels(tmp1),'batches\n') 244 | design <- build.design(tmp1,start=1) 245 | ncov <- ncol(as.matrix(saminfo[,-c(1:2,tmp)])) 246 | cat("Found",ncov,'covariate(s)\n') 247 | if(ncov>0){ 248 | for (j in 1:ncov){ 249 | tmp1 <- as.factor(as.matrix(saminfo[,-c(1:2,tmp)])[,j]) 250 | design <- build.design(tmp1,des=design) 251 | } 252 | } 253 | design 254 | } 255 | 256 | # Makes a list with elements pointing to which array belongs to which batch 257 | list.batch <- function(saminfo){ 258 | tmp1 <- as.factor(saminfo[,which(colnames(saminfo) == 'Batch')]) 259 | batches <- NULL 260 | for (i in 1:nlevels(tmp1)){batches <- append(batches, list((1:length(tmp1))[tmp1==levels(tmp1)[i]]))} 261 | batches 262 | } 263 | 264 | # Trims the data of extra columns, note your array names cannot be named 'X' or start with 'X.' 265 | trim.dat <- function(dat){ 266 | tmp <- strsplit(colnames(dat),'\\.') 267 | tr <- NULL 268 | for (i in 1:length(tmp)){tr <- c(tr,tmp[[i]][1]!='X')} 269 | tr 270 | } 271 | 272 | # Following four find empirical hyper-prior values 273 | aprior <- function(gamma.hat){m=mean(gamma.hat); s2=var(gamma.hat); (2*s2+m^2)/s2} 274 | 275 | bprior <- function(gamma.hat){m=mean(gamma.hat); s2=var(gamma.hat); (m*s2+m^3)/s2} 276 | 277 | postmean <- function(g.hat,g.bar,n,d.star,t2){(t2*n*g.hat+d.star*g.bar)/(t2*n+d.star)} 278 | 279 | postvar <- function(sum2,n,a,b){(.5*sum2+b)/(n/2+a-1)} 280 | 281 | 282 | # Pass in entire data set, the design matrix for the entire data, the batch means, the batch variances, priors (m, t2, a, b), columns of the data matrix for the batch. Uses the EM to find the parametric batch adjustments 283 | 284 | it.sol <- function(sdat,g.hat,d.hat,g.bar,t2,a,b,conv=.0001){ 285 | n <- apply(!is.na(sdat),1,sum) 286 | g.old <- g.hat 287 | d.old <- d.hat 288 | change <- 1 289 | count <- 0 290 | while(change>conv){ 291 | g.new <- postmean(g.hat,g.bar,n,d.old,t2) 292 | sum2 <- apply((sdat-g.new%*%t(rep(1,ncol(sdat))))^2, 1, sum,na.rm=T) 293 | d.new <- postvar(sum2,n,a,b) 294 | change <- max(abs(g.new-g.old)/g.old,abs(d.new-d.old)/d.old) 295 | g.old <- g.new 296 | d.old <- d.new 297 | count <- count+1 298 | } 299 | #cat("This batch took", count, "iterations until convergence\n") 300 | adjust <- rbind(g.new, d.new) 301 | rownames(adjust) <- c("g.star","d.star") 302 | adjust 303 | } 304 | 305 | #likelihood function used below 306 | L <- function(x,g.hat,d.hat){prod(dnorm(x,g.hat,sqrt(d.hat)))} 307 | 308 | # Monte Carlo integration function to find the nonparametric adjustments 309 | int.eprior <- function(sdat,g.hat,d.hat){ 310 | g.star <- d.star <- NULL 311 | r <- nrow(sdat) 312 | for(i in 1:r){ 313 | g <- g.hat[-i] 314 | d <- d.hat[-i] 315 | x <- sdat[i,!is.na(sdat[i,])] 316 | n <- length(x) 317 | j <- numeric(n)+1 318 | dat <- matrix(as.numeric(x),length(g),n,byrow=T) 319 | resid2 <- (dat-g)^2 320 | sum2 <- resid2%*%j 321 | LH <- 1/(2*pi*d)^(n/2)*exp(-sum2/(2*d)) 322 | LH[LH=="NaN"]=0 323 | g.star <- c(g.star,sum(g*LH)/sum(LH)) 324 | d.star <- c(d.star,sum(d*LH)/sum(LH)) 325 | #if(i%%1000==0){cat(i,'\n')} 326 | } 327 | adjust <- rbind(g.star,d.star) 328 | rownames(adjust) <- c("g.star","d.star") 329 | adjust 330 | } 331 | 332 | #fits the L/S model in the presence of missing data values 333 | 334 | Beta.NA = function(y,X){ 335 | des=X[!is.na(y),] 336 | y1=y[!is.na(y)] 337 | B <- solve(t(des)%*%des)%*%t(des)%*%y1 338 | B 339 | } 340 | -------------------------------------------------------------------------------- /MatchMixeR/R/functions.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 3 | 4 | #This program is free software: you can redistribute it and/or modify 5 | #it under the terms of the GNU General Public License as published by 6 | #the Free Software Foundation, either version 3 of the License, or 7 | #(at your option) any later version. 8 | 9 | #This program is distributed in the hope that it will be useful, 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | #GNU General Public License for more details. 13 | 14 | #You should have received a copy of the GNU General Public License 15 | #along with this program. If not, see . 16 | ###################################################################### 17 | 18 | cross.platform.normalize=function(method,...){ 19 | if(is.character(method)){ 20 | method = get(method) 21 | } 22 | method(...) 23 | } 24 | 25 | 26 | processplatforms = function(datalist, namesvec=NULL, skip.match=FALSE){ 27 | #Convert data from various formats to the proper format for use 28 | #with all the crossnorm normalization functions 29 | 30 | for(i in 1:length(datalist)){ 31 | if(is.matrix(datalist[[i]])){ 32 | datalist[[i]] <- as.data.frame(datalist[[i]]) 33 | } 34 | } 35 | 36 | if (is.null(namesvec)){ 37 | namesvec <- numeric(length(datalist)) 38 | for (i in 1:length(datalist)){ 39 | namesvec[i] <- 0 40 | } 41 | } 42 | 43 | #Put the row names in their places 44 | for (i in 1:length(namesvec)){ 45 | if(namesvec[i] != 0){ 46 | rownames(datalist[[i]]) = datalist[[i]][,namesvec[i]] 47 | datalist[[i]] = datalist[[i]][,-1*namesvec[i],drop=FALSE] 48 | } 49 | } 50 | 51 | if(!skip.match){ 52 | #Create the common genes list 53 | commongenes <- rownames(datalist[[1]]) 54 | for (i in 2:length(datalist)){ 55 | commongenes <- intersect(commongenes,rownames(datalist[[i]])) 56 | } 57 | 58 | 59 | #Put it all together 60 | for (i in 1:length(datalist)){ 61 | datalist[[i]] <- datalist[[i]][commongenes,,drop=FALSE] 62 | } 63 | } 64 | return(datalist) 65 | } 66 | 67 | 68 | assert = function(aBool){ 69 | if (!aBool){ 70 | stop("Assert failed!") 71 | } 72 | } 73 | 74 | repmat = function(mat,m,n){ 75 | out = mat 76 | if(m>1){ 77 | for (i in 2:m){ 78 | out = rbind(out,mat) 79 | } 80 | } 81 | if(n>1){ 82 | mat = out 83 | for (j in 2:n){ 84 | out = cbind(out,mat) 85 | } 86 | } 87 | return(out) 88 | } 89 | ones = function(m,n){ 90 | return(matrix(1,nrow=m,ncol=n)) 91 | } 92 | 93 | zeros = function(m,n){ 94 | return(matrix(0,nrow=m,ncol=n)) 95 | } 96 | 97 | speye = function(m){ 98 | #This function is not actually sparse yet. Needs to be fixed. 99 | 100 | return(diag(m)) 101 | 102 | } 103 | 104 | 105 | 106 | rowMedians = function(aMatrix, na.rm = FALSE){ 107 | #Also works on data.frames 108 | m = dim(aMatrix)[1] 109 | ret = numeric(m) 110 | for (i in 1:m){ 111 | ret[i] = mean(median(aMatrix[i,],na.rm=na.rm),na.rm=TRUE) 112 | } 113 | return(ret) 114 | } 115 | 116 | randn = function(m,n){ 117 | return(matrix(rnorm(m*n),m,n)) 118 | } 119 | 120 | colRanks = function(aMatrix){ 121 | return(apply(aMatrix,2,rank)) 122 | } 123 | -------------------------------------------------------------------------------- /MatchMixeR/R/gq.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 3 | 4 | #This program is free software: you can redistribute it and/or modify 5 | #it under the terms of the GNU General Public License as published by 6 | #the Free Software Foundation, either version 3 of the License, or 7 | #(at your option) any later version. 8 | 9 | #This program is distributed in the hope that it will be useful, 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | #GNU General Public License for more details. 13 | 14 | #You should have received a copy of the GNU General Public License 15 | #along with this program. If not, see . 16 | ###################################################################### 17 | 18 | 19 | ###################################################################### 20 | #This file contains code adapted from the WebArrayDB program, 21 | #available from http://www.webarraydb.org/webarray/index.html Please 22 | #cite appropriately when using these functions. Type 23 | #citation("CONOR") at the R prompt for details. 24 | 25 | ## Implementation of the "Gene Quantile" cross platform 26 | ## normalization algorithm, as available from webarraydb. 27 | ## The function gq() is a wrapper for normalizeGQ, which 28 | ## was provided by the authors of webarraydb. 29 | #' 30 | #'fit mixed effect regression model 31 | #'@param X gene expression matrix 32 | #'@param Y gene expression matrix 33 | #'@export 34 | gq = function(platform1.data, platform2.data, p1.names=0, p2.names=0, skip.match=FALSE){ 35 | #This function is basically a wrapper for normalizeGQ 36 | 37 | #Match names 38 | input = processplatforms(list(x=platform1.data,y=platform2.data),namesvec = c(p1.names, p2.names), skip.match=skip.match) 39 | 40 | #Prepare for normalizeGQ 41 | combined = cbind(input$x,input$y) 42 | pf = c(seq(1,1,length.out=dim(input$x)[2]),seq(2,2,length.out=dim(input$y)[2])) 43 | 44 | #Call normalizeGQ 45 | ngq = normalizeGQ(combined,pf) 46 | 47 | #Split the results and return 48 | out=split(seq(pf),pf) 49 | out[[1]] = ngq[,out[[1]]] 50 | out[[2]] = ngq[,out[[2]]] 51 | names(out) <- c("x","y") 52 | return(out) 53 | } 54 | 55 | 56 | normalizeGQ <- function(M, pf, ...) { 57 | #This function was provided by Xiao-Qin Xia, one of the authors of 58 | #webarraydb. 59 | # modified MRS 60 | # M is the data matrix 61 | # pf is the vector to specify the platform for each column of M. 62 | idx <- split(seq(pf), pf) 63 | if (length(pf)<=1) return(M) 64 | imax <- which.max(sapply(idx, length)) # for reference 65 | ref_med <- apply(M[, idx[[imax]]], 1, function(x) median(x, na.rm=TRUE)) 66 | ref_med_srt <- sort(ref_med) 67 | idx[imax] <- NULL 68 | lapply(idx, function(i) { 69 | MTMP <- sapply(i, function(x) ref_med_srt[rank(M[,x])]); 70 | M[,i] <<- MTMP - apply(MTMP, 1, median) + ref_med 71 | } ) 72 | invisible(M) 73 | } 74 | 75 | -------------------------------------------------------------------------------- /MatchMixeR/R/xpn.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 3 | 4 | #This program is free software: you can redistribute it and/or modify 5 | #it under the terms of the GNU General Public License as published by 6 | #the Free Software Foundation, either version 3 of the License, or 7 | #(at your option) any later version. 8 | 9 | #This program is distributed in the hope that it will be useful, 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | #GNU General Public License for more details. 13 | 14 | #You should have received a copy of the GNU General Public License 15 | #along with this program. If not, see . 16 | ###################################################################### 17 | 18 | #Implementation of the xpn cross platform normalization algorithm. 19 | 20 | #'@param X gene expression matrix 21 | #'@param Y gene expression matrix 22 | #'@export 23 | xpn = function(platform1.data,platform2.data,K=10,L=4,p1.names=0,p2.names=0,gene.cluster="kmeans",assay.cluster="kmeans",corr="pearson", iterations=30, skip.match=FALSE){ 24 | #If K or L is not a single value, it is taken as a list of possible values. 25 | 26 | #Match names 27 | input = processplatforms(list(x=platform1.data,y=platform2.data),namesvec = c(p1.names, p2.names), skip.match=skip.match) 28 | x <- input$x 29 | y <- input$y 30 | input <- NA 31 | 32 | #Remove the medians 33 | x_demed = x - rowMedians(as.matrix(x)) 34 | y_demed = y - rowMedians(as.matrix(y)) 35 | 36 | #Get the dimensions 37 | nx=dim(x)[2] 38 | ny=dim(y)[2] 39 | mx=dim(x)[1] 40 | my=dim(y)[1] 41 | 42 | 43 | #Create the combined dataframe for clustering purposes 44 | combined <- cbind(x_demed,y_demed) 45 | 46 | #Detect K and L if necessary 47 | K = detect_K(combined,K,corr) 48 | L = detect_L(x_demed,y_demed,L,corr) 49 | 50 | 51 | xout = 0 52 | yout = 0 53 | 54 | 55 | for(iter in 1:iterations){ 56 | cat("XPN iteration",iter,"out of",iterations,"\n") 57 | 58 | #Do the assay clustering 59 | assayclusters = xpnassaycluster(x_demed, y_demed, L, assay.cluster, corr) 60 | 61 | #Do the gene clustering 62 | geneclusters = xpngenecluster(combined, K, gene.cluster, corr) 63 | 64 | #Estimate the XPN parameters 65 | xparams = xpnparamsp(x,geneclusters,assayclusters[1:nx]) 66 | yparams = xpnparamsp(y,geneclusters,assayclusters[(nx+1):(nx+ny)]) 67 | 68 | #Calcuate the weighted averages 69 | nor = 1/(nx + ny) 70 | Aav = try((1/(xparams$nblock+yparams$nblock))*(xparams$nblock*xparams$A + yparams$nblock*yparams$A)) 71 | bav = nor*(nx*xparams$b + ny*yparams$b) 72 | cav = nor*(nx*xparams$c + ny*yparams$c) 73 | sigmaav = sqrt(nor*(nx*xparams$s2 + ny*yparams$s2)) 74 | sigmax = sqrt(xparams$s2) 75 | sigmay = sqrt(yparams$s2) 76 | 77 | 78 | #Calculate the expanded A 79 | expAx = xpnclusterexpand(xparams$A,geneclusters,assayclusters[1:nx]) 80 | expAy = xpnclusterexpand(yparams$A,geneclusters,assayclusters[(nx+1):(nx+ny)]) 81 | 82 | #Calculate the residuals 83 | epsilonx = as.matrix(x) - (as.vector(xparams$b) * expAx + kronecker(ones(1,nx),xparams$c)) 84 | epsilony = as.matrix(y) - (as.vector(yparams$b) * expAy + kronecker(ones(1,ny),yparams$c)) 85 | 86 | #Calculate the expanded average A 87 | expAavx = xpnclusterexpand(Aav,geneclusters,assayclusters[1:nx]) 88 | expAavy = xpnclusterexpand(Aav,geneclusters,assayclusters[(nx+1):(nx+ny)]) 89 | 90 | #Calculate the output values 91 | xout = xout + (1/iterations)*((as.vector(bav) * expAavx) + kronecker(ones(1,nx),cav) + as.vector(sigmaav/sigmax) * epsilonx) 92 | yout = yout + (1/iterations)*((as.vector(bav) * expAavy) + kronecker(ones(1,ny),cav) + as.vector(sigmaav/sigmay) * epsilony) 93 | 94 | }#end of the enclosing for loop 95 | 96 | #Put the rownames back in and convert to data frames 97 | xout = as.data.frame(xout,row.names=rownames(x)) 98 | yout = as.data.frame(yout,row.names=rownames(y)) 99 | 100 | #All done! 101 | return(list(x=xout,y=yout)) 102 | } 103 | 104 | xpnparamsp = function(x, geneclusters, assayclusters, method){ 105 | #x should be a dataframe or hashdataframe 106 | 107 | #Get K and L 108 | K=max(geneclusters) 109 | L=max(assayclusters) 110 | 111 | #Get number of assays and genes 112 | numassays=length(x) 113 | numgenes=length(rownames(x)) 114 | 115 | #Number of assays in each cluster 116 | nj = matrix(table(as.factor(assayclusters)),1,L) 117 | 118 | #Set up the output variables 119 | A = matrix(nrow=K,ncol=L) 120 | nblock = matrix(nrow=K,ncol=L) 121 | b = matrix(nrow=numgenes, ncol=1) 122 | c = matrix(nrow=numgenes, ncol=1) 123 | sigma = matrix(nrow=numgenes, ncol=1) 124 | mu = matrix(nrow=numgenes,ncol=L) 125 | 126 | 127 | #For each gene cluster, estimate the XPN parameters 128 | for (a in 1:K){ 129 | 130 | #Get the logical indexing vector for gene cluster a 131 | geneinds = geneclusters==a 132 | numgenesa = sum(as.numeric(geneinds)) 133 | 134 | #Get the data for this gene cluster 135 | xa = as.matrix(x[geneinds,]) 136 | 137 | #Get the number of genes in this cluster 138 | Ga = dim(xa)[1] 139 | S = dim(xa)[2] 140 | 141 | #For each assay cluster and each gene, get the (relatively) unconstrained 142 | #MLE for mu 143 | mua = matrix(nrow=Ga,ncol=L) 144 | for (bb in 1:L){ 145 | assayinds = assayclusters==bb 146 | xab = matrix(xa[,assayinds],numgenesa,sum(as.numeric(assayinds))) 147 | mua[,bb]=t(t(rowMeans(xab))) 148 | } 149 | 150 | 151 | #Calculate sigma2 152 | expmua = xpnclusterexpand(mua, colclusters = assayclusters) 153 | sigma2 = xa - expmua 154 | sigma2 = sigma2*sigma2 155 | sigma2 = xpnclustercollapse(sigma2,colclusters = assayclusters) 156 | 157 | solna = XPN_MLE_C(mua,sigma2,nj) 158 | 159 | ba = solna$b 160 | Aa = solna$A 161 | ca = solna$c 162 | s2a = solna$s2 163 | 164 | #Write the values for this gene group into the larger arrays 165 | sigma[geneinds,] = s2a 166 | mu[geneinds,] = mua 167 | A[a,] = Aa 168 | nblock[a,] = nj 169 | b[geneinds,] = ba 170 | c[geneinds,] = ca 171 | } 172 | 173 | return(list(A=A,nblock=nblock,b=b,c=c,s2=sigma,mu=mu)) 174 | 175 | } 176 | 177 | 178 | xpnclustercollapse = function(aMatrix, rowclusters = 1:(dim(aMatrix)[1]), colclusters = 1:(dim(aMatrix)[2])){ 179 | #This undoes the work of xpnclusterexpand 180 | 181 | l = dim(aMatrix)[1] 182 | w = dim(aMatrix)[2] 183 | 184 | #collapse the rows 185 | rowidx = split(1:l,rowclusters) 186 | lc = length(rowidx) 187 | collapse1 = matrix(NA,lc,w) 188 | for ( i in 1:lc){ 189 | collapse1[i,] = colMeans(matrix(aMatrix[rowidx[[i]],],length(rowidx[[i]]),w)) 190 | } 191 | 192 | #collapse the columns 193 | colidx = split(1:w,colclusters) 194 | wc = length(colidx) 195 | collapse2 = matrix(NA,lc,wc) 196 | for ( i in 1:wc){ 197 | collapse2[,i] = rowMeans(matrix(collapse1[,colidx[[i]]],lc,length(colidx[[i]]))) 198 | } 199 | 200 | return(collapse2) 201 | 202 | } 203 | 204 | 205 | xpnclusterexpand = function(aMatrix, rowclusters = 1:(dim(aMatrix)[1]), colclusters = 1:(dim(aMatrix)[2]), noise = 0){ 206 | #This is basically a fancy repmat. rowclusters and colclusters 207 | #as produced by pam. aMatrix must have the same number of rows 208 | #and columns as there are row and column clusters. 209 | 210 | l = dim(aMatrix)[1] 211 | w = dim(aMatrix)[2] 212 | 213 | #Get the dimensions of the output 214 | m = length(rowclusters) 215 | n = length(colclusters) 216 | 217 | #Get the number of row and column clusters 218 | nrowclust = max(rowclusters) 219 | ncolclust = max(colclusters) 220 | 221 | #initialize the vertically expanded matrix 222 | vert = zeros(m,dim(aMatrix)[2]) 223 | 224 | #Do the vertical expansion 225 | for (i in 1:m){ 226 | vert[i,]=aMatrix[rowclusters[i],] + (noise*randn(1,w)) 227 | } 228 | 229 | #initialize the fully expanded matrix 230 | output = matrix(NA,m,n) 231 | 232 | #Do the horizontal expansion 233 | for (j in 1:n){ 234 | output[,j] = vert[,colclusters[j]] + (noise*randn(m,1)) 235 | } 236 | 237 | #That is it! 238 | return(output) 239 | } 240 | 241 | xpnassaycluster = function(x,y,L,cluster="kmeans",corr="pearson"){ 242 | 243 | if(is.numeric(cluster) | is.factor(cluster)){ 244 | return(as.numeric(cluster)) 245 | } 246 | 247 | #The numbers of assays 248 | nx = dim(x)[2] 249 | ny = dim(y)[2] 250 | 251 | #The number of genes 252 | m = dim(x)[1] 253 | 254 | #Compute the two correlation matrices if necessary 255 | if (cluster == 'pam' | (length(L)>1)){ 256 | xdiss = 1 - cor(as.matrix(x), method = corr) 257 | ydiss = 1 - cor(as.matrix(y), method = corr) 258 | } 259 | 260 | #Determine L if a range has been given 261 | if(length(L)>1){ 262 | Lx=pamk(xdiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc 263 | Ly = pamk(ydiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc 264 | L = max(Lx,Ly) 265 | cat(as.character(L), "assay clusters detected...\n") 266 | } 267 | 268 | #Do the clustering 269 | if (cluster == "classic"){ 270 | 271 | done = FALSE 272 | while (!done){ 273 | xyclust <- ckmeans(t(cbind(x,y)),centers=L, iter.max=20, distance=corr) 274 | #print(xyclust$cluster) 275 | xclust <- xyclust$cluster[1:nx] 276 | yclust <- xyclust$cluster[(nx+1):(nx+ny)] 277 | #print(xclust) 278 | # print(yclust) 279 | # print(nlevels(as.factor(xclust))) 280 | # print(nlevels(as.factor(yclust))) 281 | done <- nlevels(as.factor(xclust)) == L & nlevels(as.factor(yclust)) == L 282 | if(!done){ 283 | cat("Not all platforms contained all assay clusters. Trying again (only \"classic\" mode has this problem)...\n") 284 | } 285 | } 286 | return(xyclust$cluster) 287 | } 288 | else if (cluster == "pam"){ 289 | xclust = pam(xdiss, k=L, diss=TRUE, keep.diss=FALSE, keep.data=FALSE,cluster.only=TRUE) 290 | yclust = pam(ydiss, k=L, diss=TRUE, keep.diss=FALSE, keep.data=FALSE,cluster.only=TRUE) 291 | }else if (cluster == "kmeans"){ 292 | 293 | #This loop ensures there are no empty clusters. 294 | done = FALSE 295 | while (!done){ 296 | xclust = ckmeans(t(x), centers=L, iter.max = 20, distance = corr) 297 | yclust = ckmeans(t(y), centers=L, iter.max = 20, distance = corr) 298 | 299 | done = !xclust$empty & !yclust$empty 300 | } 301 | xclust = xclust$cluster 302 | yclust = yclust$cluster 303 | 304 | } else if (cluster == "flexclust"){ 305 | 306 | distCor1 = function (x, centers) 307 | { 308 | z <- matrix(0, nrow(x), ncol = nrow(centers)) 309 | for (k in 1:nrow(centers)) { 310 | z[, k] <- 1 - cor(t(x), centers[k, ], method=corr) 311 | } 312 | z 313 | } 314 | #This loop ensures there are no empty clusters. 315 | done = FALSE 316 | while (!done){ 317 | xclust = kcca(t(x), k=L, family=kccaFamily(dist=distCor1,cent="centMean"), simple=TRUE) 318 | yclust = kcca(t(y), k=L, family=kccaFamily(dist=distCor1,cent="centMean"), simple=TRUE) 319 | 320 | done = (dim(xclust@clusinfo)[1] == L) &(dim(yclust@clusinfo)[1] == L) 321 | } 322 | xclust = xclust@cluster 323 | yclust = yclust@cluster 324 | }else if(cluster == "random"){ 325 | xclust = sample(c(1:L,sample(1:L,nx-L,replace = TRUE)),nx,replace=FALSE) 326 | yclust = sample(c(1:L,sample(1:L,ny-L,replace = TRUE)),ny,replace=FALSE) 327 | } 328 | 329 | #Compute cluster averages 330 | xave = matrix(NA,m,L) 331 | yave = matrix(NA,m,L) 332 | for (i in 1:L){ 333 | xinds = xclust==i 334 | yinds = yclust==i 335 | xave[,i] = rowMeans(as.matrix(x[,xinds],nrow=m,ncol=sum(as.numeric(xinds)))) 336 | yave[,i] = rowMeans(as.matrix(y[,yinds],nrow=m,ncol=sum(as.numeric(yinds)))) 337 | } 338 | 339 | #Compute the cluster correlation matrix 340 | clustercor = cor(xave,yave, method = corr) 341 | 342 | #Map clusters 343 | xtoymap = matrix(NA,L,1) 344 | for (i in 1:L){ 345 | highest = which.max(clustercor) 346 | xind = highest%%L 347 | 348 | yind = highest%/%L + 1 349 | 350 | if (xind==0){ 351 | xind = L 352 | yind = yind-1 353 | } 354 | xtoymap[xind]=yind 355 | clustercor[xind,] = -2 356 | clustercor[,yind] = -2 357 | } 358 | 359 | #Change the x clusters using the map 360 | newxclust = xclust 361 | for (i in 1:L){ 362 | newxclust[xclust==i] = xtoymap[i] 363 | } 364 | xclust = newxclust 365 | 366 | #Return the clustering in a combined vector 367 | return(c(xclust,yclust)) 368 | } 369 | 370 | 371 | xpngenecluster = function(x,K,cluster="pam",corr="pearson"){ 372 | 373 | #Dimensions 374 | m = dim(x)[1] 375 | n = dim(x)[2] 376 | 377 | #Compute the dissimilarity matrix 378 | if (cluster=="pam"|(length(K)>1)){ 379 | xdiss = 1 - cor(t(x),method=corr) 380 | } 381 | 382 | #Determine K if a range was given 383 | if(length(K)>1 ){ 384 | pamklist=pamk(xdiss,krange=K,diss=TRUE, keep.diss=FALSE, keep.data=FALSE) 385 | K = pamklist$nc 386 | genepamobj = pamklist$pamobject 387 | }else{ 388 | genepamobj = NULL 389 | } 390 | 391 | 392 | #Do the gene clustering 393 | if(cluster=="pam"){ 394 | if (!is.null(genepamobj)){ 395 | geneclusters = genepamobj$clustering 396 | }else{ 397 | geneclusters = pam(xdiss, k=K, diss=TRUE, keep.diss=FALSE, keep.data=FALSE, cluster.only=TRUE) 398 | } 399 | }else if(cluster=="kmeans"){ 400 | #This loop ensures there are no empty clusters. 401 | done = FALSE 402 | while (!done){ 403 | geneclusters = ckmeans(x,centers=K,iter.max=1000,distance=corr) 404 | done = !geneclusters$empty 405 | } 406 | geneclusters = geneclusters$cluster 407 | }else if(cluster=="flexclust"){ 408 | 409 | 410 | distCor1 = function (x, centers) #change 411 | { 412 | z <- matrix(0, nrow(x), ncol = nrow(centers)) 413 | for (k in 1:nrow(centers)) { 414 | z[, k] <- 1 - cor(t(x), centers[k, ], method=corr) 415 | } 416 | z 417 | } 418 | #This loop ensures there are no empty clusters. 419 | done = FALSE 420 | while (!done){ 421 | geneclusters = kcca(x, k=K, family=kccaFamily(dist=distCor1,cent="centMean"), simple=TRUE) #change 422 | #geneclusters = ckmeans(x,centers=K,iter.max=1000,distance=corr) 423 | done = (dim(geneclusters@clusinfo)[1]==K)#change 424 | } 425 | geneclusters = geneclusters@cluster 426 | }else if (cluster == "random"){ 427 | geneclusters = sample(c(1:K,sample(1:K,m-K,replace = TRUE)),m,replace=FALSE) 428 | }else{ 429 | cat("Unknown gene clustering method:",cluster,"\n") 430 | } 431 | 432 | return(geneclusters) 433 | 434 | } 435 | 436 | 437 | XPN_MLE_C = function(xbar,sigma2,nj){ 438 | 439 | 440 | n = sum(nj) 441 | I = dim(xbar)[1] 442 | J = dim(xbar)[2] 443 | 444 | A = zeros(1,J) 445 | b = ones(I,1) 446 | c = zeros(I,1) 447 | s2 = ones(I,1) 448 | 449 | cout = .C("XPN_MLE_C_C",as.double(xbar),A=as.double(A),b=as.double(b),c=as.double(c),s2=as.double(s2),as.double(sigma2),as.integer(I),as.integer(J),as.integer(n),as.integer(nj)) 450 | 451 | A = matrix(cout$A,1,J) 452 | b = matrix(cout$b,I,1) 453 | c = matrix(cout$c,I,1) 454 | s2 = matrix(cout$s2,I,1) 455 | 456 | return(list(A=A,b=b,c=c,s2=s2)) 457 | } 458 | 459 | XPN_MLE = function(xbar,sigma2,nj){ 460 | #This function is never used. It serves to illustrate the method of maximum likelihood estimation used by XPN. It has been replaced by XPN_MLE_C, which calls a more efficient C function. The inputs and outputs of these functions are identical. 461 | 462 | n = sum(nj) 463 | I = dim(xbar)[1] 464 | J = dim(xbar)[2] 465 | 466 | A = zeros(1,J) 467 | b = ones(I,1) 468 | c = zeros(I,1) 469 | s2 = ones(I,1) 470 | 471 | old = c(A, b, c, s2)*0 472 | iter = 0 473 | while(sum((c(A, b, c, s2)-old)^2)>(1e-16)*max(old)){ 474 | 475 | print(iter<-iter+1) 476 | print(sum((c(A, b, c, s2)-old)^2)) 477 | old = c(A, b, c, s2) 478 | 479 | c = matrix(rowSums((xbar-repmat(b,1,J)*repmat(A,I,1))*repmat(nj,I,1))/n,I,1) 480 | 481 | if(sum(b)<0){ 482 | b = -1*b; 483 | } 484 | 485 | A = matrix(colSums(repmat(b,1,J)*(xbar-repmat(c,1,J))/repmat(s2,1,J)/sum(b^2/s2)),1,J) 486 | 487 | A = A - mean(A) 488 | A = A * sqrt(J/sum(A^2)) 489 | 490 | b = matrix(rowSums(repmat(A,I,1)*(xbar-repmat(c,1,J))*repmat(nj,I,1))/sum(A^2 * nj),I,1) 491 | 492 | 493 | s2 = matrix(rowSums(((xbar-repmat(c,1,J)-repmat(A,I,1)*repmat(b,1,J))^2 + sigma2)*repmat(nj,I,1))/n,I,1) 494 | 495 | 496 | 497 | s2[s2==0] = 2.2251e-308 498 | 499 | 500 | 501 | } 502 | 503 | return(list(A=A,b=b,c=c,s2=s2)) 504 | } 505 | 506 | 507 | 508 | detect_L = function(x,y,L,corr="pearson"){ 509 | 510 | #Determine L if a range has been given 511 | if(length(L)>1 & length(L) != (dim(x)[2]+dim(y)[2])){ 512 | xdiss = 1 - cor(as.matrix(x), method = corr) 513 | ydiss = 1 - cor(as.matrix(y), method = corr) 514 | Lx=pamk(xdiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc 515 | Ly = pamk(ydiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc 516 | L = max(Lx,Ly) 517 | cat(as.character(L), "assay clusters detected...\n") 518 | } 519 | return(L) 520 | } 521 | 522 | detect_K = function(x,K,corr="pearson"){ 523 | 524 | #Determine K if a range was given 525 | if(length(K)>1 & length(K) != dim(x)[1]){ 526 | xdiss = 1 - cor(t(x),method=corr) 527 | pamklist=pamk(xdiss,krange=K,diss=TRUE, keep.diss=FALSE, keep.data=FALSE) 528 | K = pamklist$nc 529 | genepamobj = pamklist$pamobject 530 | }else{ 531 | genepamobj = NULL 532 | } 533 | return(K) 534 | } 535 | -------------------------------------------------------------------------------- /MatchMixeR/Results of devtools check.txt: -------------------------------------------------------------------------------- 1 | ==> devtools::check() 2 | 3 | Updating MatchMixeR documentation 4 | Writing NAMESPACE 5 | Loading MatchMixeR 6 | Writing NAMESPACE 7 | Warning: The existing 'MM.Rd' file was not generated by roxygen2, and will not be overwritten. 8 | -- Building ------------ MatchMixeR -- 9 | Setting env vars: 10 | * CFLAGS : -Wall -pedantic -fdiagnostics-color=always 11 | * CXXFLAGS : -Wall -pedantic -fdiagnostics-color=always 12 | * CXX11FLAGS: -Wall -pedantic -fdiagnostics-color=always 13 | -------------------------------------- 14 | v checking for file 'C:\Disa_Research\MatchMixeR/DESCRIPTION' ... 15 | - preparing 'MatchMixeR': 16 | v checking DESCRIPTION meta-information ... 17 | - installing the package to build vignettes 18 | v creating vignettes (3.5s) 19 | - checking for LF line-endings in source and make files and shell scripts 20 | - checking for empty or unneeded directories 21 | - looking to see if a 'data/datalist' file should be added 22 | - building 'MatchMixeR_0.1.1.tar.gz' 23 | 24 | -- Checking ------------ MatchMixeR -- 25 | Setting env vars: 26 | * _R_CHECK_CRAN_INCOMING_USE_ASPELL_: TRUE 27 | * _R_CHECK_CRAN_INCOMING_REMOTE_ : FALSE 28 | * _R_CHECK_CRAN_INCOMING_ : FALSE 29 | * _R_CHECK_FORCE_SUGGESTS_ : FALSE 30 | -- R CMD check ----------------------------------------------------------------- 31 | - using log directory 'C:/Disa_Research/MatchMixeR.Rcheck' 32 | - using R version 3.5.1 (2018-07-02) 33 | - using platform: x86_64-w64-mingw32 (64-bit) 34 | - using session charset: ISO8859-1 35 | - using options '--no-manual --as-cran' 36 | v checking for file 'MatchMixeR/DESCRIPTION' 37 | - checking extension type ... Package 38 | - this is package 'MatchMixeR' version '0.1.1' 39 | - package encoding: UTF-8 40 | v checking package namespace information ... 41 | v checking package dependencies (2.6s) 42 | v checking if this is a source package ... 43 | v checking if there is a namespace 44 | v checking for executable files (403ms) 45 | v checking for hidden files and directories ... 46 | v checking for portable file names ... 47 | v checking serialization versions 48 | v checking whether package 'MatchMixeR' can be installed (2.7s) 49 | v checking installed package size ... 50 | v checking package directory 51 | v checking 'build' directory ... 52 | N checking DESCRIPTION meta-information ... 53 | Malformed Description field: should contain one or more complete sentences. 54 | v checking top-level files 55 | v checking for left-over files 56 | v checking index information ... 57 | v checking package subdirectories ... 58 | v checking R files for non-ASCII characters ... 59 | v checking R files for syntax errors ... 60 | v checking whether the package can be loaded ... 61 | v checking whether the package can be loaded with stated dependencies ... 62 | v checking whether the package can be unloaded cleanly ... 63 | v checking whether the namespace can be loaded with stated dependencies ... 64 | v checking whether the namespace can be unloaded cleanly ... 65 | v checking loading without being on the library search path ... 66 | v checking dependencies in R code ... 67 | v checking S3 generic/method consistency (515ms) 68 | v checking replacement functions ... 69 | W checking foreign function calls (408ms) 70 | Foreign function call without 'PACKAGE' argument: 71 | .C("XPN_MLE_C_C", ...) 72 | See chapter 'System and foreign language interfaces' in the 'Writing R 73 | Extensions' manual. 74 | N checking R code for possible problems (5.3s) 75 | ComBat: no visible binding for global variable 'var' 76 | ComBat: no visible global function definition for 'par' 77 | ComBat: no visible global function definition for 'density' 78 | ComBat: no visible global function definition for 'plot' 79 | ComBat: no visible global function definition for 'lines' 80 | ComBat: no visible global function definition for 'dnorm' 81 | ComBat: no visible global function definition for 'qqnorm' 82 | ComBat: no visible global function definition for 'qqline' 83 | ComBat: no visible global function definition for 'rgamma' 84 | ComBat: no visible global function definition for 'qqplot' 85 | ComBat: no visible global function definition for 'title' 86 | DWD1SM: no visible global function definition for 'rdist' 87 | DWD1SM: no visible global function definition for 'median' 88 | L: no visible global function definition for 'dnorm' 89 | MM: no visible global function definition for 'cov' 90 | aprior: no visible global function definition for 'var' 91 | bprior: no visible global function definition for 'var' 92 | detect_K: no visible global function definition for 'cor' 93 | detect_K: no visible global function definition for 'pamk' 94 | detect_L: no visible global function definition for 'cor' 95 | detect_L: no visible global function definition for 'pamk' 96 | randn: no visible global function definition for 'rnorm' 97 | rowMedians: no visible global function definition for 'median' 98 | sepelimdwd: no visible global function definition for 'socp' 99 | xpnassaycluster: no visible global function definition for 'cor' 100 | xpnassaycluster: no visible global function definition for 'pamk' 101 | xpnassaycluster: no visible global function definition for 'ckmeans' 102 | xpnassaycluster: no visible global function definition for 'pam' 103 | xpnassaycluster : distCor1: no visible global function definition for 104 | 'cor' 105 | xpnassaycluster: no visible global function definition for 'kcca' 106 | xpnassaycluster: no visible global function definition for 'kccaFamily' 107 | xpngenecluster: no visible global function definition for 'cor' 108 | xpngenecluster: no visible global function definition for 'pamk' 109 | xpngenecluster: no visible global function definition for 'pam' 110 | xpngenecluster: no visible global function definition for 'ckmeans' 111 | xpngenecluster : distCor1: no visible global function definition for 112 | 'cor' 113 | xpngenecluster: no visible global function definition for 'kcca' 114 | xpngenecluster: no visible global function definition for 'kccaFamily' 115 | Undefined global functions or variables: 116 | ckmeans cor cov density dnorm kcca kccaFamily lines median pam pamk 117 | par plot qqline qqnorm qqplot rdist rgamma rnorm socp title var 118 | Consider adding 119 | importFrom("graphics", "lines", "par", "plot", "title") 120 | importFrom("stats", "cor", "cov", "density", "dnorm", "median", 121 | "qqline", "qqnorm", "qqplot", "rgamma", "rnorm", "var") 122 | to your NAMESPACE file. 123 | v checking Rd files ... 124 | v checking Rd metadata ... 125 | v checking Rd line widths ... 126 | v checking Rd cross-references ... 127 | v checking for missing documentation entries ... 128 | v checking for code/documentation mismatches (625ms) 129 | W checking Rd \usage sections ... 130 | Undocumented arguments in documentation object 'MatchMixeR' 131 | 'Xmat' 'Ymat' 132 | Documented arguments not in \usage in documentation object 'MatchMixeR': 133 | 'X' 'Y' 134 | 135 | Functions with \usage entries need to have the appropriate \alias 136 | entries, and all their arguments documented. 137 | The \usage entries must correspond to syntactically valid R code. 138 | See chapter 'Writing R documentation files' in the 'Writing R 139 | Extensions' manual. 140 | v checking Rd contents (613ms) 141 | v checking for unstated dependencies in examples ... 142 | v checking contents of 'data' directory 143 | v checking data for non-ASCII characters ... 144 | W checking data for ASCII and uncompressed saves (3s) 145 | 146 | Note: significantly better compression could be obtained 147 | by using R CMD build --resave-data 148 | old_size new_size compress 149 | gpl570_gpl96.rda 2.7Mb 2.3Mb xz 150 | v checking installed files from 'inst/doc' ... 151 | v checking files in 'vignettes' ... 152 | v checking examples (559ms) 153 | v checking for unstated dependencies in vignettes ... 154 | v checking package vignettes in 'inst/doc' ... 155 | v checking re-building of vignette outputs (878ms) 156 | 157 | See 158 | 'C:/Disa_Research/MatchMixeR.Rcheck/00check.log' 159 | for details. 160 | 161 | 162 | -- R CMD check results ----------------------------------- MatchMixeR 0.1.1 ---- 163 | Duration: 23.5s 164 | 165 | > checking foreign function calls ... WARNING 166 | Foreign function call without 'PACKAGE' argument: 167 | .C("XPN_MLE_C_C", ...) 168 | See chapter 'System and foreign language interfaces' in the 'Writing R 169 | Extensions' manual. 170 | 171 | > checking Rd \usage sections ... WARNING 172 | Undocumented arguments in documentation object 'MatchMixeR' 173 | 'Xmat' 'Ymat' 174 | Documented arguments not in \usage in documentation object 'MatchMixeR': 175 | 'X' 'Y' 176 | 177 | Functions with \usage entries need to have the appropriate \alias 178 | entries, and all their arguments documented. 179 | The \usage entries must correspond to syntactically valid R code. 180 | See chapter 'Writing R documentation files' in the 'Writing R 181 | Extensions' manual. 182 | 183 | > checking data for ASCII and uncompressed saves ... WARNING 184 | 185 | Note: significantly better compression could be obtained 186 | by using R CMD build --resave-data 187 | old_size new_size compress 188 | gpl570_gpl96.rda 2.7Mb 2.3Mb xz 189 | 190 | > checking DESCRIPTION meta-information ... NOTE 191 | Malformed Description field: should contain one or more complete sentences. 192 | 193 | > checking R code for possible problems ... NOTE 194 | ComBat: no visible binding for global variable 'var' 195 | ComBat: no visible global function definition for 'par' 196 | ComBat: no visible global function definition for 'density' 197 | ComBat: no visible global function definition for 'plot' 198 | ComBat: no visible global function definition for 'lines' 199 | ComBat: no visible global function definition for 'dnorm' 200 | ComBat: no visible global function definition for 'qqnorm' 201 | ComBat: no visible global function definition for 'qqline' 202 | ComBat: no visible global function definition for 'rgamma' 203 | ComBat: no visible global function definition for 'qqplot' 204 | ComBat: no visible global function definition for 'title' 205 | DWD1SM: no visible global function definition for 'rdist' 206 | DWD1SM: no visible global function definition for 'median' 207 | L: no visible global function definition for 'dnorm' 208 | MM: no visible global function definition for 'cov' 209 | aprior: no visible global function definition for 'var' 210 | bprior: no visible global function definition for 'var' 211 | detect_K: no visible global function definition for 'cor' 212 | detect_K: no visible global function definition for 'pamk' 213 | detect_L: no visible global function definition for 'cor' 214 | detect_L: no visible global function definition for 'pamk' 215 | randn: no visible global function definition for 'rnorm' 216 | rowMedians: no visible global function definition for 'median' 217 | sepelimdwd: no visible global function definition for 'socp' 218 | xpnassaycluster: no visible global function definition for 'cor' 219 | xpnassaycluster: no visible global function definition for 'pamk' 220 | xpnassaycluster: no visible global function definition for 'ckmeans' 221 | xpnassaycluster: no visible global function definition for 'pam' 222 | xpnassaycluster : distCor1: no visible global function definition for 223 | 'cor' 224 | xpnassaycluster: no visible global function definition for 'kcca' 225 | xpnassaycluster: no visible global function definition for 'kccaFamily' 226 | xpngenecluster: no visible global function definition for 'cor' 227 | xpngenecluster: no visible global function definition for 'pamk' 228 | xpngenecluster: no visible global function definition for 'pam' 229 | xpngenecluster: no visible global function definition for 'ckmeans' 230 | xpngenecluster : distCor1: no visible global function definition for 231 | 'cor' 232 | xpngenecluster: no visible global function definition for 'kcca' 233 | xpngenecluster: no visible global function definition for 'kccaFamily' 234 | Undefined global functions or variables: 235 | ckmeans cor cov density dnorm kcca kccaFamily lines median pam pamk 236 | par plot qqline qqnorm qqplot rdist rgamma rnorm socp title var 237 | Consider adding 238 | importFrom("graphics", "lines", "par", "plot", "title") 239 | importFrom("stats", "cor", "cov", "density", "dnorm", "median", 240 | "qqline", "qqnorm", "qqplot", "rgamma", "rnorm", "var") 241 | to your NAMESPACE file. 242 | 243 | 0 errors v | 3 warnings x | 2 notes x 244 | Error: R CMD check found WARNINGs 245 | Execution halted 246 | 247 | Exited with status 1. 248 | -------------------------------------------------------------------------------- /MatchMixeR/data/gpl570_gpl96.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/data/gpl570_gpl96.rda -------------------------------------------------------------------------------- /MatchMixeR/man/CONOR.Rd: -------------------------------------------------------------------------------- 1 | \name{MatchMixeR} 2 | \alias{CONOR} 3 | \alias{dwd} 4 | \alias{eb} 5 | \alias{xpn} 6 | \alias{gq} 7 | 8 | \title{ 9 | Functions for cross-platform normalization of microarray data, from the CONOR package. 10 | } 11 | \description{ 12 | The functions below perform cross-platform normalization of microarray data, and come from the CONOR package. Dwd is distance weighted discrimination. Eb is empirical bayes. Xpn is cross-platform normalization. The dwd function requires the package rdist. 13 | The xpn function requires the package conclust. 14 | } 15 | 16 | \usage{ 17 | 18 | dwd(platform1.data, platform2.data, platform1.train = NULL, 19 | platform2.train = NULL, p1.names = 0, p2.names = 0, p1.train.names = 0, 20 | p2.train.names = 0, skip.match = FALSE, use.sparse = TRUE) 21 | 22 | eb(platform1.data, platform2.data, par.prior = TRUE, filter = FALSE, 23 | prior.plots = FALSE, p1.names = 0, p2.names = 0, skip.match=FALSE) 24 | 25 | xpn(platform1.data, platform2.data, K = 10, L = 4, p1.names = 0, 26 | p2.names = 0, gene.cluster = "kmeans", assay.cluster = "kmeans", 27 | corr = "pearson", iterations = 30, skip.match = FALSE) 28 | 29 | gq(platform1.data, platform2.data, p1.names = 0, p2.names = 0, skip.match=FALSE) 30 | } 31 | %- maybe also 'usage' for other objects documented here. 32 | 33 | \arguments{ 34 | \item{platform1.data}{ 35 | Expression data from platform 1. Should be formatted as a data.frame, with each column representing an array and each row a gene. 36 | } 37 | \item{platform2.data}{ 38 | Expression data from platform 2. Should be formatted as a data.frame, with each column representing an array and each row a gene. 39 | } 40 | \item{p1.names}{ 41 | One column of \code{platform1.data} may contain gene names. The column number containing the gene names should be specified as \code{p1.names}. If \code{p1.names} is zero, the rownames attribute will be used. Default value is zero. The gene names for platform 1 should correspond to the gene names for platform 2. 42 | } 43 | \item{p2.names}{ 44 | One column of \code{platform2.data} may contain gene names. The column number containing the gene names should be specified as \code{p2.names}. If \code{p2.names} is zero, the rownames attribute will be used. Default value is zero. The gene names for platform 2 should correspond to the gene names for platform 1. 45 | } 46 | 47 | \item{platform1.train}{Training data set for use with DWD.} 48 | \item{platform2.train}{Training data set for use with DWD.} 49 | \item{p1.train.names}{The column number containing the gene names for platform 1 training data.} 50 | \item{p2.train.names}{The column number containing the gene names for platform 2 training data.} 51 | \item{cluster}{Clustering method used by distran.} 52 | \item{par.prior}{Parameter for eb. \code{par.prior} specifies whether to use a parametric or non-parametric prior distribution.} 53 | \item{filter}{Parameter for eb.} 54 | \item{prior.plots}{Generate prior plots for eb method.} 55 | 56 | 57 | \item{skip.match}{ 58 | If \code{skip.match} is \code{FALSE}, rows of \code{platform1.data} and \code{platform2.data} will be matched using gene names. This process uses R's built-in \code{intercept} function, which can be quite inefficient. If the rows of \code{platform1.data} and \code{platform2.data} already match, time can be saved by setting \code{skip.match} to \code{TRUE}. 59 | } 60 | 61 | \item{use.sparse}{ 62 | For dwd only. Can be set to \code{TRUE} or \code{FALSE}. Determines whether dwd uses sparse matrices (via the Matrix package) for its internal calculations. Sparse matrix calculations are more efficient for large problems, but will not affect the final output. 63 | } 64 | 65 | \item{gene.cluster}{ 66 | For \code{xpn} only, \code{gene.cluster} specifies the gene clustering method to be used. Options are "kmeans", "pam", and "flexclust". Only "kmeans" is practical for large numbers or genes. 67 | } 68 | \item{assay.cluster}{ 69 | For \code{xpn} and \code{distran} only, \code{assay.cluster} specifies the assay clustering method to be used. Options are "classic", "kmeans", "pam", and "flexclust". 70 | } 71 | \item{corr}{ 72 | For \code{xpn} and \code{distran} only, \code{corr} is the type of correlation to use as a distance measure for sample or gene clustering. Ignored for the "kmeans" and "classic" clustering options, for which only Pearson's correlation is available. 73 | } 74 | \item{iterations}{ 75 | For \code{xpn} only, \code{iterations} gives the number of iterations of the XPN algorithm to perform. 76 | } 77 | 78 | \item{K}{ 79 | For \code{xpn} only, \code{K} is the number of gene clusters to use. Must be an integer. If a vector of more than one integer is given, the best value will be selected by the silhouette based method of the pamk function from the fpc package. This can be extremely slow for large numbers of genes. 80 | } 81 | 82 | \item{L}{ 83 | For \code{xpn} and \code{distran} only, \code{L} is the number of assay clusters to use. Must be an integer. If a vector of more than one integer is given, the best value will be selected by a silhouette based method using the pamk function from the fpc package. 84 | } 85 | 86 | } 87 | 88 | 89 | \value{ 90 | %% ~Describe the value returned 91 | %% If it is a LIST, use 92 | %% \item{comp1 }{Description of 'comp1'} 93 | %% \item{comp2 }{Description of 'comp2'} 94 | %% ... 95 | \item{x}{Normalized data from platform 1.} 96 | \item{y}{Normalized data from platform 2.} 97 | \item{p1.adjust}{For dwd only, vector of platform effects removed from the platform 1 data.} 98 | \item{p2.adjust}{For dwd only, vector of platform effects removed from the platform 2 data.} 99 | } 100 | \references{ 101 | 102 | Benito et al. Adjustment of systematic microarray data biases. Bioinformatics (2004) vol. 20 (1) pp. 105 103 | 104 | Bolstad et al. A comparison of normalization methods for high density oligonucleotide array data based on variance and bias. Bioinformatics (2003) vol. 19 (2) pp. 185 105 | 106 | Jiang et al. Joint analysis of two microarray gene-expression data sets to select lung adenocarcinoma marker genes. BMC bioinformatics (2004) vol. 5 pp. 81 107 | 108 | Martinez et al. GenMiner: mining informative association rules from genomic data. Proceeding of the IEEE International Conference on Binformatics and Biomedicine. (2007) pp. 15-22 109 | 110 | Shabalin et al. Merging two gene-expression studies via cross-platform normalization. Bioinformatics (2008) vol. 24 (9) pp. 1154 111 | 112 | Shi et al. The MicroArray Quality Control (MAQC) project shows inter-and intraplatform reproducibility of gene expression measurements. Nature biotechnology (2006) vol. 24 (9) pp. 1151-1161 113 | 114 | Walker et al. Empirical Bayes accomodation of batch-effects in microarray data using identical replicate reference samples: application to RNA expression profiling of blood .... BMC bioinformatics (2008) 115 | 116 | Warnat et al. Cross-platform analysis of cancer microarray data improves gene expression based classification of phenotypes. BMC bioinformatics (2005) vol. 6 pp. 265 117 | 118 | } 119 | \author{ 120 | Jason Rudy and Faramarz Valafar 121 | } 122 | 123 | 124 | 125 | \examples{ 126 | fit_dwd <- dwd(gpl96,gpl570) 127 | fit_eb <- eb(gpl96,gpl570) 128 | fit_xpn <- xpn(gpl96,gpl570) 129 | fit_gq <- gp(gpl96,gpl570) 130 | } 131 | 132 | \keyword{MatchMixeR} 133 | \keyword{dwd} 134 | \keyword{eb} 135 | \keyword{xpn} 136 | \keyword{distance weighted discrimination} 137 | \keyword{empirical bayes} 138 | \keyword{cross-platform normalization} 139 | 140 | -------------------------------------------------------------------------------- /MatchMixeR/man/MM.Rd: -------------------------------------------------------------------------------- 1 | \name{MatchMixeR} 2 | \alias{MM} 3 | 4 | \title{MatchMixeR: A cross-platform normalization procedure for matched samples.} 5 | 6 | \description{ 7 | Matched sample based cross-platform normalization function using mixed effect regression 8 | } 9 | \usage{MM(Xmat, Ymat)} 10 | 11 | \arguments{ 12 | \item{X}{gene expression level matrix on platform X (to be transformed)} 13 | 14 | \item{Y}{gene expression level matrix on platform Y ( not to be transformed)} 15 | } 16 | 17 | \details{ 18 | This function fits mixed effect regression model. 19 | } 20 | 21 | \value{8 lists including betahat, betamat, Yhat} 22 | 23 | \author{Xing Qiu} 24 | 25 | \examples{ 26 | fit_mm <- MM(gpl96,gpl570) 27 | } 28 | 29 | \keyword{MatchMixeR} 30 | \keyword{MM} -------------------------------------------------------------------------------- /MatchMixeR/man/flmer.Rd: -------------------------------------------------------------------------------- 1 | \name{MatchMixeR} 2 | \alias{flmer} 3 | 4 | \title{MatchMixeR: A cross-platform normalization procedure for matched samples.} 5 | 6 | \description{ 7 | Fast linear mixed effects regression (flmer); a moment-based method for the MM Model 8 | } 9 | 10 | \usage{flmer(Xmat, Ymat)} 11 | 12 | \arguments{ 13 | \item{X}{gene expression level matrix on platform X (to be transformed)} 14 | 15 | \item{Y}{gene expression level matrix on platform Y ( not to be transformed)} 16 | } 17 | 18 | \details{ 19 | This function fits the mixed effect regression model (MM) using fast moment based approaches. 20 | } 21 | 22 | \value{8 lists including betahat, betamat, Yhat} 23 | 24 | \author{Xing Qiu} 25 | 26 | \examples{ 27 | fit_flmer <- flmer(gpl96,gpl570) 28 | } 29 | 30 | \keyword{MatchMixeR} 31 | \keyword{flmer} 32 | -------------------------------------------------------------------------------- /MatchMixeR/man/gpl570_gpl96.Rd: -------------------------------------------------------------------------------- 1 | \name{gpl570_gpl96} 2 | \alias{gpl570} 3 | \alias{gpl96} 4 | 5 | \title{matched samples on GPL570 and GPL96} 6 | 7 | \usage{ 8 | data(gpl570_gpl96) 9 | } 10 | 11 | \description{ 12 | Gene expression data of 58 matched samples on two microarrays manufactured by Affymetrix (5147 genes and 58 tumor mRNA samples) from the CellMiner, which is a web application that provides genomic profile data and query tools for the NCI60 and additional cell types. The NCI60 is a group of 60 human cancer cell lines used by the National Cancer Institute for the screening of compounds to detect potential anticancer activity. } 13 | 14 | \value{ 15 | \item{gpl570}{matrix of gene expression levels for the 58 matched samples measured by gpl570(Human Genome U133 Plus 2.0 Array), rows correspond to genes (5147 genes) and columns to 58 cancer cell lines.} 16 | \item{gpl96}{matrix of gene expression levels for the 58 matched samples measured by gpl96(Human Genome U133A array), rows correspond to genes (5147 genes) and columns to 58 cancer cell lines. } 17 | } 18 | 19 | \keyword{datasets} 20 | -------------------------------------------------------------------------------- /MatchMixeR/src/MatchMixeR.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/src/MatchMixeR.dll -------------------------------------------------------------------------------- /MatchMixeR/src/XPN_MLE.c: -------------------------------------------------------------------------------- 1 | /* 2 | ###################################################################### 3 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 4 | 5 | #This program is free software: you can redistribute it and/or modify 6 | #it under the terms of the GNU General Public License as published by 7 | #the Free Software Foundation, either version 3 of the License, or 8 | #(at your option) any later version. 9 | 10 | #This program is distributed in the hope that it will be useful, 11 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | #GNU General Public License for more details. 14 | 15 | #You should have received a copy of the GNU General Public License 16 | #along with this program. If not, see . 17 | ###################################################################### 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | 26 | void XPN_MLE_C_C(double *xbar, double *A, double *b, double *c, double *s2, double *sigma2, int *pI, int *pJ, int *pn, int *nj){ 27 | int I = *pI, J = *pJ, n = *pn; 28 | double *old; 29 | double maxold, tmp, sumb, sumA, sumA2, meanA, Anorm, sumnjA2, sumb2overs2, change = 1, radius = 0.0000000000000001; 30 | int changed = 1, oldlength = J+3*I; 31 | old = malloc((oldlength)*sizeof(double)); 32 | 33 | //Copy starting values 34 | int i,j; 35 | maxold = 0; 36 | for(i=0;i maxold){ 39 | maxold = old[i]; 40 | } 41 | } 42 | for(i=0;i maxold){ 45 | maxold = old[i+J]; 46 | } 47 | } 48 | for(i=0;i maxold){ 51 | maxold = old[i+J+I]; 52 | } 53 | } 54 | for(i=0;i maxold){ 57 | maxold = old[i+J+2*I]; 58 | } 59 | } 60 | 61 | 62 | 63 | //Until we stabilize 64 | int iter = 0; 65 | while(changed && (iter < 100)){ 66 | iter++; 67 | // Rprintf("%d ",iter); 68 | fflush(stdout); 69 | 70 | 71 | 72 | //update c 73 | for (i=0;i radius*maxold); 180 | //printf("%f ",change); 181 | 182 | //Copy previous values 183 | maxold = 0; 184 | for(i=0;i maxold){ 187 | maxold = old[i]; 188 | } 189 | } 190 | for(i=0;i maxold){ 193 | maxold = old[i+J]; 194 | } 195 | } 196 | for(i=0;i maxold){ 199 | maxold = old[i+J+I] ; 200 | } 201 | } 202 | for(i=0;i maxold){ 205 | maxold = old[i+J+2*I]; 206 | } 207 | } 208 | 209 | } 210 | 211 | free(old); 212 | 213 | } 214 | -------------------------------------------------------------------------------- /MatchMixeR/src/XPN_MLE.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/src/XPN_MLE.o -------------------------------------------------------------------------------- /MatchMixeR/src/ckmeans_c.c: -------------------------------------------------------------------------------- 1 | /* 2 | ###################################################################### 3 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010 4 | 5 | #This program is free software: you can redistribute it and/or modify 6 | #it under the terms of the GNU General Public License as published by 7 | #the Free Software Foundation, either version 3 of the License, or 8 | #(at your option) any later version. 9 | 10 | #This program is distributed in the hope that it will be useful, 11 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | #GNU General Public License for more details. 14 | 15 | #You should have received a copy of the GNU General Public License 16 | #along with this program. If not, see . 17 | ###################################################################### 18 | */ 19 | #include 20 | #include 21 | //#include "modreg.h" /* for declarations for registration */ 22 | 23 | double pearson(double *x, double *cen, int p, int i, int j, int n, int k); 24 | double euclidean(double *x, double *cen, int p, int i, int j, int n, int k); 25 | double manhattan(double *x, double *cen, int p, int i, int j, int n, int k); 26 | double absolutepearson(double *x, double *cen, int p, int i, int j, int n, int k); 27 | 28 | void ckmeans_c(double *x, int *pn, int *pp, double *cen, int *pk, int *cl, int *pmaxiter, int *nc, double *wss, int *pdistance ) 29 | { 30 | int n = *pn, k = *pk, p = *pp, maxiter = *pmaxiter, distance = *pdistance; 31 | int iter, i, j, c, it, inew = 0; 32 | double best, dd, tmp; 33 | Rboolean updated; 34 | 35 | for(i = 0; i < n; i++) cl[i] = -1; 36 | for(iter = 0; iter < maxiter; iter++) { 37 | updated = FALSE; 38 | for(i = 0; i < n; i++) { 39 | /* find nearest centre for each point */ 40 | best = R_PosInf; 41 | for(j = 0; j < k; j++) { 42 | dd = 0.0; 43 | 44 | switch (distance){ 45 | case 1: dd = pearson(x,cen,p, i, j, n, k); 46 | break; 47 | case 2: dd = euclidean(x,cen,p, i, j, n, k); 48 | break; 49 | case 3: dd = manhattan(x,cen,p, i, j, n, k); 50 | break; 51 | case 4: dd = absolutepearson(x,cen,p, i, j, n, k); 52 | break; 53 | default: error("Unkown distance measure provided to ckmeans."); 54 | } 55 | if(dd < best) { 56 | best = dd; 57 | inew = j+1; 58 | } 59 | } 60 | if(cl[i] != inew) { 61 | updated = TRUE; 62 | cl[i] = inew; 63 | } 64 | } 65 | if(!updated) break; 66 | /* update each centre */ 67 | for(j = 0; j < k*p; j++) cen[j] = 0.0; 68 | for(j = 0; j < k; j++) nc[j] = 0; 69 | for(i = 0; i < n; i++) { 70 | it = cl[i] - 1; nc[it]++; 71 | for(c = 0; c < p; c++) cen[it+c*k] += x[i+c*n]; 72 | } 73 | for(j = 0; j < k*p; j++) cen[j] /= nc[j % k]; 74 | } 75 | 76 | *pmaxiter = iter + 1; 77 | for(j = 0; j < k; j++) wss[j] = 0.0; 78 | for(i = 0; i < n; i++) { 79 | it = cl[i] - 1; 80 | for(c = 0; c < p; c++) { 81 | tmp = x[i+n*c] - cen[it+k*c]; 82 | wss[it] += tmp * tmp; 83 | } 84 | } 85 | } 86 | 87 | double pearson(double *x, double *cen, int p, int i, int j, int n, int k){ 88 | double top = 0; 89 | double bottom1 = 0; 90 | double bottom2 = 0; 91 | int c; 92 | for (c = 0; c < p; c++){ 93 | top += x[i+n*c] * cen[j+k*c]; 94 | bottom1 += x[i+n*c]*x[i+n*c]; 95 | bottom2 += cen[j+k*c]*cen[j+k*c]; 96 | } 97 | return(1 - top/(sqrt(bottom1)*sqrt(bottom2))); 98 | } 99 | 100 | 101 | 102 | 103 | 104 | 105 | double euclidean(double *x, double *cen, int p, int i, int j, int n, int k){ 106 | double tmp, dd = 0; 107 | int c; 108 | for(c = 0; c < p; c++) { 109 | tmp = x[i+n*c] - cen[j+k*c]; 110 | dd += tmp * tmp; 111 | } 112 | return(dd); 113 | } 114 | 115 | 116 | double manhattan(double *x, double *cen, int p, int i, int j, int n, int k){ 117 | double dd = 0; 118 | int c; 119 | for(c = 0; c < p; c++) { 120 | 121 | dd += abs(x[i+n*c] - cen[j+k*c]); 122 | } 123 | return(dd); 124 | } 125 | 126 | 127 | double absolutepearson(double *x, double *cen, int p, int i, int j, int n, int k){ 128 | 129 | return(abs(pearson(x,cen,p, i, j, n, k))); 130 | 131 | } 132 | -------------------------------------------------------------------------------- /MatchMixeR/src/ckmeans_c.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/src/ckmeans_c.o -------------------------------------------------------------------------------- /MatchMixeR/vignettes/overview.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "MatchMixeR User Guide" 3 | author: "Xing Qiu, Serin Zhang" 4 | date: "`r Sys.Date()`" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{MatchMixeR User Guide} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r load-package} 13 | 14 | library(MatchMixeR) 15 | 16 | ``` 17 | 18 | In this Vignette, we describe the basic usage of the `MatchMixeR` package. This package implements a fast liner mixed effect model for cross-platform normalization of high throughput genomic data using matched samples. Below we describe the inspiring problem and the usage of the main R function, `MM()`. 19 | 20 | ## Introduction 21 | 22 | We will briefly describe a motivating problem for which the `MM` package is applicable. 23 | 24 | Cross-platform normalization method is highly demanding for data integration across different studies as researchers have more access to pre-existing gene expression data stored in public repositories. However, the systematic difference from heterogeneous features of high throughput techniques makes the data integration difficult. Several methods for cross-platform normalization have been proposed and their normalized data show high cross-platform concordance. However, their methods can also reduce the biological difference by chance since the biological effect can be compounded with the non-biological effects. 25 | 26 | Therefore, we proposed a new cross platform normalization method to eliminate only the heterogeneity caused by different platforms and preserve the biological information by employing matched samples which have only platform differences. In our method, a benchmark linear fixed effect regression model is acquired from a matched sample data and apply it to the research data to be combined. We call our method as Match-MixeR which means a matched sample based cross-platform normalization method using mixed effect regression model. 27 | 28 | ## Fast LMER 29 | In the typical LMER model, the $\hat{{\theta}}$ can be estimated by maximum likelihood(ML) or restricted maximum likelihood (REML) method and modern LMER algorithms use the REML by default because the ML method has some bias. However, both ML and REML methods are time consuming since they rely on inverting large matrices and iterative optimization procedure. So we proposed a new fast solution using moment matching algorithm. 30 | 31 | ## Practical workaraound for cross-platform normalization of gene expression data 32 | The original FLMER assumes that the 33 | covariance structure of the random intercepts and slopes is a 34 | scalar matrix of form $\sigma_{\gamma}^2 \mathbf{I}$. In cross-platform normalization study, the variance of random intercepts may be many times larger than that of the random slopes. 35 | If we model all these three parameters in the FLMER algorithm, we will need three independent moment matching equations, which will create more uncertainties in the estimator and dramatically reduce the computational efficiency. In other words, to include too many unknown parameters in the model defeats the very purpose of FLMER. 36 | 37 | As an alternative, we propose the following practical and fast workaround. 38 | 39 | ## Usage examples 40 | The primary function in our package is `MM()`. It takes the following primary arguments: `Xmat` and `Ymat`. Here `Xmat` and Here `Xmat` are $m\times n$ dimensional matched samples gene expression matrices. The MM function fits mixed effect regression model and gives the $\hat{{Y}}$, the transformed values for the gene expression values in Xmat. Here, only the values in platform X will be changed for data integration. One can assigne platform X and platform Y by considering the sample size and the popularity of the platforms. 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /MatchMixeR/vignettes/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | MatchMixeR User Guide 18 | 19 | 20 | 21 | 22 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 |

MatchMixeR User Guide

72 |

Xing Qiu, Serin Zhang

73 |

2018-10-29

74 | 75 | 76 | 77 |
library(MatchMixeR)
78 |

In this Vignette, we describe the basic usage of the MatchMixeR package. This package implements a fast liner mixed effect model for cross-platform normalization of high throughput genomic data using matched samples. Below we describe the inspiring problem and the usage of the main R function, MM().

79 |
80 |

Introduction

81 |

We will briefly describe a motivating problem for which the MM package is applicable.

82 |

Cross-platform normalization method is highly demanding for data integration across different studies as researchers have more access to pre-existing gene expression data stored in public repositories. However, the systematic difference from heterogeneous features of high throughput techniques makes the data integration difficult. Several methods for cross-platform normalization have been proposed and their normalized data show high cross-platform concordance. However, their methods can also reduce the biological difference by chance since the biological effect can be compounded with the non-biological effects.

83 |

Therefore, we proposed a new cross platform normalization method to eliminate only the heterogeneity caused by different platforms and preserve the biological information by employing matched samples which have only platform differences. In our method, a benchmark linear fixed effect regression model is acquired from a matched sample data and apply it to the research data to be combined. We call our method as Match-MixeR which means a matched sample based cross-platform normalization method using mixed effect regression model.

84 |
85 |
86 |

Fast LMER

87 |

In the typical LMER model, the \(\hat{{\theta}}\) can be estimated by maximum likelihood(ML) or restricted maximum likelihood (REML) method and modern LMER algorithms use the REML by default because the ML method has some bias. However, both ML and REML methods are time consuming since they rely on inverting large matrices and iterative optimization procedure. So we proposed a new fast solution using moment matching algorithm.

88 |
89 |
90 |

Practical workaraound for cross-platform normalization of gene expression data

91 |

The original FLMER assumes that the covariance structure of the random intercepts and slopes is a scalar matrix of form \(\sigma_{\gamma}^2 \mathbf{I}\). In cross-platform normalization study, the variance of random intercepts may be many times larger than that of the random slopes.
92 | If we model all these three parameters in the FLMER algorithm, we will need three independent moment matching equations, which will create more uncertainties in the estimator and dramatically reduce the computational efficiency. In other words, to include too many unknown parameters in the model defeats the very purpose of FLMER.

93 |

As an alternative, we propose the following practical and fast workaround.

94 |
95 |
96 |

Usage examples

97 |

The primary function in our package is MM(). It takes the following primary arguments: Xmat and Ymat. Here Xmat and Here Xmat are \(m\times n\) dimensional matched samples gene expression matrices. The MM function fits mixed effect regression model and gives the \(\hat{{Y}}\), the transformed values for the gene expression values in Xmat. Here, only the values in platform X will be changed for data integration. One can assigne platform X and platform Y by considering the sample size and the popularity of the platforms.

98 |
99 | 100 | 101 | 102 | 103 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cross-Platform-Normalization -------------------------------------------------------------------------------- /Simulation_updated_github.R: -------------------------------------------------------------------------------- 1 | 2 | ### This code was written by Dr. Serin Zhang, as a part of her Statistics PhD 3 | ### dissertation project at Florida State University 4 | 5 | # setwd("C:/Users/Disa Yu/Dropbox/Cross-platform-normalization/R_code") 6 | 7 | library("genefilter") 8 | library("MatchMixeR") 9 | 10 | ### create a "true" expression data and fit OLS/FLMER models 11 | set.seed(4321) 12 | m <- 10000; n <- 100 13 | ## TrueMeans is the true mean expression for each gene. 14 | TrueMeans <- rnorm(m, 7.5, .6) 15 | 16 | ## Generate the training data (matched data) 17 | matchdat.X <- matrix(rnorm(m*n, 0, .5), m) + TrueMeans 18 | beta0 <- rnorm(m, 2.5, 2.0); beta1 <- runif(m, .7, 1.0) 19 | betas <- cbind(beta0=beta0, beta1=beta1) 20 | matchdat.Y <- beta0 + matchdat.X * beta1 21 | 22 | ## fit models on the first matched data : OLS & FLMER 23 | OLSmod <- MatchMixeR:::OLS(matchdat.X, matchdat.Y) 24 | #FLMERmod <-ModIV(matchdat.X, matchdat.Y) 25 | FLMERmod <-MM(matchdat.X, matchdat.Y) 26 | 27 | ### Initial Method comparison 28 | ## create research data(test data) "WITHOUT" DEGs - "smaller" matched data 29 | #n1 <- 5 30 | n1 <- 30 31 | sim.match.X <- matrix(rnorm(m*n1, 0, .3), m) + TrueMeans 32 | sim.match.Y <- beta0 + sim.match.X * beta1 33 | 34 | #sim.match.X5 <- sim.match.X; sim.match.Y5 <- sim.match.Y 35 | #sim.match.X30 <- sim.match.X; sim.match.Y30 <- sim.match.Y 36 | 37 | ## Apply fitted OLS&FLMER on the first matched data to the smaller matched data 38 | XtransOLS <- sim.match.X*OLSmod$betamat[, "Slope"] + OLSmod$betamat[, "Intercept"] 39 | XtransFLMER <- sim.match.X*FLMERmod$betamat[, "Slope"] + FLMERmod$betamat[, "Intercept"] 40 | ## other methods normalization on smaller matched data 41 | DWDmod <- dwd(sim.match.X, sim.match.Y) 42 | EBmod <- eb(sim.match.X, sim.match.Y) 43 | #GQmod <- gq(sim.match.X, sim.match.Y) 44 | XPNmod <- xpn(sim.match.X, sim.match.Y) 45 | 46 | ## make plots : Coloumn-wise R2 comparison: DWD beats our methods 47 | corr.raw <- as.numeric(lapply(1:n1,function(i) cor(sim.match.X[,i], sim.match.Y[,i]))) 48 | corr.OLS <- as.numeric(lapply(1:n1,function(i) cor(XtransOLS[,i],sim.match.Y[,i]))) 49 | corr.FLMER <- as.numeric(lapply(1:n1,function(i) cor(XtransFLMER[,i],sim.match.Y[,i]))) 50 | corr.DWD <- as.numeric(lapply(1:n1,function(i) cor(DWDmod$x[,i],DWDmod$y[,i]))) 51 | corr.EB <- as.numeric(lapply(1:n1,function(i) cor(EBmod$x[,i],EBmod$y[,i]))) 52 | #corr.GQ <- as.numeric(lapply(1:n1,function(i) cor(GQmod$x[,i],GQmod$y[,i]))) 53 | corr.XPN <- as.numeric(lapply(1:n1,function(i) cor(XPNmod$x[,i],XPNmod$y[,i]))) 54 | r2.raw <- (corr.raw)^2 55 | r2.OLS <- (corr.OLS)^2 56 | r2.FLMER <- (corr.FLMER)^2 57 | r2.DWD <- (corr.DWD)^2 58 | r2.EB <- (corr.EB)^2 59 | #r2.GQ <- (corr.GQ)^2 60 | r2.XPN <- (corr.XPN)^2 61 | r2.results <- cbind(MM=r2.FLMER,DWD=r2.DWD,XPN=r2.XPN,EB=r2.EB) 62 | boxplot(r2.results,ylab="R(column wise)",ylim = c(0.999,1),cex.lab=1.3, cex.axis=1.3) 63 | 64 | #r2.sim.n5 <- r2.results 65 | r2.sim.n30 <- r2.results 66 | 67 | ## RSS comparison: OLS vs FLMER 68 | RSS.OLS <- mean((XtransOLS-sim.match.Y)^2) 69 | RSS.FLMER <- mean((XtransFLMER-sim.match.Y)^2) 70 | ## mean sum of difference squres for other methods 71 | Diff.raw <- mean((sim.match.X-sim.match.Y)^2) 72 | Diff.EB <- mean((EBmod$x - EBmod$y)^2) 73 | Diff.DWD <- mean(as.matrix(DWDmod$x - DWDmod$y)^2) 74 | #Diff.GQ <- mean((GQmod$x - GQmod$y)^2) 75 | Diff.XPN <- mean(as.matrix(XPNmod$x - XPNmod$y)^2) 76 | 77 | #diff.n5 <- c(raw = Diff.raw, MM = RSS.FLMER,DWD = Diff.DWD,XPN = Diff.XPN,EB = Diff.EB) 78 | diff.n30 <- c(raw = Diff.raw, MM = RSS.FLMER,DWD = Diff.DWD,XPN = Diff.XPN,EB = Diff.EB) 79 | 80 | save(r2.sim.n30, diff.n30, file="sim_results.RData") 81 | 82 | ### DE analysis 83 | ##Generate GroupA with n3 platformX samples & n4 platformY samples 84 | #GroupB with n5 platformX samples & n6 platformY samples 85 | n2 <- 30; n3 <- 0 # GroupA&PlatformX ; GroupB&PlatformX 86 | n4 <- 0; n5 <- 30 # GroupA&PlatformY ; GroupB&PlatformY 87 | TP <- matrix(0,30,6); FP <- matrix(0,30,6) #exclude GQ 88 | colnames(TP)<-c("raw","OLS","MM","DWD","XPN","EB") 89 | colnames(FP)<-c("raw","OLS","MM","DWD","XPN","EB") 90 | 91 | ## create research data with 1000 DEGs (By adding Effs to GroupB) 92 | m1 <- 1000 93 | Effs <- rep(0, m); Effs[1:m1] <- runif(m1, .2, 3) 94 | for (i in 1:30) 95 | { 96 | print(i) 97 | sim.A.X <- matrix(rnorm(m*n2, 0, .25), m) + TrueMeans 98 | sim.B.X <- matrix(rnorm(m*n3, 0, .25), m) + TrueMeans + Effs 99 | sim.X <- cbind(sim.A.X,sim.B.X) 100 | TrueExp.A.Y <- matrix(rnorm(m*n4, 0, .25), m) + TrueMeans 101 | TrueExp.B.Y <- matrix(rnorm(m*n5, 0, .25), m) + TrueMeans + Effs 102 | TrueExp.Y <- cbind(TrueExp.A.Y,TrueExp.B.Y) 103 | sim.Y <- beta0 + TrueExp.Y * beta1 104 | 105 | ## Apply the trained betas to the research data 106 | Xtrans.OLS_DE <- sim.X*OLSmod$betamat[, "Slope"] + OLSmod$betamat[, "Intercept"] 107 | Xtrans.FLMER_DE <- sim.X*FLMERmod$betamat[, "Slope"] + FLMERmod$betamat[, "Intercept"] 108 | ## other methods normalization on research data with DEGs 109 | DWDmod_DE <- dwd(sim.X,sim.Y) 110 | EBmod_DE <- eb(sim.X,sim.Y) 111 | #GQmod_DE <- gq(sim.X,sim.Y) 112 | XPNmod_DE <-xpn(sim.X,sim.Y) 113 | ## check stat. power and type I error 114 | simData <- cbind(sim.X, sim.Y) 115 | simDataOLS <- cbind(Xtrans.OLS_DE, sim.Y) 116 | simDataFLMER <- cbind(Xtrans.FLMER_DE, sim.Y) 117 | simDataDWD <- as.matrix(cbind(DWDmod_DE$x,DWDmod_DE$y)) 118 | simDataEB <- as.matrix(cbind(EBmod_DE$x,EBmod_DE$y)) 119 | #simDataGQ <- as.matrix(cbind(GQmod_DE$x,GQmod_DE$y)) 120 | simDataXPN <- as.matrix(cbind(XPNmod_DE$x,XPNmod_DE$y)) 121 | simFac <- as.factor(c(rep(0, n2), rep(1, n3),rep(0, n4), rep(1, n5))) 122 | rr.raw <- rowttests(simData, simFac) 123 | rr.OLS <- rowttests(simDataOLS, simFac) 124 | rr.FLMER <- rowttests(simDataFLMER, simFac) 125 | rr.DWD <- rowttests(simDataDWD, simFac) 126 | rr.EB <- rowttests(simDataEB, simFac) 127 | #rr.GQ <- rowttests(simDataGQ, simFac) 128 | rr.XPN <- rowttests(simDataXPN, simFac) 129 | padj.raw <- p.adjust(rr.raw[, "p.value"], "holm") 130 | padj.OLS <- p.adjust(rr.OLS[, "p.value"], "holm") 131 | padj.FLMER <- p.adjust(rr.FLMER[, "p.value"], "holm") 132 | padj.DWD <- p.adjust(rr.DWD[, "p.value"], "holm") 133 | padj.EB <- p.adjust(rr.EB[, "p.value"], "holm") 134 | #padj.GQ <- p.adjust(rr.GQ[, "p.value"], "holm") 135 | padj.XPN <- p.adjust(rr.XPN[, "p.value"], "holm") 136 | TP[i,1] <- sum(padj.raw[1:m1]<0.05); FP[i,1] <- sum(padj.raw[(m1+1):m]<0.05) 137 | TP[i,2] <- sum(padj.OLS[1:m1]<0.05); FP[i,2] <- sum(padj.OLS[(m1+1):m]<0.05) 138 | TP[i,3] <- sum(padj.FLMER[1:m1]<0.05); FP[i,3] <- sum(padj.FLMER[(m1+1):m]<0.05) 139 | TP[i,4] <- sum(padj.DWD[1:m1]<0.05); FP[i,4] <- sum(padj.DWD[(m1+1):m]<0.05) 140 | TP[i,6] <- sum(padj.EB[1:m1]<0.05); FP[i,6] <- sum(padj.EB[(m1+1):m]<0.05) 141 | #TP[i,7] <- sum(padj.GQ[1:m1]<0.05); FP[i,7] <- sum(padj.GQ[(m1+1):m]<0.05) 142 | TP[i,5] <- sum(padj.XPN[1:m1]<0.05); FP[i,5] <- sum(padj.XPN[(m1+1):m]<0.05) 143 | } 144 | 145 | DE <- matrix(1000,30,6) 146 | P <- TP/(TP+FP) 147 | R <- TP/DE 148 | F1 <- 2*(P*R)/(P+R) 149 | 150 | 151 | --------------------------------------------------------------------------------