├── DE_TCGA_2_updated_github.R
├── MatchMixeR
    ├── DESCRIPTION
    ├── MatchMixeR.Rproj
    ├── NAMESPACE
    ├── R
    │   ├── MatchMixeR.R
    │   ├── ckmeans.R
    │   ├── dwd.R
    │   ├── eb.R
    │   ├── functions.R
    │   ├── gq.R
    │   └── xpn.R
    ├── Results of devtools check.txt
    ├── data
    │   └── gpl570_gpl96.rda
    ├── man
    │   ├── CONOR.Rd
    │   ├── MM.Rd
    │   ├── flmer.Rd
    │   └── gpl570_gpl96.Rd
    ├── src
    │   ├── MatchMixeR.dll
    │   ├── XPN_MLE.c
    │   ├── XPN_MLE.o
    │   ├── ckmeans_c.c
    │   └── ckmeans_c.o
    └── vignettes
    │   ├── overview.Rmd
    │   └── overview.html
├── README.md
└── Simulation_updated_github.R


/DE_TCGA_2_updated_github.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ### This code was written by Dr. Serin Zhang, as a part of her Statistics PhD
  3 | ### dissertation project at Florida State University
  4 | 
  5 | library("genefilter")
  6 | library("MatchMixeR")
  7 | 
  8 | # setwd("C:/Users/Disa Yu/Dropbox/Cross-platform-normalization")
  9 | # getwd()
 10 | 
 11 | load("./Real_data/matched_TCGA_mat.Rdata")
 12 | load("./Real_data/NormalvsTumor_mat.Rdata")
 13 | 
 14 | ## na in array data -> 0 
 15 | matched_array_mat[is.na(matched_array_mat)] <- 0
 16 | 
 17 | ## choose 100 samples(out of 523 matched samples) as training set
 18 | train.ID <- sample(1:523,100,replace=FALSE)
 19 | X <- matched_array_mat[,train.ID] ; Y <- matched_seq_mat[,train.ID]
 20 | train.sample <- colnames(X)
 21 | OLSmod <- MatchMixeR:::OLS(X, Y)
 22 | FLMERmod <-MM(X, Y)
 23 | 
 24 | ### DE analysis 
 25 | ## Get DEGs from Yuhang's list (SigGeneList_1.xls)
 26 | SigGeneList <- read.csv("./Real_data/SigGeneList_nocomments.csv")
 27 | 
 28 | SigGene <- SigGeneList[,c(1,3,7)]
 29 | # using cut off p-value < 0.001 ---> # of DEGs = 9555
 30 | SigGene_padj <- SigGene[SigGene$padj< .001,]
 31 | colnames(SigGene_padj)[1] <- "SigGeneList_padj"
 32 | SigG1 <- as.character(SigGene_padj$SigGeneList_padj)
 33 | glist_1 <- strsplit(SigG1, "|", fixed=TRUE) 
 34 | SigG1_ID <-t(data.frame(glist_1))
 35 | SigG1_ID <- data.frame(SigG1_ID)
 36 | colnames(SigG1_ID)[1] <- "geneID"
 37 | common.gene <- data.frame(row.names(matched_array_mat)); colnames(common.gene) <-"geneID" 
 38 | DEG1 <- merge(cbind(common.gene,c(1:16146)),SigG1_ID,by="geneID")
 39 | DEG_pa <- as.character(DEG1$geneID)
 40 | 
 41 | ## remove train samples from the test sample pool  
 42 | normal_array_1 <- normal_array[ ,setdiff(colnames(normal_array), train.sample)]
 43 | tumor_array_1 <- tumor_array[,setdiff (colnames(tumor_array), train.sample)]
 44 | normal_seq_1 <- normal_seq[ ,setdiff(colnames(normal_seq), train.sample)]
 45 | tumor_seq_1 <- tumor_seq[,setdiff (colnames(tumor_seq), train.sample)]
 46 | s1 <- length(colnames(normal_array_1)); s2 <-length(colnames(tumor_array_1)) 
 47 | s3 <- length(colnames(normal_seq_1)); s4 <-length(colnames(tumor_seq_1)) 
 48 | 
 49 | TP <- matrix(0,30,8);  FP <- matrix(0,30,8); com <- matrix(0,30,4)
 50 | colnames(TP) <- c("raw","OLS","MM","DWD","XPN","EB","GQ","Fisher")
 51 | colnames(FP) <- c("raw","OLS","MM","DWD","XPN","EB","GQ","Fisher")
 52 | colnames(com) <- c("array.P", "seq.P", "com.P", "com.TP")
 53 | 
 54 | for (i in 1:5)
 55 | { print(i)
 56 |   n1 <- 20; n2 <- 0 # GroupA&PlatformX ; GroupB&PlatformX  
 57 |   n3 <- 0; n4 <- 20 # GroupA&PlatformY ; GroupB&PlatformY
 58 |   normal_arrayID <-  sample(1:s1,n1,replace=FALSE)
 59 |   tumor_arrayID <-  sample(1:s2,n2,replace=FALSE)
 60 |   normal_seqID <-  sample(1:s3,n3,replace=FALSE)
 61 |   tumor_seqID <-  sample(1:s4,n4,replace=FALSE)
 62 |   test.array <- cbind(normal_array_1[,normal_arrayID],tumor_array_1[,tumor_arrayID])
 63 |   test.array[is.na(test.array)] = 0
 64 |   test.seq <- cbind(normal_seq_1[,normal_seqID],tumor_seq_1[,tumor_seqID])
 65 |   
 66 |   XtransOLS <- test.array*OLSmod$betamat[, "Slope"] + OLSmod$betamat[, "Intercept"]
 67 |   XtransFLMER <- test.array*FLMERmod$betamat[, "Slope"] + FLMERmod$betamat[, "Intercept"]
 68 |   test.array<- data.frame(test.array);test.seq<- data.frame(test.seq)
 69 |   DWDmod <- dwd(test.array, test.seq)
 70 |   EBmod <- eb(test.array, test.seq)
 71 |   GQmod <- gq(test.array,test.seq)  # Check: is gq from genefilter package?
 72 |   XPNmod <- xpn(test.array, test.seq)
 73 |   
 74 |   ## check stat. power and type I error
 75 |   DataArray <- as.matrix(test.array)
 76 |   DataSeq <- as.matrix(test.seq)
 77 |   DataRaw <- as.matrix(cbind(test.array, test.seq))
 78 |   DataOLS <- as.matrix(cbind(XtransOLS, test.seq))
 79 |   DataFLMER <- as.matrix(cbind(XtransFLMER, test.seq))
 80 |   DataDWD <- as.matrix(cbind(DWDmod$x,DWDmod$y))
 81 |   DataXPN <- as.matrix(cbind(XPNmod$x,XPNmod$y))
 82 |   DataEB <- as.matrix(cbind(EBmod$x,EBmod$y))
 83 |   DataGQ <- as.matrix(cbind(GQmod$x,GQmod$y))
 84 |   fac_1 <- as.factor(c(rep(0, n1), rep(1, n2),rep(0, n3), rep(1, n4)))
 85 |   fac_array <- as.factor(c(rep(0, n1), rep(1, n2)))
 86 |   fac_seq <- as.factor(c(rep(0, n3), rep(1, n4)))                 
 87 |   rr.raw <- rowttests(DataRaw, fac_1)
 88 |   rr.OLS <- rowttests(DataOLS, fac_1)
 89 |   rr.FLMER <- rowttests(DataFLMER, fac_1)
 90 |   rr.DWD <- rowttests(DataDWD, fac_1)
 91 |   rr.XPN <- rowttests(DataXPN, fac_1)
 92 |   rr.EB <- rowttests(DataEB, fac_1)
 93 |   rr.GQ <- rowttests(DataGQ, fac_1)
 94 |   rr.array <- rowttests(DataArray, fac_array)
 95 |   rr.seq <- rowttests(DataSeq, fac_seq)   
 96 |   
 97 |   p.array <- rr.array[rr.array$p.value < 0.05,]
 98 |   TP.array <- as.numeric(dim(p.array[row.names(p.array) %in% DEG_pa,])[1])
 99 |   FP.array <- as.numeric(dim(p.array)[1]) - TP.array  
100 |   
101 |   p.seq <- rr.seq[rr.seq$p.value < 0.05,]  
102 |   TP.seq <- as.numeric(dim(p.seq[row.names(p.seq) %in% DEG_pa,])[1])
103 |   FP.seq <- as.numeric(dim(p.seq)[1]) - TP.seq
104 |   
105 |   # common TP bet.array&seq results 
106 |   tp.array <- p.array[row.names(p.array) %in% DEG_pa,]
107 |   tp.seq <- p.seq[row.names(p.seq) %in% DEG_pa,]
108 |   com.p <- as.numeric(dim(p.array[row.names(p.array) %in% row.names(p.seq),])[1])
109 |   com.tp <- as.numeric(dim(tp.array[row.names(tp.array) %in% row.names(tp.seq),])[1])
110 |   array.p <- as.numeric(dim(p.array)[1])
111 |   seq.p <- as.numeric(dim(p.seq)[1])
112 |   
113 |   sumlogP <- -2*(log(rr.array$p.value) + log(rr.seq$p.value)) #sum of log 
114 |   combineP <- 1-pchisq(sumlogP,4)
115 |   rr.combine <- cbind(rr.array,combineP)
116 |   p.fisher <- rr.combine[rr.combine$combineP < 0.05,]
117 |   TP.fisher <- as.numeric(dim(p.fisher[row.names(p.fisher) %in% DEG_pa,])[1])
118 |   FP.fisher<- as.numeric(dim(p.fisher)[1]) - TP.fisher
119 |  
120 |   
121 |   p.raw <- rr.raw[rr.raw$p.value < 0.05,]
122 |   TP.raw <- as.numeric(dim(p.raw[row.names(p.raw) %in% DEG_pa,])[1])
123 |   FP.raw <- as.numeric(dim(p.raw)[1]) - TP.raw 
124 |   
125 |   
126 |   p.OLS <- rr.OLS[rr.OLS$p.value < 0.05,]
127 |   TP.OLS <- as.numeric(dim(p.OLS[row.names(p.OLS) %in% DEG_pa,])[1])
128 |   FP.OLS <- as.numeric(dim(p.OLS)[1]) - TP.OLS 
129 |   
130 |   
131 |   p.FLMER <- rr.FLMER[rr.FLMER$p.value < 0.05,]
132 |   TP.FLMER <- as.numeric(dim(p.FLMER[row.names(p.FLMER) %in% DEG_pa,])[1])
133 |   FP.FLMER <- as.numeric(dim(p.FLMER)[1]) - TP.FLMER 
134 |   
135 |   p.DWD <- rr.DWD[rr.DWD$p.value < 0.05,]
136 |   TP.DWD <- as.numeric(dim(p.DWD[row.names(p.DWD) %in% DEG_pa,])[1])
137 |   FP.DWD <- as.numeric(dim(p.DWD)[1]) - TP.DWD  
138 |   
139 |   p.XPN <- rr.XPN[rr.XPN$p.value < 0.05,]
140 |   TP.XPN <- as.numeric(dim(p.XPN[row.names(p.XPN) %in% DEG_pa,])[1])
141 |   FP.XPN <- as.numeric(dim(p.XPN)[1]) - TP.XPN
142 |   
143 |   p.EB <- rr.EB[rr.EB$p.value < 0.05,]
144 |   TP.EB <- as.numeric(dim(p.EB[row.names(p.EB) %in% DEG_pa,])[1])
145 |   FP.EB <- as.numeric(dim(p.EB)[1]) - TP.EB
146 |   
147 |   p.GQ <- rr.GQ[rr.GQ$p.value < 0.05,]
148 |   TP.GQ <- as.numeric(dim(p.GQ[row.names(p.GQ) %in% DEG_pa,])[1])
149 |   FP.GQ <- as.numeric(dim(p.GQ)[1]) - TP.GQ
150 |   
151 |   TP[i,1]<-TP.raw; FP[i,1]<-FP.raw
152 |   TP[i,2]<-TP.OLS; FP[i,2]<-FP.OLS
153 |   TP[i,3]<-TP.FLMER; FP[i,3]<-FP.FLMER
154 |   TP[i,4]<-TP.DWD; FP[i,4]<-FP.DWD
155 |   TP[i,5]<-TP.XPN; FP[i,5]<-FP.XPN
156 |   TP[i,6]<-TP.EB; FP[i,6]<-FP.EB
157 |   TP[i,7]<-TP.GQ; FP[i,7]<-FP.GQ
158 |   TP[i,8]<-TP.fisher; FP[i,8]<-FP.fisher
159 |   
160 |   com [i,1]<- array.p;com [i,2]<-seq.p; com [i,3]<-com.p; com [i,4]<-com.tp
161 | }
162 | 
163 | ex6 <- cbind(TP,FP,com) #repeat from line 62 with different n1-n4 numbers: ex1(10,10,10,10), ex2(15,15,5,5), ex3(15,5,5,15), ex4(20,0,5,15), ex5(25,5,0,10), ex6(20,0,0,20)
164 | class(ex6) # "matrix"
165 | dim(ex6) # 30 20
166 | 
167 | tp <- mean(ex6[,2:8])
168 | fp <- mean(ex6[,10:16])
169 | P <- tp/(tp + fp)
170 | R <- tp/9555
171 | F1 <- 2*(P*R)/(P+R)
172 | 
173 | # Getting tp, fp, and F1 values for MM in ex6
174 | tp_MM = ex6[,3]
175 | fp_MM = ex6[,11]
176 | 
177 | # Getting tp, fp, and F1 values for DWD in ex6
178 | tp_DWD = ex6[,4]
179 | fp_DWD = ex6[,12]
180 | 
181 | ratio6 = "ex6(20,0,0,20)"
182 | # cat("iteration = ", iter <- iter + 1, "\n")
183 | 
184 | cat("The ratio is ", ratio6, "\n",
185 |     "The value of tp is ", tp, "\n", 
186 |     "The value of fp is ", fp, "\n",
187 |     "The value of F1 is ", F1)
188 | 


--------------------------------------------------------------------------------
/MatchMixeR/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: MatchMixeR
 2 | Type: Package
 3 | Title: MatchMixeR: a cross-platform normalization tool
 4 | Version: 0.1.1
 5 | Author: Serin Zhang, Xing Qiu, Jinfeng Zhang
 6 | Maintainer: Serin Zhang <serin20@gmail.com>
 7 | Description: A matched sample based mixed effect regression model for cross-platform normalization
 8 | License: GPL (>= 2)
 9 | Encoding: UTF-8
10 | VignetteBuilder: knitr
11 | Suggests: knitr, rmarkdown, limma
12 | LazyData: true
13 | RoxygenNote: 6.1.1
14 | 


--------------------------------------------------------------------------------
/MatchMixeR/MatchMixeR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/MatchMixeR/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(MM)
 4 | export(dwd)
 5 | export(eb)
 6 | export(flmer)
 7 | export(gq)
 8 | export(xpn)
 9 | useDynLib(MatchMixeR,ckmeans_c)
10 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/MatchMixeR.R:
--------------------------------------------------------------------------------
  1 | # Per-gene OLS model
  2 | #'@param X gene expression matrix
  3 | #'@param Y gene expression matrix
  4 | OLS <- function(X, Y){
  5 |   ## A fast row-wise univariate OLS regression based on linear algebra
  6 |   Xbar <- rowMeans(X); Ybar <- rowMeans(Y, na.rm = TRUE)
  7 |   X.c <- sweep(X, 1, Xbar); Y.c <- sweep(Y, 1, Ybar)
  8 |   CovXY <- rowSums(X.c * Y.c, na.rm = TRUE)
  9 |   VarX <- rowSums(X.c^2, na.rm = TRUE); VarY <- rowSums(Y.c^2, na.rm = TRUE)
 10 |   ## regression coefs.
 11 |   beta1 <- CovXY / VarX; beta0 <- Ybar - beta1 * Xbar
 12 |   betamat <- cbind("Intercept"=beta0, "Slope"=beta1)
 13 |   ## Pearson corr. coefs.
 14 |   cc <- CovXY / sqrt(VarX * VarY)
 15 |   ## predictions
 16 |   Yhat <- sweep(X, 1, beta1, FUN="*") + beta0 %o% rep(1, ncol(X))
 17 |   ## RSS vector
 18 |   RSS <- rowSums((Y - Yhat)^2)
 19 |   SS1 <- rowSums(Yhat^2)
 20 |   SST <- rowSums(Y^2)
 21 |   return(list(betamat=betamat, corr=cc, covxy=CovXY, varx=VarX, Yhat=Yhat, RSS=RSS, var.explained.prop=SS1/SST, Xbar=Xbar, Ybar=Ybar))
 22 | }
 23 | 
 24 | ## The main cross-platform normalization procedure: FLMER is based on
 25 | ## the proposed moment-based method
 26 | #'@param X gene expression matrix
 27 | #'@param Y gene expression matrix
 28 | #'@export
 29 | flmer <- function(Xmat, Ymat){
 30 |   Xmat <- as.matrix(Xmat); Ymat <- as.matrix(Ymat)
 31 |   ## 11/12/2018. Use new notations; p==ngenes; n==sample size
 32 |   p <- nrow(Xmat); n <- ncol(Xmat); N <- p*n
 33 |   ## calculate some useful items
 34 |   Xibar <- rowMeans(Xmat); Xbar <- mean(Xibar)
 35 |   Xmat.c <- Xmat - Xibar
 36 |   Yibar <- rowMeans(Ymat); Ybar <- mean(Yibar)
 37 |   Ymat.c <- Ymat - Yibar
 38 |   covXY <- rowSums(Xmat.c * Ymat.c)
 39 |   covXX <- rowSums(Xmat.c * Xmat.c)
 40 |   beta.yixi <- covXY / covXX
 41 |   ## The overall regression
 42 |   Xc <- Xmat - mean(Xmat); Yc <- Ymat - mean(Ymat)
 43 |   beta.yx <- sum(Xc*Yc) / sum(Xc^2)
 44 |   ## In this case, we assume  that there is no collinearity in Z
 45 |   qprime <- 2*p
 46 |   var.epsilon <- sum((Ymat.c - Xmat.c * beta.yixi)^2) / (N - qprime)
 47 |   ## Preparing for calculating S, \| \mathbf{R}_{Z|\mathbf{X}} \|^2,
 48 |   ## and \|Z' \mathbf{R}_{Z|\mathbf{X}} \|^2 terms.
 49 |   XiYibar <- covXY/n + Xibar*Yibar
 50 |   ## Xi2bar = \|Xi\|^2 / n
 51 |   Xi2bar <- covXX/n + Xibar^2
 52 |   ## Xs is Xmat standardized by the global mean/sd
 53 |   Xs <- (Xmat - Xbar) / sqrt(sum((Xmat-Xbar)^2))
 54 |   Xsbar <- rowMeans(Xs)
 55 |   ## XsX is the inner producd between Xs and X. It is denoted as
 56 |   ## \varsigma in the notes.
 57 |   XsX <- rowSums(Xs*Xmat)
 58 |   XsNorm2 <- sum(Xs^2)
 59 |   ## the normalized version of S
 60 |   S <- n^2 * (sum((Yibar - Ybar -(Xibar - Xbar)*beta.yx)^2) + sum((XiYibar -Ybar*Xibar -(Xi2bar - Xbar*Xibar)*beta.yx)^2)) / var.epsilon
 61 |   ## Calculate \| \mathbf{R}_{Z|\mathbf{X}} \|^2
 62 |   Projxz.norm2 <- n + XsNorm2 * ( n^2*sum(Xsbar^2) + sum(XsX^2)) + n*sum(Xibar^2)/p
 63 |   Rzx.norm2 <- sum(Xmat^2) + N -Projxz.norm2
 64 |   ## General terms; without n^4 yet.
 65 |   S1 <- sum(Xibar^2); S2 <- sum(Xsbar^2); S3 <- sum(XsX^2)/n^2
 66 |   ZPZ.norm2 <- p^2/N^2 + S2^2 + 2*( p*S1/N^2 + S2*S3) + S1^2/N^2 + 2*(sum(Xibar*XsX)^2)/(N*n^2) + S3^2
 67 |   ## subtractions; to multiply by *n^4
 68 |   Minus.terms <- sum((1/N + Xsbar^2)^2) + 2*sum((Xibar/N + Xsbar*XsX/n)^2) + sum((Xibar^2/N + XsX^2/n^2)^2)
 69 |   ## added terms
 70 |   XiNorm2 <- rowSums(Xmat^2)
 71 |   Add.terms <- sum((1/n -1/N -Xsbar^2)^2) + 2*sum((Xibar/n -Xibar/N -Xsbar*XsX/n)^2) + sum((XiNorm2/n^2 -Xibar^2/N -XsX^2/n^2)^2)
 72 |   ZRzx.norm2 <- n^4*(ZPZ.norm2 - Minus.terms + Add.terms)
 73 |   ## Estimate lambda based on Moment matching
 74 |   lambdahat <- max(0, (S - Rzx.norm2) / ZRzx.norm2)
 75 |   ## Inference of the fixed effects. First, compute some small
 76 |   ## "building blocks".
 77 |   Axx <- XiNorm2 - (lambdahat/(1+n*lambdahat)) * n^2 * Xibar^2
 78 |   A1x <- n*Xibar/(1+n*lambdahat)
 79 |   A1y <- n*Yibar/(1+n*lambdahat)
 80 |   Axy <- XiYibar*n - (lambdahat/(1+n*lambdahat)) * n^2 * Xibar* Yibar
 81 |   W11 <- n/(1+n*lambdahat) - lambdahat*A1x^2/(1+lambdahat*Axx)
 82 |   W1x <- A1x/(1+lambdahat*Axx)
 83 |   W1y <- A1y -lambdahat*A1x*Axy / (1+lambdahat*Axx)
 84 |   Wxx <- Axx/(1+lambdahat*Axx)
 85 |   Wxy <- Axy / (1+lambdahat*Axx)
 86 |   ## Now the actual estimation
 87 |   covBeta <- var.epsilon * solve(matrix(c(sum(W11), sum(W1x), sum(W1x), sum(Wxx)), 2))
 88 |   betahat <- drop(covBeta %*% c(sum(W1y), sum(Wxy)) / var.epsilon)
 89 |   names(betahat) <- c("Intercept", "Slope")
 90 |   dimnames(covBeta) <- list(c("Intercept", "Slope"), c("Intercept", "Slope"))
 91 |   ## Mixed effects terms via EBLUP
 92 |   gamma0hat <- lambdahat * (W1y - betahat[1]*W11 - betahat[2]*W1x)
 93 |   gamma1hat <- lambdahat * (Wxy - betahat[1]*W1x - betahat[2]*Wxx)
 94 |   ## Individual betas
 95 |   betamat <- t(rbind(gamma0hat, gamma1hat) + betahat)
 96 |   colnames(betamat) <- c("Intercept", "Slope")
 97 |   ## t-statistics for the fixed effects
 98 |   t.fixed <- betahat/sqrt(diag(covBeta))
 99 |   ## the predicted Yhat
100 |   Yhat <- Xmat * betamat[, "Slope"] + betamat[, "Intercept"]
101 |   return(list(betahat=betahat, betamat=betamat, Yhat=Yhat,
102 |               lambdahat=lambdahat, var.epsilon=var.epsilon,
103 |               covBeta=covBeta, t.fixed=t.fixed))
104 | }
105 | 
106 | 
107 | 
108 | ## Implement the covariance transformed FLMER as follows.
109 | #'Match-MixeR
110 | #'
111 | #'fit mixed effect regression model
112 | #'@param X gene expression matrix
113 | #'@param Y gene expression matrix
114 | #'@export
115 | MM <- function(Xmat, Ymat){
116 |   rr1 <- OLS(Xmat, Ymat)
117 |   cov1 <- cov(rr1$betamat)
118 |   s0 <- sqrt(cov1[1,1]); s1 <- sqrt(cov1[2,2]); rho <- cov1[1,2]/s0/s1
119 |   a12 <- rho/sqrt(1-rho^2); a22 <- s1/(s0 * sqrt(1-rho^2))
120 |   A <- matrix(c(1, a12,
121 |                 0, a22), 2, byrow=TRUE)
122 |   ## the X transformation
123 |   Xtilde <- a12 + a22*Xmat
124 |   ## apply flmer() to the covariance transformed data
125 |   rr3 <- flmer(Xtilde, Ymat)
126 |   ## the reverse transformation. Note that each *row* of "betamat" is
127 |   ## beta_i; so we have to transpose the reverse matrix multiplication.
128 |   betamat <- rr3$betamat %*% t(A); colnames(betamat) <- colnames(rr3$betamat)
129 |   ## other misc. items
130 |   betahat <- drop(A %*% rr3$betahat); names(betahat) <- names(rr3$betahat)
131 |   lambdahat <- rr3$lambdahat
132 |   covGamma <-  rr3$lambdahat * rr3$var.epsilon * (A %*% t(A))
133 |   Yhat <- rr3$Yhat
134 |   var.epsilon <- rr3$var.epsilon
135 |   covBeta <- A %*% rr3$covBeta %*% t(A)
136 |   t.fixed <- betahat/sqrt(diag(covBeta))
137 |   return(list(betahat=betahat, betamat=betamat, Yhat=Yhat,
138 |               lambdahat=lambdahat, var.epsilon=var.epsilon,
139 |               covGamma <- covGamma,
140 |               covBeta=covBeta, t.fixed=t.fixed))
141 | }
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/ckmeans.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  3 | 
  4 | #This program is free software: you can redistribute it and/or modify
  5 | #it under the terms of the GNU General Public License as published by
  6 | #the Free Software Foundation, either version 3 of the License, or
  7 | #(at your option) any later version.
  8 | 
  9 | #This program is distributed in the hope that it will be useful,
 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | #GNU General Public License for more details.
 13 | 
 14 | #You should have received a copy of the GNU General Public License
 15 | #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | ######################################################################
 17 | 
 18 | ckmeans = function (x, centers, iter.max = 10, nstart = 1, distance = "pearson")
 19 | {
 20 | 
 21 | 
 22 | 	if (distance == "pearson"){
 23 | 		distance = 1
 24 | 	}else if(distance == "euclidean"){
 25 | 		distance = 2
 26 | 	}else if(distance == "manhattan"){
 27 | 		distance = 3
 28 | 	}else if(distance == "absolutepearson"){
 29 | 		distance = 4
 30 | 	}
 31 | 	nmeth = "lloyd"
 32 | 
 33 | 	#Normalization consistent with flexclust
 34 | 	x = x/sqrt(rowSums(x^2))
 35 | 
 36 | 
 37 |     do_one <- function(nmeth) {
 38 |       #' @useDynLib MatchMixeR ckmeans_c
 39 |             Z <- .C(ckmeans_c, as.double(x), as.integer(m),
 40 |                 as.integer(ncol(x)), centers = as.double(centers),
 41 |                 as.integer(k), c1 = integer(m), iter = as.integer(iter.max),
 42 |                 nc = integer(k), wss = double(k), as.integer(distance))
 43 |             if (Z$iter > iter.max) {
 44 |             	warning("did not converge in ",
 45 |                 iter.max, " iterations", call. = FALSE)
 46 |                 Z$converged = FALSE
 47 |             }else{
 48 |                 Z$converged = TRUE
 49 |             }
 50 |             if (any(Z$nc == 0)) {
 51 |             	warning("empty cluster: try a better set of initial centers",
 52 |                 call. = FALSE)
 53 |                 Z$empty <- TRUE
 54 |             }else{
 55 |                 Z$empty <- FALSE
 56 |             }
 57 | 			Z
 58 |     }
 59 |     x <- as.matrix(x)
 60 |     m <- nrow(x)
 61 |     if (missing(centers))
 62 |         stop("'centers' must be a number or a matrix")
 63 | 
 64 |     if (length(centers) == 1L) {
 65 |         k <- centers
 66 |         if (nstart == 1)
 67 |             centers <- x[sample.int(m, k), , drop = FALSE]
 68 |         if (nstart >= 2 || any(duplicated(centers))) {
 69 |             cn <- unique(x)
 70 |             mm <- nrow(cn)
 71 |             if (mm < k)
 72 |                 stop("more cluster centers than distinct data points.")
 73 |             centers <- cn[sample.int(mm, k), , drop = FALSE]
 74 |         }
 75 |     }
 76 |     else {
 77 |         centers <- as.matrix(centers)
 78 |         if (any(duplicated(centers)))
 79 |             stop("initial centers are not distinct")
 80 |         cn <- NULL
 81 |         k <- nrow(centers)
 82 |         if (m < k)
 83 |             stop("more cluster centers than data points")
 84 |     }
 85 |     if (iter.max < 1)
 86 |         stop("'iter.max' must be positive")
 87 |     if (ncol(x) != ncol(centers))
 88 |         stop("must have same number of columns in 'x' and 'centers'")
 89 |     Z <- do_one(nmeth)
 90 |     if (nstart >= 2 && !is.null(cn)) {
 91 |         best <- sum(Z$wss)
 92 |         for (i in 2:nstart) {
 93 |             centers <- cn[sample.int(mm, k), , drop = FALSE]
 94 |             ZZ <- do_one(nmeth)
 95 |             if ((z <- sum(ZZ$wss)) < best) {
 96 |                 Z <- ZZ
 97 |                 best <- z
 98 |             }
 99 |         }
100 |     }
101 |     centers <- matrix(Z$centers, k)
102 |     dimnames(centers) <- list(1L:k, dimnames(x)[[2L]])
103 |     cluster <- Z$c1
104 |     if (!is.null(rn <- rownames(x)))
105 |         names(cluster) <- rn
106 |     out <- list(cluster = cluster, centers = centers, withinss = Z$wss,
107 |         size = Z$nc, empty = Z$empty, converged = Z$converged)
108 |     class(out) <- "kmeans"
109 |     out
110 | }
111 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/dwd.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  3 | 
  4 | #This program is free software: you can redistribute it and/or modify
  5 | #it under the terms of the GNU General Public License as published by
  6 | #the Free Software Foundation, either version 3 of the License, or
  7 | #(at your option) any later version.
  8 | 
  9 | #This program is distributed in the hope that it will be useful,
 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | #GNU General Public License for more details.
 13 | 
 14 | #You should have received a copy of the GNU General Public License
 15 | #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | ######################################################################
 17 | 
 18 | sepelimdwd = function(Xp,Xn,penalty, useSparse=TRUE){
 19 | 	#Xp and Xn are matrices.  penalty is a scalar.  This is an adaptation of the Matlab function of the same name, written by J. S. Marron, available at https://genome.unc.edu/pubsup/dwd/.
 20 | 
 21 | 	flag <- 0
 22 | 	#Dimensions of the data
 23 | 	dp <- dim(Xp)[1]
 24 | 	np <- dim(Xp)[2]
 25 | 	dn <- dim(Xn)[1]
 26 | 	nn <- dim(Xn)[2]
 27 | 	if (dn != dp) {stop('The dimensions are incomapatible.')}
 28 | 	d <- dp
 29 | 
 30 | 	#Dimension reduction in HDLSS setting.
 31 | 	XpnY <- as.matrix(cbind(Xp,-1*Xn))
 32 | 	XpnY11 <- XpnY[1,1]
 33 | 	n <- np + nn
 34 | 	if(d>n){
 35 | 		qrfact = qr(XpnY)
 36 | 		Q = qr.Q(qrfact)
 37 | 		R = qr.R(qrfact)
 38 | 		RpnY = R
 39 | 		dnew = n
 40 | 	}else{
 41 | 		RpnY = XpnY
 42 | 		dnew = d
 43 | 	}
 44 | 
 45 | 	y = ones(np + nn,1)
 46 | 	y[(np+1):(np+nn),1] = -1
 47 | 	ym = y[2:n,1]
 48 | 
 49 | 	# nv is the number of variables (eliminating beta)
 50 | 	# nc is the number of constraints
 51 | 	nv = 1 + dnew + 4*n
 52 | 	nc = 2*n
 53 | 	#Set up the block structure, constraint matrix, rhs, and cost vector
 54 | 
 55 | 	blk = list()
 56 | 	blk$type = character()
 57 | 	blk$size = list()
 58 | 	blk$type[1] = 's'
 59 | 	blk$size[[1]] = cbind(dnew+1,3*ones(1,n))
 60 | 	blk$type[2] = 'l'
 61 | 	blk$size[[2]] = n
 62 | 
 63 | 
 64 | 	Avec = list()
 65 | 	A = zeros(nc,nv-n)
 66 | 	col1 = RpnY[,1]
 67 | 	A[1:(n-1),2:(dnew+1)] = t(RpnY[,2:n] - col1%*%t(ym))
 68 | 	A[1:(n-1),seq(dnew+5,dnew+1+3*n,3)]	= -1*speye(n-1)
 69 | 	A[1:(n-1),seq(dnew+6,dnew+2+3*n,3)]	= speye(n-1)
 70 | 	A[1:(n-1),dnew+2] = ym
 71 | 	A[1:(n-1),dnew+3] = -1*ym
 72 | 	A[n,1] = 1
 73 | 	A[(n+1):(n+n),seq(dnew+4,dnew+3+3*n,3)] = speye(n)
 74 | 
 75 | 
 76 | 	Avec[[1]] = t(A)
 77 | 	Avec[[2]] = (rbind(cbind(-1*ym,speye(n-1)),zeros(1+n,n)))
 78 | 	b = rbind(zeros(n-1,1),ones(1+n,1))
 79 | 
 80 | 
 81 | 	C = list()
 82 | 	c = zeros(nv-n,1)
 83 | 	c[seq(dnew+2,dnew+1+3*n,3),1] = ones(n,1)
 84 | 	c[seq(dnew+3,dnew+2+3*n,3),1] = ones(n,1)
 85 | 
 86 | 
 87 | 	C[[1]] = c
 88 | 	C[[2]] = penalty*ones(n,1)
 89 | 
 90 | 
 91 | 	#Solve the SOCP problem
 92 | 
 93 | ##################CLSOCP############################
 94 | 	CL_K <- unlist(blk$size)
 95 | 	CL_qlen <-sum(blk$size[[1]])
 96 | 	CL_llen <-sum(blk$size[[2]])
 97 | 	CL_type <- c(rep('q',length(blk$size[[1]])),rep('l',length(blk$size[[2]])))
 98 | 	CL_A <- cbind(t(Avec[[1]]),Avec[[2]])
 99 | 	CL_c <- rbind(C[[1]],C[[2]])
100 | 
101 | 
102 | 	soln <- CLSOCP::socp(CL_A,b,CL_c,CL_K,CL_type,gamma_fac=.3,sigma0 = .1,use_sparse=useSparse)
103 | 
104 | 
105 | 	X1 <- soln$x[1:CL_qlen]
106 | 	X2 <- soln$x[(CL_qlen+1):(CL_qlen+CL_llen)]
107 | 	lambda <- soln$y
108 | #####################################################
109 | 
110 | 
111 | 	# Compute the normal vector w and constant term beta.
112 | 
113 | 	barw = X1[2:(dnew+1)]
114 | 	if (d>n){
115 | 		w = Q %*% barw
116 | 	}else{
117 | 		w = barw
118 | 	}
119 | 	beta = X1[dnew + 2] - X1[dnew + 3] - X2[1] - t(col1)%*%barw
120 | 	normw = norm(w)
121 | 	if (normw < 1 - 1e-3){
122 | 		print(normw)
123 | 	}
124 | 	normwm1 = 0
125 | 	if (normw > 1 - 1e-3){
126 | 		w = w/normw
127 | 		normwm1 = norm(w) - 1
128 | 		beta = beta/normw
129 | 	}
130 | 
131 | 
132 | 	# Compute the minimum of the supposedly positive
133 | 	# and the maximum of the supposedly negative residuals.
134 | 	# Refine the primal solution and print its objective value.
135 | 
136 | 	residp = t(Xp) %*% w + beta[1] #optimization
137 | 	residn = t(Xn) %*% w + beta[1] #optimization
138 | 	minresidp = min(residp)
139 | 	maxresidn = max(residn)
140 | 	res = t(XpnY) %*% w + beta[1] * y
141 | 	rsc = 1/sqrt(penalty)
142 | 	xi = -1* res + rsc[1]
143 | 	xi[xi<0] <- 0
144 | 	totalviolation = sum(xi)
145 | 	minresidpmod = min(residp + xi[1:np])
146 | 	maxresidnmod = max(residn - xi[(np+1):n])
147 | 	minxi = min(xi)
148 | 	maxxi = max(xi)
149 | 	resn = res + xi
150 | 	rresn = 1 / resn
151 | 	primalobj = penalty * sum(xi) + sum(rresn)
152 | #	print(primalobj)
153 | 
154 | 
155 | 	#Compute the dual solution alp and print its objective value.
156 | 	alp = zeros(n,1)
157 | 	lambda1 = lambda[1:(n-1)]
158 | 	alp[1] = -1*t(ym)%*%lambda1
159 | 	alp[2:n] = lambda1
160 | 	alp = alp * (as.numeric(alp>0))
161 | 	sump = sum(alp[1:np])
162 | 	sumn = sum(alp[(np+1):n])
163 | 	sum2 = (sump + sumn)/2
164 | 	alp[1:np] = (sum2/sump)*alp[1:np]
165 | 	alp[(np+1):n] = (sum2/sumn)*alp[(np+1):n]
166 | 	maxalp = max(alp)
167 | 	if (maxalp > penalty | maxxi > 1e-3){
168 | 		alp = (penalty[1]/maxalp)*alp
169 | 	}
170 | 	minalp = min(alp)
171 | 	p = RpnY%*%alp
172 | 	eta = -1*norm(p)
173 | 	gamma = 2*sqrt(alp)
174 | 	dualobj = eta + sum(gamma)
175 | #	print(dualobj)
176 | 
177 | 
178 | 
179 | 	#dualgap is a measure of the accuracy of the solution
180 | 	dualgap = primalobj - dualobj
181 | #	print(dualgap)
182 | 
183 | 	if (dualgap > 1e-4){
184 | 		flag = -1
185 | 	}
186 | 
187 | 
188 | 	return(list(w=w,beta=beta,residp=residp,residn=residn,alp=alp,totalviolation=totalviolation,dualgap=dualgap,flag=flag))
189 | 
190 | }
191 | 
192 | 
193 | DWD1SM = function(trainp,trainn,threshfact = 100, useSparse = TRUE){
194 | 
195 | 	np = dim(trainp)[2]
196 | 	nn = dim(trainn)[2]
197 | #	vpwdist2 = numeric(np*nn)
198 | 	vpwdist2x <- fields::rdist(t(trainp),t(trainn))
199 | #	for (ip in 1:np){
200 | #		vpwdist2[((ip-1)*nn+1):(ip*nn)] <- colSums((trainp[,ip] - trainn)^2) #optimization
201 | #	}
202 | #	medianpwdist2 = median(vpwdist2)
203 | 	medianpwdist2 = median(vpwdist2x)^2
204 | 
205 | 	penalty = threshfact / medianpwdist2
206 | 	sepelimout = sepelimdwd(trainp,trainn,penalty,useSparse=useSparse)
207 | 	w = sepelimout$w
208 | 	flag = sepelimout$flag
209 | 	if (flag == -1){
210 | 		cat("Inaccurate solution!\n")
211 | 	}
212 | 	if (flag == -2){
213 | 		cat("Infeasible or unbounded optimization problem!\n")
214 | 	}
215 | 	dirvec = w/norm(w)
216 | 	return(dirvec)
217 | }
218 | 
219 | #'@param X gene expression matrix
220 | #'@param Y gene expression matrix
221 | #'@export
222 | dwd = function(platform1.data, platform2.data, platform1.train = NULL, platform2.train=NULL, p1.names=0, p2.names=0,p1.train.names=0, p2.train.names=0, skip.match=FALSE, use.sparse = TRUE) {
223 | 
224 | 	#Match names
225 | 	if(is.null(platform1.train) & is.null(platform2.train)){
226 | 		input = processplatforms(list(x=platform1.data,y=platform2.data), namesvec = c(p1.names, p2.names), skip.match=skip.match)
227 | 
228 | 
229 | 
230 | 		#Get normal vector for DWD adjustment
231 | 		dirvec = DWD1SM(input[[1]],input[[2]],useSparse=use.sparse)
232 | 		#Project the data
233 | 		vprojp = t(input[[1]]) %*% dirvec
234 | 		vprojn = t(input[[2]]) %*% dirvec
235 | 		meanprojp = mean(vprojp)
236 | 		meanprojn = mean(vprojn)
237 | 		output = list()
238 | 		p1.adjust <- -1 * meanprojp * dirvec
239 | 		p2.adjust <- -1 * meanprojn * dirvec
240 | 		names(p1.adjust) <- rownames(input[[1]])
241 | 		names(p2.adjust) <- rownames(input[[2]])
242 | 		output$x = input[[1]] + p1.adjust
243 | 		output$y = input[[2]] + p2.adjust
244 | 		output$p1.adjust <- p1.adjust
245 | 		output$p2.adjust <- p2.adjust
246 | 
247 | 	}else{
248 | 		input = processplatforms(list(x=platform1.data,y=platform2.data,platform1.train,platform2.train), namesvec = c(p1.names, p2.names, p1.train.names, p2.train.names), skip.match=skip.match)
249 | 
250 | 
251 | 		#Get normal vector for DWD adjustment
252 | 		dirvec = DWD1SM(input[[3]],input[[4]],useSparse=use.sparse)
253 | 
254 | 		#Project the data
255 | 		vprojp = t(input[[3]]) %*% dirvec
256 | 		vprojn = t(input[[4]]) %*% dirvec
257 | 
258 | 		meanprojp = mean(vprojp)
259 | 		meanprojn = mean(vprojn)
260 | 
261 | 		output = list()
262 | 		p1.adjust <- -1 * meanprojp * dirvec
263 | 		p2.adjust <- -1 * meanprojn * dirvec
264 | 		names(p1.adjust) <- rownames(input[[1]])
265 | 		names(p2.adjust) <- rownames(input[[2]])
266 | 		output$x = input[[1]] + p1.adjust
267 | 		output$y = input[[2]] + p2.adjust
268 | 		output$p1.adjust <- p1.adjust
269 | 		output$p2.adjust <- p2.adjust
270 | 	}
271 | 
272 | 	return(output)
273 | 
274 | }
275 | 
276 | norm = function(aMatrix){
277 | #Returns the largest singular value of aMatrix
278 | 	o = svd(aMatrix,nu=0,nv=0)
279 | 	return(o$d[1])
280 | 
281 | }
282 | 
283 | 
284 | 
285 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/eb.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  3 | 
  4 | #This program is free software: you can redistribute it and/or modify
  5 | #it under the terms of the GNU General Public License as published by
  6 | #the Free Software Foundation, either version 3 of the License, or
  7 | #(at your option) any later version.
  8 | 
  9 | #This program is distributed in the hope that it will be useful,
 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | #GNU General Public License for more details.
 13 | 
 14 | #You should have received a copy of the GNU General Public License
 15 | #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | ######################################################################
 17 | 
 18 | ######################################################################
 19 | #This file is adapted from the ComBat program, available from
 20 | #http://jlab.byu.edu/ComBat/Abstract.html.  Please cite appropriately
 21 | #when using these functions.  Type citation("CONOR") at the R prompt for
 22 | #details.
 23 | 
 24 | #'@param X gene expression matrix
 25 | #'@param Y gene expression matrix
 26 | #'@export
 27 | eb = function(platform1.data, platform2.data, par.prior=TRUE, filter=FALSE, prior.plots=FALSE, p1.names=0, p2.names=0, skip.match=FALSE){
 28 | 
 29 | 	#Match names
 30 | 	input = processplatforms(list(x=platform1.data,y=platform2.data),namesvec = c(p1.names, p2.names), skip.match=skip.match)
 31 | 
 32 | 	#Dimensions
 33 | 	nx = dim(input$x)[2]
 34 | 	ny = dim(input$y)[2]
 35 | 
 36 | 	#Prepare the data for ComBat
 37 | 	dat = cbind(input$x,input$y)
 38 | 	batch = character(nx+ny)
 39 | 	batch[1:nx] = "X"
 40 | 	batch[(nx + 1):(nx + ny)] = "Y"
 41 | 	saminfo = data.frame(as.character(1:(nx+ny)), as.character(1:(nx+ny)), batch)
 42 | 	colnames(saminfo) = c("Array name", "sample name", "Batch")
 43 | 	orignames = colnames(dat)
 44 | 	colnames(dat) = as.character(1:(nx+ny))
 45 | 
 46 | 	#Call ComBat
 47 | 	cb = ComBat(dat, saminfo, par.prior=par.prior, filter=filter, prior.plots=prior.plots)
 48 | 	colnames(cb) = orignames
 49 | 
 50 | 	#Return results
 51 | 	return(list(x=cb[,1:nx], y=cb[,(nx+1):(nx+ny)]))
 52 | }
 53 | 
 54 | #The code from here down is from the original authors, with slight modification, and was obtained at: http://jlab.byu.edu//ComBat/Download_files/ComBat.R
 55 | 
 56 | #This long comment is from the original authors, and describes the original, unmodified version.  I have made changes only to the interface, so it can use a data.frame as input instead of a file name.
 57 | # 'expression_xls' is the expression index file (e.g. outputted by dChip); 'sample_info_file' is a tab-delimited text file containing the colums: Array  name, sample name, Batch, and any other covariates to be included in the modeling; 'type' currently supports two data file types 'txt' for a tab-delimited text file and 'csv' for an Excel .csv file (sometimes R handles the .csv file better, so use this if you have problems with a .txt file!); 'write' if 'T' ComBat writes adjusted data to a file, and if 'F' and ComBat outputs the adjusted data matrix if 'F' (so assign it to an object! i.e. NewData <- ComBat('my expression.xls','Sample info file.txt', write=F)); 'covariates=all' will use all of the columns in your sample info file in the modeling (except array/sample name), if you only want use a some of the columns in your sample info file, specify these columns here as a vector (you must include the Batch column in this list); 'par.prior' if 'T' uses the parametric adjustments, if 'F' uses the nonparametric adjustments--if you are unsure what to use, try the parametric adjustments (they run faster) and check the plots to see if these priors are reasonable; 'filter=value' filters the genes with absent calls in > 1-value of the samples. The defaut here (as well as in dchip) is .8. Filter if you can as the EB adjustments work better after filtering. Filter must be numeric if your expression index file contains presence/absence calls (but you can set it >1 if you don't want to filter any genes) and must be 'F' if your data doesn't have presence/absence calls; 'skip' is the number of columns that contain probe names and gene information, so 'skip=5' implies the first expression values are in column 6; 'prior.plots' if true will give prior plots with black as a kernal estimate of the empirical batch effect density and red as the parametric estimate.
 58 | 
 59 | ComBat <- function(dat, saminfo, type='txt', covariates='all', par.prior=T, filter=F, skip=0, prior.plots=T){
 60 | 	#debug: expression_xls='exp.txt'; sample_info_file='sam.txt'; type='txt'; write=T; covariates='all'; par.prior=T; filter=F; skip=0; prior.plots=T
 61 | 
 62 | 	#Two lines commented out by Jason Rudy here:
 63 | 	#cat('Reading Sample Information File\n')
 64 | 	#saminfo <- read.table(sample_info_file, header=T, sep='\t',comment.char='')
 65 | 
 66 | 	if(sum(colnames(saminfo)=="Batch")!=1){return('ERROR: Sample Information File does not have a Batch column!')}
 67 | 
 68 |   ## 01/18/2019 modified by Xing
 69 |   dat <- as.matrix(dat)
 70 | 
 71 | 	#Lines commented out by Jason Rudy from here:
 72 | 	#cat('Reading Expression Data File\n')
 73 | 	#if(type=='csv'){
 74 | 	#	dat <- read.csv(expression_xls,header=T,as.is=T)
 75 |                 #print(dat[1:2,])
 76 | 	#	dat <- dat[,trim.dat(dat)]
 77 |                 #print(colnames(dat))
 78 | 	#	colnames(dat)=scan(expression_xls,what='character',nlines=1,sep=',',quiet=T)[1:ncol(dat)]
 79 |                 #print(colnames(dat))
 80 | 	#	}else{
 81 | 	#	dat <- read.table(expression_xls,header=T,comment.char='',fill=T,sep='\t', as.is=T)
 82 | 	#	dat <- dat[,trim.dat(dat)]
 83 | 	#	colnames(dat)=scan(expression_xls,what='character',nlines=1,sep='\t',quiet=T)[1:ncol(dat)]
 84 | 	#	}
 85 | 	#to here.
 86 | 
 87 | 
 88 | 
 89 | 	#From here down, the code is unaltered but for one minor bug fix where ComBat returns.
 90 | 	if (skip>0){
 91 |               geneinfo <- as.matrix(dat[,1:skip])
 92 |               dat <- dat[,-c(1:skip)]
 93 |               }else{geneinfo=NULL}
 94 |         #print(geneinfo[1:4])
 95 |         #print(dat[1:2,])
 96 | 
 97 | 	if(filter){
 98 | 		ngenes <- nrow(dat)
 99 | 		col <- ncol(dat)/2
100 | 		present <- apply(dat, 1, filter.absent, filter)
101 | 		dat <- dat[present, -(2*(1:col))]
102 | 		if (skip>0){geneinfo <- geneinfo[present,]}
103 | 		cat('Filtered genes absent in more than',filter,'of samples. Genes remaining:',nrow(dat),'; Genes filtered:',ngenes-nrow(dat),'\n')
104 | 		}
105 | 
106 | 	if(any(apply(dat,2,mode)!='numeric')){return('ERROR: Array expression columns contain non-numeric values! (Check your .xls file for non-numeric values and if this is not the problem, make a .csv file and use the type=csv option)')}
107 | 
108 | 	tmp <- match(colnames(dat),saminfo[,1])
109 | 	if(any(is.na(tmp))){return('ERROR: Sample Information File and Data Array Names are not the same!')}
110 | 	tmp1 <- match(saminfo[,1],colnames(dat))
111 | 	saminfo <- saminfo[tmp1[!is.na(tmp1)],]
112 | 
113 | 	if(any(covariates != 'all')){saminfo <- saminfo[,c(1:2,covariates)]}
114 | 	design <- design.mat(saminfo)
115 | 
116 | 
117 | 	batches <- list.batch(saminfo)
118 | 	n.batch <- length(batches)
119 | 	n.batches <- sapply(batches, length)
120 | 	n.array <- sum(n.batches)
121 | 
122 | 	## Check for missing values
123 | 	NAs = any(is.na(dat))
124 | 	if(NAs){cat(c('Found',sum(is.na(dat)),'Missing Data Values\n'),sep=' ')}
125 |         #print(dat[1:2,])
126 | 	##Standardize Data across genes
127 | 	cat('Standardizing Data across genes\n')
128 | 	if (!NAs){B.hat <- solve(t(design)%*%design)%*%t(design)%*%t(as.matrix(dat))}else{B.hat=apply(dat,1,Beta.NA,design)} #Standarization Model
129 | 	grand.mean <- t(n.batches/n.array)%*%B.hat[1:n.batch,]
130 | 	if (!NAs){var.pooled <- ((dat-t(design%*%B.hat))^2)%*%rep(1/n.array,n.array)}else{var.pooled <- apply(dat-t(design%*%B.hat),1,var,na.rm=T)}
131 | 
132 | 	stand.mean <- t(grand.mean)%*%t(rep(1,n.array))
133 | 	if(!is.null(design)){tmp <- design;tmp[,c(1:n.batch)] <- 0;stand.mean <- stand.mean+t(tmp%*%B.hat)}
134 | 	s.data <- (dat-stand.mean)/(sqrt(var.pooled)%*%t(rep(1,n.array)))
135 | 
136 | 	##Get regression batch effect parameters
137 | 	cat("Fitting L/S model and finding priors\n")
138 | 	batch.design <- design[,1:n.batch]
139 | 	if (!NAs){gamma.hat <- solve(t(batch.design)%*%batch.design)%*%t(batch.design)%*%t(as.matrix(s.data))}else{gamma.hat=apply(s.data,1,Beta.NA,batch.design)}
140 | 	delta.hat <- NULL
141 | 	for (i in batches){
142 | 		delta.hat <- rbind(delta.hat,apply(s.data[,i], 1, var,na.rm=T))
143 | 		}
144 | 
145 | 	##Find Priors
146 | 	gamma.bar <- apply(gamma.hat, 1, mean)
147 | 	t2 <- apply(gamma.hat, 1, var)
148 | 	a.prior <- apply(delta.hat, 1, aprior)
149 | 	b.prior <- apply(delta.hat, 1, bprior)
150 | 
151 | 
152 | 	##Plot empirical and parametric priors
153 | 
154 | 	if (prior.plots & par.prior){
155 | 		par(mfrow=c(2,2))
156 | 		tmp <- density(gamma.hat[1,])
157 | 		plot(tmp,  type='l', main="Density Plot")
158 | 		xx <- seq(min(tmp$x), max(tmp$x), length=100)
159 | 		lines(xx,dnorm(xx,gamma.bar[1],sqrt(t2[1])), col=2)
160 | 		qqnorm(gamma.hat[1,])
161 | 		qqline(gamma.hat[1,], col=2)
162 | 
163 | 		tmp <- density(delta.hat[1,])
164 | 		invgam <- 1/rgamma(ncol(delta.hat),a.prior[1],b.prior[1])
165 | 		tmp1 <- density(invgam)
166 | 		plot(tmp,  typ='l', main="Density Plot", ylim=c(0,max(tmp$y,tmp1$y)))
167 | 		lines(tmp1, col=2)
168 | 		qqplot(delta.hat[1,], invgam, xlab="Sample Quantiles", ylab='Theoretical Quantiles')
169 | 		lines(c(0,max(invgam)),c(0,max(invgam)),col=2)
170 | 		title('Q-Q Plot')
171 | 	}
172 | 
173 | 	##Find EB batch adjustments
174 | 
175 | 	gamma.star <- delta.star <- NULL
176 | 	if(par.prior){
177 | 		cat("Finding parametric adjustments\n")
178 | 		for (i in 1:n.batch){
179 | 			temp <- it.sol(s.data[,batches[[i]]],gamma.hat[i,],delta.hat[i,],gamma.bar[i],t2[i],a.prior[i],b.prior[i])
180 | 			gamma.star <- rbind(gamma.star,temp[1,])
181 | 			delta.star <- rbind(delta.star,temp[2,])
182 | 			}
183 | 	}else{
184 | 		cat("Finding nonparametric adjustments\n")
185 | 		for (i in 1:n.batch){
186 | 			temp <- int.eprior(as.matrix(s.data[,batches[[i]]]),gamma.hat[i,],delta.hat[i,])
187 | 			gamma.star <- rbind(gamma.star,temp[1,])
188 | 			delta.star <- rbind(delta.star,temp[2,])
189 | 			}
190 | 		}
191 | 
192 | 
193 | 	### Normalize the Data ###
194 | 	cat("Adjusting the Data\n")
195 | 
196 | 	bayesdata <- s.data
197 | 	j <- 1
198 | 
199 | 	for (i in batches){
200 | 		bayesdata[,i] <- (bayesdata[,i]-t(batch.design[i,]%*%gamma.star))/(sqrt(delta.star[j,])%*%t(rep(1,n.batches[j])))
201 | 		j <- j+1
202 | 	}
203 | 
204 | 	bayesdata <- (bayesdata*(sqrt(var.pooled)%*%t(rep(1,n.array))))+stand.mean
205 | 	if(FALSE){
206 | 		output_file <- paste('Adjusted',expression_xls,'.xls',sep='_')
207 |                  #print(geneinfo[1:2])
208 |                  #print(bayesdata[1:2,1:4])
209 | 		 #cat(c(colnames(geneinfo),colnames(dat),'\n'),file=output_file,sep='\t')
210 | 		#suppressWarnings(write.table(cbind(geneinfo,formatC(as.matrix(bayesdata), format = "f")), file=output_file, sep="\t", quote=F,row.names=F,col.names=F,append=T))
211 |                 if(!is.null(geneinfo)){ outdata <- cbind(ProbeID=geneinfo, bayesdata)}else{outdata=bayesdata};
212 | 		write.table(outdata, file=output_file, sep="\t",row.names=F)
213 | 		cat("Adjusted data saved in file:",output_file,"\n")
214 | 		}else{
215 | 
216 | 			#Minor change here: This is a bug fix by Jason Rudy
217 | 			if(!is.null(geneinfo)){
218 | 				return(cbind(geneinfo,bayesdata))
219 | 			}else
220 | 				return(bayesdata)
221 | 		}
222 | }
223 | 
224 | # filters data based on presence/absence call
225 | filter.absent <- function(x,pct){
226 | 	present <- T
227 | 	col <- length(x)/2
228 | 	pct.absent <- (sum(x[2*(1:col)]=="A") + sum(x[2*(1:col)]=="M"))/col
229 | 	if(pct.absent > pct){present <- F}
230 | 	present
231 | }
232 | 
233 | # Next two functions make the design matrix (X) from the sample info file
234 | build.design <- function(vec, des=NULL, start=2){
235 | 	tmp <- matrix(0,length(vec),nlevels(vec)-start+1)
236 | 	for (i in 1:ncol(tmp)){tmp[,i] <- vec==levels(vec)[i+start-1]}
237 | 	cbind(des,tmp)
238 | }
239 | 
240 | design.mat <- function(saminfo){
241 | 	tmp <- which(colnames(saminfo) == 'Batch')
242 | 	tmp1 <- as.factor(saminfo[,tmp])
243 | 	cat("Found",nlevels(tmp1),'batches\n')
244 | 	design <- build.design(tmp1,start=1)
245 | 	ncov <- ncol(as.matrix(saminfo[,-c(1:2,tmp)]))
246 | 	cat("Found",ncov,'covariate(s)\n')
247 | 	if(ncov>0){
248 | 		for (j in 1:ncov){
249 | 			tmp1 <- as.factor(as.matrix(saminfo[,-c(1:2,tmp)])[,j])
250 | 			design <- build.design(tmp1,des=design)
251 | 			}
252 | 		}
253 | 	design
254 | }
255 | 
256 | # Makes a list with elements pointing to which array belongs to which batch
257 | list.batch <- function(saminfo){
258 | 	tmp1 <- as.factor(saminfo[,which(colnames(saminfo) == 'Batch')])
259 | 	batches <- NULL
260 | 	for (i in 1:nlevels(tmp1)){batches <- append(batches, list((1:length(tmp1))[tmp1==levels(tmp1)[i]]))}
261 | 	batches
262 | }
263 | 
264 | # Trims the data of extra columns, note your array names cannot be named 'X' or start with 'X.'
265 | trim.dat <- function(dat){
266 | 	tmp <- strsplit(colnames(dat),'\\.')
267 | 	tr <- NULL
268 | 	for (i in 1:length(tmp)){tr <- c(tr,tmp[[i]][1]!='X')}
269 | 	tr
270 | }
271 | 
272 | # Following four find empirical hyper-prior values
273 | aprior <- function(gamma.hat){m=mean(gamma.hat); s2=var(gamma.hat); (2*s2+m^2)/s2}
274 | 
275 | bprior <- function(gamma.hat){m=mean(gamma.hat); s2=var(gamma.hat); (m*s2+m^3)/s2}
276 | 
277 | postmean <- function(g.hat,g.bar,n,d.star,t2){(t2*n*g.hat+d.star*g.bar)/(t2*n+d.star)}
278 | 
279 | postvar <- function(sum2,n,a,b){(.5*sum2+b)/(n/2+a-1)}
280 | 
281 | 
282 | # Pass in entire data set, the design matrix for the entire data, the batch means, the batch variances, priors (m, t2, a, b), columns of the data  matrix for the batch. Uses the EM to find the parametric batch adjustments
283 | 
284 | it.sol  <- function(sdat,g.hat,d.hat,g.bar,t2,a,b,conv=.0001){
285 | 	n <- apply(!is.na(sdat),1,sum)
286 | 	g.old <- g.hat
287 | 	d.old <- d.hat
288 | 	change <- 1
289 | 	count <- 0
290 | 	while(change>conv){
291 | 		g.new <- postmean(g.hat,g.bar,n,d.old,t2)
292 | 		sum2 <- apply((sdat-g.new%*%t(rep(1,ncol(sdat))))^2, 1, sum,na.rm=T)
293 | 		d.new <- postvar(sum2,n,a,b)
294 | 		change <- max(abs(g.new-g.old)/g.old,abs(d.new-d.old)/d.old)
295 | 		g.old <- g.new
296 | 		d.old <- d.new
297 | 		count <- count+1
298 | 		}
299 | 	#cat("This batch took", count, "iterations until convergence\n")
300 | 	adjust <- rbind(g.new, d.new)
301 | 	rownames(adjust) <- c("g.star","d.star")
302 | 	adjust
303 | }
304 | 
305 | #likelihood function used below
306 | L <- function(x,g.hat,d.hat){prod(dnorm(x,g.hat,sqrt(d.hat)))}
307 | 
308 | # Monte Carlo integration function to find the nonparametric adjustments
309 | int.eprior <- function(sdat,g.hat,d.hat){
310 | 	g.star <- d.star <- NULL
311 | 	r <- nrow(sdat)
312 | 	for(i in 1:r){
313 | 		g <- g.hat[-i]
314 | 		d <- d.hat[-i]
315 | 		x <- sdat[i,!is.na(sdat[i,])]
316 | 		n <- length(x)
317 | 		j <- numeric(n)+1
318 | 		dat <- matrix(as.numeric(x),length(g),n,byrow=T)
319 | 		resid2 <- (dat-g)^2
320 | 		sum2 <- resid2%*%j
321 | 		LH <- 1/(2*pi*d)^(n/2)*exp(-sum2/(2*d))
322 | 		LH[LH=="NaN"]=0
323 | 		g.star <- c(g.star,sum(g*LH)/sum(LH))
324 | 		d.star <- c(d.star,sum(d*LH)/sum(LH))
325 | 		#if(i%%1000==0){cat(i,'\n')}
326 | 		}
327 | 	adjust <- rbind(g.star,d.star)
328 | 	rownames(adjust) <- c("g.star","d.star")
329 | 	adjust
330 | }
331 | 
332 | #fits the L/S model in the presence of missing data values
333 | 
334 | Beta.NA = function(y,X){
335 | 	des=X[!is.na(y),]
336 | 	y1=y[!is.na(y)]
337 | 	B <- solve(t(des)%*%des)%*%t(des)%*%y1
338 | 	B
339 | }
340 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/functions.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  3 | 
  4 | #This program is free software: you can redistribute it and/or modify
  5 | #it under the terms of the GNU General Public License as published by
  6 | #the Free Software Foundation, either version 3 of the License, or
  7 | #(at your option) any later version.
  8 | 
  9 | #This program is distributed in the hope that it will be useful,
 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | #GNU General Public License for more details.
 13 | 
 14 | #You should have received a copy of the GNU General Public License
 15 | #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | ######################################################################
 17 | 
 18 | cross.platform.normalize=function(method,...){
 19 | 	if(is.character(method)){
 20 | 		method = get(method)
 21 | 	}
 22 | 	method(...)
 23 | }
 24 | 
 25 | 
 26 | processplatforms = function(datalist, namesvec=NULL, skip.match=FALSE){
 27 | 	#Convert data from various formats to the proper format for use 
 28 | 	#with all the crossnorm normalization functions
 29 | 	
 30 | 	for(i in 1:length(datalist)){
 31 | 		if(is.matrix(datalist[[i]])){
 32 | 			datalist[[i]] <- as.data.frame(datalist[[i]])
 33 | 		}
 34 | 	}
 35 | 	
 36 | 	if (is.null(namesvec)){
 37 | 		namesvec <- numeric(length(datalist))
 38 | 		for (i in 1:length(datalist)){
 39 | 			namesvec[i] <- 0
 40 | 		}
 41 | 	}
 42 | 	
 43 | 	#Put the row names in their places
 44 | 	for (i in 1:length(namesvec)){
 45 | 		if(namesvec[i] != 0){
 46 | 			rownames(datalist[[i]]) = datalist[[i]][,namesvec[i]]
 47 | 			datalist[[i]] = datalist[[i]][,-1*namesvec[i],drop=FALSE]
 48 | 		}	
 49 | 	}
 50 | 
 51 | 	if(!skip.match){
 52 | 		#Create the common genes list
 53 | 		commongenes <- rownames(datalist[[1]])
 54 | 		for (i in 2:length(datalist)){
 55 | 			commongenes <- intersect(commongenes,rownames(datalist[[i]]))
 56 | 		}
 57 | 	
 58 | 	
 59 | 		#Put it all together
 60 | 		for (i in 1:length(datalist)){
 61 | 			datalist[[i]] <- datalist[[i]][commongenes,,drop=FALSE]
 62 | 		}
 63 | 	}
 64 | 	return(datalist)
 65 | }
 66 | 
 67 | 
 68 | assert = function(aBool){
 69 | 	if (!aBool){
 70 | 		stop("Assert failed!")
 71 | 	}	
 72 | }
 73 | 	
 74 | repmat = function(mat,m,n){
 75 | 	out = mat
 76 | 	if(m>1){
 77 | 	for (i in 2:m){
 78 | 		out = rbind(out,mat)
 79 | 	}
 80 | 	}
 81 | 	if(n>1){
 82 | 	mat = out
 83 | 	for (j in 2:n){
 84 | 		out = cbind(out,mat)
 85 | 	}
 86 | 	}
 87 | 	return(out)
 88 | }
 89 | ones = function(m,n){
 90 | 	return(matrix(1,nrow=m,ncol=n))
 91 | }
 92 | 
 93 | zeros = function(m,n){
 94 | 	return(matrix(0,nrow=m,ncol=n))
 95 | }
 96 | 
 97 | speye = function(m){
 98 | #This function is not actually sparse yet.  Needs to be fixed.
 99 | 
100 | 	return(diag(m))
101 | 	
102 | }
103 | 
104 | 
105 | 
106 | rowMedians = function(aMatrix, na.rm = FALSE){
107 | #Also works on data.frames
108 | 	m = dim(aMatrix)[1]
109 | 	ret = numeric(m)
110 | 	for (i in 1:m){
111 | 		ret[i] = mean(median(aMatrix[i,],na.rm=na.rm),na.rm=TRUE)
112 | 	}
113 | 	return(ret)
114 | }
115 | 
116 | randn = function(m,n){
117 | 	return(matrix(rnorm(m*n),m,n))	
118 | }
119 | 
120 | colRanks = function(aMatrix){
121 |     return(apply(aMatrix,2,rank))
122 | }
123 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/gq.R:
--------------------------------------------------------------------------------
 1 | ######################################################################
 2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010
 3 | 
 4 | #This program is free software: you can redistribute it and/or modify
 5 | #it under the terms of the GNU General Public License as published by
 6 | #the Free Software Foundation, either version 3 of the License, or
 7 | #(at your option) any later version.
 8 | 
 9 | #This program is distributed in the hope that it will be useful,
10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | #GNU General Public License for more details.
13 | 
14 | #You should have received a copy of the GNU General Public License
15 | #along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | ######################################################################
17 | 
18 | 
19 | ######################################################################
20 | #This file contains code adapted from the WebArrayDB program,
21 | #available from http://www.webarraydb.org/webarray/index.html  Please
22 | #cite appropriately when using these functions.  Type
23 | #citation("CONOR") at the R prompt for details.
24 | 
25 | ## Implementation of the "Gene Quantile" cross platform
26 | ## normalization algorithm, as available from webarraydb.
27 | ## The function gq() is a wrapper for normalizeGQ, which
28 | ## was provided by the authors of webarraydb.
29 | #'
30 | #'fit mixed effect regression model
31 | #'@param X gene expression matrix
32 | #'@param Y gene expression matrix
33 | #'@export
34 | gq = function(platform1.data, platform2.data, p1.names=0, p2.names=0, skip.match=FALSE){
35 | 	#This function is basically a wrapper for normalizeGQ
36 | 
37 | 	#Match names
38 | 	input = processplatforms(list(x=platform1.data,y=platform2.data),namesvec = c(p1.names, p2.names), skip.match=skip.match)
39 | 
40 | 	#Prepare for normalizeGQ
41 | 	combined = cbind(input$x,input$y)
42 | 	pf = c(seq(1,1,length.out=dim(input$x)[2]),seq(2,2,length.out=dim(input$y)[2]))
43 | 
44 | 	#Call normalizeGQ
45 | 	ngq = normalizeGQ(combined,pf)
46 | 
47 | 	#Split the results and return
48 | 	out=split(seq(pf),pf)
49 | 	out[[1]] = ngq[,out[[1]]]
50 | 	out[[2]] = ngq[,out[[2]]]
51 | 	names(out) <- c("x","y")
52 | 	return(out)
53 | }
54 | 
55 | 
56 | normalizeGQ <- function(M, pf, ...) {
57 | 	#This function was provided by Xiao-Qin Xia, one of the authors of
58 | 	#webarraydb.
59 |     # modified MRS
60 |     # M is the data matrix
61 |     # pf is the vector to specify the platform for each column of M.
62 |     idx <- split(seq(pf), pf)
63 |     if (length(pf)<=1) return(M)
64 |     imax <- which.max(sapply(idx, length)) # for reference
65 |     ref_med <- apply(M[, idx[[imax]]], 1, function(x) median(x, na.rm=TRUE))
66 |     ref_med_srt <- sort(ref_med)
67 |     idx[imax] <- NULL
68 |     lapply(idx, function(i) {
69 |          MTMP <- sapply(i, function(x) ref_med_srt[rank(M[,x])]);
70 |          M[,i] <<- MTMP - apply(MTMP, 1, median) + ref_med
71 |          } )
72 |     invisible(M)
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/MatchMixeR/R/xpn.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  3 | 
  4 | #This program is free software: you can redistribute it and/or modify
  5 | #it under the terms of the GNU General Public License as published by
  6 | #the Free Software Foundation, either version 3 of the License, or
  7 | #(at your option) any later version.
  8 | 
  9 | #This program is distributed in the hope that it will be useful,
 10 | #but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | #GNU General Public License for more details.
 13 | 
 14 | #You should have received a copy of the GNU General Public License
 15 | #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | ######################################################################
 17 | 
 18 | #Implementation of the xpn cross platform normalization algorithm.
 19 | 
 20 | #'@param X gene expression matrix
 21 | #'@param Y gene expression matrix
 22 | #'@export
 23 | xpn = function(platform1.data,platform2.data,K=10,L=4,p1.names=0,p2.names=0,gene.cluster="kmeans",assay.cluster="kmeans",corr="pearson", iterations=30, skip.match=FALSE){
 24 | 	#If K or L is not a single value, it is taken as a list of possible values.
 25 | 
 26 | 	#Match names
 27 | 	input = processplatforms(list(x=platform1.data,y=platform2.data),namesvec = c(p1.names, p2.names), skip.match=skip.match)
 28 | 	x <- input$x
 29 | 	y <- input$y
 30 | 	input <- NA
 31 | 
 32 | 	#Remove the medians
 33 | 	x_demed = x - rowMedians(as.matrix(x))
 34 | 	y_demed = y - rowMedians(as.matrix(y))
 35 | 
 36 | 	#Get the dimensions
 37 | 	nx=dim(x)[2]
 38 | 	ny=dim(y)[2]
 39 | 	mx=dim(x)[1]
 40 | 	my=dim(y)[1]
 41 | 
 42 | 
 43 | 	#Create the combined dataframe for clustering purposes
 44 | 	combined <- cbind(x_demed,y_demed)
 45 | 
 46 | 	#Detect K and L if necessary
 47 | 	K = detect_K(combined,K,corr)
 48 | 	L = detect_L(x_demed,y_demed,L,corr)
 49 | 
 50 | 
 51 | 	xout = 0
 52 | 	yout = 0
 53 | 
 54 | 
 55 | 	for(iter in 1:iterations){
 56 | 	cat("XPN iteration",iter,"out of",iterations,"\n")
 57 | 
 58 | 	#Do the assay clustering
 59 | 	assayclusters = xpnassaycluster(x_demed, y_demed, L, assay.cluster, corr)
 60 | 
 61 | 	#Do the gene clustering
 62 | 	geneclusters = xpngenecluster(combined, K, gene.cluster, corr)
 63 | 
 64 | 	#Estimate the XPN parameters
 65 | 	xparams = xpnparamsp(x,geneclusters,assayclusters[1:nx])
 66 | 	yparams = xpnparamsp(y,geneclusters,assayclusters[(nx+1):(nx+ny)])
 67 | 
 68 | 	#Calcuate the weighted averages
 69 | 	nor = 1/(nx + ny)
 70 | 	Aav = try((1/(xparams$nblock+yparams$nblock))*(xparams$nblock*xparams$A + yparams$nblock*yparams$A))
 71 | 	bav = nor*(nx*xparams$b + ny*yparams$b)
 72 | 	cav = nor*(nx*xparams$c + ny*yparams$c)
 73 | 	sigmaav = sqrt(nor*(nx*xparams$s2 + ny*yparams$s2))
 74 | 	sigmax = sqrt(xparams$s2)
 75 | 	sigmay = sqrt(yparams$s2)
 76 | 
 77 | 
 78 | 	#Calculate the expanded A
 79 | 	expAx = xpnclusterexpand(xparams$A,geneclusters,assayclusters[1:nx])
 80 | 	expAy = xpnclusterexpand(yparams$A,geneclusters,assayclusters[(nx+1):(nx+ny)])
 81 | 
 82 | 	#Calculate the residuals
 83 | 	epsilonx = as.matrix(x) - (as.vector(xparams$b) * expAx + kronecker(ones(1,nx),xparams$c))
 84 | 	epsilony = as.matrix(y) - (as.vector(yparams$b) * expAy + kronecker(ones(1,ny),yparams$c))
 85 | 
 86 | 	#Calculate the expanded average A
 87 | 	expAavx = xpnclusterexpand(Aav,geneclusters,assayclusters[1:nx])
 88 | 	expAavy = xpnclusterexpand(Aav,geneclusters,assayclusters[(nx+1):(nx+ny)])
 89 | 
 90 | 	#Calculate the output values
 91 | 	xout = xout + (1/iterations)*((as.vector(bav) * expAavx) + kronecker(ones(1,nx),cav) + as.vector(sigmaav/sigmax) * epsilonx)
 92 | 	yout = yout + (1/iterations)*((as.vector(bav) * expAavy) + kronecker(ones(1,ny),cav) + as.vector(sigmaav/sigmay) * epsilony)
 93 | 
 94 | 	}#end of the enclosing for loop
 95 | 
 96 | 	#Put the rownames back in and convert to data frames
 97 | 	xout = as.data.frame(xout,row.names=rownames(x))
 98 | 	yout = as.data.frame(yout,row.names=rownames(y))
 99 | 
100 | 	#All done!
101 | 	return(list(x=xout,y=yout))
102 | }
103 | 
104 | xpnparamsp = function(x, geneclusters, assayclusters, method){
105 | 	#x should be a dataframe or hashdataframe
106 | 
107 | 	#Get K and L
108 | 	K=max(geneclusters)
109 | 	L=max(assayclusters)
110 | 
111 | 	#Get number of assays and genes
112 | 	numassays=length(x)
113 | 	numgenes=length(rownames(x))
114 | 
115 | 	#Number of assays in each cluster
116 | 	nj = matrix(table(as.factor(assayclusters)),1,L)
117 | 
118 | 	#Set up the output variables
119 | 	A = matrix(nrow=K,ncol=L)
120 | 	nblock = matrix(nrow=K,ncol=L)
121 | 	b = matrix(nrow=numgenes, ncol=1)
122 | 	c = matrix(nrow=numgenes, ncol=1)
123 | 	sigma = matrix(nrow=numgenes, ncol=1)
124 | 	mu = matrix(nrow=numgenes,ncol=L)
125 | 
126 | 
127 | 	#For each gene cluster, estimate the XPN parameters
128 | 	for (a in 1:K){
129 | 
130 | 		#Get the logical indexing vector for gene cluster a
131 | 		geneinds = geneclusters==a
132 | 		numgenesa = sum(as.numeric(geneinds))
133 | 
134 | 		#Get the data for this gene cluster
135 | 		xa = as.matrix(x[geneinds,])
136 | 
137 | 		#Get the number of genes in this cluster
138 | 		Ga = dim(xa)[1]
139 | 		S = dim(xa)[2]
140 | 
141 | 		#For each assay cluster and each gene, get the (relatively) unconstrained
142 | 		#MLE for mu
143 | 		mua = matrix(nrow=Ga,ncol=L)
144 | 		for (bb in 1:L){
145 | 			assayinds = assayclusters==bb
146 | 			xab = matrix(xa[,assayinds],numgenesa,sum(as.numeric(assayinds)))
147 | 			mua[,bb]=t(t(rowMeans(xab)))
148 | 		}
149 | 
150 | 
151 | 		#Calculate sigma2
152 | 		expmua = xpnclusterexpand(mua, colclusters = assayclusters)
153 | 		sigma2 = xa - expmua
154 | 		sigma2 = sigma2*sigma2
155 | 		sigma2 = xpnclustercollapse(sigma2,colclusters = assayclusters)
156 | 
157 | 		solna = XPN_MLE_C(mua,sigma2,nj)
158 | 
159 | 		ba = solna$b
160 | 		Aa = solna$A
161 | 		ca = solna$c
162 | 		s2a = solna$s2
163 | 
164 | 		#Write the values for this gene group into the larger arrays
165 | 		sigma[geneinds,] = s2a
166 | 		mu[geneinds,] = mua
167 | 		A[a,] = Aa
168 | 		nblock[a,] = nj
169 | 		b[geneinds,] = ba
170 | 		c[geneinds,] = ca
171 | 	}
172 | 
173 | 	return(list(A=A,nblock=nblock,b=b,c=c,s2=sigma,mu=mu))
174 | 
175 | }
176 | 
177 | 
178 | xpnclustercollapse = function(aMatrix, rowclusters = 1:(dim(aMatrix)[1]), colclusters = 1:(dim(aMatrix)[2])){
179 | 	#This undoes the work of xpnclusterexpand
180 | 
181 | 	l = dim(aMatrix)[1]
182 | 	w = dim(aMatrix)[2]
183 | 
184 | 	#collapse the rows
185 | 	rowidx = split(1:l,rowclusters)
186 | 	lc = length(rowidx)
187 | 	collapse1 = matrix(NA,lc,w)
188 | 	for ( i in 1:lc){
189 | 		collapse1[i,] =  colMeans(matrix(aMatrix[rowidx[[i]],],length(rowidx[[i]]),w))
190 | 	}
191 | 
192 | 	#collapse the columns
193 | 	colidx = split(1:w,colclusters)
194 | 	wc = length(colidx)
195 | 	collapse2 = matrix(NA,lc,wc)
196 | 	for ( i in 1:wc){
197 | 		collapse2[,i] =  rowMeans(matrix(collapse1[,colidx[[i]]],lc,length(colidx[[i]])))
198 | 	}
199 | 
200 | 	return(collapse2)
201 | 
202 | }
203 | 
204 | 
205 | xpnclusterexpand = function(aMatrix, rowclusters = 1:(dim(aMatrix)[1]), colclusters = 1:(dim(aMatrix)[2]), noise = 0){
206 | 	#This is basically a fancy repmat.  rowclusters and colclusters
207 | 	#as produced by pam.  aMatrix must have the same number of rows
208 | 	#and columns as there are row and column clusters.
209 | 
210 | 	l = dim(aMatrix)[1]
211 | 	w = dim(aMatrix)[2]
212 | 
213 | 	#Get the dimensions of the output
214 | 	m = length(rowclusters)
215 | 	n = length(colclusters)
216 | 
217 | 	#Get the number of row and column clusters
218 | 	nrowclust = max(rowclusters)
219 | 	ncolclust = max(colclusters)
220 | 
221 | 	#initialize the vertically expanded matrix
222 | 	vert = zeros(m,dim(aMatrix)[2])
223 | 
224 | 	#Do the vertical expansion
225 | 	for (i in 1:m){
226 | 		vert[i,]=aMatrix[rowclusters[i],] + (noise*randn(1,w))
227 | 	}
228 | 
229 | 	#initialize the fully expanded matrix
230 | 	output = matrix(NA,m,n)
231 | 
232 | 	#Do the horizontal expansion
233 | 	for (j in 1:n){
234 | 		output[,j] = vert[,colclusters[j]] + (noise*randn(m,1))
235 | 	}
236 | 
237 | 	#That is it!
238 | 	return(output)
239 | }
240 | 
241 | xpnassaycluster = function(x,y,L,cluster="kmeans",corr="pearson"){
242 | 
243 | 	if(is.numeric(cluster) | is.factor(cluster)){
244 | 		return(as.numeric(cluster))
245 | 	}
246 | 
247 | 	#The numbers of assays
248 | 	nx = dim(x)[2]
249 | 	ny = dim(y)[2]
250 | 
251 | 	#The number of genes
252 | 	m = dim(x)[1]
253 | 
254 | 	#Compute the two correlation matrices if necessary
255 | 	if (cluster == 'pam' | (length(L)>1)){
256 | 		xdiss = 1 - cor(as.matrix(x), method = corr)
257 | 		ydiss = 1 - cor(as.matrix(y), method = corr)
258 | 	}
259 | 
260 | 	#Determine L if a range has been given
261 | 	if(length(L)>1){
262 | 		Lx=pamk(xdiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc
263 | 		Ly = pamk(ydiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc
264 | 		L = max(Lx,Ly)
265 | 		cat(as.character(L), "assay clusters detected...\n")
266 | 	}
267 | 
268 | 	#Do the clustering
269 | 	if (cluster == "classic"){
270 | 
271 | 		done = FALSE
272 | 		while (!done){
273 | 			xyclust <- ckmeans(t(cbind(x,y)),centers=L, iter.max=20, distance=corr)
274 | 			#print(xyclust$cluster)
275 | 			xclust <- xyclust$cluster[1:nx]
276 | 			yclust <- xyclust$cluster[(nx+1):(nx+ny)]
277 | 			#print(xclust)
278 | #			print(yclust)
279 | #			print(nlevels(as.factor(xclust)))
280 | #			print(nlevels(as.factor(yclust)))
281 | 			done <- nlevels(as.factor(xclust)) == L & nlevels(as.factor(yclust)) == L
282 | 			if(!done){
283 | 				cat("Not all platforms contained all assay clusters.  Trying again (only \"classic\" mode has this problem)...\n")
284 | 			}
285 | 		}
286 | 		return(xyclust$cluster)
287 | 	}
288 | 	else if (cluster == "pam"){
289 | 		xclust = pam(xdiss, k=L, diss=TRUE, keep.diss=FALSE, keep.data=FALSE,cluster.only=TRUE)
290 | 		yclust = pam(ydiss, k=L, diss=TRUE, keep.diss=FALSE, keep.data=FALSE,cluster.only=TRUE)
291 | 	}else if (cluster == "kmeans"){
292 | 
293 | 		#This loop ensures there are no empty clusters.
294 | 		done = FALSE
295 | 		while (!done){
296 | 			xclust = ckmeans(t(x), centers=L, iter.max = 20, distance = corr)
297 | 			yclust = ckmeans(t(y), centers=L, iter.max = 20, distance = corr)
298 | 
299 | 			done = !xclust$empty & !yclust$empty
300 | 		}
301 | 		xclust = xclust$cluster
302 | 		yclust = yclust$cluster
303 | 
304 | 	} else if (cluster == "flexclust"){
305 | 
306 | 		distCor1 = function (x, centers)
307 | 		{
308 |     		z <- matrix(0, nrow(x), ncol = nrow(centers))
309 |     		for (k in 1:nrow(centers)) {
310 |         		z[, k] <- 1 - cor(t(x), centers[k, ], method=corr)
311 |     		}
312 |     		z
313 | 		}
314 | 		#This loop ensures there are no empty clusters.
315 | 		done = FALSE
316 | 		while (!done){
317 | 		xclust = kcca(t(x), k=L, family=kccaFamily(dist=distCor1,cent="centMean"), simple=TRUE)
318 | 		yclust = kcca(t(y), k=L, family=kccaFamily(dist=distCor1,cent="centMean"), simple=TRUE)
319 | 
320 | 			done = (dim(xclust@clusinfo)[1] == L) &(dim(yclust@clusinfo)[1] == L)
321 | 		}
322 | 		xclust = xclust@cluster
323 | 		yclust = yclust@cluster
324 | 	}else if(cluster == "random"){
325 | 			xclust = sample(c(1:L,sample(1:L,nx-L,replace = TRUE)),nx,replace=FALSE)
326 | 			yclust = sample(c(1:L,sample(1:L,ny-L,replace = TRUE)),ny,replace=FALSE)
327 | 	}
328 | 
329 | 	#Compute cluster averages
330 | 	xave = matrix(NA,m,L)
331 | 	yave = matrix(NA,m,L)
332 | 	for (i in 1:L){
333 | 		xinds = xclust==i
334 | 		yinds = yclust==i
335 | 		xave[,i] = rowMeans(as.matrix(x[,xinds],nrow=m,ncol=sum(as.numeric(xinds))))
336 | 		yave[,i] = rowMeans(as.matrix(y[,yinds],nrow=m,ncol=sum(as.numeric(yinds))))
337 | 	}
338 | 
339 | 	#Compute the cluster correlation matrix
340 | 	clustercor = cor(xave,yave, method = corr)
341 | 
342 | 	#Map clusters
343 | 	xtoymap = matrix(NA,L,1)
344 | 	for (i in 1:L){
345 | 		highest = which.max(clustercor)
346 | 		xind = highest%%L
347 | 
348 | 		yind = highest%/%L + 1
349 | 
350 | 		if (xind==0){
351 | 			xind = L
352 | 			yind = yind-1
353 | 		}
354 | 		xtoymap[xind]=yind
355 | 		clustercor[xind,] = -2
356 | 		clustercor[,yind] = -2
357 | 	}
358 | 
359 | 	#Change the x clusters using the map
360 | 	newxclust = xclust
361 | 	for (i in 1:L){
362 | 		newxclust[xclust==i] = xtoymap[i]
363 | 	}
364 | 	xclust = newxclust
365 | 
366 | 	#Return the clustering in a combined vector
367 | 	return(c(xclust,yclust))
368 | }
369 | 
370 | 
371 | xpngenecluster = function(x,K,cluster="pam",corr="pearson"){
372 | 
373 | 	#Dimensions
374 | 	m = dim(x)[1]
375 | 	n = dim(x)[2]
376 | 
377 | 	#Compute the dissimilarity matrix
378 | 	if (cluster=="pam"|(length(K)>1)){
379 | 		xdiss = 1 - cor(t(x),method=corr)
380 | 	}
381 | 
382 | 	#Determine K if a range was given
383 | 	if(length(K)>1 ){
384 | 		pamklist=pamk(xdiss,krange=K,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)
385 | 		K = pamklist$nc
386 | 		genepamobj = pamklist$pamobject
387 | 	}else{
388 | 		genepamobj = NULL
389 | 	}
390 | 
391 | 
392 | 	#Do the gene clustering
393 | 	if(cluster=="pam"){
394 | 		if (!is.null(genepamobj)){
395 | 			geneclusters = genepamobj$clustering
396 | 		}else{
397 | 			geneclusters = pam(xdiss, k=K, diss=TRUE, keep.diss=FALSE, keep.data=FALSE, cluster.only=TRUE)
398 | 		}
399 | 	}else if(cluster=="kmeans"){
400 | 		#This loop ensures there are no empty clusters.
401 | 		done = FALSE
402 | 		while (!done){
403 | 			geneclusters = ckmeans(x,centers=K,iter.max=1000,distance=corr)
404 | 			done = !geneclusters$empty
405 | 		}
406 | 		geneclusters = geneclusters$cluster
407 | 	}else if(cluster=="flexclust"){
408 | 
409 | 
410 | 		distCor1 = function (x, centers) #change
411 | 		{
412 |     		z <- matrix(0, nrow(x), ncol = nrow(centers))
413 |     		for (k in 1:nrow(centers)) {
414 |         		z[, k] <- 1 - cor(t(x), centers[k, ], method=corr)
415 |     		}
416 |     		z
417 | 		}
418 | 		#This loop ensures there are no empty clusters.
419 | 		done = FALSE
420 | 		while (!done){
421 | 			geneclusters = kcca(x, k=K, family=kccaFamily(dist=distCor1,cent="centMean"), simple=TRUE) #change
422 | 			#geneclusters = ckmeans(x,centers=K,iter.max=1000,distance=corr)
423 | 			done = (dim(geneclusters@clusinfo)[1]==K)#change
424 | 		}
425 | 		geneclusters = geneclusters@cluster
426 | 	}else if (cluster == "random"){
427 | 		geneclusters = sample(c(1:K,sample(1:K,m-K,replace = TRUE)),m,replace=FALSE)
428 | 	}else{
429 | 		cat("Unknown gene clustering method:",cluster,"\n")
430 | 	}
431 | 
432 | 	return(geneclusters)
433 | 
434 | }
435 | 
436 | 
437 | XPN_MLE_C = function(xbar,sigma2,nj){
438 | 
439 | 
440 | 	n = sum(nj)
441 | 	I = dim(xbar)[1]
442 | 	J = dim(xbar)[2]
443 | 
444 | 	A = zeros(1,J)
445 | 	b = ones(I,1)
446 | 	c = zeros(I,1)
447 | 	s2 = ones(I,1)
448 | 
449 | 	cout = .C("XPN_MLE_C_C",as.double(xbar),A=as.double(A),b=as.double(b),c=as.double(c),s2=as.double(s2),as.double(sigma2),as.integer(I),as.integer(J),as.integer(n),as.integer(nj))
450 | 
451 | 	A = matrix(cout$A,1,J)
452 | 	b = matrix(cout$b,I,1)
453 | 	c = matrix(cout$c,I,1)
454 | 	s2 = matrix(cout$s2,I,1)
455 | 
456 | 	return(list(A=A,b=b,c=c,s2=s2))
457 | }
458 | 
459 | XPN_MLE = function(xbar,sigma2,nj){
460 | 	#This function is never used.  It serves to illustrate the method of maximum likelihood estimation used by XPN.  It has been replaced by XPN_MLE_C, which calls a more efficient C function.  The inputs and outputs of these functions are identical.
461 | 
462 | 	n = sum(nj)
463 | 	I = dim(xbar)[1]
464 | 	J = dim(xbar)[2]
465 | 
466 | 	A = zeros(1,J)
467 | 	b = ones(I,1)
468 | 	c = zeros(I,1)
469 | 	s2 = ones(I,1)
470 | 
471 | 	old = c(A, b, c, s2)*0
472 | 	iter = 0
473 | 	while(sum((c(A, b, c, s2)-old)^2)>(1e-16)*max(old)){
474 | 
475 | 		print(iter<-iter+1)
476 | 		print(sum((c(A, b, c, s2)-old)^2))
477 | 		old = c(A, b, c, s2)
478 | 
479 | 		c = matrix(rowSums((xbar-repmat(b,1,J)*repmat(A,I,1))*repmat(nj,I,1))/n,I,1)
480 | 
481 | 		if(sum(b)<0){
482 | 			b = -1*b;
483 | 		}
484 | 
485 | 		A = matrix(colSums(repmat(b,1,J)*(xbar-repmat(c,1,J))/repmat(s2,1,J)/sum(b^2/s2)),1,J)
486 | 
487 | 		A = A - mean(A)
488 | 		A = A * sqrt(J/sum(A^2))
489 | 
490 | 		b = matrix(rowSums(repmat(A,I,1)*(xbar-repmat(c,1,J))*repmat(nj,I,1))/sum(A^2 * nj),I,1)
491 | 
492 | 
493 | 		s2 = matrix(rowSums(((xbar-repmat(c,1,J)-repmat(A,I,1)*repmat(b,1,J))^2 + sigma2)*repmat(nj,I,1))/n,I,1)
494 | 
495 | 
496 | 
497 | 		s2[s2==0] = 2.2251e-308
498 | 
499 | 
500 | 
501 | 	}
502 | 
503 | 	return(list(A=A,b=b,c=c,s2=s2))
504 | }
505 | 
506 | 
507 | 
508 | detect_L = function(x,y,L,corr="pearson"){
509 | 
510 | 	#Determine L if a range has been given
511 | 	if(length(L)>1 & length(L) != (dim(x)[2]+dim(y)[2])){
512 | 		xdiss = 1 - cor(as.matrix(x), method = corr)
513 | 		ydiss = 1 - cor(as.matrix(y), method = corr)
514 | 		Lx=pamk(xdiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc
515 | 		Ly = pamk(ydiss,krange=L,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)$nc
516 | 		L = max(Lx,Ly)
517 | 		cat(as.character(L), "assay clusters detected...\n")
518 | 	}
519 | 	return(L)
520 | }
521 | 
522 | detect_K = function(x,K,corr="pearson"){
523 | 
524 | 	#Determine K if a range was given
525 | 	if(length(K)>1 & length(K) != dim(x)[1]){
526 | 		xdiss = 1 - cor(t(x),method=corr)
527 | 		pamklist=pamk(xdiss,krange=K,diss=TRUE, keep.diss=FALSE, keep.data=FALSE)
528 | 		K = pamklist$nc
529 | 		genepamobj = pamklist$pamobject
530 | 	}else{
531 | 		genepamobj = NULL
532 | 	}
533 | 	return(K)
534 | }
535 | 


--------------------------------------------------------------------------------
/MatchMixeR/Results of devtools check.txt:
--------------------------------------------------------------------------------
  1 | ==> devtools::check()
  2 | 
  3 | Updating MatchMixeR documentation
  4 | Writing NAMESPACE
  5 | Loading MatchMixeR
  6 | Writing NAMESPACE
  7 | Warning: The existing 'MM.Rd' file was not generated by roxygen2, and will not be overwritten.
  8 | -- Building ------------ MatchMixeR --
  9 | Setting env vars:
 10 | * CFLAGS    : -Wall -pedantic -fdiagnostics-color=always
 11 | * CXXFLAGS  : -Wall -pedantic -fdiagnostics-color=always
 12 | * CXX11FLAGS: -Wall -pedantic -fdiagnostics-color=always
 13 | --------------------------------------
 14 | v  checking for file 'C:\Disa_Research\MatchMixeR/DESCRIPTION' ... 
 15 | -  preparing 'MatchMixeR':
 16 | v  checking DESCRIPTION meta-information ... 
 17 | -  installing the package to build vignettes
 18 | v  creating vignettes (3.5s)
 19 | -  checking for LF line-endings in source and make files and shell scripts
 20 | -  checking for empty or unneeded directories
 21 | -  looking to see if a 'data/datalist' file should be added
 22 | -  building 'MatchMixeR_0.1.1.tar.gz'
 23 |    
 24 | -- Checking ------------ MatchMixeR --
 25 | Setting env vars:
 26 | * _R_CHECK_CRAN_INCOMING_USE_ASPELL_: TRUE
 27 | * _R_CHECK_CRAN_INCOMING_REMOTE_    : FALSE
 28 | * _R_CHECK_CRAN_INCOMING_           : FALSE
 29 | * _R_CHECK_FORCE_SUGGESTS_          : FALSE
 30 | -- R CMD check -----------------------------------------------------------------
 31 | -  using log directory 'C:/Disa_Research/MatchMixeR.Rcheck'
 32 | -  using R version 3.5.1 (2018-07-02)
 33 | -  using platform: x86_64-w64-mingw32 (64-bit)
 34 | -  using session charset: ISO8859-1
 35 | -  using options '--no-manual --as-cran'
 36 | v  checking for file 'MatchMixeR/DESCRIPTION'
 37 | -  checking extension type ... Package
 38 | -  this is package 'MatchMixeR' version '0.1.1'
 39 | -  package encoding: UTF-8
 40 | v  checking package namespace information ...
 41 | v  checking package dependencies (2.6s)
 42 | v  checking if this is a source package ...
 43 | v  checking if there is a namespace
 44 | v  checking for executable files (403ms)
 45 | v  checking for hidden files and directories ...
 46 | v  checking for portable file names ...
 47 | v  checking serialization versions
 48 | v  checking whether package 'MatchMixeR' can be installed (2.7s)
 49 | v  checking installed package size ... 
 50 | v  checking package directory
 51 | v  checking 'build' directory ...
 52 | N  checking DESCRIPTION meta-information ... 
 53 |    Malformed Description field: should contain one or more complete sentences.
 54 | v  checking top-level files
 55 | v  checking for left-over files
 56 | v  checking index information ... 
 57 | v  checking package subdirectories ... 
 58 | v  checking R files for non-ASCII characters ... 
 59 | v  checking R files for syntax errors ... 
 60 | v  checking whether the package can be loaded ... 
 61 | v  checking whether the package can be loaded with stated dependencies ... 
 62 | v  checking whether the package can be unloaded cleanly ... 
 63 | v  checking whether the namespace can be loaded with stated dependencies ... 
 64 | v  checking whether the namespace can be unloaded cleanly ... 
 65 | v  checking loading without being on the library search path ... 
 66 | v  checking dependencies in R code ... 
 67 | v  checking S3 generic/method consistency (515ms)
 68 | v  checking replacement functions ... 
 69 | W  checking foreign function calls (408ms)
 70 |    Foreign function call without 'PACKAGE' argument:
 71 |      .C("XPN_MLE_C_C", ...)
 72 |    See chapter 'System and foreign language interfaces' in the 'Writing R
 73 |    Extensions' manual.
 74 | N  checking R code for possible problems (5.3s)
 75 |    ComBat: no visible binding for global variable 'var'
 76 |    ComBat: no visible global function definition for 'par'
 77 |    ComBat: no visible global function definition for 'density'
 78 |    ComBat: no visible global function definition for 'plot'
 79 |    ComBat: no visible global function definition for 'lines'
 80 |    ComBat: no visible global function definition for 'dnorm'
 81 |    ComBat: no visible global function definition for 'qqnorm'
 82 |    ComBat: no visible global function definition for 'qqline'
 83 |    ComBat: no visible global function definition for 'rgamma'
 84 |    ComBat: no visible global function definition for 'qqplot'
 85 |    ComBat: no visible global function definition for 'title'
 86 |    DWD1SM: no visible global function definition for 'rdist'
 87 |    DWD1SM: no visible global function definition for 'median'
 88 |    L: no visible global function definition for 'dnorm'
 89 |    MM: no visible global function definition for 'cov'
 90 |    aprior: no visible global function definition for 'var'
 91 |    bprior: no visible global function definition for 'var'
 92 |    detect_K: no visible global function definition for 'cor'
 93 |    detect_K: no visible global function definition for 'pamk'
 94 |    detect_L: no visible global function definition for 'cor'
 95 |    detect_L: no visible global function definition for 'pamk'
 96 |    randn: no visible global function definition for 'rnorm'
 97 |    rowMedians: no visible global function definition for 'median'
 98 |    sepelimdwd: no visible global function definition for 'socp'
 99 |    xpnassaycluster: no visible global function definition for 'cor'
100 |    xpnassaycluster: no visible global function definition for 'pamk'
101 |    xpnassaycluster: no visible global function definition for 'ckmeans'
102 |    xpnassaycluster: no visible global function definition for 'pam'
103 |    xpnassaycluster : distCor1: no visible global function definition for
104 |      'cor'
105 |    xpnassaycluster: no visible global function definition for 'kcca'
106 |    xpnassaycluster: no visible global function definition for 'kccaFamily'
107 |    xpngenecluster: no visible global function definition for 'cor'
108 |    xpngenecluster: no visible global function definition for 'pamk'
109 |    xpngenecluster: no visible global function definition for 'pam'
110 |    xpngenecluster: no visible global function definition for 'ckmeans'
111 |    xpngenecluster : distCor1: no visible global function definition for
112 |      'cor'
113 |    xpngenecluster: no visible global function definition for 'kcca'
114 |    xpngenecluster: no visible global function definition for 'kccaFamily'
115 |    Undefined global functions or variables:
116 |      ckmeans cor cov density dnorm kcca kccaFamily lines median pam pamk
117 |      par plot qqline qqnorm qqplot rdist rgamma rnorm socp title var
118 |    Consider adding
119 |      importFrom("graphics", "lines", "par", "plot", "title")
120 |      importFrom("stats", "cor", "cov", "density", "dnorm", "median",
121 |                 "qqline", "qqnorm", "qqplot", "rgamma", "rnorm", "var")
122 |    to your NAMESPACE file.
123 | v  checking Rd files ... 
124 | v  checking Rd metadata ... 
125 | v  checking Rd line widths ... 
126 | v  checking Rd cross-references ... 
127 | v  checking for missing documentation entries ... 
128 | v  checking for code/documentation mismatches (625ms)
129 | W  checking Rd \usage sections ... 
130 |    Undocumented arguments in documentation object 'MatchMixeR'
131 |      'Xmat' 'Ymat'
132 |    Documented arguments not in \usage in documentation object 'MatchMixeR':
133 |      'X' 'Y'
134 |    
135 |    Functions with \usage entries need to have the appropriate \alias
136 |    entries, and all their arguments documented.
137 |    The \usage entries must correspond to syntactically valid R code.
138 |    See chapter 'Writing R documentation files' in the 'Writing R
139 |    Extensions' manual.
140 | v  checking Rd contents (613ms)
141 | v  checking for unstated dependencies in examples ... 
142 | v  checking contents of 'data' directory
143 | v  checking data for non-ASCII characters ... 
144 | W  checking data for ASCII and uncompressed saves (3s)
145 |      
146 |      Note: significantly better compression could be obtained
147 |            by using R CMD build --resave-data
148 |                       old_size new_size compress
149 |      gpl570_gpl96.rda    2.7Mb    2.3Mb       xz
150 | v  checking installed files from 'inst/doc' ... 
151 | v  checking files in 'vignettes' ... 
152 | v  checking examples (559ms)
153 | v  checking for unstated dependencies in vignettes ... 
154 | v  checking package vignettes in 'inst/doc' ...
155 | v  checking re-building of vignette outputs (878ms)
156 |    
157 |    See
158 |      'C:/Disa_Research/MatchMixeR.Rcheck/00check.log'
159 |    for details.
160 |    
161 |    
162 | -- R CMD check results ----------------------------------- MatchMixeR 0.1.1 ----
163 | Duration: 23.5s
164 | 
165 | > checking foreign function calls ... WARNING
166 |   Foreign function call without 'PACKAGE' argument:
167 |     .C("XPN_MLE_C_C", ...)
168 |   See chapter 'System and foreign language interfaces' in the 'Writing R
169 |   Extensions' manual.
170 | 
171 | > checking Rd \usage sections ... WARNING
172 |   Undocumented arguments in documentation object 'MatchMixeR'
173 |     'Xmat' 'Ymat'
174 |   Documented arguments not in \usage in documentation object 'MatchMixeR':
175 |     'X' 'Y'
176 |   
177 |   Functions with \usage entries need to have the appropriate \alias
178 |   entries, and all their arguments documented.
179 |   The \usage entries must correspond to syntactically valid R code.
180 |   See chapter 'Writing R documentation files' in the 'Writing R
181 |   Extensions' manual.
182 | 
183 | > checking data for ASCII and uncompressed saves ... WARNING
184 |     
185 |     Note: significantly better compression could be obtained
186 |           by using R CMD build --resave-data
187 |                      old_size new_size compress
188 |     gpl570_gpl96.rda    2.7Mb    2.3Mb       xz
189 | 
190 | > checking DESCRIPTION meta-information ... NOTE
191 |   Malformed Description field: should contain one or more complete sentences.
192 | 
193 | > checking R code for possible problems ... NOTE
194 |   ComBat: no visible binding for global variable 'var'
195 |   ComBat: no visible global function definition for 'par'
196 |   ComBat: no visible global function definition for 'density'
197 |   ComBat: no visible global function definition for 'plot'
198 |   ComBat: no visible global function definition for 'lines'
199 |   ComBat: no visible global function definition for 'dnorm'
200 |   ComBat: no visible global function definition for 'qqnorm'
201 |   ComBat: no visible global function definition for 'qqline'
202 |   ComBat: no visible global function definition for 'rgamma'
203 |   ComBat: no visible global function definition for 'qqplot'
204 |   ComBat: no visible global function definition for 'title'
205 |   DWD1SM: no visible global function definition for 'rdist'
206 |   DWD1SM: no visible global function definition for 'median'
207 |   L: no visible global function definition for 'dnorm'
208 |   MM: no visible global function definition for 'cov'
209 |   aprior: no visible global function definition for 'var'
210 |   bprior: no visible global function definition for 'var'
211 |   detect_K: no visible global function definition for 'cor'
212 |   detect_K: no visible global function definition for 'pamk'
213 |   detect_L: no visible global function definition for 'cor'
214 |   detect_L: no visible global function definition for 'pamk'
215 |   randn: no visible global function definition for 'rnorm'
216 |   rowMedians: no visible global function definition for 'median'
217 |   sepelimdwd: no visible global function definition for 'socp'
218 |   xpnassaycluster: no visible global function definition for 'cor'
219 |   xpnassaycluster: no visible global function definition for 'pamk'
220 |   xpnassaycluster: no visible global function definition for 'ckmeans'
221 |   xpnassaycluster: no visible global function definition for 'pam'
222 |   xpnassaycluster : distCor1: no visible global function definition for
223 |     'cor'
224 |   xpnassaycluster: no visible global function definition for 'kcca'
225 |   xpnassaycluster: no visible global function definition for 'kccaFamily'
226 |   xpngenecluster: no visible global function definition for 'cor'
227 |   xpngenecluster: no visible global function definition for 'pamk'
228 |   xpngenecluster: no visible global function definition for 'pam'
229 |   xpngenecluster: no visible global function definition for 'ckmeans'
230 |   xpngenecluster : distCor1: no visible global function definition for
231 |     'cor'
232 |   xpngenecluster: no visible global function definition for 'kcca'
233 |   xpngenecluster: no visible global function definition for 'kccaFamily'
234 |   Undefined global functions or variables:
235 |     ckmeans cor cov density dnorm kcca kccaFamily lines median pam pamk
236 |     par plot qqline qqnorm qqplot rdist rgamma rnorm socp title var
237 |   Consider adding
238 |     importFrom("graphics", "lines", "par", "plot", "title")
239 |     importFrom("stats", "cor", "cov", "density", "dnorm", "median",
240 |                "qqline", "qqnorm", "qqplot", "rgamma", "rnorm", "var")
241 |   to your NAMESPACE file.
242 | 
243 | 0 errors v | 3 warnings x | 2 notes x
244 | Error: R CMD check found WARNINGs
245 | Execution halted
246 | 
247 | Exited with status 1.
248 | 


--------------------------------------------------------------------------------
/MatchMixeR/data/gpl570_gpl96.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/data/gpl570_gpl96.rda


--------------------------------------------------------------------------------
/MatchMixeR/man/CONOR.Rd:
--------------------------------------------------------------------------------
  1 | \name{MatchMixeR}
  2 | \alias{CONOR}
  3 | \alias{dwd}
  4 | \alias{eb}
  5 | \alias{xpn}
  6 | \alias{gq}
  7 | 
  8 | \title{
  9 | Functions for cross-platform normalization of microarray data, from the CONOR package.
 10 | }
 11 | \description{
 12 | The functions below perform cross-platform normalization of microarray data, and come from the CONOR package. Dwd is distance weighted discrimination. Eb is empirical bayes. Xpn is cross-platform normalization. The dwd function requires the package rdist.
 13 | The xpn function requires the package conclust.
 14 | }
 15 | 
 16 | \usage{
 17 | 
 18 | dwd(platform1.data, platform2.data, platform1.train = NULL,
 19 |     platform2.train = NULL, p1.names = 0, p2.names = 0, p1.train.names = 0,
 20 |     p2.train.names = 0, skip.match = FALSE, use.sparse = TRUE)
 21 | 
 22 | eb(platform1.data, platform2.data, par.prior = TRUE, filter = FALSE,
 23 | 	prior.plots = FALSE, p1.names = 0, p2.names = 0, skip.match=FALSE)
 24 | 
 25 | xpn(platform1.data, platform2.data, K = 10, L = 4, p1.names = 0,
 26 |     p2.names = 0, gene.cluster = "kmeans", assay.cluster = "kmeans",
 27 |     corr = "pearson", iterations = 30, skip.match = FALSE)
 28 | 
 29 | gq(platform1.data, platform2.data, p1.names = 0, p2.names = 0, skip.match=FALSE)
 30 | }
 31 | %- maybe also 'usage' for other objects documented here.
 32 | 
 33 | \arguments{
 34 |   \item{platform1.data}{
 35 | Expression data from platform 1.  Should be formatted as a data.frame, with each column representing an array and each row a gene.
 36 | }
 37 |   \item{platform2.data}{
 38 | Expression data from platform 2.  Should be formatted as a data.frame, with each column representing an array and each row a gene.
 39 | }
 40 |   \item{p1.names}{
 41 | One column of \code{platform1.data} may contain gene names.  The column number containing the gene names should be specified as \code{p1.names}.  If \code{p1.names} is zero, the rownames attribute will be used.  Default value is zero.  The gene names for platform 1 should correspond to the gene names for platform 2.
 42 | }
 43 |   \item{p2.names}{
 44 | One column of \code{platform2.data} may contain gene names.  The column number containing the gene names should be specified as \code{p2.names}.  If \code{p2.names} is zero, the rownames attribute will be used.  Default value is zero.  The gene names for platform 2 should correspond to the gene names for platform 1.
 45 | }
 46 | 
 47 |   \item{platform1.train}{Training data set for use with DWD.}
 48 |   \item{platform2.train}{Training data set for use with DWD.}
 49 |   \item{p1.train.names}{The column number containing the gene names for platform 1 training data.}
 50 |   \item{p2.train.names}{The column number containing the gene names for platform 2 training data.}
 51 |   \item{cluster}{Clustering method used by distran.}
 52 |   \item{par.prior}{Parameter for eb.  \code{par.prior} specifies whether to use a parametric or non-parametric prior distribution.}
 53 |   \item{filter}{Parameter for eb.}
 54 |   \item{prior.plots}{Generate prior plots for eb method.}
 55 | 
 56 | 
 57 |   \item{skip.match}{
 58 | If \code{skip.match} is \code{FALSE}, rows of \code{platform1.data} and \code{platform2.data} will be matched using gene names.  This process uses R's built-in \code{intercept} function, which can be quite inefficient.  If the rows of \code{platform1.data} and \code{platform2.data} already match, time can be saved by setting \code{skip.match} to \code{TRUE}.
 59 | }
 60 | 
 61 |   \item{use.sparse}{
 62 | For dwd only.  Can be set to \code{TRUE} or \code{FALSE}.  Determines whether dwd uses sparse matrices (via the Matrix package) for its internal calculations. Sparse matrix calculations are more efficient for large problems, but will not affect the final output.
 63 | }
 64 | 
 65 |   \item{gene.cluster}{
 66 | For \code{xpn} only, \code{gene.cluster} specifies the gene clustering method to be used.  Options are "kmeans", "pam", and "flexclust".  Only "kmeans" is practical for large numbers or genes.
 67 | }
 68 |   \item{assay.cluster}{
 69 | For \code{xpn} and \code{distran} only, \code{assay.cluster} specifies the assay clustering method to be used. Options are "classic", "kmeans", "pam", and "flexclust".
 70 | }
 71 |   \item{corr}{
 72 | For \code{xpn} and \code{distran} only, \code{corr} is the type of correlation to use as a distance measure for sample or gene clustering.  Ignored for the "kmeans" and "classic" clustering options, for which only Pearson's correlation is available.
 73 | }
 74 |   \item{iterations}{
 75 | For \code{xpn} only, \code{iterations} gives the number of iterations of the XPN algorithm to perform.
 76 | }
 77 | 
 78 |   \item{K}{
 79 | For \code{xpn} only, \code{K} is the number of gene clusters to use.  Must be an integer.  If a vector of more than one integer is given, the best value will be selected by the silhouette based method of the pamk function from the fpc package.  This can be extremely slow for large numbers of genes.
 80 | }
 81 | 
 82 |   \item{L}{
 83 | For \code{xpn} and \code{distran} only, \code{L} is the number of assay clusters to use.  Must be an integer.  If a vector of more than one integer is given, the best value will be selected by a silhouette based method using the pamk function from the fpc package.
 84 | }
 85 | 
 86 | }
 87 | 
 88 | 
 89 | \value{
 90 | %%  ~Describe the value returned
 91 | %%  If it is a LIST, use
 92 | %%  \item{comp1 }{Description of 'comp1'}
 93 | %%  \item{comp2 }{Description of 'comp2'}
 94 | %% ...
 95 | \item{x}{Normalized data from platform 1.}
 96 | \item{y}{Normalized data from platform 2.}
 97 | \item{p1.adjust}{For dwd only, vector of platform effects removed from the platform 1 data.}
 98 | \item{p2.adjust}{For dwd only, vector of platform effects removed from the platform 2 data.}
 99 | }
100 | \references{
101 | 
102 | Benito et al. Adjustment of systematic microarray data biases. Bioinformatics (2004) vol. 20 (1) pp. 105
103 | 
104 | Bolstad et al. A comparison of normalization methods for high density oligonucleotide array data based on variance and bias. Bioinformatics (2003) vol. 19 (2) pp. 185
105 | 
106 | Jiang et al. Joint analysis of two microarray gene-expression data sets to select lung adenocarcinoma marker genes. BMC bioinformatics (2004) vol. 5 pp. 81
107 | 
108 | Martinez et al. GenMiner: mining informative association rules from genomic data. Proceeding of the IEEE International Conference on Binformatics and Biomedicine. (2007) pp. 15-22
109 | 
110 | Shabalin et al. Merging two gene-expression studies via cross-platform normalization. Bioinformatics (2008) vol. 24 (9) pp. 1154
111 | 
112 | Shi et al. The MicroArray Quality Control (MAQC) project shows inter-and intraplatform reproducibility of gene expression measurements. Nature biotechnology (2006) vol. 24 (9) pp. 1151-1161
113 | 
114 | Walker et al. Empirical Bayes accomodation of batch-effects in microarray data using identical replicate reference samples: application to RNA expression profiling of blood .... BMC bioinformatics (2008)
115 | 
116 | Warnat et al. Cross-platform analysis of cancer microarray data improves gene expression based classification of phenotypes. BMC bioinformatics (2005) vol. 6 pp. 265
117 | 
118 | }
119 | \author{
120 | Jason Rudy and Faramarz Valafar
121 | }
122 | 
123 | 
124 | 
125 | \examples{
126 | fit_dwd <- dwd(gpl96,gpl570)
127 | fit_eb <- eb(gpl96,gpl570)
128 | fit_xpn <- xpn(gpl96,gpl570)
129 | fit_gq <- gp(gpl96,gpl570)
130 | }
131 | 
132 | \keyword{MatchMixeR}
133 | \keyword{dwd}
134 | \keyword{eb}
135 | \keyword{xpn}
136 | \keyword{distance weighted discrimination}
137 | \keyword{empirical bayes}
138 | \keyword{cross-platform normalization}
139 | 
140 | 


--------------------------------------------------------------------------------
/MatchMixeR/man/MM.Rd:
--------------------------------------------------------------------------------
 1 | \name{MatchMixeR}
 2 | \alias{MM}
 3 | 
 4 | \title{MatchMixeR: A cross-platform normalization procedure for matched samples.}
 5 | 
 6 | \description{
 7 |   Matched sample based cross-platform normalization function using mixed effect regression
 8 | }
 9 | \usage{MM(Xmat, Ymat)}
10 | 
11 | \arguments{
12 |   \item{X}{gene expression level matrix on platform X (to be transformed)}
13 | 
14 |   \item{Y}{gene expression level matrix on platform Y ( not to be transformed)}
15 | }
16 | 
17 | \details{
18 |   This function fits mixed effect regression model.
19 | }
20 | 
21 | \value{8 lists including betahat, betamat, Yhat}
22 | 
23 | \author{Xing Qiu}
24 | 
25 | \examples{
26 | fit_mm <- MM(gpl96,gpl570)
27 | }
28 | 
29 | \keyword{MatchMixeR}
30 | \keyword{MM}


--------------------------------------------------------------------------------
/MatchMixeR/man/flmer.Rd:
--------------------------------------------------------------------------------
 1 | \name{MatchMixeR}
 2 | \alias{flmer}
 3 | 
 4 | \title{MatchMixeR: A cross-platform normalization procedure for matched samples.}
 5 | 
 6 | \description{
 7 |   Fast linear mixed effects regression (flmer); a moment-based method for the MM Model
 8 | }
 9 | 
10 | \usage{flmer(Xmat, Ymat)}
11 | 
12 | \arguments{
13 |   \item{X}{gene expression level matrix on platform X (to be transformed)}
14 | 
15 |   \item{Y}{gene expression level matrix on platform Y ( not to be transformed)}
16 | }
17 | 
18 | \details{
19 |   This function fits the mixed effect regression model (MM) using fast moment based approaches.
20 | }
21 | 
22 | \value{8 lists including betahat, betamat, Yhat}
23 | 
24 | \author{Xing Qiu}
25 | 
26 | \examples{
27 | fit_flmer <- flmer(gpl96,gpl570)
28 | }
29 | 
30 | \keyword{MatchMixeR}
31 | \keyword{flmer}
32 | 


--------------------------------------------------------------------------------
/MatchMixeR/man/gpl570_gpl96.Rd:
--------------------------------------------------------------------------------
 1 | \name{gpl570_gpl96}
 2 | \alias{gpl570}
 3 | \alias{gpl96}
 4 | 
 5 | \title{matched samples on GPL570 and GPL96}
 6 | 
 7 | \usage{
 8 | data(gpl570_gpl96)
 9 | }
10 | 
11 | \description{
12 |   Gene expression data of 58 matched samples on two microarrays manufactured by Affymetrix (5147 genes and 58 tumor mRNA samples) from the CellMiner, which is a web application that provides genomic profile data and query tools for the NCI60 and additional cell types. The NCI60 is a group of 60 human cancer cell lines used by the National Cancer Institute for the screening of compounds to detect potential anticancer activity. }
13 | 
14 | \value{
15 |   \item{gpl570}{matrix of gene expression levels for the 58 matched samples measured by gpl570(Human Genome U133 Plus 2.0 Array), rows correspond to genes (5147 genes) and columns to 58 cancer cell lines.}
16 |   \item{gpl96}{matrix of gene expression levels for the 58 matched samples measured by gpl96(Human Genome U133A array), rows correspond to genes (5147 genes) and columns to 58 cancer cell lines. }
17 | }
18 | 
19 | \keyword{datasets}  
20 | 


--------------------------------------------------------------------------------
/MatchMixeR/src/MatchMixeR.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/src/MatchMixeR.dll


--------------------------------------------------------------------------------
/MatchMixeR/src/XPN_MLE.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  ######################################################################
  3 |  #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  4 |  
  5 |  #This program is free software: you can redistribute it and/or modify
  6 |  #it under the terms of the GNU General Public License as published by
  7 |  #the Free Software Foundation, either version 3 of the License, or
  8 |  #(at your option) any later version.
  9 |  
 10 |  #This program is distributed in the hope that it will be useful,
 11 |  #but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  #GNU General Public License for more details.
 14 |  
 15 |  #You should have received a copy of the GNU General Public License
 16 |  #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  ######################################################################
 18 |  */
 19 | 
 20 | #include <R.h>
 21 | #include <math.h>
 22 | #include <float.h>
 23 | #include <stdio.h>
 24 | 
 25 | 
 26 | void XPN_MLE_C_C(double *xbar, double *A, double *b, double *c, double *s2, double *sigma2, int *pI, int *pJ, int *pn, int *nj){
 27 | 	int I = *pI, J = *pJ, n = *pn;
 28 | 	double *old;
 29 | 	double maxold, tmp, sumb, sumA, sumA2, meanA, Anorm, sumnjA2, sumb2overs2, change = 1, radius = 0.0000000000000001;
 30 | 	int changed = 1, oldlength = J+3*I;
 31 | 	old = malloc((oldlength)*sizeof(double));
 32 | 	
 33 | 	//Copy starting values
 34 | 	int i,j;
 35 | 	maxold = 0;
 36 | 	for(i=0;i<J;i++){
 37 | 		old[i] = A[i];
 38 | 		if(old[i] > maxold){
 39 | 			maxold = old[i];
 40 | 		}
 41 | 	}
 42 | 	for(i=0;i<I;i++){
 43 | 		old[i + J] = b[i];
 44 | 		if(old[i+J] > maxold){
 45 | 			maxold = old[i+J];
 46 | 		}
 47 | 	}
 48 | 	for(i=0;i<I;i++){
 49 | 		old[i + J+I] = c[i];
 50 | 		if(old[i+J+I] > maxold){
 51 | 			maxold = old[i+J+I];
 52 | 		}
 53 | 	}
 54 | 	for(i=0;i<I;i++){
 55 | 		old[i + J+2*I] = s2[i];
 56 | 		if(old[i+J+2*I] > maxold){
 57 | 			maxold = old[i+J+2*I];
 58 | 		}
 59 | 	}
 60 | 
 61 | 	
 62 | 
 63 | 	//Until we stabilize
 64 | 	int iter = 0;
 65 | 	while(changed && (iter < 100)){
 66 | 		iter++;
 67 | 	//	Rprintf("%d ",iter);
 68 | 		fflush(stdout);
 69 | 
 70 | 
 71 | 	
 72 | 		//update c
 73 | 		for (i=0;i<I;i++){
 74 | 			c[i] = 0;
 75 | 			for (j=0;j<J;j++){
 76 | 				c[i] += (xbar[i + j*I] - b[i] * A[j]) * nj[j];
 77 | 			}
 78 | 			c[i] = c[i]/n;
 79 | 		//	Rprintf("c[%d]: %f\n",i,sumb);
 80 | 		}
 81 | 		
 82 | 		//Correct b if needed
 83 | 		sumb =0;
 84 | 		for (i=0;i<I;i++){
 85 | 			sumb += b[i];
 86 | 			
 87 | 		}
 88 | 		
 89 | 		if (sumb < 0) {
 90 | 			for(i=1;i<I;i++){
 91 | 				b[i] = -1*b[i];
 92 | 			}
 93 | 			sumb = -1*sumb;
 94 | 		}
 95 | 
 96 | 		
 97 | 		//Update A
 98 | 		sumb2overs2 = 0;
 99 | 		for(i=0;i<I;i++){
100 | 			sumb2overs2 += b[i]*b[i]/s2[i];
101 | 		}
102 | 		sumA = 0;
103 | 		for(j=0;j<J;j++){
104 | 			A[j] = 0;
105 | 			for(i=0;i<I;i++){
106 | 				A[j] += b[i]*(xbar[i+j*I]-c[i])/s2[i];
107 | 			}
108 | 			A[j] = A[j]/sumb2overs2;
109 | 			sumA += A[j];
110 | 		//	Rprintf("A[%d]: %f\n",j,A[j]);
111 | 		}
112 | 		meanA = sumA/J;
113 | 		for(j=0;j<J;j++){
114 | 			A[j] = A[j] - meanA;
115 | 		}
116 | 		sumA2 = 0;
117 | 		for(j=0;j<J;j++){
118 | 			sumA2 += A[j]*A[j];
119 | 		}
120 | 		Anorm = sqrt(J/sumA2);
121 | 		for(j=0;j<J;j++){
122 | 			A[j] = A[j]*Anorm;
123 | 		}
124 | 
125 | 		
126 | 		//Update b
127 | 		sumnjA2 = 0;
128 | 		for(j=0;j<J;j++){
129 | 			sumnjA2 += nj[j]*A[j]*A[j];
130 | 		}
131 | 		for (i=0;i<I;i++){
132 | 			b[i] = 0;
133 | 			for (j=0;j<J;j++){
134 | 				b[i] += A[j] * (xbar[i + I*j] - c[i]) * nj[j];
135 | 			}
136 | 			b[i] = b[i]/sumnjA2;
137 | 		//	Rprintf("b[%d]: %f\n",i,b[i]);
138 | 		}
139 | 		
140 | 		//update s2
141 | 		for (i=0;i<I;i++){
142 | 			s2[i] = 0;
143 | 			for (j=0;j<J;j++){
144 | 		//		Rprintf("Going into s2 calculation, xbar[%d,%d] = %f, c[%d] = %f, A[%d] = %f, b[%d] = %f n = %d\n", i,j,xbar[i + I*j] ,i, c[i],j,A[j],i,b[i],n);
145 | 				tmp = (xbar[i + I*j] - c[i] - A[j]*b[i]);
146 | 				s2[i] += (tmp*tmp + sigma2[i+j*I])*nj[j];
147 | 			}
148 | 			s2[i] = s2[i]/n;
149 | 			if (s2[i] == 0){
150 | 		//		Rprintf("OMG!!!\n");
151 | 				s2[i] = DBL_MIN;
152 | 			}
153 | 		//	Rprintf("s2[%d]: %f\n",i,s2[i]);
154 | 		}
155 | 
156 | 		//Calculate change
157 | 		change = 0;
158 | 		for(i=0;i<J;i++){
159 | 			tmp = old[i] - A[i];
160 | 		//	Rprintf("%f ",tmp);
161 | 			change += tmp*tmp;
162 | 		}
163 | 		for(i=0;i<I;i++){
164 | 			tmp = old[i + J] - b[i];
165 | 		//	Rprintf("%f ",tmp);
166 | 			change += tmp * tmp;
167 | 		}
168 | 		for(i=0;i<I;i++){
169 | 			tmp = old[i + J + I] - c[i];
170 | 		//	Rprintf("%f ",tmp);
171 | 			change += tmp*tmp;
172 | 		}
173 | 		for(i=0;i<I;i++){
174 | 			tmp = old[i + J + 2*I] - s2[i];
175 | 		//	Rprintf("%f ",tmp);
176 | 			change += tmp*tmp;
177 | 		}
178 | 		
179 | 		changed = (change > radius*maxold);
180 | 		//printf("%f ",change);
181 | 		
182 | 		//Copy previous values
183 | 		maxold = 0;
184 | 		for(i=0;i<J;i++){
185 | 			old[i] = A[i];
186 | 			if(old[i] > maxold){
187 | 				maxold = old[i];
188 | 			}
189 | 		}
190 | 		for(i=0;i<I;i++){
191 | 			old[i + J] = b[i];
192 | 			if(old[i+J] > maxold){
193 | 				maxold = old[i+J];
194 | 			}
195 | 		}
196 | 		for(i=0;i<I;i++){
197 | 			old[i + J+I] = c[i];
198 | 			if(old[i+J+I] > maxold){
199 | 				maxold = old[i+J+I] ;
200 | 			}
201 | 		}
202 | 		for(i=0;i<I;i++){
203 | 			old[i + J+2*I] = s2[i];
204 | 			if(old[i+J+2*I] > maxold){
205 | 				maxold = old[i+J+2*I];
206 | 			}
207 | 		}
208 | 
209 | 	}	
210 | 
211 | 	free(old);
212 | 
213 | }
214 | 


--------------------------------------------------------------------------------
/MatchMixeR/src/XPN_MLE.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/src/XPN_MLE.o


--------------------------------------------------------------------------------
/MatchMixeR/src/ckmeans_c.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  ######################################################################
  3 |  #Copyright Jason Rudy & Faramarz Valafar 2009-2010
  4 |  
  5 |  #This program is free software: you can redistribute it and/or modify
  6 |  #it under the terms of the GNU General Public License as published by
  7 |  #the Free Software Foundation, either version 3 of the License, or
  8 |  #(at your option) any later version.
  9 |  
 10 |  #This program is distributed in the hope that it will be useful,
 11 |  #but WITHOUT ANY WARRANTY; without even the implied warranty of
 12 |  #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 13 |  #GNU General Public License for more details.
 14 |  
 15 |  #You should have received a copy of the GNU General Public License
 16 |  #along with this program.  If not, see <http://www.gnu.org/licenses/>.
 17 |  ######################################################################
 18 | */
 19 | #include <R.h>
 20 | #include <math.h>
 21 | //#include "modreg.h" /* for declarations for registration */
 22 | 
 23 | double pearson(double *x, double *cen,  int p, int i, int j, int n, int k);
 24 | double euclidean(double *x, double *cen, int p, int i, int j, int n, int k);
 25 | double manhattan(double *x, double *cen, int p, int i, int j, int n, int k);
 26 | double absolutepearson(double *x, double *cen, int p, int i, int j, int n, int k);
 27 | 
 28 | void ckmeans_c(double *x, int *pn, int *pp, double *cen, int *pk, int *cl, int *pmaxiter, int *nc, double *wss, int *pdistance )
 29 | {
 30 |     int n = *pn, k = *pk, p = *pp, maxiter = *pmaxiter, distance = *pdistance;
 31 |     int iter, i, j, c, it, inew = 0;
 32 |     double best, dd, tmp;
 33 |     Rboolean updated;
 34 | 	
 35 |     for(i = 0; i < n; i++) cl[i] = -1;
 36 |     for(iter = 0; iter < maxiter; iter++) {
 37 | 	updated = FALSE;
 38 | 	for(i = 0; i < n; i++) {
 39 | 	    /* find nearest centre for each point */
 40 | 	    best = R_PosInf;
 41 | 	    for(j = 0; j < k; j++) {
 42 | 		dd = 0.0;
 43 | 		
 44 | 		switch (distance){
 45 | 			case 1: dd = pearson(x,cen,p, i, j, n, k);
 46 | 				break;
 47 | 			case 2: dd = euclidean(x,cen,p, i, j, n, k);
 48 | 				break;
 49 | 			case 3: dd = manhattan(x,cen,p, i, j, n, k);
 50 | 				break;
 51 | 			case 4: dd = absolutepearson(x,cen,p, i, j, n, k);
 52 | 				break;
 53 | 			default: error("Unkown distance measure provided to ckmeans.");
 54 | 		}
 55 | 		if(dd < best) {
 56 | 		    best = dd;
 57 | 		    inew = j+1;
 58 | 		}
 59 | 	    }
 60 | 	    if(cl[i] != inew) {
 61 | 		updated = TRUE;
 62 | 		cl[i] = inew;
 63 | 	    }
 64 | 	}
 65 | 	if(!updated) break;
 66 | 	/* update each centre */
 67 | 	for(j = 0; j < k*p; j++) cen[j] = 0.0;
 68 | 	for(j = 0; j < k; j++) nc[j] = 0;
 69 | 	for(i = 0; i < n; i++) {
 70 | 	    it = cl[i] - 1; nc[it]++;
 71 | 	    for(c = 0; c < p; c++) cen[it+c*k] += x[i+c*n];
 72 | 	}
 73 | 	for(j = 0; j < k*p; j++) cen[j] /= nc[j % k];
 74 |     }
 75 | 
 76 |     *pmaxiter = iter + 1;
 77 |     for(j = 0; j < k; j++) wss[j] = 0.0;
 78 |     for(i = 0; i < n; i++) {
 79 | 	it = cl[i] - 1;
 80 | 	for(c = 0; c < p; c++) {
 81 | 	    tmp = x[i+n*c] - cen[it+k*c];
 82 | 	    wss[it] += tmp * tmp;
 83 | 	}
 84 |     }
 85 | }
 86 | 
 87 | double pearson(double *x, double *cen,  int p, int i, int j, int n, int k){
 88 | 	double top = 0;
 89 | 	double bottom1 = 0;
 90 | 	double bottom2 = 0;
 91 | 	int c;
 92 | 	for (c = 0; c < p; c++){
 93 | 		top += x[i+n*c] * cen[j+k*c];
 94 | 		bottom1 += x[i+n*c]*x[i+n*c];
 95 | 		bottom2 += cen[j+k*c]*cen[j+k*c];
 96 | 	}
 97 | 	return(1 - top/(sqrt(bottom1)*sqrt(bottom2)));
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | double euclidean(double *x, double *cen, int p, int i, int j, int n, int k){
106 | 	double tmp, dd = 0;
107 | 	int c;
108 | 	for(c = 0; c < p; c++) {
109 | 	    tmp = x[i+n*c] - cen[j+k*c];
110 | 	    dd += tmp * tmp;
111 | 	}
112 | 	return(dd);
113 | }
114 | 
115 | 
116 | double manhattan(double *x, double *cen, int p, int i, int j, int n, int k){
117 | 	double dd = 0;
118 | 	int c;
119 | 	for(c = 0; c < p; c++) {
120 | 
121 | 	    dd += abs(x[i+n*c] - cen[j+k*c]);
122 | 	}
123 | 	return(dd);
124 | }
125 | 
126 | 
127 | double absolutepearson(double *x, double *cen, int p, int i, int j, int n, int k){
128 | 	
129 | 	return(abs(pearson(x,cen,p, i, j, n, k)));
130 | 
131 | }
132 | 


--------------------------------------------------------------------------------
/MatchMixeR/src/ckmeans_c.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dy16b/Cross-Platform-Normalization/83619fd3bc744ee6e52e621dc7d206defb222e5b/MatchMixeR/src/ckmeans_c.o


--------------------------------------------------------------------------------
/MatchMixeR/vignettes/overview.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "MatchMixeR User Guide"
 3 | author: "Xing Qiu, Serin Zhang"
 4 | date: "`r Sys.Date()`"
 5 | output: rmarkdown::html_vignette
 6 | vignette: >
 7 |   %\VignetteIndexEntry{MatchMixeR User Guide}
 8 |   %\VignetteEngine{knitr::rmarkdown}
 9 |   %\VignetteEncoding{UTF-8}
10 | ---
11 | 
12 | ```{r load-package}
13 | 
14 | library(MatchMixeR)
15 | 
16 | ```
17 | 
18 | In this Vignette, we describe the basic usage of the `MatchMixeR` package. This package implements a fast liner mixed effect model for cross-platform normalization of high throughput genomic data using matched samples. Below we describe the inspiring problem and the usage of the main R function, `MM()`. 
19 | 
20 | ## Introduction
21 | 
22 | We will briefly describe a motivating problem for which the `MM` package is applicable.
23 | 
24 | Cross-platform normalization method is highly demanding for data integration across different studies as researchers have more access to pre-existing gene expression data stored in public repositories. However, the systematic difference from heterogeneous features of high throughput techniques makes the data integration difficult. Several methods for cross-platform normalization have been proposed and their normalized data show high cross-platform concordance. However, their methods can also reduce the biological difference by chance since the biological effect can be compounded with the non-biological effects. 
25 | 
26 | Therefore, we proposed a new cross platform normalization method to eliminate only the heterogeneity caused by different platforms and preserve the biological information by employing matched samples which have only platform differences. In our method, a benchmark linear fixed effect regression model is acquired from a matched sample data and apply it to the research data to be combined. We call our method as Match-MixeR which means a matched sample based cross-platform normalization method using mixed effect regression model. 
27 | 
28 | ## Fast LMER 
29 | In the typical LMER model, the $\hat{{\theta}}$ can be estimated by maximum likelihood(ML) or restricted maximum likelihood (REML) method and modern LMER algorithms use the REML by default because the ML method has some bias. However,  both ML and REML methods are time consuming since they rely on inverting large matrices and iterative optimization procedure. So we proposed a new fast solution using moment matching algorithm. 
30 | 
31 | ## Practical workaraound for cross-platform normalization of gene expression data
32 | The original FLMER assumes that the
33 | covariance structure of the random intercepts and slopes is a
34 | scalar matrix of form $\sigma_{\gamma}^2 \mathbf{I}$. In cross-platform normalization study, the variance of random intercepts may be many times larger than that of the random slopes.  
35 | If we model all these three parameters in the FLMER algorithm, we will need three independent moment matching equations, which will create more uncertainties in the estimator and dramatically reduce the computational efficiency. In other words, to include too many unknown parameters in the model defeats the very purpose of FLMER. 
36 | 
37 | As an alternative, we propose the following practical and fast workaround.
38 | 
39 | ## Usage examples
40 | The primary function in our package is `MM()`.  It takes the following primary arguments: `Xmat` and `Ymat`. Here `Xmat` and Here `Xmat` are $m\times n$ dimensional matched samples gene expression matrices. The MM function fits mixed effect regression model and gives the $\hat{{Y}}$, the transformed values for the gene expression values in Xmat. Here, only the values in platform X will be changed for data integration. One can assigne platform X and platform Y by considering the sample size and the popularity of the platforms.  
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/MatchMixeR/vignettes/overview.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | 
  3 | <html xmlns="http://www.w3.org/1999/xhtml">
  4 | 
  5 | <head>
  6 | 
  7 | <meta charset="utf-8" />
  8 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  9 | <meta name="generator" content="pandoc" />
 10 | 
 11 | <meta name="viewport" content="width=device-width, initial-scale=1">
 12 | 
 13 | <meta name="author" content="Xing Qiu, Serin Zhang" />
 14 | 
 15 | <meta name="date" content="2018-10-29" />
 16 | 
 17 | <title>MatchMixeR User Guide</title>
 18 | 
 19 | 
 20 | 
 21 | <style type="text/css">code{white-space: pre;}</style>
 22 | <style type="text/css">
 23 | div.sourceCode { overflow-x: auto; }
 24 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
 25 |   margin: 0; padding: 0; vertical-align: baseline; border: none; }
 26 | table.sourceCode { width: 100%; line-height: 100%; }
 27 | td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
 28 | td.sourceCode { padding-left: 5px; }
 29 | code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
 30 | code > span.dt { color: #902000; } /* DataType */
 31 | code > span.dv { color: #40a070; } /* DecVal */
 32 | code > span.bn { color: #40a070; } /* BaseN */
 33 | code > span.fl { color: #40a070; } /* Float */
 34 | code > span.ch { color: #4070a0; } /* Char */
 35 | code > span.st { color: #4070a0; } /* String */
 36 | code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
 37 | code > span.ot { color: #007020; } /* Other */
 38 | code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
 39 | code > span.fu { color: #06287e; } /* Function */
 40 | code > span.er { color: #ff0000; font-weight: bold; } /* Error */
 41 | code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
 42 | code > span.cn { color: #880000; } /* Constant */
 43 | code > span.sc { color: #4070a0; } /* SpecialChar */
 44 | code > span.vs { color: #4070a0; } /* VerbatimString */
 45 | code > span.ss { color: #bb6688; } /* SpecialString */
 46 | code > span.im { } /* Import */
 47 | code > span.va { color: #19177c; } /* Variable */
 48 | code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
 49 | code > span.op { color: #666666; } /* Operator */
 50 | code > span.bu { } /* BuiltIn */
 51 | code > span.ex { } /* Extension */
 52 | code > span.pp { color: #bc7a00; } /* Preprocessor */
 53 | code > span.at { color: #7d9029; } /* Attribute */
 54 | code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
 55 | code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
 56 | code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
 57 | code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
 58 | </style>
 59 | 
 60 | 
 61 | 
 62 | <link href="data:text/css;charset=utf-8,body%20%7B%0Abackground%2Dcolor%3A%20%23fff%3B%0Amargin%3A%201em%20auto%3B%0Amax%2Dwidth%3A%20700px%3B%0Aoverflow%3A%20visible%3B%0Apadding%2Dleft%3A%202em%3B%0Apadding%2Dright%3A%202em%3B%0Afont%2Dfamily%3A%20%22Open%20Sans%22%2C%20%22Helvetica%20Neue%22%2C%20Helvetica%2C%20Arial%2C%20sans%2Dserif%3B%0Afont%2Dsize%3A%2014px%3B%0Aline%2Dheight%3A%201%2E35%3B%0A%7D%0A%23header%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0A%23TOC%20%7B%0Aclear%3A%20both%3B%0Amargin%3A%200%200%2010px%2010px%3B%0Apadding%3A%204px%3B%0Awidth%3A%20400px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Aborder%2Dradius%3A%205px%3B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Afont%2Dsize%3A%2013px%3B%0Aline%2Dheight%3A%201%2E3%3B%0A%7D%0A%23TOC%20%2Etoctitle%20%7B%0Afont%2Dweight%3A%20bold%3B%0Afont%2Dsize%3A%2015px%3B%0Amargin%2Dleft%3A%205px%3B%0A%7D%0A%23TOC%20ul%20%7B%0Apadding%2Dleft%3A%2040px%3B%0Amargin%2Dleft%3A%20%2D1%2E5em%3B%0Amargin%2Dtop%3A%205px%3B%0Amargin%2Dbottom%3A%205px%3B%0A%7D%0A%23TOC%20ul%20ul%20%7B%0Amargin%2Dleft%3A%20%2D2em%3B%0A%7D%0A%23TOC%20li%20%7B%0Aline%2Dheight%3A%2016px%3B%0A%7D%0Atable%20%7B%0Amargin%3A%201em%20auto%3B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dcolor%3A%20%23DDDDDD%3B%0Aborder%2Dstyle%3A%20outset%3B%0Aborder%2Dcollapse%3A%20collapse%3B%0A%7D%0Atable%20th%20%7B%0Aborder%2Dwidth%3A%202px%3B%0Apadding%3A%205px%3B%0Aborder%2Dstyle%3A%20inset%3B%0A%7D%0Atable%20td%20%7B%0Aborder%2Dwidth%3A%201px%3B%0Aborder%2Dstyle%3A%20inset%3B%0Aline%2Dheight%3A%2018px%3B%0Apadding%3A%205px%205px%3B%0A%7D%0Atable%2C%20table%20th%2C%20table%20td%20%7B%0Aborder%2Dleft%2Dstyle%3A%20none%3B%0Aborder%2Dright%2Dstyle%3A%20none%3B%0A%7D%0Atable%20thead%2C%20table%20tr%2Eeven%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Ap%20%7B%0Amargin%3A%200%2E5em%200%3B%0A%7D%0Ablockquote%20%7B%0Abackground%2Dcolor%3A%20%23f6f6f6%3B%0Apadding%3A%200%2E25em%200%2E75em%3B%0A%7D%0Ahr%20%7B%0Aborder%2Dstyle%3A%20solid%3B%0Aborder%3A%20none%3B%0Aborder%2Dtop%3A%201px%20solid%20%23777%3B%0Amargin%3A%2028px%200%3B%0A%7D%0Adl%20%7B%0Amargin%2Dleft%3A%200%3B%0A%7D%0Adl%20dd%20%7B%0Amargin%2Dbottom%3A%2013px%3B%0Amargin%2Dleft%3A%2013px%3B%0A%7D%0Adl%20dt%20%7B%0Afont%2Dweight%3A%20bold%3B%0A%7D%0Aul%20%7B%0Amargin%2Dtop%3A%200%3B%0A%7D%0Aul%20li%20%7B%0Alist%2Dstyle%3A%20circle%20outside%3B%0A%7D%0Aul%20ul%20%7B%0Amargin%2Dbottom%3A%200%3B%0A%7D%0Apre%2C%20code%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0Aborder%2Dradius%3A%203px%3B%0Acolor%3A%20%23333%3B%0Awhite%2Dspace%3A%20pre%2Dwrap%3B%20%0A%7D%0Apre%20%7B%0Aborder%2Dradius%3A%203px%3B%0Amargin%3A%205px%200px%2010px%200px%3B%0Apadding%3A%2010px%3B%0A%7D%0Apre%3Anot%28%5Bclass%5D%29%20%7B%0Abackground%2Dcolor%3A%20%23f7f7f7%3B%0A%7D%0Acode%20%7B%0Afont%2Dfamily%3A%20Consolas%2C%20Monaco%2C%20%27Courier%20New%27%2C%20monospace%3B%0Afont%2Dsize%3A%2085%25%3B%0A%7D%0Ap%20%3E%20code%2C%20li%20%3E%20code%20%7B%0Apadding%3A%202px%200px%3B%0A%7D%0Adiv%2Efigure%20%7B%0Atext%2Dalign%3A%20center%3B%0A%7D%0Aimg%20%7B%0Abackground%2Dcolor%3A%20%23FFFFFF%3B%0Apadding%3A%202px%3B%0Aborder%3A%201px%20solid%20%23DDDDDD%3B%0Aborder%2Dradius%3A%203px%3B%0Aborder%3A%201px%20solid%20%23CCCCCC%3B%0Amargin%3A%200%205px%3B%0A%7D%0Ah1%20%7B%0Amargin%2Dtop%3A%200%3B%0Afont%2Dsize%3A%2035px%3B%0Aline%2Dheight%3A%2040px%3B%0A%7D%0Ah2%20%7B%0Aborder%2Dbottom%3A%204px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Apadding%2Dbottom%3A%202px%3B%0Afont%2Dsize%3A%20145%25%3B%0A%7D%0Ah3%20%7B%0Aborder%2Dbottom%3A%202px%20solid%20%23f7f7f7%3B%0Apadding%2Dtop%3A%2010px%3B%0Afont%2Dsize%3A%20120%25%3B%0A%7D%0Ah4%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23f7f7f7%3B%0Amargin%2Dleft%3A%208px%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Ah5%2C%20h6%20%7B%0Aborder%2Dbottom%3A%201px%20solid%20%23ccc%3B%0Afont%2Dsize%3A%20105%25%3B%0A%7D%0Aa%20%7B%0Acolor%3A%20%230033dd%3B%0Atext%2Ddecoration%3A%20none%3B%0A%7D%0Aa%3Ahover%20%7B%0Acolor%3A%20%236666ff%3B%20%7D%0Aa%3Avisited%20%7B%0Acolor%3A%20%23800080%3B%20%7D%0Aa%3Avisited%3Ahover%20%7B%0Acolor%3A%20%23BB00BB%3B%20%7D%0Aa%5Bhref%5E%3D%22http%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0Aa%5Bhref%5E%3D%22https%3A%22%5D%20%7B%0Atext%2Ddecoration%3A%20underline%3B%20%7D%0A%0Acode%20%3E%20span%2Ekw%20%7B%20color%3A%20%23555%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Edt%20%7B%20color%3A%20%23902000%3B%20%7D%20%0Acode%20%3E%20span%2Edv%20%7B%20color%3A%20%2340a070%3B%20%7D%20%0Acode%20%3E%20span%2Ebn%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Efl%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Ech%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Est%20%7B%20color%3A%20%23d14%3B%20%7D%20%0Acode%20%3E%20span%2Eco%20%7B%20color%3A%20%23888888%3B%20font%2Dstyle%3A%20italic%3B%20%7D%20%0Acode%20%3E%20span%2Eot%20%7B%20color%3A%20%23007020%3B%20%7D%20%0Acode%20%3E%20span%2Eal%20%7B%20color%3A%20%23ff0000%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%0Acode%20%3E%20span%2Efu%20%7B%20color%3A%20%23900%3B%20font%2Dweight%3A%20bold%3B%20%7D%20%20code%20%3E%20span%2Eer%20%7B%20color%3A%20%23a61717%3B%20background%2Dcolor%3A%20%23e3d2d2%3B%20%7D%20%0A" rel="stylesheet" type="text/css" />
 63 | 
 64 | </head>
 65 | 
 66 | <body>
 67 | 
 68 | 
 69 | 
 70 | 
 71 | <h1 class="title toc-ignore">MatchMixeR User Guide</h1>
 72 | <h4 class="author"><em>Xing Qiu, Serin Zhang</em></h4>
 73 | <h4 class="date"><em>2018-10-29</em></h4>
 74 | 
 75 | 
 76 | 
 77 | <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(MatchMixeR)</code></pre></div>
 78 | <p>In this Vignette, we describe the basic usage of the <code>MatchMixeR</code> package. This package implements a fast liner mixed effect model for cross-platform normalization of high throughput genomic data using matched samples. Below we describe the inspiring problem and the usage of the main R function, <code>MM()</code>.</p>
 79 | <div id="introduction" class="section level2">
 80 | <h2>Introduction</h2>
 81 | <p>We will briefly describe a motivating problem for which the <code>MM</code> package is applicable.</p>
 82 | <p>Cross-platform normalization method is highly demanding for data integration across different studies as researchers have more access to pre-existing gene expression data stored in public repositories. However, the systematic difference from heterogeneous features of high throughput techniques makes the data integration difficult. Several methods for cross-platform normalization have been proposed and their normalized data show high cross-platform concordance. However, their methods can also reduce the biological difference by chance since the biological effect can be compounded with the non-biological effects.</p>
 83 | <p>Therefore, we proposed a new cross platform normalization method to eliminate only the heterogeneity caused by different platforms and preserve the biological information by employing matched samples which have only platform differences. In our method, a benchmark linear fixed effect regression model is acquired from a matched sample data and apply it to the research data to be combined. We call our method as Match-MixeR which means a matched sample based cross-platform normalization method using mixed effect regression model.</p>
 84 | </div>
 85 | <div id="fast-lmer" class="section level2">
 86 | <h2>Fast LMER</h2>
 87 | <p>In the typical LMER model, the <span class="math inline">\(\hat{{\theta}}\)</span> can be estimated by maximum likelihood(ML) or restricted maximum likelihood (REML) method and modern LMER algorithms use the REML by default because the ML method has some bias. However, both ML and REML methods are time consuming since they rely on inverting large matrices and iterative optimization procedure. So we proposed a new fast solution using moment matching algorithm.</p>
 88 | </div>
 89 | <div id="practical-workaraound-for-cross-platform-normalization-of-gene-expression-data" class="section level2">
 90 | <h2>Practical workaraound for cross-platform normalization of gene expression data</h2>
 91 | <p>The original FLMER assumes that the covariance structure of the random intercepts and slopes is a scalar matrix of form <span class="math inline">\(\sigma_{\gamma}^2 \mathbf{I}\)</span>. In cross-platform normalization study, the variance of random intercepts may be many times larger than that of the random slopes.<br />
 92 | If we model all these three parameters in the FLMER algorithm, we will need three independent moment matching equations, which will create more uncertainties in the estimator and dramatically reduce the computational efficiency. In other words, to include too many unknown parameters in the model defeats the very purpose of FLMER.</p>
 93 | <p>As an alternative, we propose the following practical and fast workaround.</p>
 94 | </div>
 95 | <div id="usage-examples" class="section level2">
 96 | <h2>Usage examples</h2>
 97 | <p>The primary function in our package is <code>MM()</code>. It takes the following primary arguments: <code>Xmat</code> and <code>Ymat</code>. Here <code>Xmat</code> and Here <code>Xmat</code> are <span class="math inline">\(m\times n\)</span> dimensional matched samples gene expression matrices. The MM function fits mixed effect regression model and gives the <span class="math inline">\(\hat{{Y}}\)</span>, the transformed values for the gene expression values in Xmat. Here, only the values in platform X will be changed for data integration. One can assigne platform X and platform Y by considering the sample size and the popularity of the platforms.</p>
 98 | </div>
 99 | 
100 | 
101 | 
102 | <!-- dynamically load mathjax for compatibility with self-contained -->
103 | <script>
104 |   (function () {
105 |     var script = document.createElement("script");
106 |     script.type = "text/javascript";
107 |     script.src  = "https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
108 |     document.getElementsByTagName("head")[0].appendChild(script);
109 |   })();
110 | </script>
111 | 
112 | </body>
113 | </html>
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Cross-Platform-Normalization


--------------------------------------------------------------------------------
/Simulation_updated_github.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ### This code was written by Dr. Serin Zhang, as a part of her Statistics PhD
  3 | ### dissertation project at Florida State University
  4 | 
  5 | # setwd("C:/Users/Disa Yu/Dropbox/Cross-platform-normalization/R_code")
  6 | 
  7 | library("genefilter")
  8 | library("MatchMixeR")
  9 | 
 10 | ### create a "true" expression data and fit OLS/FLMER models 
 11 | set.seed(4321)
 12 | m <- 10000; n <- 100
 13 | ## TrueMeans is the true mean expression for each gene.
 14 | TrueMeans <- rnorm(m, 7.5, .6)
 15 | 
 16 | ## Generate the training data (matched data)
 17 | matchdat.X <- matrix(rnorm(m*n, 0, .5), m) + TrueMeans
 18 | beta0 <- rnorm(m, 2.5, 2.0); beta1 <- runif(m, .7, 1.0)
 19 | betas <- cbind(beta0=beta0, beta1=beta1)
 20 | matchdat.Y <- beta0 + matchdat.X * beta1 
 21 | 
 22 | ## fit models on the first matched data : OLS & FLMER   
 23 | OLSmod <- MatchMixeR:::OLS(matchdat.X, matchdat.Y)
 24 | #FLMERmod <-ModIV(matchdat.X, matchdat.Y)
 25 | FLMERmod <-MM(matchdat.X, matchdat.Y)
 26 | 
 27 | ### Initial Method comparison
 28 | ## create research data(test data) "WITHOUT" DEGs - "smaller" matched data 
 29 | #n1 <- 5
 30 | n1 <- 30
 31 | sim.match.X <- matrix(rnorm(m*n1, 0, .3), m) + TrueMeans
 32 | sim.match.Y <- beta0 + sim.match.X * beta1 
 33 | 
 34 | #sim.match.X5 <- sim.match.X; sim.match.Y5 <- sim.match.Y
 35 | #sim.match.X30 <- sim.match.X; sim.match.Y30 <- sim.match.Y
 36 | 
 37 | ## Apply fitted OLS&FLMER on the first matched data to the smaller matched data   
 38 | XtransOLS <- sim.match.X*OLSmod$betamat[, "Slope"] + OLSmod$betamat[, "Intercept"]
 39 | XtransFLMER <- sim.match.X*FLMERmod$betamat[, "Slope"] + FLMERmod$betamat[, "Intercept"]
 40 | ## other methods normalization on smaller matched data
 41 | DWDmod <- dwd(sim.match.X, sim.match.Y)
 42 | EBmod <- eb(sim.match.X, sim.match.Y)
 43 | #GQmod <- gq(sim.match.X, sim.match.Y)
 44 | XPNmod <- xpn(sim.match.X, sim.match.Y)
 45 | 
 46 | ## make plots : Coloumn-wise R2 comparison: DWD beats our methods 
 47 | corr.raw <- as.numeric(lapply(1:n1,function(i) cor(sim.match.X[,i], sim.match.Y[,i])))
 48 | corr.OLS <- as.numeric(lapply(1:n1,function(i) cor(XtransOLS[,i],sim.match.Y[,i])))
 49 | corr.FLMER <- as.numeric(lapply(1:n1,function(i) cor(XtransFLMER[,i],sim.match.Y[,i])))
 50 | corr.DWD <- as.numeric(lapply(1:n1,function(i) cor(DWDmod$x[,i],DWDmod$y[,i])))
 51 | corr.EB <- as.numeric(lapply(1:n1,function(i) cor(EBmod$x[,i],EBmod$y[,i])))
 52 | #corr.GQ <- as.numeric(lapply(1:n1,function(i) cor(GQmod$x[,i],GQmod$y[,i])))
 53 | corr.XPN <- as.numeric(lapply(1:n1,function(i) cor(XPNmod$x[,i],XPNmod$y[,i])))
 54 | r2.raw <- (corr.raw)^2
 55 | r2.OLS <- (corr.OLS)^2
 56 | r2.FLMER <- (corr.FLMER)^2
 57 | r2.DWD <- (corr.DWD)^2
 58 | r2.EB <- (corr.EB)^2
 59 | #r2.GQ <- (corr.GQ)^2
 60 | r2.XPN <- (corr.XPN)^2
 61 | r2.results <- cbind(MM=r2.FLMER,DWD=r2.DWD,XPN=r2.XPN,EB=r2.EB)
 62 | boxplot(r2.results,ylab="R(column wise)",ylim = c(0.999,1),cex.lab=1.3, cex.axis=1.3) 
 63 | 
 64 | #r2.sim.n5 <- r2.results
 65 | r2.sim.n30 <- r2.results
 66 | 
 67 | ## RSS comparison: OLS vs FLMER
 68 | RSS.OLS <- mean((XtransOLS-sim.match.Y)^2)
 69 | RSS.FLMER <- mean((XtransFLMER-sim.match.Y)^2)
 70 | ## mean sum of difference squres for other methods
 71 | Diff.raw <- mean((sim.match.X-sim.match.Y)^2)
 72 | Diff.EB <- mean((EBmod$x - EBmod$y)^2)
 73 | Diff.DWD <- mean(as.matrix(DWDmod$x - DWDmod$y)^2)
 74 | #Diff.GQ <- mean((GQmod$x - GQmod$y)^2)
 75 | Diff.XPN <- mean(as.matrix(XPNmod$x - XPNmod$y)^2)
 76 | 
 77 | #diff.n5 <- c(raw = Diff.raw, MM = RSS.FLMER,DWD = Diff.DWD,XPN = Diff.XPN,EB = Diff.EB)
 78 | diff.n30 <- c(raw = Diff.raw, MM = RSS.FLMER,DWD = Diff.DWD,XPN = Diff.XPN,EB = Diff.EB)
 79 | 
 80 | save(r2.sim.n30, diff.n30, file="sim_results.RData")
 81 | 
 82 | ### DE analysis
 83 | ##Generate GroupA with n3 platformX samples & n4 platformY samples
 84 | #GroupB with n5 platformX samples & n6 platformY samples
 85 | n2 <- 30; n3 <- 0 # GroupA&PlatformX ; GroupB&PlatformX  
 86 | n4 <- 0; n5 <- 30 # GroupA&PlatformY ; GroupB&PlatformY
 87 | TP <- matrix(0,30,6);  FP <- matrix(0,30,6)  #exclude GQ
 88 | colnames(TP)<-c("raw","OLS","MM","DWD","XPN","EB")
 89 | colnames(FP)<-c("raw","OLS","MM","DWD","XPN","EB")
 90 | 
 91 | ## create research data with 1000 DEGs (By adding Effs to GroupB)
 92 | m1 <- 1000
 93 | Effs <- rep(0, m); Effs[1:m1] <- runif(m1, .2, 3)
 94 | for (i in 1:30)
 95 | {
 96 |   print(i)
 97 |   sim.A.X <- matrix(rnorm(m*n2, 0, .25), m) + TrueMeans 
 98 |   sim.B.X <- matrix(rnorm(m*n3, 0, .25), m) + TrueMeans + Effs
 99 |   sim.X <- cbind(sim.A.X,sim.B.X) 
100 |   TrueExp.A.Y <- matrix(rnorm(m*n4, 0, .25), m) + TrueMeans
101 |   TrueExp.B.Y <- matrix(rnorm(m*n5, 0, .25), m) + TrueMeans + Effs
102 |   TrueExp.Y <- cbind(TrueExp.A.Y,TrueExp.B.Y)
103 |   sim.Y <- beta0 + TrueExp.Y * beta1 
104 |   
105 |   ## Apply the trained betas to the research data
106 |   Xtrans.OLS_DE <- sim.X*OLSmod$betamat[, "Slope"] + OLSmod$betamat[, "Intercept"]
107 |   Xtrans.FLMER_DE <- sim.X*FLMERmod$betamat[, "Slope"] + FLMERmod$betamat[, "Intercept"]
108 |   ## other methods normalization on research data with DEGs 
109 |   DWDmod_DE <- dwd(sim.X,sim.Y)
110 |   EBmod_DE <- eb(sim.X,sim.Y)
111 |   #GQmod_DE <- gq(sim.X,sim.Y)
112 |   XPNmod_DE <-xpn(sim.X,sim.Y)
113 |   ## check stat. power and type I error
114 |   simData <- cbind(sim.X, sim.Y)
115 |   simDataOLS <- cbind(Xtrans.OLS_DE, sim.Y)
116 |   simDataFLMER <- cbind(Xtrans.FLMER_DE, sim.Y)
117 |   simDataDWD <- as.matrix(cbind(DWDmod_DE$x,DWDmod_DE$y))
118 |   simDataEB <- as.matrix(cbind(EBmod_DE$x,EBmod_DE$y))
119 |   #simDataGQ <- as.matrix(cbind(GQmod_DE$x,GQmod_DE$y))
120 |   simDataXPN <- as.matrix(cbind(XPNmod_DE$x,XPNmod_DE$y))
121 |   simFac <- as.factor(c(rep(0, n2), rep(1, n3),rep(0, n4), rep(1, n5)))
122 |   rr.raw <- rowttests(simData, simFac)
123 |   rr.OLS <- rowttests(simDataOLS, simFac)
124 |   rr.FLMER <- rowttests(simDataFLMER, simFac)
125 |   rr.DWD <- rowttests(simDataDWD, simFac)
126 |   rr.EB <- rowttests(simDataEB, simFac)
127 |   #rr.GQ <- rowttests(simDataGQ, simFac)
128 |   rr.XPN <- rowttests(simDataXPN, simFac)
129 |   padj.raw <- p.adjust(rr.raw[, "p.value"], "holm")
130 |   padj.OLS <- p.adjust(rr.OLS[, "p.value"], "holm")
131 |   padj.FLMER <- p.adjust(rr.FLMER[, "p.value"], "holm")
132 |   padj.DWD <- p.adjust(rr.DWD[, "p.value"], "holm")
133 |   padj.EB <- p.adjust(rr.EB[, "p.value"], "holm")
134 |   #padj.GQ <- p.adjust(rr.GQ[, "p.value"], "holm")
135 |   padj.XPN <- p.adjust(rr.XPN[, "p.value"], "holm")
136 |   TP[i,1] <- sum(padj.raw[1:m1]<0.05); FP[i,1] <- sum(padj.raw[(m1+1):m]<0.05)
137 |   TP[i,2] <- sum(padj.OLS[1:m1]<0.05); FP[i,2] <- sum(padj.OLS[(m1+1):m]<0.05)
138 |   TP[i,3] <- sum(padj.FLMER[1:m1]<0.05); FP[i,3] <- sum(padj.FLMER[(m1+1):m]<0.05)
139 |   TP[i,4] <- sum(padj.DWD[1:m1]<0.05); FP[i,4] <- sum(padj.DWD[(m1+1):m]<0.05)
140 |   TP[i,6] <- sum(padj.EB[1:m1]<0.05); FP[i,6] <- sum(padj.EB[(m1+1):m]<0.05)
141 |   #TP[i,7] <- sum(padj.GQ[1:m1]<0.05); FP[i,7] <- sum(padj.GQ[(m1+1):m]<0.05)
142 |   TP[i,5] <- sum(padj.XPN[1:m1]<0.05); FP[i,5] <- sum(padj.XPN[(m1+1):m]<0.05)
143 | }
144 | 
145 | DE <- matrix(1000,30,6) 
146 | P <- TP/(TP+FP)
147 | R <- TP/DE
148 | F1 <- 2*(P*R)/(P+R) 
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------