├── README.md ├── RaceID2_StemID_class.R ├── RaceID2_StemID_sample.R ├── Reference_manual_RaceID2_StemID.pdf └── transcript_counts_intestine_5days_YFP.xls /README.md: -------------------------------------------------------------------------------- 1 | StemID and RaceID2 algorithms 2 | 3 | RaceID2 is an advanced version of RaceID, an algorithm for the identification of rare and abundant cell types from single cell transcriptome data. The method is based on transcript counts obtained with unique molecular identifies. 4 | 5 | StemID is an algorithm for the derivation of cell lineage trees based on RaceID2 results and predicts multipotent cell identites. 6 | 7 | RaceID2 and StemID are written in the R computing language. 8 | 9 | The following files are provided: 10 | 11 | StemID/RaceID2 class definition: RaceID2_StemID_class.R 12 | StemID/RaceID2 sample code: RaceID2_StemID_sample.R 13 | StemID/RaceID2 reference manual: Reference_manual_RaceID2_StemID.pdf 14 | StemID/RaceID2 sample data: transcript_counts_intestine_5days_YFP.xls 15 | 16 | -------------------------------------------------------------------------------- /RaceID2_StemID_class.R: -------------------------------------------------------------------------------- 1 | ## load required packages. 2 | require(tsne) 3 | require(pheatmap) 4 | require(MASS) 5 | require(cluster) 6 | require(mclust) 7 | require(flexmix) 8 | require(lattice) 9 | require(fpc) 10 | require(amap) 11 | require(RColorBrewer) 12 | require(locfit) 13 | require(vegan) 14 | 15 | ## class definition 16 | SCseq <- setClass("SCseq", slots = c(expdata = "data.frame", ndata = "data.frame", fdata = "data.frame", distances = "matrix", tsne = "data.frame", cluster = "list", background = "list", out = "list", cpart = "vector", fcol = "vector", filterpar = "list", clusterpar = "list", outlierpar ="list" )) 17 | 18 | setValidity("SCseq", 19 | function(object) { 20 | msg <- NULL 21 | if ( ! is.data.frame(object@expdata) ){ 22 | msg <- c(msg, "input data must be data.frame") 23 | }else if ( nrow(object@expdata) < 2 ){ 24 | msg <- c(msg, "input data must have more than one row") 25 | }else if ( ncol(object@expdata) < 2 ){ 26 | msg <- c(msg, "input data must have more than one column") 27 | }else if (sum( apply( is.na(object@expdata),1,sum ) ) > 0 ){ 28 | msg <- c(msg, "NAs are not allowed in input data") 29 | }else if (sum( apply( object@expdata,1,min ) ) < 0 ){ 30 | msg <- c(msg, "negative values are not allowed in input data") 31 | } 32 | if (is.null(msg)) TRUE 33 | else msg 34 | } 35 | ) 36 | 37 | setMethod("initialize", 38 | signature = "SCseq", 39 | definition = function(.Object, expdata ){ 40 | .Object@expdata <- expdata 41 | .Object@ndata <- expdata 42 | .Object@fdata <- expdata 43 | validObject(.Object) 44 | return(.Object) 45 | } 46 | ) 47 | 48 | setGeneric("filterdata", function(object, mintotal=3000, minexpr=5, minnumber=1, maxexpr=Inf, downsample=TRUE, dsn=1, rseed=17000) standardGeneric("filterdata")) 49 | 50 | setMethod("filterdata", 51 | signature = "SCseq", 52 | definition = function(object,mintotal,minexpr,minnumber,maxexpr,downsample,dsn,rseed) { 53 | if ( ! is.numeric(mintotal) ) stop( "mintotal has to be a positive number" ) else if ( mintotal <= 0 ) stop( "mintotal has to be a positive number" ) 54 | if ( ! is.numeric(minexpr) ) stop( "minexpr has to be a non-negative number" ) else if ( minexpr < 0 ) stop( "minexpr has to be a non-negative number" ) 55 | if ( ! is.numeric(minnumber) ) stop( "minnumber has to be a non-negative integer number" ) else if ( round(minnumber) != minnumber | minnumber < 0 ) stop( "minnumber has to be a non-negative integer number" ) 56 | if ( ! ( is.numeric(downsample) | is.logical(downsample) ) ) stop( "downsample has to be logical (TRUE/FALSE)" ) 57 | if ( ! is.numeric(dsn) ) stop( "dsn has to be a positive integer number" ) else if ( round(dsn) != dsn | dsn <= 0 ) stop( "dsn has to be a positive integer number" ) 58 | object@filterpar <- list(mintotal=mintotal, minexpr=minexpr, minnumber=minnumber, maxexpr=maxexpr, downsample=downsample, dsn=dsn) 59 | object@ndata <- object@expdata[,apply(object@expdata,2,sum,na.rm=TRUE) >= mintotal] 60 | if ( downsample ){ 61 | set.seed(rseed) 62 | object@ndata <- downsample(object@expdata,n=mintotal,dsn=dsn) 63 | }else{ 64 | x <- object@ndata 65 | object@ndata <- as.data.frame( t(t(x)/apply(x,2,sum))*median(apply(x,2,sum,na.rm=TRUE)) + .1 ) 66 | } 67 | x <- object@ndata 68 | object@fdata <- x[apply(x>=minexpr,1,sum,na.rm=TRUE) >= minnumber,] 69 | x <- object@fdata 70 | object@fdata <- x[apply(x,1,max,na.rm=TRUE) < maxexpr,] 71 | return(object) 72 | } 73 | ) 74 | 75 | downsample <- function(x,n,dsn){ 76 | x <- round( x[,apply(x,2,sum,na.rm=TRUE) >= n], 0) 77 | nn <- min( apply(x,2,sum) ) 78 | for ( j in 1:dsn ){ 79 | z <- data.frame(GENEID=rownames(x)) 80 | rownames(z) <- rownames(x) 81 | initv <- rep(0,nrow(z)) 82 | for ( i in 1:dim(x)[2] ){ 83 | y <- aggregate(rep(1,nn),list(sample(rep(rownames(x),x[,i]),nn)),sum) 84 | na <- names(x)[i] 85 | names(y) <- c("GENEID",na) 86 | rownames(y) <- y$GENEID 87 | z[,na] <- initv 88 | k <- intersect(rownames(z),y$GENEID) 89 | z[k,na] <- y[k,na] 90 | z[is.na(z[,na]),na] <- 0 91 | } 92 | rownames(z) <- as.vector(z$GENEID) 93 | ds <- if ( j == 1 ) z[,-1] else ds + z[,-1] 94 | } 95 | ds <- ds/dsn + .1 96 | return(ds) 97 | } 98 | 99 | dist.gen <- function(x,method="euclidean", ...) if ( method %in% c("spearman","pearson","kendall") ) as.dist( 1 - cor(t(x),method=method,...) ) else dist(x,method=method,...) 100 | 101 | dist.gen.pairs <- function(x,y,...) dist.gen(t(cbind(x,y)),...) 102 | 103 | binompval <- function(p,N,n){ 104 | pval <- pbinom(n,round(N,0),p,lower.tail=TRUE) 105 | pval[!is.na(pval) & pval > 0.5] <- 1-pval[!is.na(pval) & pval > 0.5] 106 | return(pval) 107 | } 108 | 109 | setGeneric("plotgap", function(object) standardGeneric("plotgap")) 110 | 111 | setMethod("plotgap", 112 | signature = "SCseq", 113 | definition = function(object){ 114 | if ( length(object@cluster$kpart) == 0 ) stop("run clustexp before plotgap") 115 | if ( sum(is.na(object@cluster$gap$Tab[,3])) > 0 ) stop("run clustexp with do.gap = TRUE first") 116 | plot(object@cluster$gap,ylim=c( min(object@cluster$gap$Tab[,3] - object@cluster$gap$Tab[,4]), max(object@cluster$gap$Tab[,3] + object@cluster$gap$Tab[,4]))) 117 | } 118 | ) 119 | 120 | setGeneric("plotjaccard", function(object) standardGeneric("plotjaccard")) 121 | 122 | setMethod("plotjaccard", 123 | signature = "SCseq", 124 | definition = function(object){ 125 | if ( length(object@cluster$kpart) == 0 ) stop("run clustexp before plotjaccard") 126 | if ( length(unique(object@cluster$kpart)) < 2 ) stop("only a single cluster: no Jaccard's similarity plot") 127 | barplot(object@cluster$jaccard,names.arg=1:length(object@cluster$jaccard),ylab="Jaccard's similarity") 128 | } 129 | ) 130 | 131 | setGeneric("plotoutlierprobs", function(object) standardGeneric("plotoutlierprobs")) 132 | 133 | setMethod("plotoutlierprobs", 134 | signature = "SCseq", 135 | definition = function(object){ 136 | if ( length(object@cpart) == 0 ) stop("run findoutliers before plotoutlierprobs") 137 | p <- object@cluster$kpart[ order(object@cluster$kpart,decreasing=FALSE)] 138 | x <- object@out$cprobs[names(p)] 139 | fcol <- object@fcol 140 | for ( i in 1:max(p) ){ 141 | y <- -log10(x + 2.2e-16) 142 | y[p != i] <- 0 143 | if ( i == 1 ) b <- barplot(y,ylim=c(0,max(-log10(x + 2.2e-16))*1.1),col=fcol[i],border=fcol[i],names.arg=FALSE,ylab="-log10prob") else barplot(y,add=TRUE,col=fcol[i],border=fcol[i],names.arg=FALSE,axes=FALSE) 144 | } 145 | abline(-log10(object@outlierpar$probthr),0,col="black",lty=2) 146 | d <- b[2,1] - b[1,1] 147 | y <- 0 148 | for ( i in 1:max(p) ) y <- append(y,b[sum(p <=i),1] + d/2) 149 | axis(1,at=(y[1:(length(y)-1)] + y[-1])/2,lab=1:max(p)) 150 | box() 151 | } 152 | ) 153 | 154 | setGeneric("plotbackground", function(object) standardGeneric("plotbackground")) 155 | 156 | setMethod("plotbackground", 157 | signature = "SCseq", 158 | definition = function(object){ 159 | if ( length(object@cpart) == 0 ) stop("run findoutliers before plotbackground") 160 | m <- apply(object@fdata,1,mean) 161 | v <- apply(object@fdata,1,var) 162 | fit <- locfit(v~lp(m,nn=.7),family="gamma",maxk=500) 163 | plot(log2(m),log2(v),pch=20,xlab="log2mean",ylab="log2var",col="grey") 164 | lines(log2(m[order(m)]),log2(object@background$lvar(m[order(m)],object)),col="red",lwd=2) 165 | lines(log2(m[order(m)]),log2(fitted(fit)[order(m)]),col="orange",lwd=2,lty=2) 166 | legend("topleft",legend=substitute(paste("y = ",a,"*x^2 + ",b,"*x + ",d,sep=""),list(a=round(coef(object@background$vfit)[3],2),b=round(coef(object@background$vfit)[2],2),d=round(coef(object@background$vfit)[1],2))),bty="n") 167 | } 168 | ) 169 | 170 | setGeneric("plotsensitivity", function(object) standardGeneric("plotsensitivity")) 171 | 172 | setMethod("plotsensitivity", 173 | signature = "SCseq", 174 | definition = function(object){ 175 | if ( length(object@cpart) == 0 ) stop("run findoutliers before plotsensitivity") 176 | plot(log10(object@out$thr), object@out$stest, type="l",xlab="log10 Probability cutoff", ylab="Number of outliers") 177 | abline(v=log10(object@outlierpar$probthr),col="red",lty=2) 178 | } 179 | ) 180 | 181 | setGeneric("diffgenes", function(object,cl1,cl2,mincount=5) standardGeneric("diffgenes")) 182 | 183 | setMethod("diffgenes", 184 | signature = "SCseq", 185 | definition = function(object,cl1,cl2,mincount){ 186 | part <- object@cpart 187 | cl1 <- c(cl1) 188 | cl2 <- c(cl2) 189 | if ( length(part) == 0 ) stop("run findoutliers before diffgenes") 190 | if ( ! is.numeric(mincount) ) stop("mincount has to be a non-negative number") else if ( mincount < 0 ) stop("mincount has to be a non-negative number") 191 | if ( length(intersect(cl1, part)) < length(unique(cl1)) ) stop( paste("cl1 has to be a subset of ",paste(sort(unique(part)),collapse=","),"\n",sep="") ) 192 | if ( length(intersect(cl2, part)) < length(unique(cl2)) ) stop( paste("cl2 has to be a subset of ",paste(sort(unique(part)),collapse=","),"\n",sep="") ) 193 | f <- apply(object@ndata[,part %in% c(cl1,cl2)],1,max) > mincount 194 | x <- object@ndata[f,part %in% cl1] 195 | y <- object@ndata[f,part %in% cl2] 196 | if ( sum(part %in% cl1) == 1 ) m1 <- x else m1 <- apply(x,1,mean) 197 | if ( sum(part %in% cl2) == 1 ) m2 <- y else m2 <- apply(y,1,mean) 198 | if ( sum(part %in% cl1) == 1 ) s1 <- sqrt(x) else s1 <- sqrt(apply(x,1,var)) 199 | if ( sum(part %in% cl2) == 1 ) s2 <- sqrt(y) else s2 <- sqrt(apply(y,1,var)) 200 | 201 | d <- ( m1 - m2 )/ apply( cbind( s1, s2 ),1,mean ) 202 | names(d) <- rownames(object@ndata)[f] 203 | if ( sum(part %in% cl1) == 1 ){ 204 | names(x) <- names(d) 205 | x <- x[order(d,decreasing=TRUE)] 206 | }else{ 207 | x <- x[order(d,decreasing=TRUE),] 208 | } 209 | if ( sum(part %in% cl2) == 1 ){ 210 | names(y) <- names(d) 211 | y <- y[order(d,decreasing=TRUE)] 212 | }else{ 213 | y <- y[order(d,decreasing=TRUE),] 214 | } 215 | return(list(z=d[order(d,decreasing=TRUE)],cl1=x,cl2=y,cl1n=cl1,cl2n=cl2)) 216 | } 217 | ) 218 | 219 | plotdiffgenes <- function(z,gene=g){ 220 | if ( ! is.list(z) ) stop("first arguments needs to be output of function diffgenes") 221 | if ( length(z) < 5 ) stop("first arguments needs to be output of function diffgenes") 222 | if ( sum(names(z) == c("z","cl1","cl2","cl1n","cl2n")) < 5 ) stop("first arguments needs to be output of function diffgenes") 223 | if ( length(gene) > 1 ) stop("only single value allowed for argument gene") 224 | if ( !is.numeric(gene) & !(gene %in% names(z$z)) ) stop("argument gene needs to be within rownames of first argument or a positive integer number") 225 | if ( is.numeric(gene) ){ if ( gene < 0 | round(gene) != gene ) stop("argument gene needs to be within rownames of first argument or a positive integer number") } 226 | x <- if ( is.null(dim(z$cl1)) ) z$cl1[gene] else t(z$cl1[gene,]) 227 | y <- if ( is.null(dim(z$cl2)) ) z$cl2[gene] else t(z$cl2[gene,]) 228 | plot(1:length(c(x,y)),c(x,y),ylim=c(0,max(c(x,y))),xlab="",ylab="Expression",main=gene,cex=0,axes=FALSE) 229 | axis(2) 230 | box() 231 | u <- 1:length(x) 232 | rect(u - .5,0,u + .5,x,col="red") 233 | v <- c(min(u) - .5,max(u) + .5) 234 | axis(1,at=mean(v),lab=paste(z$cl1n,collapse=",")) 235 | lines(v,rep(mean(x),length(v))) 236 | lines(v,rep(mean(x)-sqrt(var(x)),length(v)),lty=2) 237 | lines(v,rep(mean(x)+sqrt(var(x)),length(v)),lty=2) 238 | 239 | u <- ( length(x) + 1 ):length(c(x,y)) 240 | v <- c(min(u) - .5,max(u) + .5) 241 | rect(u - .5,0,u + .5,y,col="blue") 242 | axis(1,at=mean(v),lab=paste(z$cl2n,collapse=",")) 243 | lines(v,rep(mean(y),length(v))) 244 | lines(v,rep(mean(y)-sqrt(var(y)),length(v)),lty=2) 245 | lines(v,rep(mean(y)+sqrt(var(y)),length(v)),lty=2) 246 | abline(v=length(x) + .5) 247 | } 248 | 249 | setGeneric("plottsne", function(object,final=TRUE) standardGeneric("plottsne")) 250 | 251 | setMethod("plottsne", 252 | signature = "SCseq", 253 | definition = function(object,final){ 254 | if ( length(object@tsne) == 0 ) stop("run comptsne before plottsne") 255 | if ( final & length(object@cpart) == 0 ) stop("run findoutliers before plottsne") 256 | if ( !final & length(object@cluster$kpart) == 0 ) stop("run clustexp before plottsne") 257 | part <- if ( final ) object@cpart else object@cluster$kpart 258 | plot(object@tsne,xlab="Dim 1",ylab="Dim 2",pch=20,cex=1.5,col="lightgrey") 259 | for ( i in 1:max(part) ){ 260 | if ( sum(part == i) > 0 ) text(object@tsne[part == i,1],object@tsne[part == i,2],i,col=object@fcol[i],cex=.75,font=4) 261 | } 262 | } 263 | ) 264 | 265 | setGeneric("plotlabelstsne", function(object,labels=NULL) standardGeneric("plotlabelstsne")) 266 | 267 | setMethod("plotlabelstsne", 268 | signature = "SCseq", 269 | definition = function(object,labels){ 270 | if ( is.null(labels ) ) labels <- names(object@ndata) 271 | if ( length(object@tsne) == 0 ) stop("run comptsne before plotlabelstsne") 272 | plot(object@tsne,xlab="Dim 1",ylab="Dim 2",pch=20,cex=1.5,col="lightgrey") 273 | text(object@tsne[,1],object@tsne[,2],labels,cex=.5) 274 | } 275 | ) 276 | 277 | setGeneric("plotsymbolstsne", function(object,types=NULL) standardGeneric("plotsymbolstsne")) 278 | 279 | setMethod("plotsymbolstsne", 280 | signature = "SCseq", 281 | definition = function(object,types){ 282 | if ( is.null(types) ) types <- names(object@fdata) 283 | if ( length(object@tsne) == 0 ) stop("run comptsne before plotsymbolstsne") 284 | if ( length(types) != ncol(object@fdata) ) stop("types argument has wrong length. Length has to equal to the column number of object@ndata") 285 | coloc <- rainbow(length(unique(types))) 286 | syms <- c() 287 | plot(object@tsne,xlab="Dim 1",ylab="Dim 2",pch=20,col="grey") 288 | for ( i in 1:length(unique(types)) ){ 289 | f <- types == sort(unique(types))[i] 290 | syms <- append( syms, ( (i-1) %% 25 ) + 1 ) 291 | points(object@tsne[f,1],object@tsne[f,2],col=coloc[i],pch=( (i-1) %% 25 ) + 1,cex=1) 292 | } 293 | legend("topleft", legend=sort(unique(types)), col=coloc, pch=syms) 294 | } 295 | ) 296 | 297 | setGeneric("plotexptsne", function(object,g,n="",logsc=FALSE) standardGeneric("plotexptsne")) 298 | 299 | setMethod("plotexptsne", 300 | signature = "SCseq", 301 | definition = function(object,g,n="",logsc=FALSE){ 302 | if ( length(object@tsne) == 0 ) stop("run comptsne before plottsne") 303 | if ( length(intersect(g,rownames(object@ndata))) < length(unique(g)) ) stop("second argument does not correspond to set of rownames slot ndata of SCseq object") 304 | if ( !is.numeric(logsc) & !is.logical(logsc) ) stop("argument logsc has to be logical (TRUE/FALSE)") 305 | if ( n == "" ) n <- g[1] 306 | l <- apply(object@ndata[g,] - .1,2,sum) + .1 307 | if (logsc) { 308 | f <- l == 0 309 | l <- log2(l) 310 | l[f] <- NA 311 | } 312 | mi <- min(l,na.rm=TRUE) 313 | ma <- max(l,na.rm=TRUE) 314 | ColorRamp <- colorRampPalette(rev(brewer.pal(n = 7,name = "RdYlBu")))(100) 315 | ColorLevels <- seq(mi, ma, length=length(ColorRamp)) 316 | v <- round((l - mi)/(ma - mi)*99 + 1,0) 317 | layout(matrix(data=c(1,3,2,4), nrow=2, ncol=2), widths=c(5,1,5,1), heights=c(5,1,1,1)) 318 | par(mar = c(3,5,2.5,2)) 319 | plot(object@tsne,xlab="Dim 1",ylab="Dim 2",main=n,pch=20,cex=0,col="grey") 320 | for ( k in 1:length(v) ){ 321 | points(object@tsne[k,1],object@tsne[k,2],col=ColorRamp[v[k]],pch=20,cex=1.5) 322 | } 323 | par(mar = c(3,2.5,2.5,2)) 324 | image(1, ColorLevels, 325 | matrix(data=ColorLevels, ncol=length(ColorLevels),nrow=1), 326 | col=ColorRamp, 327 | xlab="",ylab="", 328 | xaxt="n") 329 | layout(1) 330 | } 331 | ) 332 | 333 | 334 | plot.err.bars.y <- function(x, y, y.err, col="black", lwd=1, lty=1, h=0.1){ 335 | arrows(x,y-y.err,x,y+y.err,code=0, col=col, lwd=lwd, lty=lty) 336 | arrows(x-h,y-y.err,x+h,y-y.err,code=0, col=col, lwd=lwd, lty=lty) 337 | arrows(x-h,y+y.err,x+h,y+y.err,code=0, col=col, lwd=lwd, lty=lty) 338 | } 339 | 340 | clusGapExt <-function (x, FUNcluster, K.max, B = 100, verbose = interactive(), method="euclidean",random=TRUE, 341 | ...) 342 | { 343 | stopifnot(is.function(FUNcluster), length(dim(x)) == 2, K.max >= 344 | 2, (n <- nrow(x)) >= 1, (p <- ncol(x)) >= 1) 345 | if (B != (B. <- as.integer(B)) || (B <- B.) <= 0) 346 | stop("'B' has to be a positive integer") 347 | if (is.data.frame(x)) 348 | x <- as.matrix(x) 349 | ii <- seq_len(n) 350 | W.k <- function(X, kk) { 351 | clus <- if (kk > 1) 352 | FUNcluster(X, kk, ...)$cluster 353 | else rep.int(1L, nrow(X)) 354 | 0.5 * sum(vapply(split(ii, clus), function(I) { 355 | xs <- X[I, , drop = FALSE] 356 | sum(dist.gen(xs,method=method)/nrow(xs)) 357 | }, 0)) 358 | } 359 | logW <- E.logW <- SE.sim <- numeric(K.max) 360 | if (verbose) 361 | cat("Clustering k = 1,2,..., K.max (= ", K.max, "): .. ", 362 | sep = "") 363 | for (k in 1:K.max) logW[k] <- log(W.k(x, k)) 364 | if (verbose) 365 | cat("done\n") 366 | xs <- scale(x, center = TRUE, scale = FALSE) 367 | m.x <- rep(attr(xs, "scaled:center"), each = n) 368 | V.sx <- svd(xs)$v 369 | rng.x1 <- apply(xs %*% V.sx, 2, range) 370 | logWks <- matrix(0, B, K.max) 371 | if (random){ 372 | if (verbose) 373 | cat("Bootstrapping, b = 1,2,..., B (= ", B, ") [one \".\" per sample]:\n", 374 | sep = "") 375 | for (b in 1:B) { 376 | z1 <- apply(rng.x1, 2, function(M, nn) runif(nn, min = M[1], 377 | max = M[2]), nn = n) 378 | z <- tcrossprod(z1, V.sx) + m.x 379 | ##z <- apply(x,2,function(m) runif(length(m),min=min(m),max=max(m))) 380 | ##z <- apply(x,2,function(m) sample(m)) 381 | for (k in 1:K.max) { 382 | logWks[b, k] <- log(W.k(z, k)) 383 | } 384 | if (verbose) 385 | cat(".", if (b%%50 == 0) 386 | paste(b, "\n")) 387 | } 388 | if (verbose && (B%%50 != 0)) 389 | cat("", B, "\n") 390 | E.logW <- colMeans(logWks) 391 | SE.sim <- sqrt((1 + 1/B) * apply(logWks, 2, var)) 392 | }else{ 393 | E.logW <- rep(NA,K.max) 394 | SE.sim <- rep(NA,K.max) 395 | } 396 | structure(class = "clusGap", list(Tab = cbind(logW, E.logW, 397 | gap = E.logW - logW, SE.sim), n = n, B = B, FUNcluster = FUNcluster)) 398 | } 399 | 400 | clustfun <- function(x,clustnr=20,bootnr=50,metric="pearson",do.gap=FALSE,sat=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000,FUNcluster="kmedoids",distances=NULL,link="single") 401 | { 402 | if ( clustnr < 2) stop("Choose clustnr > 1") 403 | di <- if ( FUNcluster == "kmedoids" ) t(x) else dist.gen(t(x),method=metric) 404 | if ( nrow(di) - 1 < clustnr ) clustnr <- nrow(di) - 1 405 | if ( do.gap | sat | cln > 0 ){ 406 | gpr <- NULL 407 | f <- if ( cln == 0 ) TRUE else FALSE 408 | if ( do.gap ){ 409 | set.seed(rseed) 410 | if ( FUNcluster == "kmeans" ) gpr <- clusGapExt(as.matrix(di), FUN = kmeans, K.max = clustnr, B = B.gap, iter.max=100) 411 | if ( FUNcluster == "kmedoids" ) gpr <- clusGapExt(as.matrix(di), FUN = function(x,k) pam(dist.gen(x,method=metric),k), K.max = clustnr, B = B.gap, method=metric) 412 | if ( FUNcluster == "hclust" ) gpr <- clusGapExt(as.matrix(di), FUN = function(x,k){ y <- hclusterCBI(x,k,link=link,scaling=FALSE); y$cluster <- y$partition; y }, K.max = clustnr, B = B.gap) 413 | if ( f ) cln <- maxSE(gpr$Tab[,3],gpr$Tab[,4],method=SE.method,SE.factor) 414 | } 415 | if ( sat ){ 416 | if ( ! do.gap ){ 417 | if ( FUNcluster == "kmeans" ) gpr <- clusGapExt(as.matrix(di), FUN = kmeans, K.max = clustnr, B = B.gap, iter.max=100, random=FALSE) 418 | if ( FUNcluster == "kmedoids" ) gpr <- clusGapExt(as.matrix(di), FUN = function(x,k) pam(dist.gen(x,method=metric),k), K.max = clustnr, B = B.gap, random=FALSE, method=metric) 419 | if ( FUNcluster == "hclust" ) gpr <- clusGapExt(as.matrix(di), FUN = function(x,k){ y <- hclusterCBI(x,k,link=link,scaling=FALSE); y$cluster <- y$partition; y }, K.max = clustnr, B = B.gap, random=FALSE) 420 | } 421 | g <- gpr$Tab[,1] 422 | y <- g[-length(g)] - g[-1] 423 | mm <- numeric(length(y)) 424 | nn <- numeric(length(y)) 425 | for ( i in 1:length(y)){ 426 | mm[i] <- mean(y[i:length(y)]) 427 | nn[i] <- sqrt(var(y[i:length(y)])) 428 | } 429 | if ( f ) cln <- max(min(which( y - (mm + nn) < 0 )),1) 430 | } 431 | if ( cln <= 1 ) { 432 | clb <- list(result=list(partition=rep(1,dim(x)[2])),bootmean=1) 433 | names(clb$result$partition) <- names(x) 434 | return(list(x=x,clb=clb,gpr=gpr,di=if ( FUNcluster == "kmedoids" ) dist.gen(di,method=metric) else di)) 435 | } 436 | if ( FUNcluster == "kmeans" ) clb <- clusterboot(di,B=bootnr,distances=FALSE,bootmethod="boot",clustermethod=kmeansCBI,krange=cln,scaling=FALSE,multipleboot=FALSE,bscompare=TRUE,seed=rseed) 437 | if ( FUNcluster == "kmedoids" ) clb <- clusterboot(dist.gen(di,method=metric),B=bootnr,bootmethod="boot",clustermethod=pamkCBI,k=cln,multipleboot=FALSE,bscompare=TRUE,seed=rseed) 438 | if ( FUNcluster == "hclust" ) clb <- clusterboot(di,B=bootnr,distances=FALSE,bootmethod="boot",clustermethod=hclusterCBI,k=cln,link=link,scaling=FALSE,multipleboot=FALSE,bscompare=TRUE,seed=rseed) 439 | return(list(x=x,clb=clb,gpr=gpr,di=if ( FUNcluster == "kmedoids" ) dist.gen(di,method=metric) else di)) 440 | } 441 | } 442 | 443 | setGeneric("clustexp", function(object,clustnr=20,bootnr=50,metric="pearson",do.gap=FALSE,sat=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000,FUNcluster="kmedoids") standardGeneric("clustexp")) 444 | 445 | setMethod("clustexp", 446 | signature = "SCseq", 447 | definition = function(object,clustnr,bootnr,metric,do.gap,sat,SE.method,SE.factor,B.gap,cln,rseed,FUNcluster) { 448 | if ( ! is.numeric(clustnr) ) stop("clustnr has to be a positive integer") else if ( round(clustnr) != clustnr | clustnr <= 0 ) stop("clustnr has to be a positive integer") 449 | if ( ! is.numeric(bootnr) ) stop("bootnr has to be a positive integer") else if ( round(bootnr) != bootnr | bootnr <= 0 ) stop("bootnr has to be a positive integer") 450 | if ( ! ( metric %in% c( "spearman","pearson","kendall","euclidean","maximum","manhattan","canberra","binary","minkowski") ) ) stop("metric has to be one of the following: spearman, pearson, kendall, euclidean, maximum, manhattan, canberra, binary, minkowski") 451 | if ( ! ( SE.method %in% c( "firstSEmax","Tibs2001SEmax","globalSEmax","firstmax","globalmax") ) ) stop("SE.method has to be one of the following: firstSEmax, Tibs2001SEmax, globalSEmax, firstmax, globalmax") 452 | if ( ! is.numeric(SE.factor) ) stop("SE.factor has to be a non-negative integer") else if ( SE.factor < 0 ) stop("SE.factor has to be a non-negative integer") 453 | if ( ! ( is.numeric(do.gap) | is.logical(do.gap) ) ) stop( "do.gap has to be logical (TRUE/FALSE)" ) 454 | if ( ! ( is.numeric(sat) | is.logical(sat) ) ) stop( "sat has to be logical (TRUE/FALSE)" ) 455 | if ( ! is.numeric(B.gap) ) stop("B.gap has to be a positive integer") else if ( round(B.gap) != B.gap | B.gap <= 0 ) stop("B.gap has to be a positive integer") 456 | if ( ! is.numeric(cln) ) stop("cln has to be a non-negative integer") else if ( round(cln) != cln | cln < 0 ) stop("cln has to be a non-negative integer") 457 | if ( ! is.numeric(rseed) ) stop("rseed has to be numeric") 458 | if ( !do.gap & !sat & cln == 0 ) stop("cln has to be a positive integer or either do.gap or sat has to be TRUE") 459 | if ( ! ( FUNcluster %in% c("kmeans","hclust","kmedoids") ) ) stop("FUNcluster has to be one of the following: kmeans, hclust,kmedoids") 460 | object@clusterpar <- list(clustnr=clustnr,bootnr=bootnr,metric=metric,do.gap=do.gap,sat=sat,SE.method=SE.method,SE.factor=SE.factor,B.gap=B.gap,cln=cln,rseed=rseed,FUNcluster=FUNcluster) 461 | y <- clustfun(object@fdata,clustnr,bootnr,metric,do.gap,sat,SE.method,SE.factor,B.gap,cln,rseed,FUNcluster) 462 | object@cluster <- list(kpart=y$clb$result$partition, jaccard=y$clb$bootmean, gap=y$gpr, clb=y$clb) 463 | object@distances <- as.matrix( y$di ) 464 | set.seed(111111) 465 | object@fcol <- sample(rainbow(max(y$clb$result$partition))) 466 | return(object) 467 | } 468 | ) 469 | 470 | setGeneric("findoutliers", function(object,outminc=5,outlg=2,probthr=1e-3,thr=2**-(1:40),outdistquant=.95) standardGeneric("findoutliers")) 471 | 472 | setMethod("findoutliers", 473 | signature = "SCseq", 474 | definition = function(object,outminc,outlg,probthr,thr,outdistquant) { 475 | if ( length(object@cluster$kpart) == 0 ) stop("run clustexp before findoutliers") 476 | if ( ! is.numeric(outminc) ) stop("outminc has to be a non-negative integer") else if ( round(outminc) != outminc | outminc < 0 ) stop("outminc has to be a non-negative integer") 477 | if ( ! is.numeric(outlg) ) stop("outlg has to be a non-negative integer") else if ( round(outlg) != outlg | outlg < 0 ) stop("outlg has to be a non-negative integer") 478 | if ( ! is.numeric(probthr) ) stop("probthr has to be a number between 0 and 1") else if ( probthr < 0 | probthr > 1 ) stop("probthr has to be a number between 0 and 1") 479 | if ( ! is.numeric(thr) ) stop("thr hast to be a vector of numbers between 0 and 1") else if ( min(thr) < 0 | max(thr) > 1 ) stop("thr hast to be a vector of numbers between 0 and 1") 480 | if ( ! is.numeric(outdistquant) ) stop("outdistquant has to be a number between 0 and 1") else if ( outdistquant < 0 | outdistquant > 1 ) stop("outdistquant has to be a number between 0 and 1") 481 | 482 | object@outlierpar <- list( outminc=outminc,outlg=outlg,probthr=probthr,thr=thr,outdistquant=outdistquant ) 483 | ### calibrate background model 484 | m <- log2(apply(object@fdata,1,mean)) 485 | v <- log2(apply(object@fdata,1,var)) 486 | f <- m > -Inf & v > -Inf 487 | m <- m[f] 488 | v <- v[f] 489 | mm <- -8 490 | repeat{ 491 | fit <- lm(v ~ m + I(m^2)) 492 | if( coef(fit)[3] >= 0 | mm >= -1){ 493 | break 494 | } 495 | mm <- mm + .5 496 | f <- m > mm 497 | m <- m[f] 498 | v <- v[f] 499 | } 500 | object@background <- list() 501 | object@background$vfit <- fit 502 | object@background$lvar <- function(x,object) 2**(coef(object@background$vfit)[1] + log2(x)*coef(object@background$vfit)[2] + coef(object@background$vfit)[3] * log2(x)**2) 503 | object@background$lsize <- function(x,object) x**2/(max(x + 1e-6,object@background$lvar(x,object)) - x) 504 | 505 | ### identify outliers 506 | out <- c() 507 | stest <- rep(0,length(thr)) 508 | cprobs <- c() 509 | for ( n in 1:max(object@cluster$kpart) ){ 510 | if ( sum(object@cluster$kpart == n) == 1 ){ cprobs <- append(cprobs,.5); names(cprobs)[length(cprobs)] <- names(object@cluster$kpart)[object@cluster$kpart == n]; next } 511 | x <- object@fdata[,object@cluster$kpart == n] 512 | x <- x[apply(x,1,max) > outminc,] 513 | z <- t( apply(x,1,function(x){ apply( cbind( pnbinom(round(x,0),mu=mean(x),size=object@background$lsize(mean(x),object)) , 1 - pnbinom(round(x,0),mu=mean(x),size=object@background$lsize(mean(x),object)) ),1, min) } ) ) 514 | cp <- apply(z,2,function(x){ y <- p.adjust(x,method="BH"); y <- y[order(y,decreasing=FALSE)]; return(y[outlg]);}) 515 | f <- cp < probthr 516 | cprobs <- append(cprobs,cp) 517 | if ( sum(f) > 0 ) out <- append(out,names(x)[f]) 518 | for ( j in 1:length(thr) ) stest[j] <- stest[j] + sum( cp < thr[j] ) 519 | } 520 | object@out <-list(out=out,stest=stest,thr=thr,cprobs=cprobs) 521 | 522 | ### cluster outliers 523 | clp2p.cl <- c() 524 | cols <- names(object@fdata) 525 | cpart <- object@cluster$kpart 526 | di <- as.data.frame(object@distances) 527 | for ( i in 1:max(cpart) ) { 528 | tcol <- cols[cpart == i] 529 | if ( sum(!(tcol %in% out)) > 1 ) clp2p.cl <- append(clp2p.cl,as.vector(t(di[tcol[!(tcol %in% out)],tcol[!(tcol %in% out)]]))) 530 | } 531 | clp2p.cl <- clp2p.cl[clp2p.cl>0] 532 | 533 | cadd <- list() 534 | if ( length(out) > 0 ){ 535 | if (length(out) == 1){ 536 | cadd <- list(out) 537 | }else{ 538 | n <- out 539 | m <- as.data.frame(di[out,out]) 540 | 541 | for ( i in 1:length(out) ){ 542 | if ( length(n) > 1 ){ 543 | o <- order(apply(cbind(m,1:dim(m)[1]),1,function(x) min(x[1:(length(x)-1)][-x[length(x)]])),decreasing=FALSE) 544 | m <- m[o,o] 545 | n <- n[o] 546 | f <- m[,1] < quantile(clp2p.cl,outdistquant) | m[,1] == min(clp2p.cl) 547 | ind <- 1 548 | if ( sum(f) > 1 ) for ( j in 2:sum(f) ) if ( apply(m[f,f][j,c(ind,j)] > quantile(clp2p.cl,outdistquant) ,1,sum) == 0 ) ind <- append(ind,j) 549 | cadd[[i]] <- n[f][ind] 550 | g <- ! n %in% n[f][ind] 551 | n <- n[g] 552 | m <- m[g,g] 553 | if ( sum(g) == 0 ) break 554 | 555 | }else if (length(n) == 1){ 556 | cadd[[i]] <- n 557 | break 558 | } 559 | } 560 | } 561 | 562 | for ( i in 1:length(cadd) ){ 563 | cpart[cols %in% cadd[[i]]] <- max(cpart) + 1 564 | } 565 | } 566 | 567 | ### determine final clusters 568 | for ( i in 1:max(cpart) ){ 569 | if ( sum(cpart == i) == 0 ) next 570 | f <- cols[cpart == i] 571 | d <- object@fdata 572 | if ( length(f) == 1 ){ 573 | cent <- d[,f] 574 | }else{ 575 | if ( object@clusterpar$FUNcluster == "kmedoids" ){ 576 | x <- apply(as.matrix(dist.gen(t(d[,f]),method=object@clusterpar$metric)),2,mean) 577 | cent <- d[,f[which(x == min(x))[1]]] 578 | }else{ 579 | cent <- apply(d[,f],1,mean) 580 | } 581 | } 582 | if ( i == 1 ) dcent <- data.frame(cent) else dcent <- cbind(dcent,cent) 583 | if ( i == 1 ) tmp <- data.frame(apply(d,2,dist.gen.pairs,y=cent,method=object@clusterpar$metric)) else tmp <- cbind(tmp,apply(d,2,dist.gen.pairs,y=cent,method=object@clusterpar$metric)) 584 | } 585 | cpart <- apply(tmp,1,function(x) order(x,decreasing=FALSE)[1]) 586 | 587 | for ( i in max(cpart):1){if (sum(cpart==i)==0) cpart[cpart>i] <- cpart[cpart>i] - 1 } 588 | 589 | object@cpart <- cpart 590 | 591 | set.seed(111111) 592 | object@fcol <- sample(rainbow(max(cpart))) 593 | return(object) 594 | } 595 | ) 596 | 597 | 598 | setGeneric("comptsne", function(object,rseed=15555,sammonmap=FALSE,initial_cmd=TRUE,...) standardGeneric("comptsne")) 599 | 600 | setMethod("comptsne", 601 | signature = "SCseq", 602 | definition = function(object,rseed,sammonmap,...){ 603 | if ( length(object@cluster$kpart) == 0 ) stop("run clustexp before comptsne") 604 | set.seed(rseed) 605 | di <- if ( object@clusterpar$FUNcluster == "kmedoids") as.dist(object@distances) else dist.gen(as.matrix(object@distances)) 606 | if ( sammonmap ){ 607 | object@tsne <- as.data.frame(sammon(di,k=2)$points) 608 | }else{ 609 | ts <- if ( initial_cmd ) tsne(di,initial_config=cmdscale(di,k=2),...) else tsne(di,k=2,...) 610 | object@tsne <- as.data.frame(ts) 611 | } 612 | return(object) 613 | } 614 | ) 615 | 616 | setGeneric("clustdiffgenes", function(object,pvalue=.01) standardGeneric("clustdiffgenes")) 617 | 618 | setMethod("clustdiffgenes", 619 | signature = "SCseq", 620 | definition = function(object,pvalue){ 621 | if ( length(object@cpart) == 0 ) stop("run findoutliers before clustdiffgenes") 622 | if ( ! is.numeric(pvalue) ) stop("pvalue has to be a number between 0 and 1") else if ( pvalue < 0 | pvalue > 1 ) stop("pvalue has to be a number between 0 and 1") 623 | cdiff <- list() 624 | x <- object@ndata 625 | y <- object@expdata[,names(object@ndata)] 626 | part <- object@cpart 627 | for ( i in 1:max(part) ){ 628 | if ( sum(part == i) == 0 ) next 629 | m <- if ( sum(part != i) > 1 ) apply(x[,part != i],1,mean) else x[,part != i] 630 | n <- if ( sum(part == i) > 1 ) apply(x[,part == i],1,mean) else x[,part == i] 631 | no <- if ( sum(part == i) > 1 ) median(apply(y[,part == i],2,sum))/median(apply(x[,part == i],2,sum)) else sum(y[,part == i])/sum(x[,part == i]) 632 | m <- m*no 633 | n <- n*no 634 | pv <- binompval(m/sum(m),sum(n),n) 635 | d <- data.frame(mean.ncl=m,mean.cl=n,fc=n/m,pv=pv)[order(pv,decreasing=FALSE),] 636 | cdiff[[paste("cl",i,sep=".")]] <- d[d$pv < pvalue,] 637 | } 638 | return(cdiff) 639 | } 640 | ) 641 | 642 | setGeneric("plotsaturation", function(object,disp=FALSE) standardGeneric("plotsaturation")) 643 | 644 | setMethod("plotsaturation", 645 | signature = "SCseq", 646 | definition = function(object,disp){ 647 | if ( length(object@cluster$gap) == 0 ) stop("run clustexp before plotsaturation") 648 | g <- object@cluster$gap$Tab[,1] 649 | y <- g[-length(g)] - g[-1] 650 | mm <- numeric(length(y)) 651 | nn <- numeric(length(y)) 652 | for ( i in 1:length(y)){ 653 | mm[i] <- mean(y[i:length(y)]) 654 | nn[i] <- sqrt(var(y[i:length(y)])) 655 | } 656 | cln <- max(min(which( y - (mm + nn) < 0 )),1) 657 | x <- 1:length(y) 658 | if (disp){ 659 | x <- 1:length(g) 660 | plot(x,g,pch=20,col="grey",xlab="k",ylab="log within cluster dispersion") 661 | f <- x == cln 662 | points(x[f],g[f],col="blue") 663 | }else{ 664 | plot(x,y,pch=20,col="grey",xlab="k",ylab="Change in log within cluster dispersion") 665 | points(x,mm,col="red",pch=20) 666 | plot.err.bars.y(x,mm,nn,col="red") 667 | points(x,y,col="grey",pch=20) 668 | f <- x == cln 669 | points(x[f],y[f],col="blue") 670 | } 671 | } 672 | ) 673 | 674 | setGeneric("plotsilhouette", function(object) standardGeneric("plotsilhouette")) 675 | 676 | setMethod("plotsilhouette", 677 | signature = "SCseq", 678 | definition = function(object){ 679 | if ( length(object@cluster$kpart) == 0 ) stop("run clustexp before plotsilhouette") 680 | if ( length(unique(object@cluster$kpart)) < 2 ) stop("only a single cluster: no silhouette plot") 681 | kpart <- object@cluster$kpart 682 | distances <- if ( object@clusterpar$FUNcluster == "kmedoids" ) as.dist(object@distances) else dist.gen(object@distances) 683 | si <- silhouette(kpart,distances) 684 | plot(si) 685 | } 686 | ) 687 | 688 | compmedoids <- function(x,part,metric="pearson"){ 689 | m <- c() 690 | for ( i in sort(unique(part)) ){ 691 | f <- names(x)[part == i] 692 | if ( length(f) == 1 ){ 693 | m <- append(m,f) 694 | }else{ 695 | y <- apply(as.matrix(dist.gen(t(x[,f]),method=metric)),2,mean) 696 | m <- append(m,f[which(y == min(y))[1]]) 697 | } 698 | } 699 | m 700 | } 701 | 702 | setGeneric("clustheatmap", function(object,final=FALSE,hmethod="single") standardGeneric("clustheatmap")) 703 | 704 | setMethod("clustheatmap", 705 | signature = "SCseq", 706 | definition = function(object,final,hmethod){ 707 | if ( final & length(object@cpart) == 0 ) stop("run findoutliers before clustheatmap") 708 | if ( !final & length(object@cluster$kpart) == 0 ) stop("run clustexp before clustheatmap") 709 | x <- object@fdata 710 | part <- if ( final ) object@cpart else object@cluster$kpart 711 | na <- c() 712 | j <- 0 713 | for ( i in 1:max(part) ){ 714 | if ( sum(part == i) == 0 ) next 715 | j <- j + 1 716 | na <- append(na,i) 717 | d <- x[,part == i] 718 | if ( sum(part == i) == 1 ) cent <- d else cent <- apply(d,1,mean) 719 | if ( j == 1 ) tmp <- data.frame(cent) else tmp <- cbind(tmp,cent) 720 | } 721 | names(tmp) <- paste("cl",na,sep=".") 722 | ld <- if ( object@clusterpar$FUNcluster == "kmedoids" ) dist.gen(t(tmp),method=object@clusterpar$metric) else dist.gen(as.matrix(dist.gen(t(tmp),method=object@clusterpar$metric))) 723 | if ( max(part) > 1 ) cclmo <- hclust(ld,method=hmethod)$order else cclmo <- 1 724 | q <- part 725 | for ( i in 1:max(part) ){ 726 | q[part == na[cclmo[i]]] <- i 727 | } 728 | part <- q 729 | di <- if ( object@clusterpar$FUNcluster == "kmedoids" ) object@distances else as.data.frame( as.matrix( dist.gen(t(object@distances)) ) ) 730 | pto <- part[order(part,decreasing=FALSE)] 731 | ptn <- c() 732 | for ( i in 1:max(pto) ){ pt <- names(pto)[pto == i]; z <- if ( length(pt) == 1 ) pt else pt[hclust(as.dist(t(di[pt,pt])),method=hmethod)$order]; ptn <- append(ptn,z) } 733 | col <- object@fcol 734 | mi <- min(di,na.rm=TRUE) 735 | ma <- max(di,na.rm=TRUE) 736 | layout(matrix(data=c(1,3,2,4), nrow=2, ncol=2), widths=c(5,1,5,1), heights=c(5,1,1,1)) 737 | ColorRamp <- colorRampPalette(brewer.pal(n = 7,name = "RdYlBu"))(100) 738 | ColorLevels <- seq(mi, ma, length=length(ColorRamp)) 739 | if ( mi == ma ){ 740 | ColorLevels <- seq(0.99*mi, 1.01*ma, length=length(ColorRamp)) 741 | } 742 | par(mar = c(3,5,2.5,2)) 743 | image(as.matrix(di[ptn,ptn]),col=ColorRamp,axes=FALSE) 744 | abline(0,1) 745 | box() 746 | 747 | tmp <- c() 748 | for ( u in 1:max(part) ){ 749 | ol <- (0:(length(part) - 1)/(length(part) - 1))[ptn %in% names(x)[part == u]] 750 | points(rep(0,length(ol)),ol,col=col[cclmo[u]],pch=15,cex=.75) 751 | points(ol,rep(0,length(ol)),col=col[cclmo[u]],pch=15,cex=.75) 752 | tmp <- append(tmp,mean(ol)) 753 | } 754 | axis(1,at=tmp,lab=cclmo) 755 | axis(2,at=tmp,lab=cclmo) 756 | par(mar = c(3,2.5,2.5,2)) 757 | image(1, ColorLevels, 758 | matrix(data=ColorLevels, ncol=length(ColorLevels),nrow=1), 759 | col=ColorRamp, 760 | xlab="",ylab="", 761 | xaxt="n") 762 | layout(1) 763 | return(cclmo) 764 | } 765 | ) 766 | 767 | 768 | 769 | 770 | 771 | 772 | ## class definition 773 | Ltree <- setClass("Ltree", slots = c(sc = "SCseq", ldata = "list", entropy = "vector", trproj = "list", par = "list", prback = "data.frame", prbacka = "data.frame", ltcoord = "matrix", prtree = "list", sigcell = "vector", cdata = "list" )) 774 | 775 | setValidity("Ltree", 776 | function(object) { 777 | msg <- NULL 778 | if ( class(object@sc)[1] != "SCseq" ){ 779 | msg <- c(msg, "input data must be of class SCseq") 780 | } 781 | if (is.null(msg)) TRUE 782 | else msg 783 | } 784 | ) 785 | 786 | setMethod("initialize", 787 | signature = "Ltree", 788 | definition = function(.Object, sc ){ 789 | .Object@sc <- sc 790 | validObject(.Object) 791 | return(.Object) 792 | } 793 | ) 794 | 795 | setGeneric("compentropy", function(object) standardGeneric("compentropy")) 796 | 797 | setMethod("compentropy", 798 | signature = "Ltree", 799 | definition = function(object){ 800 | probs <- t(t(object@sc@ndata)/apply(object@sc@ndata,2,sum)) 801 | object@entropy <- -apply(probs*log(probs)/log(nrow(object@sc@ndata)),2,sum) 802 | return(object) 803 | } 804 | ) 805 | 806 | 807 | compproj <- function(pdiloc,lploc,cnloc,mloc,d=NULL){ 808 | pd <- data.frame(pdiloc) 809 | k <- paste("X",sort(rep(1:nrow(pdiloc),length(mloc))),sep="") 810 | pd$k <- paste("X",1:nrow(pdiloc),sep="") 811 | pd <- merge(data.frame(k=k),pd,by="k") 812 | 813 | if ( is.null(d) ){ 814 | cnv <- t(matrix(rep(t(cnloc),nrow(pdiloc)),nrow=ncol(pdiloc))) 815 | pdcl <- paste("X",lploc[as.numeric(sub("X","",pd$k))],sep="") 816 | rownames(cnloc) <- paste("X",mloc,sep="") 817 | pdcn <- cnloc[pdcl,] 818 | v <- cnv - pdcn 819 | }else{ 820 | v <- d$v 821 | pdcn <- d$pdcn 822 | } 823 | w <- pd[,names(pd) != "k"] - pdcn 824 | 825 | h <- apply(cbind(v,w),1,function(x){ 826 | x1 <- x[1:(length(x)/2)]; 827 | x2 <- x[(length(x)/2 + 1):length(x)]; 828 | x1s <- sqrt(sum(x1**2)); x2s <- sqrt(sum(x2**2)); y <- sum(x1*x2)/x1s/x2s; return( if (x1s == 0 | x2s == 0 ) NA else y ) }) 829 | 830 | rma <- as.data.frame(matrix(h,ncol=nrow(pdiloc))) 831 | names(rma) <- unique(pd$k) 832 | pdclu <- lploc[as.numeric(sub("X","",names(rma)))] 833 | pdclp <- apply(t(rma),1,function(x) if (sum(!is.na(x)) == 0 ) NA else mloc[which(abs(x) == max(abs(x),na.rm=TRUE))[1]]) 834 | pdclh <- apply(t(rma),1,function(x) if (sum(!is.na(x)) == 0 ) NA else x[which(abs(x) == max(abs(x),na.rm=TRUE))[1]]) 835 | pdcln <- names(lploc)[as.numeric(sub("X","",names(rma)))] 836 | names(rma) <- pdcln 837 | rownames(rma) <- paste("X",mloc,sep="") 838 | res <- data.frame(o=pdclu,l=pdclp,h=pdclh) 839 | rownames(res) <- pdcln 840 | return(list(res=res[names(lploc),],rma=as.data.frame(t(rma[,names(lploc)])),d=list(v=v,pdcn=pdcn))) 841 | } 842 | 843 | pdishuffle <- function(pdi,lp,cn,m,all=FALSE){ 844 | if ( all ){ 845 | d <- as.data.frame(pdi) 846 | y <- t(apply(pdi,1,function(x) runif(length(x),min=min(x),max=max(x)))) 847 | names(y) <- names(d) 848 | rownames(y) <- rownames(d) 849 | return(y) 850 | }else{ 851 | fl <- TRUE 852 | for ( i in unique(lp)){ 853 | if ( sum(lp == i) > 1 ){ 854 | y <-data.frame( t(apply(as.data.frame(pdi[,lp == i]),1,sample)) ) 855 | }else{ 856 | y <- as.data.frame(pdi[,lp == i]) 857 | } 858 | names(y) <- names(lp)[lp == i] 859 | rownames(y) <- names(lp) 860 | z <- if (fl) y else cbind(z,y) 861 | fl <- FALSE 862 | } 863 | z <- t(z[,names(lp)]) 864 | return(z) 865 | } 866 | } 867 | 868 | setGeneric("projcells", function(object,cthr=0,nmode=FALSE) standardGeneric("projcells")) 869 | 870 | setMethod("projcells", 871 | signature = "Ltree", 872 | definition = function(object,cthr,nmode){ 873 | if ( ! is.numeric(cthr) ) stop( "cthr has to be a non-negative number" ) else if ( cthr < 0 ) stop( "cthr has to be a non-negative number" ) 874 | if ( ! length(object@sc@cpart == 0) ) stop( "please run findoutliers on the SCseq input object before initializing Ltree" ) 875 | if ( !is.numeric(nmode) & !is.logical(nmode) ) stop("argument nmode has to be logical (TRUE/FALSE)") 876 | 877 | object@par$cthr <- cthr 878 | object@par$nmode <- nmode 879 | 880 | lp <- object@sc@cpart 881 | ld <- object@sc@distances 882 | n <- aggregate(rep(1,length(lp)),list(lp),sum) 883 | n <- as.vector(n[order(n[,1],decreasing=FALSE),-1]) 884 | m <- (1:length(n))[n>cthr] 885 | f <- lp %in% m 886 | lp <- lp[f] 887 | ld <- ld[f,f] 888 | 889 | pdil <- object@sc@tsne[f,] 890 | cnl <- aggregate(pdil,by=list(lp),median) 891 | cnl <- cnl[order(cnl[,1],decreasing=FALSE),-1] 892 | 893 | pdi <- suppressWarnings( cmdscale(as.matrix(ld),k=ncol(ld)-1) ) 894 | cn <- as.data.frame(pdi[compmedoids(object@sc@fdata[,names(lp)],lp),]) 895 | rownames(cn) <- 1:nrow(cn) 896 | 897 | x <- compproj(pdi,lp,cn,m) 898 | res <- x$res 899 | 900 | if ( nmode ){ 901 | rma <- x$rma 902 | z <- paste("X",t(as.vector(apply(cbind(lp,ld),1,function(x){ f <- lp != x[1]; lp[f][which(x[-1][f] == min(x[-1][f]))[1]] }))),sep="") 903 | k <- apply(cbind(z,rma),1,function(x) (x[-1])[names(rma) == x[1]]) 904 | rn <- res 905 | rn$l <- as.numeric(sub("X","",z)) 906 | rn$h <- as.numeric(k) 907 | res <- rn 908 | x$res <- res 909 | } 910 | 911 | object@ldata <- list(lp=lp,ld=ld,m=m,pdi=pdi,pdil=pdil,cn=cn,cnl=cnl) 912 | object@trproj <- x 913 | return(object) 914 | } 915 | ) 916 | 917 | 918 | 919 | 920 | 921 | setGeneric("projback", function(object,pdishuf=2000,nmode=FALSE,rseed=17000) standardGeneric("projback")) 922 | 923 | setMethod("projback", 924 | signature = "Ltree", 925 | definition = function(object,pdishuf,nmode,rseed){ 926 | if ( !is.numeric(nmode) & !is.logical(nmode) ) stop("argument nmode has to be logical (TRUE/FALSE)") 927 | if ( ! is.numeric(pdishuf) ) stop( "pdishuf has to be a non-negative integer number" ) else if ( round(pdishuf) != pdishuf | pdishuf < 0 ) stop( "pdishuf has to be a non-negative integer number" ) 928 | if ( length(object@trproj) == 0 ) stop("run projcells before projback") 929 | object@par$pdishuf <- pdishuf 930 | object@par$rseed <- rseed 931 | 932 | if ( ! nmode ){ 933 | set.seed(rseed) 934 | for ( i in 1:pdishuf ){ 935 | cat("pdishuffle:",i,"\n") 936 | x <- compproj(pdishuffle(object@ldata$pdi,object@ldata$lp,object@ldata$cn,object@ldata$m,all=TRUE),object@ldata$lp,object@ldata$cn,object@ldata$m,d=object@trproj$d)$res 937 | y <- if ( i == 1 ) t(x) else cbind(y,t(x)) 938 | } 939 | ##important 940 | object@prback <- as.data.frame(t(y)) 941 | 942 | x <- object@prback 943 | x$n <- as.vector(t(matrix(rep(1:(nrow(x)/nrow(object@ldata$pdi)),nrow(object@ldata$pdi)),ncol=nrow(object@ldata$pdi)))) 944 | object@prbacka <- aggregate(data.frame(count=rep(1,nrow(x))),by=list(n=x$n,o=x$o,l=x$l),sum) 945 | } 946 | return( object ) 947 | } 948 | ) 949 | 950 | 951 | 952 | 953 | 954 | setGeneric("lineagetree", function(object,pthr=0.01,nmode=FALSE) standardGeneric("lineagetree")) 955 | 956 | setMethod("lineagetree", 957 | signature = "Ltree", 958 | definition = function(object,pthr,nmode){ 959 | if ( !is.numeric(nmode) & !is.logical(nmode) ) stop("argument nmode has to be logical (TRUE/FALSE)") 960 | if ( length(object@trproj) == 0 ) stop("run projcells before lineagetree") 961 | if ( max(dim(object@prback)) == 0 & ! nmode ) stop("run projback before lineagetree") 962 | if ( ! is.numeric(pthr) ) stop( "pthr has to be a non-negative number" ) else if ( pthr < 0 ) stop( "pthr has to be a non-negative number" ) 963 | object@par$pthr <- pthr 964 | cnl <- object@ldata$cnl 965 | pdil <- object@ldata$pdil 966 | cn <- object@ldata$cn 967 | pdi <- object@ldata$pdi 968 | m <- object@ldata$m 969 | lp <- object@ldata$lp 970 | res <- object@trproj$res 971 | rma <- object@trproj$rma 972 | prback <- object@prback 973 | 974 | cm <- as.matrix(dist(cnl))*0 975 | linl <- list() 976 | linn <- list() 977 | for ( i in 1:length(m) ){ 978 | for ( j in i:length(m) ){ 979 | linl[[paste(m[i],m[j],sep=".")]] <- c() 980 | linn[[paste(m[i],m[j],sep=".")]] <- c() 981 | } 982 | } 983 | sigcell <- c() 984 | for ( i in 1:nrow(res) ){ 985 | a <- which( m == res$o[i]) 986 | if ( sum( lp == m[a] ) == 1 ){ 987 | k <- t(cnl)[,a] 988 | k <- NA 989 | sigcell <- append(sigcell, FALSE) 990 | }else{ 991 | b <- which(m == res$l[i] ) 992 | h <- res$h[i] 993 | if ( nmode ){ 994 | sigcell <- append(sigcell, FALSE) 995 | }else{ 996 | f <- prback$o == m[a] & prback$l == m[b] 997 | if ( sum(f) > 0 ){ 998 | ql <- quantile(prback[f,"h"],pthr,na.rm=TRUE) 999 | qh <- quantile(prback[f,"h"],1 - pthr,na.rm=TRUE) 1000 | }else{ 1001 | ql <- 0 1002 | qh <- 0 1003 | } 1004 | sigcell <- if (is.na(h) ) append(sigcell, NA) else if ( h > qh | h < min(0,ql) ) append(sigcell, TRUE) else append(sigcell, FALSE) 1005 | } 1006 | if ( !is.na(res$h[i]) ){ 1007 | w <- t(pdil)[,i] - t(cnl)[,a] 1008 | v <- t(cnl)[,b] - t(cnl)[,a] 1009 | 1010 | wo <- t(pdi)[,i] - t(cn)[,a] 1011 | vo <- t(cn)[,b] - t(cn)[,a] 1012 | df <-( h*sqrt(sum(wo*wo)) )/sqrt(sum(vo*vo))*v 1013 | k <- df + t(cnl)[,a] 1014 | cm[a,b] <- cm[a,b] + 1 1015 | so <- m[sort(c(a,b))] 1016 | dfl <- sign(h)*sqrt( sum( df*df ) )/sqrt(sum(v*v)) 1017 | if ( a > b ) dfl <- 1 - dfl 1018 | linn[[paste(so[1],so[2],sep=".")]] <- append( linn[[paste(so[1],so[2],sep=".")]], rownames(pdi)[i] ) 1019 | linl[[paste(so[1],so[2],sep=".")]] <- append( linl[[paste(so[1],so[2],sep=".")]], dfl ) 1020 | }else{ 1021 | k <- t(cnl)[,a] 1022 | for ( j in unique(lp[lp != m[a]]) ){ 1023 | b <- which(j == m) 1024 | so <- m[sort(c(a,b))] 1025 | dfl <- 0 1026 | if ( a > b ) dfl <- 1 - dfl 1027 | linn[[paste(so[1],so[2],sep=".")]] <- append( linn[[paste(so[1],so[2],sep=".")]], rownames(pdi)[i] ) 1028 | linl[[paste(so[1],so[2],sep=".")]] <- append( linl[[paste(so[1],so[2],sep=".")]], dfl ) 1029 | } 1030 | } 1031 | } 1032 | lt <- if ( i == 1 ) data.frame(k) else cbind(lt,k) 1033 | } 1034 | lt <- t(lt) 1035 | cm <- as.data.frame(cm) 1036 | names(cm) <- paste("cl",m,sep=".") 1037 | rownames(cm) <- paste("cl",m,sep=".") 1038 | lt <- as.data.frame(lt) 1039 | rownames(lt) <- rownames(res) 1040 | object@ltcoord <- as.matrix(lt) 1041 | object@prtree <- list(n=linn,l=linl) 1042 | object@cdata$counts <- cm 1043 | names(sigcell) <- rownames(res) 1044 | object@sigcell <- sigcell 1045 | 1046 | return( object ) 1047 | } 1048 | ) 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | setGeneric("comppvalue", function(object,pethr=0.01,nmode=FALSE) standardGeneric("comppvalue")) 1055 | 1056 | setMethod("comppvalue", 1057 | signature = "Ltree", 1058 | definition = function(object,pethr,nmode){ 1059 | if ( !is.numeric(nmode) & !is.logical(nmode) ) stop("argument nmode has to be logical (TRUE/FALSE)") 1060 | if ( length(object@prtree) == 0 ) stop("run lineagetree before comppvalue") 1061 | if ( ! is.numeric(pethr) ) stop( "pethr has to be a non-negative number" ) else if ( pethr < 0 ) stop( "pethr has to be a non-negative number" ) 1062 | object@par$pethr <- pethr 1063 | cm <- object@cdata$counts 1064 | cmpv <- cm*NA 1065 | cmpvd <- cm*NA 1066 | cmbr <- cm*NA 1067 | cmpvn <- cm*NA 1068 | cmpvnd <- cm*NA 1069 | cmfr <- cm/apply(cm,1,sum) 1070 | if ( nmode ){ 1071 | N <- apply(cm,1,sum) + 1 1072 | N0 <- sum(N) - N 1073 | n0 <- t(matrix(rep(N,length(N)),ncol=length(N))) 1074 | p <- n0/N0 1075 | n <- cm 1076 | k <- cbind(N,p,n) 1077 | cmpv <- apply(k,1,function(x){N <- x[1]; p <- x[2:( ncol(cm) + 1 )]; n <- x[( ncol(cm) + 2 ):( 2*ncol(cm) + 1)]; apply(cbind(n,p),1,function(x,N) binom.test(x[1],N,min(1,x[2]),alternative="g")$p.value,N=N)}) 1078 | cmpvd <- apply(k,1,function(x){N <- x[1]; p <- x[2:( ncol(cm) + 1 )]; n <- x[( ncol(cm) + 2 ):( 2*ncol(cm) + 1)]; apply(cbind(n,p),1,function(x,N) binom.test(x[1],N,min(1,x[2]),alternative="l")$p.value,N=N)}) 1079 | cmpvn <- cmpv 1080 | cmpvnd <- cmpvd 1081 | cmbr <- as.data.frame(n0/N0*N) 1082 | names(cmbr) <- names(cm) 1083 | rownames(cmbr) <- rownames(cm) 1084 | }else{ 1085 | for ( i in 1:nrow(cm) ){ 1086 | for ( j in 1:ncol(cm) ){ 1087 | f <- object@prbacka$o == object@ldata$m[i] & object@prbacka$l == object@ldata$m[j] 1088 | x <- object@prbacka$count[f] 1089 | if ( sum(f) < object@par$pdishuf ) x <- append(x,rep(0, object@par$pdishuf - sum(f))) 1090 | cmbr[i,j] <- if ( sum(f) > 0 ) mean(x) else 0 1091 | cmpv[i,j] <- if ( quantile(x,1 - pethr) < cm[i,j] ) 0 else 0.5 1092 | cmpvd[i,j] <- if ( quantile(x,pethr) > cm[i,j] ) 0 else 0.5 1093 | cmpvn[i,j] <- sum( x >= cm[i,j])/length(x) 1094 | cmpvnd[i,j] <- sum( x <= cm[i,j])/length(x) 1095 | } 1096 | } 1097 | } 1098 | 1099 | diag(cmpv) <- .5 1100 | diag(cmpvd) <- .5 1101 | diag(cmpvn) <- NA 1102 | diag(cmpvnd) <- NA 1103 | 1104 | object@cdata$counts.br <- cmbr 1105 | object@cdata$pv.e <- cmpv 1106 | object@cdata$pv.d <- cmpvd 1107 | object@cdata$pvn.e <- cmpvn 1108 | object@cdata$pvn.d <- cmpvnd 1109 | 1110 | m <- object@ldata$m 1111 | linl <- object@prtree$l 1112 | ls <- as.data.frame(matrix(rep(NA,length(m)**2),ncol=length(m))) 1113 | names(ls) <- rownames(ls) <- paste("cl",m,sep=".") 1114 | for ( i in 1:( length(m) - 1 )){ 1115 | for ( j in (i + 1):length(m) ){ 1116 | na <- paste(m[i],m[j],sep=".") 1117 | if ( na %in% names(linl) & min(cmpv[i,j],cmpv[j,i],na.rm=TRUE) < pethr ){ 1118 | y <- sort(linl[[na]]) 1119 | nn <- ( 1 - max(y[-1] - y[-length(y)]) ) 1120 | }else{ 1121 | nn <- 0 1122 | } 1123 | ls[i,j] <- nn 1124 | } 1125 | } 1126 | object@cdata$linkscore <- ls 1127 | 1128 | return(object) 1129 | } 1130 | ) 1131 | 1132 | setGeneric("plotlinkpv", function(object) standardGeneric("plotlinkpv")) 1133 | 1134 | setMethod("plotlinkpv", 1135 | signature = "Ltree", 1136 | definition = function(object){ 1137 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotlinkpv") 1138 | pheatmap(-log2(object@cdata$pvn.e + 1/object@par$pdishuf/2)) 1139 | } 1140 | ) 1141 | 1142 | setGeneric("plotlinkscore", function(object) standardGeneric("plotlinkscore")) 1143 | 1144 | setMethod("plotlinkscore", 1145 | signature = "Ltree", 1146 | definition = function(object){ 1147 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotlinkscore") 1148 | pheatmap(object@cdata$linkscore,cluster_rows=FALSE,cluster_cols=FALSE) 1149 | } 1150 | ) 1151 | 1152 | setGeneric("plotmapprojections", function(object) standardGeneric("plotmapprojections")) 1153 | 1154 | setMethod("plotmapprojections", 1155 | signature = "Ltree", 1156 | definition = function(object){ 1157 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotmapprojections") 1158 | 1159 | cent <- object@sc@fdata[,compmedoids(object@sc@fdata,object@sc@cpart)] 1160 | dc <- as.data.frame(1 - cor(cent)) 1161 | names(dc) <- sort(unique(object@sc@cpart)) 1162 | rownames(dc) <- sort(unique(object@sc@cpart)) 1163 | trl <- spantree(dc[object@ldata$m,object@ldata$m]) 1164 | 1165 | u <- object@ltcoord[,1] 1166 | v <- object@ltcoord[,2] 1167 | cnl <- object@ldata$cnl 1168 | plot(u,v,cex=1.5,col="grey",pch=20,xlab="Dim 1",ylab="Dim 2") 1169 | for ( i in unique(object@ldata$lp) ){ f <- object@ldata$lp == i; text(u[f],v[f],i,cex=.75,font=4,col=object@sc@fcol[i]) } 1170 | points(cnl[,1],cnl[,2]) 1171 | text(cnl[,1],cnl[,2],object@ldata$m,cex=2) 1172 | for ( i in 1:length(trl$kid) ){ 1173 | lines(c(cnl[i+1,1],cnl[trl$kid[i],1]),c(cnl[i+1,2],cnl[trl$kid[i],2]),col="black") 1174 | } 1175 | } 1176 | ) 1177 | 1178 | 1179 | 1180 | setGeneric("plotmap", function(object) standardGeneric("plotmap")) 1181 | 1182 | setMethod("plotmap", 1183 | signature = "Ltree", 1184 | definition = function(object){ 1185 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotmap") 1186 | 1187 | cent <- object@sc@fdata[,compmedoids(object@sc@fdata,object@sc@cpart)] 1188 | dc <- as.data.frame(1 - cor(cent)) 1189 | names(dc) <- sort(unique(object@sc@cpart)) 1190 | rownames(dc) <- sort(unique(object@sc@cpart)) 1191 | trl <- spantree(dc[object@ldata$m,object@ldata$m]) 1192 | 1193 | 1194 | u <- object@ldata$pdil[,1] 1195 | v <- object@ldata$pdil[,2] 1196 | cnl <- object@ldata$cnl 1197 | plot(u,v,cex=1.5,col="grey",pch=20,xlab="Dim 1",ylab="Dim 2") 1198 | for ( i in unique(object@ldata$lp) ){ f <- object@ldata$lp == i; text(u[f],v[f],i,cex=.75,font=4,col=object@sc@fcol[i]) } 1199 | points(cnl[,1],cnl[,2]) 1200 | text(cnl[,1],cnl[,2],object@ldata$m,cex=2) 1201 | for ( i in 1:length(trl$kid) ){ 1202 | lines(c(cnl[i+1,1],cnl[trl$kid[i],1]),c(cnl[i+1,2],cnl[trl$kid[i],2]),col="black") 1203 | } 1204 | } 1205 | ) 1206 | 1207 | 1208 | setGeneric("plottree", function(object,showCells=TRUE,nmode=FALSE,scthr=0) standardGeneric("plottree")) 1209 | 1210 | setMethod("plottree", 1211 | signature = "Ltree", 1212 | definition = function(object,showCells,nmode,scthr){ 1213 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotmap") 1214 | if ( !is.numeric(nmode) & !is.logical(nmode) ) stop("argument nmode has to be logical (TRUE/FALSE)") 1215 | if ( !is.numeric(showCells) & !is.logical(showCells) ) stop("argument showCells has to be logical (TRUE/FALSE)") 1216 | if ( ! is.numeric(scthr) ) stop( "scthr has to be a non-negative number" ) else if ( scthr < 0 | scthr > 1 ) stop( "scthr has to be a number between 0 and 1" ) 1217 | 1218 | 1219 | ramp <- colorRamp(c("darkgreen", "yellow", "brown")) 1220 | mcol <- rgb( ramp(seq(0, 1, length = 101)), max = 255) 1221 | co <- object@cdata 1222 | fc <- (co$counts/( co$counts.br + .5))*(co$pv.e < object@par$pethr) 1223 | fc <- fc*(fc > t(fc)) + t(fc)*(t(fc) >= fc) 1224 | fc <- log2(fc + (fc == 0)) 1225 | 1226 | k <- -log10(sort(unique(as.vector(t(co$pvn.e))[as.vector(t(co$pv.e)) mlpv[j,i] ){ 1241 | va <- append(va,dcc[i,j]) 1242 | }else{ 1243 | va <- append(va,dcc[j,i]) 1244 | } 1245 | cx <- append(cx,i) 1246 | cy <- append(cy,j) 1247 | } 1248 | } 1249 | } 1250 | 1251 | 1252 | 1253 | cnl <- object@ldata$cnl 1254 | u <- object@ltcoord[,1] 1255 | v <- object@ltcoord[,2] 1256 | layout( cbind(c(1, 1), c(2, 3)),widths=c(5,1,1),height=c(5,5,1)) 1257 | par(mar = c(12,5,1,1)) 1258 | 1259 | if ( showCells ){ 1260 | plot(u,v,cex=1.5,col="grey",pch=20,xlab="Dim 1",ylab="Dim 2") 1261 | if ( !nmode ) points(u[object@sigcell],v[object@sigcell],col="black") 1262 | }else{ 1263 | plot(u,v,cex=0,xlab="Dim 1",ylab="Dim 2") 1264 | } 1265 | 1266 | if ( length(va) > 0 ){ 1267 | f <- order(va,decreasing=TRUE) 1268 | for ( i in 1:length(va) ){ 1269 | if ( object@cdata$linkscore[cx[i],cy[i]] > scthr ){ 1270 | if ( showCells ){ 1271 | lines(cnl[c(cx[i],cy[i]),1],cnl[c(cx[i],cy[i]),2],col=va[i],lwd=2) 1272 | }else{ 1273 | ##nn <- min(10,fc[cx[i],cy[i]]) 1274 | lines(cnl[c(cx[i],cy[i]),1],cnl[c(cx[i],cy[i]),2],col=va[i],lwd=5*object@cdata$linkscore[cx[i],cy[i]]) 1275 | } 1276 | } 1277 | } 1278 | } 1279 | 1280 | 1281 | 1282 | en <- aggregate(object@entropy,list(object@sc@cpart),median) 1283 | en <- en[en$Group.1 %in% m,] 1284 | 1285 | mi <- min(en[,2],na.rm=TRUE) 1286 | ma <- max(en[,2],na.rm=TRUE) 1287 | w <- round((en[,2] - mi)/(ma - mi)*99 + 1,0) 1288 | ramp <- colorRamp(c("red","orange", "pink","purple", "blue")) 1289 | ColorRamp <- rgb( ramp(seq(0, 1, length = 101)), max = 255) 1290 | ColorLevels <- seq(mi, ma, length=length(ColorRamp)) 1291 | if ( mi == ma ){ 1292 | ColorLevels <- seq(0.99*mi, 1.01*ma, length=length(ColorRamp)) 1293 | } 1294 | for ( i in m ){ 1295 | f <- en[,1] == m 1296 | points(cnl[f,1],cnl[f,2],cex=5,col=ColorRamp[w[f]],pch=20) 1297 | } 1298 | text(cnl[,1],cnl[,2],m,cex=1.25,font=4,col="white") 1299 | par(mar = c(5, 4, 1, 2)) 1300 | image(1, ColorLevels, 1301 | matrix(data=ColorLevels, ncol=length(ColorLevels),nrow=1), 1302 | col=ColorRamp, 1303 | xlab="",ylab="", 1304 | xaxt="n") 1305 | coll <- seq(min(k), max(k), length=length(mcol)) 1306 | image(1, coll, 1307 | matrix(data=coll, ncol=length(mcol),nrow=1), 1308 | col=mcol, 1309 | xlab="",ylab="", 1310 | xaxt="n") 1311 | layout(1) 1312 | } 1313 | ) 1314 | 1315 | 1316 | setGeneric("plotdistanceratio", function(object) standardGeneric("plotdistanceratio")) 1317 | 1318 | setMethod("plotdistanceratio", 1319 | signature = "Ltree", 1320 | definition = function(object){ 1321 | if ( length(object@ldata) <= 0 ) stop("run projcells before plotdistanceratio") 1322 | l <- as.matrix(dist(object@ldata$pdi)) 1323 | z <- (l/object@ldata$ld) 1324 | hist(log2(z),breaks=100,xlab=" log2 emb. distance/distance",main="") 1325 | } 1326 | ) 1327 | 1328 | 1329 | setGeneric("getproj", function(object,i) standardGeneric("getproj")) 1330 | 1331 | setMethod("getproj", 1332 | signature = "Ltree", 1333 | definition = function(object,i){ 1334 | if ( length(object@ldata) <= 0 ) stop("run projcells before plotdistanceratio") 1335 | if ( ! i %in% object@ldata$m ) stop(paste("argument i has to be one of",paste(object@ldata$m,collapse=","))) 1336 | x <- object@trproj$rma[names(object@ldata$lp)[object@ldata$lp == i],] 1337 | x <- x[,names(x) != paste("X",i,sep="")] 1338 | f <- !is.na(x[,1]) 1339 | x <- x[f,] 1340 | if ( nrow(x) > 1 ){ 1341 | y <- x 1342 | y <- as.data.frame(t(apply(y,1,function(x) (x - mean(x))/sqrt(var(x))))) 1343 | } 1344 | names(x) = sub("X","cl.",names(x)) 1345 | names(y) = sub("X","cl.",names(y)) 1346 | return(list(pr=x,prz=y)) 1347 | } 1348 | ) 1349 | 1350 | 1351 | setGeneric("projenrichment", function(object) standardGeneric("projenrichment")) 1352 | 1353 | setMethod("projenrichment", 1354 | signature = "Ltree", 1355 | definition = function(object){ 1356 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotmap") 1357 | 1358 | ze <- ( object@cdata$pv.e < object@par$pethr | object@cdata$pv.d < object@par$pethr) * (object@cdata$counts + .1)/( object@cdata$counts.br + .1 ) 1359 | pheatmap(log2(ze + ( ze == 0 ) ),cluster_rows=FALSE,cluster_cols=FALSE) 1360 | } 1361 | ) 1362 | 1363 | 1364 | 1365 | 1366 | 1367 | setGeneric("compscore", function(object,nn=1) standardGeneric("compscore")) 1368 | 1369 | setMethod("compscore", 1370 | signature = "Ltree", 1371 | definition = function(object,nn){ 1372 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before compscore") 1373 | if ( ! is.numeric(nn) ) stop( "nn has to be a non-negative integer number" ) else if ( round(nn) != nn | nn < 0 ) stop( "nn has to be a non-negative integer number" ) 1374 | x <- object@cdata$counts*(object@cdata$pv.e < object@par$pethr)>0 1375 | y <- x | t(x) 1376 | 1377 | if ( max(y) > 0 ){ 1378 | z <- apply(y,1,sum) 1379 | nl <- list() 1380 | n <- list() 1381 | for ( i in 1:nn ){ 1382 | if ( i == 1 ){ 1383 | n[[i]] <- as.list(apply(y,1,function(x) grep(TRUE,x))) 1384 | nl <- data.frame( apply(y,1,sum) ) 1385 | } 1386 | if ( i > 1 ){ 1387 | v <- rep(0,nrow(nl)) 1388 | n[[i]] <- list() 1389 | for ( j in 1:length(n[[i-1]]) ){ 1390 | cl <- n[[i-1]][[j]] 1391 | if ( length(cl) == 0 ){ 1392 | n[[i]][[paste("d",j,sep="")]] <- NA 1393 | v[j] <- 0 1394 | }else{ 1395 | k <- if ( length(cl) > 1 ) apply(y[cl,],2,sum) > 0 else if ( length(cl) == 1 ) y[cl,] 1396 | n[[i]][[paste("d",j,sep="")]] <- sort(unique(c(cl,grep(TRUE,k)))) 1397 | v[j] <- length(n[[i]][[paste("d",j,sep="")]]) 1398 | } 1399 | } 1400 | names(n[[i]]) <- names(z) 1401 | nl <- cbind(nl,v) 1402 | 1403 | } 1404 | } 1405 | x <- nl[,i] 1406 | names(x) <- rownames(nl) 1407 | }else{ 1408 | x <- rep(0,length(object@ldata$m)) 1409 | names(x) <- paste("cl",object@ldata$m,sep=".") 1410 | } 1411 | 1412 | v <- aggregate(object@entropy,list(object@sc@cpart),median) 1413 | v <- v[v$Group.1 %in% object@ldata$m,] 1414 | w <- as.vector(v[,-1]) 1415 | names(w) <- paste("cl.",v$Group.1,sep="") 1416 | w <- w - min(w) 1417 | 1418 | return(list(links=x,entropy=w,StemIDscore=x*w)) 1419 | } 1420 | ) 1421 | 1422 | 1423 | 1424 | 1425 | setGeneric("plotscore", function(object,nn=1) standardGeneric("plotscore")) 1426 | 1427 | setMethod("plotscore", 1428 | signature = "Ltree", 1429 | definition = function(object,nn){ 1430 | if ( length(object@cdata) <= 0 ) stop("run comppvalue before plotscore") 1431 | x <- compscore(object,nn) 1432 | layout(1:3) 1433 | barplot(x$links,names.arg=sub("cl\\.","",object@ldata$m),xlab="Cluster",ylab="Number of links",cex.names=1) 1434 | barplot(x$entropy,names.arg=sub("cl\\.","",object@ldata$m),xlab="Cluster",ylab="Delta-Entropy",cex.names=1) 1435 | barplot(x$StemIDscore,names.arg=sub("cl\\.","",object@ldata$m),xlab="Cluster",ylab="Number of links * Delta-Entropy",cex.names=1) 1436 | layout(1) 1437 | } 1438 | ) 1439 | 1440 | 1441 | 1442 | 1443 | setGeneric("branchcells", function(object,br) standardGeneric("branchcells")) 1444 | 1445 | setMethod("branchcells", 1446 | signature = "Ltree", 1447 | definition = function(object,br){ 1448 | if ( length(object@ldata) <= 0 ) stop("run projcells before branchcells") 1449 | msg <- paste("br needs to be list of length two containing two branches, where each has to be one of", paste(names(object@prtree$n),collapse=",")) 1450 | if ( !is.list(br) ) stop(msg) else if ( length(br) != 2 ) stop(msg) else if ( ! br[[1]] %in% names(object@prtree$n) | ! br[[2]] %in% names(object@prtree$n) ) stop(msg) 1451 | 1452 | 1453 | n <- list() 1454 | scl <- object@sc 1455 | k <- c() 1456 | cl <- intersect( as.numeric(strsplit(br[[1]],"\\.")[[1]]), as.numeric(strsplit(br[[2]],"\\.")[[1]])) 1457 | if ( length(cl) == 0 ) stop("the two branches in br need to have one cluster in common.") 1458 | 1459 | for ( i in 1:length(br) ){ 1460 | f <- object@sc@cpart[ object@prtree$n[[br[[i]]]] ] %in% cl 1461 | if ( sum(f) > 0 ){ 1462 | n[[i]] <- names(object@sc@cpart[ object@prtree$n[[br[[i]]]] ])[f] 1463 | k <- append(k, max( scl@cpart ) + 1) 1464 | scl@cpart[n[[i]]] <- max( scl@cpart ) + 1 1465 | }else{ 1466 | stop(paste("no cells on branch",br[[i]],"fall into clusters",cl)) 1467 | } 1468 | } 1469 | set.seed(111111) 1470 | scl@fcol <- sample(rainbow(max(scl@cpart))) 1471 | z <- diffgenes(scl,k[1],k[2]) 1472 | return( list(n=n,scl=scl,k=k,diffgenes=z) ) 1473 | } 1474 | ) 1475 | 1476 | 1477 | 1478 | -------------------------------------------------------------------------------- /RaceID2_StemID_sample.R: -------------------------------------------------------------------------------- 1 | ## install required packages (only at first time) 2 | install.packages(c("tsne","pheatmap","MASS","cluster","mclust","flexmix","lattice","fpc","RColorBrewer","permute","amap","locfit","vegan")) 3 | 4 | ## load class definition and functions 5 | source("RaceID2_StemID_class.R") 6 | 7 | ## input data 8 | x <- read.csv("transcript_counts_intestine_5days_YFP.xls",sep="\t",header=TRUE) 9 | rownames(x) <- x$GENEID 10 | # prdata: data.frame with transcript counts for all genes (rows) in all cells (columns); with rownames == gene ids; remove ERCC spike-ins 11 | prdata <- x[grep("ERCC",rownames(x),invert=TRUE),-1] 12 | 13 | ## RaceID2 14 | # initialize SCseq object with transcript counts 15 | sc <- SCseq(prdata) 16 | # filtering of expression data 17 | sc <- filterdata(sc, mintotal=3000, minexpr=5, minnumber=1, maxexpr=500, downsample=TRUE, dsn=1, rseed=17000) 18 | # k-medoids clustering 19 | sc <- clustexp(sc,clustnr=30,bootnr=50,metric="pearson",do.gap=FALSE,sat=TRUE,SE.method="Tibs2001SEmax",SE.factor=.25,B.gap=50,cln=0,rseed=17000,FUNcluster="kmedoids") 20 | # compute t-SNE map 21 | sc <- comptsne(sc,rseed=15555) 22 | # detect outliers and redefine clusters 23 | sc <- findoutliers(sc, outminc=5,outlg=2,probthr=1e-3,thr=2**-(1:40),outdistquant=.95) 24 | 25 | ## diagnostic plots 26 | # gap statistics: only if do.gap == TRUE 27 | ##plotgap(sc) 28 | # plot within-cluster dispersion as a function of the cluster number: only if sat == TRUE 29 | plotsaturation(sc,disp=TRUE) 30 | # plot change of the within-cluster dispersion as a function of the cluster number: only if sat == TRUE 31 | plotsaturation(sc) 32 | # silhouette of k-medoids clusters 33 | plotsilhouette(sc) 34 | # Jaccard's similarity of k-medoids clusters 35 | plotjaccard(sc) 36 | # barchart of outlier probabilities 37 | plotoutlierprobs(sc) 38 | # regression of background model 39 | plotbackground(sc) 40 | # dependence of outlier number on probability threshold (probthr) 41 | plotsensitivity(sc) 42 | # heatmap of k-medoids cluster 43 | clustheatmap(sc,final=FALSE,hmethod="single") 44 | # heatmap of final cluster 45 | clustheatmap(sc,final=TRUE,hmethod="single") 46 | # highlight k-medoids clusters in t-SNE map 47 | plottsne(sc,final=FALSE) 48 | # highlight final clusters in t-SNE map 49 | plottsne(sc,final=TRUE) 50 | # highlight cell labels in t-SNE map 51 | plotlabelstsne(sc,labels=sub("(\\_\\d+)","",names(sc@ndata))) 52 | # highlight groups of cells by symbols in t-SNE map 53 | plotsymbolstsne(sc,types=sub("(\\_\\d+)$","", names(sc@ndata))) 54 | # highlight transcirpt counts of a set of genes in t-SNE map, e. g. all Apoa genes 55 | g <- c("Apoa1__chr9", "Apoa1bp__chr3", "Apoa2__chr1", "Apoa4__chr9", "Apoa5__chr9") 56 | plotexptsne(sc,g,n="Apoa genes",logsc=TRUE) 57 | 58 | ## identification of marker genes 59 | # differentially regulated genes in each cluster compared to the full ensemble 60 | cdiff <- clustdiffgenes(sc,pvalue=.01) 61 | 62 | ## write results to text files 63 | # final clusters 64 | x <- data.frame(CELLID=names(sc@cpart),cluster=sc@cpart) 65 | write.table(x[order(x$cluster,decreasing=FALSE),],"cell_clust.xls",row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE) 66 | 67 | # differentially expressed genes in cluster 68 | for ( n in names(cdiff) ) write.table(data.frame(GENEID=rownames(cdiff[[n]]),cdiff[[n]]),paste(paste("cell_clust_diff_genes",sub("\\.","\\_",n),sep="_"),".xls",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE) 69 | 70 | # differentially expressed genes between two sets of clusters, e. g. cluster 1 and clusters 2,3 71 | d <- diffgenes(sc,cl1=1,cl2=c(2,3),mincount=5) 72 | plotdiffgenes(d,gene=names(d$z)[1]) 73 | 74 | 75 | ## StemID 76 | 77 | # initialization 78 | ltr <- Ltree(sc) 79 | # computation of the entropy 80 | ltr <- compentropy(ltr) 81 | # computation of the projections for all cells 82 | ltr <- projcells(ltr,cthr=2,nmode=FALSE) 83 | # computation of the projections for all cells after randomization 84 | ltr <- projback(ltr,pdishuf=2000,nmode=FALSE,rseed=17000) 85 | # assembly of the lineage tree 86 | ltr <- lineagetree(ltr,pthr=0.01,nmode=FALSE) 87 | # determination of significant differentiation trajectories 88 | ltr <- comppvalue(ltr,pethr=0.01,nmode=FALSE) 89 | 90 | ## diagnostic plots 91 | # histogram of ratio between cell-to-cell distances in the embedded and the input space 92 | plotdistanceratio(ltr) 93 | # t-SNE map of the clusters with more than cthr cells including a minimum spanning tree for the cluster medoids 94 | plotmap(ltr) 95 | # visualization of the projections in t-SNE space overlayed with a minimum spanning tree connecting the cluster medoids 96 | plotmapprojections(ltr) 97 | # lineage tree showing the projections of all cells in t-SNE space 98 | plottree(ltr,showCells=TRUE,nmode=FALSE,scthr=0) 99 | # lineage tree without showing the projections of all cells 100 | plottree(ltr,showCells=FALSE,nmode=FALSE,scthr=0) 101 | # heatmap of the enrichment p-values for all inter-cluster links 102 | plotlinkpv(ltr) 103 | # heatmap of the link score for all inter-cluster links 104 | plotlinkscore(ltr) 105 | # heatmap showing the fold enrichment (or depletion) for significantly enriched or depleted links 106 | projenrichment(ltr) 107 | 108 | ## extract projections onto all links for all cells in a given cluster i 109 | x <- getproj(ltr,i=1) 110 | # heatmap of all projections for cluster i 111 | pheatmap(x$pr) 112 | # heatmap of z-score for all projections for cluster i 113 | pheatmap(x$prz) 114 | 115 | ## extracting all cells on two branches sharing the same cluster and computing differentially expressed genes between these two branches 116 | x <- branchcells(ltr,list("1.3","1.2")) 117 | # z-scores for differentially expressed genes 118 | head(x$diffgenes$z) 119 | # plotting the cells on the two branches as additional clusters in the t-SNE map 120 | plottsne(x$scl) 121 | 122 | 123 | ## computing the StemID score 124 | x <- compscore(ltr,nn=1) 125 | #plotting the StemID score 126 | plotscore(ltr,1) 127 | 128 | 129 | -------------------------------------------------------------------------------- /Reference_manual_RaceID2_StemID.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dgrun/StemID/edee40e9a253ea1e57fbdce1df9480d40007f6c7/Reference_manual_RaceID2_StemID.pdf --------------------------------------------------------------------------------