├── README.md └── DMM cluster transition model /README.md: -------------------------------------------------------------------------------- 1 | # Stewart_TEDDY_Microbiome_Analysis 2 | Temporal Development of the Gut Microbiome in Early Childhood - The TEDDY Study 3 | 4 | # TEDDY study infant gut microbiome analyses by Baylor College of Medicine 5 | 6 | This repository contains code for the transition model showing the progression of samples through each DMM cluster, which are presented in Figure 1 and Extended Data Figure 2 of the "Temporal Development of the Gut Microbiome in Early Childhood - The TEDDY Study" by Stewart et al. (citation to be included). 7 | 8 | The gut microbiome undergoes substantial development from birth into childhood and is believed to have significant consequences for long-term health, including the development of persistant islet autoimmunity (IA) and Type 1 Diabetes (T1D). In a multi-geographical population of 903 infants at-risk for T1D who developed IA and/or T1D by months 46 of age and their matched controls, we sought to characterize the gut microbiome from 3 to 46 months of life and determine the key covariates influencing its development. The study included two datasets, 1) 16S rRNA gene sequencing of >12,000 stool samples and 2) metagenomic sequencing of 10,867 stool samples. 9 | -------------------------------------------------------------------------------- /DMM cluster transition model: -------------------------------------------------------------------------------- 1 | 2 | #============================================================================== 3 | #=----------------------------------------------------------------------------- 4 | # Daniel Smith and Christopher Stewart 5 | # Baylor College of Medicine 6 | # 7 | # 8 | # ========================== 9 | # 10 | # Script for looking at sample clusters throughout 11 | # a timecourse with multiple subjects. 12 | # 13 | # 14 | #=----------------------------------------------------------------------------- 15 | #============================================================================== 16 | 17 | library("ggplot2") 18 | library("igraph") 19 | library("slam") 20 | library("scales") 21 | 22 | 23 | 24 | #============================================================================== 25 | # Load the pre-computed datasets 26 | #============================================================================== 27 | 28 | 29 | Metadata <- download.aws("") 30 | GenusMtx <- download.aws("") 31 | 32 | 33 | 34 | #============================================================================== 35 | # Top 25 Genera across the different DMM Clusters as a Heatmap 36 | #============================================================================== 37 | 38 | CLUSTER_COLUMN <- "DMM_Cluster" 39 | 40 | md <- Metadata 41 | genera <- t(t(GenusMtx) / colSums(GenusMtx)) 42 | 43 | anno <- md[colnames(genera), CLUSTER_COLUMN, drop=FALSE] 44 | anno[[CLUSTER_COLUMN]] <- factor(anno[[CLUSTER_COLUMN]]) 45 | anno <- anno[order(anno[[CLUSTER_COLUMN]]), , drop=FALSE] 46 | 47 | top25 <- head(names(rev(sort(rowSums(genera)))), 25) 48 | 49 | colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(n=7, name="RdYlBu")), bias=3)(100) #use to set colour (name =) and scale (bias =) 50 | 51 | mat <- genera[top25, rownames(anno), drop=FALSE] 52 | mat <- t(apply(mat, 1L, scales::rescale)) #uncomment to normalize each taxa (row) 53 | 54 | pheatmap::pheatmap( 55 | mat = mat, 56 | color = colors, #uncheck if setting the colour scale manual 57 | annotation_col = anno, 58 | show_colnames = FALSE, 59 | cluster_rows = FALSE, 60 | cluster_cols = FALSE, 61 | gaps_col = cumsum(unname(table(anno[[CLUSTER_COLUMN]]))), 62 | labels_row = sub("^.*__", "", top25) 63 | ) 64 | 65 | # remove('CLUSTER_COLUMN') 66 | # remove('anno', 'genera', 'md', 'top30', 'mat') 67 | 68 | 69 | 70 | #========= 71 | #manual order of bacteria 72 | #========= 73 | 74 | top25 <- c('Bacteria; __Actinobacteria; __Actinobacteria; __Bifidobacteriales; __Bifidobacteriaceae; __Bifidobacterium', 75 | 'Bacteria; __Firmicutes; __Bacilli; __Lactobacillales; __Enterococcaceae; __Enterococcus', 76 | 'Bacteria; __Firmicutes; __Negativicutes; __Selenomonadales; __Veillonellaceae; __Veillonella', 77 | 'Bacteria; __Firmicutes; __Bacilli; __Lactobacillales; __Streptococcaceae; __Streptococcus', 78 | 'Bacteria; __Proteobacteria; __Gammaproteobacteria; __Enterobacteriales; __Enterobacteriaceae; __Enterobacter', 79 | 'Bacteria; __Proteobacteria; __Gammaproteobacteria; __Enterobacteriales; __Enterobacteriaceae; __Escherichia_Shigella', 80 | 'Bacteria; __Bacteroidetes; __Bacteroidia; __Bacteroidales; __Bacteroidaceae; __Bacteroides', 81 | 'Bacteria; __Verrucomicrobia; __Verrucomicrobiae; __Verrucomicrobiales; __Verrucomicrobiaceae; __Akkermansia', 82 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; ___Ruminococcus_gnavus_group', 83 | 'Bacteria; __Firmicutes; __Erysipelotrichia; __Erysipelotrichales; __Erysipelotrichaceae; __Erysipelatoclostridium', 84 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Tyzzerella_4', 85 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Lachnoclostridium', 86 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Blautia', 87 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Lachnospiraceae_UCG_008', 88 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Anaerostipes', 89 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; ___Eubacterium_rectale_group', 90 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Clostridiaceae_1; __Clostridium_sensu_stricto_1', 91 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Peptostreptococcaceae; __Romboutsia', 92 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Peptostreptococcaceae; __Intestinibacter', 93 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Fusicatenibacter', 94 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; __Ruminococcus_2', 95 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; __Faecalibacterium', 96 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; __Subdoligranulum', 97 | 'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; ___Eubacterium_coprostanoligenes_group', 98 | 'Bacteria; __Firmicutes; __Negativicutes; __Selenomonadales; __Veillonellaceae; __Dialister') 99 | 100 | mat <- genera[top25, rownames(anno), drop=FALSE] 101 | mat <- t(apply(mat, 1L, scales::rescale)) #uncomment to normalize each taxa (row) 102 | 103 | pheatmap::pheatmap( 104 | mat = mat, 105 | color = colors, #uncheck if setting the colour scale manual 106 | annotation_col = anno, 107 | show_colnames = FALSE, 108 | cluster_rows = FALSE, 109 | cluster_cols = FALSE, 110 | gaps_col = cumsum(unname(table(anno[[CLUSTER_COLUMN]]))), 111 | labels_row = sub("^.*; __", "", rownames(mat)) 112 | ) 113 | 114 | 115 | # remove('CLUSTER_COLUMN') 116 | # remove('anno', 'species', 'md', 'top25', 'mat') 117 | 118 | 119 | 120 | 121 | 122 | #============================================================================== 123 | # DMM Transitions Circle 124 | # - Answers: how often is cluster y preceded by cluster x? 125 | #============================================================================== 126 | 127 | #---------------------------------------------------------------------- 128 | # Modify these values as you please 129 | #---------------------------------------------------------------------- 130 | 131 | MAX_TIME_DELTA <- 120 132 | TIME_COLUMN <- "Age_in_Days" 133 | CLUSTER_COLUMN <- "DMM_Cluster" 134 | SUBJECT_COLUMN <- "SubjectID" 135 | 136 | md <- Metadata 137 | 138 | 139 | #---------------------------------------------------------------------- 140 | # Double check for the user that they provided valid column names 141 | #---------------------------------------------------------------------- 142 | 143 | vars <- c(TIME_COLUMN, CLUSTER_COLUMN, SUBJECT_COLUMN) 144 | if (!all(vars %in% colnames(md))) 145 | stop(sprintf("Column name not found: %s", paste(collapse=", ", setdiff(vars, colnames(md))))) 146 | 147 | 148 | #---------------------------------------------------------------------- 149 | # Filter out rows with missing data for time, subject, or cluster 150 | #---------------------------------------------------------------------- 151 | 152 | md <- md[!is.na(md[[TIME_COLUMN]]) & !is.na(md[[CLUSTER_COLUMN]]) & !is.na(md[[SUBJECT_COLUMN]]),] 153 | 154 | 155 | #---------------------------------------------------------------------- 156 | # Assemble a matrix to represent the number each transition 157 | #---------------------------------------------------------------------- 158 | 159 | stateNames <- unique(as.character(sort(md[[CLUSTER_COLUMN]]))) 160 | nStates <- length(stateNames) 161 | transitionMatrix <- matrix(0, nrow=nStates, ncol=nStates, dimnames=list(stateNames, stateNames)) 162 | 163 | plyr::d_ply(md[,vars], SUBJECT_COLUMN, function (x) { 164 | 165 | if (nrow(x) < 2) return (NULL) 166 | 167 | x <- x[order(x[[TIME_COLUMN]]),,drop=FALSE] 168 | 169 | for (i in seq_len(nrow(x) - 1)) { 170 | 171 | t1 <- x[i, TIME_COLUMN] 172 | t2 <- x[i+1, TIME_COLUMN] 173 | if (t2 - t1 > MAX_TIME_DELTA) next 174 | 175 | c1 <- as.character(x[i, CLUSTER_COLUMN]) 176 | c2 <- as.character(x[i+1, CLUSTER_COLUMN]) 177 | transitionMatrix[c1, c2] <<- transitionMatrix[c1, c2] + 1 178 | } 179 | }) 180 | 181 | 182 | #---------------------------------------------------------------------- 183 | # Normalize transition matrix so rows sum to 1 184 | #---------------------------------------------------------------------- 185 | 186 | transitionMatrix <- transitionMatrix / rowSums(transitionMatrix) 187 | 188 | 189 | #---------------------------------------------------------------------- 190 | # Create iGraph network 191 | #---------------------------------------------------------------------- 192 | 193 | g <- graph.adjacency(transitionMatrix, mode="directed", weighted=TRUE) 194 | 195 | 196 | #---------------------------------------------------------------------- 197 | # Create a function to generate a continuous color palette 198 | #---------------------------------------------------------------------- 199 | 200 | val2rgb <- colorRamp(rev(heat.colors(10)[1:6]), bias = 2) 201 | 202 | 203 | #---------------------------------------------------------------------- 204 | # Generate the figure and color bar legend 205 | #---------------------------------------------------------------------- 206 | 207 | plot.igraph( 208 | x = g, 209 | xlim = c(0, .8), 210 | ylim = c(-1, 1), 211 | layout = layout_in_circle, 212 | vertex.size = colSums(transitionMatrix) * 20, 213 | edge.width = E(g)$weight * 20, 214 | edge.color = rgb(val2rgb(E(g)$weight) / 255), 215 | edge.loop.angle = scales::rescale(seq_along(E(g)), to=c(0, -2 * 3.14)) ) 216 | 217 | legend( 218 | x = 1.5, 219 | y = 1, 220 | fill = rgb(val2rgb(c(0, .05, .1, .15, .2, .5, .75, 1)) / 255), 221 | legend = c("0%", "5%", "10%", "15%", "20%", "50%", "75%", "100%"), 222 | title = "Transition\nFrequency", 223 | bty = "n" ) 224 | 225 | 226 | #---------------------------------------------------------------------- 227 | # Save final figure to a pdf file 228 | #---------------------------------------------------------------------- 229 | 230 | dev.copy2pdf(file="16S_OTU_DMM_Transitions.pdf") 231 | 232 | 233 | #---------------------------------------------------------------------- 234 | # Cleanup 235 | #---------------------------------------------------------------------- 236 | 237 | remove('CLUSTER_COLUMN', 'MAX_TIME_DELTA', 'SUBJECT_COLUMN', 'TIME_COLUMN') 238 | remove('g', 'md', 'nStates', 'stateNames', 'transitionMatrix', 'val2rgb', 'vars') 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | #============================================================================== 249 | # Sample Overview 250 | # - Shows each sample for each subject colored by a variable 251 | #============================================================================== 252 | 253 | #---------------------------------------------------------------------- 254 | # Modify these values as you please 255 | #---------------------------------------------------------------------- 256 | 257 | TIME_COLUMN <- "Age_in_Months" 258 | TIME_MIN_MAX <- c(0, 40) 259 | COLOR_COLUMN <- "DMM_Cluster" 260 | SUBJECT_COLUMN <- "SubjectID" 261 | FACET_COLUMN <- "Birth_Mode_Simple" 262 | 263 | md <- Metadata 264 | 265 | 266 | #---------------------------------------------------------------------- 267 | # Double check for the user that they provided valid column names 268 | #---------------------------------------------------------------------- 269 | 270 | vars <- c(TIME_COLUMN, COLOR_COLUMN, SUBJECT_COLUMN, FACET_COLUMN) 271 | if (!all(vars %in% colnames(md))) 272 | stop(sprintf("Column name not found: %s", paste(collapse=", ", setdiff(vars, colnames(md))))) 273 | 274 | 275 | #---------------------------------------------------------------------- 276 | # Probably want the COLOR_COLUMN to be categorical 277 | #---------------------------------------------------------------------- 278 | if (length(unique(md[[COLOR_COLUMN]])) < 20) 279 | md[[COLOR_COLUMN]] <- as.character(md[[COLOR_COLUMN]]) 280 | 281 | 282 | #---------------------------------------------------------------------- 283 | # Filter out rows with missing data or outside specified time range 284 | #---------------------------------------------------------------------- 285 | 286 | md <- md[ !is.na(md[[TIME_COLUMN]]) & 287 | !is.na(md[[COLOR_COLUMN]]) & 288 | !is.na(md[[SUBJECT_COLUMN]]) & 289 | !is.na(md[[FACET_COLUMN]]), , drop=FALSE] 290 | 291 | md <- md[ md[[TIME_COLUMN]] >= min(TIME_MIN_MAX) & 292 | md[[TIME_COLUMN]] <= max(TIME_MIN_MAX), , drop=FALSE] 293 | 294 | 295 | #---------------------------------------------------------------------- 296 | # Generate the figure and color bar legend 297 | #---------------------------------------------------------------------- 298 | 299 | ggplot(md) + 300 | ggtitle("16S OTU DMM Sample Overview") + 301 | geom_point(aes_string(x = TIME_COLUMN, y = SUBJECT_COLUMN, color = COLOR_COLUMN)) + 302 | facet_grid(as.formula(sprintf("%s ~ .", FACET_COLUMN)), scales="free_y", space="free_y") + 303 | theme_bw() + 304 | theme( 305 | axis.title.x = element_text(size=8), 306 | axis.text.x = element_text(size=8), 307 | axis.title.y = element_text(size=8), 308 | axis.text.y = element_text(size=2), 309 | axis.ticks.y = element_line(size=0.2), 310 | plot.title = element_text(size=8), 311 | legend.text = element_text(size=7), 312 | legend.title = element_text(size=8), 313 | legend.key.height = unit(0.15, "in"), 314 | legend.key.width = unit(0.15, "in"), 315 | legend.background = element_blank(), 316 | legend.margin = margin(rep(unit(0,"mm"), 4)), 317 | plot.margin = unit(c(0.2,0,0,0),"mm"), 318 | strip.text = element_text(size=8) ) 319 | 320 | 321 | #---------------------------------------------------------------------- 322 | # Save final figure to a pdf file 323 | #---------------------------------------------------------------------- 324 | 325 | ggsave(file = "16S_OTU_DMM_Schedule.pdf", width=8, height=20) 326 | 327 | 328 | #---------------------------------------------------------------------- 329 | # Cleanup 330 | #---------------------------------------------------------------------- 331 | 332 | remove('COLOR_COLUMN', 'FACET_COLUMN', 'SUBJECT_COLUMN', 'TIME_COLUMN', 'TIME_MIN_MAX') 333 | remove('md', 'vars') 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | #============================================================================== 351 | # DMM Transitions Over Time 352 | # - Answers: is a cluster more common in a certain age range? 353 | # - If a subject has >1 samples in a time range, only the first is used. 354 | # - TIME_FUDGE is used to create a window around a specific time point 355 | # - If TIME_FUDGE == 0, use only exact matches to the time point 356 | # - When 0 < TIME_FUDGE < 1, interpret as a +/- percent, e.g. a TIME_FUDGE 357 | # of .2 would give a time point of 10 the range 8-12. 358 | # - When TIME_FUDGE >= 1, interpret as a +/- value, e.g. a TIME_FUDGE of 3 359 | # would give a time point of 10 the range 7-13. 360 | #============================================================================== 361 | 362 | #---------------------------------------------------------------------- 363 | # Modify these values as you please 364 | #---------------------------------------------------------------------- 365 | 366 | TIME_COLUMN <- "Age_in_Months" 367 | TIME_POINTS <- c(4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5, 44.5) 368 | TIME_FUDGE <- 1.5 # +/- 1.5 months 369 | REQUIRE_ALL_TP <- FALSE # Drop subjects with missing time points 370 | CLUSTER_COLUMN <- "DMM_Cluster" 371 | SUBJECT_COLUMN <- "SubjectID" 372 | MINIMUM_PERCENT <- .04 373 | NODE_COLOR <- "#072b5a" 374 | EDGE_COLOR <- c("white", "orange", "darkorange", "orangered", "darkred") 375 | EDGE_PCT_RANGE <- c(0, 0.24) 376 | 377 | md <- Metadata #plots all 378 | #md <- Metadata[Metadata[['Birth_Mode_Simple']] == "Vaginal",,drop=FALSE] #00B9EB #plots specified 379 | #md <- Metadata[Metadata[['Birth_Mode_Simple']] == "Caesarian",,drop=FALSE] #ED5F16 380 | #md <- Metadata[Metadata[['Breastmilk_Combined']] == "Some_BM",,drop=FALSE] #072b5a 381 | #md <- Metadata[Metadata[['Breastmilk_Combined']] == "After_BM",,drop=FALSE] #ff0000 382 | # md <- Metadata[Metadata[['T1D_Outcome']] == "Before",,drop=FALSE] 383 | # md <- Metadata[Metadata[['T1D_Outcome']] == "Never",,drop=FALSE] 384 | # md <- Metadata[Metadata[['IA_Outcome']] == "Before",,drop=FALSE] 385 | # md <- Metadata[Metadata[['IA_Outcome']] == "Never",,drop=FALSE] 386 | 387 | 388 | 389 | #---------------------------------------------------------------------- 390 | # Double check for the user that they provided valid column names 391 | #---------------------------------------------------------------------- 392 | 393 | vars <- c(TIME_COLUMN, CLUSTER_COLUMN, SUBJECT_COLUMN) 394 | if (!all(vars %in% colnames(md))) 395 | stop(sprintf("Column name not found: %s", paste(collapse=", ", setdiff(vars, colnames(md))))) 396 | if (nrow(md) == 0) 397 | stop("Metadata object 'md' doesn't have any rows.") 398 | 399 | clusterNames <- unique(as.character(sort(md[[CLUSTER_COLUMN]]))) 400 | timePtNames <- unique(as.character(sort(TIME_POINTS))) 401 | stateNames <- apply(expand.grid(clusterNames, timePtNames), 1L, paste, collapse="@") 402 | nStates <- length(stateNames) 403 | 404 | 405 | #---------------------------------------------------------------------- 406 | # Filter out rows with missing data for time, subject, or cluster 407 | #---------------------------------------------------------------------- 408 | 409 | md <- md[!is.na(md[[TIME_COLUMN]]) & !is.na(md[[CLUSTER_COLUMN]]) & !is.na(md[[SUBJECT_COLUMN]]), vars, drop=FALSE] 410 | 411 | 412 | #---------------------------------------------------------------------- 413 | # Map samples to time points. For multiple, retain only closest. 414 | #---------------------------------------------------------------------- 415 | 416 | closestTP <- sapply(md[[TIME_COLUMN]], function (x) { 417 | x <- TIME_POINTS[which.min(abs(x - TIME_POINTS))] 418 | }) 419 | residuals <- abs(closestTP - md[[TIME_COLUMN]]) 420 | 421 | md[[TIME_COLUMN]] <- closestTP 422 | 423 | if (TIME_FUDGE < 1) inRange <- residuals <= TIME_FUDGE * closestTP 424 | if (TIME_FUDGE >= 1) inRange <- residuals <= TIME_FUDGE 425 | 426 | md <- md[inRange,,drop=FALSE] 427 | residuals <- residuals[inRange] 428 | 429 | md <- md[order(residuals),,drop=FALSE] 430 | md <- md[!duplicated(paste(md[[TIME_COLUMN]], md[[SUBJECT_COLUMN]])),,drop=FALSE] 431 | 432 | 433 | #---------------------------------------------------------------------- 434 | # Optionally require a subject to have samples from all the time points 435 | #---------------------------------------------------------------------- 436 | if (REQUIRE_ALL_TP) { 437 | md <- plyr::ddply(md, SUBJECT_COLUMN, function (x) { 438 | if (nrow(x) == length(TIME_POINTS)) return (x) 439 | return (NULL) 440 | }) 441 | } 442 | timePtCounts <- table(md[[TIME_COLUMN]]) 443 | nodeCounts <- unlist(as.list(table(paste(sep="@", md[[CLUSTER_COLUMN]], md[[TIME_COLUMN]])))) 444 | nodeCounts <- setNames(nodeCounts / timePtCounts[sub("^.*@", "", names(nodeCounts))], names(nodeCounts)) 445 | 446 | #---------------------------------------------------------------------- 447 | # Count the number of subjects at each time point and/or cluster 448 | #---------------------------------------------------------------------- 449 | 450 | table(md$Age_in_Months, dnn = TIME_COLUMN) 451 | table(md$DMM_Cluster, dnn = CLUSTER_COLUMN) 452 | table(md$DMM_Cluster, md$Age_in_Months, dnn = c(CLUSTER_COLUMN, TIME_COLUMN)) 453 | 454 | 455 | #---------------------------------------------------------------------- 456 | # Assemble a matrix to represent the number of each transition 457 | #---------------------------------------------------------------------- 458 | 459 | transitionMatrix <- matrix(0, nrow=nStates, ncol=nStates, dimnames=list(stateNames, stateNames)) 460 | 461 | md[[CLUSTER_COLUMN]] <- paste(sep="@", md[[CLUSTER_COLUMN]], md[[TIME_COLUMN]]) 462 | md[[TIME_COLUMN]] <- as.numeric(factor(md[[TIME_COLUMN]])) 463 | 464 | plyr::d_ply(md[,vars], SUBJECT_COLUMN, function (x) { 465 | 466 | if (nrow(x) < 2) return (NULL) 467 | 468 | x <- x[order(x[[TIME_COLUMN]]),,drop=FALSE] 469 | 470 | for (i in seq_len(nrow(x) - 1)) { 471 | 472 | t1 <- x[i, TIME_COLUMN] 473 | t2 <- x[i+1, TIME_COLUMN] 474 | if (t2 - t1 > 1) next 475 | 476 | c1 <- as.character(x[i, CLUSTER_COLUMN]) 477 | c2 <- as.character(x[i+1, CLUSTER_COLUMN]) 478 | transitionMatrix[c1, c2] <<- transitionMatrix[c1, c2] + 1 479 | } 480 | }) 481 | 482 | 483 | #---------------------------------------------------------------------- 484 | # Rescale the transition matrix to percentages 485 | #---------------------------------------------------------------------- 486 | indices <- sort(rep(1:length(timePtNames), length(clusterNames))) 487 | for (timePt in 1:(length(timePtNames) - 1)) { 488 | i <- which(indices == timePt) 489 | transitionMatrix[i,] <- transitionMatrix[i,] / sum(transitionMatrix[i,]) 490 | } 491 | 492 | 493 | #---------------------------------------------------------------------- 494 | # Sankey Diagram 495 | #---------------------------------------------------------------------- 496 | 497 | # Nodes <- data.frame( 498 | # 'NodeID' = stateNames, 499 | # 'Cluster' = sapply(stateNames, function (x) strsplit(x, '@', T)[[1]][[1]])) 500 | # 501 | # Links <- with( 502 | # subset(reshape2::melt(transitionMatrix), value > 0), 503 | # data.frame( 504 | # 'Source' = as.numeric(factor(Var1, levels=stateNames)) - 1, 505 | # 'Target' = as.numeric(factor(Var2, levels=stateNames)) - 1, 506 | # 'Value' = value )) 507 | # 508 | # networkD3::sankeyNetwork( 509 | # Links = Links, 510 | # Nodes = Nodes, 511 | # Source = 'Source', 512 | # Target = 'Target', 513 | # Value = 'Value', 514 | # NodeID = 'NodeID', 515 | # NodeGroup = 'Cluster', 516 | # fontSize = 10 ) 517 | # 518 | # remove('Links', 'Nodes') 519 | 520 | 521 | #---------------------------------------------------------------------- 522 | # Drop cluster/timePt combos with zero observations (after filtering) 523 | #---------------------------------------------------------------------- 524 | 525 | transitionMatrix[which(transitionMatrix < MINIMUM_PERCENT)] <- 0 526 | 527 | stateNames <- stateNames[rowSums(transitionMatrix) | colSums(transitionMatrix)] 528 | clusterNames <- clusterNames[sapply(clusterNames, function (x) any(grep(sprintf("^%s@", x), stateNames)))] 529 | timePtNames <- timePtNames[ sapply(timePtNames, function (x) any(grep(sprintf("@%s$", x), stateNames)))] 530 | 531 | transitionMatrix <- transitionMatrix[stateNames, stateNames, drop=FALSE] 532 | 533 | 534 | #---------------------------------------------------------------------- 535 | # Generate the figure, axis labels, and color bar legend 536 | #---------------------------------------------------------------------- 537 | 538 | g <- graph.adjacency(transitionMatrix, mode="upper", weighted=TRUE) 539 | 540 | layout <- matrix(unlist(strsplit(stateNames, "@", fixed=TRUE)), ncol=2, byrow=TRUE) 541 | layout <- matrix(ncol=2, c( 542 | as.numeric(factor(layout[,2], levels=timePtNames)), 543 | as.numeric(factor(layout[,1], levels=rev(clusterNames))) )) 544 | 545 | vertex_weights <- rescale(nodeCounts[names(V(g))]) 546 | edge_weights <- rescale(E(g)$weight, from=EDGE_PCT_RANGE) 547 | 548 | edgeColorRamp <- colorRamp(EDGE_COLOR) 549 | 550 | plot.igraph( 551 | x = g, 552 | xlim = c(0, .8), 553 | ylim = c(-1, 1), 554 | layout = layout, 555 | vertex.label = NA, 556 | vertex.size = rescale(sqrt(vertex_weights / 3.14)) * 25, 557 | vertex.color = NODE_COLOR, 558 | edge.width = edge_weights * 12 + 1, 559 | edge.color = rgb(edgeColorRamp(edge_weights) / 255) ) 560 | 561 | xLabelPos = (seq_along(timePtNames) - 1) * 2 / (length(timePtNames) - 1) - 1 562 | yLabelPos = (seq_along(clusterNames) - 1) * -2 / (length(clusterNames) - 1) + 1 563 | 564 | text(-1.3, yLabelPos, clusterNames) 565 | text(xLabelPos, 1.5, timePtNames) 566 | text(xLabelPos, 1.3, sprintf("n = %i", timePtCounts), cex=.6) 567 | 568 | with( 569 | cbreaks(EDGE_PCT_RANGE, pretty_breaks(10), percent), 570 | legend( 571 | x = 1.5, 572 | y = 1, 573 | fill = rgb(edgeColorRamp(rescale(breaks)) / 255), 574 | legend = labels, 575 | title = "Transition\nFrequency", 576 | bty = "n" )) 577 | 578 | 579 | #---------------------------------------------------------------------- 580 | # Save final figure to a pdf file 581 | #---------------------------------------------------------------------- 582 | 583 | dev.copy2pdf(file="16S_OTU_DMM_Transitions_Over_Time.pdf") 584 | 585 | 586 | 587 | #---------------------------------------------------------------------- 588 | # Cleanup 589 | #---------------------------------------------------------------------- 590 | 591 | # remove('CLUSTER_COLUMN', 'EDGE_COLOR', 'MINIMUM_PERCENT', 'NODE_COLOR', 'REQUIRE_ALL_TP') 592 | # remove('SUBJECT_COLUMN', 'TIME_COLUMN', 'TIME_FUDGE', 'TIME_POINTS', 'EDGE_PCT_RANGE') 593 | # remove('closestTP', 'clusterNames', 'edge_weights', 'edgeColorRamp', 'g') 594 | # remove('i', 'indices', 'inRange', 'layout', 'md', 'nStates', 'residuals') 595 | # remove('stateNames', 'timePt', 'timePtCounts', 'timePtNames', 'transitionMatrix') 596 | # remove('vars', 'nodeCounts', 'vertex_weights', 'xLabelPos', 'yLabelPos') 597 | 598 | --------------------------------------------------------------------------------