├── README.md
└── DMM cluster transition model


/README.md:
--------------------------------------------------------------------------------
1 | # Stewart_TEDDY_Microbiome_Analysis
2 | Temporal Development of the Gut Microbiome in Early Childhood - The TEDDY Study
3 | 
4 | # TEDDY study infant gut microbiome analyses by Baylor College of Medicine
5 | 
6 | This repository contains code for the transition model showing the progression of samples through each DMM cluster, which are presented in Figure 1 and Extended Data Figure 2 of the "Temporal Development of the Gut Microbiome in Early Childhood - The TEDDY Study" by Stewart et al. (citation to be included).
7 | 
8 | The gut microbiome undergoes substantial development from birth into childhood and is believed to have significant consequences for long-term health, including the development of persistant islet autoimmunity (IA) and Type 1 Diabetes (T1D). In a multi-geographical population of 903 infants at-risk for T1D who developed IA and/or T1D by months 46 of age and their matched controls, we sought to characterize the gut microbiome from 3 to 46 months of life and determine the key covariates influencing its development. The study included two datasets, 1) 16S rRNA gene sequencing of >12,000 stool samples and 2) metagenomic sequencing of 10,867 stool samples.
9 | 


--------------------------------------------------------------------------------
/DMM cluster transition model:
--------------------------------------------------------------------------------
  1 | 
  2 | #==============================================================================
  3 | #=-----------------------------------------------------------------------------
  4 | # Daniel Smith and Christopher Stewart
  5 | # Baylor College of Medicine
  6 | #
  7 | #
  8 | # ==========================
  9 | #
 10 | #   Script for looking at sample clusters throughout
 11 | #   a timecourse with multiple subjects.
 12 | # 
 13 | #
 14 | #=-----------------------------------------------------------------------------
 15 | #==============================================================================
 16 | 
 17 | library("ggplot2")
 18 | library("igraph")
 19 | library("slam")
 20 | library("scales")
 21 | 
 22 | 
 23 | 
 24 | #==============================================================================
 25 | # Load the pre-computed datasets
 26 | #==============================================================================
 27 | 
 28 | 
 29 | Metadata <- download.aws("")
 30 | GenusMtx <- download.aws("")
 31 | 
 32 | 
 33 | 
 34 | #==============================================================================
 35 | # Top 25 Genera across the different DMM Clusters as a Heatmap
 36 | #==============================================================================
 37 | 
 38 | CLUSTER_COLUMN <- "DMM_Cluster"
 39 | 
 40 | md     <- Metadata
 41 | genera <- t(t(GenusMtx) / colSums(GenusMtx))
 42 | 
 43 | anno <- md[colnames(genera), CLUSTER_COLUMN, drop=FALSE]
 44 | anno[[CLUSTER_COLUMN]] <- factor(anno[[CLUSTER_COLUMN]])
 45 | anno <- anno[order(anno[[CLUSTER_COLUMN]]), , drop=FALSE]
 46 | 
 47 | top25 <- head(names(rev(sort(rowSums(genera)))), 25)
 48 | 
 49 | colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(n=7, name="RdYlBu")), bias=3)(100)  #use to set colour (name =) and scale (bias =)
 50 | 
 51 | mat <- genera[top25, rownames(anno), drop=FALSE]
 52 | mat <- t(apply(mat, 1L, scales::rescale))   #uncomment to normalize each taxa (row)
 53 | 
 54 | pheatmap::pheatmap(
 55 |   mat            = mat, 
 56 |   color          = colors,   #uncheck if setting the colour scale manual
 57 |   annotation_col = anno, 
 58 |   show_colnames  = FALSE,
 59 |   cluster_rows   = FALSE,
 60 |   cluster_cols   = FALSE,
 61 |   gaps_col       = cumsum(unname(table(anno[[CLUSTER_COLUMN]]))),
 62 |   labels_row     = sub("^.*__", "", top25)
 63 | )
 64 | 
 65 | # remove('CLUSTER_COLUMN')
 66 | # remove('anno', 'genera', 'md', 'top30', 'mat')
 67 | 
 68 | 
 69 | 
 70 | #=========
 71 | #manual order of bacteria
 72 | #=========
 73 | 
 74 | top25 <- c('Bacteria; __Actinobacteria; __Actinobacteria; __Bifidobacteriales; __Bifidobacteriaceae; __Bifidobacterium',
 75 |                     'Bacteria; __Firmicutes; __Bacilli; __Lactobacillales; __Enterococcaceae; __Enterococcus',
 76 |                    'Bacteria; __Firmicutes; __Negativicutes; __Selenomonadales; __Veillonellaceae; __Veillonella',
 77 |                    'Bacteria; __Firmicutes; __Bacilli; __Lactobacillales; __Streptococcaceae; __Streptococcus',
 78 |                    'Bacteria; __Proteobacteria; __Gammaproteobacteria; __Enterobacteriales; __Enterobacteriaceae; __Enterobacter',
 79 |                    'Bacteria; __Proteobacteria; __Gammaproteobacteria; __Enterobacteriales; __Enterobacteriaceae; __Escherichia_Shigella',
 80 |                    'Bacteria; __Bacteroidetes; __Bacteroidia; __Bacteroidales; __Bacteroidaceae; __Bacteroides',
 81 |                     'Bacteria; __Verrucomicrobia; __Verrucomicrobiae; __Verrucomicrobiales; __Verrucomicrobiaceae; __Akkermansia',
 82 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; ___Ruminococcus_gnavus_group',
 83 |                    'Bacteria; __Firmicutes; __Erysipelotrichia; __Erysipelotrichales; __Erysipelotrichaceae; __Erysipelatoclostridium',
 84 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Tyzzerella_4',
 85 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Lachnoclostridium',
 86 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Blautia',
 87 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Lachnospiraceae_UCG_008',
 88 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Anaerostipes',
 89 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; ___Eubacterium_rectale_group',
 90 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Clostridiaceae_1; __Clostridium_sensu_stricto_1',
 91 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Peptostreptococcaceae; __Romboutsia',
 92 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Peptostreptococcaceae; __Intestinibacter',
 93 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Lachnospiraceae; __Fusicatenibacter',
 94 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; __Ruminococcus_2',
 95 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; __Faecalibacterium',
 96 |                    'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; __Subdoligranulum',
 97 |                     'Bacteria; __Firmicutes; __Clostridia; __Clostridiales; __Ruminococcaceae; ___Eubacterium_coprostanoligenes_group',
 98 |                    'Bacteria; __Firmicutes; __Negativicutes; __Selenomonadales; __Veillonellaceae; __Dialister')
 99 | 
100 | mat   <- genera[top25, rownames(anno), drop=FALSE]
101 | mat   <- t(apply(mat, 1L, scales::rescale))   #uncomment to normalize each taxa (row)
102 | 
103 | pheatmap::pheatmap(
104 |   mat            = mat, 
105 |   color          = colors,   #uncheck if setting the colour scale manual
106 |   annotation_col = anno, 
107 |   show_colnames  = FALSE,
108 |   cluster_rows   = FALSE,
109 |   cluster_cols   = FALSE,
110 |   gaps_col       = cumsum(unname(table(anno[[CLUSTER_COLUMN]]))),
111 |   labels_row     = sub("^.*; __", "", rownames(mat))
112 | )
113 | 
114 | 
115 | # remove('CLUSTER_COLUMN')
116 | # remove('anno', 'species', 'md', 'top25', 'mat')
117 | 
118 | 
119 | 
120 | 
121 | 
122 | #==============================================================================
123 | # DMM Transitions Circle
124 | #   - Answers: how often is cluster y preceded by cluster x?
125 | #==============================================================================
126 | 
127 | #----------------------------------------------------------------------
128 | # Modify these values as you please
129 | #----------------------------------------------------------------------
130 | 
131 | MAX_TIME_DELTA <- 120
132 | TIME_COLUMN    <- "Age_in_Days"
133 | CLUSTER_COLUMN <- "DMM_Cluster"
134 | SUBJECT_COLUMN <- "SubjectID"
135 | 
136 | md <- Metadata
137 | 
138 | 
139 | #----------------------------------------------------------------------
140 | # Double check for the user that they provided valid column names
141 | #----------------------------------------------------------------------
142 | 
143 | vars <- c(TIME_COLUMN, CLUSTER_COLUMN, SUBJECT_COLUMN)
144 | if (!all(vars %in% colnames(md)))
145 |   stop(sprintf("Column name not found: %s", paste(collapse=", ", setdiff(vars, colnames(md)))))
146 | 
147 | 
148 | #----------------------------------------------------------------------
149 | # Filter out rows with missing data for time, subject, or cluster
150 | #----------------------------------------------------------------------
151 | 
152 | md <- md[!is.na(md[[TIME_COLUMN]]) & !is.na(md[[CLUSTER_COLUMN]]) & !is.na(md[[SUBJECT_COLUMN]]),]
153 | 
154 | 
155 | #----------------------------------------------------------------------
156 | # Assemble a matrix to represent the number each transition
157 | #----------------------------------------------------------------------
158 | 
159 | stateNames       <- unique(as.character(sort(md[[CLUSTER_COLUMN]])))
160 | nStates          <- length(stateNames)
161 | transitionMatrix <- matrix(0, nrow=nStates, ncol=nStates, dimnames=list(stateNames, stateNames))
162 | 
163 | plyr::d_ply(md[,vars], SUBJECT_COLUMN, function (x) {
164 |   
165 |   if (nrow(x) < 2) return (NULL)
166 |   
167 |   x <- x[order(x[[TIME_COLUMN]]),,drop=FALSE]
168 |   
169 |   for (i in seq_len(nrow(x) - 1)) {
170 |     
171 |     t1 <- x[i,   TIME_COLUMN]
172 |     t2 <- x[i+1, TIME_COLUMN]
173 |     if (t2 - t1 > MAX_TIME_DELTA) next
174 |     
175 |     c1 <- as.character(x[i,   CLUSTER_COLUMN])
176 |     c2 <- as.character(x[i+1, CLUSTER_COLUMN])
177 |     transitionMatrix[c1, c2] <<- transitionMatrix[c1, c2] + 1
178 |   }
179 | })
180 | 
181 | 
182 | #----------------------------------------------------------------------
183 | # Normalize transition matrix so rows sum to 1
184 | #----------------------------------------------------------------------
185 | 
186 | transitionMatrix <- transitionMatrix / rowSums(transitionMatrix)
187 | 
188 | 
189 | #----------------------------------------------------------------------
190 | # Create iGraph network
191 | #----------------------------------------------------------------------
192 | 
193 | g <- graph.adjacency(transitionMatrix, mode="directed", weighted=TRUE)
194 | 
195 | 
196 | #----------------------------------------------------------------------
197 | # Create a function to generate a continuous color palette
198 | #----------------------------------------------------------------------
199 | 
200 | val2rgb <- colorRamp(rev(heat.colors(10)[1:6]), bias = 2)
201 | 
202 | 
203 | #----------------------------------------------------------------------
204 | # Generate the figure and color bar legend
205 | #----------------------------------------------------------------------
206 | 
207 | plot.igraph(
208 |   x               = g,
209 |   xlim            = c(0, .8),
210 |   ylim            = c(-1, 1),
211 |   layout          = layout_in_circle,
212 |   vertex.size     = colSums(transitionMatrix) * 20,
213 |   edge.width      = E(g)$weight * 20,
214 |   edge.color      = rgb(val2rgb(E(g)$weight) / 255),
215 |   edge.loop.angle = scales::rescale(seq_along(E(g)), to=c(0, -2 * 3.14)) )
216 | 
217 | legend(
218 |   x      = 1.5, 
219 |   y      = 1, 
220 |   fill   = rgb(val2rgb(c(0, .05, .1, .15, .2, .5, .75, 1)) / 255), 
221 |   legend = c("0%", "5%", "10%", "15%", "20%", "50%", "75%", "100%"), 
222 |   title  = "Transition\nFrequency", 
223 |   bty    = "n" )
224 | 
225 | 
226 | #----------------------------------------------------------------------
227 | # Save final figure to a pdf file
228 | #----------------------------------------------------------------------
229 | 
230 | dev.copy2pdf(file="16S_OTU_DMM_Transitions.pdf")
231 | 
232 | 
233 | #----------------------------------------------------------------------
234 | # Cleanup
235 | #----------------------------------------------------------------------
236 | 
237 | remove('CLUSTER_COLUMN', 'MAX_TIME_DELTA', 'SUBJECT_COLUMN', 'TIME_COLUMN')
238 | remove('g', 'md', 'nStates', 'stateNames', 'transitionMatrix', 'val2rgb', 'vars')
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 
246 | 
247 | 
248 | #==============================================================================
249 | # Sample Overview
250 | #   - Shows each sample for each subject colored by a variable
251 | #==============================================================================
252 | 
253 | #----------------------------------------------------------------------
254 | # Modify these values as you please
255 | #----------------------------------------------------------------------
256 | 
257 | TIME_COLUMN    <- "Age_in_Months"
258 | TIME_MIN_MAX   <- c(0, 40)
259 | COLOR_COLUMN   <- "DMM_Cluster"
260 | SUBJECT_COLUMN <- "SubjectID"
261 | FACET_COLUMN   <- "Birth_Mode_Simple"
262 | 
263 | md <- Metadata
264 | 
265 | 
266 | #----------------------------------------------------------------------
267 | # Double check for the user that they provided valid column names
268 | #----------------------------------------------------------------------
269 | 
270 | vars <- c(TIME_COLUMN, COLOR_COLUMN, SUBJECT_COLUMN, FACET_COLUMN)
271 | if (!all(vars %in% colnames(md)))
272 |   stop(sprintf("Column name not found: %s", paste(collapse=", ", setdiff(vars, colnames(md)))))
273 | 
274 | 
275 | #----------------------------------------------------------------------
276 | # Probably want the COLOR_COLUMN to be categorical
277 | #----------------------------------------------------------------------
278 | if (length(unique(md[[COLOR_COLUMN]])) < 20)
279 |   md[[COLOR_COLUMN]] <- as.character(md[[COLOR_COLUMN]])
280 | 
281 | 
282 | #----------------------------------------------------------------------
283 | # Filter out rows with missing data or outside specified time range
284 | #----------------------------------------------------------------------
285 | 
286 | md <- md[ !is.na(md[[TIME_COLUMN]]) & 
287 |           !is.na(md[[COLOR_COLUMN]]) & 
288 |           !is.na(md[[SUBJECT_COLUMN]]) &
289 |           !is.na(md[[FACET_COLUMN]]), , drop=FALSE]
290 | 
291 | md <- md[ md[[TIME_COLUMN]] >= min(TIME_MIN_MAX) & 
292 |           md[[TIME_COLUMN]] <= max(TIME_MIN_MAX), , drop=FALSE]
293 | 
294 | 
295 | #----------------------------------------------------------------------
296 | # Generate the figure and color bar legend
297 | #----------------------------------------------------------------------
298 | 
299 | ggplot(md) + 
300 |   ggtitle("16S OTU DMM Sample Overview") + 
301 |   geom_point(aes_string(x = TIME_COLUMN, y = SUBJECT_COLUMN, color = COLOR_COLUMN)) +
302 |   facet_grid(as.formula(sprintf("%s ~ .", FACET_COLUMN)), scales="free_y", space="free_y") +
303 |   theme_bw() + 
304 |   theme(
305 |     axis.title.x      = element_text(size=8),
306 |     axis.text.x       = element_text(size=8),
307 |     axis.title.y      = element_text(size=8),
308 |     axis.text.y       = element_text(size=2),
309 |     axis.ticks.y      = element_line(size=0.2),
310 |     plot.title        = element_text(size=8),
311 |     legend.text       = element_text(size=7),
312 |     legend.title      = element_text(size=8),
313 |     legend.key.height = unit(0.15, "in"),
314 |     legend.key.width  = unit(0.15, "in"),
315 |     legend.background = element_blank(),
316 |     legend.margin     = margin(rep(unit(0,"mm"), 4)),
317 |     plot.margin       = unit(c(0.2,0,0,0),"mm"),
318 |     strip.text        = element_text(size=8) )
319 | 
320 | 
321 | #----------------------------------------------------------------------
322 | # Save final figure to a pdf file
323 | #----------------------------------------------------------------------
324 | 
325 | ggsave(file = "16S_OTU_DMM_Schedule.pdf", width=8, height=20)
326 | 
327 | 
328 | #----------------------------------------------------------------------
329 | # Cleanup
330 | #----------------------------------------------------------------------
331 | 
332 | remove('COLOR_COLUMN', 'FACET_COLUMN', 'SUBJECT_COLUMN', 'TIME_COLUMN', 'TIME_MIN_MAX')
333 | remove('md', 'vars')
334 | 
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | 
344 | 
345 | 
346 | 
347 | 
348 | 
349 | 
350 | #==============================================================================
351 | # DMM Transitions Over Time
352 | #   - Answers: is a cluster more common in a certain age range?
353 | #   - If a subject has >1 samples in a time range, only the first is used.
354 | #   - TIME_FUDGE is used to create a window around a specific time point
355 | #     - If TIME_FUDGE == 0, use only exact matches to the time point
356 | #     - When 0 < TIME_FUDGE < 1, interpret as a +/- percent, e.g. a TIME_FUDGE
357 | #       of .2 would give a time point of 10 the range 8-12.
358 | #     - When TIME_FUDGE >= 1, interpret as a +/- value, e.g. a TIME_FUDGE of 3
359 | #       would give a time point of 10 the range 7-13.
360 | #==============================================================================
361 | 
362 | #----------------------------------------------------------------------
363 | # Modify these values as you please
364 | #----------------------------------------------------------------------
365 | 
366 | TIME_COLUMN     <- "Age_in_Months"
367 | TIME_POINTS     <- c(4.5, 8.5, 12.5, 16.5, 20.5, 24.5, 28.5, 32.5, 36.5, 40.5, 44.5)
368 | TIME_FUDGE      <- 1.5            # +/- 1.5 months
369 | REQUIRE_ALL_TP  <- FALSE           # Drop subjects with missing time points
370 | CLUSTER_COLUMN  <- "DMM_Cluster"
371 | SUBJECT_COLUMN  <- "SubjectID"
372 | MINIMUM_PERCENT <- .04
373 | NODE_COLOR      <- "#072b5a"
374 | EDGE_COLOR      <- c("white", "orange", "darkorange", "orangered", "darkred")
375 | EDGE_PCT_RANGE  <- c(0, 0.24)   
376 | 
377 | md     <- Metadata     #plots all
378 | #md <- Metadata[Metadata[['Birth_Mode_Simple']] == "Vaginal",,drop=FALSE] #00B9EB #plots specified  
379 | #md <- Metadata[Metadata[['Birth_Mode_Simple']] == "Caesarian",,drop=FALSE]  #ED5F16
380 | #md <- Metadata[Metadata[['Breastmilk_Combined']] == "Some_BM",,drop=FALSE]  #072b5a
381 | #md <- Metadata[Metadata[['Breastmilk_Combined']] == "After_BM",,drop=FALSE]  #ff0000
382 | # md <- Metadata[Metadata[['T1D_Outcome']] == "Before",,drop=FALSE]
383 | # md <- Metadata[Metadata[['T1D_Outcome']] == "Never",,drop=FALSE]
384 | # md <- Metadata[Metadata[['IA_Outcome']] == "Before",,drop=FALSE]
385 | # md <- Metadata[Metadata[['IA_Outcome']] == "Never",,drop=FALSE]
386 | 
387 | 
388 | 
389 | #----------------------------------------------------------------------
390 | # Double check for the user that they provided valid column names
391 | #----------------------------------------------------------------------
392 | 
393 | vars <- c(TIME_COLUMN, CLUSTER_COLUMN, SUBJECT_COLUMN)
394 | if (!all(vars %in% colnames(md)))
395 |   stop(sprintf("Column name not found: %s", paste(collapse=", ", setdiff(vars, colnames(md)))))
396 | if (nrow(md) == 0)
397 |   stop("Metadata object 'md' doesn't have any rows.")
398 | 
399 | clusterNames <- unique(as.character(sort(md[[CLUSTER_COLUMN]])))
400 | timePtNames  <- unique(as.character(sort(TIME_POINTS)))
401 | stateNames   <- apply(expand.grid(clusterNames, timePtNames), 1L, paste, collapse="@")
402 | nStates      <- length(stateNames)
403 | 
404 | 
405 | #----------------------------------------------------------------------
406 | # Filter out rows with missing data for time, subject, or cluster
407 | #----------------------------------------------------------------------
408 | 
409 | md <- md[!is.na(md[[TIME_COLUMN]]) & !is.na(md[[CLUSTER_COLUMN]]) & !is.na(md[[SUBJECT_COLUMN]]), vars, drop=FALSE]
410 | 
411 | 
412 | #----------------------------------------------------------------------
413 | # Map samples to time points. For multiple, retain only closest.
414 | #----------------------------------------------------------------------
415 | 
416 | closestTP <- sapply(md[[TIME_COLUMN]], function (x) {
417 |   x <- TIME_POINTS[which.min(abs(x - TIME_POINTS))]
418 | })
419 | residuals <- abs(closestTP - md[[TIME_COLUMN]])
420 | 
421 | md[[TIME_COLUMN]] <- closestTP
422 | 
423 | if (TIME_FUDGE <  1) inRange <- residuals <= TIME_FUDGE * closestTP
424 | if (TIME_FUDGE >= 1) inRange <- residuals <= TIME_FUDGE
425 | 
426 | md        <- md[inRange,,drop=FALSE]
427 | residuals <- residuals[inRange]
428 | 
429 | md <- md[order(residuals),,drop=FALSE]
430 | md <- md[!duplicated(paste(md[[TIME_COLUMN]], md[[SUBJECT_COLUMN]])),,drop=FALSE]
431 | 
432 | 
433 | #----------------------------------------------------------------------
434 | # Optionally require a subject to have samples from all the time points
435 | #----------------------------------------------------------------------
436 | if (REQUIRE_ALL_TP) {
437 |   md <- plyr::ddply(md, SUBJECT_COLUMN, function (x) {
438 |     if (nrow(x) == length(TIME_POINTS)) return (x)
439 |     return (NULL)
440 |   })
441 | }
442 | timePtCounts <- table(md[[TIME_COLUMN]])
443 | nodeCounts   <- unlist(as.list(table(paste(sep="@", md[[CLUSTER_COLUMN]], md[[TIME_COLUMN]]))))
444 | nodeCounts   <- setNames(nodeCounts / timePtCounts[sub("^.*@", "", names(nodeCounts))], names(nodeCounts))
445 | 
446 | #----------------------------------------------------------------------
447 | # Count the number of subjects at each time point and/or cluster
448 | #----------------------------------------------------------------------
449 | 
450 | table(md$Age_in_Months, dnn = TIME_COLUMN)
451 | table(md$DMM_Cluster,   dnn = CLUSTER_COLUMN)
452 | table(md$DMM_Cluster, md$Age_in_Months, dnn = c(CLUSTER_COLUMN, TIME_COLUMN))
453 | 
454 | 
455 | #----------------------------------------------------------------------
456 | # Assemble a matrix to represent the number of each transition
457 | #----------------------------------------------------------------------
458 | 
459 | transitionMatrix <- matrix(0, nrow=nStates, ncol=nStates, dimnames=list(stateNames, stateNames))
460 | 
461 | md[[CLUSTER_COLUMN]] <- paste(sep="@", md[[CLUSTER_COLUMN]], md[[TIME_COLUMN]])
462 | md[[TIME_COLUMN]]    <- as.numeric(factor(md[[TIME_COLUMN]]))
463 | 
464 | plyr::d_ply(md[,vars], SUBJECT_COLUMN, function (x) {
465 |   
466 |   if (nrow(x) < 2) return (NULL)
467 |   
468 |   x <- x[order(x[[TIME_COLUMN]]),,drop=FALSE]
469 |   
470 |   for (i in seq_len(nrow(x) - 1)) {
471 |     
472 |     t1 <- x[i,   TIME_COLUMN]
473 |     t2 <- x[i+1, TIME_COLUMN]
474 |     if (t2 - t1 > 1) next
475 |     
476 |     c1 <- as.character(x[i,   CLUSTER_COLUMN])
477 |     c2 <- as.character(x[i+1, CLUSTER_COLUMN])
478 |     transitionMatrix[c1, c2] <<- transitionMatrix[c1, c2] + 1
479 |   }
480 | })
481 | 
482 | 
483 | #----------------------------------------------------------------------
484 | # Rescale the transition matrix to percentages
485 | #----------------------------------------------------------------------
486 | indices <- sort(rep(1:length(timePtNames), length(clusterNames)))
487 | for (timePt in 1:(length(timePtNames) - 1)) {
488 |   i <- which(indices == timePt)
489 |   transitionMatrix[i,] <- transitionMatrix[i,] / sum(transitionMatrix[i,])
490 | }
491 | 
492 | 
493 | #----------------------------------------------------------------------
494 | # Sankey Diagram
495 | #----------------------------------------------------------------------
496 | 
497 | # Nodes <- data.frame(
498 | #   'NodeID'  = stateNames,
499 | #   'Cluster' = sapply(stateNames, function (x) strsplit(x, '@', T)[[1]][[1]]))
500 | # 
501 | # Links <- with(
502 | #   subset(reshape2::melt(transitionMatrix), value > 0),
503 | #   data.frame(
504 | #     'Source' = as.numeric(factor(Var1, levels=stateNames)) - 1,
505 | #     'Target' = as.numeric(factor(Var2, levels=stateNames)) - 1,
506 | #     'Value'  = value ))
507 | # 
508 | # networkD3::sankeyNetwork(
509 | #   Links     = Links,
510 | #   Nodes     = Nodes,
511 | #   Source    = 'Source',
512 | #   Target    = 'Target',
513 | #   Value     = 'Value',
514 | #   NodeID    = 'NodeID',
515 | #   NodeGroup = 'Cluster',
516 | #   fontSize  = 10 )
517 | # 
518 | # remove('Links', 'Nodes')
519 | 
520 | 
521 | #----------------------------------------------------------------------
522 | # Drop cluster/timePt combos with zero observations (after filtering)
523 | #----------------------------------------------------------------------
524 | 
525 | transitionMatrix[which(transitionMatrix < MINIMUM_PERCENT)] <- 0
526 | 
527 | stateNames   <- stateNames[rowSums(transitionMatrix) | colSums(transitionMatrix)]
528 | clusterNames <- clusterNames[sapply(clusterNames, function (x) any(grep(sprintf("^%s@", x), stateNames)))]
529 | timePtNames  <- timePtNames[ sapply(timePtNames,  function (x) any(grep(sprintf("@%s$", x), stateNames)))]
530 | 
531 | transitionMatrix <- transitionMatrix[stateNames, stateNames, drop=FALSE]
532 | 
533 | 
534 | #----------------------------------------------------------------------
535 | # Generate the figure, axis labels, and color bar legend
536 | #----------------------------------------------------------------------
537 | 
538 | g <- graph.adjacency(transitionMatrix, mode="upper", weighted=TRUE)
539 | 
540 | layout <- matrix(unlist(strsplit(stateNames, "@", fixed=TRUE)), ncol=2, byrow=TRUE)
541 | layout <- matrix(ncol=2, c(
542 |   as.numeric(factor(layout[,2], levels=timePtNames)), 
543 |   as.numeric(factor(layout[,1], levels=rev(clusterNames))) ))
544 | 
545 | vertex_weights <- rescale(nodeCounts[names(V(g))])
546 | edge_weights   <- rescale(E(g)$weight, from=EDGE_PCT_RANGE)
547 | 
548 | edgeColorRamp <- colorRamp(EDGE_COLOR)
549 | 
550 | plot.igraph(
551 |   x                = g,
552 |   xlim             = c(0, .8),
553 |   ylim             = c(-1, 1),
554 |   layout           = layout,
555 |   vertex.label     = NA,
556 |   vertex.size      = rescale(sqrt(vertex_weights / 3.14)) * 25,
557 |   vertex.color     = NODE_COLOR,
558 |   edge.width       = edge_weights * 12 + 1,
559 |   edge.color       = rgb(edgeColorRamp(edge_weights) / 255) )
560 | 
561 | xLabelPos = (seq_along(timePtNames)  - 1) *  2 / (length(timePtNames)  - 1) - 1
562 | yLabelPos = (seq_along(clusterNames) - 1) * -2 / (length(clusterNames) - 1) + 1
563 | 
564 | text(-1.3, yLabelPos, clusterNames)
565 | text(xLabelPos, 1.5, timePtNames)
566 | text(xLabelPos, 1.3, sprintf("n = %i", timePtCounts), cex=.6)
567 | 
568 | with(
569 |   cbreaks(EDGE_PCT_RANGE, pretty_breaks(10), percent),
570 |   legend(
571 |     x      = 1.5, 
572 |     y      = 1, 
573 |     fill   = rgb(edgeColorRamp(rescale(breaks)) / 255), 
574 |     legend = labels, 
575 |     title  = "Transition\nFrequency", 
576 |     bty    = "n" ))
577 | 
578 | 
579 | #----------------------------------------------------------------------
580 | # Save final figure to a pdf file
581 | #----------------------------------------------------------------------
582 | 
583 | dev.copy2pdf(file="16S_OTU_DMM_Transitions_Over_Time.pdf")
584 | 
585 | 
586 | 
587 | #----------------------------------------------------------------------
588 | # Cleanup
589 | #----------------------------------------------------------------------
590 | 
591 | # remove('CLUSTER_COLUMN', 'EDGE_COLOR', 'MINIMUM_PERCENT', 'NODE_COLOR', 'REQUIRE_ALL_TP')
592 | # remove('SUBJECT_COLUMN', 'TIME_COLUMN', 'TIME_FUDGE', 'TIME_POINTS', 'EDGE_PCT_RANGE')
593 | # remove('closestTP', 'clusterNames', 'edge_weights', 'edgeColorRamp', 'g')
594 | # remove('i', 'indices', 'inRange', 'layout', 'md', 'nStates', 'residuals')
595 | # remove('stateNames', 'timePt', 'timePtCounts', 'timePtNames', 'transitionMatrix')
596 | # remove('vars', 'nodeCounts', 'vertex_weights', 'xLabelPos', 'yLabelPos')
597 | 
598 | 


--------------------------------------------------------------------------------