├── .DS_Store ├── .RData ├── .Rhistory ├── .gitattributes ├── .gitignore ├── README.md ├── bulkRNAseq.Rproj ├── data ├── GSE63310 │ └── GSE63310_RAW.tar ├── GSM1545535_10_6_5_11.txt ├── GSM1545536_9_6_5_11.txt ├── GSM1545537_mo906111-1_m09611-2.txt.gz ├── GSM1545538_purep53.txt ├── GSM1545539_JMS8-2.txt ├── GSM1545540_JMS8-3.txt ├── GSM1545541_JMS8-4.txt ├── GSM1545542_JMS8-5.txt ├── GSM1545543_JMS9-CDBG.txt.gz ├── GSM1545544_JMS9-P7c.txt └── GSM1545545_JMS9-P8c.txt ├── notebooks ├── 10_enrichment.Rmd ├── 1_install_packages.Rmd ├── 2_downloadRNAseq.Rmd ├── 3_annotate.Rmd ├── 4_cpm_n_log2cpm.Rmd ├── 5_filter_n_plot.Rmd ├── 6_norm_n_plot.Rmd ├── 7_unsupervised_clust.Rmd ├── 8_diff_expr.Rmd └── 9_RNA_data_viz.Rmd └── results ├── DEresults.txt ├── MDSplot-unnormalized.png ├── MDSplot.png ├── barplot_react.png ├── basal_lp_volcano.png ├── basal_ml_volcano.png ├── dotplot_react.png ├── heatmap.png └── lp_ml_volcano.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/.DS_Store -------------------------------------------------------------------------------- /.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/.RData -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | install_load("GEOquery", "edgeR") 2 | ## if you ARE using an R notebook (if not the comment out this line): 3 | knitr::opts_knit$set(root.dir = "~/Desktop/RNAseq/Data/") 4 | ## if you ARE NOT using an R notebook (un-comment this line): 5 | #setwd("~/Desktop/RNAseq/Data/") 6 | ?getGEOSuppFiles 7 | gse <- getGEOSuppFiles("GSE63310") 8 | untar("./GSE63310/GSE63310_RAW.tar") 9 | files <- c("GSM1545535_10_6_5_11.txt", "GSM1545536_9_6_5_11.txt", "GSM1545538_purep53.txt", "GSM1545539_JMS8-2.txt", "GSM1545540_JMS8-3.txt", "GSM1545541_JMS8-4.txt", "GSM1545542_JMS8-5.txt", "GSM1545544_JMS9-P7c.txt", "GSM1545545_JMS9-P8c.txt") 10 | for(i in paste(files, ".gz", sep="")) { 11 | gunzip(i, overwrite=TRUE) 12 | } 13 | files[1] 14 | f1 <- read.delim(files[1]) 15 | dim(f1) 16 | head(f1) 17 | ?readDGE 18 | dge <- readDGE(files, columns=c(1,3)) 19 | dge 20 | ## Defining install_load function - adapted from: https://gitlab.com/iembry/install.load 21 | install_load <- function(package1, ...) { 22 | # convert arguments to vector 23 | packages <- c(package1, ...) 24 | # start loop to determine if each package is installed 25 | for(package in packages) { 26 | # if package is installed, just load it 27 | if(package %in% (BiocManager::available())) 28 | do.call('library', list(package)) 29 | # if package is not installed locally, download, then load 30 | else { 31 | BiocManager::install(package) 32 | do.call("library", list(package)) 33 | } 34 | } 35 | } 36 | install_load("edgeR","org.Mm.eg.db") 37 | ## if you ARE using an R notebook (if not the comment out this line): 38 | knitr::opts_knit$set(root.dir = "~/Desktop/RNAseq/Data/") 39 | ## if you ARE NOT using an R notebook (un-comment this line): 40 | #setwd("~/Desktop/RNAseq/Data/") 41 | files 42 | dge <- readDGE(files, columns=c(1,3)) 43 | dge 44 | dge$samples$group <- as.factor(c("LP", "ML", "Basal", "Basal", "ML", "LP", "Basal", "ML", "LP")) 45 | dge$samples 46 | head(dge$counts) 47 | dge_ids <- rownames(dge) 48 | head(dge_ids) 49 | gene_ids <- select(org.Mm.eg.db, 50 | keys=dge_ids, 51 | keytype="ENTREZID", 52 | columns=c("SYMBOL")) 53 | head(gene_ids) 54 | names(dge) 55 | dge$gene_ids <- gene_ids 56 | names(dge) 57 | f1 <- read.delim(files[1]) 58 | dim(f1) 59 | head(f1) 60 | ?cpm 61 | cpm <- cpm(dge) 62 | log2_cpm <- cpm(dge, log=TRUE) 63 | head(dge$counts) 64 | head(cpm) 65 | head(log2_cpm) 66 | View(dge) 67 | ## Defining install_load function - adapted from: https://gitlab.com/iembry/install.load 68 | install_load <- function(package1, ...) { 69 | # convert arguments to vector 70 | packages <- c(package1, ...) 71 | # start loop to determine if each package is installed 72 | for(package in packages) { 73 | # if package is installed, just load it 74 | if(package %in% (BiocManager::available())) 75 | do.call('library', list(package)) 76 | # if package is not installed locally, download, then load 77 | else { 78 | BiocManager::install(package) 79 | do.call("library", list(package)) 80 | } 81 | } 82 | } 83 | install_load("edgeR","RColorBrewer") 84 | dge_sub <- dge[rowSums(cpm>1)>=3, ] 85 | head(cpm) 86 | head(rowSums(cpm>1)) 87 | head(rowSums(cpm>1)>=3) 88 | dim(dge) 89 | dim(dge_sub) 90 | nrow(dge_sub) / nrow(dge) 91 | ?RColorBrewer 92 | nsamples <- ncol(dge_sub) 93 | col <- brewer.pal(nsamples, "Paired") 94 | col 95 | log2cpm_raw <- cpm(dge, log=TRUE) 96 | log2cpm_sub <- cpm(dge_sub, log=TRUE) 97 | density(log2cpm_raw[,1]) 98 | density(log2cpm_sub[,1]) 99 | par(mfrow=c(1,2)) 100 | plot(density(log2cpm_raw[,1]), 101 | col=col[1], 102 | lwd=2, 103 | ylim=c(0,0.21), 104 | las=2, 105 | main="", 106 | xlab="") 107 | title(main="A. Raw data", xlab="Log2(CPM)") 108 | abline(v=0, lty=3) 109 | for (i in 2:nsamples){ 110 | den <- density(log2cpm_raw[,i]) 111 | lines(den$x, den$y, col=col[i], lwd=2) 112 | } 113 | par(mfrow=c(1,2)) 114 | plot(density(log2cpm_raw[,1]), 115 | col=col[1], 116 | lwd=2, 117 | ylim=c(0,0.21), 118 | las=2, 119 | main="", 120 | xlab="") 121 | title(main="A. Raw data", xlab="Log2(CPM)") 122 | abline(v=0, lty=3) 123 | for (i in 2:nsamples){ 124 | den <- density(log2cpm_raw[,i]) 125 | lines(den$x, den$y, col=col[i], lwd=2) 126 | } 127 | plot(density(log2cpm_sub[,1]), 128 | col=col[1], 129 | lwd=2, 130 | ylim=c(0,0.21), 131 | las=2, 132 | main="", 133 | xlab="") 134 | title(main="B. Filtered data", xlab="Log2(CPM)") 135 | abline(v=0, lty=3) 136 | for (i in 2:nsamples){ 137 | den <- density(log2cpm_sub[,i]) 138 | lines(den$x, den$y, col=col[i], lwd=2) 139 | } 140 | ?cpm 141 | ## Defining install_load function - adapted from: https://gitlab.com/iembry/install.load 142 | install_load <- function(package1, ...) { 143 | # convert arguments to vector 144 | packages <- c(package1, ...) 145 | # start loop to determine if each package is installed 146 | for(package in packages) { 147 | # if package is installed, just load it 148 | if(package %in% (BiocManager::available())) 149 | do.call('library', list(package)) 150 | # if package is not installed locally, download, then load 151 | else { 152 | BiocManager::install(package) 153 | do.call("library", list(package)) 154 | } 155 | } 156 | } 157 | install_load("edgeR","RColorBrewer") 158 | dge_sub$samples 159 | ?calcNormFactors 160 | dge_sub_norm <- calcNormFactors(dge_sub, method = "TMM") 161 | dge_sub_norm$samples 162 | dge_sub$counts[,1] <- dge_sub$counts[,1]*0.05 163 | dge_sub$counts[,2] <- dge_sub$counts[,2]*5 164 | log2cpm_unorm <- cpm(dge_sub, log=TRUE) 165 | log2cpm_norm <- cpm(dge_sub_norm, log=TRUE) 166 | par(mfrow=c(1,2)) 167 | boxplot(log2cpm_unorm, 168 | las=2, 169 | col=col, 170 | main="") 171 | title(main="A. Unnormalized data",ylab="Log2(CPM)") 172 | boxplot(log2cpm_norm, 173 | las=2, 174 | col=col, 175 | main="") 176 | title(main="B. Normalized data",ylab="Log2(CPM)") 177 | ## if you ARE using an R notebook (if not the comment out this line): 178 | knitr::opts_knit$set(root.dir = "~/Desktop/RNAseq/Results/") 179 | ## if you ARE NOT using an R notebook (un-comment this line): 180 | #setwd("~/Desktop/RNAseq/Results/") 181 | grps <- dge_sub$samples$group 182 | col.grp <- grps 183 | levels(col.grp) <- brewer.pal(nlevels(col.grp), "Set1") 184 | col.grp <- as.character(col.grp) 185 | col.grp 186 | grps 187 | ?plotMDS 188 | head(log2cpm_norm) 189 | #png("MDSplot.png") 190 | par(mfrow=c(1,1)) 191 | plotMDS(log2cpm_norm, 192 | labels=grps, 193 | col=col.grp, 194 | xlab="Log2(Fold-Change)", 195 | ylab="Log2(Fold-Change)") 196 | title(main="Sample groups") 197 | #dev.off() 198 | ## if you ARE using an R notebook (if not the comment out this line): 199 | knitr::opts_knit$set(root.dir = "~/Desktop/RNAseq/Results/") 200 | ## if you ARE NOT using an R notebook (un-comment this line): 201 | #setwd("~/Desktop/RNAseq/Results/") 202 | ?model.matrix 203 | grps 204 | design <- model.matrix(~0 + grps) 205 | design 206 | colnames(design) <- gsub("grps", "", colnames(design)) 207 | design 208 | ?makeContrasts 209 | contrast <- makeContrasts( 210 | BasalvsLP = Basal - LP, 211 | BasalvsML = Basal - ML, 212 | LPvsML = LP - ML, 213 | levels = colnames(design)) 214 | contrast 215 | ?voom 216 | dge_voom <- voom(dge_sub_norm, design) 217 | dge_voom 218 | ?lmFit 219 | dge_fit <- lmFit(dge_voom, design) 220 | ?contrasts.fit 221 | grp_fit <- contrasts.fit(dge_fit, contrasts=contrast) 222 | head(dge_fit$coefficients) 223 | head(grp_fit$coefficients) 224 | ?eBayes 225 | efit <- eBayes(grp_fit) 226 | efit 227 | ?plotSA 228 | plotSA(efit, main="Mean−variance trend") 229 | ?decideTests 230 | summary(decideTests(efit)) 231 | ?topTable 232 | ## look at top 10 overall by F statistics/p-values 233 | topTableF(efit, number=10) 234 | names(efit) 235 | efit$contrasts 236 | ## Basal vs. LP (coef=1) 237 | topTable(efit, coef=1, number=10, sort.by="p") 238 | ## Basal vs. ML (coef=2) 239 | topTable(efit, coef=2, number=10, sort.by="p") 240 | ## LP vs. ML (coef=3) 241 | topTable(efit, coef=3, number=10, sort.by="p") 242 | write.fit(efit, file="DEresults.txt", adjust="BH", method="separate") 243 | ## Defining install_load function - adapted from: https://gitlab.com/iembry/install.load 244 | install_load <- function(package1, ...) { 245 | # convert arguments to vector 246 | packages <- c(package1, ...) 247 | # start loop to determine if each package is installed 248 | for(package in packages) { 249 | # if package is installed, just load it 250 | if(package %in% (BiocManager::available())) 251 | do.call('library', list(package)) 252 | # if package is not installed locally, download, then load 253 | else { 254 | BiocManager::install(package) 255 | do.call("library", list(package)) 256 | } 257 | } 258 | } 259 | install_load("limma","gplots","RColorBrewer") 260 | ## if you ARE using an R notebook (if not the comment out this line): 261 | knitr::opts_knit$set(root.dir = "~/Desktop/RNAseq/Results/") 262 | ## if you ARE NOT using an R notebook (un-comment this line): 263 | #setwd("~/Desktop/RNAseq/Results/") 264 | ?apply 265 | var_genes <- apply(log2cpm_norm, 1, var) 266 | head(var_genes) 267 | select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] 268 | head(select_var) 269 | highly_variable_lcpm <- log2cpm_norm[select_var,] 270 | dim(highly_variable_lcpm) 271 | head(highly_variable_lcpm) 272 | ## Get some nicer colors 273 | ## to display brewer colors (use diverging colors in this instance): display.brewer.all() 274 | mypalette <- brewer.pal(11,"RdBu") 275 | morecols <- colorRampPalette(mypalette) 276 | # Set up color vector for celltype variable 277 | col.cell <- c("honeydew4", "lightblue3", "lightcyan2")[c(dge$samples$group)] 278 | # view dendograms, heatmap 279 | plot(heatmap$colDendrogram) 280 | ## Get some nicer colors 281 | ## to display brewer colors (use diverging colors in this instance): display.brewer.all() 282 | mypalette <- brewer.pal(11,"RdBu") 283 | morecols <- colorRampPalette(mypalette) 284 | # Set up color vector for celltype variable 285 | col.cell <- c("honeydew4", "lightblue3", "lightcyan2")[c(dge$samples$group)] 286 | # view dendograms, heatmap 287 | #plot(heatmap$colDendrogram) 288 | plot(heatmap$rowDendrogram) 289 | ## Get some nicer colors 290 | ## to display brewer colors (use diverging colors in this instance): display.brewer.all() 291 | mypalette <- brewer.pal(11,"RdBu") 292 | morecols <- colorRampPalette(mypalette) 293 | # Set up color vector for celltype variable 294 | col.cell <- c("honeydew4", "lightblue3", "lightcyan2")[c(dge$samples$group)] 295 | # view dendograms, heatmap 296 | #plot(heatmap$colDendrogram) 297 | #plot(heatmap$rowDendrogram) 298 | heatmap.2(highly_variable_lcpm, col=rev(morecols(50)), 299 | trace = "none", 300 | main = "Top 500 most variable genes - ML v. LP v. Basal", 301 | keysize = 1.75, 302 | key.title = "", 303 | lwid = c(0.09, 0.4), # helps format the legend - arbitrary 304 | ColSideColors = col.cell,scale="row", 305 | srtCol = 25, 306 | labRow = NA, 307 | margins = c(8,8), 308 | ) 309 | ## Get some nicer colors 310 | ## to display brewer colors (use diverging colors in this instance): display.brewer.all() 311 | mypalette <- brewer.pal(11,"RdBu") 312 | morecols <- colorRampPalette(mypalette) 313 | # Set up color vector for celltype variable 314 | col.cell <- c("honeydew4", "lightblue3", "lightcyan2")[c(dge$samples$group)] 315 | heatmapp <- heatmap.2(highly_variable_lcpm, col=rev(morecols(50)), 316 | trace = "none", 317 | main = "Top 500 most variable genes - ML v. LP v. Basal", 318 | keysize = 1.75, 319 | key.title = "", 320 | lwid = c(0.09, 0.4), # helps format the legend - arbitrary 321 | ColSideColors = col.cell,scale="row", 322 | srtCol = 25, 323 | labRow = NA, 324 | margins = c(8,8), 325 | ) 326 | heatmapp 327 | # view dendograms, heatmap 328 | plot(heatmapp$colDendrogram) 329 | plot(heatmapp$rowDendrogram) 330 | # define filename 331 | png(filename="heatmap.png") 332 | #store heatmap as object in GlobalEnv 333 | heatmap <- heatmap.2(highly_variable_lcpm, col=rev(morecols(50)), 334 | trace = "none", 335 | main = "Top 500 most variable genes - ML v. LP v. Basal", 336 | keysize = 1.75, 337 | key.title = "", 338 | lwid = c(0.08, 0.4), 339 | ColSideColors = col.cell,scale="row", 340 | srtCol = 25, 341 | labRow = NA, 342 | margins = c(8,8), 343 | ) 344 | # write out 345 | dev.off() 346 | #?heatmap.2 347 | top_500_clustered <- as.hclust(heatmap$rowDendrogram) 348 | # define the clusters (essentially picking where to "cut"" the dendogram and select clusters) 349 | # cutree() returns a vector of cluster membership in the order of the original data rows 350 | mycl <- cutree(top_500_clustered, h=max(40)) 351 | # examine it 352 | #mycl 353 | # examine the head and tail of cluster membership by it's order in the heatmap 354 | head(mycl[top_500_clustered$order], n=10) 355 | tail(mycl[top_500_clustered$order], n=10) 356 | # you could either grab a cluster 357 | #cluster1 <- highly_variable_lcpm[mycl == 1,] 358 | # or simply add the cluster ID to your data. This is what we'll do. 359 | cldat <- cbind(highly_variable_lcpm, clusterID=mycl) 360 | # examine the data with cluster ids attached, and ordered like the heat map 361 | top_diff_genes <- as.data.frame(cldat[top_500_clustered$order,]) 362 | # check is gene IDs table is still in memory: 363 | head(gene_ids) 364 | #use gene IDs to rename genes in hr... 365 | top_diff_genes <- tibble::rownames_to_column(top_diff_genes, "ENTREZID") 366 | top_diff_genes <- merge(gene_ids, top_diff_genes, by="ENTREZID") 367 | head(top_diff_genes, n=10) 368 | mycl <- cutree(top_500_clustered, h=max(40)) 369 | head(mycl[top_500_clustered$order], n=10) 370 | tail(mycl[top_500_clustered$order], n=10) 371 | # Now we'll grab a cluster 372 | cluster1 <- highly_variable_lcpm[mycl == 1,] 373 | # as opposed to adding the cluster ID to the data, as we did previously. 374 | #cldat <- cbind(highly_variable_lcpm, clusterID=mycl) 375 | top_diff_genes <- as.data.frame(cluster1) 376 | #use gene IDs to rename genes in hr... 377 | top_diff_genes <- tibble::rownames_to_column(top_diff_genes, "ENTREZID") 378 | top_diff_genes <- merge(gene_ids, top_diff_genes, by="ENTREZID") 379 | head(top_diff_genes, n=10) 380 | mycl <- cutree(top_500_clustered, 381 | h=max(20) # here is the line to edit in this case - sets the height at which to "cut" the dendogram 382 | ) 383 | head(mycl[top_500_clustered$order], n=10) 384 | tail(mycl[top_500_clustered$order], n=10) 385 | # This time we're adding the cluster assignment to the data to observe what this threshold adjustment actually accomplished 386 | cldat <- cbind(highly_variable_lcpm, clusterID=mycl) 387 | top_diff_genes <- as.data.frame(cldat[top_500_clustered$order,]) 388 | top_diff_genes <- tibble::rownames_to_column(top_diff_genes, "ENTREZID") 389 | top_diff_genes <- merge(gene_ids, top_diff_genes, by="ENTREZID", sort = F) 390 | head(top_diff_genes) 391 | ### Rehashing how to format data for volano plots (from notebooks 6, 7, 8) 392 | # start with our groups 393 | grps # this variable contains our groups of samples (ML, LP, Basal) - created in notebook #7 (unsupervised clustering) 394 | # creating design matrix (dummy variable creation for each sample and the group it belongs to) 395 | design <- model.matrix(~0 + grps) 396 | # relabeling columns 397 | colnames(design) <- gsub("grps", "", colnames(design)) 398 | #creating contrast matrix 399 | contrast <- makeContrasts( 400 | BasalvsLP = Basal - LP, 401 | BasalvsML = Basal - ML, 402 | LPvsML = LP - ML, 403 | levels = colnames(design)) 404 | # Now we use `voom()` from `limma` to combine log2(CPM) values from the filtered, normalized data with the design matrix 405 | dge_voom <- voom(dge_sub_norm, design) 406 | # Using limma's lmfit() function - this is the linear model fit step 407 | dge_fit <- lmFit(dge_voom, design) 408 | # Using contrast matrix to compare the groups 409 | grp_fit <- contrasts.fit(dge_fit, contrasts=contrast) 410 | # Empirical Bayes method (limma) to use information across all genes - this should make the residual variances independent of the expression levels. 411 | efit <- eBayes(grp_fit) 412 | # convert gene IDs to Gene Symbols for easier interpretation 413 | efit_genes <- names(efit$Amean) 414 | efit_genes <- as.data.frame.AsIs(efit_genes) 415 | colnames(efit_genes) <- "ENTREZID" 416 | efit_genes <- merge(gene_ids, efit_genes, by="ENTREZID", sort = F) 417 | names(efit$Amean) <- efit_genes$SYMBOL 418 | ?volcanoplot 419 | # basal vs lp 420 | volcanoplot(efit, 421 | coef = 1, 422 | style = "p-value", 423 | highlight = 10, 424 | names = names(efit$Amean), 425 | hl.col="blue", 426 | main = "Basal vs LP", 427 | xlab = "Log2 Fold Change", 428 | ylab = NULL, 429 | pch=20, 430 | cex=0.35) 431 | # basal vs ml 432 | volcanoplot(efit, coef = 2, style = "B-statistic", highlight = 10, names = names(efit$Amean), 433 | hl.col="blue", 434 | main = "Basal vs ML", xlab = "Log2 Fold Change", ylab = NULL, pch=20, cex=0.35) 435 | # lp vs ml 436 | volcanoplot(efit,coef = 3, style = "p-value", highlight = 10, names = names(efit$Amean), 437 | hl.col="blue", 438 | main = "LP vs ML", xlab = "Log2 Fold Change", ylab = NULL, pch=20, cex=0.35) 439 | ?topTable 440 | # top hits: 441 | BasalvLP_top <- topTable(efit, coef = 1, number = 10, sort.by = 'P') 442 | BasalvLP_top 443 | # top hits: 444 | BasalvML_top <- topTable(efit, coef = 2, number = 10, sort.by = 'P') 445 | BasalvML_top 446 | # top hits: 447 | ML_v_LP_top <- topTable(efit, coef = 3, number = 10, sort.by = 'P') 448 | ML_v_LP_top 449 | # convert gene IDs to Gene Symbols for easier interpretation 450 | ML_v_LP_top <- tibble::rownames_to_column(ML_v_LP_top, "ENTREZID") 451 | ML_v_LP_top <- merge(gene_ids, ML_v_LP_top, by="ENTREZID", sort = F) 452 | ML_v_LP_top 453 | ## Defining install_load function - adapted from: https://gitlab.com/iembry/install.load 454 | install_load <- function(package1, ...) { 455 | # convert arguments to vector 456 | packages <- c(package1, ...) 457 | # start loop to determine if each package is installed 458 | for(package in packages) { 459 | # if package is installed, just load it 460 | if(package %in% (BiocManager::available())) 461 | do.call('library', list(package)) 462 | # if package is not installed locally, download, then load 463 | else { 464 | BiocManager::install(package) 465 | do.call("library", list(package)) 466 | } 467 | } 468 | } 469 | install_load("gage","pathview") 470 | ## if you ARE using an R notebook (if not the comment out this line): 471 | knitr::opts_knit$set(root.dir = "~/Desktop/RNAseq/Results/") 472 | ## if you ARE NOT using an R notebook (un-comment this line): 473 | #setwd("~/Desktop/RNAseq/Results/") 474 | ?gage 475 | #head(EFITE$coefficients) 476 | fold_lpml <- efit$coefficients[, "LPvsML"] 477 | head(fold_lpml) ## Entrez IDs are retained from the coefficient matrix 478 | ?kegg.gsets 479 | kegg_mouse <- kegg.gsets(species="mmu", id.type="kegg") 480 | names(kegg_mouse) 481 | head(kegg_mouse$kg.sets, n=3) ## look at first 3 482 | ?gage 483 | kegg_all = gage(exprs=fold_lpml, gsets=kegg_mouse$kg.sets, same.dir=TRUE) 484 | lapply(kegg_all, head) 485 | names(kegg_mouse) 486 | head(kegg_mouse$met.idx) ## these are indices of the metabolic pathways 487 | kegg_mouse$kg.sets[1] ## this is the first metabolic pathway (index=1) 488 | kegg_met <- kegg_mouse$kg.sets[kegg_mouse$met.idx] ## all metabolic pathways 489 | length(kegg_mouse$met.idx) 490 | length(kegg_met) 491 | kegg_met = gage(exprs=fold_lpml, gsets=kegg_met, same.dir=TRUE) 492 | names(kegg_met) 493 | head(kegg_met$less, n=3) 494 | ## extract KEGG ID and pathway name (first row name of the kegg_met$less data) 495 | op <- rownames(kegg_met$less)[1] 496 | op 497 | ## extract the KEGG ID (this code removes everything after the first space from op) 498 | op_kegg <- sub(" .*$", "", op) 499 | op_kegg 500 | ?pathview 501 | pathview(gene.data=fold_lpml, pathway.id=op_kegg, species="mmu") 502 | par(mfrow=c(1,2)) 503 | boxplot(log2cpm_unorm, 504 | las=2, 505 | col=col, 506 | main="") 507 | title(main="A. Unnormalized data",ylab="Log2(CPM)") 508 | boxplot(log2cpm_norm, 509 | las=2, 510 | col=col, 511 | main="") 512 | title(main="B. Normalized data",ylab="Log2(CPM)") 513 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | *.Rmd linguist-language=R 3 | *.nb.html linguist-vendored 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | sandbox.nb.html 2 | .DS_store 3 | .Rproj.user 4 | .Rhistory 5 | .RData 6 | .Ruserdata 7 | 1_install_packages.nb.html 8 | 2_downloadRNAseq.nb.html 9 | 3_annotate.nb.html 10 | 4_cpm_n_log2cpm.nb.html 11 | 5_filter_n_plot.nb.html 12 | 6_norm_n_plot.nb.html 13 | 7_unsupervised_clust.nb.html 14 | 8_diff_expr.nb.html 15 | 9_RNA_data_viz.nb.html 16 | 10_gsea.nb.html 17 | DEresults.txt 18 | GSM1545539_JMS8-2.txt 19 | GSM1545544_JMS9-P7c.txt 20 | GSM1545535_10_6_5_11.txt 21 | GSM1545540_JMS8-3.txt 22 | GSM1545545_JMS9-P8c.txt 23 | GSM1545536_9_6_5_11.txt 24 | GSM1545541_JMS8-4.txt 25 | GSM1545538_purep53.txt 26 | GSM1545542_JMS8-5.txt 27 | heatmap.png 28 | mmu00190.pathview.png 29 | mmu00190.png 30 | mmu00190.xml 31 | GSM1545537_mo906111-1_m09611-2.txt.gz 32 | GSM1545543_JMS9-CDBG.txt.gz 33 | /GSE63310 34 | GSM1545535_10_6_5_11.txt.gz 35 | GSM1545536_9_6_5_11.txt.gz 36 | GSM1545538_purep53.txt.gz 37 | GSM1545539_JMS8-2.txt.gz 38 | GSM1545540_JMS8-3.txt.gz 39 | GSM1545541_JMS8-4.txt.gz 40 | GSM1545542_JMS8-5.txt.gz 41 | GSM1545544_JMS9-P7c.txt.gz 42 | GSM1545545_JMS9-P8c.txt.gz 43 | 11_sandbox.nb.html 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UCSF DSI RNA seq 2 | 3 | Workshop Overview 4 | 5 | This workshop is intended for individuals who are already comfortable with R programming and who are interested in learning to use R for standard RNA-Seq analyses. We will take you through a complete RNA-Seq workflow using R Bioconductor packages. 6 | 7 | Learning Objectives 8 | By the end of the workshop, participants should be able to: 9 | 10 | - Download RNA-seq files from GEO 11 | - Annotate the samples 12 | - Calculate expression as read counts/million (CPM) and log2(CPM) 13 | - Filter out genes with low expression and plot 14 | - Normalize the filtered expression data and plot 15 | - Perform unsupervised clustering of expression data 16 | - Perform differential expression analysis 17 | - Create heatmaps and volcano plots 18 | - Perform enrichment and pathway analysis 19 | 20 | Prerequisites 21 | 22 | You must have some R programming experience and a basic understanding of the purpose of DNA-Seq analysis to benefit from this course. Feel free to contact the instructor (see below) if you have questions about these requirements. 23 | 24 | Software 25 | 26 | Please have the latest versions of R and R Bioconductor installed if you plan to run the R notebooks locally. RStudio (the free version) is highly recommended as well, since we will be teaching in this environment. 27 | 28 | We package R installations in .Rmd format - you'll need to have [R](https://www.r-project.org), [R Bioconductor](https://bioconductor.org/install/), and [RStudio](https://www.rstudio.com/products/rstudio/download/) installed to proceed. 29 | 30 | ### Official UCSF DSI Course Materials 31 | 32 | Team-reviewed, tested, and ready-to-use materials will always be hosted on our official UCSF Collaborative Learning Environment (CLE) [RNA-seq webpage](https://courses.ucsf.edu/course/view.php?id=6137). 33 | 34 | _This GitHub repository is for development purposes and therefore may stray from the materials used in class._ CLE will remain as it was at the time of our most recently offered workshop and will only be updated if major changes are introduced to the course. 35 | 36 | Branch "master" should function as intended if cloned, but for verified materials please download all documents and data from CLE unless otherwise instructed. 37 | 38 | # Workshop Setup Instructions: 39 | 40 | #### *Please do not wait until the day of the workshop to do these installations*! We cannot spend class time working through installation issues. 41 | 42 | Installation instructions for this course are on our CLE RNA-seq webpage. If you have trouble with installations up until the day before the workshop, please email bthe instructor: 43 | 44 | - Karla Lindquist: [karla.lindquist@ucsf.edu](mailto:karla.lindquist@ucsf.edu) 45 | 46 | 47 | # GitHub and UCSF data security protocols: 48 | 49 | Please be aware that **GitHub is not certified for use with personal health information (PHI).** Do not store or share any sensitive information via GitHub, even if kept in private repositories. 50 | 51 | For more information regarding UCSF's security guidelines, [please see the UCSF IRB guidelines for electronic data security.](https://irb.ucsf.edu/electronic-data-security) 52 | -------------------------------------------------------------------------------- /bulkRNAseq.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /data/GSE63310/GSE63310_RAW.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/data/GSE63310/GSE63310_RAW.tar -------------------------------------------------------------------------------- /data/GSM1545537_mo906111-1_m09611-2.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/data/GSM1545537_mo906111-1_m09611-2.txt.gz -------------------------------------------------------------------------------- /data/GSM1545543_JMS9-CDBG.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/data/GSM1545543_JMS9-CDBG.txt.gz -------------------------------------------------------------------------------- /notebooks/10_enrichment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Perform enrichment and pathway analysis" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 9. Perform enrichment and pathway analysis* 8 | 9 | Packages used in this notebook: `gage`, `pathview`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | The `limma` package can test for enrichment with [Gene Ontology](http://geneontology.org/) (GO) terms with the `goana()` function. It can also test for [Kyoto Encyclopedia of Genes and Genomes](https://www.genome.jp/kegg/) (KEGG) pathways with the `kegga()` function. But there are far better options these days. We will only scratch the surface of this complex topic in this workshop. See the [Gladstone Institute](https://github.com/gladstone-institutes/Bioinformatics-Workshops/wiki) for current enrichment and pathway analysis course offerings (there are several). 14 | 15 | For a recent overview of gene set analysis, see this paper "[Fifteen Years of Gene Set Analysis for High-Throughput Genomic Data: A Review of Statistical Approaches and Future Challenges](https://pubmed.ncbi.nlm.nih.gov/33286201/)" by Das, McClain, & Rai (2020). 16 | 17 | ------------------------------------------------------------------------ 18 | 19 | ##### Test for KEGG pathway enrichment using Generally Applicable Gene-set Enrichment (GAGE) 20 | 21 | First we will focus on using the `gage` package. See the vignettes for `gage` (run `vignette(package="gage")`) and also this R-bloggers [tutorial](https://www.r-bloggers.com/tutorial-rna-seq-differential-expression-pathway-analysis-with-sailfish-deseq2-gage-and-pathview/) for full pipeline (note they use DESeq2 to start with, so this part is independent of how you got to your list of differentially expressed genes). 22 | 23 | The first input to the `gage()` function this is a named vector of log2(Fold-change) coefficients. Let's start by doing a gene set analysis for LP vs. ML. To get these you can use the LPvsML coefficients from the `efit` coefficient matrix. 24 | 25 | ```{r} 26 | ?gage 27 | head(efit$coefficients) 28 | 29 | fold_lpml <- efit$coefficients[, "LPvsML"] 30 | head(fold_lpml) 31 | ``` 32 | 33 | Lets's test for KEGG pathway enrichment in the LP vs. ML samples. 34 | 35 | After getting the fold changes (coefficients) as above, the next thing we need to do is to get the latest mouse genome KEGG pathways to test. The `kegg.gsets` function comes from the `gage` package. 36 | 37 | ```{r} 38 | ?kegg.gsets 39 | 40 | kegg_mouse <- kegg.gsets(species="mmu", id.type="kegg") 41 | names(kegg_mouse) 42 | 43 | length(kegg_mouse$kg.sets) ## number of mouse pathways in kegg 44 | head(kegg_mouse$kg.sets, n=3) ## look at first 3 pathways listed 45 | ``` 46 | 47 | The numbers next to the name of the pathway is the KEGG accession number. The genes that are members of each pathway are given in ENTREZIDs. 48 | 49 | The functions below mostly use ENTREZIDs. Remember we have those for our corresponding gene names stored in an object called `gene_ids` (created in Objective 2). 50 | 51 | ```{r} 52 | head(gene_ids) 53 | ``` 54 | 55 | We can match the ENTREZIDs back into our `fold_lpml` object comparing fold changes for LP vs. ML samples. 56 | 57 | ```{r} 58 | head(fold_lpml) 59 | fold_ids <- gene_ids[gene_ids$SYMBOL %in% names(fold_lpml), ] 60 | names(fold_lpml) <- fold_ids$ENTREZID 61 | head(fold_lpml) 62 | ``` 63 | 64 | Now let's test whether any our list of differentially expressed genes in the LP vs. ML samples are enriched with any of these. In other words, are there more than expected by chance? Using the `same.dir` argument gives us separate lists of pathways that are enriched with up-regulated and down-regulated genes in LP vs. ML. 65 | 66 | ```{r} 67 | ?gage 68 | 69 | kegg_all = gage(exprs=fold_lpml, gsets=kegg_mouse$kg.sets, same.dir=TRUE) 70 | ``` 71 | 72 | Notice that `kegg_all` is a list. We have up-regulated genes in LP vs. ML, ("greater"), down-regulated genes ("less"). The `q.val`column is the one we care most about because this is the p-value corrected for multiple comparisons (number of genes tested). 73 | 74 | ```{r} 75 | head(kegg_all$greater) 76 | head(kegg_all$less) 77 | ``` 78 | 79 | The `stat.mean` indicates the direction and magnitude of expression in the genes that are in the pathway. Again the `q.val` column is the adjusted significance level (false discovery rate - you can use 0.1 for example), and `set.size` represents the number of genes in the pathway. The data is sorted by most-least significant. If you want to look up more information about any pathway, you can search KEGG using the accession number, e.g. see [mmu04974](https://www.genome.jp/dbget-bin/www_bget?mmu04974). 80 | 81 | ------------------------------------------------------------------------ 82 | 83 | Test for KEGG metabolism pathway enrichment 84 | 85 | You can also test the subset of KEGG signaling, metabolism, and disease pathways separately. The full set that we created above with the `kegg.gsets()` function (we named this kegg_mouse) can be subset for either one using indices. For example let's say we want to test if any metabolic pathways are perturbed - we'll start by subsetting these. 86 | 87 | ```{r} 88 | names(kegg_mouse) 89 | head(kegg_mouse$met.idx) ## these are indices of the metabolic pathways 90 | kegg_mouse$kg.sets[1] ## this is the first metabolic pathway (index=1) 91 | ``` 92 | 93 | ```{r} 94 | kegg_met <- kegg_mouse$kg.sets[kegg_mouse$met.idx] ## all metabolic pathways 95 | ``` 96 | 97 | Make sure the number of pathways matches the number of metabolic indices. 98 | 99 | ```{r} 100 | length(kegg_mouse$met.idx) 101 | length(kegg_met) 102 | ``` 103 | 104 | Now we can use the kegg_met subset on the LP vs. ML fold changes to find metabolic pathways enriched with up- and down-regulated genes. Say we are interested only in the genes that are down-regulated in LP vs. ML. 105 | 106 | ```{r} 107 | kegg_met = gage(exprs=fold_lpml, gsets=kegg_met, same.dir=TRUE) 108 | names(kegg_met) 109 | head(kegg_met$less, n=3) 110 | ``` 111 | 112 | ------------------------------------------------------------------------ 113 | 114 | ##### Test for Reactome pathway enrichement using ReactomePA 115 | 116 | We can also use the [ReactomePA](https://pubmed.ncbi.nlm.nih.gov/26661513/) package (Yu & He, 2016) to do pathway analysis. This package uses a hypergeometric model to determine whether your list of genes are significantly associated with any Reactome packages. Make sure this package is loaded. You may also want to look through the vignettes for this package using the `browseVignettes` function. 117 | 118 | We will start by using the `enrichPathway()` function which simply takes a vector of ENTREZIDs as input. We can use the names of the `fold_lpml` object since we changed them to ENTREZIDs above. We need to make sure to tell it that we are looking at mouse genome pathways. We will a-priori set a q-value cutoff of 0.01 to determine significance of the enrichment. This might take a while. 119 | 120 | ```{r} 121 | ?enrichPathway 122 | 123 | head(names(fold_lpml)) 124 | 125 | react_pwy <- enrichPathway(gene=names(fold_lpml), qvalueCutoff=0.01, readable=TRUE, organism = "mouse") 126 | 127 | react_pwy_df <- as.data.frame(react_pwy) 128 | dim(react_pwy_df) 129 | react_pwy_df 130 | ``` 131 | 132 | So we have a lot of pathways that are siginificantly enriched. Again we can look them up by accession numbers ("ID") in the Reactome database, e.g. R-MMU-1500931. You may want to reduce this list ... again I recommend taking one of the Gladstone workshops to learn more about strategies for this. 133 | 134 | ------------------------------------------------------------------------ 135 | 136 | ##### Visualize Reactome pathway enrichment results 137 | 138 | There are lots of ways to visualize the results ... here are a few simple plots (saved as .png). 139 | 140 | ```{r} 141 | # png(paste0(projdir, "/results/barplot_react.png"), width=960) 142 | barplot(react_pwy, showCategory = 10) 143 | # dev.off() 144 | 145 | # png(paste0(projdir, "/results/dotplot_react.png"), width=960) 146 | dotplot(react_pwy, showCategory = 10) 147 | # dev.off() 148 | ``` 149 | 150 | ------------------------------------------------------------------------ 151 | 152 | ##### Resource for learning more about enrichment and pathway analysis 153 | 154 | We have covered just a few approaches to doing these types of analyses. There is a great online resource called the [Biomedical Knowledge Mining](https://yulab-smu.top/biomedical-knowledge-mining-book/index.html) book by Wu et al (2021) that I recommend you review on your own which covers other approaches and R Bioconductor packages. It gives a good overview and some ideas for other packages you can try. 155 | -------------------------------------------------------------------------------- /notebooks/1_install_packages.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DNA Variant Analysis with R Bioconductor" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | ------------------------------------------------------------------------ 8 | 9 | #### In this workshop, our objectives are to: 10 | 11 | 1. Download RNA-seq files from GEO 12 | 2. Annotate the samples 13 | 3. Calculate expression as read counts/million (CPM) and log2(CPM) 14 | 4. Filter out genes with low expression and plot 15 | 5. Normalize the filtered expression data and plot 16 | 6. Perform unsupervised clustering of expression data 17 | 7. Perform differential expression analysis 18 | 8. Create heatmaps and volcano plots 19 | 9. Perform enrichment and pathway analysis 20 | 21 | ------------------------------------------------------------------------ 22 | 23 | #### Install packages 24 | 25 | Instructions and notes: 26 | 27 | - Please make sure that you have *current* versions of R, Bioconductor, and RStudio installed. 28 | 29 | - [R](https://cran.r-project.org/) 30 | 31 | - [RStudio (free desktop version)](https://www.rstudio.com/products/rstudio/download/) 32 | 33 | - [Bioconductor](https://bioconductor.org/install/)\ 34 | 35 | - I recommend you start a fresh R/RStudio session *before* running the code below. If asked to restart R again during the installation process, it means you already have the package so you can say No. 36 | 37 | - If asked "Update all/some/none?", select No [n] during class (other times select Yes [y]). Older versions of the packages that we will be using will probably work OK. 38 | 39 | - If asked "Do you want to install from sources the package which needs compilation?", select No [n] during class (other times select Yes [y]). Note that source versions are typically a little newer than binary versions. 40 | 41 | - Lazy loading errors can be ignored, as can warnings about the version of R packages were built under. 42 | 43 | - If you get a message saying "Error in install.packages : Updating loaded packages" do not worry about this. 44 | 45 | - We will not be re-installing the packages during class, so it is important that you make sure that you have these installed before class. 46 | 47 | - If you have questions about installation issues, feel free to email [karla.lindquist\@ucsf.edu](mailto:karla.lindquist@ucsf.edu){.email}. 48 | 49 | ```{r message=FALSE} 50 | ## install Bioconductor 51 | # if (!require("BiocManager", quietly = TRUE)) 52 | # install.packages("BiocManager") 53 | # BiocManager::install(version = "3.15") 54 | 55 | ## list Bioconductor packages 56 | bpkgs <- c("GEOquery","org.Mm.eg.db","GenomeInfoDbData","TxDb.Mmusculus.UCSC.mm10.knownGene","Mus.musculus","BiocStyle","limma","edgeR","RColorBrewer","gplots","gage","clusterProfiler","ReactomePA","GOSemSim","DOSE","enrichplot") 57 | ## note: some may not be necessary to follow along in the workshop (more for demo purposes) 58 | 59 | ## install Bioconductor packages 60 | # BiocManager::install(bpkgs, force = TRUE) 61 | 62 | 63 | ## list CRAN packages 64 | rpkgs <- c("ggplot2","here","cli","R.utils","rmarkdown","knitr","utf8") 65 | ## note: some but not all may need to install utf8 to get all Bioconductor packages to install; rmarkdown and knitr are only needed if you are using R notebooks 66 | 67 | ## install CRAN packages 68 | # install.packages(rpkgs) 69 | ``` 70 | 71 | #### Load all packages 72 | 73 | ```{r message=FALSE} 74 | ## list and load all packages 75 | pkgs <- c(bpkgs, rpkgs) 76 | for(package in pkgs) { 77 | do.call("library", list(package)) 78 | } 79 | ``` 80 | 81 | ------------------------------------------------------------------------ 82 | 83 | #### Acknowledgments 84 | 85 | To find package citations, you can use the `citation()` function with package name as the argument. 86 | 87 | ```{r} 88 | citation("enrichplot") 89 | ``` 90 | -------------------------------------------------------------------------------- /notebooks/2_downloadRNAseq.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Download RNA-seq files from GEO" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 1. Download RNA-seq files from GEO* 8 | 9 | Packages used in this notebook: `GEOquery`, `edgeR`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | ##### Read publicly available data from the Gene Expression Omnibus (GEO) repository 14 | 15 | We will be using data from a study titled: "**Transcriptome profiling of purified mouse mammary stem, progenitor and mature cell populations" (**GEO accession [GSE63310](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63310)). Take some time to review the summary page. Samples were sequenced on the Illumina HiSeq 2000 platform. 16 | 17 | First, make sure that your working directory is the main project directory which should contain a data directory. Whether you use files downloaded with R (as shown below) or if you manually downloaded the files, we will assume that you are starting from the main project directory. We will assign the project directory to an object called `projdir` so that we don't have to type the whole path every time. 18 | 19 | ```{r} 20 | getwd() ## confirm that your working directory is the "notebooks" folder 21 | projdir <- sub("notebooks", "", getwd()) ## this removes the "notebooks" part of the directory string 22 | projdir ## confirm this is your main project directory 23 | ``` 24 | 25 | Retrieve the raw RNA-seq files using the GEO accession number for our dataset (GSE63310). These are usually provided in plan text files. This function `getGEOSuppFiles()` will detect the format. We will call this object `gse` which stands for GEO Series (GSE) and is used as an acronym used by the NCBI GEO repository. 26 | 27 | ```{r} 28 | ?getGEOSuppFiles 29 | 30 | gse <- getGEOSuppFiles(GEO = "GSE63310", baseDir=paste0(projdir, "data")) 31 | ``` 32 | 33 | Take a look at the data folder in your project directory now. Unless you had files in there already, the only thing you should see is another directory within it called GSE63310. Within that, you should see a .tar file, which we will now unpack into our main data directory. You can create the path and filename by using the `paste0` function to string parts of the directory together. 34 | 35 | ```{r} 36 | ## check that path is correct for the untar() function to find the .tar file 37 | paste0(projdir, "data", "/GSE63310/GSE63310_RAW.tar") 38 | ``` 39 | 40 | Use the code above as the argument to the `untar()` function. If this is successful there should be no output in the console/notebook. 41 | 42 | ```{r} 43 | untar(tarfile=paste0(projdir, "data", "/GSE63310/GSE63310_RAW.tar"), 44 | exdir=paste0(projdir, "data")) 45 | ``` 46 | 47 | Move up to the data directory. You should see all the samples in .txt.gz files now. We will be analyzing 9 of the 11 samples (each sample is in a separate file). You can find out more from the GEO website about this study, we will be selecting the 9 primary cell types studied: *basal* cells, *luminal progenitor (LP)* cells, and *mature luminal-enriched (ML)* cells. Each cell type has 3 samples. 48 | 49 | Since there are just 9 samples, I will type these filenames out and assign them to a character vector called `files`. Then I will store the corresponding cell types in another character vector called `celltypes`. We will use this in the annotation notebook. 50 | 51 | ```{r} 52 | files <- c("GSM1545535_10_6_5_11.txt", ## LP 53 | "GSM1545536_9_6_5_11.txt", ## ML 54 | "GSM1545538_purep53.txt", ## basal 55 | "GSM1545539_JMS8-2.txt", ## basal 56 | "GSM1545540_JMS8-3.txt", ## ML 57 | "GSM1545541_JMS8-4.txt", ## LP 58 | "GSM1545542_JMS8-5.txt", ## basal 59 | "GSM1545544_JMS9-P7c.txt", ## ML 60 | "GSM1545545_JMS9-P8c.txt") ## LP 61 | celltypes <- c("LP", "ML", "Basal", "Basal", "ML", "LP", "Basal", "ML", "LP") 62 | ``` 63 | 64 | ------------------------------------------------------------------------ 65 | 66 | ##### Unpack and decompress files 67 | 68 | We are now going to unzip the files one by one so we can look at them. Since there are only 9 we can use a for loop (if you have a lot more files or very large files, then you may want to use some more efficient method). 69 | 70 | ```{r} 71 | for(f in paste(files, ".gz", sep="")) { 72 | gunzip(paste0(projdir, "data/", f), overwrite=TRUE) 73 | } 74 | ``` 75 | 76 | We could open any of these in a text editor since they are small, but let's read the *first* file into a data frame so we can take a look at it in R. We can access the first file by using the character vector's first index with `files[1]`. 77 | 78 | ```{r} 79 | files[1] 80 | f1 <- read.delim(file=paste0(projdir, "data/", files[1])) 81 | dim(f1) 82 | head(f1) 83 | tail(f1) 84 | ``` 85 | 86 | As we can see, this is a data frame with three columns: the gene ID (ENTREZID), length of the gene, and read counts from the run. 87 | 88 | ------------------------------------------------------------------------ 89 | 90 | ##### Create a digital gene expression (DGE) object 91 | 92 | Now we will create a `DGEList` object which will contain data from all 9 files. This requires us to indicate which columns represent the gene names and raw read counts. This assumes there is only one gene per row (no duplicates). Take a look at the `readDGE()` help file. This function is from the `edgeR` package. Note that you can specify which columns to read (we just want the genes and counts), which is important because in the filter and normalization steps to follow, it will just be working on the counts. You can also apply sample group names (the latter are stored in the `celltypes` object that we created above). 93 | 94 | ```{r} 95 | ?readDGE 96 | 97 | dge <- readDGE(files = files, path=paste0(projdir, "data"), 98 | columns = c(1,3), ## column 1 has the genes, column 3 has the counts 99 | group = celltypes) 100 | ``` 101 | 102 | If all goes well you will not see any output in the console. You can inspect the components of the `dge` object by expanding/opening it from the Environment window. Or you can just type `dge` to see the first few rows and to confirm that the files are associated with the right sample groups. 103 | 104 | ```{r} 105 | dge 106 | ``` 107 | -------------------------------------------------------------------------------- /notebooks/3_annotate.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Annotate the samples" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 2. Annotate the samples* 8 | 9 | Packages used in this notebook: `edgeR`, `org.Mm.eg.db`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | As seen in the previous notebook, the `readDGE()` function from the `edgeR` package conveniently reads all of the files at once. Let's take another look at this object that we named `dge`. 14 | 15 | ```{r} 16 | dge 17 | ``` 18 | 19 | You can also look at the first few rows of the raw counts which is in a matrix, so you can access the values using the `$` operator. 20 | 21 | ```{r} 22 | names(dge) 23 | class(dge$counts) 24 | head(dge$counts) 25 | ``` 26 | 27 | ------------------------------------------------------------------------ 28 | 29 | ##### Label the gene names 30 | 31 | The counts part of the DGE object contains gene names as ENTREZIDs in rows and sample or file names in columns. You may want to convert these ENTREZIDs to gene symbols instead, which are easier to read. You can use the `rownames` function on the `dge` object to create a vector of ENTREZIDs that we can then translate to symbols. 32 | 33 | ```{r} 34 | dge_ids <- rownames(dge) 35 | 36 | head(dge_ids) ## look at first 6 gene names 37 | 38 | length(dge_ids) ## find out how many genes are represented 39 | ``` 40 | 41 | To annotate the data with gene SYMBOLS instead of ENTREZIDs, we need to use the mouse genome database `org.Mm.eg.db` since our samples come from mice. 42 | 43 | Let\'s use the `select()` function from the `AnnotationDbi` package (comes with Bioconductor). Note that there are multiple packages that support a function with the same name. We want to look at the one from the package, so we will call up the help file from that one with `?AnnotationDbi::select`. Take some time to understand this function - it is used a lot with Bioconductor packages. 44 | 45 | Note the use of `keys = dge_ids` here (we wouldn't want to type them all out since this is a long vector). We will create a new vector but using SYMBOLS instead of ENTREZIDs, and we'll call this `gene_ids`. 46 | 47 | ```{r} 48 | ?AnnotationDbi::select 49 | gene_ids <- AnnotationDbi::select(org.Mm.eg.db, 50 | keys = dge_ids, 51 | keytype = "ENTREZID", 52 | columns = "SYMBOL") 53 | head(gene_ids) 54 | ``` 55 | 56 | Note: `select()` returned 1:1 mapping between keys and columns is what we are expecting here - there should be a 1:1 mapping between ENTREZIDs and gene SYMBOLs, so this is good. If you get 1:many or many:1 then you want to make sure you understand why ... e.g. this can happen if you are mapping genes names to multiple transcripts. 57 | 58 | ------------------------------------------------------------------------ 59 | 60 | Now, we can add these identifiers to our `dge` count matrix by re-assigning the row names with SYMBOLs from our `gene_ids` object. 61 | 62 | ```{r} 63 | head(dge$counts) 64 | head(gene_ids) 65 | ``` 66 | 67 | ```{r} 68 | rownames(dge$counts) <- gene_ids$SYMBOL 69 | dge 70 | ``` 71 | -------------------------------------------------------------------------------- /notebooks/4_cpm_n_log2cpm.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Estimate expression levels" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 3. Calculate expression as read counts/million (CPM) and log2(CPM)* 8 | 9 | Packages used in this notebook: `edgeR`. Make sure this package is installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | ##### Comments on measures used to estimate expression levels from raw read counts 14 | 15 | We will convert our raw counts into counts per million (CPM). We will also be calculating log2 counts/million, log2(CPM). This is related to expression. It isn't necessarily the best measurement to use but it's the simplest one. You want to become familiar with the different ways of measuring expression and choose the best one for your data. 16 | 17 | Other measures to estimate expression from raw counts that you may hear about are RPKM and FPKM. For more discussion about this, see this article by Harold Pimentel called [What the FPKM? A review of RNA-Seq expression units](https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/). Here are a few excerpts from this article to summarize: 18 | 19 | - Counts per million (CPM) mapped reads are counts scaled by the number of fragments you sequenced (![N](https://s0.wp.com/latex.php?latex=N&bg=ffffff&fg=000000&s=0&c=20201002)) times one million. 20 | 21 | - Reads per kilobase of exon per million reads mapped (RPKM) or the more generic FPKM (substitute reads with fragments) are essentially the same thing. Contrary to some misconceptions, FPKM is not 2 \* RPKM if you have paired-end reads. FPKM == RPKM if you have single-end reads, and saying RPKM when you have paired-end reads is just weird, so don't do it :). 22 | 23 | ------------------------------------------------------------------------ 24 | 25 | ##### Calculate counts per million (CPM) and log-transform 26 | 27 | The `cpm()` function from the `edgeR` package calculates CPM values using the `dge` object that we have been working with. It can also calculate log2(CPM) values, which we will do by adding the log argument. 28 | 29 | ```{r} 30 | ?cpm 31 | cpm <- cpm(dge) 32 | log2_cpm <- cpm(dge, log=TRUE) 33 | ``` 34 | 35 | Take a look at raw counts, raw CPM, and log2(CPM) values. 36 | 37 | ```{r} 38 | head(dge$counts) ## raw read counts 39 | head(cpm) ## counts per million mapped reads 40 | head(log2_cpm) ## log-transformed cpm 41 | ``` 42 | 43 | Note: when a raw count is zero, the `cpm()` function will apply a pseudocount to avoid talking the log of zero, which is not a number. We'll be filtering these out anyway. 44 | 45 | Let's look at the distribution of each with simple histograms. The log transformation just makes the non-0 values more normally distributed which can simplify statistical testing of differences between sample types (more on this later). 46 | 47 | ```{r} 48 | hist(cpm) 49 | hist(log2_cpm) 50 | ``` 51 | -------------------------------------------------------------------------------- /notebooks/5_filter_n_plot.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Filter and plot expression" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 4. Filter out genes with low expression and plot* 8 | 9 | Packages used in this notebook: `edgeR`, `RColorBrewer`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | The next step is to filter out genes with low intensity (this is a crude way of filtering - it's not a recommendation, just showing how this could be done). Say we want to select genes where at least 3 of the 9 samples have some expression (i.e. raw counts\>0, or CMP\>1). We will call this new object `dge_sub` (a subset of the `dge` object). 14 | 15 | ```{r} 16 | dim(dge) 17 | dge_sub <- dge[rowSums(cpm>1)>=3, ] 18 | dim(dge_sub) 19 | ``` 20 | 21 | If this code seems mysterious, break it down piece by piece. 22 | 23 | ```{r} 24 | head(cpm) 25 | head(rowSums(cpm>1)) ## gets total number of samples with expression for each gene 26 | head(rowSums(cpm>1)>=3) ## indicates if each of the totals above are >= 3 27 | ``` 28 | 29 | ------------------------------------------------------------------------ 30 | 31 | ##### **Discussion** 32 | 33 | What are some ways to decide on the filtering criteria? How do you do it? What are the strengths and weaknesses of these options: 34 | 35 | - Intensity-based (e.g. decide a-priori: if genes are not expressed in x% of samples, remove). 36 | - Statistically-based (e.g. calculate coefficient of variation CV or adjusted p-values for each gene across samples, decide a-priori to remove genes outside of a certain range). 37 | - Biologically-based (e.g. look at genes of interest). 38 | 39 | ------------------------------------------------------------------------ 40 | 41 | ##### Compare filtered to unfiltered data 42 | 43 | Now compare the size of the original raw count data to the filtered data. 44 | 45 | ```{r} 46 | 100 * nrow(dge_sub) / nrow(dge) 47 | ``` 48 | 49 | So we're only going to be using \~52% of the data, but when we look at the distribution of the raw and unfiltered data, we see that this helps. 50 | 51 | Now let's plot the raw and filtered data distributions (density) side-by-side. Why? We want to check two things: a) to see if the filtered data is normally distributed and b) to check that the samples have similar distributions. This doesn't tell us which genes have differential expression between the samples but it has implications for the statistical testing to determine this. More on this later. 52 | 53 | First, let's assign different colors to each of the 9 samples so we can overlay them and be able to distinguish them from each other more easily. We can use the `RColorBrewer` package to assign the colors by sample type. 54 | 55 | ```{r} 56 | ?RColorBrewer 57 | 58 | nsamples <- ncol(dge_sub) ## 9 samples 59 | col <- brewer.pal(n = nsamples, name = "Paired") ## the paird option gives up to 12 distinct colors 60 | col 61 | ``` 62 | 63 | Now can plot the log2(CPM) distributions for both raw and filtered data. We created `log2_cpm` in Objective 3 but we'll create it here again and name it `log2cpm_raw` to distingush it from the filtered data, which we will call `log2cpm_sub`. We are just using that `cpm` function again. 64 | 65 | ```{r} 66 | log2cpm_raw <- cpm(dge, log=TRUE) ## raw unfiltered expression (log2 CPM) 67 | log2cpm_sub <- cpm(dge_sub, log=TRUE) ## filtered 68 | ``` 69 | 70 | The `density()` function from the `stats` package will give us the data to plot the raw and filtered data. We will start with just the first sample (column 1) so we can compare the means and median etc. for that sample. 71 | 72 | ```{r} 73 | density(log2cpm_raw[,1]) 74 | density(log2cpm_sub[,1]) 75 | ``` 76 | 77 | Now, we will set up a plot with 2 side-by-side density plots. We'll put the raw data on the left plot and the filtered data on the right. This is done with the `par()` function (see `?par` for general plot parameters). 78 | 79 | We'll start by making the first plot of just the first sample, then we will combine the samples using a for loop and overlay these as lines on the same plot. 80 | 81 | ```{r} 82 | par(mfrow=c(1,2)) 83 | plot(density(log2cpm_raw[,1]), 84 | col=col[1], 85 | lwd=2, 86 | ylim=c(0,0.21), 87 | las=2, 88 | main="", 89 | xlab="") 90 | title(main="A. Raw data", xlab="Log2(CPM)") 91 | abline(v=0, lty=3) 92 | 93 | for (i in 2:nsamples){ 94 | den <- density(log2cpm_raw[,i]) 95 | lines(den$x, den$y, col=col[i], lwd=2) 96 | } 97 | ``` 98 | 99 | Next we can make the second plot of filtered data on the right like we did for the raw data. We'll put all of the code together for the final plot. 100 | 101 | ```{r} 102 | par(mfrow=c(1,2)) 103 | plot(density(log2cpm_raw[,1]), 104 | col=col[1], 105 | lwd=2, 106 | ylim=c(0,0.21), 107 | las=2, 108 | main="", 109 | xlab="") 110 | title(main="A. Raw data", xlab="Log2(CPM)") 111 | abline(v=0, lty=3) 112 | 113 | for (i in 2:nsamples){ 114 | den <- density(log2cpm_raw[,i]) 115 | lines(den$x, den$y, col=col[i], lwd=2) 116 | } 117 | 118 | plot(density(log2cpm_sub[,1]), 119 | col=col[1], 120 | lwd=2, 121 | ylim=c(0,0.21), 122 | las=2, 123 | main="", 124 | xlab="") 125 | title(main="B. Filtered data", xlab="Log2(CPM)") 126 | abline(v=0, lty=3) 127 | 128 | for (i in 2:nsamples){ 129 | den <- density(log2cpm_sub[,i]) 130 | lines(den$x, den$y, col=col[i], lwd=2) 131 | } 132 | ``` 133 | 134 | **Question:** why is there that peak on the left side of the raw data? Recall what I said about the `cpm` function in the previous notebook about what happens to zero counts. s 135 | -------------------------------------------------------------------------------- /notebooks/6_norm_n_plot.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Normalize and plot expression" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 5. Normalize the filtered expression data and plot* 8 | 9 | Packages used in this notebook: `edgeR`, `RColorBrewer`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | Now we should normalize the filtered expression data. This is achieved by dividing each expression value by the total or sum of all expression values on the array. Simply put, this just gets everything on a similar scale. We'll demonstrate this below. Just like with estimating and filtering expression, there are a number of different methods for normalization. 14 | 15 | There are also other R packages to do this and to be able to compare normalization methods, e.g. the [NormExpression](https://www.frontiersin.org/articles/10.3389/fgene.2019.00400/full) package (Wu et al. 2019). A quote from the Wu et al. paper describing the package: 16 | 17 | *"NormExpression provides a framework and a fast and simple way for researchers to select the best method for the normalization of their gene expression data based on the evaluation of different methods (particularly some data-driven methods or their own methods) in the principle of the consistency of metrics and the consistency of datasets."* 18 | 19 | For now, we'll use the `calcNormFactors()` from the `edgeR` package since we are using a lot of other functions from `edgeR`, but I encourage you to try multiple methods using the `NormExpression` package. 20 | 21 | Our normalized values will go in the `norm.factors` column of the samples part of our filtered DGEList object `dge_sub` , which are all equal to 1 by default: 22 | 23 | ```{r} 24 | dge_sub$samples 25 | ``` 26 | 27 | Before we normalize, let's create boxplots of the un-normalized but filtered log2(CPM) values in `log2cpm_sub` for each sample. 28 | 29 | ```{r} 30 | boxplot(log2cpm_sub, 31 | las=2, 32 | col=col, 33 | main="") 34 | title(main="Un-normalized data",ylab="Log2(CPM)") 35 | ``` 36 | 37 | So this isn't too surprising because in the previous step we saw that the samples all have a similar, normal-looking distribution. So in order to illustrate the impact of normalizing when you have, say some samples that looks very different. 38 | 39 | To illustrate this point, I will artificially change scale on the first 2 columns of the raw filtered counts by reducing the counts by 5% for first sample, increase by 500% for second. I will then re-calculate the log2(CPM) values after doing messing them up (we'll call this `log2cpm_mess`. This will better allow us to see the effects of the normalization in less well-behaved samples. 40 | 41 | ```{r} 42 | dge_sub$counts[,1] <- dge_sub$counts[,1]*0.05 ## reduce counts for sample 1 by 5% 43 | dge_sub$counts[,2] <- dge_sub$counts[,2]*5 ## increase counts for sample 2 by 500% 44 | 45 | log2cpm_mess <- cpm(dge_sub, log=TRUE) ## re-calculate the log2cpm 46 | ``` 47 | 48 | OK now let's see that plot again after artificially messing up a couple of samples. 49 | 50 | ```{r} 51 | boxplot(log2cpm_mess, 52 | las=2, 53 | col=col, 54 | main="") 55 | title(main="Un-normalized data",ylab="Log2(CPM)") 56 | ``` 57 | 58 | ------------------------------------------------------------------------ 59 | 60 | ##### Normalize using the trimmed mean (TMM) method 61 | 62 | As mentioned above there are different methods and you can try more than one, but we will use the TMM method for now. Once you know how to apply one method it is not very hard to try others. The TMM method accounts for different library sizes between the samples. Let's apply it and then look at the values in the norm.factors column. These are the "adjustment" values if you will. We will create a new object called `dge_sub_norm` (starting with the filtered data). 63 | 64 | ```{r} 65 | ?calcNormFactors 66 | dge_sub_norm <- calcNormFactors(dge_sub, method = "TMM") 67 | dge_sub_norm$samples 68 | ``` 69 | 70 | So now norm.factors column of our new `` dge_sub_norm` `` object hs different values than before where they were all = 1 in `dge_sub`. This doesn't do anything to our CPM values yet though - we need to re-calculate these after getting these norm.factors. 71 | 72 | ------------------------------------------------------------------------ 73 | 74 | ##### Calculate and plot normalized CPM values 75 | 76 | To re-calculate CPM using the norm.factor values, we apply the `cpm()` function again. Before we had `log2cpm_sub` (filtered un-normalized data) and now we are creating `log2cpm_norm` (filtered normalized data). 77 | 78 | ```{r} 79 | log2cpm_norm <- cpm(dge_sub_norm, log=TRUE) 80 | ``` 81 | 82 | Now let's create boxplots showing the normalized distribution of the log2(CPM) values. 83 | 84 | ```{r} 85 | boxplot(log2cpm_norm, 86 | las=2, 87 | col=col, 88 | main="") 89 | title(main="Normalized data",ylab="Log2(CPM)") 90 | ``` 91 | 92 | If you want to put these side-by-side you can use that `par()` function like we did in the last notebook. 93 | 94 | ```{r} 95 | par(mfrow=c(1,2)) 96 | boxplot(log2cpm_mess, 97 | las=2, 98 | col=col, 99 | main="") 100 | title(main="A. Un-normalized data",ylab="Log2(CPM)") 101 | 102 | boxplot(log2cpm_norm, 103 | las=2, 104 | col=col, 105 | main="") 106 | title(main="B. Normalized data",ylab="Log2(CPM)") 107 | ``` 108 | -------------------------------------------------------------------------------- /notebooks/7_unsupervised_clust.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Perform unsupervised clusetering" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 6. Perform unsupervised clustering of expression data* 8 | 9 | Packages used in this notebook: `limma`, `RColorBrewer`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | Now that we have filtered and normalized our expression levels across samples, we will visualize differences between groups of samples in terms of their average expression levels. 14 | 15 | We'll start by assigning different colors to each group of samples (LP, ML, and Basal) in a similar way to what we did in Objective 4 (there we had one color/sample, here we have one color/group). 16 | 17 | ```{r} 18 | grps <- dge_sub$samples$group ## these are the group assignments to each sample - just creating the grps object as a shorthand 19 | grps 20 | col.grp <- grps ## make a new object to hold assigned colors 21 | levels(col.grp) <- brewer.pal(nlevels(col.grp), "Set1") ## assigning a color to each sample group using nlevels(col.grp) ... see ?brewer.pal and ?nlevels for help 22 | col.grp <- as.character(col.grp) 23 | col.grp 24 | ``` 25 | 26 | ------------------------------------------------------------------------ 27 | 28 | ##### Multidimensional scaling (MDS) and Principal Coordinate (PCoA) Plots 29 | 30 | First we will do an exploratory analysis by performing unsupervised clustering (i.e. determining whether any of the samples have similar average expression levels, without using actual group labels to guide this analysis). This step is optional, and the results are not feeding in to subsequent analyses, but doing these types of analyses can be useful to scan for patterns before diving into the formal differential expression analysis. 31 | 32 | Here, we will use a simple approach here with multidemensional scaling and principal coordinate analysis plots. Similar to principal component analysis plots (PCA), these methods attempt to reduce the number of dimensions when working with many features. This makes it easier to spot patterns. Other dimension reduction techniques that are commonly used in single cell RNA-seq analyses, such as PCA, tSNE, and UMAP. I don't cover the statistics of these methods here but I cover them in a little more in my [scRNA-seq workshop](https://tiny.ucsf.edu/dsiscrnaseq). A good overview of these other types is also provided by this PDF called "[Dimension Reduction: PCA, tSNE, UMAP](https://www.bioinformatics.babraham.ac.uk/training/10XRNASeq/Dimension%20Reduction.pdf)" by Simon Andrews from Babraham Bioinformatics (2020). For more general information about the differences between bulk and single cell RNA-seq, see [Recent Advances in Single-Cell Genomics Techniques](https://courses.ucsf.edu/mod/url/view.php?id=884244) by Dmitry Velmeshev (2019) for a good high-level description. 33 | 34 | The `plotMDS()` function from the `limma` package shows us a scatterplot where distances between samples represent typical log2(CPM) between the samples for genes that distinguish the samples (see Details in Help file and in this [reference article](https://pubmed.ncbi.nlm.nih.gov/25605792/) by the package authors, Ritchie et al. (2015)). 35 | 36 | If it appears that there is some clustering of samples, then doing differential expression analysis can follow to formally test these differences. The log2(CPM) values have been filtered and normalized and stored in the object `log2cpm_norm`, and should represent an estimate of **log2(Fold Change)** when comparing expression levels between samples. In producing the plot below we will visualize pairwise comparisons for the 500 genes that have the greatest differences between each pair of samples (these are default settings - see `?plotMDS` for more info and other options). 37 | 38 | Note that you can save this plot (code is commented out but you can use the `png()` function followed by the plot commands and then `dev.off()`). We will also attach sample numbers (1-9) to the group names so we know which sample comes from which file (e.g. the first sample belongs to the LP group so it will get labeled LP-1, etc). 39 | 40 | ```{r} 41 | ?plotMDS 42 | 43 | head(log2cpm_norm) 44 | 45 | grps_num <- paste(grps, seq(1:length(grps)), sep="-") ## paste sample number (i.e. file order in the files object) next to the group labels 46 | grps_num 47 | 48 | # png(paste0(projdir, "results/MDSplot.png")) ## uncomment this line to save a .png image file to your results folder 49 | par(mfrow=c(1,1)) 50 | plotMDS(log2cpm_norm, 51 | labels=grps_num, ## label with group and sample # 52 | col=col.grp, 53 | xlab="Log2(Fold-Change)", 54 | ylab="Log2(Fold-Change)") 55 | title(main="Sample groups") 56 | # dev.off() 57 | ``` 58 | 59 | ------------------------------------------------------------------------ 60 | 61 | ##### **Discussion** 62 | 63 | What is this plot telling us? As mentioned above, the Log2(Fold-Change) is estimated from the log2(CPM) values when comparing samples to each other. This is a principal coordinate plot where we are looking at pairwise comparisons for 500 genes that have the largest standard deviation (most variation) between each pair of samples. The percentages on each axis represent the amount of variance explained by each dimension. 64 | 65 | What if you were to use the un-normalized data that was perturbed in Objective 5 where sample #1 (LP) counts were artificially reduced by 5%, and sample #2 (ML) counts were artificially increased by 500%? Let's see! 66 | 67 | ```{r} 68 | dge_sub 69 | head(log2cpm_mess) 70 | 71 | # png(paste0(projdir, "results/MDSplot-unnormalized.png")) 72 | par(mfrow=c(1,1)) 73 | plotMDS(log2cpm_mess, 74 | labels=grps_num, 75 | col=col.grp, 76 | xlab="Log2(Fold-Change)", 77 | ylab="Log2(Fold-Change)") 78 | title(main="Sample groups") 79 | # dev.off() 80 | ``` 81 | 82 | Well as expected, the un-normalized samples LP-1 and ML-2 because they deviate from the others so much. So now hopefully you can see why normalization is so important. 83 | -------------------------------------------------------------------------------- /notebooks/8_diff_expr.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Perform differential expression analysis" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 7. Perform differential expression analysis* 8 | 9 | Packages used in this notebook: `limma`. Make sure this package are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | Here we will be formally testing whether genes are deferentially expressed between the sample groups (LP vs. MP vs. Basal). Someone once pointed out that this notebook could be an entire workshop on it's own. So be aware that this is a meaty one and that it may take some independent background study and practice if this is your first exposure to these methods. 14 | 15 | ##### Create design and contrast matrices 16 | 17 | First we need to do a little set-up which involves creating design and contrast matrices. The purpose of these should become a little more clear once we make them. 18 | 19 | Let's make a **design matrix** (also called a model matrix) with groups to be compared. This makes dummy variables (0/1) for each sample indicating which group they belong to. We can do this with the `model.matrix()` function from the `stats` package. 20 | 21 | ```{r} 22 | ?model.matrix 23 | grps 24 | design <- model.matrix(~0 + grps) 25 | design 26 | ``` 27 | 28 | Now we'll re-label the columns to clean it up (get rid of "grps"). 29 | 30 | ```{r} 31 | colnames(design) <- gsub("grps", "", colnames(design)) 32 | design 33 | ``` 34 | 35 | We also need a **contrast matrix** to indicate which groups to compare to each other. These are coded with the reference group having a value of 1, and the comparison group has a value of -1 for each contrast. It doesn't really matter which gets which code, but you need to keep track because that will tell you which group has higher or lower expression relative to the comparison group. 36 | 37 | We will do all pairwise contrasts, or paired sample comparisons. The `makeContrasts()` function is from the `limma` package. We'll have to remember what the labels refer to, i.e. "BasalvsLP" is taking expression levels in Basal samples minus expression levels in the LP samples, etc. So for this contrast, Basal cells are coded 1 (reference group) and LP samples are coded -1 (comparison group). 38 | 39 | ```{r} 40 | ?makeContrasts 41 | 42 | contrast <- makeContrasts( 43 | BasalvsLP = Basal - LP, 44 | BasalvsML = Basal - ML, 45 | LPvsML = LP - ML, 46 | levels = colnames(design)) 47 | contrast 48 | ``` 49 | 50 | OK now we are ready to put these into action. 51 | 52 | ------------------------------------------------------------------------ 53 | 54 | ##### Combine normalized expression levels with the design matrix 55 | 56 | We can use the `voom()` function from the `limma` to combine log2(CPM) values with the design matrix and to prepare it for modeling at the gene level. Note that the default normalization method for this function is "none". We will keep this default because we have already filtered and normalized the values, and remember we have stored these in the `dge_sub_norm` object. What this function does is It calculates weights for each gene. 57 | 58 | What are these weights and why are we using them? See [this article](https://pubmed.ncbi.nlm.nih.gov/25925576/) by Liu et al. (2015) to understand these better. Voom is a statistical modelling of the heterogeneity at both the sample and observational levels. Sample variance factors are converted to weights and combined with observational level weights obtained from the mean-variance relationship 59 | of the log2(CPM) values. It is supposed to "powerful analysis and fewer false discoveries". It takes as input the filtered, normalized counts and the design matrix. 60 | 61 | ```{r} 62 | ?voom 63 | 64 | dge_voom <- voom(counts=dge_sub_norm, design = design) 65 | dge_voom 66 | ``` 67 | 68 | ------------------------------------------------------------------------ 69 | 70 | ##### Compare expression in each sample group versus the others combined 71 | 72 | Now we can put this into a linear model where the groups are compared one at a time to the other two groups (combined) for each gene using `lmFit()` from `limma`. Take a moment to read the details section of the help file so that you understand what this is doing. In a nutshell it is fitting linear models to describe differences between each sample group versus the others. Coefficients describe the direction and magnitude of these differences. The weighted counts from `voom` and the design matrix are used as inputs. We will store our results in an object called `dge_fit`, which will be used below to 73 | 74 | ```{r} 75 | ?lmFit 76 | dge_fit <- lmFit(object = dge_voom, design = design) 77 | dge_fit 78 | ``` 79 | 80 | So let's first look at this part of the output from the coefficients: 81 | 82 | Basal LP ML 83 | Xkr4 2.813459 -3.825652 -4.285171 84 | 85 | This means that for gene Xkr4, expression levels in Basal cells are 2.8-fold higher than the other two combined. You can actually calculate this manually if you want an extra challenge:) 86 | 87 | ------------------------------------------------------------------------ 88 | 89 | ##### Compare expression in each pair of samples 90 | 91 | Next we will compare the groups to each other individually in pairwise way. So this requires the results of the models run above (in `dge_fit`) and the contrast matrix. 92 | 93 | ```{r} 94 | ?contrasts.fit 95 | grp_fit <- contrasts.fit(dge_fit, contrasts=contrast) 96 | grp_fit 97 | ``` 98 | 99 | Then let's look at a few coefficients from both sets of results above. 100 | 101 | ```{r} 102 | head(dge_fit$coefficients) 103 | head(grp_fit$coefficients) 104 | ``` 105 | 106 | Notice that out `grp_fit` coefficients are equal to the differences between the coefficients from the `dge_fit` model above. For the Xkr4 gene, Basal cells have a 6.6-fold higher expression than LP cells and a 7.1-fold higher expression than ML cells. 107 | 108 | ------------------------------------------------------------------------ 109 | 110 | ##### Inspect residuals 111 | 112 | Just like with any regression modeling, you want to check the assumptions of the model. One assumption is that the variance of the residuals (difference between the model's predicted values and the actual values) are independent of the actual values. We can use the `plotSA` (Sigma vs A) function from the `limma` package. Review the help file for more information. 113 | 114 | ```{r} 115 | ?plotSA 116 | 117 | plotSA(grp_fit, main="Mean−variance trend") 118 | ``` 119 | 120 | This looks pretty good. Why? We don't want to see any obvious patterns of association between variance (y-axis) and the average expression levels. This is a key assumptions of many linear models, not just in this context. If there is a pattern, e.g. if the variance increases as expression increases or vice verse, that is an indication that the model does not fit very well. If this happens, you may want to investigate possible causes and determine if it might reflect some bias in your measurements, and then either correct it or take a different modeling approach (i.e. not using a linear model). 121 | 122 | Would you exclude/filter any genes based on this plot? Hint: some genes have high variance. You might be most interested in these, or not. This depends on what you want to learn and your context. 123 | 124 | ------------------------------------------------------------------------ 125 | 126 | ##### Apply eBayes if necessary 127 | 128 | What if you have a strong correlation between residuals and expression levels, e.g. where variance (y-axis) increases with expression, and you suspect this is a bias and you want to correct for it? Applying the empirical Bayes method should make the residual variances independent of the expression levels. The `eBayes()` function is also from `limma`. It "moderates" the standard errors in some cases. 129 | 130 | ```{r} 131 | ?eBayes 132 | efit <- eBayes(fit = grp_fit, trend = TRUE) 133 | # efit ## if you want to look at all the output 134 | ``` 135 | 136 | We can plot the residuals again with a trend line, which in an ideal situation would be a flat horizontal (this is pretty close). 137 | 138 | ```{r} 139 | plotSA(efit, main="Mean−variance trend post-eBayes") 140 | ``` 141 | 142 | ------------------------------------------------------------------------ 143 | 144 | ##### Identify significant genes 145 | 146 | Now we can find out how many genes are significantly different between the groups. You can use the result of the eBayes method (`efit`) or the original fit (`grp_fit`). In this case, there isn't much of a difference. The `decideTests` function identifies significantly up- and down-regulated genes in the sample groups after correcting for multiple comparisons. The function uses the Benjamini-Hochberg method by default for the latter, which controls false discovery rates (although you can use other types - see the adjust.method description in the help file. 147 | 148 | ```{r} 149 | ?decideTests 150 | summary(decideTests(efit)) 151 | ``` 152 | 153 | ------------------------------------------------------------------------ 154 | 155 | ##### Identify top differentially expressed genes 156 | 157 | Now let's say we want to find the most significant genes (e.g. the top 10). The F statistic reflects the magnitude of the overall group differences, and the adjusted p-value corrects for multiple comparisons. It is possible to look at overall statistics for the three contrasts, but we will take a look at the top 10 genes for each contrast one at a time (i.e. Basal vs. LP, Basal vs. ML, LP vs. ML) by using the coef argument. 158 | 159 | ```{r} 160 | ?topTable 161 | 162 | ## look at top 10 overall by F statistics/p-values 163 | topTable(efit, number=10) 164 | 165 | names(efit) 166 | efit$contrasts 167 | 168 | ## Basal vs. LP (coef=1) 169 | topTable(efit, coef=1, number=10, sort.by="p") 170 | 171 | ## Basal vs. ML (coef=2) 172 | topTable(efit, coef=2, number=10, sort.by="p") 173 | 174 | ## LP vs. ML (coef=3) 175 | topTable(efit, coef=3, number=10, sort.by="p") 176 | ``` 177 | 178 | ------------------------------------------------------------------------ 179 | 180 | ##### Save list of significant genes to a file 181 | 182 | If you would like to save the results from the model above, including coefficients, test statistics and p-values, you can use the `write.fit()` function from the `limma` package. In this function, you need to specify if you want to adjust p-values for multiple comparisons (you should, and we will use "BH" here for Benjamini-Hochberg, to match what we did above with `topTable()`, and with what we will do next with visualizations and GSEA). We will also indicate that we want to adjust p-values for each contrast. 183 | 184 | ```{r} 185 | # write.fit(efit, file=paste0(projdir, "results/DEresults.txt"), adjust="BH") 186 | ``` 187 | 188 | Note that this will complain if you have any duplicated genes in your file, which we do (found out by trying to run `write.fit()` and getting an error). This is often because in the process of annotating with different gene IDs (we went from ENTREZIDs to SYMBOLs in Objective 2), some may not be matched. This will result in NA values for those genes. We can see which rows have NA values for the SYMBOLs in our dataset. We can simply remove the rows with NAs for gene IDs ... you want to be careful about this, maybe do some QC on them first. But we don't have too many of these relatively and so we are not going to worry about it for now. 189 | 190 | ```{r} 191 | efit <- efit[which(!is.na(rownames(efit))),] 192 | ``` 193 | 194 | Now the `write.fit()` function should work. 195 | 196 | ```{r} 197 | write.fit(efit, file=paste0(projdir, "results/DEresults.txt"), adjust="BH") 198 | ``` 199 | -------------------------------------------------------------------------------- /notebooks/9_RNA_data_viz.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Visualize differential results" 3 | author: "Karla Lindquist" 4 | output: html_notebook 5 | --- 6 | 7 | #### *Objective 8. Create heatmap and volcano plots* 8 | 9 | Packages used in this notebook: `limma`, `gplots`, `RColorBrewer`. Make sure these packages are installed and loaded. 10 | 11 | ------------------------------------------------------------------------ 12 | 13 | Now that we've done some analysis of the data, we can visualize the results. While figures may not convey the depth of information that quantitative outputs might display, it does make our results far easier to interpret at a glance. Here we'll do a quick survey of some of the more popular RNA-seq visualizations. 14 | 15 | ------------------------------------------------------------------------ 16 | 17 | ##### Hierarchical clustering with heatmaps 18 | 19 | Hierarchical clustering can be a potent way to examine differences between samples, and heatmaps can succintly visualise the clustering of your samples. We can do this using the `heatmap.2()` function from the `gplots` package. In this example `heatmap.2()` calculates a matrix of euclidean distances from the log2(CPM ) values for the 500 most significant genes. 20 | 21 | We can get the gene SYMBOLs for the 50 most significant genes by basically doing a sort by p-values and then select 1-50. You can sort by adjusted p-values for pairwise comparisons, or as I demonstrate here, you can sort by the F-statistic p-values which represent the overall significance of differences across the 3 sample groups. It is probably easier to convert your results in the `efit` object to a data frame first - let's call this `efit_df`. 22 | 23 | ```{r} 24 | efit_df <- as.data.frame(efit, row.names = rownames(efit)) ## create data frame, retaining genes names as row names 25 | efit_df <- efit_df[order(efit_df$F.p.value),] ## sort by F statistic p-values 26 | head(efit_df) 27 | ``` 28 | 29 | If you want to see this full table more easily, you can show it in a new window using the tiny and faint icon at the top right corner of the output. Let's look at the top genes. 30 | 31 | ```{r} 32 | top50 <- rownames(efit_df)[1:50] 33 | top50 34 | ``` 35 | 36 | Now let's subset the `log2cpm_norm` matrix contain to contain just the top 50 genes using this vector of gene names. The heatmap needs to use these normalized expression levels to plot (more on this below). 37 | 38 | ```{r} 39 | log2cpm_top50 <- log2cpm_norm[top50,] 40 | dim(log2cpm_top50) ## should be 50 genes x 9 samples 41 | head(log2cpm_top50) 42 | ``` 43 | 44 | Now plot the heatmap with these 50 genes. First let's look at the help file for the `heatmap.2` function. There are a lot of options! The easiest way to customize your own is to look at the examples and try a few different things, or look online to find some others that you like. We will make a pretty bare-bones heatmap here. 45 | 46 | We will keep the samples together in columns and use the same colors that we used in the cluster plot in Objective 6 to indicate which is which. So at the top we will have the 3 ML samples (green), 3 LP samples (blue), and the 3 Basal samples (red). 47 | 48 | The dendograms are based on hierarchical clustering to show similar samples. By default, `heatmap.2` uses Euclidean distances with complete agglomeration. Different heatmap functions will use different methods, and there are lots of tutorials out there that can help you understand the differences between them, e.g. see "[Heatmap in R: Static and Interactive Visualization](https://www.datanovia.com/en/lessons/heatmap-in-r-static-and-interactive-visualization/)" from Datanovia. If you get an error message saying "Error in plot.new() : figure margins too large" that just means that the plot can't fit on your screen. You can always try to comment the `png()` and `dev.off()` lines below and see if it works for you in the RStudio session. The dendograms will still be output. 49 | 50 | ```{r} 51 | ?heatmap.2 52 | 53 | png(filename=paste0(projdir, "results/heatmap.png")) 54 | heatmap <- heatmap.2(log2cpm_top50, 55 | dendrogram = "both", ## create a dendogram and reorder the rows by cluster 56 | trace = "none", ## don't draw lines on top 57 | scale = "row", ## use z-score scaling for rows 58 | col = bluered, ## use blues and reds 59 | srtCol = 25, ## angle the column/sample labels since they are long 60 | ColSideColors = col.grp, ## color sample groups as we did in the cluster plot (green ML, blue LP, red Basal) 61 | margins = c(8,8)) 62 | dev.off() 63 | 64 | # you can view the dendogram too although the gene name labels are hard to see 65 | plot(heatmap$rowDendrogram, ylab="Cluster height", xlab="Gene") 66 | plot(heatmap$colDendrogram, ylab="Cluster height", xlab="Sample") 67 | ``` 68 | 69 | This reflects what we saw earlier with the MDS and PCoA plots in that we can clearly see that the LP and MP groups are much more similar to one another than the Basal group. This is also doing some unsupervised clustering of both genes and samples, just with a different method of clustering (hierarchical). See the documentation for the `heatmap.2()` function for more information. 70 | 71 | ------------------------------------------------------------------------ 72 | 73 | ##### Extract gene clusters from the heatmap 74 | 75 | Sometimes you may want to pull out groups of interest from the heatmap. The `as.hclust` function can take the clustered data from the heatmap (the row dendogram that is, where genes are clustered). 76 | 77 | ```{r} 78 | top50_clustered <- as.hclust(heatmap$rowDendrogram) 79 | names(top50_clustered) ## these are the values stored - the height and order are included now - you can also view this in the environment tab 80 | ``` 81 | 82 | Now take this clustering information and "cut" the dendogram to select a fewer number of clusters to focus on. To do this, we need to cut based on the cluster height (higher numbers correspond to larger clusters). Look back at the dendogram above, or look at the distribution of height values. Picking a good cut point is somewhat of an art but often is practical (e.g. how many clusters do you want to keep?). Let's say for simplicity, we just want to know which genes fall into the clusters at height\>15 which should give us 4 gene clusters. We can use the `table` function to find out how many genes fall into each of the clusters. We can store these clusters in an object called `mycl`. 83 | 84 | ```{r} 85 | ?cutree 86 | mycl <- cutree(top50_clustered, h=15) 87 | table(mycl) 88 | ``` 89 | 90 | Now let's add the cluster ID to the rest of the data. The genes and cluster IDs should still be in the same order, but you can do a spot check first or to be really safe, you can join the data with the cluster IDs by SYMBOLs. We'll call this annotated version `cldat`. 91 | 92 | ```{r} 93 | head(log2cpm_top50) 94 | head(mycl) 95 | cldat <- cbind(log2cpm_top50, clusterID=mycl) 96 | head(cldat) 97 | ``` 98 | 99 | Notice the column called `clusterID` above. These are the cluster assigments for each gene (the numbers 1-4 are just identifiers - the values don't mean anything). 100 | 101 | ------------------------------------------------------------------------ 102 | 103 | ##### Make volcano plots 104 | 105 | Another way to visualize differential gene expression is via volcano plots. The `limma` package's function `volcanoplot()` function takes as input a fitted model object in the MArrayLM format (as is `efit`). We will create one for each of the pairwise comparisons that we did. I'll label the top 10 genes on each plot. If you want to remind yourself of the direction of the comparisons you can output the contrast matrix again. 106 | 107 | ```{r} 108 | ?volcanoplot 109 | # basal vs lp 110 | # png(filename=paste0(projdir, "results/basal_lp_volcano.png")) 111 | volcanoplot(efit, 112 | coef = 1, 113 | style = "p-value", 114 | highlight = 10, 115 | names = names(efit$Amean), 116 | hl.col="blue", 117 | main = "Basal - LP", 118 | xlab = "Log2 Fold Change", 119 | ylab = NULL, 120 | pch=20, 121 | cex=0.35) 122 | # dev.off() 123 | 124 | # basal vs ml 125 | # png(filename=paste0(projdir, "results/basal_ml_volcano.png")) 126 | volcanoplot(efit, coef = 2, style = "B-statistic", highlight = 10, names = names(efit$Amean), 127 | hl.col="blue", 128 | main = "Basal - ML", xlab = "Log2 Fold Change", ylab = NULL, pch=20, cex=0.35) 129 | # dev.off() 130 | 131 | # lp vs ml 132 | # png(filename=paste0(projdir, "results/lp_ml_volcano.png")) 133 | volcanoplot(efit,coef = 3, style = "p-value", highlight = 10, names = names(efit$Amean), 134 | hl.col="blue", 135 | main = "LP - ML", xlab = "Log2 Fold Change", ylab = NULL, pch=20, cex=0.35) 136 | # dev.off() 137 | ``` 138 | 139 | This is a succinct way to visualize differential expression patterns, but let's pull out the labeled genes (top 10 most significant) for further observation. 140 | 141 | ```{r} 142 | ?topTable 143 | # top hits: 144 | BasalvLP_top <- topTable(efit, coef = 1, number = 10, sort.by = 'P') 145 | BasalvLP_top 146 | 147 | # top hits: 148 | BasalvML_top <- topTable(efit, coef = 2, number = 10, sort.by = 'P') 149 | BasalvML_top 150 | 151 | # top hits: 152 | ML_v_LP_top <- topTable(efit, coef = 3, number = 10, sort.by = 'P') 153 | ML_v_LP_top 154 | ``` 155 | 156 | ------------------------------------------------------------------------ 157 | 158 | ##### Some notes on volcano plots 159 | 160 | It is possible to refine this figure for publication, but I'd recommend using the `EnhancedVolcano` package to generate publication-ready plots. 161 | 162 | In the interest of time, we won't be covering the EnhancedVolcano package for highly customizable plots. If you wish to give this a try yourself, see [this bioconductor vignette](https://bioconductor.org/packages/devel/bioc/vignettes/EnhancedVolcano/inst/doc/EnhancedVolcano.html). Fair warning: it will take some data cleaning and restructuring to implement. 163 | -------------------------------------------------------------------------------- /results/MDSplot-unnormalized.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/MDSplot-unnormalized.png -------------------------------------------------------------------------------- /results/MDSplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/MDSplot.png -------------------------------------------------------------------------------- /results/barplot_react.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/barplot_react.png -------------------------------------------------------------------------------- /results/basal_lp_volcano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/basal_lp_volcano.png -------------------------------------------------------------------------------- /results/basal_ml_volcano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/basal_ml_volcano.png -------------------------------------------------------------------------------- /results/dotplot_react.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/dotplot_react.png -------------------------------------------------------------------------------- /results/heatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/heatmap.png -------------------------------------------------------------------------------- /results/lp_ml_volcano.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCSF-DSOS/bulk_RNA_seq/0fdde4a0a0f6477e641a35da65b90041902fdc91/results/lp_ml_volcano.png --------------------------------------------------------------------------------