├── README.md ├── cycle.Rmd ├── dimred.Rmd ├── flowsom.Rmd ├── intro.Rmd ├── normalize.Rmd ├── parallel ├── slurm-aaron.tmpl └── slurm.R ├── pics └── make_pics.R ├── preprocess.Rmd ├── resources └── marker_genes.csv └── variance.Rmd /README.md: -------------------------------------------------------------------------------- 1 | # Scripts for analyzing the 10X 1.3 million brain cell data 2 | 3 | The scripts should be executed in the following order: 4 | 5 | - `intro.Rmd`: An introduction, duh. 6 | - `preprocess.Rmd`: Downloading the data and quality control 7 | - `cycle.Rmd`: Cell cycle phase assignment 8 | - `normalize.Rmd`: Calculation of cell-specific size factors 9 | - `variance.Rmd`: Identification of highly variable genes 10 | - `dimred.Rmd`: Dimensionality reduction with randomized PCA 11 | 12 | Various output objects will be saved to `objects/`. 13 | A few of these objects are currently hosted at https://drive.google.com/open?id=1_0WbmJ2BriLKlyKEf1Bbb8K0_NwD9rw-. 14 | Note that `sce.rds` does not contain the actual counts or normalized expression values, and requires something like this: 15 | 16 | ```r 17 | library(TENxBrainData) 18 | tenx <- TENxBrainData() 19 | sce <- readRDS("sce.rds") 20 | tenx <- tenx[,colnames(sce)] # drop 19,672 cells from the raw TENxBrainData 21 | counts(sce) <- counts(tenx) # overwrite inbuilt absolute path 22 | library(scater) 23 | sce <- normalize(sce) # generate normalized expression values 24 | ``` 25 | 26 | The `pics/make_pics.R` scripts will generate the figures used in the paper. 27 | -------------------------------------------------------------------------------- /cycle.Rmd: -------------------------------------------------------------------------------- 1 | # Assigning cell cycle phase 2 | 3 | ```{r, echo=FALSE, results="hide"} 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) 5 | ``` 6 | 7 | ```{r, echo=FALSE, results="hide"} 8 | library(BiocStyle) 9 | library(HDF5Array) 10 | library(SingleCellExperiment) 11 | sce <- readRDS("objects/sce.rds") 12 | ``` 13 | 14 | Here, we use the `cyclone` method to assign cells to the cell cycle phase. 15 | We do so using the pre-trained mouse classifier and multiple cores to reduce computational time. 16 | 17 | ```{r} 18 | library(scran) 19 | set.seed(100) 20 | mm.pairs <- readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")) 21 | assigned <- cyclone(sce, mm.pairs, gene.names=rowData(sce)$Ensembl, BPPARAM=MulticoreParam(3)) 22 | ``` 23 | 24 | Making a plot of the cell cycle scores. 25 | 26 | ```{r cyclescores} 27 | smoothScatter(assigned$scores$G1, assigned$scores$G2M, 28 | xlab="G1 score", ylab="G2M score") 29 | ``` 30 | 31 | We save the phases to the `SummarizedExperiment` object in case it's useful. 32 | 33 | ```{r} 34 | sce$Phase <- assigned$phase 35 | table(assigned$phase) 36 | ``` 37 | 38 | Saving all of the results to file for later use. 39 | 40 | ```{r} 41 | saveRDS(assigned, file="objects/cycle_output.rds") 42 | ``` 43 | 44 | 51 | -------------------------------------------------------------------------------- /dimred.Rmd: -------------------------------------------------------------------------------- 1 | # Dimensionality reduction 2 | 3 | ```{r, echo=FALSE, results="hide"} 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) 5 | ``` 6 | 7 | ```{r, echo=FALSE, results="hide"} 8 | library(BiocStyle) 9 | library(SingleCellExperiment) 10 | sce <- readRDS("objects/sce.rds") 11 | dec <- read.table("objects/hvg_output.txt", stringsAsFactors=FALSE, header=TRUE) 12 | ``` 13 | 14 | Taking the HVGs to define the biological subspace of interest. 15 | 16 | ```{r} 17 | sig <- dec$FDR <= 0.05 & !is.na(dec$FDR) 18 | summary(sig) 19 | ``` 20 | 21 | Extracting the HVGs and mean-centering them. 22 | 23 | ```{r} 24 | exprs.mat <- logcounts(sce) 25 | exprs.mat <- exprs.mat[match(dec$Ensembl[sig], rowData(sce)$Ensembl), ] 26 | exprs.mat <- exprs.mat - dec$mean[sig] 27 | ``` 28 | 29 | Performing random PCA on the resulting expression matrix to retain the first 50 PCs. 30 | Here we use the `r Githubpkg("Bioconductor/BigDataAlgorithms")` package that contains a randomized SVD implementation for `DelayedArray` objects. 31 | 32 | ```{r} 33 | library(BigDataAlgorithms) 34 | set.seed(1000) 35 | options(DelayedArray.block.size=200e6) 36 | svd.out <- rsvd(t(exprs.mat), k=50) 37 | pc.out <- sweep(svd.out$u, 2, svd.out$d, "*") 38 | ``` 39 | 40 | Having a look at the variance explained by the top 50 PCs. 41 | 42 | ```{r screeplot} 43 | var.exp <- svd.out$d^2/(ncol(exprs.mat)-1) 44 | total.var <- sum(dec$total[sig]) 45 | plot(var.exp/total.var*100, xlab="PC number", ylab="% variance explained") 46 | ``` 47 | 48 | Making a plot of the first three PCs. 49 | 50 | ```{r pcaplot, fig.width=12, fig.height=6} 51 | par(mfrow=c(1,2)) 52 | smoothScatter(pc.out[,1], pc.out[,2], xlab="PC1", ylab="PC2") 53 | smoothScatter(pc.out[,1], pc.out[,3], xlab="PC1", ylab="PC3") 54 | ``` 55 | 56 | Storing the PCs in the `SingleCellExperiment` object. 57 | Also saving the SVD values for future use. 58 | 59 | ```{r} 60 | attr(pc.out, "percentVar") <- var.exp/total.var 61 | reducedDim(sce, "rPCA") <- pc.out 62 | saveRDS(svd.out, file="objects/rsvd.rds") 63 | ``` 64 | 65 | 72 | -------------------------------------------------------------------------------- /flowsom.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Clustering 1.3M cells with FlowSOM" 3 | author: "" 4 | output: html_document 5 | --- 6 | 7 | ```{r} 8 | suppressPackageStartupMessages(library(TENxBrainData)) 9 | suppressPackageStartupMessages(library(scater)) 10 | suppressPackageStartupMessages(library(flowCore)) 11 | suppressPackageStartupMessages(library(FlowSOM)) 12 | suppressPackageStartupMessages(library(scran)) 13 | suppressPackageStartupMessages(library(pheatmap)) 14 | suppressPackageStartupMessages(library(tibble)) 15 | suppressPackageStartupMessages(library(dplyr)) 16 | suppressPackageStartupMessages(library(reshape2)) 17 | suppressPackageStartupMessages(library(ggplot2)) 18 | suppressPackageStartupMessages(library(Rtsne)) 19 | suppressPackageStartupMessages(library(mclust)) 20 | ``` 21 | 22 | ## Read data 23 | 24 | ```{r} 25 | tenx <- TENxBrainData() 26 | sce <- readRDS("objects/sce.rds") 27 | tenx <- tenx[, colnames(sce)] # drop 19,672 cells from the raw TENxBrainData 28 | counts(sce) <- counts(tenx) # overwrite inbuilt absolute path 29 | system.time(sce <- scater::normalize(sce)) # generate normalized expression values 30 | ``` 31 | 32 | ## Read rSVD results 33 | 34 | ```{r} 35 | svd <- readRDS("objects/rsvd.rds") 36 | pca <- sweep(svd$u, 2, svd$d, "*") 37 | colnames(pca) <- paste0("PC", seq_len(ncol(pca))) 38 | dim(pca) 39 | ``` 40 | 41 | ## Apply FlowSOM to cluster the cells based on the first 50 PCs 42 | 43 | ```{r} 44 | set.seed(123) 45 | system.time(ff <- flowFrame(exprs = pca)) 46 | system.time(fSOM <- FlowSOM::ReadInput(ff, compensate = FALSE, transform = FALSE, 47 | scale = FALSE, silent = TRUE)) 48 | system.time(fSOM <- FlowSOM::BuildSOM(fSOM, silent = TRUE, xdim = 13, ydim = 13)) 49 | system.time(metaClustering <- metaClustering_consensus(fSOM$map$codes, k = 16)) 50 | ``` 51 | 52 | ## Add cluster info to data object 53 | 54 | ```{r} 55 | colData(sce)$som100 <- fSOM$map$mapping[, 1] 56 | colData(sce)$sommeta <- metaClustering[fSOM$map$mapping[, 1]] 57 | ``` 58 | 59 | ## Add rownames 60 | 61 | ```{r} 62 | rownames(sce) <- paste0(rowData(sce)$Ensembl, ".", rowData(sce)$Symbol) 63 | ``` 64 | 65 | 66 | ## Plot PCA representation 67 | 68 | ```{r pca, fig.width = 8, fig.height = 8} 69 | dim(sce) 70 | dim(pca) 71 | cols <- c("#DC050C", "#E8601C", "#7BAFDE", "#1965B0", "#B17BA6", 72 | "#882E72", "#F1932D", "#F6C141", "#F7EE55", "#4EB265", 73 | "#90C987", "#CAEDAB", "#777777", "black", "cyan", "pink") 74 | names(cols) <- as.character(seq_len(length(cols))) 75 | dfpca <- data.frame(pca, som100 = factor(colData(sce)$som100), 76 | sommeta = factor(colData(sce)$sommeta), 77 | library_id = factor(colData(sce)$Library), 78 | mouse = factor(colData(sce)$Mouse), 79 | stringsAsFactors = FALSE) 80 | print(ggplot(dfpca, aes(x = PC1, y = PC2, color = sommeta)) + 81 | geom_point(size = 0.5) + scale_color_manual(values = cols) + 82 | theme_bw()) 83 | print(ggplot(dfpca, aes(x = PC3, y = PC4, color = sommeta)) + 84 | geom_point(size = 0.5) + scale_color_manual(values = cols) + 85 | theme_bw()) 86 | print(ggplot(dfpca, aes(x = PC5, y = PC6, color = sommeta)) + 87 | geom_point(size = 0.5) + scale_color_manual(values = cols) + 88 | theme_bw()) 89 | print(ggplot(dfpca, aes(x = PC7, y = PC8, color = sommeta)) + 90 | geom_point(size = 0.5) + scale_color_manual(values = cols) + 91 | theme_bw()) 92 | print(ggplot(dfpca, aes(x = PC9, y = PC10, color = sommeta)) + 93 | geom_point(size = 0.5) + scale_color_manual(values = cols) + 94 | theme_bw()) 95 | ``` 96 | 97 | Also plot PCA colored by library ID/mouse 98 | 99 | ```{r pca2} 100 | print(ggplot(dfpca, aes(x = PC1, y = PC2, color = library_id)) + 101 | geom_point(size = 0.5) + theme_bw()) 102 | print(ggplot(dfpca, aes(x = PC1, y = PC2, color = mouse)) + 103 | geom_point(size = 0.5) + theme_bw()) 104 | ``` 105 | 106 | ## Find marker genes 107 | 108 | ```{r scran_markers} 109 | system.time(scran_markers_all <- scran::findMarkers(sce, 110 | clusters = colData(sce)$sommeta, 111 | block = NULL, 112 | design = NULL, 113 | direction = "up", 114 | pval.type = "any", 115 | assay.type = "logcounts", 116 | get.spikes = FALSE, 117 | log.p = TRUE, 118 | lfc = 0.5)) 119 | for (i in seq_len(length(scran_markers_all))) { 120 | print(head(as.data.frame(scran_markers_all[[i]]))) 121 | } 122 | scran_markers <- unique(unlist(lapply(scran_markers_all, function(w) { 123 | rownames(subset(w, Top <= 1)) 124 | }))) 125 | 126 | scescranmarker <- sce[which(rownames(sce) %in% scran_markers), ] 127 | logcounts_scranmarkers <- logcounts(scescranmarker) 128 | rownames(logcounts_scranmarkers) <- rowData(scescranmarker)$Symbol 129 | dfscran <- as.data.frame(logcounts_scranmarkers) %>% tibble::rownames_to_column("gene") %>% 130 | reshape2::melt() %>% dplyr::left_join(as.data.frame(colData(scescranmarker)) %>% 131 | tibble::rownames_to_column("variable") %>% 132 | dplyr::select(variable, som100, sommeta)) %>% 133 | dplyr::mutate(sommeta = factor(sommeta)) %>% 134 | dplyr::mutate(som100 = factor(som100)) 135 | ``` 136 | 137 | Heatmap of inferred marker genes 138 | 139 | ```{r scran_markers_heatmap, fig.height = 12, fig.width = 10} 140 | dfsumscran <- dfscran %>% dplyr::group_by(gene, som100) %>% 141 | dplyr::summarize(value = quantile(value, probs = 0.75)) %>% 142 | tidyr::spread(som100, value) %>% as.data.frame() 143 | rownames(dfsumscran) <- dfsumscran$gene 144 | dfsumscran$gene <- NULL 145 | dfsumscran <- dfsumscran[apply(dfsumscran, 1, sd) > 0, ] 146 | pheatmap::pheatmap(dfsumscran, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE, 147 | annotation_col = data.frame(metaClust = factor(metaClustering), row.names = as.character(seq_len(length(metaClustering)))), 148 | show_colnames = FALSE, show_rownames = TRUE, fontsize_row = 6, 149 | annotation_colors = list(metaClust = cols)) 150 | 151 | ``` 152 | 153 | 154 | ## Look at known marker genes. 155 | Marker genes for cell types in mouse brain were obtained from two recent publications: 156 | 157 | - [http://science.sciencemag.org/content/347/6226/1138.full](http://science.sciencemag.org/content/347/6226/1138.full) 158 | - [https://www.nature.com/articles/nn.4216](https://www.nature.com/articles/nn.4216) 159 | 160 | ```{r markers, fig.width = 24, fig.height = 24} 161 | marker_genes <- read.csv("resources/marker_genes.csv", header = TRUE, as.is = TRUE) %>% 162 | dplyr::arrange(population) %>% dplyr::select(gene, population) %>% 163 | dplyr::distinct() 164 | 165 | keep <- which(rowData(sce)$Symbol %in% unique(marker_genes$gene)) 166 | scemarker <- sce[keep, ] 167 | logcounts_markers <- assays(scemarker)[["logcounts"]] 168 | rownames(logcounts_markers) <- rowData(scemarker)$Symbol 169 | df <- as.data.frame(logcounts_markers) %>% tibble::rownames_to_column("gene") %>% 170 | reshape2::melt() %>% dplyr::left_join(as.data.frame(colData(scemarker)) %>% 171 | tibble::rownames_to_column("variable") %>% 172 | dplyr::select(variable, som100, sommeta)) %>% 173 | dplyr::mutate(sommeta = factor(sommeta)) %>% 174 | dplyr::mutate(som100 = factor(som100)) %>% 175 | dplyr::left_join(marker_genes %>% dplyr::select(gene, population)) %>% 176 | dplyr::mutate(genepop = paste0(gene, " (", population, ")")) %>% 177 | dplyr::arrange(population) %>% 178 | dplyr::mutate(gene = factor(gene, levels = unique(marker_genes$gene))) 179 | 180 | print(ggplot(df, aes(x = sommeta, y = value, fill = sommeta)) + 181 | geom_boxplot(outlier.size = 0.5) + facet_wrap(~ genepop, scales = "free_y") + 182 | scale_fill_manual(values = cols) + theme_bw()) 183 | ``` 184 | 185 | Check the FDRs of the marker genes in each of the clusters 186 | 187 | ```{r known_markers_fdr, fig.height = 12} 188 | L <- do.call(rbind, lapply(structure(marker_genes$gene, names = marker_genes$gene), 189 | function(g) { 190 | do.call(rbind, lapply(seq_along(scran_markers_all), function(i) { 191 | j <- grep(paste0("\\.", g, "$"), rownames(scran_markers_all[[i]])) 192 | if (length(j) == 1) data.frame(gene = g, cluster = i, 193 | log.FDR = scran_markers_all[[i]][j, "log.FDR"], 194 | stringsAsFactors = FALSE) 195 | else data.frame(gene = g, cluster = i, log.FDR = NA, stringsAsFactors = FALSE) 196 | })) 197 | })) 198 | L <- tidyr::spread(L, cluster, log.FDR) 199 | rownames(L) <- L$gene 200 | L$gene <- NULL 201 | L <- L[rowSums(is.na(L)) == 0, ] 202 | pheatmap::pheatmap(exp(L), scale = "none", cluster_rows = TRUE, cluster_cols = TRUE) 203 | ``` 204 | 205 | 206 | Plot the individual marker gene expression in each of the 100 original FlowSOM 207 | clusters, color by the final cluster assignment. 208 | 209 | ```{r markers-indiv, fig.width = 18} 210 | for (g in unique(df$genepop)) { 211 | print(ggplot(df %>% dplyr::filter(genepop == g), 212 | aes(x = som100, y = value, fill = sommeta)) + 213 | geom_boxplot(outlier.size = 0.5) + ggtitle(g) + theme_bw() + 214 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) + 215 | scale_fill_manual(values = cols)) 216 | } 217 | ``` 218 | 219 | ## Heatmap of marker genes 220 | 221 | We make a heatmap of the marker genes, summarized for the cells in each of the 222 | original FlowSOM clusters. As a summarization function, we use the third 223 | quartile. 224 | 225 | ```{r, fig.width = 10, fig.height = 12} 226 | dfsum <- df %>% dplyr::group_by(genepop, som100) %>% 227 | dplyr::summarize(value = quantile(value, probs = 0.75)) %>% 228 | tidyr::spread(som100, value) %>% as.data.frame() 229 | rownames(dfsum) <- dfsum$genepop 230 | dfsum$genepop <- NULL 231 | dfsum <- dfsum[apply(dfsum, 1, sd) > 0, ] 232 | pheatmap::pheatmap(dfsum, scale = "row", cluster_rows = TRUE, cluster_cols = TRUE, 233 | annotation_col = data.frame(metaClust = factor(metaClustering), row.names = as.character(seq_len(length(metaClustering)))), 234 | show_colnames = FALSE, show_rownames = TRUE, fontsize_row = 6, 235 | annotation_colors = list(metaClust = cols)) 236 | ``` 237 | 238 | ## t-SNE of subsampled data 239 | 240 | Here we apply t-SNE to 10,000 randomly selected cells, using the 50 first PCs as 241 | the input. We color the resulting representation by the assigned cluster. 242 | 243 | ```{r tsnesub} 244 | set.seed(123) 245 | subs <- sample(seq_len(nrow(pca)), 10000, replace = FALSE) 246 | pcasub <- pca[subs, ] 247 | rtsne_out <- Rtsne(as.matrix(pcasub), pca = FALSE, verbose = TRUE, perplexity = 30) 248 | rtsne_out <- data.frame(rtsne_out$Y) 249 | colnames(rtsne_out) <- c("tSNE1", "tSNE2") 250 | rtsne_out$sommeta <- factor(colData(sce)$sommeta[subs]) 251 | print(ggplot(rtsne_out, aes(x = tSNE1, y = tSNE2, color = sommeta)) + 252 | geom_point(size = 0.75) + scale_color_manual(values = cols) + 253 | theme_bw()) 254 | ``` 255 | 256 | ## Session info 257 | 258 | ```{r} 259 | date() 260 | sessionInfo() 261 | ``` 262 | 263 | -------------------------------------------------------------------------------- /intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Short analysis of the 10X million neuron data set 3 | author: Aaron Lun 4 | date: 29 June 2017 5 | output: 6 | BiocStyle::html_document: 7 | fig.caption: no 8 | --- 9 | 10 | ```{r style, echo = FALSE, results = 'asis'} 11 | BiocStyle::markdown() 12 | ``` 13 | 14 | # Introduction 15 | 16 | Here we perform a brief analysis of the 1M neuron data set from 10X genomics (https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.3.0/1M_neurons). 17 | This uses a variety of low-level packages to represent the data on-disk, as well as high-level packages for scRNA-seq data analysis. 18 | The aim is to demonstrate the usefulness of the _beachmat_ API in facilitating high-level analyses from a `HDF5Matrix` object. 19 | 20 | -------------------------------------------------------------------------------- /normalize.Rmd: -------------------------------------------------------------------------------- 1 | # Normalization for cell-specific biases 2 | 3 | ```{r, echo=FALSE, results="hide"} 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) 5 | knitr::opts_chunk$set(dpi=300, dev="png", dev.args=list(pointsize=15)) 6 | ``` 7 | 8 | ```{r, echo=FALSE, results="hide"} 9 | library(BiocStyle) 10 | library(HDF5Array) 11 | library(scran) 12 | sce <- readRDS("objects/sce.rds") 13 | ``` 14 | 15 | ## Performing a rough pre-clustering 16 | 17 | Here, we use the deconvolution method to compute size factors for each cell. 18 | We start by doing some pre-clustering to split the cells into sensible clusters, to avoid violating the non-DE assumption and distorting the size factors. 19 | 20 | ```{r} 21 | library(scran) 22 | system.time({ 23 | ids <- quickCluster(sce, min.mean=0.1, method="igraph", 24 | block=sce$Library, block.BPPARAM=BPPARAM) 25 | }) 26 | summary(tabulate(as.integer(ids))) 27 | ``` 28 | 29 | ## Calculating size factors 30 | 31 | We now compute a size factor for each cell using the specified parameters. 32 | 33 | ```{r} 34 | system.time({ 35 | sce <- computeSumFactors(sce, cluster=ids, min.mean=0.1, BPPARAM=BPPARAM) 36 | }) 37 | summary(sizeFactors(sce)) 38 | ``` 39 | 40 | We can have a look at them in more detail, compared to the library size for each cell. 41 | 42 | ```{r sizefacplot} 43 | plot(sce$scater_qc$all$total_counts, sizeFactors(sce), 44 | log="xy", xlab="Library size", 45 | ylab="Size factors", cex=0.2, pch=16) 46 | ``` 47 | 48 | ## Normalizing the expression values 49 | 50 | We calculate normalized log-expression values, using delayed operations for speed instead of saving to a new HDF5 file. 51 | 52 | ```{r} 53 | logcounts(sce) <- log2(t(t(counts(sce))/sizeFactors(sce)) + 1) 54 | ``` 55 | 56 | 62 | -------------------------------------------------------------------------------- /parallel/slurm-aaron.tmpl: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Job Resource Interface Definition 4 | ## 5 | ## ntasks [integer(1)]: Number of required tasks, 6 | ## Set larger than 1 if you want to further parallelize 7 | ## with MPI within your job. 8 | ## ncpus [integer(1)]: Number of required cpus per task, 9 | ## Set larger than 1 if you want to further parallelize 10 | ## with multicore/parallel within each task. 11 | ## walltime [integer(1)]: Walltime for this job, in minutes. 12 | ## Must be at least 1 minute. 13 | ## memory [integer(1)]: Memory in megabytes for each cpu. 14 | ## Must be at least 100 (when I tried lower values my 15 | ## jobs did not start at all). 16 | ## 17 | ## Default resources can be set in your .batchtools.conf.R by defining the variable 18 | ## 'default.resources' as a named list. 19 | 20 | <% 21 | # relative paths are not handled well by Slurm 22 | log.file = fs::path_expand(log.file) 23 | -%> 24 | 25 | 26 | #SBATCH --job-name=<%= job.name %> 27 | #SBATCH --output=<%= log.file %> 28 | #SBATCH --error=<%= log.file %> 29 | #SBATCH --time=<%= ceiling(resources$walltime / 60) %> 30 | #SBATCH --ntasks=1 31 | #SBATCH --cpus-per-task=<%= resources$ncpus %> 32 | #SBATCH --mem-per-cpu=<%= resources$memory %> 33 | <%= if (!is.null(resources$partition)) sprintf(paste0("#SBATCH --partition='", resources$partition, "'")) %> 34 | <%= if (array.jobs) sprintf("#SBATCH --array=1-%i", nrow(jobs)) else "" %> 35 | 36 | ## Initialize work environment like 37 | ## source /etc/profile 38 | ## module add ... 39 | 40 | ## Export value of DEBUGME environemnt var to slave 41 | export DEBUGME=<%= Sys.getenv("DEBUGME") %> 42 | 43 | <%= sprintf("export OMP_NUM_THREADS=%i", resources$omp.threads) -%> 44 | <%= sprintf("export OPENBLAS_NUM_THREADS=%i", resources$blas.threads) -%> 45 | <%= sprintf("export MKL_NUM_THREADS=%i", resources$blas.threads) -%> 46 | 47 | ## Run R: 48 | ## we merge R output with stdout from SLURM, which gets then logged via --output option 49 | echo 'batchtools::doJobCollection("<%= uri %>")' | Rdevel --no-save --slave 50 | -------------------------------------------------------------------------------- /parallel/slurm.R: -------------------------------------------------------------------------------- 1 | # Defines a BatchtoolsParam object for submission on a SLURM cluster. 2 | 3 | BPPARAM <- BatchtoolsParam(10, 4 | cluster="slurm", template="parallel/slurm-aaron.tmpl", 5 | logdir="parallel", log=TRUE, 6 | RNGseed=10000L, 7 | resources=list(walltime=20000, memory=8000, ncpus=1)) 8 | -------------------------------------------------------------------------------- /pics/make_pics.R: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Making a cell cycle picture. 3 | 4 | assignments <- readRDS("../objects/cycle_output.rds") 5 | X <- assignments$scores$G1 6 | Y <- assignments$scores$G2M 7 | 8 | png("cycle.png", width=7, height=7, units="in", res=300, pointsize=12) 9 | smoothScatter(X, Y, xlab="G1 score", ylab="G2M score", cex.axis=1.2, cex.lab=1.4) 10 | 11 | segments(-1, 0.5, 0.5, 0.5, col="red", lty=2, lwd=2) 12 | segments(0.5, -1, 0.5, 0.5, col="red", lty=2, lwd=2) 13 | segments(0.5, 0.5, 2, 2, col="red", lty=2, lwd=2) 14 | 15 | text(0, 0.6, col="red", sprintf("G2M phase:\n%i", sum(assignments$phase=="G2M")), cex=1.2, pos=4) 16 | text(0, 0.4, col="red", sprintf("S phase:\n%i", sum(assignments$phase=="S")), cex=1.2, pos=4) 17 | text(0.5, 0.4, col="red", sprintf("G1 phase:\n%i", sum(assignments$phase=="G1")), cex=1.2, pos=4) 18 | dev.off() 19 | 20 | ################################################################################ 21 | # Making a plot of the size factors. 22 | 23 | library(SingleCellExperiment) 24 | sce <- readRDS("../objects/sce.rds") 25 | 26 | ratio <- log(sizeFactors(sce)/sce$total_counts) 27 | nmads <- abs((ratio - median(ratio))/mad(ratio)) 28 | 29 | library(viridis) 30 | my.cols <- rev(viridis(11)) 31 | coldex <- findInterval(nmads, 0:10/2) 32 | 33 | options(bitmapType="cairo") 34 | png("sizefacs.png", width=7, height=7, units="in", res=300, pointsize=12) 35 | plot(sce$total_counts/1e3, sizeFactors(sce), log="xy", col=my.cols[coldex], 36 | xlab=expression("Library size ("*10^3*")"), ylab="Size factor", 37 | cex.axis=1.2, cex.lab=1.4, pch=16, cex=0.2) 38 | dev.off() 39 | 40 | ################################################################################ 41 | # Making a plot of the top HVGs. 42 | 43 | hvg.out <- read.table("../objects/hvg_output.txt", header=TRUE) 44 | is.sig <- hvg.out$FDR <= 0.05 45 | 46 | png("hvg.png", width=7, height=7, units="in", res=300, pointsize=12) 47 | par(mar=c(5.1, 5.1, 4.1, 2.1)) 48 | plot(hvg.out$mean, hvg.out$total, pch=16, col="grey", 49 | xlab=expression("Mean log"[2]~"expression"), 50 | ylab=expression("Variance of log"[2]~"expression"), 51 | cex.axis=1.2, cex.lab=1.4) 52 | points(hvg.out$mean[is.sig], hvg.out$total[is.sig], col="orange", pch=16) 53 | o <- order(hvg.out$mean) 54 | lines(hvg.out$mean[o], hvg.out$tech[o], col="red", lwd=2) 55 | legend("bottomright", sprintf("%i HVGs out of %i genes", sum(is.sig & !is.na(is.sig)), length(is.sig)), bty="n", cex=1.1) 56 | 57 | # Adding the top set of genes. 58 | chosen <- hvg.out$Symbol[1:10] 59 | xoffset <- c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2) 60 | yoffset <- c(0., -0.1, 0.1, 0.1, 0.1, 0., 0.1, 0.1, 0.1, 0.) 61 | 62 | for (i in seq_along(chosen)) { 63 | idex <- hvg.out$Symbol==chosen[i] 64 | xpos <- hvg.out$mean[idex] 65 | ypos <- hvg.out$total[idex] 66 | xpos2 <- xpos + xoffset[i] 67 | ypos2 <- ypos + yoffset[i] 68 | text(xpos2, ypos2, chosen[i], pos=4, offset=0.1, cex=1.1) 69 | segments(xpos, ypos, xpos2, ypos2) 70 | } 71 | dev.off() 72 | 73 | ################################################################################ 74 | # Making a PCA plot of the first two PCs. 75 | 76 | #pc.data <- readRDS("../objects/sce.rds") 77 | pc.mat <- reducedDim(sce) 78 | pc.var <- attr(pc.mat, "percentVar") 79 | 80 | options(bitmapType="cairo") 81 | png("pca.png", width=7, height=7, units="in", res=300, pointsize=12) 82 | smoothScatter(pc.mat[,1], pc.mat[,2], colramp=colorRampPalette(c("white", "black")), 83 | xlab=sprintf("PC1 (%.2f%%)", pc.var[1]*100), 84 | ylab=sprintf("PC2 (%.2f%%)", pc.var[2]*100), 85 | cex.axis=1.2, cex.lab=1.4) 86 | dev.off() 87 | 88 | -------------------------------------------------------------------------------- /preprocess.Rmd: -------------------------------------------------------------------------------- 1 | # Preprocessing 2 | 3 | ```{r, echo=FALSE, results="hide"} 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) 5 | knitr::opts_chunk$set(dpi=300, dev="png", dev.args=list(pointsize=15)) 6 | ``` 7 | 8 | ```{r, echo=FALSE, results="hide"} 9 | library(BiocStyle) 10 | ``` 11 | 12 | ## Loading in the data 13 | 14 | The aim here is to convert the 10X data set into a `HDF5Matrix` object. 15 | We first obtain the data using the `r Biocpkg("TENxBrainData")` package: 16 | 17 | ```{r} 18 | #library(TENxBrainData) 19 | #sce <- TENxBrainData() # or we would, if it was working properly. 20 | library(SingleCellExperiment) 21 | library(HDF5Array) 22 | sce <- SingleCellExperiment( 23 | list(counts=HDF5Array("rawdata/counts.h5", "counts")), 24 | rowData=readRDS("rawdata/rowdata.rds"), 25 | colData=readRDS("rawdata/coldata.rds") 26 | ) 27 | sce 28 | ``` 29 | 30 | We have a look at some of the cell-level metadata. 31 | Data were obtained from multiple mice, which were captured and sequenced in multiple libraries. 32 | Note that the libraries and nested within the mice. 33 | 34 | ```{r} 35 | table(sce$Library, sce$Mouse) 36 | ``` 37 | 38 | We also add some gene-level annotation. 39 | There's already gene symbols, so we just add the chromosome location. 40 | 41 | ```{r} 42 | library(TxDb.Mmusculus.UCSC.mm10.ensGene) 43 | chr.loc <- mapIds(TxDb.Mmusculus.UCSC.mm10.ensGene, keys=rowData(sce)$Ensembl, 44 | keytype="GENEID", column="CDSCHROM") 45 | rowData(sce)$Chr <- chr.loc 46 | head(rowData(sce)) 47 | ``` 48 | 49 | ## Performing cell-based quality control 50 | 51 | We use `r Biocpkg("scater")` to calculate quality control summaries for each cell and gene. 52 | 53 | ```{r} 54 | library(scater) 55 | system.time({ 56 | sce <- calculateQCMetrics(sce, 57 | feature_controls=list(Mito=which(rowData(sce)$Chr=="chrM")), 58 | compact=TRUE, BPPARAM=BPPARAM) 59 | }) 60 | head(colnames(rowData(sce))) 61 | ``` 62 | 63 | We have a look at the three relevant metrics, plotted against the batch of origin for each cell. 64 | 65 | ```{r qchist, fig.height=6, fig.width=12} 66 | par(mfrow=c(1,3)) 67 | hist(sce$scater_qc$all$log10_total_counts, 68 | xlab=expression(Log[10]~"library size"), col="grey80") 69 | hist(sce$scater_qc$all$log10_total_features_by_counts, 70 | xlab=expression(Log[10]~"number of genes expressed"), col="grey80") 71 | hist(sce$scater_qc$feature_control_Mito$pct_counts, 72 | xlab="Percentage of mitochondrial reads", col="grey80", breaks=50) 73 | ``` 74 | 75 | We use some of these metrics for quality control on the cells. 76 | This is done within each batch to avoid discarding cells, e.g., if one batch was sequenced at lower depth. 77 | 78 | ```{r} 79 | low.libsize <- isOutlier(sce$scater_qc$all$log10_total_counts, 80 | nmad=3, batch=sce$Library, type="lower") 81 | low.ngenes <- isOutlier(sce$scater_qc$all$log10_total_features_by_counts, 82 | nmad=3, batch=sce$Library, type="lower") 83 | discard <- low.libsize | low.ngenes 84 | data.frame(LowLib=sum(low.libsize), LowGenes=sum(low.ngenes), Lost=sum(discard)) 85 | ``` 86 | 87 | Low-quality cells are discarded from the object. 88 | 89 | ```{r} 90 | sce <- sce[,!discard] 91 | ``` 92 | 93 | ## Looking at gene-based metrics 94 | 95 | For each feature, we plot the average count against the percentage of cells in which expression was detected. 96 | 97 | ```{r ave-per-gene} 98 | ave.count <- rowData(sce)$scater_qc$all$mean_counts 99 | pct.cells <- rowData(sce)$scater_qc$all$n_cells_by_counts/ncol(sce) * 100 100 | smoothScatter(log10(ave.count), pct.cells, ylab="Percentage of cells", 101 | xlab=expression("Log"[10]~"average count")) 102 | ``` 103 | 104 | We also inspect the identities of the top most-highly-expressed genes. 105 | 106 | ```{r} 107 | top.genes <- rowData(sce)[,c("Ensembl", "Symbol")] 108 | top.genes$AveCount <- ave.count 109 | top.genes$PctCells <- pct.cells 110 | top.genes <- top.genes[order(ave.count, decreasing=TRUE),] 111 | head(as.data.frame(top.genes), 20) 112 | ``` 113 | 114 | 122 | -------------------------------------------------------------------------------- /resources/marker_genes.csv: -------------------------------------------------------------------------------- 1 | gene,population,source Dlx5,Interneurons,http://science.sciencemag.org/content/347/6226/1138.full Arx,Interneurons,http://science.sciencemag.org/content/347/6226/1138.full Dlx2,Interneurons,http://science.sciencemag.org/content/347/6226/1138.full Dlx1,Interneurons,http://science.sciencemag.org/content/347/6226/1138.full Elavl2,Interneurons,http://science.sciencemag.org/content/347/6226/1138.full Sp9,Interneurons,http://science.sciencemag.org/content/347/6226/1138.full Tbr1,S1Pyramidal_ExcitatoryNeurons,http://science.sciencemag.org/content/347/6226/1138.full Pou3f1,CA1Pyramidal,http://science.sciencemag.org/content/347/6226/1138.full Lmo1,CA1Pyramidal,http://science.sciencemag.org/content/347/6226/1138.full Heyl,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.full Tbx2,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.full Foxs1,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.full Tbx18,Vsmc_Peric,http://science.sciencemag.org/content/347/6226/1138.full Bcl6b,Vend,http://science.sciencemag.org/content/347/6226/1138.full Meox1,Vend,http://science.sciencemag.org/content/347/6226/1138.full Nmi,Vend,http://science.sciencemag.org/content/347/6226/1138.full Sox7,Vend,http://science.sciencemag.org/content/347/6226/1138.full Foxl2,Vend,http://science.sciencemag.org/content/347/6226/1138.full Irf8,Microglia,http://science.sciencemag.org/content/347/6226/1138.full Nlrp3,Microglia,http://science.sciencemag.org/content/347/6226/1138.full Irf5,Microglia,http://science.sciencemag.org/content/347/6226/1138.full Hcls1,Microglia,http://science.sciencemag.org/content/347/6226/1138.full Spi1,Microglia,http://science.sciencemag.org/content/347/6226/1138.full Dbx2,Astrocytes,http://science.sciencemag.org/content/347/6226/1138.full Sall3,Astrocytes,http://science.sciencemag.org/content/347/6226/1138.full Sox21,Astrocytes,http://science.sciencemag.org/content/347/6226/1138.full Nkx6-2,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.full Sox10,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.full St18,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.full Olig1,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.full Olig2,Oligodendrocytes,http://science.sciencemag.org/content/347/6226/1138.full Snap25,Neurons,https://www.nature.com/articles/nn.4216 Gad1,GABAergic_InhibitoryNeurons,https://www.nature.com/articles/nn.4217 Vip,VipGABAergicNeurons,https://www.nature.com/articles/nn.4218 Sst,SstGABAergicNeurons,https://www.nature.com/articles/nn.4219 Pvalb,PvalbGABAergicNeurons,https://www.nature.com/articles/nn.4220 Slc17a7,GlutamatergicNeurons,https://www.nature.com/articles/nn.4221 Rorb,GlutamatergicNeuronsL4L5a,https://www.nature.com/articles/nn.4222 Foxp2,GlutamatergicNeuronsL6,https://www.nature.com/articles/nn.4223 Aqp4,Astrocytes,https://www.nature.com/articles/nn.4224 Pdgfra,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4225 Mog,Oligodendrocytes,https://www.nature.com/articles/nn.4226 Itgam,Microglia,https://www.nature.com/articles/nn.4227 Flt1,Endothelial,https://www.nature.com/articles/nn.4228 Bgn,SMC,https://www.nature.com/articles/nn.4229 Myl9,SMC,https://www.nature.com/articles/nn.4230 Crip1,SMC,https://www.nature.com/articles/nn.4231 Vtn,SMC,https://www.nature.com/articles/nn.4232 Id1,Endothelial,https://www.nature.com/articles/nn.4233 Xdh,Endothelial,https://www.nature.com/articles/nn.4234 Tbc1d4,Endothelial,https://www.nature.com/articles/nn.4235 Exosc7,Endothelial,https://www.nature.com/articles/nn.4236 Ctss,Microglia,https://www.nature.com/articles/nn.4237 Cx3cr1,Microglia,https://www.nature.com/articles/nn.4238 C1qb,Microglia,https://www.nature.com/articles/nn.4239 Gpr34,Microglia,https://www.nature.com/articles/nn.4240 Opalin,Oligodendrocytes,https://www.nature.com/articles/nn.4241 Mag,Oligodendrocytes,https://www.nature.com/articles/nn.4242 Mbp,Oligodendrocytes,https://www.nature.com/articles/nn.4243 Cspg4,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4244 Pcdh15,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4245 Gria3,OligodendrocytePrecursors,https://www.nature.com/articles/nn.4246 Cxcl14,Astrocytes,https://www.nature.com/articles/nn.4247 Gja1,Astrocytes,https://www.nature.com/articles/nn.4248 F3,Astrocytes,https://www.nature.com/articles/nn.4249 Sox9,Astrocytes,https://www.nature.com/articles/nn.4250 Stmn2,Neurons,10xGenomicsChromiumWP Hes1,Glial,10xGenomicsChromiumWP Aldoc,Astrocytes,10xGenomicsChromiumWP Oligo1,Oligodendrocytes,10xGenomicsChromiumWP -------------------------------------------------------------------------------- /variance.Rmd: -------------------------------------------------------------------------------- 1 | # Detecting highly variable genes 2 | 3 | ```{r, echo=FALSE, results="hide"} 4 | knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) 5 | ``` 6 | 7 | ```{r, echo=FALSE, results="hide"} 8 | library(BiocStyle) 9 | library(scran) 10 | sce <- readRDS("objects/sce.rds") 11 | 12 | library(DelayedArray) 13 | setAutoBlockSize(100*100*8) # Ensure we load one chunk's worth of rows. 14 | ``` 15 | 16 | We compute the variance of the normalized log-expression values while blocking on the library of origin, and we fit a trend to it. 17 | Some fiddling with the trend parameters is necessary to obtain an appropriate fit at high abundances. 18 | 19 | ```{r} 20 | system.time({ 21 | fit <- trendVar(sce, method="loess", parametric=TRUE, 22 | design=sce$Library, use.spikes=FALSE, BPPARAM=BPPARAM, 23 | loess.args=list(span=0.05, control=loess.control(iterations=100))) 24 | }) 25 | ``` 26 | 27 | For comparison, we fit a trend corresponding to pure Poisson noise. 28 | 29 | ```{r} 30 | system.time({ 31 | poistrend <- makeTechTrend( 32 | means=2^seq(0, max(fit$mean), length.out=100) - 1, 33 | size.factors=sizeFactors(sce), 34 | approx.npts=10000, BPPARAM=BPPARAM) 35 | }) 36 | ``` 37 | 38 | We decompose the biological and technical component for each gene. 39 | Note that we use the Poisson trend here, as the trend fitted to the endogenous variances does not provide a good estimate of the technical noise for UMI data. 40 | 41 | ```{r} 42 | fit0 <- fit 43 | fit0$trend <- poistrend 44 | dec <- decomposeVar(fit=fit0) 45 | dec <- cbind(rowData(sce)[,1:2], dec) 46 | dec <- dec[order(dec$p.value, -dec$bio),] 47 | head(dec) 48 | ``` 49 | 50 | We examine the mean-variance relationship. 51 | 52 | ```{r hvgplot} 53 | plot(fit$mean, fit$var, pch=16, cex=0.5, xlab="Mean of log-expression", 54 | ylab="Variance of log-expression") 55 | curve(fit$trend(x), add=TRUE, col="red") 56 | curve(poistrend(x), add=TRUE, col="blue") 57 | ``` 58 | 59 | Finally we save the results to file. 60 | 61 | ```{r} 62 | write.table(file="objects/hvg_output.txt", dec, sep="\t", quote=FALSE, row.names=FALSE) 63 | ``` 64 | 65 | 72 | --------------------------------------------------------------------------------